icu46/source/i18n/uspoof.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/i18n/uspoof.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 ***************************************************************************

	3 * Copyright (C) 2008-2009, International Business Machines Corporation

	4 * and others. All Rights Reserved.

	5 ***************************************************************************

	6 * file name: uspoof.cpp

	7 * encoding: US-ASCII

	8 * tab size: 8 (not used)

	9 * indentation:4

	10 *

	11 * created on: 2008Feb13

	12 * created by: Andy Heninger

	13 *

	14 * Unicode Spoof Detection

	15 */

	16 #include "unicode/utypes.h"

	17 #include "unicode/uspoof.h"

	18 #include "unicode/unorm.h"

	19 #include "unicode/ustring.h"

	20 #include "cmemory.h"

	21 #include "uspoof_impl.h"

	22 #include "uassert.h"

	23

	24

	25 #if !UCONFIG_NO_NORMALIZATION

	26

	27

	28 #include <stdio.h> // debug

	29

	30 U_NAMESPACE_USE

	31

	32

	33 U_CAPI USpoofChecker * U_EXPORT2

	34 uspoof_open(UErrorCode *status) {

	35 if (U_FAILURE(*status)) {

	36 return NULL;

	37 }

	38 SpoofImpl si = new SpoofImpl(SpoofData::getDefault(status), *status);

	39 if (U_FAILURE(*status)) {

	40 delete si;

	41 si = NULL;

	42 }

	43 return (USpoofChecker *)si;

	44 }

	45

	46

	47 U_CAPI USpoofChecker * U_EXPORT2

	48 uspoof_openFromSerialized(const void data, int32_t length, int32_t pActualLeng th,

	49 UErrorCode *status) {

	50 if (U_FAILURE(*status)) {

	51 return NULL;

	52 }

	53 SpoofData sd = new SpoofData(data, length, status);

	54 SpoofImpl si = new SpoofImpl(sd, status);

	55 if (U_FAILURE(*status)) {

	56 delete sd;

	57 delete si;

	58 return NULL;

	59 }

	60 if (sd == NULL \|\| si == NULL) {

	61 *status = U_MEMORY_ALLOCATION_ERROR;

	62 delete sd;

	63 delete si;

	64 return NULL;

	65 }

	66

	67 if (pActualLength != NULL) {

	68 *pActualLength = sd->fRawData->fLength;

	69 }

	70 return reinterpret_cast<USpoofChecker *>(si);

	71 }

	72

	73

	74 U_CAPI USpoofChecker * U_EXPORT2

	75 uspoof_clone(const USpoofChecker sc, UErrorCode status) {

	76 const SpoofImpl src = SpoofImpl::validateThis(sc, status);

	77 if (src == NULL) {

	78 return NULL;

	79 }

	80 SpoofImpl result = new SpoofImpl(src, *status); // copy constructor

	81 if (U_FAILURE(*status)) {

	82 delete result;

	83 result = NULL;

	84 }

	85 return (USpoofChecker *)result;

	86 }

	87

	88

	89 U_CAPI void U_EXPORT2

	90 uspoof_close(USpoofChecker *sc) {

	91 UErrorCode status = U_ZERO_ERROR;

	92 SpoofImpl *This = SpoofImpl::validateThis(sc, status);

	93 delete This;

	94 }

	95

	96

	97 U_CAPI void U_EXPORT2

	98 uspoof_setChecks(USpoofChecker sc, int32_t checks, UErrorCode status) {

	99 SpoofImpl This = SpoofImpl::validateThis(sc, status);

	100 if (This == NULL) {

	101 return;

	102 }

	103

	104 // Verify that the requested checks are all ones (bits) that

	105 // are acceptable, known values.

	106 if (checks & ~USPOOF_ALL_CHECKS) {

	107 *status = U_ILLEGAL_ARGUMENT_ERROR;

	108 return;

	109 }

	110

	111 This->fChecks = checks;

	112 }

	113

	114

	115 U_CAPI int32_t U_EXPORT2

	116 uspoof_getChecks(const USpoofChecker sc, UErrorCode status) {

	117 const SpoofImpl This = SpoofImpl::validateThis(sc, status);

	118 if (This == NULL) {

	119 return 0;

	120 }

	121 return This->fChecks;

	122 }

	123

	124 U_CAPI void U_EXPORT2

	125 uspoof_setAllowedLocales(USpoofChecker sc, const char localesList, UErrorCode *status) {

	126 SpoofImpl This = SpoofImpl::validateThis(sc, status);

	127 if (This == NULL) {

	128 return;

	129 }

	130 This->setAllowedLocales(localesList, *status);

	131 }

	132

	133 U_CAPI const char * U_EXPORT2

	134 uspoof_getAllowedLocales(USpoofChecker sc, UErrorCode status) {

	135 SpoofImpl This = SpoofImpl::validateThis(sc, status);

	136 if (This == NULL) {

	137 return NULL;

	138 }

	139 return This->getAllowedLocales(*status);

	140 }

	141

	142

	143 U_CAPI const USet * U_EXPORT2

	144 uspoof_getAllowedChars(const USpoofChecker sc, UErrorCode status) {

	145 const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status);

	146 return reinterpret_cast<const USet *>(result);

	147 }

	148

	149 U_CAPI const UnicodeSet * U_EXPORT2

	150 uspoof_getAllowedUnicodeSet(const USpoofChecker sc, UErrorCode status) {

	151 const SpoofImpl This = SpoofImpl::validateThis(sc, status);

	152 if (This == NULL) {

	153 return NULL;

	154 }

	155 return This->fAllowedCharsSet;

	156 }

	157

	158

	159 U_CAPI void U_EXPORT2

	160 uspoof_setAllowedChars(USpoofChecker sc, const USet chars, UErrorCode *status) {

	161 const UnicodeSet set = reinterpret_cast<const UnicodeSet >(chars);

	162 uspoof_setAllowedUnicodeSet(sc, set, status);

	163 }

	164

	165

	166 U_CAPI void U_EXPORT2

	167 uspoof_setAllowedUnicodeSet(USpoofChecker sc, const UnicodeSet chars, UErrorCo de *status) {

	168 SpoofImpl This = SpoofImpl::validateThis(sc, status);

	169 if (This == NULL) {

	170 return;

	171 }

	172 if (chars->isBogus()) {

	173 *status = U_ILLEGAL_ARGUMENT_ERROR;

	174 return;

	175 }

	176 UnicodeSet clonedSet = static_cast<UnicodeSet >(chars->clone());

	177 if (clonedSet == NULL \|\| clonedSet->isBogus()) {

	178 *status = U_MEMORY_ALLOCATION_ERROR;

	179 return;

	180 }

	181 clonedSet->freeze();

	182 delete This->fAllowedCharsSet;

	183 This->fAllowedCharsSet = clonedSet;

	184 This->fChecks \|= USPOOF_CHAR_LIMIT;

	185 }

	186

	187

	188 U_CAPI int32_t U_EXPORT2

	189 uspoof_check(const USpoofChecker *sc,

	190 const UChar *text, int32_t length,

	191 int32_t *position,

	192 UErrorCode *status) {

	193

	194 const SpoofImpl This = SpoofImpl::validateThis(sc, status);

	195 if (This == NULL) {

	196 return 0;

	197 }

	198 if (length < -1) {

	199 *status = U_ILLEGAL_ARGUMENT_ERROR;

	200 return 0;

	201 }

	202 if (length == -1) {

	203 // It's not worth the bother to handle nul terminated strings everywhere .

	204 // Just get the length and be done with it.

	205 length = u_strlen(text);

	206 }

	207

	208 int32_t result = 0;

	209 int32_t failPos = 0x7fffffff; // TODO: do we have a #define for max int32?

	210

	211 // A count of the number of non-Common or inherited scripts.

	212 // Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE t ests.

	213 // Share the computation when possible. scriptCount == -1 means that we hav en't

	214 // done it yet.

	215 int32_t scriptCount = -1;

	216

	217 if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) {

	218 scriptCount = This->scriptScan(text, length, failPos, *status);

	219 // printf("scriptCount (clipped to 2) = %d\n", scriptCount);

	220 if ( scriptCount >= 2) {

	221 // Note: scriptCount == 2 covers all cases of the number of scripts >= 2

	222 result \|= USPOOF_SINGLE_SCRIPT;

	223 }

	224 }

	225

	226 if (This->fChecks & USPOOF_CHAR_LIMIT) {

	227 int32_t i;

	228 UChar32 c;

	229 for (i=0; i<length ;) {

	230 U16_NEXT(text, i, length, c);

	231 if (!This->fAllowedCharsSet->contains(c)) {

	232 result \|= USPOOF_CHAR_LIMIT;

	233 if (i < failPos) {

	234 failPos = i;

	235 }

	236 break;

	237 }

	238 }

	239 }

	240

	241 if (This->fChecks &

	242 (USPOOF_WHOLE_SCRIPT_CONFUSABLE \| USPOOF_MIXED_SCRIPT_CONFUSABLE \| USPOO F_INVISIBLE)) {

	243 // These are the checks that need to be done on NFKD input

	244 NFKDBuffer normalizedInput(text, length, *status);

	245 const UChar *nfkdText = normalizedInput.getBuffer();

	246 int32_t nfkdLength = normalizedInput.getLength();

	247

	248 if (This->fChecks & USPOOF_INVISIBLE) {

	249

	250 // scan for more than one occurence of the same non-spacing mark

	251 // in a sequence of non-spacing marks.

	252 int32_t i;

	253 UChar32 c;

	254 UChar32 firstNonspacingMark = 0;

	255 UBool haveMultipleMarks = FALSE;

	256 UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence.

	257

	258 for (i=0; i<length ;) {

	259 U16_NEXT(nfkdText, i, nfkdLength, c);

	260 if (u_charType(c) != U_NON_SPACING_MARK) {

	261 firstNonspacingMark = 0;

	262 if (haveMultipleMarks) {

	263 marksSeenSoFar.clear();

	264 haveMultipleMarks = FALSE;

	265 }

	266 continue;

	267 }

	268 if (firstNonspacingMark == 0) {

	269 firstNonspacingMark = c;

	270 continue;

	271 }

	272 if (!haveMultipleMarks) {

	273 marksSeenSoFar.add(firstNonspacingMark);

	274 haveMultipleMarks = TRUE;

	275 }

	276 if (marksSeenSoFar.contains(c)) {

	277 // report the error, and stop scanning.

	278 // No need to find more than the first failure.

	279 result \|= USPOOF_INVISIBLE;

	280 failPos = i;

	281 break;

	282 }

	283 marksSeenSoFar.add(c);

	284 }

	285 }

	286

	287

	288 if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE \| USPOOF_MIXED_SCRIP T_CONFUSABLE)) {

	289 // The basic test is the same for both whole and mixed script confus ables.

	290 // Compute the set of scripts that every input character has a confu sable in.

	291 // For this computation an input character is always considered to b e

	292 // confusable with itself in its own script.

	293 // If the number of such scripts is two or more, and the input consi sted of

	294 // characters all from a single script, we have a whole script con fusable.

	295 // (The two scripts will be the original script and the one that i s confusable)

	296 // If the number of such scripts >= one, and the original input cont ained characters from

	297 // more than one script, we have a mixed script confusable. (We c an transform

	298 // some of the characters, and end up with a visually similar stri ng all in

	299 // one script.)

	300

	301 if (scriptCount == -1) {

	302 int32_t t;

	303 scriptCount = This->scriptScan(text, length, t, *status);

	304 }

	305

	306 ScriptSet scripts;

	307 This->wholeScriptCheck(nfkdText, nfkdLength, &scripts, *status);

	308 int32_t confusableScriptCount = scripts.countMembers();

	309 //printf("confusableScriptCount = %d\n", confusableScriptCount);

	310

	311 if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&

	312 confusableScriptCount >= 2 &&

	313 scriptCount == 1) {

	314 result \|= USPOOF_WHOLE_SCRIPT_CONFUSABLE;

	315 }

	316

	317 if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&

	318 confusableScriptCount >= 1 &&

	319 scriptCount > 1) {

	320 result \|= USPOOF_MIXED_SCRIPT_CONFUSABLE;

	321 }

	322 }

	323 }

	324 if (position != NULL && failPos != 0x7fffffff) {

	325 *position = failPos;

	326 }

	327 return result;

	328 }

	329

	330

	331 U_CAPI int32_t U_EXPORT2

	332 uspoof_checkUTF8(const USpoofChecker *sc,

	333 const char *text, int32_t length,

	334 int32_t *position,

	335 UErrorCode *status) {

	336

	337 if (U_FAILURE(*status)) {

	338 return 0;

	339 }

	340 UChar stackBuf[USPOOF_STACK_BUFFER_SIZE];

	341 UChar* text16 = stackBuf;

	342 int32_t len16;

	343

	344 u_strFromUTF8(text16, USPOOF_STACK_BUFFER_SIZE, &len16, text, length, status );

	345 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {

	346 return 0;

	347 }

	348 if (*status == U_BUFFER_OVERFLOW_ERROR) {

	349 text16 = static_cast<UChar >(uprv_malloc(len16 sizeof(UChar) + 2));

	350 if (text16 == NULL) {

	351 *status = U_MEMORY_ALLOCATION_ERROR;

	352 return 0;

	353 }

	354 *status = U_ZERO_ERROR;

	355 u_strFromUTF8(text16, len16+1, NULL, text, length, status);

	356 }

	357

	358 int32_t position16 = -1;

	359 int32_t result = uspoof_check(sc, text16, len16, &position16, status);

	360 if (U_FAILURE(*status)) {

	361 return 0;

	362 }

	363

	364 if (position16 > 0) {

	365 // Translate a UTF-16 based error position back to a UTF-8 offset.

	366 // u_strToUTF8() in preflight mode is an easy way to do it.

	367 U_ASSERT(position16 <= len16);

	368 u_strToUTF8(NULL, 0, position, text16, position16, status);

	369 if (position > 0) {

	370 // position is the required buffer length from u_strToUTF8, which in cludes

	371 // space for a terminating NULL, which we don't want, hence the -1.

	372 *position -= 1;

	373 }

	374 *status = U_ZERO_ERROR; // u_strToUTF8, above sets BUFFER_OVERFLOW_ERR OR.

	375 }

	376

	377 if (text16 != stackBuf) {

	378 uprv_free(text16);

	379 }

	380 return result;

	381

	382 }

	383

	384 /* A convenience wrapper around the public uspoof_getSkeleton that handles

	385 * allocating a larger buffer than provided if the original is too small.

	386 */

	387 static UChar getSkeleton(const USpoofChecker sc, uint32_t type, const UChar *s , int32_t inputLength,

	388 UChar dest, int32_t destCapacity, int32_t outputLengt h, UErrorCode *status) {

	389 int32_t requiredCapacity = 0;

	390 UChar *buf = dest;

	391

	392 if (U_FAILURE(*status)) {

	393 return NULL;

	394 }

	395 requiredCapacity = uspoof_getSkeleton(sc, type, s, inputLength, dest, destCa pacity, status);

	396 if (*status == U_BUFFER_OVERFLOW_ERROR) {

	397 buf = static_cast<UChar >(uprv_malloc(requiredCapacity sizeof(UChar)) );

	398 if (buf == NULL) {

	399 *status = U_MEMORY_ALLOCATION_ERROR;

	400 return NULL;

	401 }

	402 *status = U_ZERO_ERROR;

	403 uspoof_getSkeleton(sc, type, s, inputLength, buf, requiredCapacity, stat us);

	404 }

	405 *outputLength = requiredCapacity;

	406 return buf;

	407 }

	408

	409

	410 U_CAPI int32_t U_EXPORT2

	411 uspoof_areConfusable(const USpoofChecker *sc,

	412 const UChar *s1, int32_t length1,

	413 const UChar *s2, int32_t length2,

	414 UErrorCode *status) {

	415 const SpoofImpl This = SpoofImpl::validateThis(sc, status);

	416 if (U_FAILURE(*status)) {

	417 return 0;

	418 }

	419 //

	420 // See section 4 of UAX 39 for the algorithm for checking whether two string s are confusable,

	421 // and for definitions of the types (single, whole, mixed-script) of confu sables.

	422

	423 // We only care about a few of the check flags. Ignore the others.

	424 // If no tests relavant to this function have been specified, return an erro r.

	425 // TODO: is this really the right thing to do? It's probably an error on t he caller's part,

	426 // but logically we would just return 0 (no error).

	427 if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE \| USPOOF_MIXED_SCRIPT_ CONFUSABLE \|

	428 USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) {

	429 *status = U_INVALID_STATE_ERROR;

	430 return 0;

	431 }

	432 int32_t flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE;

	433 UChar s1SkeletonBuf[USPOOF_STACK_BUFFER_SIZE];

	434 UChar *s1Skeleton;

	435 int32_t s1SkeletonLength = 0;

	436

	437 UChar s2SkeletonBuf[USPOOF_STACK_BUFFER_SIZE];

	438 UChar *s2Skeleton;

	439 int32_t s2SkeletonLength = 0;

	440

	441 int32_t result = 0;

	442 int32_t t;

	443 int32_t s1ScriptCount = This->scriptScan(s1, length1, t, *status);

	444 int32_t s2ScriptCount = This->scriptScan(s2, length2, t, *status);

	445

	446 if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {

	447 // Do the Single Script compare.

	448 if (s1ScriptCount <= 1 && s2ScriptCount <= 1) {

	449 flagsForSkeleton \|= USPOOF_SINGLE_SCRIPT_CONFUSABLE;

	450 s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1Skelet onBuf,

	451 sizeof(s1SkeletonBuf)/sizeof(UChar), &s1Ske letonLength, status);

	452 s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2Skelet onBuf,

	453 sizeof(s2SkeletonBuf)/sizeof(UChar), &s2Ske letonLength, status);

	454 if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2 Skeleton, s1SkeletonLength) == 0) {

	455 result \|= USPOOF_SINGLE_SCRIPT_CONFUSABLE;

	456 }

	457 if (s1Skeleton != s1SkeletonBuf) {

	458 uprv_free(s1Skeleton);

	459 }

	460 if (s2Skeleton != s2SkeletonBuf) {

	461 uprv_free(s2Skeleton);

	462 }

	463 }

	464 }

	465

	466 if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {

	467 // If the two inputs are single script confusable they cannot also be

	468 // mixed or whole script confusable, according to the UAX39 definitions .

	469 // So we can skip those tests.

	470 return result;

	471 }

	472

	473 // Optimization for whole script confusables test: two identifiers are whol e script confusable if

	474 // each is of a single script and they are mixed script confusable.

	475 UBool possiblyWholeScriptConfusables =

	476 s1ScriptCount <= 1 && s2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOL E_SCRIPT_CONFUSABLE);

	477

	478 //

	479 // Mixed Script Check

	480 //

	481 if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) \|\| possiblyWholeScriptC onfusables ) {

	482 // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us

	483 // the mixed script table skeleton, which is what we want.

	484 // The Any Case / Lower Case bit in the skelton flags was set at the top of the function.

	485 flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;

	486 s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBu f,

	487 sizeof(s1SkeletonBuf)/sizeof(UChar), &s1Skeleto nLength, status);

	488 s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBu f,

	489 sizeof(s2SkeletonBuf)/sizeof(UChar), &s2Skeleto nLength, status);

	490 if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skel eton, s1SkeletonLength) == 0) {

	491 result \|= USPOOF_MIXED_SCRIPT_CONFUSABLE;

	492 if (possiblyWholeScriptConfusables) {

	493 result \|= USPOOF_WHOLE_SCRIPT_CONFUSABLE;

	494 }

	495 }

	496 if (s1Skeleton != s1SkeletonBuf) {

	497 uprv_free(s1Skeleton);

	498 }

	499 if (s2Skeleton != s2SkeletonBuf) {

	500 uprv_free(s2Skeleton);

	501 }

	502 }

	503

	504 return result;

	505 }

	506

	507

	508 // Convenience function for converting a UTF-8 input to a UChar * string, includ ing

	509 // reallocating a buffer when required. Parameters and their interpret ation mostly

	510 // match u_strFromUTF8.

	511

	512 static UChar * convertFromUTF8(UChar outBuf, int32_t outBufCapacity, int32_t o utputLength,

	513 const char in, int32_t inLength, UErrorCode sta tus) {

	514 if (U_FAILURE(*status)) {

	515 return NULL;

	516 }

	517 UChar *dest = outBuf;

	518 u_strFromUTF8(dest, outBufCapacity, outputLength, in, inLength, status);

	519 if (*status == U_BUFFER_OVERFLOW_ERROR) {

	520 dest = static_cast<UChar >(uprv_malloc(outputLength * sizeof(UChar)));

	521 if (dest == NULL) {

	522 *status = U_MEMORY_ALLOCATION_ERROR;

	523 return NULL;

	524 }

	525 *status = U_ZERO_ERROR;

	526 u_strFromUTF8(dest, *outputLength, NULL, in, inLength, status);

	527 }

	528 return dest;

	529 }

	530

	531

	532

	533 U_CAPI int32_t U_EXPORT2

	534 uspoof_areConfusableUTF8(const USpoofChecker *sc,

	535 const char *s1, int32_t length1,

	536 const char *s2, int32_t length2,

	537 UErrorCode *status) {

	538

	539 SpoofImpl::validateThis(sc, *status);

	540 if (U_FAILURE(*status)) {

	541 return 0;

	542 }

	543

	544 UChar s1Buf[USPOOF_STACK_BUFFER_SIZE];

	545 int32_t lengthS1U;

	546 UChar *s1U = convertFromUTF8(s1Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS1U, s1, length1, status);

	547

	548 UChar s2Buf[USPOOF_STACK_BUFFER_SIZE];

	549 int32_t lengthS2U;

	550 UChar *s2U = convertFromUTF8(s2Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS2U, s2, length2, status);

	551

	552 int32_t results = uspoof_areConfusable(sc, s1U, lengthS1U, s2U, lengthS2U, s tatus);

	553

	554 if (s1U != s1Buf) {

	555 uprv_free(s1U);

	556 }

	557 if (s2U != s2Buf) {

	558 uprv_free(s2U);

	559 }

	560 return results;

	561 }

	562

	563

	564 U_CAPI int32_t U_EXPORT2

	565 uspoof_areConfusableUnicodeString(const USpoofChecker *sc,

	566 const U_NAMESPACE_QUALIFIER UnicodeString &s1,

	567 const U_NAMESPACE_QUALIFIER UnicodeString &s2,

	568 UErrorCode *status) {

	569

	570 const UChar *u1 = s1.getBuffer();

	571 int32_t length1 = s1.length();

	572 const UChar *u2 = s2.getBuffer();

	573 int32_t length2 = s2.length();

	574

	575 int32_t results = uspoof_areConfusable(sc, u1, length1, u2, length2, status );

	576 return results;

	577 }

	578

	579

	580

	581

	582 U_CAPI int32_t U_EXPORT2

	583 uspoof_checkUnicodeString(const USpoofChecker *sc,

	584 const U_NAMESPACE_QUALIFIER UnicodeString &text,

	585 int32_t *position,

	586 UErrorCode *status) {

	587 int32_t result = uspoof_check(sc, text.getBuffer(), text.length(), position, status);

	588 return result;

	589 }

	590

	591

	592 U_CAPI int32_t U_EXPORT2

	593 uspoof_getSkeleton(const USpoofChecker *sc,

	594 uint32_t type,

	595 const UChar *s, int32_t length,

	596 UChar *dest, int32_t destCapacity,

	597 UErrorCode *status) {

	598

	599 // TODO: this function could be sped up a bit

	600 // Skip the input normalization when not needed, work from callers da ta.

	601 // Put the initial skeleton straight into the caller's destination bu ffer.

	602 // It probably won't need normalization.

	603 // But these would make the structure more complicated.

	604

	605 const SpoofImpl This = SpoofImpl::validateThis(sc, status);

	606 if (U_FAILURE(*status)) {

	607 return 0;

	608 }

	609 if (length<-1 \|\| destCapacity<0 \|\| (destCapacity==0 && dest!=NULL) \|\|

	610 (type & ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE \| USPOOF_ANY_CASE)) != 0) {

	611 *status = U_ILLEGAL_ARGUMENT_ERROR;

	612 return 0;

	613 }

	614

	615 int32_t tableMask = 0;

	616 switch (type) {

	617 case 0:

	618 tableMask = USPOOF_ML_TABLE_FLAG;

	619 break;

	620 case USPOOF_SINGLE_SCRIPT_CONFUSABLE:

	621 tableMask = USPOOF_SL_TABLE_FLAG;

	622 break;

	623 case USPOOF_ANY_CASE:

	624 tableMask = USPOOF_MA_TABLE_FLAG;

	625 break;

	626 case USPOOF_SINGLE_SCRIPT_CONFUSABLE \| USPOOF_ANY_CASE:

	627 tableMask = USPOOF_SA_TABLE_FLAG;

	628 break;

	629 default:

	630 *status = U_ILLEGAL_ARGUMENT_ERROR;

	631 return 0;

	632 }

	633

	634 // NFKD transform of the user supplied input

	635

	636 UChar nfkdStackBuf[USPOOF_STACK_BUFFER_SIZE];

	637 UChar *nfkdInput = nfkdStackBuf;

	638 int32_t normalizedLen = unorm_normalize(

	639 s, length, UNORM_NFKD, 0, nfkdInput, USPOOF_STACK_BUFFER_SIZE, status);

	640 if (*status == U_BUFFER_OVERFLOW_ERROR) {

	641 nfkdInput = (UChar )uprv_malloc((normalizedLen+1)sizeof(UChar));

	642 if (nfkdInput == NULL) {

	643 *status = U_MEMORY_ALLOCATION_ERROR;

	644 return 0;

	645 }

	646 *status = U_ZERO_ERROR;

	647 normalizedLen = unorm_normalize(s, length, UNORM_NFKD, 0,

	648 nfkdInput, normalizedLen+1, status);

	649 }

	650 if (U_FAILURE(*status)) {

	651 if (nfkdInput != nfkdStackBuf) {

	652 uprv_free(nfkdInput);

	653 }

	654 return 0;

	655 }

	656

	657 // buffer to hold the Unicode defined skeleton mappings for a single code po int

	658 UChar buf[USPOOF_MAX_SKELETON_EXPANSION];

	659

	660 // Apply the skeleton mapping to the NFKD normalized input string

	661 // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.

	662 int32_t inputIndex = 0;

	663 UnicodeString skelStr;

	664 while (inputIndex < normalizedLen) {

	665 UChar32 c;

	666 U16_NEXT(nfkdInput, inputIndex, normalizedLen, c);

	667 int32_t replaceLen = This->confusableLookup(c, tableMask, buf);

	668 skelStr.append(buf, replaceLen);

	669 }

	670

	671 if (nfkdInput != nfkdStackBuf) {

	672 uprv_free(nfkdInput);

	673 }

	674

	675 const UChar *result = skelStr.getBuffer();

	676 int32_t resultLen = skelStr.length();

	677 UChar *normedResult = NULL;

	678

	679 // Check the skeleton for NFKD, normalize it if needed.

	680 // Unnormalized results should be very rare.

	681 if (!unorm_isNormalized(result, resultLen, UNORM_NFKD, status)) {

	682 normalizedLen = unorm_normalize(result, resultLen, UNORM_NFKD, 0, NULL, 0, status);

	683 normedResult = static_cast<UChar >(uprv_malloc((normalizedLen+1)sizeof (UChar)));

	684 if (normedResult == NULL) {

	685 *status = U_MEMORY_ALLOCATION_ERROR;

	686 return 0;

	687 }

	688 *status = U_ZERO_ERROR;

	689 unorm_normalize(result, resultLen, UNORM_NFKD, 0, normedResult, normaliz edLen+1, status);

	690 result = normedResult;

	691 resultLen = normalizedLen;

	692 }

	693

	694 // Copy the skeleton to the caller's buffer

	695 if (U_SUCCESS(*status)) {

	696 if (destCapacity == 0 \|\| resultLen > destCapacity) {

	697 *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRIN G_NOT_TERMINATED_WARNING;

	698 } else {

	699 u_memcpy(dest, result, resultLen);

	700 if (destCapacity > resultLen) {

	701 dest[resultLen] = 0;

	702 } else {

	703 *status = U_STRING_NOT_TERMINATED_WARNING;

	704 }

	705 }

	706 }

	707 uprv_free(normedResult);

	708 return resultLen;

	709 }

	710

	711

	712

	713 U_CAPI UnicodeString & U_EXPORT2

	714 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,

	715 uint32_t type,

	716 const UnicodeString &s,

	717 UnicodeString &dest,

	718 UErrorCode *status) {

	719 if (U_FAILURE(*status)) {

	720 return dest;

	721 }

	722 dest.remove();

	723

	724 const UChar *str = s.getBuffer();

	725 int32_t strLen = s.length();

	726 UChar smallBuf[USPOOF_STACK_BUFFER_SIZE];

	727 UChar *buf = smallBuf;

	728 int32_t outputSize = uspoof_getSkeleton(sc, type, str, strLen, smallBuf, USP OOF_STACK_BUFFER_SIZE, status);

	729 if (*status == U_BUFFER_OVERFLOW_ERROR) {

	730 buf = static_cast<UChar >(uprv_malloc((outputSize+1)sizeof(UChar)));

	731 if (buf == NULL) {

	732 *status = U_MEMORY_ALLOCATION_ERROR;

	733 return dest;

	734 }

	735 *status = U_ZERO_ERROR;

	736 uspoof_getSkeleton(sc, type, str, strLen, buf, outputSize+1, status);

	737 }

	738 if (U_SUCCESS(*status)) {

	739 dest.setTo(buf, outputSize);

	740 }

	741

	742 if (buf != smallBuf) {

	743 uprv_free(buf);

	744 }

	745 return dest;

	746 }

	747

	748

	749 U_CAPI int32_t U_EXPORT2

	750 uspoof_getSkeletonUTF8(const USpoofChecker *sc,

	751 uint32_t type,

	752 const char *s, int32_t length,

	753 char *dest, int32_t destCapacity,

	754 UErrorCode *status) {

	755 // Lacking a UTF-8 normalization API, just converting the input to

	756 // UTF-16 seems as good an approach as any. In typical use, input will

	757 // be an identifier, which is to say not too long for stack buffers.

	758 if (U_FAILURE(*status)) {

	759 return 0;

	760 }

	761 // Buffers for the UChar form of the input and skeleton strings.

	762 UChar smallInBuf[USPOOF_STACK_BUFFER_SIZE];

	763 UChar *inBuf = smallInBuf;

	764 UChar smallOutBuf[USPOOF_STACK_BUFFER_SIZE];

	765 UChar *outBuf = smallOutBuf;

	766

	767 int32_t lengthInUChars = 0;

	768 int32_t skelLengthInUChars = 0;

	769 int32_t skelLengthInUTF8 = 0;

	770

	771 u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars,

	772 s, length, status);

	773 if (*status == U_BUFFER_OVERFLOW_ERROR) {

	774 inBuf = static_cast<UChar >(uprv_malloc((lengthInUChars+1)sizeof(UChar )));

	775 if (inBuf == NULL) {

	776 *status = U_MEMORY_ALLOCATION_ERROR;

	777 goto cleanup;

	778 }

	779 *status = U_ZERO_ERROR;

	780 u_strFromUTF8(inBuf, lengthInUChars+1, &lengthInUChars,

	781 s, length, status);

	782 }

	783

	784 skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,

	785 outBuf, USPOOF_STACK_BUFFER_SIZE, statu s);

	786 if (*status == U_BUFFER_OVERFLOW_ERROR) {

	787 outBuf = static_cast<UChar >(uprv_malloc((skelLengthInUChars+1)sizeof( UChar)));

	788 if (outBuf == NULL) {

	789 *status = U_MEMORY_ALLOCATION_ERROR;

	790 goto cleanup;

	791 }

	792 *status = U_ZERO_ERROR;

	793 skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,

	794 outBuf, skelLengthInUChars+1, status);

	795 }

	796

	797 u_strToUTF8(dest, destCapacity, &skelLengthInUTF8,

	798 outBuf, skelLengthInUChars, status);

	799

	800 cleanup:

	801 if (inBuf != smallInBuf) {

	802 uprv_free(inBuf);

	803 }

	804 if (outBuf != smallOutBuf) {

	805 uprv_free(outBuf);

	806 }

	807 return skelLengthInUTF8;

	808 }

	809

	810

	811 U_CAPI int32_t U_EXPORT2

	812 uspoof_serialize(USpoofChecker sc,void buf, int32_t capacity, UErrorCode *stat us) {

	813 SpoofImpl This = SpoofImpl::validateThis(sc, status);

	814 if (This == NULL) {

	815 U_ASSERT(U_FAILURE(*status));

	816 return 0;

	817 }

	818 int32_t dataSize = This->fSpoofData->fRawData->fLength;

	819 if (capacity < dataSize) {

	820 *status = U_BUFFER_OVERFLOW_ERROR;

	821 return dataSize;

	822 }

	823 uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize);

	824 return dataSize;

	825 }

	826

	827 #endif

OLD	NEW

« no previous file with comments | « icu46/source/i18n/usearch.cpp ('k') | icu46/source/i18n/uspoof_build.cpp » ('j') | no next file with comments »