OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * |
| 4 * Copyright (C) 2001-2010, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ******************************************************************************* |
| 8 * file name: unormcmp.cpp |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2004sep13 |
| 14 * created by: Markus W. Scherer |
| 15 * |
| 16 * unorm_compare() function moved here from unorm.cpp for better modularization
. |
| 17 * Depends on both normalization and case folding. |
| 18 * Allows unorm.cpp to not depend on any character properties code. |
| 19 */ |
| 20 |
| 21 #include "unicode/utypes.h" |
| 22 |
| 23 #if !UCONFIG_NO_NORMALIZATION |
| 24 |
| 25 #include "unicode/unorm.h" |
| 26 #include "unicode/ustring.h" |
| 27 #include "cmemory.h" |
| 28 #include "normalizer2impl.h" |
| 29 #include "ucase.h" |
| 30 #include "uprops.h" |
| 31 #include "ustr_imp.h" |
| 32 |
| 33 U_NAMESPACE_USE |
| 34 |
| 35 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
| 36 |
| 37 /* compare canonically equivalent ------------------------------------------- */ |
| 38 |
| 39 /* |
| 40 * Compare two strings for canonical equivalence. |
| 41 * Further options include case-insensitive comparison and |
| 42 * code point order (as opposed to code unit order). |
| 43 * |
| 44 * In this function, canonical equivalence is optional as well. |
| 45 * If canonical equivalence is tested, then both strings must fulfill |
| 46 * the FCD check. |
| 47 * |
| 48 * Semantically, this is equivalent to |
| 49 * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2))) |
| 50 * where code point order, NFD and foldCase are all optional. |
| 51 * |
| 52 * String comparisons almost always yield results before processing both strings |
| 53 * completely. |
| 54 * They are generally more efficient working incrementally instead of |
| 55 * performing the sub-processing (strlen, normalization, case-folding) |
| 56 * on the entire strings first. |
| 57 * |
| 58 * It is also unnecessary to not normalize identical characters. |
| 59 * |
| 60 * This function works in principle as follows: |
| 61 * |
| 62 * loop { |
| 63 * get one code unit c1 from s1 (-1 if end of source) |
| 64 * get one code unit c2 from s2 (-1 if end of source) |
| 65 * |
| 66 * if(either string finished) { |
| 67 * return result; |
| 68 * } |
| 69 * if(c1==c2) { |
| 70 * continue; |
| 71 * } |
| 72 * |
| 73 * // c1!=c2 |
| 74 * try to decompose/case-fold c1/c2, and continue if one does; |
| 75 * |
| 76 * // still c1!=c2 and neither decomposes/case-folds, return result |
| 77 * return c1-c2; |
| 78 * } |
| 79 * |
| 80 * When a character decomposes, then the pointer for that source changes to |
| 81 * the decomposition, pushing the previous pointer onto a stack. |
| 82 * When the end of the decomposition is reached, then the code unit reader |
| 83 * pops the previous source from the stack. |
| 84 * (Same for case-folding.) |
| 85 * |
| 86 * This is complicated further by operating on variable-width UTF-16. |
| 87 * The top part of the loop works on code units, while lookups for decomposition |
| 88 * and case-folding need code points. |
| 89 * Code points are assembled after the equality/end-of-source part. |
| 90 * The source pointer is only advanced beyond all code units when the code point |
| 91 * actually decomposes/case-folds. |
| 92 * |
| 93 * If we were on a trail surrogate unit when assembling a code point, |
| 94 * and the code point decomposes/case-folds, then the decomposition/folding |
| 95 * result must be compared with the part of the other string that corresponds to |
| 96 * this string's lead surrogate. |
| 97 * Since we only assemble a code point when hitting a trail unit when the |
| 98 * preceding lead units were identical, we back up the other string by one unit |
| 99 * in such a case. |
| 100 * |
| 101 * The optional code point order comparison at the end works with |
| 102 * the same fix-up as the other code point order comparison functions. |
| 103 * See ustring.c and the comment near the end of this function. |
| 104 * |
| 105 * Assumption: A decomposition or case-folding result string never contains |
| 106 * a single surrogate. This is a safe assumption in the Unicode Standard. |
| 107 * Therefore, we do not need to check for surrogate pairs across |
| 108 * decomposition/case-folding boundaries. |
| 109 * |
| 110 * Further assumptions (see verifications tstnorm.cpp): |
| 111 * The API function checks for FCD first, while the core function |
| 112 * first case-folds and then decomposes. This requires that case-folding does no
t |
| 113 * un-FCD any strings. |
| 114 * |
| 115 * The API function may also NFD the input and turn off decomposition. |
| 116 * This requires that case-folding does not un-NFD strings either. |
| 117 * |
| 118 * TODO If any of the above two assumptions is violated, |
| 119 * then this entire code must be re-thought. |
| 120 * If this happens, then a simple solution is to case-fold both strings up front |
| 121 * and to turn off UNORM_INPUT_IS_FCD. |
| 122 * We already do this when not both strings are in FCD because makeFCD |
| 123 * would be a partial NFD before the case folding, which does not work. |
| 124 * Note that all of this is only a problem when case-folding _and_ |
| 125 * canonical equivalence come together. |
| 126 * (Comments in unorm_compare() are more up to date than this TODO.) |
| 127 */ |
| 128 |
| 129 /* stack element for previous-level source/decomposition pointers */ |
| 130 struct CmpEquivLevel { |
| 131 const UChar *start, *s, *limit; |
| 132 }; |
| 133 typedef struct CmpEquivLevel CmpEquivLevel; |
| 134 |
| 135 /** |
| 136 * Internal option for unorm_cmpEquivFold() for decomposing. |
| 137 * If not set, just do strcasecmp(). |
| 138 */ |
| 139 #define _COMPARE_EQUIV 0x80000 |
| 140 |
| 141 /* internal function */ |
| 142 static int32_t |
| 143 unorm_cmpEquivFold(const UChar *s1, int32_t length1, |
| 144 const UChar *s2, int32_t length2, |
| 145 uint32_t options, |
| 146 UErrorCode *pErrorCode) { |
| 147 const Normalizer2Impl *nfcImpl; |
| 148 const UCaseProps *csp; |
| 149 |
| 150 /* current-level start/limit - s1/s2 as current */ |
| 151 const UChar *start1, *start2, *limit1, *limit2; |
| 152 |
| 153 /* decomposition and case folding variables */ |
| 154 const UChar *p; |
| 155 int32_t length; |
| 156 |
| 157 /* stacks of previous-level start/current/limit */ |
| 158 CmpEquivLevel stack1[2], stack2[2]; |
| 159 |
| 160 /* buffers for algorithmic decompositions */ |
| 161 UChar decomp1[4], decomp2[4]; |
| 162 |
| 163 /* case folding buffers, only use current-level start/limit */ |
| 164 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; |
| 165 |
| 166 /* track which is the current level per string */ |
| 167 int32_t level1, level2; |
| 168 |
| 169 /* current code units, and code points for lookups */ |
| 170 UChar32 c1, c2, cp1, cp2; |
| 171 |
| 172 /* no argument error checking because this itself is not an API */ |
| 173 |
| 174 /* |
| 175 * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNO
RE_CASE is set |
| 176 * otherwise this function must behave exactly as uprv_strCompare() |
| 177 * not checking for that here makes testing this function easier |
| 178 */ |
| 179 |
| 180 /* normalization/properties data loaded? */ |
| 181 if((options&_COMPARE_EQUIV)!=0) { |
| 182 nfcImpl=Normalizer2Factory::getNFCImpl(*pErrorCode); |
| 183 } else { |
| 184 nfcImpl=NULL; |
| 185 } |
| 186 if((options&U_COMPARE_IGNORE_CASE)!=0) { |
| 187 csp=ucase_getSingleton(); |
| 188 } else { |
| 189 csp=NULL; |
| 190 } |
| 191 if(U_FAILURE(*pErrorCode)) { |
| 192 return 0; |
| 193 } |
| 194 |
| 195 /* initialize */ |
| 196 start1=s1; |
| 197 if(length1==-1) { |
| 198 limit1=NULL; |
| 199 } else { |
| 200 limit1=s1+length1; |
| 201 } |
| 202 |
| 203 start2=s2; |
| 204 if(length2==-1) { |
| 205 limit2=NULL; |
| 206 } else { |
| 207 limit2=s2+length2; |
| 208 } |
| 209 |
| 210 level1=level2=0; |
| 211 c1=c2=-1; |
| 212 |
| 213 /* comparison loop */ |
| 214 for(;;) { |
| 215 /* |
| 216 * here a code unit value of -1 means "get another code unit" |
| 217 * below it will mean "this source is finished" |
| 218 */ |
| 219 |
| 220 if(c1<0) { |
| 221 /* get next code unit from string 1, post-increment */ |
| 222 for(;;) { |
| 223 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRN
CMP_STYLE)))) { |
| 224 if(level1==0) { |
| 225 c1=-1; |
| 226 break; |
| 227 } |
| 228 } else { |
| 229 ++s1; |
| 230 break; |
| 231 } |
| 232 |
| 233 /* reached end of level buffer, pop one level */ |
| 234 do { |
| 235 --level1; |
| 236 start1=stack1[level1].start; |
| 237 } while(start1==NULL); |
| 238 s1=stack1[level1].s; |
| 239 limit1=stack1[level1].limit; |
| 240 } |
| 241 } |
| 242 |
| 243 if(c2<0) { |
| 244 /* get next code unit from string 2, post-increment */ |
| 245 for(;;) { |
| 246 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRN
CMP_STYLE)))) { |
| 247 if(level2==0) { |
| 248 c2=-1; |
| 249 break; |
| 250 } |
| 251 } else { |
| 252 ++s2; |
| 253 break; |
| 254 } |
| 255 |
| 256 /* reached end of level buffer, pop one level */ |
| 257 do { |
| 258 --level2; |
| 259 start2=stack2[level2].start; |
| 260 } while(start2==NULL); |
| 261 s2=stack2[level2].s; |
| 262 limit2=stack2[level2].limit; |
| 263 } |
| 264 } |
| 265 |
| 266 /* |
| 267 * compare c1 and c2 |
| 268 * either variable c1, c2 is -1 only if the corresponding string is fini
shed |
| 269 */ |
| 270 if(c1==c2) { |
| 271 if(c1<0) { |
| 272 return 0; /* c1==c2==-1 indicating end of strings */ |
| 273 } |
| 274 c1=c2=-1; /* make us fetch new code units */ |
| 275 continue; |
| 276 } else if(c1<0) { |
| 277 return -1; /* string 1 ends before string 2 */ |
| 278 } else if(c2<0) { |
| 279 return 1; /* string 2 ends before string 1 */ |
| 280 } |
| 281 /* c1!=c2 && c1>=0 && c2>=0 */ |
| 282 |
| 283 /* get complete code points for c1, c2 for lookups if either is a surrog
ate */ |
| 284 cp1=c1; |
| 285 if(U_IS_SURROGATE(c1)) { |
| 286 UChar c; |
| 287 |
| 288 if(U_IS_SURROGATE_LEAD(c1)) { |
| 289 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { |
| 290 /* advance ++s1; only below if cp1 decomposes/case-folds */ |
| 291 cp1=U16_GET_SUPPLEMENTARY(c1, c); |
| 292 } |
| 293 } else /* isTrail(c1) */ { |
| 294 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { |
| 295 cp1=U16_GET_SUPPLEMENTARY(c, c1); |
| 296 } |
| 297 } |
| 298 } |
| 299 |
| 300 cp2=c2; |
| 301 if(U_IS_SURROGATE(c2)) { |
| 302 UChar c; |
| 303 |
| 304 if(U_IS_SURROGATE_LEAD(c2)) { |
| 305 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { |
| 306 /* advance ++s2; only below if cp2 decomposes/case-folds */ |
| 307 cp2=U16_GET_SUPPLEMENTARY(c2, c); |
| 308 } |
| 309 } else /* isTrail(c2) */ { |
| 310 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { |
| 311 cp2=U16_GET_SUPPLEMENTARY(c, c2); |
| 312 } |
| 313 } |
| 314 } |
| 315 |
| 316 /* |
| 317 * go down one level for each string |
| 318 * continue with the main loop as soon as there is a real change |
| 319 */ |
| 320 |
| 321 if( level1==0 && (options&U_COMPARE_IGNORE_CASE) && |
| 322 (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0 |
| 323 ) { |
| 324 /* cp1 case-folds to the code point "length" or to p[length] */ |
| 325 if(U_IS_SURROGATE(c1)) { |
| 326 if(U_IS_SURROGATE_LEAD(c1)) { |
| 327 /* advance beyond source surrogate pair if it case-folds */ |
| 328 ++s1; |
| 329 } else /* isTrail(c1) */ { |
| 330 /* |
| 331 * we got a supplementary code point when hitting its trail
surrogate, |
| 332 * therefore the lead surrogate must have been the same as i
n the other string; |
| 333 * compare this decomposition with the lead surrogate in the
other string |
| 334 * remember that this simulates bulk text replacement: |
| 335 * the decomposition would replace the entire code point |
| 336 */ |
| 337 --s2; |
| 338 c2=*(s2-1); |
| 339 } |
| 340 } |
| 341 |
| 342 /* push current level pointers */ |
| 343 stack1[0].start=start1; |
| 344 stack1[0].s=s1; |
| 345 stack1[0].limit=limit1; |
| 346 ++level1; |
| 347 |
| 348 /* copy the folding result to fold1[] */ |
| 349 if(length<=UCASE_MAX_STRING_LENGTH) { |
| 350 u_memcpy(fold1, p, length); |
| 351 } else { |
| 352 int32_t i=0; |
| 353 U16_APPEND_UNSAFE(fold1, i, length); |
| 354 length=i; |
| 355 } |
| 356 |
| 357 /* set next level pointers to case folding */ |
| 358 start1=s1=fold1; |
| 359 limit1=fold1+length; |
| 360 |
| 361 /* get ready to read from decomposition, continue with loop */ |
| 362 c1=-1; |
| 363 continue; |
| 364 } |
| 365 |
| 366 if( level2==0 && (options&U_COMPARE_IGNORE_CASE) && |
| 367 (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0 |
| 368 ) { |
| 369 /* cp2 case-folds to the code point "length" or to p[length] */ |
| 370 if(U_IS_SURROGATE(c2)) { |
| 371 if(U_IS_SURROGATE_LEAD(c2)) { |
| 372 /* advance beyond source surrogate pair if it case-folds */ |
| 373 ++s2; |
| 374 } else /* isTrail(c2) */ { |
| 375 /* |
| 376 * we got a supplementary code point when hitting its trail
surrogate, |
| 377 * therefore the lead surrogate must have been the same as i
n the other string; |
| 378 * compare this decomposition with the lead surrogate in the
other string |
| 379 * remember that this simulates bulk text replacement: |
| 380 * the decomposition would replace the entire code point |
| 381 */ |
| 382 --s1; |
| 383 c1=*(s1-1); |
| 384 } |
| 385 } |
| 386 |
| 387 /* push current level pointers */ |
| 388 stack2[0].start=start2; |
| 389 stack2[0].s=s2; |
| 390 stack2[0].limit=limit2; |
| 391 ++level2; |
| 392 |
| 393 /* copy the folding result to fold2[] */ |
| 394 if(length<=UCASE_MAX_STRING_LENGTH) { |
| 395 u_memcpy(fold2, p, length); |
| 396 } else { |
| 397 int32_t i=0; |
| 398 U16_APPEND_UNSAFE(fold2, i, length); |
| 399 length=i; |
| 400 } |
| 401 |
| 402 /* set next level pointers to case folding */ |
| 403 start2=s2=fold2; |
| 404 limit2=fold2+length; |
| 405 |
| 406 /* get ready to read from decomposition, continue with loop */ |
| 407 c2=-1; |
| 408 continue; |
| 409 } |
| 410 |
| 411 if( level1<2 && (options&_COMPARE_EQUIV) && |
| 412 0!=(p=nfcImpl->getDecomposition((UChar32)cp1, decomp1, length)) |
| 413 ) { |
| 414 /* cp1 decomposes into p[length] */ |
| 415 if(U_IS_SURROGATE(c1)) { |
| 416 if(U_IS_SURROGATE_LEAD(c1)) { |
| 417 /* advance beyond source surrogate pair if it decomposes */ |
| 418 ++s1; |
| 419 } else /* isTrail(c1) */ { |
| 420 /* |
| 421 * we got a supplementary code point when hitting its trail
surrogate, |
| 422 * therefore the lead surrogate must have been the same as i
n the other string; |
| 423 * compare this decomposition with the lead surrogate in the
other string |
| 424 * remember that this simulates bulk text replacement: |
| 425 * the decomposition would replace the entire code point |
| 426 */ |
| 427 --s2; |
| 428 c2=*(s2-1); |
| 429 } |
| 430 } |
| 431 |
| 432 /* push current level pointers */ |
| 433 stack1[level1].start=start1; |
| 434 stack1[level1].s=s1; |
| 435 stack1[level1].limit=limit1; |
| 436 ++level1; |
| 437 |
| 438 /* set empty intermediate level if skipped */ |
| 439 if(level1<2) { |
| 440 stack1[level1++].start=NULL; |
| 441 } |
| 442 |
| 443 /* set next level pointers to decomposition */ |
| 444 start1=s1=p; |
| 445 limit1=p+length; |
| 446 |
| 447 /* get ready to read from decomposition, continue with loop */ |
| 448 c1=-1; |
| 449 continue; |
| 450 } |
| 451 |
| 452 if( level2<2 && (options&_COMPARE_EQUIV) && |
| 453 0!=(p=nfcImpl->getDecomposition((UChar32)cp2, decomp2, length)) |
| 454 ) { |
| 455 /* cp2 decomposes into p[length] */ |
| 456 if(U_IS_SURROGATE(c2)) { |
| 457 if(U_IS_SURROGATE_LEAD(c2)) { |
| 458 /* advance beyond source surrogate pair if it decomposes */ |
| 459 ++s2; |
| 460 } else /* isTrail(c2) */ { |
| 461 /* |
| 462 * we got a supplementary code point when hitting its trail
surrogate, |
| 463 * therefore the lead surrogate must have been the same as i
n the other string; |
| 464 * compare this decomposition with the lead surrogate in the
other string |
| 465 * remember that this simulates bulk text replacement: |
| 466 * the decomposition would replace the entire code point |
| 467 */ |
| 468 --s1; |
| 469 c1=*(s1-1); |
| 470 } |
| 471 } |
| 472 |
| 473 /* push current level pointers */ |
| 474 stack2[level2].start=start2; |
| 475 stack2[level2].s=s2; |
| 476 stack2[level2].limit=limit2; |
| 477 ++level2; |
| 478 |
| 479 /* set empty intermediate level if skipped */ |
| 480 if(level2<2) { |
| 481 stack2[level2++].start=NULL; |
| 482 } |
| 483 |
| 484 /* set next level pointers to decomposition */ |
| 485 start2=s2=p; |
| 486 limit2=p+length; |
| 487 |
| 488 /* get ready to read from decomposition, continue with loop */ |
| 489 c2=-1; |
| 490 continue; |
| 491 } |
| 492 |
| 493 /* |
| 494 * no decomposition/case folding, max level for both sides: |
| 495 * return difference result |
| 496 * |
| 497 * code point order comparison must not just return cp1-cp2 |
| 498 * because when single surrogates are present then the surrogate pairs |
| 499 * that formed cp1 and cp2 may be from different string indexes |
| 500 * |
| 501 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code
units |
| 502 * c1=d800 cp1=10001 c2=dc00 cp2=10000 |
| 503 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < {
10000 } |
| 504 * |
| 505 * therefore, use same fix-up as in ustring.c/uprv_strCompare() |
| 506 * except: uprv_strCompare() fetches c=*s while this functions fetches c
=*s++ |
| 507 * so we have slightly different pointer/start/limit comparisons here |
| 508 */ |
| 509 |
| 510 if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { |
| 511 /* subtract 0x2800 from BMP code points to make them smaller than su
pplementary ones */ |
| 512 if( |
| 513 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || |
| 514 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) |
| 515 ) { |
| 516 /* part of a surrogate pair, leave >=d800 */ |
| 517 } else { |
| 518 /* BMP code point - may be surrogate code point - make <d800 */ |
| 519 c1-=0x2800; |
| 520 } |
| 521 |
| 522 if( |
| 523 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || |
| 524 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) |
| 525 ) { |
| 526 /* part of a surrogate pair, leave >=d800 */ |
| 527 } else { |
| 528 /* BMP code point - may be surrogate code point - make <d800 */ |
| 529 c2-=0x2800; |
| 530 } |
| 531 } |
| 532 |
| 533 return c1-c2; |
| 534 } |
| 535 } |
| 536 |
| 537 U_CAPI int32_t U_EXPORT2 |
| 538 unorm_compare(const UChar *s1, int32_t length1, |
| 539 const UChar *s2, int32_t length2, |
| 540 uint32_t options, |
| 541 UErrorCode *pErrorCode) { |
| 542 /* argument checking */ |
| 543 if(U_FAILURE(*pErrorCode)) { |
| 544 return 0; |
| 545 } |
| 546 if(s1==0 || length1<-1 || s2==0 || length2<-1) { |
| 547 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 548 return 0; |
| 549 } |
| 550 |
| 551 UnicodeString fcd1, fcd2; |
| 552 int32_t normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT); |
| 553 options|=_COMPARE_EQUIV; |
| 554 |
| 555 /* |
| 556 * UAX #21 Case Mappings, as fixed for Unicode version 4 |
| 557 * (see Jitterbug 2021), defines a canonical caseless match as |
| 558 * |
| 559 * A string X is a canonical caseless match |
| 560 * for a string Y if and only if |
| 561 * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y))) |
| 562 * |
| 563 * For better performance, we check for FCD (or let the caller tell us that |
| 564 * both strings are in FCD) for the inner normalization. |
| 565 * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that |
| 566 * case-folding preserves the FCD-ness of a string. |
| 567 * The outer normalization is then only performed by unorm_cmpEquivFold() |
| 568 * when there is a difference. |
| 569 * |
| 570 * Exception: When using the Turkic case-folding option, we do perform |
| 571 * full NFD first. This is because in the Turkic case precomposed characters |
| 572 * with 0049 capital I or 0069 small i fold differently whether they |
| 573 * are first decomposed or not, so an FCD check - a check only for |
| 574 * canonical order - is not sufficient. |
| 575 */ |
| 576 if(!(options&UNORM_INPUT_IS_FCD) || (options&U_FOLD_CASE_EXCLUDE_SPECIAL_I))
{ |
| 577 const Normalizer2 *n2; |
| 578 if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) { |
| 579 n2=Normalizer2Factory::getNFDInstance(*pErrorCode); |
| 580 } else { |
| 581 n2=Normalizer2Factory::getFCDInstance(*pErrorCode); |
| 582 } |
| 583 if (U_FAILURE(*pErrorCode)) { |
| 584 return 0; |
| 585 } |
| 586 |
| 587 // check if s1 and/or s2 fulfill the FCD conditions |
| 588 const UnicodeSet *uni32; |
| 589 if(normOptions&UNORM_UNICODE_3_2) { |
| 590 uni32=uniset_getUnicode32Instance(*pErrorCode); |
| 591 } else { |
| 592 uni32=NULL; // unused |
| 593 } |
| 594 FilteredNormalizer2 fn2(*n2, *uni32); |
| 595 if(normOptions&UNORM_UNICODE_3_2) { |
| 596 n2=&fn2; |
| 597 } |
| 598 |
| 599 UnicodeString str1(length1<0, s1, length1); |
| 600 UnicodeString str2(length2<0, s2, length2); |
| 601 int32_t spanQCYes1=n2->spanQuickCheckYes(str1, *pErrorCode); |
| 602 int32_t spanQCYes2=n2->spanQuickCheckYes(str2, *pErrorCode); |
| 603 if(U_FAILURE(*pErrorCode)) { |
| 604 return 0; |
| 605 } |
| 606 |
| 607 /* |
| 608 * ICU 2.4 had a further optimization: |
| 609 * If both strings were not in FCD, then they were both NFD'ed, |
| 610 * and the _COMPARE_EQUIV option was turned off. |
| 611 * It is not entirely clear that this is valid with the current |
| 612 * definition of the canonical caseless match. |
| 613 * Therefore, ICU 2.6 removes that optimization. |
| 614 */ |
| 615 |
| 616 if(spanQCYes1<str1.length()) { |
| 617 UnicodeString unnormalized=str1.tempSubString(spanQCYes1); |
| 618 fcd1.setTo(FALSE, str1.getBuffer(), spanQCYes1); |
| 619 n2->normalizeSecondAndAppend(fcd1, unnormalized, *pErrorCode); |
| 620 s1=fcd1.getBuffer(); |
| 621 length1=fcd1.length(); |
| 622 } |
| 623 if(spanQCYes2<str2.length()) { |
| 624 UnicodeString unnormalized=str2.tempSubString(spanQCYes2); |
| 625 fcd2.setTo(FALSE, str2.getBuffer(), spanQCYes2); |
| 626 n2->normalizeSecondAndAppend(fcd2, unnormalized, *pErrorCode); |
| 627 s2=fcd2.getBuffer(); |
| 628 length2=fcd2.length(); |
| 629 } |
| 630 } |
| 631 |
| 632 if(U_SUCCESS(*pErrorCode)) { |
| 633 return unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode)
; |
| 634 } else { |
| 635 return 0; |
| 636 } |
| 637 } |
| 638 |
| 639 #endif /* #if !UCONFIG_NO_NORMALIZATION */ |
OLD | NEW |