OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * Copyright (C) 2013-2014, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* |
| 6 * collationfastlatin.cpp |
| 7 * |
| 8 * created on: 2013aug18 |
| 9 * created by: Markus W. Scherer |
| 10 */ |
| 11 |
| 12 #include "unicode/utypes.h" |
| 13 |
| 14 #if !UCONFIG_NO_COLLATION |
| 15 |
| 16 #include "unicode/ucol.h" |
| 17 #include "collationdata.h" |
| 18 #include "collationfastlatin.h" |
| 19 #include "collationsettings.h" |
| 20 #include "putilimp.h" // U_ALIGN_CODE |
| 21 #include "uassert.h" |
| 22 |
| 23 U_NAMESPACE_BEGIN |
| 24 |
| 25 int32_t |
| 26 CollationFastLatin::getOptions(const CollationData *data, const CollationSetting
s &settings, |
| 27 uint16_t *primaries, int32_t capacity) { |
| 28 const uint16_t *table = data->fastLatinTable; |
| 29 if(table == NULL) { return -1; } |
| 30 U_ASSERT(capacity == LATIN_LIMIT); |
| 31 if(capacity != LATIN_LIMIT) { return -1; } |
| 32 |
| 33 uint32_t miniVarTop; |
| 34 if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) { |
| 35 // No mini primaries are variable, set a variableTop just below the |
| 36 // lowest long mini primary. |
| 37 miniVarTop = MIN_LONG - 1; |
| 38 } else { |
| 39 uint32_t v1 = settings.variableTop >> 24; |
| 40 int32_t headerLength = *table & 0xff; |
| 41 int32_t i = headerLength - 1; |
| 42 if(i <= 0 || v1 > (table[i] & 0x7f)) { |
| 43 return -1; // variableTop >= digits, should not occur |
| 44 } |
| 45 while(i > 1 && v1 <= (table[i - 1] & 0x7f)) { --i; } |
| 46 // In the table header, the miniVarTop is in bits 15..7, with 4 zero bit
s 19..16 implied. |
| 47 // Shift right to make it comparable with long mini primaries in bits 15
..3. |
| 48 miniVarTop = (table[i] & 0xff80) >> 4; |
| 49 } |
| 50 |
| 51 const uint8_t *reorderTable = settings.reorderTable; |
| 52 if(reorderTable != NULL) { |
| 53 const uint16_t *scripts = data->scripts; |
| 54 int32_t length = data->scriptsLength; |
| 55 uint32_t prevLastByte = 0; |
| 56 for(int32_t i = 0; i < length;) { |
| 57 // reordered last byte of the group |
| 58 uint32_t lastByte = reorderTable[scripts[i] & 0xff]; |
| 59 if(lastByte < prevLastByte) { |
| 60 // The permutation affects the groups up to Latin. |
| 61 return -1; |
| 62 } |
| 63 if(scripts[i + 2] == USCRIPT_LATIN) { break; } |
| 64 i = i + 2 + scripts[i + 1]; |
| 65 prevLastByte = lastByte; |
| 66 } |
| 67 } |
| 68 |
| 69 table += (table[0] & 0xff); // skip the header |
| 70 for(UChar32 c = 0; c < LATIN_LIMIT; ++c) { |
| 71 uint32_t p = table[c]; |
| 72 if(p >= MIN_SHORT) { |
| 73 p &= SHORT_PRIMARY_MASK; |
| 74 } else if(p > miniVarTop) { |
| 75 p &= LONG_PRIMARY_MASK; |
| 76 } else { |
| 77 p = 0; |
| 78 } |
| 79 primaries[c] = (uint16_t)p; |
| 80 } |
| 81 if((settings.options & CollationSettings::NUMERIC) != 0) { |
| 82 // Bail out for digits. |
| 83 for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; } |
| 84 } |
| 85 |
| 86 // Shift the miniVarTop above other options. |
| 87 return ((int32_t)miniVarTop << 16) | settings.options; |
| 88 } |
| 89 |
| 90 int32_t |
| 91 CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primarie
s, int32_t options, |
| 92 const UChar *left, int32_t leftLength, |
| 93 const UChar *right, int32_t rightLength) { |
| 94 // This is a modified copy of CollationCompare::compareUpToQuaternary(), |
| 95 // optimized for common Latin text. |
| 96 // Keep them in sync! |
| 97 // Keep compareUTF16() and compareUTF8() in sync very closely! |
| 98 |
| 99 U_ASSERT((table[0] >> 8) == VERSION); |
| 100 table += (table[0] & 0xff); // skip the header |
| 101 uint32_t variableTop = (uint32_t)options >> 16; // see getOptions() |
| 102 options &= 0xffff; // needed for CollationSettings::getStrength() to work |
| 103 |
| 104 // Check for supported characters, fetch mini CEs, and compare primaries. |
| 105 U_ALIGN_CODE(16); |
| 106 int32_t leftIndex = 0, rightIndex = 0; |
| 107 /** |
| 108 * Single mini CE or a pair. |
| 109 * The current mini CE is in the lower 16 bits, the next one is in the upper
16 bits. |
| 110 * If there is only one, then it is in the lower bits, and the upper bits ar
e 0. |
| 111 */ |
| 112 uint32_t leftPair = 0, rightPair = 0; |
| 113 for(;;) { |
| 114 // We fetch CEs until we get a non-ignorable primary or reach the end. |
| 115 while(leftPair == 0) { |
| 116 if(leftIndex == leftLength) { |
| 117 leftPair = EOS; |
| 118 break; |
| 119 } |
| 120 UChar32 c = left[leftIndex++]; |
| 121 if(c <= LATIN_MAX) { |
| 122 leftPair = primaries[c]; |
| 123 if(leftPair != 0) { break; } |
| 124 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMER
IC) != 0) { |
| 125 return BAIL_OUT_RESULT; |
| 126 } |
| 127 leftPair = table[c]; |
| 128 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { |
| 129 leftPair = table[c - PUNCT_START + LATIN_LIMIT]; |
| 130 } else { |
| 131 leftPair = lookup(table, c); |
| 132 } |
| 133 if(leftPair >= MIN_SHORT) { |
| 134 leftPair &= SHORT_PRIMARY_MASK; |
| 135 break; |
| 136 } else if(leftPair > variableTop) { |
| 137 leftPair &= LONG_PRIMARY_MASK; |
| 138 break; |
| 139 } else { |
| 140 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, l
eftLength); |
| 141 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } |
| 142 leftPair = getPrimaries(variableTop, leftPair); |
| 143 } |
| 144 } |
| 145 |
| 146 while(rightPair == 0) { |
| 147 if(rightIndex == rightLength) { |
| 148 rightPair = EOS; |
| 149 break; |
| 150 } |
| 151 UChar32 c = right[rightIndex++]; |
| 152 if(c <= LATIN_MAX) { |
| 153 rightPair = primaries[c]; |
| 154 if(rightPair != 0) { break; } |
| 155 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMER
IC) != 0) { |
| 156 return BAIL_OUT_RESULT; |
| 157 } |
| 158 rightPair = table[c]; |
| 159 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { |
| 160 rightPair = table[c - PUNCT_START + LATIN_LIMIT]; |
| 161 } else { |
| 162 rightPair = lookup(table, c); |
| 163 } |
| 164 if(rightPair >= MIN_SHORT) { |
| 165 rightPair &= SHORT_PRIMARY_MASK; |
| 166 break; |
| 167 } else if(rightPair > variableTop) { |
| 168 rightPair &= LONG_PRIMARY_MASK; |
| 169 break; |
| 170 } else { |
| 171 rightPair = nextPair(table, c, rightPair, right, NULL, rightInde
x, rightLength); |
| 172 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } |
| 173 rightPair = getPrimaries(variableTop, rightPair); |
| 174 } |
| 175 } |
| 176 |
| 177 if(leftPair == rightPair) { |
| 178 if(leftPair == EOS) { break; } |
| 179 leftPair = rightPair = 0; |
| 180 continue; |
| 181 } |
| 182 uint32_t leftPrimary = leftPair & 0xffff; |
| 183 uint32_t rightPrimary = rightPair & 0xffff; |
| 184 if(leftPrimary != rightPrimary) { |
| 185 // Return the primary difference. |
| 186 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; |
| 187 } |
| 188 if(leftPair == EOS) { break; } |
| 189 leftPair >>= 16; |
| 190 rightPair >>= 16; |
| 191 } |
| 192 // In the following, we need to re-fetch each character because we did not b
uffer the CEs, |
| 193 // but we know that the string is well-formed and |
| 194 // only contains supported characters and mappings. |
| 195 |
| 196 // We might skip the secondary level but continue with the case level |
| 197 // which is turned on separately. |
| 198 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { |
| 199 leftIndex = rightIndex = 0; |
| 200 leftPair = rightPair = 0; |
| 201 for(;;) { |
| 202 while(leftPair == 0) { |
| 203 if(leftIndex == leftLength) { |
| 204 leftPair = EOS; |
| 205 break; |
| 206 } |
| 207 UChar32 c = left[leftIndex++]; |
| 208 if(c <= LATIN_MAX) { |
| 209 leftPair = table[c]; |
| 210 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { |
| 211 leftPair = table[c - PUNCT_START + LATIN_LIMIT]; |
| 212 } else { |
| 213 leftPair = lookup(table, c); |
| 214 } |
| 215 if(leftPair >= MIN_SHORT) { |
| 216 leftPair = getSecondariesFromOneShortCE(leftPair); |
| 217 break; |
| 218 } else if(leftPair > variableTop) { |
| 219 leftPair = COMMON_SEC_PLUS_OFFSET; |
| 220 break; |
| 221 } else { |
| 222 leftPair = nextPair(table, c, leftPair, left, NULL, leftInde
x, leftLength); |
| 223 leftPair = getSecondaries(variableTop, leftPair); |
| 224 } |
| 225 } |
| 226 |
| 227 while(rightPair == 0) { |
| 228 if(rightIndex == rightLength) { |
| 229 rightPair = EOS; |
| 230 break; |
| 231 } |
| 232 UChar32 c = right[rightIndex++]; |
| 233 if(c <= LATIN_MAX) { |
| 234 rightPair = table[c]; |
| 235 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { |
| 236 rightPair = table[c - PUNCT_START + LATIN_LIMIT]; |
| 237 } else { |
| 238 rightPair = lookup(table, c); |
| 239 } |
| 240 if(rightPair >= MIN_SHORT) { |
| 241 rightPair = getSecondariesFromOneShortCE(rightPair); |
| 242 break; |
| 243 } else if(rightPair > variableTop) { |
| 244 rightPair = COMMON_SEC_PLUS_OFFSET; |
| 245 break; |
| 246 } else { |
| 247 rightPair = nextPair(table, c, rightPair, right, NULL, right
Index, rightLength); |
| 248 rightPair = getSecondaries(variableTop, rightPair); |
| 249 } |
| 250 } |
| 251 |
| 252 if(leftPair == rightPair) { |
| 253 if(leftPair == EOS) { break; } |
| 254 leftPair = rightPair = 0; |
| 255 continue; |
| 256 } |
| 257 uint32_t leftSecondary = leftPair & 0xffff; |
| 258 uint32_t rightSecondary = rightPair & 0xffff; |
| 259 if(leftSecondary != rightSecondary) { |
| 260 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { |
| 261 // Full support for backwards secondary requires backwards c
ontraction matching |
| 262 // and moving backwards between merge separators. |
| 263 return BAIL_OUT_RESULT; |
| 264 } |
| 265 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREAT
ER; |
| 266 } |
| 267 if(leftPair == EOS) { break; } |
| 268 leftPair >>= 16; |
| 269 rightPair >>= 16; |
| 270 } |
| 271 } |
| 272 |
| 273 if((options & CollationSettings::CASE_LEVEL) != 0) { |
| 274 UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCO
L_PRIMARY; |
| 275 leftIndex = rightIndex = 0; |
| 276 leftPair = rightPair = 0; |
| 277 for(;;) { |
| 278 while(leftPair == 0) { |
| 279 if(leftIndex == leftLength) { |
| 280 leftPair = EOS; |
| 281 break; |
| 282 } |
| 283 UChar32 c = left[leftIndex++]; |
| 284 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); |
| 285 if(leftPair < MIN_LONG) { |
| 286 leftPair = nextPair(table, c, leftPair, left, NULL, leftInde
x, leftLength); |
| 287 } |
| 288 leftPair = getCases(variableTop, strengthIsPrimary, leftPair); |
| 289 } |
| 290 |
| 291 while(rightPair == 0) { |
| 292 if(rightIndex == rightLength) { |
| 293 rightPair = EOS; |
| 294 break; |
| 295 } |
| 296 UChar32 c = right[rightIndex++]; |
| 297 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); |
| 298 if(rightPair < MIN_LONG) { |
| 299 rightPair = nextPair(table, c, rightPair, right, NULL, right
Index, rightLength); |
| 300 } |
| 301 rightPair = getCases(variableTop, strengthIsPrimary, rightPair); |
| 302 } |
| 303 |
| 304 if(leftPair == rightPair) { |
| 305 if(leftPair == EOS) { break; } |
| 306 leftPair = rightPair = 0; |
| 307 continue; |
| 308 } |
| 309 uint32_t leftCase = leftPair & 0xffff; |
| 310 uint32_t rightCase = rightPair & 0xffff; |
| 311 if(leftCase != rightCase) { |
| 312 if((options & CollationSettings::UPPER_FIRST) == 0) { |
| 313 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; |
| 314 } else { |
| 315 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; |
| 316 } |
| 317 } |
| 318 if(leftPair == EOS) { break; } |
| 319 leftPair >>= 16; |
| 320 rightPair >>= 16; |
| 321 } |
| 322 } |
| 323 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_
EQUAL; } |
| 324 |
| 325 // Remove the case bits from the tertiary weight when caseLevel is on or cas
eFirst is off. |
| 326 UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); |
| 327 |
| 328 leftIndex = rightIndex = 0; |
| 329 leftPair = rightPair = 0; |
| 330 for(;;) { |
| 331 while(leftPair == 0) { |
| 332 if(leftIndex == leftLength) { |
| 333 leftPair = EOS; |
| 334 break; |
| 335 } |
| 336 UChar32 c = left[leftIndex++]; |
| 337 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); |
| 338 if(leftPair < MIN_LONG) { |
| 339 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, l
eftLength); |
| 340 } |
| 341 leftPair = getTertiaries(variableTop, withCaseBits, leftPair); |
| 342 } |
| 343 |
| 344 while(rightPair == 0) { |
| 345 if(rightIndex == rightLength) { |
| 346 rightPair = EOS; |
| 347 break; |
| 348 } |
| 349 UChar32 c = right[rightIndex++]; |
| 350 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); |
| 351 if(rightPair < MIN_LONG) { |
| 352 rightPair = nextPair(table, c, rightPair, right, NULL, rightInde
x, rightLength); |
| 353 } |
| 354 rightPair = getTertiaries(variableTop, withCaseBits, rightPair); |
| 355 } |
| 356 |
| 357 if(leftPair == rightPair) { |
| 358 if(leftPair == EOS) { break; } |
| 359 leftPair = rightPair = 0; |
| 360 continue; |
| 361 } |
| 362 uint32_t leftTertiary = leftPair & 0xffff; |
| 363 uint32_t rightTertiary = rightPair & 0xffff; |
| 364 if(leftTertiary != rightTertiary) { |
| 365 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { |
| 366 // Pass through EOS and MERGE_WEIGHT |
| 367 // and keep real tertiary weights larger than the MERGE_WEIGHT. |
| 368 // Tertiary CEs (secondary ignorables) are not supported in fast
Latin. |
| 369 if(leftTertiary > MERGE_WEIGHT) { |
| 370 leftTertiary ^= CASE_MASK; |
| 371 } |
| 372 if(rightTertiary > MERGE_WEIGHT) { |
| 373 rightTertiary ^= CASE_MASK; |
| 374 } |
| 375 } |
| 376 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; |
| 377 } |
| 378 if(leftPair == EOS) { break; } |
| 379 leftPair >>= 16; |
| 380 rightPair >>= 16; |
| 381 } |
| 382 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_E
QUAL; } |
| 383 |
| 384 leftIndex = rightIndex = 0; |
| 385 leftPair = rightPair = 0; |
| 386 for(;;) { |
| 387 while(leftPair == 0) { |
| 388 if(leftIndex == leftLength) { |
| 389 leftPair = EOS; |
| 390 break; |
| 391 } |
| 392 UChar32 c = left[leftIndex++]; |
| 393 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); |
| 394 if(leftPair < MIN_LONG) { |
| 395 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, l
eftLength); |
| 396 } |
| 397 leftPair = getQuaternaries(variableTop, leftPair); |
| 398 } |
| 399 |
| 400 while(rightPair == 0) { |
| 401 if(rightIndex == rightLength) { |
| 402 rightPair = EOS; |
| 403 break; |
| 404 } |
| 405 UChar32 c = right[rightIndex++]; |
| 406 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); |
| 407 if(rightPair < MIN_LONG) { |
| 408 rightPair = nextPair(table, c, rightPair, right, NULL, rightInde
x, rightLength); |
| 409 } |
| 410 rightPair = getQuaternaries(variableTop, rightPair); |
| 411 } |
| 412 |
| 413 if(leftPair == rightPair) { |
| 414 if(leftPair == EOS) { break; } |
| 415 leftPair = rightPair = 0; |
| 416 continue; |
| 417 } |
| 418 uint32_t leftQuaternary = leftPair & 0xffff; |
| 419 uint32_t rightQuaternary = rightPair & 0xffff; |
| 420 if(leftQuaternary != rightQuaternary) { |
| 421 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER
; |
| 422 } |
| 423 if(leftPair == EOS) { break; } |
| 424 leftPair >>= 16; |
| 425 rightPair >>= 16; |
| 426 } |
| 427 return UCOL_EQUAL; |
| 428 } |
| 429 |
| 430 int32_t |
| 431 CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries
, int32_t options, |
| 432 const uint8_t *left, int32_t leftLength, |
| 433 const uint8_t *right, int32_t rightLength) { |
| 434 // Keep compareUTF16() and compareUTF8() in sync very closely! |
| 435 |
| 436 U_ASSERT((table[0] >> 8) == VERSION); |
| 437 table += (table[0] & 0xff); // skip the header |
| 438 uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::g
etFastLatinOptions() |
| 439 options &= 0xffff; // needed for CollationSettings::getStrength() to work |
| 440 |
| 441 // Check for supported characters, fetch mini CEs, and compare primaries. |
| 442 U_ALIGN_CODE(16); |
| 443 int32_t leftIndex = 0, rightIndex = 0; |
| 444 /** |
| 445 * Single mini CE or a pair. |
| 446 * The current mini CE is in the lower 16 bits, the next one is in the upper
16 bits. |
| 447 * If there is only one, then it is in the lower bits, and the upper bits ar
e 0. |
| 448 */ |
| 449 uint32_t leftPair = 0, rightPair = 0; |
| 450 // Note: There is no need to assemble the code point. |
| 451 // We only need to look up the table entry for the character, |
| 452 // and nextPair() looks for whether c==0. |
| 453 for(;;) { |
| 454 // We fetch CEs until we get a non-ignorable primary or reach the end. |
| 455 while(leftPair == 0) { |
| 456 if(leftIndex == leftLength) { |
| 457 leftPair = EOS; |
| 458 break; |
| 459 } |
| 460 UChar32 c = left[leftIndex++]; |
| 461 uint8_t t; |
| 462 if(c <= 0x7f) { |
| 463 leftPair = primaries[c]; |
| 464 if(leftPair != 0) { break; } |
| 465 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMER
IC) != 0) { |
| 466 return BAIL_OUT_RESULT; |
| 467 } |
| 468 leftPair = table[c]; |
| 469 } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != left
Length && |
| 470 0x80 <= (t = left[leftIndex]) && t <= 0xbf) { |
| 471 ++leftIndex; |
| 472 c = ((c - 0xc2) << 6) + t; |
| 473 leftPair = primaries[c]; |
| 474 if(leftPair != 0) { break; } |
| 475 leftPair = table[c]; |
| 476 } else { |
| 477 leftPair = lookupUTF8(table, c, left, leftIndex, leftLength); |
| 478 } |
| 479 if(leftPair >= MIN_SHORT) { |
| 480 leftPair &= SHORT_PRIMARY_MASK; |
| 481 break; |
| 482 } else if(leftPair > variableTop) { |
| 483 leftPair &= LONG_PRIMARY_MASK; |
| 484 break; |
| 485 } else { |
| 486 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, l
eftLength); |
| 487 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } |
| 488 leftPair = getPrimaries(variableTop, leftPair); |
| 489 } |
| 490 } |
| 491 |
| 492 while(rightPair == 0) { |
| 493 if(rightIndex == rightLength) { |
| 494 rightPair = EOS; |
| 495 break; |
| 496 } |
| 497 UChar32 c = right[rightIndex++]; |
| 498 uint8_t t; |
| 499 if(c <= 0x7f) { |
| 500 rightPair = primaries[c]; |
| 501 if(rightPair != 0) { break; } |
| 502 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMER
IC) != 0) { |
| 503 return BAIL_OUT_RESULT; |
| 504 } |
| 505 rightPair = table[c]; |
| 506 } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rig
htLength && |
| 507 0x80 <= (t = right[rightIndex]) && t <= 0xbf) { |
| 508 ++rightIndex; |
| 509 c = ((c - 0xc2) << 6) + t; |
| 510 rightPair = primaries[c]; |
| 511 if(rightPair != 0) { break; } |
| 512 rightPair = table[c]; |
| 513 } else { |
| 514 rightPair = lookupUTF8(table, c, right, rightIndex, rightLength)
; |
| 515 } |
| 516 if(rightPair >= MIN_SHORT) { |
| 517 rightPair &= SHORT_PRIMARY_MASK; |
| 518 break; |
| 519 } else if(rightPair > variableTop) { |
| 520 rightPair &= LONG_PRIMARY_MASK; |
| 521 break; |
| 522 } else { |
| 523 rightPair = nextPair(table, c, rightPair, NULL, right, rightInde
x, rightLength); |
| 524 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } |
| 525 rightPair = getPrimaries(variableTop, rightPair); |
| 526 } |
| 527 } |
| 528 |
| 529 if(leftPair == rightPair) { |
| 530 if(leftPair == EOS) { break; } |
| 531 leftPair = rightPair = 0; |
| 532 continue; |
| 533 } |
| 534 uint32_t leftPrimary = leftPair & 0xffff; |
| 535 uint32_t rightPrimary = rightPair & 0xffff; |
| 536 if(leftPrimary != rightPrimary) { |
| 537 // Return the primary difference. |
| 538 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; |
| 539 } |
| 540 if(leftPair == EOS) { break; } |
| 541 leftPair >>= 16; |
| 542 rightPair >>= 16; |
| 543 } |
| 544 // In the following, we need to re-fetch each character because we did not b
uffer the CEs, |
| 545 // but we know that the string is well-formed and |
| 546 // only contains supported characters and mappings. |
| 547 |
| 548 // We might skip the secondary level but continue with the case level |
| 549 // which is turned on separately. |
| 550 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { |
| 551 leftIndex = rightIndex = 0; |
| 552 leftPair = rightPair = 0; |
| 553 for(;;) { |
| 554 while(leftPair == 0) { |
| 555 if(leftIndex == leftLength) { |
| 556 leftPair = EOS; |
| 557 break; |
| 558 } |
| 559 UChar32 c = left[leftIndex++]; |
| 560 if(c <= 0x7f) { |
| 561 leftPair = table[c]; |
| 562 } else if(c <= LATIN_MAX_UTF8_LEAD) { |
| 563 leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]]; |
| 564 } else { |
| 565 leftPair = lookupUTF8Unsafe(table, c, left, leftIndex); |
| 566 } |
| 567 if(leftPair >= MIN_SHORT) { |
| 568 leftPair = getSecondariesFromOneShortCE(leftPair); |
| 569 break; |
| 570 } else if(leftPair > variableTop) { |
| 571 leftPair = COMMON_SEC_PLUS_OFFSET; |
| 572 break; |
| 573 } else { |
| 574 leftPair = nextPair(table, c, leftPair, NULL, left, leftInde
x, leftLength); |
| 575 leftPair = getSecondaries(variableTop, leftPair); |
| 576 } |
| 577 } |
| 578 |
| 579 while(rightPair == 0) { |
| 580 if(rightIndex == rightLength) { |
| 581 rightPair = EOS; |
| 582 break; |
| 583 } |
| 584 UChar32 c = right[rightIndex++]; |
| 585 if(c <= 0x7f) { |
| 586 rightPair = table[c]; |
| 587 } else if(c <= LATIN_MAX_UTF8_LEAD) { |
| 588 rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]]; |
| 589 } else { |
| 590 rightPair = lookupUTF8Unsafe(table, c, right, rightIndex); |
| 591 } |
| 592 if(rightPair >= MIN_SHORT) { |
| 593 rightPair = getSecondariesFromOneShortCE(rightPair); |
| 594 break; |
| 595 } else if(rightPair > variableTop) { |
| 596 rightPair = COMMON_SEC_PLUS_OFFSET; |
| 597 break; |
| 598 } else { |
| 599 rightPair = nextPair(table, c, rightPair, NULL, right, right
Index, rightLength); |
| 600 rightPair = getSecondaries(variableTop, rightPair); |
| 601 } |
| 602 } |
| 603 |
| 604 if(leftPair == rightPair) { |
| 605 if(leftPair == EOS) { break; } |
| 606 leftPair = rightPair = 0; |
| 607 continue; |
| 608 } |
| 609 uint32_t leftSecondary = leftPair & 0xffff; |
| 610 uint32_t rightSecondary = rightPair & 0xffff; |
| 611 if(leftSecondary != rightSecondary) { |
| 612 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { |
| 613 // Full support for backwards secondary requires backwards c
ontraction matching |
| 614 // and moving backwards between merge separators. |
| 615 return BAIL_OUT_RESULT; |
| 616 } |
| 617 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREAT
ER; |
| 618 } |
| 619 if(leftPair == EOS) { break; } |
| 620 leftPair >>= 16; |
| 621 rightPair >>= 16; |
| 622 } |
| 623 } |
| 624 |
| 625 if((options & CollationSettings::CASE_LEVEL) != 0) { |
| 626 UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCO
L_PRIMARY; |
| 627 leftIndex = rightIndex = 0; |
| 628 leftPair = rightPair = 0; |
| 629 for(;;) { |
| 630 while(leftPair == 0) { |
| 631 if(leftIndex == leftLength) { |
| 632 leftPair = EOS; |
| 633 break; |
| 634 } |
| 635 UChar32 c = left[leftIndex++]; |
| 636 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, l
eft, leftIndex); |
| 637 if(leftPair < MIN_LONG) { |
| 638 leftPair = nextPair(table, c, leftPair, NULL, left, leftInde
x, leftLength); |
| 639 } |
| 640 leftPair = getCases(variableTop, strengthIsPrimary, leftPair); |
| 641 } |
| 642 |
| 643 while(rightPair == 0) { |
| 644 if(rightIndex == rightLength) { |
| 645 rightPair = EOS; |
| 646 break; |
| 647 } |
| 648 UChar32 c = right[rightIndex++]; |
| 649 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c,
right, rightIndex); |
| 650 if(rightPair < MIN_LONG) { |
| 651 rightPair = nextPair(table, c, rightPair, NULL, right, right
Index, rightLength); |
| 652 } |
| 653 rightPair = getCases(variableTop, strengthIsPrimary, rightPair); |
| 654 } |
| 655 |
| 656 if(leftPair == rightPair) { |
| 657 if(leftPair == EOS) { break; } |
| 658 leftPair = rightPair = 0; |
| 659 continue; |
| 660 } |
| 661 uint32_t leftCase = leftPair & 0xffff; |
| 662 uint32_t rightCase = rightPair & 0xffff; |
| 663 if(leftCase != rightCase) { |
| 664 if((options & CollationSettings::UPPER_FIRST) == 0) { |
| 665 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; |
| 666 } else { |
| 667 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; |
| 668 } |
| 669 } |
| 670 if(leftPair == EOS) { break; } |
| 671 leftPair >>= 16; |
| 672 rightPair >>= 16; |
| 673 } |
| 674 } |
| 675 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_
EQUAL; } |
| 676 |
| 677 // Remove the case bits from the tertiary weight when caseLevel is on or cas
eFirst is off. |
| 678 UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); |
| 679 |
| 680 leftIndex = rightIndex = 0; |
| 681 leftPair = rightPair = 0; |
| 682 for(;;) { |
| 683 while(leftPair == 0) { |
| 684 if(leftIndex == leftLength) { |
| 685 leftPair = EOS; |
| 686 break; |
| 687 } |
| 688 UChar32 c = left[leftIndex++]; |
| 689 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left,
leftIndex); |
| 690 if(leftPair < MIN_LONG) { |
| 691 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, l
eftLength); |
| 692 } |
| 693 leftPair = getTertiaries(variableTop, withCaseBits, leftPair); |
| 694 } |
| 695 |
| 696 while(rightPair == 0) { |
| 697 if(rightIndex == rightLength) { |
| 698 rightPair = EOS; |
| 699 break; |
| 700 } |
| 701 UChar32 c = right[rightIndex++]; |
| 702 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, righ
t, rightIndex); |
| 703 if(rightPair < MIN_LONG) { |
| 704 rightPair = nextPair(table, c, rightPair, NULL, right, rightInde
x, rightLength); |
| 705 } |
| 706 rightPair = getTertiaries(variableTop, withCaseBits, rightPair); |
| 707 } |
| 708 |
| 709 if(leftPair == rightPair) { |
| 710 if(leftPair == EOS) { break; } |
| 711 leftPair = rightPair = 0; |
| 712 continue; |
| 713 } |
| 714 uint32_t leftTertiary = leftPair & 0xffff; |
| 715 uint32_t rightTertiary = rightPair & 0xffff; |
| 716 if(leftTertiary != rightTertiary) { |
| 717 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { |
| 718 // Pass through EOS and MERGE_WEIGHT |
| 719 // and keep real tertiary weights larger than the MERGE_WEIGHT. |
| 720 // Tertiary CEs (secondary ignorables) are not supported in fast
Latin. |
| 721 if(leftTertiary > MERGE_WEIGHT) { |
| 722 leftTertiary ^= CASE_MASK; |
| 723 } |
| 724 if(rightTertiary > MERGE_WEIGHT) { |
| 725 rightTertiary ^= CASE_MASK; |
| 726 } |
| 727 } |
| 728 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; |
| 729 } |
| 730 if(leftPair == EOS) { break; } |
| 731 leftPair >>= 16; |
| 732 rightPair >>= 16; |
| 733 } |
| 734 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_E
QUAL; } |
| 735 |
| 736 leftIndex = rightIndex = 0; |
| 737 leftPair = rightPair = 0; |
| 738 for(;;) { |
| 739 while(leftPair == 0) { |
| 740 if(leftIndex == leftLength) { |
| 741 leftPair = EOS; |
| 742 break; |
| 743 } |
| 744 UChar32 c = left[leftIndex++]; |
| 745 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left,
leftIndex); |
| 746 if(leftPair < MIN_LONG) { |
| 747 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, l
eftLength); |
| 748 } |
| 749 leftPair = getQuaternaries(variableTop, leftPair); |
| 750 } |
| 751 |
| 752 while(rightPair == 0) { |
| 753 if(rightIndex == rightLength) { |
| 754 rightPair = EOS; |
| 755 break; |
| 756 } |
| 757 UChar32 c = right[rightIndex++]; |
| 758 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, righ
t, rightIndex); |
| 759 if(rightPair < MIN_LONG) { |
| 760 rightPair = nextPair(table, c, rightPair, NULL, right, rightInde
x, rightLength); |
| 761 } |
| 762 rightPair = getQuaternaries(variableTop, rightPair); |
| 763 } |
| 764 |
| 765 if(leftPair == rightPair) { |
| 766 if(leftPair == EOS) { break; } |
| 767 leftPair = rightPair = 0; |
| 768 continue; |
| 769 } |
| 770 uint32_t leftQuaternary = leftPair & 0xffff; |
| 771 uint32_t rightQuaternary = rightPair & 0xffff; |
| 772 if(leftQuaternary != rightQuaternary) { |
| 773 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER
; |
| 774 } |
| 775 if(leftPair == EOS) { break; } |
| 776 leftPair >>= 16; |
| 777 rightPair >>= 16; |
| 778 } |
| 779 return UCOL_EQUAL; |
| 780 } |
| 781 |
| 782 uint32_t |
| 783 CollationFastLatin::lookup(const uint16_t *table, UChar32 c) { |
| 784 U_ASSERT(c > LATIN_MAX); |
| 785 if(PUNCT_START <= c && c < PUNCT_LIMIT) { |
| 786 return table[c - PUNCT_START + LATIN_LIMIT]; |
| 787 } else if(c == 0xfffe) { |
| 788 return MERGE_WEIGHT; |
| 789 } else if(c == 0xffff) { |
| 790 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; |
| 791 } else { |
| 792 return BAIL_OUT; |
| 793 } |
| 794 } |
| 795 |
| 796 uint32_t |
| 797 CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c, |
| 798 const uint8_t *s8, int32_t &sIndex, int32_t sLeng
th) { |
| 799 // The caller handled ASCII and valid/supported Latin. |
| 800 U_ASSERT(c > 0x7f); |
| 801 int32_t i2 = sIndex + 1; |
| 802 if(i2 < sLength || sLength < 0) { |
| 803 uint8_t t1 = s8[sIndex]; |
| 804 uint8_t t2 = s8[i2]; |
| 805 sIndex += 2; |
| 806 if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) { |
| 807 return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01B
F |
| 808 } else if(c == 0xef && t1 == 0xbf) { |
| 809 if(t2 == 0xbe) { |
| 810 return MERGE_WEIGHT; // U+FFFE |
| 811 } else if(t2 == 0xbf) { |
| 812 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+F
FFF |
| 813 } |
| 814 } |
| 815 } |
| 816 return BAIL_OUT; |
| 817 } |
| 818 |
| 819 uint32_t |
| 820 CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c, |
| 821 const uint8_t *s8, int32_t &sIndex) { |
| 822 // The caller handled ASCII. |
| 823 // The string is well-formed and contains only supported characters. |
| 824 U_ASSERT(c > 0x7f); |
| 825 if(c <= LATIN_MAX_UTF8_LEAD) { |
| 826 return table[((c - 0xc2) << 6) + s8[sIndex++]]; // 0080..017F |
| 827 } |
| 828 uint8_t t2 = s8[sIndex + 1]; |
| 829 sIndex += 2; |
| 830 if(c == 0xe2) { |
| 831 return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF |
| 832 } else if(t2 == 0xbe) { |
| 833 return MERGE_WEIGHT; // U+FFFE |
| 834 } else { |
| 835 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF |
| 836 } |
| 837 } |
| 838 |
| 839 uint32_t |
| 840 CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce, |
| 841 const UChar *s16, const uint8_t *s8, int32_t &sInde
x, int32_t &sLength) { |
| 842 if(ce >= MIN_LONG || ce < CONTRACTION) { |
| 843 return ce; // simple or special mini CE |
| 844 } else if(ce >= EXPANSION) { |
| 845 int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); |
| 846 return ((uint32_t)table[index + 1] << 16) | table[index]; |
| 847 } else /* ce >= CONTRACTION */ { |
| 848 if(c == 0 && sLength < 0) { |
| 849 sLength = sIndex - 1; |
| 850 return EOS; |
| 851 } |
| 852 // Contraction list: Default mapping followed by |
| 853 // 0 or more single-character contraction suffix mappings. |
| 854 int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); |
| 855 if(sIndex != sLength) { |
| 856 // Read the next character. |
| 857 int32_t c2; |
| 858 int32_t nextIndex = sIndex; |
| 859 if(s16 != NULL) { |
| 860 c2 = s16[nextIndex++]; |
| 861 if(c2 > LATIN_MAX) { |
| 862 if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) { |
| 863 c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0
180..01BF |
| 864 } else if(c2 == 0xfffe || c2 == 0xffff) { |
| 865 c2 = -1; // U+FFFE & U+FFFF cannot occur in contraction
s. |
| 866 } else { |
| 867 return BAIL_OUT; |
| 868 } |
| 869 } |
| 870 } else { |
| 871 c2 = s8[nextIndex++]; |
| 872 if(c2 > 0x7f) { |
| 873 uint8_t t; |
| 874 if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength && |
| 875 0x80 <= (t = s8[nextIndex]) && t <= 0xbf) { |
| 876 c2 = ((c2 - 0xc2) << 6) + t; // 0080..017F |
| 877 ++nextIndex; |
| 878 } else { |
| 879 int32_t i2 = nextIndex + 1; |
| 880 if(i2 < sLength || sLength < 0) { |
| 881 if(c2 == 0xe2 && s8[nextIndex] == 0x80 && |
| 882 0x80 <= (t = s8[i2]) && t <= 0xbf) { |
| 883 c2 = (LATIN_LIMIT - 0x80) + t; // 2000..203F ->
0180..01BF |
| 884 } else if(c2 == 0xef && s8[nextIndex] == 0xbf && |
| 885 ((t = s8[i2]) == 0xbe || t == 0xbf)) { |
| 886 c2 = -1; // U+FFFE & U+FFFF cannot occur in con
tractions. |
| 887 } else { |
| 888 return BAIL_OUT; |
| 889 } |
| 890 } else { |
| 891 return BAIL_OUT; |
| 892 } |
| 893 nextIndex += 2; |
| 894 } |
| 895 } |
| 896 } |
| 897 if(c2 == 0 && sLength < 0) { |
| 898 sLength = sIndex; |
| 899 c2 = -1; |
| 900 } |
| 901 // Look for the next character in the contraction suffix list, |
| 902 // which is in ascending order of single suffix characters. |
| 903 int32_t i = index; |
| 904 int32_t head = table[i]; // first skip the default mapping |
| 905 int32_t x; |
| 906 do { |
| 907 i += head >> CONTR_LENGTH_SHIFT; |
| 908 head = table[i]; |
| 909 x = head & CONTR_CHAR_MASK; |
| 910 } while(x < c2); |
| 911 if(x == c2) { |
| 912 index = i; |
| 913 sIndex = nextIndex; |
| 914 } |
| 915 } |
| 916 // Return the CE or CEs for the default or contraction mapping. |
| 917 int32_t length = table[index] >> CONTR_LENGTH_SHIFT; |
| 918 if(length == 1) { |
| 919 return BAIL_OUT; |
| 920 } |
| 921 ce = table[index + 1]; |
| 922 if(length == 2) { |
| 923 return ce; |
| 924 } else { |
| 925 return ((uint32_t)table[index + 2] << 16) | ce; |
| 926 } |
| 927 } |
| 928 } |
| 929 |
| 930 uint32_t |
| 931 CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) { |
| 932 if(pair <= 0xffff) { |
| 933 // one mini CE |
| 934 if(pair >= MIN_SHORT) { |
| 935 pair = getSecondariesFromOneShortCE(pair); |
| 936 } else if(pair > variableTop) { |
| 937 pair = COMMON_SEC_PLUS_OFFSET; |
| 938 } else if(pair >= MIN_LONG) { |
| 939 pair = 0; // variable |
| 940 } |
| 941 // else special mini CE |
| 942 } else { |
| 943 uint32_t ce = pair & 0xffff; |
| 944 if(ce >= MIN_SHORT) { |
| 945 pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS; |
| 946 } else if(ce > variableTop) { |
| 947 pair = TWO_COMMON_SEC_PLUS_OFFSET; |
| 948 } else { |
| 949 U_ASSERT(ce >= MIN_LONG); |
| 950 pair = 0; // variable |
| 951 } |
| 952 } |
| 953 return pair; |
| 954 } |
| 955 |
| 956 uint32_t |
| 957 CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint
32_t pair) { |
| 958 // Primary+caseLevel: Ignore case level weights of primary ignorables. |
| 959 // Otherwise: Ignore case level weights of secondary ignorables. |
| 960 // For details see the comments in the CollationCompare class. |
| 961 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. |
| 962 if(pair <= 0xffff) { |
| 963 // one mini CE |
| 964 if(pair >= MIN_SHORT) { |
| 965 // A high secondary weight means we really have two CEs, |
| 966 // a primary CE and a secondary CE. |
| 967 uint32_t ce = pair; |
| 968 pair &= CASE_MASK; // explicit weight of primary CE |
| 969 if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { |
| 970 pair |= LOWER_CASE << 16; // implied weight of secondary CE |
| 971 } |
| 972 } else if(pair > variableTop) { |
| 973 pair = LOWER_CASE; |
| 974 } else if(pair >= MIN_LONG) { |
| 975 pair = 0; // variable |
| 976 } |
| 977 // else special mini CE |
| 978 } else { |
| 979 // two mini CEs, same primary groups, neither expands like above |
| 980 uint32_t ce = pair & 0xffff; |
| 981 if(ce >= MIN_SHORT) { |
| 982 if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) { |
| 983 pair &= CASE_MASK; |
| 984 } else { |
| 985 pair &= TWO_CASES_MASK; |
| 986 } |
| 987 } else if(ce > variableTop) { |
| 988 pair = TWO_LOWER_CASES; |
| 989 } else { |
| 990 U_ASSERT(ce >= MIN_LONG); |
| 991 pair = 0; // variable |
| 992 } |
| 993 } |
| 994 return pair; |
| 995 } |
| 996 |
| 997 uint32_t |
| 998 CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint
32_t pair) { |
| 999 if(pair <= 0xffff) { |
| 1000 // one mini CE |
| 1001 if(pair >= MIN_SHORT) { |
| 1002 // A high secondary weight means we really have two CEs, |
| 1003 // a primary CE and a secondary CE. |
| 1004 uint32_t ce = pair; |
| 1005 if(withCaseBits) { |
| 1006 pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET; |
| 1007 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { |
| 1008 pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16; |
| 1009 } |
| 1010 } else { |
| 1011 pair = (pair & TERTIARY_MASK) + TER_OFFSET; |
| 1012 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { |
| 1013 pair |= COMMON_TER_PLUS_OFFSET << 16; |
| 1014 } |
| 1015 } |
| 1016 } else if(pair > variableTop) { |
| 1017 pair = (pair & TERTIARY_MASK) + TER_OFFSET; |
| 1018 if(withCaseBits) { |
| 1019 pair |= LOWER_CASE; |
| 1020 } |
| 1021 } else if(pair >= MIN_LONG) { |
| 1022 pair = 0; // variable |
| 1023 } |
| 1024 // else special mini CE |
| 1025 } else { |
| 1026 // two mini CEs, same primary groups, neither expands like above |
| 1027 uint32_t ce = pair & 0xffff; |
| 1028 if(ce >= MIN_SHORT) { |
| 1029 if(withCaseBits) { |
| 1030 pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK; |
| 1031 } else { |
| 1032 pair &= TWO_TERTIARIES_MASK; |
| 1033 } |
| 1034 pair += TWO_TER_OFFSETS; |
| 1035 } else if(ce > variableTop) { |
| 1036 pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS; |
| 1037 if(withCaseBits) { |
| 1038 pair |= TWO_LOWER_CASES; |
| 1039 } |
| 1040 } else { |
| 1041 U_ASSERT(ce >= MIN_LONG); |
| 1042 pair = 0; // variable |
| 1043 } |
| 1044 } |
| 1045 return pair; |
| 1046 } |
| 1047 |
| 1048 uint32_t |
| 1049 CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) { |
| 1050 // Return the primary weight of a variable CE, |
| 1051 // or the maximum primary weight for a non-variable, not-completely-ignorabl
e CE. |
| 1052 if(pair <= 0xffff) { |
| 1053 // one mini CE |
| 1054 if(pair >= MIN_SHORT) { |
| 1055 // A high secondary weight means we really have two CEs, |
| 1056 // a primary CE and a secondary CE. |
| 1057 if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) { |
| 1058 pair = TWO_SHORT_PRIMARIES_MASK; |
| 1059 } else { |
| 1060 pair = SHORT_PRIMARY_MASK; |
| 1061 } |
| 1062 } else if(pair > variableTop) { |
| 1063 pair = SHORT_PRIMARY_MASK; |
| 1064 } else if(pair >= MIN_LONG) { |
| 1065 pair &= LONG_PRIMARY_MASK; // variable |
| 1066 } |
| 1067 // else special mini CE |
| 1068 } else { |
| 1069 // two mini CEs, same primary groups, neither expands like above |
| 1070 uint32_t ce = pair & 0xffff; |
| 1071 if(ce > variableTop) { |
| 1072 pair = TWO_SHORT_PRIMARIES_MASK; |
| 1073 } else { |
| 1074 U_ASSERT(ce >= MIN_LONG); |
| 1075 pair &= TWO_LONG_PRIMARIES_MASK; // variable |
| 1076 } |
| 1077 } |
| 1078 return pair; |
| 1079 } |
| 1080 |
| 1081 U_NAMESPACE_END |
| 1082 |
| 1083 #endif // !UCONFIG_NO_COLLATION |
OLD | NEW |