| OLD | NEW |
| 1 /* | 1 /* |
| 2 ******************************************************************************* | 2 ******************************************************************************* |
| 3 * Copyright (C) 2012-2014, International Business Machines | 3 * Copyright (C) 2012-2015, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* | 5 ******************************************************************************* |
| 6 * collationkeys.cpp | 6 * collationkeys.cpp |
| 7 * | 7 * |
| 8 * created on: 2012sep02 | 8 * created on: 2012sep02 |
| 9 * created by: Markus W. Scherer | 9 * created by: Markus W. Scherer |
| 10 */ | 10 */ |
| 11 | 11 |
| 12 #include "unicode/utypes.h" | 12 #include "unicode/utypes.h" |
| 13 | 13 |
| (...skipping 225 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 239 levels &= ~(((uint32_t)1 << minLevel) - 1); | 239 levels &= ~(((uint32_t)1 << minLevel) - 1); |
| 240 if(levels == 0) { return; } | 240 if(levels == 0) { return; } |
| 241 | 241 |
| 242 uint32_t variableTop; | 242 uint32_t variableTop; |
| 243 if((options & CollationSettings::ALTERNATE_MASK) == 0) { | 243 if((options & CollationSettings::ALTERNATE_MASK) == 0) { |
| 244 variableTop = 0; | 244 variableTop = 0; |
| 245 } else { | 245 } else { |
| 246 // +1 so that we can use "<" and primary ignorables test out early. | 246 // +1 so that we can use "<" and primary ignorables test out early. |
| 247 variableTop = settings.variableTop + 1; | 247 variableTop = settings.variableTop + 1; |
| 248 } | 248 } |
| 249 const uint8_t *reorderTable = settings.reorderTable; | |
| 250 | 249 |
| 251 uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options); | 250 uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options); |
| 252 | 251 |
| 253 SortKeyLevel cases; | 252 SortKeyLevel cases; |
| 254 SortKeyLevel secondaries; | 253 SortKeyLevel secondaries; |
| 255 SortKeyLevel tertiaries; | 254 SortKeyLevel tertiaries; |
| 256 SortKeyLevel quaternaries; | 255 SortKeyLevel quaternaries; |
| 257 | 256 |
| 258 uint32_t compressedP1 = 0; // 0==no compression; otherwise reordered compre
ssible lead byte | 257 uint32_t prevReorderedPrimary = 0; // 0==no compression |
| 259 int32_t commonCases = 0; | 258 int32_t commonCases = 0; |
| 260 int32_t commonSecondaries = 0; | 259 int32_t commonSecondaries = 0; |
| 261 int32_t commonTertiaries = 0; | 260 int32_t commonTertiaries = 0; |
| 262 int32_t commonQuaternaries = 0; | 261 int32_t commonQuaternaries = 0; |
| 263 | 262 |
| 264 uint32_t prevSecondary = 0; | 263 uint32_t prevSecondary = 0; |
| 265 UBool anyMergeSeparators = FALSE; | 264 int32_t secSegmentStart = 0; |
| 266 | 265 |
| 267 for(;;) { | 266 for(;;) { |
| 268 // No need to keep all CEs in the buffer when we write a sort key. | 267 // No need to keep all CEs in the buffer when we write a sort key. |
| 269 iter.clearCEsIfNoneRemaining(); | 268 iter.clearCEsIfNoneRemaining(); |
| 270 int64_t ce = iter.nextCE(errorCode); | 269 int64_t ce = iter.nextCE(errorCode); |
| 271 uint32_t p = (uint32_t)(ce >> 32); | 270 uint32_t p = (uint32_t)(ce >> 32); |
| 272 if(p < variableTop && p > Collation::MERGE_SEPARATOR_PRIMARY) { | 271 if(p < variableTop && p > Collation::MERGE_SEPARATOR_PRIMARY) { |
| 273 // Variable CE, shift it to quaternary level. | 272 // Variable CE, shift it to quaternary level. |
| 274 // Ignore all following primary ignorables, and shift further variab
le CEs. | 273 // Ignore all following primary ignorables, and shift further variab
le CEs. |
| 275 if(commonQuaternaries != 0) { | 274 if(commonQuaternaries != 0) { |
| 276 --commonQuaternaries; | 275 --commonQuaternaries; |
| 277 while(commonQuaternaries >= QUAT_COMMON_MAX_COUNT) { | 276 while(commonQuaternaries >= QUAT_COMMON_MAX_COUNT) { |
| 278 quaternaries.appendByte(QUAT_COMMON_MIDDLE); | 277 quaternaries.appendByte(QUAT_COMMON_MIDDLE); |
| 279 commonQuaternaries -= QUAT_COMMON_MAX_COUNT; | 278 commonQuaternaries -= QUAT_COMMON_MAX_COUNT; |
| 280 } | 279 } |
| 281 // Shifted primary weights are lower than the common weight. | 280 // Shifted primary weights are lower than the common weight. |
| 282 quaternaries.appendByte(QUAT_COMMON_LOW + commonQuaternaries); | 281 quaternaries.appendByte(QUAT_COMMON_LOW + commonQuaternaries); |
| 283 commonQuaternaries = 0; | 282 commonQuaternaries = 0; |
| 284 } | 283 } |
| 285 do { | 284 do { |
| 286 if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) { | 285 if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) { |
| 287 uint32_t p1 = p >> 24; | 286 if(settings.hasReordering()) { |
| 288 if(reorderTable != NULL) { p1 = reorderTable[p1]; } | 287 p = settings.reorder(p); |
| 289 if(p1 >= QUAT_SHIFTED_LIMIT_BYTE) { | 288 } |
| 289 if((p >> 24) >= QUAT_SHIFTED_LIMIT_BYTE) { |
| 290 // Prevent shifted primary lead bytes from | 290 // Prevent shifted primary lead bytes from |
| 291 // overlapping with the common compression range. | 291 // overlapping with the common compression range. |
| 292 quaternaries.appendByte(QUAT_SHIFTED_LIMIT_BYTE); | 292 quaternaries.appendByte(QUAT_SHIFTED_LIMIT_BYTE); |
| 293 } | 293 } |
| 294 quaternaries.appendWeight32((p1 << 24) | (p & 0xffffff)); | 294 quaternaries.appendWeight32(p); |
| 295 } | 295 } |
| 296 do { | 296 do { |
| 297 ce = iter.nextCE(errorCode); | 297 ce = iter.nextCE(errorCode); |
| 298 p = (uint32_t)(ce >> 32); | 298 p = (uint32_t)(ce >> 32); |
| 299 } while(p == 0); | 299 } while(p == 0); |
| 300 } while(p < variableTop && p > Collation::MERGE_SEPARATOR_PRIMARY); | 300 } while(p < variableTop && p > Collation::MERGE_SEPARATOR_PRIMARY); |
| 301 } | 301 } |
| 302 // ce could be primary ignorable, or NO_CE, or the merge separator, | 302 // ce could be primary ignorable, or NO_CE, or the merge separator, |
| 303 // or a regular primary CE, but it is not variable. | 303 // or a regular primary CE, but it is not variable. |
| 304 // If ce==NO_CE, then write nothing for the primary level but | 304 // If ce==NO_CE, then write nothing for the primary level but |
| 305 // terminate compression on all levels and then exit the loop. | 305 // terminate compression on all levels and then exit the loop. |
| 306 if(p > Collation::NO_CE_PRIMARY && (levels & Collation::PRIMARY_LEVEL_FL
AG) != 0) { | 306 if(p > Collation::NO_CE_PRIMARY && (levels & Collation::PRIMARY_LEVEL_FL
AG) != 0) { |
| 307 // Test the un-reordered primary for compressibility. |
| 308 UBool isCompressible = compressibleBytes[p >> 24]; |
| 309 if(settings.hasReordering()) { |
| 310 p = settings.reorder(p); |
| 311 } |
| 307 uint32_t p1 = p >> 24; | 312 uint32_t p1 = p >> 24; |
| 308 if(reorderTable != NULL) { p1 = reorderTable[p1]; } | 313 if(!isCompressible || p1 != (prevReorderedPrimary >> 24)) { |
| 309 if(p1 != compressedP1) { | 314 if(prevReorderedPrimary != 0) { |
| 310 if(compressedP1 != 0) { | 315 if(p < prevReorderedPrimary) { |
| 311 if(p1 < compressedP1) { | |
| 312 // No primary compression terminator | 316 // No primary compression terminator |
| 313 // at the end of the level or merged segment. | 317 // at the end of the level or merged segment. |
| 314 if(p1 > Collation::MERGE_SEPARATOR_BYTE) { | 318 if(p1 > Collation::MERGE_SEPARATOR_BYTE) { |
| 315 sink.Append(Collation::PRIMARY_COMPRESSION_LOW_BYTE)
; | 319 sink.Append(Collation::PRIMARY_COMPRESSION_LOW_BYTE)
; |
| 316 } | 320 } |
| 317 } else { | 321 } else { |
| 318 sink.Append(Collation::PRIMARY_COMPRESSION_HIGH_BYTE); | 322 sink.Append(Collation::PRIMARY_COMPRESSION_HIGH_BYTE); |
| 319 } | 323 } |
| 320 } | 324 } |
| 321 sink.Append(p1); | 325 sink.Append(p1); |
| 322 // Test the un-reordered lead byte for compressibility but | 326 if(isCompressible) { |
| 323 // remember the reordered lead byte. | 327 prevReorderedPrimary = p; |
| 324 if(compressibleBytes[p >> 24]) { | |
| 325 compressedP1 = p1; | |
| 326 } else { | 328 } else { |
| 327 compressedP1 = 0; | 329 prevReorderedPrimary = 0; |
| 328 } | 330 } |
| 329 } | 331 } |
| 330 char p2 = (char)(p >> 16); | 332 char p2 = (char)(p >> 16); |
| 331 if(p2 != 0) { | 333 if(p2 != 0) { |
| 332 char buffer[3] = { p2, (char)(p >> 8), (char)p }; | 334 char buffer[3] = { p2, (char)(p >> 8), (char)p }; |
| 333 sink.Append(buffer, (buffer[1] == 0) ? 1 : (buffer[2] == 0) ? 2
: 3); | 335 sink.Append(buffer, (buffer[1] == 0) ? 1 : (buffer[2] == 0) ? 2
: 3); |
| 334 } | 336 } |
| 335 // Optimization for internalNextSortKeyPart(): | 337 // Optimization for internalNextSortKeyPart(): |
| 336 // When the primary level overflows we can stop because we need not | 338 // When the primary level overflows we can stop because we need not |
| 337 // calculate (preflight) the whole sort key length. | 339 // calculate (preflight) the whole sort key length. |
| 338 if(!preflight && sink.Overflowed()) { | 340 if(!preflight && sink.Overflowed()) { |
| 339 if(U_SUCCESS(errorCode) && !sink.IsOk()) { | 341 if(U_SUCCESS(errorCode) && !sink.IsOk()) { |
| 340 errorCode = U_MEMORY_ALLOCATION_ERROR; | 342 errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 341 } | 343 } |
| 342 return; | 344 return; |
| 343 } | 345 } |
| 344 } | 346 } |
| 345 | 347 |
| 346 uint32_t lower32 = (uint32_t)ce; | 348 uint32_t lower32 = (uint32_t)ce; |
| 347 if(lower32 == 0) { continue; } // completely ignorable, no secondary/ca
se/tertiary/quaternary | 349 if(lower32 == 0) { continue; } // completely ignorable, no secondary/ca
se/tertiary/quaternary |
| 348 | 350 |
| 349 if((levels & Collation::SECONDARY_LEVEL_FLAG) != 0) { | 351 if((levels & Collation::SECONDARY_LEVEL_FLAG) != 0) { |
| 350 uint32_t s = lower32 >> 16; | 352 uint32_t s = lower32 >> 16; |
| 351 if(s == 0) { | 353 if(s == 0) { |
| 352 // secondary ignorable | 354 // secondary ignorable |
| 353 } else if(s == Collation::COMMON_WEIGHT16) { | 355 } else if(s == Collation::COMMON_WEIGHT16 && |
| 356 ((options & CollationSettings::BACKWARD_SECONDARY) == 0 || |
| 357 p != Collation::MERGE_SEPARATOR_PRIMARY)) { |
| 358 // s is a common secondary weight, and |
| 359 // backwards-secondary is off or the ce is not the merge separat
or. |
| 354 ++commonSecondaries; | 360 ++commonSecondaries; |
| 355 } else if((options & CollationSettings::BACKWARD_SECONDARY) == 0) { | 361 } else if((options & CollationSettings::BACKWARD_SECONDARY) == 0) { |
| 356 if(commonSecondaries != 0) { | 362 if(commonSecondaries != 0) { |
| 357 --commonSecondaries; | 363 --commonSecondaries; |
| 358 while(commonSecondaries >= SEC_COMMON_MAX_COUNT) { | 364 while(commonSecondaries >= SEC_COMMON_MAX_COUNT) { |
| 359 secondaries.appendByte(SEC_COMMON_MIDDLE); | 365 secondaries.appendByte(SEC_COMMON_MIDDLE); |
| 360 commonSecondaries -= SEC_COMMON_MAX_COUNT; | 366 commonSecondaries -= SEC_COMMON_MAX_COUNT; |
| 361 } | 367 } |
| 362 uint32_t b; | 368 uint32_t b; |
| 363 if(s < Collation::COMMON_WEIGHT16) { | 369 if(s < Collation::COMMON_WEIGHT16) { |
| (...skipping 18 matching lines...) Expand all Loading... |
| 382 } | 388 } |
| 383 secondaries.appendByte(b); | 389 secondaries.appendByte(b); |
| 384 commonSecondaries -= remainder; | 390 commonSecondaries -= remainder; |
| 385 // commonSecondaries is now a multiple of SEC_COMMON_MAX_COU
NT. | 391 // commonSecondaries is now a multiple of SEC_COMMON_MAX_COU
NT. |
| 386 while(commonSecondaries > 0) { // same as >= SEC_COMMON_MAX
_COUNT | 392 while(commonSecondaries > 0) { // same as >= SEC_COMMON_MAX
_COUNT |
| 387 secondaries.appendByte(SEC_COMMON_MIDDLE); | 393 secondaries.appendByte(SEC_COMMON_MIDDLE); |
| 388 commonSecondaries -= SEC_COMMON_MAX_COUNT; | 394 commonSecondaries -= SEC_COMMON_MAX_COUNT; |
| 389 } | 395 } |
| 390 // commonSecondaries == 0 | 396 // commonSecondaries == 0 |
| 391 } | 397 } |
| 392 // Reduce separators so that we can look for byte<=1 later. | 398 if(0 < p && p <= Collation::MERGE_SEPARATOR_PRIMARY) { |
| 393 if(s <= Collation::MERGE_SEPARATOR_WEIGHT16) { | 399 // The backwards secondary level compares secondary weights
backwards |
| 394 if(s == Collation::MERGE_SEPARATOR_WEIGHT16) { | 400 // within segments separated by the merge separator (U+FFFE)
. |
| 395 anyMergeSeparators = TRUE; | 401 uint8_t *secs = secondaries.data(); |
| 402 int32_t last = secondaries.length() - 1; |
| 403 if(secSegmentStart < last) { |
| 404 uint8_t *p = secs + secSegmentStart; |
| 405 uint8_t *q = secs + last; |
| 406 do { |
| 407 uint8_t b = *p; |
| 408 *p++ = *q; |
| 409 *q-- = b; |
| 410 } while(p < q); |
| 396 } | 411 } |
| 397 secondaries.appendByte((s >> 8) - 1); | 412 secondaries.appendByte(p == Collation::NO_CE_PRIMARY ? |
| 413 Collation::LEVEL_SEPARATOR_BYTE : Collation::MERGE_SEPAR
ATOR_BYTE); |
| 414 prevSecondary = 0; |
| 415 secSegmentStart = secondaries.length(); |
| 398 } else { | 416 } else { |
| 399 secondaries.appendReverseWeight16(s); | 417 secondaries.appendReverseWeight16(s); |
| 418 prevSecondary = s; |
| 400 } | 419 } |
| 401 prevSecondary = s; | |
| 402 } | 420 } |
| 403 } | 421 } |
| 404 | 422 |
| 405 if((levels & Collation::CASE_LEVEL_FLAG) != 0) { | 423 if((levels & Collation::CASE_LEVEL_FLAG) != 0) { |
| 406 if((CollationSettings::getStrength(options) == UCOL_PRIMARY) ? | 424 if((CollationSettings::getStrength(options) == UCOL_PRIMARY) ? |
| 407 p == 0 : lower32 <= 0xffff) { | 425 p == 0 : lower32 <= 0xffff) { |
| 408 // Primary+caseLevel: Ignore case level weights of primary ignor
ables. | 426 // Primary+caseLevel: Ignore case level weights of primary ignor
ables. |
| 409 // Otherwise: Ignore case level weights of secondary ignorables. | 427 // Otherwise: Ignore case level weights of secondary ignorables. |
| 410 // For details see the comments in the CollationCompare class. | 428 // For details see the comments in the CollationCompare class. |
| 411 } else { | 429 } else { |
| 412 uint32_t c = (lower32 >> 8) & 0xff; // case bits & tertiary lea
d byte | 430 uint32_t c = (lower32 >> 8) & 0xff; // case bits & tertiary lea
d byte |
| 413 U_ASSERT((c & 0xc0) != 0xc0); | 431 U_ASSERT((c & 0xc0) != 0xc0); |
| 414 if((c & 0xc0) == 0 && c > Collation::MERGE_SEPARATOR_BYTE) { | 432 if((c & 0xc0) == 0 && c > Collation::LEVEL_SEPARATOR_BYTE) { |
| 415 ++commonCases; | 433 ++commonCases; |
| 416 } else { | 434 } else { |
| 417 if((options & CollationSettings::UPPER_FIRST) == 0) { | 435 if((options & CollationSettings::UPPER_FIRST) == 0) { |
| 418 // lowerFirst: Compress common weights to nibbles 1..7..
13, mixed=14, upper=15. | 436 // lowerFirst: Compress common weights to nibbles 1..7..
13, mixed=14, upper=15. |
| 419 if(commonCases != 0) { | 437 // If there are only common (=lowest) weights in the who
le level, |
| 438 // then we need not write anything. |
| 439 // Level length differences are handled already on the n
ext-higher level. |
| 440 if(commonCases != 0 && |
| 441 (c > Collation::LEVEL_SEPARATOR_BYTE || !cases.i
sEmpty())) { |
| 420 --commonCases; | 442 --commonCases; |
| 421 while(commonCases >= CASE_LOWER_FIRST_COMMON_MAX_COU
NT) { | 443 while(commonCases >= CASE_LOWER_FIRST_COMMON_MAX_COU
NT) { |
| 422 cases.appendByte(CASE_LOWER_FIRST_COMMON_MIDDLE
<< 4); | 444 cases.appendByte(CASE_LOWER_FIRST_COMMON_MIDDLE
<< 4); |
| 423 commonCases -= CASE_LOWER_FIRST_COMMON_MAX_COUNT
; | 445 commonCases -= CASE_LOWER_FIRST_COMMON_MAX_COUNT
; |
| 424 } | 446 } |
| 425 uint32_t b; | 447 uint32_t b; |
| 426 if(c <= Collation::MERGE_SEPARATOR_BYTE) { | 448 if(c <= Collation::LEVEL_SEPARATOR_BYTE) { |
| 427 b = CASE_LOWER_FIRST_COMMON_LOW + commonCases; | 449 b = CASE_LOWER_FIRST_COMMON_LOW + commonCases; |
| 428 } else { | 450 } else { |
| 429 b = CASE_LOWER_FIRST_COMMON_HIGH - commonCases; | 451 b = CASE_LOWER_FIRST_COMMON_HIGH - commonCases; |
| 430 } | 452 } |
| 431 cases.appendByte(b << 4); | 453 cases.appendByte(b << 4); |
| 432 commonCases = 0; | 454 commonCases = 0; |
| 433 } | 455 } |
| 434 if(c > Collation::MERGE_SEPARATOR_BYTE) { | 456 if(c > Collation::LEVEL_SEPARATOR_BYTE) { |
| 435 c = (CASE_LOWER_FIRST_COMMON_HIGH + (c >> 6)) << 4;
// 14 or 15 | 457 c = (CASE_LOWER_FIRST_COMMON_HIGH + (c >> 6)) << 4;
// 14 or 15 |
| 436 } | 458 } |
| 437 } else { | 459 } else { |
| 438 // upperFirst: Compress common weights to nibbles 3..15,
mixed=2, upper=1. | 460 // upperFirst: Compress common weights to nibbles 3..15,
mixed=2, upper=1. |
| 439 // The compressed common case weights only go up from th
e "low" value | 461 // The compressed common case weights only go up from th
e "low" value |
| 440 // because with upperFirst the common weight is the high
est one. | 462 // because with upperFirst the common weight is the high
est one. |
| 441 if(commonCases != 0) { | 463 if(commonCases != 0) { |
| 442 --commonCases; | 464 --commonCases; |
| 443 while(commonCases >= CASE_UPPER_FIRST_COMMON_MAX_COU
NT) { | 465 while(commonCases >= CASE_UPPER_FIRST_COMMON_MAX_COU
NT) { |
| 444 cases.appendByte(CASE_UPPER_FIRST_COMMON_LOW <<
4); | 466 cases.appendByte(CASE_UPPER_FIRST_COMMON_LOW <<
4); |
| 445 commonCases -= CASE_UPPER_FIRST_COMMON_MAX_COUNT
; | 467 commonCases -= CASE_UPPER_FIRST_COMMON_MAX_COUNT
; |
| 446 } | 468 } |
| 447 cases.appendByte((CASE_UPPER_FIRST_COMMON_LOW + comm
onCases) << 4); | 469 cases.appendByte((CASE_UPPER_FIRST_COMMON_LOW + comm
onCases) << 4); |
| 448 commonCases = 0; | 470 commonCases = 0; |
| 449 } | 471 } |
| 450 if(c > Collation::MERGE_SEPARATOR_BYTE) { | 472 if(c > Collation::LEVEL_SEPARATOR_BYTE) { |
| 451 c = (CASE_UPPER_FIRST_COMMON_LOW - (c >> 6)) << 4;
// 2 or 1 | 473 c = (CASE_UPPER_FIRST_COMMON_LOW - (c >> 6)) << 4;
// 2 or 1 |
| 452 } | 474 } |
| 453 } | 475 } |
| 454 // c is a separator byte 01 or 02, | 476 // c is a separator byte 01, |
| 455 // or a left-shifted nibble 0x10, 0x20, ... 0xf0. | 477 // or a left-shifted nibble 0x10, 0x20, ... 0xf0. |
| 456 cases.appendByte(c); | 478 cases.appendByte(c); |
| 457 } | 479 } |
| 458 } | 480 } |
| 459 } | 481 } |
| 460 | 482 |
| 461 if((levels & Collation::TERTIARY_LEVEL_FLAG) != 0) { | 483 if((levels & Collation::TERTIARY_LEVEL_FLAG) != 0) { |
| 462 uint32_t t = lower32 & tertiaryMask; | 484 uint32_t t = lower32 & tertiaryMask; |
| 463 U_ASSERT((lower32 & 0xc000) != 0xc000); | 485 U_ASSERT((lower32 & 0xc000) != 0xc000); |
| 464 if(t == Collation::COMMON_WEIGHT16) { | 486 if(t == Collation::COMMON_WEIGHT16) { |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 503 } | 525 } |
| 504 if(t > Collation::COMMON_WEIGHT16) { t += 0x4000; } | 526 if(t > Collation::COMMON_WEIGHT16) { t += 0x4000; } |
| 505 tertiaries.appendWeight16(t); | 527 tertiaries.appendWeight16(t); |
| 506 } else { | 528 } else { |
| 507 // Tertiary weights with caseFirst=upperFirst. | 529 // Tertiary weights with caseFirst=upperFirst. |
| 508 // Do not change the artificial uppercase weight of a tertiary C
E (0.0.ut), | 530 // Do not change the artificial uppercase weight of a tertiary C
E (0.0.ut), |
| 509 // to keep tertiary CEs well-formed. | 531 // to keep tertiary CEs well-formed. |
| 510 // Their case+tertiary weights must be greater than those of | 532 // Their case+tertiary weights must be greater than those of |
| 511 // primary and secondary CEs. | 533 // primary and secondary CEs. |
| 512 // | 534 // |
| 513 // Separators 01..02 -> 01..02 (unchanged) | 535 // Separator 01 -> 01 (unchanged) |
| 514 // Lowercase 03..04 -> 83..84 (includes uncased) | 536 // Lowercase 02..04 -> 82..84 (includes uncased) |
| 515 // Common weight 05 -> 85..C5 (common-weight compression ra
nge) | 537 // Common weight 05 -> 85..C5 (common-weight compression ra
nge) |
| 516 // Lowercase 06..3F -> C6..FF | 538 // Lowercase 06..3F -> C6..FF |
| 517 // Mixed case 43..7F -> 43..7F | 539 // Mixed case 42..7F -> 42..7F |
| 518 // Uppercase 83..BF -> 03..3F | 540 // Uppercase 82..BF -> 02..3F |
| 519 // Tertiary CE 86..BF -> C6..FF | 541 // Tertiary CE 86..BF -> C6..FF |
| 520 if(t <= Collation::MERGE_SEPARATOR_WEIGHT16) { | 542 if(t <= Collation::NO_CE_WEIGHT16) { |
| 521 // Keep separators unchanged. | 543 // Keep separators unchanged. |
| 522 } else if(lower32 > 0xffff) { | 544 } else if(lower32 > 0xffff) { |
| 523 // Invert case bits of primary & secondary CEs. | 545 // Invert case bits of primary & secondary CEs. |
| 524 t ^= 0xc000; | 546 t ^= 0xc000; |
| 525 if(t < (TER_UPPER_FIRST_COMMON_HIGH << 8)) { | 547 if(t < (TER_UPPER_FIRST_COMMON_HIGH << 8)) { |
| 526 t -= 0x4000; | 548 t -= 0x4000; |
| 527 } | 549 } |
| 528 } else { | 550 } else { |
| 529 // Keep uppercase bits of tertiary CEs. | 551 // Keep uppercase bits of tertiary CEs. |
| 530 U_ASSERT(0x8600 <= t && t <= 0xbfff); | 552 U_ASSERT(0x8600 <= t && t <= 0xbfff); |
| (...skipping 13 matching lines...) Expand all Loading... |
| 544 } | 566 } |
| 545 tertiaries.appendByte(b); | 567 tertiaries.appendByte(b); |
| 546 commonTertiaries = 0; | 568 commonTertiaries = 0; |
| 547 } | 569 } |
| 548 tertiaries.appendWeight16(t); | 570 tertiaries.appendWeight16(t); |
| 549 } | 571 } |
| 550 } | 572 } |
| 551 | 573 |
| 552 if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) { | 574 if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) { |
| 553 uint32_t q = lower32 & 0xffff; | 575 uint32_t q = lower32 & 0xffff; |
| 554 if((q & 0xc0) == 0 && q > Collation::MERGE_SEPARATOR_WEIGHT16) { | 576 if((q & 0xc0) == 0 && q > Collation::NO_CE_WEIGHT16) { |
| 555 ++commonQuaternaries; | 577 ++commonQuaternaries; |
| 556 } else if(q <= Collation::MERGE_SEPARATOR_WEIGHT16 && | 578 } else if(q == Collation::NO_CE_WEIGHT16 && |
| 557 (options & CollationSettings::ALTERNATE_MASK) == 0 && | 579 (options & CollationSettings::ALTERNATE_MASK) == 0 && |
| 558 (quaternaries.isEmpty() || | 580 quaternaries.isEmpty()) { |
| 559 quaternaries[quaternaries.length() - 1] == Collation::ME
RGE_SEPARATOR_BYTE)) { | 581 // If alternate=non-ignorable and there are only common quaterna
ry weights, |
| 560 // If alternate=non-ignorable and there are only | 582 // then we need not write anything. |
| 561 // common quaternary weights between two separators, | |
| 562 // then we need not write anything between these separators. | |
| 563 // The only weights greater than the merge separator and less th
an the common weight | 583 // The only weights greater than the merge separator and less th
an the common weight |
| 564 // are shifted primary weights, which are not generated for alte
rnate=non-ignorable. | 584 // are shifted primary weights, which are not generated for alte
rnate=non-ignorable. |
| 565 // There are also exactly as many quaternary weights as tertiary
weights, | 585 // There are also exactly as many quaternary weights as tertiary
weights, |
| 566 // so level length differences are handled already on tertiary l
evel. | 586 // so level length differences are handled already on tertiary l
evel. |
| 567 // Any above-common quaternary weight will compare greater regar
dless. | 587 // Any above-common quaternary weight will compare greater regar
dless. |
| 568 quaternaries.appendByte(q >> 8); | 588 quaternaries.appendByte(Collation::LEVEL_SEPARATOR_BYTE); |
| 569 } else { | 589 } else { |
| 570 if(q <= Collation::MERGE_SEPARATOR_WEIGHT16) { | 590 if(q == Collation::NO_CE_WEIGHT16) { |
| 571 q >>= 8; | 591 q = Collation::LEVEL_SEPARATOR_BYTE; |
| 572 } else { | 592 } else { |
| 573 q = 0xfc + ((q >> 6) & 3); | 593 q = 0xfc + ((q >> 6) & 3); |
| 574 } | 594 } |
| 575 if(commonQuaternaries != 0) { | 595 if(commonQuaternaries != 0) { |
| 576 --commonQuaternaries; | 596 --commonQuaternaries; |
| 577 while(commonQuaternaries >= QUAT_COMMON_MAX_COUNT) { | 597 while(commonQuaternaries >= QUAT_COMMON_MAX_COUNT) { |
| 578 quaternaries.appendByte(QUAT_COMMON_MIDDLE); | 598 quaternaries.appendByte(QUAT_COMMON_MIDDLE); |
| 579 commonQuaternaries -= QUAT_COMMON_MAX_COUNT; | 599 commonQuaternaries -= QUAT_COMMON_MAX_COUNT; |
| 580 } | 600 } |
| 581 uint32_t b; | 601 uint32_t b; |
| (...skipping 13 matching lines...) Expand all Loading... |
| 595 } | 615 } |
| 596 | 616 |
| 597 if(U_FAILURE(errorCode)) { return; } | 617 if(U_FAILURE(errorCode)) { return; } |
| 598 | 618 |
| 599 // Append the beyond-primary levels. | 619 // Append the beyond-primary levels. |
| 600 UBool ok = TRUE; | 620 UBool ok = TRUE; |
| 601 if((levels & Collation::SECONDARY_LEVEL_FLAG) != 0) { | 621 if((levels & Collation::SECONDARY_LEVEL_FLAG) != 0) { |
| 602 if(!callback.needToWrite(Collation::SECONDARY_LEVEL)) { return; } | 622 if(!callback.needToWrite(Collation::SECONDARY_LEVEL)) { return; } |
| 603 ok &= secondaries.isOk(); | 623 ok &= secondaries.isOk(); |
| 604 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); | 624 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); |
| 605 uint8_t *secs = secondaries.data(); | 625 secondaries.appendTo(sink); |
| 606 int32_t length = secondaries.length() - 1; // Ignore the trailing NO_CE
. | |
| 607 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { | |
| 608 // The backwards secondary level compares secondary weights backward
s | |
| 609 // within segments separated by the merge separator (U+FFFE, weight
02). | |
| 610 // The separator weights 01 & 02 were reduced to 00 & 01 so that | |
| 611 // we do not accidentally separate at a _second_ weight byte of 02. | |
| 612 int32_t start = 0; | |
| 613 for(;;) { | |
| 614 // Find the merge separator or the NO_CE terminator. | |
| 615 int32_t limit; | |
| 616 if(anyMergeSeparators) { | |
| 617 limit = start; | |
| 618 while(secs[limit] > 1) { ++limit; } | |
| 619 } else { | |
| 620 limit = length; | |
| 621 } | |
| 622 // Reverse this segment. | |
| 623 if(start < limit) { | |
| 624 uint8_t *p = secs + start; | |
| 625 uint8_t *q = secs + limit - 1; | |
| 626 while(p < q) { | |
| 627 uint8_t s = *p; | |
| 628 *p++ = *q; | |
| 629 *q-- = s; | |
| 630 } | |
| 631 } | |
| 632 // Did we reach the end of the string? | |
| 633 if(secs[limit] == 0) { break; } | |
| 634 // Restore the merge separator. | |
| 635 secs[limit] = 2; | |
| 636 // Skip the merge separator and continue. | |
| 637 start = limit + 1; | |
| 638 } | |
| 639 } | |
| 640 sink.Append(reinterpret_cast<char *>(secs), length); | |
| 641 } | 626 } |
| 642 | 627 |
| 643 if((levels & Collation::CASE_LEVEL_FLAG) != 0) { | 628 if((levels & Collation::CASE_LEVEL_FLAG) != 0) { |
| 644 if(!callback.needToWrite(Collation::CASE_LEVEL)) { return; } | 629 if(!callback.needToWrite(Collation::CASE_LEVEL)) { return; } |
| 645 ok &= cases.isOk(); | 630 ok &= cases.isOk(); |
| 646 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); | 631 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); |
| 647 // Write pairs of nibbles as bytes, except separator bytes as themselves
. | 632 // Write pairs of nibbles as bytes, except separator bytes as themselves
. |
| 648 int32_t length = cases.length() - 1; // Ignore the trailing NO_CE. | 633 int32_t length = cases.length() - 1; // Ignore the trailing NO_CE. |
| 649 uint8_t b = 0; | 634 uint8_t b = 0; |
| 650 for(int32_t i = 0; i < length; ++i) { | 635 for(int32_t i = 0; i < length; ++i) { |
| 651 uint8_t c = (uint8_t)cases[i]; | 636 uint8_t c = (uint8_t)cases[i]; |
| 652 if(c <= Collation::MERGE_SEPARATOR_BYTE) { | 637 U_ASSERT((c & 0xf) == 0 && c != 0); |
| 653 U_ASSERT(c != 0); | 638 if(b == 0) { |
| 654 if(b != 0) { | 639 b = c; |
| 655 sink.Append(b); | |
| 656 b = 0; | |
| 657 } | |
| 658 sink.Append(c); | |
| 659 } else { | 640 } else { |
| 660 U_ASSERT((c & 0xf) == 0); | 641 sink.Append(b | (c >> 4)); |
| 661 if(b == 0) { | 642 b = 0; |
| 662 b = c; | |
| 663 } else { | |
| 664 sink.Append(b | (c >> 4)); | |
| 665 b = 0; | |
| 666 } | |
| 667 } | 643 } |
| 668 } | 644 } |
| 669 if(b != 0) { | 645 if(b != 0) { |
| 670 sink.Append(b); | 646 sink.Append(b); |
| 671 } | 647 } |
| 672 } | 648 } |
| 673 | 649 |
| 674 if((levels & Collation::TERTIARY_LEVEL_FLAG) != 0) { | 650 if((levels & Collation::TERTIARY_LEVEL_FLAG) != 0) { |
| 675 if(!callback.needToWrite(Collation::TERTIARY_LEVEL)) { return; } | 651 if(!callback.needToWrite(Collation::TERTIARY_LEVEL)) { return; } |
| 676 ok &= tertiaries.isOk(); | 652 ok &= tertiaries.isOk(); |
| 677 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); | 653 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); |
| 678 tertiaries.appendTo(sink); | 654 tertiaries.appendTo(sink); |
| 679 } | 655 } |
| 680 | 656 |
| 681 if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) { | 657 if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) { |
| 682 if(!callback.needToWrite(Collation::QUATERNARY_LEVEL)) { return; } | 658 if(!callback.needToWrite(Collation::QUATERNARY_LEVEL)) { return; } |
| 683 ok &= quaternaries.isOk(); | 659 ok &= quaternaries.isOk(); |
| 684 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); | 660 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); |
| 685 quaternaries.appendTo(sink); | 661 quaternaries.appendTo(sink); |
| 686 } | 662 } |
| 687 | 663 |
| 688 if(!ok || !sink.IsOk()) { | 664 if(!ok || !sink.IsOk()) { |
| 689 errorCode = U_MEMORY_ALLOCATION_ERROR; | 665 errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 690 } | 666 } |
| 691 } | 667 } |
| 692 | 668 |
| 693 U_NAMESPACE_END | 669 U_NAMESPACE_END |
| 694 | 670 |
| 695 #endif // !UCONFIG_NO_COLLATION | 671 #endif // !UCONFIG_NO_COLLATION |
| OLD | NEW |