OLD | NEW |
1 /* | 1 /* |
2 ******************************************************************************* | 2 ******************************************************************************* |
3 * Copyright (C) 2012-2014, International Business Machines | 3 * Copyright (C) 2012-2015, International Business Machines |
4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
5 ******************************************************************************* | 5 ******************************************************************************* |
6 * collationkeys.cpp | 6 * collationkeys.cpp |
7 * | 7 * |
8 * created on: 2012sep02 | 8 * created on: 2012sep02 |
9 * created by: Markus W. Scherer | 9 * created by: Markus W. Scherer |
10 */ | 10 */ |
11 | 11 |
12 #include "unicode/utypes.h" | 12 #include "unicode/utypes.h" |
13 | 13 |
(...skipping 225 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
239 levels &= ~(((uint32_t)1 << minLevel) - 1); | 239 levels &= ~(((uint32_t)1 << minLevel) - 1); |
240 if(levels == 0) { return; } | 240 if(levels == 0) { return; } |
241 | 241 |
242 uint32_t variableTop; | 242 uint32_t variableTop; |
243 if((options & CollationSettings::ALTERNATE_MASK) == 0) { | 243 if((options & CollationSettings::ALTERNATE_MASK) == 0) { |
244 variableTop = 0; | 244 variableTop = 0; |
245 } else { | 245 } else { |
246 // +1 so that we can use "<" and primary ignorables test out early. | 246 // +1 so that we can use "<" and primary ignorables test out early. |
247 variableTop = settings.variableTop + 1; | 247 variableTop = settings.variableTop + 1; |
248 } | 248 } |
249 const uint8_t *reorderTable = settings.reorderTable; | |
250 | 249 |
251 uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options); | 250 uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options); |
252 | 251 |
253 SortKeyLevel cases; | 252 SortKeyLevel cases; |
254 SortKeyLevel secondaries; | 253 SortKeyLevel secondaries; |
255 SortKeyLevel tertiaries; | 254 SortKeyLevel tertiaries; |
256 SortKeyLevel quaternaries; | 255 SortKeyLevel quaternaries; |
257 | 256 |
258 uint32_t compressedP1 = 0; // 0==no compression; otherwise reordered compre
ssible lead byte | 257 uint32_t prevReorderedPrimary = 0; // 0==no compression |
259 int32_t commonCases = 0; | 258 int32_t commonCases = 0; |
260 int32_t commonSecondaries = 0; | 259 int32_t commonSecondaries = 0; |
261 int32_t commonTertiaries = 0; | 260 int32_t commonTertiaries = 0; |
262 int32_t commonQuaternaries = 0; | 261 int32_t commonQuaternaries = 0; |
263 | 262 |
264 uint32_t prevSecondary = 0; | 263 uint32_t prevSecondary = 0; |
265 UBool anyMergeSeparators = FALSE; | 264 int32_t secSegmentStart = 0; |
266 | 265 |
267 for(;;) { | 266 for(;;) { |
268 // No need to keep all CEs in the buffer when we write a sort key. | 267 // No need to keep all CEs in the buffer when we write a sort key. |
269 iter.clearCEsIfNoneRemaining(); | 268 iter.clearCEsIfNoneRemaining(); |
270 int64_t ce = iter.nextCE(errorCode); | 269 int64_t ce = iter.nextCE(errorCode); |
271 uint32_t p = (uint32_t)(ce >> 32); | 270 uint32_t p = (uint32_t)(ce >> 32); |
272 if(p < variableTop && p > Collation::MERGE_SEPARATOR_PRIMARY) { | 271 if(p < variableTop && p > Collation::MERGE_SEPARATOR_PRIMARY) { |
273 // Variable CE, shift it to quaternary level. | 272 // Variable CE, shift it to quaternary level. |
274 // Ignore all following primary ignorables, and shift further variab
le CEs. | 273 // Ignore all following primary ignorables, and shift further variab
le CEs. |
275 if(commonQuaternaries != 0) { | 274 if(commonQuaternaries != 0) { |
276 --commonQuaternaries; | 275 --commonQuaternaries; |
277 while(commonQuaternaries >= QUAT_COMMON_MAX_COUNT) { | 276 while(commonQuaternaries >= QUAT_COMMON_MAX_COUNT) { |
278 quaternaries.appendByte(QUAT_COMMON_MIDDLE); | 277 quaternaries.appendByte(QUAT_COMMON_MIDDLE); |
279 commonQuaternaries -= QUAT_COMMON_MAX_COUNT; | 278 commonQuaternaries -= QUAT_COMMON_MAX_COUNT; |
280 } | 279 } |
281 // Shifted primary weights are lower than the common weight. | 280 // Shifted primary weights are lower than the common weight. |
282 quaternaries.appendByte(QUAT_COMMON_LOW + commonQuaternaries); | 281 quaternaries.appendByte(QUAT_COMMON_LOW + commonQuaternaries); |
283 commonQuaternaries = 0; | 282 commonQuaternaries = 0; |
284 } | 283 } |
285 do { | 284 do { |
286 if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) { | 285 if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) { |
287 uint32_t p1 = p >> 24; | 286 if(settings.hasReordering()) { |
288 if(reorderTable != NULL) { p1 = reorderTable[p1]; } | 287 p = settings.reorder(p); |
289 if(p1 >= QUAT_SHIFTED_LIMIT_BYTE) { | 288 } |
| 289 if((p >> 24) >= QUAT_SHIFTED_LIMIT_BYTE) { |
290 // Prevent shifted primary lead bytes from | 290 // Prevent shifted primary lead bytes from |
291 // overlapping with the common compression range. | 291 // overlapping with the common compression range. |
292 quaternaries.appendByte(QUAT_SHIFTED_LIMIT_BYTE); | 292 quaternaries.appendByte(QUAT_SHIFTED_LIMIT_BYTE); |
293 } | 293 } |
294 quaternaries.appendWeight32((p1 << 24) | (p & 0xffffff)); | 294 quaternaries.appendWeight32(p); |
295 } | 295 } |
296 do { | 296 do { |
297 ce = iter.nextCE(errorCode); | 297 ce = iter.nextCE(errorCode); |
298 p = (uint32_t)(ce >> 32); | 298 p = (uint32_t)(ce >> 32); |
299 } while(p == 0); | 299 } while(p == 0); |
300 } while(p < variableTop && p > Collation::MERGE_SEPARATOR_PRIMARY); | 300 } while(p < variableTop && p > Collation::MERGE_SEPARATOR_PRIMARY); |
301 } | 301 } |
302 // ce could be primary ignorable, or NO_CE, or the merge separator, | 302 // ce could be primary ignorable, or NO_CE, or the merge separator, |
303 // or a regular primary CE, but it is not variable. | 303 // or a regular primary CE, but it is not variable. |
304 // If ce==NO_CE, then write nothing for the primary level but | 304 // If ce==NO_CE, then write nothing for the primary level but |
305 // terminate compression on all levels and then exit the loop. | 305 // terminate compression on all levels and then exit the loop. |
306 if(p > Collation::NO_CE_PRIMARY && (levels & Collation::PRIMARY_LEVEL_FL
AG) != 0) { | 306 if(p > Collation::NO_CE_PRIMARY && (levels & Collation::PRIMARY_LEVEL_FL
AG) != 0) { |
| 307 // Test the un-reordered primary for compressibility. |
| 308 UBool isCompressible = compressibleBytes[p >> 24]; |
| 309 if(settings.hasReordering()) { |
| 310 p = settings.reorder(p); |
| 311 } |
307 uint32_t p1 = p >> 24; | 312 uint32_t p1 = p >> 24; |
308 if(reorderTable != NULL) { p1 = reorderTable[p1]; } | 313 if(!isCompressible || p1 != (prevReorderedPrimary >> 24)) { |
309 if(p1 != compressedP1) { | 314 if(prevReorderedPrimary != 0) { |
310 if(compressedP1 != 0) { | 315 if(p < prevReorderedPrimary) { |
311 if(p1 < compressedP1) { | |
312 // No primary compression terminator | 316 // No primary compression terminator |
313 // at the end of the level or merged segment. | 317 // at the end of the level or merged segment. |
314 if(p1 > Collation::MERGE_SEPARATOR_BYTE) { | 318 if(p1 > Collation::MERGE_SEPARATOR_BYTE) { |
315 sink.Append(Collation::PRIMARY_COMPRESSION_LOW_BYTE)
; | 319 sink.Append(Collation::PRIMARY_COMPRESSION_LOW_BYTE)
; |
316 } | 320 } |
317 } else { | 321 } else { |
318 sink.Append(Collation::PRIMARY_COMPRESSION_HIGH_BYTE); | 322 sink.Append(Collation::PRIMARY_COMPRESSION_HIGH_BYTE); |
319 } | 323 } |
320 } | 324 } |
321 sink.Append(p1); | 325 sink.Append(p1); |
322 // Test the un-reordered lead byte for compressibility but | 326 if(isCompressible) { |
323 // remember the reordered lead byte. | 327 prevReorderedPrimary = p; |
324 if(compressibleBytes[p >> 24]) { | |
325 compressedP1 = p1; | |
326 } else { | 328 } else { |
327 compressedP1 = 0; | 329 prevReorderedPrimary = 0; |
328 } | 330 } |
329 } | 331 } |
330 char p2 = (char)(p >> 16); | 332 char p2 = (char)(p >> 16); |
331 if(p2 != 0) { | 333 if(p2 != 0) { |
332 char buffer[3] = { p2, (char)(p >> 8), (char)p }; | 334 char buffer[3] = { p2, (char)(p >> 8), (char)p }; |
333 sink.Append(buffer, (buffer[1] == 0) ? 1 : (buffer[2] == 0) ? 2
: 3); | 335 sink.Append(buffer, (buffer[1] == 0) ? 1 : (buffer[2] == 0) ? 2
: 3); |
334 } | 336 } |
335 // Optimization for internalNextSortKeyPart(): | 337 // Optimization for internalNextSortKeyPart(): |
336 // When the primary level overflows we can stop because we need not | 338 // When the primary level overflows we can stop because we need not |
337 // calculate (preflight) the whole sort key length. | 339 // calculate (preflight) the whole sort key length. |
338 if(!preflight && sink.Overflowed()) { | 340 if(!preflight && sink.Overflowed()) { |
339 if(U_SUCCESS(errorCode) && !sink.IsOk()) { | 341 if(U_SUCCESS(errorCode) && !sink.IsOk()) { |
340 errorCode = U_MEMORY_ALLOCATION_ERROR; | 342 errorCode = U_MEMORY_ALLOCATION_ERROR; |
341 } | 343 } |
342 return; | 344 return; |
343 } | 345 } |
344 } | 346 } |
345 | 347 |
346 uint32_t lower32 = (uint32_t)ce; | 348 uint32_t lower32 = (uint32_t)ce; |
347 if(lower32 == 0) { continue; } // completely ignorable, no secondary/ca
se/tertiary/quaternary | 349 if(lower32 == 0) { continue; } // completely ignorable, no secondary/ca
se/tertiary/quaternary |
348 | 350 |
349 if((levels & Collation::SECONDARY_LEVEL_FLAG) != 0) { | 351 if((levels & Collation::SECONDARY_LEVEL_FLAG) != 0) { |
350 uint32_t s = lower32 >> 16; | 352 uint32_t s = lower32 >> 16; |
351 if(s == 0) { | 353 if(s == 0) { |
352 // secondary ignorable | 354 // secondary ignorable |
353 } else if(s == Collation::COMMON_WEIGHT16) { | 355 } else if(s == Collation::COMMON_WEIGHT16 && |
| 356 ((options & CollationSettings::BACKWARD_SECONDARY) == 0 || |
| 357 p != Collation::MERGE_SEPARATOR_PRIMARY)) { |
| 358 // s is a common secondary weight, and |
| 359 // backwards-secondary is off or the ce is not the merge separat
or. |
354 ++commonSecondaries; | 360 ++commonSecondaries; |
355 } else if((options & CollationSettings::BACKWARD_SECONDARY) == 0) { | 361 } else if((options & CollationSettings::BACKWARD_SECONDARY) == 0) { |
356 if(commonSecondaries != 0) { | 362 if(commonSecondaries != 0) { |
357 --commonSecondaries; | 363 --commonSecondaries; |
358 while(commonSecondaries >= SEC_COMMON_MAX_COUNT) { | 364 while(commonSecondaries >= SEC_COMMON_MAX_COUNT) { |
359 secondaries.appendByte(SEC_COMMON_MIDDLE); | 365 secondaries.appendByte(SEC_COMMON_MIDDLE); |
360 commonSecondaries -= SEC_COMMON_MAX_COUNT; | 366 commonSecondaries -= SEC_COMMON_MAX_COUNT; |
361 } | 367 } |
362 uint32_t b; | 368 uint32_t b; |
363 if(s < Collation::COMMON_WEIGHT16) { | 369 if(s < Collation::COMMON_WEIGHT16) { |
(...skipping 18 matching lines...) Expand all Loading... |
382 } | 388 } |
383 secondaries.appendByte(b); | 389 secondaries.appendByte(b); |
384 commonSecondaries -= remainder; | 390 commonSecondaries -= remainder; |
385 // commonSecondaries is now a multiple of SEC_COMMON_MAX_COU
NT. | 391 // commonSecondaries is now a multiple of SEC_COMMON_MAX_COU
NT. |
386 while(commonSecondaries > 0) { // same as >= SEC_COMMON_MAX
_COUNT | 392 while(commonSecondaries > 0) { // same as >= SEC_COMMON_MAX
_COUNT |
387 secondaries.appendByte(SEC_COMMON_MIDDLE); | 393 secondaries.appendByte(SEC_COMMON_MIDDLE); |
388 commonSecondaries -= SEC_COMMON_MAX_COUNT; | 394 commonSecondaries -= SEC_COMMON_MAX_COUNT; |
389 } | 395 } |
390 // commonSecondaries == 0 | 396 // commonSecondaries == 0 |
391 } | 397 } |
392 // Reduce separators so that we can look for byte<=1 later. | 398 if(0 < p && p <= Collation::MERGE_SEPARATOR_PRIMARY) { |
393 if(s <= Collation::MERGE_SEPARATOR_WEIGHT16) { | 399 // The backwards secondary level compares secondary weights
backwards |
394 if(s == Collation::MERGE_SEPARATOR_WEIGHT16) { | 400 // within segments separated by the merge separator (U+FFFE)
. |
395 anyMergeSeparators = TRUE; | 401 uint8_t *secs = secondaries.data(); |
| 402 int32_t last = secondaries.length() - 1; |
| 403 if(secSegmentStart < last) { |
| 404 uint8_t *p = secs + secSegmentStart; |
| 405 uint8_t *q = secs + last; |
| 406 do { |
| 407 uint8_t b = *p; |
| 408 *p++ = *q; |
| 409 *q-- = b; |
| 410 } while(p < q); |
396 } | 411 } |
397 secondaries.appendByte((s >> 8) - 1); | 412 secondaries.appendByte(p == Collation::NO_CE_PRIMARY ? |
| 413 Collation::LEVEL_SEPARATOR_BYTE : Collation::MERGE_SEPAR
ATOR_BYTE); |
| 414 prevSecondary = 0; |
| 415 secSegmentStart = secondaries.length(); |
398 } else { | 416 } else { |
399 secondaries.appendReverseWeight16(s); | 417 secondaries.appendReverseWeight16(s); |
| 418 prevSecondary = s; |
400 } | 419 } |
401 prevSecondary = s; | |
402 } | 420 } |
403 } | 421 } |
404 | 422 |
405 if((levels & Collation::CASE_LEVEL_FLAG) != 0) { | 423 if((levels & Collation::CASE_LEVEL_FLAG) != 0) { |
406 if((CollationSettings::getStrength(options) == UCOL_PRIMARY) ? | 424 if((CollationSettings::getStrength(options) == UCOL_PRIMARY) ? |
407 p == 0 : lower32 <= 0xffff) { | 425 p == 0 : lower32 <= 0xffff) { |
408 // Primary+caseLevel: Ignore case level weights of primary ignor
ables. | 426 // Primary+caseLevel: Ignore case level weights of primary ignor
ables. |
409 // Otherwise: Ignore case level weights of secondary ignorables. | 427 // Otherwise: Ignore case level weights of secondary ignorables. |
410 // For details see the comments in the CollationCompare class. | 428 // For details see the comments in the CollationCompare class. |
411 } else { | 429 } else { |
412 uint32_t c = (lower32 >> 8) & 0xff; // case bits & tertiary lea
d byte | 430 uint32_t c = (lower32 >> 8) & 0xff; // case bits & tertiary lea
d byte |
413 U_ASSERT((c & 0xc0) != 0xc0); | 431 U_ASSERT((c & 0xc0) != 0xc0); |
414 if((c & 0xc0) == 0 && c > Collation::MERGE_SEPARATOR_BYTE) { | 432 if((c & 0xc0) == 0 && c > Collation::LEVEL_SEPARATOR_BYTE) { |
415 ++commonCases; | 433 ++commonCases; |
416 } else { | 434 } else { |
417 if((options & CollationSettings::UPPER_FIRST) == 0) { | 435 if((options & CollationSettings::UPPER_FIRST) == 0) { |
418 // lowerFirst: Compress common weights to nibbles 1..7..
13, mixed=14, upper=15. | 436 // lowerFirst: Compress common weights to nibbles 1..7..
13, mixed=14, upper=15. |
419 if(commonCases != 0) { | 437 // If there are only common (=lowest) weights in the who
le level, |
| 438 // then we need not write anything. |
| 439 // Level length differences are handled already on the n
ext-higher level. |
| 440 if(commonCases != 0 && |
| 441 (c > Collation::LEVEL_SEPARATOR_BYTE || !cases.i
sEmpty())) { |
420 --commonCases; | 442 --commonCases; |
421 while(commonCases >= CASE_LOWER_FIRST_COMMON_MAX_COU
NT) { | 443 while(commonCases >= CASE_LOWER_FIRST_COMMON_MAX_COU
NT) { |
422 cases.appendByte(CASE_LOWER_FIRST_COMMON_MIDDLE
<< 4); | 444 cases.appendByte(CASE_LOWER_FIRST_COMMON_MIDDLE
<< 4); |
423 commonCases -= CASE_LOWER_FIRST_COMMON_MAX_COUNT
; | 445 commonCases -= CASE_LOWER_FIRST_COMMON_MAX_COUNT
; |
424 } | 446 } |
425 uint32_t b; | 447 uint32_t b; |
426 if(c <= Collation::MERGE_SEPARATOR_BYTE) { | 448 if(c <= Collation::LEVEL_SEPARATOR_BYTE) { |
427 b = CASE_LOWER_FIRST_COMMON_LOW + commonCases; | 449 b = CASE_LOWER_FIRST_COMMON_LOW + commonCases; |
428 } else { | 450 } else { |
429 b = CASE_LOWER_FIRST_COMMON_HIGH - commonCases; | 451 b = CASE_LOWER_FIRST_COMMON_HIGH - commonCases; |
430 } | 452 } |
431 cases.appendByte(b << 4); | 453 cases.appendByte(b << 4); |
432 commonCases = 0; | 454 commonCases = 0; |
433 } | 455 } |
434 if(c > Collation::MERGE_SEPARATOR_BYTE) { | 456 if(c > Collation::LEVEL_SEPARATOR_BYTE) { |
435 c = (CASE_LOWER_FIRST_COMMON_HIGH + (c >> 6)) << 4;
// 14 or 15 | 457 c = (CASE_LOWER_FIRST_COMMON_HIGH + (c >> 6)) << 4;
// 14 or 15 |
436 } | 458 } |
437 } else { | 459 } else { |
438 // upperFirst: Compress common weights to nibbles 3..15,
mixed=2, upper=1. | 460 // upperFirst: Compress common weights to nibbles 3..15,
mixed=2, upper=1. |
439 // The compressed common case weights only go up from th
e "low" value | 461 // The compressed common case weights only go up from th
e "low" value |
440 // because with upperFirst the common weight is the high
est one. | 462 // because with upperFirst the common weight is the high
est one. |
441 if(commonCases != 0) { | 463 if(commonCases != 0) { |
442 --commonCases; | 464 --commonCases; |
443 while(commonCases >= CASE_UPPER_FIRST_COMMON_MAX_COU
NT) { | 465 while(commonCases >= CASE_UPPER_FIRST_COMMON_MAX_COU
NT) { |
444 cases.appendByte(CASE_UPPER_FIRST_COMMON_LOW <<
4); | 466 cases.appendByte(CASE_UPPER_FIRST_COMMON_LOW <<
4); |
445 commonCases -= CASE_UPPER_FIRST_COMMON_MAX_COUNT
; | 467 commonCases -= CASE_UPPER_FIRST_COMMON_MAX_COUNT
; |
446 } | 468 } |
447 cases.appendByte((CASE_UPPER_FIRST_COMMON_LOW + comm
onCases) << 4); | 469 cases.appendByte((CASE_UPPER_FIRST_COMMON_LOW + comm
onCases) << 4); |
448 commonCases = 0; | 470 commonCases = 0; |
449 } | 471 } |
450 if(c > Collation::MERGE_SEPARATOR_BYTE) { | 472 if(c > Collation::LEVEL_SEPARATOR_BYTE) { |
451 c = (CASE_UPPER_FIRST_COMMON_LOW - (c >> 6)) << 4;
// 2 or 1 | 473 c = (CASE_UPPER_FIRST_COMMON_LOW - (c >> 6)) << 4;
// 2 or 1 |
452 } | 474 } |
453 } | 475 } |
454 // c is a separator byte 01 or 02, | 476 // c is a separator byte 01, |
455 // or a left-shifted nibble 0x10, 0x20, ... 0xf0. | 477 // or a left-shifted nibble 0x10, 0x20, ... 0xf0. |
456 cases.appendByte(c); | 478 cases.appendByte(c); |
457 } | 479 } |
458 } | 480 } |
459 } | 481 } |
460 | 482 |
461 if((levels & Collation::TERTIARY_LEVEL_FLAG) != 0) { | 483 if((levels & Collation::TERTIARY_LEVEL_FLAG) != 0) { |
462 uint32_t t = lower32 & tertiaryMask; | 484 uint32_t t = lower32 & tertiaryMask; |
463 U_ASSERT((lower32 & 0xc000) != 0xc000); | 485 U_ASSERT((lower32 & 0xc000) != 0xc000); |
464 if(t == Collation::COMMON_WEIGHT16) { | 486 if(t == Collation::COMMON_WEIGHT16) { |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
503 } | 525 } |
504 if(t > Collation::COMMON_WEIGHT16) { t += 0x4000; } | 526 if(t > Collation::COMMON_WEIGHT16) { t += 0x4000; } |
505 tertiaries.appendWeight16(t); | 527 tertiaries.appendWeight16(t); |
506 } else { | 528 } else { |
507 // Tertiary weights with caseFirst=upperFirst. | 529 // Tertiary weights with caseFirst=upperFirst. |
508 // Do not change the artificial uppercase weight of a tertiary C
E (0.0.ut), | 530 // Do not change the artificial uppercase weight of a tertiary C
E (0.0.ut), |
509 // to keep tertiary CEs well-formed. | 531 // to keep tertiary CEs well-formed. |
510 // Their case+tertiary weights must be greater than those of | 532 // Their case+tertiary weights must be greater than those of |
511 // primary and secondary CEs. | 533 // primary and secondary CEs. |
512 // | 534 // |
513 // Separators 01..02 -> 01..02 (unchanged) | 535 // Separator 01 -> 01 (unchanged) |
514 // Lowercase 03..04 -> 83..84 (includes uncased) | 536 // Lowercase 02..04 -> 82..84 (includes uncased) |
515 // Common weight 05 -> 85..C5 (common-weight compression ra
nge) | 537 // Common weight 05 -> 85..C5 (common-weight compression ra
nge) |
516 // Lowercase 06..3F -> C6..FF | 538 // Lowercase 06..3F -> C6..FF |
517 // Mixed case 43..7F -> 43..7F | 539 // Mixed case 42..7F -> 42..7F |
518 // Uppercase 83..BF -> 03..3F | 540 // Uppercase 82..BF -> 02..3F |
519 // Tertiary CE 86..BF -> C6..FF | 541 // Tertiary CE 86..BF -> C6..FF |
520 if(t <= Collation::MERGE_SEPARATOR_WEIGHT16) { | 542 if(t <= Collation::NO_CE_WEIGHT16) { |
521 // Keep separators unchanged. | 543 // Keep separators unchanged. |
522 } else if(lower32 > 0xffff) { | 544 } else if(lower32 > 0xffff) { |
523 // Invert case bits of primary & secondary CEs. | 545 // Invert case bits of primary & secondary CEs. |
524 t ^= 0xc000; | 546 t ^= 0xc000; |
525 if(t < (TER_UPPER_FIRST_COMMON_HIGH << 8)) { | 547 if(t < (TER_UPPER_FIRST_COMMON_HIGH << 8)) { |
526 t -= 0x4000; | 548 t -= 0x4000; |
527 } | 549 } |
528 } else { | 550 } else { |
529 // Keep uppercase bits of tertiary CEs. | 551 // Keep uppercase bits of tertiary CEs. |
530 U_ASSERT(0x8600 <= t && t <= 0xbfff); | 552 U_ASSERT(0x8600 <= t && t <= 0xbfff); |
(...skipping 13 matching lines...) Expand all Loading... |
544 } | 566 } |
545 tertiaries.appendByte(b); | 567 tertiaries.appendByte(b); |
546 commonTertiaries = 0; | 568 commonTertiaries = 0; |
547 } | 569 } |
548 tertiaries.appendWeight16(t); | 570 tertiaries.appendWeight16(t); |
549 } | 571 } |
550 } | 572 } |
551 | 573 |
552 if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) { | 574 if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) { |
553 uint32_t q = lower32 & 0xffff; | 575 uint32_t q = lower32 & 0xffff; |
554 if((q & 0xc0) == 0 && q > Collation::MERGE_SEPARATOR_WEIGHT16) { | 576 if((q & 0xc0) == 0 && q > Collation::NO_CE_WEIGHT16) { |
555 ++commonQuaternaries; | 577 ++commonQuaternaries; |
556 } else if(q <= Collation::MERGE_SEPARATOR_WEIGHT16 && | 578 } else if(q == Collation::NO_CE_WEIGHT16 && |
557 (options & CollationSettings::ALTERNATE_MASK) == 0 && | 579 (options & CollationSettings::ALTERNATE_MASK) == 0 && |
558 (quaternaries.isEmpty() || | 580 quaternaries.isEmpty()) { |
559 quaternaries[quaternaries.length() - 1] == Collation::ME
RGE_SEPARATOR_BYTE)) { | 581 // If alternate=non-ignorable and there are only common quaterna
ry weights, |
560 // If alternate=non-ignorable and there are only | 582 // then we need not write anything. |
561 // common quaternary weights between two separators, | |
562 // then we need not write anything between these separators. | |
563 // The only weights greater than the merge separator and less th
an the common weight | 583 // The only weights greater than the merge separator and less th
an the common weight |
564 // are shifted primary weights, which are not generated for alte
rnate=non-ignorable. | 584 // are shifted primary weights, which are not generated for alte
rnate=non-ignorable. |
565 // There are also exactly as many quaternary weights as tertiary
weights, | 585 // There are also exactly as many quaternary weights as tertiary
weights, |
566 // so level length differences are handled already on tertiary l
evel. | 586 // so level length differences are handled already on tertiary l
evel. |
567 // Any above-common quaternary weight will compare greater regar
dless. | 587 // Any above-common quaternary weight will compare greater regar
dless. |
568 quaternaries.appendByte(q >> 8); | 588 quaternaries.appendByte(Collation::LEVEL_SEPARATOR_BYTE); |
569 } else { | 589 } else { |
570 if(q <= Collation::MERGE_SEPARATOR_WEIGHT16) { | 590 if(q == Collation::NO_CE_WEIGHT16) { |
571 q >>= 8; | 591 q = Collation::LEVEL_SEPARATOR_BYTE; |
572 } else { | 592 } else { |
573 q = 0xfc + ((q >> 6) & 3); | 593 q = 0xfc + ((q >> 6) & 3); |
574 } | 594 } |
575 if(commonQuaternaries != 0) { | 595 if(commonQuaternaries != 0) { |
576 --commonQuaternaries; | 596 --commonQuaternaries; |
577 while(commonQuaternaries >= QUAT_COMMON_MAX_COUNT) { | 597 while(commonQuaternaries >= QUAT_COMMON_MAX_COUNT) { |
578 quaternaries.appendByte(QUAT_COMMON_MIDDLE); | 598 quaternaries.appendByte(QUAT_COMMON_MIDDLE); |
579 commonQuaternaries -= QUAT_COMMON_MAX_COUNT; | 599 commonQuaternaries -= QUAT_COMMON_MAX_COUNT; |
580 } | 600 } |
581 uint32_t b; | 601 uint32_t b; |
(...skipping 13 matching lines...) Expand all Loading... |
595 } | 615 } |
596 | 616 |
597 if(U_FAILURE(errorCode)) { return; } | 617 if(U_FAILURE(errorCode)) { return; } |
598 | 618 |
599 // Append the beyond-primary levels. | 619 // Append the beyond-primary levels. |
600 UBool ok = TRUE; | 620 UBool ok = TRUE; |
601 if((levels & Collation::SECONDARY_LEVEL_FLAG) != 0) { | 621 if((levels & Collation::SECONDARY_LEVEL_FLAG) != 0) { |
602 if(!callback.needToWrite(Collation::SECONDARY_LEVEL)) { return; } | 622 if(!callback.needToWrite(Collation::SECONDARY_LEVEL)) { return; } |
603 ok &= secondaries.isOk(); | 623 ok &= secondaries.isOk(); |
604 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); | 624 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); |
605 uint8_t *secs = secondaries.data(); | 625 secondaries.appendTo(sink); |
606 int32_t length = secondaries.length() - 1; // Ignore the trailing NO_CE
. | |
607 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { | |
608 // The backwards secondary level compares secondary weights backward
s | |
609 // within segments separated by the merge separator (U+FFFE, weight
02). | |
610 // The separator weights 01 & 02 were reduced to 00 & 01 so that | |
611 // we do not accidentally separate at a _second_ weight byte of 02. | |
612 int32_t start = 0; | |
613 for(;;) { | |
614 // Find the merge separator or the NO_CE terminator. | |
615 int32_t limit; | |
616 if(anyMergeSeparators) { | |
617 limit = start; | |
618 while(secs[limit] > 1) { ++limit; } | |
619 } else { | |
620 limit = length; | |
621 } | |
622 // Reverse this segment. | |
623 if(start < limit) { | |
624 uint8_t *p = secs + start; | |
625 uint8_t *q = secs + limit - 1; | |
626 while(p < q) { | |
627 uint8_t s = *p; | |
628 *p++ = *q; | |
629 *q-- = s; | |
630 } | |
631 } | |
632 // Did we reach the end of the string? | |
633 if(secs[limit] == 0) { break; } | |
634 // Restore the merge separator. | |
635 secs[limit] = 2; | |
636 // Skip the merge separator and continue. | |
637 start = limit + 1; | |
638 } | |
639 } | |
640 sink.Append(reinterpret_cast<char *>(secs), length); | |
641 } | 626 } |
642 | 627 |
643 if((levels & Collation::CASE_LEVEL_FLAG) != 0) { | 628 if((levels & Collation::CASE_LEVEL_FLAG) != 0) { |
644 if(!callback.needToWrite(Collation::CASE_LEVEL)) { return; } | 629 if(!callback.needToWrite(Collation::CASE_LEVEL)) { return; } |
645 ok &= cases.isOk(); | 630 ok &= cases.isOk(); |
646 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); | 631 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); |
647 // Write pairs of nibbles as bytes, except separator bytes as themselves
. | 632 // Write pairs of nibbles as bytes, except separator bytes as themselves
. |
648 int32_t length = cases.length() - 1; // Ignore the trailing NO_CE. | 633 int32_t length = cases.length() - 1; // Ignore the trailing NO_CE. |
649 uint8_t b = 0; | 634 uint8_t b = 0; |
650 for(int32_t i = 0; i < length; ++i) { | 635 for(int32_t i = 0; i < length; ++i) { |
651 uint8_t c = (uint8_t)cases[i]; | 636 uint8_t c = (uint8_t)cases[i]; |
652 if(c <= Collation::MERGE_SEPARATOR_BYTE) { | 637 U_ASSERT((c & 0xf) == 0 && c != 0); |
653 U_ASSERT(c != 0); | 638 if(b == 0) { |
654 if(b != 0) { | 639 b = c; |
655 sink.Append(b); | |
656 b = 0; | |
657 } | |
658 sink.Append(c); | |
659 } else { | 640 } else { |
660 U_ASSERT((c & 0xf) == 0); | 641 sink.Append(b | (c >> 4)); |
661 if(b == 0) { | 642 b = 0; |
662 b = c; | |
663 } else { | |
664 sink.Append(b | (c >> 4)); | |
665 b = 0; | |
666 } | |
667 } | 643 } |
668 } | 644 } |
669 if(b != 0) { | 645 if(b != 0) { |
670 sink.Append(b); | 646 sink.Append(b); |
671 } | 647 } |
672 } | 648 } |
673 | 649 |
674 if((levels & Collation::TERTIARY_LEVEL_FLAG) != 0) { | 650 if((levels & Collation::TERTIARY_LEVEL_FLAG) != 0) { |
675 if(!callback.needToWrite(Collation::TERTIARY_LEVEL)) { return; } | 651 if(!callback.needToWrite(Collation::TERTIARY_LEVEL)) { return; } |
676 ok &= tertiaries.isOk(); | 652 ok &= tertiaries.isOk(); |
677 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); | 653 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); |
678 tertiaries.appendTo(sink); | 654 tertiaries.appendTo(sink); |
679 } | 655 } |
680 | 656 |
681 if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) { | 657 if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) { |
682 if(!callback.needToWrite(Collation::QUATERNARY_LEVEL)) { return; } | 658 if(!callback.needToWrite(Collation::QUATERNARY_LEVEL)) { return; } |
683 ok &= quaternaries.isOk(); | 659 ok &= quaternaries.isOk(); |
684 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); | 660 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); |
685 quaternaries.appendTo(sink); | 661 quaternaries.appendTo(sink); |
686 } | 662 } |
687 | 663 |
688 if(!ok || !sink.IsOk()) { | 664 if(!ok || !sink.IsOk()) { |
689 errorCode = U_MEMORY_ALLOCATION_ERROR; | 665 errorCode = U_MEMORY_ALLOCATION_ERROR; |
690 } | 666 } |
691 } | 667 } |
692 | 668 |
693 U_NAMESPACE_END | 669 U_NAMESPACE_END |
694 | 670 |
695 #endif // !UCONFIG_NO_COLLATION | 671 #endif // !UCONFIG_NO_COLLATION |
OLD | NEW |