OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * Copyright (C) 2012-2014, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* |
| 6 * collationkeys.cpp |
| 7 * |
| 8 * created on: 2012sep02 |
| 9 * created by: Markus W. Scherer |
| 10 */ |
| 11 |
| 12 #include "unicode/utypes.h" |
| 13 |
| 14 #if !UCONFIG_NO_COLLATION |
| 15 |
| 16 #include "unicode/bytestream.h" |
| 17 #include "collation.h" |
| 18 #include "collationiterator.h" |
| 19 #include "collationkeys.h" |
| 20 #include "collationsettings.h" |
| 21 #include "uassert.h" |
| 22 |
| 23 U_NAMESPACE_BEGIN |
| 24 |
| 25 SortKeyByteSink::~SortKeyByteSink() {} |
| 26 |
| 27 void |
| 28 SortKeyByteSink::Append(const char *bytes, int32_t n) { |
| 29 if (n <= 0 || bytes == NULL) { |
| 30 return; |
| 31 } |
| 32 if (ignore_ > 0) { |
| 33 int32_t ignoreRest = ignore_ - n; |
| 34 if (ignoreRest >= 0) { |
| 35 ignore_ = ignoreRest; |
| 36 return; |
| 37 } else { |
| 38 bytes += ignore_; |
| 39 n = -ignoreRest; |
| 40 ignore_ = 0; |
| 41 } |
| 42 } |
| 43 int32_t length = appended_; |
| 44 appended_ += n; |
| 45 if ((buffer_ + length) == bytes) { |
| 46 return; // the caller used GetAppendBuffer() and wrote the bytes alread
y |
| 47 } |
| 48 int32_t available = capacity_ - length; |
| 49 if (n <= available) { |
| 50 uprv_memcpy(buffer_ + length, bytes, n); |
| 51 } else { |
| 52 AppendBeyondCapacity(bytes, n, length); |
| 53 } |
| 54 } |
| 55 |
| 56 char * |
| 57 SortKeyByteSink::GetAppendBuffer(int32_t min_capacity, |
| 58 int32_t desired_capacity_hint, |
| 59 char *scratch, |
| 60 int32_t scratch_capacity, |
| 61 int32_t *result_capacity) { |
| 62 if (min_capacity < 1 || scratch_capacity < min_capacity) { |
| 63 *result_capacity = 0; |
| 64 return NULL; |
| 65 } |
| 66 if (ignore_ > 0) { |
| 67 // Do not write ignored bytes right at the end of the buffer. |
| 68 *result_capacity = scratch_capacity; |
| 69 return scratch; |
| 70 } |
| 71 int32_t available = capacity_ - appended_; |
| 72 if (available >= min_capacity) { |
| 73 *result_capacity = available; |
| 74 return buffer_ + appended_; |
| 75 } else if (Resize(desired_capacity_hint, appended_)) { |
| 76 *result_capacity = capacity_ - appended_; |
| 77 return buffer_ + appended_; |
| 78 } else { |
| 79 *result_capacity = scratch_capacity; |
| 80 return scratch; |
| 81 } |
| 82 } |
| 83 |
| 84 namespace { |
| 85 |
| 86 /** |
| 87 * uint8_t byte buffer, similar to CharString but simpler. |
| 88 */ |
| 89 class SortKeyLevel : public UMemory { |
| 90 public: |
| 91 SortKeyLevel() : len(0), ok(TRUE) {} |
| 92 ~SortKeyLevel() {} |
| 93 |
| 94 /** @return FALSE if memory allocation failed */ |
| 95 UBool isOk() const { return ok; } |
| 96 UBool isEmpty() const { return len == 0; } |
| 97 int32_t length() const { return len; } |
| 98 const uint8_t *data() const { return buffer.getAlias(); } |
| 99 uint8_t operator[](int32_t index) const { return buffer[index]; } |
| 100 |
| 101 uint8_t *data() { return buffer.getAlias(); } |
| 102 |
| 103 void appendByte(uint32_t b); |
| 104 void appendWeight16(uint32_t w); |
| 105 void appendWeight32(uint32_t w); |
| 106 void appendReverseWeight16(uint32_t w); |
| 107 |
| 108 /** Appends all but the last byte to the sink. The last byte should be the 0
1 terminator. */ |
| 109 void appendTo(ByteSink &sink) const { |
| 110 U_ASSERT(len > 0 && buffer[len - 1] == 1); |
| 111 sink.Append(reinterpret_cast<const char *>(buffer.getAlias()), len - 1); |
| 112 } |
| 113 |
| 114 private: |
| 115 MaybeStackArray<uint8_t, 40> buffer; |
| 116 int32_t len; |
| 117 UBool ok; |
| 118 |
| 119 UBool ensureCapacity(int32_t appendCapacity); |
| 120 |
| 121 SortKeyLevel(const SortKeyLevel &other); // forbid copying of this class |
| 122 SortKeyLevel &operator=(const SortKeyLevel &other); // forbid copying of thi
s class |
| 123 }; |
| 124 |
| 125 void SortKeyLevel::appendByte(uint32_t b) { |
| 126 if(len < buffer.getCapacity() || ensureCapacity(1)) { |
| 127 buffer[len++] = (uint8_t)b; |
| 128 } |
| 129 } |
| 130 |
| 131 void |
| 132 SortKeyLevel::appendWeight16(uint32_t w) { |
| 133 U_ASSERT((w & 0xffff) != 0); |
| 134 uint8_t b0 = (uint8_t)(w >> 8); |
| 135 uint8_t b1 = (uint8_t)w; |
| 136 int32_t appendLength = (b1 == 0) ? 1 : 2; |
| 137 if((len + appendLength) <= buffer.getCapacity() || ensureCapacity(appendLeng
th)) { |
| 138 buffer[len++] = b0; |
| 139 if(b1 != 0) { |
| 140 buffer[len++] = b1; |
| 141 } |
| 142 } |
| 143 } |
| 144 |
| 145 void |
| 146 SortKeyLevel::appendWeight32(uint32_t w) { |
| 147 U_ASSERT(w != 0); |
| 148 uint8_t bytes[4] = { (uint8_t)(w >> 24), (uint8_t)(w >> 16), (uint8_t)(w >>
8), (uint8_t)w }; |
| 149 int32_t appendLength = (bytes[1] == 0) ? 1 : (bytes[2] == 0) ? 2 : (bytes[3]
== 0) ? 3 : 4; |
| 150 if((len + appendLength) <= buffer.getCapacity() || ensureCapacity(appendLeng
th)) { |
| 151 buffer[len++] = bytes[0]; |
| 152 if(bytes[1] != 0) { |
| 153 buffer[len++] = bytes[1]; |
| 154 if(bytes[2] != 0) { |
| 155 buffer[len++] = bytes[2]; |
| 156 if(bytes[3] != 0) { |
| 157 buffer[len++] = bytes[3]; |
| 158 } |
| 159 } |
| 160 } |
| 161 } |
| 162 } |
| 163 |
| 164 void |
| 165 SortKeyLevel::appendReverseWeight16(uint32_t w) { |
| 166 U_ASSERT((w & 0xffff) != 0); |
| 167 uint8_t b0 = (uint8_t)(w >> 8); |
| 168 uint8_t b1 = (uint8_t)w; |
| 169 int32_t appendLength = (b1 == 0) ? 1 : 2; |
| 170 if((len + appendLength) <= buffer.getCapacity() || ensureCapacity(appendLeng
th)) { |
| 171 if(b1 == 0) { |
| 172 buffer[len++] = b0; |
| 173 } else { |
| 174 buffer[len] = b1; |
| 175 buffer[len + 1] = b0; |
| 176 len += 2; |
| 177 } |
| 178 } |
| 179 } |
| 180 |
| 181 UBool SortKeyLevel::ensureCapacity(int32_t appendCapacity) { |
| 182 if(!ok) { |
| 183 return FALSE; |
| 184 } |
| 185 int32_t newCapacity = 2 * buffer.getCapacity(); |
| 186 int32_t altCapacity = len + 2 * appendCapacity; |
| 187 if (newCapacity < altCapacity) { |
| 188 newCapacity = altCapacity; |
| 189 } |
| 190 if (newCapacity < 200) { |
| 191 newCapacity = 200; |
| 192 } |
| 193 if(buffer.resize(newCapacity, len)==NULL) { |
| 194 return ok = FALSE; |
| 195 } |
| 196 return TRUE; |
| 197 } |
| 198 |
| 199 } // namespace |
| 200 |
| 201 CollationKeys::LevelCallback::~LevelCallback() {} |
| 202 |
| 203 UBool |
| 204 CollationKeys::LevelCallback::needToWrite(Collation::Level /*level*/) { return T
RUE; } |
| 205 |
| 206 /** |
| 207 * Map from collation strength (UColAttributeValue) |
| 208 * to a mask of Collation::Level bits up to that strength, |
| 209 * excluding the CASE_LEVEL which is independent of the strength, |
| 210 * and excluding IDENTICAL_LEVEL which this function does not write. |
| 211 */ |
| 212 static const uint32_t levelMasks[UCOL_STRENGTH_LIMIT] = { |
| 213 2, // UCOL_PRIMARY -> PRIMARY_LEVEL |
| 214 6, // UCOL_SECONDARY -> up to SECONDARY_LEVEL |
| 215 0x16, // UCOL_TERTIARY -> up to TERTIARY_LEVEL |
| 216 0x36, // UCOL_QUATERNARY -> up to QUATERNARY_LEVEL |
| 217 0, 0, 0, 0, |
| 218 0, 0, 0, 0, |
| 219 0, 0, 0, |
| 220 0x36 // UCOL_IDENTICAL -> up to QUATERNARY_LEVEL |
| 221 }; |
| 222 |
| 223 void |
| 224 CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter, |
| 225 const UBool *compressibleBytes, |
| 226 const CollationSettings &settings, |
| 227 SortKeyByteSink &sink, |
| 228 Collation::Level minLevel, LevelCallba
ck &callback, |
| 229 UBool preflight, UErrorCode &errorCode
) { |
| 230 if(U_FAILURE(errorCode)) { return; } |
| 231 |
| 232 int32_t options = settings.options; |
| 233 // Set of levels to process and write. |
| 234 uint32_t levels = levelMasks[CollationSettings::getStrength(options)]; |
| 235 if((options & CollationSettings::CASE_LEVEL) != 0) { |
| 236 levels |= Collation::CASE_LEVEL_FLAG; |
| 237 } |
| 238 // Minus the levels below minLevel. |
| 239 levels &= ~(((uint32_t)1 << minLevel) - 1); |
| 240 if(levels == 0) { return; } |
| 241 |
| 242 uint32_t variableTop; |
| 243 if((options & CollationSettings::ALTERNATE_MASK) == 0) { |
| 244 variableTop = 0; |
| 245 } else { |
| 246 // +1 so that we can use "<" and primary ignorables test out early. |
| 247 variableTop = settings.variableTop + 1; |
| 248 } |
| 249 const uint8_t *reorderTable = settings.reorderTable; |
| 250 |
| 251 uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options); |
| 252 |
| 253 SortKeyLevel cases; |
| 254 SortKeyLevel secondaries; |
| 255 SortKeyLevel tertiaries; |
| 256 SortKeyLevel quaternaries; |
| 257 |
| 258 uint32_t compressedP1 = 0; // 0==no compression; otherwise reordered compre
ssible lead byte |
| 259 int32_t commonCases = 0; |
| 260 int32_t commonSecondaries = 0; |
| 261 int32_t commonTertiaries = 0; |
| 262 int32_t commonQuaternaries = 0; |
| 263 |
| 264 uint32_t prevSecondary = 0; |
| 265 UBool anyMergeSeparators = FALSE; |
| 266 |
| 267 for(;;) { |
| 268 // No need to keep all CEs in the buffer when we write a sort key. |
| 269 iter.clearCEsIfNoneRemaining(); |
| 270 int64_t ce = iter.nextCE(errorCode); |
| 271 uint32_t p = (uint32_t)(ce >> 32); |
| 272 if(p < variableTop && p > Collation::MERGE_SEPARATOR_PRIMARY) { |
| 273 // Variable CE, shift it to quaternary level. |
| 274 // Ignore all following primary ignorables, and shift further variab
le CEs. |
| 275 if(commonQuaternaries != 0) { |
| 276 --commonQuaternaries; |
| 277 while(commonQuaternaries >= QUAT_COMMON_MAX_COUNT) { |
| 278 quaternaries.appendByte(QUAT_COMMON_MIDDLE); |
| 279 commonQuaternaries -= QUAT_COMMON_MAX_COUNT; |
| 280 } |
| 281 // Shifted primary weights are lower than the common weight. |
| 282 quaternaries.appendByte(QUAT_COMMON_LOW + commonQuaternaries); |
| 283 commonQuaternaries = 0; |
| 284 } |
| 285 do { |
| 286 if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) { |
| 287 uint32_t p1 = p >> 24; |
| 288 if(reorderTable != NULL) { p1 = reorderTable[p1]; } |
| 289 if(p1 >= QUAT_SHIFTED_LIMIT_BYTE) { |
| 290 // Prevent shifted primary lead bytes from |
| 291 // overlapping with the common compression range. |
| 292 quaternaries.appendByte(QUAT_SHIFTED_LIMIT_BYTE); |
| 293 } |
| 294 quaternaries.appendWeight32((p1 << 24) | (p & 0xffffff)); |
| 295 } |
| 296 do { |
| 297 ce = iter.nextCE(errorCode); |
| 298 p = (uint32_t)(ce >> 32); |
| 299 } while(p == 0); |
| 300 } while(p < variableTop && p > Collation::MERGE_SEPARATOR_PRIMARY); |
| 301 } |
| 302 // ce could be primary ignorable, or NO_CE, or the merge separator, |
| 303 // or a regular primary CE, but it is not variable. |
| 304 // If ce==NO_CE, then write nothing for the primary level but |
| 305 // terminate compression on all levels and then exit the loop. |
| 306 if(p > Collation::NO_CE_PRIMARY && (levels & Collation::PRIMARY_LEVEL_FL
AG) != 0) { |
| 307 uint32_t p1 = p >> 24; |
| 308 if(reorderTable != NULL) { p1 = reorderTable[p1]; } |
| 309 if(p1 != compressedP1) { |
| 310 if(compressedP1 != 0) { |
| 311 if(p1 < compressedP1) { |
| 312 // No primary compression terminator |
| 313 // at the end of the level or merged segment. |
| 314 if(p1 > Collation::MERGE_SEPARATOR_BYTE) { |
| 315 sink.Append(Collation::PRIMARY_COMPRESSION_LOW_BYTE)
; |
| 316 } |
| 317 } else { |
| 318 sink.Append(Collation::PRIMARY_COMPRESSION_HIGH_BYTE); |
| 319 } |
| 320 } |
| 321 sink.Append(p1); |
| 322 // Test the un-reordered lead byte for compressibility but |
| 323 // remember the reordered lead byte. |
| 324 if(compressibleBytes[p >> 24]) { |
| 325 compressedP1 = p1; |
| 326 } else { |
| 327 compressedP1 = 0; |
| 328 } |
| 329 } |
| 330 char p2 = (char)(p >> 16); |
| 331 if(p2 != 0) { |
| 332 char buffer[3] = { p2, (char)(p >> 8), (char)p }; |
| 333 sink.Append(buffer, (buffer[1] == 0) ? 1 : (buffer[2] == 0) ? 2
: 3); |
| 334 } |
| 335 // Optimization for internalNextSortKeyPart(): |
| 336 // When the primary level overflows we can stop because we need not |
| 337 // calculate (preflight) the whole sort key length. |
| 338 if(!preflight && sink.Overflowed()) { |
| 339 if(U_SUCCESS(errorCode) && !sink.IsOk()) { |
| 340 errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 341 } |
| 342 return; |
| 343 } |
| 344 } |
| 345 |
| 346 uint32_t lower32 = (uint32_t)ce; |
| 347 if(lower32 == 0) { continue; } // completely ignorable, no secondary/ca
se/tertiary/quaternary |
| 348 |
| 349 if((levels & Collation::SECONDARY_LEVEL_FLAG) != 0) { |
| 350 uint32_t s = lower32 >> 16; |
| 351 if(s == 0) { |
| 352 // secondary ignorable |
| 353 } else if(s == Collation::COMMON_WEIGHT16) { |
| 354 ++commonSecondaries; |
| 355 } else if((options & CollationSettings::BACKWARD_SECONDARY) == 0) { |
| 356 if(commonSecondaries != 0) { |
| 357 --commonSecondaries; |
| 358 while(commonSecondaries >= SEC_COMMON_MAX_COUNT) { |
| 359 secondaries.appendByte(SEC_COMMON_MIDDLE); |
| 360 commonSecondaries -= SEC_COMMON_MAX_COUNT; |
| 361 } |
| 362 uint32_t b; |
| 363 if(s < Collation::COMMON_WEIGHT16) { |
| 364 b = SEC_COMMON_LOW + commonSecondaries; |
| 365 } else { |
| 366 b = SEC_COMMON_HIGH - commonSecondaries; |
| 367 } |
| 368 secondaries.appendByte(b); |
| 369 commonSecondaries = 0; |
| 370 } |
| 371 secondaries.appendWeight16(s); |
| 372 } else { |
| 373 if(commonSecondaries != 0) { |
| 374 --commonSecondaries; |
| 375 // Append reverse weights. The level will be re-reversed lat
er. |
| 376 int32_t remainder = commonSecondaries % SEC_COMMON_MAX_COUNT
; |
| 377 uint32_t b; |
| 378 if(prevSecondary < Collation::COMMON_WEIGHT16) { |
| 379 b = SEC_COMMON_LOW + remainder; |
| 380 } else { |
| 381 b = SEC_COMMON_HIGH - remainder; |
| 382 } |
| 383 secondaries.appendByte(b); |
| 384 commonSecondaries -= remainder; |
| 385 // commonSecondaries is now a multiple of SEC_COMMON_MAX_COU
NT. |
| 386 while(commonSecondaries > 0) { // same as >= SEC_COMMON_MAX
_COUNT |
| 387 secondaries.appendByte(SEC_COMMON_MIDDLE); |
| 388 commonSecondaries -= SEC_COMMON_MAX_COUNT; |
| 389 } |
| 390 // commonSecondaries == 0 |
| 391 } |
| 392 // Reduce separators so that we can look for byte<=1 later. |
| 393 if(s <= Collation::MERGE_SEPARATOR_WEIGHT16) { |
| 394 if(s == Collation::MERGE_SEPARATOR_WEIGHT16) { |
| 395 anyMergeSeparators = TRUE; |
| 396 } |
| 397 secondaries.appendByte((s >> 8) - 1); |
| 398 } else { |
| 399 secondaries.appendReverseWeight16(s); |
| 400 } |
| 401 prevSecondary = s; |
| 402 } |
| 403 } |
| 404 |
| 405 if((levels & Collation::CASE_LEVEL_FLAG) != 0) { |
| 406 if((CollationSettings::getStrength(options) == UCOL_PRIMARY) ? |
| 407 p == 0 : lower32 <= 0xffff) { |
| 408 // Primary+caseLevel: Ignore case level weights of primary ignor
ables. |
| 409 // Otherwise: Ignore case level weights of secondary ignorables. |
| 410 // For details see the comments in the CollationCompare class. |
| 411 } else { |
| 412 uint32_t c = (lower32 >> 8) & 0xff; // case bits & tertiary lea
d byte |
| 413 U_ASSERT((c & 0xc0) != 0xc0); |
| 414 if((c & 0xc0) == 0 && c > Collation::MERGE_SEPARATOR_BYTE) { |
| 415 ++commonCases; |
| 416 } else { |
| 417 if((options & CollationSettings::UPPER_FIRST) == 0) { |
| 418 // lowerFirst: Compress common weights to nibbles 1..7..
13, mixed=14, upper=15. |
| 419 if(commonCases != 0) { |
| 420 --commonCases; |
| 421 while(commonCases >= CASE_LOWER_FIRST_COMMON_MAX_COU
NT) { |
| 422 cases.appendByte(CASE_LOWER_FIRST_COMMON_MIDDLE
<< 4); |
| 423 commonCases -= CASE_LOWER_FIRST_COMMON_MAX_COUNT
; |
| 424 } |
| 425 uint32_t b; |
| 426 if(c <= Collation::MERGE_SEPARATOR_BYTE) { |
| 427 b = CASE_LOWER_FIRST_COMMON_LOW + commonCases; |
| 428 } else { |
| 429 b = CASE_LOWER_FIRST_COMMON_HIGH - commonCases; |
| 430 } |
| 431 cases.appendByte(b << 4); |
| 432 commonCases = 0; |
| 433 } |
| 434 if(c > Collation::MERGE_SEPARATOR_BYTE) { |
| 435 c = (CASE_LOWER_FIRST_COMMON_HIGH + (c >> 6)) << 4;
// 14 or 15 |
| 436 } |
| 437 } else { |
| 438 // upperFirst: Compress common weights to nibbles 3..15,
mixed=2, upper=1. |
| 439 // The compressed common case weights only go up from th
e "low" value |
| 440 // because with upperFirst the common weight is the high
est one. |
| 441 if(commonCases != 0) { |
| 442 --commonCases; |
| 443 while(commonCases >= CASE_UPPER_FIRST_COMMON_MAX_COU
NT) { |
| 444 cases.appendByte(CASE_UPPER_FIRST_COMMON_LOW <<
4); |
| 445 commonCases -= CASE_UPPER_FIRST_COMMON_MAX_COUNT
; |
| 446 } |
| 447 cases.appendByte((CASE_UPPER_FIRST_COMMON_LOW + comm
onCases) << 4); |
| 448 commonCases = 0; |
| 449 } |
| 450 if(c > Collation::MERGE_SEPARATOR_BYTE) { |
| 451 c = (CASE_UPPER_FIRST_COMMON_LOW - (c >> 6)) << 4;
// 2 or 1 |
| 452 } |
| 453 } |
| 454 // c is a separator byte 01 or 02, |
| 455 // or a left-shifted nibble 0x10, 0x20, ... 0xf0. |
| 456 cases.appendByte(c); |
| 457 } |
| 458 } |
| 459 } |
| 460 |
| 461 if((levels & Collation::TERTIARY_LEVEL_FLAG) != 0) { |
| 462 uint32_t t = lower32 & tertiaryMask; |
| 463 U_ASSERT((lower32 & 0xc000) != 0xc000); |
| 464 if(t == Collation::COMMON_WEIGHT16) { |
| 465 ++commonTertiaries; |
| 466 } else if((tertiaryMask & 0x8000) == 0) { |
| 467 // Tertiary weights without case bits. |
| 468 // Move lead bytes 06..3F to C6..FF for a large common-weight ra
nge. |
| 469 if(commonTertiaries != 0) { |
| 470 --commonTertiaries; |
| 471 while(commonTertiaries >= TER_ONLY_COMMON_MAX_COUNT) { |
| 472 tertiaries.appendByte(TER_ONLY_COMMON_MIDDLE); |
| 473 commonTertiaries -= TER_ONLY_COMMON_MAX_COUNT; |
| 474 } |
| 475 uint32_t b; |
| 476 if(t < Collation::COMMON_WEIGHT16) { |
| 477 b = TER_ONLY_COMMON_LOW + commonTertiaries; |
| 478 } else { |
| 479 b = TER_ONLY_COMMON_HIGH - commonTertiaries; |
| 480 } |
| 481 tertiaries.appendByte(b); |
| 482 commonTertiaries = 0; |
| 483 } |
| 484 if(t > Collation::COMMON_WEIGHT16) { t += 0xc000; } |
| 485 tertiaries.appendWeight16(t); |
| 486 } else if((options & CollationSettings::UPPER_FIRST) == 0) { |
| 487 // Tertiary weights with caseFirst=lowerFirst. |
| 488 // Move lead bytes 06..BF to 46..FF for the common-weight range. |
| 489 if(commonTertiaries != 0) { |
| 490 --commonTertiaries; |
| 491 while(commonTertiaries >= TER_LOWER_FIRST_COMMON_MAX_COUNT)
{ |
| 492 tertiaries.appendByte(TER_LOWER_FIRST_COMMON_MIDDLE); |
| 493 commonTertiaries -= TER_LOWER_FIRST_COMMON_MAX_COUNT; |
| 494 } |
| 495 uint32_t b; |
| 496 if(t < Collation::COMMON_WEIGHT16) { |
| 497 b = TER_LOWER_FIRST_COMMON_LOW + commonTertiaries; |
| 498 } else { |
| 499 b = TER_LOWER_FIRST_COMMON_HIGH - commonTertiaries; |
| 500 } |
| 501 tertiaries.appendByte(b); |
| 502 commonTertiaries = 0; |
| 503 } |
| 504 if(t > Collation::COMMON_WEIGHT16) { t += 0x4000; } |
| 505 tertiaries.appendWeight16(t); |
| 506 } else { |
| 507 // Tertiary weights with caseFirst=upperFirst. |
| 508 // Do not change the artificial uppercase weight of a tertiary C
E (0.0.ut), |
| 509 // to keep tertiary CEs well-formed. |
| 510 // Their case+tertiary weights must be greater than those of |
| 511 // primary and secondary CEs. |
| 512 // |
| 513 // Separators 01..02 -> 01..02 (unchanged) |
| 514 // Lowercase 03..04 -> 83..84 (includes uncased) |
| 515 // Common weight 05 -> 85..C5 (common-weight compression ra
nge) |
| 516 // Lowercase 06..3F -> C6..FF |
| 517 // Mixed case 43..7F -> 43..7F |
| 518 // Uppercase 83..BF -> 03..3F |
| 519 // Tertiary CE 86..BF -> C6..FF |
| 520 if(t <= Collation::MERGE_SEPARATOR_WEIGHT16) { |
| 521 // Keep separators unchanged. |
| 522 } else if(lower32 > 0xffff) { |
| 523 // Invert case bits of primary & secondary CEs. |
| 524 t ^= 0xc000; |
| 525 if(t < (TER_UPPER_FIRST_COMMON_HIGH << 8)) { |
| 526 t -= 0x4000; |
| 527 } |
| 528 } else { |
| 529 // Keep uppercase bits of tertiary CEs. |
| 530 U_ASSERT(0x8600 <= t && t <= 0xbfff); |
| 531 t += 0x4000; |
| 532 } |
| 533 if(commonTertiaries != 0) { |
| 534 --commonTertiaries; |
| 535 while(commonTertiaries >= TER_UPPER_FIRST_COMMON_MAX_COUNT)
{ |
| 536 tertiaries.appendByte(TER_UPPER_FIRST_COMMON_MIDDLE); |
| 537 commonTertiaries -= TER_UPPER_FIRST_COMMON_MAX_COUNT; |
| 538 } |
| 539 uint32_t b; |
| 540 if(t < (TER_UPPER_FIRST_COMMON_LOW << 8)) { |
| 541 b = TER_UPPER_FIRST_COMMON_LOW + commonTertiaries; |
| 542 } else { |
| 543 b = TER_UPPER_FIRST_COMMON_HIGH - commonTertiaries; |
| 544 } |
| 545 tertiaries.appendByte(b); |
| 546 commonTertiaries = 0; |
| 547 } |
| 548 tertiaries.appendWeight16(t); |
| 549 } |
| 550 } |
| 551 |
| 552 if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) { |
| 553 uint32_t q = lower32 & 0xffff; |
| 554 if((q & 0xc0) == 0 && q > Collation::MERGE_SEPARATOR_WEIGHT16) { |
| 555 ++commonQuaternaries; |
| 556 } else if(q <= Collation::MERGE_SEPARATOR_WEIGHT16 && |
| 557 (options & CollationSettings::ALTERNATE_MASK) == 0 && |
| 558 (quaternaries.isEmpty() || |
| 559 quaternaries[quaternaries.length() - 1] == Collation::ME
RGE_SEPARATOR_BYTE)) { |
| 560 // If alternate=non-ignorable and there are only |
| 561 // common quaternary weights between two separators, |
| 562 // then we need not write anything between these separators. |
| 563 // The only weights greater than the merge separator and less th
an the common weight |
| 564 // are shifted primary weights, which are not generated for alte
rnate=non-ignorable. |
| 565 // There are also exactly as many quaternary weights as tertiary
weights, |
| 566 // so level length differences are handled already on tertiary l
evel. |
| 567 // Any above-common quaternary weight will compare greater regar
dless. |
| 568 quaternaries.appendByte(q >> 8); |
| 569 } else { |
| 570 if(q <= Collation::MERGE_SEPARATOR_WEIGHT16) { |
| 571 q >>= 8; |
| 572 } else { |
| 573 q = 0xfc + ((q >> 6) & 3); |
| 574 } |
| 575 if(commonQuaternaries != 0) { |
| 576 --commonQuaternaries; |
| 577 while(commonQuaternaries >= QUAT_COMMON_MAX_COUNT) { |
| 578 quaternaries.appendByte(QUAT_COMMON_MIDDLE); |
| 579 commonQuaternaries -= QUAT_COMMON_MAX_COUNT; |
| 580 } |
| 581 uint32_t b; |
| 582 if(q < QUAT_COMMON_LOW) { |
| 583 b = QUAT_COMMON_LOW + commonQuaternaries; |
| 584 } else { |
| 585 b = QUAT_COMMON_HIGH - commonQuaternaries; |
| 586 } |
| 587 quaternaries.appendByte(b); |
| 588 commonQuaternaries = 0; |
| 589 } |
| 590 quaternaries.appendByte(q); |
| 591 } |
| 592 } |
| 593 |
| 594 if((lower32 >> 24) == Collation::LEVEL_SEPARATOR_BYTE) { break; } // ce
== NO_CE |
| 595 } |
| 596 |
| 597 if(U_FAILURE(errorCode)) { return; } |
| 598 |
| 599 // Append the beyond-primary levels. |
| 600 UBool ok = TRUE; |
| 601 if((levels & Collation::SECONDARY_LEVEL_FLAG) != 0) { |
| 602 if(!callback.needToWrite(Collation::SECONDARY_LEVEL)) { return; } |
| 603 ok &= secondaries.isOk(); |
| 604 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); |
| 605 uint8_t *secs = secondaries.data(); |
| 606 int32_t length = secondaries.length() - 1; // Ignore the trailing NO_CE
. |
| 607 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { |
| 608 // The backwards secondary level compares secondary weights backward
s |
| 609 // within segments separated by the merge separator (U+FFFE, weight
02). |
| 610 // The separator weights 01 & 02 were reduced to 00 & 01 so that |
| 611 // we do not accidentally separate at a _second_ weight byte of 02. |
| 612 int32_t start = 0; |
| 613 for(;;) { |
| 614 // Find the merge separator or the NO_CE terminator. |
| 615 int32_t limit; |
| 616 if(anyMergeSeparators) { |
| 617 limit = start; |
| 618 while(secs[limit] > 1) { ++limit; } |
| 619 } else { |
| 620 limit = length; |
| 621 } |
| 622 // Reverse this segment. |
| 623 if(start < limit) { |
| 624 uint8_t *p = secs + start; |
| 625 uint8_t *q = secs + limit - 1; |
| 626 while(p < q) { |
| 627 uint8_t s = *p; |
| 628 *p++ = *q; |
| 629 *q-- = s; |
| 630 } |
| 631 } |
| 632 // Did we reach the end of the string? |
| 633 if(secs[limit] == 0) { break; } |
| 634 // Restore the merge separator. |
| 635 secs[limit] = 2; |
| 636 // Skip the merge separator and continue. |
| 637 start = limit + 1; |
| 638 } |
| 639 } |
| 640 sink.Append(reinterpret_cast<char *>(secs), length); |
| 641 } |
| 642 |
| 643 if((levels & Collation::CASE_LEVEL_FLAG) != 0) { |
| 644 if(!callback.needToWrite(Collation::CASE_LEVEL)) { return; } |
| 645 ok &= cases.isOk(); |
| 646 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); |
| 647 // Write pairs of nibbles as bytes, except separator bytes as themselves
. |
| 648 int32_t length = cases.length() - 1; // Ignore the trailing NO_CE. |
| 649 uint8_t b = 0; |
| 650 for(int32_t i = 0; i < length; ++i) { |
| 651 uint8_t c = (uint8_t)cases[i]; |
| 652 if(c <= Collation::MERGE_SEPARATOR_BYTE) { |
| 653 U_ASSERT(c != 0); |
| 654 if(b != 0) { |
| 655 sink.Append(b); |
| 656 b = 0; |
| 657 } |
| 658 sink.Append(c); |
| 659 } else { |
| 660 U_ASSERT((c & 0xf) == 0); |
| 661 if(b == 0) { |
| 662 b = c; |
| 663 } else { |
| 664 sink.Append(b | (c >> 4)); |
| 665 b = 0; |
| 666 } |
| 667 } |
| 668 } |
| 669 if(b != 0) { |
| 670 sink.Append(b); |
| 671 } |
| 672 } |
| 673 |
| 674 if((levels & Collation::TERTIARY_LEVEL_FLAG) != 0) { |
| 675 if(!callback.needToWrite(Collation::TERTIARY_LEVEL)) { return; } |
| 676 ok &= tertiaries.isOk(); |
| 677 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); |
| 678 tertiaries.appendTo(sink); |
| 679 } |
| 680 |
| 681 if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) { |
| 682 if(!callback.needToWrite(Collation::QUATERNARY_LEVEL)) { return; } |
| 683 ok &= quaternaries.isOk(); |
| 684 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); |
| 685 quaternaries.appendTo(sink); |
| 686 } |
| 687 |
| 688 if(!ok || !sink.IsOk()) { |
| 689 errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 690 } |
| 691 } |
| 692 |
| 693 U_NAMESPACE_END |
| 694 |
| 695 #endif // !UCONFIG_NO_COLLATION |
OLD | NEW |