OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * Copyright (C) 2013-2014, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* |
| 6 * collationbuilder.cpp |
| 7 * |
| 8 * (replaced the former ucol_bld.cpp) |
| 9 * |
| 10 * created on: 2013may06 |
| 11 * created by: Markus W. Scherer |
| 12 */ |
| 13 |
| 14 #ifdef DEBUG_COLLATION_BUILDER |
| 15 #include <stdio.h> |
| 16 #endif |
| 17 |
| 18 #include "unicode/utypes.h" |
| 19 |
| 20 #if !UCONFIG_NO_COLLATION |
| 21 |
| 22 #include "unicode/caniter.h" |
| 23 #include "unicode/normalizer2.h" |
| 24 #include "unicode/tblcoll.h" |
| 25 #include "unicode/parseerr.h" |
| 26 #include "unicode/uchar.h" |
| 27 #include "unicode/ucol.h" |
| 28 #include "unicode/unistr.h" |
| 29 #include "unicode/usetiter.h" |
| 30 #include "unicode/utf16.h" |
| 31 #include "unicode/uversion.h" |
| 32 #include "cmemory.h" |
| 33 #include "collation.h" |
| 34 #include "collationbuilder.h" |
| 35 #include "collationdata.h" |
| 36 #include "collationdatabuilder.h" |
| 37 #include "collationfastlatin.h" |
| 38 #include "collationroot.h" |
| 39 #include "collationrootelements.h" |
| 40 #include "collationruleparser.h" |
| 41 #include "collationsettings.h" |
| 42 #include "collationtailoring.h" |
| 43 #include "collationweights.h" |
| 44 #include "normalizer2impl.h" |
| 45 #include "uassert.h" |
| 46 #include "ucol_imp.h" |
| 47 #include "utf16collationiterator.h" |
| 48 |
| 49 U_NAMESPACE_BEGIN |
| 50 |
| 51 namespace { |
| 52 |
| 53 class BundleImporter : public CollationRuleParser::Importer { |
| 54 public: |
| 55 BundleImporter() {} |
| 56 virtual ~BundleImporter(); |
| 57 virtual void getRules( |
| 58 const char *localeID, const char *collationType, |
| 59 UnicodeString &rules, |
| 60 const char *&errorReason, UErrorCode &errorCode); |
| 61 }; |
| 62 |
| 63 BundleImporter::~BundleImporter() {} |
| 64 |
| 65 void |
| 66 BundleImporter::getRules( |
| 67 const char *localeID, const char *collationType, |
| 68 UnicodeString &rules, |
| 69 const char *& /*errorReason*/, UErrorCode &errorCode) { |
| 70 CollationLoader::loadRules(localeID, collationType, rules, errorCode); |
| 71 } |
| 72 |
| 73 } // namespace |
| 74 |
| 75 // RuleBasedCollator implementation ---------------------------------------- *** |
| 76 |
| 77 // These methods are here, rather than in rulebasedcollator.cpp, |
| 78 // for modularization: |
| 79 // Most code using Collator does not need to build a Collator from rules. |
| 80 // By moving these constructors and helper methods to a separate file, |
| 81 // most code will not have a static dependency on the builder code. |
| 82 |
| 83 RuleBasedCollator::RuleBasedCollator() |
| 84 : data(NULL), |
| 85 settings(NULL), |
| 86 tailoring(NULL), |
| 87 cacheEntry(NULL), |
| 88 validLocale(""), |
| 89 explicitlySetAttributes(0), |
| 90 actualLocaleIsSameAsValid(FALSE) { |
| 91 } |
| 92 |
| 93 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, UErrorCode &err
orCode) |
| 94 : data(NULL), |
| 95 settings(NULL), |
| 96 tailoring(NULL), |
| 97 cacheEntry(NULL), |
| 98 validLocale(""), |
| 99 explicitlySetAttributes(0), |
| 100 actualLocaleIsSameAsValid(FALSE) { |
| 101 internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, NULL, NULL, errorC
ode); |
| 102 } |
| 103 |
| 104 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, ECollationStren
gth strength, |
| 105 UErrorCode &errorCode) |
| 106 : data(NULL), |
| 107 settings(NULL), |
| 108 tailoring(NULL), |
| 109 cacheEntry(NULL), |
| 110 validLocale(""), |
| 111 explicitlySetAttributes(0), |
| 112 actualLocaleIsSameAsValid(FALSE) { |
| 113 internalBuildTailoring(rules, strength, UCOL_DEFAULT, NULL, NULL, errorCode)
; |
| 114 } |
| 115 |
| 116 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, |
| 117 UColAttributeValue decompositionMode, |
| 118 UErrorCode &errorCode) |
| 119 : data(NULL), |
| 120 settings(NULL), |
| 121 tailoring(NULL), |
| 122 cacheEntry(NULL), |
| 123 validLocale(""), |
| 124 explicitlySetAttributes(0), |
| 125 actualLocaleIsSameAsValid(FALSE) { |
| 126 internalBuildTailoring(rules, UCOL_DEFAULT, decompositionMode, NULL, NULL, e
rrorCode); |
| 127 } |
| 128 |
| 129 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, |
| 130 ECollationStrength strength, |
| 131 UColAttributeValue decompositionMode, |
| 132 UErrorCode &errorCode) |
| 133 : data(NULL), |
| 134 settings(NULL), |
| 135 tailoring(NULL), |
| 136 cacheEntry(NULL), |
| 137 validLocale(""), |
| 138 explicitlySetAttributes(0), |
| 139 actualLocaleIsSameAsValid(FALSE) { |
| 140 internalBuildTailoring(rules, strength, decompositionMode, NULL, NULL, error
Code); |
| 141 } |
| 142 |
| 143 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, |
| 144 UParseError &parseError, UnicodeString &rea
son, |
| 145 UErrorCode &errorCode) |
| 146 : data(NULL), |
| 147 settings(NULL), |
| 148 tailoring(NULL), |
| 149 cacheEntry(NULL), |
| 150 validLocale(""), |
| 151 explicitlySetAttributes(0), |
| 152 actualLocaleIsSameAsValid(FALSE) { |
| 153 internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &reas
on, errorCode); |
| 154 } |
| 155 |
| 156 void |
| 157 RuleBasedCollator::internalBuildTailoring(const UnicodeString &rules, |
| 158 int32_t strength, |
| 159 UColAttributeValue decompositionMode, |
| 160 UParseError *outParseError, UnicodeStr
ing *outReason, |
| 161 UErrorCode &errorCode) { |
| 162 const CollationTailoring *base = CollationRoot::getRoot(errorCode); |
| 163 if(U_FAILURE(errorCode)) { return; } |
| 164 if(outReason != NULL) { outReason->remove(); } |
| 165 CollationBuilder builder(base, errorCode); |
| 166 UVersionInfo noVersion = { 0, 0, 0, 0 }; |
| 167 BundleImporter importer; |
| 168 LocalPointer<CollationTailoring> t(builder.parseAndBuild(rules, noVersion, |
| 169 &importer, |
| 170 outParseError, erro
rCode)); |
| 171 if(U_FAILURE(errorCode)) { |
| 172 const char *reason = builder.getErrorReason(); |
| 173 if(reason != NULL && outReason != NULL) { |
| 174 *outReason = UnicodeString(reason, -1, US_INV); |
| 175 } |
| 176 return; |
| 177 } |
| 178 t->actualLocale.setToBogus(); |
| 179 adoptTailoring(t.orphan(), errorCode); |
| 180 // Set attributes after building the collator, |
| 181 // to keep the default settings consistent with the rule string. |
| 182 if(strength != UCOL_DEFAULT) { |
| 183 setAttribute(UCOL_STRENGTH, (UColAttributeValue)strength, errorCode); |
| 184 } |
| 185 if(decompositionMode != UCOL_DEFAULT) { |
| 186 setAttribute(UCOL_NORMALIZATION_MODE, decompositionMode, errorCode); |
| 187 } |
| 188 } |
| 189 |
| 190 // CollationBuilder implementation ----------------------------------------- *** |
| 191 |
| 192 CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &erro
rCode) |
| 193 : nfd(*Normalizer2::getNFDInstance(errorCode)), |
| 194 fcd(*Normalizer2Factory::getFCDInstance(errorCode)), |
| 195 nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)), |
| 196 base(b), |
| 197 baseData(b->data), |
| 198 rootElements(b->data->rootElements, b->data->rootElementsLength), |
| 199 variableTop(0), |
| 200 dataBuilder(new CollationDataBuilder(errorCode)), fastLatinEnabled(TRU
E), |
| 201 errorReason(NULL), |
| 202 cesLength(0), |
| 203 rootPrimaryIndexes(errorCode), nodes(errorCode) { |
| 204 nfcImpl.ensureCanonIterData(errorCode); |
| 205 if(U_FAILURE(errorCode)) { |
| 206 errorReason = "CollationBuilder fields initialization failed"; |
| 207 return; |
| 208 } |
| 209 if(dataBuilder == NULL) { |
| 210 errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 211 return; |
| 212 } |
| 213 dataBuilder->initForTailoring(baseData, errorCode); |
| 214 if(U_FAILURE(errorCode)) { |
| 215 errorReason = "CollationBuilder initialization failed"; |
| 216 } |
| 217 } |
| 218 |
| 219 CollationBuilder::~CollationBuilder() { |
| 220 delete dataBuilder; |
| 221 } |
| 222 |
| 223 CollationTailoring * |
| 224 CollationBuilder::parseAndBuild(const UnicodeString &ruleString, |
| 225 const UVersionInfo rulesVersion, |
| 226 CollationRuleParser::Importer *importer, |
| 227 UParseError *outParseError, |
| 228 UErrorCode &errorCode) { |
| 229 if(U_FAILURE(errorCode)) { return NULL; } |
| 230 if(baseData->rootElements == NULL) { |
| 231 errorCode = U_MISSING_RESOURCE_ERROR; |
| 232 errorReason = "missing root elements data, tailoring not supported"; |
| 233 return NULL; |
| 234 } |
| 235 LocalPointer<CollationTailoring> tailoring(new CollationTailoring(base->sett
ings)); |
| 236 if(tailoring.isNull() || tailoring->isBogus()) { |
| 237 errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 238 return NULL; |
| 239 } |
| 240 CollationRuleParser parser(baseData, errorCode); |
| 241 if(U_FAILURE(errorCode)) { return NULL; } |
| 242 // Note: This always bases &[last variable] and &[first regular] |
| 243 // on the root collator's maxVariable/variableTop. |
| 244 // If we wanted this to change after [maxVariable x], then we would keep |
| 245 // the tailoring.settings pointer here and read its variableTop when we need
it. |
| 246 // See http://unicode.org/cldr/trac/ticket/6070 |
| 247 variableTop = base->settings->variableTop; |
| 248 parser.setSink(this); |
| 249 parser.setImporter(importer); |
| 250 CollationSettings &ownedSettings = *SharedObject::copyOnWrite(tailoring->set
tings); |
| 251 parser.parse(ruleString, ownedSettings, outParseError, errorCode); |
| 252 errorReason = parser.getErrorReason(); |
| 253 if(U_FAILURE(errorCode)) { return NULL; } |
| 254 if(dataBuilder->hasMappings()) { |
| 255 makeTailoredCEs(errorCode); |
| 256 closeOverComposites(errorCode); |
| 257 finalizeCEs(errorCode); |
| 258 // Copy all of ASCII, and Latin-1 letters, into each tailoring. |
| 259 optimizeSet.add(0, 0x7f); |
| 260 optimizeSet.add(0xc0, 0xff); |
| 261 // Hangul is decomposed on the fly during collation, |
| 262 // and the tailoring data is always built with HANGUL_TAG specials. |
| 263 optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END); |
| 264 dataBuilder->optimize(optimizeSet, errorCode); |
| 265 tailoring->ensureOwnedData(errorCode); |
| 266 if(U_FAILURE(errorCode)) { return NULL; } |
| 267 if(fastLatinEnabled) { dataBuilder->enableFastLatin(); } |
| 268 dataBuilder->build(*tailoring->ownedData, errorCode); |
| 269 tailoring->builder = dataBuilder; |
| 270 dataBuilder = NULL; |
| 271 } else { |
| 272 tailoring->data = baseData; |
| 273 } |
| 274 if(U_FAILURE(errorCode)) { return NULL; } |
| 275 ownedSettings.fastLatinOptions = CollationFastLatin::getOptions( |
| 276 tailoring->data, ownedSettings, |
| 277 ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinP
rimaries)); |
| 278 tailoring->rules = ruleString; |
| 279 tailoring->rules.getTerminatedBuffer(); // ensure NUL-termination |
| 280 tailoring->setVersion(base->version, rulesVersion); |
| 281 return tailoring.orphan(); |
| 282 } |
| 283 |
| 284 void |
| 285 CollationBuilder::addReset(int32_t strength, const UnicodeString &str, |
| 286 const char *&parserErrorReason, UErrorCode &errorCode
) { |
| 287 if(U_FAILURE(errorCode)) { return; } |
| 288 U_ASSERT(!str.isEmpty()); |
| 289 if(str.charAt(0) == CollationRuleParser::POS_LEAD) { |
| 290 ces[0] = getSpecialResetPosition(str, parserErrorReason, errorCode); |
| 291 cesLength = 1; |
| 292 if(U_FAILURE(errorCode)) { return; } |
| 293 U_ASSERT((ces[0] & Collation::CASE_AND_QUATERNARY_MASK) == 0); |
| 294 } else { |
| 295 // normal reset to a character or string |
| 296 UnicodeString nfdString = nfd.normalize(str, errorCode); |
| 297 if(U_FAILURE(errorCode)) { |
| 298 parserErrorReason = "normalizing the reset position"; |
| 299 return; |
| 300 } |
| 301 cesLength = dataBuilder->getCEs(nfdString, ces, 0); |
| 302 if(cesLength > Collation::MAX_EXPANSION_LENGTH) { |
| 303 errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 304 parserErrorReason = "reset position maps to too many collation eleme
nts (more than 31)"; |
| 305 return; |
| 306 } |
| 307 } |
| 308 if(strength == UCOL_IDENTICAL) { return; } // simple reset-at-position |
| 309 |
| 310 // &[before strength]position |
| 311 U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_TERTIARY); |
| 312 int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCod
e); |
| 313 if(U_FAILURE(errorCode)) { return; } |
| 314 |
| 315 int64_t node = nodes.elementAti(index); |
| 316 // If the index is for a "weaker" tailored node, |
| 317 // then skip backwards over this and further "weaker" nodes. |
| 318 while(strengthFromNode(node) > strength) { |
| 319 index = previousIndexFromNode(node); |
| 320 node = nodes.elementAti(index); |
| 321 } |
| 322 |
| 323 // Find or insert a node whose index we will put into a temporary CE. |
| 324 if(strengthFromNode(node) == strength && isTailoredNode(node)) { |
| 325 // Reset to just before this same-strength tailored node. |
| 326 index = previousIndexFromNode(node); |
| 327 } else if(strength == UCOL_PRIMARY) { |
| 328 // root primary node (has no previous index) |
| 329 uint32_t p = weight32FromNode(node); |
| 330 if(p == 0) { |
| 331 errorCode = U_UNSUPPORTED_ERROR; |
| 332 parserErrorReason = "reset primary-before ignorable not possible"; |
| 333 return; |
| 334 } |
| 335 if(p <= rootElements.getFirstPrimary()) { |
| 336 // There is no primary gap between ignorables and the space-first-pr
imary. |
| 337 errorCode = U_UNSUPPORTED_ERROR; |
| 338 parserErrorReason = "reset primary-before first non-ignorable not su
pported"; |
| 339 return; |
| 340 } |
| 341 if(p == Collation::FIRST_TRAILING_PRIMARY) { |
| 342 // We do not support tailoring to an unassigned-implicit CE. |
| 343 errorCode = U_UNSUPPORTED_ERROR; |
| 344 parserErrorReason = "reset primary-before [first trailing] not suppo
rted"; |
| 345 return; |
| 346 } |
| 347 p = rootElements.getPrimaryBefore(p, baseData->isCompressiblePrimary(p))
; |
| 348 index = findOrInsertNodeForPrimary(p, errorCode); |
| 349 // Go to the last node in this list: |
| 350 // Tailor after the last node between adjacent root nodes. |
| 351 for(;;) { |
| 352 node = nodes.elementAti(index); |
| 353 int32_t nextIndex = nextIndexFromNode(node); |
| 354 if(nextIndex == 0) { break; } |
| 355 index = nextIndex; |
| 356 } |
| 357 } else { |
| 358 // &[before 2] or &[before 3] |
| 359 index = findCommonNode(index, UCOL_SECONDARY); |
| 360 if(strength >= UCOL_TERTIARY) { |
| 361 index = findCommonNode(index, UCOL_TERTIARY); |
| 362 } |
| 363 node = nodes.elementAti(index); |
| 364 if(strengthFromNode(node) == strength) { |
| 365 // Found a same-strength node with an explicit weight. |
| 366 uint32_t weight16 = weight16FromNode(node); |
| 367 if(weight16 == 0) { |
| 368 errorCode = U_UNSUPPORTED_ERROR; |
| 369 if(strength == UCOL_SECONDARY) { |
| 370 parserErrorReason = "reset secondary-before secondary ignora
ble not possible"; |
| 371 } else { |
| 372 parserErrorReason = "reset tertiary-before completely ignora
ble not possible"; |
| 373 } |
| 374 return; |
| 375 } |
| 376 U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16); |
| 377 int32_t previousIndex = previousIndexFromNode(node); |
| 378 if(weight16 == Collation::COMMON_WEIGHT16) { |
| 379 // Reset to just before this same-strength common-weight node. |
| 380 index = previousIndex; |
| 381 } else { |
| 382 // A non-common weight is only possible from a root CE. |
| 383 // Find the higher-level weights, which must all be explicit, |
| 384 // and then find the preceding weight for this level. |
| 385 uint32_t previousWeight16 = 0; |
| 386 int32_t previousWeightIndex = -1; |
| 387 int32_t i = index; |
| 388 if(strength == UCOL_SECONDARY) { |
| 389 uint32_t p; |
| 390 do { |
| 391 i = previousIndexFromNode(node); |
| 392 node = nodes.elementAti(i); |
| 393 if(strengthFromNode(node) == UCOL_SECONDARY && !isTailor
edNode(node) && |
| 394 previousWeightIndex < 0) { |
| 395 previousWeightIndex = i; |
| 396 previousWeight16 = weight16FromNode(node); |
| 397 } |
| 398 } while(strengthFromNode(node) > UCOL_PRIMARY); |
| 399 U_ASSERT(!isTailoredNode(node)); |
| 400 p = weight32FromNode(node); |
| 401 weight16 = rootElements.getSecondaryBefore(p, weight16); |
| 402 } else { |
| 403 uint32_t p, s; |
| 404 do { |
| 405 i = previousIndexFromNode(node); |
| 406 node = nodes.elementAti(i); |
| 407 if(strengthFromNode(node) == UCOL_TERTIARY && !isTailore
dNode(node) && |
| 408 previousWeightIndex < 0) { |
| 409 previousWeightIndex = i; |
| 410 previousWeight16 = weight16FromNode(node); |
| 411 } |
| 412 } while(strengthFromNode(node) > UCOL_SECONDARY); |
| 413 U_ASSERT(!isTailoredNode(node)); |
| 414 if(strengthFromNode(node) == UCOL_SECONDARY) { |
| 415 s = weight16FromNode(node); |
| 416 do { |
| 417 i = previousIndexFromNode(node); |
| 418 node = nodes.elementAti(i); |
| 419 } while(strengthFromNode(node) > UCOL_PRIMARY); |
| 420 U_ASSERT(!isTailoredNode(node)); |
| 421 } else { |
| 422 U_ASSERT(!nodeHasBefore2(node)); |
| 423 s = Collation::COMMON_WEIGHT16; |
| 424 } |
| 425 p = weight32FromNode(node); |
| 426 weight16 = rootElements.getTertiaryBefore(p, s, weight16); |
| 427 U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0); |
| 428 } |
| 429 // Find or insert the new explicit weight before the current one
. |
| 430 if(previousWeightIndex >= 0 && weight16 == previousWeight16) { |
| 431 // Tailor after the last node between adjacent root nodes. |
| 432 index = previousIndex; |
| 433 } else { |
| 434 node = nodeFromWeight16(weight16) | nodeFromStrength(strengt
h); |
| 435 index = insertNodeBetween(previousIndex, index, node, errorC
ode); |
| 436 } |
| 437 } |
| 438 } else { |
| 439 // Found a stronger node with implied strength-common weight. |
| 440 int64_t hasBefore3 = 0; |
| 441 if(strength == UCOL_SECONDARY) { |
| 442 U_ASSERT(!nodeHasBefore2(node)); |
| 443 // Move the HAS_BEFORE3 flag from the parent node |
| 444 // to the new secondary common node. |
| 445 hasBefore3 = node & HAS_BEFORE3; |
| 446 node = (node & ~(int64_t)HAS_BEFORE3) | HAS_BEFORE2; |
| 447 } else { |
| 448 U_ASSERT(!nodeHasBefore3(node)); |
| 449 node |= HAS_BEFORE3; |
| 450 } |
| 451 nodes.setElementAt(node, index); |
| 452 int32_t nextIndex = nextIndexFromNode(node); |
| 453 // Insert default nodes with weights 02 and 05, reset to the 02 node
. |
| 454 node = nodeFromWeight16(BEFORE_WEIGHT16) | nodeFromStrength(strength
); |
| 455 index = insertNodeBetween(index, nextIndex, node, errorCode); |
| 456 node = nodeFromWeight16(Collation::COMMON_WEIGHT16) | hasBefore3 | |
| 457 nodeFromStrength(strength); |
| 458 insertNodeBetween(index, nextIndex, node, errorCode); |
| 459 } |
| 460 // Strength of the temporary CE = strength of its reset position. |
| 461 // Code above raises an error if the before-strength is stronger. |
| 462 strength = ceStrength(ces[cesLength - 1]); |
| 463 } |
| 464 if(U_FAILURE(errorCode)) { |
| 465 parserErrorReason = "inserting reset position for &[before n]"; |
| 466 return; |
| 467 } |
| 468 ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength); |
| 469 } |
| 470 |
| 471 int64_t |
| 472 CollationBuilder::getSpecialResetPosition(const UnicodeString &str, |
| 473 const char *&parserErrorReason, UError
Code &errorCode) { |
| 474 U_ASSERT(str.length() == 2); |
| 475 int64_t ce; |
| 476 int32_t strength = UCOL_PRIMARY; |
| 477 UBool isBoundary = FALSE; |
| 478 UChar32 pos = str.charAt(1) - CollationRuleParser::POS_BASE; |
| 479 U_ASSERT(0 <= pos && pos <= CollationRuleParser::LAST_TRAILING); |
| 480 switch(pos) { |
| 481 case CollationRuleParser::FIRST_TERTIARY_IGNORABLE: |
| 482 // Quaternary CEs are not supported. |
| 483 // Non-zero quaternary weights are possible only on tertiary or stronger
CEs. |
| 484 return 0; |
| 485 case CollationRuleParser::LAST_TERTIARY_IGNORABLE: |
| 486 return 0; |
| 487 case CollationRuleParser::FIRST_SECONDARY_IGNORABLE: { |
| 488 // Look for a tailored tertiary node after [0, 0, 0]. |
| 489 int32_t index = findOrInsertNodeForRootCE(0, UCOL_TERTIARY, errorCode); |
| 490 if(U_FAILURE(errorCode)) { return 0; } |
| 491 int64_t node = nodes.elementAti(index); |
| 492 if((index = nextIndexFromNode(node)) != 0) { |
| 493 node = nodes.elementAti(index); |
| 494 U_ASSERT(strengthFromNode(node) <= UCOL_TERTIARY); |
| 495 if(isTailoredNode(node) && strengthFromNode(node) == UCOL_TERTIARY)
{ |
| 496 return tempCEFromIndexAndStrength(index, UCOL_TERTIARY); |
| 497 } |
| 498 } |
| 499 return rootElements.getFirstTertiaryCE(); |
| 500 // No need to look for nodeHasAnyBefore() on a tertiary node. |
| 501 } |
| 502 case CollationRuleParser::LAST_SECONDARY_IGNORABLE: |
| 503 ce = rootElements.getLastTertiaryCE(); |
| 504 strength = UCOL_TERTIARY; |
| 505 break; |
| 506 case CollationRuleParser::FIRST_PRIMARY_IGNORABLE: { |
| 507 // Look for a tailored secondary node after [0, 0, *]. |
| 508 int32_t index = findOrInsertNodeForRootCE(0, UCOL_SECONDARY, errorCode); |
| 509 if(U_FAILURE(errorCode)) { return 0; } |
| 510 int64_t node = nodes.elementAti(index); |
| 511 while((index = nextIndexFromNode(node)) != 0) { |
| 512 node = nodes.elementAti(index); |
| 513 strength = strengthFromNode(node); |
| 514 if(strength < UCOL_SECONDARY) { break; } |
| 515 if(strength == UCOL_SECONDARY) { |
| 516 if(isTailoredNode(node)) { |
| 517 if(nodeHasBefore3(node)) { |
| 518 index = nextIndexFromNode(nodes.elementAti(nextIndexFrom
Node(node))); |
| 519 U_ASSERT(isTailoredNode(nodes.elementAti(index))); |
| 520 } |
| 521 return tempCEFromIndexAndStrength(index, UCOL_SECONDARY); |
| 522 } else { |
| 523 break; |
| 524 } |
| 525 } |
| 526 } |
| 527 ce = rootElements.getFirstSecondaryCE(); |
| 528 strength = UCOL_SECONDARY; |
| 529 break; |
| 530 } |
| 531 case CollationRuleParser::LAST_PRIMARY_IGNORABLE: |
| 532 ce = rootElements.getLastSecondaryCE(); |
| 533 strength = UCOL_SECONDARY; |
| 534 break; |
| 535 case CollationRuleParser::FIRST_VARIABLE: |
| 536 ce = rootElements.getFirstPrimaryCE(); |
| 537 isBoundary = TRUE; // FractionalUCA.txt: FDD1 00A0, SPACE first primary |
| 538 break; |
| 539 case CollationRuleParser::LAST_VARIABLE: |
| 540 ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1); |
| 541 break; |
| 542 case CollationRuleParser::FIRST_REGULAR: |
| 543 ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1); |
| 544 isBoundary = TRUE; // FractionalUCA.txt: FDD1 263A, SYMBOL first primar
y |
| 545 break; |
| 546 case CollationRuleParser::LAST_REGULAR: |
| 547 // Use the Hani-first-primary rather than the actual last "regular" CE b
efore it, |
| 548 // for backward compatibility with behavior before the introduction of |
| 549 // script-first-primary CEs in the root collator. |
| 550 ce = rootElements.firstCEWithPrimaryAtLeast( |
| 551 baseData->getFirstPrimaryForGroup(USCRIPT_HAN)); |
| 552 break; |
| 553 case CollationRuleParser::FIRST_IMPLICIT: |
| 554 ce = baseData->getSingleCE(0x4e00, errorCode); |
| 555 break; |
| 556 case CollationRuleParser::LAST_IMPLICIT: |
| 557 // We do not support tailoring to an unassigned-implicit CE. |
| 558 errorCode = U_UNSUPPORTED_ERROR; |
| 559 parserErrorReason = "reset to [last implicit] not supported"; |
| 560 return 0; |
| 561 case CollationRuleParser::FIRST_TRAILING: |
| 562 ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY); |
| 563 isBoundary = TRUE; // trailing first primary (there is no mapping for i
t) |
| 564 break; |
| 565 case CollationRuleParser::LAST_TRAILING: |
| 566 errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 567 parserErrorReason = "LDML forbids tailoring to U+FFFF"; |
| 568 return 0; |
| 569 default: |
| 570 U_ASSERT(FALSE); |
| 571 return 0; |
| 572 } |
| 573 |
| 574 int32_t index = findOrInsertNodeForRootCE(ce, strength, errorCode); |
| 575 if(U_FAILURE(errorCode)) { return 0; } |
| 576 int64_t node = nodes.elementAti(index); |
| 577 if((pos & 1) == 0) { |
| 578 // even pos = [first xyz] |
| 579 if(!nodeHasAnyBefore(node) && isBoundary) { |
| 580 // A <group> first primary boundary is artificially added to Fractio
nalUCA.txt. |
| 581 // It is reachable via its special contraction, but is not normally
used. |
| 582 // Find the first character tailored after the boundary CE, |
| 583 // or the first real root CE after it. |
| 584 if((index = nextIndexFromNode(node)) != 0) { |
| 585 // If there is a following node, then it must be tailored |
| 586 // because there are no root CEs with a boundary primary |
| 587 // and non-common secondary/tertiary weights. |
| 588 node = nodes.elementAti(index); |
| 589 U_ASSERT(isTailoredNode(node)); |
| 590 ce = tempCEFromIndexAndStrength(index, strength); |
| 591 } else { |
| 592 U_ASSERT(strength == UCOL_PRIMARY); |
| 593 uint32_t p = (uint32_t)(ce >> 32); |
| 594 int32_t pIndex = rootElements.findPrimary(p); |
| 595 UBool isCompressible = baseData->isCompressiblePrimary(p); |
| 596 p = rootElements.getPrimaryAfter(p, pIndex, isCompressible); |
| 597 ce = Collation::makeCE(p); |
| 598 index = findOrInsertNodeForRootCE(ce, UCOL_PRIMARY, errorCode); |
| 599 if(U_FAILURE(errorCode)) { return 0; } |
| 600 node = nodes.elementAti(index); |
| 601 } |
| 602 } |
| 603 if(nodeHasAnyBefore(node)) { |
| 604 // Get the first node that was tailored before this one at a weaker
strength. |
| 605 if(nodeHasBefore2(node)) { |
| 606 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(nod
e))); |
| 607 node = nodes.elementAti(index); |
| 608 } |
| 609 if(nodeHasBefore3(node)) { |
| 610 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(nod
e))); |
| 611 } |
| 612 U_ASSERT(isTailoredNode(nodes.elementAti(index))); |
| 613 ce = tempCEFromIndexAndStrength(index, strength); |
| 614 } |
| 615 } else { |
| 616 // odd pos = [last xyz] |
| 617 // Find the last node that was tailored after the [last xyz] |
| 618 // at a strength no greater than the position's strength. |
| 619 for(;;) { |
| 620 int32_t nextIndex = nextIndexFromNode(node); |
| 621 if(nextIndex == 0) { break; } |
| 622 int64_t nextNode = nodes.elementAti(nextIndex); |
| 623 if(strengthFromNode(nextNode) < strength) { break; } |
| 624 index = nextIndex; |
| 625 node = nextNode; |
| 626 } |
| 627 // Do not make a temporary CE for a root node. |
| 628 // This last node might be the node for the root CE itself, |
| 629 // or a node with a common secondary or tertiary weight. |
| 630 if(isTailoredNode(node)) { |
| 631 ce = tempCEFromIndexAndStrength(index, strength); |
| 632 } |
| 633 } |
| 634 return ce; |
| 635 } |
| 636 |
| 637 void |
| 638 CollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix, |
| 639 const UnicodeString &str, const UnicodeString &ext
ension, |
| 640 const char *&parserErrorReason, UErrorCode &errorC
ode) { |
| 641 if(U_FAILURE(errorCode)) { return; } |
| 642 UnicodeString nfdPrefix; |
| 643 if(!prefix.isEmpty()) { |
| 644 nfd.normalize(prefix, nfdPrefix, errorCode); |
| 645 if(U_FAILURE(errorCode)) { |
| 646 parserErrorReason = "normalizing the relation prefix"; |
| 647 return; |
| 648 } |
| 649 } |
| 650 UnicodeString nfdString = nfd.normalize(str, errorCode); |
| 651 if(U_FAILURE(errorCode)) { |
| 652 parserErrorReason = "normalizing the relation string"; |
| 653 return; |
| 654 } |
| 655 |
| 656 // The runtime code decomposes Hangul syllables on the fly, |
| 657 // with recursive processing but without making the Jamo pieces visible for
matching. |
| 658 // It does not work with certain types of contextual mappings. |
| 659 int32_t nfdLength = nfdString.length(); |
| 660 if(nfdLength >= 2) { |
| 661 UChar c = nfdString.charAt(0); |
| 662 if(Hangul::isJamoL(c) || Hangul::isJamoV(c)) { |
| 663 // While handling a Hangul syllable, contractions starting with Jamo
L or V |
| 664 // would not see the following Jamo of that syllable. |
| 665 errorCode = U_UNSUPPORTED_ERROR; |
| 666 parserErrorReason = "contractions starting with conjoining Jamo L or
V not supported"; |
| 667 return; |
| 668 } |
| 669 c = nfdString.charAt(nfdLength - 1); |
| 670 if(Hangul::isJamoL(c) || |
| 671 (Hangul::isJamoV(c) && Hangul::isJamoL(nfdString.charAt(nfdLengt
h - 2)))) { |
| 672 // A contraction ending with Jamo L or L+V would require |
| 673 // generating Hangul syllables in addTailComposites() (588 for a Jam
o L), |
| 674 // or decomposing a following Hangul syllable on the fly, during con
traction matching. |
| 675 errorCode = U_UNSUPPORTED_ERROR; |
| 676 parserErrorReason = "contractions ending with conjoining Jamo L or L
+V not supported"; |
| 677 return; |
| 678 } |
| 679 // A Hangul syllable completely inside a contraction is ok. |
| 680 } |
| 681 // Note: If there is a prefix, then the parser checked that |
| 682 // both the prefix and the string beging with NFC boundaries (not Jamo V or
T). |
| 683 // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0)) |
| 684 // (While handling a Hangul syllable, prefixes on Jamo V or T |
| 685 // would not see the previous Jamo of that syllable.) |
| 686 |
| 687 if(strength != UCOL_IDENTICAL) { |
| 688 // Find the node index after which we insert the new tailored node. |
| 689 int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, erro
rCode); |
| 690 U_ASSERT(cesLength > 0); |
| 691 int64_t ce = ces[cesLength - 1]; |
| 692 if(strength == UCOL_PRIMARY && !isTempCE(ce) && (uint32_t)(ce >> 32) ==
0) { |
| 693 // There is no primary gap between ignorables and the space-first-pr
imary. |
| 694 errorCode = U_UNSUPPORTED_ERROR; |
| 695 parserErrorReason = "tailoring primary after ignorables not supporte
d"; |
| 696 return; |
| 697 } |
| 698 if(strength == UCOL_QUATERNARY && ce == 0) { |
| 699 // The CE data structure does not support non-zero quaternary weight
s |
| 700 // on tertiary ignorables. |
| 701 errorCode = U_UNSUPPORTED_ERROR; |
| 702 parserErrorReason = "tailoring quaternary after tertiary ignorables
not supported"; |
| 703 return; |
| 704 } |
| 705 // Insert the new tailored node. |
| 706 index = insertTailoredNodeAfter(index, strength, errorCode); |
| 707 if(U_FAILURE(errorCode)) { |
| 708 parserErrorReason = "modifying collation elements"; |
| 709 return; |
| 710 } |
| 711 // Strength of the temporary CE: |
| 712 // The new relation may yield a stronger CE but not a weaker one. |
| 713 int32_t tempStrength = ceStrength(ce); |
| 714 if(strength < tempStrength) { tempStrength = strength; } |
| 715 ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength); |
| 716 } |
| 717 |
| 718 setCaseBits(nfdString, parserErrorReason, errorCode); |
| 719 if(U_FAILURE(errorCode)) { return; } |
| 720 |
| 721 int32_t cesLengthBeforeExtension = cesLength; |
| 722 if(!extension.isEmpty()) { |
| 723 UnicodeString nfdExtension = nfd.normalize(extension, errorCode); |
| 724 if(U_FAILURE(errorCode)) { |
| 725 parserErrorReason = "normalizing the relation extension"; |
| 726 return; |
| 727 } |
| 728 cesLength = dataBuilder->getCEs(nfdExtension, ces, cesLength); |
| 729 if(cesLength > Collation::MAX_EXPANSION_LENGTH) { |
| 730 errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 731 parserErrorReason = |
| 732 "extension string adds too many collation elements (more than 31
total)"; |
| 733 return; |
| 734 } |
| 735 } |
| 736 uint32_t ce32 = Collation::UNASSIGNED_CE32; |
| 737 if((prefix != nfdPrefix || str != nfdString) && |
| 738 !ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) { |
| 739 // Map from the original input to the CEs. |
| 740 // We do this in case the canonical closure is incomplete, |
| 741 // so that it is possible to explicitly provide the missing mappings. |
| 742 ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode); |
| 743 } |
| 744 addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode); |
| 745 if(U_FAILURE(errorCode)) { |
| 746 parserErrorReason = "writing collation elements"; |
| 747 return; |
| 748 } |
| 749 cesLength = cesLengthBeforeExtension; |
| 750 } |
| 751 |
| 752 int32_t |
| 753 CollationBuilder::findOrInsertNodeForCEs(int32_t strength, const char *&parserEr
rorReason, |
| 754 UErrorCode &errorCode) { |
| 755 if(U_FAILURE(errorCode)) { return 0; } |
| 756 U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_QUATERNARY); |
| 757 |
| 758 // Find the last CE that is at least as "strong" as the requested difference
. |
| 759 // Note: Stronger is smaller (UCOL_PRIMARY=0). |
| 760 int64_t ce; |
| 761 for(;; --cesLength) { |
| 762 if(cesLength == 0) { |
| 763 ce = ces[0] = 0; |
| 764 cesLength = 1; |
| 765 break; |
| 766 } else { |
| 767 ce = ces[cesLength - 1]; |
| 768 } |
| 769 if(ceStrength(ce) <= strength) { break; } |
| 770 } |
| 771 |
| 772 if(isTempCE(ce)) { |
| 773 // No need to findCommonNode() here for lower levels |
| 774 // because insertTailoredNodeAfter() will do that anyway. |
| 775 return indexFromTempCE(ce); |
| 776 } |
| 777 |
| 778 // root CE |
| 779 if((uint8_t)(ce >> 56) == Collation::UNASSIGNED_IMPLICIT_BYTE) { |
| 780 errorCode = U_UNSUPPORTED_ERROR; |
| 781 parserErrorReason = "tailoring relative to an unassigned code point not
supported"; |
| 782 return 0; |
| 783 } |
| 784 return findOrInsertNodeForRootCE(ce, strength, errorCode); |
| 785 } |
| 786 |
| 787 int32_t |
| 788 CollationBuilder::findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UError
Code &errorCode) { |
| 789 if(U_FAILURE(errorCode)) { return 0; } |
| 790 U_ASSERT((uint8_t)(ce >> 56) != Collation::UNASSIGNED_IMPLICIT_BYTE); |
| 791 |
| 792 // Find or insert the node for each of the root CE's weights, |
| 793 // down to the requested level/strength. |
| 794 // Root CEs must have common=zero quaternary weights (for which we never ins
ert any nodes). |
| 795 U_ASSERT((ce & 0xc0) == 0); |
| 796 int32_t index = findOrInsertNodeForPrimary((uint32_t)(ce >> 32) , errorCode)
; |
| 797 if(strength >= UCOL_SECONDARY) { |
| 798 uint32_t lower32 = (uint32_t)ce; |
| 799 index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, error
Code); |
| 800 if(strength >= UCOL_TERTIARY) { |
| 801 index = findOrInsertWeakNode(index, lower32 & Collation::ONLY_TERTIA
RY_MASK, |
| 802 UCOL_TERTIARY, errorCode); |
| 803 } |
| 804 } |
| 805 return index; |
| 806 } |
| 807 |
| 808 namespace { |
| 809 |
| 810 /** |
| 811 * Like Java Collections.binarySearch(List, key, Comparator). |
| 812 * |
| 813 * @return the index>=0 where the item was found, |
| 814 * or the index<0 for inserting the string at ~index in sorted order |
| 815 * (index into rootPrimaryIndexes) |
| 816 */ |
| 817 int32_t |
| 818 binarySearchForRootPrimaryNode(const int32_t *rootPrimaryIndexes, int32_t length
, |
| 819 const int64_t *nodes, uint32_t p) { |
| 820 if(length == 0) { return ~0; } |
| 821 int32_t start = 0; |
| 822 int32_t limit = length; |
| 823 for (;;) { |
| 824 int32_t i = (start + limit) / 2; |
| 825 int64_t node = nodes[rootPrimaryIndexes[i]]; |
| 826 uint32_t nodePrimary = (uint32_t)(node >> 32); // weight32FromNode(node
) |
| 827 if (p == nodePrimary) { |
| 828 return i; |
| 829 } else if (p < nodePrimary) { |
| 830 if (i == start) { |
| 831 return ~start; // insert s before i |
| 832 } |
| 833 limit = i; |
| 834 } else { |
| 835 if (i == start) { |
| 836 return ~(start + 1); // insert s after i |
| 837 } |
| 838 start = i; |
| 839 } |
| 840 } |
| 841 } |
| 842 |
| 843 } // namespace |
| 844 |
| 845 int32_t |
| 846 CollationBuilder::findOrInsertNodeForPrimary(uint32_t p, UErrorCode &errorCode)
{ |
| 847 if(U_FAILURE(errorCode)) { return 0; } |
| 848 |
| 849 int32_t rootIndex = binarySearchForRootPrimaryNode( |
| 850 rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuff
er(), p); |
| 851 if(rootIndex >= 0) { |
| 852 return rootPrimaryIndexes.elementAti(rootIndex); |
| 853 } else { |
| 854 // Start a new list of nodes with this primary. |
| 855 int32_t index = nodes.size(); |
| 856 nodes.addElement(nodeFromWeight32(p), errorCode); |
| 857 rootPrimaryIndexes.insertElementAt(index, ~rootIndex, errorCode); |
| 858 return index; |
| 859 } |
| 860 } |
| 861 |
| 862 int32_t |
| 863 CollationBuilder::findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t
level, UErrorCode &errorCode) { |
| 864 if(U_FAILURE(errorCode)) { return 0; } |
| 865 U_ASSERT(0 <= index && index < nodes.size()); |
| 866 |
| 867 U_ASSERT(weight16 == 0 || weight16 >= Collation::COMMON_WEIGHT16); |
| 868 // Only reset-before inserts common weights. |
| 869 if(weight16 == Collation::COMMON_WEIGHT16) { |
| 870 return findCommonNode(index, level); |
| 871 } |
| 872 // Find the root CE's weight for this level. |
| 873 // Postpone insertion if not found: |
| 874 // Insert the new root node before the next stronger node, |
| 875 // or before the next root node with the same strength and a larger weight. |
| 876 int64_t node = nodes.elementAti(index); |
| 877 int32_t nextIndex; |
| 878 while((nextIndex = nextIndexFromNode(node)) != 0) { |
| 879 node = nodes.elementAti(nextIndex); |
| 880 int32_t nextStrength = strengthFromNode(node); |
| 881 if(nextStrength <= level) { |
| 882 // Insert before a stronger node. |
| 883 if(nextStrength < level) { break; } |
| 884 // nextStrength == level |
| 885 if(!isTailoredNode(node)) { |
| 886 uint32_t nextWeight16 = weight16FromNode(node); |
| 887 if(nextWeight16 == weight16) { |
| 888 // Found the node for the root CE up to this level. |
| 889 return nextIndex; |
| 890 } |
| 891 // Insert before a node with a larger same-strength weight. |
| 892 if(nextWeight16 > weight16) { break; } |
| 893 } |
| 894 } |
| 895 // Skip the next node. |
| 896 index = nextIndex; |
| 897 } |
| 898 node = nodeFromWeight16(weight16) | nodeFromStrength(level); |
| 899 return insertNodeBetween(index, nextIndex, node, errorCode); |
| 900 } |
| 901 |
| 902 int32_t |
| 903 CollationBuilder::insertTailoredNodeAfter(int32_t index, int32_t strength, UErro
rCode &errorCode) { |
| 904 if(U_FAILURE(errorCode)) { return 0; } |
| 905 U_ASSERT(0 <= index && index < nodes.size()); |
| 906 if(strength >= UCOL_SECONDARY) { |
| 907 index = findCommonNode(index, UCOL_SECONDARY); |
| 908 if(strength >= UCOL_TERTIARY) { |
| 909 index = findCommonNode(index, UCOL_TERTIARY); |
| 910 } |
| 911 } |
| 912 // Postpone insertion: |
| 913 // Insert the new node before the next one with a strength at least as stron
g. |
| 914 int64_t node = nodes.elementAti(index); |
| 915 int32_t nextIndex; |
| 916 while((nextIndex = nextIndexFromNode(node)) != 0) { |
| 917 node = nodes.elementAti(nextIndex); |
| 918 if(strengthFromNode(node) <= strength) { break; } |
| 919 // Skip the next node which has a weaker (larger) strength than the new
one. |
| 920 index = nextIndex; |
| 921 } |
| 922 node = IS_TAILORED | nodeFromStrength(strength); |
| 923 return insertNodeBetween(index, nextIndex, node, errorCode); |
| 924 } |
| 925 |
| 926 int32_t |
| 927 CollationBuilder::insertNodeBetween(int32_t index, int32_t nextIndex, int64_t no
de, |
| 928 UErrorCode &errorCode) { |
| 929 if(U_FAILURE(errorCode)) { return 0; } |
| 930 U_ASSERT(previousIndexFromNode(node) == 0); |
| 931 U_ASSERT(nextIndexFromNode(node) == 0); |
| 932 U_ASSERT(nextIndexFromNode(nodes.elementAti(index)) == nextIndex); |
| 933 // Append the new node and link it to the existing nodes. |
| 934 int32_t newIndex = nodes.size(); |
| 935 node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex); |
| 936 nodes.addElement(node, errorCode); |
| 937 if(U_FAILURE(errorCode)) { return 0; } |
| 938 // nodes[index].nextIndex = newIndex |
| 939 node = nodes.elementAti(index); |
| 940 nodes.setElementAt(changeNodeNextIndex(node, newIndex), index); |
| 941 // nodes[nextIndex].previousIndex = newIndex |
| 942 if(nextIndex != 0) { |
| 943 node = nodes.elementAti(nextIndex); |
| 944 nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex); |
| 945 } |
| 946 return newIndex; |
| 947 } |
| 948 |
| 949 int32_t |
| 950 CollationBuilder::findCommonNode(int32_t index, int32_t strength) const { |
| 951 U_ASSERT(UCOL_SECONDARY <= strength && strength <= UCOL_TERTIARY); |
| 952 int64_t node = nodes.elementAti(index); |
| 953 if(strengthFromNode(node) >= strength) { |
| 954 // The current node is no stronger. |
| 955 return index; |
| 956 } |
| 957 if(strength == UCOL_SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node
)) { |
| 958 // The current node implies the strength-common weight. |
| 959 return index; |
| 960 } |
| 961 index = nextIndexFromNode(node); |
| 962 node = nodes.elementAti(index); |
| 963 U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength && |
| 964 weight16FromNode(node) == BEFORE_WEIGHT16); |
| 965 // Skip to the explicit common node. |
| 966 do { |
| 967 index = nextIndexFromNode(node); |
| 968 node = nodes.elementAti(index); |
| 969 U_ASSERT(strengthFromNode(node) >= strength); |
| 970 } while(isTailoredNode(node) || strengthFromNode(node) > strength); |
| 971 U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16); |
| 972 return index; |
| 973 } |
| 974 |
| 975 void |
| 976 CollationBuilder::setCaseBits(const UnicodeString &nfdString, |
| 977 const char *&parserErrorReason, UErrorCode &errorC
ode) { |
| 978 if(U_FAILURE(errorCode)) { return; } |
| 979 int32_t numTailoredPrimaries = 0; |
| 980 for(int32_t i = 0; i < cesLength; ++i) { |
| 981 if(ceStrength(ces[i]) == UCOL_PRIMARY) { ++numTailoredPrimaries; } |
| 982 } |
| 983 // We should not be able to get too many case bits because |
| 984 // cesLength<=31==MAX_EXPANSION_LENGTH. |
| 985 // 31 pairs of case bits fit into an int64_t without setting its sign bit. |
| 986 U_ASSERT(numTailoredPrimaries <= 31); |
| 987 |
| 988 int64_t cases = 0; |
| 989 if(numTailoredPrimaries > 0) { |
| 990 const UChar *s = nfdString.getBuffer(); |
| 991 UTF16CollationIterator baseCEs(baseData, FALSE, s, s, s + nfdString.leng
th()); |
| 992 int32_t baseCEsLength = baseCEs.fetchCEs(errorCode) - 1; |
| 993 if(U_FAILURE(errorCode)) { |
| 994 parserErrorReason = "fetching root CEs for tailored string"; |
| 995 return; |
| 996 } |
| 997 U_ASSERT(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation
::NO_CE); |
| 998 |
| 999 uint32_t lastCase = 0; |
| 1000 int32_t numBasePrimaries = 0; |
| 1001 for(int32_t i = 0; i < baseCEsLength; ++i) { |
| 1002 int64_t ce = baseCEs.getCE(i); |
| 1003 if((ce >> 32) != 0) { |
| 1004 ++numBasePrimaries; |
| 1005 uint32_t c = ((uint32_t)ce >> 14) & 3; |
| 1006 U_ASSERT(c == 0 || c == 2); // lowercase or uppercase, no mixed
case in any base CE |
| 1007 if(numBasePrimaries < numTailoredPrimaries) { |
| 1008 cases |= (int64_t)c << ((numBasePrimaries - 1) * 2); |
| 1009 } else if(numBasePrimaries == numTailoredPrimaries) { |
| 1010 lastCase = c; |
| 1011 } else if(c != lastCase) { |
| 1012 // There are more base primary CEs than tailored primaries. |
| 1013 // Set mixed case if the case bits of the remainder differ. |
| 1014 lastCase = 1; |
| 1015 // Nothing more can change. |
| 1016 break; |
| 1017 } |
| 1018 } |
| 1019 } |
| 1020 if(numBasePrimaries >= numTailoredPrimaries) { |
| 1021 cases |= (int64_t)lastCase << ((numTailoredPrimaries - 1) * 2); |
| 1022 } |
| 1023 } |
| 1024 |
| 1025 for(int32_t i = 0; i < cesLength; ++i) { |
| 1026 int64_t ce = ces[i] & INT64_C(0xffffffffffff3fff); // clear old case bi
ts |
| 1027 int32_t strength = ceStrength(ce); |
| 1028 if(strength == UCOL_PRIMARY) { |
| 1029 ce |= (cases & 3) << 14; |
| 1030 cases >>= 2; |
| 1031 } else if(strength == UCOL_TERTIARY) { |
| 1032 // Tertiary CEs must have uppercase bits. |
| 1033 // See the LDML spec, and comments in class CollationCompare. |
| 1034 ce |= 0x8000; |
| 1035 } |
| 1036 // Tertiary ignorable CEs must have 0 case bits. |
| 1037 // We set 0 case bits for secondary CEs too |
| 1038 // since currently only U+0345 is cased and maps to a secondary CE, |
| 1039 // and it is lowercase. Other secondaries are uncased. |
| 1040 // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight. |
| 1041 ces[i] = ce; |
| 1042 } |
| 1043 } |
| 1044 |
| 1045 void |
| 1046 CollationBuilder::suppressContractions(const UnicodeSet &set, const char *&parse
rErrorReason, |
| 1047 UErrorCode &errorCode) { |
| 1048 if(U_FAILURE(errorCode)) { return; } |
| 1049 dataBuilder->suppressContractions(set, errorCode); |
| 1050 if(U_FAILURE(errorCode)) { |
| 1051 parserErrorReason = "application of [suppressContractions [set]] failed"
; |
| 1052 } |
| 1053 } |
| 1054 |
| 1055 void |
| 1056 CollationBuilder::optimize(const UnicodeSet &set, const char *& /* parserErrorRe
ason */, |
| 1057 UErrorCode &errorCode) { |
| 1058 if(U_FAILURE(errorCode)) { return; } |
| 1059 optimizeSet.addAll(set); |
| 1060 } |
| 1061 |
| 1062 uint32_t |
| 1063 CollationBuilder::addWithClosure(const UnicodeString &nfdPrefix, const UnicodeSt
ring &nfdString, |
| 1064 const int64_t newCEs[], int32_t newCEsLength, u
int32_t ce32, |
| 1065 UErrorCode &errorCode) { |
| 1066 // Map from the NFD input to the CEs. |
| 1067 ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, erro
rCode); |
| 1068 ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, erro
rCode); |
| 1069 addTailComposites(nfdPrefix, nfdString, errorCode); |
| 1070 return ce32; |
| 1071 } |
| 1072 |
| 1073 uint32_t |
| 1074 CollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeSt
ring &nfdString, |
| 1075 const int64_t newCEs[], int32_t newCEsLength, u
int32_t ce32, |
| 1076 UErrorCode &errorCode) { |
| 1077 if(U_FAILURE(errorCode)) { return ce32; } |
| 1078 |
| 1079 // Map from canonically equivalent input to the CEs. (But not from the all-N
FD input.) |
| 1080 if(nfdPrefix.isEmpty()) { |
| 1081 CanonicalIterator stringIter(nfdString, errorCode); |
| 1082 if(U_FAILURE(errorCode)) { return ce32; } |
| 1083 UnicodeString prefix; |
| 1084 for(;;) { |
| 1085 UnicodeString str = stringIter.next(); |
| 1086 if(str.isBogus()) { break; } |
| 1087 if(ignoreString(str, errorCode) || str == nfdString) { continue; } |
| 1088 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, error
Code); |
| 1089 if(U_FAILURE(errorCode)) { return ce32; } |
| 1090 } |
| 1091 } else { |
| 1092 CanonicalIterator prefixIter(nfdPrefix, errorCode); |
| 1093 CanonicalIterator stringIter(nfdString, errorCode); |
| 1094 if(U_FAILURE(errorCode)) { return ce32; } |
| 1095 for(;;) { |
| 1096 UnicodeString prefix = prefixIter.next(); |
| 1097 if(prefix.isBogus()) { break; } |
| 1098 if(ignorePrefix(prefix, errorCode)) { continue; } |
| 1099 UBool samePrefix = prefix == nfdPrefix; |
| 1100 for(;;) { |
| 1101 UnicodeString str = stringIter.next(); |
| 1102 if(str.isBogus()) { break; } |
| 1103 if(ignoreString(str, errorCode) || (samePrefix && str == nfdStri
ng)) { continue; } |
| 1104 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, e
rrorCode); |
| 1105 if(U_FAILURE(errorCode)) { return ce32; } |
| 1106 } |
| 1107 stringIter.reset(); |
| 1108 } |
| 1109 } |
| 1110 return ce32; |
| 1111 } |
| 1112 |
| 1113 void |
| 1114 CollationBuilder::addTailComposites(const UnicodeString &nfdPrefix, const Unicod
eString &nfdString, |
| 1115 UErrorCode &errorCode) { |
| 1116 if(U_FAILURE(errorCode)) { return; } |
| 1117 |
| 1118 // Look for the last starter in the NFD string. |
| 1119 UChar32 lastStarter; |
| 1120 int32_t indexAfterLastStarter = nfdString.length(); |
| 1121 for(;;) { |
| 1122 if(indexAfterLastStarter == 0) { return; } // no starter at all |
| 1123 lastStarter = nfdString.char32At(indexAfterLastStarter - 1); |
| 1124 if(nfd.getCombiningClass(lastStarter) == 0) { break; } |
| 1125 indexAfterLastStarter -= U16_LENGTH(lastStarter); |
| 1126 } |
| 1127 // No closure to Hangul syllables since we decompose them on the fly. |
| 1128 if(Hangul::isJamoL(lastStarter)) { return; } |
| 1129 |
| 1130 // Are there any composites whose decomposition starts with the lastStarter? |
| 1131 // Note: Normalizer2Impl does not currently return start sets for NFC_QC=May
be characters. |
| 1132 // We might find some more equivalent mappings here if it did. |
| 1133 UnicodeSet composites; |
| 1134 if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; } |
| 1135 |
| 1136 UnicodeString decomp; |
| 1137 UnicodeString newNFDString, newString; |
| 1138 int64_t newCEs[Collation::MAX_EXPANSION_LENGTH]; |
| 1139 UnicodeSetIterator iter(composites); |
| 1140 while(iter.next()) { |
| 1141 U_ASSERT(!iter.isString()); |
| 1142 UChar32 composite = iter.getCodepoint(); |
| 1143 nfd.getDecomposition(composite, decomp); |
| 1144 if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite
, decomp, |
| 1145 newNFDString, newString, errorCode)) { |
| 1146 continue; |
| 1147 } |
| 1148 int32_t newCEsLength = dataBuilder->getCEs(nfdPrefix, newNFDString, newC
Es, 0); |
| 1149 if(newCEsLength > Collation::MAX_EXPANSION_LENGTH) { |
| 1150 // Ignore mappings that we cannot store. |
| 1151 continue; |
| 1152 } |
| 1153 // Note: It is possible that the newCEs do not make use of the mapping |
| 1154 // for which we are adding the tail composites, in which case we might b
e adding |
| 1155 // unnecessary mappings. |
| 1156 // For example, when we add tail composites for ae^ (^=combining circumf
lex), |
| 1157 // UCA discontiguous-contraction matching does not find any matches |
| 1158 // for ae_^ (_=any combining diacritic below) *unless* there is also |
| 1159 // a contraction mapping for ae. |
| 1160 // Thus, if there is no ae contraction, then the ae^ mapping is ignored |
| 1161 // while fetching the newCEs for ae_^. |
| 1162 // TODO: Try to detect this effectively. |
| 1163 // (Alternatively, print a warning when prefix contractions are missing.
) |
| 1164 |
| 1165 // We do not need an explicit mapping for the NFD strings. |
| 1166 // It is fine if the NFD input collates like this via a sequence of mapp
ings. |
| 1167 // It also saves a little bit of space, and may reduce the set of charac
ters with contractions. |
| 1168 uint32_t ce32 = addIfDifferent(nfdPrefix, newString, |
| 1169 newCEs, newCEsLength, Collation::UNASSIGN
ED_CE32, errorCode); |
| 1170 if(ce32 != Collation::UNASSIGNED_CE32) { |
| 1171 // was different, was added |
| 1172 addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32,
errorCode); |
| 1173 } |
| 1174 } |
| 1175 } |
| 1176 |
| 1177 UBool |
| 1178 CollationBuilder::mergeCompositeIntoString(const UnicodeString &nfdString, |
| 1179 int32_t indexAfterLastStarter, |
| 1180 UChar32 composite, const UnicodeStrin
g &decomp, |
| 1181 UnicodeString &newNFDString, UnicodeS
tring &newString, |
| 1182 UErrorCode &errorCode) const { |
| 1183 if(U_FAILURE(errorCode)) { return FALSE; } |
| 1184 U_ASSERT(nfdString.char32At(indexAfterLastStarter - 1) == decomp.char32At(0)
); |
| 1185 int32_t lastStarterLength = decomp.moveIndex32(0, 1); |
| 1186 if(lastStarterLength == decomp.length()) { |
| 1187 // Singleton decompositions should be found by addWithClosure() |
| 1188 // and the CanonicalIterator, so we can ignore them here. |
| 1189 return FALSE; |
| 1190 } |
| 1191 if(nfdString.compare(indexAfterLastStarter, 0x7fffffff, |
| 1192 decomp, lastStarterLength, 0x7fffffff) == 0) { |
| 1193 // same strings, nothing new to be found here |
| 1194 return FALSE; |
| 1195 } |
| 1196 |
| 1197 // Make new FCD strings that combine a composite, or its decomposition, |
| 1198 // into the nfdString's last starter and the combining marks following it. |
| 1199 // Make an NFD version, and a version with the composite. |
| 1200 newNFDString.setTo(nfdString, 0, indexAfterLastStarter); |
| 1201 newString.setTo(nfdString, 0, indexAfterLastStarter - lastStarterLength).app
end(composite); |
| 1202 |
| 1203 // The following is related to discontiguous contraction matching, |
| 1204 // but builds only FCD strings (or else returns FALSE). |
| 1205 int32_t sourceIndex = indexAfterLastStarter; |
| 1206 int32_t decompIndex = lastStarterLength; |
| 1207 // Small optimization: We keep the source character across loop iterations |
| 1208 // because we do not always consume it, |
| 1209 // and then need not fetch it again nor look up its combining class again. |
| 1210 UChar32 sourceChar = U_SENTINEL; |
| 1211 // The cc variables need to be declared before the loop so that at the end |
| 1212 // they are set to the last combining classes seen. |
| 1213 uint8_t sourceCC = 0; |
| 1214 uint8_t decompCC = 0; |
| 1215 for(;;) { |
| 1216 if(sourceChar < 0) { |
| 1217 if(sourceIndex >= nfdString.length()) { break; } |
| 1218 sourceChar = nfdString.char32At(sourceIndex); |
| 1219 sourceCC = nfd.getCombiningClass(sourceChar); |
| 1220 U_ASSERT(sourceCC != 0); |
| 1221 } |
| 1222 // We consume a decomposition character in each iteration. |
| 1223 if(decompIndex >= decomp.length()) { break; } |
| 1224 UChar32 decompChar = decomp.char32At(decompIndex); |
| 1225 decompCC = nfd.getCombiningClass(decompChar); |
| 1226 // Compare the two characters and their combining classes. |
| 1227 if(decompCC == 0) { |
| 1228 // Unable to merge because the source contains a non-zero combining
mark |
| 1229 // but the composite's decomposition contains another starter. |
| 1230 // The strings would not be equivalent. |
| 1231 return FALSE; |
| 1232 } else if(sourceCC < decompCC) { |
| 1233 // Composite + sourceChar would not be FCD. |
| 1234 return FALSE; |
| 1235 } else if(decompCC < sourceCC) { |
| 1236 newNFDString.append(decompChar); |
| 1237 decompIndex += U16_LENGTH(decompChar); |
| 1238 } else if(decompChar != sourceChar) { |
| 1239 // Blocked because same combining class. |
| 1240 return FALSE; |
| 1241 } else { // match: decompChar == sourceChar |
| 1242 newNFDString.append(decompChar); |
| 1243 decompIndex += U16_LENGTH(decompChar); |
| 1244 sourceIndex += U16_LENGTH(decompChar); |
| 1245 sourceChar = U_SENTINEL; |
| 1246 } |
| 1247 } |
| 1248 // We are at the end of at least one of the two inputs. |
| 1249 if(sourceChar >= 0) { // more characters from nfdString but not from decomp |
| 1250 if(sourceCC < decompCC) { |
| 1251 // Appending the next source character to the composite would not be
FCD. |
| 1252 return FALSE; |
| 1253 } |
| 1254 newNFDString.append(nfdString, sourceIndex, 0x7fffffff); |
| 1255 newString.append(nfdString, sourceIndex, 0x7fffffff); |
| 1256 } else if(decompIndex < decomp.length()) { // more characters from decomp,
not from nfdString |
| 1257 newNFDString.append(decomp, decompIndex, 0x7fffffff); |
| 1258 } |
| 1259 U_ASSERT(nfd.isNormalized(newNFDString, errorCode)); |
| 1260 U_ASSERT(fcd.isNormalized(newString, errorCode)); |
| 1261 U_ASSERT(nfd.normalize(newString, errorCode) == newNFDString); // canonical
ly equivalent |
| 1262 return TRUE; |
| 1263 } |
| 1264 |
| 1265 UBool |
| 1266 CollationBuilder::ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) co
nst { |
| 1267 // Do not map non-FCD prefixes. |
| 1268 return !isFCD(s, errorCode); |
| 1269 } |
| 1270 |
| 1271 UBool |
| 1272 CollationBuilder::ignoreString(const UnicodeString &s, UErrorCode &errorCode) co
nst { |
| 1273 // Do not map non-FCD strings. |
| 1274 // Do not map strings that start with Hangul syllables: We decompose those o
n the fly. |
| 1275 return !isFCD(s, errorCode) || Hangul::isHangul(s.charAt(0)); |
| 1276 } |
| 1277 |
| 1278 UBool |
| 1279 CollationBuilder::isFCD(const UnicodeString &s, UErrorCode &errorCode) const { |
| 1280 return U_SUCCESS(errorCode) && fcd.isNormalized(s, errorCode); |
| 1281 } |
| 1282 |
| 1283 void |
| 1284 CollationBuilder::closeOverComposites(UErrorCode &errorCode) { |
| 1285 UnicodeSet composites(UNICODE_STRING_SIMPLE("[:NFD_QC=N:]"), errorCode); //
Java: static final |
| 1286 if(U_FAILURE(errorCode)) { return; } |
| 1287 // Hangul is decomposed on the fly during collation. |
| 1288 composites.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END); |
| 1289 UnicodeString prefix; // empty |
| 1290 UnicodeString nfdString; |
| 1291 UnicodeSetIterator iter(composites); |
| 1292 while(iter.next()) { |
| 1293 U_ASSERT(!iter.isString()); |
| 1294 nfd.getDecomposition(iter.getCodepoint(), nfdString); |
| 1295 cesLength = dataBuilder->getCEs(nfdString, ces, 0); |
| 1296 if(cesLength > Collation::MAX_EXPANSION_LENGTH) { |
| 1297 // Too many CEs from the decomposition (unusual), ignore this compos
ite. |
| 1298 // We could add a capacity parameter to getCEs() and reallocate if n
ecessary. |
| 1299 // However, this can only really happen in contrived cases. |
| 1300 continue; |
| 1301 } |
| 1302 const UnicodeString &composite(iter.getString()); |
| 1303 addIfDifferent(prefix, composite, ces, cesLength, Collation::UNASSIGNED_
CE32, errorCode); |
| 1304 } |
| 1305 } |
| 1306 |
| 1307 uint32_t |
| 1308 CollationBuilder::addIfDifferent(const UnicodeString &prefix, const UnicodeStrin
g &str, |
| 1309 const int64_t newCEs[], int32_t newCEsLength, u
int32_t ce32, |
| 1310 UErrorCode &errorCode) { |
| 1311 if(U_FAILURE(errorCode)) { return ce32; } |
| 1312 int64_t oldCEs[Collation::MAX_EXPANSION_LENGTH]; |
| 1313 int32_t oldCEsLength = dataBuilder->getCEs(prefix, str, oldCEs, 0); |
| 1314 if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) { |
| 1315 if(ce32 == Collation::UNASSIGNED_CE32) { |
| 1316 ce32 = dataBuilder->encodeCEs(newCEs, newCEsLength, errorCode); |
| 1317 } |
| 1318 dataBuilder->addCE32(prefix, str, ce32, errorCode); |
| 1319 } |
| 1320 return ce32; |
| 1321 } |
| 1322 |
| 1323 UBool |
| 1324 CollationBuilder::sameCEs(const int64_t ces1[], int32_t ces1Length, |
| 1325 const int64_t ces2[], int32_t ces2Length) { |
| 1326 if(ces1Length != ces2Length) { |
| 1327 return FALSE; |
| 1328 } |
| 1329 U_ASSERT(ces1Length <= Collation::MAX_EXPANSION_LENGTH); |
| 1330 for(int32_t i = 0; i < ces1Length; ++i) { |
| 1331 if(ces1[i] != ces2[i]) { return FALSE; } |
| 1332 } |
| 1333 return TRUE; |
| 1334 } |
| 1335 |
| 1336 #ifdef DEBUG_COLLATION_BUILDER |
| 1337 |
| 1338 uint32_t |
| 1339 alignWeightRight(uint32_t w) { |
| 1340 if(w != 0) { |
| 1341 while((w & 0xff) == 0) { w >>= 8; } |
| 1342 } |
| 1343 return w; |
| 1344 } |
| 1345 |
| 1346 #endif |
| 1347 |
| 1348 void |
| 1349 CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) { |
| 1350 if(U_FAILURE(errorCode)) { return; } |
| 1351 |
| 1352 CollationWeights primaries, secondaries, tertiaries; |
| 1353 int64_t *nodesArray = nodes.getBuffer(); |
| 1354 |
| 1355 for(int32_t rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) { |
| 1356 int32_t i = rootPrimaryIndexes.elementAti(rpi); |
| 1357 int64_t node = nodesArray[i]; |
| 1358 uint32_t p = weight32FromNode(node); |
| 1359 uint32_t s = p == 0 ? 0 : Collation::COMMON_WEIGHT16; |
| 1360 uint32_t t = s; |
| 1361 uint32_t q = 0; |
| 1362 UBool pIsTailored = FALSE; |
| 1363 UBool sIsTailored = FALSE; |
| 1364 UBool tIsTailored = FALSE; |
| 1365 #ifdef DEBUG_COLLATION_BUILDER |
| 1366 printf("\nprimary %lx\n", (long)alignWeightRight(p)); |
| 1367 #endif |
| 1368 int32_t pIndex = p == 0 ? 0 : rootElements.findPrimary(p); |
| 1369 int32_t nextIndex = nextIndexFromNode(node); |
| 1370 while(nextIndex != 0) { |
| 1371 i = nextIndex; |
| 1372 node = nodesArray[i]; |
| 1373 nextIndex = nextIndexFromNode(node); |
| 1374 int32_t strength = strengthFromNode(node); |
| 1375 if(strength == UCOL_QUATERNARY) { |
| 1376 U_ASSERT(isTailoredNode(node)); |
| 1377 #ifdef DEBUG_COLLATION_BUILDER |
| 1378 printf(" quat+ "); |
| 1379 #endif |
| 1380 if(q == 3) { |
| 1381 errorCode = U_BUFFER_OVERFLOW_ERROR; |
| 1382 errorReason = "quaternary tailoring gap too small"; |
| 1383 return; |
| 1384 } |
| 1385 ++q; |
| 1386 } else { |
| 1387 if(strength == UCOL_TERTIARY) { |
| 1388 if(isTailoredNode(node)) { |
| 1389 #ifdef DEBUG_COLLATION_BUILDER |
| 1390 printf(" ter+ "); |
| 1391 #endif |
| 1392 if(!tIsTailored) { |
| 1393 // First tailored tertiary node for [p, s]. |
| 1394 int32_t tCount = countTailoredNodes(nodesArray, next
Index, |
| 1395 UCOL_TERTIARY) +
1; |
| 1396 uint32_t tLimit; |
| 1397 if(t == 0) { |
| 1398 // Gap at the beginning of the tertiary CE range
. |
| 1399 t = rootElements.getTertiaryBoundary() - 0x100; |
| 1400 tLimit = rootElements.getFirstTertiaryCE() & Col
lation::ONLY_TERTIARY_MASK; |
| 1401 } else if(t == BEFORE_WEIGHT16) { |
| 1402 tLimit = Collation::COMMON_WEIGHT16; |
| 1403 } else if(!pIsTailored && !sIsTailored) { |
| 1404 // p and s are root weights. |
| 1405 tLimit = rootElements.getTertiaryAfter(pIndex, s
, t); |
| 1406 } else { |
| 1407 // [p, s] is tailored. |
| 1408 U_ASSERT(t == Collation::COMMON_WEIGHT16); |
| 1409 tLimit = rootElements.getTertiaryBoundary(); |
| 1410 } |
| 1411 U_ASSERT(tLimit == 0x4000 || (tLimit & ~Collation::O
NLY_TERTIARY_MASK) == 0); |
| 1412 tertiaries.initForTertiary(); |
| 1413 if(!tertiaries.allocWeights(t, tLimit, tCount)) { |
| 1414 errorCode = U_BUFFER_OVERFLOW_ERROR; |
| 1415 errorReason = "tertiary tailoring gap too small"
; |
| 1416 return; |
| 1417 } |
| 1418 tIsTailored = TRUE; |
| 1419 } |
| 1420 t = tertiaries.nextWeight(); |
| 1421 U_ASSERT(t != 0xffffffff); |
| 1422 } else { |
| 1423 t = weight16FromNode(node); |
| 1424 tIsTailored = FALSE; |
| 1425 #ifdef DEBUG_COLLATION_BUILDER |
| 1426 printf(" ter %lx\n", (long)alignWeightRight(t)); |
| 1427 #endif |
| 1428 } |
| 1429 } else { |
| 1430 if(strength == UCOL_SECONDARY) { |
| 1431 if(isTailoredNode(node)) { |
| 1432 #ifdef DEBUG_COLLATION_BUILDER |
| 1433 printf(" sec+ "); |
| 1434 #endif |
| 1435 if(!sIsTailored) { |
| 1436 // First tailored secondary node for p. |
| 1437 int32_t sCount = countTailoredNodes(nodesArray,
nextIndex, |
| 1438 UCOL_SECONDA
RY) + 1; |
| 1439 uint32_t sLimit; |
| 1440 if(s == 0) { |
| 1441 // Gap at the beginning of the secondary CE
range. |
| 1442 s = rootElements.getSecondaryBoundary() - 0x
100; |
| 1443 sLimit = rootElements.getFirstSecondaryCE()
>> 16; |
| 1444 } else if(s == BEFORE_WEIGHT16) { |
| 1445 sLimit = Collation::COMMON_WEIGHT16; |
| 1446 } else if(!pIsTailored) { |
| 1447 // p is a root primary. |
| 1448 sLimit = rootElements.getSecondaryAfter(pInd
ex, s); |
| 1449 } else { |
| 1450 // p is a tailored primary. |
| 1451 U_ASSERT(s == Collation::COMMON_WEIGHT16); |
| 1452 sLimit = rootElements.getSecondaryBoundary()
; |
| 1453 } |
| 1454 if(s == Collation::COMMON_WEIGHT16) { |
| 1455 // Do not tailor into the getSortKey() range
of |
| 1456 // compressed common secondaries. |
| 1457 s = rootElements.getLastCommonSecondary(); |
| 1458 } |
| 1459 secondaries.initForSecondary(); |
| 1460 if(!secondaries.allocWeights(s, sLimit, sCount))
{ |
| 1461 errorCode = U_BUFFER_OVERFLOW_ERROR; |
| 1462 errorReason = "secondary tailoring gap too s
mall"; |
| 1463 return; |
| 1464 } |
| 1465 sIsTailored = TRUE; |
| 1466 } |
| 1467 s = secondaries.nextWeight(); |
| 1468 U_ASSERT(s != 0xffffffff); |
| 1469 } else { |
| 1470 s = weight16FromNode(node); |
| 1471 sIsTailored = FALSE; |
| 1472 #ifdef DEBUG_COLLATION_BUILDER |
| 1473 printf(" sec %lx\n", (long)alignWeightRight(s
)); |
| 1474 #endif |
| 1475 } |
| 1476 } else /* UCOL_PRIMARY */ { |
| 1477 U_ASSERT(isTailoredNode(node)); |
| 1478 #ifdef DEBUG_COLLATION_BUILDER |
| 1479 printf("pri+ "); |
| 1480 #endif |
| 1481 if(!pIsTailored) { |
| 1482 // First tailored primary node in this list. |
| 1483 int32_t pCount = countTailoredNodes(nodesArray, next
Index, |
| 1484 UCOL_PRIMARY) +
1; |
| 1485 UBool isCompressible = baseData->isCompressiblePrima
ry(p); |
| 1486 uint32_t pLimit = |
| 1487 rootElements.getPrimaryAfter(p, pIndex, isCompre
ssible); |
| 1488 primaries.initForPrimary(isCompressible); |
| 1489 if(!primaries.allocWeights(p, pLimit, pCount)) { |
| 1490 errorCode = U_BUFFER_OVERFLOW_ERROR; // TODO: i
ntroduce a more specific UErrorCode? |
| 1491 errorReason = "primary tailoring gap too small"; |
| 1492 return; |
| 1493 } |
| 1494 pIsTailored = TRUE; |
| 1495 } |
| 1496 p = primaries.nextWeight(); |
| 1497 U_ASSERT(p != 0xffffffff); |
| 1498 s = Collation::COMMON_WEIGHT16; |
| 1499 sIsTailored = FALSE; |
| 1500 } |
| 1501 t = s == 0 ? 0 : Collation::COMMON_WEIGHT16; |
| 1502 tIsTailored = FALSE; |
| 1503 } |
| 1504 q = 0; |
| 1505 } |
| 1506 if(isTailoredNode(node)) { |
| 1507 nodesArray[i] = Collation::makeCE(p, s, t, q); |
| 1508 #ifdef DEBUG_COLLATION_BUILDER |
| 1509 printf("%016llx\n", (long long)nodesArray[i]); |
| 1510 #endif |
| 1511 } |
| 1512 } |
| 1513 } |
| 1514 } |
| 1515 |
| 1516 int32_t |
| 1517 CollationBuilder::countTailoredNodes(const int64_t *nodesArray, int32_t i, int32
_t strength) { |
| 1518 int32_t count = 0; |
| 1519 for(;;) { |
| 1520 if(i == 0) { break; } |
| 1521 int64_t node = nodesArray[i]; |
| 1522 if(strengthFromNode(node) < strength) { break; } |
| 1523 if(strengthFromNode(node) == strength) { |
| 1524 if(isTailoredNode(node)) { |
| 1525 ++count; |
| 1526 } else { |
| 1527 break; |
| 1528 } |
| 1529 } |
| 1530 i = nextIndexFromNode(node); |
| 1531 } |
| 1532 return count; |
| 1533 } |
| 1534 |
| 1535 class CEFinalizer : public CollationDataBuilder::CEModifier { |
| 1536 public: |
| 1537 CEFinalizer(const int64_t *ces) : finalCEs(ces) {} |
| 1538 virtual ~CEFinalizer(); |
| 1539 virtual int64_t modifyCE32(uint32_t ce32) const { |
| 1540 U_ASSERT(!Collation::isSpecialCE32(ce32)); |
| 1541 if(CollationBuilder::isTempCE32(ce32)) { |
| 1542 // retain case bits |
| 1543 return finalCEs[CollationBuilder::indexFromTempCE32(ce32)] | ((ce32
& 0xc0) << 8); |
| 1544 } else { |
| 1545 return Collation::NO_CE; |
| 1546 } |
| 1547 } |
| 1548 virtual int64_t modifyCE(int64_t ce) const { |
| 1549 if(CollationBuilder::isTempCE(ce)) { |
| 1550 // retain case bits |
| 1551 return finalCEs[CollationBuilder::indexFromTempCE(ce)] | (ce & 0xc00
0); |
| 1552 } else { |
| 1553 return Collation::NO_CE; |
| 1554 } |
| 1555 } |
| 1556 |
| 1557 private: |
| 1558 const int64_t *finalCEs; |
| 1559 }; |
| 1560 |
| 1561 CEFinalizer::~CEFinalizer() {} |
| 1562 |
| 1563 void |
| 1564 CollationBuilder::finalizeCEs(UErrorCode &errorCode) { |
| 1565 if(U_FAILURE(errorCode)) { return; } |
| 1566 LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(error
Code)); |
| 1567 if(newBuilder.isNull()) { |
| 1568 errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 1569 return; |
| 1570 } |
| 1571 newBuilder->initForTailoring(baseData, errorCode); |
| 1572 CEFinalizer finalizer(nodes.getBuffer()); |
| 1573 newBuilder->copyFrom(*dataBuilder, finalizer, errorCode); |
| 1574 if(U_FAILURE(errorCode)) { return; } |
| 1575 delete dataBuilder; |
| 1576 dataBuilder = newBuilder.orphan(); |
| 1577 } |
| 1578 |
| 1579 int32_t |
| 1580 CollationBuilder::ceStrength(int64_t ce) { |
| 1581 return |
| 1582 isTempCE(ce) ? strengthFromTempCE(ce) : |
| 1583 (ce & INT64_C(0xff00000000000000)) != 0 ? UCOL_PRIMARY : |
| 1584 ((uint32_t)ce & 0xff000000) != 0 ? UCOL_SECONDARY : |
| 1585 ce != 0 ? UCOL_TERTIARY : |
| 1586 UCOL_IDENTICAL; |
| 1587 } |
| 1588 |
| 1589 U_NAMESPACE_END |
| 1590 |
| 1591 U_NAMESPACE_USE |
| 1592 |
| 1593 U_CAPI UCollator * U_EXPORT2 |
| 1594 ucol_openRules(const UChar *rules, int32_t rulesLength, |
| 1595 UColAttributeValue normalizationMode, UCollationStrength strength
, |
| 1596 UParseError *parseError, UErrorCode *pErrorCode) { |
| 1597 if(U_FAILURE(*pErrorCode)) { return NULL; } |
| 1598 if(rules == NULL && rulesLength != 0) { |
| 1599 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 1600 return NULL; |
| 1601 } |
| 1602 RuleBasedCollator *coll = new RuleBasedCollator(); |
| 1603 if(coll == NULL) { |
| 1604 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; |
| 1605 return NULL; |
| 1606 } |
| 1607 UnicodeString r((UBool)(rulesLength < 0), rules, rulesLength); |
| 1608 coll->internalBuildTailoring(r, strength, normalizationMode, parseError, NUL
L, *pErrorCode); |
| 1609 if(U_FAILURE(*pErrorCode)) { |
| 1610 delete coll; |
| 1611 return NULL; |
| 1612 } |
| 1613 return coll->toUCollator(); |
| 1614 } |
| 1615 |
| 1616 static const int32_t internalBufferSize = 512; |
| 1617 |
| 1618 // The @internal ucol_getUnsafeSet() was moved here from ucol_sit.cpp |
| 1619 // because it calls UnicodeSet "builder" code that depends on all Unicode proper
ties, |
| 1620 // and the rest of the collation "runtime" code only depends on normalization. |
| 1621 // This function is not related to the collation builder, |
| 1622 // but it did not seem worth moving it into its own .cpp file, |
| 1623 // nor rewriting it to use lower-level UnicodeSet and Normalizer2Impl methods. |
| 1624 U_CAPI int32_t U_EXPORT2 |
| 1625 ucol_getUnsafeSet( const UCollator *coll, |
| 1626 USet *unsafe, |
| 1627 UErrorCode *status) |
| 1628 { |
| 1629 UChar buffer[internalBufferSize]; |
| 1630 int32_t len = 0; |
| 1631 |
| 1632 uset_clear(unsafe); |
| 1633 |
| 1634 // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant |
| 1635 static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x
63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, |
| 1636 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x
3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 }; |
| 1637 |
| 1638 // add chars that fail the fcd check |
| 1639 uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status); |
| 1640 |
| 1641 // add lead/trail surrogates |
| 1642 // (trail surrogates should need to be unsafe only if the caller tests for U
TF-16 code *units*, |
| 1643 // not when testing code *points*) |
| 1644 uset_addRange(unsafe, 0xd800, 0xdfff); |
| 1645 |
| 1646 USet *contractions = uset_open(0,0); |
| 1647 |
| 1648 int32_t i = 0, j = 0; |
| 1649 ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status); |
| 1650 int32_t contsSize = uset_size(contractions); |
| 1651 UChar32 c = 0; |
| 1652 // Contraction set consists only of strings |
| 1653 // to get unsafe code points, we need to |
| 1654 // break the strings apart and add them to the unsafe set |
| 1655 for(i = 0; i < contsSize; i++) { |
| 1656 len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSi
ze, status); |
| 1657 if(len > 0) { |
| 1658 j = 0; |
| 1659 while(j < len) { |
| 1660 U16_NEXT(buffer, j, len, c); |
| 1661 if(j < len) { |
| 1662 uset_add(unsafe, c); |
| 1663 } |
| 1664 } |
| 1665 } |
| 1666 } |
| 1667 |
| 1668 uset_close(contractions); |
| 1669 |
| 1670 return uset_size(unsafe); |
| 1671 } |
| 1672 |
| 1673 #endif // !UCONFIG_NO_COLLATION |
OLD | NEW |