Index: source/i18n/collationbuilder.cpp |
diff --git a/source/i18n/collationbuilder.cpp b/source/i18n/collationbuilder.cpp |
new file mode 100644 |
index 0000000000000000000000000000000000000000..37f701ce775cc9a6cf6a8a842d35ed3381700ab6 |
--- /dev/null |
+++ b/source/i18n/collationbuilder.cpp |
@@ -0,0 +1,1673 @@ |
+/* |
+******************************************************************************* |
+* Copyright (C) 2013-2014, International Business Machines |
+* Corporation and others. All Rights Reserved. |
+******************************************************************************* |
+* collationbuilder.cpp |
+* |
+* (replaced the former ucol_bld.cpp) |
+* |
+* created on: 2013may06 |
+* created by: Markus W. Scherer |
+*/ |
+ |
+#ifdef DEBUG_COLLATION_BUILDER |
+#include <stdio.h> |
+#endif |
+ |
+#include "unicode/utypes.h" |
+ |
+#if !UCONFIG_NO_COLLATION |
+ |
+#include "unicode/caniter.h" |
+#include "unicode/normalizer2.h" |
+#include "unicode/tblcoll.h" |
+#include "unicode/parseerr.h" |
+#include "unicode/uchar.h" |
+#include "unicode/ucol.h" |
+#include "unicode/unistr.h" |
+#include "unicode/usetiter.h" |
+#include "unicode/utf16.h" |
+#include "unicode/uversion.h" |
+#include "cmemory.h" |
+#include "collation.h" |
+#include "collationbuilder.h" |
+#include "collationdata.h" |
+#include "collationdatabuilder.h" |
+#include "collationfastlatin.h" |
+#include "collationroot.h" |
+#include "collationrootelements.h" |
+#include "collationruleparser.h" |
+#include "collationsettings.h" |
+#include "collationtailoring.h" |
+#include "collationweights.h" |
+#include "normalizer2impl.h" |
+#include "uassert.h" |
+#include "ucol_imp.h" |
+#include "utf16collationiterator.h" |
+ |
+U_NAMESPACE_BEGIN |
+ |
+namespace { |
+ |
+class BundleImporter : public CollationRuleParser::Importer { |
+public: |
+ BundleImporter() {} |
+ virtual ~BundleImporter(); |
+ virtual void getRules( |
+ const char *localeID, const char *collationType, |
+ UnicodeString &rules, |
+ const char *&errorReason, UErrorCode &errorCode); |
+}; |
+ |
+BundleImporter::~BundleImporter() {} |
+ |
+void |
+BundleImporter::getRules( |
+ const char *localeID, const char *collationType, |
+ UnicodeString &rules, |
+ const char *& /*errorReason*/, UErrorCode &errorCode) { |
+ CollationLoader::loadRules(localeID, collationType, rules, errorCode); |
+} |
+ |
+} // namespace |
+ |
+// RuleBasedCollator implementation ---------------------------------------- *** |
+ |
+// These methods are here, rather than in rulebasedcollator.cpp, |
+// for modularization: |
+// Most code using Collator does not need to build a Collator from rules. |
+// By moving these constructors and helper methods to a separate file, |
+// most code will not have a static dependency on the builder code. |
+ |
+RuleBasedCollator::RuleBasedCollator() |
+ : data(NULL), |
+ settings(NULL), |
+ tailoring(NULL), |
+ cacheEntry(NULL), |
+ validLocale(""), |
+ explicitlySetAttributes(0), |
+ actualLocaleIsSameAsValid(FALSE) { |
+} |
+ |
+RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, UErrorCode &errorCode) |
+ : data(NULL), |
+ settings(NULL), |
+ tailoring(NULL), |
+ cacheEntry(NULL), |
+ validLocale(""), |
+ explicitlySetAttributes(0), |
+ actualLocaleIsSameAsValid(FALSE) { |
+ internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, NULL, NULL, errorCode); |
+} |
+ |
+RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, ECollationStrength strength, |
+ UErrorCode &errorCode) |
+ : data(NULL), |
+ settings(NULL), |
+ tailoring(NULL), |
+ cacheEntry(NULL), |
+ validLocale(""), |
+ explicitlySetAttributes(0), |
+ actualLocaleIsSameAsValid(FALSE) { |
+ internalBuildTailoring(rules, strength, UCOL_DEFAULT, NULL, NULL, errorCode); |
+} |
+ |
+RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, |
+ UColAttributeValue decompositionMode, |
+ UErrorCode &errorCode) |
+ : data(NULL), |
+ settings(NULL), |
+ tailoring(NULL), |
+ cacheEntry(NULL), |
+ validLocale(""), |
+ explicitlySetAttributes(0), |
+ actualLocaleIsSameAsValid(FALSE) { |
+ internalBuildTailoring(rules, UCOL_DEFAULT, decompositionMode, NULL, NULL, errorCode); |
+} |
+ |
+RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, |
+ ECollationStrength strength, |
+ UColAttributeValue decompositionMode, |
+ UErrorCode &errorCode) |
+ : data(NULL), |
+ settings(NULL), |
+ tailoring(NULL), |
+ cacheEntry(NULL), |
+ validLocale(""), |
+ explicitlySetAttributes(0), |
+ actualLocaleIsSameAsValid(FALSE) { |
+ internalBuildTailoring(rules, strength, decompositionMode, NULL, NULL, errorCode); |
+} |
+ |
+RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, |
+ UParseError &parseError, UnicodeString &reason, |
+ UErrorCode &errorCode) |
+ : data(NULL), |
+ settings(NULL), |
+ tailoring(NULL), |
+ cacheEntry(NULL), |
+ validLocale(""), |
+ explicitlySetAttributes(0), |
+ actualLocaleIsSameAsValid(FALSE) { |
+ internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &reason, errorCode); |
+} |
+ |
+void |
+RuleBasedCollator::internalBuildTailoring(const UnicodeString &rules, |
+ int32_t strength, |
+ UColAttributeValue decompositionMode, |
+ UParseError *outParseError, UnicodeString *outReason, |
+ UErrorCode &errorCode) { |
+ const CollationTailoring *base = CollationRoot::getRoot(errorCode); |
+ if(U_FAILURE(errorCode)) { return; } |
+ if(outReason != NULL) { outReason->remove(); } |
+ CollationBuilder builder(base, errorCode); |
+ UVersionInfo noVersion = { 0, 0, 0, 0 }; |
+ BundleImporter importer; |
+ LocalPointer<CollationTailoring> t(builder.parseAndBuild(rules, noVersion, |
+ &importer, |
+ outParseError, errorCode)); |
+ if(U_FAILURE(errorCode)) { |
+ const char *reason = builder.getErrorReason(); |
+ if(reason != NULL && outReason != NULL) { |
+ *outReason = UnicodeString(reason, -1, US_INV); |
+ } |
+ return; |
+ } |
+ t->actualLocale.setToBogus(); |
+ adoptTailoring(t.orphan(), errorCode); |
+ // Set attributes after building the collator, |
+ // to keep the default settings consistent with the rule string. |
+ if(strength != UCOL_DEFAULT) { |
+ setAttribute(UCOL_STRENGTH, (UColAttributeValue)strength, errorCode); |
+ } |
+ if(decompositionMode != UCOL_DEFAULT) { |
+ setAttribute(UCOL_NORMALIZATION_MODE, decompositionMode, errorCode); |
+ } |
+} |
+ |
+// CollationBuilder implementation ----------------------------------------- *** |
+ |
+CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode) |
+ : nfd(*Normalizer2::getNFDInstance(errorCode)), |
+ fcd(*Normalizer2Factory::getFCDInstance(errorCode)), |
+ nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)), |
+ base(b), |
+ baseData(b->data), |
+ rootElements(b->data->rootElements, b->data->rootElementsLength), |
+ variableTop(0), |
+ dataBuilder(new CollationDataBuilder(errorCode)), fastLatinEnabled(TRUE), |
+ errorReason(NULL), |
+ cesLength(0), |
+ rootPrimaryIndexes(errorCode), nodes(errorCode) { |
+ nfcImpl.ensureCanonIterData(errorCode); |
+ if(U_FAILURE(errorCode)) { |
+ errorReason = "CollationBuilder fields initialization failed"; |
+ return; |
+ } |
+ if(dataBuilder == NULL) { |
+ errorCode = U_MEMORY_ALLOCATION_ERROR; |
+ return; |
+ } |
+ dataBuilder->initForTailoring(baseData, errorCode); |
+ if(U_FAILURE(errorCode)) { |
+ errorReason = "CollationBuilder initialization failed"; |
+ } |
+} |
+ |
+CollationBuilder::~CollationBuilder() { |
+ delete dataBuilder; |
+} |
+ |
+CollationTailoring * |
+CollationBuilder::parseAndBuild(const UnicodeString &ruleString, |
+ const UVersionInfo rulesVersion, |
+ CollationRuleParser::Importer *importer, |
+ UParseError *outParseError, |
+ UErrorCode &errorCode) { |
+ if(U_FAILURE(errorCode)) { return NULL; } |
+ if(baseData->rootElements == NULL) { |
+ errorCode = U_MISSING_RESOURCE_ERROR; |
+ errorReason = "missing root elements data, tailoring not supported"; |
+ return NULL; |
+ } |
+ LocalPointer<CollationTailoring> tailoring(new CollationTailoring(base->settings)); |
+ if(tailoring.isNull() || tailoring->isBogus()) { |
+ errorCode = U_MEMORY_ALLOCATION_ERROR; |
+ return NULL; |
+ } |
+ CollationRuleParser parser(baseData, errorCode); |
+ if(U_FAILURE(errorCode)) { return NULL; } |
+ // Note: This always bases &[last variable] and &[first regular] |
+ // on the root collator's maxVariable/variableTop. |
+ // If we wanted this to change after [maxVariable x], then we would keep |
+ // the tailoring.settings pointer here and read its variableTop when we need it. |
+ // See http://unicode.org/cldr/trac/ticket/6070 |
+ variableTop = base->settings->variableTop; |
+ parser.setSink(this); |
+ parser.setImporter(importer); |
+ CollationSettings &ownedSettings = *SharedObject::copyOnWrite(tailoring->settings); |
+ parser.parse(ruleString, ownedSettings, outParseError, errorCode); |
+ errorReason = parser.getErrorReason(); |
+ if(U_FAILURE(errorCode)) { return NULL; } |
+ if(dataBuilder->hasMappings()) { |
+ makeTailoredCEs(errorCode); |
+ closeOverComposites(errorCode); |
+ finalizeCEs(errorCode); |
+ // Copy all of ASCII, and Latin-1 letters, into each tailoring. |
+ optimizeSet.add(0, 0x7f); |
+ optimizeSet.add(0xc0, 0xff); |
+ // Hangul is decomposed on the fly during collation, |
+ // and the tailoring data is always built with HANGUL_TAG specials. |
+ optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END); |
+ dataBuilder->optimize(optimizeSet, errorCode); |
+ tailoring->ensureOwnedData(errorCode); |
+ if(U_FAILURE(errorCode)) { return NULL; } |
+ if(fastLatinEnabled) { dataBuilder->enableFastLatin(); } |
+ dataBuilder->build(*tailoring->ownedData, errorCode); |
+ tailoring->builder = dataBuilder; |
+ dataBuilder = NULL; |
+ } else { |
+ tailoring->data = baseData; |
+ } |
+ if(U_FAILURE(errorCode)) { return NULL; } |
+ ownedSettings.fastLatinOptions = CollationFastLatin::getOptions( |
+ tailoring->data, ownedSettings, |
+ ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries)); |
+ tailoring->rules = ruleString; |
+ tailoring->rules.getTerminatedBuffer(); // ensure NUL-termination |
+ tailoring->setVersion(base->version, rulesVersion); |
+ return tailoring.orphan(); |
+} |
+ |
+void |
+CollationBuilder::addReset(int32_t strength, const UnicodeString &str, |
+ const char *&parserErrorReason, UErrorCode &errorCode) { |
+ if(U_FAILURE(errorCode)) { return; } |
+ U_ASSERT(!str.isEmpty()); |
+ if(str.charAt(0) == CollationRuleParser::POS_LEAD) { |
+ ces[0] = getSpecialResetPosition(str, parserErrorReason, errorCode); |
+ cesLength = 1; |
+ if(U_FAILURE(errorCode)) { return; } |
+ U_ASSERT((ces[0] & Collation::CASE_AND_QUATERNARY_MASK) == 0); |
+ } else { |
+ // normal reset to a character or string |
+ UnicodeString nfdString = nfd.normalize(str, errorCode); |
+ if(U_FAILURE(errorCode)) { |
+ parserErrorReason = "normalizing the reset position"; |
+ return; |
+ } |
+ cesLength = dataBuilder->getCEs(nfdString, ces, 0); |
+ if(cesLength > Collation::MAX_EXPANSION_LENGTH) { |
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
+ parserErrorReason = "reset position maps to too many collation elements (more than 31)"; |
+ return; |
+ } |
+ } |
+ if(strength == UCOL_IDENTICAL) { return; } // simple reset-at-position |
+ |
+ // &[before strength]position |
+ U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_TERTIARY); |
+ int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode); |
+ if(U_FAILURE(errorCode)) { return; } |
+ |
+ int64_t node = nodes.elementAti(index); |
+ // If the index is for a "weaker" tailored node, |
+ // then skip backwards over this and further "weaker" nodes. |
+ while(strengthFromNode(node) > strength) { |
+ index = previousIndexFromNode(node); |
+ node = nodes.elementAti(index); |
+ } |
+ |
+ // Find or insert a node whose index we will put into a temporary CE. |
+ if(strengthFromNode(node) == strength && isTailoredNode(node)) { |
+ // Reset to just before this same-strength tailored node. |
+ index = previousIndexFromNode(node); |
+ } else if(strength == UCOL_PRIMARY) { |
+ // root primary node (has no previous index) |
+ uint32_t p = weight32FromNode(node); |
+ if(p == 0) { |
+ errorCode = U_UNSUPPORTED_ERROR; |
+ parserErrorReason = "reset primary-before ignorable not possible"; |
+ return; |
+ } |
+ if(p <= rootElements.getFirstPrimary()) { |
+ // There is no primary gap between ignorables and the space-first-primary. |
+ errorCode = U_UNSUPPORTED_ERROR; |
+ parserErrorReason = "reset primary-before first non-ignorable not supported"; |
+ return; |
+ } |
+ if(p == Collation::FIRST_TRAILING_PRIMARY) { |
+ // We do not support tailoring to an unassigned-implicit CE. |
+ errorCode = U_UNSUPPORTED_ERROR; |
+ parserErrorReason = "reset primary-before [first trailing] not supported"; |
+ return; |
+ } |
+ p = rootElements.getPrimaryBefore(p, baseData->isCompressiblePrimary(p)); |
+ index = findOrInsertNodeForPrimary(p, errorCode); |
+ // Go to the last node in this list: |
+ // Tailor after the last node between adjacent root nodes. |
+ for(;;) { |
+ node = nodes.elementAti(index); |
+ int32_t nextIndex = nextIndexFromNode(node); |
+ if(nextIndex == 0) { break; } |
+ index = nextIndex; |
+ } |
+ } else { |
+ // &[before 2] or &[before 3] |
+ index = findCommonNode(index, UCOL_SECONDARY); |
+ if(strength >= UCOL_TERTIARY) { |
+ index = findCommonNode(index, UCOL_TERTIARY); |
+ } |
+ node = nodes.elementAti(index); |
+ if(strengthFromNode(node) == strength) { |
+ // Found a same-strength node with an explicit weight. |
+ uint32_t weight16 = weight16FromNode(node); |
+ if(weight16 == 0) { |
+ errorCode = U_UNSUPPORTED_ERROR; |
+ if(strength == UCOL_SECONDARY) { |
+ parserErrorReason = "reset secondary-before secondary ignorable not possible"; |
+ } else { |
+ parserErrorReason = "reset tertiary-before completely ignorable not possible"; |
+ } |
+ return; |
+ } |
+ U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16); |
+ int32_t previousIndex = previousIndexFromNode(node); |
+ if(weight16 == Collation::COMMON_WEIGHT16) { |
+ // Reset to just before this same-strength common-weight node. |
+ index = previousIndex; |
+ } else { |
+ // A non-common weight is only possible from a root CE. |
+ // Find the higher-level weights, which must all be explicit, |
+ // and then find the preceding weight for this level. |
+ uint32_t previousWeight16 = 0; |
+ int32_t previousWeightIndex = -1; |
+ int32_t i = index; |
+ if(strength == UCOL_SECONDARY) { |
+ uint32_t p; |
+ do { |
+ i = previousIndexFromNode(node); |
+ node = nodes.elementAti(i); |
+ if(strengthFromNode(node) == UCOL_SECONDARY && !isTailoredNode(node) && |
+ previousWeightIndex < 0) { |
+ previousWeightIndex = i; |
+ previousWeight16 = weight16FromNode(node); |
+ } |
+ } while(strengthFromNode(node) > UCOL_PRIMARY); |
+ U_ASSERT(!isTailoredNode(node)); |
+ p = weight32FromNode(node); |
+ weight16 = rootElements.getSecondaryBefore(p, weight16); |
+ } else { |
+ uint32_t p, s; |
+ do { |
+ i = previousIndexFromNode(node); |
+ node = nodes.elementAti(i); |
+ if(strengthFromNode(node) == UCOL_TERTIARY && !isTailoredNode(node) && |
+ previousWeightIndex < 0) { |
+ previousWeightIndex = i; |
+ previousWeight16 = weight16FromNode(node); |
+ } |
+ } while(strengthFromNode(node) > UCOL_SECONDARY); |
+ U_ASSERT(!isTailoredNode(node)); |
+ if(strengthFromNode(node) == UCOL_SECONDARY) { |
+ s = weight16FromNode(node); |
+ do { |
+ i = previousIndexFromNode(node); |
+ node = nodes.elementAti(i); |
+ } while(strengthFromNode(node) > UCOL_PRIMARY); |
+ U_ASSERT(!isTailoredNode(node)); |
+ } else { |
+ U_ASSERT(!nodeHasBefore2(node)); |
+ s = Collation::COMMON_WEIGHT16; |
+ } |
+ p = weight32FromNode(node); |
+ weight16 = rootElements.getTertiaryBefore(p, s, weight16); |
+ U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0); |
+ } |
+ // Find or insert the new explicit weight before the current one. |
+ if(previousWeightIndex >= 0 && weight16 == previousWeight16) { |
+ // Tailor after the last node between adjacent root nodes. |
+ index = previousIndex; |
+ } else { |
+ node = nodeFromWeight16(weight16) | nodeFromStrength(strength); |
+ index = insertNodeBetween(previousIndex, index, node, errorCode); |
+ } |
+ } |
+ } else { |
+ // Found a stronger node with implied strength-common weight. |
+ int64_t hasBefore3 = 0; |
+ if(strength == UCOL_SECONDARY) { |
+ U_ASSERT(!nodeHasBefore2(node)); |
+ // Move the HAS_BEFORE3 flag from the parent node |
+ // to the new secondary common node. |
+ hasBefore3 = node & HAS_BEFORE3; |
+ node = (node & ~(int64_t)HAS_BEFORE3) | HAS_BEFORE2; |
+ } else { |
+ U_ASSERT(!nodeHasBefore3(node)); |
+ node |= HAS_BEFORE3; |
+ } |
+ nodes.setElementAt(node, index); |
+ int32_t nextIndex = nextIndexFromNode(node); |
+ // Insert default nodes with weights 02 and 05, reset to the 02 node. |
+ node = nodeFromWeight16(BEFORE_WEIGHT16) | nodeFromStrength(strength); |
+ index = insertNodeBetween(index, nextIndex, node, errorCode); |
+ node = nodeFromWeight16(Collation::COMMON_WEIGHT16) | hasBefore3 | |
+ nodeFromStrength(strength); |
+ insertNodeBetween(index, nextIndex, node, errorCode); |
+ } |
+ // Strength of the temporary CE = strength of its reset position. |
+ // Code above raises an error if the before-strength is stronger. |
+ strength = ceStrength(ces[cesLength - 1]); |
+ } |
+ if(U_FAILURE(errorCode)) { |
+ parserErrorReason = "inserting reset position for &[before n]"; |
+ return; |
+ } |
+ ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength); |
+} |
+ |
+int64_t |
+CollationBuilder::getSpecialResetPosition(const UnicodeString &str, |
+ const char *&parserErrorReason, UErrorCode &errorCode) { |
+ U_ASSERT(str.length() == 2); |
+ int64_t ce; |
+ int32_t strength = UCOL_PRIMARY; |
+ UBool isBoundary = FALSE; |
+ UChar32 pos = str.charAt(1) - CollationRuleParser::POS_BASE; |
+ U_ASSERT(0 <= pos && pos <= CollationRuleParser::LAST_TRAILING); |
+ switch(pos) { |
+ case CollationRuleParser::FIRST_TERTIARY_IGNORABLE: |
+ // Quaternary CEs are not supported. |
+ // Non-zero quaternary weights are possible only on tertiary or stronger CEs. |
+ return 0; |
+ case CollationRuleParser::LAST_TERTIARY_IGNORABLE: |
+ return 0; |
+ case CollationRuleParser::FIRST_SECONDARY_IGNORABLE: { |
+ // Look for a tailored tertiary node after [0, 0, 0]. |
+ int32_t index = findOrInsertNodeForRootCE(0, UCOL_TERTIARY, errorCode); |
+ if(U_FAILURE(errorCode)) { return 0; } |
+ int64_t node = nodes.elementAti(index); |
+ if((index = nextIndexFromNode(node)) != 0) { |
+ node = nodes.elementAti(index); |
+ U_ASSERT(strengthFromNode(node) <= UCOL_TERTIARY); |
+ if(isTailoredNode(node) && strengthFromNode(node) == UCOL_TERTIARY) { |
+ return tempCEFromIndexAndStrength(index, UCOL_TERTIARY); |
+ } |
+ } |
+ return rootElements.getFirstTertiaryCE(); |
+ // No need to look for nodeHasAnyBefore() on a tertiary node. |
+ } |
+ case CollationRuleParser::LAST_SECONDARY_IGNORABLE: |
+ ce = rootElements.getLastTertiaryCE(); |
+ strength = UCOL_TERTIARY; |
+ break; |
+ case CollationRuleParser::FIRST_PRIMARY_IGNORABLE: { |
+ // Look for a tailored secondary node after [0, 0, *]. |
+ int32_t index = findOrInsertNodeForRootCE(0, UCOL_SECONDARY, errorCode); |
+ if(U_FAILURE(errorCode)) { return 0; } |
+ int64_t node = nodes.elementAti(index); |
+ while((index = nextIndexFromNode(node)) != 0) { |
+ node = nodes.elementAti(index); |
+ strength = strengthFromNode(node); |
+ if(strength < UCOL_SECONDARY) { break; } |
+ if(strength == UCOL_SECONDARY) { |
+ if(isTailoredNode(node)) { |
+ if(nodeHasBefore3(node)) { |
+ index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); |
+ U_ASSERT(isTailoredNode(nodes.elementAti(index))); |
+ } |
+ return tempCEFromIndexAndStrength(index, UCOL_SECONDARY); |
+ } else { |
+ break; |
+ } |
+ } |
+ } |
+ ce = rootElements.getFirstSecondaryCE(); |
+ strength = UCOL_SECONDARY; |
+ break; |
+ } |
+ case CollationRuleParser::LAST_PRIMARY_IGNORABLE: |
+ ce = rootElements.getLastSecondaryCE(); |
+ strength = UCOL_SECONDARY; |
+ break; |
+ case CollationRuleParser::FIRST_VARIABLE: |
+ ce = rootElements.getFirstPrimaryCE(); |
+ isBoundary = TRUE; // FractionalUCA.txt: FDD1 00A0, SPACE first primary |
+ break; |
+ case CollationRuleParser::LAST_VARIABLE: |
+ ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1); |
+ break; |
+ case CollationRuleParser::FIRST_REGULAR: |
+ ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1); |
+ isBoundary = TRUE; // FractionalUCA.txt: FDD1 263A, SYMBOL first primary |
+ break; |
+ case CollationRuleParser::LAST_REGULAR: |
+ // Use the Hani-first-primary rather than the actual last "regular" CE before it, |
+ // for backward compatibility with behavior before the introduction of |
+ // script-first-primary CEs in the root collator. |
+ ce = rootElements.firstCEWithPrimaryAtLeast( |
+ baseData->getFirstPrimaryForGroup(USCRIPT_HAN)); |
+ break; |
+ case CollationRuleParser::FIRST_IMPLICIT: |
+ ce = baseData->getSingleCE(0x4e00, errorCode); |
+ break; |
+ case CollationRuleParser::LAST_IMPLICIT: |
+ // We do not support tailoring to an unassigned-implicit CE. |
+ errorCode = U_UNSUPPORTED_ERROR; |
+ parserErrorReason = "reset to [last implicit] not supported"; |
+ return 0; |
+ case CollationRuleParser::FIRST_TRAILING: |
+ ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY); |
+ isBoundary = TRUE; // trailing first primary (there is no mapping for it) |
+ break; |
+ case CollationRuleParser::LAST_TRAILING: |
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
+ parserErrorReason = "LDML forbids tailoring to U+FFFF"; |
+ return 0; |
+ default: |
+ U_ASSERT(FALSE); |
+ return 0; |
+ } |
+ |
+ int32_t index = findOrInsertNodeForRootCE(ce, strength, errorCode); |
+ if(U_FAILURE(errorCode)) { return 0; } |
+ int64_t node = nodes.elementAti(index); |
+ if((pos & 1) == 0) { |
+ // even pos = [first xyz] |
+ if(!nodeHasAnyBefore(node) && isBoundary) { |
+ // A <group> first primary boundary is artificially added to FractionalUCA.txt. |
+ // It is reachable via its special contraction, but is not normally used. |
+ // Find the first character tailored after the boundary CE, |
+ // or the first real root CE after it. |
+ if((index = nextIndexFromNode(node)) != 0) { |
+ // If there is a following node, then it must be tailored |
+ // because there are no root CEs with a boundary primary |
+ // and non-common secondary/tertiary weights. |
+ node = nodes.elementAti(index); |
+ U_ASSERT(isTailoredNode(node)); |
+ ce = tempCEFromIndexAndStrength(index, strength); |
+ } else { |
+ U_ASSERT(strength == UCOL_PRIMARY); |
+ uint32_t p = (uint32_t)(ce >> 32); |
+ int32_t pIndex = rootElements.findPrimary(p); |
+ UBool isCompressible = baseData->isCompressiblePrimary(p); |
+ p = rootElements.getPrimaryAfter(p, pIndex, isCompressible); |
+ ce = Collation::makeCE(p); |
+ index = findOrInsertNodeForRootCE(ce, UCOL_PRIMARY, errorCode); |
+ if(U_FAILURE(errorCode)) { return 0; } |
+ node = nodes.elementAti(index); |
+ } |
+ } |
+ if(nodeHasAnyBefore(node)) { |
+ // Get the first node that was tailored before this one at a weaker strength. |
+ if(nodeHasBefore2(node)) { |
+ index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); |
+ node = nodes.elementAti(index); |
+ } |
+ if(nodeHasBefore3(node)) { |
+ index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); |
+ } |
+ U_ASSERT(isTailoredNode(nodes.elementAti(index))); |
+ ce = tempCEFromIndexAndStrength(index, strength); |
+ } |
+ } else { |
+ // odd pos = [last xyz] |
+ // Find the last node that was tailored after the [last xyz] |
+ // at a strength no greater than the position's strength. |
+ for(;;) { |
+ int32_t nextIndex = nextIndexFromNode(node); |
+ if(nextIndex == 0) { break; } |
+ int64_t nextNode = nodes.elementAti(nextIndex); |
+ if(strengthFromNode(nextNode) < strength) { break; } |
+ index = nextIndex; |
+ node = nextNode; |
+ } |
+ // Do not make a temporary CE for a root node. |
+ // This last node might be the node for the root CE itself, |
+ // or a node with a common secondary or tertiary weight. |
+ if(isTailoredNode(node)) { |
+ ce = tempCEFromIndexAndStrength(index, strength); |
+ } |
+ } |
+ return ce; |
+} |
+ |
+void |
+CollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix, |
+ const UnicodeString &str, const UnicodeString &extension, |
+ const char *&parserErrorReason, UErrorCode &errorCode) { |
+ if(U_FAILURE(errorCode)) { return; } |
+ UnicodeString nfdPrefix; |
+ if(!prefix.isEmpty()) { |
+ nfd.normalize(prefix, nfdPrefix, errorCode); |
+ if(U_FAILURE(errorCode)) { |
+ parserErrorReason = "normalizing the relation prefix"; |
+ return; |
+ } |
+ } |
+ UnicodeString nfdString = nfd.normalize(str, errorCode); |
+ if(U_FAILURE(errorCode)) { |
+ parserErrorReason = "normalizing the relation string"; |
+ return; |
+ } |
+ |
+ // The runtime code decomposes Hangul syllables on the fly, |
+ // with recursive processing but without making the Jamo pieces visible for matching. |
+ // It does not work with certain types of contextual mappings. |
+ int32_t nfdLength = nfdString.length(); |
+ if(nfdLength >= 2) { |
+ UChar c = nfdString.charAt(0); |
+ if(Hangul::isJamoL(c) || Hangul::isJamoV(c)) { |
+ // While handling a Hangul syllable, contractions starting with Jamo L or V |
+ // would not see the following Jamo of that syllable. |
+ errorCode = U_UNSUPPORTED_ERROR; |
+ parserErrorReason = "contractions starting with conjoining Jamo L or V not supported"; |
+ return; |
+ } |
+ c = nfdString.charAt(nfdLength - 1); |
+ if(Hangul::isJamoL(c) || |
+ (Hangul::isJamoV(c) && Hangul::isJamoL(nfdString.charAt(nfdLength - 2)))) { |
+ // A contraction ending with Jamo L or L+V would require |
+ // generating Hangul syllables in addTailComposites() (588 for a Jamo L), |
+ // or decomposing a following Hangul syllable on the fly, during contraction matching. |
+ errorCode = U_UNSUPPORTED_ERROR; |
+ parserErrorReason = "contractions ending with conjoining Jamo L or L+V not supported"; |
+ return; |
+ } |
+ // A Hangul syllable completely inside a contraction is ok. |
+ } |
+ // Note: If there is a prefix, then the parser checked that |
+ // both the prefix and the string beging with NFC boundaries (not Jamo V or T). |
+ // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0)) |
+ // (While handling a Hangul syllable, prefixes on Jamo V or T |
+ // would not see the previous Jamo of that syllable.) |
+ |
+ if(strength != UCOL_IDENTICAL) { |
+ // Find the node index after which we insert the new tailored node. |
+ int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode); |
+ U_ASSERT(cesLength > 0); |
+ int64_t ce = ces[cesLength - 1]; |
+ if(strength == UCOL_PRIMARY && !isTempCE(ce) && (uint32_t)(ce >> 32) == 0) { |
+ // There is no primary gap between ignorables and the space-first-primary. |
+ errorCode = U_UNSUPPORTED_ERROR; |
+ parserErrorReason = "tailoring primary after ignorables not supported"; |
+ return; |
+ } |
+ if(strength == UCOL_QUATERNARY && ce == 0) { |
+ // The CE data structure does not support non-zero quaternary weights |
+ // on tertiary ignorables. |
+ errorCode = U_UNSUPPORTED_ERROR; |
+ parserErrorReason = "tailoring quaternary after tertiary ignorables not supported"; |
+ return; |
+ } |
+ // Insert the new tailored node. |
+ index = insertTailoredNodeAfter(index, strength, errorCode); |
+ if(U_FAILURE(errorCode)) { |
+ parserErrorReason = "modifying collation elements"; |
+ return; |
+ } |
+ // Strength of the temporary CE: |
+ // The new relation may yield a stronger CE but not a weaker one. |
+ int32_t tempStrength = ceStrength(ce); |
+ if(strength < tempStrength) { tempStrength = strength; } |
+ ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength); |
+ } |
+ |
+ setCaseBits(nfdString, parserErrorReason, errorCode); |
+ if(U_FAILURE(errorCode)) { return; } |
+ |
+ int32_t cesLengthBeforeExtension = cesLength; |
+ if(!extension.isEmpty()) { |
+ UnicodeString nfdExtension = nfd.normalize(extension, errorCode); |
+ if(U_FAILURE(errorCode)) { |
+ parserErrorReason = "normalizing the relation extension"; |
+ return; |
+ } |
+ cesLength = dataBuilder->getCEs(nfdExtension, ces, cesLength); |
+ if(cesLength > Collation::MAX_EXPANSION_LENGTH) { |
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
+ parserErrorReason = |
+ "extension string adds too many collation elements (more than 31 total)"; |
+ return; |
+ } |
+ } |
+ uint32_t ce32 = Collation::UNASSIGNED_CE32; |
+ if((prefix != nfdPrefix || str != nfdString) && |
+ !ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) { |
+ // Map from the original input to the CEs. |
+ // We do this in case the canonical closure is incomplete, |
+ // so that it is possible to explicitly provide the missing mappings. |
+ ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode); |
+ } |
+ addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode); |
+ if(U_FAILURE(errorCode)) { |
+ parserErrorReason = "writing collation elements"; |
+ return; |
+ } |
+ cesLength = cesLengthBeforeExtension; |
+} |
+ |
+int32_t |
+CollationBuilder::findOrInsertNodeForCEs(int32_t strength, const char *&parserErrorReason, |
+ UErrorCode &errorCode) { |
+ if(U_FAILURE(errorCode)) { return 0; } |
+ U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_QUATERNARY); |
+ |
+ // Find the last CE that is at least as "strong" as the requested difference. |
+ // Note: Stronger is smaller (UCOL_PRIMARY=0). |
+ int64_t ce; |
+ for(;; --cesLength) { |
+ if(cesLength == 0) { |
+ ce = ces[0] = 0; |
+ cesLength = 1; |
+ break; |
+ } else { |
+ ce = ces[cesLength - 1]; |
+ } |
+ if(ceStrength(ce) <= strength) { break; } |
+ } |
+ |
+ if(isTempCE(ce)) { |
+ // No need to findCommonNode() here for lower levels |
+ // because insertTailoredNodeAfter() will do that anyway. |
+ return indexFromTempCE(ce); |
+ } |
+ |
+ // root CE |
+ if((uint8_t)(ce >> 56) == Collation::UNASSIGNED_IMPLICIT_BYTE) { |
+ errorCode = U_UNSUPPORTED_ERROR; |
+ parserErrorReason = "tailoring relative to an unassigned code point not supported"; |
+ return 0; |
+ } |
+ return findOrInsertNodeForRootCE(ce, strength, errorCode); |
+} |
+ |
+int32_t |
+CollationBuilder::findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UErrorCode &errorCode) { |
+ if(U_FAILURE(errorCode)) { return 0; } |
+ U_ASSERT((uint8_t)(ce >> 56) != Collation::UNASSIGNED_IMPLICIT_BYTE); |
+ |
+ // Find or insert the node for each of the root CE's weights, |
+ // down to the requested level/strength. |
+ // Root CEs must have common=zero quaternary weights (for which we never insert any nodes). |
+ U_ASSERT((ce & 0xc0) == 0); |
+ int32_t index = findOrInsertNodeForPrimary((uint32_t)(ce >> 32) , errorCode); |
+ if(strength >= UCOL_SECONDARY) { |
+ uint32_t lower32 = (uint32_t)ce; |
+ index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, errorCode); |
+ if(strength >= UCOL_TERTIARY) { |
+ index = findOrInsertWeakNode(index, lower32 & Collation::ONLY_TERTIARY_MASK, |
+ UCOL_TERTIARY, errorCode); |
+ } |
+ } |
+ return index; |
+} |
+ |
+namespace { |
+ |
+/** |
+ * Like Java Collections.binarySearch(List, key, Comparator). |
+ * |
+ * @return the index>=0 where the item was found, |
+ * or the index<0 for inserting the string at ~index in sorted order |
+ * (index into rootPrimaryIndexes) |
+ */ |
+int32_t |
+binarySearchForRootPrimaryNode(const int32_t *rootPrimaryIndexes, int32_t length, |
+ const int64_t *nodes, uint32_t p) { |
+ if(length == 0) { return ~0; } |
+ int32_t start = 0; |
+ int32_t limit = length; |
+ for (;;) { |
+ int32_t i = (start + limit) / 2; |
+ int64_t node = nodes[rootPrimaryIndexes[i]]; |
+ uint32_t nodePrimary = (uint32_t)(node >> 32); // weight32FromNode(node) |
+ if (p == nodePrimary) { |
+ return i; |
+ } else if (p < nodePrimary) { |
+ if (i == start) { |
+ return ~start; // insert s before i |
+ } |
+ limit = i; |
+ } else { |
+ if (i == start) { |
+ return ~(start + 1); // insert s after i |
+ } |
+ start = i; |
+ } |
+ } |
+} |
+ |
+} // namespace |
+ |
+int32_t |
+CollationBuilder::findOrInsertNodeForPrimary(uint32_t p, UErrorCode &errorCode) { |
+ if(U_FAILURE(errorCode)) { return 0; } |
+ |
+ int32_t rootIndex = binarySearchForRootPrimaryNode( |
+ rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p); |
+ if(rootIndex >= 0) { |
+ return rootPrimaryIndexes.elementAti(rootIndex); |
+ } else { |
+ // Start a new list of nodes with this primary. |
+ int32_t index = nodes.size(); |
+ nodes.addElement(nodeFromWeight32(p), errorCode); |
+ rootPrimaryIndexes.insertElementAt(index, ~rootIndex, errorCode); |
+ return index; |
+ } |
+} |
+ |
+int32_t |
+CollationBuilder::findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level, UErrorCode &errorCode) { |
+ if(U_FAILURE(errorCode)) { return 0; } |
+ U_ASSERT(0 <= index && index < nodes.size()); |
+ |
+ U_ASSERT(weight16 == 0 || weight16 >= Collation::COMMON_WEIGHT16); |
+ // Only reset-before inserts common weights. |
+ if(weight16 == Collation::COMMON_WEIGHT16) { |
+ return findCommonNode(index, level); |
+ } |
+ // Find the root CE's weight for this level. |
+ // Postpone insertion if not found: |
+ // Insert the new root node before the next stronger node, |
+ // or before the next root node with the same strength and a larger weight. |
+ int64_t node = nodes.elementAti(index); |
+ int32_t nextIndex; |
+ while((nextIndex = nextIndexFromNode(node)) != 0) { |
+ node = nodes.elementAti(nextIndex); |
+ int32_t nextStrength = strengthFromNode(node); |
+ if(nextStrength <= level) { |
+ // Insert before a stronger node. |
+ if(nextStrength < level) { break; } |
+ // nextStrength == level |
+ if(!isTailoredNode(node)) { |
+ uint32_t nextWeight16 = weight16FromNode(node); |
+ if(nextWeight16 == weight16) { |
+ // Found the node for the root CE up to this level. |
+ return nextIndex; |
+ } |
+ // Insert before a node with a larger same-strength weight. |
+ if(nextWeight16 > weight16) { break; } |
+ } |
+ } |
+ // Skip the next node. |
+ index = nextIndex; |
+ } |
+ node = nodeFromWeight16(weight16) | nodeFromStrength(level); |
+ return insertNodeBetween(index, nextIndex, node, errorCode); |
+} |
+ |
+int32_t |
+CollationBuilder::insertTailoredNodeAfter(int32_t index, int32_t strength, UErrorCode &errorCode) { |
+ if(U_FAILURE(errorCode)) { return 0; } |
+ U_ASSERT(0 <= index && index < nodes.size()); |
+ if(strength >= UCOL_SECONDARY) { |
+ index = findCommonNode(index, UCOL_SECONDARY); |
+ if(strength >= UCOL_TERTIARY) { |
+ index = findCommonNode(index, UCOL_TERTIARY); |
+ } |
+ } |
+ // Postpone insertion: |
+ // Insert the new node before the next one with a strength at least as strong. |
+ int64_t node = nodes.elementAti(index); |
+ int32_t nextIndex; |
+ while((nextIndex = nextIndexFromNode(node)) != 0) { |
+ node = nodes.elementAti(nextIndex); |
+ if(strengthFromNode(node) <= strength) { break; } |
+ // Skip the next node which has a weaker (larger) strength than the new one. |
+ index = nextIndex; |
+ } |
+ node = IS_TAILORED | nodeFromStrength(strength); |
+ return insertNodeBetween(index, nextIndex, node, errorCode); |
+} |
+ |
+int32_t |
+CollationBuilder::insertNodeBetween(int32_t index, int32_t nextIndex, int64_t node, |
+ UErrorCode &errorCode) { |
+ if(U_FAILURE(errorCode)) { return 0; } |
+ U_ASSERT(previousIndexFromNode(node) == 0); |
+ U_ASSERT(nextIndexFromNode(node) == 0); |
+ U_ASSERT(nextIndexFromNode(nodes.elementAti(index)) == nextIndex); |
+ // Append the new node and link it to the existing nodes. |
+ int32_t newIndex = nodes.size(); |
+ node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex); |
+ nodes.addElement(node, errorCode); |
+ if(U_FAILURE(errorCode)) { return 0; } |
+ // nodes[index].nextIndex = newIndex |
+ node = nodes.elementAti(index); |
+ nodes.setElementAt(changeNodeNextIndex(node, newIndex), index); |
+ // nodes[nextIndex].previousIndex = newIndex |
+ if(nextIndex != 0) { |
+ node = nodes.elementAti(nextIndex); |
+ nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex); |
+ } |
+ return newIndex; |
+} |
+ |
+int32_t |
+CollationBuilder::findCommonNode(int32_t index, int32_t strength) const { |
+ U_ASSERT(UCOL_SECONDARY <= strength && strength <= UCOL_TERTIARY); |
+ int64_t node = nodes.elementAti(index); |
+ if(strengthFromNode(node) >= strength) { |
+ // The current node is no stronger. |
+ return index; |
+ } |
+ if(strength == UCOL_SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) { |
+ // The current node implies the strength-common weight. |
+ return index; |
+ } |
+ index = nextIndexFromNode(node); |
+ node = nodes.elementAti(index); |
+ U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength && |
+ weight16FromNode(node) == BEFORE_WEIGHT16); |
+ // Skip to the explicit common node. |
+ do { |
+ index = nextIndexFromNode(node); |
+ node = nodes.elementAti(index); |
+ U_ASSERT(strengthFromNode(node) >= strength); |
+ } while(isTailoredNode(node) || strengthFromNode(node) > strength); |
+ U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16); |
+ return index; |
+} |
+ |
+void |
+CollationBuilder::setCaseBits(const UnicodeString &nfdString, |
+ const char *&parserErrorReason, UErrorCode &errorCode) { |
+ if(U_FAILURE(errorCode)) { return; } |
+ int32_t numTailoredPrimaries = 0; |
+ for(int32_t i = 0; i < cesLength; ++i) { |
+ if(ceStrength(ces[i]) == UCOL_PRIMARY) { ++numTailoredPrimaries; } |
+ } |
+ // We should not be able to get too many case bits because |
+ // cesLength<=31==MAX_EXPANSION_LENGTH. |
+ // 31 pairs of case bits fit into an int64_t without setting its sign bit. |
+ U_ASSERT(numTailoredPrimaries <= 31); |
+ |
+ int64_t cases = 0; |
+ if(numTailoredPrimaries > 0) { |
+ const UChar *s = nfdString.getBuffer(); |
+ UTF16CollationIterator baseCEs(baseData, FALSE, s, s, s + nfdString.length()); |
+ int32_t baseCEsLength = baseCEs.fetchCEs(errorCode) - 1; |
+ if(U_FAILURE(errorCode)) { |
+ parserErrorReason = "fetching root CEs for tailored string"; |
+ return; |
+ } |
+ U_ASSERT(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation::NO_CE); |
+ |
+ uint32_t lastCase = 0; |
+ int32_t numBasePrimaries = 0; |
+ for(int32_t i = 0; i < baseCEsLength; ++i) { |
+ int64_t ce = baseCEs.getCE(i); |
+ if((ce >> 32) != 0) { |
+ ++numBasePrimaries; |
+ uint32_t c = ((uint32_t)ce >> 14) & 3; |
+ U_ASSERT(c == 0 || c == 2); // lowercase or uppercase, no mixed case in any base CE |
+ if(numBasePrimaries < numTailoredPrimaries) { |
+ cases |= (int64_t)c << ((numBasePrimaries - 1) * 2); |
+ } else if(numBasePrimaries == numTailoredPrimaries) { |
+ lastCase = c; |
+ } else if(c != lastCase) { |
+ // There are more base primary CEs than tailored primaries. |
+ // Set mixed case if the case bits of the remainder differ. |
+ lastCase = 1; |
+ // Nothing more can change. |
+ break; |
+ } |
+ } |
+ } |
+ if(numBasePrimaries >= numTailoredPrimaries) { |
+ cases |= (int64_t)lastCase << ((numTailoredPrimaries - 1) * 2); |
+ } |
+ } |
+ |
+ for(int32_t i = 0; i < cesLength; ++i) { |
+ int64_t ce = ces[i] & INT64_C(0xffffffffffff3fff); // clear old case bits |
+ int32_t strength = ceStrength(ce); |
+ if(strength == UCOL_PRIMARY) { |
+ ce |= (cases & 3) << 14; |
+ cases >>= 2; |
+ } else if(strength == UCOL_TERTIARY) { |
+ // Tertiary CEs must have uppercase bits. |
+ // See the LDML spec, and comments in class CollationCompare. |
+ ce |= 0x8000; |
+ } |
+ // Tertiary ignorable CEs must have 0 case bits. |
+ // We set 0 case bits for secondary CEs too |
+ // since currently only U+0345 is cased and maps to a secondary CE, |
+ // and it is lowercase. Other secondaries are uncased. |
+ // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight. |
+ ces[i] = ce; |
+ } |
+} |
+ |
+void |
+CollationBuilder::suppressContractions(const UnicodeSet &set, const char *&parserErrorReason, |
+ UErrorCode &errorCode) { |
+ if(U_FAILURE(errorCode)) { return; } |
+ dataBuilder->suppressContractions(set, errorCode); |
+ if(U_FAILURE(errorCode)) { |
+ parserErrorReason = "application of [suppressContractions [set]] failed"; |
+ } |
+} |
+ |
+void |
+CollationBuilder::optimize(const UnicodeSet &set, const char *& /* parserErrorReason */, |
+ UErrorCode &errorCode) { |
+ if(U_FAILURE(errorCode)) { return; } |
+ optimizeSet.addAll(set); |
+} |
+ |
+uint32_t |
+CollationBuilder::addWithClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString, |
+ const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32, |
+ UErrorCode &errorCode) { |
+ // Map from the NFD input to the CEs. |
+ ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode); |
+ ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode); |
+ addTailComposites(nfdPrefix, nfdString, errorCode); |
+ return ce32; |
+} |
+ |
+uint32_t |
+CollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString, |
+ const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32, |
+ UErrorCode &errorCode) { |
+ if(U_FAILURE(errorCode)) { return ce32; } |
+ |
+ // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.) |
+ if(nfdPrefix.isEmpty()) { |
+ CanonicalIterator stringIter(nfdString, errorCode); |
+ if(U_FAILURE(errorCode)) { return ce32; } |
+ UnicodeString prefix; |
+ for(;;) { |
+ UnicodeString str = stringIter.next(); |
+ if(str.isBogus()) { break; } |
+ if(ignoreString(str, errorCode) || str == nfdString) { continue; } |
+ ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode); |
+ if(U_FAILURE(errorCode)) { return ce32; } |
+ } |
+ } else { |
+ CanonicalIterator prefixIter(nfdPrefix, errorCode); |
+ CanonicalIterator stringIter(nfdString, errorCode); |
+ if(U_FAILURE(errorCode)) { return ce32; } |
+ for(;;) { |
+ UnicodeString prefix = prefixIter.next(); |
+ if(prefix.isBogus()) { break; } |
+ if(ignorePrefix(prefix, errorCode)) { continue; } |
+ UBool samePrefix = prefix == nfdPrefix; |
+ for(;;) { |
+ UnicodeString str = stringIter.next(); |
+ if(str.isBogus()) { break; } |
+ if(ignoreString(str, errorCode) || (samePrefix && str == nfdString)) { continue; } |
+ ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode); |
+ if(U_FAILURE(errorCode)) { return ce32; } |
+ } |
+ stringIter.reset(); |
+ } |
+ } |
+ return ce32; |
+} |
+ |
+void |
+CollationBuilder::addTailComposites(const UnicodeString &nfdPrefix, const UnicodeString &nfdString, |
+ UErrorCode &errorCode) { |
+ if(U_FAILURE(errorCode)) { return; } |
+ |
+ // Look for the last starter in the NFD string. |
+ UChar32 lastStarter; |
+ int32_t indexAfterLastStarter = nfdString.length(); |
+ for(;;) { |
+ if(indexAfterLastStarter == 0) { return; } // no starter at all |
+ lastStarter = nfdString.char32At(indexAfterLastStarter - 1); |
+ if(nfd.getCombiningClass(lastStarter) == 0) { break; } |
+ indexAfterLastStarter -= U16_LENGTH(lastStarter); |
+ } |
+ // No closure to Hangul syllables since we decompose them on the fly. |
+ if(Hangul::isJamoL(lastStarter)) { return; } |
+ |
+ // Are there any composites whose decomposition starts with the lastStarter? |
+ // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters. |
+ // We might find some more equivalent mappings here if it did. |
+ UnicodeSet composites; |
+ if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; } |
+ |
+ UnicodeString decomp; |
+ UnicodeString newNFDString, newString; |
+ int64_t newCEs[Collation::MAX_EXPANSION_LENGTH]; |
+ UnicodeSetIterator iter(composites); |
+ while(iter.next()) { |
+ U_ASSERT(!iter.isString()); |
+ UChar32 composite = iter.getCodepoint(); |
+ nfd.getDecomposition(composite, decomp); |
+ if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp, |
+ newNFDString, newString, errorCode)) { |
+ continue; |
+ } |
+ int32_t newCEsLength = dataBuilder->getCEs(nfdPrefix, newNFDString, newCEs, 0); |
+ if(newCEsLength > Collation::MAX_EXPANSION_LENGTH) { |
+ // Ignore mappings that we cannot store. |
+ continue; |
+ } |
+ // Note: It is possible that the newCEs do not make use of the mapping |
+ // for which we are adding the tail composites, in which case we might be adding |
+ // unnecessary mappings. |
+ // For example, when we add tail composites for ae^ (^=combining circumflex), |
+ // UCA discontiguous-contraction matching does not find any matches |
+ // for ae_^ (_=any combining diacritic below) *unless* there is also |
+ // a contraction mapping for ae. |
+ // Thus, if there is no ae contraction, then the ae^ mapping is ignored |
+ // while fetching the newCEs for ae_^. |
+ // TODO: Try to detect this effectively. |
+ // (Alternatively, print a warning when prefix contractions are missing.) |
+ |
+ // We do not need an explicit mapping for the NFD strings. |
+ // It is fine if the NFD input collates like this via a sequence of mappings. |
+ // It also saves a little bit of space, and may reduce the set of characters with contractions. |
+ uint32_t ce32 = addIfDifferent(nfdPrefix, newString, |
+ newCEs, newCEsLength, Collation::UNASSIGNED_CE32, errorCode); |
+ if(ce32 != Collation::UNASSIGNED_CE32) { |
+ // was different, was added |
+ addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32, errorCode); |
+ } |
+ } |
+} |
+ |
+UBool |
+CollationBuilder::mergeCompositeIntoString(const UnicodeString &nfdString, |
+ int32_t indexAfterLastStarter, |
+ UChar32 composite, const UnicodeString &decomp, |
+ UnicodeString &newNFDString, UnicodeString &newString, |
+ UErrorCode &errorCode) const { |
+ if(U_FAILURE(errorCode)) { return FALSE; } |
+ U_ASSERT(nfdString.char32At(indexAfterLastStarter - 1) == decomp.char32At(0)); |
+ int32_t lastStarterLength = decomp.moveIndex32(0, 1); |
+ if(lastStarterLength == decomp.length()) { |
+ // Singleton decompositions should be found by addWithClosure() |
+ // and the CanonicalIterator, so we can ignore them here. |
+ return FALSE; |
+ } |
+ if(nfdString.compare(indexAfterLastStarter, 0x7fffffff, |
+ decomp, lastStarterLength, 0x7fffffff) == 0) { |
+ // same strings, nothing new to be found here |
+ return FALSE; |
+ } |
+ |
+ // Make new FCD strings that combine a composite, or its decomposition, |
+ // into the nfdString's last starter and the combining marks following it. |
+ // Make an NFD version, and a version with the composite. |
+ newNFDString.setTo(nfdString, 0, indexAfterLastStarter); |
+ newString.setTo(nfdString, 0, indexAfterLastStarter - lastStarterLength).append(composite); |
+ |
+ // The following is related to discontiguous contraction matching, |
+ // but builds only FCD strings (or else returns FALSE). |
+ int32_t sourceIndex = indexAfterLastStarter; |
+ int32_t decompIndex = lastStarterLength; |
+ // Small optimization: We keep the source character across loop iterations |
+ // because we do not always consume it, |
+ // and then need not fetch it again nor look up its combining class again. |
+ UChar32 sourceChar = U_SENTINEL; |
+ // The cc variables need to be declared before the loop so that at the end |
+ // they are set to the last combining classes seen. |
+ uint8_t sourceCC = 0; |
+ uint8_t decompCC = 0; |
+ for(;;) { |
+ if(sourceChar < 0) { |
+ if(sourceIndex >= nfdString.length()) { break; } |
+ sourceChar = nfdString.char32At(sourceIndex); |
+ sourceCC = nfd.getCombiningClass(sourceChar); |
+ U_ASSERT(sourceCC != 0); |
+ } |
+ // We consume a decomposition character in each iteration. |
+ if(decompIndex >= decomp.length()) { break; } |
+ UChar32 decompChar = decomp.char32At(decompIndex); |
+ decompCC = nfd.getCombiningClass(decompChar); |
+ // Compare the two characters and their combining classes. |
+ if(decompCC == 0) { |
+ // Unable to merge because the source contains a non-zero combining mark |
+ // but the composite's decomposition contains another starter. |
+ // The strings would not be equivalent. |
+ return FALSE; |
+ } else if(sourceCC < decompCC) { |
+ // Composite + sourceChar would not be FCD. |
+ return FALSE; |
+ } else if(decompCC < sourceCC) { |
+ newNFDString.append(decompChar); |
+ decompIndex += U16_LENGTH(decompChar); |
+ } else if(decompChar != sourceChar) { |
+ // Blocked because same combining class. |
+ return FALSE; |
+ } else { // match: decompChar == sourceChar |
+ newNFDString.append(decompChar); |
+ decompIndex += U16_LENGTH(decompChar); |
+ sourceIndex += U16_LENGTH(decompChar); |
+ sourceChar = U_SENTINEL; |
+ } |
+ } |
+ // We are at the end of at least one of the two inputs. |
+ if(sourceChar >= 0) { // more characters from nfdString but not from decomp |
+ if(sourceCC < decompCC) { |
+ // Appending the next source character to the composite would not be FCD. |
+ return FALSE; |
+ } |
+ newNFDString.append(nfdString, sourceIndex, 0x7fffffff); |
+ newString.append(nfdString, sourceIndex, 0x7fffffff); |
+ } else if(decompIndex < decomp.length()) { // more characters from decomp, not from nfdString |
+ newNFDString.append(decomp, decompIndex, 0x7fffffff); |
+ } |
+ U_ASSERT(nfd.isNormalized(newNFDString, errorCode)); |
+ U_ASSERT(fcd.isNormalized(newString, errorCode)); |
+ U_ASSERT(nfd.normalize(newString, errorCode) == newNFDString); // canonically equivalent |
+ return TRUE; |
+} |
+ |
+UBool |
+CollationBuilder::ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) const { |
+ // Do not map non-FCD prefixes. |
+ return !isFCD(s, errorCode); |
+} |
+ |
+UBool |
+CollationBuilder::ignoreString(const UnicodeString &s, UErrorCode &errorCode) const { |
+ // Do not map non-FCD strings. |
+ // Do not map strings that start with Hangul syllables: We decompose those on the fly. |
+ return !isFCD(s, errorCode) || Hangul::isHangul(s.charAt(0)); |
+} |
+ |
+UBool |
+CollationBuilder::isFCD(const UnicodeString &s, UErrorCode &errorCode) const { |
+ return U_SUCCESS(errorCode) && fcd.isNormalized(s, errorCode); |
+} |
+ |
+void |
+CollationBuilder::closeOverComposites(UErrorCode &errorCode) { |
+ UnicodeSet composites(UNICODE_STRING_SIMPLE("[:NFD_QC=N:]"), errorCode); // Java: static final |
+ if(U_FAILURE(errorCode)) { return; } |
+ // Hangul is decomposed on the fly during collation. |
+ composites.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END); |
+ UnicodeString prefix; // empty |
+ UnicodeString nfdString; |
+ UnicodeSetIterator iter(composites); |
+ while(iter.next()) { |
+ U_ASSERT(!iter.isString()); |
+ nfd.getDecomposition(iter.getCodepoint(), nfdString); |
+ cesLength = dataBuilder->getCEs(nfdString, ces, 0); |
+ if(cesLength > Collation::MAX_EXPANSION_LENGTH) { |
+ // Too many CEs from the decomposition (unusual), ignore this composite. |
+ // We could add a capacity parameter to getCEs() and reallocate if necessary. |
+ // However, this can only really happen in contrived cases. |
+ continue; |
+ } |
+ const UnicodeString &composite(iter.getString()); |
+ addIfDifferent(prefix, composite, ces, cesLength, Collation::UNASSIGNED_CE32, errorCode); |
+ } |
+} |
+ |
+uint32_t |
+CollationBuilder::addIfDifferent(const UnicodeString &prefix, const UnicodeString &str, |
+ const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32, |
+ UErrorCode &errorCode) { |
+ if(U_FAILURE(errorCode)) { return ce32; } |
+ int64_t oldCEs[Collation::MAX_EXPANSION_LENGTH]; |
+ int32_t oldCEsLength = dataBuilder->getCEs(prefix, str, oldCEs, 0); |
+ if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) { |
+ if(ce32 == Collation::UNASSIGNED_CE32) { |
+ ce32 = dataBuilder->encodeCEs(newCEs, newCEsLength, errorCode); |
+ } |
+ dataBuilder->addCE32(prefix, str, ce32, errorCode); |
+ } |
+ return ce32; |
+} |
+ |
+UBool |
+CollationBuilder::sameCEs(const int64_t ces1[], int32_t ces1Length, |
+ const int64_t ces2[], int32_t ces2Length) { |
+ if(ces1Length != ces2Length) { |
+ return FALSE; |
+ } |
+ U_ASSERT(ces1Length <= Collation::MAX_EXPANSION_LENGTH); |
+ for(int32_t i = 0; i < ces1Length; ++i) { |
+ if(ces1[i] != ces2[i]) { return FALSE; } |
+ } |
+ return TRUE; |
+} |
+ |
+#ifdef DEBUG_COLLATION_BUILDER |
+ |
+uint32_t |
+alignWeightRight(uint32_t w) { |
+ if(w != 0) { |
+ while((w & 0xff) == 0) { w >>= 8; } |
+ } |
+ return w; |
+} |
+ |
+#endif |
+ |
+void |
+CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) { |
+ if(U_FAILURE(errorCode)) { return; } |
+ |
+ CollationWeights primaries, secondaries, tertiaries; |
+ int64_t *nodesArray = nodes.getBuffer(); |
+ |
+ for(int32_t rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) { |
+ int32_t i = rootPrimaryIndexes.elementAti(rpi); |
+ int64_t node = nodesArray[i]; |
+ uint32_t p = weight32FromNode(node); |
+ uint32_t s = p == 0 ? 0 : Collation::COMMON_WEIGHT16; |
+ uint32_t t = s; |
+ uint32_t q = 0; |
+ UBool pIsTailored = FALSE; |
+ UBool sIsTailored = FALSE; |
+ UBool tIsTailored = FALSE; |
+#ifdef DEBUG_COLLATION_BUILDER |
+ printf("\nprimary %lx\n", (long)alignWeightRight(p)); |
+#endif |
+ int32_t pIndex = p == 0 ? 0 : rootElements.findPrimary(p); |
+ int32_t nextIndex = nextIndexFromNode(node); |
+ while(nextIndex != 0) { |
+ i = nextIndex; |
+ node = nodesArray[i]; |
+ nextIndex = nextIndexFromNode(node); |
+ int32_t strength = strengthFromNode(node); |
+ if(strength == UCOL_QUATERNARY) { |
+ U_ASSERT(isTailoredNode(node)); |
+#ifdef DEBUG_COLLATION_BUILDER |
+ printf(" quat+ "); |
+#endif |
+ if(q == 3) { |
+ errorCode = U_BUFFER_OVERFLOW_ERROR; |
+ errorReason = "quaternary tailoring gap too small"; |
+ return; |
+ } |
+ ++q; |
+ } else { |
+ if(strength == UCOL_TERTIARY) { |
+ if(isTailoredNode(node)) { |
+#ifdef DEBUG_COLLATION_BUILDER |
+ printf(" ter+ "); |
+#endif |
+ if(!tIsTailored) { |
+ // First tailored tertiary node for [p, s]. |
+ int32_t tCount = countTailoredNodes(nodesArray, nextIndex, |
+ UCOL_TERTIARY) + 1; |
+ uint32_t tLimit; |
+ if(t == 0) { |
+ // Gap at the beginning of the tertiary CE range. |
+ t = rootElements.getTertiaryBoundary() - 0x100; |
+ tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK; |
+ } else if(t == BEFORE_WEIGHT16) { |
+ tLimit = Collation::COMMON_WEIGHT16; |
+ } else if(!pIsTailored && !sIsTailored) { |
+ // p and s are root weights. |
+ tLimit = rootElements.getTertiaryAfter(pIndex, s, t); |
+ } else { |
+ // [p, s] is tailored. |
+ U_ASSERT(t == Collation::COMMON_WEIGHT16); |
+ tLimit = rootElements.getTertiaryBoundary(); |
+ } |
+ U_ASSERT(tLimit == 0x4000 || (tLimit & ~Collation::ONLY_TERTIARY_MASK) == 0); |
+ tertiaries.initForTertiary(); |
+ if(!tertiaries.allocWeights(t, tLimit, tCount)) { |
+ errorCode = U_BUFFER_OVERFLOW_ERROR; |
+ errorReason = "tertiary tailoring gap too small"; |
+ return; |
+ } |
+ tIsTailored = TRUE; |
+ } |
+ t = tertiaries.nextWeight(); |
+ U_ASSERT(t != 0xffffffff); |
+ } else { |
+ t = weight16FromNode(node); |
+ tIsTailored = FALSE; |
+#ifdef DEBUG_COLLATION_BUILDER |
+ printf(" ter %lx\n", (long)alignWeightRight(t)); |
+#endif |
+ } |
+ } else { |
+ if(strength == UCOL_SECONDARY) { |
+ if(isTailoredNode(node)) { |
+#ifdef DEBUG_COLLATION_BUILDER |
+ printf(" sec+ "); |
+#endif |
+ if(!sIsTailored) { |
+ // First tailored secondary node for p. |
+ int32_t sCount = countTailoredNodes(nodesArray, nextIndex, |
+ UCOL_SECONDARY) + 1; |
+ uint32_t sLimit; |
+ if(s == 0) { |
+ // Gap at the beginning of the secondary CE range. |
+ s = rootElements.getSecondaryBoundary() - 0x100; |
+ sLimit = rootElements.getFirstSecondaryCE() >> 16; |
+ } else if(s == BEFORE_WEIGHT16) { |
+ sLimit = Collation::COMMON_WEIGHT16; |
+ } else if(!pIsTailored) { |
+ // p is a root primary. |
+ sLimit = rootElements.getSecondaryAfter(pIndex, s); |
+ } else { |
+ // p is a tailored primary. |
+ U_ASSERT(s == Collation::COMMON_WEIGHT16); |
+ sLimit = rootElements.getSecondaryBoundary(); |
+ } |
+ if(s == Collation::COMMON_WEIGHT16) { |
+ // Do not tailor into the getSortKey() range of |
+ // compressed common secondaries. |
+ s = rootElements.getLastCommonSecondary(); |
+ } |
+ secondaries.initForSecondary(); |
+ if(!secondaries.allocWeights(s, sLimit, sCount)) { |
+ errorCode = U_BUFFER_OVERFLOW_ERROR; |
+ errorReason = "secondary tailoring gap too small"; |
+ return; |
+ } |
+ sIsTailored = TRUE; |
+ } |
+ s = secondaries.nextWeight(); |
+ U_ASSERT(s != 0xffffffff); |
+ } else { |
+ s = weight16FromNode(node); |
+ sIsTailored = FALSE; |
+#ifdef DEBUG_COLLATION_BUILDER |
+ printf(" sec %lx\n", (long)alignWeightRight(s)); |
+#endif |
+ } |
+ } else /* UCOL_PRIMARY */ { |
+ U_ASSERT(isTailoredNode(node)); |
+#ifdef DEBUG_COLLATION_BUILDER |
+ printf("pri+ "); |
+#endif |
+ if(!pIsTailored) { |
+ // First tailored primary node in this list. |
+ int32_t pCount = countTailoredNodes(nodesArray, nextIndex, |
+ UCOL_PRIMARY) + 1; |
+ UBool isCompressible = baseData->isCompressiblePrimary(p); |
+ uint32_t pLimit = |
+ rootElements.getPrimaryAfter(p, pIndex, isCompressible); |
+ primaries.initForPrimary(isCompressible); |
+ if(!primaries.allocWeights(p, pLimit, pCount)) { |
+ errorCode = U_BUFFER_OVERFLOW_ERROR; // TODO: introduce a more specific UErrorCode? |
+ errorReason = "primary tailoring gap too small"; |
+ return; |
+ } |
+ pIsTailored = TRUE; |
+ } |
+ p = primaries.nextWeight(); |
+ U_ASSERT(p != 0xffffffff); |
+ s = Collation::COMMON_WEIGHT16; |
+ sIsTailored = FALSE; |
+ } |
+ t = s == 0 ? 0 : Collation::COMMON_WEIGHT16; |
+ tIsTailored = FALSE; |
+ } |
+ q = 0; |
+ } |
+ if(isTailoredNode(node)) { |
+ nodesArray[i] = Collation::makeCE(p, s, t, q); |
+#ifdef DEBUG_COLLATION_BUILDER |
+ printf("%016llx\n", (long long)nodesArray[i]); |
+#endif |
+ } |
+ } |
+ } |
+} |
+ |
+int32_t |
+CollationBuilder::countTailoredNodes(const int64_t *nodesArray, int32_t i, int32_t strength) { |
+ int32_t count = 0; |
+ for(;;) { |
+ if(i == 0) { break; } |
+ int64_t node = nodesArray[i]; |
+ if(strengthFromNode(node) < strength) { break; } |
+ if(strengthFromNode(node) == strength) { |
+ if(isTailoredNode(node)) { |
+ ++count; |
+ } else { |
+ break; |
+ } |
+ } |
+ i = nextIndexFromNode(node); |
+ } |
+ return count; |
+} |
+ |
+class CEFinalizer : public CollationDataBuilder::CEModifier { |
+public: |
+ CEFinalizer(const int64_t *ces) : finalCEs(ces) {} |
+ virtual ~CEFinalizer(); |
+ virtual int64_t modifyCE32(uint32_t ce32) const { |
+ U_ASSERT(!Collation::isSpecialCE32(ce32)); |
+ if(CollationBuilder::isTempCE32(ce32)) { |
+ // retain case bits |
+ return finalCEs[CollationBuilder::indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8); |
+ } else { |
+ return Collation::NO_CE; |
+ } |
+ } |
+ virtual int64_t modifyCE(int64_t ce) const { |
+ if(CollationBuilder::isTempCE(ce)) { |
+ // retain case bits |
+ return finalCEs[CollationBuilder::indexFromTempCE(ce)] | (ce & 0xc000); |
+ } else { |
+ return Collation::NO_CE; |
+ } |
+ } |
+ |
+private: |
+ const int64_t *finalCEs; |
+}; |
+ |
+CEFinalizer::~CEFinalizer() {} |
+ |
+void |
+CollationBuilder::finalizeCEs(UErrorCode &errorCode) { |
+ if(U_FAILURE(errorCode)) { return; } |
+ LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(errorCode)); |
+ if(newBuilder.isNull()) { |
+ errorCode = U_MEMORY_ALLOCATION_ERROR; |
+ return; |
+ } |
+ newBuilder->initForTailoring(baseData, errorCode); |
+ CEFinalizer finalizer(nodes.getBuffer()); |
+ newBuilder->copyFrom(*dataBuilder, finalizer, errorCode); |
+ if(U_FAILURE(errorCode)) { return; } |
+ delete dataBuilder; |
+ dataBuilder = newBuilder.orphan(); |
+} |
+ |
+int32_t |
+CollationBuilder::ceStrength(int64_t ce) { |
+ return |
+ isTempCE(ce) ? strengthFromTempCE(ce) : |
+ (ce & INT64_C(0xff00000000000000)) != 0 ? UCOL_PRIMARY : |
+ ((uint32_t)ce & 0xff000000) != 0 ? UCOL_SECONDARY : |
+ ce != 0 ? UCOL_TERTIARY : |
+ UCOL_IDENTICAL; |
+} |
+ |
+U_NAMESPACE_END |
+ |
+U_NAMESPACE_USE |
+ |
+U_CAPI UCollator * U_EXPORT2 |
+ucol_openRules(const UChar *rules, int32_t rulesLength, |
+ UColAttributeValue normalizationMode, UCollationStrength strength, |
+ UParseError *parseError, UErrorCode *pErrorCode) { |
+ if(U_FAILURE(*pErrorCode)) { return NULL; } |
+ if(rules == NULL && rulesLength != 0) { |
+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
+ return NULL; |
+ } |
+ RuleBasedCollator *coll = new RuleBasedCollator(); |
+ if(coll == NULL) { |
+ *pErrorCode = U_MEMORY_ALLOCATION_ERROR; |
+ return NULL; |
+ } |
+ UnicodeString r((UBool)(rulesLength < 0), rules, rulesLength); |
+ coll->internalBuildTailoring(r, strength, normalizationMode, parseError, NULL, *pErrorCode); |
+ if(U_FAILURE(*pErrorCode)) { |
+ delete coll; |
+ return NULL; |
+ } |
+ return coll->toUCollator(); |
+} |
+ |
+static const int32_t internalBufferSize = 512; |
+ |
+// The @internal ucol_getUnsafeSet() was moved here from ucol_sit.cpp |
+// because it calls UnicodeSet "builder" code that depends on all Unicode properties, |
+// and the rest of the collation "runtime" code only depends on normalization. |
+// This function is not related to the collation builder, |
+// but it did not seem worth moving it into its own .cpp file, |
+// nor rewriting it to use lower-level UnicodeSet and Normalizer2Impl methods. |
+U_CAPI int32_t U_EXPORT2 |
+ucol_getUnsafeSet( const UCollator *coll, |
+ USet *unsafe, |
+ UErrorCode *status) |
+{ |
+ UChar buffer[internalBufferSize]; |
+ int32_t len = 0; |
+ |
+ uset_clear(unsafe); |
+ |
+ // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant |
+ static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, |
+ 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 }; |
+ |
+ // add chars that fail the fcd check |
+ uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status); |
+ |
+ // add lead/trail surrogates |
+ // (trail surrogates should need to be unsafe only if the caller tests for UTF-16 code *units*, |
+ // not when testing code *points*) |
+ uset_addRange(unsafe, 0xd800, 0xdfff); |
+ |
+ USet *contractions = uset_open(0,0); |
+ |
+ int32_t i = 0, j = 0; |
+ ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status); |
+ int32_t contsSize = uset_size(contractions); |
+ UChar32 c = 0; |
+ // Contraction set consists only of strings |
+ // to get unsafe code points, we need to |
+ // break the strings apart and add them to the unsafe set |
+ for(i = 0; i < contsSize; i++) { |
+ len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status); |
+ if(len > 0) { |
+ j = 0; |
+ while(j < len) { |
+ U16_NEXT(buffer, j, len, c); |
+ if(j < len) { |
+ uset_add(unsafe, c); |
+ } |
+ } |
+ } |
+ } |
+ |
+ uset_close(contractions); |
+ |
+ return uset_size(unsafe); |
+} |
+ |
+#endif // !UCONFIG_NO_COLLATION |