source/i18n/collationbuilder.cpp - Issue 845603002: Update ICU to 54.1 step 1

Unified Diff: source/i18n/collationbuilder.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master

Patch Set: remove unusued directories Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: source/i18n/collationbuilder.cpp

diff --git a/source/i18n/collationbuilder.cpp b/source/i18n/collationbuilder.cpp

new file mode 100644

index 0000000000000000000000000000000000000000..37f701ce775cc9a6cf6a8a842d35ed3381700ab6

--- /dev/null

+++ b/source/i18n/collationbuilder.cpp

@@ -0,0 +1,1673 @@

+/*

+*******************************************************************************

+* collationbuilder.cpp

+* (replaced the former ucol_bld.cpp)

+* created on: 2013may06

+* created by: Markus W. Scherer

+*/

+#ifdef DEBUG_COLLATION_BUILDER

+#include <stdio.h>

+#endif

+#include "unicode/utypes.h"

+#if !UCONFIG_NO_COLLATION

+#include "unicode/caniter.h"

+#include "unicode/normalizer2.h"

+#include "unicode/tblcoll.h"

+#include "unicode/parseerr.h"

+#include "unicode/uchar.h"

+#include "unicode/ucol.h"

+#include "unicode/unistr.h"

+#include "unicode/usetiter.h"

+#include "unicode/utf16.h"

+#include "unicode/uversion.h"

+#include "cmemory.h"

+#include "collation.h"

+#include "collationbuilder.h"

+#include "collationdata.h"

+#include "collationdatabuilder.h"

+#include "collationfastlatin.h"

+#include "collationroot.h"

+#include "collationrootelements.h"

+#include "collationruleparser.h"

+#include "collationsettings.h"

+#include "collationtailoring.h"

+#include "collationweights.h"

+#include "normalizer2impl.h"

+#include "uassert.h"

+#include "ucol_imp.h"

+#include "utf16collationiterator.h"

+U_NAMESPACE_BEGIN

+namespace {

+class BundleImporter : public CollationRuleParser::Importer {

+public:

+ BundleImporter() {}

+ virtual ~BundleImporter();

+ virtual void getRules(

+ const char *localeID, const char *collationType,

+ UnicodeString &rules,

+ const char *&errorReason, UErrorCode &errorCode);

+};

+BundleImporter::~BundleImporter() {}

+void

+BundleImporter::getRules(

+ const char *localeID, const char *collationType,

+ UnicodeString &rules,

+ const char *& /*errorReason*/, UErrorCode &errorCode) {

+ CollationLoader::loadRules(localeID, collationType, rules, errorCode);

+} // namespace

+// RuleBasedCollator implementation ---------------------------------------- ***

+// These methods are here, rather than in rulebasedcollator.cpp,

+// for modularization:

+// Most code using Collator does not need to build a Collator from rules.

+// By moving these constructors and helper methods to a separate file,

+// most code will not have a static dependency on the builder code.

+RuleBasedCollator::RuleBasedCollator()

+ : data(NULL),

+ settings(NULL),

+ tailoring(NULL),

+ cacheEntry(NULL),

+ validLocale(""),

+ explicitlySetAttributes(0),

+ actualLocaleIsSameAsValid(FALSE) {

+RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, UErrorCode &errorCode)

+ : data(NULL),

+ settings(NULL),

+ tailoring(NULL),

+ cacheEntry(NULL),

+ validLocale(""),

+ explicitlySetAttributes(0),

+ actualLocaleIsSameAsValid(FALSE) {

+ internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, NULL, NULL, errorCode);

+RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, ECollationStrength strength,

+ UErrorCode &errorCode)

+ : data(NULL),

+ settings(NULL),

+ tailoring(NULL),

+ cacheEntry(NULL),

+ validLocale(""),

+ explicitlySetAttributes(0),

+ actualLocaleIsSameAsValid(FALSE) {

+ internalBuildTailoring(rules, strength, UCOL_DEFAULT, NULL, NULL, errorCode);

+RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,

+ UColAttributeValue decompositionMode,

+ UErrorCode &errorCode)

+ : data(NULL),

+ settings(NULL),

+ tailoring(NULL),

+ cacheEntry(NULL),

+ validLocale(""),

+ explicitlySetAttributes(0),

+ actualLocaleIsSameAsValid(FALSE) {

+ internalBuildTailoring(rules, UCOL_DEFAULT, decompositionMode, NULL, NULL, errorCode);

+RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,

+ ECollationStrength strength,

+ UColAttributeValue decompositionMode,

+ UErrorCode &errorCode)

+ : data(NULL),

+ settings(NULL),

+ tailoring(NULL),

+ cacheEntry(NULL),

+ validLocale(""),

+ explicitlySetAttributes(0),

+ actualLocaleIsSameAsValid(FALSE) {

+ internalBuildTailoring(rules, strength, decompositionMode, NULL, NULL, errorCode);

+RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,

+ UParseError &parseError, UnicodeString &reason,

+ UErrorCode &errorCode)

+ : data(NULL),

+ settings(NULL),

+ tailoring(NULL),

+ cacheEntry(NULL),

+ validLocale(""),

+ explicitlySetAttributes(0),

+ actualLocaleIsSameAsValid(FALSE) {

+ internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &reason, errorCode);

+void

+RuleBasedCollator::internalBuildTailoring(const UnicodeString &rules,

+ int32_t strength,

+ UColAttributeValue decompositionMode,

+ UParseError *outParseError, UnicodeString *outReason,

+ UErrorCode &errorCode) {

+ const CollationTailoring *base = CollationRoot::getRoot(errorCode);

+ if(U_FAILURE(errorCode)) { return; }

+ if(outReason != NULL) { outReason->remove(); }

+ CollationBuilder builder(base, errorCode);

+ UVersionInfo noVersion = { 0, 0, 0, 0 };

+ BundleImporter importer;

+ LocalPointer<CollationTailoring> t(builder.parseAndBuild(rules, noVersion,

+ &importer,

+ outParseError, errorCode));

+ if(U_FAILURE(errorCode)) {

+ const char *reason = builder.getErrorReason();

+ if(reason != NULL && outReason != NULL) {

+ *outReason = UnicodeString(reason, -1, US_INV);

+ }

+ return;

+ }

+ t->actualLocale.setToBogus();

+ adoptTailoring(t.orphan(), errorCode);

+ // Set attributes after building the collator,

+ // to keep the default settings consistent with the rule string.

+ if(strength != UCOL_DEFAULT) {

+ setAttribute(UCOL_STRENGTH, (UColAttributeValue)strength, errorCode);

+ }

+ if(decompositionMode != UCOL_DEFAULT) {

+ setAttribute(UCOL_NORMALIZATION_MODE, decompositionMode, errorCode);

+ }

+// CollationBuilder implementation ----------------------------------------- ***

+CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode)

+ : nfd(*Normalizer2::getNFDInstance(errorCode)),

+ fcd(*Normalizer2Factory::getFCDInstance(errorCode)),

+ nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),

+ base(b),

+ baseData(b->data),

+ rootElements(b->data->rootElements, b->data->rootElementsLength),

+ variableTop(0),

+ dataBuilder(new CollationDataBuilder(errorCode)), fastLatinEnabled(TRUE),

+ errorReason(NULL),

+ cesLength(0),

+ rootPrimaryIndexes(errorCode), nodes(errorCode) {

+ nfcImpl.ensureCanonIterData(errorCode);

+ if(U_FAILURE(errorCode)) {

+ errorReason = "CollationBuilder fields initialization failed";

+ return;

+ }

+ if(dataBuilder == NULL) {

+ errorCode = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ dataBuilder->initForTailoring(baseData, errorCode);

+ if(U_FAILURE(errorCode)) {

+ errorReason = "CollationBuilder initialization failed";

+ }

+CollationBuilder::~CollationBuilder() {

+ delete dataBuilder;

+CollationTailoring *

+CollationBuilder::parseAndBuild(const UnicodeString &ruleString,

+ const UVersionInfo rulesVersion,

+ CollationRuleParser::Importer *importer,

+ UParseError *outParseError,

+ UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return NULL; }

+ if(baseData->rootElements == NULL) {

+ errorCode = U_MISSING_RESOURCE_ERROR;

+ errorReason = "missing root elements data, tailoring not supported";

+ return NULL;

+ }

+ LocalPointer<CollationTailoring> tailoring(new CollationTailoring(base->settings));

+ if(tailoring.isNull() || tailoring->isBogus()) {

+ errorCode = U_MEMORY_ALLOCATION_ERROR;

+ return NULL;

+ }

+ CollationRuleParser parser(baseData, errorCode);

+ if(U_FAILURE(errorCode)) { return NULL; }

+ // Note: This always bases &[last variable] and &[first regular]

+ // on the root collator's maxVariable/variableTop.

+ // If we wanted this to change after [maxVariable x], then we would keep

+ // the tailoring.settings pointer here and read its variableTop when we need it.

+ // See http://unicode.org/cldr/trac/ticket/6070

+ variableTop = base->settings->variableTop;

+ parser.setSink(this);

+ parser.setImporter(importer);

+ CollationSettings &ownedSettings = *SharedObject::copyOnWrite(tailoring->settings);

+ parser.parse(ruleString, ownedSettings, outParseError, errorCode);

+ errorReason = parser.getErrorReason();

+ if(U_FAILURE(errorCode)) { return NULL; }

+ if(dataBuilder->hasMappings()) {

+ makeTailoredCEs(errorCode);

+ closeOverComposites(errorCode);

+ finalizeCEs(errorCode);

+ // Copy all of ASCII, and Latin-1 letters, into each tailoring.

+ optimizeSet.add(0, 0x7f);

+ optimizeSet.add(0xc0, 0xff);

+ // Hangul is decomposed on the fly during collation,

+ // and the tailoring data is always built with HANGUL_TAG specials.

+ optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);

+ dataBuilder->optimize(optimizeSet, errorCode);

+ tailoring->ensureOwnedData(errorCode);

+ if(U_FAILURE(errorCode)) { return NULL; }

+ if(fastLatinEnabled) { dataBuilder->enableFastLatin(); }

+ dataBuilder->build(*tailoring->ownedData, errorCode);

+ tailoring->builder = dataBuilder;

+ dataBuilder = NULL;

+ } else {

+ tailoring->data = baseData;

+ }

+ if(U_FAILURE(errorCode)) { return NULL; }

+ ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(

+ tailoring->data, ownedSettings,

+ ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries));

+ tailoring->rules = ruleString;

+ tailoring->rules.getTerminatedBuffer(); // ensure NUL-termination

+ tailoring->setVersion(base->version, rulesVersion);

+ return tailoring.orphan();

+void

+CollationBuilder::addReset(int32_t strength, const UnicodeString &str,

+ const char *&parserErrorReason, UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return; }

+ U_ASSERT(!str.isEmpty());

+ if(str.charAt(0) == CollationRuleParser::POS_LEAD) {

+ ces[0] = getSpecialResetPosition(str, parserErrorReason, errorCode);

+ cesLength = 1;

+ if(U_FAILURE(errorCode)) { return; }

+ U_ASSERT((ces[0] & Collation::CASE_AND_QUATERNARY_MASK) == 0);

+ } else {

+ // normal reset to a character or string

+ UnicodeString nfdString = nfd.normalize(str, errorCode);

+ if(U_FAILURE(errorCode)) {

+ parserErrorReason = "normalizing the reset position";

+ return;

+ }

+ cesLength = dataBuilder->getCEs(nfdString, ces, 0);

+ if(cesLength > Collation::MAX_EXPANSION_LENGTH) {

+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;

+ parserErrorReason = "reset position maps to too many collation elements (more than 31)";

+ return;

+ }

+ if(strength == UCOL_IDENTICAL) { return; } // simple reset-at-position

+ // &[before strength]position

+ U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_TERTIARY);

+ int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);

+ if(U_FAILURE(errorCode)) { return; }

+ int64_t node = nodes.elementAti(index);

+ // If the index is for a "weaker" tailored node,

+ // then skip backwards over this and further "weaker" nodes.

+ while(strengthFromNode(node) > strength) {

+ index = previousIndexFromNode(node);

+ node = nodes.elementAti(index);

+ }

+ // Find or insert a node whose index we will put into a temporary CE.

+ if(strengthFromNode(node) == strength && isTailoredNode(node)) {

+ // Reset to just before this same-strength tailored node.

+ index = previousIndexFromNode(node);

+ } else if(strength == UCOL_PRIMARY) {

+ // root primary node (has no previous index)

+ uint32_t p = weight32FromNode(node);

+ if(p == 0) {

+ errorCode = U_UNSUPPORTED_ERROR;

+ parserErrorReason = "reset primary-before ignorable not possible";

+ return;

+ }

+ if(p <= rootElements.getFirstPrimary()) {

+ // There is no primary gap between ignorables and the space-first-primary.

+ errorCode = U_UNSUPPORTED_ERROR;

+ parserErrorReason = "reset primary-before first non-ignorable not supported";

+ return;

+ }

+ if(p == Collation::FIRST_TRAILING_PRIMARY) {

+ // We do not support tailoring to an unassigned-implicit CE.

+ errorCode = U_UNSUPPORTED_ERROR;

+ parserErrorReason = "reset primary-before [first trailing] not supported";

+ return;

+ }

+ p = rootElements.getPrimaryBefore(p, baseData->isCompressiblePrimary(p));

+ index = findOrInsertNodeForPrimary(p, errorCode);

+ // Go to the last node in this list:

+ // Tailor after the last node between adjacent root nodes.

+ for(;;) {

+ node = nodes.elementAti(index);

+ int32_t nextIndex = nextIndexFromNode(node);

+ if(nextIndex == 0) { break; }

+ index = nextIndex;

+ }

+ } else {

+ // &[before 2] or &[before 3]

+ index = findCommonNode(index, UCOL_SECONDARY);

+ if(strength >= UCOL_TERTIARY) {

+ index = findCommonNode(index, UCOL_TERTIARY);

+ }

+ node = nodes.elementAti(index);

+ if(strengthFromNode(node) == strength) {

+ // Found a same-strength node with an explicit weight.

+ uint32_t weight16 = weight16FromNode(node);

+ if(weight16 == 0) {

+ errorCode = U_UNSUPPORTED_ERROR;

+ if(strength == UCOL_SECONDARY) {

+ parserErrorReason = "reset secondary-before secondary ignorable not possible";

+ } else {

+ parserErrorReason = "reset tertiary-before completely ignorable not possible";

+ }

+ return;

+ }

+ U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16);

+ int32_t previousIndex = previousIndexFromNode(node);

+ if(weight16 == Collation::COMMON_WEIGHT16) {

+ // Reset to just before this same-strength common-weight node.

+ index = previousIndex;

+ } else {

+ // A non-common weight is only possible from a root CE.

+ // Find the higher-level weights, which must all be explicit,

+ // and then find the preceding weight for this level.

+ uint32_t previousWeight16 = 0;

+ int32_t previousWeightIndex = -1;

+ int32_t i = index;

+ if(strength == UCOL_SECONDARY) {

+ uint32_t p;

+ do {

+ i = previousIndexFromNode(node);

+ node = nodes.elementAti(i);

+ if(strengthFromNode(node) == UCOL_SECONDARY && !isTailoredNode(node) &&

+ previousWeightIndex < 0) {

+ previousWeightIndex = i;

+ previousWeight16 = weight16FromNode(node);

+ }

+ } while(strengthFromNode(node) > UCOL_PRIMARY);

+ U_ASSERT(!isTailoredNode(node));

+ p = weight32FromNode(node);

+ weight16 = rootElements.getSecondaryBefore(p, weight16);

+ } else {

+ uint32_t p, s;

+ do {

+ i = previousIndexFromNode(node);

+ node = nodes.elementAti(i);

+ if(strengthFromNode(node) == UCOL_TERTIARY && !isTailoredNode(node) &&

+ previousWeightIndex < 0) {

+ previousWeightIndex = i;

+ previousWeight16 = weight16FromNode(node);

+ }

+ } while(strengthFromNode(node) > UCOL_SECONDARY);

+ U_ASSERT(!isTailoredNode(node));

+ if(strengthFromNode(node) == UCOL_SECONDARY) {

+ s = weight16FromNode(node);

+ do {

+ i = previousIndexFromNode(node);

+ node = nodes.elementAti(i);

+ } while(strengthFromNode(node) > UCOL_PRIMARY);

+ U_ASSERT(!isTailoredNode(node));

+ } else {

+ U_ASSERT(!nodeHasBefore2(node));

+ s = Collation::COMMON_WEIGHT16;

+ }

+ p = weight32FromNode(node);

+ weight16 = rootElements.getTertiaryBefore(p, s, weight16);

+ U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0);

+ }

+ // Find or insert the new explicit weight before the current one.

+ if(previousWeightIndex >= 0 && weight16 == previousWeight16) {

+ // Tailor after the last node between adjacent root nodes.

+ index = previousIndex;

+ } else {

+ node = nodeFromWeight16(weight16) | nodeFromStrength(strength);

+ index = insertNodeBetween(previousIndex, index, node, errorCode);

+ }

+ } else {

+ // Found a stronger node with implied strength-common weight.

+ int64_t hasBefore3 = 0;

+ if(strength == UCOL_SECONDARY) {

+ U_ASSERT(!nodeHasBefore2(node));

+ // Move the HAS_BEFORE3 flag from the parent node

+ // to the new secondary common node.

+ hasBefore3 = node & HAS_BEFORE3;

+ node = (node & ~(int64_t)HAS_BEFORE3) | HAS_BEFORE2;

+ } else {

+ U_ASSERT(!nodeHasBefore3(node));

+ node |= HAS_BEFORE3;

+ }

+ nodes.setElementAt(node, index);

+ int32_t nextIndex = nextIndexFromNode(node);

+ // Insert default nodes with weights 02 and 05, reset to the 02 node.

+ node = nodeFromWeight16(BEFORE_WEIGHT16) | nodeFromStrength(strength);

+ index = insertNodeBetween(index, nextIndex, node, errorCode);

+ node = nodeFromWeight16(Collation::COMMON_WEIGHT16) | hasBefore3 |

+ nodeFromStrength(strength);

+ insertNodeBetween(index, nextIndex, node, errorCode);

+ }

+ // Strength of the temporary CE = strength of its reset position.

+ // Code above raises an error if the before-strength is stronger.

+ strength = ceStrength(ces[cesLength - 1]);

+ }

+ if(U_FAILURE(errorCode)) {

+ parserErrorReason = "inserting reset position for &[before n]";

+ return;

+ }

+ ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength);

+int64_t

+CollationBuilder::getSpecialResetPosition(const UnicodeString &str,

+ const char *&parserErrorReason, UErrorCode &errorCode) {

+ U_ASSERT(str.length() == 2);

+ int64_t ce;

+ int32_t strength = UCOL_PRIMARY;

+ UBool isBoundary = FALSE;

+ UChar32 pos = str.charAt(1) - CollationRuleParser::POS_BASE;

+ U_ASSERT(0 <= pos && pos <= CollationRuleParser::LAST_TRAILING);

+ switch(pos) {

+ case CollationRuleParser::FIRST_TERTIARY_IGNORABLE:

+ // Quaternary CEs are not supported.

+ // Non-zero quaternary weights are possible only on tertiary or stronger CEs.

+ return 0;

+ case CollationRuleParser::LAST_TERTIARY_IGNORABLE:

+ return 0;

+ case CollationRuleParser::FIRST_SECONDARY_IGNORABLE: {

+ // Look for a tailored tertiary node after [0, 0, 0].

+ int32_t index = findOrInsertNodeForRootCE(0, UCOL_TERTIARY, errorCode);

+ if(U_FAILURE(errorCode)) { return 0; }

+ int64_t node = nodes.elementAti(index);

+ if((index = nextIndexFromNode(node)) != 0) {

+ node = nodes.elementAti(index);

+ U_ASSERT(strengthFromNode(node) <= UCOL_TERTIARY);

+ if(isTailoredNode(node) && strengthFromNode(node) == UCOL_TERTIARY) {

+ return tempCEFromIndexAndStrength(index, UCOL_TERTIARY);

+ }

+ return rootElements.getFirstTertiaryCE();

+ // No need to look for nodeHasAnyBefore() on a tertiary node.

+ }

+ case CollationRuleParser::LAST_SECONDARY_IGNORABLE:

+ ce = rootElements.getLastTertiaryCE();

+ strength = UCOL_TERTIARY;

+ break;

+ case CollationRuleParser::FIRST_PRIMARY_IGNORABLE: {

+ // Look for a tailored secondary node after [0, 0, *].

+ int32_t index = findOrInsertNodeForRootCE(0, UCOL_SECONDARY, errorCode);

+ if(U_FAILURE(errorCode)) { return 0; }

+ int64_t node = nodes.elementAti(index);

+ while((index = nextIndexFromNode(node)) != 0) {

+ node = nodes.elementAti(index);

+ strength = strengthFromNode(node);

+ if(strength < UCOL_SECONDARY) { break; }

+ if(strength == UCOL_SECONDARY) {

+ if(isTailoredNode(node)) {

+ if(nodeHasBefore3(node)) {

+ index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));

+ U_ASSERT(isTailoredNode(nodes.elementAti(index)));

+ }

+ return tempCEFromIndexAndStrength(index, UCOL_SECONDARY);

+ } else {

+ break;

+ }

+ ce = rootElements.getFirstSecondaryCE();

+ strength = UCOL_SECONDARY;

+ break;

+ }

+ case CollationRuleParser::LAST_PRIMARY_IGNORABLE:

+ ce = rootElements.getLastSecondaryCE();

+ strength = UCOL_SECONDARY;

+ break;

+ case CollationRuleParser::FIRST_VARIABLE:

+ ce = rootElements.getFirstPrimaryCE();

+ isBoundary = TRUE; // FractionalUCA.txt: FDD1 00A0, SPACE first primary

+ break;

+ case CollationRuleParser::LAST_VARIABLE:

+ ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1);

+ break;

+ case CollationRuleParser::FIRST_REGULAR:

+ ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1);

+ isBoundary = TRUE; // FractionalUCA.txt: FDD1 263A, SYMBOL first primary

+ break;

+ case CollationRuleParser::LAST_REGULAR:

+ // Use the Hani-first-primary rather than the actual last "regular" CE before it,

+ // for backward compatibility with behavior before the introduction of

+ // script-first-primary CEs in the root collator.

+ ce = rootElements.firstCEWithPrimaryAtLeast(

+ baseData->getFirstPrimaryForGroup(USCRIPT_HAN));

+ break;

+ case CollationRuleParser::FIRST_IMPLICIT:

+ ce = baseData->getSingleCE(0x4e00, errorCode);

+ break;

+ case CollationRuleParser::LAST_IMPLICIT:

+ // We do not support tailoring to an unassigned-implicit CE.

+ errorCode = U_UNSUPPORTED_ERROR;

+ parserErrorReason = "reset to [last implicit] not supported";

+ return 0;

+ case CollationRuleParser::FIRST_TRAILING:

+ ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY);

+ isBoundary = TRUE; // trailing first primary (there is no mapping for it)

+ break;

+ case CollationRuleParser::LAST_TRAILING:

+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;

+ parserErrorReason = "LDML forbids tailoring to U+FFFF";

+ return 0;

+ default:

+ U_ASSERT(FALSE);

+ return 0;

+ }

+ int32_t index = findOrInsertNodeForRootCE(ce, strength, errorCode);

+ if(U_FAILURE(errorCode)) { return 0; }

+ int64_t node = nodes.elementAti(index);

+ if((pos & 1) == 0) {

+ // even pos = [first xyz]

+ if(!nodeHasAnyBefore(node) && isBoundary) {

+ // A <group> first primary boundary is artificially added to FractionalUCA.txt.

+ // It is reachable via its special contraction, but is not normally used.

+ // Find the first character tailored after the boundary CE,

+ // or the first real root CE after it.

+ if((index = nextIndexFromNode(node)) != 0) {

+ // If there is a following node, then it must be tailored

+ // because there are no root CEs with a boundary primary

+ // and non-common secondary/tertiary weights.

+ node = nodes.elementAti(index);

+ U_ASSERT(isTailoredNode(node));

+ ce = tempCEFromIndexAndStrength(index, strength);

+ } else {

+ U_ASSERT(strength == UCOL_PRIMARY);

+ uint32_t p = (uint32_t)(ce >> 32);

+ int32_t pIndex = rootElements.findPrimary(p);

+ UBool isCompressible = baseData->isCompressiblePrimary(p);

+ p = rootElements.getPrimaryAfter(p, pIndex, isCompressible);

+ ce = Collation::makeCE(p);

+ index = findOrInsertNodeForRootCE(ce, UCOL_PRIMARY, errorCode);

+ if(U_FAILURE(errorCode)) { return 0; }

+ node = nodes.elementAti(index);

+ }

+ if(nodeHasAnyBefore(node)) {

+ // Get the first node that was tailored before this one at a weaker strength.

+ if(nodeHasBefore2(node)) {

+ index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));

+ node = nodes.elementAti(index);

+ }

+ if(nodeHasBefore3(node)) {

+ index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));

+ }

+ U_ASSERT(isTailoredNode(nodes.elementAti(index)));

+ ce = tempCEFromIndexAndStrength(index, strength);

+ }

+ } else {

+ // odd pos = [last xyz]

+ // Find the last node that was tailored after the [last xyz]

+ // at a strength no greater than the position's strength.

+ for(;;) {

+ int32_t nextIndex = nextIndexFromNode(node);

+ if(nextIndex == 0) { break; }

+ int64_t nextNode = nodes.elementAti(nextIndex);

+ if(strengthFromNode(nextNode) < strength) { break; }

+ index = nextIndex;

+ node = nextNode;

+ }

+ // Do not make a temporary CE for a root node.

+ // This last node might be the node for the root CE itself,

+ // or a node with a common secondary or tertiary weight.

+ if(isTailoredNode(node)) {

+ ce = tempCEFromIndexAndStrength(index, strength);

+ }

+ return ce;

+void

+CollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix,

+ const UnicodeString &str, const UnicodeString &extension,

+ const char *&parserErrorReason, UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return; }

+ UnicodeString nfdPrefix;

+ if(!prefix.isEmpty()) {

+ nfd.normalize(prefix, nfdPrefix, errorCode);

+ if(U_FAILURE(errorCode)) {

+ parserErrorReason = "normalizing the relation prefix";

+ return;

+ }

+ UnicodeString nfdString = nfd.normalize(str, errorCode);

+ if(U_FAILURE(errorCode)) {

+ parserErrorReason = "normalizing the relation string";

+ return;

+ }

+ // The runtime code decomposes Hangul syllables on the fly,

+ // with recursive processing but without making the Jamo pieces visible for matching.

+ // It does not work with certain types of contextual mappings.

+ int32_t nfdLength = nfdString.length();

+ if(nfdLength >= 2) {

+ UChar c = nfdString.charAt(0);

+ if(Hangul::isJamoL(c) || Hangul::isJamoV(c)) {

+ // While handling a Hangul syllable, contractions starting with Jamo L or V

+ // would not see the following Jamo of that syllable.

+ errorCode = U_UNSUPPORTED_ERROR;

+ parserErrorReason = "contractions starting with conjoining Jamo L or V not supported";

+ return;

+ }

+ c = nfdString.charAt(nfdLength - 1);

+ if(Hangul::isJamoL(c) ||

+ (Hangul::isJamoV(c) && Hangul::isJamoL(nfdString.charAt(nfdLength - 2)))) {

+ // A contraction ending with Jamo L or L+V would require

+ // generating Hangul syllables in addTailComposites() (588 for a Jamo L),

+ // or decomposing a following Hangul syllable on the fly, during contraction matching.

+ errorCode = U_UNSUPPORTED_ERROR;

+ parserErrorReason = "contractions ending with conjoining Jamo L or L+V not supported";

+ return;

+ }

+ // A Hangul syllable completely inside a contraction is ok.

+ }

+ // Note: If there is a prefix, then the parser checked that

+ // both the prefix and the string beging with NFC boundaries (not Jamo V or T).

+ // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0))

+ // (While handling a Hangul syllable, prefixes on Jamo V or T

+ // would not see the previous Jamo of that syllable.)

+ if(strength != UCOL_IDENTICAL) {

+ // Find the node index after which we insert the new tailored node.

+ int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);

+ U_ASSERT(cesLength > 0);

+ int64_t ce = ces[cesLength - 1];

+ if(strength == UCOL_PRIMARY && !isTempCE(ce) && (uint32_t)(ce >> 32) == 0) {

+ // There is no primary gap between ignorables and the space-first-primary.

+ errorCode = U_UNSUPPORTED_ERROR;

+ parserErrorReason = "tailoring primary after ignorables not supported";

+ return;

+ }

+ if(strength == UCOL_QUATERNARY && ce == 0) {

+ // The CE data structure does not support non-zero quaternary weights

+ // on tertiary ignorables.

+ errorCode = U_UNSUPPORTED_ERROR;

+ parserErrorReason = "tailoring quaternary after tertiary ignorables not supported";

+ return;

+ }

+ // Insert the new tailored node.

+ index = insertTailoredNodeAfter(index, strength, errorCode);

+ if(U_FAILURE(errorCode)) {

+ parserErrorReason = "modifying collation elements";

+ return;

+ }

+ // Strength of the temporary CE:

+ // The new relation may yield a stronger CE but not a weaker one.

+ int32_t tempStrength = ceStrength(ce);

+ if(strength < tempStrength) { tempStrength = strength; }

+ ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength);

+ }

+ setCaseBits(nfdString, parserErrorReason, errorCode);

+ if(U_FAILURE(errorCode)) { return; }

+ int32_t cesLengthBeforeExtension = cesLength;

+ if(!extension.isEmpty()) {

+ UnicodeString nfdExtension = nfd.normalize(extension, errorCode);

+ if(U_FAILURE(errorCode)) {

+ parserErrorReason = "normalizing the relation extension";

+ return;

+ }

+ cesLength = dataBuilder->getCEs(nfdExtension, ces, cesLength);

+ if(cesLength > Collation::MAX_EXPANSION_LENGTH) {

+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;

+ parserErrorReason =

+ "extension string adds too many collation elements (more than 31 total)";

+ return;

+ }

+ uint32_t ce32 = Collation::UNASSIGNED_CE32;

+ if((prefix != nfdPrefix || str != nfdString) &&

+ !ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) {

+ // Map from the original input to the CEs.

+ // We do this in case the canonical closure is incomplete,

+ // so that it is possible to explicitly provide the missing mappings.

+ ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode);

+ }

+ addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);

+ if(U_FAILURE(errorCode)) {

+ parserErrorReason = "writing collation elements";

+ return;

+ }

+ cesLength = cesLengthBeforeExtension;

+int32_t

+CollationBuilder::findOrInsertNodeForCEs(int32_t strength, const char *&parserErrorReason,

+ UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return 0; }

+ U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_QUATERNARY);

+ // Find the last CE that is at least as "strong" as the requested difference.

+ // Note: Stronger is smaller (UCOL_PRIMARY=0).

+ int64_t ce;

+ for(;; --cesLength) {

+ if(cesLength == 0) {

+ ce = ces[0] = 0;

+ cesLength = 1;

+ break;

+ } else {

+ ce = ces[cesLength - 1];

+ }

+ if(ceStrength(ce) <= strength) { break; }

+ }

+ if(isTempCE(ce)) {

+ // No need to findCommonNode() here for lower levels

+ // because insertTailoredNodeAfter() will do that anyway.

+ return indexFromTempCE(ce);

+ }

+ // root CE

+ if((uint8_t)(ce >> 56) == Collation::UNASSIGNED_IMPLICIT_BYTE) {

+ errorCode = U_UNSUPPORTED_ERROR;

+ parserErrorReason = "tailoring relative to an unassigned code point not supported";

+ return 0;

+ }

+ return findOrInsertNodeForRootCE(ce, strength, errorCode);

+int32_t

+CollationBuilder::findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return 0; }

+ U_ASSERT((uint8_t)(ce >> 56) != Collation::UNASSIGNED_IMPLICIT_BYTE);

+ // Find or insert the node for each of the root CE's weights,

+ // down to the requested level/strength.

+ // Root CEs must have common=zero quaternary weights (for which we never insert any nodes).

+ U_ASSERT((ce & 0xc0) == 0);

+ int32_t index = findOrInsertNodeForPrimary((uint32_t)(ce >> 32) , errorCode);

+ if(strength >= UCOL_SECONDARY) {

+ uint32_t lower32 = (uint32_t)ce;

+ index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, errorCode);

+ if(strength >= UCOL_TERTIARY) {

+ index = findOrInsertWeakNode(index, lower32 & Collation::ONLY_TERTIARY_MASK,

+ UCOL_TERTIARY, errorCode);

+ }

+ return index;

+namespace {

+/**

+ * Like Java Collections.binarySearch(List, key, Comparator).

+ *

+ * @return the index>=0 where the item was found,

+ * or the index<0 for inserting the string at ~index in sorted order

+ * (index into rootPrimaryIndexes)

+ */

+int32_t

+binarySearchForRootPrimaryNode(const int32_t *rootPrimaryIndexes, int32_t length,

+ const int64_t *nodes, uint32_t p) {

+ if(length == 0) { return ~0; }

+ int32_t start = 0;

+ int32_t limit = length;

+ for (;;) {

+ int32_t i = (start + limit) / 2;

+ int64_t node = nodes[rootPrimaryIndexes[i]];

+ uint32_t nodePrimary = (uint32_t)(node >> 32); // weight32FromNode(node)

+ if (p == nodePrimary) {

+ return i;

+ } else if (p < nodePrimary) {

+ if (i == start) {

+ return ~start; // insert s before i

+ }

+ limit = i;

+ } else {

+ if (i == start) {

+ return ~(start + 1); // insert s after i

+ }

+ start = i;

+ }

+} // namespace

+int32_t

+CollationBuilder::findOrInsertNodeForPrimary(uint32_t p, UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return 0; }

+ int32_t rootIndex = binarySearchForRootPrimaryNode(

+ rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p);

+ if(rootIndex >= 0) {

+ return rootPrimaryIndexes.elementAti(rootIndex);

+ } else {

+ // Start a new list of nodes with this primary.

+ int32_t index = nodes.size();

+ nodes.addElement(nodeFromWeight32(p), errorCode);

+ rootPrimaryIndexes.insertElementAt(index, ~rootIndex, errorCode);

+ return index;

+ }

+int32_t

+CollationBuilder::findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level, UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return 0; }

+ U_ASSERT(0 <= index && index < nodes.size());

+ U_ASSERT(weight16 == 0 || weight16 >= Collation::COMMON_WEIGHT16);

+ // Only reset-before inserts common weights.

+ if(weight16 == Collation::COMMON_WEIGHT16) {

+ return findCommonNode(index, level);

+ }

+ // Find the root CE's weight for this level.

+ // Postpone insertion if not found:

+ // Insert the new root node before the next stronger node,

+ // or before the next root node with the same strength and a larger weight.

+ int64_t node = nodes.elementAti(index);

+ int32_t nextIndex;

+ while((nextIndex = nextIndexFromNode(node)) != 0) {

+ node = nodes.elementAti(nextIndex);

+ int32_t nextStrength = strengthFromNode(node);

+ if(nextStrength <= level) {

+ // Insert before a stronger node.

+ if(nextStrength < level) { break; }

+ // nextStrength == level

+ if(!isTailoredNode(node)) {

+ uint32_t nextWeight16 = weight16FromNode(node);

+ if(nextWeight16 == weight16) {

+ // Found the node for the root CE up to this level.

+ return nextIndex;

+ }

+ // Insert before a node with a larger same-strength weight.

+ if(nextWeight16 > weight16) { break; }

+ }

+ // Skip the next node.

+ index = nextIndex;

+ }

+ node = nodeFromWeight16(weight16) | nodeFromStrength(level);

+ return insertNodeBetween(index, nextIndex, node, errorCode);

+int32_t

+CollationBuilder::insertTailoredNodeAfter(int32_t index, int32_t strength, UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return 0; }

+ U_ASSERT(0 <= index && index < nodes.size());

+ if(strength >= UCOL_SECONDARY) {

+ index = findCommonNode(index, UCOL_SECONDARY);

+ if(strength >= UCOL_TERTIARY) {

+ index = findCommonNode(index, UCOL_TERTIARY);

+ }

+ // Postpone insertion:

+ // Insert the new node before the next one with a strength at least as strong.

+ int64_t node = nodes.elementAti(index);

+ int32_t nextIndex;

+ while((nextIndex = nextIndexFromNode(node)) != 0) {

+ node = nodes.elementAti(nextIndex);

+ if(strengthFromNode(node) <= strength) { break; }

+ // Skip the next node which has a weaker (larger) strength than the new one.

+ index = nextIndex;

+ }

+ node = IS_TAILORED | nodeFromStrength(strength);

+ return insertNodeBetween(index, nextIndex, node, errorCode);

+int32_t

+CollationBuilder::insertNodeBetween(int32_t index, int32_t nextIndex, int64_t node,

+ UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return 0; }

+ U_ASSERT(previousIndexFromNode(node) == 0);

+ U_ASSERT(nextIndexFromNode(node) == 0);

+ U_ASSERT(nextIndexFromNode(nodes.elementAti(index)) == nextIndex);

+ // Append the new node and link it to the existing nodes.

+ int32_t newIndex = nodes.size();

+ node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex);

+ nodes.addElement(node, errorCode);

+ if(U_FAILURE(errorCode)) { return 0; }

+ // nodes[index].nextIndex = newIndex

+ node = nodes.elementAti(index);

+ nodes.setElementAt(changeNodeNextIndex(node, newIndex), index);

+ // nodes[nextIndex].previousIndex = newIndex

+ if(nextIndex != 0) {

+ node = nodes.elementAti(nextIndex);

+ nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex);

+ }

+ return newIndex;

+int32_t

+CollationBuilder::findCommonNode(int32_t index, int32_t strength) const {

+ U_ASSERT(UCOL_SECONDARY <= strength && strength <= UCOL_TERTIARY);

+ int64_t node = nodes.elementAti(index);

+ if(strengthFromNode(node) >= strength) {

+ // The current node is no stronger.

+ return index;

+ }

+ if(strength == UCOL_SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) {

+ // The current node implies the strength-common weight.

+ return index;

+ }

+ index = nextIndexFromNode(node);

+ node = nodes.elementAti(index);

+ U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength &&

+ weight16FromNode(node) == BEFORE_WEIGHT16);

+ // Skip to the explicit common node.

+ do {

+ index = nextIndexFromNode(node);

+ node = nodes.elementAti(index);

+ U_ASSERT(strengthFromNode(node) >= strength);

+ } while(isTailoredNode(node) || strengthFromNode(node) > strength);

+ U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16);

+ return index;

+void

+CollationBuilder::setCaseBits(const UnicodeString &nfdString,

+ const char *&parserErrorReason, UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return; }

+ int32_t numTailoredPrimaries = 0;

+ for(int32_t i = 0; i < cesLength; ++i) {

+ if(ceStrength(ces[i]) == UCOL_PRIMARY) { ++numTailoredPrimaries; }

+ }

+ // We should not be able to get too many case bits because

+ // cesLength<=31==MAX_EXPANSION_LENGTH.

+ // 31 pairs of case bits fit into an int64_t without setting its sign bit.

+ U_ASSERT(numTailoredPrimaries <= 31);

+ int64_t cases = 0;

+ if(numTailoredPrimaries > 0) {

+ const UChar *s = nfdString.getBuffer();

+ UTF16CollationIterator baseCEs(baseData, FALSE, s, s, s + nfdString.length());

+ int32_t baseCEsLength = baseCEs.fetchCEs(errorCode) - 1;

+ if(U_FAILURE(errorCode)) {

+ parserErrorReason = "fetching root CEs for tailored string";

+ return;

+ }

+ U_ASSERT(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation::NO_CE);

+ uint32_t lastCase = 0;

+ int32_t numBasePrimaries = 0;

+ for(int32_t i = 0; i < baseCEsLength; ++i) {

+ int64_t ce = baseCEs.getCE(i);

+ if((ce >> 32) != 0) {

+ ++numBasePrimaries;

+ uint32_t c = ((uint32_t)ce >> 14) & 3;

+ U_ASSERT(c == 0 || c == 2); // lowercase or uppercase, no mixed case in any base CE

+ if(numBasePrimaries < numTailoredPrimaries) {

+ cases |= (int64_t)c << ((numBasePrimaries - 1) * 2);

+ } else if(numBasePrimaries == numTailoredPrimaries) {

+ lastCase = c;

+ } else if(c != lastCase) {

+ // There are more base primary CEs than tailored primaries.

+ // Set mixed case if the case bits of the remainder differ.

+ lastCase = 1;

+ // Nothing more can change.

+ break;

+ }

+ if(numBasePrimaries >= numTailoredPrimaries) {

+ cases |= (int64_t)lastCase << ((numTailoredPrimaries - 1) * 2);

+ }

+ for(int32_t i = 0; i < cesLength; ++i) {

+ int64_t ce = ces[i] & INT64_C(0xffffffffffff3fff); // clear old case bits

+ int32_t strength = ceStrength(ce);

+ if(strength == UCOL_PRIMARY) {

+ ce |= (cases & 3) << 14;

+ cases >>= 2;

+ } else if(strength == UCOL_TERTIARY) {

+ // Tertiary CEs must have uppercase bits.

+ // See the LDML spec, and comments in class CollationCompare.

+ ce |= 0x8000;

+ }

+ // Tertiary ignorable CEs must have 0 case bits.

+ // We set 0 case bits for secondary CEs too

+ // since currently only U+0345 is cased and maps to a secondary CE,

+ // and it is lowercase. Other secondaries are uncased.

+ // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight.

+ ces[i] = ce;

+ }

+void

+CollationBuilder::suppressContractions(const UnicodeSet &set, const char *&parserErrorReason,

+ UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return; }

+ dataBuilder->suppressContractions(set, errorCode);

+ if(U_FAILURE(errorCode)) {

+ parserErrorReason = "application of [suppressContractions [set]] failed";

+ }

+void

+CollationBuilder::optimize(const UnicodeSet &set, const char *& /* parserErrorReason */,

+ UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return; }

+ optimizeSet.addAll(set);

+uint32_t

+CollationBuilder::addWithClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,

+ const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,

+ UErrorCode &errorCode) {

+ // Map from the NFD input to the CEs.

+ ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode);

+ ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode);

+ addTailComposites(nfdPrefix, nfdString, errorCode);

+ return ce32;

+uint32_t

+CollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,

+ const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,

+ UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return ce32; }

+ // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.)

+ if(nfdPrefix.isEmpty()) {

+ CanonicalIterator stringIter(nfdString, errorCode);

+ if(U_FAILURE(errorCode)) { return ce32; }

+ UnicodeString prefix;

+ for(;;) {

+ UnicodeString str = stringIter.next();

+ if(str.isBogus()) { break; }

+ if(ignoreString(str, errorCode) || str == nfdString) { continue; }

+ ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);

+ if(U_FAILURE(errorCode)) { return ce32; }

+ }

+ } else {

+ CanonicalIterator prefixIter(nfdPrefix, errorCode);

+ CanonicalIterator stringIter(nfdString, errorCode);

+ if(U_FAILURE(errorCode)) { return ce32; }

+ for(;;) {

+ UnicodeString prefix = prefixIter.next();

+ if(prefix.isBogus()) { break; }

+ if(ignorePrefix(prefix, errorCode)) { continue; }

+ UBool samePrefix = prefix == nfdPrefix;

+ for(;;) {

+ UnicodeString str = stringIter.next();

+ if(str.isBogus()) { break; }

+ if(ignoreString(str, errorCode) || (samePrefix && str == nfdString)) { continue; }

+ ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);

+ if(U_FAILURE(errorCode)) { return ce32; }

+ }

+ stringIter.reset();

+ }

+ return ce32;

+void

+CollationBuilder::addTailComposites(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,

+ UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return; }

+ // Look for the last starter in the NFD string.

+ UChar32 lastStarter;

+ int32_t indexAfterLastStarter = nfdString.length();

+ for(;;) {

+ if(indexAfterLastStarter == 0) { return; } // no starter at all

+ lastStarter = nfdString.char32At(indexAfterLastStarter - 1);

+ if(nfd.getCombiningClass(lastStarter) == 0) { break; }

+ indexAfterLastStarter -= U16_LENGTH(lastStarter);

+ }

+ // No closure to Hangul syllables since we decompose them on the fly.

+ if(Hangul::isJamoL(lastStarter)) { return; }

+ // Are there any composites whose decomposition starts with the lastStarter?

+ // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters.

+ // We might find some more equivalent mappings here if it did.

+ UnicodeSet composites;

+ if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; }

+ UnicodeString decomp;

+ UnicodeString newNFDString, newString;

+ int64_t newCEs[Collation::MAX_EXPANSION_LENGTH];

+ UnicodeSetIterator iter(composites);

+ while(iter.next()) {

+ U_ASSERT(!iter.isString());

+ UChar32 composite = iter.getCodepoint();

+ nfd.getDecomposition(composite, decomp);

+ if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp,

+ newNFDString, newString, errorCode)) {

+ continue;

+ }

+ int32_t newCEsLength = dataBuilder->getCEs(nfdPrefix, newNFDString, newCEs, 0);

+ if(newCEsLength > Collation::MAX_EXPANSION_LENGTH) {

+ // Ignore mappings that we cannot store.

+ continue;

+ }

+ // Note: It is possible that the newCEs do not make use of the mapping

+ // for which we are adding the tail composites, in which case we might be adding

+ // unnecessary mappings.

+ // For example, when we add tail composites for ae^ (^=combining circumflex),

+ // UCA discontiguous-contraction matching does not find any matches

+ // for ae_^ (_=any combining diacritic below) *unless* there is also

+ // a contraction mapping for ae.

+ // Thus, if there is no ae contraction, then the ae^ mapping is ignored

+ // while fetching the newCEs for ae_^.

+ // TODO: Try to detect this effectively.

+ // (Alternatively, print a warning when prefix contractions are missing.)

+ // We do not need an explicit mapping for the NFD strings.

+ // It is fine if the NFD input collates like this via a sequence of mappings.

+ // It also saves a little bit of space, and may reduce the set of characters with contractions.

+ uint32_t ce32 = addIfDifferent(nfdPrefix, newString,

+ newCEs, newCEsLength, Collation::UNASSIGNED_CE32, errorCode);

+ if(ce32 != Collation::UNASSIGNED_CE32) {

+ // was different, was added

+ addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32, errorCode);

+ }

+UBool

+CollationBuilder::mergeCompositeIntoString(const UnicodeString &nfdString,

+ int32_t indexAfterLastStarter,

+ UChar32 composite, const UnicodeString &decomp,

+ UnicodeString &newNFDString, UnicodeString &newString,

+ UErrorCode &errorCode) const {

+ if(U_FAILURE(errorCode)) { return FALSE; }

+ U_ASSERT(nfdString.char32At(indexAfterLastStarter - 1) == decomp.char32At(0));

+ int32_t lastStarterLength = decomp.moveIndex32(0, 1);

+ if(lastStarterLength == decomp.length()) {

+ // Singleton decompositions should be found by addWithClosure()

+ // and the CanonicalIterator, so we can ignore them here.

+ return FALSE;

+ }

+ if(nfdString.compare(indexAfterLastStarter, 0x7fffffff,

+ decomp, lastStarterLength, 0x7fffffff) == 0) {

+ // same strings, nothing new to be found here

+ return FALSE;

+ }

+ // Make new FCD strings that combine a composite, or its decomposition,

+ // into the nfdString's last starter and the combining marks following it.

+ // Make an NFD version, and a version with the composite.

+ newNFDString.setTo(nfdString, 0, indexAfterLastStarter);

+ newString.setTo(nfdString, 0, indexAfterLastStarter - lastStarterLength).append(composite);

+ // The following is related to discontiguous contraction matching,

+ // but builds only FCD strings (or else returns FALSE).

+ int32_t sourceIndex = indexAfterLastStarter;

+ int32_t decompIndex = lastStarterLength;

+ // Small optimization: We keep the source character across loop iterations

+ // because we do not always consume it,

+ // and then need not fetch it again nor look up its combining class again.

+ UChar32 sourceChar = U_SENTINEL;

+ // The cc variables need to be declared before the loop so that at the end

+ // they are set to the last combining classes seen.

+ uint8_t sourceCC = 0;

+ uint8_t decompCC = 0;

+ for(;;) {

+ if(sourceChar < 0) {

+ if(sourceIndex >= nfdString.length()) { break; }

+ sourceChar = nfdString.char32At(sourceIndex);

+ sourceCC = nfd.getCombiningClass(sourceChar);

+ U_ASSERT(sourceCC != 0);

+ }

+ // We consume a decomposition character in each iteration.

+ if(decompIndex >= decomp.length()) { break; }

+ UChar32 decompChar = decomp.char32At(decompIndex);

+ decompCC = nfd.getCombiningClass(decompChar);

+ // Compare the two characters and their combining classes.

+ if(decompCC == 0) {

+ // Unable to merge because the source contains a non-zero combining mark

+ // but the composite's decomposition contains another starter.

+ // The strings would not be equivalent.

+ return FALSE;

+ } else if(sourceCC < decompCC) {

+ // Composite + sourceChar would not be FCD.

+ return FALSE;

+ } else if(decompCC < sourceCC) {

+ newNFDString.append(decompChar);

+ decompIndex += U16_LENGTH(decompChar);

+ } else if(decompChar != sourceChar) {

+ // Blocked because same combining class.

+ return FALSE;

+ } else { // match: decompChar == sourceChar

+ newNFDString.append(decompChar);

+ decompIndex += U16_LENGTH(decompChar);

+ sourceIndex += U16_LENGTH(decompChar);

+ sourceChar = U_SENTINEL;

+ }

+ // We are at the end of at least one of the two inputs.

+ if(sourceChar >= 0) { // more characters from nfdString but not from decomp

+ if(sourceCC < decompCC) {

+ // Appending the next source character to the composite would not be FCD.

+ return FALSE;

+ }

+ newNFDString.append(nfdString, sourceIndex, 0x7fffffff);

+ newString.append(nfdString, sourceIndex, 0x7fffffff);

+ } else if(decompIndex < decomp.length()) { // more characters from decomp, not from nfdString

+ newNFDString.append(decomp, decompIndex, 0x7fffffff);

+ }

+ U_ASSERT(nfd.isNormalized(newNFDString, errorCode));

+ U_ASSERT(fcd.isNormalized(newString, errorCode));

+ U_ASSERT(nfd.normalize(newString, errorCode) == newNFDString); // canonically equivalent

+ return TRUE;

+UBool

+CollationBuilder::ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) const {

+ // Do not map non-FCD prefixes.

+ return !isFCD(s, errorCode);

+UBool

+CollationBuilder::ignoreString(const UnicodeString &s, UErrorCode &errorCode) const {

+ // Do not map non-FCD strings.

+ // Do not map strings that start with Hangul syllables: We decompose those on the fly.

+ return !isFCD(s, errorCode) || Hangul::isHangul(s.charAt(0));

+UBool

+CollationBuilder::isFCD(const UnicodeString &s, UErrorCode &errorCode) const {

+ return U_SUCCESS(errorCode) && fcd.isNormalized(s, errorCode);

+void

+CollationBuilder::closeOverComposites(UErrorCode &errorCode) {

+ UnicodeSet composites(UNICODE_STRING_SIMPLE("[:NFD_QC=N:]"), errorCode); // Java: static final

+ if(U_FAILURE(errorCode)) { return; }

+ // Hangul is decomposed on the fly during collation.

+ composites.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);

+ UnicodeString prefix; // empty

+ UnicodeString nfdString;

+ UnicodeSetIterator iter(composites);

+ while(iter.next()) {

+ U_ASSERT(!iter.isString());

+ nfd.getDecomposition(iter.getCodepoint(), nfdString);

+ cesLength = dataBuilder->getCEs(nfdString, ces, 0);

+ if(cesLength > Collation::MAX_EXPANSION_LENGTH) {

+ // Too many CEs from the decomposition (unusual), ignore this composite.

+ // We could add a capacity parameter to getCEs() and reallocate if necessary.

+ // However, this can only really happen in contrived cases.

+ continue;

+ }

+ const UnicodeString &composite(iter.getString());

+ addIfDifferent(prefix, composite, ces, cesLength, Collation::UNASSIGNED_CE32, errorCode);

+ }

+uint32_t

+CollationBuilder::addIfDifferent(const UnicodeString &prefix, const UnicodeString &str,

+ const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,

+ UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return ce32; }

+ int64_t oldCEs[Collation::MAX_EXPANSION_LENGTH];

+ int32_t oldCEsLength = dataBuilder->getCEs(prefix, str, oldCEs, 0);

+ if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) {

+ if(ce32 == Collation::UNASSIGNED_CE32) {

+ ce32 = dataBuilder->encodeCEs(newCEs, newCEsLength, errorCode);

+ }

+ dataBuilder->addCE32(prefix, str, ce32, errorCode);

+ }

+ return ce32;

+UBool

+CollationBuilder::sameCEs(const int64_t ces1[], int32_t ces1Length,

+ const int64_t ces2[], int32_t ces2Length) {

+ if(ces1Length != ces2Length) {

+ return FALSE;

+ }

+ U_ASSERT(ces1Length <= Collation::MAX_EXPANSION_LENGTH);

+ for(int32_t i = 0; i < ces1Length; ++i) {

+ if(ces1[i] != ces2[i]) { return FALSE; }

+ }

+ return TRUE;

+#ifdef DEBUG_COLLATION_BUILDER

+uint32_t

+alignWeightRight(uint32_t w) {

+ if(w != 0) {

+ while((w & 0xff) == 0) { w >>= 8; }

+ }

+ return w;

+#endif

+void

+CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return; }

+ CollationWeights primaries, secondaries, tertiaries;

+ int64_t *nodesArray = nodes.getBuffer();

+ for(int32_t rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) {

+ int32_t i = rootPrimaryIndexes.elementAti(rpi);

+ int64_t node = nodesArray[i];

+ uint32_t p = weight32FromNode(node);

+ uint32_t s = p == 0 ? 0 : Collation::COMMON_WEIGHT16;

+ uint32_t t = s;

+ uint32_t q = 0;

+ UBool pIsTailored = FALSE;

+ UBool sIsTailored = FALSE;

+ UBool tIsTailored = FALSE;

+#ifdef DEBUG_COLLATION_BUILDER

+ printf("\nprimary %lx\n", (long)alignWeightRight(p));

+#endif

+ int32_t pIndex = p == 0 ? 0 : rootElements.findPrimary(p);

+ int32_t nextIndex = nextIndexFromNode(node);

+ while(nextIndex != 0) {

+ i = nextIndex;

+ node = nodesArray[i];

+ nextIndex = nextIndexFromNode(node);

+ int32_t strength = strengthFromNode(node);

+ if(strength == UCOL_QUATERNARY) {

+ U_ASSERT(isTailoredNode(node));

+#ifdef DEBUG_COLLATION_BUILDER

+ printf(" quat+ ");

+#endif

+ if(q == 3) {

+ errorCode = U_BUFFER_OVERFLOW_ERROR;

+ errorReason = "quaternary tailoring gap too small";

+ return;

+ }

+ ++q;

+ } else {

+ if(strength == UCOL_TERTIARY) {

+ if(isTailoredNode(node)) {

+#ifdef DEBUG_COLLATION_BUILDER

+ printf(" ter+ ");

+#endif

+ if(!tIsTailored) {

+ // First tailored tertiary node for [p, s].

+ int32_t tCount = countTailoredNodes(nodesArray, nextIndex,

+ UCOL_TERTIARY) + 1;

+ uint32_t tLimit;

+ if(t == 0) {

+ // Gap at the beginning of the tertiary CE range.

+ t = rootElements.getTertiaryBoundary() - 0x100;

+ tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK;

+ } else if(t == BEFORE_WEIGHT16) {

+ tLimit = Collation::COMMON_WEIGHT16;

+ } else if(!pIsTailored && !sIsTailored) {

+ // p and s are root weights.

+ tLimit = rootElements.getTertiaryAfter(pIndex, s, t);

+ } else {

+ // [p, s] is tailored.

+ U_ASSERT(t == Collation::COMMON_WEIGHT16);

+ tLimit = rootElements.getTertiaryBoundary();

+ }

+ U_ASSERT(tLimit == 0x4000 || (tLimit & ~Collation::ONLY_TERTIARY_MASK) == 0);

+ tertiaries.initForTertiary();

+ if(!tertiaries.allocWeights(t, tLimit, tCount)) {

+ errorCode = U_BUFFER_OVERFLOW_ERROR;

+ errorReason = "tertiary tailoring gap too small";

+ return;

+ }

+ tIsTailored = TRUE;

+ }

+ t = tertiaries.nextWeight();

+ U_ASSERT(t != 0xffffffff);

+ } else {

+ t = weight16FromNode(node);

+ tIsTailored = FALSE;

+#ifdef DEBUG_COLLATION_BUILDER

+ printf(" ter %lx\n", (long)alignWeightRight(t));

+#endif

+ }

+ } else {

+ if(strength == UCOL_SECONDARY) {

+ if(isTailoredNode(node)) {

+#ifdef DEBUG_COLLATION_BUILDER

+ printf(" sec+ ");

+#endif

+ if(!sIsTailored) {

+ // First tailored secondary node for p.

+ int32_t sCount = countTailoredNodes(nodesArray, nextIndex,

+ UCOL_SECONDARY) + 1;

+ uint32_t sLimit;

+ if(s == 0) {

+ // Gap at the beginning of the secondary CE range.

+ s = rootElements.getSecondaryBoundary() - 0x100;

+ sLimit = rootElements.getFirstSecondaryCE() >> 16;

+ } else if(s == BEFORE_WEIGHT16) {

+ sLimit = Collation::COMMON_WEIGHT16;

+ } else if(!pIsTailored) {

+ // p is a root primary.

+ sLimit = rootElements.getSecondaryAfter(pIndex, s);

+ } else {

+ // p is a tailored primary.

+ U_ASSERT(s == Collation::COMMON_WEIGHT16);

+ sLimit = rootElements.getSecondaryBoundary();

+ }

+ if(s == Collation::COMMON_WEIGHT16) {

+ // Do not tailor into the getSortKey() range of

+ // compressed common secondaries.

+ s = rootElements.getLastCommonSecondary();

+ }

+ secondaries.initForSecondary();

+ if(!secondaries.allocWeights(s, sLimit, sCount)) {

+ errorCode = U_BUFFER_OVERFLOW_ERROR;

+ errorReason = "secondary tailoring gap too small";

+ return;

+ }

+ sIsTailored = TRUE;

+ }

+ s = secondaries.nextWeight();

+ U_ASSERT(s != 0xffffffff);

+ } else {

+ s = weight16FromNode(node);

+ sIsTailored = FALSE;

+#ifdef DEBUG_COLLATION_BUILDER

+ printf(" sec %lx\n", (long)alignWeightRight(s));

+#endif

+ }

+ } else /* UCOL_PRIMARY */ {

+ U_ASSERT(isTailoredNode(node));

+#ifdef DEBUG_COLLATION_BUILDER

+ printf("pri+ ");

+#endif

+ if(!pIsTailored) {

+ // First tailored primary node in this list.

+ int32_t pCount = countTailoredNodes(nodesArray, nextIndex,

+ UCOL_PRIMARY) + 1;

+ UBool isCompressible = baseData->isCompressiblePrimary(p);

+ uint32_t pLimit =

+ rootElements.getPrimaryAfter(p, pIndex, isCompressible);

+ primaries.initForPrimary(isCompressible);

+ if(!primaries.allocWeights(p, pLimit, pCount)) {

+ errorCode = U_BUFFER_OVERFLOW_ERROR; // TODO: introduce a more specific UErrorCode?

+ errorReason = "primary tailoring gap too small";

+ return;

+ }

+ pIsTailored = TRUE;

+ }

+ p = primaries.nextWeight();

+ U_ASSERT(p != 0xffffffff);

+ s = Collation::COMMON_WEIGHT16;

+ sIsTailored = FALSE;

+ }

+ t = s == 0 ? 0 : Collation::COMMON_WEIGHT16;

+ tIsTailored = FALSE;

+ }

+ q = 0;

+ }

+ if(isTailoredNode(node)) {

+ nodesArray[i] = Collation::makeCE(p, s, t, q);

+#ifdef DEBUG_COLLATION_BUILDER

+ printf("%016llx\n", (long long)nodesArray[i]);

+#endif

+ }

+int32_t

+CollationBuilder::countTailoredNodes(const int64_t *nodesArray, int32_t i, int32_t strength) {

+ int32_t count = 0;

+ for(;;) {

+ if(i == 0) { break; }

+ int64_t node = nodesArray[i];

+ if(strengthFromNode(node) < strength) { break; }

+ if(strengthFromNode(node) == strength) {

+ if(isTailoredNode(node)) {

+ ++count;

+ } else {

+ break;

+ }

+ i = nextIndexFromNode(node);

+ }

+ return count;

+class CEFinalizer : public CollationDataBuilder::CEModifier {

+public:

+ CEFinalizer(const int64_t *ces) : finalCEs(ces) {}

+ virtual ~CEFinalizer();

+ virtual int64_t modifyCE32(uint32_t ce32) const {

+ U_ASSERT(!Collation::isSpecialCE32(ce32));

+ if(CollationBuilder::isTempCE32(ce32)) {

+ // retain case bits

+ return finalCEs[CollationBuilder::indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8);

+ } else {

+ return Collation::NO_CE;

+ }

+ virtual int64_t modifyCE(int64_t ce) const {

+ if(CollationBuilder::isTempCE(ce)) {

+ // retain case bits

+ return finalCEs[CollationBuilder::indexFromTempCE(ce)] | (ce & 0xc000);

+ } else {

+ return Collation::NO_CE;

+ }

+private:

+ const int64_t *finalCEs;

+};

+CEFinalizer::~CEFinalizer() {}

+void

+CollationBuilder::finalizeCEs(UErrorCode &errorCode) {

+ if(U_FAILURE(errorCode)) { return; }

+ LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(errorCode));

+ if(newBuilder.isNull()) {

+ errorCode = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ newBuilder->initForTailoring(baseData, errorCode);

+ CEFinalizer finalizer(nodes.getBuffer());

+ newBuilder->copyFrom(*dataBuilder, finalizer, errorCode);

+ if(U_FAILURE(errorCode)) { return; }

+ delete dataBuilder;

+ dataBuilder = newBuilder.orphan();

+int32_t

+CollationBuilder::ceStrength(int64_t ce) {

+ return

+ isTempCE(ce) ? strengthFromTempCE(ce) :

+ (ce & INT64_C(0xff00000000000000)) != 0 ? UCOL_PRIMARY :

+ ((uint32_t)ce & 0xff000000) != 0 ? UCOL_SECONDARY :

+ ce != 0 ? UCOL_TERTIARY :

+ UCOL_IDENTICAL;

+U_NAMESPACE_END

+U_NAMESPACE_USE

+U_CAPI UCollator * U_EXPORT2

+ucol_openRules(const UChar *rules, int32_t rulesLength,

+ UColAttributeValue normalizationMode, UCollationStrength strength,

+ UParseError *parseError, UErrorCode *pErrorCode) {

+ if(U_FAILURE(*pErrorCode)) { return NULL; }

+ if(rules == NULL && rulesLength != 0) {

+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;

+ return NULL;

+ }

+ RuleBasedCollator *coll = new RuleBasedCollator();

+ if(coll == NULL) {

+ *pErrorCode = U_MEMORY_ALLOCATION_ERROR;

+ return NULL;

+ }

+ UnicodeString r((UBool)(rulesLength < 0), rules, rulesLength);

+ coll->internalBuildTailoring(r, strength, normalizationMode, parseError, NULL, *pErrorCode);

+ if(U_FAILURE(*pErrorCode)) {

+ delete coll;

+ return NULL;

+ }

+ return coll->toUCollator();

+static const int32_t internalBufferSize = 512;

+// The @internal ucol_getUnsafeSet() was moved here from ucol_sit.cpp

+// because it calls UnicodeSet "builder" code that depends on all Unicode properties,

+// and the rest of the collation "runtime" code only depends on normalization.

+// This function is not related to the collation builder,

+// but it did not seem worth moving it into its own .cpp file,

+// nor rewriting it to use lower-level UnicodeSet and Normalizer2Impl methods.

+U_CAPI int32_t U_EXPORT2

+ucol_getUnsafeSet( const UCollator *coll,

+ USet *unsafe,

+ UErrorCode *status)

+ UChar buffer[internalBufferSize];

+ int32_t len = 0;

+ uset_clear(unsafe);

+ // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant

+ static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d,

+ 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 };

+ // add chars that fail the fcd check

+ uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status);

+ // add lead/trail surrogates

+ // (trail surrogates should need to be unsafe only if the caller tests for UTF-16 code *units*,

+ // not when testing code *points*)

+ uset_addRange(unsafe, 0xd800, 0xdfff);

+ USet *contractions = uset_open(0,0);

+ int32_t i = 0, j = 0;

+ ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status);

+ int32_t contsSize = uset_size(contractions);

+ UChar32 c = 0;

+ // Contraction set consists only of strings

+ // to get unsafe code points, we need to

+ // break the strings apart and add them to the unsafe set

+ for(i = 0; i < contsSize; i++) {

+ len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status);

+ if(len > 0) {

+ j = 0;

+ while(j < len) {

+ U16_NEXT(buffer, j, len, c);

+ if(j < len) {

+ uset_add(unsafe, c);

+ }

+ uset_close(contractions);

+ return uset_size(unsafe);

+#endif // !UCONFIG_NO_COLLATION

« no previous file with comments | « source/i18n/collationbuilder.h ('k') | source/i18n/collationcompare.h » ('j') | no next file with comments »