| Index: source/i18n/collationbasedatabuilder.cpp
|
| diff --git a/source/i18n/collationbasedatabuilder.cpp b/source/i18n/collationbasedatabuilder.cpp
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..cb09ee6dd568bd969ba101a28ff8aa6ab33028ec
|
| --- /dev/null
|
| +++ b/source/i18n/collationbasedatabuilder.cpp
|
| @@ -0,0 +1,492 @@
|
| +/*
|
| +*******************************************************************************
|
| +* Copyright (C) 2012-2014, International Business Machines
|
| +* Corporation and others. All Rights Reserved.
|
| +*******************************************************************************
|
| +* collationbasedatabuilder.cpp
|
| +*
|
| +* created on: 2012aug11
|
| +* created by: Markus W. Scherer
|
| +*/
|
| +
|
| +#include "unicode/utypes.h"
|
| +
|
| +#if !UCONFIG_NO_COLLATION
|
| +
|
| +#include "unicode/localpointer.h"
|
| +#include "unicode/ucharstriebuilder.h"
|
| +#include "unicode/uniset.h"
|
| +#include "unicode/unistr.h"
|
| +#include "unicode/utf16.h"
|
| +#include "collation.h"
|
| +#include "collationbasedatabuilder.h"
|
| +#include "collationdata.h"
|
| +#include "collationdatabuilder.h"
|
| +#include "collationrootelements.h"
|
| +#include "normalizer2impl.h"
|
| +#include "uassert.h"
|
| +#include "utrie2.h"
|
| +#include "uvectr32.h"
|
| +#include "uvectr64.h"
|
| +#include "uvector.h"
|
| +
|
| +U_NAMESPACE_BEGIN
|
| +
|
| +namespace {
|
| +
|
| +/**
|
| + * Compare two signed int64_t values as if they were unsigned.
|
| + */
|
| +int32_t
|
| +compareInt64AsUnsigned(int64_t a, int64_t b) {
|
| + if((uint64_t)a < (uint64_t)b) {
|
| + return -1;
|
| + } else if((uint64_t)a > (uint64_t)b) {
|
| + return 1;
|
| + } else {
|
| + return 0;
|
| + }
|
| +}
|
| +
|
| +// TODO: Try to merge this with the binarySearch in alphaindex.cpp.
|
| +/**
|
| + * Like Java Collections.binarySearch(List, String, Comparator).
|
| + *
|
| + * @return the index>=0 where the item was found,
|
| + * or the index<0 for inserting the string at ~index in sorted order
|
| + */
|
| +int32_t
|
| +binarySearch(const UVector64 &list, int64_t ce) {
|
| + if (list.size() == 0) { return ~0; }
|
| + int32_t start = 0;
|
| + int32_t limit = list.size();
|
| + for (;;) {
|
| + int32_t i = (start + limit) / 2;
|
| + int32_t cmp = compareInt64AsUnsigned(ce, list.elementAti(i));
|
| + if (cmp == 0) {
|
| + return i;
|
| + } else if (cmp < 0) {
|
| + if (i == start) {
|
| + return ~start; // insert ce before i
|
| + }
|
| + limit = i;
|
| + } else {
|
| + if (i == start) {
|
| + return ~(start + 1); // insert ce after i
|
| + }
|
| + start = i;
|
| + }
|
| + }
|
| +}
|
| +
|
| +} // namespace
|
| +
|
| +CollationBaseDataBuilder::CollationBaseDataBuilder(UErrorCode &errorCode)
|
| + : CollationDataBuilder(errorCode),
|
| + numericPrimary(0x12000000),
|
| + firstHanPrimary(0), lastHanPrimary(0), hanStep(2),
|
| + rootElements(errorCode) {
|
| +}
|
| +
|
| +CollationBaseDataBuilder::~CollationBaseDataBuilder() {
|
| +}
|
| +
|
| +void
|
| +CollationBaseDataBuilder::init(UErrorCode &errorCode) {
|
| + if(U_FAILURE(errorCode)) { return; }
|
| + if(trie != NULL) {
|
| + errorCode = U_INVALID_STATE_ERROR;
|
| + return;
|
| + }
|
| +
|
| + // Not compressible:
|
| + // - digits
|
| + // - Latin
|
| + // - Hani
|
| + // - trail weights
|
| + // Some scripts are compressible, some are not.
|
| + uprv_memset(compressibleBytes, FALSE, 256);
|
| + compressibleBytes[Collation::UNASSIGNED_IMPLICIT_BYTE] = TRUE;
|
| +
|
| + // For a base, the default is to compute an unassigned-character implicit CE.
|
| + // This includes surrogate code points; see the last option in
|
| + // UCA section 7.1.1 Handling Ill-Formed Code Unit Sequences.
|
| + trie = utrie2_open(Collation::UNASSIGNED_CE32, Collation::FFFD_CE32, &errorCode);
|
| +
|
| + // Preallocate trie blocks for Latin in the hope that proximity helps with CPU caches.
|
| + for(UChar32 c = 0; c < 0x180; ++c) {
|
| + utrie2_set32(trie, c, Collation::UNASSIGNED_CE32, &errorCode);
|
| + }
|
| +
|
| + utrie2_set32(trie, 0xfffe, Collation::MERGE_SEPARATOR_CE32, &errorCode);
|
| + // No root element for the merge separator which has 02 weights.
|
| + // Some code assumes that the root first primary CE is the "space first primary"
|
| + // from FractionalUCA.txt.
|
| +
|
| + uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
|
| + utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode);
|
| +
|
| + // Add a mapping for the first-unassigned boundary,
|
| + // which is the AlphabeticIndex overflow boundary.
|
| + UnicodeString s((UChar)0xfdd1); // Script boundary contractions start with U+FDD1.
|
| + s.append((UChar)0xfdd0); // Zzzz script sample character U+FDD0.
|
| + int64_t ce = Collation::makeCE(Collation::FIRST_UNASSIGNED_PRIMARY);
|
| + add(UnicodeString(), s, &ce, 1, errorCode);
|
| +
|
| + // Add a tailoring boundary, but not a mapping, for [first trailing].
|
| + ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY);
|
| + rootElements.addElement(ce, errorCode);
|
| +
|
| + // U+FFFD maps to a CE with the third-highest primary weight,
|
| + // for predictable handling of ill-formed UTF-8.
|
| + uint32_t ce32 = Collation::FFFD_CE32;
|
| + utrie2_set32(trie, 0xfffd, ce32, &errorCode);
|
| + addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode);
|
| +
|
| + // U+FFFF maps to a CE with the highest primary weight.
|
| + ce32 = Collation::MAX_REGULAR_CE32;
|
| + utrie2_set32(trie, 0xffff, ce32, &errorCode);
|
| + addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode);
|
| +}
|
| +
|
| +void
|
| +CollationBaseDataBuilder::initHanRanges(const UChar32 ranges[], int32_t length,
|
| + UErrorCode &errorCode) {
|
| + if(U_FAILURE(errorCode) || length == 0) { return; }
|
| + if((length & 1) != 0) { // incomplete start/end pairs
|
| + errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
| + return;
|
| + }
|
| + if(isAssigned(0x4e00)) { // already set
|
| + errorCode = U_INVALID_STATE_ERROR;
|
| + return;
|
| + }
|
| + int32_t numHanCodePoints = 0;
|
| + for(int32_t i = 0; i < length; i += 2) {
|
| + UChar32 start = ranges[i];
|
| + UChar32 end = ranges[i + 1];
|
| + numHanCodePoints += end - start + 1;
|
| + }
|
| + // Multiply the number of code points by (gap+1).
|
| + // Add hanStep+2 for tailoring after the last Han character.
|
| + int32_t gap = 1;
|
| + hanStep = gap + 1;
|
| + int32_t numHan = numHanCodePoints * hanStep + hanStep + 2;
|
| + // Numbers of Han primaries per lead byte determined by
|
| + // numbers of 2nd (not compressible) times 3rd primary byte values.
|
| + int32_t numHanPerLeadByte = 254 * 254;
|
| + int32_t numHanLeadBytes = (numHan + numHanPerLeadByte - 1) / numHanPerLeadByte;
|
| + uint32_t hanPrimary = (uint32_t)(Collation::UNASSIGNED_IMPLICIT_BYTE - numHanLeadBytes) << 24;
|
| + hanPrimary |= 0x20200;
|
| + firstHanPrimary = hanPrimary;
|
| + for(int32_t i = 0; i < length; i += 2) {
|
| + UChar32 start = ranges[i];
|
| + UChar32 end = ranges[i + 1];
|
| + hanPrimary = setPrimaryRangeAndReturnNext(start, end, hanPrimary, hanStep, errorCode);
|
| + }
|
| + // One past the actual last one, but that is harmless for tailoring.
|
| + // It saves us from subtracting "hanStep" and handling underflows.
|
| + lastHanPrimary = hanPrimary;
|
| +}
|
| +
|
| +UBool
|
| +CollationBaseDataBuilder::isCompressibleLeadByte(uint32_t b) const {
|
| + return compressibleBytes[b];
|
| +}
|
| +
|
| +void
|
| +CollationBaseDataBuilder::setCompressibleLeadByte(uint32_t b) {
|
| + compressibleBytes[b] = TRUE;
|
| +}
|
| +
|
| +int32_t
|
| +CollationBaseDataBuilder::diffTwoBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) {
|
| + if((p1 & 0xff000000) == (p2 & 0xff000000)) {
|
| + // Same lead bytes.
|
| + return (int32_t)(p2 - p1) >> 16;
|
| + } else {
|
| + int32_t linear1;
|
| + int32_t linear2;
|
| + int32_t factor;
|
| + if(isCompressible) {
|
| + // Second byte for compressible lead byte: 251 bytes 04..FE
|
| + linear1 = (int32_t)((p1 >> 16) & 0xff) - 4;
|
| + linear2 = (int32_t)((p2 >> 16) & 0xff) - 4;
|
| + factor = 251;
|
| + } else {
|
| + // Second byte for incompressible lead byte: 254 bytes 02..FF
|
| + linear1 = (int32_t)((p1 >> 16) & 0xff) - 2;
|
| + linear2 = (int32_t)((p2 >> 16) & 0xff) - 2;
|
| + factor = 254;
|
| + }
|
| + linear1 += factor * (int32_t)((p1 >> 24) & 0xff);
|
| + linear2 += factor * (int32_t)((p2 >> 24) & 0xff);
|
| + return linear2 - linear1;
|
| + }
|
| +}
|
| +
|
| +int32_t
|
| +CollationBaseDataBuilder::diffThreeBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) {
|
| + if((p1 & 0xffff0000) == (p2 & 0xffff0000)) {
|
| + // Same first two bytes.
|
| + return (int32_t)(p2 - p1) >> 8;
|
| + } else {
|
| + // Third byte: 254 bytes 02..FF
|
| + int32_t linear1 = (int32_t)((p1 >> 8) & 0xff) - 2;
|
| + int32_t linear2 = (int32_t)((p2 >> 8) & 0xff) - 2;
|
| + int32_t factor;
|
| + if(isCompressible) {
|
| + // Second byte for compressible lead byte: 251 bytes 04..FE
|
| + linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 4);
|
| + linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 4);
|
| + factor = 251 * 254;
|
| + } else {
|
| + // Second byte for incompressible lead byte: 254 bytes 02..FF
|
| + linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 2);
|
| + linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 2);
|
| + factor = 254 * 254;
|
| + }
|
| + linear1 += factor * (int32_t)((p1 >> 24) & 0xff);
|
| + linear2 += factor * (int32_t)((p2 >> 24) & 0xff);
|
| + return linear2 - linear1;
|
| + }
|
| +}
|
| +
|
| +uint32_t
|
| +CollationBaseDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode) {
|
| + addRootElements(ces, cesLength, errorCode);
|
| + return CollationDataBuilder::encodeCEs(ces, cesLength, errorCode);
|
| +}
|
| +
|
| +void
|
| +CollationBaseDataBuilder::addRootElements(const int64_t ces[], int32_t cesLength,
|
| + UErrorCode &errorCode) {
|
| + if(U_FAILURE(errorCode)) { return; }
|
| + for(int32_t i = 0; i < cesLength; ++i) {
|
| + addRootElement(ces[i], errorCode);
|
| + }
|
| +}
|
| +
|
| +void
|
| +CollationBaseDataBuilder::addRootElement(int64_t ce, UErrorCode &errorCode) {
|
| + if(U_FAILURE(errorCode) || ce == 0) { return; }
|
| + // Remove case bits.
|
| + ce &= INT64_C(0xffffffffffff3fff);
|
| + U_ASSERT((ce & 0xc0) == 0); // quaternary==0
|
| + // Ignore the CE if it has a Han primary weight and common secondary/tertiary weights.
|
| + // We will add it later, as part of the Han ranges.
|
| + uint32_t p = (uint32_t)(ce >> 32);
|
| + uint32_t secTer = (uint32_t)ce;
|
| + if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
|
| + if(firstHanPrimary <= p && p <= lastHanPrimary) {
|
| + return;
|
| + }
|
| + } else {
|
| + // Check that secondary and tertiary weights are >= "common".
|
| + uint32_t s = secTer >> 16;
|
| + uint32_t t = secTer & Collation::ONLY_TERTIARY_MASK;
|
| + if((s != 0 && s < Collation::COMMON_WEIGHT16) || (t != 0 && t < Collation::COMMON_WEIGHT16)) {
|
| + errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
| + return;
|
| + }
|
| + }
|
| + // Check that primaries have at most 3 bytes.
|
| + if((p & 0xff) != 0) {
|
| + errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
| + return;
|
| + }
|
| + int32_t i = binarySearch(rootElements, ce);
|
| + if(i < 0) {
|
| + rootElements.insertElementAt(ce, ~i, errorCode);
|
| + }
|
| +}
|
| +
|
| +void
|
| +CollationBaseDataBuilder::addReorderingGroup(uint32_t firstByte, uint32_t lastByte,
|
| + const UnicodeString &groupScripts,
|
| + UErrorCode &errorCode) {
|
| + if(U_FAILURE(errorCode)) { return; }
|
| + if(groupScripts.isEmpty()) {
|
| + errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
| + return;
|
| + }
|
| + if(groupScripts.indexOf((UChar)USCRIPT_UNKNOWN) >= 0) {
|
| + // Zzzz must not occur.
|
| + // It is the code used in the API to separate low and high scripts.
|
| + errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
| + return;
|
| + }
|
| + // Note: We are mostly trusting the input data,
|
| + // rather than verifying that reordering groups do not intersect
|
| + // with their lead byte ranges nor their sets of scripts,
|
| + // and that all script codes are valid.
|
| + scripts.append((UChar)((firstByte << 8) | lastByte));
|
| + scripts.append((UChar)groupScripts.length());
|
| + scripts.append(groupScripts);
|
| +}
|
| +
|
| +void
|
| +CollationBaseDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
|
| + buildMappings(data, errorCode);
|
| + data.numericPrimary = numericPrimary;
|
| + data.compressibleBytes = compressibleBytes;
|
| + data.scripts = reinterpret_cast<const uint16_t *>(scripts.getBuffer());
|
| + data.scriptsLength = scripts.length();
|
| + buildFastLatinTable(data, errorCode);
|
| +}
|
| +
|
| +void
|
| +CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &errorCode) {
|
| + if(U_FAILURE(errorCode)) { return; }
|
| + uint32_t nextHanPrimary = firstHanPrimary; // Set to 0xffffffff after the last Han range.
|
| + uint32_t prevPrimary = 0; // Start with primary ignorable CEs.
|
| + UBool tryRange = FALSE;
|
| + for(int32_t i = 0; i < rootElements.size(); ++i) {
|
| + int64_t ce = rootElements.elementAti(i);
|
| + uint32_t p = (uint32_t)(ce >> 32);
|
| + uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
|
| + if(p != prevPrimary) {
|
| + U_ASSERT((p & 0xff) == 0);
|
| + int32_t end;
|
| + if(p >= nextHanPrimary) {
|
| + // Add a Han primary weight or range.
|
| + // We omitted them initially, and omitted all CEs with Han primaries
|
| + // and common secondary/tertiary weights.
|
| + U_ASSERT(p > lastHanPrimary || secTer != Collation::COMMON_SEC_AND_TER_CE);
|
| + if(p == nextHanPrimary) {
|
| + // One single Han primary with non-common secondary/tertiary weights.
|
| + table.addElement((int32_t)p, errorCode);
|
| + if(p < lastHanPrimary) {
|
| + // Prepare for the next Han range.
|
| + nextHanPrimary = Collation::incThreeBytePrimaryByOffset(p, FALSE, hanStep);
|
| + } else {
|
| + // p is the last Han primary.
|
| + nextHanPrimary = 0xffffffff;
|
| + }
|
| + } else {
|
| + // p > nextHanPrimary: Add a Han primary range, starting with nextHanPrimary.
|
| + table.addElement((int32_t)nextHanPrimary, errorCode);
|
| + if(nextHanPrimary == lastHanPrimary) {
|
| + // nextHanPrimary == lastHanPrimary < p
|
| + // We just wrote the single last Han primary.
|
| + nextHanPrimary = 0xffffffff;
|
| + } else if(p < lastHanPrimary) {
|
| + // nextHanPrimary < p < lastHanPrimary
|
| + // End the Han range on p, prepare for the next range.
|
| + table.addElement((int32_t)p | hanStep, errorCode);
|
| + nextHanPrimary = Collation::incThreeBytePrimaryByOffset(p, FALSE, hanStep);
|
| + } else if(p == lastHanPrimary) {
|
| + // nextHanPrimary < p == lastHanPrimary
|
| + // End the last Han range on p.
|
| + table.addElement((int32_t)p | hanStep, errorCode);
|
| + nextHanPrimary = 0xffffffff;
|
| + } else {
|
| + // nextHanPrimary < lastHanPrimary < p
|
| + // End the last Han range, then write p.
|
| + table.addElement((int32_t)lastHanPrimary | hanStep, errorCode);
|
| + nextHanPrimary = 0xffffffff;
|
| + table.addElement((int32_t)p, errorCode);
|
| + }
|
| + }
|
| + } else if(tryRange && secTer == Collation::COMMON_SEC_AND_TER_CE &&
|
| + (end = writeRootElementsRange(prevPrimary, p, i + 1, table, errorCode)) != 0) {
|
| + // Multiple CEs with only common secondary/tertiary weights were
|
| + // combined into a primary range.
|
| + // The range end was written, ending with the primary of rootElements[end].
|
| + ce = rootElements.elementAti(end);
|
| + p = (uint32_t)(ce >> 32);
|
| + secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
|
| + i = end;
|
| + } else {
|
| + // Write the primary weight of a normal CE.
|
| + table.addElement((int32_t)p, errorCode);
|
| + }
|
| + prevPrimary = p;
|
| + }
|
| + if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
|
| + // The common secondar/tertiary weights are implied in the primary unit.
|
| + // If there is no intervening delta unit, then we will try to combine
|
| + // the next several primaries into a range.
|
| + tryRange = TRUE;
|
| + } else {
|
| + // For each new set of secondary/tertiary weights we write a delta unit.
|
| + table.addElement((int32_t)secTer | CollationRootElements::SEC_TER_DELTA_FLAG, errorCode);
|
| + tryRange = FALSE;
|
| + }
|
| + }
|
| +
|
| + // Limit sentinel for root elements.
|
| + // This allows us to reduce range checks at runtime.
|
| + table.addElement(CollationRootElements::PRIMARY_SENTINEL, errorCode);
|
| +}
|
| +
|
| +int32_t
|
| +CollationBaseDataBuilder::writeRootElementsRange(
|
| + uint32_t prevPrimary, uint32_t p, int32_t i,
|
| + UVector32 &table, UErrorCode &errorCode) {
|
| + if(U_FAILURE(errorCode) || i >= rootElements.size()) { return 0; }
|
| + U_ASSERT(prevPrimary < p);
|
| + // No ranges of single-byte primaries.
|
| + if((p & prevPrimary & 0xff0000) == 0) { return 0; }
|
| + // Lead bytes of compressible primaries must match.
|
| + UBool isCompressible = isCompressiblePrimary(p);
|
| + if((isCompressible || isCompressiblePrimary(prevPrimary)) &&
|
| + (p & 0xff000000) != (prevPrimary & 0xff000000)) {
|
| + return 0;
|
| + }
|
| + // Number of bytes in the primaries.
|
| + UBool twoBytes;
|
| + // Number of primaries from prevPrimary to p.
|
| + int32_t step;
|
| + if((p & 0xff00) == 0) {
|
| + // 2-byte primary
|
| + if((prevPrimary & 0xff00) != 0) { return 0; } // length mismatch
|
| + twoBytes = TRUE;
|
| + step = diffTwoBytePrimaries(prevPrimary, p, isCompressible);
|
| + } else {
|
| + // 3-byte primary
|
| + if((prevPrimary & 0xff00) == 0) { return 0; } // length mismatch
|
| + twoBytes = FALSE;
|
| + step = diffThreeBytePrimaries(prevPrimary, p, isCompressible);
|
| + }
|
| + if(step > (int32_t)CollationRootElements::PRIMARY_STEP_MASK) { return 0; }
|
| + // See if there are more than two CEs with primaries increasing by "step"
|
| + // and with only common secondary/tertiary weights on all but the last one.
|
| + int32_t end = 0; // Initially 0: No range for just two primaries.
|
| + for(;;) {
|
| + prevPrimary = p;
|
| + // Calculate which primary we expect next.
|
| + uint32_t nextPrimary; // = p + step
|
| + if(twoBytes) {
|
| + nextPrimary = Collation::incTwoBytePrimaryByOffset(p, isCompressible, step);
|
| + } else {
|
| + nextPrimary = Collation::incThreeBytePrimaryByOffset(p, isCompressible, step);
|
| + }
|
| + // Fetch the actual next CE.
|
| + int64_t ce = rootElements.elementAti(i);
|
| + p = (uint32_t)(ce >> 32);
|
| + uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
|
| + // Does this primary increase by "step" from the last one?
|
| + if(p != nextPrimary ||
|
| + // Do not cross into a new lead byte if either is compressible.
|
| + ((p & 0xff000000) != (prevPrimary & 0xff000000) &&
|
| + (isCompressible || isCompressiblePrimary(p)))) {
|
| + // The range ends with the previous CE.
|
| + p = prevPrimary;
|
| + break;
|
| + }
|
| + // Extend the range to include this primary.
|
| + end = i++;
|
| + // This primary is the last in the range if it has non-common weights
|
| + // or if we are at the end of the list.
|
| + if(secTer != Collation::COMMON_SEC_AND_TER_CE || i >= rootElements.size()) { break; }
|
| + }
|
| + if(end != 0) {
|
| + table.addElement((int32_t)p | step, errorCode);
|
| + }
|
| + return end;
|
| +}
|
| +
|
| +U_NAMESPACE_END
|
| +
|
| +#endif // !UCONFIG_NO_COLLATION
|
|
|