| Index: source/i18n/collationcompare.cpp
|
| diff --git a/source/i18n/collationcompare.cpp b/source/i18n/collationcompare.cpp
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..6f9107e9db516416fdbdec4493f6cd585d9897ac
|
| --- /dev/null
|
| +++ b/source/i18n/collationcompare.cpp
|
| @@ -0,0 +1,363 @@
|
| +/*
|
| +*******************************************************************************
|
| +* Copyright (C) 1996-2014, International Business Machines
|
| +* Corporation and others. All Rights Reserved.
|
| +*******************************************************************************
|
| +* collationcompare.cpp
|
| +*
|
| +* created on: 2012feb14 with new and old collation code
|
| +* created by: Markus W. Scherer
|
| +*/
|
| +
|
| +#include "unicode/utypes.h"
|
| +
|
| +#if !UCONFIG_NO_COLLATION
|
| +
|
| +#include "unicode/ucol.h"
|
| +#include "cmemory.h"
|
| +#include "collation.h"
|
| +#include "collationcompare.h"
|
| +#include "collationiterator.h"
|
| +#include "collationsettings.h"
|
| +#include "uassert.h"
|
| +
|
| +U_NAMESPACE_BEGIN
|
| +
|
| +UCollationResult
|
| +CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right,
|
| + const CollationSettings &settings,
|
| + UErrorCode &errorCode) {
|
| + if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
|
| +
|
| + int32_t options = settings.options;
|
| + uint32_t variableTop;
|
| + if((options & CollationSettings::ALTERNATE_MASK) == 0) {
|
| + variableTop = 0;
|
| + } else {
|
| + // +1 so that we can use "<" and primary ignorables test out early.
|
| + variableTop = settings.variableTop + 1;
|
| + }
|
| + UBool anyVariable = FALSE;
|
| +
|
| + // Fetch CEs, compare primaries, store secondary & tertiary weights.
|
| + U_ALIGN_CODE(16);
|
| + for(;;) {
|
| + // We fetch CEs until we get a non-ignorable primary or reach the end.
|
| + uint32_t leftPrimary;
|
| + do {
|
| + int64_t ce = left.nextCE(errorCode);
|
| + leftPrimary = (uint32_t)(ce >> 32);
|
| + if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
|
| + // Variable CE, shift it to quaternary level.
|
| + // Ignore all following primary ignorables, and shift further variable CEs.
|
| + anyVariable = TRUE;
|
| + do {
|
| + // Store only the primary of the variable CE.
|
| + left.setCurrentCE(ce & INT64_C(0xffffffff00000000));
|
| + for(;;) {
|
| + ce = left.nextCE(errorCode);
|
| + leftPrimary = (uint32_t)(ce >> 32);
|
| + if(leftPrimary == 0) {
|
| + left.setCurrentCE(0);
|
| + } else {
|
| + break;
|
| + }
|
| + }
|
| + } while(leftPrimary < variableTop &&
|
| + leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
|
| + }
|
| + } while(leftPrimary == 0);
|
| +
|
| + uint32_t rightPrimary;
|
| + do {
|
| + int64_t ce = right.nextCE(errorCode);
|
| + rightPrimary = (uint32_t)(ce >> 32);
|
| + if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
|
| + // Variable CE, shift it to quaternary level.
|
| + // Ignore all following primary ignorables, and shift further variable CEs.
|
| + anyVariable = TRUE;
|
| + do {
|
| + // Store only the primary of the variable CE.
|
| + right.setCurrentCE(ce & INT64_C(0xffffffff00000000));
|
| + for(;;) {
|
| + ce = right.nextCE(errorCode);
|
| + rightPrimary = (uint32_t)(ce >> 32);
|
| + if(rightPrimary == 0) {
|
| + right.setCurrentCE(0);
|
| + } else {
|
| + break;
|
| + }
|
| + }
|
| + } while(rightPrimary < variableTop &&
|
| + rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
|
| + }
|
| + } while(rightPrimary == 0);
|
| +
|
| + if(leftPrimary != rightPrimary) {
|
| + // Return the primary difference, with script reordering.
|
| + const uint8_t *reorderTable = settings.reorderTable;
|
| + if (reorderTable != NULL) {
|
| + leftPrimary = Collation::reorder(reorderTable, leftPrimary);
|
| + rightPrimary = Collation::reorder(reorderTable, rightPrimary);
|
| + }
|
| + return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
|
| + }
|
| + if(leftPrimary == Collation::NO_CE_PRIMARY) { break; }
|
| + }
|
| + if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
|
| +
|
| + // Compare the buffered secondary & tertiary weights.
|
| + // We might skip the secondary level but continue with the case level
|
| + // which is turned on separately.
|
| + if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
|
| + if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
|
| + int32_t leftIndex = 0;
|
| + int32_t rightIndex = 0;
|
| + for(;;) {
|
| + uint32_t leftSecondary;
|
| + do {
|
| + leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16;
|
| + } while(leftSecondary == 0);
|
| +
|
| + uint32_t rightSecondary;
|
| + do {
|
| + rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16;
|
| + } while(rightSecondary == 0);
|
| +
|
| + if(leftSecondary != rightSecondary) {
|
| + return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
|
| + }
|
| + if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; }
|
| + }
|
| + } else {
|
| + // The backwards secondary level compares secondary weights backwards
|
| + // within segments separated by the merge separator (U+FFFE, weight 02).
|
| + int32_t leftStart = 0;
|
| + int32_t rightStart = 0;
|
| + for(;;) {
|
| + // Find the merge separator or the NO_CE terminator.
|
| + int32_t leftLimit = leftStart;
|
| + uint32_t leftLower32;
|
| + while((leftLower32 = (uint32_t)left.getCE(leftLimit)) >
|
| + Collation::MERGE_SEPARATOR_LOWER32 ||
|
| + leftLower32 == 0) {
|
| + ++leftLimit;
|
| + }
|
| + int32_t rightLimit = rightStart;
|
| + uint32_t rightLower32;
|
| + while((rightLower32 = (uint32_t)right.getCE(rightLimit)) >
|
| + Collation::MERGE_SEPARATOR_LOWER32 ||
|
| + rightLower32 == 0) {
|
| + ++rightLimit;
|
| + }
|
| +
|
| + // Compare the segments.
|
| + int32_t leftIndex = leftLimit;
|
| + int32_t rightIndex = rightLimit;
|
| + for(;;) {
|
| + int32_t leftSecondary = 0;
|
| + while(leftSecondary == 0 && leftIndex > leftStart) {
|
| + leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16;
|
| + }
|
| +
|
| + int32_t rightSecondary = 0;
|
| + while(rightSecondary == 0 && rightIndex > rightStart) {
|
| + rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16;
|
| + }
|
| +
|
| + if(leftSecondary != rightSecondary) {
|
| + return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
|
| + }
|
| + if(leftSecondary == 0) { break; }
|
| + }
|
| +
|
| + // Did we reach the end of either string?
|
| + // Both strings have the same number of merge separators,
|
| + // or else there would have been a primary-level difference.
|
| + U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
|
| + if(left.getCE(leftLimit) == Collation::NO_CE) { break; }
|
| + // Skip both merge separators and continue.
|
| + leftStart = leftLimit + 1;
|
| + rightStart = rightLimit + 1;
|
| + }
|
| + }
|
| + }
|
| +
|
| + if((options & CollationSettings::CASE_LEVEL) != 0) {
|
| + int32_t strength = CollationSettings::getStrength(options);
|
| + int32_t leftIndex = 0;
|
| + int32_t rightIndex = 0;
|
| + for(;;) {
|
| + uint32_t leftCase, leftLower32, rightCase;
|
| + if(strength == UCOL_PRIMARY) {
|
| + // Primary+caseLevel: Ignore case level weights of primary ignorables.
|
| + // Otherwise we would get a-umlaut > a
|
| + // which is not desirable for accent-insensitive sorting.
|
| + // Check for (lower 32 bits) == 0 as well because variable CEs are stored
|
| + // with only primary weights.
|
| + int64_t ce;
|
| + do {
|
| + ce = left.getCE(leftIndex++);
|
| + leftCase = (uint32_t)ce;
|
| + } while((uint32_t)(ce >> 32) == 0 || leftCase == 0);
|
| + leftLower32 = leftCase;
|
| + leftCase &= 0xc000;
|
| +
|
| + do {
|
| + ce = right.getCE(rightIndex++);
|
| + rightCase = (uint32_t)ce;
|
| + } while((uint32_t)(ce >> 32) == 0 || rightCase == 0);
|
| + rightCase &= 0xc000;
|
| + } else {
|
| + // Secondary+caseLevel: By analogy with the above,
|
| + // ignore case level weights of secondary ignorables.
|
| + //
|
| + // Note: A tertiary CE has uppercase case bits (0.0.ut)
|
| + // to keep tertiary+caseFirst well-formed.
|
| + //
|
| + // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
|
| + // Otherwise a tertiary CE's uppercase would be no greater than
|
| + // a primary/secondary CE's uppercase.
|
| + // (See UCA well-formedness condition 2.)
|
| + // We could construct a special case weight higher than uppercase,
|
| + // but it's simpler to always ignore case weights of secondary ignorables,
|
| + // turning 0.0.ut into 0.0.0.t.
|
| + // (See LDML Collation, Case Parameters.)
|
| + do {
|
| + leftCase = (uint32_t)left.getCE(leftIndex++);
|
| + } while(leftCase <= 0xffff);
|
| + leftLower32 = leftCase;
|
| + leftCase &= 0xc000;
|
| +
|
| + do {
|
| + rightCase = (uint32_t)right.getCE(rightIndex++);
|
| + } while(rightCase <= 0xffff);
|
| + rightCase &= 0xc000;
|
| + }
|
| +
|
| + // No need to handle NO_CE and MERGE_SEPARATOR specially:
|
| + // There is one case weight for each previous-level weight,
|
| + // so level length differences were handled there.
|
| + if(leftCase != rightCase) {
|
| + if((options & CollationSettings::UPPER_FIRST) == 0) {
|
| + return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
|
| + } else {
|
| + return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
|
| + }
|
| + }
|
| + if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; }
|
| + }
|
| + }
|
| + if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
|
| +
|
| + uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
|
| +
|
| + int32_t leftIndex = 0;
|
| + int32_t rightIndex = 0;
|
| + uint32_t anyQuaternaries = 0;
|
| + for(;;) {
|
| + uint32_t leftLower32, leftTertiary;
|
| + do {
|
| + leftLower32 = (uint32_t)left.getCE(leftIndex++);
|
| + anyQuaternaries |= leftLower32;
|
| + U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
|
| + (leftLower32 & 0xc0c0) == 0);
|
| + leftTertiary = leftLower32 & tertiaryMask;
|
| + } while(leftTertiary == 0);
|
| +
|
| + uint32_t rightLower32, rightTertiary;
|
| + do {
|
| + rightLower32 = (uint32_t)right.getCE(rightIndex++);
|
| + anyQuaternaries |= rightLower32;
|
| + U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
|
| + (rightLower32 & 0xc0c0) == 0);
|
| + rightTertiary = rightLower32 & tertiaryMask;
|
| + } while(rightTertiary == 0);
|
| +
|
| + if(leftTertiary != rightTertiary) {
|
| + if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
|
| + // Pass through NO_CE and MERGE_SEPARATOR
|
| + // and keep real tertiary weights larger than the MERGE_SEPARATOR.
|
| + // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
|
| + // to keep tertiary CEs well-formed.
|
| + // Their case+tertiary weights must be greater than those of
|
| + // primary and secondary CEs.
|
| + if(leftTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) {
|
| + if(leftLower32 > 0xffff) {
|
| + leftTertiary ^= 0xc000;
|
| + } else {
|
| + leftTertiary += 0x4000;
|
| + }
|
| + }
|
| + if(rightTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) {
|
| + if(rightLower32 > 0xffff) {
|
| + rightTertiary ^= 0xc000;
|
| + } else {
|
| + rightTertiary += 0x4000;
|
| + }
|
| + }
|
| + }
|
| + return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
|
| + }
|
| + if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; }
|
| + }
|
| + if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
|
| +
|
| + if(!anyVariable && (anyQuaternaries & 0xc0) == 0) {
|
| + // If there are no "variable" CEs and no non-zero quaternary weights,
|
| + // then there are no quaternary differences.
|
| + return UCOL_EQUAL;
|
| + }
|
| +
|
| + leftIndex = 0;
|
| + rightIndex = 0;
|
| + for(;;) {
|
| + uint32_t leftQuaternary;
|
| + do {
|
| + int64_t ce = left.getCE(leftIndex++);
|
| + leftQuaternary = (uint32_t)ce & 0xffff;
|
| + if(leftQuaternary == 0) {
|
| + // Variable primary or completely ignorable.
|
| + leftQuaternary = (uint32_t)(ce >> 32);
|
| + } else if(leftQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) {
|
| + // Leave NO_CE or MERGE_SEPARATOR as is.
|
| + } else {
|
| + // Regular CE, not tertiary ignorable.
|
| + // Preserve the quaternary weight in bits 7..6.
|
| + leftQuaternary |= 0xffffff3f;
|
| + }
|
| + } while(leftQuaternary == 0);
|
| +
|
| + uint32_t rightQuaternary;
|
| + do {
|
| + int64_t ce = right.getCE(rightIndex++);
|
| + rightQuaternary = (uint32_t)ce & 0xffff;
|
| + if(rightQuaternary == 0) {
|
| + // Variable primary or completely ignorable.
|
| + rightQuaternary = (uint32_t)(ce >> 32);
|
| + } else if(rightQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) {
|
| + // Leave NO_CE or MERGE_SEPARATOR as is.
|
| + } else {
|
| + // Regular CE, not tertiary ignorable.
|
| + // Preserve the quaternary weight in bits 7..6.
|
| + rightQuaternary |= 0xffffff3f;
|
| + }
|
| + } while(rightQuaternary == 0);
|
| +
|
| + if(leftQuaternary != rightQuaternary) {
|
| + // Return the difference, with script reordering.
|
| + const uint8_t *reorderTable = settings.reorderTable;
|
| + if (reorderTable != NULL) {
|
| + leftQuaternary = Collation::reorder(reorderTable, leftQuaternary);
|
| + rightQuaternary = Collation::reorder(reorderTable, rightQuaternary);
|
| + }
|
| + return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
|
| + }
|
| + if(leftQuaternary == Collation::NO_CE_WEIGHT16) { break; }
|
| + }
|
| + return UCOL_EQUAL;
|
| +}
|
| +
|
| +U_NAMESPACE_END
|
| +
|
| +#endif // !UCONFIG_NO_COLLATION
|
|
|