Index: source/i18n/collationcompare.cpp |
diff --git a/source/i18n/collationcompare.cpp b/source/i18n/collationcompare.cpp |
new file mode 100644 |
index 0000000000000000000000000000000000000000..6f9107e9db516416fdbdec4493f6cd585d9897ac |
--- /dev/null |
+++ b/source/i18n/collationcompare.cpp |
@@ -0,0 +1,363 @@ |
+/* |
+******************************************************************************* |
+* Copyright (C) 1996-2014, International Business Machines |
+* Corporation and others. All Rights Reserved. |
+******************************************************************************* |
+* collationcompare.cpp |
+* |
+* created on: 2012feb14 with new and old collation code |
+* created by: Markus W. Scherer |
+*/ |
+ |
+#include "unicode/utypes.h" |
+ |
+#if !UCONFIG_NO_COLLATION |
+ |
+#include "unicode/ucol.h" |
+#include "cmemory.h" |
+#include "collation.h" |
+#include "collationcompare.h" |
+#include "collationiterator.h" |
+#include "collationsettings.h" |
+#include "uassert.h" |
+ |
+U_NAMESPACE_BEGIN |
+ |
+UCollationResult |
+CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right, |
+ const CollationSettings &settings, |
+ UErrorCode &errorCode) { |
+ if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } |
+ |
+ int32_t options = settings.options; |
+ uint32_t variableTop; |
+ if((options & CollationSettings::ALTERNATE_MASK) == 0) { |
+ variableTop = 0; |
+ } else { |
+ // +1 so that we can use "<" and primary ignorables test out early. |
+ variableTop = settings.variableTop + 1; |
+ } |
+ UBool anyVariable = FALSE; |
+ |
+ // Fetch CEs, compare primaries, store secondary & tertiary weights. |
+ U_ALIGN_CODE(16); |
+ for(;;) { |
+ // We fetch CEs until we get a non-ignorable primary or reach the end. |
+ uint32_t leftPrimary; |
+ do { |
+ int64_t ce = left.nextCE(errorCode); |
+ leftPrimary = (uint32_t)(ce >> 32); |
+ if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) { |
+ // Variable CE, shift it to quaternary level. |
+ // Ignore all following primary ignorables, and shift further variable CEs. |
+ anyVariable = TRUE; |
+ do { |
+ // Store only the primary of the variable CE. |
+ left.setCurrentCE(ce & INT64_C(0xffffffff00000000)); |
+ for(;;) { |
+ ce = left.nextCE(errorCode); |
+ leftPrimary = (uint32_t)(ce >> 32); |
+ if(leftPrimary == 0) { |
+ left.setCurrentCE(0); |
+ } else { |
+ break; |
+ } |
+ } |
+ } while(leftPrimary < variableTop && |
+ leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY); |
+ } |
+ } while(leftPrimary == 0); |
+ |
+ uint32_t rightPrimary; |
+ do { |
+ int64_t ce = right.nextCE(errorCode); |
+ rightPrimary = (uint32_t)(ce >> 32); |
+ if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) { |
+ // Variable CE, shift it to quaternary level. |
+ // Ignore all following primary ignorables, and shift further variable CEs. |
+ anyVariable = TRUE; |
+ do { |
+ // Store only the primary of the variable CE. |
+ right.setCurrentCE(ce & INT64_C(0xffffffff00000000)); |
+ for(;;) { |
+ ce = right.nextCE(errorCode); |
+ rightPrimary = (uint32_t)(ce >> 32); |
+ if(rightPrimary == 0) { |
+ right.setCurrentCE(0); |
+ } else { |
+ break; |
+ } |
+ } |
+ } while(rightPrimary < variableTop && |
+ rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY); |
+ } |
+ } while(rightPrimary == 0); |
+ |
+ if(leftPrimary != rightPrimary) { |
+ // Return the primary difference, with script reordering. |
+ const uint8_t *reorderTable = settings.reorderTable; |
+ if (reorderTable != NULL) { |
+ leftPrimary = Collation::reorder(reorderTable, leftPrimary); |
+ rightPrimary = Collation::reorder(reorderTable, rightPrimary); |
+ } |
+ return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; |
+ } |
+ if(leftPrimary == Collation::NO_CE_PRIMARY) { break; } |
+ } |
+ if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } |
+ |
+ // Compare the buffered secondary & tertiary weights. |
+ // We might skip the secondary level but continue with the case level |
+ // which is turned on separately. |
+ if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { |
+ if((options & CollationSettings::BACKWARD_SECONDARY) == 0) { |
+ int32_t leftIndex = 0; |
+ int32_t rightIndex = 0; |
+ for(;;) { |
+ uint32_t leftSecondary; |
+ do { |
+ leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16; |
+ } while(leftSecondary == 0); |
+ |
+ uint32_t rightSecondary; |
+ do { |
+ rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16; |
+ } while(rightSecondary == 0); |
+ |
+ if(leftSecondary != rightSecondary) { |
+ return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; |
+ } |
+ if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; } |
+ } |
+ } else { |
+ // The backwards secondary level compares secondary weights backwards |
+ // within segments separated by the merge separator (U+FFFE, weight 02). |
+ int32_t leftStart = 0; |
+ int32_t rightStart = 0; |
+ for(;;) { |
+ // Find the merge separator or the NO_CE terminator. |
+ int32_t leftLimit = leftStart; |
+ uint32_t leftLower32; |
+ while((leftLower32 = (uint32_t)left.getCE(leftLimit)) > |
+ Collation::MERGE_SEPARATOR_LOWER32 || |
+ leftLower32 == 0) { |
+ ++leftLimit; |
+ } |
+ int32_t rightLimit = rightStart; |
+ uint32_t rightLower32; |
+ while((rightLower32 = (uint32_t)right.getCE(rightLimit)) > |
+ Collation::MERGE_SEPARATOR_LOWER32 || |
+ rightLower32 == 0) { |
+ ++rightLimit; |
+ } |
+ |
+ // Compare the segments. |
+ int32_t leftIndex = leftLimit; |
+ int32_t rightIndex = rightLimit; |
+ for(;;) { |
+ int32_t leftSecondary = 0; |
+ while(leftSecondary == 0 && leftIndex > leftStart) { |
+ leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16; |
+ } |
+ |
+ int32_t rightSecondary = 0; |
+ while(rightSecondary == 0 && rightIndex > rightStart) { |
+ rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16; |
+ } |
+ |
+ if(leftSecondary != rightSecondary) { |
+ return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; |
+ } |
+ if(leftSecondary == 0) { break; } |
+ } |
+ |
+ // Did we reach the end of either string? |
+ // Both strings have the same number of merge separators, |
+ // or else there would have been a primary-level difference. |
+ U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit)); |
+ if(left.getCE(leftLimit) == Collation::NO_CE) { break; } |
+ // Skip both merge separators and continue. |
+ leftStart = leftLimit + 1; |
+ rightStart = rightLimit + 1; |
+ } |
+ } |
+ } |
+ |
+ if((options & CollationSettings::CASE_LEVEL) != 0) { |
+ int32_t strength = CollationSettings::getStrength(options); |
+ int32_t leftIndex = 0; |
+ int32_t rightIndex = 0; |
+ for(;;) { |
+ uint32_t leftCase, leftLower32, rightCase; |
+ if(strength == UCOL_PRIMARY) { |
+ // Primary+caseLevel: Ignore case level weights of primary ignorables. |
+ // Otherwise we would get a-umlaut > a |
+ // which is not desirable for accent-insensitive sorting. |
+ // Check for (lower 32 bits) == 0 as well because variable CEs are stored |
+ // with only primary weights. |
+ int64_t ce; |
+ do { |
+ ce = left.getCE(leftIndex++); |
+ leftCase = (uint32_t)ce; |
+ } while((uint32_t)(ce >> 32) == 0 || leftCase == 0); |
+ leftLower32 = leftCase; |
+ leftCase &= 0xc000; |
+ |
+ do { |
+ ce = right.getCE(rightIndex++); |
+ rightCase = (uint32_t)ce; |
+ } while((uint32_t)(ce >> 32) == 0 || rightCase == 0); |
+ rightCase &= 0xc000; |
+ } else { |
+ // Secondary+caseLevel: By analogy with the above, |
+ // ignore case level weights of secondary ignorables. |
+ // |
+ // Note: A tertiary CE has uppercase case bits (0.0.ut) |
+ // to keep tertiary+caseFirst well-formed. |
+ // |
+ // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables. |
+ // Otherwise a tertiary CE's uppercase would be no greater than |
+ // a primary/secondary CE's uppercase. |
+ // (See UCA well-formedness condition 2.) |
+ // We could construct a special case weight higher than uppercase, |
+ // but it's simpler to always ignore case weights of secondary ignorables, |
+ // turning 0.0.ut into 0.0.0.t. |
+ // (See LDML Collation, Case Parameters.) |
+ do { |
+ leftCase = (uint32_t)left.getCE(leftIndex++); |
+ } while(leftCase <= 0xffff); |
+ leftLower32 = leftCase; |
+ leftCase &= 0xc000; |
+ |
+ do { |
+ rightCase = (uint32_t)right.getCE(rightIndex++); |
+ } while(rightCase <= 0xffff); |
+ rightCase &= 0xc000; |
+ } |
+ |
+ // No need to handle NO_CE and MERGE_SEPARATOR specially: |
+ // There is one case weight for each previous-level weight, |
+ // so level length differences were handled there. |
+ if(leftCase != rightCase) { |
+ if((options & CollationSettings::UPPER_FIRST) == 0) { |
+ return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; |
+ } else { |
+ return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; |
+ } |
+ } |
+ if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; } |
+ } |
+ } |
+ if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } |
+ |
+ uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options); |
+ |
+ int32_t leftIndex = 0; |
+ int32_t rightIndex = 0; |
+ uint32_t anyQuaternaries = 0; |
+ for(;;) { |
+ uint32_t leftLower32, leftTertiary; |
+ do { |
+ leftLower32 = (uint32_t)left.getCE(leftIndex++); |
+ anyQuaternaries |= leftLower32; |
+ U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 || |
+ (leftLower32 & 0xc0c0) == 0); |
+ leftTertiary = leftLower32 & tertiaryMask; |
+ } while(leftTertiary == 0); |
+ |
+ uint32_t rightLower32, rightTertiary; |
+ do { |
+ rightLower32 = (uint32_t)right.getCE(rightIndex++); |
+ anyQuaternaries |= rightLower32; |
+ U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 || |
+ (rightLower32 & 0xc0c0) == 0); |
+ rightTertiary = rightLower32 & tertiaryMask; |
+ } while(rightTertiary == 0); |
+ |
+ if(leftTertiary != rightTertiary) { |
+ if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { |
+ // Pass through NO_CE and MERGE_SEPARATOR |
+ // and keep real tertiary weights larger than the MERGE_SEPARATOR. |
+ // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut), |
+ // to keep tertiary CEs well-formed. |
+ // Their case+tertiary weights must be greater than those of |
+ // primary and secondary CEs. |
+ if(leftTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) { |
+ if(leftLower32 > 0xffff) { |
+ leftTertiary ^= 0xc000; |
+ } else { |
+ leftTertiary += 0x4000; |
+ } |
+ } |
+ if(rightTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) { |
+ if(rightLower32 > 0xffff) { |
+ rightTertiary ^= 0xc000; |
+ } else { |
+ rightTertiary += 0x4000; |
+ } |
+ } |
+ } |
+ return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; |
+ } |
+ if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; } |
+ } |
+ if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } |
+ |
+ if(!anyVariable && (anyQuaternaries & 0xc0) == 0) { |
+ // If there are no "variable" CEs and no non-zero quaternary weights, |
+ // then there are no quaternary differences. |
+ return UCOL_EQUAL; |
+ } |
+ |
+ leftIndex = 0; |
+ rightIndex = 0; |
+ for(;;) { |
+ uint32_t leftQuaternary; |
+ do { |
+ int64_t ce = left.getCE(leftIndex++); |
+ leftQuaternary = (uint32_t)ce & 0xffff; |
+ if(leftQuaternary == 0) { |
+ // Variable primary or completely ignorable. |
+ leftQuaternary = (uint32_t)(ce >> 32); |
+ } else if(leftQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) { |
+ // Leave NO_CE or MERGE_SEPARATOR as is. |
+ } else { |
+ // Regular CE, not tertiary ignorable. |
+ // Preserve the quaternary weight in bits 7..6. |
+ leftQuaternary |= 0xffffff3f; |
+ } |
+ } while(leftQuaternary == 0); |
+ |
+ uint32_t rightQuaternary; |
+ do { |
+ int64_t ce = right.getCE(rightIndex++); |
+ rightQuaternary = (uint32_t)ce & 0xffff; |
+ if(rightQuaternary == 0) { |
+ // Variable primary or completely ignorable. |
+ rightQuaternary = (uint32_t)(ce >> 32); |
+ } else if(rightQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) { |
+ // Leave NO_CE or MERGE_SEPARATOR as is. |
+ } else { |
+ // Regular CE, not tertiary ignorable. |
+ // Preserve the quaternary weight in bits 7..6. |
+ rightQuaternary |= 0xffffff3f; |
+ } |
+ } while(rightQuaternary == 0); |
+ |
+ if(leftQuaternary != rightQuaternary) { |
+ // Return the difference, with script reordering. |
+ const uint8_t *reorderTable = settings.reorderTable; |
+ if (reorderTable != NULL) { |
+ leftQuaternary = Collation::reorder(reorderTable, leftQuaternary); |
+ rightQuaternary = Collation::reorder(reorderTable, rightQuaternary); |
+ } |
+ return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; |
+ } |
+ if(leftQuaternary == Collation::NO_CE_WEIGHT16) { break; } |
+ } |
+ return UCOL_EQUAL; |
+} |
+ |
+U_NAMESPACE_END |
+ |
+#endif // !UCONFIG_NO_COLLATION |