OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * Copyright (C) 1996-2014, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* |
| 6 * collationcompare.cpp |
| 7 * |
| 8 * created on: 2012feb14 with new and old collation code |
| 9 * created by: Markus W. Scherer |
| 10 */ |
| 11 |
| 12 #include "unicode/utypes.h" |
| 13 |
| 14 #if !UCONFIG_NO_COLLATION |
| 15 |
| 16 #include "unicode/ucol.h" |
| 17 #include "cmemory.h" |
| 18 #include "collation.h" |
| 19 #include "collationcompare.h" |
| 20 #include "collationiterator.h" |
| 21 #include "collationsettings.h" |
| 22 #include "uassert.h" |
| 23 |
| 24 U_NAMESPACE_BEGIN |
| 25 |
| 26 UCollationResult |
| 27 CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
or &right, |
| 28 const CollationSettings &settings, |
| 29 UErrorCode &errorCode) { |
| 30 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } |
| 31 |
| 32 int32_t options = settings.options; |
| 33 uint32_t variableTop; |
| 34 if((options & CollationSettings::ALTERNATE_MASK) == 0) { |
| 35 variableTop = 0; |
| 36 } else { |
| 37 // +1 so that we can use "<" and primary ignorables test out early. |
| 38 variableTop = settings.variableTop + 1; |
| 39 } |
| 40 UBool anyVariable = FALSE; |
| 41 |
| 42 // Fetch CEs, compare primaries, store secondary & tertiary weights. |
| 43 U_ALIGN_CODE(16); |
| 44 for(;;) { |
| 45 // We fetch CEs until we get a non-ignorable primary or reach the end. |
| 46 uint32_t leftPrimary; |
| 47 do { |
| 48 int64_t ce = left.nextCE(errorCode); |
| 49 leftPrimary = (uint32_t)(ce >> 32); |
| 50 if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPAR
ATOR_PRIMARY) { |
| 51 // Variable CE, shift it to quaternary level. |
| 52 // Ignore all following primary ignorables, and shift further va
riable CEs. |
| 53 anyVariable = TRUE; |
| 54 do { |
| 55 // Store only the primary of the variable CE. |
| 56 left.setCurrentCE(ce & INT64_C(0xffffffff00000000)); |
| 57 for(;;) { |
| 58 ce = left.nextCE(errorCode); |
| 59 leftPrimary = (uint32_t)(ce >> 32); |
| 60 if(leftPrimary == 0) { |
| 61 left.setCurrentCE(0); |
| 62 } else { |
| 63 break; |
| 64 } |
| 65 } |
| 66 } while(leftPrimary < variableTop && |
| 67 leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY); |
| 68 } |
| 69 } while(leftPrimary == 0); |
| 70 |
| 71 uint32_t rightPrimary; |
| 72 do { |
| 73 int64_t ce = right.nextCE(errorCode); |
| 74 rightPrimary = (uint32_t)(ce >> 32); |
| 75 if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEP
ARATOR_PRIMARY) { |
| 76 // Variable CE, shift it to quaternary level. |
| 77 // Ignore all following primary ignorables, and shift further va
riable CEs. |
| 78 anyVariable = TRUE; |
| 79 do { |
| 80 // Store only the primary of the variable CE. |
| 81 right.setCurrentCE(ce & INT64_C(0xffffffff00000000)); |
| 82 for(;;) { |
| 83 ce = right.nextCE(errorCode); |
| 84 rightPrimary = (uint32_t)(ce >> 32); |
| 85 if(rightPrimary == 0) { |
| 86 right.setCurrentCE(0); |
| 87 } else { |
| 88 break; |
| 89 } |
| 90 } |
| 91 } while(rightPrimary < variableTop && |
| 92 rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY); |
| 93 } |
| 94 } while(rightPrimary == 0); |
| 95 |
| 96 if(leftPrimary != rightPrimary) { |
| 97 // Return the primary difference, with script reordering. |
| 98 const uint8_t *reorderTable = settings.reorderTable; |
| 99 if (reorderTable != NULL) { |
| 100 leftPrimary = Collation::reorder(reorderTable, leftPrimary); |
| 101 rightPrimary = Collation::reorder(reorderTable, rightPrimary); |
| 102 } |
| 103 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; |
| 104 } |
| 105 if(leftPrimary == Collation::NO_CE_PRIMARY) { break; } |
| 106 } |
| 107 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } |
| 108 |
| 109 // Compare the buffered secondary & tertiary weights. |
| 110 // We might skip the secondary level but continue with the case level |
| 111 // which is turned on separately. |
| 112 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { |
| 113 if((options & CollationSettings::BACKWARD_SECONDARY) == 0) { |
| 114 int32_t leftIndex = 0; |
| 115 int32_t rightIndex = 0; |
| 116 for(;;) { |
| 117 uint32_t leftSecondary; |
| 118 do { |
| 119 leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16; |
| 120 } while(leftSecondary == 0); |
| 121 |
| 122 uint32_t rightSecondary; |
| 123 do { |
| 124 rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16
; |
| 125 } while(rightSecondary == 0); |
| 126 |
| 127 if(leftSecondary != rightSecondary) { |
| 128 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_G
REATER; |
| 129 } |
| 130 if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; } |
| 131 } |
| 132 } else { |
| 133 // The backwards secondary level compares secondary weights backward
s |
| 134 // within segments separated by the merge separator (U+FFFE, weight
02). |
| 135 int32_t leftStart = 0; |
| 136 int32_t rightStart = 0; |
| 137 for(;;) { |
| 138 // Find the merge separator or the NO_CE terminator. |
| 139 int32_t leftLimit = leftStart; |
| 140 uint32_t leftLower32; |
| 141 while((leftLower32 = (uint32_t)left.getCE(leftLimit)) > |
| 142 Collation::MERGE_SEPARATOR_LOWER32 || |
| 143 leftLower32 == 0) { |
| 144 ++leftLimit; |
| 145 } |
| 146 int32_t rightLimit = rightStart; |
| 147 uint32_t rightLower32; |
| 148 while((rightLower32 = (uint32_t)right.getCE(rightLimit)) > |
| 149 Collation::MERGE_SEPARATOR_LOWER32 || |
| 150 rightLower32 == 0) { |
| 151 ++rightLimit; |
| 152 } |
| 153 |
| 154 // Compare the segments. |
| 155 int32_t leftIndex = leftLimit; |
| 156 int32_t rightIndex = rightLimit; |
| 157 for(;;) { |
| 158 int32_t leftSecondary = 0; |
| 159 while(leftSecondary == 0 && leftIndex > leftStart) { |
| 160 leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 1
6; |
| 161 } |
| 162 |
| 163 int32_t rightSecondary = 0; |
| 164 while(rightSecondary == 0 && rightIndex > rightStart) { |
| 165 rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >
> 16; |
| 166 } |
| 167 |
| 168 if(leftSecondary != rightSecondary) { |
| 169 return (leftSecondary < rightSecondary) ? UCOL_LESS : UC
OL_GREATER; |
| 170 } |
| 171 if(leftSecondary == 0) { break; } |
| 172 } |
| 173 |
| 174 // Did we reach the end of either string? |
| 175 // Both strings have the same number of merge separators, |
| 176 // or else there would have been a primary-level difference. |
| 177 U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit)); |
| 178 if(left.getCE(leftLimit) == Collation::NO_CE) { break; } |
| 179 // Skip both merge separators and continue. |
| 180 leftStart = leftLimit + 1; |
| 181 rightStart = rightLimit + 1; |
| 182 } |
| 183 } |
| 184 } |
| 185 |
| 186 if((options & CollationSettings::CASE_LEVEL) != 0) { |
| 187 int32_t strength = CollationSettings::getStrength(options); |
| 188 int32_t leftIndex = 0; |
| 189 int32_t rightIndex = 0; |
| 190 for(;;) { |
| 191 uint32_t leftCase, leftLower32, rightCase; |
| 192 if(strength == UCOL_PRIMARY) { |
| 193 // Primary+caseLevel: Ignore case level weights of primary ignor
ables. |
| 194 // Otherwise we would get a-umlaut > a |
| 195 // which is not desirable for accent-insensitive sorting. |
| 196 // Check for (lower 32 bits) == 0 as well because variable CEs a
re stored |
| 197 // with only primary weights. |
| 198 int64_t ce; |
| 199 do { |
| 200 ce = left.getCE(leftIndex++); |
| 201 leftCase = (uint32_t)ce; |
| 202 } while((uint32_t)(ce >> 32) == 0 || leftCase == 0); |
| 203 leftLower32 = leftCase; |
| 204 leftCase &= 0xc000; |
| 205 |
| 206 do { |
| 207 ce = right.getCE(rightIndex++); |
| 208 rightCase = (uint32_t)ce; |
| 209 } while((uint32_t)(ce >> 32) == 0 || rightCase == 0); |
| 210 rightCase &= 0xc000; |
| 211 } else { |
| 212 // Secondary+caseLevel: By analogy with the above, |
| 213 // ignore case level weights of secondary ignorables. |
| 214 // |
| 215 // Note: A tertiary CE has uppercase case bits (0.0.ut) |
| 216 // to keep tertiary+caseFirst well-formed. |
| 217 // |
| 218 // Tertiary+caseLevel: Also ignore case level weights of seconda
ry ignorables. |
| 219 // Otherwise a tertiary CE's uppercase would be no greater than |
| 220 // a primary/secondary CE's uppercase. |
| 221 // (See UCA well-formedness condition 2.) |
| 222 // We could construct a special case weight higher than uppercas
e, |
| 223 // but it's simpler to always ignore case weights of secondary i
gnorables, |
| 224 // turning 0.0.ut into 0.0.0.t. |
| 225 // (See LDML Collation, Case Parameters.) |
| 226 do { |
| 227 leftCase = (uint32_t)left.getCE(leftIndex++); |
| 228 } while(leftCase <= 0xffff); |
| 229 leftLower32 = leftCase; |
| 230 leftCase &= 0xc000; |
| 231 |
| 232 do { |
| 233 rightCase = (uint32_t)right.getCE(rightIndex++); |
| 234 } while(rightCase <= 0xffff); |
| 235 rightCase &= 0xc000; |
| 236 } |
| 237 |
| 238 // No need to handle NO_CE and MERGE_SEPARATOR specially: |
| 239 // There is one case weight for each previous-level weight, |
| 240 // so level length differences were handled there. |
| 241 if(leftCase != rightCase) { |
| 242 if((options & CollationSettings::UPPER_FIRST) == 0) { |
| 243 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; |
| 244 } else { |
| 245 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; |
| 246 } |
| 247 } |
| 248 if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; } |
| 249 } |
| 250 } |
| 251 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_
EQUAL; } |
| 252 |
| 253 uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options); |
| 254 |
| 255 int32_t leftIndex = 0; |
| 256 int32_t rightIndex = 0; |
| 257 uint32_t anyQuaternaries = 0; |
| 258 for(;;) { |
| 259 uint32_t leftLower32, leftTertiary; |
| 260 do { |
| 261 leftLower32 = (uint32_t)left.getCE(leftIndex++); |
| 262 anyQuaternaries |= leftLower32; |
| 263 U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 || |
| 264 (leftLower32 & 0xc0c0) == 0); |
| 265 leftTertiary = leftLower32 & tertiaryMask; |
| 266 } while(leftTertiary == 0); |
| 267 |
| 268 uint32_t rightLower32, rightTertiary; |
| 269 do { |
| 270 rightLower32 = (uint32_t)right.getCE(rightIndex++); |
| 271 anyQuaternaries |= rightLower32; |
| 272 U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 || |
| 273 (rightLower32 & 0xc0c0) == 0); |
| 274 rightTertiary = rightLower32 & tertiaryMask; |
| 275 } while(rightTertiary == 0); |
| 276 |
| 277 if(leftTertiary != rightTertiary) { |
| 278 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { |
| 279 // Pass through NO_CE and MERGE_SEPARATOR |
| 280 // and keep real tertiary weights larger than the MERGE_SEPARATO
R. |
| 281 // Do not change the artificial uppercase weight of a tertiary C
E (0.0.ut), |
| 282 // to keep tertiary CEs well-formed. |
| 283 // Their case+tertiary weights must be greater than those of |
| 284 // primary and secondary CEs. |
| 285 if(leftTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) { |
| 286 if(leftLower32 > 0xffff) { |
| 287 leftTertiary ^= 0xc000; |
| 288 } else { |
| 289 leftTertiary += 0x4000; |
| 290 } |
| 291 } |
| 292 if(rightTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) { |
| 293 if(rightLower32 > 0xffff) { |
| 294 rightTertiary ^= 0xc000; |
| 295 } else { |
| 296 rightTertiary += 0x4000; |
| 297 } |
| 298 } |
| 299 } |
| 300 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; |
| 301 } |
| 302 if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; } |
| 303 } |
| 304 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_E
QUAL; } |
| 305 |
| 306 if(!anyVariable && (anyQuaternaries & 0xc0) == 0) { |
| 307 // If there are no "variable" CEs and no non-zero quaternary weights, |
| 308 // then there are no quaternary differences. |
| 309 return UCOL_EQUAL; |
| 310 } |
| 311 |
| 312 leftIndex = 0; |
| 313 rightIndex = 0; |
| 314 for(;;) { |
| 315 uint32_t leftQuaternary; |
| 316 do { |
| 317 int64_t ce = left.getCE(leftIndex++); |
| 318 leftQuaternary = (uint32_t)ce & 0xffff; |
| 319 if(leftQuaternary == 0) { |
| 320 // Variable primary or completely ignorable. |
| 321 leftQuaternary = (uint32_t)(ce >> 32); |
| 322 } else if(leftQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) { |
| 323 // Leave NO_CE or MERGE_SEPARATOR as is. |
| 324 } else { |
| 325 // Regular CE, not tertiary ignorable. |
| 326 // Preserve the quaternary weight in bits 7..6. |
| 327 leftQuaternary |= 0xffffff3f; |
| 328 } |
| 329 } while(leftQuaternary == 0); |
| 330 |
| 331 uint32_t rightQuaternary; |
| 332 do { |
| 333 int64_t ce = right.getCE(rightIndex++); |
| 334 rightQuaternary = (uint32_t)ce & 0xffff; |
| 335 if(rightQuaternary == 0) { |
| 336 // Variable primary or completely ignorable. |
| 337 rightQuaternary = (uint32_t)(ce >> 32); |
| 338 } else if(rightQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) { |
| 339 // Leave NO_CE or MERGE_SEPARATOR as is. |
| 340 } else { |
| 341 // Regular CE, not tertiary ignorable. |
| 342 // Preserve the quaternary weight in bits 7..6. |
| 343 rightQuaternary |= 0xffffff3f; |
| 344 } |
| 345 } while(rightQuaternary == 0); |
| 346 |
| 347 if(leftQuaternary != rightQuaternary) { |
| 348 // Return the difference, with script reordering. |
| 349 const uint8_t *reorderTable = settings.reorderTable; |
| 350 if (reorderTable != NULL) { |
| 351 leftQuaternary = Collation::reorder(reorderTable, leftQuaternary
); |
| 352 rightQuaternary = Collation::reorder(reorderTable, rightQuaterna
ry); |
| 353 } |
| 354 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER
; |
| 355 } |
| 356 if(leftQuaternary == Collation::NO_CE_WEIGHT16) { break; } |
| 357 } |
| 358 return UCOL_EQUAL; |
| 359 } |
| 360 |
| 361 U_NAMESPACE_END |
| 362 |
| 363 #endif // !UCONFIG_NO_COLLATION |
OLD | NEW |