OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * Copyright (C) 2012-2014, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* |
| 6 * collationtest.cpp |
| 7 * |
| 8 * created on: 2012apr27 |
| 9 * created by: Markus W. Scherer |
| 10 */ |
| 11 |
| 12 #include "unicode/utypes.h" |
| 13 |
| 14 #if !UCONFIG_NO_COLLATION |
| 15 |
| 16 #include "unicode/coll.h" |
| 17 #include "unicode/errorcode.h" |
| 18 #include "unicode/localpointer.h" |
| 19 #include "unicode/normalizer2.h" |
| 20 #include "unicode/sortkey.h" |
| 21 #include "unicode/std_string.h" |
| 22 #include "unicode/strenum.h" |
| 23 #include "unicode/tblcoll.h" |
| 24 #include "unicode/uiter.h" |
| 25 #include "unicode/uniset.h" |
| 26 #include "unicode/unistr.h" |
| 27 #include "unicode/usetiter.h" |
| 28 #include "unicode/ustring.h" |
| 29 #include "charstr.h" |
| 30 #include "cmemory.h" |
| 31 #include "collation.h" |
| 32 #include "collationdata.h" |
| 33 #include "collationfcd.h" |
| 34 #include "collationiterator.h" |
| 35 #include "collationroot.h" |
| 36 #include "collationrootelements.h" |
| 37 #include "collationruleparser.h" |
| 38 #include "collationweights.h" |
| 39 #include "cstring.h" |
| 40 #include "intltest.h" |
| 41 #include "normalizer2impl.h" |
| 42 #include "ucbuf.h" |
| 43 #include "uhash.h" |
| 44 #include "uitercollationiterator.h" |
| 45 #include "utf16collationiterator.h" |
| 46 #include "utf8collationiterator.h" |
| 47 #include "uvectr32.h" |
| 48 #include "uvectr64.h" |
| 49 #include "writesrc.h" |
| 50 |
| 51 // TODO: Move to ucbuf.h |
| 52 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close); |
| 53 |
| 54 class CodePointIterator; |
| 55 |
| 56 // TODO: try to share code with IntlTestCollator; for example, prettify(Collatio
nKey) |
| 57 |
| 58 class CollationTest : public IntlTest { |
| 59 public: |
| 60 CollationTest() |
| 61 : fcd(NULL), nfd(NULL), |
| 62 fileLineNumber(0), |
| 63 coll(NULL) {} |
| 64 |
| 65 ~CollationTest() { |
| 66 delete coll; |
| 67 } |
| 68 |
| 69 void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=
NULL); |
| 70 |
| 71 void TestMinMax(); |
| 72 void TestImplicits(); |
| 73 void TestNulTerminated(); |
| 74 void TestIllegalUTF8(); |
| 75 void TestShortFCDData(); |
| 76 void TestFCD(); |
| 77 void TestCollationWeights(); |
| 78 void TestRootElements(); |
| 79 void TestTailoredElements(); |
| 80 void TestDataDriven(); |
| 81 |
| 82 private: |
| 83 void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cp
i); |
| 84 void checkAllocWeights(CollationWeights &cw, |
| 85 uint32_t lowerLimit, uint32_t upperLimit, int32_t n, |
| 86 int32_t someLength, int32_t minCount); |
| 87 |
| 88 static UnicodeString printSortKey(const uint8_t *p, int32_t length); |
| 89 static UnicodeString printCollationKey(const CollationKey &key); |
| 90 |
| 91 // Helpers & fields for data-driven test. |
| 92 static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; } |
| 93 static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; } |
| 94 static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c
== 0x40; } // %*@ |
| 95 int32_t skipSpaces(int32_t i) { |
| 96 while(isSpace(fileLine[i])) { ++i; } |
| 97 return i; |
| 98 } |
| 99 |
| 100 UBool readLine(UCHARBUF *f, IcuTestErrorCode &errorCode); |
| 101 void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UE
rrorCode &errorCode); |
| 102 Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &
errorCode); |
| 103 void parseAndSetAttribute(IcuTestErrorCode &errorCode); |
| 104 void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode); |
| 105 void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode); |
| 106 void setRootCollator(IcuTestErrorCode &errorCode); |
| 107 void setLocaleCollator(IcuTestErrorCode &errorCode); |
| 108 |
| 109 UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) cons
t; |
| 110 |
| 111 UBool getSortKeyParts(const UChar *s, int32_t length, |
| 112 CharString &dest, int32_t partSize, |
| 113 IcuTestErrorCode &errorCode); |
| 114 UBool getCollationKey(const char *norm, const UnicodeString &line, |
| 115 const UChar *s, int32_t length, |
| 116 CollationKey &key, IcuTestErrorCode &errorCode); |
| 117 UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine, |
| 118 const UnicodeString &prevString, const UnicodeString &
s, |
| 119 UCollationResult expectedOrder, Collation::Level expec
tedLevel, |
| 120 IcuTestErrorCode &errorCode); |
| 121 void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode); |
| 122 |
| 123 const Normalizer2 *fcd, *nfd; |
| 124 UnicodeString fileLine; |
| 125 int32_t fileLineNumber; |
| 126 UnicodeString fileTestName; |
| 127 Collator *coll; |
| 128 }; |
| 129 |
| 130 extern IntlTest *createCollationTest() { |
| 131 return new CollationTest(); |
| 132 } |
| 133 |
| 134 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name,
char * /*par*/) { |
| 135 if(exec) { |
| 136 logln("TestSuite CollationTest: "); |
| 137 } |
| 138 TESTCASE_AUTO_BEGIN; |
| 139 TESTCASE_AUTO(TestMinMax); |
| 140 TESTCASE_AUTO(TestImplicits); |
| 141 TESTCASE_AUTO(TestNulTerminated); |
| 142 TESTCASE_AUTO(TestIllegalUTF8); |
| 143 TESTCASE_AUTO(TestShortFCDData); |
| 144 TESTCASE_AUTO(TestFCD); |
| 145 TESTCASE_AUTO(TestCollationWeights); |
| 146 TESTCASE_AUTO(TestRootElements); |
| 147 TESTCASE_AUTO(TestTailoredElements); |
| 148 TESTCASE_AUTO(TestDataDriven); |
| 149 TESTCASE_AUTO_END; |
| 150 } |
| 151 |
| 152 void CollationTest::TestMinMax() { |
| 153 IcuTestErrorCode errorCode(*this, "TestMinMax"); |
| 154 |
| 155 setRootCollator(errorCode); |
| 156 if(errorCode.isFailure()) { |
| 157 errorCode.reset(); |
| 158 return; |
| 159 } |
| 160 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll); |
| 161 if(rbc == NULL) { |
| 162 errln("the root collator is not a RuleBasedCollator"); |
| 163 return; |
| 164 } |
| 165 |
| 166 static const UChar s[2] = { 0xfffe, 0xffff }; |
| 167 UVector64 ces(errorCode); |
| 168 rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode); |
| 169 errorCode.assertSuccess(); |
| 170 if(ces.size() != 2) { |
| 171 errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size()); |
| 172 return; |
| 173 } |
| 174 int64_t ce = ces.elementAti(0); |
| 175 int64_t expected = |
| 176 ((int64_t)Collation::MERGE_SEPARATOR_PRIMARY << 32) | |
| 177 Collation::MERGE_SEPARATOR_LOWER32; |
| 178 if(ce != expected) { |
| 179 errln("CE(U+fffe)=%04lx != 02.02.02", (long)ce); |
| 180 } |
| 181 |
| 182 ce = ces.elementAti(1); |
| 183 expected = Collation::makeCE(Collation::MAX_PRIMARY); |
| 184 if(ce != expected) { |
| 185 errln("CE(U+ffff)=%04lx != max..", (long)ce); |
| 186 } |
| 187 } |
| 188 |
| 189 void CollationTest::TestImplicits() { |
| 190 IcuTestErrorCode errorCode(*this, "TestImplicits"); |
| 191 |
| 192 const CollationData *cd = CollationRoot::getData(errorCode); |
| 193 if(errorCode.logDataIfFailureAndReset("CollationRoot::getBaseData()")) { |
| 194 return; |
| 195 } |
| 196 |
| 197 // Implicit primary weights should be assigned for the following sets, |
| 198 // and sort in ascending order by set and then code point. |
| 199 // See http://www.unicode.org/reports/tr10/#Implicit_Weights |
| 200 |
| 201 // core Han Unified Ideographs |
| 202 UnicodeSet coreHan("[\\p{unified_ideograph}&" |
| 203 "[\\p{Block=CJK_Unified_Ideographs}" |
| 204 "\\p{Block=CJK_Compatibility_Ideographs}]]", |
| 205 errorCode); |
| 206 // all other Unified Han ideographs |
| 207 UnicodeSet otherHan("[\\p{unified ideograph}-" |
| 208 "[\\p{Block=CJK_Unified_Ideographs}" |
| 209 "\\p{Block=CJK_Compatibility_Ideographs}]]", |
| 210 errorCode); |
| 211 UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode); |
| 212 unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings
. |
| 213 |
| 214 // Starting with CLDR 26/ICU 54, the root Han order may instead be |
| 215 // the Unihan radical-stroke order. |
| 216 // The tests should pass either way, so we only test the order of a small se
t of Han characters |
| 217 // whose radical-stroke order is the same as their code point order. |
| 218 UnicodeSet someHanInCPOrder( |
| 219 "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48" |
| 220 "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3
-\\u50F6]", |
| 221 errorCode); |
| 222 UnicodeSet inOrder(someHanInCPOrder); |
| 223 inOrder.addAll(unassigned).freeze(); |
| 224 if(errorCode.logIfFailureAndReset("UnicodeSet")) { |
| 225 return; |
| 226 } |
| 227 const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned }; |
| 228 UChar32 prev = 0; |
| 229 uint32_t prevPrimary = 0; |
| 230 UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL); |
| 231 for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) { |
| 232 LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i])); |
| 233 while(iter->next()) { |
| 234 UChar32 c = iter->getCodepoint(); |
| 235 UnicodeString s(c); |
| 236 ci.setText(s.getBuffer(), s.getBuffer() + s.length()); |
| 237 int64_t ce = ci.nextCE(errorCode); |
| 238 int64_t ce2 = ci.nextCE(errorCode); |
| 239 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) { |
| 240 return; |
| 241 } |
| 242 if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) { |
| 243 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly o
ne CE", (long)c); |
| 244 continue; |
| 245 } |
| 246 if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) { |
| 247 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter
weights: %08lx", |
| 248 (long)c, (long)(ce & 0xffffffff)); |
| 249 continue; |
| 250 } |
| 251 uint32_t primary = (uint32_t)(ce >> 32); |
| 252 if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contai
ns(prev)) { |
| 253 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx.."
, |
| 254 (long)c, (long)primary, (long)prev, (long)prevPrimary); |
| 255 } |
| 256 prev = c; |
| 257 prevPrimary = primary; |
| 258 } |
| 259 } |
| 260 } |
| 261 |
| 262 void CollationTest::TestNulTerminated() { |
| 263 IcuTestErrorCode errorCode(*this, "TestNulTerminated"); |
| 264 const CollationData *data = CollationRoot::getData(errorCode); |
| 265 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) { |
| 266 return; |
| 267 } |
| 268 |
| 269 static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 }; |
| 270 |
| 271 UTF16CollationIterator ci1(data, FALSE, s, s, s + 2); |
| 272 UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL); |
| 273 for(int32_t i = 0;; ++i) { |
| 274 int64_t ce1 = ci1.nextCE(errorCode); |
| 275 int64_t ce2 = ci2.nextCE(errorCode); |
| 276 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) { |
| 277 return; |
| 278 } |
| 279 if(ce1 != ce2) { |
| 280 errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminate
d) at CE %d", (int)i); |
| 281 break; |
| 282 } |
| 283 if(ce1 == Collation::NO_CE) { break; } |
| 284 } |
| 285 } |
| 286 |
| 287 void CollationTest::TestIllegalUTF8() { |
| 288 IcuTestErrorCode errorCode(*this, "TestIllegalUTF8"); |
| 289 |
| 290 setRootCollator(errorCode); |
| 291 if(errorCode.isFailure()) { |
| 292 errorCode.reset(); |
| 293 return; |
| 294 } |
| 295 coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode); |
| 296 |
| 297 static const char *strings[] = { |
| 298 // U+FFFD |
| 299 "a\xef\xbf\xbdz", |
| 300 // illegal byte sequences |
| 301 "a\x80z", // trail byte |
| 302 "a\xc1\x81z", // non-shortest form |
| 303 "a\xe0\x82\x83z", // non-shortest form |
| 304 "a\xed\xa0\x80z", // lead surrogate: would be U+D800 |
| 305 "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF |
| 306 "a\xf0\x8f\xbf\xbfz", // non-shortest form |
| 307 "a\xf4\x90\x80\x80z" // out of range: would be U+110000 |
| 308 }; |
| 309 |
| 310 StringPiece fffd(strings[0]); |
| 311 for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) { |
| 312 StringPiece illegal(strings[i]); |
| 313 UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode); |
| 314 if(order != UCOL_EQUAL) { |
| 315 errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_
EQUAL", |
| 316 (int)i, order); |
| 317 } |
| 318 } |
| 319 } |
| 320 |
| 321 namespace { |
| 322 |
| 323 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest)
{ |
| 324 for(UChar32 c = 0x10000; c < 0x110000;) { |
| 325 UChar32 next = c + 0x400; |
| 326 if(src.containsSome(c, next - 1)) { |
| 327 dest.add(U16_LEAD(c)); |
| 328 } |
| 329 c = next; |
| 330 } |
| 331 } |
| 332 |
| 333 } // namespace |
| 334 |
| 335 void CollationTest::TestShortFCDData() { |
| 336 // See CollationFCD class comments. |
| 337 IcuTestErrorCode errorCode(*this, "TestShortFCDData"); |
| 338 UnicodeSet expectedLccc("[:^lccc=0:]", errorCode); |
| 339 errorCode.assertSuccess(); |
| 340 expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates |
| 341 addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc); |
| 342 UnicodeSet lccc; // actual |
| 343 for(UChar32 c = 0; c <= 0xffff; ++c) { |
| 344 if(CollationFCD::hasLccc(c)) { lccc.add(c); } |
| 345 } |
| 346 UnicodeSet diff(expectedLccc); |
| 347 diff.removeAll(lccc); |
| 348 diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP |
| 349 UnicodeString empty("[]"); |
| 350 UnicodeString diffString; |
| 351 diff.toPattern(diffString, TRUE); |
| 352 assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString); |
| 353 diff = lccc; |
| 354 diff.removeAll(expectedLccc); |
| 355 diff.toPattern(diffString, TRUE); |
| 356 assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, T
RUE); |
| 357 |
| 358 UnicodeSet expectedTccc("[:^tccc=0:]", errorCode); |
| 359 if (errorCode.isSuccess()) { |
| 360 addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc); |
| 361 addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc); |
| 362 UnicodeSet tccc; // actual |
| 363 for(UChar32 c = 0; c <= 0xffff; ++c) { |
| 364 if(CollationFCD::hasTccc(c)) { tccc.add(c); } |
| 365 } |
| 366 diff = expectedTccc; |
| 367 diff.removeAll(tccc); |
| 368 diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP |
| 369 assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffStrin
g); |
| 370 diff = tccc; |
| 371 diff.removeAll(expectedTccc); |
| 372 diff.toPattern(diffString, TRUE); |
| 373 assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffStrin
g); |
| 374 } |
| 375 } |
| 376 |
| 377 class CodePointIterator { |
| 378 public: |
| 379 CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length
), pos(0) {} |
| 380 void resetToStart() { pos = 0; } |
| 381 UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; } |
| 382 UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; } |
| 383 int32_t getLength() const { return length; } |
| 384 int getIndex() const { return (int)pos; } |
| 385 private: |
| 386 const UChar32 *cp; |
| 387 int32_t length; |
| 388 int32_t pos; |
| 389 }; |
| 390 |
| 391 void CollationTest::checkFCD(const char *name, |
| 392 CollationIterator &ci, CodePointIterator &cpi) { |
| 393 IcuTestErrorCode errorCode(*this, "checkFCD"); |
| 394 |
| 395 // Iterate forward to the limit. |
| 396 for(;;) { |
| 397 UChar32 c1 = ci.nextCodePoint(errorCode); |
| 398 UChar32 c2 = cpi.next(); |
| 399 if(c1 != c2) { |
| 400 errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at
%d", |
| 401 name, (long)c1, (long)c2, cpi.getIndex()); |
| 402 return; |
| 403 } |
| 404 if(c1 < 0) { break; } |
| 405 } |
| 406 |
| 407 // Iterate backward most of the way. |
| 408 for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) { |
| 409 UChar32 c1 = ci.previousCodePoint(errorCode); |
| 410 UChar32 c2 = cpi.previous(); |
| 411 if(c1 != c2) { |
| 412 errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d", |
| 413 name, (long)c1, (long)c2, cpi.getIndex()); |
| 414 return; |
| 415 } |
| 416 } |
| 417 |
| 418 // Forward again. |
| 419 for(;;) { |
| 420 UChar32 c1 = ci.nextCodePoint(errorCode); |
| 421 UChar32 c2 = cpi.next(); |
| 422 if(c1 != c2) { |
| 423 errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d", |
| 424 name, (long)c1, (long)c2, cpi.getIndex()); |
| 425 return; |
| 426 } |
| 427 if(c1 < 0) { break; } |
| 428 } |
| 429 |
| 430 // Iterate backward to the start. |
| 431 for(;;) { |
| 432 UChar32 c1 = ci.previousCodePoint(errorCode); |
| 433 UChar32 c2 = cpi.previous(); |
| 434 if(c1 != c2) { |
| 435 errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d", |
| 436 name, (long)c1, (long)c2, cpi.getIndex()); |
| 437 return; |
| 438 } |
| 439 if(c1 < 0) { break; } |
| 440 } |
| 441 } |
| 442 |
| 443 void CollationTest::TestFCD() { |
| 444 IcuTestErrorCode errorCode(*this, "TestFCD"); |
| 445 const CollationData *data = CollationRoot::getData(errorCode); |
| 446 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) { |
| 447 return; |
| 448 } |
| 449 |
| 450 // Input string, not FCD, NUL-terminated. |
| 451 static const UChar s[] = { |
| 452 0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62, |
| 453 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), // MUSICAL SYMBOL QUARTER NOTE=1
D158 1D165, ccc=0, 216 |
| 454 0x327, 0x308, // ccc=202, 230 |
| 455 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), // MUSICAL SYMBOL COMBINING AUGM
ENTATION DOT, ccc=226 |
| 456 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), |
| 457 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), |
| 458 0xac01, |
| 459 0xe7, // Character with tccc!=0 decomposed together with mis-ordered se
quence. |
| 460 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D
165), |
| 461 0xe1, // Character with tccc!=0 decomposed together with decomposed seq
uence. |
| 462 0xf73, 0xf75, // Tibetan composite vowels must be decomposed. |
| 463 0x4e00, 0xf81, |
| 464 0 |
| 465 }; |
| 466 // Expected code points. |
| 467 static const UChar32 cp[] = { |
| 468 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62, |
| 469 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308, |
| 470 0x1D15F, 0x1D16D, |
| 471 0xac01, |
| 472 0x63, 0x327, 0x1D165, 0x1D16D, |
| 473 0x61, |
| 474 0xf71, 0xf71, 0xf72, 0xf74, 0x301, |
| 475 0x4e00, 0xf71, 0xf80 |
| 476 }; |
| 477 |
| 478 FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL); |
| 479 if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor"))
{ |
| 480 return; |
| 481 } |
| 482 CodePointIterator cpi(cp, UPRV_LENGTHOF(cp)); |
| 483 checkFCD("FCDUTF16CollationIterator", u16ci, cpi); |
| 484 |
| 485 #if U_HAVE_STD_STRING |
| 486 cpi.resetToStart(); |
| 487 std::string utf8; |
| 488 UnicodeString(s).toUTF8String(utf8); |
| 489 FCDUTF8CollationIterator u8ci(data, FALSE, |
| 490 reinterpret_cast<const uint8_t *>(utf8.c_str()
), 0, -1); |
| 491 if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) { |
| 492 return; |
| 493 } |
| 494 checkFCD("FCDUTF8CollationIterator", u8ci, cpi); |
| 495 #endif |
| 496 |
| 497 cpi.resetToStart(); |
| 498 UCharIterator iter; |
| 499 uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1); // -1: without the termina
ting NUL |
| 500 FCDUIterCollationIterator uici(data, FALSE, iter, 0); |
| 501 if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor"))
{ |
| 502 return; |
| 503 } |
| 504 checkFCD("FCDUIterCollationIterator", uici, cpi); |
| 505 } |
| 506 |
| 507 void CollationTest::checkAllocWeights(CollationWeights &cw, |
| 508 uint32_t lowerLimit, uint32_t upperLimit,
int32_t n, |
| 509 int32_t someLength, int32_t minCount) { |
| 510 if(!cw.allocWeights(lowerLimit, upperLimit, n)) { |
| 511 errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE", |
| 512 (long)lowerLimit, (long)upperLimit, (long)n); |
| 513 return; |
| 514 } |
| 515 uint32_t previous = lowerLimit; |
| 516 int32_t count = 0; // number of weights that have someLength |
| 517 for(int32_t i = 0; i < n; ++i) { |
| 518 uint32_t w = cw.nextWeight(); |
| 519 if(w == 0xffffffff) { |
| 520 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() " |
| 521 "returns only %ld weights", |
| 522 (long)lowerLimit, (long)upperLimit, (long)n, (long)i); |
| 523 return; |
| 524 } |
| 525 if(!(previous < w && w < upperLimit)) { |
| 526 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() " |
| 527 "number %ld -> %lx not between %lx and %lx", |
| 528 (long)lowerLimit, (long)upperLimit, (long)n, |
| 529 (long)(i + 1), (long)w, (long)previous, (long)upperLimit); |
| 530 return; |
| 531 } |
| 532 if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; } |
| 533 } |
| 534 if(count < minCount) { |
| 535 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() " |
| 536 "returns only %ld < %ld weights of length %d", |
| 537 (long)lowerLimit, (long)upperLimit, (long)n, |
| 538 (long)count, (long)minCount, (int)someLength); |
| 539 } |
| 540 } |
| 541 |
| 542 void CollationTest::TestCollationWeights() { |
| 543 CollationWeights cw; |
| 544 |
| 545 // Non-compressible primaries use 254 second bytes 02..FF. |
| 546 logln("CollationWeights.initForPrimary(non-compressible)"); |
| 547 cw.initForPrimary(FALSE); |
| 548 // Expect 1 weight 11 and 254 weights 12xx. |
| 549 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1); |
| 550 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254); |
| 551 // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202. |
| 552 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255); |
| 553 // Expect 254 two-byte weights from the ranges 10ff and 11xx. |
| 554 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254); |
| 555 // Expect 254^2=64516 three-byte weights. |
| 556 // During computation, there should be 3 three-byte ranges |
| 557 // 10ffff, 11xxxx, 120202. |
| 558 // The middle one should be split 64515:1, |
| 559 // and the newly-split-off range and the last ranged lengthened. |
| 560 checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516)
; |
| 561 // Expect weights 1102 & 1103. |
| 562 checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2); |
| 563 // Expect weights 102102 & 102103. |
| 564 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2); |
| 565 |
| 566 // Compressible primaries use 251 second bytes 04..FE. |
| 567 logln("CollationWeights.initForPrimary(compressible)"); |
| 568 cw.initForPrimary(TRUE); |
| 569 // Expect 1 weight 11 and 251 weights 12xx. |
| 570 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1); |
| 571 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251); |
| 572 // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204. |
| 573 checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252); |
| 574 // Expect weights 1104 & 1105. |
| 575 checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2); |
| 576 // Expect weights 102102 & 102103. |
| 577 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2); |
| 578 |
| 579 // Secondary and tertiary weights use only bytes 3 & 4. |
| 580 logln("CollationWeights.initForSecondary()"); |
| 581 cw.initForSecondary(); |
| 582 // Expect weights fbxx and all four fc..ff. |
| 583 checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4); |
| 584 |
| 585 logln("CollationWeights.initForTertiary()"); |
| 586 cw.initForTertiary(); |
| 587 // Expect weights 3dxx and both 3e & 3f. |
| 588 checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2); |
| 589 } |
| 590 |
| 591 namespace { |
| 592 |
| 593 UBool isValidCE(const CollationRootElements &re, const CollationData &data, |
| 594 uint32_t p, uint32_t s, uint32_t ctq) { |
| 595 uint32_t p1 = p >> 24; |
| 596 uint32_t p2 = (p >> 16) & 0xff; |
| 597 uint32_t p3 = (p >> 8) & 0xff; |
| 598 uint32_t p4 = p & 0xff; |
| 599 uint32_t s1 = s >> 8; |
| 600 uint32_t s2 = s & 0xff; |
| 601 // ctq = Case, Tertiary, Quaternary |
| 602 uint32_t c = (ctq & Collation::CASE_MASK) >> 14; |
| 603 uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK; |
| 604 uint32_t t1 = t >> 8; |
| 605 uint32_t t2 = t & 0xff; |
| 606 uint32_t q = ctq & Collation::QUATERNARY_MASK; |
| 607 // No leading zero bytes. |
| 608 if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) { |
| 609 return FALSE; |
| 610 } |
| 611 // No intermediate zero bytes. |
| 612 if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) { |
| 613 return FALSE; |
| 614 } |
| 615 if(p2 != 0 && p3 == 0 && p4 != 0) { |
| 616 return FALSE; |
| 617 } |
| 618 // Minimum & maximum lead bytes. |
| 619 if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) || |
| 620 (s1 != 0 && s1 <= Collation::MERGE_SEPARATOR_BYTE) || |
| 621 (t1 != 0 && t1 <= Collation::MERGE_SEPARATOR_BYTE)) { |
| 622 return FALSE; |
| 623 } |
| 624 if(t1 != 0 && t1 > 0x3f) { |
| 625 return FALSE; |
| 626 } |
| 627 if(c > 2) { |
| 628 return FALSE; |
| 629 } |
| 630 // The valid byte range for the second primary byte depends on compressibili
ty. |
| 631 if(p2 != 0) { |
| 632 if(data.isCompressibleLeadByte(p1)) { |
| 633 if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE || |
| 634 Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) { |
| 635 return FALSE; |
| 636 } |
| 637 } else { |
| 638 if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) { |
| 639 return FALSE; |
| 640 } |
| 641 } |
| 642 } |
| 643 // Other bytes just need to avoid the level separator. |
| 644 // Trailing zeros are ok. |
| 645 U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1); |
| 646 if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR
_BYTE || |
| 647 s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPA
RATOR_BYTE) { |
| 648 return FALSE; |
| 649 } |
| 650 // Well-formed CEs. |
| 651 if(p == 0) { |
| 652 if(s == 0) { |
| 653 if(t == 0) { |
| 654 // Completely ignorable CE. |
| 655 // Quaternary CEs are not supported. |
| 656 if(c != 0 || q != 0) { |
| 657 return FALSE; |
| 658 } |
| 659 } else { |
| 660 // Tertiary CE. |
| 661 if(t < re.getTertiaryBoundary() || c != 2) { |
| 662 return FALSE; |
| 663 } |
| 664 } |
| 665 } else { |
| 666 // Secondary CE. |
| 667 if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBou
ndary()) { |
| 668 return FALSE; |
| 669 } |
| 670 } |
| 671 } else { |
| 672 // Primary CE. |
| 673 if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSec
ondary()) || |
| 674 s >= re.getSecondaryBoundary()) { |
| 675 return FALSE; |
| 676 } |
| 677 if(t == 0 || t >= re.getTertiaryBoundary()) { |
| 678 return FALSE; |
| 679 } |
| 680 } |
| 681 return TRUE; |
| 682 } |
| 683 |
| 684 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int6
4_t ce) { |
| 685 uint32_t p = (uint32_t)(ce >> 32); |
| 686 uint32_t secTer = (uint32_t)ce; |
| 687 return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff); |
| 688 } |
| 689 |
| 690 class RootElementsIterator { |
| 691 public: |
| 692 RootElementsIterator(const CollationData &root) |
| 693 : data(root), |
| 694 elements(root.rootElements), length(root.rootElementsLength), |
| 695 pri(0), secTer(0), |
| 696 index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_I
NDEX]) {} |
| 697 |
| 698 UBool next() { |
| 699 if(index >= length) { return FALSE; } |
| 700 uint32_t p = elements[index]; |
| 701 if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; } |
| 702 if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) { |
| 703 ++index; |
| 704 secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG; |
| 705 return TRUE; |
| 706 } |
| 707 if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) { |
| 708 // End of a range, enumerate the primaries in the range. |
| 709 int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK
; |
| 710 p &= 0xffffff00; |
| 711 if(pri == p) { |
| 712 // Finished the range, return the next CE after it. |
| 713 ++index; |
| 714 return next(); |
| 715 } |
| 716 U_ASSERT(pri < p); |
| 717 // Return the next primary in this range. |
| 718 UBool isCompressible = data.isCompressiblePrimary(pri); |
| 719 if((pri & 0xffff) == 0) { |
| 720 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible,
step); |
| 721 } else { |
| 722 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible
, step); |
| 723 } |
| 724 return TRUE; |
| 725 } |
| 726 // Simple primary CE. |
| 727 ++index; |
| 728 pri = p; |
| 729 secTer = Collation::COMMON_SEC_AND_TER_CE; |
| 730 return TRUE; |
| 731 } |
| 732 |
| 733 uint32_t getPrimary() const { return pri; } |
| 734 uint32_t getSecTer() const { return secTer; } |
| 735 |
| 736 private: |
| 737 const CollationData &data; |
| 738 const uint32_t *elements; |
| 739 int32_t length; |
| 740 |
| 741 uint32_t pri; |
| 742 uint32_t secTer; |
| 743 int32_t index; |
| 744 }; |
| 745 |
| 746 } // namespace |
| 747 |
| 748 void CollationTest::TestRootElements() { |
| 749 IcuTestErrorCode errorCode(*this, "TestRootElements"); |
| 750 const CollationData *root = CollationRoot::getData(errorCode); |
| 751 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) { |
| 752 return; |
| 753 } |
| 754 CollationRootElements rootElements(root->rootElements, root->rootElementsLen
gth); |
| 755 RootElementsIterator iter(*root); |
| 756 |
| 757 // We check each root CE for validity, |
| 758 // and we also verify that there is a tailoring gap between each two CEs. |
| 759 CollationWeights cw1c; // compressible primary weights |
| 760 CollationWeights cw1u; // uncompressible primary weights |
| 761 CollationWeights cw2; |
| 762 CollationWeights cw3; |
| 763 |
| 764 cw1c.initForPrimary(TRUE); |
| 765 cw1u.initForPrimary(FALSE); |
| 766 cw2.initForSecondary(); |
| 767 cw3.initForTertiary(); |
| 768 |
| 769 // Note: The root elements do not include Han-implicit or unassigned-implici
t CEs, |
| 770 // nor the special merge-separator CE for U+FFFE. |
| 771 uint32_t prevPri = 0; |
| 772 uint32_t prevSec = 0; |
| 773 uint32_t prevTer = 0; |
| 774 while(iter.next()) { |
| 775 uint32_t pri = iter.getPrimary(); |
| 776 uint32_t secTer = iter.getSecTer(); |
| 777 // CollationRootElements CEs must have 0 case and quaternary bits. |
| 778 if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) { |
| 779 errln("CollationRootElements CE has non-zero case and/or quaternary
bits: %08lx %08lx", |
| 780 (long)pri, (long)secTer); |
| 781 } |
| 782 uint32_t sec = secTer >> 16; |
| 783 uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK; |
| 784 uint32_t ctq = ter; |
| 785 if(pri == 0 && sec == 0 && ter != 0) { |
| 786 // Tertiary CEs must have uppercase bits, |
| 787 // but they are not stored in the CollationRootElements. |
| 788 ctq |= 0x8000; |
| 789 } |
| 790 if(!isValidCE(rootElements, *root, pri, sec, ctq)) { |
| 791 errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer); |
| 792 } else { |
| 793 if(pri != prevPri) { |
| 794 uint32_t newWeight = 0; |
| 795 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) { |
| 796 // There is currently no tailoring gap after primary ignorab
les, |
| 797 // and we forbid tailoring after U+FFFD and U+FFFF. |
| 798 } else if(root->isCompressiblePrimary(prevPri)) { |
| 799 if(!cw1c.allocWeights(prevPri, pri, 1)) { |
| 800 errln("no primary/compressible tailoring gap between %08
lx and %08lx", |
| 801 (long)prevPri, (long)pri); |
| 802 } else { |
| 803 newWeight = cw1c.nextWeight(); |
| 804 } |
| 805 } else { |
| 806 if(!cw1u.allocWeights(prevPri, pri, 1)) { |
| 807 errln("no primary/uncompressible tailoring gap between %
08lx and %08lx", |
| 808 (long)prevPri, (long)pri); |
| 809 } else { |
| 810 newWeight = cw1u.nextWeight(); |
| 811 } |
| 812 } |
| 813 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri))
{ |
| 814 errln("mis-allocated primary weight, should get %08lx < %08l
x < %08lx", |
| 815 (long)prevPri, (long)newWeight, (long)pri); |
| 816 } |
| 817 } else if(sec != prevSec) { |
| 818 uint32_t lowerLimit = |
| 819 prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 :
prevSec; |
| 820 if(!cw2.allocWeights(lowerLimit, sec, 1)) { |
| 821 errln("no secondary tailoring gap between %04x and %04x", lo
werLimit, sec); |
| 822 } else { |
| 823 uint32_t newWeight = cw2.nextWeight(); |
| 824 if(!(prevSec < newWeight && newWeight < sec)) { |
| 825 errln("mis-allocated secondary weight, should get %04x <
%04x < %04x", |
| 826 (long)lowerLimit, (long)newWeight, (long)sec); |
| 827 } |
| 828 } |
| 829 } else if(ter != prevTer) { |
| 830 uint32_t lowerLimit = |
| 831 prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 :
prevTer; |
| 832 if(!cw3.allocWeights(lowerLimit, ter, 1)) { |
| 833 errln("no teriary tailoring gap between %04x and %04x", lowe
rLimit, ter); |
| 834 } else { |
| 835 uint32_t newWeight = cw3.nextWeight(); |
| 836 if(!(prevTer < newWeight && newWeight < ter)) { |
| 837 errln("mis-allocated secondary weight, should get %04x <
%04x < %04x", |
| 838 (long)lowerLimit, (long)newWeight, (long)ter); |
| 839 } |
| 840 } |
| 841 } else { |
| 842 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer); |
| 843 } |
| 844 } |
| 845 prevPri = pri; |
| 846 prevSec = sec; |
| 847 prevTer = ter; |
| 848 } |
| 849 } |
| 850 |
| 851 void CollationTest::TestTailoredElements() { |
| 852 IcuTestErrorCode errorCode(*this, "TestTailoredElements"); |
| 853 const CollationData *root = CollationRoot::getData(errorCode); |
| 854 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) { |
| 855 return; |
| 856 } |
| 857 CollationRootElements rootElements(root->rootElements, root->rootElementsLen
gth); |
| 858 |
| 859 UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NU
LL, errorCode); |
| 860 if(errorCode.logIfFailureAndReset("failed to create a hash table")) { |
| 861 return; |
| 862 } |
| 863 uhash_setKeyDeleter(prevLocales, uprv_free); |
| 864 // TestRootElements() tests the root collator which does not have tailorings
. |
| 865 uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode); |
| 866 uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode); |
| 867 uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode
); |
| 868 |
| 869 UVector64 ces(errorCode); |
| 870 LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales()); |
| 871 U_ASSERT(locales.isValid()); |
| 872 const char *localeID = "root"; |
| 873 do { |
| 874 Locale locale(localeID); |
| 875 LocalPointer<StringEnumeration> types( |
| 876 Collator::getKeywordValuesForLocale("collation", locale, FALSE,
errorCode)); |
| 877 errorCode.assertSuccess(); |
| 878 const char *type; // first: default type |
| 879 while((type = types->next(NULL, errorCode)) != NULL) { |
| 880 if(strncmp(type, "private-", 8) == 0) { |
| 881 errln("Collator::getKeywordValuesForLocale(%s) returns private c
ollation keyword: %s", |
| 882 localeID, type); |
| 883 } |
| 884 Locale localeWithType(locale); |
| 885 localeWithType.setKeywordValue("collation", type, errorCode); |
| 886 errorCode.assertSuccess(); |
| 887 LocalPointer<Collator> coll(Collator::createInstance(localeWithType,
errorCode)); |
| 888 if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)", |
| 889 localeWithType.getName())) { |
| 890 continue; |
| 891 } |
| 892 Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode); |
| 893 if(uhash_geti(prevLocales, actual.getName()) != 0) { |
| 894 continue; |
| 895 } |
| 896 uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode)
; |
| 897 errorCode.assertSuccess(); |
| 898 logln("TestTailoredElements(): requested %s -> actual %s", |
| 899 localeWithType.getName(), actual.getName()); |
| 900 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getA
lias()); |
| 901 if(rbc == NULL) { |
| 902 continue; |
| 903 } |
| 904 // Note: It would be better to get tailored strings such that we can |
| 905 // identify the prefix, and only get the CEs for the prefix+string, |
| 906 // not also for the prefix. |
| 907 // There is currently no API for that. |
| 908 // It would help in an unusual case where a contraction starting in
the prefix |
| 909 // extends past its end, and we do not see the intended mapping. |
| 910 // For example, for a mapping p|st, if there is also a contraction p
s, |
| 911 // then we get CEs(ps)+CEs(t), rather than CEs(p|st). |
| 912 LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode)); |
| 913 errorCode.assertSuccess(); |
| 914 UnicodeSetIterator iter(*tailored); |
| 915 while(iter.next()) { |
| 916 const UnicodeString &s = iter.getString(); |
| 917 ces.removeAllElements(); |
| 918 rbc->internalGetCEs(s, ces, errorCode); |
| 919 errorCode.assertSuccess(); |
| 920 for(int32_t i = 0; i < ces.size(); ++i) { |
| 921 int64_t ce = ces.elementAti(i); |
| 922 if(!isValidCE(rootElements, *root, ce)) { |
| 923 errln("invalid tailored CE %016llx at CE index %d from s
tring:", |
| 924 (long long)ce, (int)i); |
| 925 infoln(prettify(s)); |
| 926 } |
| 927 } |
| 928 } |
| 929 } |
| 930 } while((localeID = locales->next(NULL, errorCode)) != NULL); |
| 931 uhash_close(prevLocales); |
| 932 } |
| 933 |
| 934 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) { |
| 935 UnicodeString s; |
| 936 for(int32_t i = 0; i < length; ++i) { |
| 937 if(i > 0) { s.append((UChar)0x20); } |
| 938 uint8_t b = p[i]; |
| 939 if(b == 0) { |
| 940 s.append((UChar)0x2e); // period |
| 941 } else if(b == 1) { |
| 942 s.append((UChar)0x7c); // vertical bar |
| 943 } else { |
| 944 appendHex(b, 2, s); |
| 945 } |
| 946 } |
| 947 return s; |
| 948 } |
| 949 |
| 950 UnicodeString CollationTest::printCollationKey(const CollationKey &key) { |
| 951 int32_t length; |
| 952 const uint8_t *p = key.getByteArray(length); |
| 953 return printSortKey(p, length); |
| 954 } |
| 955 |
| 956 UBool CollationTest::readLine(UCHARBUF *f, IcuTestErrorCode &errorCode) { |
| 957 int32_t lineLength; |
| 958 const UChar *line = ucbuf_readline(f, &lineLength, errorCode); |
| 959 if(line == NULL || errorCode.isFailure()) { |
| 960 fileLine.remove(); |
| 961 return FALSE; |
| 962 } |
| 963 ++fileLineNumber; |
| 964 // Strip trailing CR/LF, comments, and spaces. |
| 965 const UChar *comment = u_memchr(line, 0x23, lineLength); // '#' |
| 966 if(comment != NULL) { |
| 967 lineLength = (int32_t)(comment - line); |
| 968 } else { |
| 969 while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength;
} |
| 970 } |
| 971 while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; } |
| 972 fileLine.setTo(FALSE, line, lineLength); |
| 973 return TRUE; |
| 974 } |
| 975 |
| 976 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeSt
ring &s, |
| 977 UErrorCode &errorCode) { |
| 978 int32_t length = fileLine.length(); |
| 979 int32_t i; |
| 980 for(i = start; i < length && !isSpace(fileLine[i]); ++i) {} |
| 981 int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start); // '|' |
| 982 if(pipeIndex >= 0) { |
| 983 prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape(); |
| 984 if(prefix.isEmpty()) { |
| 985 errln("empty prefix on line %d", (int)fileLineNumber); |
| 986 infoln(fileLine); |
| 987 errorCode = U_PARSE_ERROR; |
| 988 return; |
| 989 } |
| 990 start = pipeIndex + 1; |
| 991 } else { |
| 992 prefix.remove(); |
| 993 } |
| 994 s = fileLine.tempSubStringBetween(start, i).unescape(); |
| 995 if(s.isEmpty()) { |
| 996 errln("empty string on line %d", (int)fileLineNumber); |
| 997 infoln(fileLine); |
| 998 errorCode = U_PARSE_ERROR; |
| 999 return; |
| 1000 } |
| 1001 start = i; |
| 1002 } |
| 1003 |
| 1004 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTest
ErrorCode &errorCode) { |
| 1005 Collation::Level relation; |
| 1006 int32_t start; |
| 1007 if(fileLine[0] == 0x3c) { // < |
| 1008 UChar second = fileLine[1]; |
| 1009 start = 2; |
| 1010 switch(second) { |
| 1011 case 0x31: // <1 |
| 1012 relation = Collation::PRIMARY_LEVEL; |
| 1013 break; |
| 1014 case 0x32: // <2 |
| 1015 relation = Collation::SECONDARY_LEVEL; |
| 1016 break; |
| 1017 case 0x33: // <3 |
| 1018 relation = Collation::TERTIARY_LEVEL; |
| 1019 break; |
| 1020 case 0x34: // <4 |
| 1021 relation = Collation::QUATERNARY_LEVEL; |
| 1022 break; |
| 1023 case 0x63: // <c |
| 1024 relation = Collation::CASE_LEVEL; |
| 1025 break; |
| 1026 case 0x69: // <i |
| 1027 relation = Collation::IDENTICAL_LEVEL; |
| 1028 break; |
| 1029 default: // just < |
| 1030 relation = Collation::NO_LEVEL; |
| 1031 start = 1; |
| 1032 break; |
| 1033 } |
| 1034 } else if(fileLine[0] == 0x3d) { // = |
| 1035 relation = Collation::ZERO_LEVEL; |
| 1036 start = 1; |
| 1037 } else { |
| 1038 start = 0; |
| 1039 } |
| 1040 if(start == 0 || !isSpace(fileLine[start])) { |
| 1041 errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (in
t)fileLineNumber); |
| 1042 infoln(fileLine); |
| 1043 errorCode.set(U_PARSE_ERROR); |
| 1044 return Collation::NO_LEVEL; |
| 1045 } |
| 1046 start = skipSpaces(start); |
| 1047 UnicodeString prefix; |
| 1048 parseString(start, prefix, s, errorCode); |
| 1049 if(errorCode.isSuccess() && !prefix.isEmpty()) { |
| 1050 errln("prefix string not allowed for test string: on line %d", (int)file
LineNumber); |
| 1051 infoln(fileLine); |
| 1052 errorCode.set(U_PARSE_ERROR); |
| 1053 return Collation::NO_LEVEL; |
| 1054 } |
| 1055 if(start < fileLine.length()) { |
| 1056 errln("unexpected line contents after test string on line %d", (int)file
LineNumber); |
| 1057 infoln(fileLine); |
| 1058 errorCode.set(U_PARSE_ERROR); |
| 1059 return Collation::NO_LEVEL; |
| 1060 } |
| 1061 return relation; |
| 1062 } |
| 1063 |
| 1064 static const struct { |
| 1065 const char *name; |
| 1066 UColAttribute attr; |
| 1067 } attributes[] = { |
| 1068 { "backwards", UCOL_FRENCH_COLLATION }, |
| 1069 { "alternate", UCOL_ALTERNATE_HANDLING }, |
| 1070 { "caseFirst", UCOL_CASE_FIRST }, |
| 1071 { "caseLevel", UCOL_CASE_LEVEL }, |
| 1072 // UCOL_NORMALIZATION_MODE is turned on and off automatically. |
| 1073 { "strength", UCOL_STRENGTH }, |
| 1074 // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated. |
| 1075 { "numeric", UCOL_NUMERIC_COLLATION } |
| 1076 }; |
| 1077 |
| 1078 static const struct { |
| 1079 const char *name; |
| 1080 UColAttributeValue value; |
| 1081 } attributeValues[] = { |
| 1082 { "default", UCOL_DEFAULT }, |
| 1083 { "primary", UCOL_PRIMARY }, |
| 1084 { "secondary", UCOL_SECONDARY }, |
| 1085 { "tertiary", UCOL_TERTIARY }, |
| 1086 { "quaternary", UCOL_QUATERNARY }, |
| 1087 { "identical", UCOL_IDENTICAL }, |
| 1088 { "off", UCOL_OFF }, |
| 1089 { "on", UCOL_ON }, |
| 1090 { "shifted", UCOL_SHIFTED }, |
| 1091 { "non-ignorable", UCOL_NON_IGNORABLE }, |
| 1092 { "lower", UCOL_LOWER_FIRST }, |
| 1093 { "upper", UCOL_UPPER_FIRST } |
| 1094 }; |
| 1095 |
| 1096 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) { |
| 1097 int32_t start = skipSpaces(1); |
| 1098 int32_t equalPos = fileLine.indexOf(0x3d); |
| 1099 if(equalPos < 0) { |
| 1100 if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) { |
| 1101 parseAndSetReorderCodes(start + 7, errorCode); |
| 1102 return; |
| 1103 } |
| 1104 errln("missing '=' on line %d", (int)fileLineNumber); |
| 1105 infoln(fileLine); |
| 1106 errorCode.set(U_PARSE_ERROR); |
| 1107 return; |
| 1108 } |
| 1109 |
| 1110 UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos); |
| 1111 UnicodeString valueString = fileLine.tempSubString(equalPos+1); |
| 1112 if(attrString == UNICODE_STRING("maxVariable", 11)) { |
| 1113 UColReorderCode max; |
| 1114 if(valueString == UNICODE_STRING("space", 5)) { |
| 1115 max = UCOL_REORDER_CODE_SPACE; |
| 1116 } else if(valueString == UNICODE_STRING("punct", 5)) { |
| 1117 max = UCOL_REORDER_CODE_PUNCTUATION; |
| 1118 } else if(valueString == UNICODE_STRING("symbol", 6)) { |
| 1119 max = UCOL_REORDER_CODE_SYMBOL; |
| 1120 } else if(valueString == UNICODE_STRING("currency", 8)) { |
| 1121 max = UCOL_REORDER_CODE_CURRENCY; |
| 1122 } else { |
| 1123 errln("invalid attribute value name on line %d", (int)fileLineNumber
); |
| 1124 infoln(fileLine); |
| 1125 errorCode.set(U_PARSE_ERROR); |
| 1126 return; |
| 1127 } |
| 1128 coll->setMaxVariable(max, errorCode); |
| 1129 if(errorCode.isFailure()) { |
| 1130 errln("setMaxVariable() failed on line %d: %s", |
| 1131 (int)fileLineNumber, errorCode.errorName()); |
| 1132 infoln(fileLine); |
| 1133 return; |
| 1134 } |
| 1135 fileLine.remove(); |
| 1136 return; |
| 1137 } |
| 1138 |
| 1139 UColAttribute attr; |
| 1140 for(int32_t i = 0;; ++i) { |
| 1141 if(i == UPRV_LENGTHOF(attributes)) { |
| 1142 errln("invalid attribute name on line %d", (int)fileLineNumber); |
| 1143 infoln(fileLine); |
| 1144 errorCode.set(U_PARSE_ERROR); |
| 1145 return; |
| 1146 } |
| 1147 if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) { |
| 1148 attr = attributes[i].attr; |
| 1149 break; |
| 1150 } |
| 1151 } |
| 1152 |
| 1153 UColAttributeValue value; |
| 1154 for(int32_t i = 0;; ++i) { |
| 1155 if(i == UPRV_LENGTHOF(attributeValues)) { |
| 1156 errln("invalid attribute value name on line %d", (int)fileLineNumber
); |
| 1157 infoln(fileLine); |
| 1158 errorCode.set(U_PARSE_ERROR); |
| 1159 return; |
| 1160 } |
| 1161 if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) { |
| 1162 value = attributeValues[i].value; |
| 1163 break; |
| 1164 } |
| 1165 } |
| 1166 |
| 1167 coll->setAttribute(attr, value, errorCode); |
| 1168 if(errorCode.isFailure()) { |
| 1169 errln("illegal attribute=value combination on line %d: %s", |
| 1170 (int)fileLineNumber, errorCode.errorName()); |
| 1171 infoln(fileLine); |
| 1172 return; |
| 1173 } |
| 1174 fileLine.remove(); |
| 1175 } |
| 1176 |
| 1177 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &err
orCode) { |
| 1178 UVector32 reorderCodes(errorCode); |
| 1179 while(start < fileLine.length()) { |
| 1180 start = skipSpaces(start); |
| 1181 int32_t limit = start; |
| 1182 while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit;
} |
| 1183 CharString name; |
| 1184 name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), e
rrorCode); |
| 1185 int32_t code = CollationRuleParser::getReorderCode(name.data()); |
| 1186 if(code < 0) { |
| 1187 if(uprv_stricmp(name.data(), "default") == 0) { |
| 1188 code = UCOL_REORDER_CODE_DEFAULT; // -1 |
| 1189 } else { |
| 1190 errln("invalid reorder code '%s' on line %d", name.data(), (int)
fileLineNumber); |
| 1191 infoln(fileLine); |
| 1192 errorCode.set(U_PARSE_ERROR); |
| 1193 return; |
| 1194 } |
| 1195 } |
| 1196 reorderCodes.addElement(code, errorCode); |
| 1197 start = limit; |
| 1198 } |
| 1199 coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCo
de); |
| 1200 if(errorCode.isFailure()) { |
| 1201 errln("setReorderCodes() failed on line %d: %s", (int)fileLineNumber, er
rorCode.errorName()); |
| 1202 infoln(fileLine); |
| 1203 return; |
| 1204 } |
| 1205 fileLine.remove(); |
| 1206 } |
| 1207 |
| 1208 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) { |
| 1209 UnicodeString rules; |
| 1210 while(readLine(f, errorCode)) { |
| 1211 if(fileLine.isEmpty()) { continue; } |
| 1212 if(isSectionStarter(fileLine[0])) { break; } |
| 1213 rules.append(fileLine.unescape()); |
| 1214 } |
| 1215 if(errorCode.isFailure()) { return; } |
| 1216 logln(rules); |
| 1217 |
| 1218 UParseError parseError; |
| 1219 UnicodeString reason; |
| 1220 delete coll; |
| 1221 coll = new RuleBasedCollator(rules, parseError, reason, errorCode); |
| 1222 if(coll == NULL) { |
| 1223 errln("unable to allocate a new collator"); |
| 1224 errorCode.set(U_MEMORY_ALLOCATION_ERROR); |
| 1225 return; |
| 1226 } |
| 1227 if(errorCode.isFailure()) { |
| 1228 dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName())
; |
| 1229 infoln(UnicodeString(" reason: ") + reason); |
| 1230 if(parseError.offset >= 0) { infoln(" rules offset: %d", (int)parseErro
r.offset); } |
| 1231 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) { |
| 1232 infoln(UnicodeString(" snippet: ...") + |
| 1233 parseError.preContext + "(!)" + parseError.postContext + "..."); |
| 1234 } |
| 1235 } else { |
| 1236 assertEquals("no error reason when RuleBasedCollator(rules) succeeds", |
| 1237 UnicodeString(), reason); |
| 1238 } |
| 1239 } |
| 1240 |
| 1241 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) { |
| 1242 if(errorCode.isFailure()) { return; } |
| 1243 delete coll; |
| 1244 coll = Collator::createInstance(Locale::getRoot(), errorCode); |
| 1245 if(errorCode.isFailure()) { |
| 1246 dataerrln("unable to create a root collator"); |
| 1247 return; |
| 1248 } |
| 1249 } |
| 1250 |
| 1251 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) { |
| 1252 if(errorCode.isFailure()) { return; } |
| 1253 int32_t at = fileLine.indexOf((UChar)0x40, 9); // @ is not invariant |
| 1254 if(at >= 0) { |
| 1255 fileLine.setCharAt(at, (UChar)0x2a); // * |
| 1256 } |
| 1257 CharString localeID; |
| 1258 localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode); |
| 1259 if(at >= 0) { |
| 1260 localeID.data()[at - 9] = '@'; |
| 1261 } |
| 1262 Locale locale(localeID.data()); |
| 1263 if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) { |
| 1264 errln("invalid language tag on line %d", (int)fileLineNumber); |
| 1265 infoln(fileLine); |
| 1266 if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); } |
| 1267 return; |
| 1268 } |
| 1269 |
| 1270 logln("creating a collator for locale ID %s", locale.getName()); |
| 1271 Collator *newColl = Collator::createInstance(locale, errorCode); |
| 1272 if(errorCode.isFailure()) { |
| 1273 dataerrln("unable to create a collator for locale %s on line %d", |
| 1274 locale.getName(), (int)fileLineNumber); |
| 1275 infoln(fileLine); |
| 1276 return; |
| 1277 } |
| 1278 delete coll; |
| 1279 coll = newColl; |
| 1280 } |
| 1281 |
| 1282 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &erro
rCode) const { |
| 1283 if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE;
} |
| 1284 // In some sequences with Tibetan composite vowel signs, |
| 1285 // even if the string passes the FCD check, |
| 1286 // those composites must be decomposed. |
| 1287 // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81. |
| 1288 int32_t index = 0; |
| 1289 while((index = s.indexOf((UChar)0xf71, index)) >= 0) { |
| 1290 if(++index < s.length()) { |
| 1291 UChar c = s[index]; |
| 1292 if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; } |
| 1293 } |
| 1294 } |
| 1295 return FALSE; |
| 1296 } |
| 1297 |
| 1298 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length, |
| 1299 CharString &dest, int32_t partSize, |
| 1300 IcuTestErrorCode &errorCode) { |
| 1301 if(errorCode.isFailure()) { return FALSE; } |
| 1302 uint8_t part[32]; |
| 1303 U_ASSERT(partSize <= UPRV_LENGTHOF(part)); |
| 1304 UCharIterator iter; |
| 1305 uiter_setString(&iter, s, length); |
| 1306 uint32_t state[2] = { 0, 0 }; |
| 1307 for(;;) { |
| 1308 int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, p
artSize, errorCode); |
| 1309 UBool done = partLength < partSize; |
| 1310 if(done) { |
| 1311 // At the end, append the next byte as well which should be 00. |
| 1312 ++partLength; |
| 1313 } |
| 1314 dest.append(reinterpret_cast<char *>(part), partLength, errorCode); |
| 1315 if(done) { |
| 1316 return errorCode.isSuccess(); |
| 1317 } |
| 1318 } |
| 1319 } |
| 1320 |
| 1321 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line
, |
| 1322 const UChar *s, int32_t length, |
| 1323 CollationKey &key, IcuTestErrorCode &errorC
ode) { |
| 1324 if(errorCode.isFailure()) { return FALSE; } |
| 1325 coll->getCollationKey(s, length, key, errorCode); |
| 1326 if(errorCode.isFailure()) { |
| 1327 infoln(fileTestName); |
| 1328 errln("Collator(%s).getCollationKey() failed: %s", |
| 1329 norm, errorCode.errorName()); |
| 1330 infoln(line); |
| 1331 return FALSE; |
| 1332 } |
| 1333 int32_t keyLength; |
| 1334 const uint8_t *keyBytes = key.getByteArray(keyLength); |
| 1335 if(keyLength == 0 || keyBytes[keyLength - 1] != 0) { |
| 1336 infoln(fileTestName); |
| 1337 errln("Collator(%s).getCollationKey() wrote an empty or unterminated key
", |
| 1338 norm); |
| 1339 infoln(line); |
| 1340 infoln(printCollationKey(key)); |
| 1341 return FALSE; |
| 1342 } |
| 1343 |
| 1344 int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode); |
| 1345 if(numLevels < UCOL_IDENTICAL) { |
| 1346 ++numLevels; |
| 1347 } else { |
| 1348 numLevels = 5; |
| 1349 } |
| 1350 if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) { |
| 1351 ++numLevels; |
| 1352 } |
| 1353 errorCode.assertSuccess(); |
| 1354 int32_t numLevelSeparators = 0; |
| 1355 for(int32_t i = 0; i < (keyLength - 1); ++i) { |
| 1356 uint8_t b = keyBytes[i]; |
| 1357 if(b == 0) { |
| 1358 infoln(fileTestName); |
| 1359 errln("Collator(%s).getCollationKey() contains a 00 byte", norm); |
| 1360 infoln(line); |
| 1361 infoln(printCollationKey(key)); |
| 1362 return FALSE; |
| 1363 } |
| 1364 if(b == 1) { ++numLevelSeparators; } |
| 1365 } |
| 1366 if(numLevelSeparators != (numLevels - 1)) { |
| 1367 infoln(fileTestName); |
| 1368 errln("Collator(%s).getCollationKey() has %d level separators for %d lev
els", |
| 1369 norm, (int)numLevelSeparators, (int)numLevels); |
| 1370 infoln(line); |
| 1371 infoln(printCollationKey(key)); |
| 1372 return FALSE; |
| 1373 } |
| 1374 |
| 1375 // If s contains U+FFFE, check that merged segments make the same key. |
| 1376 LocalMemory<uint8_t> mergedKey; |
| 1377 int32_t mergedKeyLength = 0; |
| 1378 int32_t mergedKeyCapacity = 0; |
| 1379 int32_t sLength = (length >= 0) ? length : u_strlen(s); |
| 1380 int32_t segmentStart = 0; |
| 1381 for(int32_t i = 0;;) { |
| 1382 if(i == sLength) { |
| 1383 if(segmentStart == 0) { |
| 1384 // s does not contain any U+FFFE. |
| 1385 break; |
| 1386 } |
| 1387 } else if(s[i] != 0xfffe) { |
| 1388 ++i; |
| 1389 continue; |
| 1390 } |
| 1391 // Get the sort key for another segment and merge it into mergedKey. |
| 1392 CollationKey key1(mergedKey.getAlias(), mergedKeyLength); // copies the
bytes |
| 1393 CollationKey key2; |
| 1394 coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCod
e); |
| 1395 int32_t key1Length, key2Length; |
| 1396 const uint8_t *key1Bytes = key1.getByteArray(key1Length); |
| 1397 const uint8_t *key2Bytes = key2.getByteArray(key2Length); |
| 1398 uint8_t *dest; |
| 1399 int32_t minCapacity = key1Length + key2Length; |
| 1400 if(key1Length > 0) { --minCapacity; } |
| 1401 if(minCapacity <= mergedKeyCapacity) { |
| 1402 dest = mergedKey.getAlias(); |
| 1403 } else { |
| 1404 if(minCapacity <= 200) { |
| 1405 mergedKeyCapacity = 200; |
| 1406 } else if(minCapacity <= 2 * mergedKeyCapacity) { |
| 1407 mergedKeyCapacity *= 2; |
| 1408 } else { |
| 1409 mergedKeyCapacity = minCapacity; |
| 1410 } |
| 1411 dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity); |
| 1412 } |
| 1413 U_ASSERT(dest != NULL || mergedKeyCapacity == 0); |
| 1414 if(key1Length == 0) { |
| 1415 // key2 is the sort key for the first segment. |
| 1416 uprv_memcpy(dest, key2Bytes, key2Length); |
| 1417 mergedKeyLength = key2Length; |
| 1418 } else { |
| 1419 mergedKeyLength = |
| 1420 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length, |
| 1421 dest, mergedKeyCapacity); |
| 1422 } |
| 1423 if(i == sLength) { break; } |
| 1424 segmentStart = ++i; |
| 1425 } |
| 1426 if(segmentStart != 0 && |
| 1427 (mergedKeyLength != keyLength || |
| 1428 uprv_memcmp(mergedKey.getAlias(), keyBytes, keyLength) != 0)) { |
| 1429 infoln(fileTestName); |
| 1430 errln("Collator(%s).getCollationKey(with U+FFFE) != " |
| 1431 "ucol_mergeSortkeys(segments)", |
| 1432 norm); |
| 1433 infoln(line); |
| 1434 infoln(printCollationKey(key)); |
| 1435 infoln(printSortKey(mergedKey.getAlias(), mergedKeyLength)); |
| 1436 return FALSE; |
| 1437 } |
| 1438 |
| 1439 // Check that internalNextSortKeyPart() makes the same key, with several par
t sizes. |
| 1440 static const int32_t partSizes[] = { 32, 3, 1 }; |
| 1441 for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) { |
| 1442 int32_t partSize = partSizes[psi]; |
| 1443 CharString parts; |
| 1444 if(!getSortKeyParts(s, length, parts, 32, errorCode)) { |
| 1445 infoln(fileTestName); |
| 1446 errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s", |
| 1447 norm, (int)partSize, errorCode.errorName()); |
| 1448 infoln(line); |
| 1449 return FALSE; |
| 1450 } |
| 1451 if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), ke
yLength) != 0) { |
| 1452 infoln(fileTestName); |
| 1453 errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)
", |
| 1454 norm, (int)partSize); |
| 1455 infoln(line); |
| 1456 infoln(printCollationKey(key)); |
| 1457 infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts
.length())); |
| 1458 return FALSE; |
| 1459 } |
| 1460 } |
| 1461 return TRUE; |
| 1462 } |
| 1463 |
| 1464 namespace { |
| 1465 |
| 1466 /** |
| 1467 * Replaces unpaired surrogates with U+FFFD. |
| 1468 * Returns s if no replacement was made, otherwise buffer. |
| 1469 */ |
| 1470 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buf
fer) { |
| 1471 int32_t i = 0; |
| 1472 while(i < s.length()) { |
| 1473 UChar32 c = s.char32At(i); |
| 1474 if(U_IS_SURROGATE(c)) { |
| 1475 if(buffer.length() < i) { |
| 1476 buffer.append(s, buffer.length(), i - buffer.length()); |
| 1477 } |
| 1478 buffer.append((UChar)0xfffd); |
| 1479 } |
| 1480 i += U16_LENGTH(c); |
| 1481 } |
| 1482 if(buffer.isEmpty()) { |
| 1483 return s; |
| 1484 } |
| 1485 if(buffer.length() < i) { |
| 1486 buffer.append(s, buffer.length(), i - buffer.length()); |
| 1487 } |
| 1488 return buffer; |
| 1489 } |
| 1490 |
| 1491 } |
| 1492 |
| 1493 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prev
FileLine, |
| 1494 const UnicodeString &prevString, const Unic
odeString &s, |
| 1495 UCollationResult expectedOrder, Collation::
Level expectedLevel, |
| 1496 IcuTestErrorCode &errorCode) { |
| 1497 if(errorCode.isFailure()) { return FALSE; } |
| 1498 |
| 1499 // Get the sort keys first, for error debug output. |
| 1500 CollationKey prevKey; |
| 1501 if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.l
ength(), |
| 1502 prevKey, errorCode)) { |
| 1503 return FALSE; |
| 1504 } |
| 1505 CollationKey key; |
| 1506 if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCod
e)) { return FALSE; } |
| 1507 |
| 1508 UCollationResult order = coll->compare(prevString, s, errorCode); |
| 1509 if(order != expectedOrder || errorCode.isFailure()) { |
| 1510 infoln(fileTestName); |
| 1511 errln("line %d Collator(%s).compare(previous, current) wrong order: %d !
= %d (%s)", |
| 1512 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorNa
me()); |
| 1513 infoln(prevFileLine); |
| 1514 infoln(fileLine); |
| 1515 infoln(printCollationKey(prevKey)); |
| 1516 infoln(printCollationKey(key)); |
| 1517 return FALSE; |
| 1518 } |
| 1519 order = coll->compare(s, prevString, errorCode); |
| 1520 if(order != -expectedOrder || errorCode.isFailure()) { |
| 1521 infoln(fileTestName); |
| 1522 errln("line %d Collator(%s).compare(current, previous) wrong order: %d !
= %d (%s)", |
| 1523 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorN
ame()); |
| 1524 infoln(prevFileLine); |
| 1525 infoln(fileLine); |
| 1526 infoln(printCollationKey(prevKey)); |
| 1527 infoln(printCollationKey(key)); |
| 1528 return FALSE; |
| 1529 } |
| 1530 // Test NUL-termination if the strings do not contain NUL characters. |
| 1531 UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0)
>= 0; |
| 1532 if(!containNUL) { |
| 1533 order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, err
orCode); |
| 1534 if(order != expectedOrder || errorCode.isFailure()) { |
| 1535 infoln(fileTestName); |
| 1536 errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong
order: %d != %d (%s)", |
| 1537 (int)fileLineNumber, norm, order, expectedOrder, errorCode.err
orName()); |
| 1538 infoln(prevFileLine); |
| 1539 infoln(fileLine); |
| 1540 infoln(printCollationKey(prevKey)); |
| 1541 infoln(printCollationKey(key)); |
| 1542 return FALSE; |
| 1543 } |
| 1544 order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, err
orCode); |
| 1545 if(order != -expectedOrder || errorCode.isFailure()) { |
| 1546 infoln(fileTestName); |
| 1547 errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong
order: %d != %d (%s)", |
| 1548 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.er
rorName()); |
| 1549 infoln(prevFileLine); |
| 1550 infoln(fileLine); |
| 1551 infoln(printCollationKey(prevKey)); |
| 1552 infoln(printCollationKey(key)); |
| 1553 return FALSE; |
| 1554 } |
| 1555 } |
| 1556 |
| 1557 #if U_HAVE_STD_STRING |
| 1558 // compare(UTF-16) treats unpaired surrogates like unassigned code points. |
| 1559 // Unpaired surrogates cannot be converted to UTF-8. |
| 1560 // Create valid UTF-16 strings if necessary, and use those for |
| 1561 // both the expected compare() result and for the input to compare(UTF-8). |
| 1562 UnicodeString prevBuffer, sBuffer; |
| 1563 const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer); |
| 1564 const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer); |
| 1565 std::string prevUTF8, sUTF8; |
| 1566 UnicodeString(prevValid).toUTF8String(prevUTF8); |
| 1567 UnicodeString(sValid).toUTF8String(sUTF8); |
| 1568 UCollationResult expectedUTF8Order; |
| 1569 if(&prevValid == &prevString && &sValid == &s) { |
| 1570 expectedUTF8Order = expectedOrder; |
| 1571 } else { |
| 1572 expectedUTF8Order = coll->compare(prevValid, sValid, errorCode); |
| 1573 } |
| 1574 |
| 1575 order = coll->compareUTF8(prevUTF8, sUTF8, errorCode); |
| 1576 if(order != expectedUTF8Order || errorCode.isFailure()) { |
| 1577 infoln(fileTestName); |
| 1578 errln("line %d Collator(%s).compareUTF8(previous, current) wrong order:
%d != %d (%s)", |
| 1579 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.err
orName()); |
| 1580 infoln(prevFileLine); |
| 1581 infoln(fileLine); |
| 1582 infoln(printCollationKey(prevKey)); |
| 1583 infoln(printCollationKey(key)); |
| 1584 return FALSE; |
| 1585 } |
| 1586 order = coll->compareUTF8(sUTF8, prevUTF8, errorCode); |
| 1587 if(order != -expectedUTF8Order || errorCode.isFailure()) { |
| 1588 infoln(fileTestName); |
| 1589 errln("line %d Collator(%s).compareUTF8(current, previous) wrong order:
%d != %d (%s)", |
| 1590 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.er
rorName()); |
| 1591 infoln(prevFileLine); |
| 1592 infoln(fileLine); |
| 1593 infoln(printCollationKey(prevKey)); |
| 1594 infoln(printCollationKey(key)); |
| 1595 return FALSE; |
| 1596 } |
| 1597 // Test NUL-termination if the strings do not contain NUL characters. |
| 1598 if(!containNUL) { |
| 1599 order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -
1, errorCode); |
| 1600 if(order != expectedUTF8Order || errorCode.isFailure()) { |
| 1601 infoln(fileTestName); |
| 1602 errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, curren
t-NUL) wrong order: %d != %d (%s)", |
| 1603 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode
.errorName()); |
| 1604 infoln(prevFileLine); |
| 1605 infoln(fileLine); |
| 1606 infoln(printCollationKey(prevKey)); |
| 1607 infoln(printCollationKey(key)); |
| 1608 return FALSE; |
| 1609 } |
| 1610 order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -
1, errorCode); |
| 1611 if(order != -expectedUTF8Order || errorCode.isFailure()) { |
| 1612 infoln(fileTestName); |
| 1613 errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previou
s-NUL) wrong order: %d != %d (%s)", |
| 1614 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCod
e.errorName()); |
| 1615 infoln(prevFileLine); |
| 1616 infoln(fileLine); |
| 1617 infoln(printCollationKey(prevKey)); |
| 1618 infoln(printCollationKey(key)); |
| 1619 return FALSE; |
| 1620 } |
| 1621 } |
| 1622 #endif |
| 1623 |
| 1624 UCharIterator leftIter; |
| 1625 UCharIterator rightIter; |
| 1626 uiter_setString(&leftIter, prevString.getBuffer(), prevString.length()); |
| 1627 uiter_setString(&rightIter, s.getBuffer(), s.length()); |
| 1628 order = coll->compare(leftIter, rightIter, errorCode); |
| 1629 if(order != expectedOrder || errorCode.isFailure()) { |
| 1630 infoln(fileTestName); |
| 1631 errln("line %d Collator(%s).compare(UCharIterator: previous, current) " |
| 1632 "wrong order: %d != %d (%s)", |
| 1633 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorNa
me()); |
| 1634 infoln(prevFileLine); |
| 1635 infoln(fileLine); |
| 1636 infoln(printCollationKey(prevKey)); |
| 1637 infoln(printCollationKey(key)); |
| 1638 return FALSE; |
| 1639 } |
| 1640 |
| 1641 order = prevKey.compareTo(key, errorCode); |
| 1642 if(order != expectedOrder || errorCode.isFailure()) { |
| 1643 infoln(fileTestName); |
| 1644 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo
() wrong order: %d != %d (%s)", |
| 1645 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorNa
me()); |
| 1646 infoln(prevFileLine); |
| 1647 infoln(fileLine); |
| 1648 infoln(printCollationKey(prevKey)); |
| 1649 infoln(printCollationKey(key)); |
| 1650 return FALSE; |
| 1651 } |
| 1652 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) { |
| 1653 int32_t prevKeyLength; |
| 1654 const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength); |
| 1655 int32_t keyLength; |
| 1656 const uint8_t *bytes = key.getByteArray(keyLength); |
| 1657 int32_t level = Collation::PRIMARY_LEVEL; |
| 1658 for(int32_t i = 0;; ++i) { |
| 1659 uint8_t b = prevBytes[i]; |
| 1660 if(b != bytes[i]) { break; } |
| 1661 if(b == Collation::LEVEL_SEPARATOR_BYTE) { |
| 1662 ++level; |
| 1663 if(level == Collation::CASE_LEVEL && |
| 1664 coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_O
FF) { |
| 1665 ++level; |
| 1666 } |
| 1667 } |
| 1668 } |
| 1669 if(level != expectedLevel) { |
| 1670 infoln(fileTestName); |
| 1671 errln("line %d Collator(%s).getCollationKey(previous, current).compa
reTo()=%d wrong level: %d != %d", |
| 1672 (int)fileLineNumber, norm, order, level, expectedLevel); |
| 1673 infoln(prevFileLine); |
| 1674 infoln(fileLine); |
| 1675 infoln(printCollationKey(prevKey)); |
| 1676 infoln(printCollationKey(key)); |
| 1677 return FALSE; |
| 1678 } |
| 1679 } |
| 1680 return TRUE; |
| 1681 } |
| 1682 |
| 1683 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode
) { |
| 1684 if(errorCode.isFailure()) { return; } |
| 1685 UnicodeString prevFileLine = UNICODE_STRING("(none)", 6); |
| 1686 UnicodeString prevString, s; |
| 1687 prevString.getTerminatedBuffer(); // Ensure NUL-termination. |
| 1688 while(readLine(f, errorCode)) { |
| 1689 if(fileLine.isEmpty()) { continue; } |
| 1690 if(isSectionStarter(fileLine[0])) { break; } |
| 1691 Collation::Level relation = parseRelationAndString(s, errorCode); |
| 1692 if(errorCode.isFailure()) { |
| 1693 errorCode.reset(); |
| 1694 break; |
| 1695 } |
| 1696 UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? U
COL_EQUAL : UCOL_LESS; |
| 1697 Collation::Level expectedLevel = relation; |
| 1698 s.getTerminatedBuffer(); // Ensure NUL-termination. |
| 1699 UBool isOk = TRUE; |
| 1700 if(!needsNormalization(prevString, errorCode) && !needsNormalization(s,
errorCode)) { |
| 1701 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode); |
| 1702 isOk = checkCompareTwo("normalization=on", prevFileLine, prevString,
s, |
| 1703 expectedOrder, expectedLevel, errorCode); |
| 1704 } |
| 1705 if(isOk) { |
| 1706 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode); |
| 1707 isOk = checkCompareTwo("normalization=off", prevFileLine, prevString
, s, |
| 1708 expectedOrder, expectedLevel, errorCode); |
| 1709 } |
| 1710 if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormali
zed(s, errorCode))) { |
| 1711 UnicodeString pn = nfd->normalize(prevString, errorCode); |
| 1712 UnicodeString n = nfd->normalize(s, errorCode); |
| 1713 pn.getTerminatedBuffer(); |
| 1714 n.getTerminatedBuffer(); |
| 1715 errorCode.assertSuccess(); |
| 1716 isOk = checkCompareTwo("NFD input", prevFileLine, pn, n, |
| 1717 expectedOrder, expectedLevel, errorCode); |
| 1718 } |
| 1719 if(!isOk) { |
| 1720 errorCode.reset(); // already reported |
| 1721 } |
| 1722 prevFileLine = fileLine; |
| 1723 prevString = s; |
| 1724 prevString.getTerminatedBuffer(); // Ensure NUL-termination. |
| 1725 } |
| 1726 } |
| 1727 |
| 1728 void CollationTest::TestDataDriven() { |
| 1729 IcuTestErrorCode errorCode(*this, "TestDataDriven"); |
| 1730 |
| 1731 fcd = Normalizer2Factory::getFCDInstance(errorCode); |
| 1732 nfd = Normalizer2::getNFDInstance(errorCode); |
| 1733 if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance()
or getNFDInstance()")) { |
| 1734 return; |
| 1735 } |
| 1736 |
| 1737 CharString path(getSourceTestData(errorCode), errorCode); |
| 1738 path.appendPathPart("collationtest.txt", errorCode); |
| 1739 const char *codePage = "UTF-8"; |
| 1740 LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, error
Code)); |
| 1741 if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) { |
| 1742 return; |
| 1743 } |
| 1744 while(errorCode.isSuccess()) { |
| 1745 // Read a new line if necessary. |
| 1746 // Sub-parsers leave the first line set that they do not handle. |
| 1747 if(fileLine.isEmpty()) { |
| 1748 if(!readLine(f.getAlias(), errorCode)) { break; } |
| 1749 continue; |
| 1750 } |
| 1751 if(!isSectionStarter(fileLine[0])) { |
| 1752 errln("syntax error on line %d", (int)fileLineNumber); |
| 1753 infoln(fileLine); |
| 1754 return; |
| 1755 } |
| 1756 if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) { |
| 1757 fileTestName = fileLine; |
| 1758 logln(fileLine); |
| 1759 fileLine.remove(); |
| 1760 } else if(fileLine == UNICODE_STRING("@ root", 6)) { |
| 1761 setRootCollator(errorCode); |
| 1762 fileLine.remove(); |
| 1763 } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) { |
| 1764 setLocaleCollator(errorCode); |
| 1765 fileLine.remove(); |
| 1766 } else if(fileLine == UNICODE_STRING("@ rules", 7)) { |
| 1767 buildTailoring(f.getAlias(), errorCode); |
| 1768 } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) { // % |
| 1769 parseAndSetAttribute(errorCode); |
| 1770 } else if(fileLine == UNICODE_STRING("* compare", 9)) { |
| 1771 checkCompareStrings(f.getAlias(), errorCode); |
| 1772 } else { |
| 1773 errln("syntax error on line %d", (int)fileLineNumber); |
| 1774 infoln(fileLine); |
| 1775 return; |
| 1776 } |
| 1777 } |
| 1778 } |
| 1779 |
| 1780 #endif // !UCONFIG_NO_COLLATION |
OLD | NEW |