Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(387)

Side by Side Diff: source/test/intltest/collationtest.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master
Patch Set: remove unusued directories Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/test/intltest/cntabcol.cpp ('k') | source/test/intltest/colldata.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /*
2 *******************************************************************************
3 * Copyright (C) 2012-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationtest.cpp
7 *
8 * created on: 2012apr27
9 * created by: Markus W. Scherer
10 */
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_COLLATION
15
16 #include "unicode/coll.h"
17 #include "unicode/errorcode.h"
18 #include "unicode/localpointer.h"
19 #include "unicode/normalizer2.h"
20 #include "unicode/sortkey.h"
21 #include "unicode/std_string.h"
22 #include "unicode/strenum.h"
23 #include "unicode/tblcoll.h"
24 #include "unicode/uiter.h"
25 #include "unicode/uniset.h"
26 #include "unicode/unistr.h"
27 #include "unicode/usetiter.h"
28 #include "unicode/ustring.h"
29 #include "charstr.h"
30 #include "cmemory.h"
31 #include "collation.h"
32 #include "collationdata.h"
33 #include "collationfcd.h"
34 #include "collationiterator.h"
35 #include "collationroot.h"
36 #include "collationrootelements.h"
37 #include "collationruleparser.h"
38 #include "collationweights.h"
39 #include "cstring.h"
40 #include "intltest.h"
41 #include "normalizer2impl.h"
42 #include "ucbuf.h"
43 #include "uhash.h"
44 #include "uitercollationiterator.h"
45 #include "utf16collationiterator.h"
46 #include "utf8collationiterator.h"
47 #include "uvectr32.h"
48 #include "uvectr64.h"
49 #include "writesrc.h"
50
51 // TODO: Move to ucbuf.h
52 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close);
53
54 class CodePointIterator;
55
56 // TODO: try to share code with IntlTestCollator; for example, prettify(Collatio nKey)
57
58 class CollationTest : public IntlTest {
59 public:
60 CollationTest()
61 : fcd(NULL), nfd(NULL),
62 fileLineNumber(0),
63 coll(NULL) {}
64
65 ~CollationTest() {
66 delete coll;
67 }
68
69 void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par= NULL);
70
71 void TestMinMax();
72 void TestImplicits();
73 void TestNulTerminated();
74 void TestIllegalUTF8();
75 void TestShortFCDData();
76 void TestFCD();
77 void TestCollationWeights();
78 void TestRootElements();
79 void TestTailoredElements();
80 void TestDataDriven();
81
82 private:
83 void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cp i);
84 void checkAllocWeights(CollationWeights &cw,
85 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
86 int32_t someLength, int32_t minCount);
87
88 static UnicodeString printSortKey(const uint8_t *p, int32_t length);
89 static UnicodeString printCollationKey(const CollationKey &key);
90
91 // Helpers & fields for data-driven test.
92 static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
93 static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
94 static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; } // %*@
95 int32_t skipSpaces(int32_t i) {
96 while(isSpace(fileLine[i])) { ++i; }
97 return i;
98 }
99
100 UBool readLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
101 void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UE rrorCode &errorCode);
102 Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode & errorCode);
103 void parseAndSetAttribute(IcuTestErrorCode &errorCode);
104 void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
105 void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
106 void setRootCollator(IcuTestErrorCode &errorCode);
107 void setLocaleCollator(IcuTestErrorCode &errorCode);
108
109 UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) cons t;
110
111 UBool getSortKeyParts(const UChar *s, int32_t length,
112 CharString &dest, int32_t partSize,
113 IcuTestErrorCode &errorCode);
114 UBool getCollationKey(const char *norm, const UnicodeString &line,
115 const UChar *s, int32_t length,
116 CollationKey &key, IcuTestErrorCode &errorCode);
117 UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
118 const UnicodeString &prevString, const UnicodeString & s,
119 UCollationResult expectedOrder, Collation::Level expec tedLevel,
120 IcuTestErrorCode &errorCode);
121 void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
122
123 const Normalizer2 *fcd, *nfd;
124 UnicodeString fileLine;
125 int32_t fileLineNumber;
126 UnicodeString fileTestName;
127 Collator *coll;
128 };
129
130 extern IntlTest *createCollationTest() {
131 return new CollationTest();
132 }
133
134 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
135 if(exec) {
136 logln("TestSuite CollationTest: ");
137 }
138 TESTCASE_AUTO_BEGIN;
139 TESTCASE_AUTO(TestMinMax);
140 TESTCASE_AUTO(TestImplicits);
141 TESTCASE_AUTO(TestNulTerminated);
142 TESTCASE_AUTO(TestIllegalUTF8);
143 TESTCASE_AUTO(TestShortFCDData);
144 TESTCASE_AUTO(TestFCD);
145 TESTCASE_AUTO(TestCollationWeights);
146 TESTCASE_AUTO(TestRootElements);
147 TESTCASE_AUTO(TestTailoredElements);
148 TESTCASE_AUTO(TestDataDriven);
149 TESTCASE_AUTO_END;
150 }
151
152 void CollationTest::TestMinMax() {
153 IcuTestErrorCode errorCode(*this, "TestMinMax");
154
155 setRootCollator(errorCode);
156 if(errorCode.isFailure()) {
157 errorCode.reset();
158 return;
159 }
160 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
161 if(rbc == NULL) {
162 errln("the root collator is not a RuleBasedCollator");
163 return;
164 }
165
166 static const UChar s[2] = { 0xfffe, 0xffff };
167 UVector64 ces(errorCode);
168 rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
169 errorCode.assertSuccess();
170 if(ces.size() != 2) {
171 errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
172 return;
173 }
174 int64_t ce = ces.elementAti(0);
175 int64_t expected =
176 ((int64_t)Collation::MERGE_SEPARATOR_PRIMARY << 32) |
177 Collation::MERGE_SEPARATOR_LOWER32;
178 if(ce != expected) {
179 errln("CE(U+fffe)=%04lx != 02.02.02", (long)ce);
180 }
181
182 ce = ces.elementAti(1);
183 expected = Collation::makeCE(Collation::MAX_PRIMARY);
184 if(ce != expected) {
185 errln("CE(U+ffff)=%04lx != max..", (long)ce);
186 }
187 }
188
189 void CollationTest::TestImplicits() {
190 IcuTestErrorCode errorCode(*this, "TestImplicits");
191
192 const CollationData *cd = CollationRoot::getData(errorCode);
193 if(errorCode.logDataIfFailureAndReset("CollationRoot::getBaseData()")) {
194 return;
195 }
196
197 // Implicit primary weights should be assigned for the following sets,
198 // and sort in ascending order by set and then code point.
199 // See http://www.unicode.org/reports/tr10/#Implicit_Weights
200
201 // core Han Unified Ideographs
202 UnicodeSet coreHan("[\\p{unified_ideograph}&"
203 "[\\p{Block=CJK_Unified_Ideographs}"
204 "\\p{Block=CJK_Compatibility_Ideographs}]]",
205 errorCode);
206 // all other Unified Han ideographs
207 UnicodeSet otherHan("[\\p{unified ideograph}-"
208 "[\\p{Block=CJK_Unified_Ideographs}"
209 "\\p{Block=CJK_Compatibility_Ideographs}]]",
210 errorCode);
211 UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
212 unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings .
213
214 // Starting with CLDR 26/ICU 54, the root Han order may instead be
215 // the Unihan radical-stroke order.
216 // The tests should pass either way, so we only test the order of a small se t of Han characters
217 // whose radical-stroke order is the same as their code point order.
218 UnicodeSet someHanInCPOrder(
219 "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
220 "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3 -\\u50F6]",
221 errorCode);
222 UnicodeSet inOrder(someHanInCPOrder);
223 inOrder.addAll(unassigned).freeze();
224 if(errorCode.logIfFailureAndReset("UnicodeSet")) {
225 return;
226 }
227 const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
228 UChar32 prev = 0;
229 uint32_t prevPrimary = 0;
230 UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
231 for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
232 LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
233 while(iter->next()) {
234 UChar32 c = iter->getCodepoint();
235 UnicodeString s(c);
236 ci.setText(s.getBuffer(), s.getBuffer() + s.length());
237 int64_t ce = ci.nextCE(errorCode);
238 int64_t ce2 = ci.nextCE(errorCode);
239 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
240 return;
241 }
242 if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
243 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly o ne CE", (long)c);
244 continue;
245 }
246 if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
247 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
248 (long)c, (long)(ce & 0xffffffff));
249 continue;
250 }
251 uint32_t primary = (uint32_t)(ce >> 32);
252 if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contai ns(prev)) {
253 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx.." ,
254 (long)c, (long)primary, (long)prev, (long)prevPrimary);
255 }
256 prev = c;
257 prevPrimary = primary;
258 }
259 }
260 }
261
262 void CollationTest::TestNulTerminated() {
263 IcuTestErrorCode errorCode(*this, "TestNulTerminated");
264 const CollationData *data = CollationRoot::getData(errorCode);
265 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
266 return;
267 }
268
269 static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
270
271 UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
272 UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
273 for(int32_t i = 0;; ++i) {
274 int64_t ce1 = ci1.nextCE(errorCode);
275 int64_t ce2 = ci2.nextCE(errorCode);
276 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
277 return;
278 }
279 if(ce1 != ce2) {
280 errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminate d) at CE %d", (int)i);
281 break;
282 }
283 if(ce1 == Collation::NO_CE) { break; }
284 }
285 }
286
287 void CollationTest::TestIllegalUTF8() {
288 IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
289
290 setRootCollator(errorCode);
291 if(errorCode.isFailure()) {
292 errorCode.reset();
293 return;
294 }
295 coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
296
297 static const char *strings[] = {
298 // U+FFFD
299 "a\xef\xbf\xbdz",
300 // illegal byte sequences
301 "a\x80z", // trail byte
302 "a\xc1\x81z", // non-shortest form
303 "a\xe0\x82\x83z", // non-shortest form
304 "a\xed\xa0\x80z", // lead surrogate: would be U+D800
305 "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
306 "a\xf0\x8f\xbf\xbfz", // non-shortest form
307 "a\xf4\x90\x80\x80z" // out of range: would be U+110000
308 };
309
310 StringPiece fffd(strings[0]);
311 for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) {
312 StringPiece illegal(strings[i]);
313 UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
314 if(order != UCOL_EQUAL) {
315 errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_ EQUAL",
316 (int)i, order);
317 }
318 }
319 }
320
321 namespace {
322
323 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
324 for(UChar32 c = 0x10000; c < 0x110000;) {
325 UChar32 next = c + 0x400;
326 if(src.containsSome(c, next - 1)) {
327 dest.add(U16_LEAD(c));
328 }
329 c = next;
330 }
331 }
332
333 } // namespace
334
335 void CollationTest::TestShortFCDData() {
336 // See CollationFCD class comments.
337 IcuTestErrorCode errorCode(*this, "TestShortFCDData");
338 UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
339 errorCode.assertSuccess();
340 expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates
341 addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
342 UnicodeSet lccc; // actual
343 for(UChar32 c = 0; c <= 0xffff; ++c) {
344 if(CollationFCD::hasLccc(c)) { lccc.add(c); }
345 }
346 UnicodeSet diff(expectedLccc);
347 diff.removeAll(lccc);
348 diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP
349 UnicodeString empty("[]");
350 UnicodeString diffString;
351 diff.toPattern(diffString, TRUE);
352 assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
353 diff = lccc;
354 diff.removeAll(expectedLccc);
355 diff.toPattern(diffString, TRUE);
356 assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, T RUE);
357
358 UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
359 if (errorCode.isSuccess()) {
360 addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
361 addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
362 UnicodeSet tccc; // actual
363 for(UChar32 c = 0; c <= 0xffff; ++c) {
364 if(CollationFCD::hasTccc(c)) { tccc.add(c); }
365 }
366 diff = expectedTccc;
367 diff.removeAll(tccc);
368 diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP
369 assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffStrin g);
370 diff = tccc;
371 diff.removeAll(expectedTccc);
372 diff.toPattern(diffString, TRUE);
373 assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffStrin g);
374 }
375 }
376
377 class CodePointIterator {
378 public:
379 CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length ), pos(0) {}
380 void resetToStart() { pos = 0; }
381 UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
382 UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
383 int32_t getLength() const { return length; }
384 int getIndex() const { return (int)pos; }
385 private:
386 const UChar32 *cp;
387 int32_t length;
388 int32_t pos;
389 };
390
391 void CollationTest::checkFCD(const char *name,
392 CollationIterator &ci, CodePointIterator &cpi) {
393 IcuTestErrorCode errorCode(*this, "checkFCD");
394
395 // Iterate forward to the limit.
396 for(;;) {
397 UChar32 c1 = ci.nextCodePoint(errorCode);
398 UChar32 c2 = cpi.next();
399 if(c1 != c2) {
400 errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
401 name, (long)c1, (long)c2, cpi.getIndex());
402 return;
403 }
404 if(c1 < 0) { break; }
405 }
406
407 // Iterate backward most of the way.
408 for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
409 UChar32 c1 = ci.previousCodePoint(errorCode);
410 UChar32 c2 = cpi.previous();
411 if(c1 != c2) {
412 errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
413 name, (long)c1, (long)c2, cpi.getIndex());
414 return;
415 }
416 }
417
418 // Forward again.
419 for(;;) {
420 UChar32 c1 = ci.nextCodePoint(errorCode);
421 UChar32 c2 = cpi.next();
422 if(c1 != c2) {
423 errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
424 name, (long)c1, (long)c2, cpi.getIndex());
425 return;
426 }
427 if(c1 < 0) { break; }
428 }
429
430 // Iterate backward to the start.
431 for(;;) {
432 UChar32 c1 = ci.previousCodePoint(errorCode);
433 UChar32 c2 = cpi.previous();
434 if(c1 != c2) {
435 errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
436 name, (long)c1, (long)c2, cpi.getIndex());
437 return;
438 }
439 if(c1 < 0) { break; }
440 }
441 }
442
443 void CollationTest::TestFCD() {
444 IcuTestErrorCode errorCode(*this, "TestFCD");
445 const CollationData *data = CollationRoot::getData(errorCode);
446 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
447 return;
448 }
449
450 // Input string, not FCD, NUL-terminated.
451 static const UChar s[] = {
452 0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
453 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), // MUSICAL SYMBOL QUARTER NOTE=1 D158 1D165, ccc=0, 216
454 0x327, 0x308, // ccc=202, 230
455 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), // MUSICAL SYMBOL COMBINING AUGM ENTATION DOT, ccc=226
456 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
457 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
458 0xac01,
459 0xe7, // Character with tccc!=0 decomposed together with mis-ordered se quence.
460 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D 165),
461 0xe1, // Character with tccc!=0 decomposed together with decomposed seq uence.
462 0xf73, 0xf75, // Tibetan composite vowels must be decomposed.
463 0x4e00, 0xf81,
464 0
465 };
466 // Expected code points.
467 static const UChar32 cp[] = {
468 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
469 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
470 0x1D15F, 0x1D16D,
471 0xac01,
472 0x63, 0x327, 0x1D165, 0x1D16D,
473 0x61,
474 0xf71, 0xf71, 0xf72, 0xf74, 0x301,
475 0x4e00, 0xf71, 0xf80
476 };
477
478 FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
479 if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
480 return;
481 }
482 CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
483 checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
484
485 #if U_HAVE_STD_STRING
486 cpi.resetToStart();
487 std::string utf8;
488 UnicodeString(s).toUTF8String(utf8);
489 FCDUTF8CollationIterator u8ci(data, FALSE,
490 reinterpret_cast<const uint8_t *>(utf8.c_str() ), 0, -1);
491 if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
492 return;
493 }
494 checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
495 #endif
496
497 cpi.resetToStart();
498 UCharIterator iter;
499 uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1); // -1: without the termina ting NUL
500 FCDUIterCollationIterator uici(data, FALSE, iter, 0);
501 if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) {
502 return;
503 }
504 checkFCD("FCDUIterCollationIterator", uici, cpi);
505 }
506
507 void CollationTest::checkAllocWeights(CollationWeights &cw,
508 uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
509 int32_t someLength, int32_t minCount) {
510 if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
511 errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
512 (long)lowerLimit, (long)upperLimit, (long)n);
513 return;
514 }
515 uint32_t previous = lowerLimit;
516 int32_t count = 0; // number of weights that have someLength
517 for(int32_t i = 0; i < n; ++i) {
518 uint32_t w = cw.nextWeight();
519 if(w == 0xffffffff) {
520 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
521 "returns only %ld weights",
522 (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
523 return;
524 }
525 if(!(previous < w && w < upperLimit)) {
526 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
527 "number %ld -> %lx not between %lx and %lx",
528 (long)lowerLimit, (long)upperLimit, (long)n,
529 (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
530 return;
531 }
532 if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
533 }
534 if(count < minCount) {
535 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
536 "returns only %ld < %ld weights of length %d",
537 (long)lowerLimit, (long)upperLimit, (long)n,
538 (long)count, (long)minCount, (int)someLength);
539 }
540 }
541
542 void CollationTest::TestCollationWeights() {
543 CollationWeights cw;
544
545 // Non-compressible primaries use 254 second bytes 02..FF.
546 logln("CollationWeights.initForPrimary(non-compressible)");
547 cw.initForPrimary(FALSE);
548 // Expect 1 weight 11 and 254 weights 12xx.
549 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
550 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
551 // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
552 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
553 // Expect 254 two-byte weights from the ranges 10ff and 11xx.
554 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
555 // Expect 254^2=64516 three-byte weights.
556 // During computation, there should be 3 three-byte ranges
557 // 10ffff, 11xxxx, 120202.
558 // The middle one should be split 64515:1,
559 // and the newly-split-off range and the last ranged lengthened.
560 checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516) ;
561 // Expect weights 1102 & 1103.
562 checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
563 // Expect weights 102102 & 102103.
564 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
565
566 // Compressible primaries use 251 second bytes 04..FE.
567 logln("CollationWeights.initForPrimary(compressible)");
568 cw.initForPrimary(TRUE);
569 // Expect 1 weight 11 and 251 weights 12xx.
570 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
571 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
572 // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
573 checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
574 // Expect weights 1104 & 1105.
575 checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
576 // Expect weights 102102 & 102103.
577 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
578
579 // Secondary and tertiary weights use only bytes 3 & 4.
580 logln("CollationWeights.initForSecondary()");
581 cw.initForSecondary();
582 // Expect weights fbxx and all four fc..ff.
583 checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
584
585 logln("CollationWeights.initForTertiary()");
586 cw.initForTertiary();
587 // Expect weights 3dxx and both 3e & 3f.
588 checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
589 }
590
591 namespace {
592
593 UBool isValidCE(const CollationRootElements &re, const CollationData &data,
594 uint32_t p, uint32_t s, uint32_t ctq) {
595 uint32_t p1 = p >> 24;
596 uint32_t p2 = (p >> 16) & 0xff;
597 uint32_t p3 = (p >> 8) & 0xff;
598 uint32_t p4 = p & 0xff;
599 uint32_t s1 = s >> 8;
600 uint32_t s2 = s & 0xff;
601 // ctq = Case, Tertiary, Quaternary
602 uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
603 uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
604 uint32_t t1 = t >> 8;
605 uint32_t t2 = t & 0xff;
606 uint32_t q = ctq & Collation::QUATERNARY_MASK;
607 // No leading zero bytes.
608 if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
609 return FALSE;
610 }
611 // No intermediate zero bytes.
612 if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
613 return FALSE;
614 }
615 if(p2 != 0 && p3 == 0 && p4 != 0) {
616 return FALSE;
617 }
618 // Minimum & maximum lead bytes.
619 if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
620 (s1 != 0 && s1 <= Collation::MERGE_SEPARATOR_BYTE) ||
621 (t1 != 0 && t1 <= Collation::MERGE_SEPARATOR_BYTE)) {
622 return FALSE;
623 }
624 if(t1 != 0 && t1 > 0x3f) {
625 return FALSE;
626 }
627 if(c > 2) {
628 return FALSE;
629 }
630 // The valid byte range for the second primary byte depends on compressibili ty.
631 if(p2 != 0) {
632 if(data.isCompressibleLeadByte(p1)) {
633 if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
634 Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
635 return FALSE;
636 }
637 } else {
638 if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
639 return FALSE;
640 }
641 }
642 }
643 // Other bytes just need to avoid the level separator.
644 // Trailing zeros are ok.
645 U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
646 if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR _BYTE ||
647 s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPA RATOR_BYTE) {
648 return FALSE;
649 }
650 // Well-formed CEs.
651 if(p == 0) {
652 if(s == 0) {
653 if(t == 0) {
654 // Completely ignorable CE.
655 // Quaternary CEs are not supported.
656 if(c != 0 || q != 0) {
657 return FALSE;
658 }
659 } else {
660 // Tertiary CE.
661 if(t < re.getTertiaryBoundary() || c != 2) {
662 return FALSE;
663 }
664 }
665 } else {
666 // Secondary CE.
667 if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBou ndary()) {
668 return FALSE;
669 }
670 }
671 } else {
672 // Primary CE.
673 if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSec ondary()) ||
674 s >= re.getSecondaryBoundary()) {
675 return FALSE;
676 }
677 if(t == 0 || t >= re.getTertiaryBoundary()) {
678 return FALSE;
679 }
680 }
681 return TRUE;
682 }
683
684 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int6 4_t ce) {
685 uint32_t p = (uint32_t)(ce >> 32);
686 uint32_t secTer = (uint32_t)ce;
687 return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
688 }
689
690 class RootElementsIterator {
691 public:
692 RootElementsIterator(const CollationData &root)
693 : data(root),
694 elements(root.rootElements), length(root.rootElementsLength),
695 pri(0), secTer(0),
696 index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_I NDEX]) {}
697
698 UBool next() {
699 if(index >= length) { return FALSE; }
700 uint32_t p = elements[index];
701 if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
702 if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
703 ++index;
704 secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
705 return TRUE;
706 }
707 if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
708 // End of a range, enumerate the primaries in the range.
709 int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK ;
710 p &= 0xffffff00;
711 if(pri == p) {
712 // Finished the range, return the next CE after it.
713 ++index;
714 return next();
715 }
716 U_ASSERT(pri < p);
717 // Return the next primary in this range.
718 UBool isCompressible = data.isCompressiblePrimary(pri);
719 if((pri & 0xffff) == 0) {
720 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
721 } else {
722 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible , step);
723 }
724 return TRUE;
725 }
726 // Simple primary CE.
727 ++index;
728 pri = p;
729 secTer = Collation::COMMON_SEC_AND_TER_CE;
730 return TRUE;
731 }
732
733 uint32_t getPrimary() const { return pri; }
734 uint32_t getSecTer() const { return secTer; }
735
736 private:
737 const CollationData &data;
738 const uint32_t *elements;
739 int32_t length;
740
741 uint32_t pri;
742 uint32_t secTer;
743 int32_t index;
744 };
745
746 } // namespace
747
748 void CollationTest::TestRootElements() {
749 IcuTestErrorCode errorCode(*this, "TestRootElements");
750 const CollationData *root = CollationRoot::getData(errorCode);
751 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
752 return;
753 }
754 CollationRootElements rootElements(root->rootElements, root->rootElementsLen gth);
755 RootElementsIterator iter(*root);
756
757 // We check each root CE for validity,
758 // and we also verify that there is a tailoring gap between each two CEs.
759 CollationWeights cw1c; // compressible primary weights
760 CollationWeights cw1u; // uncompressible primary weights
761 CollationWeights cw2;
762 CollationWeights cw3;
763
764 cw1c.initForPrimary(TRUE);
765 cw1u.initForPrimary(FALSE);
766 cw2.initForSecondary();
767 cw3.initForTertiary();
768
769 // Note: The root elements do not include Han-implicit or unassigned-implici t CEs,
770 // nor the special merge-separator CE for U+FFFE.
771 uint32_t prevPri = 0;
772 uint32_t prevSec = 0;
773 uint32_t prevTer = 0;
774 while(iter.next()) {
775 uint32_t pri = iter.getPrimary();
776 uint32_t secTer = iter.getSecTer();
777 // CollationRootElements CEs must have 0 case and quaternary bits.
778 if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
779 errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
780 (long)pri, (long)secTer);
781 }
782 uint32_t sec = secTer >> 16;
783 uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
784 uint32_t ctq = ter;
785 if(pri == 0 && sec == 0 && ter != 0) {
786 // Tertiary CEs must have uppercase bits,
787 // but they are not stored in the CollationRootElements.
788 ctq |= 0x8000;
789 }
790 if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
791 errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
792 } else {
793 if(pri != prevPri) {
794 uint32_t newWeight = 0;
795 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
796 // There is currently no tailoring gap after primary ignorab les,
797 // and we forbid tailoring after U+FFFD and U+FFFF.
798 } else if(root->isCompressiblePrimary(prevPri)) {
799 if(!cw1c.allocWeights(prevPri, pri, 1)) {
800 errln("no primary/compressible tailoring gap between %08 lx and %08lx",
801 (long)prevPri, (long)pri);
802 } else {
803 newWeight = cw1c.nextWeight();
804 }
805 } else {
806 if(!cw1u.allocWeights(prevPri, pri, 1)) {
807 errln("no primary/uncompressible tailoring gap between % 08lx and %08lx",
808 (long)prevPri, (long)pri);
809 } else {
810 newWeight = cw1u.nextWeight();
811 }
812 }
813 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
814 errln("mis-allocated primary weight, should get %08lx < %08l x < %08lx",
815 (long)prevPri, (long)newWeight, (long)pri);
816 }
817 } else if(sec != prevSec) {
818 uint32_t lowerLimit =
819 prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
820 if(!cw2.allocWeights(lowerLimit, sec, 1)) {
821 errln("no secondary tailoring gap between %04x and %04x", lo werLimit, sec);
822 } else {
823 uint32_t newWeight = cw2.nextWeight();
824 if(!(prevSec < newWeight && newWeight < sec)) {
825 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
826 (long)lowerLimit, (long)newWeight, (long)sec);
827 }
828 }
829 } else if(ter != prevTer) {
830 uint32_t lowerLimit =
831 prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
832 if(!cw3.allocWeights(lowerLimit, ter, 1)) {
833 errln("no teriary tailoring gap between %04x and %04x", lowe rLimit, ter);
834 } else {
835 uint32_t newWeight = cw3.nextWeight();
836 if(!(prevTer < newWeight && newWeight < ter)) {
837 errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
838 (long)lowerLimit, (long)newWeight, (long)ter);
839 }
840 }
841 } else {
842 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
843 }
844 }
845 prevPri = pri;
846 prevSec = sec;
847 prevTer = ter;
848 }
849 }
850
851 void CollationTest::TestTailoredElements() {
852 IcuTestErrorCode errorCode(*this, "TestTailoredElements");
853 const CollationData *root = CollationRoot::getData(errorCode);
854 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
855 return;
856 }
857 CollationRootElements rootElements(root->rootElements, root->rootElementsLen gth);
858
859 UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NU LL, errorCode);
860 if(errorCode.logIfFailureAndReset("failed to create a hash table")) {
861 return;
862 }
863 uhash_setKeyDeleter(prevLocales, uprv_free);
864 // TestRootElements() tests the root collator which does not have tailorings .
865 uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
866 uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
867 uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode );
868
869 UVector64 ces(errorCode);
870 LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
871 U_ASSERT(locales.isValid());
872 const char *localeID = "root";
873 do {
874 Locale locale(localeID);
875 LocalPointer<StringEnumeration> types(
876 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
877 errorCode.assertSuccess();
878 const char *type; // first: default type
879 while((type = types->next(NULL, errorCode)) != NULL) {
880 if(strncmp(type, "private-", 8) == 0) {
881 errln("Collator::getKeywordValuesForLocale(%s) returns private c ollation keyword: %s",
882 localeID, type);
883 }
884 Locale localeWithType(locale);
885 localeWithType.setKeywordValue("collation", type, errorCode);
886 errorCode.assertSuccess();
887 LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
888 if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)",
889 localeWithType.getName())) {
890 continue;
891 }
892 Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
893 if(uhash_geti(prevLocales, actual.getName()) != 0) {
894 continue;
895 }
896 uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode) ;
897 errorCode.assertSuccess();
898 logln("TestTailoredElements(): requested %s -> actual %s",
899 localeWithType.getName(), actual.getName());
900 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getA lias());
901 if(rbc == NULL) {
902 continue;
903 }
904 // Note: It would be better to get tailored strings such that we can
905 // identify the prefix, and only get the CEs for the prefix+string,
906 // not also for the prefix.
907 // There is currently no API for that.
908 // It would help in an unusual case where a contraction starting in the prefix
909 // extends past its end, and we do not see the intended mapping.
910 // For example, for a mapping p|st, if there is also a contraction p s,
911 // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
912 LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
913 errorCode.assertSuccess();
914 UnicodeSetIterator iter(*tailored);
915 while(iter.next()) {
916 const UnicodeString &s = iter.getString();
917 ces.removeAllElements();
918 rbc->internalGetCEs(s, ces, errorCode);
919 errorCode.assertSuccess();
920 for(int32_t i = 0; i < ces.size(); ++i) {
921 int64_t ce = ces.elementAti(i);
922 if(!isValidCE(rootElements, *root, ce)) {
923 errln("invalid tailored CE %016llx at CE index %d from s tring:",
924 (long long)ce, (int)i);
925 infoln(prettify(s));
926 }
927 }
928 }
929 }
930 } while((localeID = locales->next(NULL, errorCode)) != NULL);
931 uhash_close(prevLocales);
932 }
933
934 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
935 UnicodeString s;
936 for(int32_t i = 0; i < length; ++i) {
937 if(i > 0) { s.append((UChar)0x20); }
938 uint8_t b = p[i];
939 if(b == 0) {
940 s.append((UChar)0x2e); // period
941 } else if(b == 1) {
942 s.append((UChar)0x7c); // vertical bar
943 } else {
944 appendHex(b, 2, s);
945 }
946 }
947 return s;
948 }
949
950 UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
951 int32_t length;
952 const uint8_t *p = key.getByteArray(length);
953 return printSortKey(p, length);
954 }
955
956 UBool CollationTest::readLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
957 int32_t lineLength;
958 const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
959 if(line == NULL || errorCode.isFailure()) {
960 fileLine.remove();
961 return FALSE;
962 }
963 ++fileLineNumber;
964 // Strip trailing CR/LF, comments, and spaces.
965 const UChar *comment = u_memchr(line, 0x23, lineLength); // '#'
966 if(comment != NULL) {
967 lineLength = (int32_t)(comment - line);
968 } else {
969 while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
970 }
971 while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
972 fileLine.setTo(FALSE, line, lineLength);
973 return TRUE;
974 }
975
976 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeSt ring &s,
977 UErrorCode &errorCode) {
978 int32_t length = fileLine.length();
979 int32_t i;
980 for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
981 int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start); // '|'
982 if(pipeIndex >= 0) {
983 prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
984 if(prefix.isEmpty()) {
985 errln("empty prefix on line %d", (int)fileLineNumber);
986 infoln(fileLine);
987 errorCode = U_PARSE_ERROR;
988 return;
989 }
990 start = pipeIndex + 1;
991 } else {
992 prefix.remove();
993 }
994 s = fileLine.tempSubStringBetween(start, i).unescape();
995 if(s.isEmpty()) {
996 errln("empty string on line %d", (int)fileLineNumber);
997 infoln(fileLine);
998 errorCode = U_PARSE_ERROR;
999 return;
1000 }
1001 start = i;
1002 }
1003
1004 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTest ErrorCode &errorCode) {
1005 Collation::Level relation;
1006 int32_t start;
1007 if(fileLine[0] == 0x3c) { // <
1008 UChar second = fileLine[1];
1009 start = 2;
1010 switch(second) {
1011 case 0x31: // <1
1012 relation = Collation::PRIMARY_LEVEL;
1013 break;
1014 case 0x32: // <2
1015 relation = Collation::SECONDARY_LEVEL;
1016 break;
1017 case 0x33: // <3
1018 relation = Collation::TERTIARY_LEVEL;
1019 break;
1020 case 0x34: // <4
1021 relation = Collation::QUATERNARY_LEVEL;
1022 break;
1023 case 0x63: // <c
1024 relation = Collation::CASE_LEVEL;
1025 break;
1026 case 0x69: // <i
1027 relation = Collation::IDENTICAL_LEVEL;
1028 break;
1029 default: // just <
1030 relation = Collation::NO_LEVEL;
1031 start = 1;
1032 break;
1033 }
1034 } else if(fileLine[0] == 0x3d) { // =
1035 relation = Collation::ZERO_LEVEL;
1036 start = 1;
1037 } else {
1038 start = 0;
1039 }
1040 if(start == 0 || !isSpace(fileLine[start])) {
1041 errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (in t)fileLineNumber);
1042 infoln(fileLine);
1043 errorCode.set(U_PARSE_ERROR);
1044 return Collation::NO_LEVEL;
1045 }
1046 start = skipSpaces(start);
1047 UnicodeString prefix;
1048 parseString(start, prefix, s, errorCode);
1049 if(errorCode.isSuccess() && !prefix.isEmpty()) {
1050 errln("prefix string not allowed for test string: on line %d", (int)file LineNumber);
1051 infoln(fileLine);
1052 errorCode.set(U_PARSE_ERROR);
1053 return Collation::NO_LEVEL;
1054 }
1055 if(start < fileLine.length()) {
1056 errln("unexpected line contents after test string on line %d", (int)file LineNumber);
1057 infoln(fileLine);
1058 errorCode.set(U_PARSE_ERROR);
1059 return Collation::NO_LEVEL;
1060 }
1061 return relation;
1062 }
1063
1064 static const struct {
1065 const char *name;
1066 UColAttribute attr;
1067 } attributes[] = {
1068 { "backwards", UCOL_FRENCH_COLLATION },
1069 { "alternate", UCOL_ALTERNATE_HANDLING },
1070 { "caseFirst", UCOL_CASE_FIRST },
1071 { "caseLevel", UCOL_CASE_LEVEL },
1072 // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1073 { "strength", UCOL_STRENGTH },
1074 // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1075 { "numeric", UCOL_NUMERIC_COLLATION }
1076 };
1077
1078 static const struct {
1079 const char *name;
1080 UColAttributeValue value;
1081 } attributeValues[] = {
1082 { "default", UCOL_DEFAULT },
1083 { "primary", UCOL_PRIMARY },
1084 { "secondary", UCOL_SECONDARY },
1085 { "tertiary", UCOL_TERTIARY },
1086 { "quaternary", UCOL_QUATERNARY },
1087 { "identical", UCOL_IDENTICAL },
1088 { "off", UCOL_OFF },
1089 { "on", UCOL_ON },
1090 { "shifted", UCOL_SHIFTED },
1091 { "non-ignorable", UCOL_NON_IGNORABLE },
1092 { "lower", UCOL_LOWER_FIRST },
1093 { "upper", UCOL_UPPER_FIRST }
1094 };
1095
1096 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1097 int32_t start = skipSpaces(1);
1098 int32_t equalPos = fileLine.indexOf(0x3d);
1099 if(equalPos < 0) {
1100 if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1101 parseAndSetReorderCodes(start + 7, errorCode);
1102 return;
1103 }
1104 errln("missing '=' on line %d", (int)fileLineNumber);
1105 infoln(fileLine);
1106 errorCode.set(U_PARSE_ERROR);
1107 return;
1108 }
1109
1110 UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1111 UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1112 if(attrString == UNICODE_STRING("maxVariable", 11)) {
1113 UColReorderCode max;
1114 if(valueString == UNICODE_STRING("space", 5)) {
1115 max = UCOL_REORDER_CODE_SPACE;
1116 } else if(valueString == UNICODE_STRING("punct", 5)) {
1117 max = UCOL_REORDER_CODE_PUNCTUATION;
1118 } else if(valueString == UNICODE_STRING("symbol", 6)) {
1119 max = UCOL_REORDER_CODE_SYMBOL;
1120 } else if(valueString == UNICODE_STRING("currency", 8)) {
1121 max = UCOL_REORDER_CODE_CURRENCY;
1122 } else {
1123 errln("invalid attribute value name on line %d", (int)fileLineNumber );
1124 infoln(fileLine);
1125 errorCode.set(U_PARSE_ERROR);
1126 return;
1127 }
1128 coll->setMaxVariable(max, errorCode);
1129 if(errorCode.isFailure()) {
1130 errln("setMaxVariable() failed on line %d: %s",
1131 (int)fileLineNumber, errorCode.errorName());
1132 infoln(fileLine);
1133 return;
1134 }
1135 fileLine.remove();
1136 return;
1137 }
1138
1139 UColAttribute attr;
1140 for(int32_t i = 0;; ++i) {
1141 if(i == UPRV_LENGTHOF(attributes)) {
1142 errln("invalid attribute name on line %d", (int)fileLineNumber);
1143 infoln(fileLine);
1144 errorCode.set(U_PARSE_ERROR);
1145 return;
1146 }
1147 if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1148 attr = attributes[i].attr;
1149 break;
1150 }
1151 }
1152
1153 UColAttributeValue value;
1154 for(int32_t i = 0;; ++i) {
1155 if(i == UPRV_LENGTHOF(attributeValues)) {
1156 errln("invalid attribute value name on line %d", (int)fileLineNumber );
1157 infoln(fileLine);
1158 errorCode.set(U_PARSE_ERROR);
1159 return;
1160 }
1161 if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1162 value = attributeValues[i].value;
1163 break;
1164 }
1165 }
1166
1167 coll->setAttribute(attr, value, errorCode);
1168 if(errorCode.isFailure()) {
1169 errln("illegal attribute=value combination on line %d: %s",
1170 (int)fileLineNumber, errorCode.errorName());
1171 infoln(fileLine);
1172 return;
1173 }
1174 fileLine.remove();
1175 }
1176
1177 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &err orCode) {
1178 UVector32 reorderCodes(errorCode);
1179 while(start < fileLine.length()) {
1180 start = skipSpaces(start);
1181 int32_t limit = start;
1182 while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1183 CharString name;
1184 name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), e rrorCode);
1185 int32_t code = CollationRuleParser::getReorderCode(name.data());
1186 if(code < 0) {
1187 if(uprv_stricmp(name.data(), "default") == 0) {
1188 code = UCOL_REORDER_CODE_DEFAULT; // -1
1189 } else {
1190 errln("invalid reorder code '%s' on line %d", name.data(), (int) fileLineNumber);
1191 infoln(fileLine);
1192 errorCode.set(U_PARSE_ERROR);
1193 return;
1194 }
1195 }
1196 reorderCodes.addElement(code, errorCode);
1197 start = limit;
1198 }
1199 coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCo de);
1200 if(errorCode.isFailure()) {
1201 errln("setReorderCodes() failed on line %d: %s", (int)fileLineNumber, er rorCode.errorName());
1202 infoln(fileLine);
1203 return;
1204 }
1205 fileLine.remove();
1206 }
1207
1208 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1209 UnicodeString rules;
1210 while(readLine(f, errorCode)) {
1211 if(fileLine.isEmpty()) { continue; }
1212 if(isSectionStarter(fileLine[0])) { break; }
1213 rules.append(fileLine.unescape());
1214 }
1215 if(errorCode.isFailure()) { return; }
1216 logln(rules);
1217
1218 UParseError parseError;
1219 UnicodeString reason;
1220 delete coll;
1221 coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1222 if(coll == NULL) {
1223 errln("unable to allocate a new collator");
1224 errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1225 return;
1226 }
1227 if(errorCode.isFailure()) {
1228 dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName()) ;
1229 infoln(UnicodeString(" reason: ") + reason);
1230 if(parseError.offset >= 0) { infoln(" rules offset: %d", (int)parseErro r.offset); }
1231 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1232 infoln(UnicodeString(" snippet: ...") +
1233 parseError.preContext + "(!)" + parseError.postContext + "...");
1234 }
1235 } else {
1236 assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1237 UnicodeString(), reason);
1238 }
1239 }
1240
1241 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1242 if(errorCode.isFailure()) { return; }
1243 delete coll;
1244 coll = Collator::createInstance(Locale::getRoot(), errorCode);
1245 if(errorCode.isFailure()) {
1246 dataerrln("unable to create a root collator");
1247 return;
1248 }
1249 }
1250
1251 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1252 if(errorCode.isFailure()) { return; }
1253 int32_t at = fileLine.indexOf((UChar)0x40, 9); // @ is not invariant
1254 if(at >= 0) {
1255 fileLine.setCharAt(at, (UChar)0x2a); // *
1256 }
1257 CharString localeID;
1258 localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1259 if(at >= 0) {
1260 localeID.data()[at - 9] = '@';
1261 }
1262 Locale locale(localeID.data());
1263 if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1264 errln("invalid language tag on line %d", (int)fileLineNumber);
1265 infoln(fileLine);
1266 if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1267 return;
1268 }
1269
1270 logln("creating a collator for locale ID %s", locale.getName());
1271 Collator *newColl = Collator::createInstance(locale, errorCode);
1272 if(errorCode.isFailure()) {
1273 dataerrln("unable to create a collator for locale %s on line %d",
1274 locale.getName(), (int)fileLineNumber);
1275 infoln(fileLine);
1276 return;
1277 }
1278 delete coll;
1279 coll = newColl;
1280 }
1281
1282 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &erro rCode) const {
1283 if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1284 // In some sequences with Tibetan composite vowel signs,
1285 // even if the string passes the FCD check,
1286 // those composites must be decomposed.
1287 // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1288 int32_t index = 0;
1289 while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1290 if(++index < s.length()) {
1291 UChar c = s[index];
1292 if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1293 }
1294 }
1295 return FALSE;
1296 }
1297
1298 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1299 CharString &dest, int32_t partSize,
1300 IcuTestErrorCode &errorCode) {
1301 if(errorCode.isFailure()) { return FALSE; }
1302 uint8_t part[32];
1303 U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1304 UCharIterator iter;
1305 uiter_setString(&iter, s, length);
1306 uint32_t state[2] = { 0, 0 };
1307 for(;;) {
1308 int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, p artSize, errorCode);
1309 UBool done = partLength < partSize;
1310 if(done) {
1311 // At the end, append the next byte as well which should be 00.
1312 ++partLength;
1313 }
1314 dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1315 if(done) {
1316 return errorCode.isSuccess();
1317 }
1318 }
1319 }
1320
1321 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line ,
1322 const UChar *s, int32_t length,
1323 CollationKey &key, IcuTestErrorCode &errorC ode) {
1324 if(errorCode.isFailure()) { return FALSE; }
1325 coll->getCollationKey(s, length, key, errorCode);
1326 if(errorCode.isFailure()) {
1327 infoln(fileTestName);
1328 errln("Collator(%s).getCollationKey() failed: %s",
1329 norm, errorCode.errorName());
1330 infoln(line);
1331 return FALSE;
1332 }
1333 int32_t keyLength;
1334 const uint8_t *keyBytes = key.getByteArray(keyLength);
1335 if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1336 infoln(fileTestName);
1337 errln("Collator(%s).getCollationKey() wrote an empty or unterminated key ",
1338 norm);
1339 infoln(line);
1340 infoln(printCollationKey(key));
1341 return FALSE;
1342 }
1343
1344 int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1345 if(numLevels < UCOL_IDENTICAL) {
1346 ++numLevels;
1347 } else {
1348 numLevels = 5;
1349 }
1350 if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1351 ++numLevels;
1352 }
1353 errorCode.assertSuccess();
1354 int32_t numLevelSeparators = 0;
1355 for(int32_t i = 0; i < (keyLength - 1); ++i) {
1356 uint8_t b = keyBytes[i];
1357 if(b == 0) {
1358 infoln(fileTestName);
1359 errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1360 infoln(line);
1361 infoln(printCollationKey(key));
1362 return FALSE;
1363 }
1364 if(b == 1) { ++numLevelSeparators; }
1365 }
1366 if(numLevelSeparators != (numLevels - 1)) {
1367 infoln(fileTestName);
1368 errln("Collator(%s).getCollationKey() has %d level separators for %d lev els",
1369 norm, (int)numLevelSeparators, (int)numLevels);
1370 infoln(line);
1371 infoln(printCollationKey(key));
1372 return FALSE;
1373 }
1374
1375 // If s contains U+FFFE, check that merged segments make the same key.
1376 LocalMemory<uint8_t> mergedKey;
1377 int32_t mergedKeyLength = 0;
1378 int32_t mergedKeyCapacity = 0;
1379 int32_t sLength = (length >= 0) ? length : u_strlen(s);
1380 int32_t segmentStart = 0;
1381 for(int32_t i = 0;;) {
1382 if(i == sLength) {
1383 if(segmentStart == 0) {
1384 // s does not contain any U+FFFE.
1385 break;
1386 }
1387 } else if(s[i] != 0xfffe) {
1388 ++i;
1389 continue;
1390 }
1391 // Get the sort key for another segment and merge it into mergedKey.
1392 CollationKey key1(mergedKey.getAlias(), mergedKeyLength); // copies the bytes
1393 CollationKey key2;
1394 coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCod e);
1395 int32_t key1Length, key2Length;
1396 const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1397 const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1398 uint8_t *dest;
1399 int32_t minCapacity = key1Length + key2Length;
1400 if(key1Length > 0) { --minCapacity; }
1401 if(minCapacity <= mergedKeyCapacity) {
1402 dest = mergedKey.getAlias();
1403 } else {
1404 if(minCapacity <= 200) {
1405 mergedKeyCapacity = 200;
1406 } else if(minCapacity <= 2 * mergedKeyCapacity) {
1407 mergedKeyCapacity *= 2;
1408 } else {
1409 mergedKeyCapacity = minCapacity;
1410 }
1411 dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1412 }
1413 U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1414 if(key1Length == 0) {
1415 // key2 is the sort key for the first segment.
1416 uprv_memcpy(dest, key2Bytes, key2Length);
1417 mergedKeyLength = key2Length;
1418 } else {
1419 mergedKeyLength =
1420 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1421 dest, mergedKeyCapacity);
1422 }
1423 if(i == sLength) { break; }
1424 segmentStart = ++i;
1425 }
1426 if(segmentStart != 0 &&
1427 (mergedKeyLength != keyLength ||
1428 uprv_memcmp(mergedKey.getAlias(), keyBytes, keyLength) != 0)) {
1429 infoln(fileTestName);
1430 errln("Collator(%s).getCollationKey(with U+FFFE) != "
1431 "ucol_mergeSortkeys(segments)",
1432 norm);
1433 infoln(line);
1434 infoln(printCollationKey(key));
1435 infoln(printSortKey(mergedKey.getAlias(), mergedKeyLength));
1436 return FALSE;
1437 }
1438
1439 // Check that internalNextSortKeyPart() makes the same key, with several par t sizes.
1440 static const int32_t partSizes[] = { 32, 3, 1 };
1441 for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1442 int32_t partSize = partSizes[psi];
1443 CharString parts;
1444 if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1445 infoln(fileTestName);
1446 errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1447 norm, (int)partSize, errorCode.errorName());
1448 infoln(line);
1449 return FALSE;
1450 }
1451 if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), ke yLength) != 0) {
1452 infoln(fileTestName);
1453 errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d) ",
1454 norm, (int)partSize);
1455 infoln(line);
1456 infoln(printCollationKey(key));
1457 infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts .length()));
1458 return FALSE;
1459 }
1460 }
1461 return TRUE;
1462 }
1463
1464 namespace {
1465
1466 /**
1467 * Replaces unpaired surrogates with U+FFFD.
1468 * Returns s if no replacement was made, otherwise buffer.
1469 */
1470 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buf fer) {
1471 int32_t i = 0;
1472 while(i < s.length()) {
1473 UChar32 c = s.char32At(i);
1474 if(U_IS_SURROGATE(c)) {
1475 if(buffer.length() < i) {
1476 buffer.append(s, buffer.length(), i - buffer.length());
1477 }
1478 buffer.append((UChar)0xfffd);
1479 }
1480 i += U16_LENGTH(c);
1481 }
1482 if(buffer.isEmpty()) {
1483 return s;
1484 }
1485 if(buffer.length() < i) {
1486 buffer.append(s, buffer.length(), i - buffer.length());
1487 }
1488 return buffer;
1489 }
1490
1491 }
1492
1493 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prev FileLine,
1494 const UnicodeString &prevString, const Unic odeString &s,
1495 UCollationResult expectedOrder, Collation:: Level expectedLevel,
1496 IcuTestErrorCode &errorCode) {
1497 if(errorCode.isFailure()) { return FALSE; }
1498
1499 // Get the sort keys first, for error debug output.
1500 CollationKey prevKey;
1501 if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.l ength(),
1502 prevKey, errorCode)) {
1503 return FALSE;
1504 }
1505 CollationKey key;
1506 if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCod e)) { return FALSE; }
1507
1508 UCollationResult order = coll->compare(prevString, s, errorCode);
1509 if(order != expectedOrder || errorCode.isFailure()) {
1510 infoln(fileTestName);
1511 errln("line %d Collator(%s).compare(previous, current) wrong order: %d ! = %d (%s)",
1512 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorNa me());
1513 infoln(prevFileLine);
1514 infoln(fileLine);
1515 infoln(printCollationKey(prevKey));
1516 infoln(printCollationKey(key));
1517 return FALSE;
1518 }
1519 order = coll->compare(s, prevString, errorCode);
1520 if(order != -expectedOrder || errorCode.isFailure()) {
1521 infoln(fileTestName);
1522 errln("line %d Collator(%s).compare(current, previous) wrong order: %d ! = %d (%s)",
1523 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorN ame());
1524 infoln(prevFileLine);
1525 infoln(fileLine);
1526 infoln(printCollationKey(prevKey));
1527 infoln(printCollationKey(key));
1528 return FALSE;
1529 }
1530 // Test NUL-termination if the strings do not contain NUL characters.
1531 UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1532 if(!containNUL) {
1533 order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, err orCode);
1534 if(order != expectedOrder || errorCode.isFailure()) {
1535 infoln(fileTestName);
1536 errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1537 (int)fileLineNumber, norm, order, expectedOrder, errorCode.err orName());
1538 infoln(prevFileLine);
1539 infoln(fileLine);
1540 infoln(printCollationKey(prevKey));
1541 infoln(printCollationKey(key));
1542 return FALSE;
1543 }
1544 order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, err orCode);
1545 if(order != -expectedOrder || errorCode.isFailure()) {
1546 infoln(fileTestName);
1547 errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1548 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.er rorName());
1549 infoln(prevFileLine);
1550 infoln(fileLine);
1551 infoln(printCollationKey(prevKey));
1552 infoln(printCollationKey(key));
1553 return FALSE;
1554 }
1555 }
1556
1557 #if U_HAVE_STD_STRING
1558 // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1559 // Unpaired surrogates cannot be converted to UTF-8.
1560 // Create valid UTF-16 strings if necessary, and use those for
1561 // both the expected compare() result and for the input to compare(UTF-8).
1562 UnicodeString prevBuffer, sBuffer;
1563 const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1564 const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1565 std::string prevUTF8, sUTF8;
1566 UnicodeString(prevValid).toUTF8String(prevUTF8);
1567 UnicodeString(sValid).toUTF8String(sUTF8);
1568 UCollationResult expectedUTF8Order;
1569 if(&prevValid == &prevString && &sValid == &s) {
1570 expectedUTF8Order = expectedOrder;
1571 } else {
1572 expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1573 }
1574
1575 order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1576 if(order != expectedUTF8Order || errorCode.isFailure()) {
1577 infoln(fileTestName);
1578 errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1579 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.err orName());
1580 infoln(prevFileLine);
1581 infoln(fileLine);
1582 infoln(printCollationKey(prevKey));
1583 infoln(printCollationKey(key));
1584 return FALSE;
1585 }
1586 order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1587 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1588 infoln(fileTestName);
1589 errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1590 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.er rorName());
1591 infoln(prevFileLine);
1592 infoln(fileLine);
1593 infoln(printCollationKey(prevKey));
1594 infoln(printCollationKey(key));
1595 return FALSE;
1596 }
1597 // Test NUL-termination if the strings do not contain NUL characters.
1598 if(!containNUL) {
1599 order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), - 1, errorCode);
1600 if(order != expectedUTF8Order || errorCode.isFailure()) {
1601 infoln(fileTestName);
1602 errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, curren t-NUL) wrong order: %d != %d (%s)",
1603 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode .errorName());
1604 infoln(prevFileLine);
1605 infoln(fileLine);
1606 infoln(printCollationKey(prevKey));
1607 infoln(printCollationKey(key));
1608 return FALSE;
1609 }
1610 order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), - 1, errorCode);
1611 if(order != -expectedUTF8Order || errorCode.isFailure()) {
1612 infoln(fileTestName);
1613 errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previou s-NUL) wrong order: %d != %d (%s)",
1614 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCod e.errorName());
1615 infoln(prevFileLine);
1616 infoln(fileLine);
1617 infoln(printCollationKey(prevKey));
1618 infoln(printCollationKey(key));
1619 return FALSE;
1620 }
1621 }
1622 #endif
1623
1624 UCharIterator leftIter;
1625 UCharIterator rightIter;
1626 uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1627 uiter_setString(&rightIter, s.getBuffer(), s.length());
1628 order = coll->compare(leftIter, rightIter, errorCode);
1629 if(order != expectedOrder || errorCode.isFailure()) {
1630 infoln(fileTestName);
1631 errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1632 "wrong order: %d != %d (%s)",
1633 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorNa me());
1634 infoln(prevFileLine);
1635 infoln(fileLine);
1636 infoln(printCollationKey(prevKey));
1637 infoln(printCollationKey(key));
1638 return FALSE;
1639 }
1640
1641 order = prevKey.compareTo(key, errorCode);
1642 if(order != expectedOrder || errorCode.isFailure()) {
1643 infoln(fileTestName);
1644 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo () wrong order: %d != %d (%s)",
1645 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorNa me());
1646 infoln(prevFileLine);
1647 infoln(fileLine);
1648 infoln(printCollationKey(prevKey));
1649 infoln(printCollationKey(key));
1650 return FALSE;
1651 }
1652 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1653 int32_t prevKeyLength;
1654 const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1655 int32_t keyLength;
1656 const uint8_t *bytes = key.getByteArray(keyLength);
1657 int32_t level = Collation::PRIMARY_LEVEL;
1658 for(int32_t i = 0;; ++i) {
1659 uint8_t b = prevBytes[i];
1660 if(b != bytes[i]) { break; }
1661 if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1662 ++level;
1663 if(level == Collation::CASE_LEVEL &&
1664 coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_O FF) {
1665 ++level;
1666 }
1667 }
1668 }
1669 if(level != expectedLevel) {
1670 infoln(fileTestName);
1671 errln("line %d Collator(%s).getCollationKey(previous, current).compa reTo()=%d wrong level: %d != %d",
1672 (int)fileLineNumber, norm, order, level, expectedLevel);
1673 infoln(prevFileLine);
1674 infoln(fileLine);
1675 infoln(printCollationKey(prevKey));
1676 infoln(printCollationKey(key));
1677 return FALSE;
1678 }
1679 }
1680 return TRUE;
1681 }
1682
1683 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode ) {
1684 if(errorCode.isFailure()) { return; }
1685 UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1686 UnicodeString prevString, s;
1687 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1688 while(readLine(f, errorCode)) {
1689 if(fileLine.isEmpty()) { continue; }
1690 if(isSectionStarter(fileLine[0])) { break; }
1691 Collation::Level relation = parseRelationAndString(s, errorCode);
1692 if(errorCode.isFailure()) {
1693 errorCode.reset();
1694 break;
1695 }
1696 UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? U COL_EQUAL : UCOL_LESS;
1697 Collation::Level expectedLevel = relation;
1698 s.getTerminatedBuffer(); // Ensure NUL-termination.
1699 UBool isOk = TRUE;
1700 if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1701 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1702 isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1703 expectedOrder, expectedLevel, errorCode);
1704 }
1705 if(isOk) {
1706 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1707 isOk = checkCompareTwo("normalization=off", prevFileLine, prevString , s,
1708 expectedOrder, expectedLevel, errorCode);
1709 }
1710 if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormali zed(s, errorCode))) {
1711 UnicodeString pn = nfd->normalize(prevString, errorCode);
1712 UnicodeString n = nfd->normalize(s, errorCode);
1713 pn.getTerminatedBuffer();
1714 n.getTerminatedBuffer();
1715 errorCode.assertSuccess();
1716 isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1717 expectedOrder, expectedLevel, errorCode);
1718 }
1719 if(!isOk) {
1720 errorCode.reset(); // already reported
1721 }
1722 prevFileLine = fileLine;
1723 prevString = s;
1724 prevString.getTerminatedBuffer(); // Ensure NUL-termination.
1725 }
1726 }
1727
1728 void CollationTest::TestDataDriven() {
1729 IcuTestErrorCode errorCode(*this, "TestDataDriven");
1730
1731 fcd = Normalizer2Factory::getFCDInstance(errorCode);
1732 nfd = Normalizer2::getNFDInstance(errorCode);
1733 if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1734 return;
1735 }
1736
1737 CharString path(getSourceTestData(errorCode), errorCode);
1738 path.appendPathPart("collationtest.txt", errorCode);
1739 const char *codePage = "UTF-8";
1740 LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, error Code));
1741 if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1742 return;
1743 }
1744 while(errorCode.isSuccess()) {
1745 // Read a new line if necessary.
1746 // Sub-parsers leave the first line set that they do not handle.
1747 if(fileLine.isEmpty()) {
1748 if(!readLine(f.getAlias(), errorCode)) { break; }
1749 continue;
1750 }
1751 if(!isSectionStarter(fileLine[0])) {
1752 errln("syntax error on line %d", (int)fileLineNumber);
1753 infoln(fileLine);
1754 return;
1755 }
1756 if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1757 fileTestName = fileLine;
1758 logln(fileLine);
1759 fileLine.remove();
1760 } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1761 setRootCollator(errorCode);
1762 fileLine.remove();
1763 } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1764 setLocaleCollator(errorCode);
1765 fileLine.remove();
1766 } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1767 buildTailoring(f.getAlias(), errorCode);
1768 } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) { // %
1769 parseAndSetAttribute(errorCode);
1770 } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1771 checkCompareStrings(f.getAlias(), errorCode);
1772 } else {
1773 errln("syntax error on line %d", (int)fileLineNumber);
1774 infoln(fileLine);
1775 return;
1776 }
1777 }
1778 }
1779
1780 #endif // !UCONFIG_NO_COLLATION
OLDNEW
« no previous file with comments | « source/test/intltest/cntabcol.cpp ('k') | source/test/intltest/colldata.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698