OLD | NEW |
| (Empty) |
1 /* | |
2 ******************************************************************************* | |
3 * Copyright (C) 2012-2015, International Business Machines | |
4 * Corporation and others. All Rights Reserved. | |
5 ******************************************************************************* | |
6 * collationtest.cpp | |
7 * | |
8 * created on: 2012apr27 | |
9 * created by: Markus W. Scherer | |
10 */ | |
11 | |
12 #include "unicode/utypes.h" | |
13 | |
14 #if !UCONFIG_NO_COLLATION | |
15 | |
16 #include "unicode/coll.h" | |
17 #include "unicode/errorcode.h" | |
18 #include "unicode/localpointer.h" | |
19 #include "unicode/normalizer2.h" | |
20 #include "unicode/sortkey.h" | |
21 #include "unicode/std_string.h" | |
22 #include "unicode/strenum.h" | |
23 #include "unicode/tblcoll.h" | |
24 #include "unicode/uiter.h" | |
25 #include "unicode/uniset.h" | |
26 #include "unicode/unistr.h" | |
27 #include "unicode/usetiter.h" | |
28 #include "unicode/ustring.h" | |
29 #include "charstr.h" | |
30 #include "cmemory.h" | |
31 #include "collation.h" | |
32 #include "collationdata.h" | |
33 #include "collationfcd.h" | |
34 #include "collationiterator.h" | |
35 #include "collationroot.h" | |
36 #include "collationrootelements.h" | |
37 #include "collationruleparser.h" | |
38 #include "collationweights.h" | |
39 #include "cstring.h" | |
40 #include "intltest.h" | |
41 #include "normalizer2impl.h" | |
42 #include "ucbuf.h" | |
43 #include "uhash.h" | |
44 #include "uitercollationiterator.h" | |
45 #include "utf16collationiterator.h" | |
46 #include "utf8collationiterator.h" | |
47 #include "uvectr32.h" | |
48 #include "uvectr64.h" | |
49 #include "writesrc.h" | |
50 | |
51 class CodePointIterator; | |
52 | |
53 // TODO: try to share code with IntlTestCollator; for example, prettify(Collatio
nKey) | |
54 | |
55 class CollationTest : public IntlTest { | |
56 public: | |
57 CollationTest() | |
58 : fcd(NULL), nfd(NULL), | |
59 fileLineNumber(0), | |
60 coll(NULL) {} | |
61 | |
62 ~CollationTest() { | |
63 delete coll; | |
64 } | |
65 | |
66 void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=
NULL); | |
67 | |
68 void TestMinMax(); | |
69 void TestImplicits(); | |
70 void TestNulTerminated(); | |
71 void TestIllegalUTF8(); | |
72 void TestShortFCDData(); | |
73 void TestFCD(); | |
74 void TestCollationWeights(); | |
75 void TestRootElements(); | |
76 void TestTailoredElements(); | |
77 void TestDataDriven(); | |
78 | |
79 private: | |
80 void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cp
i); | |
81 void checkAllocWeights(CollationWeights &cw, | |
82 uint32_t lowerLimit, uint32_t upperLimit, int32_t n, | |
83 int32_t someLength, int32_t minCount); | |
84 | |
85 static UnicodeString printSortKey(const uint8_t *p, int32_t length); | |
86 static UnicodeString printCollationKey(const CollationKey &key); | |
87 | |
88 // Helpers & fields for data-driven test. | |
89 static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; } | |
90 static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; } | |
91 static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c
== 0x40; } // %*@ | |
92 int32_t skipSpaces(int32_t i) { | |
93 while(isSpace(fileLine[i])) { ++i; } | |
94 return i; | |
95 } | |
96 | |
97 UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode); | |
98 void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UE
rrorCode &errorCode); | |
99 Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &
errorCode); | |
100 void parseAndSetAttribute(IcuTestErrorCode &errorCode); | |
101 void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode); | |
102 void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode); | |
103 void setRootCollator(IcuTestErrorCode &errorCode); | |
104 void setLocaleCollator(IcuTestErrorCode &errorCode); | |
105 | |
106 UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) cons
t; | |
107 | |
108 UBool getSortKeyParts(const UChar *s, int32_t length, | |
109 CharString &dest, int32_t partSize, | |
110 IcuTestErrorCode &errorCode); | |
111 UBool getCollationKey(const char *norm, const UnicodeString &line, | |
112 const UChar *s, int32_t length, | |
113 CollationKey &key, IcuTestErrorCode &errorCode); | |
114 UBool getMergedCollationKey(const UChar *s, int32_t length, | |
115 CollationKey &key, IcuTestErrorCode &errorCode); | |
116 UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine, | |
117 const UnicodeString &prevString, const UnicodeString &
s, | |
118 UCollationResult expectedOrder, Collation::Level expec
tedLevel, | |
119 IcuTestErrorCode &errorCode); | |
120 void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode); | |
121 | |
122 const Normalizer2 *fcd, *nfd; | |
123 UnicodeString fileLine; | |
124 int32_t fileLineNumber; | |
125 UnicodeString fileTestName; | |
126 Collator *coll; | |
127 }; | |
128 | |
129 extern IntlTest *createCollationTest() { | |
130 return new CollationTest(); | |
131 } | |
132 | |
133 void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name,
char * /*par*/) { | |
134 if(exec) { | |
135 logln("TestSuite CollationTest: "); | |
136 } | |
137 TESTCASE_AUTO_BEGIN; | |
138 TESTCASE_AUTO(TestMinMax); | |
139 TESTCASE_AUTO(TestImplicits); | |
140 TESTCASE_AUTO(TestNulTerminated); | |
141 TESTCASE_AUTO(TestIllegalUTF8); | |
142 TESTCASE_AUTO(TestShortFCDData); | |
143 TESTCASE_AUTO(TestFCD); | |
144 TESTCASE_AUTO(TestCollationWeights); | |
145 TESTCASE_AUTO(TestRootElements); | |
146 TESTCASE_AUTO(TestTailoredElements); | |
147 TESTCASE_AUTO(TestDataDriven); | |
148 TESTCASE_AUTO_END; | |
149 } | |
150 | |
151 void CollationTest::TestMinMax() { | |
152 IcuTestErrorCode errorCode(*this, "TestMinMax"); | |
153 | |
154 setRootCollator(errorCode); | |
155 if(errorCode.isFailure()) { | |
156 errorCode.reset(); | |
157 return; | |
158 } | |
159 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll); | |
160 if(rbc == NULL) { | |
161 errln("the root collator is not a RuleBasedCollator"); | |
162 return; | |
163 } | |
164 | |
165 static const UChar s[2] = { 0xfffe, 0xffff }; | |
166 UVector64 ces(errorCode); | |
167 rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode); | |
168 errorCode.assertSuccess(); | |
169 if(ces.size() != 2) { | |
170 errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size()); | |
171 return; | |
172 } | |
173 int64_t ce = ces.elementAti(0); | |
174 int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY); | |
175 if(ce != expected) { | |
176 errln("CE(U+fffe)=%04lx != 02..", (long)ce); | |
177 } | |
178 | |
179 ce = ces.elementAti(1); | |
180 expected = Collation::makeCE(Collation::MAX_PRIMARY); | |
181 if(ce != expected) { | |
182 errln("CE(U+ffff)=%04lx != max..", (long)ce); | |
183 } | |
184 } | |
185 | |
186 void CollationTest::TestImplicits() { | |
187 IcuTestErrorCode errorCode(*this, "TestImplicits"); | |
188 | |
189 const CollationData *cd = CollationRoot::getData(errorCode); | |
190 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) { | |
191 return; | |
192 } | |
193 | |
194 // Implicit primary weights should be assigned for the following sets, | |
195 // and sort in ascending order by set and then code point. | |
196 // See http://www.unicode.org/reports/tr10/#Implicit_Weights | |
197 | |
198 // core Han Unified Ideographs | |
199 UnicodeSet coreHan("[\\p{unified_ideograph}&" | |
200 "[\\p{Block=CJK_Unified_Ideographs}" | |
201 "\\p{Block=CJK_Compatibility_Ideographs}]]", | |
202 errorCode); | |
203 // all other Unified Han ideographs | |
204 UnicodeSet otherHan("[\\p{unified ideograph}-" | |
205 "[\\p{Block=CJK_Unified_Ideographs}" | |
206 "\\p{Block=CJK_Compatibility_Ideographs}]]", | |
207 errorCode); | |
208 UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode); | |
209 unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings
. | |
210 | |
211 // Starting with CLDR 26/ICU 54, the root Han order may instead be | |
212 // the Unihan radical-stroke order. | |
213 // The tests should pass either way, so we only test the order of a small se
t of Han characters | |
214 // whose radical-stroke order is the same as their code point order. | |
215 UnicodeSet someHanInCPOrder( | |
216 "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48" | |
217 "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3
-\\u50F6]", | |
218 errorCode); | |
219 UnicodeSet inOrder(someHanInCPOrder); | |
220 inOrder.addAll(unassigned).freeze(); | |
221 if(errorCode.logIfFailureAndReset("UnicodeSet")) { | |
222 return; | |
223 } | |
224 const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned }; | |
225 UChar32 prev = 0; | |
226 uint32_t prevPrimary = 0; | |
227 UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL); | |
228 for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) { | |
229 LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i])); | |
230 while(iter->next()) { | |
231 UChar32 c = iter->getCodepoint(); | |
232 UnicodeString s(c); | |
233 ci.setText(s.getBuffer(), s.getBuffer() + s.length()); | |
234 int64_t ce = ci.nextCE(errorCode); | |
235 int64_t ce2 = ci.nextCE(errorCode); | |
236 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) { | |
237 return; | |
238 } | |
239 if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) { | |
240 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly o
ne CE", (long)c); | |
241 continue; | |
242 } | |
243 if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) { | |
244 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter
weights: %08lx", | |
245 (long)c, (long)(ce & 0xffffffff)); | |
246 continue; | |
247 } | |
248 uint32_t primary = (uint32_t)(ce >> 32); | |
249 if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contai
ns(prev)) { | |
250 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx.."
, | |
251 (long)c, (long)primary, (long)prev, (long)prevPrimary); | |
252 } | |
253 prev = c; | |
254 prevPrimary = primary; | |
255 } | |
256 } | |
257 } | |
258 | |
259 void CollationTest::TestNulTerminated() { | |
260 IcuTestErrorCode errorCode(*this, "TestNulTerminated"); | |
261 const CollationData *data = CollationRoot::getData(errorCode); | |
262 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) { | |
263 return; | |
264 } | |
265 | |
266 static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 }; | |
267 | |
268 UTF16CollationIterator ci1(data, FALSE, s, s, s + 2); | |
269 UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL); | |
270 for(int32_t i = 0;; ++i) { | |
271 int64_t ce1 = ci1.nextCE(errorCode); | |
272 int64_t ce2 = ci2.nextCE(errorCode); | |
273 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) { | |
274 return; | |
275 } | |
276 if(ce1 != ce2) { | |
277 errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminate
d) at CE %d", (int)i); | |
278 break; | |
279 } | |
280 if(ce1 == Collation::NO_CE) { break; } | |
281 } | |
282 } | |
283 | |
284 void CollationTest::TestIllegalUTF8() { | |
285 IcuTestErrorCode errorCode(*this, "TestIllegalUTF8"); | |
286 | |
287 setRootCollator(errorCode); | |
288 if(errorCode.isFailure()) { | |
289 errorCode.reset(); | |
290 return; | |
291 } | |
292 coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode); | |
293 | |
294 static const char *strings[] = { | |
295 // U+FFFD | |
296 "a\xef\xbf\xbdz", | |
297 // illegal byte sequences | |
298 "a\x80z", // trail byte | |
299 "a\xc1\x81z", // non-shortest form | |
300 "a\xe0\x82\x83z", // non-shortest form | |
301 "a\xed\xa0\x80z", // lead surrogate: would be U+D800 | |
302 "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF | |
303 "a\xf0\x8f\xbf\xbfz", // non-shortest form | |
304 "a\xf4\x90\x80\x80z" // out of range: would be U+110000 | |
305 }; | |
306 | |
307 StringPiece fffd(strings[0]); | |
308 for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) { | |
309 StringPiece illegal(strings[i]); | |
310 UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode); | |
311 if(order != UCOL_EQUAL) { | |
312 errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_
EQUAL", | |
313 (int)i, order); | |
314 } | |
315 } | |
316 } | |
317 | |
318 namespace { | |
319 | |
320 void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest)
{ | |
321 for(UChar32 c = 0x10000; c < 0x110000;) { | |
322 UChar32 next = c + 0x400; | |
323 if(src.containsSome(c, next - 1)) { | |
324 dest.add(U16_LEAD(c)); | |
325 } | |
326 c = next; | |
327 } | |
328 } | |
329 | |
330 } // namespace | |
331 | |
332 void CollationTest::TestShortFCDData() { | |
333 // See CollationFCD class comments. | |
334 IcuTestErrorCode errorCode(*this, "TestShortFCDData"); | |
335 UnicodeSet expectedLccc("[:^lccc=0:]", errorCode); | |
336 errorCode.assertSuccess(); | |
337 expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates | |
338 addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc); | |
339 UnicodeSet lccc; // actual | |
340 for(UChar32 c = 0; c <= 0xffff; ++c) { | |
341 if(CollationFCD::hasLccc(c)) { lccc.add(c); } | |
342 } | |
343 UnicodeSet diff(expectedLccc); | |
344 diff.removeAll(lccc); | |
345 diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP | |
346 UnicodeString empty("[]"); | |
347 UnicodeString diffString; | |
348 diff.toPattern(diffString, TRUE); | |
349 assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString); | |
350 diff = lccc; | |
351 diff.removeAll(expectedLccc); | |
352 diff.toPattern(diffString, TRUE); | |
353 assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, T
RUE); | |
354 | |
355 UnicodeSet expectedTccc("[:^tccc=0:]", errorCode); | |
356 if (errorCode.isSuccess()) { | |
357 addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc); | |
358 addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc); | |
359 UnicodeSet tccc; // actual | |
360 for(UChar32 c = 0; c <= 0xffff; ++c) { | |
361 if(CollationFCD::hasTccc(c)) { tccc.add(c); } | |
362 } | |
363 diff = expectedTccc; | |
364 diff.removeAll(tccc); | |
365 diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP | |
366 assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffStrin
g); | |
367 diff = tccc; | |
368 diff.removeAll(expectedTccc); | |
369 diff.toPattern(diffString, TRUE); | |
370 assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffStrin
g); | |
371 } | |
372 } | |
373 | |
374 class CodePointIterator { | |
375 public: | |
376 CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length
), pos(0) {} | |
377 void resetToStart() { pos = 0; } | |
378 UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; } | |
379 UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; } | |
380 int32_t getLength() const { return length; } | |
381 int getIndex() const { return (int)pos; } | |
382 private: | |
383 const UChar32 *cp; | |
384 int32_t length; | |
385 int32_t pos; | |
386 }; | |
387 | |
388 void CollationTest::checkFCD(const char *name, | |
389 CollationIterator &ci, CodePointIterator &cpi) { | |
390 IcuTestErrorCode errorCode(*this, "checkFCD"); | |
391 | |
392 // Iterate forward to the limit. | |
393 for(;;) { | |
394 UChar32 c1 = ci.nextCodePoint(errorCode); | |
395 UChar32 c2 = cpi.next(); | |
396 if(c1 != c2) { | |
397 errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at
%d", | |
398 name, (long)c1, (long)c2, cpi.getIndex()); | |
399 return; | |
400 } | |
401 if(c1 < 0) { break; } | |
402 } | |
403 | |
404 // Iterate backward most of the way. | |
405 for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) { | |
406 UChar32 c1 = ci.previousCodePoint(errorCode); | |
407 UChar32 c2 = cpi.previous(); | |
408 if(c1 != c2) { | |
409 errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d", | |
410 name, (long)c1, (long)c2, cpi.getIndex()); | |
411 return; | |
412 } | |
413 } | |
414 | |
415 // Forward again. | |
416 for(;;) { | |
417 UChar32 c1 = ci.nextCodePoint(errorCode); | |
418 UChar32 c2 = cpi.next(); | |
419 if(c1 != c2) { | |
420 errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d", | |
421 name, (long)c1, (long)c2, cpi.getIndex()); | |
422 return; | |
423 } | |
424 if(c1 < 0) { break; } | |
425 } | |
426 | |
427 // Iterate backward to the start. | |
428 for(;;) { | |
429 UChar32 c1 = ci.previousCodePoint(errorCode); | |
430 UChar32 c2 = cpi.previous(); | |
431 if(c1 != c2) { | |
432 errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d", | |
433 name, (long)c1, (long)c2, cpi.getIndex()); | |
434 return; | |
435 } | |
436 if(c1 < 0) { break; } | |
437 } | |
438 } | |
439 | |
440 void CollationTest::TestFCD() { | |
441 IcuTestErrorCode errorCode(*this, "TestFCD"); | |
442 const CollationData *data = CollationRoot::getData(errorCode); | |
443 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) { | |
444 return; | |
445 } | |
446 | |
447 // Input string, not FCD, NUL-terminated. | |
448 static const UChar s[] = { | |
449 0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62, | |
450 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), // MUSICAL SYMBOL QUARTER NOTE=1
D158 1D165, ccc=0, 216 | |
451 0x327, 0x308, // ccc=202, 230 | |
452 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), // MUSICAL SYMBOL COMBINING AUGM
ENTATION DOT, ccc=226 | |
453 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), | |
454 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), | |
455 0xac01, | |
456 0xe7, // Character with tccc!=0 decomposed together with mis-ordered se
quence. | |
457 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D
165), | |
458 0xe1, // Character with tccc!=0 decomposed together with decomposed seq
uence. | |
459 0xf73, 0xf75, // Tibetan composite vowels must be decomposed. | |
460 0x4e00, 0xf81, | |
461 0 | |
462 }; | |
463 // Expected code points. | |
464 static const UChar32 cp[] = { | |
465 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62, | |
466 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308, | |
467 0x1D15F, 0x1D16D, | |
468 0xac01, | |
469 0x63, 0x327, 0x1D165, 0x1D16D, | |
470 0x61, | |
471 0xf71, 0xf71, 0xf72, 0xf74, 0x301, | |
472 0x4e00, 0xf71, 0xf80 | |
473 }; | |
474 | |
475 FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL); | |
476 if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor"))
{ | |
477 return; | |
478 } | |
479 CodePointIterator cpi(cp, UPRV_LENGTHOF(cp)); | |
480 checkFCD("FCDUTF16CollationIterator", u16ci, cpi); | |
481 | |
482 #if U_HAVE_STD_STRING | |
483 cpi.resetToStart(); | |
484 std::string utf8; | |
485 UnicodeString(s).toUTF8String(utf8); | |
486 FCDUTF8CollationIterator u8ci(data, FALSE, | |
487 reinterpret_cast<const uint8_t *>(utf8.c_str()
), 0, -1); | |
488 if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) { | |
489 return; | |
490 } | |
491 checkFCD("FCDUTF8CollationIterator", u8ci, cpi); | |
492 #endif | |
493 | |
494 cpi.resetToStart(); | |
495 UCharIterator iter; | |
496 uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1); // -1: without the termina
ting NUL | |
497 FCDUIterCollationIterator uici(data, FALSE, iter, 0); | |
498 if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor"))
{ | |
499 return; | |
500 } | |
501 checkFCD("FCDUIterCollationIterator", uici, cpi); | |
502 } | |
503 | |
504 void CollationTest::checkAllocWeights(CollationWeights &cw, | |
505 uint32_t lowerLimit, uint32_t upperLimit,
int32_t n, | |
506 int32_t someLength, int32_t minCount) { | |
507 if(!cw.allocWeights(lowerLimit, upperLimit, n)) { | |
508 errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE", | |
509 (long)lowerLimit, (long)upperLimit, (long)n); | |
510 return; | |
511 } | |
512 uint32_t previous = lowerLimit; | |
513 int32_t count = 0; // number of weights that have someLength | |
514 for(int32_t i = 0; i < n; ++i) { | |
515 uint32_t w = cw.nextWeight(); | |
516 if(w == 0xffffffff) { | |
517 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() " | |
518 "returns only %ld weights", | |
519 (long)lowerLimit, (long)upperLimit, (long)n, (long)i); | |
520 return; | |
521 } | |
522 if(!(previous < w && w < upperLimit)) { | |
523 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() " | |
524 "number %ld -> %lx not between %lx and %lx", | |
525 (long)lowerLimit, (long)upperLimit, (long)n, | |
526 (long)(i + 1), (long)w, (long)previous, (long)upperLimit); | |
527 return; | |
528 } | |
529 if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; } | |
530 } | |
531 if(count < minCount) { | |
532 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() " | |
533 "returns only %ld < %ld weights of length %d", | |
534 (long)lowerLimit, (long)upperLimit, (long)n, | |
535 (long)count, (long)minCount, (int)someLength); | |
536 } | |
537 } | |
538 | |
539 void CollationTest::TestCollationWeights() { | |
540 CollationWeights cw; | |
541 | |
542 // Non-compressible primaries use 254 second bytes 02..FF. | |
543 logln("CollationWeights.initForPrimary(non-compressible)"); | |
544 cw.initForPrimary(FALSE); | |
545 // Expect 1 weight 11 and 254 weights 12xx. | |
546 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1); | |
547 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254); | |
548 // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202. | |
549 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255); | |
550 // Expect 254 two-byte weights from the ranges 10ff and 11xx. | |
551 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254); | |
552 // Expect 254^2=64516 three-byte weights. | |
553 // During computation, there should be 3 three-byte ranges | |
554 // 10ffff, 11xxxx, 120202. | |
555 // The middle one should be split 64515:1, | |
556 // and the newly-split-off range and the last ranged lengthened. | |
557 checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516)
; | |
558 // Expect weights 1102 & 1103. | |
559 checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2); | |
560 // Expect weights 102102 & 102103. | |
561 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2); | |
562 | |
563 // Compressible primaries use 251 second bytes 04..FE. | |
564 logln("CollationWeights.initForPrimary(compressible)"); | |
565 cw.initForPrimary(TRUE); | |
566 // Expect 1 weight 11 and 251 weights 12xx. | |
567 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1); | |
568 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251); | |
569 // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204. | |
570 checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252); | |
571 // Expect weights 1104 & 1105. | |
572 checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2); | |
573 // Expect weights 102102 & 102103. | |
574 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2); | |
575 | |
576 // Secondary and tertiary weights use only bytes 3 & 4. | |
577 logln("CollationWeights.initForSecondary()"); | |
578 cw.initForSecondary(); | |
579 // Expect weights fbxx and all four fc..ff. | |
580 checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4); | |
581 | |
582 logln("CollationWeights.initForTertiary()"); | |
583 cw.initForTertiary(); | |
584 // Expect weights 3dxx and both 3e & 3f. | |
585 checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2); | |
586 } | |
587 | |
588 namespace { | |
589 | |
590 UBool isValidCE(const CollationRootElements &re, const CollationData &data, | |
591 uint32_t p, uint32_t s, uint32_t ctq) { | |
592 uint32_t p1 = p >> 24; | |
593 uint32_t p2 = (p >> 16) & 0xff; | |
594 uint32_t p3 = (p >> 8) & 0xff; | |
595 uint32_t p4 = p & 0xff; | |
596 uint32_t s1 = s >> 8; | |
597 uint32_t s2 = s & 0xff; | |
598 // ctq = Case, Tertiary, Quaternary | |
599 uint32_t c = (ctq & Collation::CASE_MASK) >> 14; | |
600 uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK; | |
601 uint32_t t1 = t >> 8; | |
602 uint32_t t2 = t & 0xff; | |
603 uint32_t q = ctq & Collation::QUATERNARY_MASK; | |
604 // No leading zero bytes. | |
605 if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) { | |
606 return FALSE; | |
607 } | |
608 // No intermediate zero bytes. | |
609 if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) { | |
610 return FALSE; | |
611 } | |
612 if(p2 != 0 && p3 == 0 && p4 != 0) { | |
613 return FALSE; | |
614 } | |
615 // Minimum & maximum lead bytes. | |
616 if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) || | |
617 s1 == Collation::LEVEL_SEPARATOR_BYTE || | |
618 t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) { | |
619 return FALSE; | |
620 } | |
621 if(c > 2) { | |
622 return FALSE; | |
623 } | |
624 // The valid byte range for the second primary byte depends on compressibili
ty. | |
625 if(p2 != 0) { | |
626 if(data.isCompressibleLeadByte(p1)) { | |
627 if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE || | |
628 Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) { | |
629 return FALSE; | |
630 } | |
631 } else { | |
632 if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) { | |
633 return FALSE; | |
634 } | |
635 } | |
636 } | |
637 // Other bytes just need to avoid the level separator. | |
638 // Trailing zeros are ok. | |
639 U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1); | |
640 if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR
_BYTE || | |
641 s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPA
RATOR_BYTE) { | |
642 return FALSE; | |
643 } | |
644 // Well-formed CEs. | |
645 if(p == 0) { | |
646 if(s == 0) { | |
647 if(t == 0) { | |
648 // Completely ignorable CE. | |
649 // Quaternary CEs are not supported. | |
650 if(c != 0 || q != 0) { | |
651 return FALSE; | |
652 } | |
653 } else { | |
654 // Tertiary CE. | |
655 if(t < re.getTertiaryBoundary() || c != 2) { | |
656 return FALSE; | |
657 } | |
658 } | |
659 } else { | |
660 // Secondary CE. | |
661 if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBou
ndary()) { | |
662 return FALSE; | |
663 } | |
664 } | |
665 } else { | |
666 // Primary CE. | |
667 if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSec
ondary()) || | |
668 s >= re.getSecondaryBoundary()) { | |
669 return FALSE; | |
670 } | |
671 if(t == 0 || t >= re.getTertiaryBoundary()) { | |
672 return FALSE; | |
673 } | |
674 } | |
675 return TRUE; | |
676 } | |
677 | |
678 UBool isValidCE(const CollationRootElements &re, const CollationData &data, int6
4_t ce) { | |
679 uint32_t p = (uint32_t)(ce >> 32); | |
680 uint32_t secTer = (uint32_t)ce; | |
681 return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff); | |
682 } | |
683 | |
684 class RootElementsIterator { | |
685 public: | |
686 RootElementsIterator(const CollationData &root) | |
687 : data(root), | |
688 elements(root.rootElements), length(root.rootElementsLength), | |
689 pri(0), secTer(0), | |
690 index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_I
NDEX]) {} | |
691 | |
692 UBool next() { | |
693 if(index >= length) { return FALSE; } | |
694 uint32_t p = elements[index]; | |
695 if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; } | |
696 if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) { | |
697 ++index; | |
698 secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG; | |
699 return TRUE; | |
700 } | |
701 if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) { | |
702 // End of a range, enumerate the primaries in the range. | |
703 int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK
; | |
704 p &= 0xffffff00; | |
705 if(pri == p) { | |
706 // Finished the range, return the next CE after it. | |
707 ++index; | |
708 return next(); | |
709 } | |
710 U_ASSERT(pri < p); | |
711 // Return the next primary in this range. | |
712 UBool isCompressible = data.isCompressiblePrimary(pri); | |
713 if((pri & 0xffff) == 0) { | |
714 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible,
step); | |
715 } else { | |
716 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible
, step); | |
717 } | |
718 return TRUE; | |
719 } | |
720 // Simple primary CE. | |
721 ++index; | |
722 pri = p; | |
723 // Does this have an explicit below-common sec/ter unit, | |
724 // or does it imply a common one? | |
725 if(index == length) { | |
726 secTer = Collation::COMMON_SEC_AND_TER_CE; | |
727 } else { | |
728 secTer = elements[index]; | |
729 if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) { | |
730 // No sec/ter delta. | |
731 secTer = Collation::COMMON_SEC_AND_TER_CE; | |
732 } else { | |
733 secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG; | |
734 if(secTer > Collation::COMMON_SEC_AND_TER_CE) { | |
735 // Implied sec/ter. | |
736 secTer = Collation::COMMON_SEC_AND_TER_CE; | |
737 } else { | |
738 // Explicit sec/ter below common/common. | |
739 ++index; | |
740 } | |
741 } | |
742 } | |
743 return TRUE; | |
744 } | |
745 | |
746 uint32_t getPrimary() const { return pri; } | |
747 uint32_t getSecTer() const { return secTer; } | |
748 | |
749 private: | |
750 const CollationData &data; | |
751 const uint32_t *elements; | |
752 int32_t length; | |
753 | |
754 uint32_t pri; | |
755 uint32_t secTer; | |
756 int32_t index; | |
757 }; | |
758 | |
759 } // namespace | |
760 | |
761 void CollationTest::TestRootElements() { | |
762 IcuTestErrorCode errorCode(*this, "TestRootElements"); | |
763 const CollationData *root = CollationRoot::getData(errorCode); | |
764 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) { | |
765 return; | |
766 } | |
767 CollationRootElements rootElements(root->rootElements, root->rootElementsLen
gth); | |
768 RootElementsIterator iter(*root); | |
769 | |
770 // We check each root CE for validity, | |
771 // and we also verify that there is a tailoring gap between each two CEs. | |
772 CollationWeights cw1c; // compressible primary weights | |
773 CollationWeights cw1u; // uncompressible primary weights | |
774 CollationWeights cw2; | |
775 CollationWeights cw3; | |
776 | |
777 cw1c.initForPrimary(TRUE); | |
778 cw1u.initForPrimary(FALSE); | |
779 cw2.initForSecondary(); | |
780 cw3.initForTertiary(); | |
781 | |
782 // Note: The root elements do not include Han-implicit or unassigned-implici
t CEs, | |
783 // nor the special merge-separator CE for U+FFFE. | |
784 uint32_t prevPri = 0; | |
785 uint32_t prevSec = 0; | |
786 uint32_t prevTer = 0; | |
787 while(iter.next()) { | |
788 uint32_t pri = iter.getPrimary(); | |
789 uint32_t secTer = iter.getSecTer(); | |
790 // CollationRootElements CEs must have 0 case and quaternary bits. | |
791 if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) { | |
792 errln("CollationRootElements CE has non-zero case and/or quaternary
bits: %08lx %08lx", | |
793 (long)pri, (long)secTer); | |
794 } | |
795 uint32_t sec = secTer >> 16; | |
796 uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK; | |
797 uint32_t ctq = ter; | |
798 if(pri == 0 && sec == 0 && ter != 0) { | |
799 // Tertiary CEs must have uppercase bits, | |
800 // but they are not stored in the CollationRootElements. | |
801 ctq |= 0x8000; | |
802 } | |
803 if(!isValidCE(rootElements, *root, pri, sec, ctq)) { | |
804 errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer); | |
805 } else { | |
806 if(pri != prevPri) { | |
807 uint32_t newWeight = 0; | |
808 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) { | |
809 // There is currently no tailoring gap after primary ignorab
les, | |
810 // and we forbid tailoring after U+FFFD and U+FFFF. | |
811 } else if(root->isCompressiblePrimary(prevPri)) { | |
812 if(!cw1c.allocWeights(prevPri, pri, 1)) { | |
813 errln("no primary/compressible tailoring gap between %08
lx and %08lx", | |
814 (long)prevPri, (long)pri); | |
815 } else { | |
816 newWeight = cw1c.nextWeight(); | |
817 } | |
818 } else { | |
819 if(!cw1u.allocWeights(prevPri, pri, 1)) { | |
820 errln("no primary/uncompressible tailoring gap between %
08lx and %08lx", | |
821 (long)prevPri, (long)pri); | |
822 } else { | |
823 newWeight = cw1u.nextWeight(); | |
824 } | |
825 } | |
826 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri))
{ | |
827 errln("mis-allocated primary weight, should get %08lx < %08l
x < %08lx", | |
828 (long)prevPri, (long)newWeight, (long)pri); | |
829 } | |
830 } else if(sec != prevSec) { | |
831 uint32_t lowerLimit = | |
832 prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 :
prevSec; | |
833 if(!cw2.allocWeights(lowerLimit, sec, 1)) { | |
834 errln("no secondary tailoring gap between %04x and %04x", lo
werLimit, sec); | |
835 } else { | |
836 uint32_t newWeight = cw2.nextWeight(); | |
837 if(!(prevSec < newWeight && newWeight < sec)) { | |
838 errln("mis-allocated secondary weight, should get %04x <
%04x < %04x", | |
839 (long)lowerLimit, (long)newWeight, (long)sec); | |
840 } | |
841 } | |
842 } else if(ter != prevTer) { | |
843 uint32_t lowerLimit = | |
844 prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 :
prevTer; | |
845 if(!cw3.allocWeights(lowerLimit, ter, 1)) { | |
846 errln("no teriary tailoring gap between %04x and %04x", lowe
rLimit, ter); | |
847 } else { | |
848 uint32_t newWeight = cw3.nextWeight(); | |
849 if(!(prevTer < newWeight && newWeight < ter)) { | |
850 errln("mis-allocated secondary weight, should get %04x <
%04x < %04x", | |
851 (long)lowerLimit, (long)newWeight, (long)ter); | |
852 } | |
853 } | |
854 } else { | |
855 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer); | |
856 } | |
857 } | |
858 prevPri = pri; | |
859 prevSec = sec; | |
860 prevTer = ter; | |
861 } | |
862 } | |
863 | |
864 void CollationTest::TestTailoredElements() { | |
865 IcuTestErrorCode errorCode(*this, "TestTailoredElements"); | |
866 const CollationData *root = CollationRoot::getData(errorCode); | |
867 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) { | |
868 return; | |
869 } | |
870 CollationRootElements rootElements(root->rootElements, root->rootElementsLen
gth); | |
871 | |
872 UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NU
LL, errorCode); | |
873 if(errorCode.logIfFailureAndReset("failed to create a hash table")) { | |
874 return; | |
875 } | |
876 uhash_setKeyDeleter(prevLocales, uprv_free); | |
877 // TestRootElements() tests the root collator which does not have tailorings
. | |
878 uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode); | |
879 uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode); | |
880 uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode
); | |
881 | |
882 UVector64 ces(errorCode); | |
883 LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales()); | |
884 U_ASSERT(locales.isValid()); | |
885 const char *localeID = "root"; | |
886 do { | |
887 Locale locale(localeID); | |
888 LocalPointer<StringEnumeration> types( | |
889 Collator::getKeywordValuesForLocale("collation", locale, FALSE,
errorCode)); | |
890 errorCode.assertSuccess(); | |
891 const char *type; // first: default type | |
892 while((type = types->next(NULL, errorCode)) != NULL) { | |
893 if(strncmp(type, "private-", 8) == 0) { | |
894 errln("Collator::getKeywordValuesForLocale(%s) returns private c
ollation keyword: %s", | |
895 localeID, type); | |
896 } | |
897 Locale localeWithType(locale); | |
898 localeWithType.setKeywordValue("collation", type, errorCode); | |
899 errorCode.assertSuccess(); | |
900 LocalPointer<Collator> coll(Collator::createInstance(localeWithType,
errorCode)); | |
901 if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)", | |
902 localeWithType.getName())) { | |
903 continue; | |
904 } | |
905 Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode); | |
906 if(uhash_geti(prevLocales, actual.getName()) != 0) { | |
907 continue; | |
908 } | |
909 uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode)
; | |
910 errorCode.assertSuccess(); | |
911 logln("TestTailoredElements(): requested %s -> actual %s", | |
912 localeWithType.getName(), actual.getName()); | |
913 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getA
lias()); | |
914 if(rbc == NULL) { | |
915 continue; | |
916 } | |
917 // Note: It would be better to get tailored strings such that we can | |
918 // identify the prefix, and only get the CEs for the prefix+string, | |
919 // not also for the prefix. | |
920 // There is currently no API for that. | |
921 // It would help in an unusual case where a contraction starting in
the prefix | |
922 // extends past its end, and we do not see the intended mapping. | |
923 // For example, for a mapping p|st, if there is also a contraction p
s, | |
924 // then we get CEs(ps)+CEs(t), rather than CEs(p|st). | |
925 LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode)); | |
926 errorCode.assertSuccess(); | |
927 UnicodeSetIterator iter(*tailored); | |
928 while(iter.next()) { | |
929 const UnicodeString &s = iter.getString(); | |
930 ces.removeAllElements(); | |
931 rbc->internalGetCEs(s, ces, errorCode); | |
932 errorCode.assertSuccess(); | |
933 for(int32_t i = 0; i < ces.size(); ++i) { | |
934 int64_t ce = ces.elementAti(i); | |
935 if(!isValidCE(rootElements, *root, ce)) { | |
936 errln("invalid tailored CE %016llx at CE index %d from s
tring:", | |
937 (long long)ce, (int)i); | |
938 infoln(prettify(s)); | |
939 } | |
940 } | |
941 } | |
942 } | |
943 } while((localeID = locales->next(NULL, errorCode)) != NULL); | |
944 uhash_close(prevLocales); | |
945 } | |
946 | |
947 UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) { | |
948 UnicodeString s; | |
949 for(int32_t i = 0; i < length; ++i) { | |
950 if(i > 0) { s.append((UChar)0x20); } | |
951 uint8_t b = p[i]; | |
952 if(b == 0) { | |
953 s.append((UChar)0x2e); // period | |
954 } else if(b == 1) { | |
955 s.append((UChar)0x7c); // vertical bar | |
956 } else { | |
957 appendHex(b, 2, s); | |
958 } | |
959 } | |
960 return s; | |
961 } | |
962 | |
963 UnicodeString CollationTest::printCollationKey(const CollationKey &key) { | |
964 int32_t length; | |
965 const uint8_t *p = key.getByteArray(length); | |
966 return printSortKey(p, length); | |
967 } | |
968 | |
969 UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode)
{ | |
970 for(;;) { | |
971 int32_t lineLength; | |
972 const UChar *line = ucbuf_readline(f, &lineLength, errorCode); | |
973 if(line == NULL || errorCode.isFailure()) { | |
974 fileLine.remove(); | |
975 return FALSE; | |
976 } | |
977 ++fileLineNumber; | |
978 // Strip trailing CR/LF, comments, and spaces. | |
979 const UChar *comment = u_memchr(line, 0x23, lineLength); // '#' | |
980 if(comment != NULL) { | |
981 lineLength = (int32_t)(comment - line); | |
982 } else { | |
983 while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLeng
th; } | |
984 } | |
985 while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; } | |
986 if(lineLength != 0) { | |
987 fileLine.setTo(FALSE, line, lineLength); | |
988 return TRUE; | |
989 } | |
990 // Empty line, continue. | |
991 } | |
992 } | |
993 | |
994 void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeSt
ring &s, | |
995 UErrorCode &errorCode) { | |
996 int32_t length = fileLine.length(); | |
997 int32_t i; | |
998 for(i = start; i < length && !isSpace(fileLine[i]); ++i) {} | |
999 int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start); // '|' | |
1000 if(pipeIndex >= 0) { | |
1001 prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape(); | |
1002 if(prefix.isEmpty()) { | |
1003 errln("empty prefix on line %d", (int)fileLineNumber); | |
1004 infoln(fileLine); | |
1005 errorCode = U_PARSE_ERROR; | |
1006 return; | |
1007 } | |
1008 start = pipeIndex + 1; | |
1009 } else { | |
1010 prefix.remove(); | |
1011 } | |
1012 s = fileLine.tempSubStringBetween(start, i).unescape(); | |
1013 if(s.isEmpty()) { | |
1014 errln("empty string on line %d", (int)fileLineNumber); | |
1015 infoln(fileLine); | |
1016 errorCode = U_PARSE_ERROR; | |
1017 return; | |
1018 } | |
1019 start = i; | |
1020 } | |
1021 | |
1022 Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTest
ErrorCode &errorCode) { | |
1023 Collation::Level relation; | |
1024 int32_t start; | |
1025 if(fileLine[0] == 0x3c) { // < | |
1026 UChar second = fileLine[1]; | |
1027 start = 2; | |
1028 switch(second) { | |
1029 case 0x31: // <1 | |
1030 relation = Collation::PRIMARY_LEVEL; | |
1031 break; | |
1032 case 0x32: // <2 | |
1033 relation = Collation::SECONDARY_LEVEL; | |
1034 break; | |
1035 case 0x33: // <3 | |
1036 relation = Collation::TERTIARY_LEVEL; | |
1037 break; | |
1038 case 0x34: // <4 | |
1039 relation = Collation::QUATERNARY_LEVEL; | |
1040 break; | |
1041 case 0x63: // <c | |
1042 relation = Collation::CASE_LEVEL; | |
1043 break; | |
1044 case 0x69: // <i | |
1045 relation = Collation::IDENTICAL_LEVEL; | |
1046 break; | |
1047 default: // just < | |
1048 relation = Collation::NO_LEVEL; | |
1049 start = 1; | |
1050 break; | |
1051 } | |
1052 } else if(fileLine[0] == 0x3d) { // = | |
1053 relation = Collation::ZERO_LEVEL; | |
1054 start = 1; | |
1055 } else { | |
1056 start = 0; | |
1057 } | |
1058 if(start == 0 || !isSpace(fileLine[start])) { | |
1059 errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (in
t)fileLineNumber); | |
1060 infoln(fileLine); | |
1061 errorCode.set(U_PARSE_ERROR); | |
1062 return Collation::NO_LEVEL; | |
1063 } | |
1064 start = skipSpaces(start); | |
1065 UnicodeString prefix; | |
1066 parseString(start, prefix, s, errorCode); | |
1067 if(errorCode.isSuccess() && !prefix.isEmpty()) { | |
1068 errln("prefix string not allowed for test string: on line %d", (int)file
LineNumber); | |
1069 infoln(fileLine); | |
1070 errorCode.set(U_PARSE_ERROR); | |
1071 return Collation::NO_LEVEL; | |
1072 } | |
1073 if(start < fileLine.length()) { | |
1074 errln("unexpected line contents after test string on line %d", (int)file
LineNumber); | |
1075 infoln(fileLine); | |
1076 errorCode.set(U_PARSE_ERROR); | |
1077 return Collation::NO_LEVEL; | |
1078 } | |
1079 return relation; | |
1080 } | |
1081 | |
1082 static const struct { | |
1083 const char *name; | |
1084 UColAttribute attr; | |
1085 } attributes[] = { | |
1086 { "backwards", UCOL_FRENCH_COLLATION }, | |
1087 { "alternate", UCOL_ALTERNATE_HANDLING }, | |
1088 { "caseFirst", UCOL_CASE_FIRST }, | |
1089 { "caseLevel", UCOL_CASE_LEVEL }, | |
1090 // UCOL_NORMALIZATION_MODE is turned on and off automatically. | |
1091 { "strength", UCOL_STRENGTH }, | |
1092 // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated. | |
1093 { "numeric", UCOL_NUMERIC_COLLATION } | |
1094 }; | |
1095 | |
1096 static const struct { | |
1097 const char *name; | |
1098 UColAttributeValue value; | |
1099 } attributeValues[] = { | |
1100 { "default", UCOL_DEFAULT }, | |
1101 { "primary", UCOL_PRIMARY }, | |
1102 { "secondary", UCOL_SECONDARY }, | |
1103 { "tertiary", UCOL_TERTIARY }, | |
1104 { "quaternary", UCOL_QUATERNARY }, | |
1105 { "identical", UCOL_IDENTICAL }, | |
1106 { "off", UCOL_OFF }, | |
1107 { "on", UCOL_ON }, | |
1108 { "shifted", UCOL_SHIFTED }, | |
1109 { "non-ignorable", UCOL_NON_IGNORABLE }, | |
1110 { "lower", UCOL_LOWER_FIRST }, | |
1111 { "upper", UCOL_UPPER_FIRST } | |
1112 }; | |
1113 | |
1114 void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) { | |
1115 // Parse attributes even if the Collator could not be created, | |
1116 // in order to report syntax errors. | |
1117 int32_t start = skipSpaces(1); | |
1118 int32_t equalPos = fileLine.indexOf(0x3d); | |
1119 if(equalPos < 0) { | |
1120 if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) { | |
1121 parseAndSetReorderCodes(start + 7, errorCode); | |
1122 return; | |
1123 } | |
1124 errln("missing '=' on line %d", (int)fileLineNumber); | |
1125 infoln(fileLine); | |
1126 errorCode.set(U_PARSE_ERROR); | |
1127 return; | |
1128 } | |
1129 | |
1130 UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos); | |
1131 UnicodeString valueString = fileLine.tempSubString(equalPos+1); | |
1132 if(attrString == UNICODE_STRING("maxVariable", 11)) { | |
1133 UColReorderCode max; | |
1134 if(valueString == UNICODE_STRING("space", 5)) { | |
1135 max = UCOL_REORDER_CODE_SPACE; | |
1136 } else if(valueString == UNICODE_STRING("punct", 5)) { | |
1137 max = UCOL_REORDER_CODE_PUNCTUATION; | |
1138 } else if(valueString == UNICODE_STRING("symbol", 6)) { | |
1139 max = UCOL_REORDER_CODE_SYMBOL; | |
1140 } else if(valueString == UNICODE_STRING("currency", 8)) { | |
1141 max = UCOL_REORDER_CODE_CURRENCY; | |
1142 } else { | |
1143 errln("invalid attribute value name on line %d", (int)fileLineNumber
); | |
1144 infoln(fileLine); | |
1145 errorCode.set(U_PARSE_ERROR); | |
1146 return; | |
1147 } | |
1148 if(coll != NULL) { | |
1149 coll->setMaxVariable(max, errorCode); | |
1150 if(errorCode.isFailure()) { | |
1151 errln("setMaxVariable() failed on line %d: %s", | |
1152 (int)fileLineNumber, errorCode.errorName()); | |
1153 infoln(fileLine); | |
1154 return; | |
1155 } | |
1156 } | |
1157 fileLine.remove(); | |
1158 return; | |
1159 } | |
1160 | |
1161 UColAttribute attr; | |
1162 for(int32_t i = 0;; ++i) { | |
1163 if(i == UPRV_LENGTHOF(attributes)) { | |
1164 errln("invalid attribute name on line %d", (int)fileLineNumber); | |
1165 infoln(fileLine); | |
1166 errorCode.set(U_PARSE_ERROR); | |
1167 return; | |
1168 } | |
1169 if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) { | |
1170 attr = attributes[i].attr; | |
1171 break; | |
1172 } | |
1173 } | |
1174 | |
1175 UColAttributeValue value; | |
1176 for(int32_t i = 0;; ++i) { | |
1177 if(i == UPRV_LENGTHOF(attributeValues)) { | |
1178 errln("invalid attribute value name on line %d", (int)fileLineNumber
); | |
1179 infoln(fileLine); | |
1180 errorCode.set(U_PARSE_ERROR); | |
1181 return; | |
1182 } | |
1183 if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) { | |
1184 value = attributeValues[i].value; | |
1185 break; | |
1186 } | |
1187 } | |
1188 | |
1189 if(coll != NULL) { | |
1190 coll->setAttribute(attr, value, errorCode); | |
1191 if(errorCode.isFailure()) { | |
1192 errln("illegal attribute=value combination on line %d: %s", | |
1193 (int)fileLineNumber, errorCode.errorName()); | |
1194 infoln(fileLine); | |
1195 return; | |
1196 } | |
1197 } | |
1198 fileLine.remove(); | |
1199 } | |
1200 | |
1201 void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &err
orCode) { | |
1202 UVector32 reorderCodes(errorCode); | |
1203 while(start < fileLine.length()) { | |
1204 start = skipSpaces(start); | |
1205 int32_t limit = start; | |
1206 while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit;
} | |
1207 CharString name; | |
1208 name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), e
rrorCode); | |
1209 int32_t code = CollationRuleParser::getReorderCode(name.data()); | |
1210 if(code < 0) { | |
1211 if(uprv_stricmp(name.data(), "default") == 0) { | |
1212 code = UCOL_REORDER_CODE_DEFAULT; // -1 | |
1213 } else { | |
1214 errln("invalid reorder code '%s' on line %d", name.data(), (int)
fileLineNumber); | |
1215 infoln(fileLine); | |
1216 errorCode.set(U_PARSE_ERROR); | |
1217 return; | |
1218 } | |
1219 } | |
1220 reorderCodes.addElement(code, errorCode); | |
1221 start = limit; | |
1222 } | |
1223 if(coll != NULL) { | |
1224 coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), err
orCode); | |
1225 if(errorCode.isFailure()) { | |
1226 errln("setReorderCodes() failed on line %d: %s", | |
1227 (int)fileLineNumber, errorCode.errorName()); | |
1228 infoln(fileLine); | |
1229 return; | |
1230 } | |
1231 } | |
1232 fileLine.remove(); | |
1233 } | |
1234 | |
1235 void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) { | |
1236 UnicodeString rules; | |
1237 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) { | |
1238 rules.append(fileLine.unescape()); | |
1239 } | |
1240 if(errorCode.isFailure()) { return; } | |
1241 logln(rules); | |
1242 | |
1243 UParseError parseError; | |
1244 UnicodeString reason; | |
1245 delete coll; | |
1246 coll = new RuleBasedCollator(rules, parseError, reason, errorCode); | |
1247 if(coll == NULL) { | |
1248 errln("unable to allocate a new collator"); | |
1249 errorCode.set(U_MEMORY_ALLOCATION_ERROR); | |
1250 return; | |
1251 } | |
1252 if(errorCode.isFailure()) { | |
1253 dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName())
; | |
1254 infoln(UnicodeString(" reason: ") + reason); | |
1255 if(parseError.offset >= 0) { infoln(" rules offset: %d", (int)parseErro
r.offset); } | |
1256 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) { | |
1257 infoln(UnicodeString(" snippet: ...") + | |
1258 parseError.preContext + "(!)" + parseError.postContext + "..."); | |
1259 } | |
1260 delete coll; | |
1261 coll = NULL; | |
1262 errorCode.reset(); | |
1263 } else { | |
1264 assertEquals("no error reason when RuleBasedCollator(rules) succeeds", | |
1265 UnicodeString(), reason); | |
1266 } | |
1267 } | |
1268 | |
1269 void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) { | |
1270 if(errorCode.isFailure()) { return; } | |
1271 delete coll; | |
1272 coll = Collator::createInstance(Locale::getRoot(), errorCode); | |
1273 if(errorCode.isFailure()) { | |
1274 dataerrln("unable to create a root collator"); | |
1275 return; | |
1276 } | |
1277 } | |
1278 | |
1279 void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) { | |
1280 if(errorCode.isFailure()) { return; } | |
1281 delete coll; | |
1282 coll = NULL; | |
1283 int32_t at = fileLine.indexOf((UChar)0x40, 9); // @ is not invariant | |
1284 if(at >= 0) { | |
1285 fileLine.setCharAt(at, (UChar)0x2a); // * | |
1286 } | |
1287 CharString localeID; | |
1288 localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode); | |
1289 if(at >= 0) { | |
1290 localeID.data()[at - 9] = '@'; | |
1291 } | |
1292 Locale locale(localeID.data()); | |
1293 if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) { | |
1294 errln("invalid language tag on line %d", (int)fileLineNumber); | |
1295 infoln(fileLine); | |
1296 if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); } | |
1297 return; | |
1298 } | |
1299 | |
1300 logln("creating a collator for locale ID %s", locale.getName()); | |
1301 coll = Collator::createInstance(locale, errorCode); | |
1302 if(errorCode.isFailure()) { | |
1303 dataerrln("unable to create a collator for locale %s on line %d", | |
1304 locale.getName(), (int)fileLineNumber); | |
1305 infoln(fileLine); | |
1306 delete coll; | |
1307 coll = NULL; | |
1308 errorCode.reset(); | |
1309 } | |
1310 } | |
1311 | |
1312 UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &erro
rCode) const { | |
1313 if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE;
} | |
1314 // In some sequences with Tibetan composite vowel signs, | |
1315 // even if the string passes the FCD check, | |
1316 // those composites must be decomposed. | |
1317 // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81. | |
1318 int32_t index = 0; | |
1319 while((index = s.indexOf((UChar)0xf71, index)) >= 0) { | |
1320 if(++index < s.length()) { | |
1321 UChar c = s[index]; | |
1322 if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; } | |
1323 } | |
1324 } | |
1325 return FALSE; | |
1326 } | |
1327 | |
1328 UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length, | |
1329 CharString &dest, int32_t partSize, | |
1330 IcuTestErrorCode &errorCode) { | |
1331 if(errorCode.isFailure()) { return FALSE; } | |
1332 uint8_t part[32]; | |
1333 U_ASSERT(partSize <= UPRV_LENGTHOF(part)); | |
1334 UCharIterator iter; | |
1335 uiter_setString(&iter, s, length); | |
1336 uint32_t state[2] = { 0, 0 }; | |
1337 for(;;) { | |
1338 int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, p
artSize, errorCode); | |
1339 UBool done = partLength < partSize; | |
1340 if(done) { | |
1341 // At the end, append the next byte as well which should be 00. | |
1342 ++partLength; | |
1343 } | |
1344 dest.append(reinterpret_cast<char *>(part), partLength, errorCode); | |
1345 if(done) { | |
1346 return errorCode.isSuccess(); | |
1347 } | |
1348 } | |
1349 } | |
1350 | |
1351 UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line
, | |
1352 const UChar *s, int32_t length, | |
1353 CollationKey &key, IcuTestErrorCode &errorC
ode) { | |
1354 if(errorCode.isFailure()) { return FALSE; } | |
1355 coll->getCollationKey(s, length, key, errorCode); | |
1356 if(errorCode.isFailure()) { | |
1357 infoln(fileTestName); | |
1358 errln("Collator(%s).getCollationKey() failed: %s", | |
1359 norm, errorCode.errorName()); | |
1360 infoln(line); | |
1361 return FALSE; | |
1362 } | |
1363 int32_t keyLength; | |
1364 const uint8_t *keyBytes = key.getByteArray(keyLength); | |
1365 if(keyLength == 0 || keyBytes[keyLength - 1] != 0) { | |
1366 infoln(fileTestName); | |
1367 errln("Collator(%s).getCollationKey() wrote an empty or unterminated key
", | |
1368 norm); | |
1369 infoln(line); | |
1370 infoln(printCollationKey(key)); | |
1371 return FALSE; | |
1372 } | |
1373 | |
1374 int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode); | |
1375 if(numLevels < UCOL_IDENTICAL) { | |
1376 ++numLevels; | |
1377 } else { | |
1378 numLevels = 5; | |
1379 } | |
1380 if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) { | |
1381 ++numLevels; | |
1382 } | |
1383 errorCode.assertSuccess(); | |
1384 int32_t numLevelSeparators = 0; | |
1385 for(int32_t i = 0; i < (keyLength - 1); ++i) { | |
1386 uint8_t b = keyBytes[i]; | |
1387 if(b == 0) { | |
1388 infoln(fileTestName); | |
1389 errln("Collator(%s).getCollationKey() contains a 00 byte", norm); | |
1390 infoln(line); | |
1391 infoln(printCollationKey(key)); | |
1392 return FALSE; | |
1393 } | |
1394 if(b == 1) { ++numLevelSeparators; } | |
1395 } | |
1396 if(numLevelSeparators != (numLevels - 1)) { | |
1397 infoln(fileTestName); | |
1398 errln("Collator(%s).getCollationKey() has %d level separators for %d lev
els", | |
1399 norm, (int)numLevelSeparators, (int)numLevels); | |
1400 infoln(line); | |
1401 infoln(printCollationKey(key)); | |
1402 return FALSE; | |
1403 } | |
1404 | |
1405 // Check that internalNextSortKeyPart() makes the same key, with several par
t sizes. | |
1406 static const int32_t partSizes[] = { 32, 3, 1 }; | |
1407 for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) { | |
1408 int32_t partSize = partSizes[psi]; | |
1409 CharString parts; | |
1410 if(!getSortKeyParts(s, length, parts, 32, errorCode)) { | |
1411 infoln(fileTestName); | |
1412 errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s", | |
1413 norm, (int)partSize, errorCode.errorName()); | |
1414 infoln(line); | |
1415 return FALSE; | |
1416 } | |
1417 if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), ke
yLength) != 0) { | |
1418 infoln(fileTestName); | |
1419 errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)
", | |
1420 norm, (int)partSize); | |
1421 infoln(line); | |
1422 infoln(printCollationKey(key)); | |
1423 infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts
.length())); | |
1424 return FALSE; | |
1425 } | |
1426 } | |
1427 return TRUE; | |
1428 } | |
1429 | |
1430 /** | |
1431 * Changes the key to the merged segments of the U+FFFE-separated substrings of
s. | |
1432 * Leaves key unchanged if s does not contain U+FFFE. | |
1433 * @return TRUE if the key was successfully changed | |
1434 */ | |
1435 UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length, | |
1436 CollationKey &key, IcuTestErrorCode &
errorCode) { | |
1437 if(errorCode.isFailure()) { return FALSE; } | |
1438 LocalMemory<uint8_t> mergedKey; | |
1439 int32_t mergedKeyLength = 0; | |
1440 int32_t mergedKeyCapacity = 0; | |
1441 int32_t sLength = (length >= 0) ? length : u_strlen(s); | |
1442 int32_t segmentStart = 0; | |
1443 for(int32_t i = 0;;) { | |
1444 if(i == sLength) { | |
1445 if(segmentStart == 0) { | |
1446 // s does not contain any U+FFFE. | |
1447 return FALSE; | |
1448 } | |
1449 } else if(s[i] != 0xfffe) { | |
1450 ++i; | |
1451 continue; | |
1452 } | |
1453 // Get the sort key for another segment and merge it into mergedKey. | |
1454 CollationKey key1(mergedKey.getAlias(), mergedKeyLength); // copies the
bytes | |
1455 CollationKey key2; | |
1456 coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCod
e); | |
1457 int32_t key1Length, key2Length; | |
1458 const uint8_t *key1Bytes = key1.getByteArray(key1Length); | |
1459 const uint8_t *key2Bytes = key2.getByteArray(key2Length); | |
1460 uint8_t *dest; | |
1461 int32_t minCapacity = key1Length + key2Length; | |
1462 if(key1Length > 0) { --minCapacity; } | |
1463 if(minCapacity <= mergedKeyCapacity) { | |
1464 dest = mergedKey.getAlias(); | |
1465 } else { | |
1466 if(minCapacity <= 200) { | |
1467 mergedKeyCapacity = 200; | |
1468 } else if(minCapacity <= 2 * mergedKeyCapacity) { | |
1469 mergedKeyCapacity *= 2; | |
1470 } else { | |
1471 mergedKeyCapacity = minCapacity; | |
1472 } | |
1473 dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity); | |
1474 } | |
1475 U_ASSERT(dest != NULL || mergedKeyCapacity == 0); | |
1476 if(key1Length == 0) { | |
1477 // key2 is the sort key for the first segment. | |
1478 uprv_memcpy(dest, key2Bytes, key2Length); | |
1479 mergedKeyLength = key2Length; | |
1480 } else { | |
1481 mergedKeyLength = | |
1482 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length, | |
1483 dest, mergedKeyCapacity); | |
1484 } | |
1485 if(i == sLength) { break; } | |
1486 segmentStart = ++i; | |
1487 } | |
1488 key = CollationKey(mergedKey.getAlias(), mergedKeyLength); | |
1489 return TRUE; | |
1490 } | |
1491 | |
1492 namespace { | |
1493 | |
1494 /** | |
1495 * Replaces unpaired surrogates with U+FFFD. | |
1496 * Returns s if no replacement was made, otherwise buffer. | |
1497 */ | |
1498 const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buf
fer) { | |
1499 int32_t i = 0; | |
1500 while(i < s.length()) { | |
1501 UChar32 c = s.char32At(i); | |
1502 if(U_IS_SURROGATE(c)) { | |
1503 if(buffer.length() < i) { | |
1504 buffer.append(s, buffer.length(), i - buffer.length()); | |
1505 } | |
1506 buffer.append((UChar)0xfffd); | |
1507 } | |
1508 i += U16_LENGTH(c); | |
1509 } | |
1510 if(buffer.isEmpty()) { | |
1511 return s; | |
1512 } | |
1513 if(buffer.length() < i) { | |
1514 buffer.append(s, buffer.length(), i - buffer.length()); | |
1515 } | |
1516 return buffer; | |
1517 } | |
1518 | |
1519 int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key, | |
1520 UCollationResult order, UBool collHasCaseLevel) { | |
1521 if(order == UCOL_EQUAL) { | |
1522 return Collation::NO_LEVEL; | |
1523 } | |
1524 int32_t prevKeyLength; | |
1525 const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength); | |
1526 int32_t keyLength; | |
1527 const uint8_t *bytes = key.getByteArray(keyLength); | |
1528 int32_t level = Collation::PRIMARY_LEVEL; | |
1529 for(int32_t i = 0;; ++i) { | |
1530 uint8_t b = prevBytes[i]; | |
1531 if(b != bytes[i]) { break; } | |
1532 if(b == Collation::LEVEL_SEPARATOR_BYTE) { | |
1533 ++level; | |
1534 if(level == Collation::CASE_LEVEL && !collHasCaseLevel) { | |
1535 ++level; | |
1536 } | |
1537 } | |
1538 } | |
1539 return level; | |
1540 } | |
1541 | |
1542 } | |
1543 | |
1544 UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prev
FileLine, | |
1545 const UnicodeString &prevString, const Unic
odeString &s, | |
1546 UCollationResult expectedOrder, Collation::
Level expectedLevel, | |
1547 IcuTestErrorCode &errorCode) { | |
1548 if(errorCode.isFailure()) { return FALSE; } | |
1549 | |
1550 // Get the sort keys first, for error debug output. | |
1551 CollationKey prevKey; | |
1552 if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.l
ength(), | |
1553 prevKey, errorCode)) { | |
1554 return FALSE; | |
1555 } | |
1556 CollationKey key; | |
1557 if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCod
e)) { return FALSE; } | |
1558 | |
1559 UCollationResult order = coll->compare(prevString, s, errorCode); | |
1560 if(order != expectedOrder || errorCode.isFailure()) { | |
1561 infoln(fileTestName); | |
1562 errln("line %d Collator(%s).compare(previous, current) wrong order: %d !
= %d (%s)", | |
1563 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorNa
me()); | |
1564 infoln(prevFileLine); | |
1565 infoln(fileLine); | |
1566 infoln(printCollationKey(prevKey)); | |
1567 infoln(printCollationKey(key)); | |
1568 return FALSE; | |
1569 } | |
1570 order = coll->compare(s, prevString, errorCode); | |
1571 if(order != -expectedOrder || errorCode.isFailure()) { | |
1572 infoln(fileTestName); | |
1573 errln("line %d Collator(%s).compare(current, previous) wrong order: %d !
= %d (%s)", | |
1574 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorN
ame()); | |
1575 infoln(prevFileLine); | |
1576 infoln(fileLine); | |
1577 infoln(printCollationKey(prevKey)); | |
1578 infoln(printCollationKey(key)); | |
1579 return FALSE; | |
1580 } | |
1581 // Test NUL-termination if the strings do not contain NUL characters. | |
1582 UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0)
>= 0; | |
1583 if(!containNUL) { | |
1584 order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, err
orCode); | |
1585 if(order != expectedOrder || errorCode.isFailure()) { | |
1586 infoln(fileTestName); | |
1587 errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong
order: %d != %d (%s)", | |
1588 (int)fileLineNumber, norm, order, expectedOrder, errorCode.err
orName()); | |
1589 infoln(prevFileLine); | |
1590 infoln(fileLine); | |
1591 infoln(printCollationKey(prevKey)); | |
1592 infoln(printCollationKey(key)); | |
1593 return FALSE; | |
1594 } | |
1595 order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, err
orCode); | |
1596 if(order != -expectedOrder || errorCode.isFailure()) { | |
1597 infoln(fileTestName); | |
1598 errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong
order: %d != %d (%s)", | |
1599 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.er
rorName()); | |
1600 infoln(prevFileLine); | |
1601 infoln(fileLine); | |
1602 infoln(printCollationKey(prevKey)); | |
1603 infoln(printCollationKey(key)); | |
1604 return FALSE; | |
1605 } | |
1606 } | |
1607 | |
1608 #if U_HAVE_STD_STRING | |
1609 // compare(UTF-16) treats unpaired surrogates like unassigned code points. | |
1610 // Unpaired surrogates cannot be converted to UTF-8. | |
1611 // Create valid UTF-16 strings if necessary, and use those for | |
1612 // both the expected compare() result and for the input to compare(UTF-8). | |
1613 UnicodeString prevBuffer, sBuffer; | |
1614 const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer); | |
1615 const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer); | |
1616 std::string prevUTF8, sUTF8; | |
1617 UnicodeString(prevValid).toUTF8String(prevUTF8); | |
1618 UnicodeString(sValid).toUTF8String(sUTF8); | |
1619 UCollationResult expectedUTF8Order; | |
1620 if(&prevValid == &prevString && &sValid == &s) { | |
1621 expectedUTF8Order = expectedOrder; | |
1622 } else { | |
1623 expectedUTF8Order = coll->compare(prevValid, sValid, errorCode); | |
1624 } | |
1625 | |
1626 order = coll->compareUTF8(prevUTF8, sUTF8, errorCode); | |
1627 if(order != expectedUTF8Order || errorCode.isFailure()) { | |
1628 infoln(fileTestName); | |
1629 errln("line %d Collator(%s).compareUTF8(previous, current) wrong order:
%d != %d (%s)", | |
1630 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.err
orName()); | |
1631 infoln(prevFileLine); | |
1632 infoln(fileLine); | |
1633 infoln(printCollationKey(prevKey)); | |
1634 infoln(printCollationKey(key)); | |
1635 return FALSE; | |
1636 } | |
1637 order = coll->compareUTF8(sUTF8, prevUTF8, errorCode); | |
1638 if(order != -expectedUTF8Order || errorCode.isFailure()) { | |
1639 infoln(fileTestName); | |
1640 errln("line %d Collator(%s).compareUTF8(current, previous) wrong order:
%d != %d (%s)", | |
1641 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.er
rorName()); | |
1642 infoln(prevFileLine); | |
1643 infoln(fileLine); | |
1644 infoln(printCollationKey(prevKey)); | |
1645 infoln(printCollationKey(key)); | |
1646 return FALSE; | |
1647 } | |
1648 // Test NUL-termination if the strings do not contain NUL characters. | |
1649 if(!containNUL) { | |
1650 order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -
1, errorCode); | |
1651 if(order != expectedUTF8Order || errorCode.isFailure()) { | |
1652 infoln(fileTestName); | |
1653 errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, curren
t-NUL) wrong order: %d != %d (%s)", | |
1654 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode
.errorName()); | |
1655 infoln(prevFileLine); | |
1656 infoln(fileLine); | |
1657 infoln(printCollationKey(prevKey)); | |
1658 infoln(printCollationKey(key)); | |
1659 return FALSE; | |
1660 } | |
1661 order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -
1, errorCode); | |
1662 if(order != -expectedUTF8Order || errorCode.isFailure()) { | |
1663 infoln(fileTestName); | |
1664 errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previou
s-NUL) wrong order: %d != %d (%s)", | |
1665 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCod
e.errorName()); | |
1666 infoln(prevFileLine); | |
1667 infoln(fileLine); | |
1668 infoln(printCollationKey(prevKey)); | |
1669 infoln(printCollationKey(key)); | |
1670 return FALSE; | |
1671 } | |
1672 } | |
1673 #endif | |
1674 | |
1675 UCharIterator leftIter; | |
1676 UCharIterator rightIter; | |
1677 uiter_setString(&leftIter, prevString.getBuffer(), prevString.length()); | |
1678 uiter_setString(&rightIter, s.getBuffer(), s.length()); | |
1679 order = coll->compare(leftIter, rightIter, errorCode); | |
1680 if(order != expectedOrder || errorCode.isFailure()) { | |
1681 infoln(fileTestName); | |
1682 errln("line %d Collator(%s).compare(UCharIterator: previous, current) " | |
1683 "wrong order: %d != %d (%s)", | |
1684 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorNa
me()); | |
1685 infoln(prevFileLine); | |
1686 infoln(fileLine); | |
1687 infoln(printCollationKey(prevKey)); | |
1688 infoln(printCollationKey(key)); | |
1689 return FALSE; | |
1690 } | |
1691 | |
1692 order = prevKey.compareTo(key, errorCode); | |
1693 if(order != expectedOrder || errorCode.isFailure()) { | |
1694 infoln(fileTestName); | |
1695 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo
() wrong order: %d != %d (%s)", | |
1696 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorNa
me()); | |
1697 infoln(prevFileLine); | |
1698 infoln(fileLine); | |
1699 infoln(printCollationKey(prevKey)); | |
1700 infoln(printCollationKey(key)); | |
1701 return FALSE; | |
1702 } | |
1703 UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == U
COL_ON; | |
1704 int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel); | |
1705 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) { | |
1706 if(level != expectedLevel) { | |
1707 infoln(fileTestName); | |
1708 errln("line %d Collator(%s).getCollationKey(previous, current).compa
reTo()=%d wrong level: %d != %d", | |
1709 (int)fileLineNumber, norm, order, level, expectedLevel); | |
1710 infoln(prevFileLine); | |
1711 infoln(fileLine); | |
1712 infoln(printCollationKey(prevKey)); | |
1713 infoln(printCollationKey(key)); | |
1714 return FALSE; | |
1715 } | |
1716 } | |
1717 | |
1718 // If either string contains U+FFFE, then their sort keys must compare the s
ame as | |
1719 // the merged sort keys of each string's between-FFFE segments. | |
1720 // | |
1721 // It is not required that | |
1722 // sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey
(str2)) | |
1723 // only that those two methods yield the same order. | |
1724 // | |
1725 // Use bit-wise OR so that getMergedCollationKey() is always called for both
strings. | |
1726 if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevK
ey, errorCode) | | |
1727 getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)
) || | |
1728 errorCode.isFailure()) { | |
1729 order = prevKey.compareTo(key, errorCode); | |
1730 if(order != expectedOrder || errorCode.isFailure()) { | |
1731 infoln(fileTestName); | |
1732 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey" | |
1733 "(previous, current segments between U+FFFE)).compareTo() wrong
order: %d != %d (%s)", | |
1734 (int)fileLineNumber, norm, order, expectedOrder, errorCode.error
Name()); | |
1735 infoln(prevFileLine); | |
1736 infoln(fileLine); | |
1737 infoln(printCollationKey(prevKey)); | |
1738 infoln(printCollationKey(key)); | |
1739 return FALSE; | |
1740 } | |
1741 int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCas
eLevel); | |
1742 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) { | |
1743 if(mergedLevel != level) { | |
1744 infoln(fileTestName); | |
1745 errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey" | |
1746 "(previous, current segments between U+FFFE)).compareTo()=%d
wrong level: %d != %d", | |
1747 (int)fileLineNumber, norm, order, mergedLevel, level); | |
1748 infoln(prevFileLine); | |
1749 infoln(fileLine); | |
1750 infoln(printCollationKey(prevKey)); | |
1751 infoln(printCollationKey(key)); | |
1752 return FALSE; | |
1753 } | |
1754 } | |
1755 } | |
1756 return TRUE; | |
1757 } | |
1758 | |
1759 void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode
) { | |
1760 if(errorCode.isFailure()) { return; } | |
1761 UnicodeString prevFileLine = UNICODE_STRING("(none)", 6); | |
1762 UnicodeString prevString, s; | |
1763 prevString.getTerminatedBuffer(); // Ensure NUL-termination. | |
1764 while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) { | |
1765 // Parse the line even if it will be ignored (when we do not have a Coll
ator) | |
1766 // in order to report syntax issues. | |
1767 Collation::Level relation = parseRelationAndString(s, errorCode); | |
1768 if(errorCode.isFailure()) { | |
1769 errorCode.reset(); | |
1770 break; | |
1771 } | |
1772 if(coll == NULL) { | |
1773 // We were unable to create the Collator but continue with tests. | |
1774 // Ignore test data for this Collator. | |
1775 // The next Collator creation might work. | |
1776 continue; | |
1777 } | |
1778 UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? U
COL_EQUAL : UCOL_LESS; | |
1779 Collation::Level expectedLevel = relation; | |
1780 s.getTerminatedBuffer(); // Ensure NUL-termination. | |
1781 UBool isOk = TRUE; | |
1782 if(!needsNormalization(prevString, errorCode) && !needsNormalization(s,
errorCode)) { | |
1783 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode); | |
1784 isOk = checkCompareTwo("normalization=on", prevFileLine, prevString,
s, | |
1785 expectedOrder, expectedLevel, errorCode); | |
1786 } | |
1787 if(isOk) { | |
1788 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode); | |
1789 isOk = checkCompareTwo("normalization=off", prevFileLine, prevString
, s, | |
1790 expectedOrder, expectedLevel, errorCode); | |
1791 } | |
1792 if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormali
zed(s, errorCode))) { | |
1793 UnicodeString pn = nfd->normalize(prevString, errorCode); | |
1794 UnicodeString n = nfd->normalize(s, errorCode); | |
1795 pn.getTerminatedBuffer(); | |
1796 n.getTerminatedBuffer(); | |
1797 errorCode.assertSuccess(); | |
1798 isOk = checkCompareTwo("NFD input", prevFileLine, pn, n, | |
1799 expectedOrder, expectedLevel, errorCode); | |
1800 } | |
1801 if(!isOk) { | |
1802 errorCode.reset(); // already reported | |
1803 } | |
1804 prevFileLine = fileLine; | |
1805 prevString = s; | |
1806 prevString.getTerminatedBuffer(); // Ensure NUL-termination. | |
1807 } | |
1808 } | |
1809 | |
1810 void CollationTest::TestDataDriven() { | |
1811 IcuTestErrorCode errorCode(*this, "TestDataDriven"); | |
1812 | |
1813 fcd = Normalizer2Factory::getFCDInstance(errorCode); | |
1814 nfd = Normalizer2::getNFDInstance(errorCode); | |
1815 if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance()
or getNFDInstance()")) { | |
1816 return; | |
1817 } | |
1818 | |
1819 CharString path(getSourceTestData(errorCode), errorCode); | |
1820 path.appendPathPart("collationtest.txt", errorCode); | |
1821 const char *codePage = "UTF-8"; | |
1822 LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, error
Code)); | |
1823 if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) { | |
1824 return; | |
1825 } | |
1826 // Read a new line if necessary. | |
1827 // Sub-parsers leave the first line set that they do not handle. | |
1828 while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.ge
tAlias(), errorCode))) { | |
1829 if(!isSectionStarter(fileLine[0])) { | |
1830 errln("syntax error on line %d", (int)fileLineNumber); | |
1831 infoln(fileLine); | |
1832 return; | |
1833 } | |
1834 if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) { | |
1835 fileTestName = fileLine; | |
1836 logln(fileLine); | |
1837 fileLine.remove(); | |
1838 } else if(fileLine == UNICODE_STRING("@ root", 6)) { | |
1839 setRootCollator(errorCode); | |
1840 fileLine.remove(); | |
1841 } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) { | |
1842 setLocaleCollator(errorCode); | |
1843 fileLine.remove(); | |
1844 } else if(fileLine == UNICODE_STRING("@ rules", 7)) { | |
1845 buildTailoring(f.getAlias(), errorCode); | |
1846 } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) { // % | |
1847 parseAndSetAttribute(errorCode); | |
1848 } else if(fileLine == UNICODE_STRING("* compare", 9)) { | |
1849 checkCompareStrings(f.getAlias(), errorCode); | |
1850 } else { | |
1851 errln("syntax error on line %d", (int)fileLineNumber); | |
1852 infoln(fileLine); | |
1853 return; | |
1854 } | |
1855 } | |
1856 } | |
1857 | |
1858 #endif // !UCONFIG_NO_COLLATION | |
OLD | NEW |