OLD | NEW |
| (Empty) |
1 # Copyright (c) 2012-2015 International Business Machines | |
2 # Corporation and others. All Rights Reserved. | |
3 # | |
4 # This file should be in UTF-8 with a signature byte sequence ("BOM"). | |
5 # | |
6 # collationtest.txt: Collation test data. | |
7 # | |
8 # created on: 2012apr13 | |
9 # created by: Markus W. Scherer | |
10 | |
11 # A line with "** test: description" is used for verbose and error output. | |
12 | |
13 # A collator can be set with "@ root" or "@ locale language-tag", | |
14 # for example "@ locale de-u-co-phonebk". | |
15 # An old-style locale ID can also be used, for example "@ locale de@collation=ph
onebook". | |
16 | |
17 # A collator can be built with "@ rules". | |
18 # An "@ rules" line is followed by one or more lines with the tailoring rules. | |
19 | |
20 # A collator can be modified with "% attribute=value". | |
21 | |
22 # "* compare" tests the order (= or <) of the following strings. | |
23 # The relation can be "=" or "<" (the level of the difference is not specified) | |
24 # or "<1", "<2", "<c", "<3", "<4" (indicating the level of the difference). | |
25 | |
26 # Test sections ("* compare") are terminated by | |
27 # definitions of new collators, changing attributes, or new test sections. | |
28 | |
29 ** test: simple CEs & expansions | |
30 # Many types of mappings are tested elsewhere, including via the UCA conformance
tests. | |
31 # Here we mostly cover a few unusual mappings. | |
32 @ rules | |
33 &\x01 # most control codes are ignorable | |
34 <<<\u0300 # tertiary CE | |
35 &9<\x00 # NUL not ignorable | |
36 &\uA00A\uA00B=\uA002 # two long-primary CEs | |
37 &\uA00A\uA00B\u00050005=\uA003 # three CEs, require 64 bits | |
38 | |
39 * compare | |
40 = \x01 | |
41 = \x02 | |
42 <3 \u0300 | |
43 <1 9 | |
44 <1 \x00 | |
45 = \x01\x00\x02 | |
46 <1 a | |
47 <3 a\u0300 | |
48 <2 a\u0308 | |
49 = ä | |
50 <1 b | |
51 <1 か # Hiragana Ka (U+304B) | |
52 <2 か\u3099 # plus voiced sound mark | |
53 = が # Hiragana Ga (U+304C) | |
54 <1 \uA00A\uA00B | |
55 = \uA002 | |
56 <1 \uA00A\uA00B\u00050004 | |
57 <1 \uA00A\uA00B\u00050005 | |
58 = \uA003 | |
59 <1 \uA00A\uA00B\u00050006 | |
60 | |
61 ** test: contractions | |
62 # Create some interesting mappings, and map some normalization-inert characters | |
63 # (which are not subject to canonical reordering) | |
64 # to some of the same CEs to check the sequence of CEs. | |
65 @ rules | |
66 | |
67 # Contractions starting with 'a' should not continue with any character < U+0300 | |
68 # so that we can test a shortcut for that. | |
69 &a=ⓐ | |
70 &b<bz=ⓑ | |
71 &d<dz\u0301=ⓓ # d+z+acute | |
72 &z | |
73 <a\u0301=Ⓐ # a+acute sorts after z | |
74 <a\u0301\u0301=Ⓑ # a+acute+acute | |
75 <a\u0301\u0301\u0358=Ⓒ # a+acute+acute+dot above right | |
76 <a\u030a=Ⓓ # a+ring | |
77 <a\u0323=Ⓔ # a+dot below | |
78 <a\u0323\u0358=Ⓕ # a+dot below+dot above right | |
79 <a\u0327\u0323\u030a=Ⓖ # a+cedilla+dot below+ring | |
80 <a\u0327\u0323bz=Ⓗ # a+cedilla+dot below+b+z | |
81 | |
82 &\U0001D158=⁰ # musical notehead black (has a symbol primary) | |
83 <\U0001D158\U0001D165=¼ # musical quarter note | |
84 | |
85 # deliberately missing prefix contractions: | |
86 # dz | |
87 # a\u0327 | |
88 # a\u0327\u0323 | |
89 # a\u0327\u0323b | |
90 | |
91 &\x01 | |
92 <<<\U0001D165=¹ # musical stem (ccc=216) | |
93 <<<\U0001D16D=² # musical augmentation dot (ccc=226) | |
94 <<<\U0001D165\U0001D16D=³ # stem+dot (ccc=216 226) | |
95 &\u0301=❶ # acute (ccc=230) | |
96 &\u030a=❷ # ring (ccc=230) | |
97 &\u0308=❸ # diaeresis (ccc=230) | |
98 <<\u0308\u0301=❹ # diaeresis+acute (=dialytika tonos) (ccc=230 230) | |
99 &\u0327=❺ # cedilla (ccc=202) | |
100 &\u0323=❻ # dot below (ccc=220) | |
101 &\u0331=❼ # macron below (ccc=220) | |
102 <<\u0331\u0358=❽ # macron below+dot above right (ccc=220 232) | |
103 &\u0334=❾ # tilde overlay (ccc=1) | |
104 &\u0358=❿ # dot above right (ccc=232) | |
105 | |
106 &\u0f71=① # tibetan vowel sign aa | |
107 &\u0f72=② # tibetan vowel sign i | |
108 # \u0f71\u0f72 # tibetan vowel sign aa + i = ii = U+0F73 | |
109 &\u0f73=③ # tibetan vowel sign ii (ccc=0 but lccc=129) | |
110 | |
111 ** test: simple contractions | |
112 | |
113 # Some strings are chosen to cause incremental contiguous contraction matching t
o | |
114 # go into partial matches for prefixes of contractions | |
115 # (where the prefixes are deliberately not also contractions). | |
116 # When there is no complete match, then the matching code must back out of those | |
117 # so that discontiguous contractions work as specified. | |
118 | |
119 * compare | |
120 # contraction starter with no following text, or mismatch, or blocked | |
121 <1 a | |
122 = ⓐ | |
123 <1 aa | |
124 = ⓐⓐ | |
125 <1 ab | |
126 = ⓐb | |
127 <1 az | |
128 = ⓐz | |
129 | |
130 * compare | |
131 <1 a | |
132 <2 a\u0308\u030a # ring blocked by diaeresis | |
133 = ⓐ❸❷ | |
134 <2 a\u0327 | |
135 = ⓐ❺ | |
136 | |
137 * compare | |
138 <2 \u0308 | |
139 = ❸ | |
140 <2 \u0308\u030a\u0301 # acute blocked by ring | |
141 = ❸❷❶ | |
142 | |
143 * compare | |
144 <1 \U0001D158 | |
145 = ⁰ | |
146 <1 \U0001D158\U0001D165 | |
147 = ¼ | |
148 | |
149 # no discontiguous contraction because of missing prefix contraction d+z, | |
150 # and a starter ('z') after the 'd' | |
151 * compare | |
152 <1 dz\u0323\u0301 | |
153 = dz❻❶ | |
154 | |
155 # contiguous contractions | |
156 * compare | |
157 <1 abz | |
158 = ⓐⓑ | |
159 <1 abzz | |
160 = ⓐⓑz | |
161 | |
162 * compare | |
163 <1 a | |
164 <1 z | |
165 <1 a\u0301 | |
166 = Ⓐ | |
167 <1 a\u0301\u0301 | |
168 = Ⓑ | |
169 <1 a\u0301\u0301\u0358 | |
170 = Ⓒ | |
171 <1 a\u030a | |
172 = Ⓓ | |
173 <1 a\u0323\u0358 | |
174 = Ⓕ | |
175 <1 a\u0327\u0323\u030a # match despite missing prefix | |
176 = Ⓖ | |
177 <1 a\u0327\u0323bz | |
178 = Ⓗ | |
179 | |
180 * compare | |
181 <2 \u0308\u0308\u0301 # acute blocked from first diaeresis, contracts with seco
nd | |
182 = ❸❹ | |
183 | |
184 * compare | |
185 <1 \U0001D158\U0001D165 | |
186 = ¼ | |
187 | |
188 * compare | |
189 <3 \U0001D165\U0001D16D | |
190 = ³ | |
191 | |
192 ** test: discontiguous contractions | |
193 * compare | |
194 <1 a\u0327\u030a # a+ring skips cedilla | |
195 = Ⓓ❺ | |
196 <2 a\u0327\u0327\u030a # a+ring skips 2 cedillas | |
197 = Ⓓ❺❺ | |
198 <2 a\u0327\u0327\u0327\u030a # a+ring skips 3 cedillas | |
199 = Ⓓ❺❺❺ | |
200 <2 a\u0334\u0327\u0327\u030a # a+ring skips tilde overlay & 2 cedillas | |
201 = Ⓓ❾❺❺ | |
202 <1 a\u0327\u0323 # a+dot below skips cedilla | |
203 = Ⓔ❺ | |
204 <1 a\u0323\u0301\u0358 # a+dot below+dot ab.r.: 2-char match, then skip
s acute | |
205 = Ⓕ❶ | |
206 <2 a\u0334\u0323\u0358 # a+dot below skips tilde overlay | |
207 = Ⓕ❾ | |
208 | |
209 * compare | |
210 <2 \u0331\u0331\u0358 # macron below+dot ab.r. skips the second macron
below | |
211 = ❽❼ | |
212 | |
213 * compare | |
214 <1 a\u0327\u0331\u0323\u030a # a+ring skips cedilla, macron below, dot below
(dot blocked by macron) | |
215 = Ⓓ❺❼❻ | |
216 <1 a\u0327\u0323\U0001D16D\u030a # a+dot below skips cedilla | |
217 = Ⓔ❺²❷ | |
218 <2 a\u0327\u0327\u0323\u030a # a+dot below skips 2 cedillas | |
219 = Ⓔ❺❺❷ | |
220 <2 a\u0327\u0323\u0323\u030a # a+dot below skips cedilla | |
221 = Ⓔ❺❻❷ | |
222 <2 a\u0334\u0327\u0323\u030a # a+dot below skips tilde overlay & cedilla | |
223 = Ⓔ❾❺❷ | |
224 | |
225 * compare | |
226 <1 \U0001D158\u0327\U0001D165 # quarter note skips cedilla | |
227 = ¼❺ | |
228 <1 a\U0001D165\u0323 # a+dot below skips stem | |
229 = Ⓔ¹ | |
230 | |
231 # partial contiguous match, backs up, matches discontiguous contraction | |
232 <1 a\u0327\u0323b | |
233 = Ⓔ❺b | |
234 <1 a\u0327\u0323ba | |
235 = Ⓔ❺bⓐ | |
236 | |
237 # a+acute+acute+dot above right skips cedilla, continues matching 2 same-ccc com
bining marks | |
238 * compare | |
239 <1 a\u0327\u0301\u0301\u0358 | |
240 = Ⓒ❺ | |
241 | |
242 # FCD but not NFD | |
243 * compare | |
244 <1 a\u0f73\u0301 # a+acute skips tibetan ii | |
245 = Ⓐ③ | |
246 | |
247 # FCD but the 0f71 inside the 0f73 must be skipped | |
248 # to match the discontiguous contraction of the first 0f71 with the trailing 0f7
2 inside the 0f73 | |
249 * compare | |
250 <1 \u0f71\u0f73 # == \u0f73\u0f71 == \u0f71\u0f71\u0f72 | |
251 = ③① | |
252 | |
253 ** test: discontiguous contractions with nested contractions | |
254 * compare | |
255 <1 a\u0323\u0308\u0301\u0358 | |
256 = Ⓕ❹ | |
257 <2 a\u0323\u0308\u0301\u0308\u0301\u0358 | |
258 = Ⓕ❹❹ | |
259 | |
260 ** test: discontiguous contractions with interleaved contractions | |
261 * compare | |
262 # a+ring & cedilla & macron below+dot above right | |
263 <1 a\u0327\u0331\u030a\u0358 | |
264 = Ⓓ❺❽ | |
265 | |
266 # a+ring & 1x..3x macron below+dot above right | |
267 <2 a\u0331\u030a\u0358 | |
268 = Ⓓ❽ | |
269 <2 a\u0331\u0331\u030a\u0358\u0358 | |
270 = Ⓓ❽❽ | |
271 # also skips acute | |
272 <2 a\u0331\u0331\u0331\u030a\u0301\u0358\u0358\u0358 | |
273 = Ⓓ❽❽❽❶ | |
274 | |
275 # a+dot below & stem+augmentation dot, followed by contiguous d+z+acute | |
276 <1 a\U0001D165\u0323\U0001D16Ddz\u0301 | |
277 = Ⓔ³ⓓ | |
278 | |
279 ** test: some simple string comparisons | |
280 @ root | |
281 * compare | |
282 # first string compares against "" | |
283 = \u0000 | |
284 < a | |
285 <1 b | |
286 <3 B | |
287 = \u0000B\u0000 | |
288 | |
289 ** test: compare with strength=primary | |
290 % strength=primary | |
291 * compare | |
292 <1 a | |
293 <1 b | |
294 = B | |
295 | |
296 ** test: compare with strength=secondary | |
297 % strength=secondary | |
298 * compare | |
299 <1 a | |
300 <1 b | |
301 = B | |
302 | |
303 ** test: compare with strength=tertiary | |
304 % strength=tertiary | |
305 * compare | |
306 <1 a | |
307 <1 b | |
308 <3 B | |
309 | |
310 ** test: compare with strength=quaternary | |
311 % strength=quaternary | |
312 * compare | |
313 <1 a | |
314 <1 b | |
315 <3 B | |
316 | |
317 ** test: compare with strength=identical | |
318 % strength=identical | |
319 * compare | |
320 <1 a | |
321 <1 b | |
322 <3 B | |
323 | |
324 ** test: côté with forwards secondary | |
325 @ root | |
326 * compare | |
327 <1 cote | |
328 <2 coté | |
329 <2 côte | |
330 <2 côté | |
331 | |
332 ** test: côté with forwards secondary vs. U+FFFE merge separator | |
333 # Merged sort keys: On each level, any difference in the first segment | |
334 # must trump any further difference. | |
335 * compare | |
336 <1 cote\uFFFEcôté | |
337 <2 coté\uFFFEcôte | |
338 <2 côte\uFFFEcoté | |
339 <2 côté\uFFFEcote | |
340 | |
341 ** test: côté with backwards secondary | |
342 % backwards=on | |
343 * compare | |
344 <1 cote | |
345 <2 côte | |
346 <2 coté | |
347 <2 côté | |
348 | |
349 ** test: côté with backwards secondary vs. U+FFFE merge separator | |
350 # Merged sort keys: On each level, any difference in the first segment | |
351 # must trump any further difference. | |
352 * compare | |
353 <1 cote\uFFFEcôté | |
354 <2 côte\uFFFEcoté | |
355 <2 coté\uFFFEcôte | |
356 <2 côté\uFFFEcote | |
357 | |
358 ** test: U+FFFE on identical level | |
359 @ root | |
360 % strength=identical | |
361 * compare | |
362 # All of these control codes are completely-ignorable, so that | |
363 # their low code points are compared with the merge separator. | |
364 # The merge separator must compare less than any other character. | |
365 <1 \uFFFE\u0001\u0002\u0003 | |
366 <i \u0001\uFFFE\u0002\u0003 | |
367 <i \u0001\u0002\uFFFE\u0003 | |
368 <i \u0001\u0002\u0003\uFFFE | |
369 | |
370 * compare | |
371 # The merge separator must even compare less than U+0000. | |
372 <1 \uFFFE\u0000\u0000 | |
373 <i \u0000\uFFFE\u0000 | |
374 <i \u0000\u0000\uFFFE | |
375 | |
376 ** test: Hani < surrogates < U+FFFD | |
377 # Note: compareUTF8() treats unpaired surrogates like U+FFFD, | |
378 # so with that the strings with surrogates will compare equal to each other | |
379 # and equal to the string with U+FFFD. | |
380 @ root | |
381 % strength=identical | |
382 * compare | |
383 <1 abz | |
384 <1 a\u4e00z | |
385 <1 a\U00020000z | |
386 <1 a\ud800z | |
387 <1 a\udbffz | |
388 <1 a\udc00z | |
389 <1 a\udfffz | |
390 <1 a\ufffdz | |
391 | |
392 ** test: script reordering | |
393 @ root | |
394 % reorder Hani Zzzz digit | |
395 * compare | |
396 <1 ? | |
397 <1 + | |
398 <1 丂 | |
399 <1 a | |
400 <1 α | |
401 <1 5 | |
402 | |
403 % reorder default | |
404 * compare | |
405 <1 ? | |
406 <1 + | |
407 <1 5 | |
408 <1 a | |
409 <1 α | |
410 <1 丂 | |
411 | |
412 ** test: empty rules | |
413 @ rules | |
414 * compare | |
415 <1 a | |
416 <2 ä | |
417 <3 Ä | |
418 <1 b | |
419 | |
420 ** test: very simple rules | |
421 @ rules | |
422 &a=e<<<<q<<<<r<x<<<X<<y<<<Y;z,Z | |
423 % strength=quaternary | |
424 * compare | |
425 <1 a | |
426 = e | |
427 <4 q | |
428 <4 r | |
429 <1 x | |
430 <3 X | |
431 <2 y | |
432 <3 Y | |
433 <2 z | |
434 <3 Z | |
435 | |
436 ** test: tailoring twice before a root position: primary | |
437 @ rules | |
438 &[before 1]b<p | |
439 &[before 1]b<q | |
440 * compare | |
441 <1 a | |
442 <1 p | |
443 <1 q | |
444 <1 b | |
445 | |
446 ** test: tailoring twice before a root position: secondary | |
447 @ rules | |
448 &[before 2]ſ<<p | |
449 &[before 2]ſ<<q | |
450 * compare | |
451 <1 s | |
452 <2 p | |
453 <2 q | |
454 <2 ſ | |
455 | |
456 # secondary-before common weight | |
457 @ rules | |
458 &[before 2]b<<p | |
459 &[before 2]b<<q | |
460 * compare | |
461 <1 a | |
462 <1 p | |
463 <2 q | |
464 <2 b | |
465 | |
466 ** test: tailoring twice before a root position: tertiary | |
467 @ rules | |
468 &[before 3]B<<<p | |
469 &[before 3]B<<<q | |
470 * compare | |
471 <1 b | |
472 <3 p | |
473 <3 q | |
474 <3 B | |
475 | |
476 # tertiary-before common weight | |
477 @ rules | |
478 &[before 3]b<<<p | |
479 &[before 3]b<<<q | |
480 * compare | |
481 <1 a | |
482 <1 p | |
483 <3 q | |
484 <3 b | |
485 | |
486 @ rules | |
487 &[before 2]b<<s | |
488 &[before 3]s<<<p | |
489 &[before 3]s<<<q | |
490 * compare | |
491 <1 a | |
492 <1 p | |
493 <3 q | |
494 <3 s | |
495 <2 b | |
496 | |
497 ** test: tailor after completely ignorable | |
498 @ rules | |
499 &\x00<<<x<<y | |
500 * compare | |
501 = \x00 | |
502 = \x1F | |
503 <3 x | |
504 <2 y | |
505 | |
506 ** test: secondary tailoring gaps, ICU ticket 9362 | |
507 @ rules | |
508 &[before 2]s<<'_' | |
509 &s<<r # secondary between s and ſ (long s) | |
510 &ſ<<*a-q # more than 15 between ſ and secondary CE boundary | |
511 &[before 2][first primary ignorable]<<u<<v # between secondary CE boundary & lo
west secondary CE | |
512 &[last primary ignorable]<<y<<z | |
513 | |
514 * compare | |
515 <2 u | |
516 <2 v | |
517 <2 \u0332 # lowest secondary CE | |
518 <2 \u0308 | |
519 <2 y | |
520 <2 z | |
521 <1 s_ | |
522 <2 ss | |
523 <2 sr | |
524 <2 sſ | |
525 <2 sa | |
526 <2 sb | |
527 <2 sp | |
528 <2 sq | |
529 <2 sus | |
530 <2 svs | |
531 <2 rs | |
532 | |
533 ** test: tertiary tailoring gaps, ICU ticket 9362 | |
534 @ rules | |
535 &[before 3]t<<<'_' | |
536 &t<<<r # tertiary between t and fullwidth t | |
537 &ᵀ<<<*a-q # more than 15 between ᵀ (modifier letter T) and tertiary CE boundary | |
538 &[before 3][first secondary ignorable]<<<u<<<v # between tertiary CE boundary &
lowest tertiary CE | |
539 &[last secondary ignorable]<<<y<<<z | |
540 | |
541 * compare | |
542 <3 u | |
543 <3 v | |
544 # Note: The root collator currently does not map any characters to tertiary CEs. | |
545 <3 y | |
546 <3 z | |
547 <1 t_ | |
548 <3 tt | |
549 <3 tr | |
550 <3 tt | |
551 <3 tᵀ | |
552 <3 ta | |
553 <3 tb | |
554 <3 tp | |
555 <3 tq | |
556 <3 tut | |
557 <3 tvt | |
558 <3 rt | |
559 | |
560 ** test: secondary & tertiary around root character | |
561 @ rules | |
562 &[before 2]m<<r | |
563 &m<<s | |
564 &[before 3]m<<<u | |
565 &m<<<v | |
566 * compare | |
567 <1 l | |
568 <1 r | |
569 <2 u | |
570 <3 m | |
571 <3 v | |
572 <2 s | |
573 <1 n | |
574 | |
575 ** test: secondary & tertiary around tailored item | |
576 @ rules | |
577 &m<x | |
578 &[before 2]x<<r | |
579 &x<<s | |
580 &[before 3]x<<<u | |
581 &x<<<v | |
582 * compare | |
583 <1 m | |
584 <1 r | |
585 <2 u | |
586 <3 x | |
587 <3 v | |
588 <2 s | |
589 <1 n | |
590 | |
591 ** test: more nesting of secondary & tertiary before | |
592 @ rules | |
593 &[before 3]m<<<u | |
594 &[before 2]m<<r | |
595 &[before 3]r<<<q | |
596 &m<<<w | |
597 &m<<t | |
598 &[before 3]w<<<v | |
599 &w<<<x | |
600 &w<<s | |
601 * compare | |
602 <1 l | |
603 <1 q | |
604 <3 r | |
605 <2 u | |
606 <3 m | |
607 <3 v | |
608 <3 w | |
609 <3 x | |
610 <2 s | |
611 <2 t | |
612 <1 n | |
613 | |
614 ** test: case bits | |
615 @ rules | |
616 &w<x # tailored CE getting case bits | |
617 =uv=uV=Uv=UV # 2 chars -> 1 CE | |
618 &ae=ch=cH=Ch=CH # 2 chars -> 2 CEs | |
619 &rst=yz=yZ=Yz=YZ # 2 chars -> 3 CEs | |
620 % caseFirst=lower | |
621 * compare | |
622 <1 ae | |
623 = ch | |
624 <3 cH | |
625 <3 Ch | |
626 <3 CH | |
627 <1 rst | |
628 = yz | |
629 <3 yZ | |
630 <3 Yz | |
631 <3 YZ | |
632 <1 w | |
633 <1 x | |
634 = uv | |
635 <3 uV | |
636 = Uv # mixed case on single CE cannot distinguish variations | |
637 <3 UV | |
638 | |
639 ** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=lower | |
640 @ rules | |
641 &\u0001<<<t<<<T # tertiary CEs | |
642 % caseFirst=lower | |
643 * compare | |
644 <1 aa | |
645 <3 aat | |
646 <3 aaT | |
647 <3 aA | |
648 <3 aAt | |
649 <3 ata | |
650 <3 aTa | |
651 | |
652 ** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=upper | |
653 % caseFirst=upper | |
654 * compare | |
655 <1 aA | |
656 <3 aAt | |
657 <3 aa | |
658 <3 aat | |
659 <3 aaT | |
660 <3 ata | |
661 <3 aTa | |
662 | |
663 ** test: reset on expansion, ICU tickets 9415 & 9593 | |
664 @ rules | |
665 &æ<x # tailor the last primary CE so that x sorts between ae and af | |
666 &æb=bæ # copy all reset CEs to make bæ sort the same | |
667 &각<h # copy/tailor 3 CEs to make h sort before the next Hangul syllable 갂 | |
668 &⒀<<y # copy/tailor 4 CEs to make y sort with only a secondary difference | |
669 &l·=z # handle the pre-context for · when fetching reset CEs | |
670 <<u # copy/tailor 2 CEs | |
671 | |
672 * compare | |
673 <1 ae | |
674 <2 æ | |
675 <1 x | |
676 <1 af | |
677 | |
678 * compare | |
679 <1 aeb | |
680 <2 æb | |
681 = bæ | |
682 | |
683 * compare | |
684 <1 각 | |
685 <1 h | |
686 <1 갂 | |
687 <1 갃 | |
688 | |
689 * compare | |
690 <1 · # by itself: primary CE | |
691 <1 l | |
692 <2 l· # l+middle dot has only a secondary difference from l | |
693 = z | |
694 <2 u | |
695 | |
696 * compare | |
697 <1 (13) | |
698 <3 ⒀ # DUCET sets special tertiary weights in all CEs | |
699 <2 y | |
700 <1 (13[ | |
701 | |
702 % alternate=shifted | |
703 * compare | |
704 <1 (13) | |
705 = 13 | |
706 <3 ⒀ | |
707 = y # alternate=shifted removes the tailoring difference on the last CE | |
708 <1 14 | |
709 | |
710 ** test: contraction inside extension, ICU ticket 9378 | |
711 @ rules | |
712 &а<<х/й # all letters are Cyrillic | |
713 * compare | |
714 <1 ай | |
715 <2 х | |
716 | |
717 ** test: no duplicate tailored CEs for different reset positions with same CEs,
ICU ticket 10104 | |
718 @ rules | |
719 &t<x &ᵀ<y # same primary weights | |
720 &q<u &[before 1]ꝗ<v # q and ꝗ are primary adjacent | |
721 * compare | |
722 <1 q | |
723 <1 u | |
724 <1 v | |
725 <1 ꝗ | |
726 <1 t | |
727 <3 ᵀ | |
728 <1 y | |
729 <1 x | |
730 | |
731 # Principle: Each rule builds on the state of preceding rules and ignores follow
ing rules. | |
732 | |
733 ** test: later rule does not affect earlier reset position, ICU ticket 10105 | |
734 @ rules | |
735 &a < u < v < w &ov < x &b < v | |
736 * compare | |
737 <1 oa | |
738 <1 ou | |
739 <1 x # CE(o) followed by CE between u and w | |
740 <1 ow | |
741 <1 ob | |
742 <1 ov | |
743 | |
744 ** test: later rule does not affect earlier extension (1), ICU ticket 10105 | |
745 @ rules | |
746 &a=x/b &v=b | |
747 % strength=secondary | |
748 * compare | |
749 <1 B | |
750 <1 c | |
751 <1 v | |
752 = b | |
753 * compare | |
754 <1 AB | |
755 = x | |
756 <1 ac | |
757 <1 av | |
758 = ab | |
759 | |
760 ** test: later rule does not affect earlier extension (2), ICU ticket 10105 | |
761 @ rules | |
762 &a <<< c / e &g <<< e / l | |
763 % strength=secondary | |
764 * compare | |
765 <1 AE | |
766 = c | |
767 <2 æ | |
768 <1 agl | |
769 = ae | |
770 | |
771 ** test: later rule does not affect earlier extension (3), ICU ticket 10105 | |
772 @ rules | |
773 &a = b / c &d = c / e | |
774 % strength=secondary | |
775 * compare | |
776 <1 AC # C is still only tertiary different from the original c | |
777 = b | |
778 <1 ade | |
779 = ac | |
780 | |
781 ** test: extension contains tailored character, ICU ticket 10105 | |
782 @ rules | |
783 &a=e &b=u/e | |
784 * compare | |
785 <1 a | |
786 = e | |
787 <1 ba | |
788 = be | |
789 = u | |
790 | |
791 ** test: add simple mappings for characters with root context | |
792 @ rules | |
793 &z=· # middle dot has a prefix mapping in the CLDR root | |
794 &n=и # и (U+0438) has contractions in the root | |
795 * compare | |
796 <1 l | |
797 <2 l· # root mapping for l|· still works | |
798 <1 z | |
799 = · | |
800 * compare | |
801 <1 n | |
802 = и | |
803 <1 И | |
804 <1 и\u0306 # root mapping for й=и\u0306 still works | |
805 = й | |
806 <3 Й | |
807 | |
808 ** test: add context mappings around characters with root context | |
809 @ rules | |
810 &z=·h # middle dot has a prefix mapping in the CLDR root | |
811 &n=ә|и # и (U+0438) has contractions in the root | |
812 * compare | |
813 <1 l | |
814 <2 l· # root mapping for l|· still works | |
815 <1 z | |
816 = ·h | |
817 * compare | |
818 <1 и | |
819 <3 И | |
820 <1 и\u0306 # root mapping for й=и\u0306 still works | |
821 = й | |
822 * compare | |
823 <1 әn | |
824 = әи | |
825 <1 әo | |
826 | |
827 ** test: many secondary CEs at the top of their range | |
828 @ rules | |
829 &[last primary ignorable]<<*\u2801-\u28ff | |
830 * compare | |
831 <2 \u0308 | |
832 <2 \u2801 | |
833 <2 \u2802 | |
834 <2 \u2803 | |
835 <2 \u2804 | |
836 <2 \u28fd | |
837 <2 \u28fe | |
838 <2 \u28ff | |
839 <1 \x20 | |
840 | |
841 ** test: many tertiary CEs at the top of their range | |
842 @ rules | |
843 &[last secondary ignorable]<<<*a-z | |
844 * compare | |
845 <3 a | |
846 <3 b | |
847 <3 c | |
848 <3 d | |
849 # e..w | |
850 <3 x | |
851 <3 y | |
852 <3 z | |
853 <2 \u0308 | |
854 | |
855 ** test: tailor contraction together with nearly equivalent prefix, ICU ticket 1
0101 | |
856 @ rules | |
857 &a=p|x &b=px &c=op | |
858 * compare | |
859 <1 b | |
860 = px | |
861 <3 B | |
862 <1 c | |
863 = op | |
864 <3 C | |
865 * compare | |
866 <1 ca | |
867 = opx # first contraction op, then prefix p|x | |
868 <3 cA | |
869 <3 Ca | |
870 | |
871 ** test: reset position with prefix (pre-context), ICU ticket 10102 | |
872 @ rules | |
873 &a=p|x &px=y | |
874 * compare | |
875 <1 pa | |
876 = px | |
877 = y | |
878 <3 pA | |
879 <1 q | |
880 <1 x | |
881 | |
882 ** test: prefix+contraction together (1), ICU ticket 10071 | |
883 @ rules | |
884 &x=a|bc | |
885 * compare | |
886 <1 ab | |
887 <1 Abc | |
888 <1 abd | |
889 <1 ac | |
890 <1 aw | |
891 <1 ax | |
892 = abc | |
893 <3 aX | |
894 <3 Ax | |
895 <1 b | |
896 <1 bb | |
897 <1 bc | |
898 <3 bC | |
899 <3 Bc | |
900 <1 bd | |
901 | |
902 ** test: prefix+contraction together (2), ICU ticket 10071 | |
903 @ rules | |
904 &w=bc &x=a|b | |
905 * compare | |
906 <1 w | |
907 = bc | |
908 <3 W | |
909 * compare | |
910 <1 aw | |
911 <1 ax | |
912 = ab | |
913 <3 aX | |
914 <1 axb | |
915 <1 axc | |
916 = abc # prefix match a|b takes precedence over contraction match bc | |
917 <3 abC | |
918 <1 abd | |
919 <1 ay | |
920 | |
921 ** test: prefix+contraction together (3), ICU ticket 10071 | |
922 @ rules | |
923 &x=a|b &w=bc # reverse order of rules as previous test, order should not matt
er here | |
924 * compare # same "compare" sequences as previous test | |
925 <1 w | |
926 = bc | |
927 <3 W | |
928 * compare | |
929 <1 aw | |
930 <1 ax | |
931 = ab | |
932 <3 aX | |
933 <1 axb | |
934 <1 axc | |
935 = abc # prefix match a|b takes precedence over contraction match bc | |
936 <3 abC | |
937 <1 abd | |
938 <1 ay | |
939 | |
940 ** test: no mapping p|c, falls back to contraction ch, CLDR ticket 5962 | |
941 @ rules | |
942 &d=ch &v=p|ci | |
943 * compare | |
944 <1 pc | |
945 <3 pC | |
946 <1 pcH | |
947 <1 pcI | |
948 <1 pd | |
949 = pch # no-prefix contraction ch matches | |
950 <3 pD | |
951 <1 pv | |
952 = pci # prefix+contraction p|ci matches | |
953 <3 pV | |
954 | |
955 ** test: tailor in & around compact ranges of root primaries | |
956 # The Ogham characters U+1681..U+169A are in simple ascending order of primary C
Es | |
957 # which should be reliably encoded as one range in the root elements data. | |
958 @ rules | |
959 &[before 1]ᚁ<a | |
960 &ᚁ<b | |
961 &[before 1]ᚂ<c | |
962 &ᚂ<d | |
963 &[before 1]ᚚ<y | |
964 &ᚚ<z | |
965 &[before 2]ᚁ<<r | |
966 &ᚁ<<s | |
967 &[before 3]ᚚ<<<t | |
968 &ᚚ<<<u | |
969 * compare | |
970 <1 ᣵ # U+18F5 last Canadian Aboriginal | |
971 <1 a | |
972 <1 r | |
973 <2 ᚁ | |
974 <2 s | |
975 <1 b | |
976 <1 c | |
977 <1 ᚂ | |
978 <1 d | |
979 <1 ᚃ | |
980 <1 ᚙ | |
981 <1 y | |
982 <1 t | |
983 <3 ᚚ | |
984 <3 u | |
985 <1 z | |
986 <1 ᚠ # U+16A0 first Runic | |
987 | |
988 ** test: suppressContractions | |
989 @ rules | |
990 &z<ch<әж [suppressContractions [·cә]] | |
991 * compare | |
992 <1 ch | |
993 <3 cH # ch was suppressed | |
994 <1 l | |
995 <1 l· # primary difference, not secondary, because l|· was suppressed | |
996 <1 ә | |
997 <2 ә\u0308 # secondary difference, not primary, because contractions for ә were
suppressed | |
998 <1 әж | |
999 <3 әЖ | |
1000 | |
1001 ** test: Hangul & Jamo | |
1002 @ rules | |
1003 &L=\u1100 # first Jamo L | |
1004 &V=\u1161 # first Jamo V | |
1005 &T=\u11A8 # first Jamo T | |
1006 &\uAC01<<*\u4E00-\u4EFF # first Hangul LVT syllable & lots of secondary diffs | |
1007 * compare | |
1008 <1 Lv | |
1009 <3 LV | |
1010 = \u1100\u1161 | |
1011 = \uAC00 | |
1012 <1 LVt | |
1013 <3 LVT | |
1014 = \u1100\u1161\u11A8 | |
1015 = \uAC00\u11A8 | |
1016 = \uAC01 | |
1017 <2 LVT\u0308 | |
1018 <2 \u4E00 | |
1019 <2 \u4E01 | |
1020 <2 \u4E80 | |
1021 <2 \u4EFF | |
1022 <2 LV\u0308T | |
1023 <1 \uAC02 | |
1024 | |
1025 ** test: adjust special reset positions according to previous rules, CLDR ticket
6070 | |
1026 @ rules | |
1027 &[last variable]<x | |
1028 [maxVariable space] # has effect only after building, no effect on following ru
les | |
1029 &[last variable]<y | |
1030 &[before 1][first regular]<z | |
1031 * compare | |
1032 <1 ? # some punctuation | |
1033 <1 x | |
1034 <1 y | |
1035 <1 z | |
1036 <1 $ # some symbol | |
1037 | |
1038 @ rules | |
1039 &[last primary ignorable]<<x<<<y | |
1040 &[last primary ignorable]<<z | |
1041 * compare | |
1042 <2 \u0358 | |
1043 <2 x | |
1044 <3 y | |
1045 <2 z | |
1046 <1 \x20 | |
1047 | |
1048 @ rules | |
1049 &[last secondary ignorable]<<<x | |
1050 &[last secondary ignorable]<<<y | |
1051 * compare | |
1052 <3 x | |
1053 <3 y | |
1054 <2 \u0358 | |
1055 | |
1056 @ rules | |
1057 &[before 2][first variable]<<z | |
1058 &[before 2][first variable]<<y | |
1059 &[before 3][first variable]<<<x | |
1060 &[before 3][first variable]<<<w | |
1061 &[before 1][first variable]<v | |
1062 &[before 2][first variable]<<u | |
1063 &[before 3][first variable]<<<t | |
1064 &[before 2]\uFDD1\xA0<<s # FractionalUCA.txt: FDD1 00A0, SPACE first primary | |
1065 * compare | |
1066 <2 \u0358 | |
1067 <1 s | |
1068 <2 \uFDD1\xA0 | |
1069 <1 t | |
1070 <3 u | |
1071 <2 v | |
1072 <1 w | |
1073 <3 x | |
1074 <3 y | |
1075 <2 z | |
1076 <2 \t | |
1077 | |
1078 @ rules | |
1079 &[before 2][first regular]<<z | |
1080 &[before 3][first regular]<<<y | |
1081 &[before 1][first regular]<x | |
1082 &[before 3][first regular]<<<w | |
1083 &[before 2]\uFDD1\u263A<<v # FractionalUCA.txt: FDD1 263A, SYMBOL first primary | |
1084 &[before 3][first regular]<<<u | |
1085 &[before 1][first regular]<p # primary before the boundary: becomes variable | |
1086 &[before 3][first regular]<<<t # not affected by p | |
1087 &[last variable]<q # after p! | |
1088 * compare | |
1089 <1 ? | |
1090 <1 p | |
1091 <1 q | |
1092 <1 t | |
1093 <3 u | |
1094 <3 v | |
1095 <1 w | |
1096 <3 x | |
1097 <1 y | |
1098 <3 z | |
1099 <1 $ | |
1100 | |
1101 # check that p & q are indeed variable | |
1102 % alternate=shifted | |
1103 * compare | |
1104 = ? | |
1105 = p | |
1106 = q | |
1107 <1 t | |
1108 <3 u | |
1109 <3 v | |
1110 <1 w | |
1111 <3 x | |
1112 <1 y | |
1113 <3 z | |
1114 <1 $ | |
1115 | |
1116 @ rules | |
1117 &[before 2][first trailing]<<z | |
1118 &[before 1][first trailing]<y | |
1119 &[before 3][first trailing]<<<x | |
1120 * compare | |
1121 <1 \u4E00 # first Han, first implicit | |
1122 <1 \uFDD1\uFDD0 # FractionalUCA.txt: unassigned first primary | |
1123 # Note: The root collator currently does not map any characters to the trailing
first boundary primary. | |
1124 <1 x | |
1125 <3 y | |
1126 <1 z | |
1127 <2 \uFFFD # The root collator currently maps U+FFFD to the first real trailing
primary. | |
1128 | |
1129 @ rules | |
1130 &[before 2][first primary ignorable]<<z | |
1131 &[before 2][first primary ignorable]<<y | |
1132 &[before 3][first primary ignorable]<<<x | |
1133 &[before 3][first primary ignorable]<<<w | |
1134 * compare | |
1135 = \x01 | |
1136 <2 w | |
1137 <3 x | |
1138 <3 y | |
1139 <2 z | |
1140 <2 \u0301 | |
1141 | |
1142 @ rules | |
1143 &[before 3][first secondary ignorable]<<<y | |
1144 &[before 3][first secondary ignorable]<<<x | |
1145 * compare | |
1146 = \x01 | |
1147 <3 x | |
1148 <3 y | |
1149 <2 \u0301 | |
1150 | |
1151 ** test: canonical closure | |
1152 @ rules | |
1153 &X=A &U=Â | |
1154 * compare | |
1155 <1 U | |
1156 = Â | |
1157 = A\u0302 | |
1158 <2 Ú # U with acute | |
1159 = U\u0301 | |
1160 = Ấ # A with circumflex & acute | |
1161 = Â\u0301 | |
1162 = A\u0302\u0301 | |
1163 <1 X | |
1164 = A | |
1165 <2 X\u030A # with ring above | |
1166 = Å | |
1167 = A\u030A | |
1168 = \u212B # Angstrom sign | |
1169 | |
1170 @ rules | |
1171 &x=\u5140\u55C0 | |
1172 * compare | |
1173 <1 x | |
1174 = \u5140\u55C0 | |
1175 = \u5140\uFA0D | |
1176 = \uFA0C\u55C0 | |
1177 = \uFA0C\uFA0D # CJK compatibility characters | |
1178 <3 X | |
1179 | |
1180 # canonical closure on prefix rules, ICU ticket 9444 | |
1181 @ rules | |
1182 &x=ä|ŝ | |
1183 * compare | |
1184 <1 äs # not tailored | |
1185 <1 äx | |
1186 = äŝ | |
1187 = a\u0308s\u0302 | |
1188 = a\u0308ŝ | |
1189 = äs\u0302 | |
1190 <3 äX | |
1191 | |
1192 ** test: conjoining Jamo map to expansions | |
1193 @ rules | |
1194 &gg=\u1101 # Jamo Lead consonant GG | |
1195 &nj=\u11AC # Jamo Trail consonant NJ | |
1196 * compare | |
1197 <1 gg\u1161nj | |
1198 = \u1101\u1161\u11AC | |
1199 = \uAE4C\u11AC | |
1200 = \uAE51 | |
1201 <3 gg\u1161nJ | |
1202 <1 \u1100\u1100 | |
1203 | |
1204 ** test: canonical tail closure, ICU ticket 5913 | |
1205 @ rules | |
1206 &a<â | |
1207 * compare | |
1208 <1 a | |
1209 <1 â # tailored | |
1210 = a\u0302 | |
1211 <2 a\u0323\u0302 # discontiguous contraction | |
1212 = ạ\u0302 # equivalent | |
1213 = ậ # equivalent | |
1214 <1 b | |
1215 | |
1216 @ rules | |
1217 &a<ạ | |
1218 * compare | |
1219 <1 a | |
1220 <1 ạ # tailored | |
1221 = a\u0323 | |
1222 <2 a\u0323\u0302 # contiguous contraction plus extra diacritic | |
1223 = ạ\u0302 # equivalent | |
1224 = ậ # equivalent | |
1225 <1 b | |
1226 | |
1227 # Tail closure should work even if there is a prefix and/or contraction. | |
1228 @ rules | |
1229 &a<\u5140|câ | |
1230 # In order to find discontiguous contractions for \u5140|câ | |
1231 # there must exist a mapping for \u5140|ca, regardless of what it maps to. | |
1232 # (This follows from the UCA spec.) | |
1233 &x=\u5140|ca | |
1234 * compare | |
1235 <1 \u5140a | |
1236 = \uFA0Ca | |
1237 <1 \u5140câ # tailored | |
1238 = \uFA0Ccâ | |
1239 = \u5140ca\u0302 | |
1240 = \uFA0Cca\u0302 | |
1241 <2 \u5140ca\u0323\u0302 # discontiguous contraction | |
1242 = \uFA0Cca\u0323\u0302 | |
1243 = \u5140cạ\u0302 | |
1244 = \uFA0Ccạ\u0302 | |
1245 = \u5140cậ | |
1246 = \uFA0Ccậ | |
1247 <1 \u5140b | |
1248 = \uFA0Cb | |
1249 <1 \u5140x | |
1250 = \u5140ca | |
1251 | |
1252 # Double-check that without the extra mapping there will be no discontiguous mat
ch. | |
1253 @ rules | |
1254 &a<\u5140|câ | |
1255 * compare | |
1256 <1 \u5140a | |
1257 = \uFA0Ca | |
1258 <1 \u5140câ # tailored | |
1259 = \uFA0Ccâ | |
1260 = \u5140ca\u0302 | |
1261 = \uFA0Cca\u0302 | |
1262 <1 \u5140b | |
1263 = \uFA0Cb | |
1264 <1 \u5140ca\u0323\u0302 # no discontiguous contraction | |
1265 = \uFA0Cca\u0323\u0302 | |
1266 = \u5140cạ\u0302 | |
1267 = \uFA0Ccạ\u0302 | |
1268 = \u5140cậ | |
1269 = \uFA0Ccậ | |
1270 | |
1271 @ rules | |
1272 &a<cạ | |
1273 * compare | |
1274 <1 a | |
1275 <1 cạ # tailored | |
1276 = ca\u0323 | |
1277 <2 ca\u0323\u0302 # contiguous contraction plus extra diacritic | |
1278 = cạ\u0302 # equivalent | |
1279 = cậ # equivalent | |
1280 <1 b | |
1281 | |
1282 # ᾢ = U+1FA2 GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI | |
1283 # = 03C9 0313 0300 0345 | |
1284 # ccc = 0, 230, 230, 240 | |
1285 @ rules | |
1286 &δ=αῳ | |
1287 # In order to find discontiguous contractions for αῳ | |
1288 # there must exist a mapping for αω, regardless of what it maps to. | |
1289 # (This follows from the UCA spec.) | |
1290 &ε=αω | |
1291 * compare | |
1292 <1 δ | |
1293 = αῳ | |
1294 = αω\u0345 | |
1295 <2 αω\u0313\u0300\u0345 # discontiguous contraction | |
1296 = αὠ\u0300\u0345 | |
1297 = αὢ\u0345 | |
1298 = αᾢ | |
1299 <2 αω\u0300\u0313\u0345 | |
1300 = αὼ\u0313\u0345 | |
1301 = αῲ\u0313 # not FCD | |
1302 <1 ε | |
1303 = αω | |
1304 | |
1305 # Double-check that without the extra mapping there will be no discontiguous mat
ch. | |
1306 @ rules | |
1307 &δ=αῳ | |
1308 * compare | |
1309 <1 αω\u0313\u0300\u0345 # no discontiguous contraction | |
1310 = αὠ\u0300\u0345 | |
1311 = αὢ\u0345 | |
1312 = αᾢ | |
1313 <2 αω\u0300\u0313\u0345 | |
1314 = αὼ\u0313\u0345 | |
1315 = αῲ\u0313 # not FCD | |
1316 <1 δ | |
1317 = αῳ | |
1318 = αω\u0345 | |
1319 | |
1320 # Add U+0315 COMBINING COMMA ABOVE RIGHT which has ccc=232. | |
1321 # Tests code paths where the tailored string has a combining mark | |
1322 # that does not occur in any composite's decomposition. | |
1323 @ rules | |
1324 &δ=αὼ\u0315 | |
1325 * compare | |
1326 <1 αω\u0313\u0300\u0315 # Not tailored: The grave accent blocks the comma above
. | |
1327 = αὠ\u0300\u0315 | |
1328 = αὢ\u0315 | |
1329 <1 δ | |
1330 = αὼ\u0315 | |
1331 = αω\u0300\u0315 | |
1332 <2 αω\u0300\u0315\u0345 | |
1333 = αὼ\u0315\u0345 | |
1334 = αῲ\u0315 # not FCD | |
1335 | |
1336 ** test: danish a+a vs. a-umlaut, ICU ticket 9319 | |
1337 @ rules | |
1338 &z<aa | |
1339 * compare | |
1340 <1 z | |
1341 <1 aa | |
1342 <2 aa\u0308 | |
1343 = aä | |
1344 | |
1345 ** test: Jamo L with and in prefix | |
1346 # Useful for the Korean "searchjl" tailoring (instead of contractions of pairs o
f Jamo L). | |
1347 @ rules | |
1348 # Jamo Lead consonant G after G or GG | |
1349 &[last primary ignorable]<<\u1100|\u1100=\u1101|\u1100 | |
1350 # Jamo Lead consonant GG sorts like G+G | |
1351 &\u1100\u1100=\u1101 | |
1352 # Note: Making G|GG and GG|GG sort the same as G|G+G | |
1353 # would require the ability to reset on G|G+G, | |
1354 # or we could make G-after-G equal to some secondary-CE character, | |
1355 # and reset on a pair of those. | |
1356 # (It does not matter much if there are at most two G in a row in real text.) | |
1357 * compare | |
1358 <1 \u1100 | |
1359 <2 \u1100\u1100 # only one primary from a sequence of G lead consonants | |
1360 = \u1101 | |
1361 <2 \u1100\u1100\u1100 | |
1362 = \u1101\u1100 | |
1363 # but not = \u1100\u1101, see above | |
1364 <1 \u1100\u1161 | |
1365 = \uAC00 | |
1366 <2 \u1100\u1100\u1161 | |
1367 = \u1100\uAC00 # prefix match from the L of the LV syllable | |
1368 = \u1101\u1161 | |
1369 = \uAE4C | |
1370 | |
1371 ** test: proposed Korean "searchjl" tailoring with prefixes, CLDR ticket 6546 | |
1372 @ rules | |
1373 # Low secondary CEs for Jamo V & T. | |
1374 # Note: T should sort before V for proper syllable order. | |
1375 &\u0332 # COMBINING LOW LINE (first primary ignorable) | |
1376 <<\u1161<<\u1162 | |
1377 | |
1378 # Korean Jamo lead consonant search rules, part 2: | |
1379 # Make modern compound L jamo primary equivalent to non-compound forms. | |
1380 | |
1381 # Secondary CEs for Jamo L-after-L, greater than Jamo V & T. | |
1382 &\u0313 # COMBINING COMMA ABOVE (second primary ignorable) | |
1383 =\u1100|\u1100 | |
1384 =\u1103|\u1103 | |
1385 =\u1107|\u1107 | |
1386 =\u1109|\u1109 | |
1387 =\u110C|\u110C | |
1388 | |
1389 # Compound L Jamo map to equivalent expansions of primary+secondary CE. | |
1390 &\u1100\u0313=\u1101<<<\u3132 # HANGUL CHOSEONG SSANGKIYEOK, HANGUL LETTER SSAN
GKIYEOK | |
1391 &\u1103\u0313=\u1104<<<\u3138 # HANGUL CHOSEONG SSANGTIKEUT, HANGUL LETTER SSAN
GTIKEUT | |
1392 &\u1107\u0313=\u1108<<<\u3143 # HANGUL CHOSEONG SSANGPIEUP, HANGUL LETTER SSANG
PIEUP | |
1393 &\u1109\u0313=\u110A<<<\u3146 # HANGUL CHOSEONG SSANGSIOS, HANGUL LETTER SSANGS
IOS | |
1394 &\u110C\u0313=\u110D<<<\u3149 # HANGUL CHOSEONG SSANGCIEUC, HANGUL LETTER SSANG
CIEUC | |
1395 | |
1396 * compare | |
1397 <1 \u1100\u1161 | |
1398 = \uAC00 | |
1399 <2 \u1100\u1162 | |
1400 = \uAC1C | |
1401 <2 \u1100\u1100\u1161 | |
1402 = \u1100\uAC00 | |
1403 = \u1101\u1161 | |
1404 = \uAE4C | |
1405 <3 \u3132\u1161 | |
1406 | |
1407 ** test: Hangul syllables in prefix & in the interior of a contraction | |
1408 @ rules | |
1409 &x=\u1100\u1161|a\u1102\u1162z | |
1410 * compare | |
1411 <1 \u1100\u1161x | |
1412 = \u1100\u1161a\u1102\u1162z | |
1413 = \u1100\u1161a\uB0B4z | |
1414 = \uAC00a\u1102\u1162z | |
1415 = \uAC00a\uB0B4z | |
1416 | |
1417 ** test: digits are unsafe-backwards when numeric=on | |
1418 @ root | |
1419 % numeric=on | |
1420 * compare | |
1421 # If digits are not unsafe, then numeric collation sees "1"=="01" and "b">"a". | |
1422 # We need to back up before the identical prefix "1" and compare the full number
s. | |
1423 <1 11b | |
1424 <1 101a | |
1425 | |
1426 ** test: simple locale data test | |
1427 @ locale de | |
1428 * compare | |
1429 <1 a | |
1430 <2 ä | |
1431 <1 ae | |
1432 <2 æ | |
1433 | |
1434 @ locale de-u-co-phonebk | |
1435 * compare | |
1436 <1 a | |
1437 <1 ae | |
1438 <2 ä | |
1439 <2 æ | |
1440 | |
1441 # The following test cases were moved here from ICU 52's DataDrivenCollationTest
.txt. | |
1442 | |
1443 ** test: DataDrivenCollationTest/TestMorePinyin | |
1444 # Testing the primary strength. | |
1445 @ locale zh | |
1446 % strength=primary | |
1447 * compare | |
1448 < lā | |
1449 = lĀ | |
1450 = Lā | |
1451 = LĀ | |
1452 < lān | |
1453 = lĀn | |
1454 < lē | |
1455 = lĒ | |
1456 = Lē | |
1457 = LĒ | |
1458 < lēn | |
1459 = lĒn | |
1460 | |
1461 ** test: DataDrivenCollationTest/TestLithuanian | |
1462 # Lithuanian sort order. | |
1463 @ locale lt | |
1464 * compare | |
1465 < cz | |
1466 < č | |
1467 < d | |
1468 < iz | |
1469 < j | |
1470 < sz | |
1471 < š | |
1472 < t | |
1473 < zz | |
1474 < ž | |
1475 | |
1476 ** test: DataDrivenCollationTest/TestLatvian | |
1477 # Latvian sort order. | |
1478 @ locale lv | |
1479 * compare | |
1480 < cz | |
1481 < č | |
1482 < d | |
1483 < gz | |
1484 < ģ | |
1485 < h | |
1486 < iz | |
1487 < j | |
1488 < kz | |
1489 < ķ | |
1490 < l | |
1491 < lz | |
1492 < ļ | |
1493 < m | |
1494 < nz | |
1495 < ņ | |
1496 < o | |
1497 < rz | |
1498 < ŗ | |
1499 < s | |
1500 < sz | |
1501 < š | |
1502 < t | |
1503 < zz | |
1504 < ž | |
1505 | |
1506 ** test: DataDrivenCollationTest/TestEstonian | |
1507 # Estonian sort order. | |
1508 @ locale et | |
1509 * compare | |
1510 < sy | |
1511 < š | |
1512 < šy | |
1513 < z | |
1514 < zy | |
1515 < ž | |
1516 < v | |
1517 < va | |
1518 < w | |
1519 < õ | |
1520 < õy | |
1521 < ä | |
1522 < äy | |
1523 < ö | |
1524 < öy | |
1525 < ü | |
1526 < üy | |
1527 < x | |
1528 | |
1529 ** test: DataDrivenCollationTest/TestAlbanian | |
1530 # Albanian sort order. | |
1531 @ locale sq | |
1532 * compare | |
1533 < cz | |
1534 < ç | |
1535 < d | |
1536 < dz | |
1537 < dh | |
1538 < e | |
1539 < ez | |
1540 < ë | |
1541 < f | |
1542 < gz | |
1543 < gj | |
1544 < h | |
1545 < lz | |
1546 < ll | |
1547 < m | |
1548 < nz | |
1549 < nj | |
1550 < o | |
1551 < rz | |
1552 < rr | |
1553 < s | |
1554 < sz | |
1555 < sh | |
1556 < t | |
1557 < tz | |
1558 < th | |
1559 < u | |
1560 < xz | |
1561 < xh | |
1562 < y | |
1563 < zz | |
1564 < zh | |
1565 | |
1566 ** test: DataDrivenCollationTest/TestSimplifiedChineseOrder | |
1567 # Sorted file has different order. | |
1568 @ root | |
1569 # normalization=on turned on & off automatically. | |
1570 * compare | |
1571 < \u5F20 | |
1572 < \u5F20\u4E00\u8E3F | |
1573 | |
1574 ** test: DataDrivenCollationTest/TestTibetanNormalizedIterativeCrash | |
1575 # This pretty much crashes. | |
1576 @ root | |
1577 * compare | |
1578 < \u0f71\u0f72\u0f80\u0f71\u0f72 | |
1579 < \u0f80 | |
1580 | |
1581 ** test: DataDrivenCollationTest/TestThaiPartialSortKeyProblems | |
1582 # These are examples of strings that caused trouble in partial sort key testing. | |
1583 @ locale th-TH | |
1584 * compare | |
1585 < \u0E01\u0E01\u0E38\u0E18\u0E20\u0E31\u0E13\u0E11\u0E4C | |
1586 < \u0E01\u0E01\u0E38\u0E2A\u0E31\u0E19\u0E42\u0E18 | |
1587 * compare | |
1588 < \u0E01\u0E07\u0E01\u0E32\u0E23 | |
1589 < \u0E01\u0E07\u0E42\u0E01\u0E49 | |
1590 * compare | |
1591 < \u0E01\u0E23\u0E19\u0E17\u0E32 | |
1592 < \u0E01\u0E23\u0E19\u0E19\u0E40\u0E0A\u0E49\u0E32 | |
1593 * compare | |
1594 < \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E22\u0E27 | |
1595 < \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E4A\u0E22\u0E27 | |
1596 * compare | |
1597 < \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E2D | |
1598 < \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E49\u0E32 | |
1599 | |
1600 ** test: DataDrivenCollationTest/TestJavaStyleRule | |
1601 # java.text allows rules to start as '<<<x<<<y...' | |
1602 # we emulate this by assuming a &[first tertiary ignorable] in this case. | |
1603 @ rules | |
1604 &\u0001=equal<<<z<<x<<<w &[first tertiary ignorable]=a &[first primary ignorable
]=b | |
1605 * compare | |
1606 = a | |
1607 = equal | |
1608 < z | |
1609 < x | |
1610 = b # x had become the new first primary ignorable | |
1611 < w | |
1612 | |
1613 ** test: DataDrivenCollationTest/TestShiftedIgnorable | |
1614 # The UCA states that primary ignorables should be completely | |
1615 # ignorable when following a shifted code point. | |
1616 @ root | |
1617 % alternate=shifted | |
1618 % strength=quaternary | |
1619 * compare | |
1620 < a\u0020b | |
1621 = a\u0020\u0300b | |
1622 = a\u0020\u0301b | |
1623 < a_b | |
1624 = a_\u0300b | |
1625 = a_\u0301b | |
1626 < A\u0020b | |
1627 = A\u0020\u0300b | |
1628 = A\u0020\u0301b | |
1629 < A_b | |
1630 = A_\u0300b | |
1631 = A_\u0301b | |
1632 < a\u0301b | |
1633 < A\u0301b | |
1634 < a\u0300b | |
1635 < A\u0300b | |
1636 | |
1637 ** test: DataDrivenCollationTest/TestNShiftedIgnorable | |
1638 # The UCA states that primary ignorables should be completely | |
1639 # ignorable when following a shifted code point. | |
1640 @ root | |
1641 % alternate=non-ignorable | |
1642 % strength=tertiary | |
1643 * compare | |
1644 < a\u0020b | |
1645 < A\u0020b | |
1646 < a\u0020\u0301b | |
1647 < A\u0020\u0301b | |
1648 < a\u0020\u0300b | |
1649 < A\u0020\u0300b | |
1650 < a_b | |
1651 < A_b | |
1652 < a_\u0301b | |
1653 < A_\u0301b | |
1654 < a_\u0300b | |
1655 < A_\u0300b | |
1656 < a\u0301b | |
1657 < A\u0301b | |
1658 < a\u0300b | |
1659 < A\u0300b | |
1660 | |
1661 ** test: DataDrivenCollationTest/TestSafeSurrogates | |
1662 # It turned out that surrogates were not skipped properly | |
1663 # when iterating backwards if they were in the middle of a | |
1664 # contraction. This test assures that this is fixed. | |
1665 @ rules | |
1666 &a < x\ud800\udc00b | |
1667 * compare | |
1668 < a | |
1669 < x\ud800\udc00b | |
1670 | |
1671 ** test: DataDrivenCollationTest/da_TestPrimary | |
1672 # This test goes through primary strength cases | |
1673 @ locale da | |
1674 % strength=primary | |
1675 * compare | |
1676 < Lvi | |
1677 < Lwi | |
1678 * compare | |
1679 < L\u00e4vi | |
1680 < L\u00f6wi | |
1681 * compare | |
1682 < L\u00fcbeck | |
1683 = Lybeck | |
1684 | |
1685 ** test: DataDrivenCollationTest/da_TestTertiary | |
1686 # This test goes through tertiary strength cases | |
1687 @ locale da | |
1688 % strength=tertiary | |
1689 * compare | |
1690 < Luc | |
1691 < luck | |
1692 * compare | |
1693 < luck | |
1694 < L\u00fcbeck | |
1695 * compare | |
1696 < lybeck | |
1697 < L\u00fcbeck | |
1698 * compare | |
1699 < L\u00e4vi | |
1700 < L\u00f6we | |
1701 * compare | |
1702 < L\u00f6ww | |
1703 < mast | |
1704 | |
1705 * compare | |
1706 < A/S | |
1707 < ANDRE | |
1708 < ANDR\u00c9 | |
1709 < ANDREAS | |
1710 < AS | |
1711 < CA | |
1712 < \u00c7A | |
1713 < CB | |
1714 < \u00c7C | |
1715 < D.S.B. | |
1716 < DA | |
1717 < \u00d0A | |
1718 < DB | |
1719 < \u00d0C | |
1720 < DSB | |
1721 < DSC | |
1722 < EKSTRA_ARBEJDE | |
1723 < EKSTRABUD0 | |
1724 < H\u00d8ST | |
1725 < HAAG | |
1726 < H\u00c5NDBOG | |
1727 < HAANDV\u00c6RKSBANKEN | |
1728 < Karl | |
1729 < karl | |
1730 < NIELS\u0020J\u00d8RGEN | |
1731 < NIELS-J\u00d8RGEN | |
1732 < NIELSEN | |
1733 < R\u00c9E,\u0020A | |
1734 < REE,\u0020B | |
1735 < R\u00c9E,\u0020L | |
1736 < REE,\u0020V | |
1737 < SCHYTT,\u0020B | |
1738 < SCHYTT,\u0020H | |
1739 < SCH\u00dcTT,\u0020H | |
1740 < SCHYTT,\u0020L | |
1741 < SCH\u00dcTT,\u0020M | |
1742 < SS | |
1743 < \u00df | |
1744 < SSA | |
1745 < STORE\u0020VILDMOSE | |
1746 < STOREK\u00c6R0 | |
1747 < STORM\u0020PETERSEN | |
1748 < STORMLY | |
1749 < THORVALD | |
1750 < THORVARDUR | |
1751 < \u00feORVAR\u00d0UR | |
1752 < THYGESEN | |
1753 < VESTERG\u00c5RD,\u0020A | |
1754 < VESTERGAARD,\u0020A | |
1755 < VESTERG\u00c5RD,\u0020B | |
1756 < \u00c6BLE | |
1757 < \u00c4BLE | |
1758 < \u00d8BERG | |
1759 < \u00d6BERG | |
1760 | |
1761 * compare | |
1762 < andere | |
1763 < chaque | |
1764 < chemin | |
1765 < cote | |
1766 < cot\u00e9 | |
1767 < c\u00f4te | |
1768 < c\u00f4t\u00e9 | |
1769 < \u010du\u010d\u0113t | |
1770 < Czech | |
1771 < hi\u0161a | |
1772 < irdisch | |
1773 < lie | |
1774 < lire | |
1775 < llama | |
1776 < l\u00f5ug | |
1777 < l\u00f2za | |
1778 < lu\u010d | |
1779 < luck | |
1780 < L\u00fcbeck | |
1781 < lye | |
1782 < l\u00e4vi | |
1783 < L\u00f6wen | |
1784 < m\u00e0\u0161ta | |
1785 < m\u00eer | |
1786 < myndig | |
1787 < M\u00e4nner | |
1788 < m\u00f6chten | |
1789 < pi\u00f1a | |
1790 < pint | |
1791 < pylon | |
1792 < \u0161\u00e0ran | |
1793 < savoir | |
1794 < \u0160erb\u016bra | |
1795 < Sietla | |
1796 < \u015blub | |
1797 < subtle | |
1798 < symbol | |
1799 < s\u00e4mtlich | |
1800 < verkehrt | |
1801 < vox | |
1802 < v\u00e4ga | |
1803 < waffle | |
1804 < wood | |
1805 < yen | |
1806 < yuan | |
1807 < yucca | |
1808 < \u017eal | |
1809 < \u017eena | |
1810 < \u017den\u0113va | |
1811 < zoo0 | |
1812 < Zviedrija | |
1813 < Z\u00fcrich | |
1814 < zysk0 | |
1815 < \u00e4ndere | |
1816 | |
1817 ** test: DataDrivenCollationTest/hi_TestNewRules | |
1818 # This test goes through new rules and tests against old rules | |
1819 @ locale hi | |
1820 * compare | |
1821 < कॐ | |
1822 < कं | |
1823 < कँ | |
1824 < कः | |
1825 | |
1826 ** test: DataDrivenCollationTest/ro_TestNewRules | |
1827 # This test goes through new rules and tests against old rules | |
1828 @ locale ro | |
1829 * compare | |
1830 < xAx | |
1831 < xă | |
1832 < xĂ | |
1833 < Xă | |
1834 < XĂ | |
1835 < xăx | |
1836 < xĂx | |
1837 < xâ | |
1838 < x | |
1839 < Xâ | |
1840 < XÂ | |
1841 < xâx | |
1842 < xÂx | |
1843 < xb | |
1844 < xIx | |
1845 < xî | |
1846 < xÎ | |
1847 < Xî | |
1848 < XÎ | |
1849 < xîx | |
1850 < xÎx | |
1851 < xj | |
1852 < xSx | |
1853 < xș | |
1854 = xş | |
1855 < xȘ | |
1856 = xŞ | |
1857 < Xș | |
1858 = Xş | |
1859 < XȘ | |
1860 = XŞ | |
1861 < xșx | |
1862 = xşx | |
1863 < xȘx | |
1864 = xŞx | |
1865 < xT | |
1866 < xTx | |
1867 < xț | |
1868 = xţ | |
1869 < xȚ | |
1870 = xŢ | |
1871 < Xț | |
1872 = Xţ | |
1873 < XȚ | |
1874 = XŢ | |
1875 < xțx | |
1876 = xţx | |
1877 < xȚx | |
1878 = xŢx | |
1879 < xU | |
1880 | |
1881 ** test: DataDrivenCollationTest/testOffsets | |
1882 # This tests cases where forwards and backwards iteration get different offsets | |
1883 @ locale en | |
1884 % strength=tertiary | |
1885 * compare | |
1886 < a\uD800\uDC00\uDC00 | |
1887 < b\uD800\uDC00\uDC00 | |
1888 * compare | |
1889 < \u0301A\u0301\u0301 | |
1890 < \u0301B\u0301\u0301 | |
1891 * compare | |
1892 < abcd\r\u0301 | |
1893 < abce\r\u0301 | |
1894 # TODO: test offsets in new CollationTest | |
1895 | |
1896 # End of test cases moved here from ICU 52's DataDrivenCollationTest.txt. | |
1897 | |
1898 ** test: was ICU 52 cmsccoll/TestRedundantRules | |
1899 @ rules | |
1900 & a < b < c < d& [before 1] c < m | |
1901 * compare | |
1902 <1 a | |
1903 <1 b | |
1904 <1 m | |
1905 <1 c | |
1906 <1 d | |
1907 | |
1908 @ rules | |
1909 & a < b <<< c << d <<< e& [before 3] e <<< x | |
1910 * compare | |
1911 <1 a | |
1912 <1 b | |
1913 <3 c | |
1914 <2 d | |
1915 <3 x | |
1916 <3 e | |
1917 | |
1918 @ rules | |
1919 & a < b <<< c << d <<< e <<< f < g& [before 1] g < x | |
1920 * compare | |
1921 <1 a | |
1922 <1 b | |
1923 <3 c | |
1924 <2 d | |
1925 <3 e | |
1926 <3 f | |
1927 <1 x | |
1928 <1 g | |
1929 | |
1930 @ rules | |
1931 & a <<< b << c < d& a < m | |
1932 * compare | |
1933 <1 a | |
1934 <3 b | |
1935 <2 c | |
1936 <1 m | |
1937 <1 d | |
1938 | |
1939 @ rules | |
1940 &a<b<<b\u0301 &z<b | |
1941 * compare | |
1942 <1 a | |
1943 <1 b\u0301 | |
1944 <1 z | |
1945 <1 b | |
1946 | |
1947 @ rules | |
1948 &z<m<<<q<<<m | |
1949 * compare | |
1950 <1 z | |
1951 <1 q | |
1952 <3 m | |
1953 | |
1954 @ rules | |
1955 &z<<<m<q<<<m | |
1956 * compare | |
1957 <1 z | |
1958 <1 q | |
1959 <3 m | |
1960 | |
1961 @ rules | |
1962 & a < b < c < d& r < c | |
1963 * compare | |
1964 <1 a | |
1965 <1 b | |
1966 <1 d | |
1967 <1 r | |
1968 <1 c | |
1969 | |
1970 @ rules | |
1971 & a < b < c < d& c < m | |
1972 * compare | |
1973 <1 a | |
1974 <1 b | |
1975 <1 c | |
1976 <1 m | |
1977 <1 d | |
1978 | |
1979 @ rules | |
1980 & a < b < c < d& a < m | |
1981 * compare | |
1982 <1 a | |
1983 <1 m | |
1984 <1 b | |
1985 <1 c | |
1986 <1 d | |
1987 | |
1988 ** test: was ICU 52 cmsccoll/TestExpansionSyntax | |
1989 # The following two rules should sort the particular list of strings the same. | |
1990 @ rules | |
1991 &AE <<< a << b <<< c &d <<< f | |
1992 * compare | |
1993 <1 AE | |
1994 <3 a | |
1995 <2 b | |
1996 <3 c | |
1997 <1 d | |
1998 <3 f | |
1999 | |
2000 @ rules | |
2001 &A <<< a / E << b / E <<< c /E &d <<< f | |
2002 * compare | |
2003 <1 AE | |
2004 <3 a | |
2005 <2 b | |
2006 <3 c | |
2007 <1 d | |
2008 <3 f | |
2009 | |
2010 # The following two rules should sort the particular list of strings the same. | |
2011 @ rules | |
2012 &AE <<< a <<< b << c << d < e < f <<< g | |
2013 * compare | |
2014 <1 AE | |
2015 <3 a | |
2016 <3 b | |
2017 <2 c | |
2018 <2 d | |
2019 <1 e | |
2020 <1 f | |
2021 <3 g | |
2022 | |
2023 @ rules | |
2024 &A <<< a / E <<< b / E << c / E << d / E < e < f <<< g | |
2025 * compare | |
2026 <1 AE | |
2027 <3 a | |
2028 <3 b | |
2029 <2 c | |
2030 <2 d | |
2031 <1 e | |
2032 <1 f | |
2033 <3 g | |
2034 | |
2035 # The following two rules should sort the particular list of strings the same. | |
2036 @ rules | |
2037 &AE <<< B <<< C / D <<< F | |
2038 * compare | |
2039 <1 AE | |
2040 <3 B | |
2041 <3 F | |
2042 <1 AED | |
2043 <3 C | |
2044 | |
2045 @ rules | |
2046 &A <<< B / E <<< C / ED <<< F / E | |
2047 * compare | |
2048 <1 AE | |
2049 <3 B | |
2050 <3 F | |
2051 <1 AED | |
2052 <3 C | |
2053 | |
2054 ** test: never reorder trailing primaries | |
2055 @ root | |
2056 % reorder Zzzz Grek | |
2057 * compare | |
2058 <1 L | |
2059 <1 字 | |
2060 <1 Ω | |
2061 <1 \uFFFD | |
2062 <1 \uFFFF | |
2063 | |
2064 ** test: fall back to mappings with shorter prefixes, not immediately to ones wi
th no prefixes | |
2065 @ rules | |
2066 &u=ab|cd | |
2067 &v=b|ce | |
2068 * compare | |
2069 <1 abc | |
2070 <1 abcc | |
2071 <1 abcf | |
2072 <1 abcd | |
2073 = abu | |
2074 <1 abce | |
2075 = abv | |
2076 | |
2077 # With the following rules, there is only one prefix per composite ĉ or ç, | |
2078 # but both prefixes apply to just c in NFD form. | |
2079 # We would get different results for composed vs. NFD input | |
2080 # if we fell back directly from longest-prefix mappings to no-prefix mappings. | |
2081 @ rules | |
2082 &x=op|ĉ | |
2083 &y=p|ç | |
2084 * compare | |
2085 <1 opc | |
2086 <2 opć | |
2087 <1 opcz | |
2088 <1 opd | |
2089 <1 opĉ | |
2090 = opc\u0302 | |
2091 = opx | |
2092 <1 opç | |
2093 = opc\u0327 | |
2094 = opy | |
2095 | |
2096 # The mapping is used which has the longest matching prefix for which | |
2097 # there is also a suffix match, with the longest suffix match among several for
that prefix. | |
2098 @ rules | |
2099 &❶=d | |
2100 &❷=de | |
2101 &❸=def | |
2102 &①=c|d | |
2103 &②=c|de | |
2104 &③=c|def | |
2105 &④=bc|d | |
2106 &⑤=bc|de | |
2107 &⑥=bc|def | |
2108 &⑦=abc|d | |
2109 &⑧=abc|de | |
2110 &⑨=abc|def | |
2111 * compare | |
2112 <1 9aadzz | |
2113 = 9aa❶zz | |
2114 <1 9aadez | |
2115 = 9aa❷z | |
2116 <1 9aadef | |
2117 = 9aa❸ | |
2118 <1 9acdzz | |
2119 = 9ac①zz | |
2120 <1 9acdez | |
2121 = 9ac②z | |
2122 <1 9acdef | |
2123 = 9ac③ | |
2124 <1 9bcdzz | |
2125 = 9bc④zz | |
2126 <1 9bcdez | |
2127 = 9bc⑤z | |
2128 <1 9bcdef | |
2129 = 9bc⑥ | |
2130 <1 abcdzz | |
2131 = abc⑦zz | |
2132 <1 abcdez | |
2133 = abc⑧z | |
2134 <1 abcdef | |
2135 = abc⑨ | |
2136 | |
2137 ** test: prefix + discontiguous contraction with missing prefix contraction | |
2138 # Unfortunate terminology: The first "prefix" here is the pre-context, | |
2139 # the second "prefix" refers to the contraction/relation string that is | |
2140 # one shorter than the one being tested. | |
2141 @ rules | |
2142 &x=p|e | |
2143 &y=p|ê | |
2144 &z=op|ê | |
2145 # No mapping for op|e: | |
2146 # Discontiguous contraction matching should not match op|ê in opệ | |
2147 # because it would have to skip the dot below and extend a match on op|e by the
circumflex, | |
2148 # but there is no match on op|e. | |
2149 * compare | |
2150 <1 oPe | |
2151 <1 ope | |
2152 = opx | |
2153 <1 opệ | |
2154 = opy\u0323 # y not z | |
2155 <1 opê | |
2156 = opz | |
2157 | |
2158 # We cannot test for fallback by whether the contraction default CE32 | |
2159 # is for another contraction. With the following rules, there is no mapping for
op|e, | |
2160 # and the fallback to prefix p has no contractions. | |
2161 @ rules | |
2162 &x=p|e | |
2163 &z=op|ê | |
2164 * compare | |
2165 <1 oPe | |
2166 <1 ope | |
2167 = opx | |
2168 <2 opệ | |
2169 = opx\u0323\u0302 # x not z | |
2170 <1 opê | |
2171 = opz | |
2172 | |
2173 # One more variation: Fallback to the simple code point, no shorter non-empty pr
efix. | |
2174 @ rules | |
2175 &x=e | |
2176 &z=op|ê | |
2177 * compare | |
2178 <1 ope | |
2179 = opx | |
2180 <3 oPe | |
2181 = oPx | |
2182 <2 opệ | |
2183 = opx\u0323\u0302 # x not z | |
2184 <1 opê | |
2185 = opz | |
2186 | |
2187 ** test: maxVariable via rules | |
2188 @ rules | |
2189 [maxVariable space][alternate shifted] | |
2190 * compare | |
2191 = \u0020 | |
2192 = \u000A | |
2193 <1 . | |
2194 <1 ° # degree sign | |
2195 <1 $ | |
2196 <1 0 | |
2197 | |
2198 ** test: maxVariable via setting | |
2199 @ root | |
2200 % maxVariable=currency | |
2201 % alternate=shifted | |
2202 * compare | |
2203 = \u0020 | |
2204 = \u000A | |
2205 = . | |
2206 = ° # degree sign | |
2207 = $ | |
2208 <1 0 | |
2209 | |
2210 ** test: ICU4J CollationMiscTest/TestContractionClosure (ää) | |
2211 # This tests canonical closure, but it also tests that CollationFastLatin | |
2212 # bails out properly for contractions with combining marks. | |
2213 # For that we need pairs of strings that remain in the Latin fastpath | |
2214 # long enough, hence the extra "= b" lines. | |
2215 @ rules | |
2216 &b=\u00e4\u00e4 | |
2217 * compare | |
2218 <1 b | |
2219 = \u00e4\u00e4 | |
2220 = b | |
2221 = a\u0308a\u0308 | |
2222 = b | |
2223 = \u00e4a\u0308 | |
2224 = b | |
2225 = a\u0308\u00e4 | |
2226 | |
2227 ** test: ICU4J CollationMiscTest/TestContractionClosure (Å) | |
2228 @ rules | |
2229 &b=\u00C5 | |
2230 * compare | |
2231 <1 b | |
2232 = \u00C5 | |
2233 = b | |
2234 = A\u030A | |
2235 = b | |
2236 = \u212B | |
2237 | |
2238 ** test: reset-before on already-tailored characters, ICU ticket 10108 | |
2239 @ rules | |
2240 &a<w<<x &[before 2]x<<y | |
2241 * compare | |
2242 <1 a | |
2243 <1 w | |
2244 <2 y | |
2245 <2 x | |
2246 | |
2247 @ rules | |
2248 &a<<w<<<x &[before 2]x<<y | |
2249 * compare | |
2250 <1 a | |
2251 <2 y | |
2252 <2 w | |
2253 <3 x | |
2254 | |
2255 @ rules | |
2256 &a<w<x &[before 2]x<<y | |
2257 * compare | |
2258 <1 a | |
2259 <1 w | |
2260 <1 y | |
2261 <2 x | |
2262 | |
2263 @ rules | |
2264 &a<w<<<x &[before 2]x<<y | |
2265 * compare | |
2266 <1 a | |
2267 <1 y | |
2268 <2 w | |
2269 <3 x | |
2270 | |
2271 ** test: numeric collation with other settings, ICU ticket 9092 | |
2272 @ root | |
2273 % strength=identical | |
2274 % caseFirst=upper | |
2275 % numeric=on | |
2276 * compare | |
2277 <1 100\u0020a | |
2278 <1 101 | |
2279 | |
2280 ** test: collation type fallback from unsupported type, ICU ticket 10149 | |
2281 @ locale fr-CA-u-co-phonebk | |
2282 # Expect the same result as with fr-CA, using backwards-secondary order. | |
2283 # That is, we should fall back from the unsupported collation type | |
2284 # to the locale's default collation type. | |
2285 * compare | |
2286 <1 cote | |
2287 <2 côte | |
2288 <2 coté | |
2289 <2 côté | |
2290 | |
2291 ** test: @ is equivalent to [backwards 2], ICU ticket 9956 | |
2292 @ rules | |
2293 &b<a @ &v<<w | |
2294 * compare | |
2295 <1 b | |
2296 <1 a | |
2297 <1 cote | |
2298 <2 côte | |
2299 <2 coté | |
2300 <2 côté | |
2301 <1 v | |
2302 <2 w | |
2303 <1 x | |
2304 | |
2305 ** test: shifted+reordering, ICU ticket 9507 | |
2306 @ root | |
2307 % reorder Grek punct space | |
2308 % alternate=shifted | |
2309 % strength=quaternary | |
2310 # Which primaries are "variable" should be determined without script reordering, | |
2311 # and then primaries should be reordered whether they are shifted to quaternary
or not. | |
2312 * compare | |
2313 <4 ( # punctuation | |
2314 <4 ) | |
2315 <4 \u0020 # space | |
2316 <1 ` # symbol | |
2317 <1 ^ | |
2318 <1 $ # currency symbol | |
2319 <1 € | |
2320 <1 0 # numbers | |
2321 <1 ε # Greek | |
2322 <1 e # Latin | |
2323 <1 e(e | |
2324 <4 e)e | |
2325 <4 e\u0020e | |
2326 <4 ee | |
2327 <3 e(E | |
2328 <4 e)E | |
2329 <4 e\u0020E | |
2330 <4 eE | |
2331 | |
2332 ** test: "uppercase first" could sort a string before its prefix, ICU ticket 935
1 | |
2333 @ rules | |
2334 &\u0001<<<b<<<B | |
2335 % caseFirst=upper | |
2336 * compare | |
2337 <1 aaa | |
2338 <3 aaaB | |
2339 | |
2340 ** test: secondary+case ignores secondary ignorables, ICU ticket 9355 | |
2341 @ rules | |
2342 &\u0001<<<b<<<B | |
2343 % strength=secondary | |
2344 % caseLevel=on | |
2345 * compare | |
2346 <1 a | |
2347 = ab | |
2348 = aB | |
2349 | |
2350 ** test: custom collation rules involving tail of a contraction in Malayalam, IC
U ticket 6328 | |
2351 @ rules | |
2352 &[before 2] ൌ << ൗ # U+0D57 << U+0D4C == 0D46+0D57 | |
2353 * compare | |
2354 <1 ൗx | |
2355 <2 ൌx | |
2356 <1 ൗy | |
2357 <2 ൌy | |
2358 | |
2359 ** test: quoted apostrophe in compact syntax, ICU ticket 8204 | |
2360 @ rules | |
2361 &q<<*a''c | |
2362 * compare | |
2363 <1 d | |
2364 <1 p | |
2365 <1 q | |
2366 <2 a | |
2367 <2 \u0027 | |
2368 <2 c | |
2369 <1 r | |
2370 | |
2371 # ICU ticket #8260 "Support all collation-related keywords in Collator.getInstan
ce()" | |
2372 ** test: locale -u- with collation keywords, ICU ticket 8260 | |
2373 @ locale de-u-kv-sPace-ka-shifTed-kn-kk-falsE-kf-Upper-kc-tRue-ks-leVel4 | |
2374 * compare | |
2375 <4 \u0020 # space is shifted, strength=quaternary | |
2376 <1 ! # punctuation is regular | |
2377 <1 2 | |
2378 <1 12 # numeric sorting | |
2379 <1 B | |
2380 <c b # uppercase first on case level | |
2381 <1 x\u0301\u0308 | |
2382 <2 x\u0308\u0301 # normalization off | |
2383 | |
2384 ** test: locale @ with collation keywords, ICU ticket 8260 | |
2385 @ locale fr@colbAckwards=yes;ColStrength=Quaternary;kv=currencY;colalternate=shi
fted | |
2386 * compare | |
2387 <4 $ # currency symbols are shifted, strength=quaternary | |
2388 <1 àla | |
2389 <2 alà # backwards secondary level | |
2390 | |
2391 ** test: locale -u- with script reordering, ICU ticket 8260 | |
2392 @ locale el-u-kr-kana-SYMBOL-Grek-hani-cyrl-latn-digit-armn-deva-ethi-thai | |
2393 * compare | |
2394 <1 \u0020 | |
2395 <1 あ | |
2396 <1 ☂ | |
2397 <1 Ω | |
2398 <1 丂 | |
2399 <1 ж | |
2400 <1 L | |
2401 <1 4 | |
2402 <1 Ձ | |
2403 <1 अ | |
2404 <1 ሄ | |
2405 <1 ฉ | |
2406 | |
2407 ** test: locale @collation=type should be case-insensitive | |
2408 @ locale de@coLLation=PhoneBook | |
2409 * compare | |
2410 <1 ae | |
2411 <2 ä | |
2412 <3 Ä | |
2413 | |
2414 ** test: import root search rules plus German phonebook rules, ICU ticket 8962 | |
2415 @ locale de-u-co-search | |
2416 * compare | |
2417 <1 = | |
2418 <1 ≠ | |
2419 <1 a | |
2420 <1 ae | |
2421 <2 ä | |
2422 | |
2423 # Once more, but with runtime builder. | |
2424 @ rules | |
2425 [import und-u-co-search][import de-u-co-phonebk] | |
2426 * compare | |
2427 <1 = | |
2428 <1 ≠ | |
2429 <1 a | |
2430 <1 ae | |
2431 <2 ä | |
2432 | |
2433 # Once again, with import from "root" not "und" (as in a proper language tag). | |
2434 @ rules | |
2435 [import root-u-co-search][import de-u-co-phonebk] | |
2436 * compare | |
2437 <1 = | |
2438 <1 ≠ | |
2439 <1 a | |
2440 <1 ae | |
2441 <2 ä | |
2442 | |
2443 ** test: import rules from a language with non-Latin native script, and reset th
e reordering, ICU ticket 10998 | |
2444 # Greek should sort Greek first. | |
2445 @ rules | |
2446 [import el] | |
2447 * compare | |
2448 <1 4 | |
2449 <1 Ω | |
2450 <1 L | |
2451 | |
2452 # Import Greek, and then reset the reordering. | |
2453 @ rules | |
2454 [import el][reorder Zzzz] | |
2455 * compare | |
2456 <1 4 | |
2457 <1 L | |
2458 <1 Ω | |
2459 | |
2460 # "others" is a synonym for Zzzz. | |
2461 @ rules | |
2462 [import el][reorder others] | |
2463 * compare | |
2464 <1 4 | |
2465 <1 L | |
2466 <1 Ω | |
2467 | |
2468 ** test: regression test for CollationFastLatinBuilder, ICU ticket 11388 | |
2469 @ rules | |
2470 &x<<aa<<<Aa<<<AA | |
2471 % strength=secondary | |
2472 * compare | |
2473 <1 AA | |
2474 <2 Aẩ | |
2475 <2 aą | |
2476 * compare | |
2477 <1 AA | |
2478 <2 aą | |
2479 | |
2480 ** test: tailor tertiary-after a common tertiary where there is a lower one | |
2481 # Assume that Hiragana small A has a below-common tertiary, and Hiragana A has a
common one. | |
2482 # See ICU ticket 11448 & CLDR ticket 7222. | |
2483 @ rules | |
2484 &あ<<<x<<<y<<<z | |
2485 * compare | |
2486 <1 ぁ | |
2487 <3 あ | |
2488 <3 x | |
2489 <3 y | |
2490 <3 z | |
2491 <3 ァ | |
2492 <1 い | |
2493 | |
2494 ** test: tailor tertiary-after a below-common tertiary | |
2495 @ rules | |
2496 &ぁ<<<x<<<y<<<z | |
2497 * compare | |
2498 <1 ぁ | |
2499 <3 x | |
2500 <3 y | |
2501 <3 z | |
2502 <3 あ | |
2503 <3 ァ | |
2504 <1 い | |
2505 | |
2506 ** test: tailor tertiary-before a common tertiary where there is a lower one | |
2507 @ rules | |
2508 &[before 3]あ<<<x<<<y<<<z | |
2509 * compare | |
2510 <1 ぁ | |
2511 <3 x | |
2512 <3 y | |
2513 <3 z | |
2514 <3 あ | |
2515 <3 ァ | |
2516 <1 い | |
2517 | |
2518 ** test: tailor tertiary-before a below-common tertiary | |
2519 @ rules | |
2520 &[before 3]ぁ<<<x<<<y<<<z | |
2521 * compare | |
2522 <1 x | |
2523 <3 y | |
2524 <3 z | |
2525 <3 ぁ | |
2526 <3 あ | |
2527 <3 ァ | |
2528 <1 い | |
2529 | |
2530 ** test: reorder single scripts not groups, ICU ticket 11449 | |
2531 @ root | |
2532 % reorder Goth Latn | |
2533 * compare | |
2534 <1 4 | |
2535 <1 𐌰 # Gothic | |
2536 <1 L | |
2537 <1 Ω | |
2538 # Before ICU 55, the following reordered together with Gothic. | |
2539 <1 𐌈 # Old Italic | |
2540 <1 𐑐 # Shavian | |
OLD | NEW |