Index: source/test/testdata/collationtest.txt |
diff --git a/source/test/testdata/collationtest.txt b/source/test/testdata/collationtest.txt |
new file mode 100644 |
index 0000000000000000000000000000000000000000..d55f53188892f67d62c19083cef0963589c5806a |
--- /dev/null |
+++ b/source/test/testdata/collationtest.txt |
@@ -0,0 +1,2466 @@ |
+# Copyright (c) 2012-2014 International Business Machines |
+# Corporation and others. All Rights Reserved. |
+# |
+# This file should be in UTF-8 with a signature byte sequence ("BOM"). |
+# |
+# collationtest.txt: Collation test data. |
+# |
+# created on: 2012apr13 |
+# created by: Markus W. Scherer |
+ |
+# A line with "** test: description" is used for verbose and error output. |
+ |
+# A collator can be set with "@ root" or "@ locale language-tag", |
+# for example "@ locale de-u-co-phonebk". |
+# An old-style locale ID can also be used, for example "@ locale de@collation=phonebook". |
+ |
+# A collator can be built with "@ rules". |
+# An "@ rules" line is followed by one or more lines with the tailoring rules. |
+ |
+# A collator can be modified with "% attribute=value". |
+ |
+# "* compare" tests the order (= or <) of the following strings. |
+# The relation can be "=" or "<" (the level of the difference is not specified) |
+# or "<1", "<2", "<c", "<3", "<4" (indicating the level of the difference). |
+ |
+# Test sections ("* compare") are terminated by |
+# definitions of new collators, changing attributes, or new test sections. |
+ |
+** test: simple CEs & expansions |
+# Many types of mappings are tested elsewhere, including via the UCA conformance tests. |
+# Here we mostly cover a few unusual mappings. |
+@ rules |
+&\x01 # most control codes are ignorable |
+<<<\u0300 # tertiary CE |
+&9<\x00 # NUL not ignorable |
+&\uA00A\uA00B=\uA002 # two long-primary CEs |
+&\uA00A\uA00B\u00050005=\uA003 # three CEs, require 64 bits |
+ |
+* compare |
+= \x01 |
+= \x02 |
+<3 \u0300 |
+<1 9 |
+<1 \x00 |
+= \x01\x00\x02 |
+<1 a |
+<3 a\u0300 |
+<2 a\u0308 |
+= ä |
+<1 b |
+<1 か # Hiragana Ka (U+304B) |
+<2 か\u3099 # plus voiced sound mark |
+= が # Hiragana Ga (U+304C) |
+<1 \uA00A\uA00B |
+= \uA002 |
+<1 \uA00A\uA00B\u00050004 |
+<1 \uA00A\uA00B\u00050005 |
+= \uA003 |
+<1 \uA00A\uA00B\u00050006 |
+ |
+** test: contractions |
+# Create some interesting mappings, and map some normalization-inert characters |
+# (which are not subject to canonical reordering) |
+# to some of the same CEs to check the sequence of CEs. |
+@ rules |
+ |
+# Contractions starting with 'a' should not continue with any character < U+0300 |
+# so that we can test a shortcut for that. |
+&a=ⓐ |
+&b<bz=ⓑ |
+&d<dz\u0301=ⓓ # d+z+acute |
+&z |
+<a\u0301=Ⓐ # a+acute sorts after z |
+<a\u0301\u0301=Ⓑ # a+acute+acute |
+<a\u0301\u0301\u0358=Ⓒ # a+acute+acute+dot above right |
+<a\u030a=Ⓓ # a+ring |
+<a\u0323=Ⓔ # a+dot below |
+<a\u0323\u0358=Ⓕ # a+dot below+dot above right |
+<a\u0327\u0323\u030a=Ⓖ # a+cedilla+dot below+ring |
+<a\u0327\u0323bz=Ⓗ # a+cedilla+dot below+b+z |
+ |
+&\U0001D158=⁰ # musical notehead black (has a symbol primary) |
+<\U0001D158\U0001D165=¼ # musical quarter note |
+ |
+# deliberately missing prefix contractions: |
+# dz |
+# a\u0327 |
+# a\u0327\u0323 |
+# a\u0327\u0323b |
+ |
+&\x01 |
+<<<\U0001D165=¹ # musical stem (ccc=216) |
+<<<\U0001D16D=² # musical augmentation dot (ccc=226) |
+<<<\U0001D165\U0001D16D=³ # stem+dot (ccc=216 226) |
+&\u0301=❶ # acute (ccc=230) |
+&\u030a=❷ # ring (ccc=230) |
+&\u0308=❸ # diaeresis (ccc=230) |
+<<\u0308\u0301=❹ # diaeresis+acute (=dialytika tonos) (ccc=230 230) |
+&\u0327=❺ # cedilla (ccc=202) |
+&\u0323=❻ # dot below (ccc=220) |
+&\u0331=❼ # macron below (ccc=220) |
+<<\u0331\u0358=❽ # macron below+dot above right (ccc=220 232) |
+&\u0334=❾ # tilde overlay (ccc=1) |
+&\u0358=❿ # dot above right (ccc=232) |
+ |
+&\u0f71=① # tibetan vowel sign aa |
+&\u0f72=② # tibetan vowel sign i |
+# \u0f71\u0f72 # tibetan vowel sign aa + i = ii = U+0F73 |
+&\u0f73=③ # tibetan vowel sign ii (ccc=0 but lccc=129) |
+ |
+** test: simple contractions |
+ |
+# Some strings are chosen to cause incremental contiguous contraction matching to |
+# go into partial matches for prefixes of contractions |
+# (where the prefixes are deliberately not also contractions). |
+# When there is no complete match, then the matching code must back out of those |
+# so that discontiguous contractions work as specified. |
+ |
+* compare |
+# contraction starter with no following text, or mismatch, or blocked |
+<1 a |
+= ⓐ |
+<1 aa |
+= ⓐⓐ |
+<1 ab |
+= ⓐb |
+<1 az |
+= ⓐz |
+ |
+* compare |
+<1 a |
+<2 a\u0308\u030a # ring blocked by diaeresis |
+= ⓐ❸❷ |
+<2 a\u0327 |
+= ⓐ❺ |
+ |
+* compare |
+<2 \u0308 |
+= ❸ |
+<2 \u0308\u030a\u0301 # acute blocked by ring |
+= ❸❷❶ |
+ |
+* compare |
+<1 \U0001D158 |
+= ⁰ |
+<1 \U0001D158\U0001D165 |
+= ¼ |
+ |
+# no discontiguous contraction because of missing prefix contraction d+z, |
+# and a starter ('z') after the 'd' |
+* compare |
+<1 dz\u0323\u0301 |
+= dz❻❶ |
+ |
+# contiguous contractions |
+* compare |
+<1 abz |
+= ⓐⓑ |
+<1 abzz |
+= ⓐⓑz |
+ |
+* compare |
+<1 a |
+<1 z |
+<1 a\u0301 |
+= Ⓐ |
+<1 a\u0301\u0301 |
+= Ⓑ |
+<1 a\u0301\u0301\u0358 |
+= Ⓒ |
+<1 a\u030a |
+= Ⓓ |
+<1 a\u0323\u0358 |
+= Ⓕ |
+<1 a\u0327\u0323\u030a # match despite missing prefix |
+= Ⓖ |
+<1 a\u0327\u0323bz |
+= Ⓗ |
+ |
+* compare |
+<2 \u0308\u0308\u0301 # acute blocked from first diaeresis, contracts with second |
+= ❸❹ |
+ |
+* compare |
+<1 \U0001D158\U0001D165 |
+= ¼ |
+ |
+* compare |
+<3 \U0001D165\U0001D16D |
+= ³ |
+ |
+** test: discontiguous contractions |
+* compare |
+<1 a\u0327\u030a # a+ring skips cedilla |
+= Ⓓ❺ |
+<2 a\u0327\u0327\u030a # a+ring skips 2 cedillas |
+= Ⓓ❺❺ |
+<2 a\u0327\u0327\u0327\u030a # a+ring skips 3 cedillas |
+= Ⓓ❺❺❺ |
+<2 a\u0334\u0327\u0327\u030a # a+ring skips tilde overlay & 2 cedillas |
+= Ⓓ❾❺❺ |
+<1 a\u0327\u0323 # a+dot below skips cedilla |
+= Ⓔ❺ |
+<1 a\u0323\u0301\u0358 # a+dot below+dot ab.r.: 2-char match, then skips acute |
+= Ⓕ❶ |
+<2 a\u0334\u0323\u0358 # a+dot below skips tilde overlay |
+= Ⓕ❾ |
+ |
+* compare |
+<2 \u0331\u0331\u0358 # macron below+dot ab.r. skips the second macron below |
+= ❽❼ |
+ |
+* compare |
+<1 a\u0327\u0331\u0323\u030a # a+ring skips cedilla, macron below, dot below (dot blocked by macron) |
+= Ⓓ❺❼❻ |
+<1 a\u0327\u0323\U0001D16D\u030a # a+dot below skips cedilla |
+= Ⓔ❺²❷ |
+<2 a\u0327\u0327\u0323\u030a # a+dot below skips 2 cedillas |
+= Ⓔ❺❺❷ |
+<2 a\u0327\u0323\u0323\u030a # a+dot below skips cedilla |
+= Ⓔ❺❻❷ |
+<2 a\u0334\u0327\u0323\u030a # a+dot below skips tilde overlay & cedilla |
+= Ⓔ❾❺❷ |
+ |
+* compare |
+<1 \U0001D158\u0327\U0001D165 # quarter note skips cedilla |
+= ¼❺ |
+<1 a\U0001D165\u0323 # a+dot below skips stem |
+= Ⓔ¹ |
+ |
+# partial contiguous match, backs up, matches discontiguous contraction |
+<1 a\u0327\u0323b |
+= Ⓔ❺b |
+<1 a\u0327\u0323ba |
+= Ⓔ❺bⓐ |
+ |
+# a+acute+acute+dot above right skips cedilla, continues matching 2 same-ccc combining marks |
+* compare |
+<1 a\u0327\u0301\u0301\u0358 |
+= Ⓒ❺ |
+ |
+# FCD but not NFD |
+* compare |
+<1 a\u0f73\u0301 # a+acute skips tibetan ii |
+= Ⓐ③ |
+ |
+# FCD but the 0f71 inside the 0f73 must be skipped |
+# to match the discontiguous contraction of the first 0f71 with the trailing 0f72 inside the 0f73 |
+* compare |
+<1 \u0f71\u0f73 # == \u0f73\u0f71 == \u0f71\u0f71\u0f72 |
+= ③① |
+ |
+** test: discontiguous contractions with nested contractions |
+* compare |
+<1 a\u0323\u0308\u0301\u0358 |
+= Ⓕ❹ |
+<2 a\u0323\u0308\u0301\u0308\u0301\u0358 |
+= Ⓕ❹❹ |
+ |
+** test: discontiguous contractions with interleaved contractions |
+* compare |
+# a+ring & cedilla & macron below+dot above right |
+<1 a\u0327\u0331\u030a\u0358 |
+= Ⓓ❺❽ |
+ |
+# a+ring & 1x..3x macron below+dot above right |
+<2 a\u0331\u030a\u0358 |
+= Ⓓ❽ |
+<2 a\u0331\u0331\u030a\u0358\u0358 |
+= Ⓓ❽❽ |
+# also skips acute |
+<2 a\u0331\u0331\u0331\u030a\u0301\u0358\u0358\u0358 |
+= Ⓓ❽❽❽❶ |
+ |
+# a+dot below & stem+augmentation dot, followed by contiguous d+z+acute |
+<1 a\U0001D165\u0323\U0001D16Ddz\u0301 |
+= Ⓔ³ⓓ |
+ |
+** test: some simple string comparisons |
+@ root |
+* compare |
+# first string compares against "" |
+= \u0000 |
+< a |
+<1 b |
+<3 B |
+= \u0000B\u0000 |
+ |
+** test: compare with strength=primary |
+% strength=primary |
+* compare |
+<1 a |
+<1 b |
+= B |
+ |
+** test: compare with strength=secondary |
+% strength=secondary |
+* compare |
+<1 a |
+<1 b |
+= B |
+ |
+** test: compare with strength=tertiary |
+% strength=tertiary |
+* compare |
+<1 a |
+<1 b |
+<3 B |
+ |
+** test: compare with strength=quaternary |
+% strength=quaternary |
+* compare |
+<1 a |
+<1 b |
+<3 B |
+ |
+** test: compare with strength=identical |
+% strength=identical |
+* compare |
+<1 a |
+<1 b |
+<3 B |
+ |
+** test: côté with forwards secondary |
+@ root |
+* compare |
+<1 cote |
+<2 coté |
+<2 côte |
+<2 côté |
+ |
+** test: côté with forwards secondary vs. U+FFFE merge separator |
+# Merged sort keys: On each level, any difference in the first segment |
+# must trump any further difference. |
+* compare |
+<1 cote\uFFFEcôté |
+<2 coté\uFFFEcôte |
+<2 côte\uFFFEcoté |
+<2 côté\uFFFEcote |
+ |
+** test: côté with backwards secondary |
+% backwards=on |
+* compare |
+<1 cote |
+<2 côte |
+<2 coté |
+<2 côté |
+ |
+** test: côté with backwards secondary vs. U+FFFE merge separator |
+# Merged sort keys: On each level, any difference in the first segment |
+# must trump any further difference. |
+* compare |
+<1 cote\uFFFEcôté |
+<2 côte\uFFFEcoté |
+<2 coté\uFFFEcôte |
+<2 côté\uFFFEcote |
+ |
+** test: U+FFFE on identical level |
+@ root |
+% strength=identical |
+* compare |
+# All of these control codes are completely-ignorable, so that |
+# their low code points are compared with the merge separator. |
+# The merge separator must compare less than any other character. |
+<1 \uFFFE\u0001\u0002\u0003 |
+<i \u0001\uFFFE\u0002\u0003 |
+<i \u0001\u0002\uFFFE\u0003 |
+<i \u0001\u0002\u0003\uFFFE |
+ |
+* compare |
+# The merge separator must even compare less than U+0000. |
+<1 \uFFFE\u0000\u0000 |
+<i \u0000\uFFFE\u0000 |
+<i \u0000\u0000\uFFFE |
+ |
+** test: Hani < surrogates < U+FFFD |
+# Note: compareUTF8() treats unpaired surrogates like U+FFFD, |
+# so with that the strings with surrogates will compare equal to each other |
+# and equal to the string with U+FFFD. |
+@ root |
+% strength=identical |
+* compare |
+<1 abz |
+<1 a\u4e00z |
+<1 a\U00020000z |
+<1 a\ud800z |
+<1 a\udbffz |
+<1 a\udc00z |
+<1 a\udfffz |
+<1 a\ufffdz |
+ |
+** test: script reordering |
+@ root |
+% reorder Hani Zzzz digit |
+* compare |
+<1 ? |
+<1 + |
+<1 丂 |
+<1 a |
+<1 α |
+<1 5 |
+ |
+% reorder default |
+* compare |
+<1 ? |
+<1 + |
+<1 5 |
+<1 a |
+<1 α |
+<1 丂 |
+ |
+** test: empty rules |
+@ rules |
+* compare |
+<1 a |
+<2 ä |
+<3 Ä |
+<1 b |
+ |
+** test: very simple rules |
+@ rules |
+&a=e<<<<q<<<<r<x<<<X<<y<<<Y;z,Z |
+% strength=quaternary |
+* compare |
+<1 a |
+= e |
+<4 q |
+<4 r |
+<1 x |
+<3 X |
+<2 y |
+<3 Y |
+<2 z |
+<3 Z |
+ |
+** test: tailoring twice before a root position: primary |
+@ rules |
+&[before 1]b<p |
+&[before 1]b<q |
+* compare |
+<1 a |
+<1 p |
+<1 q |
+<1 b |
+ |
+** test: tailoring twice before a root position: secondary |
+@ rules |
+&[before 2]ſ<<p |
+&[before 2]ſ<<q |
+* compare |
+<1 s |
+<2 p |
+<2 q |
+<2 ſ |
+ |
+# secondary-before common weight |
+@ rules |
+&[before 2]b<<p |
+&[before 2]b<<q |
+* compare |
+<1 a |
+<1 p |
+<2 q |
+<2 b |
+ |
+** test: tailoring twice before a root position: tertiary |
+@ rules |
+&[before 3]B<<<p |
+&[before 3]B<<<q |
+* compare |
+<1 b |
+<3 p |
+<3 q |
+<3 B |
+ |
+# tertiary-before common weight |
+@ rules |
+&[before 3]b<<<p |
+&[before 3]b<<<q |
+* compare |
+<1 a |
+<1 p |
+<3 q |
+<3 b |
+ |
+@ rules |
+&[before 2]b<<s |
+&[before 3]s<<<p |
+&[before 3]s<<<q |
+* compare |
+<1 a |
+<1 p |
+<3 q |
+<3 s |
+<2 b |
+ |
+** test: tailor after completely ignorable |
+@ rules |
+&\x00<<<x<<y |
+* compare |
+= \x00 |
+= \x1F |
+<3 x |
+<2 y |
+ |
+** test: secondary tailoring gaps, ICU ticket 9362 |
+@ rules |
+&[before 2]s<<'_' |
+&s<<r # secondary between s and ſ (long s) |
+&ſ<<*a-q # more than 15 between ſ and secondary CE boundary |
+&[before 2][first primary ignorable]<<u<<v # between secondary CE boundary & lowest secondary CE |
+&[last primary ignorable]<<y<<z |
+ |
+* compare |
+<2 u |
+<2 v |
+<2 \u0332 # lowest secondary CE |
+<2 \u0308 |
+<2 y |
+<2 z |
+<1 s_ |
+<2 ss |
+<2 sr |
+<2 sſ |
+<2 sa |
+<2 sb |
+<2 sp |
+<2 sq |
+<2 sus |
+<2 svs |
+<2 rs |
+ |
+** test: tertiary tailoring gaps, ICU ticket 9362 |
+@ rules |
+&[before 3]t<<<'_' |
+&t<<<r # tertiary between t and fullwidth t |
+&ᵀ<<<*a-q # more than 15 between ᵀ (modifier letter T) and tertiary CE boundary |
+&[before 3][first secondary ignorable]<<<u<<<v # between tertiary CE boundary & lowest tertiary CE |
+&[last secondary ignorable]<<<y<<<z |
+ |
+* compare |
+<3 u |
+<3 v |
+# Note: The root collator currently does not map any characters to tertiary CEs. |
+<3 y |
+<3 z |
+<1 t_ |
+<3 tt |
+<3 tr |
+<3 tt |
+<3 tᵀ |
+<3 ta |
+<3 tb |
+<3 tp |
+<3 tq |
+<3 tut |
+<3 tvt |
+<3 rt |
+ |
+** test: secondary & tertiary around root character |
+@ rules |
+&[before 2]m<<r |
+&m<<s |
+&[before 3]m<<<u |
+&m<<<v |
+* compare |
+<1 l |
+<1 r |
+<2 u |
+<3 m |
+<3 v |
+<2 s |
+<1 n |
+ |
+** test: secondary & tertiary around tailored item |
+@ rules |
+&m<x |
+&[before 2]x<<r |
+&x<<s |
+&[before 3]x<<<u |
+&x<<<v |
+* compare |
+<1 m |
+<1 r |
+<2 u |
+<3 x |
+<3 v |
+<2 s |
+<1 n |
+ |
+** test: more nesting of secondary & tertiary before |
+@ rules |
+&[before 3]m<<<u |
+&[before 2]m<<r |
+&[before 3]r<<<q |
+&m<<<w |
+&m<<t |
+&[before 3]w<<<v |
+&w<<<x |
+&w<<s |
+* compare |
+<1 l |
+<1 q |
+<3 r |
+<2 u |
+<3 m |
+<3 v |
+<3 w |
+<3 x |
+<2 s |
+<2 t |
+<1 n |
+ |
+** test: case bits |
+@ rules |
+&w<x # tailored CE getting case bits |
+ =uv=uV=Uv=UV # 2 chars -> 1 CE |
+&ae=ch=cH=Ch=CH # 2 chars -> 2 CEs |
+&rst=yz=yZ=Yz=YZ # 2 chars -> 3 CEs |
+% caseFirst=lower |
+* compare |
+<1 ae |
+= ch |
+<3 cH |
+<3 Ch |
+<3 CH |
+<1 rst |
+= yz |
+<3 yZ |
+<3 Yz |
+<3 YZ |
+<1 w |
+<1 x |
+= uv |
+<3 uV |
+= Uv # mixed case on single CE cannot distinguish variations |
+<3 UV |
+ |
+** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=lower |
+@ rules |
+&\u0001<<<t<<<T # tertiary CEs |
+% caseFirst=lower |
+* compare |
+<1 aa |
+<3 aat |
+<3 aaT |
+<3 aA |
+<3 aAt |
+<3 ata |
+<3 aTa |
+ |
+** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=upper |
+% caseFirst=upper |
+* compare |
+<1 aA |
+<3 aAt |
+<3 aa |
+<3 aat |
+<3 aaT |
+<3 ata |
+<3 aTa |
+ |
+** test: reset on expansion, ICU tickets 9415 & 9593 |
+@ rules |
+&æ<x # tailor the last primary CE so that x sorts between ae and af |
+&æb=bæ # copy all reset CEs to make bæ sort the same |
+&각<h # copy/tailor 3 CEs to make h sort before the next Hangul syllable 갂 |
+&⒀<<y # copy/tailor 4 CEs to make y sort with only a secondary difference |
+&l·=z # handle the pre-context for · when fetching reset CEs |
+ <<u # copy/tailor 2 CEs |
+ |
+* compare |
+<1 ae |
+<2 æ |
+<1 x |
+<1 af |
+ |
+* compare |
+<1 aeb |
+<2 æb |
+= bæ |
+ |
+* compare |
+<1 각 |
+<1 h |
+<1 갂 |
+<1 갃 |
+ |
+* compare |
+<1 · # by itself: primary CE |
+<1 l |
+<2 l· # l+middle dot has only a secondary difference from l |
+= z |
+<2 u |
+ |
+* compare |
+<1 (13) |
+<3 ⒀ # DUCET sets special tertiary weights in all CEs |
+<2 y |
+<1 (13[ |
+ |
+% alternate=shifted |
+* compare |
+<1 (13) |
+= 13 |
+<3 ⒀ |
+= y # alternate=shifted removes the tailoring difference on the last CE |
+<1 14 |
+ |
+** test: contraction inside extension, ICU ticket 9378 |
+@ rules |
+&а<<х/й # all letters are Cyrillic |
+* compare |
+<1 ай |
+<2 х |
+ |
+** test: no duplicate tailored CEs for different reset positions with same CEs, ICU ticket 10104 |
+@ rules |
+&t<x &ᵀ<y # same primary weights |
+&q<u &[before 1]ꝗ<v # q and ꝗ are primary adjacent |
+* compare |
+<1 q |
+<1 u |
+<1 v |
+<1 ꝗ |
+<1 t |
+<3 ᵀ |
+<1 y |
+<1 x |
+ |
+# Principle: Each rule builds on the state of preceding rules and ignores following rules. |
+ |
+** test: later rule does not affect earlier reset position, ICU ticket 10105 |
+@ rules |
+&a < u < v < w &ov < x &b < v |
+* compare |
+<1 oa |
+<1 ou |
+<1 x # CE(o) followed by CE between u and w |
+<1 ow |
+<1 ob |
+<1 ov |
+ |
+** test: later rule does not affect earlier extension (1), ICU ticket 10105 |
+@ rules |
+&a=x/b &v=b |
+% strength=secondary |
+* compare |
+<1 B |
+<1 c |
+<1 v |
+= b |
+* compare |
+<1 AB |
+= x |
+<1 ac |
+<1 av |
+= ab |
+ |
+** test: later rule does not affect earlier extension (2), ICU ticket 10105 |
+@ rules |
+&a <<< c / e &g <<< e / l |
+% strength=secondary |
+* compare |
+<1 AE |
+= c |
+<2 æ |
+<1 agl |
+= ae |
+ |
+** test: later rule does not affect earlier extension (3), ICU ticket 10105 |
+@ rules |
+&a = b / c &d = c / e |
+% strength=secondary |
+* compare |
+<1 AC # C is still only tertiary different from the original c |
+= b |
+<1 ade |
+= ac |
+ |
+** test: extension contains tailored character, ICU ticket 10105 |
+@ rules |
+&a=e &b=u/e |
+* compare |
+<1 a |
+= e |
+<1 ba |
+= be |
+= u |
+ |
+** test: add simple mappings for characters with root context |
+@ rules |
+&z=· # middle dot has a prefix mapping in the CLDR root |
+&n=и # и (U+0438) has contractions in the root |
+* compare |
+<1 l |
+<2 l· # root mapping for l|· still works |
+<1 z |
+= · |
+* compare |
+<1 n |
+= и |
+<1 И |
+<1 и\u0306 # root mapping for й=и\u0306 still works |
+= й |
+<3 Й |
+ |
+** test: add context mappings around characters with root context |
+@ rules |
+&z=·h # middle dot has a prefix mapping in the CLDR root |
+&n=ә|и # и (U+0438) has contractions in the root |
+* compare |
+<1 l |
+<2 l· # root mapping for l|· still works |
+<1 z |
+= ·h |
+* compare |
+<1 и |
+<3 И |
+<1 и\u0306 # root mapping for й=и\u0306 still works |
+= й |
+* compare |
+<1 әn |
+= әи |
+<1 әo |
+ |
+** test: many secondary CEs at the top of their range |
+@ rules |
+&[last primary ignorable]<<*\u2801-\u28ff |
+* compare |
+<2 \u0308 |
+<2 \u2801 |
+<2 \u2802 |
+<2 \u2803 |
+<2 \u2804 |
+<2 \u28fd |
+<2 \u28fe |
+<2 \u28ff |
+<1 \x20 |
+ |
+** test: many tertiary CEs at the top of their range |
+@ rules |
+&[last secondary ignorable]<<<*a-z |
+* compare |
+<3 a |
+<3 b |
+<3 c |
+<3 d |
+# e..w |
+<3 x |
+<3 y |
+<3 z |
+<2 \u0308 |
+ |
+** test: tailor contraction together with nearly equivalent prefix, ICU ticket 10101 |
+@ rules |
+&a=p|x &b=px &c=op |
+* compare |
+<1 b |
+= px |
+<3 B |
+<1 c |
+= op |
+<3 C |
+* compare |
+<1 ca |
+= opx # first contraction op, then prefix p|x |
+<3 cA |
+<3 Ca |
+ |
+** test: reset position with prefix (pre-context), ICU ticket 10102 |
+@ rules |
+&a=p|x &px=y |
+* compare |
+<1 pa |
+= px |
+= y |
+<3 pA |
+<1 q |
+<1 x |
+ |
+** test: prefix+contraction together (1), ICU ticket 10071 |
+@ rules |
+&x=a|bc |
+* compare |
+<1 ab |
+<1 Abc |
+<1 abd |
+<1 ac |
+<1 aw |
+<1 ax |
+= abc |
+<3 aX |
+<3 Ax |
+<1 b |
+<1 bb |
+<1 bc |
+<3 bC |
+<3 Bc |
+<1 bd |
+ |
+** test: prefix+contraction together (2), ICU ticket 10071 |
+@ rules |
+&w=bc &x=a|b |
+* compare |
+<1 w |
+= bc |
+<3 W |
+* compare |
+<1 aw |
+<1 ax |
+= ab |
+<3 aX |
+<1 axb |
+<1 axc |
+= abc # prefix match a|b takes precedence over contraction match bc |
+<3 abC |
+<1 abd |
+<1 ay |
+ |
+** test: prefix+contraction together (3), ICU ticket 10071 |
+@ rules |
+&x=a|b &w=bc # reverse order of rules as previous test, order should not matter here |
+* compare # same "compare" sequences as previous test |
+<1 w |
+= bc |
+<3 W |
+* compare |
+<1 aw |
+<1 ax |
+= ab |
+<3 aX |
+<1 axb |
+<1 axc |
+= abc # prefix match a|b takes precedence over contraction match bc |
+<3 abC |
+<1 abd |
+<1 ay |
+ |
+** test: no mapping p|c, falls back to contraction ch, CLDR ticket 5962 |
+@ rules |
+&d=ch &v=p|ci |
+* compare |
+<1 pc |
+<3 pC |
+<1 pcH |
+<1 pcI |
+<1 pd |
+= pch # no-prefix contraction ch matches |
+<3 pD |
+<1 pv |
+= pci # prefix+contraction p|ci matches |
+<3 pV |
+ |
+** test: tailor in & around compact ranges of root primaries |
+# The Ogham characters U+1681..U+169A are in simple ascending order of primary CEs |
+# which should be reliably encoded as one range in the root elements data. |
+@ rules |
+&[before 1]ᚁ<a |
+&ᚁ<b |
+&[before 1]ᚂ<c |
+&ᚂ<d |
+&[before 1]ᚚ<y |
+&ᚚ<z |
+&[before 2]ᚁ<<r |
+&ᚁ<<s |
+&[before 3]ᚚ<<<t |
+&ᚚ<<<u |
+* compare |
+<1 ᣵ # U+18F5 last Canadian Aboriginal |
+<1 a |
+<1 r |
+<2 ᚁ |
+<2 s |
+<1 b |
+<1 c |
+<1 ᚂ |
+<1 d |
+<1 ᚃ |
+<1 ᚙ |
+<1 y |
+<1 t |
+<3 ᚚ |
+<3 u |
+<1 z |
+<1 ᚠ # U+16A0 first Runic |
+ |
+** test: suppressContractions |
+@ rules |
+&z<ch<әж [suppressContractions [·cә]] |
+* compare |
+<1 ch |
+<3 cH # ch was suppressed |
+<1 l |
+<1 l· # primary difference, not secondary, because l|· was suppressed |
+<1 ә |
+<2 ә\u0308 # secondary difference, not primary, because contractions for ә were suppressed |
+<1 әж |
+<3 әЖ |
+ |
+** test: Hangul & Jamo |
+@ rules |
+&L=\u1100 # first Jamo L |
+&V=\u1161 # first Jamo V |
+&T=\u11A8 # first Jamo T |
+&\uAC01<<*\u4E00-\u4EFF # first Hangul LVT syllable & lots of secondary diffs |
+* compare |
+<1 Lv |
+<3 LV |
+= \u1100\u1161 |
+= \uAC00 |
+<1 LVt |
+<3 LVT |
+= \u1100\u1161\u11A8 |
+= \uAC00\u11A8 |
+= \uAC01 |
+<2 LVT\u0308 |
+<2 \u4E00 |
+<2 \u4E01 |
+<2 \u4E80 |
+<2 \u4EFF |
+<2 LV\u0308T |
+<1 \uAC02 |
+ |
+** test: adjust special reset positions according to previous rules, CLDR ticket 6070 |
+@ rules |
+&[last variable]<x |
+[maxVariable space] # has effect only after building, no effect on following rules |
+&[last variable]<y |
+&[before 1][first regular]<z |
+* compare |
+<1 ? # some punctuation |
+<1 x |
+<1 y |
+<1 z |
+<1 $ # some symbol |
+ |
+@ rules |
+&[last primary ignorable]<<x<<<y |
+&[last primary ignorable]<<z |
+* compare |
+<2 \u0358 |
+<2 x |
+<3 y |
+<2 z |
+<1 \x20 |
+ |
+@ rules |
+&[last secondary ignorable]<<<x |
+&[last secondary ignorable]<<<y |
+* compare |
+<3 x |
+<3 y |
+<2 \u0358 |
+ |
+@ rules |
+&[before 2][first variable]<<z |
+&[before 2][first variable]<<y |
+&[before 3][first variable]<<<x |
+&[before 3][first variable]<<<w |
+&[before 1][first variable]<v |
+&[before 2][first variable]<<u |
+&[before 3][first variable]<<<t |
+&[before 2]\uFDD1\xA0<<s # FractionalUCA.txt: FDD1 00A0, SPACE first primary |
+* compare |
+<2 \u0358 |
+<1 s |
+<2 \uFDD1\xA0 |
+<1 t |
+<3 u |
+<2 v |
+<1 w |
+<3 x |
+<3 y |
+<2 z |
+<2 \t |
+ |
+@ rules |
+&[before 2][first regular]<<z |
+&[before 3][first regular]<<<y |
+&[before 1][first regular]<x |
+&[before 3][first regular]<<<w |
+&[before 2]\uFDD1\u263A<<v # FractionalUCA.txt: FDD1 263A, SYMBOL first primary |
+&[before 3][first regular]<<<u |
+&[before 1][first regular]<p # primary before the boundary: becomes variable |
+&[before 3][first regular]<<<t # not affected by p |
+&[last variable]<q # after p! |
+* compare |
+<1 ? |
+<1 p |
+<1 q |
+<1 t |
+<3 u |
+<3 v |
+<1 w |
+<3 x |
+<1 y |
+<3 z |
+<1 $ |
+ |
+# check that p & q are indeed variable |
+% alternate=shifted |
+* compare |
+= ? |
+= p |
+= q |
+<1 t |
+<3 u |
+<3 v |
+<1 w |
+<3 x |
+<1 y |
+<3 z |
+<1 $ |
+ |
+@ rules |
+&[before 2][first trailing]<<z |
+&[before 1][first trailing]<y |
+&[before 3][first trailing]<<<x |
+* compare |
+<1 \u4E00 # first Han, first implicit |
+<1 \uFDD1\uFDD0 # FractionalUCA.txt: unassigned first primary |
+# Note: The root collator currently does not map any characters to the trailing first boundary primary. |
+<1 x |
+<3 y |
+<1 z |
+<2 \uFFFD # The root collator currently maps U+FFFD to the first real trailing primary. |
+ |
+@ rules |
+&[before 2][first primary ignorable]<<z |
+&[before 2][first primary ignorable]<<y |
+&[before 3][first primary ignorable]<<<x |
+&[before 3][first primary ignorable]<<<w |
+* compare |
+= \x01 |
+<2 w |
+<3 x |
+<3 y |
+<2 z |
+<2 \u0301 |
+ |
+@ rules |
+&[before 3][first secondary ignorable]<<<y |
+&[before 3][first secondary ignorable]<<<x |
+* compare |
+= \x01 |
+<3 x |
+<3 y |
+<2 \u0301 |
+ |
+** test: canonical closure |
+@ rules |
+&X=A &U=Â |
+* compare |
+<1 U |
+= Â |
+= A\u0302 |
+<2 Ú # U with acute |
+= U\u0301 |
+= Ấ # A with circumflex & acute |
+= Â\u0301 |
+= A\u0302\u0301 |
+<1 X |
+= A |
+<2 X\u030A # with ring above |
+= Å |
+= A\u030A |
+= \u212B # Angstrom sign |
+ |
+@ rules |
+&x=\u5140\u55C0 |
+* compare |
+<1 x |
+= \u5140\u55C0 |
+= \u5140\uFA0D |
+= \uFA0C\u55C0 |
+= \uFA0C\uFA0D # CJK compatibility characters |
+<3 X |
+ |
+# canonical closure on prefix rules, ICU ticket 9444 |
+@ rules |
+&x=ä|ŝ |
+* compare |
+<1 äs # not tailored |
+<1 äx |
+= äŝ |
+= a\u0308s\u0302 |
+= a\u0308ŝ |
+= äs\u0302 |
+<3 äX |
+ |
+** test: conjoining Jamo map to expansions |
+@ rules |
+&gg=\u1101 # Jamo Lead consonant GG |
+&nj=\u11AC # Jamo Trail consonant NJ |
+* compare |
+<1 gg\u1161nj |
+= \u1101\u1161\u11AC |
+= \uAE4C\u11AC |
+= \uAE51 |
+<3 gg\u1161nJ |
+<1 \u1100\u1100 |
+ |
+** test: canonical tail closure, ICU ticket 5913 |
+@ rules |
+&a<â |
+* compare |
+<1 a |
+<1 â # tailored |
+= a\u0302 |
+<2 a\u0323\u0302 # discontiguous contraction |
+= ạ\u0302 # equivalent |
+= ậ # equivalent |
+<1 b |
+ |
+@ rules |
+&a<ạ |
+* compare |
+<1 a |
+<1 ạ # tailored |
+= a\u0323 |
+<2 a\u0323\u0302 # contiguous contraction plus extra diacritic |
+= ạ\u0302 # equivalent |
+= ậ # equivalent |
+<1 b |
+ |
+# Tail closure should work even if there is a prefix and/or contraction. |
+@ rules |
+&a<\u5140|câ |
+# In order to find discontiguous contractions for \u5140|câ |
+# there must exist a mapping for \u5140|ca, regardless of what it maps to. |
+# (This follows from the UCA spec.) |
+&x=\u5140|ca |
+* compare |
+<1 \u5140a |
+= \uFA0Ca |
+<1 \u5140câ # tailored |
+= \uFA0Ccâ |
+= \u5140ca\u0302 |
+= \uFA0Cca\u0302 |
+<2 \u5140ca\u0323\u0302 # discontiguous contraction |
+= \uFA0Cca\u0323\u0302 |
+= \u5140cạ\u0302 |
+= \uFA0Ccạ\u0302 |
+= \u5140cậ |
+= \uFA0Ccậ |
+<1 \u5140b |
+= \uFA0Cb |
+<1 \u5140x |
+= \u5140ca |
+ |
+# Double-check that without the extra mapping there will be no discontiguous match. |
+@ rules |
+&a<\u5140|câ |
+* compare |
+<1 \u5140a |
+= \uFA0Ca |
+<1 \u5140câ # tailored |
+= \uFA0Ccâ |
+= \u5140ca\u0302 |
+= \uFA0Cca\u0302 |
+<1 \u5140b |
+= \uFA0Cb |
+<1 \u5140ca\u0323\u0302 # no discontiguous contraction |
+= \uFA0Cca\u0323\u0302 |
+= \u5140cạ\u0302 |
+= \uFA0Ccạ\u0302 |
+= \u5140cậ |
+= \uFA0Ccậ |
+ |
+@ rules |
+&a<cạ |
+* compare |
+<1 a |
+<1 cạ # tailored |
+= ca\u0323 |
+<2 ca\u0323\u0302 # contiguous contraction plus extra diacritic |
+= cạ\u0302 # equivalent |
+= cậ # equivalent |
+<1 b |
+ |
+# ᾢ = U+1FA2 GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI |
+# = 03C9 0313 0300 0345 |
+# ccc = 0, 230, 230, 240 |
+@ rules |
+&δ=αῳ |
+# In order to find discontiguous contractions for αῳ |
+# there must exist a mapping for αω, regardless of what it maps to. |
+# (This follows from the UCA spec.) |
+&ε=αω |
+* compare |
+<1 δ |
+= αῳ |
+= αω\u0345 |
+<2 αω\u0313\u0300\u0345 # discontiguous contraction |
+= αὠ\u0300\u0345 |
+= αὢ\u0345 |
+= αᾢ |
+<2 αω\u0300\u0313\u0345 |
+= αὼ\u0313\u0345 |
+= αῲ\u0313 # not FCD |
+<1 ε |
+= αω |
+ |
+# Double-check that without the extra mapping there will be no discontiguous match. |
+@ rules |
+&δ=αῳ |
+* compare |
+<1 αω\u0313\u0300\u0345 # no discontiguous contraction |
+= αὠ\u0300\u0345 |
+= αὢ\u0345 |
+= αᾢ |
+<2 αω\u0300\u0313\u0345 |
+= αὼ\u0313\u0345 |
+= αῲ\u0313 # not FCD |
+<1 δ |
+= αῳ |
+= αω\u0345 |
+ |
+# Add U+0315 COMBINING COMMA ABOVE RIGHT which has ccc=232. |
+# Tests code paths where the tailored string has a combining mark |
+# that does not occur in any composite's decomposition. |
+@ rules |
+&δ=αὼ\u0315 |
+* compare |
+<1 αω\u0313\u0300\u0315 # Not tailored: The grave accent blocks the comma above. |
+= αὠ\u0300\u0315 |
+= αὢ\u0315 |
+<1 δ |
+= αὼ\u0315 |
+= αω\u0300\u0315 |
+<2 αω\u0300\u0315\u0345 |
+= αὼ\u0315\u0345 |
+= αῲ\u0315 # not FCD |
+ |
+** test: danish a+a vs. a-umlaut, ICU ticket 9319 |
+@ rules |
+&z<aa |
+* compare |
+<1 z |
+<1 aa |
+<2 aa\u0308 |
+= aä |
+ |
+** test: Jamo L with and in prefix |
+# Useful for the Korean "searchjl" tailoring (instead of contractions of pairs of Jamo L). |
+@ rules |
+# Jamo Lead consonant G after G or GG |
+&[last primary ignorable]<<\u1100|\u1100=\u1101|\u1100 |
+# Jamo Lead consonant GG sorts like G+G |
+&\u1100\u1100=\u1101 |
+# Note: Making G|GG and GG|GG sort the same as G|G+G |
+# would require the ability to reset on G|G+G, |
+# or we could make G-after-G equal to some secondary-CE character, |
+# and reset on a pair of those. |
+# (It does not matter much if there are at most two G in a row in real text.) |
+* compare |
+<1 \u1100 |
+<2 \u1100\u1100 # only one primary from a sequence of G lead consonants |
+= \u1101 |
+<2 \u1100\u1100\u1100 |
+= \u1101\u1100 |
+# but not = \u1100\u1101, see above |
+<1 \u1100\u1161 |
+= \uAC00 |
+<2 \u1100\u1100\u1161 |
+= \u1100\uAC00 # prefix match from the L of the LV syllable |
+= \u1101\u1161 |
+= \uAE4C |
+ |
+** test: proposed Korean "searchjl" tailoring with prefixes, CLDR ticket 6546 |
+@ rules |
+# Low secondary CEs for Jamo V & T. |
+# Note: T should sort before V for proper syllable order. |
+&\u0332 # COMBINING LOW LINE (first primary ignorable) |
+<<\u1161<<\u1162 |
+ |
+# Korean Jamo lead consonant search rules, part 2: |
+# Make modern compound L jamo primary equivalent to non-compound forms. |
+ |
+# Secondary CEs for Jamo L-after-L, greater than Jamo V & T. |
+&\u0313 # COMBINING COMMA ABOVE (second primary ignorable) |
+=\u1100|\u1100 |
+=\u1103|\u1103 |
+=\u1107|\u1107 |
+=\u1109|\u1109 |
+=\u110C|\u110C |
+ |
+# Compound L Jamo map to equivalent expansions of primary+secondary CE. |
+&\u1100\u0313=\u1101<<<\u3132 # HANGUL CHOSEONG SSANGKIYEOK, HANGUL LETTER SSANGKIYEOK |
+&\u1103\u0313=\u1104<<<\u3138 # HANGUL CHOSEONG SSANGTIKEUT, HANGUL LETTER SSANGTIKEUT |
+&\u1107\u0313=\u1108<<<\u3143 # HANGUL CHOSEONG SSANGPIEUP, HANGUL LETTER SSANGPIEUP |
+&\u1109\u0313=\u110A<<<\u3146 # HANGUL CHOSEONG SSANGSIOS, HANGUL LETTER SSANGSIOS |
+&\u110C\u0313=\u110D<<<\u3149 # HANGUL CHOSEONG SSANGCIEUC, HANGUL LETTER SSANGCIEUC |
+ |
+* compare |
+<1 \u1100\u1161 |
+= \uAC00 |
+<2 \u1100\u1162 |
+= \uAC1C |
+<2 \u1100\u1100\u1161 |
+= \u1100\uAC00 |
+= \u1101\u1161 |
+= \uAE4C |
+<3 \u3132\u1161 |
+ |
+** test: Hangul syllables in prefix & in the interior of a contraction |
+@ rules |
+&x=\u1100\u1161|a\u1102\u1162z |
+* compare |
+<1 \u1100\u1161x |
+= \u1100\u1161a\u1102\u1162z |
+= \u1100\u1161a\uB0B4z |
+= \uAC00a\u1102\u1162z |
+= \uAC00a\uB0B4z |
+ |
+** test: digits are unsafe-backwards when numeric=on |
+@ root |
+% numeric=on |
+* compare |
+# If digits are not unsafe, then numeric collation sees "1"=="01" and "b">"a". |
+# We need to back up before the identical prefix "1" and compare the full numbers. |
+<1 11b |
+<1 101a |
+ |
+** test: simple locale data test |
+@ locale de |
+* compare |
+<1 a |
+<2 ä |
+<1 ae |
+<2 æ |
+ |
+@ locale de-u-co-phonebk |
+* compare |
+<1 a |
+<1 ae |
+<2 ä |
+<2 æ |
+ |
+# The following test cases were moved here from ICU 52's DataDrivenCollationTest.txt. |
+ |
+** test: DataDrivenCollationTest/TestMorePinyin |
+# Testing the primary strength. |
+@ locale zh |
+% strength=primary |
+* compare |
+< lā |
+= lĀ |
+= Lā |
+= LĀ |
+< lān |
+= lĀn |
+< lē |
+= lĒ |
+= Lē |
+= LĒ |
+< lēn |
+= lĒn |
+ |
+** test: DataDrivenCollationTest/TestLithuanian |
+# Lithuanian sort order. |
+@ locale lt |
+* compare |
+< cz |
+< č |
+< d |
+< iz |
+< j |
+< sz |
+< š |
+< t |
+< zz |
+< ž |
+ |
+** test: DataDrivenCollationTest/TestLatvian |
+# Latvian sort order. |
+@ locale lv |
+* compare |
+< cz |
+< č |
+< d |
+< gz |
+< ģ |
+< h |
+< iz |
+< j |
+< kz |
+< ķ |
+< l |
+< lz |
+< ļ |
+< m |
+< nz |
+< ņ |
+< o |
+< rz |
+< ŗ |
+< s |
+< sz |
+< š |
+< t |
+< zz |
+< ž |
+ |
+** test: DataDrivenCollationTest/TestEstonian |
+# Estonian sort order. |
+@ locale et |
+* compare |
+< sy |
+< š |
+< šy |
+< z |
+< zy |
+< ž |
+< v |
+< va |
+< w |
+< õ |
+< õy |
+< ä |
+< äy |
+< ö |
+< öy |
+< ü |
+< üy |
+< x |
+ |
+** test: DataDrivenCollationTest/TestAlbanian |
+# Albanian sort order. |
+@ locale sq |
+* compare |
+< cz |
+< ç |
+< d |
+< dz |
+< dh |
+< e |
+< ez |
+< ë |
+< f |
+< gz |
+< gj |
+< h |
+< lz |
+< ll |
+< m |
+< nz |
+< nj |
+< o |
+< rz |
+< rr |
+< s |
+< sz |
+< sh |
+< t |
+< tz |
+< th |
+< u |
+< xz |
+< xh |
+< y |
+< zz |
+< zh |
+ |
+** test: DataDrivenCollationTest/TestSimplifiedChineseOrder |
+# Sorted file has different order. |
+@ root |
+# normalization=on turned on & off automatically. |
+* compare |
+< \u5F20 |
+< \u5F20\u4E00\u8E3F |
+ |
+** test: DataDrivenCollationTest/TestTibetanNormalizedIterativeCrash |
+# This pretty much crashes. |
+@ root |
+* compare |
+< \u0f71\u0f72\u0f80\u0f71\u0f72 |
+< \u0f80 |
+ |
+** test: DataDrivenCollationTest/TestThaiPartialSortKeyProblems |
+# These are examples of strings that caused trouble in partial sort key testing. |
+@ locale th-TH |
+* compare |
+< \u0E01\u0E01\u0E38\u0E18\u0E20\u0E31\u0E13\u0E11\u0E4C |
+< \u0E01\u0E01\u0E38\u0E2A\u0E31\u0E19\u0E42\u0E18 |
+* compare |
+< \u0E01\u0E07\u0E01\u0E32\u0E23 |
+< \u0E01\u0E07\u0E42\u0E01\u0E49 |
+* compare |
+< \u0E01\u0E23\u0E19\u0E17\u0E32 |
+< \u0E01\u0E23\u0E19\u0E19\u0E40\u0E0A\u0E49\u0E32 |
+* compare |
+< \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E22\u0E27 |
+< \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E4A\u0E22\u0E27 |
+* compare |
+< \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E2D |
+< \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E49\u0E32 |
+ |
+** test: DataDrivenCollationTest/TestJavaStyleRule |
+# java.text allows rules to start as '<<<x<<<y...' |
+# we emulate this by assuming a &[first tertiary ignorable] in this case. |
+@ rules |
+&\u0001=equal<<<z<<x<<<w &[first tertiary ignorable]=a &[first primary ignorable]=b |
+* compare |
+= a |
+= equal |
+< z |
+< x |
+= b # x had become the new first primary ignorable |
+< w |
+ |
+** test: DataDrivenCollationTest/TestShiftedIgnorable |
+# The UCA states that primary ignorables should be completely |
+# ignorable when following a shifted code point. |
+@ root |
+% alternate=shifted |
+% strength=quaternary |
+* compare |
+< a\u0020b |
+= a\u0020\u0300b |
+= a\u0020\u0301b |
+< a_b |
+= a_\u0300b |
+= a_\u0301b |
+< A\u0020b |
+= A\u0020\u0300b |
+= A\u0020\u0301b |
+< A_b |
+= A_\u0300b |
+= A_\u0301b |
+< a\u0301b |
+< A\u0301b |
+< a\u0300b |
+< A\u0300b |
+ |
+** test: DataDrivenCollationTest/TestNShiftedIgnorable |
+# The UCA states that primary ignorables should be completely |
+# ignorable when following a shifted code point. |
+@ root |
+% alternate=non-ignorable |
+% strength=tertiary |
+* compare |
+< a\u0020b |
+< A\u0020b |
+< a\u0020\u0301b |
+< A\u0020\u0301b |
+< a\u0020\u0300b |
+< A\u0020\u0300b |
+< a_b |
+< A_b |
+< a_\u0301b |
+< A_\u0301b |
+< a_\u0300b |
+< A_\u0300b |
+< a\u0301b |
+< A\u0301b |
+< a\u0300b |
+< A\u0300b |
+ |
+** test: DataDrivenCollationTest/TestSafeSurrogates |
+# It turned out that surrogates were not skipped properly |
+# when iterating backwards if they were in the middle of a |
+# contraction. This test assures that this is fixed. |
+@ rules |
+&a < x\ud800\udc00b |
+* compare |
+< a |
+< x\ud800\udc00b |
+ |
+** test: DataDrivenCollationTest/da_TestPrimary |
+# This test goes through primary strength cases |
+@ locale da |
+% strength=primary |
+* compare |
+< Lvi |
+< Lwi |
+* compare |
+< L\u00e4vi |
+< L\u00f6wi |
+* compare |
+< L\u00fcbeck |
+= Lybeck |
+ |
+** test: DataDrivenCollationTest/da_TestTertiary |
+# This test goes through tertiary strength cases |
+@ locale da |
+% strength=tertiary |
+* compare |
+< Luc |
+< luck |
+* compare |
+< luck |
+< L\u00fcbeck |
+* compare |
+< lybeck |
+< L\u00fcbeck |
+* compare |
+< L\u00e4vi |
+< L\u00f6we |
+* compare |
+< L\u00f6ww |
+< mast |
+ |
+* compare |
+< A/S |
+< ANDRE |
+< ANDR\u00c9 |
+< ANDREAS |
+< AS |
+< CA |
+< \u00c7A |
+< CB |
+< \u00c7C |
+< D.S.B. |
+< DA |
+< \u00d0A |
+< DB |
+< \u00d0C |
+< DSB |
+< DSC |
+< EKSTRA_ARBEJDE |
+< EKSTRABUD0 |
+< H\u00d8ST |
+< HAAG |
+< H\u00c5NDBOG |
+< HAANDV\u00c6RKSBANKEN |
+< Karl |
+< karl |
+< NIELS\u0020J\u00d8RGEN |
+< NIELS-J\u00d8RGEN |
+< NIELSEN |
+< R\u00c9E,\u0020A |
+< REE,\u0020B |
+< R\u00c9E,\u0020L |
+< REE,\u0020V |
+< SCHYTT,\u0020B |
+< SCHYTT,\u0020H |
+< SCH\u00dcTT,\u0020H |
+< SCHYTT,\u0020L |
+< SCH\u00dcTT,\u0020M |
+< SS |
+< \u00df |
+< SSA |
+< STORE\u0020VILDMOSE |
+< STOREK\u00c6R0 |
+< STORM\u0020PETERSEN |
+< STORMLY |
+< THORVALD |
+< THORVARDUR |
+< \u00feORVAR\u00d0UR |
+< THYGESEN |
+< VESTERG\u00c5RD,\u0020A |
+< VESTERGAARD,\u0020A |
+< VESTERG\u00c5RD,\u0020B |
+< \u00c6BLE |
+< \u00c4BLE |
+< \u00d8BERG |
+< \u00d6BERG |
+ |
+* compare |
+< andere |
+< chaque |
+< chemin |
+< cote |
+< cot\u00e9 |
+< c\u00f4te |
+< c\u00f4t\u00e9 |
+< \u010du\u010d\u0113t |
+< Czech |
+< hi\u0161a |
+< irdisch |
+< lie |
+< lire |
+< llama |
+< l\u00f5ug |
+< l\u00f2za |
+< lu\u010d |
+< luck |
+< L\u00fcbeck |
+< lye |
+< l\u00e4vi |
+< L\u00f6wen |
+< m\u00e0\u0161ta |
+< m\u00eer |
+< myndig |
+< M\u00e4nner |
+< m\u00f6chten |
+< pi\u00f1a |
+< pint |
+< pylon |
+< \u0161\u00e0ran |
+< savoir |
+< \u0160erb\u016bra |
+< Sietla |
+< \u015blub |
+< subtle |
+< symbol |
+< s\u00e4mtlich |
+< verkehrt |
+< vox |
+< v\u00e4ga |
+< waffle |
+< wood |
+< yen |
+< yuan |
+< yucca |
+< \u017eal |
+< \u017eena |
+< \u017den\u0113va |
+< zoo0 |
+< Zviedrija |
+< Z\u00fcrich |
+< zysk0 |
+< \u00e4ndere |
+ |
+** test: DataDrivenCollationTest/hi_TestNewRules |
+# This test goes through new rules and tests against old rules |
+@ locale hi |
+* compare |
+< कॐ |
+< कं |
+< कँ |
+< कः |
+ |
+** test: DataDrivenCollationTest/ro_TestNewRules |
+# This test goes through new rules and tests against old rules |
+@ locale ro |
+* compare |
+< xAx |
+< xă |
+< xĂ |
+< Xă |
+< XĂ |
+< xăx |
+< xĂx |
+< xâ |
+< x |
+< Xâ |
+< XÂ |
+< xâx |
+< xÂx |
+< xb |
+< xIx |
+< xî |
+< xÎ |
+< Xî |
+< XÎ |
+< xîx |
+< xÎx |
+< xj |
+< xSx |
+< xș |
+= xş |
+< xȘ |
+= xŞ |
+< Xș |
+= Xş |
+< XȘ |
+= XŞ |
+< xșx |
+= xşx |
+< xȘx |
+= xŞx |
+< xT |
+< xTx |
+< xț |
+= xţ |
+< xȚ |
+= xŢ |
+< Xț |
+= Xţ |
+< XȚ |
+= XŢ |
+< xțx |
+= xţx |
+< xȚx |
+= xŢx |
+< xU |
+ |
+** test: DataDrivenCollationTest/testOffsets |
+# This tests cases where forwards and backwards iteration get different offsets |
+@ locale en |
+% strength=tertiary |
+* compare |
+< a\uD800\uDC00\uDC00 |
+< b\uD800\uDC00\uDC00 |
+* compare |
+< \u0301A\u0301\u0301 |
+< \u0301B\u0301\u0301 |
+* compare |
+< abcd\r\u0301 |
+< abce\r\u0301 |
+# TODO: test offsets in new CollationTest |
+ |
+# End of test cases moved here from ICU 52's DataDrivenCollationTest.txt. |
+ |
+** test: was ICU 52 cmsccoll/TestRedundantRules |
+@ rules |
+& a < b < c < d& [before 1] c < m |
+* compare |
+<1 a |
+<1 b |
+<1 m |
+<1 c |
+<1 d |
+ |
+@ rules |
+& a < b <<< c << d <<< e& [before 3] e <<< x |
+* compare |
+<1 a |
+<1 b |
+<3 c |
+<2 d |
+<3 x |
+<3 e |
+ |
+@ rules |
+& a < b <<< c << d <<< e <<< f < g& [before 1] g < x |
+* compare |
+<1 a |
+<1 b |
+<3 c |
+<2 d |
+<3 e |
+<3 f |
+<1 x |
+<1 g |
+ |
+@ rules |
+& a <<< b << c < d& a < m |
+* compare |
+<1 a |
+<3 b |
+<2 c |
+<1 m |
+<1 d |
+ |
+@ rules |
+&a<b<<b\u0301 &z<b |
+* compare |
+<1 a |
+<1 b\u0301 |
+<1 z |
+<1 b |
+ |
+@ rules |
+&z<m<<<q<<<m |
+* compare |
+<1 z |
+<1 q |
+<3 m |
+ |
+@ rules |
+&z<<<m<q<<<m |
+* compare |
+<1 z |
+<1 q |
+<3 m |
+ |
+@ rules |
+& a < b < c < d& r < c |
+* compare |
+<1 a |
+<1 b |
+<1 d |
+<1 r |
+<1 c |
+ |
+@ rules |
+& a < b < c < d& c < m |
+* compare |
+<1 a |
+<1 b |
+<1 c |
+<1 m |
+<1 d |
+ |
+@ rules |
+& a < b < c < d& a < m |
+* compare |
+<1 a |
+<1 m |
+<1 b |
+<1 c |
+<1 d |
+ |
+** test: was ICU 52 cmsccoll/TestExpansionSyntax |
+# The following two rules should sort the particular list of strings the same. |
+@ rules |
+&AE <<< a << b <<< c &d <<< f |
+* compare |
+<1 AE |
+<3 a |
+<2 b |
+<3 c |
+<1 d |
+<3 f |
+ |
+@ rules |
+&A <<< a / E << b / E <<< c /E &d <<< f |
+* compare |
+<1 AE |
+<3 a |
+<2 b |
+<3 c |
+<1 d |
+<3 f |
+ |
+# The following two rules should sort the particular list of strings the same. |
+@ rules |
+&AE <<< a <<< b << c << d < e < f <<< g |
+* compare |
+<1 AE |
+<3 a |
+<3 b |
+<2 c |
+<2 d |
+<1 e |
+<1 f |
+<3 g |
+ |
+@ rules |
+&A <<< a / E <<< b / E << c / E << d / E < e < f <<< g |
+* compare |
+<1 AE |
+<3 a |
+<3 b |
+<2 c |
+<2 d |
+<1 e |
+<1 f |
+<3 g |
+ |
+# The following two rules should sort the particular list of strings the same. |
+@ rules |
+&AE <<< B <<< C / D <<< F |
+* compare |
+<1 AE |
+<3 B |
+<3 F |
+<1 AED |
+<3 C |
+ |
+@ rules |
+&A <<< B / E <<< C / ED <<< F / E |
+* compare |
+<1 AE |
+<3 B |
+<3 F |
+<1 AED |
+<3 C |
+ |
+** test: never reorder trailing primaries |
+@ root |
+% reorder Zzzz Grek |
+* compare |
+<1 L |
+<1 字 |
+<1 Ω |
+<1 \uFFFD |
+<1 \uFFFF |
+ |
+** test: fall back to mappings with shorter prefixes, not immediately to ones with no prefixes |
+@ rules |
+&u=ab|cd |
+&v=b|ce |
+* compare |
+<1 abc |
+<1 abcc |
+<1 abcf |
+<1 abcd |
+= abu |
+<1 abce |
+= abv |
+ |
+# With the following rules, there is only one prefix per composite ĉ or ç, |
+# but both prefixes apply to just c in NFD form. |
+# We would get different results for composed vs. NFD input |
+# if we fell back directly from longest-prefix mappings to no-prefix mappings. |
+@ rules |
+&x=op|ĉ |
+&y=p|ç |
+* compare |
+<1 opc |
+<2 opć |
+<1 opcz |
+<1 opd |
+<1 opĉ |
+= opc\u0302 |
+= opx |
+<1 opç |
+= opc\u0327 |
+= opy |
+ |
+# The mapping is used which has the longest matching prefix for which |
+# there is also a suffix match, with the longest suffix match among several for that prefix. |
+@ rules |
+&❶=d |
+&❷=de |
+&❸=def |
+&①=c|d |
+&②=c|de |
+&③=c|def |
+&④=bc|d |
+&⑤=bc|de |
+&⑥=bc|def |
+&⑦=abc|d |
+&⑧=abc|de |
+&⑨=abc|def |
+* compare |
+<1 9aadzz |
+= 9aa❶zz |
+<1 9aadez |
+= 9aa❷z |
+<1 9aadef |
+= 9aa❸ |
+<1 9acdzz |
+= 9ac①zz |
+<1 9acdez |
+= 9ac②z |
+<1 9acdef |
+= 9ac③ |
+<1 9bcdzz |
+= 9bc④zz |
+<1 9bcdez |
+= 9bc⑤z |
+<1 9bcdef |
+= 9bc⑥ |
+<1 abcdzz |
+= abc⑦zz |
+<1 abcdez |
+= abc⑧z |
+<1 abcdef |
+= abc⑨ |
+ |
+** test: prefix + discontiguous contraction with missing prefix contraction |
+# Unfortunate terminology: The first "prefix" here is the pre-context, |
+# the second "prefix" refers to the contraction/relation string that is |
+# one shorter than the one being tested. |
+@ rules |
+&x=p|e |
+&y=p|ê |
+&z=op|ê |
+# No mapping for op|e: |
+# Discontiguous contraction matching should not match op|ê in opệ |
+# because it would have to skip the dot below and extend a match on op|e by the circumflex, |
+# but there is no match on op|e. |
+* compare |
+<1 oPe |
+<1 ope |
+= opx |
+<1 opệ |
+= opy\u0323 # y not z |
+<1 opê |
+= opz |
+ |
+# We cannot test for fallback by whether the contraction default CE32 |
+# is for another contraction. With the following rules, there is no mapping for op|e, |
+# and the fallback to prefix p has no contractions. |
+@ rules |
+&x=p|e |
+&z=op|ê |
+* compare |
+<1 oPe |
+<1 ope |
+= opx |
+<2 opệ |
+= opx\u0323\u0302 # x not z |
+<1 opê |
+= opz |
+ |
+# One more variation: Fallback to the simple code point, no shorter non-empty prefix. |
+@ rules |
+&x=e |
+&z=op|ê |
+* compare |
+<1 ope |
+= opx |
+<3 oPe |
+= oPx |
+<2 opệ |
+= opx\u0323\u0302 # x not z |
+<1 opê |
+= opz |
+ |
+** test: maxVariable via rules |
+@ rules |
+[maxVariable space][alternate shifted] |
+* compare |
+= \u0020 |
+= \u000A |
+<1 . |
+<1 ° # degree sign |
+<1 $ |
+<1 0 |
+ |
+** test: maxVariable via setting |
+@ root |
+% maxVariable=currency |
+% alternate=shifted |
+* compare |
+= \u0020 |
+= \u000A |
+= . |
+= ° # degree sign |
+= $ |
+<1 0 |
+ |
+** test: ICU4J CollationMiscTest/TestContractionClosure (ää) |
+# This tests canonical closure, but it also tests that CollationFastLatin |
+# bails out properly for contractions with combining marks. |
+# For that we need pairs of strings that remain in the Latin fastpath |
+# long enough, hence the extra "= b" lines. |
+@ rules |
+&b=\u00e4\u00e4 |
+* compare |
+<1 b |
+= \u00e4\u00e4 |
+= b |
+= a\u0308a\u0308 |
+= b |
+= \u00e4a\u0308 |
+= b |
+= a\u0308\u00e4 |
+ |
+** test: ICU4J CollationMiscTest/TestContractionClosure (Å) |
+@ rules |
+&b=\u00C5 |
+* compare |
+<1 b |
+= \u00C5 |
+= b |
+= A\u030A |
+= b |
+= \u212B |
+ |
+** test: reset-before on already-tailored characters, ICU ticket 10108 |
+@ rules |
+&a<w<<x &[before 2]x<<y |
+* compare |
+<1 a |
+<1 w |
+<2 y |
+<2 x |
+ |
+@ rules |
+&a<<w<<<x &[before 2]x<<y |
+* compare |
+<1 a |
+<2 y |
+<2 w |
+<3 x |
+ |
+@ rules |
+&a<w<x &[before 2]x<<y |
+* compare |
+<1 a |
+<1 w |
+<1 y |
+<2 x |
+ |
+@ rules |
+&a<w<<<x &[before 2]x<<y |
+* compare |
+<1 a |
+<1 y |
+<2 w |
+<3 x |
+ |
+** test: numeric collation with other settings, ICU ticket 9092 |
+@ root |
+% strength=identical |
+% caseFirst=upper |
+% numeric=on |
+* compare |
+<1 100\u0020a |
+<1 101 |
+ |
+** test: collation type fallback from unsupported type, ICU ticket 10149 |
+@ locale fr-CA-u-co-phonebk |
+# Expect the same result as with fr-CA, using backwards-secondary order. |
+# That is, we should fall back from the unsupported collation type |
+# to the locale's default collation type. |
+* compare |
+<1 cote |
+<2 côte |
+<2 coté |
+<2 côté |
+ |
+** test: @ is equivalent to [backwards 2], ICU ticket 9956 |
+@ rules |
+&b<a @ &v<<w |
+* compare |
+<1 b |
+<1 a |
+<1 cote |
+<2 côte |
+<2 coté |
+<2 côté |
+<1 v |
+<2 w |
+<1 x |
+ |
+** test: shifted+reordering, ICU ticket 9507 |
+@ root |
+% reorder Grek punct space |
+% alternate=shifted |
+% strength=quaternary |
+# Which primaries are "variable" should be determined without script reordering, |
+# and then primaries should be reordered whether they are shifted to quaternary or not. |
+* compare |
+<4 ( # punctuation |
+<4 ) |
+<4 \u0020 # space |
+<1 ` # symbol |
+<1 ^ |
+<1 $ # currency symbol |
+<1 € |
+<1 0 # numbers |
+<1 ε # Greek |
+<1 e # Latin |
+<1 e(e |
+<4 e)e |
+<4 e\u0020e |
+<4 ee |
+<3 e(E |
+<4 e)E |
+<4 e\u0020E |
+<4 eE |
+ |
+** test: "uppercase first" could sort a string before its prefix, ICU ticket 9351 |
+@ rules |
+&\u0001<<<b<<<B |
+% caseFirst=upper |
+* compare |
+<1 aaa |
+<3 aaaB |
+ |
+** test: secondary+case ignores secondary ignorables, ICU ticket 9355 |
+@ rules |
+&\u0001<<<b<<<B |
+% strength=secondary |
+% caseLevel=on |
+* compare |
+<1 a |
+= ab |
+= aB |
+ |
+** test: custom collation rules involving tail of a contraction in Malayalam, ICU ticket 6328 |
+@ rules |
+&[before 2] ൌ << ൗ # U+0D57 << U+0D4C == 0D46+0D57 |
+* compare |
+<1 ൗx |
+<2 ൌx |
+<1 ൗy |
+<2 ൌy |
+ |
+** test: quoted apostrophe in compact syntax, ICU ticket 8204 |
+@ rules |
+&q<<*a''c |
+* compare |
+<1 d |
+<1 p |
+<1 q |
+<2 a |
+<2 \u0027 |
+<2 c |
+<1 r |
+ |
+# ICU ticket #8260 "Support all collation-related keywords in Collator.getInstance()" |
+** test: locale -u- with collation keywords, ICU ticket 8260 |
+@ locale de-u-kv-sPace-ka-shifTed-kn-kk-falsE-kf-Upper-kc-tRue-ks-leVel4 |
+* compare |
+<4 \u0020 # space is shifted, strength=quaternary |
+<1 ! # punctuation is regular |
+<1 2 |
+<1 12 # numeric sorting |
+<1 B |
+<c b # uppercase first on case level |
+<1 x\u0301\u0308 |
+<2 x\u0308\u0301 # normalization off |
+ |
+** test: locale @ with collation keywords, ICU ticket 8260 |
+@ locale fr@colbAckwards=yes;ColStrength=Quaternary;kv=currencY;colalternate=shifted |
+* compare |
+<4 $ # currency symbols are shifted, strength=quaternary |
+<1 àla |
+<2 alà # backwards secondary level |
+ |
+** test: locale -u- with script reordering, ICU ticket 8260 |
+@ locale el-u-kr-kana-SYMBOL-Grek-hani-cyrl-latn-digit-armn-deva-ethi-thai |
+* compare |
+<1 \u0020 |
+<1 あ |
+<1 ☂ |
+<1 Ω |
+<1 丂 |
+<1 ж |
+<1 L |
+<1 4 |
+<1 Ձ |
+<1 अ |
+<1 ሄ |
+<1 ฉ |
+ |
+** test: locale @collation=type should be case-insensitive |
+@ locale de@coLLation=PhoneBook |
+* compare |
+<1 ae |
+<2 ä |
+<3 Ä |
+ |
+** test: import root search rules plus German phonebook rules, ICU ticket 8962 |
+@ locale de-u-co-search |
+* compare |
+<1 = |
+<1 ≠ |
+<1 a |
+<1 ae |
+<2 ä |
+ |
+# Once more, but with runtime builder. |
+@ rules |
+[import und-u-co-search][import de-u-co-phonebk] |
+* compare |
+<1 = |
+<1 ≠ |
+<1 a |
+<1 ae |
+<2 ä |
+ |
+# Once again, with import from "root" not "und" (as in a proper language tag). |
+@ rules |
+[import root-u-co-search][import de-u-co-phonebk] |
+* compare |
+<1 = |
+<1 ≠ |
+<1 a |
+<1 ae |
+<2 ä |
+ |
+** test: import rules from a language with non-Latin native script, and reset the reordering, ICU ticket 10998 |
+# Greek should sort Greek first. |
+@ rules |
+[import el] |
+* compare |
+<1 4 |
+<1 Ω |
+<1 L |
+ |
+# Import Greek, and then reset the reordering. |
+@ rules |
+[import el][reorder Zzzz] |
+* compare |
+<1 4 |
+<1 L |
+<1 Ω |
+ |
+# "others" is a synonym for Zzzz. |
+@ rules |
+[import el][reorder others] |
+* compare |
+<1 4 |
+<1 L |
+<1 Ω |