Index: source/test/testdata/collationtest.txt |
diff --git a/source/test/testdata/collationtest.txt b/source/test/testdata/collationtest.txt |
deleted file mode 100644 |
index 3a703cb10b6a091cb39eb33b96a4ce7b18d0ae53..0000000000000000000000000000000000000000 |
--- a/source/test/testdata/collationtest.txt |
+++ /dev/null |
@@ -1,2540 +0,0 @@ |
-# Copyright (c) 2012-2015 International Business Machines |
-# Corporation and others. All Rights Reserved. |
-# |
-# This file should be in UTF-8 with a signature byte sequence ("BOM"). |
-# |
-# collationtest.txt: Collation test data. |
-# |
-# created on: 2012apr13 |
-# created by: Markus W. Scherer |
- |
-# A line with "** test: description" is used for verbose and error output. |
- |
-# A collator can be set with "@ root" or "@ locale language-tag", |
-# for example "@ locale de-u-co-phonebk". |
-# An old-style locale ID can also be used, for example "@ locale de@collation=phonebook". |
- |
-# A collator can be built with "@ rules". |
-# An "@ rules" line is followed by one or more lines with the tailoring rules. |
- |
-# A collator can be modified with "% attribute=value". |
- |
-# "* compare" tests the order (= or <) of the following strings. |
-# The relation can be "=" or "<" (the level of the difference is not specified) |
-# or "<1", "<2", "<c", "<3", "<4" (indicating the level of the difference). |
- |
-# Test sections ("* compare") are terminated by |
-# definitions of new collators, changing attributes, or new test sections. |
- |
-** test: simple CEs & expansions |
-# Many types of mappings are tested elsewhere, including via the UCA conformance tests. |
-# Here we mostly cover a few unusual mappings. |
-@ rules |
-&\x01 # most control codes are ignorable |
-<<<\u0300 # tertiary CE |
-&9<\x00 # NUL not ignorable |
-&\uA00A\uA00B=\uA002 # two long-primary CEs |
-&\uA00A\uA00B\u00050005=\uA003 # three CEs, require 64 bits |
- |
-* compare |
-= \x01 |
-= \x02 |
-<3 \u0300 |
-<1 9 |
-<1 \x00 |
-= \x01\x00\x02 |
-<1 a |
-<3 a\u0300 |
-<2 a\u0308 |
-= ä |
-<1 b |
-<1 か # Hiragana Ka (U+304B) |
-<2 か\u3099 # plus voiced sound mark |
-= が # Hiragana Ga (U+304C) |
-<1 \uA00A\uA00B |
-= \uA002 |
-<1 \uA00A\uA00B\u00050004 |
-<1 \uA00A\uA00B\u00050005 |
-= \uA003 |
-<1 \uA00A\uA00B\u00050006 |
- |
-** test: contractions |
-# Create some interesting mappings, and map some normalization-inert characters |
-# (which are not subject to canonical reordering) |
-# to some of the same CEs to check the sequence of CEs. |
-@ rules |
- |
-# Contractions starting with 'a' should not continue with any character < U+0300 |
-# so that we can test a shortcut for that. |
-&a=ⓐ |
-&b<bz=ⓑ |
-&d<dz\u0301=ⓓ # d+z+acute |
-&z |
-<a\u0301=Ⓐ # a+acute sorts after z |
-<a\u0301\u0301=Ⓑ # a+acute+acute |
-<a\u0301\u0301\u0358=Ⓒ # a+acute+acute+dot above right |
-<a\u030a=Ⓓ # a+ring |
-<a\u0323=Ⓔ # a+dot below |
-<a\u0323\u0358=Ⓕ # a+dot below+dot above right |
-<a\u0327\u0323\u030a=Ⓖ # a+cedilla+dot below+ring |
-<a\u0327\u0323bz=Ⓗ # a+cedilla+dot below+b+z |
- |
-&\U0001D158=⁰ # musical notehead black (has a symbol primary) |
-<\U0001D158\U0001D165=¼ # musical quarter note |
- |
-# deliberately missing prefix contractions: |
-# dz |
-# a\u0327 |
-# a\u0327\u0323 |
-# a\u0327\u0323b |
- |
-&\x01 |
-<<<\U0001D165=¹ # musical stem (ccc=216) |
-<<<\U0001D16D=² # musical augmentation dot (ccc=226) |
-<<<\U0001D165\U0001D16D=³ # stem+dot (ccc=216 226) |
-&\u0301=❶ # acute (ccc=230) |
-&\u030a=❷ # ring (ccc=230) |
-&\u0308=❸ # diaeresis (ccc=230) |
-<<\u0308\u0301=❹ # diaeresis+acute (=dialytika tonos) (ccc=230 230) |
-&\u0327=❺ # cedilla (ccc=202) |
-&\u0323=❻ # dot below (ccc=220) |
-&\u0331=❼ # macron below (ccc=220) |
-<<\u0331\u0358=❽ # macron below+dot above right (ccc=220 232) |
-&\u0334=❾ # tilde overlay (ccc=1) |
-&\u0358=❿ # dot above right (ccc=232) |
- |
-&\u0f71=① # tibetan vowel sign aa |
-&\u0f72=② # tibetan vowel sign i |
-# \u0f71\u0f72 # tibetan vowel sign aa + i = ii = U+0F73 |
-&\u0f73=③ # tibetan vowel sign ii (ccc=0 but lccc=129) |
- |
-** test: simple contractions |
- |
-# Some strings are chosen to cause incremental contiguous contraction matching to |
-# go into partial matches for prefixes of contractions |
-# (where the prefixes are deliberately not also contractions). |
-# When there is no complete match, then the matching code must back out of those |
-# so that discontiguous contractions work as specified. |
- |
-* compare |
-# contraction starter with no following text, or mismatch, or blocked |
-<1 a |
-= ⓐ |
-<1 aa |
-= ⓐⓐ |
-<1 ab |
-= ⓐb |
-<1 az |
-= ⓐz |
- |
-* compare |
-<1 a |
-<2 a\u0308\u030a # ring blocked by diaeresis |
-= ⓐ❸❷ |
-<2 a\u0327 |
-= ⓐ❺ |
- |
-* compare |
-<2 \u0308 |
-= ❸ |
-<2 \u0308\u030a\u0301 # acute blocked by ring |
-= ❸❷❶ |
- |
-* compare |
-<1 \U0001D158 |
-= ⁰ |
-<1 \U0001D158\U0001D165 |
-= ¼ |
- |
-# no discontiguous contraction because of missing prefix contraction d+z, |
-# and a starter ('z') after the 'd' |
-* compare |
-<1 dz\u0323\u0301 |
-= dz❻❶ |
- |
-# contiguous contractions |
-* compare |
-<1 abz |
-= ⓐⓑ |
-<1 abzz |
-= ⓐⓑz |
- |
-* compare |
-<1 a |
-<1 z |
-<1 a\u0301 |
-= Ⓐ |
-<1 a\u0301\u0301 |
-= Ⓑ |
-<1 a\u0301\u0301\u0358 |
-= Ⓒ |
-<1 a\u030a |
-= Ⓓ |
-<1 a\u0323\u0358 |
-= Ⓕ |
-<1 a\u0327\u0323\u030a # match despite missing prefix |
-= Ⓖ |
-<1 a\u0327\u0323bz |
-= Ⓗ |
- |
-* compare |
-<2 \u0308\u0308\u0301 # acute blocked from first diaeresis, contracts with second |
-= ❸❹ |
- |
-* compare |
-<1 \U0001D158\U0001D165 |
-= ¼ |
- |
-* compare |
-<3 \U0001D165\U0001D16D |
-= ³ |
- |
-** test: discontiguous contractions |
-* compare |
-<1 a\u0327\u030a # a+ring skips cedilla |
-= Ⓓ❺ |
-<2 a\u0327\u0327\u030a # a+ring skips 2 cedillas |
-= Ⓓ❺❺ |
-<2 a\u0327\u0327\u0327\u030a # a+ring skips 3 cedillas |
-= Ⓓ❺❺❺ |
-<2 a\u0334\u0327\u0327\u030a # a+ring skips tilde overlay & 2 cedillas |
-= Ⓓ❾❺❺ |
-<1 a\u0327\u0323 # a+dot below skips cedilla |
-= Ⓔ❺ |
-<1 a\u0323\u0301\u0358 # a+dot below+dot ab.r.: 2-char match, then skips acute |
-= Ⓕ❶ |
-<2 a\u0334\u0323\u0358 # a+dot below skips tilde overlay |
-= Ⓕ❾ |
- |
-* compare |
-<2 \u0331\u0331\u0358 # macron below+dot ab.r. skips the second macron below |
-= ❽❼ |
- |
-* compare |
-<1 a\u0327\u0331\u0323\u030a # a+ring skips cedilla, macron below, dot below (dot blocked by macron) |
-= Ⓓ❺❼❻ |
-<1 a\u0327\u0323\U0001D16D\u030a # a+dot below skips cedilla |
-= Ⓔ❺²❷ |
-<2 a\u0327\u0327\u0323\u030a # a+dot below skips 2 cedillas |
-= Ⓔ❺❺❷ |
-<2 a\u0327\u0323\u0323\u030a # a+dot below skips cedilla |
-= Ⓔ❺❻❷ |
-<2 a\u0334\u0327\u0323\u030a # a+dot below skips tilde overlay & cedilla |
-= Ⓔ❾❺❷ |
- |
-* compare |
-<1 \U0001D158\u0327\U0001D165 # quarter note skips cedilla |
-= ¼❺ |
-<1 a\U0001D165\u0323 # a+dot below skips stem |
-= Ⓔ¹ |
- |
-# partial contiguous match, backs up, matches discontiguous contraction |
-<1 a\u0327\u0323b |
-= Ⓔ❺b |
-<1 a\u0327\u0323ba |
-= Ⓔ❺bⓐ |
- |
-# a+acute+acute+dot above right skips cedilla, continues matching 2 same-ccc combining marks |
-* compare |
-<1 a\u0327\u0301\u0301\u0358 |
-= Ⓒ❺ |
- |
-# FCD but not NFD |
-* compare |
-<1 a\u0f73\u0301 # a+acute skips tibetan ii |
-= Ⓐ③ |
- |
-# FCD but the 0f71 inside the 0f73 must be skipped |
-# to match the discontiguous contraction of the first 0f71 with the trailing 0f72 inside the 0f73 |
-* compare |
-<1 \u0f71\u0f73 # == \u0f73\u0f71 == \u0f71\u0f71\u0f72 |
-= ③① |
- |
-** test: discontiguous contractions with nested contractions |
-* compare |
-<1 a\u0323\u0308\u0301\u0358 |
-= Ⓕ❹ |
-<2 a\u0323\u0308\u0301\u0308\u0301\u0358 |
-= Ⓕ❹❹ |
- |
-** test: discontiguous contractions with interleaved contractions |
-* compare |
-# a+ring & cedilla & macron below+dot above right |
-<1 a\u0327\u0331\u030a\u0358 |
-= Ⓓ❺❽ |
- |
-# a+ring & 1x..3x macron below+dot above right |
-<2 a\u0331\u030a\u0358 |
-= Ⓓ❽ |
-<2 a\u0331\u0331\u030a\u0358\u0358 |
-= Ⓓ❽❽ |
-# also skips acute |
-<2 a\u0331\u0331\u0331\u030a\u0301\u0358\u0358\u0358 |
-= Ⓓ❽❽❽❶ |
- |
-# a+dot below & stem+augmentation dot, followed by contiguous d+z+acute |
-<1 a\U0001D165\u0323\U0001D16Ddz\u0301 |
-= Ⓔ³ⓓ |
- |
-** test: some simple string comparisons |
-@ root |
-* compare |
-# first string compares against "" |
-= \u0000 |
-< a |
-<1 b |
-<3 B |
-= \u0000B\u0000 |
- |
-** test: compare with strength=primary |
-% strength=primary |
-* compare |
-<1 a |
-<1 b |
-= B |
- |
-** test: compare with strength=secondary |
-% strength=secondary |
-* compare |
-<1 a |
-<1 b |
-= B |
- |
-** test: compare with strength=tertiary |
-% strength=tertiary |
-* compare |
-<1 a |
-<1 b |
-<3 B |
- |
-** test: compare with strength=quaternary |
-% strength=quaternary |
-* compare |
-<1 a |
-<1 b |
-<3 B |
- |
-** test: compare with strength=identical |
-% strength=identical |
-* compare |
-<1 a |
-<1 b |
-<3 B |
- |
-** test: côté with forwards secondary |
-@ root |
-* compare |
-<1 cote |
-<2 coté |
-<2 côte |
-<2 côté |
- |
-** test: côté with forwards secondary vs. U+FFFE merge separator |
-# Merged sort keys: On each level, any difference in the first segment |
-# must trump any further difference. |
-* compare |
-<1 cote\uFFFEcôté |
-<2 coté\uFFFEcôte |
-<2 côte\uFFFEcoté |
-<2 côté\uFFFEcote |
- |
-** test: côté with backwards secondary |
-% backwards=on |
-* compare |
-<1 cote |
-<2 côte |
-<2 coté |
-<2 côté |
- |
-** test: côté with backwards secondary vs. U+FFFE merge separator |
-# Merged sort keys: On each level, any difference in the first segment |
-# must trump any further difference. |
-* compare |
-<1 cote\uFFFEcôté |
-<2 côte\uFFFEcoté |
-<2 coté\uFFFEcôte |
-<2 côté\uFFFEcote |
- |
-** test: U+FFFE on identical level |
-@ root |
-% strength=identical |
-* compare |
-# All of these control codes are completely-ignorable, so that |
-# their low code points are compared with the merge separator. |
-# The merge separator must compare less than any other character. |
-<1 \uFFFE\u0001\u0002\u0003 |
-<i \u0001\uFFFE\u0002\u0003 |
-<i \u0001\u0002\uFFFE\u0003 |
-<i \u0001\u0002\u0003\uFFFE |
- |
-* compare |
-# The merge separator must even compare less than U+0000. |
-<1 \uFFFE\u0000\u0000 |
-<i \u0000\uFFFE\u0000 |
-<i \u0000\u0000\uFFFE |
- |
-** test: Hani < surrogates < U+FFFD |
-# Note: compareUTF8() treats unpaired surrogates like U+FFFD, |
-# so with that the strings with surrogates will compare equal to each other |
-# and equal to the string with U+FFFD. |
-@ root |
-% strength=identical |
-* compare |
-<1 abz |
-<1 a\u4e00z |
-<1 a\U00020000z |
-<1 a\ud800z |
-<1 a\udbffz |
-<1 a\udc00z |
-<1 a\udfffz |
-<1 a\ufffdz |
- |
-** test: script reordering |
-@ root |
-% reorder Hani Zzzz digit |
-* compare |
-<1 ? |
-<1 + |
-<1 丂 |
-<1 a |
-<1 α |
-<1 5 |
- |
-% reorder default |
-* compare |
-<1 ? |
-<1 + |
-<1 5 |
-<1 a |
-<1 α |
-<1 丂 |
- |
-** test: empty rules |
-@ rules |
-* compare |
-<1 a |
-<2 ä |
-<3 Ä |
-<1 b |
- |
-** test: very simple rules |
-@ rules |
-&a=e<<<<q<<<<r<x<<<X<<y<<<Y;z,Z |
-% strength=quaternary |
-* compare |
-<1 a |
-= e |
-<4 q |
-<4 r |
-<1 x |
-<3 X |
-<2 y |
-<3 Y |
-<2 z |
-<3 Z |
- |
-** test: tailoring twice before a root position: primary |
-@ rules |
-&[before 1]b<p |
-&[before 1]b<q |
-* compare |
-<1 a |
-<1 p |
-<1 q |
-<1 b |
- |
-** test: tailoring twice before a root position: secondary |
-@ rules |
-&[before 2]ſ<<p |
-&[before 2]ſ<<q |
-* compare |
-<1 s |
-<2 p |
-<2 q |
-<2 ſ |
- |
-# secondary-before common weight |
-@ rules |
-&[before 2]b<<p |
-&[before 2]b<<q |
-* compare |
-<1 a |
-<1 p |
-<2 q |
-<2 b |
- |
-** test: tailoring twice before a root position: tertiary |
-@ rules |
-&[before 3]B<<<p |
-&[before 3]B<<<q |
-* compare |
-<1 b |
-<3 p |
-<3 q |
-<3 B |
- |
-# tertiary-before common weight |
-@ rules |
-&[before 3]b<<<p |
-&[before 3]b<<<q |
-* compare |
-<1 a |
-<1 p |
-<3 q |
-<3 b |
- |
-@ rules |
-&[before 2]b<<s |
-&[before 3]s<<<p |
-&[before 3]s<<<q |
-* compare |
-<1 a |
-<1 p |
-<3 q |
-<3 s |
-<2 b |
- |
-** test: tailor after completely ignorable |
-@ rules |
-&\x00<<<x<<y |
-* compare |
-= \x00 |
-= \x1F |
-<3 x |
-<2 y |
- |
-** test: secondary tailoring gaps, ICU ticket 9362 |
-@ rules |
-&[before 2]s<<'_' |
-&s<<r # secondary between s and ſ (long s) |
-&ſ<<*a-q # more than 15 between ſ and secondary CE boundary |
-&[before 2][first primary ignorable]<<u<<v # between secondary CE boundary & lowest secondary CE |
-&[last primary ignorable]<<y<<z |
- |
-* compare |
-<2 u |
-<2 v |
-<2 \u0332 # lowest secondary CE |
-<2 \u0308 |
-<2 y |
-<2 z |
-<1 s_ |
-<2 ss |
-<2 sr |
-<2 sſ |
-<2 sa |
-<2 sb |
-<2 sp |
-<2 sq |
-<2 sus |
-<2 svs |
-<2 rs |
- |
-** test: tertiary tailoring gaps, ICU ticket 9362 |
-@ rules |
-&[before 3]t<<<'_' |
-&t<<<r # tertiary between t and fullwidth t |
-&ᵀ<<<*a-q # more than 15 between ᵀ (modifier letter T) and tertiary CE boundary |
-&[before 3][first secondary ignorable]<<<u<<<v # between tertiary CE boundary & lowest tertiary CE |
-&[last secondary ignorable]<<<y<<<z |
- |
-* compare |
-<3 u |
-<3 v |
-# Note: The root collator currently does not map any characters to tertiary CEs. |
-<3 y |
-<3 z |
-<1 t_ |
-<3 tt |
-<3 tr |
-<3 tt |
-<3 tᵀ |
-<3 ta |
-<3 tb |
-<3 tp |
-<3 tq |
-<3 tut |
-<3 tvt |
-<3 rt |
- |
-** test: secondary & tertiary around root character |
-@ rules |
-&[before 2]m<<r |
-&m<<s |
-&[before 3]m<<<u |
-&m<<<v |
-* compare |
-<1 l |
-<1 r |
-<2 u |
-<3 m |
-<3 v |
-<2 s |
-<1 n |
- |
-** test: secondary & tertiary around tailored item |
-@ rules |
-&m<x |
-&[before 2]x<<r |
-&x<<s |
-&[before 3]x<<<u |
-&x<<<v |
-* compare |
-<1 m |
-<1 r |
-<2 u |
-<3 x |
-<3 v |
-<2 s |
-<1 n |
- |
-** test: more nesting of secondary & tertiary before |
-@ rules |
-&[before 3]m<<<u |
-&[before 2]m<<r |
-&[before 3]r<<<q |
-&m<<<w |
-&m<<t |
-&[before 3]w<<<v |
-&w<<<x |
-&w<<s |
-* compare |
-<1 l |
-<1 q |
-<3 r |
-<2 u |
-<3 m |
-<3 v |
-<3 w |
-<3 x |
-<2 s |
-<2 t |
-<1 n |
- |
-** test: case bits |
-@ rules |
-&w<x # tailored CE getting case bits |
- =uv=uV=Uv=UV # 2 chars -> 1 CE |
-&ae=ch=cH=Ch=CH # 2 chars -> 2 CEs |
-&rst=yz=yZ=Yz=YZ # 2 chars -> 3 CEs |
-% caseFirst=lower |
-* compare |
-<1 ae |
-= ch |
-<3 cH |
-<3 Ch |
-<3 CH |
-<1 rst |
-= yz |
-<3 yZ |
-<3 Yz |
-<3 YZ |
-<1 w |
-<1 x |
-= uv |
-<3 uV |
-= Uv # mixed case on single CE cannot distinguish variations |
-<3 UV |
- |
-** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=lower |
-@ rules |
-&\u0001<<<t<<<T # tertiary CEs |
-% caseFirst=lower |
-* compare |
-<1 aa |
-<3 aat |
-<3 aaT |
-<3 aA |
-<3 aAt |
-<3 ata |
-<3 aTa |
- |
-** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=upper |
-% caseFirst=upper |
-* compare |
-<1 aA |
-<3 aAt |
-<3 aa |
-<3 aat |
-<3 aaT |
-<3 ata |
-<3 aTa |
- |
-** test: reset on expansion, ICU tickets 9415 & 9593 |
-@ rules |
-&æ<x # tailor the last primary CE so that x sorts between ae and af |
-&æb=bæ # copy all reset CEs to make bæ sort the same |
-&각<h # copy/tailor 3 CEs to make h sort before the next Hangul syllable 갂 |
-&⒀<<y # copy/tailor 4 CEs to make y sort with only a secondary difference |
-&l·=z # handle the pre-context for · when fetching reset CEs |
- <<u # copy/tailor 2 CEs |
- |
-* compare |
-<1 ae |
-<2 æ |
-<1 x |
-<1 af |
- |
-* compare |
-<1 aeb |
-<2 æb |
-= bæ |
- |
-* compare |
-<1 각 |
-<1 h |
-<1 갂 |
-<1 갃 |
- |
-* compare |
-<1 · # by itself: primary CE |
-<1 l |
-<2 l· # l+middle dot has only a secondary difference from l |
-= z |
-<2 u |
- |
-* compare |
-<1 (13) |
-<3 ⒀ # DUCET sets special tertiary weights in all CEs |
-<2 y |
-<1 (13[ |
- |
-% alternate=shifted |
-* compare |
-<1 (13) |
-= 13 |
-<3 ⒀ |
-= y # alternate=shifted removes the tailoring difference on the last CE |
-<1 14 |
- |
-** test: contraction inside extension, ICU ticket 9378 |
-@ rules |
-&а<<х/й # all letters are Cyrillic |
-* compare |
-<1 ай |
-<2 х |
- |
-** test: no duplicate tailored CEs for different reset positions with same CEs, ICU ticket 10104 |
-@ rules |
-&t<x &ᵀ<y # same primary weights |
-&q<u &[before 1]ꝗ<v # q and ꝗ are primary adjacent |
-* compare |
-<1 q |
-<1 u |
-<1 v |
-<1 ꝗ |
-<1 t |
-<3 ᵀ |
-<1 y |
-<1 x |
- |
-# Principle: Each rule builds on the state of preceding rules and ignores following rules. |
- |
-** test: later rule does not affect earlier reset position, ICU ticket 10105 |
-@ rules |
-&a < u < v < w &ov < x &b < v |
-* compare |
-<1 oa |
-<1 ou |
-<1 x # CE(o) followed by CE between u and w |
-<1 ow |
-<1 ob |
-<1 ov |
- |
-** test: later rule does not affect earlier extension (1), ICU ticket 10105 |
-@ rules |
-&a=x/b &v=b |
-% strength=secondary |
-* compare |
-<1 B |
-<1 c |
-<1 v |
-= b |
-* compare |
-<1 AB |
-= x |
-<1 ac |
-<1 av |
-= ab |
- |
-** test: later rule does not affect earlier extension (2), ICU ticket 10105 |
-@ rules |
-&a <<< c / e &g <<< e / l |
-% strength=secondary |
-* compare |
-<1 AE |
-= c |
-<2 æ |
-<1 agl |
-= ae |
- |
-** test: later rule does not affect earlier extension (3), ICU ticket 10105 |
-@ rules |
-&a = b / c &d = c / e |
-% strength=secondary |
-* compare |
-<1 AC # C is still only tertiary different from the original c |
-= b |
-<1 ade |
-= ac |
- |
-** test: extension contains tailored character, ICU ticket 10105 |
-@ rules |
-&a=e &b=u/e |
-* compare |
-<1 a |
-= e |
-<1 ba |
-= be |
-= u |
- |
-** test: add simple mappings for characters with root context |
-@ rules |
-&z=· # middle dot has a prefix mapping in the CLDR root |
-&n=и # и (U+0438) has contractions in the root |
-* compare |
-<1 l |
-<2 l· # root mapping for l|· still works |
-<1 z |
-= · |
-* compare |
-<1 n |
-= и |
-<1 И |
-<1 и\u0306 # root mapping for й=и\u0306 still works |
-= й |
-<3 Й |
- |
-** test: add context mappings around characters with root context |
-@ rules |
-&z=·h # middle dot has a prefix mapping in the CLDR root |
-&n=ә|и # и (U+0438) has contractions in the root |
-* compare |
-<1 l |
-<2 l· # root mapping for l|· still works |
-<1 z |
-= ·h |
-* compare |
-<1 и |
-<3 И |
-<1 и\u0306 # root mapping for й=и\u0306 still works |
-= й |
-* compare |
-<1 әn |
-= әи |
-<1 әo |
- |
-** test: many secondary CEs at the top of their range |
-@ rules |
-&[last primary ignorable]<<*\u2801-\u28ff |
-* compare |
-<2 \u0308 |
-<2 \u2801 |
-<2 \u2802 |
-<2 \u2803 |
-<2 \u2804 |
-<2 \u28fd |
-<2 \u28fe |
-<2 \u28ff |
-<1 \x20 |
- |
-** test: many tertiary CEs at the top of their range |
-@ rules |
-&[last secondary ignorable]<<<*a-z |
-* compare |
-<3 a |
-<3 b |
-<3 c |
-<3 d |
-# e..w |
-<3 x |
-<3 y |
-<3 z |
-<2 \u0308 |
- |
-** test: tailor contraction together with nearly equivalent prefix, ICU ticket 10101 |
-@ rules |
-&a=p|x &b=px &c=op |
-* compare |
-<1 b |
-= px |
-<3 B |
-<1 c |
-= op |
-<3 C |
-* compare |
-<1 ca |
-= opx # first contraction op, then prefix p|x |
-<3 cA |
-<3 Ca |
- |
-** test: reset position with prefix (pre-context), ICU ticket 10102 |
-@ rules |
-&a=p|x &px=y |
-* compare |
-<1 pa |
-= px |
-= y |
-<3 pA |
-<1 q |
-<1 x |
- |
-** test: prefix+contraction together (1), ICU ticket 10071 |
-@ rules |
-&x=a|bc |
-* compare |
-<1 ab |
-<1 Abc |
-<1 abd |
-<1 ac |
-<1 aw |
-<1 ax |
-= abc |
-<3 aX |
-<3 Ax |
-<1 b |
-<1 bb |
-<1 bc |
-<3 bC |
-<3 Bc |
-<1 bd |
- |
-** test: prefix+contraction together (2), ICU ticket 10071 |
-@ rules |
-&w=bc &x=a|b |
-* compare |
-<1 w |
-= bc |
-<3 W |
-* compare |
-<1 aw |
-<1 ax |
-= ab |
-<3 aX |
-<1 axb |
-<1 axc |
-= abc # prefix match a|b takes precedence over contraction match bc |
-<3 abC |
-<1 abd |
-<1 ay |
- |
-** test: prefix+contraction together (3), ICU ticket 10071 |
-@ rules |
-&x=a|b &w=bc # reverse order of rules as previous test, order should not matter here |
-* compare # same "compare" sequences as previous test |
-<1 w |
-= bc |
-<3 W |
-* compare |
-<1 aw |
-<1 ax |
-= ab |
-<3 aX |
-<1 axb |
-<1 axc |
-= abc # prefix match a|b takes precedence over contraction match bc |
-<3 abC |
-<1 abd |
-<1 ay |
- |
-** test: no mapping p|c, falls back to contraction ch, CLDR ticket 5962 |
-@ rules |
-&d=ch &v=p|ci |
-* compare |
-<1 pc |
-<3 pC |
-<1 pcH |
-<1 pcI |
-<1 pd |
-= pch # no-prefix contraction ch matches |
-<3 pD |
-<1 pv |
-= pci # prefix+contraction p|ci matches |
-<3 pV |
- |
-** test: tailor in & around compact ranges of root primaries |
-# The Ogham characters U+1681..U+169A are in simple ascending order of primary CEs |
-# which should be reliably encoded as one range in the root elements data. |
-@ rules |
-&[before 1]ᚁ<a |
-&ᚁ<b |
-&[before 1]ᚂ<c |
-&ᚂ<d |
-&[before 1]ᚚ<y |
-&ᚚ<z |
-&[before 2]ᚁ<<r |
-&ᚁ<<s |
-&[before 3]ᚚ<<<t |
-&ᚚ<<<u |
-* compare |
-<1 ᣵ # U+18F5 last Canadian Aboriginal |
-<1 a |
-<1 r |
-<2 ᚁ |
-<2 s |
-<1 b |
-<1 c |
-<1 ᚂ |
-<1 d |
-<1 ᚃ |
-<1 ᚙ |
-<1 y |
-<1 t |
-<3 ᚚ |
-<3 u |
-<1 z |
-<1 ᚠ # U+16A0 first Runic |
- |
-** test: suppressContractions |
-@ rules |
-&z<ch<әж [suppressContractions [·cә]] |
-* compare |
-<1 ch |
-<3 cH # ch was suppressed |
-<1 l |
-<1 l· # primary difference, not secondary, because l|· was suppressed |
-<1 ә |
-<2 ә\u0308 # secondary difference, not primary, because contractions for ә were suppressed |
-<1 әж |
-<3 әЖ |
- |
-** test: Hangul & Jamo |
-@ rules |
-&L=\u1100 # first Jamo L |
-&V=\u1161 # first Jamo V |
-&T=\u11A8 # first Jamo T |
-&\uAC01<<*\u4E00-\u4EFF # first Hangul LVT syllable & lots of secondary diffs |
-* compare |
-<1 Lv |
-<3 LV |
-= \u1100\u1161 |
-= \uAC00 |
-<1 LVt |
-<3 LVT |
-= \u1100\u1161\u11A8 |
-= \uAC00\u11A8 |
-= \uAC01 |
-<2 LVT\u0308 |
-<2 \u4E00 |
-<2 \u4E01 |
-<2 \u4E80 |
-<2 \u4EFF |
-<2 LV\u0308T |
-<1 \uAC02 |
- |
-** test: adjust special reset positions according to previous rules, CLDR ticket 6070 |
-@ rules |
-&[last variable]<x |
-[maxVariable space] # has effect only after building, no effect on following rules |
-&[last variable]<y |
-&[before 1][first regular]<z |
-* compare |
-<1 ? # some punctuation |
-<1 x |
-<1 y |
-<1 z |
-<1 $ # some symbol |
- |
-@ rules |
-&[last primary ignorable]<<x<<<y |
-&[last primary ignorable]<<z |
-* compare |
-<2 \u0358 |
-<2 x |
-<3 y |
-<2 z |
-<1 \x20 |
- |
-@ rules |
-&[last secondary ignorable]<<<x |
-&[last secondary ignorable]<<<y |
-* compare |
-<3 x |
-<3 y |
-<2 \u0358 |
- |
-@ rules |
-&[before 2][first variable]<<z |
-&[before 2][first variable]<<y |
-&[before 3][first variable]<<<x |
-&[before 3][first variable]<<<w |
-&[before 1][first variable]<v |
-&[before 2][first variable]<<u |
-&[before 3][first variable]<<<t |
-&[before 2]\uFDD1\xA0<<s # FractionalUCA.txt: FDD1 00A0, SPACE first primary |
-* compare |
-<2 \u0358 |
-<1 s |
-<2 \uFDD1\xA0 |
-<1 t |
-<3 u |
-<2 v |
-<1 w |
-<3 x |
-<3 y |
-<2 z |
-<2 \t |
- |
-@ rules |
-&[before 2][first regular]<<z |
-&[before 3][first regular]<<<y |
-&[before 1][first regular]<x |
-&[before 3][first regular]<<<w |
-&[before 2]\uFDD1\u263A<<v # FractionalUCA.txt: FDD1 263A, SYMBOL first primary |
-&[before 3][first regular]<<<u |
-&[before 1][first regular]<p # primary before the boundary: becomes variable |
-&[before 3][first regular]<<<t # not affected by p |
-&[last variable]<q # after p! |
-* compare |
-<1 ? |
-<1 p |
-<1 q |
-<1 t |
-<3 u |
-<3 v |
-<1 w |
-<3 x |
-<1 y |
-<3 z |
-<1 $ |
- |
-# check that p & q are indeed variable |
-% alternate=shifted |
-* compare |
-= ? |
-= p |
-= q |
-<1 t |
-<3 u |
-<3 v |
-<1 w |
-<3 x |
-<1 y |
-<3 z |
-<1 $ |
- |
-@ rules |
-&[before 2][first trailing]<<z |
-&[before 1][first trailing]<y |
-&[before 3][first trailing]<<<x |
-* compare |
-<1 \u4E00 # first Han, first implicit |
-<1 \uFDD1\uFDD0 # FractionalUCA.txt: unassigned first primary |
-# Note: The root collator currently does not map any characters to the trailing first boundary primary. |
-<1 x |
-<3 y |
-<1 z |
-<2 \uFFFD # The root collator currently maps U+FFFD to the first real trailing primary. |
- |
-@ rules |
-&[before 2][first primary ignorable]<<z |
-&[before 2][first primary ignorable]<<y |
-&[before 3][first primary ignorable]<<<x |
-&[before 3][first primary ignorable]<<<w |
-* compare |
-= \x01 |
-<2 w |
-<3 x |
-<3 y |
-<2 z |
-<2 \u0301 |
- |
-@ rules |
-&[before 3][first secondary ignorable]<<<y |
-&[before 3][first secondary ignorable]<<<x |
-* compare |
-= \x01 |
-<3 x |
-<3 y |
-<2 \u0301 |
- |
-** test: canonical closure |
-@ rules |
-&X=A &U=Â |
-* compare |
-<1 U |
-= Â |
-= A\u0302 |
-<2 Ú # U with acute |
-= U\u0301 |
-= Ấ # A with circumflex & acute |
-= Â\u0301 |
-= A\u0302\u0301 |
-<1 X |
-= A |
-<2 X\u030A # with ring above |
-= Å |
-= A\u030A |
-= \u212B # Angstrom sign |
- |
-@ rules |
-&x=\u5140\u55C0 |
-* compare |
-<1 x |
-= \u5140\u55C0 |
-= \u5140\uFA0D |
-= \uFA0C\u55C0 |
-= \uFA0C\uFA0D # CJK compatibility characters |
-<3 X |
- |
-# canonical closure on prefix rules, ICU ticket 9444 |
-@ rules |
-&x=ä|ŝ |
-* compare |
-<1 äs # not tailored |
-<1 äx |
-= äŝ |
-= a\u0308s\u0302 |
-= a\u0308ŝ |
-= äs\u0302 |
-<3 äX |
- |
-** test: conjoining Jamo map to expansions |
-@ rules |
-&gg=\u1101 # Jamo Lead consonant GG |
-&nj=\u11AC # Jamo Trail consonant NJ |
-* compare |
-<1 gg\u1161nj |
-= \u1101\u1161\u11AC |
-= \uAE4C\u11AC |
-= \uAE51 |
-<3 gg\u1161nJ |
-<1 \u1100\u1100 |
- |
-** test: canonical tail closure, ICU ticket 5913 |
-@ rules |
-&a<â |
-* compare |
-<1 a |
-<1 â # tailored |
-= a\u0302 |
-<2 a\u0323\u0302 # discontiguous contraction |
-= ạ\u0302 # equivalent |
-= ậ # equivalent |
-<1 b |
- |
-@ rules |
-&a<ạ |
-* compare |
-<1 a |
-<1 ạ # tailored |
-= a\u0323 |
-<2 a\u0323\u0302 # contiguous contraction plus extra diacritic |
-= ạ\u0302 # equivalent |
-= ậ # equivalent |
-<1 b |
- |
-# Tail closure should work even if there is a prefix and/or contraction. |
-@ rules |
-&a<\u5140|câ |
-# In order to find discontiguous contractions for \u5140|câ |
-# there must exist a mapping for \u5140|ca, regardless of what it maps to. |
-# (This follows from the UCA spec.) |
-&x=\u5140|ca |
-* compare |
-<1 \u5140a |
-= \uFA0Ca |
-<1 \u5140câ # tailored |
-= \uFA0Ccâ |
-= \u5140ca\u0302 |
-= \uFA0Cca\u0302 |
-<2 \u5140ca\u0323\u0302 # discontiguous contraction |
-= \uFA0Cca\u0323\u0302 |
-= \u5140cạ\u0302 |
-= \uFA0Ccạ\u0302 |
-= \u5140cậ |
-= \uFA0Ccậ |
-<1 \u5140b |
-= \uFA0Cb |
-<1 \u5140x |
-= \u5140ca |
- |
-# Double-check that without the extra mapping there will be no discontiguous match. |
-@ rules |
-&a<\u5140|câ |
-* compare |
-<1 \u5140a |
-= \uFA0Ca |
-<1 \u5140câ # tailored |
-= \uFA0Ccâ |
-= \u5140ca\u0302 |
-= \uFA0Cca\u0302 |
-<1 \u5140b |
-= \uFA0Cb |
-<1 \u5140ca\u0323\u0302 # no discontiguous contraction |
-= \uFA0Cca\u0323\u0302 |
-= \u5140cạ\u0302 |
-= \uFA0Ccạ\u0302 |
-= \u5140cậ |
-= \uFA0Ccậ |
- |
-@ rules |
-&a<cạ |
-* compare |
-<1 a |
-<1 cạ # tailored |
-= ca\u0323 |
-<2 ca\u0323\u0302 # contiguous contraction plus extra diacritic |
-= cạ\u0302 # equivalent |
-= cậ # equivalent |
-<1 b |
- |
-# ᾢ = U+1FA2 GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI |
-# = 03C9 0313 0300 0345 |
-# ccc = 0, 230, 230, 240 |
-@ rules |
-&δ=αῳ |
-# In order to find discontiguous contractions for αῳ |
-# there must exist a mapping for αω, regardless of what it maps to. |
-# (This follows from the UCA spec.) |
-&ε=αω |
-* compare |
-<1 δ |
-= αῳ |
-= αω\u0345 |
-<2 αω\u0313\u0300\u0345 # discontiguous contraction |
-= αὠ\u0300\u0345 |
-= αὢ\u0345 |
-= αᾢ |
-<2 αω\u0300\u0313\u0345 |
-= αὼ\u0313\u0345 |
-= αῲ\u0313 # not FCD |
-<1 ε |
-= αω |
- |
-# Double-check that without the extra mapping there will be no discontiguous match. |
-@ rules |
-&δ=αῳ |
-* compare |
-<1 αω\u0313\u0300\u0345 # no discontiguous contraction |
-= αὠ\u0300\u0345 |
-= αὢ\u0345 |
-= αᾢ |
-<2 αω\u0300\u0313\u0345 |
-= αὼ\u0313\u0345 |
-= αῲ\u0313 # not FCD |
-<1 δ |
-= αῳ |
-= αω\u0345 |
- |
-# Add U+0315 COMBINING COMMA ABOVE RIGHT which has ccc=232. |
-# Tests code paths where the tailored string has a combining mark |
-# that does not occur in any composite's decomposition. |
-@ rules |
-&δ=αὼ\u0315 |
-* compare |
-<1 αω\u0313\u0300\u0315 # Not tailored: The grave accent blocks the comma above. |
-= αὠ\u0300\u0315 |
-= αὢ\u0315 |
-<1 δ |
-= αὼ\u0315 |
-= αω\u0300\u0315 |
-<2 αω\u0300\u0315\u0345 |
-= αὼ\u0315\u0345 |
-= αῲ\u0315 # not FCD |
- |
-** test: danish a+a vs. a-umlaut, ICU ticket 9319 |
-@ rules |
-&z<aa |
-* compare |
-<1 z |
-<1 aa |
-<2 aa\u0308 |
-= aä |
- |
-** test: Jamo L with and in prefix |
-# Useful for the Korean "searchjl" tailoring (instead of contractions of pairs of Jamo L). |
-@ rules |
-# Jamo Lead consonant G after G or GG |
-&[last primary ignorable]<<\u1100|\u1100=\u1101|\u1100 |
-# Jamo Lead consonant GG sorts like G+G |
-&\u1100\u1100=\u1101 |
-# Note: Making G|GG and GG|GG sort the same as G|G+G |
-# would require the ability to reset on G|G+G, |
-# or we could make G-after-G equal to some secondary-CE character, |
-# and reset on a pair of those. |
-# (It does not matter much if there are at most two G in a row in real text.) |
-* compare |
-<1 \u1100 |
-<2 \u1100\u1100 # only one primary from a sequence of G lead consonants |
-= \u1101 |
-<2 \u1100\u1100\u1100 |
-= \u1101\u1100 |
-# but not = \u1100\u1101, see above |
-<1 \u1100\u1161 |
-= \uAC00 |
-<2 \u1100\u1100\u1161 |
-= \u1100\uAC00 # prefix match from the L of the LV syllable |
-= \u1101\u1161 |
-= \uAE4C |
- |
-** test: proposed Korean "searchjl" tailoring with prefixes, CLDR ticket 6546 |
-@ rules |
-# Low secondary CEs for Jamo V & T. |
-# Note: T should sort before V for proper syllable order. |
-&\u0332 # COMBINING LOW LINE (first primary ignorable) |
-<<\u1161<<\u1162 |
- |
-# Korean Jamo lead consonant search rules, part 2: |
-# Make modern compound L jamo primary equivalent to non-compound forms. |
- |
-# Secondary CEs for Jamo L-after-L, greater than Jamo V & T. |
-&\u0313 # COMBINING COMMA ABOVE (second primary ignorable) |
-=\u1100|\u1100 |
-=\u1103|\u1103 |
-=\u1107|\u1107 |
-=\u1109|\u1109 |
-=\u110C|\u110C |
- |
-# Compound L Jamo map to equivalent expansions of primary+secondary CE. |
-&\u1100\u0313=\u1101<<<\u3132 # HANGUL CHOSEONG SSANGKIYEOK, HANGUL LETTER SSANGKIYEOK |
-&\u1103\u0313=\u1104<<<\u3138 # HANGUL CHOSEONG SSANGTIKEUT, HANGUL LETTER SSANGTIKEUT |
-&\u1107\u0313=\u1108<<<\u3143 # HANGUL CHOSEONG SSANGPIEUP, HANGUL LETTER SSANGPIEUP |
-&\u1109\u0313=\u110A<<<\u3146 # HANGUL CHOSEONG SSANGSIOS, HANGUL LETTER SSANGSIOS |
-&\u110C\u0313=\u110D<<<\u3149 # HANGUL CHOSEONG SSANGCIEUC, HANGUL LETTER SSANGCIEUC |
- |
-* compare |
-<1 \u1100\u1161 |
-= \uAC00 |
-<2 \u1100\u1162 |
-= \uAC1C |
-<2 \u1100\u1100\u1161 |
-= \u1100\uAC00 |
-= \u1101\u1161 |
-= \uAE4C |
-<3 \u3132\u1161 |
- |
-** test: Hangul syllables in prefix & in the interior of a contraction |
-@ rules |
-&x=\u1100\u1161|a\u1102\u1162z |
-* compare |
-<1 \u1100\u1161x |
-= \u1100\u1161a\u1102\u1162z |
-= \u1100\u1161a\uB0B4z |
-= \uAC00a\u1102\u1162z |
-= \uAC00a\uB0B4z |
- |
-** test: digits are unsafe-backwards when numeric=on |
-@ root |
-% numeric=on |
-* compare |
-# If digits are not unsafe, then numeric collation sees "1"=="01" and "b">"a". |
-# We need to back up before the identical prefix "1" and compare the full numbers. |
-<1 11b |
-<1 101a |
- |
-** test: simple locale data test |
-@ locale de |
-* compare |
-<1 a |
-<2 ä |
-<1 ae |
-<2 æ |
- |
-@ locale de-u-co-phonebk |
-* compare |
-<1 a |
-<1 ae |
-<2 ä |
-<2 æ |
- |
-# The following test cases were moved here from ICU 52's DataDrivenCollationTest.txt. |
- |
-** test: DataDrivenCollationTest/TestMorePinyin |
-# Testing the primary strength. |
-@ locale zh |
-% strength=primary |
-* compare |
-< lā |
-= lĀ |
-= Lā |
-= LĀ |
-< lān |
-= lĀn |
-< lē |
-= lĒ |
-= Lē |
-= LĒ |
-< lēn |
-= lĒn |
- |
-** test: DataDrivenCollationTest/TestLithuanian |
-# Lithuanian sort order. |
-@ locale lt |
-* compare |
-< cz |
-< č |
-< d |
-< iz |
-< j |
-< sz |
-< š |
-< t |
-< zz |
-< ž |
- |
-** test: DataDrivenCollationTest/TestLatvian |
-# Latvian sort order. |
-@ locale lv |
-* compare |
-< cz |
-< č |
-< d |
-< gz |
-< ģ |
-< h |
-< iz |
-< j |
-< kz |
-< ķ |
-< l |
-< lz |
-< ļ |
-< m |
-< nz |
-< ņ |
-< o |
-< rz |
-< ŗ |
-< s |
-< sz |
-< š |
-< t |
-< zz |
-< ž |
- |
-** test: DataDrivenCollationTest/TestEstonian |
-# Estonian sort order. |
-@ locale et |
-* compare |
-< sy |
-< š |
-< šy |
-< z |
-< zy |
-< ž |
-< v |
-< va |
-< w |
-< õ |
-< õy |
-< ä |
-< äy |
-< ö |
-< öy |
-< ü |
-< üy |
-< x |
- |
-** test: DataDrivenCollationTest/TestAlbanian |
-# Albanian sort order. |
-@ locale sq |
-* compare |
-< cz |
-< ç |
-< d |
-< dz |
-< dh |
-< e |
-< ez |
-< ë |
-< f |
-< gz |
-< gj |
-< h |
-< lz |
-< ll |
-< m |
-< nz |
-< nj |
-< o |
-< rz |
-< rr |
-< s |
-< sz |
-< sh |
-< t |
-< tz |
-< th |
-< u |
-< xz |
-< xh |
-< y |
-< zz |
-< zh |
- |
-** test: DataDrivenCollationTest/TestSimplifiedChineseOrder |
-# Sorted file has different order. |
-@ root |
-# normalization=on turned on & off automatically. |
-* compare |
-< \u5F20 |
-< \u5F20\u4E00\u8E3F |
- |
-** test: DataDrivenCollationTest/TestTibetanNormalizedIterativeCrash |
-# This pretty much crashes. |
-@ root |
-* compare |
-< \u0f71\u0f72\u0f80\u0f71\u0f72 |
-< \u0f80 |
- |
-** test: DataDrivenCollationTest/TestThaiPartialSortKeyProblems |
-# These are examples of strings that caused trouble in partial sort key testing. |
-@ locale th-TH |
-* compare |
-< \u0E01\u0E01\u0E38\u0E18\u0E20\u0E31\u0E13\u0E11\u0E4C |
-< \u0E01\u0E01\u0E38\u0E2A\u0E31\u0E19\u0E42\u0E18 |
-* compare |
-< \u0E01\u0E07\u0E01\u0E32\u0E23 |
-< \u0E01\u0E07\u0E42\u0E01\u0E49 |
-* compare |
-< \u0E01\u0E23\u0E19\u0E17\u0E32 |
-< \u0E01\u0E23\u0E19\u0E19\u0E40\u0E0A\u0E49\u0E32 |
-* compare |
-< \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E22\u0E27 |
-< \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E4A\u0E22\u0E27 |
-* compare |
-< \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E2D |
-< \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E49\u0E32 |
- |
-** test: DataDrivenCollationTest/TestJavaStyleRule |
-# java.text allows rules to start as '<<<x<<<y...' |
-# we emulate this by assuming a &[first tertiary ignorable] in this case. |
-@ rules |
-&\u0001=equal<<<z<<x<<<w &[first tertiary ignorable]=a &[first primary ignorable]=b |
-* compare |
-= a |
-= equal |
-< z |
-< x |
-= b # x had become the new first primary ignorable |
-< w |
- |
-** test: DataDrivenCollationTest/TestShiftedIgnorable |
-# The UCA states that primary ignorables should be completely |
-# ignorable when following a shifted code point. |
-@ root |
-% alternate=shifted |
-% strength=quaternary |
-* compare |
-< a\u0020b |
-= a\u0020\u0300b |
-= a\u0020\u0301b |
-< a_b |
-= a_\u0300b |
-= a_\u0301b |
-< A\u0020b |
-= A\u0020\u0300b |
-= A\u0020\u0301b |
-< A_b |
-= A_\u0300b |
-= A_\u0301b |
-< a\u0301b |
-< A\u0301b |
-< a\u0300b |
-< A\u0300b |
- |
-** test: DataDrivenCollationTest/TestNShiftedIgnorable |
-# The UCA states that primary ignorables should be completely |
-# ignorable when following a shifted code point. |
-@ root |
-% alternate=non-ignorable |
-% strength=tertiary |
-* compare |
-< a\u0020b |
-< A\u0020b |
-< a\u0020\u0301b |
-< A\u0020\u0301b |
-< a\u0020\u0300b |
-< A\u0020\u0300b |
-< a_b |
-< A_b |
-< a_\u0301b |
-< A_\u0301b |
-< a_\u0300b |
-< A_\u0300b |
-< a\u0301b |
-< A\u0301b |
-< a\u0300b |
-< A\u0300b |
- |
-** test: DataDrivenCollationTest/TestSafeSurrogates |
-# It turned out that surrogates were not skipped properly |
-# when iterating backwards if they were in the middle of a |
-# contraction. This test assures that this is fixed. |
-@ rules |
-&a < x\ud800\udc00b |
-* compare |
-< a |
-< x\ud800\udc00b |
- |
-** test: DataDrivenCollationTest/da_TestPrimary |
-# This test goes through primary strength cases |
-@ locale da |
-% strength=primary |
-* compare |
-< Lvi |
-< Lwi |
-* compare |
-< L\u00e4vi |
-< L\u00f6wi |
-* compare |
-< L\u00fcbeck |
-= Lybeck |
- |
-** test: DataDrivenCollationTest/da_TestTertiary |
-# This test goes through tertiary strength cases |
-@ locale da |
-% strength=tertiary |
-* compare |
-< Luc |
-< luck |
-* compare |
-< luck |
-< L\u00fcbeck |
-* compare |
-< lybeck |
-< L\u00fcbeck |
-* compare |
-< L\u00e4vi |
-< L\u00f6we |
-* compare |
-< L\u00f6ww |
-< mast |
- |
-* compare |
-< A/S |
-< ANDRE |
-< ANDR\u00c9 |
-< ANDREAS |
-< AS |
-< CA |
-< \u00c7A |
-< CB |
-< \u00c7C |
-< D.S.B. |
-< DA |
-< \u00d0A |
-< DB |
-< \u00d0C |
-< DSB |
-< DSC |
-< EKSTRA_ARBEJDE |
-< EKSTRABUD0 |
-< H\u00d8ST |
-< HAAG |
-< H\u00c5NDBOG |
-< HAANDV\u00c6RKSBANKEN |
-< Karl |
-< karl |
-< NIELS\u0020J\u00d8RGEN |
-< NIELS-J\u00d8RGEN |
-< NIELSEN |
-< R\u00c9E,\u0020A |
-< REE,\u0020B |
-< R\u00c9E,\u0020L |
-< REE,\u0020V |
-< SCHYTT,\u0020B |
-< SCHYTT,\u0020H |
-< SCH\u00dcTT,\u0020H |
-< SCHYTT,\u0020L |
-< SCH\u00dcTT,\u0020M |
-< SS |
-< \u00df |
-< SSA |
-< STORE\u0020VILDMOSE |
-< STOREK\u00c6R0 |
-< STORM\u0020PETERSEN |
-< STORMLY |
-< THORVALD |
-< THORVARDUR |
-< \u00feORVAR\u00d0UR |
-< THYGESEN |
-< VESTERG\u00c5RD,\u0020A |
-< VESTERGAARD,\u0020A |
-< VESTERG\u00c5RD,\u0020B |
-< \u00c6BLE |
-< \u00c4BLE |
-< \u00d8BERG |
-< \u00d6BERG |
- |
-* compare |
-< andere |
-< chaque |
-< chemin |
-< cote |
-< cot\u00e9 |
-< c\u00f4te |
-< c\u00f4t\u00e9 |
-< \u010du\u010d\u0113t |
-< Czech |
-< hi\u0161a |
-< irdisch |
-< lie |
-< lire |
-< llama |
-< l\u00f5ug |
-< l\u00f2za |
-< lu\u010d |
-< luck |
-< L\u00fcbeck |
-< lye |
-< l\u00e4vi |
-< L\u00f6wen |
-< m\u00e0\u0161ta |
-< m\u00eer |
-< myndig |
-< M\u00e4nner |
-< m\u00f6chten |
-< pi\u00f1a |
-< pint |
-< pylon |
-< \u0161\u00e0ran |
-< savoir |
-< \u0160erb\u016bra |
-< Sietla |
-< \u015blub |
-< subtle |
-< symbol |
-< s\u00e4mtlich |
-< verkehrt |
-< vox |
-< v\u00e4ga |
-< waffle |
-< wood |
-< yen |
-< yuan |
-< yucca |
-< \u017eal |
-< \u017eena |
-< \u017den\u0113va |
-< zoo0 |
-< Zviedrija |
-< Z\u00fcrich |
-< zysk0 |
-< \u00e4ndere |
- |
-** test: DataDrivenCollationTest/hi_TestNewRules |
-# This test goes through new rules and tests against old rules |
-@ locale hi |
-* compare |
-< कॐ |
-< कं |
-< कँ |
-< कः |
- |
-** test: DataDrivenCollationTest/ro_TestNewRules |
-# This test goes through new rules and tests against old rules |
-@ locale ro |
-* compare |
-< xAx |
-< xă |
-< xĂ |
-< Xă |
-< XĂ |
-< xăx |
-< xĂx |
-< xâ |
-< x |
-< Xâ |
-< XÂ |
-< xâx |
-< xÂx |
-< xb |
-< xIx |
-< xî |
-< xÎ |
-< Xî |
-< XÎ |
-< xîx |
-< xÎx |
-< xj |
-< xSx |
-< xș |
-= xş |
-< xȘ |
-= xŞ |
-< Xș |
-= Xş |
-< XȘ |
-= XŞ |
-< xșx |
-= xşx |
-< xȘx |
-= xŞx |
-< xT |
-< xTx |
-< xț |
-= xţ |
-< xȚ |
-= xŢ |
-< Xț |
-= Xţ |
-< XȚ |
-= XŢ |
-< xțx |
-= xţx |
-< xȚx |
-= xŢx |
-< xU |
- |
-** test: DataDrivenCollationTest/testOffsets |
-# This tests cases where forwards and backwards iteration get different offsets |
-@ locale en |
-% strength=tertiary |
-* compare |
-< a\uD800\uDC00\uDC00 |
-< b\uD800\uDC00\uDC00 |
-* compare |
-< \u0301A\u0301\u0301 |
-< \u0301B\u0301\u0301 |
-* compare |
-< abcd\r\u0301 |
-< abce\r\u0301 |
-# TODO: test offsets in new CollationTest |
- |
-# End of test cases moved here from ICU 52's DataDrivenCollationTest.txt. |
- |
-** test: was ICU 52 cmsccoll/TestRedundantRules |
-@ rules |
-& a < b < c < d& [before 1] c < m |
-* compare |
-<1 a |
-<1 b |
-<1 m |
-<1 c |
-<1 d |
- |
-@ rules |
-& a < b <<< c << d <<< e& [before 3] e <<< x |
-* compare |
-<1 a |
-<1 b |
-<3 c |
-<2 d |
-<3 x |
-<3 e |
- |
-@ rules |
-& a < b <<< c << d <<< e <<< f < g& [before 1] g < x |
-* compare |
-<1 a |
-<1 b |
-<3 c |
-<2 d |
-<3 e |
-<3 f |
-<1 x |
-<1 g |
- |
-@ rules |
-& a <<< b << c < d& a < m |
-* compare |
-<1 a |
-<3 b |
-<2 c |
-<1 m |
-<1 d |
- |
-@ rules |
-&a<b<<b\u0301 &z<b |
-* compare |
-<1 a |
-<1 b\u0301 |
-<1 z |
-<1 b |
- |
-@ rules |
-&z<m<<<q<<<m |
-* compare |
-<1 z |
-<1 q |
-<3 m |
- |
-@ rules |
-&z<<<m<q<<<m |
-* compare |
-<1 z |
-<1 q |
-<3 m |
- |
-@ rules |
-& a < b < c < d& r < c |
-* compare |
-<1 a |
-<1 b |
-<1 d |
-<1 r |
-<1 c |
- |
-@ rules |
-& a < b < c < d& c < m |
-* compare |
-<1 a |
-<1 b |
-<1 c |
-<1 m |
-<1 d |
- |
-@ rules |
-& a < b < c < d& a < m |
-* compare |
-<1 a |
-<1 m |
-<1 b |
-<1 c |
-<1 d |
- |
-** test: was ICU 52 cmsccoll/TestExpansionSyntax |
-# The following two rules should sort the particular list of strings the same. |
-@ rules |
-&AE <<< a << b <<< c &d <<< f |
-* compare |
-<1 AE |
-<3 a |
-<2 b |
-<3 c |
-<1 d |
-<3 f |
- |
-@ rules |
-&A <<< a / E << b / E <<< c /E &d <<< f |
-* compare |
-<1 AE |
-<3 a |
-<2 b |
-<3 c |
-<1 d |
-<3 f |
- |
-# The following two rules should sort the particular list of strings the same. |
-@ rules |
-&AE <<< a <<< b << c << d < e < f <<< g |
-* compare |
-<1 AE |
-<3 a |
-<3 b |
-<2 c |
-<2 d |
-<1 e |
-<1 f |
-<3 g |
- |
-@ rules |
-&A <<< a / E <<< b / E << c / E << d / E < e < f <<< g |
-* compare |
-<1 AE |
-<3 a |
-<3 b |
-<2 c |
-<2 d |
-<1 e |
-<1 f |
-<3 g |
- |
-# The following two rules should sort the particular list of strings the same. |
-@ rules |
-&AE <<< B <<< C / D <<< F |
-* compare |
-<1 AE |
-<3 B |
-<3 F |
-<1 AED |
-<3 C |
- |
-@ rules |
-&A <<< B / E <<< C / ED <<< F / E |
-* compare |
-<1 AE |
-<3 B |
-<3 F |
-<1 AED |
-<3 C |
- |
-** test: never reorder trailing primaries |
-@ root |
-% reorder Zzzz Grek |
-* compare |
-<1 L |
-<1 字 |
-<1 Ω |
-<1 \uFFFD |
-<1 \uFFFF |
- |
-** test: fall back to mappings with shorter prefixes, not immediately to ones with no prefixes |
-@ rules |
-&u=ab|cd |
-&v=b|ce |
-* compare |
-<1 abc |
-<1 abcc |
-<1 abcf |
-<1 abcd |
-= abu |
-<1 abce |
-= abv |
- |
-# With the following rules, there is only one prefix per composite ĉ or ç, |
-# but both prefixes apply to just c in NFD form. |
-# We would get different results for composed vs. NFD input |
-# if we fell back directly from longest-prefix mappings to no-prefix mappings. |
-@ rules |
-&x=op|ĉ |
-&y=p|ç |
-* compare |
-<1 opc |
-<2 opć |
-<1 opcz |
-<1 opd |
-<1 opĉ |
-= opc\u0302 |
-= opx |
-<1 opç |
-= opc\u0327 |
-= opy |
- |
-# The mapping is used which has the longest matching prefix for which |
-# there is also a suffix match, with the longest suffix match among several for that prefix. |
-@ rules |
-&❶=d |
-&❷=de |
-&❸=def |
-&①=c|d |
-&②=c|de |
-&③=c|def |
-&④=bc|d |
-&⑤=bc|de |
-&⑥=bc|def |
-&⑦=abc|d |
-&⑧=abc|de |
-&⑨=abc|def |
-* compare |
-<1 9aadzz |
-= 9aa❶zz |
-<1 9aadez |
-= 9aa❷z |
-<1 9aadef |
-= 9aa❸ |
-<1 9acdzz |
-= 9ac①zz |
-<1 9acdez |
-= 9ac②z |
-<1 9acdef |
-= 9ac③ |
-<1 9bcdzz |
-= 9bc④zz |
-<1 9bcdez |
-= 9bc⑤z |
-<1 9bcdef |
-= 9bc⑥ |
-<1 abcdzz |
-= abc⑦zz |
-<1 abcdez |
-= abc⑧z |
-<1 abcdef |
-= abc⑨ |
- |
-** test: prefix + discontiguous contraction with missing prefix contraction |
-# Unfortunate terminology: The first "prefix" here is the pre-context, |
-# the second "prefix" refers to the contraction/relation string that is |
-# one shorter than the one being tested. |
-@ rules |
-&x=p|e |
-&y=p|ê |
-&z=op|ê |
-# No mapping for op|e: |
-# Discontiguous contraction matching should not match op|ê in opệ |
-# because it would have to skip the dot below and extend a match on op|e by the circumflex, |
-# but there is no match on op|e. |
-* compare |
-<1 oPe |
-<1 ope |
-= opx |
-<1 opệ |
-= opy\u0323 # y not z |
-<1 opê |
-= opz |
- |
-# We cannot test for fallback by whether the contraction default CE32 |
-# is for another contraction. With the following rules, there is no mapping for op|e, |
-# and the fallback to prefix p has no contractions. |
-@ rules |
-&x=p|e |
-&z=op|ê |
-* compare |
-<1 oPe |
-<1 ope |
-= opx |
-<2 opệ |
-= opx\u0323\u0302 # x not z |
-<1 opê |
-= opz |
- |
-# One more variation: Fallback to the simple code point, no shorter non-empty prefix. |
-@ rules |
-&x=e |
-&z=op|ê |
-* compare |
-<1 ope |
-= opx |
-<3 oPe |
-= oPx |
-<2 opệ |
-= opx\u0323\u0302 # x not z |
-<1 opê |
-= opz |
- |
-** test: maxVariable via rules |
-@ rules |
-[maxVariable space][alternate shifted] |
-* compare |
-= \u0020 |
-= \u000A |
-<1 . |
-<1 ° # degree sign |
-<1 $ |
-<1 0 |
- |
-** test: maxVariable via setting |
-@ root |
-% maxVariable=currency |
-% alternate=shifted |
-* compare |
-= \u0020 |
-= \u000A |
-= . |
-= ° # degree sign |
-= $ |
-<1 0 |
- |
-** test: ICU4J CollationMiscTest/TestContractionClosure (ää) |
-# This tests canonical closure, but it also tests that CollationFastLatin |
-# bails out properly for contractions with combining marks. |
-# For that we need pairs of strings that remain in the Latin fastpath |
-# long enough, hence the extra "= b" lines. |
-@ rules |
-&b=\u00e4\u00e4 |
-* compare |
-<1 b |
-= \u00e4\u00e4 |
-= b |
-= a\u0308a\u0308 |
-= b |
-= \u00e4a\u0308 |
-= b |
-= a\u0308\u00e4 |
- |
-** test: ICU4J CollationMiscTest/TestContractionClosure (Å) |
-@ rules |
-&b=\u00C5 |
-* compare |
-<1 b |
-= \u00C5 |
-= b |
-= A\u030A |
-= b |
-= \u212B |
- |
-** test: reset-before on already-tailored characters, ICU ticket 10108 |
-@ rules |
-&a<w<<x &[before 2]x<<y |
-* compare |
-<1 a |
-<1 w |
-<2 y |
-<2 x |
- |
-@ rules |
-&a<<w<<<x &[before 2]x<<y |
-* compare |
-<1 a |
-<2 y |
-<2 w |
-<3 x |
- |
-@ rules |
-&a<w<x &[before 2]x<<y |
-* compare |
-<1 a |
-<1 w |
-<1 y |
-<2 x |
- |
-@ rules |
-&a<w<<<x &[before 2]x<<y |
-* compare |
-<1 a |
-<1 y |
-<2 w |
-<3 x |
- |
-** test: numeric collation with other settings, ICU ticket 9092 |
-@ root |
-% strength=identical |
-% caseFirst=upper |
-% numeric=on |
-* compare |
-<1 100\u0020a |
-<1 101 |
- |
-** test: collation type fallback from unsupported type, ICU ticket 10149 |
-@ locale fr-CA-u-co-phonebk |
-# Expect the same result as with fr-CA, using backwards-secondary order. |
-# That is, we should fall back from the unsupported collation type |
-# to the locale's default collation type. |
-* compare |
-<1 cote |
-<2 côte |
-<2 coté |
-<2 côté |
- |
-** test: @ is equivalent to [backwards 2], ICU ticket 9956 |
-@ rules |
-&b<a @ &v<<w |
-* compare |
-<1 b |
-<1 a |
-<1 cote |
-<2 côte |
-<2 coté |
-<2 côté |
-<1 v |
-<2 w |
-<1 x |
- |
-** test: shifted+reordering, ICU ticket 9507 |
-@ root |
-% reorder Grek punct space |
-% alternate=shifted |
-% strength=quaternary |
-# Which primaries are "variable" should be determined without script reordering, |
-# and then primaries should be reordered whether they are shifted to quaternary or not. |
-* compare |
-<4 ( # punctuation |
-<4 ) |
-<4 \u0020 # space |
-<1 ` # symbol |
-<1 ^ |
-<1 $ # currency symbol |
-<1 € |
-<1 0 # numbers |
-<1 ε # Greek |
-<1 e # Latin |
-<1 e(e |
-<4 e)e |
-<4 e\u0020e |
-<4 ee |
-<3 e(E |
-<4 e)E |
-<4 e\u0020E |
-<4 eE |
- |
-** test: "uppercase first" could sort a string before its prefix, ICU ticket 9351 |
-@ rules |
-&\u0001<<<b<<<B |
-% caseFirst=upper |
-* compare |
-<1 aaa |
-<3 aaaB |
- |
-** test: secondary+case ignores secondary ignorables, ICU ticket 9355 |
-@ rules |
-&\u0001<<<b<<<B |
-% strength=secondary |
-% caseLevel=on |
-* compare |
-<1 a |
-= ab |
-= aB |
- |
-** test: custom collation rules involving tail of a contraction in Malayalam, ICU ticket 6328 |
-@ rules |
-&[before 2] ൌ << ൗ # U+0D57 << U+0D4C == 0D46+0D57 |
-* compare |
-<1 ൗx |
-<2 ൌx |
-<1 ൗy |
-<2 ൌy |
- |
-** test: quoted apostrophe in compact syntax, ICU ticket 8204 |
-@ rules |
-&q<<*a''c |
-* compare |
-<1 d |
-<1 p |
-<1 q |
-<2 a |
-<2 \u0027 |
-<2 c |
-<1 r |
- |
-# ICU ticket #8260 "Support all collation-related keywords in Collator.getInstance()" |
-** test: locale -u- with collation keywords, ICU ticket 8260 |
-@ locale de-u-kv-sPace-ka-shifTed-kn-kk-falsE-kf-Upper-kc-tRue-ks-leVel4 |
-* compare |
-<4 \u0020 # space is shifted, strength=quaternary |
-<1 ! # punctuation is regular |
-<1 2 |
-<1 12 # numeric sorting |
-<1 B |
-<c b # uppercase first on case level |
-<1 x\u0301\u0308 |
-<2 x\u0308\u0301 # normalization off |
- |
-** test: locale @ with collation keywords, ICU ticket 8260 |
-@ locale fr@colbAckwards=yes;ColStrength=Quaternary;kv=currencY;colalternate=shifted |
-* compare |
-<4 $ # currency symbols are shifted, strength=quaternary |
-<1 àla |
-<2 alà # backwards secondary level |
- |
-** test: locale -u- with script reordering, ICU ticket 8260 |
-@ locale el-u-kr-kana-SYMBOL-Grek-hani-cyrl-latn-digit-armn-deva-ethi-thai |
-* compare |
-<1 \u0020 |
-<1 あ |
-<1 ☂ |
-<1 Ω |
-<1 丂 |
-<1 ж |
-<1 L |
-<1 4 |
-<1 Ձ |
-<1 अ |
-<1 ሄ |
-<1 ฉ |
- |
-** test: locale @collation=type should be case-insensitive |
-@ locale de@coLLation=PhoneBook |
-* compare |
-<1 ae |
-<2 ä |
-<3 Ä |
- |
-** test: import root search rules plus German phonebook rules, ICU ticket 8962 |
-@ locale de-u-co-search |
-* compare |
-<1 = |
-<1 ≠ |
-<1 a |
-<1 ae |
-<2 ä |
- |
-# Once more, but with runtime builder. |
-@ rules |
-[import und-u-co-search][import de-u-co-phonebk] |
-* compare |
-<1 = |
-<1 ≠ |
-<1 a |
-<1 ae |
-<2 ä |
- |
-# Once again, with import from "root" not "und" (as in a proper language tag). |
-@ rules |
-[import root-u-co-search][import de-u-co-phonebk] |
-* compare |
-<1 = |
-<1 ≠ |
-<1 a |
-<1 ae |
-<2 ä |
- |
-** test: import rules from a language with non-Latin native script, and reset the reordering, ICU ticket 10998 |
-# Greek should sort Greek first. |
-@ rules |
-[import el] |
-* compare |
-<1 4 |
-<1 Ω |
-<1 L |
- |
-# Import Greek, and then reset the reordering. |
-@ rules |
-[import el][reorder Zzzz] |
-* compare |
-<1 4 |
-<1 L |
-<1 Ω |
- |
-# "others" is a synonym for Zzzz. |
-@ rules |
-[import el][reorder others] |
-* compare |
-<1 4 |
-<1 L |
-<1 Ω |
- |
-** test: regression test for CollationFastLatinBuilder, ICU ticket 11388 |
-@ rules |
-&x<<aa<<<Aa<<<AA |
-% strength=secondary |
-* compare |
-<1 AA |
-<2 Aẩ |
-<2 aą |
-* compare |
-<1 AA |
-<2 aą |
- |
-** test: tailor tertiary-after a common tertiary where there is a lower one |
-# Assume that Hiragana small A has a below-common tertiary, and Hiragana A has a common one. |
-# See ICU ticket 11448 & CLDR ticket 7222. |
-@ rules |
-&あ<<<x<<<y<<<z |
-* compare |
-<1 ぁ |
-<3 あ |
-<3 x |
-<3 y |
-<3 z |
-<3 ァ |
-<1 い |
- |
-** test: tailor tertiary-after a below-common tertiary |
-@ rules |
-&ぁ<<<x<<<y<<<z |
-* compare |
-<1 ぁ |
-<3 x |
-<3 y |
-<3 z |
-<3 あ |
-<3 ァ |
-<1 い |
- |
-** test: tailor tertiary-before a common tertiary where there is a lower one |
-@ rules |
-&[before 3]あ<<<x<<<y<<<z |
-* compare |
-<1 ぁ |
-<3 x |
-<3 y |
-<3 z |
-<3 あ |
-<3 ァ |
-<1 い |
- |
-** test: tailor tertiary-before a below-common tertiary |
-@ rules |
-&[before 3]ぁ<<<x<<<y<<<z |
-* compare |
-<1 x |
-<3 y |
-<3 z |
-<3 ぁ |
-<3 あ |
-<3 ァ |
-<1 い |
- |
-** test: reorder single scripts not groups, ICU ticket 11449 |
-@ root |
-% reorder Goth Latn |
-* compare |
-<1 4 |
-<1 𐌰 # Gothic |
-<1 L |
-<1 Ω |
-# Before ICU 55, the following reordered together with Gothic. |
-<1 𐌈 # Old Italic |
-<1 𐑐 # Shavian |