OLD | NEW |
| (Empty) |
1 # Copyright (c) 2001-2015 International Business Machines | |
2 # Corporation and others. All Rights Reserved. | |
3 # | |
4 # RBBI Test Data | |
5 # | |
6 # File: rbbitst.txt | |
7 # | |
8 # The format of this file looks vaguely like some kind of xml-ish markup, | |
9 # but it is NOT. The syntax is this.. | |
10 # | |
11 # <word> any following data is for word break testing | |
12 # <sent> any following data is for sentence break testing | |
13 # <line> any following data is for line break testing | |
14 # <char> any following data is for char break testing | |
15 # <locale local_name> Switch to the named locale at the next occurence of <wo
rd>, <sent>, etc. | |
16 # <data> ... </data> test data. May span multiple lines. | |
17 # <> Break position, status == 0 | |
18 # • Break position, status == 0 (Bullet, \u2022) | |
19 # <nnn> Break position, status == nnn | |
20 # \ Escape. Normal ICU unescape applied. | |
21 # \ at end of line -> Line Continuation. Remove both the backslash and t
he new line | |
22 # | |
23 # In ICU4C, this test data is run by intltest, rbbi/RBBITest/TestExtended. | |
24 # In ICU4J, this test data is run by com.ibm.icu.dev.test.rbbi.RBBITestExtended | |
25 # | |
26 # There are two copies of this file in the source repository, | |
27 # [ICU4C] source/test/testdata/rbbitst.txt | |
28 # [ICU4J] main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt | |
29 # | |
30 # ICU4C's copy is the master. If any changes are made to ICU4J's copy, make sur
e they | |
31 # are merged back into ICU4C's copy of the file, lest they get overwritten late
r. | |
32 # TODO: figure out how to have a single copy of the file for use by both C and
Java. | |
33 | |
34 | |
35 ## FILTERED BREAK TESTS | |
36 | |
37 # (William Bradford, public domain. http://catalog.hathitrust.org/Record/0086512
24 ) - edited. | |
38 <locale en> | |
39 <sent> | |
40 <data>\ | |
41 •In the meantime Mr. •Weston arrived with his small ship, which he had now recov
ered. •Capt. •Gorges, who informed the Sgt. here that one purpose of his going e
ast was to meet with Mr. •Weston, took this opportunity to call him to account f
or some abuses he had to lay to his charge.•</data> | |
42 | |
43 <locale en@ss=standard> | |
44 <sent> | |
45 <data>\ | |
46 •In the meantime Mr. Weston arrived with his small ship, which he had now recove
red. •Capt. Gorges, who informed the Sgt. here that one purpose of his going eas
t was to meet with Mr. Weston, took this opportunity to call him to account for
some abuses he had to lay to his charge.•</data> | |
47 | |
48 ## END FILTERED BREAK TESTS | |
49 | |
50 <locale> | |
51 | |
52 # Temp debugging tests | |
53 <sent> | |
54 <data>•\u00c0.•</data> | |
55 | |
56 #<data>•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:"JAVA\u821c\u8165\u7fc8
\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46".\u202
9•</data> | |
57 ################################################################################
######## | |
58 # | |
59 # | |
60 # G r a p h e m e C l u s t e r T e s t s | |
61 # | |
62 # | |
63 ################################################################################
########## | |
64 <char> | |
65 | |
66 <data>•a•b•c• •,•\u0666•</data> # Quick Test | |
67 <data>•\r•\r•\r\n•\r\n•\n•\r•</data> # don't break CR/LF | |
68 | |
69 # Always break after controls. Combining chars don't combine with them. | |
70 <data>•\u0003•\N{COMBINING GRAVE ACCENT}•\r•\N{COMBINING GRAVE ACCENT}•</data> | |
71 <data>•\u0085•\N{COMBINING MACRON}•A\N{COMBINING MACRON}•</data> | |
72 | |
73 # Surrogates | |
74 <data>•\U00011000•\U00010020•\U00010000\N{COMBINING MACRON}•</data> | |
75 <data>•\ud800\udc00•\udbff\udfff•a•</data> | |
76 | |
77 # Extend (Combining chars) combine. | |
78 <data>•A\N{COMBINING GRAVE ACCENT}•B•</data> | |
79 <data>•\N{GREEK SMALL LETTER MU}\N{COMBINING LOW LINE}\N{COMBINING HORN}•</data> | |
80 <data>•a\u0301•b\u0302•c\u0303•d\u0304•e\u0305•f\u0306•g\u0307•h\u0308•i\u0309•<
/data> | |
81 | |
82 <data>•a\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304
\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u
0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u03
03\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304
\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u
0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u03
03\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304
\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u
0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u03
03\u0304\u0301\u0302\u0303\u0304•</data> | |
83 | |
84 # Don't break Hangul Syllables | |
85 # L : \u1100 | |
86 # V : \u1161 | |
87 # T : \u11A8 | |
88 # LV : \uAC00 | |
89 # LVT : \uAC01 | |
90 | |
91 <data>•\u1100\u1161\u11a8•\u1100\u1161\u11a8•</data> #LVT | |
92 <data>•\u1100\u1161•\u1100\u1161•</data> | |
93 <data>•\u1100\u1161\u11a8•\u1161•\u1100•\u11a8•\u1161\u1161\u1161\u11a8•</data> | |
94 <data>•\u1100\u1100\uac01•\u1100\uac01•\u1100\uac01\u0301•\uac01•</data> | |
95 <data>•\u1100\u0301•\u1161\u11a8\u0301•\u11a8•</data> | |
96 | |
97 | |
98 | |
99 # Hindi combining chars. (An old test) | |
100 # TODO: Update these tests for Unicode 5.1 Extended Grapheme clusters | |
101 #<data>•भ••ा•\u0930•\u0924• •\u0938\u0941\u0902•\u0926•\u0930• | |
102 #•\u0939•\u094c•\u0964•</data> | |
103 #<data>•\u0916\u0947•\u0938\u0941\u0902•\u0926•\u0930•\u0939•\u094c•\u0964•</dat
a> | |
104 | |
105 | |
106 # Bug 1587. Tamil. \u0baa\u0bc1 is an Extended Grpaheme Cluster | |
107 <data>•\u0baa\u0bc1•\u0baa\u0bc1•</data> | |
108 | |
109 # Regression test for bug 1889 | |
110 <data>•\u0f40\u0f7d•\u0000•\u0f7e•</data> | |
111 | |
112 | |
113 # 0xffff is a legal character, and should not stop the break iterator early. | |
114 # (Requires special casing in implementation, which is why it gets a test.) | |
115 <data>•\uffff•\uffff• •a•</data> | |
116 | |
117 # Treat Japanese Half Width voicing marks as combining | |
118 <data>•A\uff9e•B\uff9f\uff9e\uff9f•C•</data> | |
119 | |
120 ################################################################################
######## | |
121 # | |
122 # | |
123 # E x t e n d e d G r a p h e m e C l u s t e r T e s t s | |
124 # | |
125 # | |
126 ################################################################################
########## | |
127 #<xgc> | |
128 | |
129 # Plain Vanilla grapheme clusters | |
130 #<data>•a•b•c•</data> | |
131 #<data>•a\u0301\u0302• •b\u0303\u0304•</data> | |
132 | |
133 # Assorted Hindi combining marks | |
134 #<data>•\u0904\u0903• •\u0937\u093E• •\u0904\u093F• •\u0937\u0940• •\u0937\u0949
• •\u0937\u094A• •\u0937\u094B• •\u0937\u094C•</data> | |
135 | |
136 # Thai Clusters | |
137 # $Prepend $Extend* $PrependBase $Extend*; | |
138 # | |
139 #<data>•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02•
•</data> | |
140 | |
141 | |
142 ################################################################################
######## | |
143 # | |
144 # | |
145 # W o r d B o u n d a r y T e s t s | |
146 # | |
147 # | |
148 ################################################################################
########## | |
149 | |
150 <word> | |
151 # | |
152 # Quick sanity test | |
153 # | |
154 <data>•hello<200> •there<200> •goodbye<200></data> | |
155 <data>•hello<200> •12345<100> •,•</data> | |
156 | |
157 | |
158 # | |
159 # Test data originally in RBBIAPITest::TestFirstNextFollowing() and TestLastPrev
iousPreceding() | |
160 # | |
161 | |
162 <word> | |
163 <data>•This<200> •is<200> •a<200> •word<200> •break<200>.• • •Isn't<200> •it<200
>?• •2.25<100></data> | |
164 | |
165 | |
166 | |
167 # | |
168 # Data originally from TestDefaultRuleBasedWordIteration() | |
169 # | |
170 <data>•Write<200> •wordrules<200>.• •123.456<100> •alpha\u00adbeta\u00adgamma<20
0> •\u092f\u0939<200> •</data> | |
171 <data>• •\u0939\u093f\u0928\u094d\u200d\u0926\u0940<200> •\u0939\u0948<200> •\u0
905\u093e\u092a<200> •\u0938\u093f\u0916\u094b\u0917\u0947<200>?•</data> | |
172 | |
173 #Hindi Numbers | |
174 <data>• •\u0968\u0966.\u0969\u096f<100> •\u0967\u0966\u0966.\u0966\u0966<100> •\
N{RUPEE SIGN}•\u0967,\u0967\u0966\u0966.\u0966\u0966<100> • •\u0905\u092e\u091c<
200>\n•</data> | |
175 | |
176 <data>•\u0938\u094d\u200d\u0935\u0924\u0902deadTA\u0930<200>\r•It's<200> •$•30.1
0<100> •12,34<100>¢•£•¤•¥•alpha\u05f3beta\u05f4gamma<200> •</data> | |
177 | |
178 <data>•Badges<200>?• •BADGES<200>!•?•!• •We<200> •don't<200> •need<200> •no<200>
•STINKING<200> •BADGES<200>!•!•1000,233,456.000<100> •1,23.322<100>%•123.1222<1
00>$•123,000.20<100> •179.01<100>%•X<200> •Now<200>\r•is<200>\n•the<200>\r\n•tim
e<200> •</data> | |
179 | |
180 #Hangul | |
181 <data>•\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u111
2\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •Hello<200>,• •how
<200> •are<200> •you<200> •</data> | |
182 | |
183 <data>•Hello<200>,• •how<200> •are<200> •you<200> •\uc5f0\ud569<200> •\uc7a5\ub8
5c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11
ab\u110b\u1175\u11ab<200> •</data> | |
184 | |
185 # Words containing non-BMP letters | |
186 <data>•abc\U00010300<200> •abc\N{DESERET SMALL LETTER ENG}<200> •abc\N{MATHEMATI
CAL BOLD SMALL Z}<200> •abc\N{MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL}<200
> •</data> | |
187 | |
188 # Unassigned code points | |
189 <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data> | |
190 | |
191 # Hiragana & Katakana stay together, but separates from each other and Latin. | |
192 # *** what to do about theoretical combos of chars? i.e. hiragana + accent | |
193 #<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<400>\N{HIRAGANA LETTER VU}\N{COMBINI
NG ACUTE ACCENT}<400>\N{HIRAGANA ITERATION MARK}<400>\N{KATAKANA LETTER SMALL A}
\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA
LETTER N}<400>def<200>#•</data> | |
194 | |
195 # test normalization/dictionary handling of halfwidth katakana: same dictionary
phrase in fullwidth and halfwidth | |
196 <data>•芽キャベツ<400>芽キャベツ<400></data> | |
197 | |
198 # more Japanese tests | |
199 # TODO: some script=common characters in the Hiragana and the Katakana block may
not be treated correctly | |
200 # (was formerly true for U+30FC); need to check and fix if so. | |
201 #<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>
は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data> | |
202 <data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>でも
<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data> | |
203 | |
204 # Testing of word boundary for dictionary word containing both kanji and kana | |
205 <data>•中だるみ<400>蔵王の森<400>ウ離島<400></data> | |
206 | |
207 # Testing of Chinese segmentation (taken from a Chinese news article) | |
208 <data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400>
到了<400>“•推荐<400>票<400>”•,•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400>的
<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>,•选出<400>他们<400>属
意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</da
ta> | |
209 | |
210 # Words with interior formatting characters | |
211 <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data
> | |
212 | |
213 # to test for bug #4097779 | |
214 <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data> | |
215 | |
216 # fullwidth numeric, midletter characters etc should be treated like their halfw
idth counterparts | |
217 # <data>•ISN'T<200> •19<100>日<400></data> | |
218 # why was this added with the dbbi stuff? | |
219 | |
220 # to test for bug #4098467 | |
221 # What follows is a string of Korean characters (I found it in the Yellow P
ages | |
222 # ad for the Korean Presbyterian Church of San Francisco, and I hope I tran
scribed | |
223 # it correctly), first as precomposed syllables, and then as conjoining jam
o. | |
224 # Both sequences should be semantically identical and break the same way. | |
225 # precomposed syllables... | |
226 <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad
50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u11
0b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11
bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data> | |
227 | |
228 # more Korean tests (Jamo not tested here, not counted as dictionary characters) | |
229 # Disable them now because we don't include a Korean dictionary. | |
230 #<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<20
0>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data> | |
231 #<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2dd
<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200>
•\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data> | |
232 | |
233 <data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</dat
a> | |
234 | |
235 <data>•\u06c9<200>\uc799\ufffa•</data> | |
236 | |
237 | |
238 # | |
239 # Try some words from other scripts. | |
240 # | |
241 | |
242 # Try some words from other scripts. | |
243 # Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin | |
244 # | |
245 <data>•ΑΒΓ<200> •БВГ<200> •אבג֓<200> •ابت<200> •١٢٣<100> •\u10A0\u10A1\u10A2<200
> •ABC<200> •</data> | |
246 | |
247 <data>•\u0301•A<200></data> | |
248 | |
249 | |
250 # | |
251 # Hindi word break tests, imported from the old RBBI tests. | |
252 # An historical note: a much earlier version of ICU break iterators had a nu
mber | |
253 # of special case rules for Hindi, which were tested by an earlier version of | |
254 # this test data. The current RBBI rules do not special case Hindi in | |
255 # any way, making this test data much less signfificant. | |
256 # | |
257 <data>•\u0917\u092a\u00ad\u0936\u092a<200>!•\u092f\u0939<200> •\u0939\u093f\u092
8\u094d\u200d\u0926\u0940<200> •\u0939\u0948<200> •\u0905\u093e\u092a<200> •\u09
38\u093f\u0916\u094b\u0917\u0947<200>?•\n•:•\u092a\u094d\u0930\u093e\u092f\u0903
<200> | |
258 •\u0935\u0930\u094d\u0937\u093e<200>\r\n•\u092a\u094d\u0930\u0915\u093e\u0936<20
0>,•\u0924\u0941\u092e\u093e\u0930\u094b<200> •\u092e\u093f\u0924\u094d\u0930<20
0> •\u0915\u093e<200> •\u092a\u0924\u094d\u0930<200> •\u092a\u095d\u094b<200> •\
u0938\u094d\u0924\u094d\u0930\u093f<200>.• •\u0968\u0966.\u0969\u096f<100> •\u09
67\u0966\u0966.\u0966\u0966<100>\u20a8•\u0967,\u0967\u0966\u0966.\u0966\u0966<10
0> •\u0905\u092e\u091c<200>\n•\u0938\u094d\u200d\u0935\u0924\u0902\u0924\u094d\u
0930<200>\r•</data> | |
259 | |
260 # | |
261 # Failures from monkey tests | |
262 # | |
263 <data>•\u8527<400>\u02ba<200>\u0027\u0d42•\u00b7•\u09ea<100></data> | |
264 | |
265 # | |
266 # Jitterbug 5276 - treat Japanese half width voicing marks as Grapheme Extend | |
267 # | |
268 <data>•A\uff9e\uff9fBC<200> •1\uff9e\uff9f23<100></data> | |
269 | |
270 # User guide example: | |
271 <data>•Parlez<200>-•vous<200> •français<200> •?•</data> | |
272 | |
273 # Test for #11673 | |
274 <word> | |
275 <data>•ジョージア<400> •</data> | |
276 | |
277 ################################################################################
######## | |
278 # | |
279 # | |
280 # S e n t e n c e B o u n d a r y T e s t s | |
281 # | |
282 # | |
283 ################################################################################
########## | |
284 | |
285 | |
286 # | |
287 # Test data originally from RBBI RBBITest::TestDefaultRuleBasedSentenceIteration
() | |
288 # | |
289 <sent> | |
290 | |
291 | |
292 <sent> | |
293 <data>•This\n<100></data> | |
294 <data>•Hello! •how are you? •I'am fine. •Thankyou. •How are you \ | |
295 doing? •This\n<100> costs $20,00,000. •</data> | |
296 | |
297 | |
298 # Sentence ending in a quote. | |
299 <data>•"Sentence ending with a quote." •Bye.•</data> | |
300 | |
301 # Sentence, and test data, ending without a period or other terminator. | |
302 <data>•Here is a random sentence, no ending period<100></data> | |
303 | |
304 | |
305 <data>• (This is it). •Testing the sentence iterator. •\ | |
306 "This isn't it." •Hi! \ | |
307 •This is a simple sample sentence. •(This is it.) •This is a simple sample sente
nce. •\ | |
308 "This isn't it." •\ | |
309 Hi! •This is a simple sample sentence. •It does not have to make any sense as yo
u can see. •Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura
. •Che la dritta via aveo smarrita. •He said, that I said, that you said!! •Don'
t rock the boat.\u2029•Because I am the daddy, that is why. | |
310 •Not on my time (el timo.)! •</data> | |
311 | |
312 <data>•Hello. •So what!!\u2029•"But now," he said, \ | |
313 "I know!" •\ | |
314 Harris thumbed down several, including "Away We Go" (which became the huge succe
ss Oklahoma!). •One species, B. anthracis, is highly virulent. | |
315 •Wolf said about Sounder:\ | |
316 "Beautifully thought-out and directed." •\ | |
317 Have you ever said, "This is where\tI shall live"? •He answered, \ | |
318 "You may not!" •Another popular saying is: "How do you do?". \n•\ | |
319 Yet another popular saying is: \ | |
320 'I'm fine thanks.' •\ | |
321 What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tall!
!\ | |
322 •Now\r<100>is\n<100>the\r\n<100>time\n<100>\r<100>for\r<100>\r<100></data> | |
323 | |
324 <data>•No breaks when . is surrounded by UPPER.Case letters. •</data> | |
325 <data>•No breaks when . is followed by Numeric .4 a.4 C.4 3.1 .•</data> | |
326 <data>•No breaks when . is followed by a lower, with possible intervening punct
.,a .$a .)a. •</data> | |
327 | |
328 # | |
329 # Sentence Breaks: no break at the boundary between CJK and other letters | |
330 # | |
331 <data>•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:"JAVA\u821c\u8165\u7fc8\
u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46".\u2029
•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4JAVA\u821c\u8165\u7
fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d4
6\u97e5\u7751\u3002•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4
\u6470\u8790JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0\u60b1\u85
60\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2048•He said, "I can go there."\u2
029•Bye, now.•</data> | |
332 | |
333 # | |
334 # Treat fullwidth variants of .!? the same as their | |
335 # normal counterparts | |
336 # | |
337 <data>•I know I'm right\uff0e •Right\uff1f •Right\uff01 •</data> | |
338 | |
339 | |
340 # | |
341 # Don't break sentences at boundary between CJK and digits | |
342 # | |
343 <data>•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e48888\u821c\u8
165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u251
0\u5d46\u97e5\u7751\u3002•Bye, now<100></data> | |
344 | |
345 # | |
346 # Breaks around '(' following a sentence TERM. (Rule 9) | |
347 # | |
348 <data>•How do you do?(•Fine). •</data> | |
349 <data>•How do you do? •(Fine). •</data> | |
350 <data>•How do you do?(•fine). •</data> | |
351 <data>•How do you do? •(fine). •</data> | |
352 | |
353 # | |
354 <data>•Hello.123<100></data> # Rule 6 | |
355 <data>•Hello?•123<100></data> | |
356 | |
357 <data>•HELLO.Bye<100></data> # Rule 7 | |
358 <data>•HELLO?•Bye<100></data> | |
359 | |
360 <data>•Hello.goodbye<100></data> #Rule 8 | |
361 <data>•Hello. •Goodbye<100></data> | |
362 <data>•Hello. goodbye<100></data> | |
363 | |
364 | |
365 | |
366 # | |
367 # test for bug #4158381: No breaks when there are no terminators around | |
368 # | |
369 <data>•\<P>Provides a set of "lightweight" (all-java\<FONT SIZE="-2">\
<SUP>TM\</SUP>\</FONT> language) components that, to the maximum degree possible
, work the same on all platforms. •</data> | |
370 <data>•Another test.\u2029•</data> | |
371 | |
372 # test for bug #4143071: Make sure sentences that end with digits | |
373 # work right | |
374 # | |
375 <data>•Today is the 27th of May, 1998. •</data> | |
376 <data>•Tomorrow with be 28 May 1998. •</data> | |
377 <data>•The day after will be the 30th.\u2029•</data> | |
378 | |
379 # test for bug #4152416: Make sure sentences ending with a capital | |
380 # letter are treated correctly | |
381 # | |
382 <data>•The type of all primitive \<code>boolean\</code> values accessed in the t
arget VM. •Calls to xxx will return an implementor of this interface. \u2029•<
/data> | |
383 | |
384 # test for bug #4152117: Make sure sentence breaking is handling | |
385 # punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS | |
386 # HERE TO MAKE SURE IT DOESN'T CROP UP] | |
387 # | |
388 <data>•Constructs a randomly generated BigInteger, uniformly distributed over th
e range \<tt>0\</tt> to \<tt>(2\<sup>numBits\</sup> - 1\)\</tt>, inclusive. •Th
e uniformity of the distribution assumes that a fair source of random bits is pr
ovided in \<tt>rnd\</tt>. •Note that this constructor always constructs a non-n
egative biginteger. \n•Ahh abc. | |
389 •</data> | |
390 | |
391 # sentence breaks for hindi which used Devanagari script | |
392 # make sure there is sentence break after ?,danda(hindi phrase separator)
, | |
393 # fullstop followed by space. (VERY old test) | |
394 # | |
395 <data>•\u0928\u092e\u0938\u094d\u200d\u0924\u0947 \u0930\u092e\u0947\u0936\u0905
\u093e\u092a\u0915\u0948\u0938\u0947 \u0939\u0948?•\u092e\u0948 \u0905\u091a\u09
4d\u200d \u091b\u093e \u0939\u0942\u0901\u0964 •\u0905\u093e\u092a\r\n<100>\ | |
396 \u0915\u0948\u0938\u0947 \u0939\u0948?•\u0935\u0939 \u0915\u094d\u200d\u092f\u09
3e\n\ | |
397 <100>\u0939\u0948?•\u092f\u0939 \u0905\u093e\u092e \u0939\u0948. •\u092f\u0939 m
eans "this". •"\u092a\u095d\u093e\u0908" meaning "education" or "studies". •\u09
05\u093e\u091c(\u0938\u094d\u200d\u0935\u0924\u0902\u0924\u094d\u0930 \u0926\u09
3f\u0935\u093e\u0938) \u0939\u0948\u0964 •Let's end here. •</data> | |
398 | |
399 # Regression test for bug #1984, Sentence break in Arabic text. | |
400 | |
401 <data>\ | |
402 •\u0623\u0633\u0627\u0633\u064b\u0627\u060c\u0020\u062a\u062a\u0639\u0627"\u0645
\u0644\u0020\u0627\u0644\u062d\u0648\u0627\u0633\u064a\u0628\u0020"\u0641\u0642\
u0637\u0020\u0645\u0639\u0020\u0627\u0644\u0623\u0631\u0642\u0627\u0645\u060c\u0
648\u062a\u0642\u0648\u0645\u0020\u0628\u062a\u062e\u0632\u064a\u0646\u0020\u062
7\u0644\u0623\u062d\u0631\u0641\u0020\u0648\u0627\u0644\u0645\u062d\u0627\u0631\
u0641\u0020\u0627\u0644\u0623\u062e\u0631\u0649\u0020\u0628\u0639\u062f\u0020\u0
623\u0646\u062a\u064f\u0639\u0637\u064a\u0020\u0631\u0642\u0645\u0627\u0020\u064
5\u0639\u064a\u0646\u0627\u0020\u0644\u0643\u0644\u0020\u0648\u0627\u062d\u062f\
u0020\u0645\u0646\u0647\u0627\u002e\u0020•\u0648\u0642\u0628\u0644\u0020\u0627\u
062e\u062a\u0631\u0627\u0639\u0022\u064a\u0648\u0646\u0650\u0643\u0648\u062f\u00
22\u060c\u0020\u0643\u0627\u0646\u0020\u0647\u0646\u0627\u0643\u0020\u0645\u0626
\u0627\u062a\u0020\u0627\u0644\u0623\u0646\u0638\u0645\u0629\u0020\u0644\u0644\u
062a\u0634\u0641\u064a\u0631\u0648\u062a\u062e\u0635\u064a\u0635\u0020\u0647\u06
30\u0647\u0020\u0627\u0644\u0623\u0631\u0642\u0627\u0645\u0020\u0644\u0644\u0645
\u062d\u0627\u0631\u0641\u060c\u0020\u0648\u0644\u0645\u0020\u064a\u0648\u062c\u
062f\u0020\u0646\u0638\u0627\u0645\u062a\u0634\u0641\u064a\u0020\u0639\u0644\u06
49\u0020\u062c\u0645\u064a\u0639\u0020\u0627\u0644\u0645\u062d\u0627\u0631\u0641
\u0020\u0627\u0644\u0636\u0631\u0648\u0631\u064a\u0629. •</data> | |
403 | |
404 # Try a few more of the less common sentence endings. | |
405 <data>•Hello, world\u3002 •Hello, world\u1803 •Hello, world\u2048 •Hello, world\
u203c •Let's end here. •</data> | |
406 | |
407 | |
408 | |
409 | |
410 ################################################################ | |
411 # | |
412 # | |
413 # L I N E B R E A K | |
414 # | |
415 # | |
416 ################################################################ | |
417 | |
418 <line> | |
419 # | |
420 # Test Character for each of the line break classes. | |
421 # | |
422 # 00A1;AI # INVERTED EXCLAMATION MARK ¡ | |
423 # 0041;AL # LATIN CAPITAL LETTER A | |
424 # 0009;BA # <control> | |
425 # 00B4;BB # ACUTE ACCENT | |
426 # 000C;BK # <control> | |
427 # 2014;B2 # EM DASH | |
428 # FFFC;CB # OBJECT REPLACEMENT CHARACTER | |
429 # 0029;CL # RIGHT PARENTHESIS | |
430 # 0301;CM # COMBINING ACUTE ACCENT | |
431 # 0021;EX # EXCLAMATION MARK | |
432 # 00A0;GL # NO-BREAK SPACE | |
433 # 002D;HY # HYPHEN-MINUS | |
434 # 4E00;ID # <CJK Ideograph, First> | |
435 # 2024;IN # ONE DOT LEADER | |
436 # 002C;IS # COMMA | |
437 # 000A;LF # <control> | |
438 # 0E5A;NS # THAI CHARACTER ANGKHANKHU | |
439 # 0032;NU # DIGIT TWO | |
440 # 0028;OP # LEFT PARENTHESIS | |
441 # 0025;PO # PERCENT SIGN | |
442 # 0024;PR # DOLLAR SIGN | |
443 # 0022;QU # QUOTATION MARK | |
444 # 0E01;SA # THAI CHARACTER KO KAI | |
445 # DB7F;SG # Surrogate | |
446 # 0020;SP # SPACE | |
447 # 002F;SY # SOLIDUS / | |
448 # F8FF;XX # Private Use | |
449 # 200B;ZW # ZERO WIDTH SPACE | |
450 | |
451 | |
452 # 2b Always break at end of text | |
453 | |
454 <data>• •\u00A1•</data> | |
455 <data>• •\u0041•</data> | |
456 <data>• •\u0009•</data> | |
457 <data>• •\u00B4•</data> | |
458 <data>• \u000C<100></data> # LB3C × BK | |
459 <data>• •\u2014•</data> | |
460 <data>• •\uFFFC•</data> | |
461 <data>• \u0029•</data> # LB 8 × CL | |
462 # <data>• • \u0301•</data> # LB 7a Treat SP CM* as if it were ID #TODO: S
P CM | |
463 <data>• \u0021•</data> # LB 8 × EX | |
464 #<data>• \u00A0•</data> # LB 11b × GL TODO: fix. | |
465 <data>• •\u002D•</data> | |
466 <data>• •\u4E00•</data> | |
467 <data>• •\u2024•</data> | |
468 <data>• \u002C•</data> # LB 8 × IS | |
469 <data>• \u000A<100></data> # LB3C × ( BK | CR | LF | NL ) | |
470 <data>• •\u0E5A•</data> | |
471 <data>• •\u0032•</data> | |
472 <data>• •\u0028•</data> | |
473 <data>• •\u0025•</data> | |
474 <data>• •\u0024•</data> | |
475 <data>• •\u0022•</data> | |
476 <data>• •\u0E01•</data> | |
477 <data>• •\uDB7F•</data> | |
478 <data>• \u0020•</data> # LB4 - don't break before space. | |
479 <data>• \u002F•</data> # LB 8 × SY | |
480 <data>• •\uF8FF•</data> | |
481 <data>• \u200B•</data> # LB4 - don't break before ZA | |
482 | |
483 | |
484 # 3a Always break after hard line breaks. | |
485 # 3c Never break before hard line breaks. | |
486 | |
487 <data>• •\u00A1\u2028<100>\u00A1•</data> | |
488 <data>• •\u0041\u2028<100>\u0041•</data> | |
489 <data>• •\u0009\u2028<100>\u0009•</data> | |
490 <data>• •\u00B4\u2028<100>\u00B4•</data> | |
491 <data>• \u000C<100>\u2028<100>\u000C<100></data> | |
492 <data>• •\u2014\u2028<100>\u2014•</data> | |
493 <data>• •\uFFFC\u2028<100>\uFFFC•</data> | |
494 <data>• \u0029\u2028<100>\u0029•</data> | |
495 #<data>• \u0301\u2028<100>\u0301•</data> # TODO: fix. | |
496 <data>• \u0021\u2028<100>\u0021•</data> | |
497 #<data>• \u00A0\u2028<100>\u00A0•</data> # TODO: fix | |
498 <data>• •\u002D\u2028<100>\u002D•</data> | |
499 <data>• •\u4E00\u2028<100>\u4E00•</data> | |
500 <data>• •\u2024\u2028<100>\u2024•</data> | |
501 <data>• \u002C\u2028<100>\u002C•</data> | |
502 <data>• \u000A<100>\u2028<100>\u000A<100></data> | |
503 <data>• •\u0E5A\u2028<100>\u0E5A•</data> | |
504 <data>• •\u0032\u2028<100>\u0032•</data> | |
505 <data>• •\u0028\u2028<100>\u0028•</data> | |
506 <data>• •\u0025\u2028<100>\u0025•</data> | |
507 <data>• •\u0024\u2028<100>\u0024•</data> | |
508 <data>• •\u0022\u2028<100>\u0022•</data> | |
509 <data>• •\u0E01\u2028<100>\u0E01•</data> | |
510 <data>• •\uDB7F\u2028<100>\uDB7F•</data> | |
511 <data>• \u0020\u2028<100>\u0020•</data> | |
512 <data>• \u002F\u2028<100>\u002F•</data> | |
513 <data>• •\uF8FF\u2028<100>\uF8FF•</data> | |
514 <data>• \u200B\u2028<100>\u200B•</data> | |
515 | |
516 # User Guide example | |
517 | |
518 <data>•Parlez-•vous •français ?•</data> | |
519 | |
520 # | |
521 # Old Line Break Test data. Orginally located in RBBITest::TestDefaultRuleBase
dLineIteration() | |
522 # | |
523 | |
524 <line> | |
525 | |
526 <data>•Multi-•Level •example •of •a •semi-•idiotic •non-•sensical •(non-•importa
nt) •sentence. | |
527 <100>Hi •Hello •How\n<100>are\r<100>you\u2028<100>fine.\t•good. •Now\r<100>is\
n<100>the\r\n<100>time\n<100>\r<100>for\r<100>\r<100>all•</data> | |
528 | |
529 <line> | |
530 <data>•Hello! •how\r\n<100> •(are)\r<100> •you? •I'am •fine- •Thankyou. •foo\u00
a0bar | |
531 <100>How, •are, •you? •This, •costs •$20,00,000.•</data> | |
532 | |
533 # test for bug #4068133 | |
534 # | |
535 <data>•\u96f6•\u4e00\u3002•\u4e8c\u3001•\u4e09\u3002\u3001•\u56db\u3001\u3002\u3
001•\u4e94,•\u516d.•\u4e03.\u3001,\u3002•\u516b•</data> | |
536 | |
537 # to test for bug #4086052 | |
538 <data>•foo\u00a0bar•</data> | |
539 | |
540 # to test for bug #4097920 | |
541 <data>•dog,cat,mouse •(one)•(two)\n<100></data> | |
542 | |
543 # to test for bug #4035266 | |
544 <data>•The •balance •is •$-23,456.78, •not •-•$32,456.78!\n<100></data> | |
545 | |
546 | |
547 # to test for bug #4098467 | |
548 # What follows is a string of Korean characters (I found it in the Yellow P
ages | |
549 # ad for the Korean Presbyterian Church of San Francisco, and I hope I tran
scribed | |
550 # it correctly), first as precomposed syllables, and then as conjoining jam
o. | |
551 # Both sequences should be semantically identical and break the same way. | |
552 # precomposed syllables... (I == Rich Gillam?) | |
553 # | |
554 <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•
</data> | |
555 | |
556 # conjoining jamo... | |
557 <data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u1
1ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u11
00\u116d•\u1112\u116c•</data> | |
558 | |
559 # to test for bug #4117554: Fullwidth .!? should be treated as postJwrd | |
560 <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data> | |
561 | |
562 # Surrogate line break tests. | |
563 # | |
564 <data>•\u4e01•\ud840\udc01•\u4e02•abc •\ue000 •\udb80\udc01•</data> #This line
and the following are equivalent. | |
565 <data>•\u4e01•\U00020001•\u4e02•abc •\ue000 •\U000f0001•</data> | |
566 | |
567 # Regression for bug 836 | |
568 # Note: Unicode 5.1 changed this behavior | |
569 # Unicode 5.2 changed it again, there is no break following the '(
' | |
570 <data>•AAA(AAA •</data> | |
571 | |
572 # Try some words from other scripts. | |
573 # Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin | |
574 # | |
575 <data>•ΑΒΓ •БВГ •אבג֓ •ابت •١٢٣ •\u10A0\u10A1\u10A2 •ABC •</data> | |
576 | |
577 # | |
578 # ticket #4853: unpaired surrogates should behave like AL | |
579 # | |
580 <data>•abc\ud801xyz•</data> | |
581 | |
582 # | |
583 # Regression tests for failures that originally came from the monkey test. | |
584 # Monkey test failure lines can, with slight reformatting, be copied into th
is section | |
585 # as test cases. The error display from here is more informative. | |
586 # | |
587 <data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udc
fb•</data> | |
588 <data>•\u114d\u31f3•\ube44\u002d•\u0362\u24e2\u276e\u2014\u205f\ufe16•\uc877•\u0
fd0\u000a<100>\u20a3•</data> | |
589 <data>•\u080a\u215b\U0001d7d3\u002c•\u2025\U000e012e•\u02df\u118d\u0029\ua8d6\u0
085<100>\u6cc4\u2024\u202f\ufffc•</data> | |
590 | |
591 # Test for #10176 (in root) | |
592 <line> | |
593 <data>•abc/•s •def•</data> | |
594 <data>•abc/\u05D9 •def•</data> | |
595 <data>•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC•</data> | |
596 <data>•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05D
D/\u05D9\u05D5\u05EA•</data> | |
597 | |
598 | |
599 | |
600 ################################################################################
######## | |
601 # | |
602 # | |
603 # T i t l e B o u n d a r y T e s t s | |
604 # | |
605 # | |
606 ################################################################################
########## | |
607 <title> | |
608 <data>•Here •is •a •short •sample •sentence. •And •another.•</data> | |
609 <data>•HERE •IS •A •SHORT •SAMPLE •SENTENCE. •AND •ANOTHER.•</data> | |
610 <data>• •Start •and •end •with •spaces •</data> | |
611 <data>•Include 123 456 ^& •some 54332 •numbers 4445•abc123•abc •ending 1223 •</
data> | |
612 | |
613 <data>•Combining\u0301 \u0301•ma\u0306rks •bye •</data> | |
614 <data>•123 •Start •with •a •number.•</data> | |
615 | |
616 <data>•'•start •with •a •case-•ignorable •cha'r'a'cter•</data> | |
617 <data>•' '' •start •with •case-•ignorable & •case-•insensitive •cha'r'a'cter•</
data> | |
618 <data>• ''•aaa' •bbb '•ccc' '•ddd''' '''•eee '''•fff''' •ggg ''•</data> | |
619 # Note: apostrophe is case-ignorable. space is not cased. | |
620 | |
621 ################################################################################
########## | |
622 # | |
623 # Thai Tests | |
624 # | |
625 ################################################################################
########## | |
626 <locale th> | |
627 <word> | |
628 # | |
629 # Test data originally from the test code source file | |
630 # // @suwit -- Thai sample data from GVT Guideline | |
631 # | |
632 <data>•\u0E2B\u0E19\u0E36\u0E48\u0E07<200>\u0E04\u0E33<200>\u0E44\u0E17\u0E22<20
0>\ | |
633 \u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16<200>\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A<20
0>\ | |
634 \u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\ | |
635 \u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200></data> | |
636 | |
637 # Test data originally from http://bugs.icu-project.org/trac/search?q=r30327 | |
638 <data>•กู<200> •กิน<200>กุ้ง<200> •ปิ้่<200>งอ<200>ยู่<200>ใน<200>ถ้ำ<200></data
> | |
639 | |
640 <data>•\u0E01\u0E39<200>\u0020•\u0E01\u0E34\u0E19<200>\u0E01\u0E38\u0E49\u0E07<2
00>\ | |
641 \u0020•\u0E1B\u0E34\u0E49\u0E48<200>\u0E07\u0E2D<200>\u0E22\u0E39\u0E48<200>\ | |
642 \u0E43\u0E19<200>\u0E16\u0E49\u0E33<200></data> | |
643 | |
644 <line> | |
645 <data>•0E01\u0E39\u0020•\u0E01\u0E34\u0E19•\u0E01\u0E38\u0E49\u0E07\ | |
646 \u0020•\u0E1B\u0E34\u0E49\u0E48•\u0E07\u0E2D•\u0E22\u0E39\u0E48•\ | |
647 \u0E43\u0E19•\u0E16\u0E49\u0E33•</data> | |
648 | |
649 # Data originally from intltest RBBITest::TestThaiLineBreak() | |
650 # | |
651 # \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that | |
652 # represents elided letters at the end of a long word. It should be bound to | |
653 # the end of the word and not treated as an independent punctuation mark. | |
654 # | |
655 # the one time where the paiyannoi occurs somewhere other than at the end | |
656 # of a word is in the Thai abbrevation for "etc.", which both begins and | |
657 # ends with a paiyannoi | |
658 # | |
659 <line> | |
660 <data>•\u0e2a\u0e16\u0e32\u0e19\u0e35\u0e2f•\ | |
661 \u0e08\u0e30•\ | |
662 \u0e23\u0e30\u0e14\u0e21•\ | |
663 \u0e40\u0e08\u0e49\u0e32•\ | |
664 \u0e2b\u0e19\u0e49\u0e32\u0e17\u0e35\u0e48•\ | |
665 \u0e2d\u0e2d\u0e01•\ | |
666 \u0e21\u0e32•\ | |
667 \u0e40\u0e23\u0e48\u0e07•\ | |
668 \u0e23\u0e30\u0e1a\u0e32\u0e22•\ | |
669 \u0e2d\u0e22\u0e48\u0e32\u0e07•\ | |
670 \u0e40\u0e15\u0e47\u0e21•\ | |
671 \u0e2f\u0e25\u0e2f•\ | |
672 \u0e17\u0e35\u0e48•\ | |
673 \u0e19\u0e31\u0e49\u0e19•</data> | |
674 | |
675 # Data originally from RBBITest::TestMixedThaiLineBreak() | |
676 # @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English chara
cters start | |
677 # | |
678 <line> | |
679 <data>•\u0E1B\u0E35•\ | |
680 \u0E1E\u0E38\u0E17\u0E18\u0E28\u0E31\u0E01\u0E23\u0E32\u0E0A •\ | |
681 2545 •\ | |
682 \u0E40\u0E1B\u0E47\u0E19•\ | |
683 \u0E1B\u0E35•\ | |
684 \u0E09\u0E25\u0E2D\u0E07•\ | |
685 \u0E04\u0E23\u0E1A•\ | |
686 \u0E23\u0E2D\u0E1A •\ | |
687 \"\u0E52\u0E52\u0E50 •\ | |
688 \u0E1b\u0E35\" •\ | |
689 \u0E02\u0E2d\u0E07•\ | |
690 \u0E01\u0E23\u0E38\u0E07•\ | |
691 \u0E23\u0E31\u0E15\u0E19\u0E42\u0E01\u0E2A\u0E34\u0E19\u0E17\u0E23\u0E4C •\ | |
692 (\u0E01\u0E23\u0E38\u0E07\u0E40\u0E17\u0E1e\u0E2F•\ | |
693 \u0E2B\u0E23\u0E37\u0E2D •\ | |
694 Bangkok)•</data> | |
695 | |
696 # Data originally from RBBITest::TestMaiyamok() | |
697 # The Thai maiyamok character is a shorthand symbol that means "repeat the pre
vious | |
698 # word". Instead of appearing as a word unto itself, however, it's kept toget
her | |
699 # with the word before it. | |
700 # | |
701 <line> | |
702 <data>•\u0e44\u0e1b\u0e46•\ | |
703 \u0e21\u0e32\u0e46•\ | |
704 \u0e23\u0e30\u0e2b\u0e27\u0e48\u0e32\u0e07•\ | |
705 \u0e01\u0e23\u0e38\u0e07•\ | |
706 \u0e40\u0e17\u0e1e•\ | |
707 \u0e41\u0e25\u0e30•\ | |
708 \u0e40\u0e03\u0e35•\ | |
709 \u0e22\u0e07•\ | |
710 \u0e43\u0e2b\u0e21\u0e48•</data> | |
711 | |
712 # Test for #10296 | |
713 <line> | |
714 <data>•ใช•มั้ย•</data> | |
715 <data>•มั๊ยล่ะ•ที่รัก•</data> | |
716 | |
717 # Test for #10593 | |
718 <line> | |
719 <data>•เล่น•ผ่าน•ทาง•บลูทูธ•บน•อุปกรณ์•</data> | |
720 | |
721 # Test for city names #10691 | |
722 <line> | |
723 <data>•ไป•ที่•ซานฟรานซิสโก•</data> | |
724 | |
725 # Test for #10630, #10631 | |
726 <line> | |
727 <data>•แท็ก•แอปพลิเคชัน•เป็น•พิเศษ•</data> | |
728 | |
729 # Test for #11019 | |
730 <line> | |
731 <data>•เบ•เบราว์เซอร์•โพ•โพสต์•โพสท์•</data> | |
732 | |
733 # Test for #11688 | |
734 <line> | |
735 <data>•อัปเดต•อีเวนต์•</data> | |
736 | |
737 ################################################################################
########## | |
738 # | |
739 # Lao Tests | |
740 # | |
741 ################################################################################
########## | |
742 <locale en> | |
743 # Basic check for #7647 | |
744 <line> | |
745 <data>•ສະບາຍດີ•</data> | |
746 <data>•ດີ•ຂອບໃຈ•</data> | |
747 <data>•ເຈົ້າ•ເວົ້າ•ພາສາ•ອັງກິດ•ໄດ້•ບໍ່•</data> | |
748 <data>•ກະລຸນາ•ເວົ້າ•ຊ້າ•ໆ•</data> | |
749 | |
750 ################################################################################
########## | |
751 # | |
752 # Burmese/Myanmar Tests | |
753 # | |
754 ################################################################################
########## | |
755 <locale en> | |
756 # Basic sanity check for #10326 (some text from http://www.unicode.org/udhr/d/ud
hr_mya.txt) | |
757 <line> | |
758 <data>•လူ•တိုင်း•သည် •တူညီ •လွတ်လပ်•သော •ဂုဏ်•သိ•က္•ခါ•ဖြ•င့် •လည်းကောင်း၊ •</da
ta> | |
759 <data>•တူညီ•လွတ်လပ်•သော •အ•ခွ•င့်•အရေး•များ•ဖြ•င့် •လည်းကောင်း၊ •မွေး•ဖွား•လာ•သူ
များ •ဖြစ်သည်။•</data> | |
760 <data>•ထို•သူ•တို့၌ •ပိုင်းခြား •ဝေဖန်•တတ်•သော •ဉာဏ်•နှ•င့် •ကျ•င့်•ဝတ် •သိတတ်•သ
ော •စိတ်•တို့•ရှိ•ကြ၍ •</data> | |
761 <data>•ထို•သူ•တို့သည် •အချင်းချင်း •မေတ္တာ•ထား၍ •ဆက်ဆံ•ကျ•င့်•သုံး•</data> | |
762 | |
763 ################################################################################
########## | |
764 # | |
765 # Khmer Tests | |
766 # | |
767 ################################################################################
########## | |
768 | |
769 # Test data originally from http://bugs.icu-project.org/trac/search?q=r30327 | |
770 # from the file testdata/wordsegments.txt | |
771 <locale en> | |
772 <word> | |
773 | |
774 <data>•តើ<200>លោក<200>មក<200>ពី<200>ប្រទេស<200>ណា<200></data> | |
775 <data>•សណ្ដូក<200>ក<200>បណ្ដែត<200>ខ្លួន<200></data> | |
776 <data>•ពណ៌ស<200>ម្ដេច<200>ថា<200>ខ្មៅ<200></data> | |
777 #ប្រយោគ|ពី|របៀប|រួបរួម|និង|ភាព|ផ្សេងគ្នា|ដែល|អាច|ចូល<200></data> | |
778 <data>•ប្រយោគ<200>ពី<200>របៀប<200>ដែល<200>និង<200>ភាព<200>ផ្សេងគ្នា<200>ដែល<200>
អាច<200>ចូល<200></data> | |
779 #ប្រយោគ|ពី|របៀប|ជា|មួយ|និង|ភាព|ផ្សេងគ្នា|ដែល|អាច|ចូល<200></data> | |
780 <data>•សូម<200>ចំណាយពេល<200>បន្តិច<200>ដើម្បី<200>អធិស្ឋាន<200>អរព្រះគុណ<200>ដល់
<200>ព្រះអង្គ<200></data> | |
781 <data>•ការ<200>ថោកទាប<200>បរិប្បូណ៌<200>ដោយ<200></data> | |
782 <data>•ប្រើប្រាស់<200>ស្អាត<200>ទាំង<200>ចិត្ត<200>សិស្ស<200>នោះ<200></data> | |
783 <data>•បើ<200>អ្នក<200>ប្រព្រឺត្ត<200>អំពើអាក្រក់<200>មុខ<200>ជា<200>មាន<200></d
ata> | |
784 <data>•ប្រដាប់<200>ប្រដា<200>រ<200>រៀនសូត្រ<200>បន្ទប់<200>រៀន<200></data> | |
785 <data>•ដើរតួ<200>មនុស្សគ<200>ឥត<200>បញ្ចេញ<200>យោបល់<200>សោះ<200>ឡើយ<200></data> | |
786 <data>•មិន<200>អាច<200>ឲ្យ<200>យើង<200>ធ្វើ<200>កសិកម្ម<200>បាន<200>ឡើយ<200></da
ta> | |
787 <data>•បន្ត<200>សេចក្ត<200>ទៅទៀត<200></data> | |
788 <data>•ក្រុម<200>ប៉ូលិស<200>បណ្តាក់<200>គ្នា<200></data> | |
789 <data>•គ្មាន<200>សុខ<200>សំរាន្ត<200>ដង<200>ណា<200></data> | |
790 <data>•បាន<200>សុខភាព<200>បរិប្បូណ៌<200></data> | |
791 <data>•ជា<200>មេចោរ<200>ខ្ញុំ<200>នឹង<200>ស្លាប់<200>ទៅវិញ<200>ជា<200>មេចោរ<200>
</data> | |
792 <data>•ឯ<200>ការ<200>វាយ<200>ផ្ចាល<200>ដែល<200>នាំ<200></data> | |
793 <data>•គេ<200>ដឹក<200>ទៅ<200>សំឡាប់<200></data> | |
794 #អ្នក|ដែល|ជា|មន្ត្រី|ធំ|លើ|គាត់|ទេ<200></data> | |
795 <data>•យក<200>ទៅ<200>សម្លាប់ចោល<200>ស្ងាត់<200></data> | |
796 <data>•ត្រូវ<200>បាន<200>គេ<200>សម្លាប់<200></data> | |
797 <data>•នៅក្នុង<200>ស្រុក<200>ខ្ល<200>ងហ្ស៊ុន<200></data> | |
798 | |
799 | |
800 # | |
801 # Jitterbug 3671 Test Case | |
802 # | |
803 <data>•สวัสดี<200>ครับ<200>สบาย<200>ดี<200>ไหม<200> •ครับ<200></data> | |
804 | |
805 # | |
806 # Trac ticket 5595 Test Case | |
807 <data>•บท<200>ที่๑พายุ<200>ไซโคลน<200>โด<200>โรธี<200>อาศัย<200>อยู่<200>ท่ามกลา
ง<200>\ | |
808 ทุ่งใหญ่<200>ใน<200>แคนซัส<200>กับ<200>ลุง<200>เฮ<200>นรี<200>ชาวไร่<200>และ<200
>ป้า<200>เอ็ม<200>\ | |
809 ภรรยา<200>ชาวไร่<200>บ้าน<200>ของ<200>พวก<200>เขา<200>หลัง<200>เล็ก<200>เพราะ<20
0>ไม้<200>\ | |
810 สร้าง<200>บ้าน<200>ต้อง<200>ขน<200>มา<200>ด้วย<200>เกวียน<200>เป็น<200>ระยะ<200>
ทาง<200>หลาย<200>\ | |
811 ไมล์<200></data> | |
812 | |
813 ################################################################################
#### | |
814 # | |
815 # Tailored (locale specific) breaking. | |
816 # | |
817 ################################################################################
#### | |
818 | |
819 # Japanese line break tailoring test | |
820 | |
821 <locale ja> | |
822 <line> | |
823 <data>•\u3041•\u3043•\u3045•\u31f1•</data> | |
824 <locale en> | |
825 <line> | |
826 <data>•\u3041\u3043\u3045\u31f1•</data> | |
827 | |
828 # The following data was originally in RBBITest::TestJapaneseWordBreak() | |
829 <locale ja> | |
830 <word> | |
831 <data>•\u4ECA\u65E5<400>\u306F<400>\u3044\u3044<400>\u5929\u6C17<400>\u3067\u305
9<400>\u306D<400>\u3002•\u000D\u000A•</data> | |
832 | |
833 # UBreakIteratorType UBRK_WORD, Locale "ja" | |
834 # Don't break in runs of hiragana or runs of ideograph, where the latter include
s \u3005 \u3007 \u303B (cldrbug #2009). | |
835 # \u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC
\u30BF\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u
308B\u3002 | |
836 # modified to work with dbbi code - should verify | |
837 | |
838 <locale ja> | |
839 <word> | |
840 <data>•私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈々
<400>は<400>ワード<400>で<400>ある<400>。•</data> | |
841 | |
842 # Test for #10176 (in ja) | |
843 <line> | |
844 <data>•abc/•s •def•</data> | |
845 <data>•abc/\u05D9 •def•</data> | |
846 <data>•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC•</data> | |
847 <data>•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05D
D/\u05D9\u05D5\u05EA•</data> | |
848 | |
849 | |
850 <locale root> | |
851 <word> | |
852 <data>•私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈々
<400>は<400>ワード<400>で<400>ある<400>。•</data> | |
853 # The following test is for #10300 | |
854 <data>•例えば<400>オーストラリア<400>。•</data> | |
855 # The following test is for #10571 | |
856 <data>•一部<400>の<400>地域<400>では<400>、<0>ブラジル<400>、<0>インドネシア<400>、<0>オーストリア<400>、<0
>ニュージーランド<400>で<400>ある<400>。•</data> | |
857 | |
858 # UBreakIteratorType UBRK_SENTENCE, Locale "el" | |
859 # Add break after Greek question mark (cldrbug #2069). | |
860 # "\u0391\u03B2, \u03B3\u03B4; \u0395 \u03B6\u03B7\u037E \u0398 \u03B9\u03BA. " | |
861 # "\u039B\u03BC \u03BD\u03BE! \u039F\u03C0, \u03A1\u03C2? \u03A3" | |
862 # which is "Αβ, γδ; Ε ζη; Θ ικ. Λμ νξ! Οπ, Ρς? Σ" | |
863 | |
864 <locale root> | |
865 <sent> | |
866 <data>•Αβ, γδ; Ε ζη; Θ ικ. •Λμ νξ! •Οπ, Ρς? •Σ<100></data> | |
867 | |
868 <locale el> | |
869 <sent> | |
870 <data>•Αβ, γδ; •Ε ζη; •Θ ικ. •Λμ νξ! •Οπ, Ρς? •Σ<100></data> | |
871 | |
872 # UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX" | |
873 # Words don't include colon or period (cldrbug #1969). | |
874 | |
875 <locale en_US> | |
876 <word> | |
877 <data>•Can't<200> •have<200> •breaks<200> •in<200> •xx:yy<200> •or<200> •struct.
field<200> \ | |
878 •for<200> •CS<200>-•types<200>.•</data> | |
879 <data>•\uFF92\uFF76\uFF9E<400> •</data> | |
880 | |
881 <locale en_US_POSIX> | |
882 <word> | |
883 <data>•Can't<200> •have<200> •breaks<200> •in<200> •xx<200>:•yy<200> •or<200> •s
truct<200>.•field<200> \ | |
884 •for<200> •CS<200>-•types<200>.•</data> | |
885 <data>•\u06c9<200>\uc799\ufffa•</data> | |
886 <data>•\uFF92\uFF76\uFF9E<400> •</data> | |
887 | |
888 | |
889 # UBreakIteratorType UBRK_CHARACTER, Locale "th" | |
890 # Clusters should not include spacing Thai/Lao vowels (prefix or postfix), excep
t for [SARA] AM (cldrbug #2161). | |
891 # Update: As of Unicode 6.1 root has same behavior as th for this. | |
892 # | |
893 # "\u0E01\u0E23\u0E30\u0E17\u0E48\u0E2D\u0E21\u0E23\u0E08\u0E19\u0E32 " | |
894 # "(\u0E2A\u0E38\u0E0A\u0E32\u0E15\u0E34-\u0E08\u0E38\u0E11\u0E32\u0E21\u0E32\u0
E28) " | |
895 # "\u0E40\u0E14\u0E47\u0E01\u0E21\u0E35\u0E1B\u0E31\u0E0D\u0E2B\u0E32 " | |
896 # which is "กระท่อมรจนา (สุชาติ-จุฑามาศ) เด็กมีปัญหา " | |
897 | |
898 <locale th> | |
899 <char> | |
900 <data>•\u0E01•\u0E23•\u0E30•\u0E17\u0E48•\u0E2D•\u0E21•\u0E23•\u0E08•\u0E19•\u0E
32• •\ | |
901 (•\u0E2A\u0E38•\u0E0A•\u0E32•\u0E15\u0E34•-•\u0E08\u0E38•\u0E11•\u0E32•\u0E21•\u
0E32•\u0E28•)• •\ | |
902 \u0E40•\u0E14\u0E47•\u0E01•\u0E21\u0E35•\u0E1B\u0E31•\u0E0D•\u0E2B•\u0E32• •</da
ta> | |
903 | |
904 # Finnish line breaking | |
905 # | |
906 # These rules deal with hyphens when there is a space on the leading side. | |
907 # There should be a break opportunity between the space and the hyphen, and not
after the hyphen. | |
908 # See CLDR ticket 3029. | |
909 # See ICU ticket 8151 | |
910 | |
911 <locale root> | |
912 <line> | |
913 <data>•abc •- •def •abc •-•def •abc- •def •abc-•def•</data> # With ASC
II hyphen | |
914 <data>•abc •‐ •def •abc •‐•def •abc‐ •def •abc‐•def•</data> # With Uni
code u2010 hyphen | |
915 | |
916 <locale fi> | |
917 <line> | |
918 # TODO: problems with Finnish line break rules cause these two lines to fail. | |
919 #<data>•abc •- •def •abc •-def •abc- •def •abc-•def•</data> # With ASC
II hyphen | |
920 #<data>•abc •‐ •def •abc •‐def •abc‐ •def •abc‐•def•</data> # With Uni
code u2010 hyphen | |
921 | |
922 <data>•abc •- •def •abc •-def •abc- •def •</data> # With ASCII hyphen | |
923 <data>•abc •‐ •def •abc •‐def •abc‐ •def •</data> # With Unicode u2010
hyphen | |
924 | |
925 # Test for #10176 (in fi) | |
926 <line> | |
927 <data>•abc/•s •def•</data> | |
928 <data>•abc/\u05D9 •def•</data> | |
929 <data>•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC•</data> | |
930 <data>•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05D
D/\u05D9\u05D5\u05EA•</data> | |
931 | |
932 ################################################################################
#### | |
933 # | |
934 # Test CSS line break variants: strict, normal, loose | |
935 # | |
936 ################################################################################
#### | |
937 | |
938 <locale ja@lb=strict> | |
939 <line> | |
940 # •no brk before 3063 •no brk before 301C•no brk btw 2026 •no
brk before FF01• | |
941 <data>•\u3084\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026\u2026\u0020•\u
30A2\uFF01\u0020•</data> | |
942 | |
943 <locale ja@lb=normal> | |
944 <line> | |
945 # •brk OK before 3063 •brk OK before 301C •no brk btw 2026 •
no brk before FF01• | |
946 <data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•
\u30A2\uFF01\u0020•</data> | |
947 | |
948 <locale ja@lb=loose> | |
949 <line> | |
950 # •brk OK before 3063 •brk OK before 301C •brk OK btw 2026
•brk OK before FF01• | |
951 <data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020
•u30A2•\uFF01\u0020•</data> | |
952 | |
953 <locale en@lb=strict> | |
954 <line> | |
955 # •no brk before 3063 •no brk before 301C•no brk btw 2026 •no
brk before FF01• | |
956 <data>•\u3084\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026\u2026\u0020•\u
30A2\uFF01\u0020•</data> | |
957 | |
958 <locale en@lb=normal> | |
959 <line> | |
960 # •brk OK before 3063 •no brk before 301C •no brk btw 2026 •n
o brk before FF01• | |
961 <data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026\u2026\u0020•\
u30A2\uFF01\u0020•</data> | |
962 | |
963 <locale en@lb=loose> | |
964 <line> | |
965 # •brk OK before 3063 •no brk before 301C •brk OK btw 2026 •
no brk before FF01• | |
966 <data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026•\u2026\u0020•
u30A2\uFF01\u0020•</data> | |
OLD | NEW |