| OLD | NEW |
| (Empty) |
| 1 # Copyright (c) 2001-2015 International Business Machines | |
| 2 # Corporation and others. All Rights Reserved. | |
| 3 # | |
| 4 # RBBI Test Data | |
| 5 # | |
| 6 # File: rbbitst.txt | |
| 7 # | |
| 8 # The format of this file looks vaguely like some kind of xml-ish markup, | |
| 9 # but it is NOT. The syntax is this.. | |
| 10 # | |
| 11 # <word> any following data is for word break testing | |
| 12 # <sent> any following data is for sentence break testing | |
| 13 # <line> any following data is for line break testing | |
| 14 # <char> any following data is for char break testing | |
| 15 # <locale local_name> Switch to the named locale at the next occurence of <wo
rd>, <sent>, etc. | |
| 16 # <data> ... </data> test data. May span multiple lines. | |
| 17 # <> Break position, status == 0 | |
| 18 # • Break position, status == 0 (Bullet, \u2022) | |
| 19 # <nnn> Break position, status == nnn | |
| 20 # \ Escape. Normal ICU unescape applied. | |
| 21 # \ at end of line -> Line Continuation. Remove both the backslash and t
he new line | |
| 22 # | |
| 23 # In ICU4C, this test data is run by intltest, rbbi/RBBITest/TestExtended. | |
| 24 # In ICU4J, this test data is run by com.ibm.icu.dev.test.rbbi.RBBITestExtended | |
| 25 # | |
| 26 # There are two copies of this file in the source repository, | |
| 27 # [ICU4C] source/test/testdata/rbbitst.txt | |
| 28 # [ICU4J] main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt | |
| 29 # | |
| 30 # ICU4C's copy is the master. If any changes are made to ICU4J's copy, make sur
e they | |
| 31 # are merged back into ICU4C's copy of the file, lest they get overwritten late
r. | |
| 32 # TODO: figure out how to have a single copy of the file for use by both C and
Java. | |
| 33 | |
| 34 | |
| 35 ## FILTERED BREAK TESTS | |
| 36 | |
| 37 # (William Bradford, public domain. http://catalog.hathitrust.org/Record/0086512
24 ) - edited. | |
| 38 <locale en> | |
| 39 <sent> | |
| 40 <data>\ | |
| 41 •In the meantime Mr. •Weston arrived with his small ship, which he had now recov
ered. •Capt. •Gorges, who informed the Sgt. here that one purpose of his going e
ast was to meet with Mr. •Weston, took this opportunity to call him to account f
or some abuses he had to lay to his charge.•</data> | |
| 42 | |
| 43 <locale en@ss=standard> | |
| 44 <sent> | |
| 45 <data>\ | |
| 46 •In the meantime Mr. Weston arrived with his small ship, which he had now recove
red. •Capt. Gorges, who informed the Sgt. here that one purpose of his going eas
t was to meet with Mr. Weston, took this opportunity to call him to account for
some abuses he had to lay to his charge.•</data> | |
| 47 | |
| 48 ## END FILTERED BREAK TESTS | |
| 49 | |
| 50 <locale> | |
| 51 | |
| 52 # Temp debugging tests | |
| 53 <sent> | |
| 54 <data>•\u00c0.•</data> | |
| 55 | |
| 56 #<data>•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:"JAVA\u821c\u8165\u7fc8
\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46".\u202
9•</data> | |
| 57 ################################################################################
######## | |
| 58 # | |
| 59 # | |
| 60 # G r a p h e m e C l u s t e r T e s t s | |
| 61 # | |
| 62 # | |
| 63 ################################################################################
########## | |
| 64 <char> | |
| 65 | |
| 66 <data>•a•b•c• •,•\u0666•</data> # Quick Test | |
| 67 <data>•\r•\r•\r\n•\r\n•\n•\r•</data> # don't break CR/LF | |
| 68 | |
| 69 # Always break after controls. Combining chars don't combine with them. | |
| 70 <data>•\u0003•\N{COMBINING GRAVE ACCENT}•\r•\N{COMBINING GRAVE ACCENT}•</data> | |
| 71 <data>•\u0085•\N{COMBINING MACRON}•A\N{COMBINING MACRON}•</data> | |
| 72 | |
| 73 # Surrogates | |
| 74 <data>•\U00011000•\U00010020•\U00010000\N{COMBINING MACRON}•</data> | |
| 75 <data>•\ud800\udc00•\udbff\udfff•a•</data> | |
| 76 | |
| 77 # Extend (Combining chars) combine. | |
| 78 <data>•A\N{COMBINING GRAVE ACCENT}•B•</data> | |
| 79 <data>•\N{GREEK SMALL LETTER MU}\N{COMBINING LOW LINE}\N{COMBINING HORN}•</data> | |
| 80 <data>•a\u0301•b\u0302•c\u0303•d\u0304•e\u0305•f\u0306•g\u0307•h\u0308•i\u0309•<
/data> | |
| 81 | |
| 82 <data>•a\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304
\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u
0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u03
03\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304
\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u
0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u03
03\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304
\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u
0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u03
03\u0304\u0301\u0302\u0303\u0304•</data> | |
| 83 | |
| 84 # Don't break Hangul Syllables | |
| 85 # L : \u1100 | |
| 86 # V : \u1161 | |
| 87 # T : \u11A8 | |
| 88 # LV : \uAC00 | |
| 89 # LVT : \uAC01 | |
| 90 | |
| 91 <data>•\u1100\u1161\u11a8•\u1100\u1161\u11a8•</data> #LVT | |
| 92 <data>•\u1100\u1161•\u1100\u1161•</data> | |
| 93 <data>•\u1100\u1161\u11a8•\u1161•\u1100•\u11a8•\u1161\u1161\u1161\u11a8•</data> | |
| 94 <data>•\u1100\u1100\uac01•\u1100\uac01•\u1100\uac01\u0301•\uac01•</data> | |
| 95 <data>•\u1100\u0301•\u1161\u11a8\u0301•\u11a8•</data> | |
| 96 | |
| 97 | |
| 98 | |
| 99 # Hindi combining chars. (An old test) | |
| 100 # TODO: Update these tests for Unicode 5.1 Extended Grapheme clusters | |
| 101 #<data>•भ••ा•\u0930•\u0924• •\u0938\u0941\u0902•\u0926•\u0930• | |
| 102 #•\u0939•\u094c•\u0964•</data> | |
| 103 #<data>•\u0916\u0947•\u0938\u0941\u0902•\u0926•\u0930•\u0939•\u094c•\u0964•</dat
a> | |
| 104 | |
| 105 | |
| 106 # Bug 1587. Tamil. \u0baa\u0bc1 is an Extended Grpaheme Cluster | |
| 107 <data>•\u0baa\u0bc1•\u0baa\u0bc1•</data> | |
| 108 | |
| 109 # Regression test for bug 1889 | |
| 110 <data>•\u0f40\u0f7d•\u0000•\u0f7e•</data> | |
| 111 | |
| 112 | |
| 113 # 0xffff is a legal character, and should not stop the break iterator early. | |
| 114 # (Requires special casing in implementation, which is why it gets a test.) | |
| 115 <data>•\uffff•\uffff• •a•</data> | |
| 116 | |
| 117 # Treat Japanese Half Width voicing marks as combining | |
| 118 <data>•A\uff9e•B\uff9f\uff9e\uff9f•C•</data> | |
| 119 | |
| 120 ################################################################################
######## | |
| 121 # | |
| 122 # | |
| 123 # E x t e n d e d G r a p h e m e C l u s t e r T e s t s | |
| 124 # | |
| 125 # | |
| 126 ################################################################################
########## | |
| 127 #<xgc> | |
| 128 | |
| 129 # Plain Vanilla grapheme clusters | |
| 130 #<data>•a•b•c•</data> | |
| 131 #<data>•a\u0301\u0302• •b\u0303\u0304•</data> | |
| 132 | |
| 133 # Assorted Hindi combining marks | |
| 134 #<data>•\u0904\u0903• •\u0937\u093E• •\u0904\u093F• •\u0937\u0940• •\u0937\u0949
• •\u0937\u094A• •\u0937\u094B• •\u0937\u094C•</data> | |
| 135 | |
| 136 # Thai Clusters | |
| 137 # $Prepend $Extend* $PrependBase $Extend*; | |
| 138 # | |
| 139 #<data>•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02•
•</data> | |
| 140 | |
| 141 | |
| 142 ################################################################################
######## | |
| 143 # | |
| 144 # | |
| 145 # W o r d B o u n d a r y T e s t s | |
| 146 # | |
| 147 # | |
| 148 ################################################################################
########## | |
| 149 | |
| 150 <word> | |
| 151 # | |
| 152 # Quick sanity test | |
| 153 # | |
| 154 <data>•hello<200> •there<200> •goodbye<200></data> | |
| 155 <data>•hello<200> •12345<100> •,•</data> | |
| 156 | |
| 157 | |
| 158 # | |
| 159 # Test data originally in RBBIAPITest::TestFirstNextFollowing() and TestLastPrev
iousPreceding() | |
| 160 # | |
| 161 | |
| 162 <word> | |
| 163 <data>•This<200> •is<200> •a<200> •word<200> •break<200>.• • •Isn't<200> •it<200
>?• •2.25<100></data> | |
| 164 | |
| 165 | |
| 166 | |
| 167 # | |
| 168 # Data originally from TestDefaultRuleBasedWordIteration() | |
| 169 # | |
| 170 <data>•Write<200> •wordrules<200>.• •123.456<100> •alpha\u00adbeta\u00adgamma<20
0> •\u092f\u0939<200> •</data> | |
| 171 <data>• •\u0939\u093f\u0928\u094d\u200d\u0926\u0940<200> •\u0939\u0948<200> •\u0
905\u093e\u092a<200> •\u0938\u093f\u0916\u094b\u0917\u0947<200>?•</data> | |
| 172 | |
| 173 #Hindi Numbers | |
| 174 <data>• •\u0968\u0966.\u0969\u096f<100> •\u0967\u0966\u0966.\u0966\u0966<100> •\
N{RUPEE SIGN}•\u0967,\u0967\u0966\u0966.\u0966\u0966<100> • •\u0905\u092e\u091c<
200>\n•</data> | |
| 175 | |
| 176 <data>•\u0938\u094d\u200d\u0935\u0924\u0902deadTA\u0930<200>\r•It's<200> •$•30.1
0<100> •12,34<100>¢•£•¤•¥•alpha\u05f3beta\u05f4gamma<200> •</data> | |
| 177 | |
| 178 <data>•Badges<200>?• •BADGES<200>!•?•!• •We<200> •don't<200> •need<200> •no<200>
•STINKING<200> •BADGES<200>!•!•1000,233,456.000<100> •1,23.322<100>%•123.1222<1
00>$•123,000.20<100> •179.01<100>%•X<200> •Now<200>\r•is<200>\n•the<200>\r\n•tim
e<200> •</data> | |
| 179 | |
| 180 #Hangul | |
| 181 <data>•\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u111
2\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •Hello<200>,• •how
<200> •are<200> •you<200> •</data> | |
| 182 | |
| 183 <data>•Hello<200>,• •how<200> •are<200> •you<200> •\uc5f0\ud569<200> •\uc7a5\ub8
5c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11
ab\u110b\u1175\u11ab<200> •</data> | |
| 184 | |
| 185 # Words containing non-BMP letters | |
| 186 <data>•abc\U00010300<200> •abc\N{DESERET SMALL LETTER ENG}<200> •abc\N{MATHEMATI
CAL BOLD SMALL Z}<200> •abc\N{MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL}<200
> •</data> | |
| 187 | |
| 188 # Unassigned code points | |
| 189 <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data> | |
| 190 | |
| 191 # Hiragana & Katakana stay together, but separates from each other and Latin. | |
| 192 # *** what to do about theoretical combos of chars? i.e. hiragana + accent | |
| 193 #<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<400>\N{HIRAGANA LETTER VU}\N{COMBINI
NG ACUTE ACCENT}<400>\N{HIRAGANA ITERATION MARK}<400>\N{KATAKANA LETTER SMALL A}
\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA
LETTER N}<400>def<200>#•</data> | |
| 194 | |
| 195 # test normalization/dictionary handling of halfwidth katakana: same dictionary
phrase in fullwidth and halfwidth | |
| 196 <data>•芽キャベツ<400>芽キャベツ<400></data> | |
| 197 | |
| 198 # more Japanese tests | |
| 199 # TODO: some script=common characters in the Hiragana and the Katakana block may
not be treated correctly | |
| 200 # (was formerly true for U+30FC); need to check and fix if so. | |
| 201 #<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>
は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data> | |
| 202 <data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>でも
<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data> | |
| 203 | |
| 204 # Testing of word boundary for dictionary word containing both kanji and kana | |
| 205 <data>•中だるみ<400>蔵王の森<400>ウ離島<400></data> | |
| 206 | |
| 207 # Testing of Chinese segmentation (taken from a Chinese news article) | |
| 208 <data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400>
到了<400>“•推荐<400>票<400>”•,•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400>的
<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>,•选出<400>他们<400>属
意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</da
ta> | |
| 209 | |
| 210 # Words with interior formatting characters | |
| 211 <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data
> | |
| 212 | |
| 213 # to test for bug #4097779 | |
| 214 <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data> | |
| 215 | |
| 216 # fullwidth numeric, midletter characters etc should be treated like their halfw
idth counterparts | |
| 217 # <data>•ISN'T<200> •19<100>日<400></data> | |
| 218 # why was this added with the dbbi stuff? | |
| 219 | |
| 220 # to test for bug #4098467 | |
| 221 # What follows is a string of Korean characters (I found it in the Yellow P
ages | |
| 222 # ad for the Korean Presbyterian Church of San Francisco, and I hope I tran
scribed | |
| 223 # it correctly), first as precomposed syllables, and then as conjoining jam
o. | |
| 224 # Both sequences should be semantically identical and break the same way. | |
| 225 # precomposed syllables... | |
| 226 <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad
50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u11
0b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11
bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data> | |
| 227 | |
| 228 # more Korean tests (Jamo not tested here, not counted as dictionary characters) | |
| 229 # Disable them now because we don't include a Korean dictionary. | |
| 230 #<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<20
0>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data> | |
| 231 #<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2dd
<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200>
•\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data> | |
| 232 | |
| 233 <data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</dat
a> | |
| 234 | |
| 235 <data>•\u06c9<200>\uc799\ufffa•</data> | |
| 236 | |
| 237 | |
| 238 # | |
| 239 # Try some words from other scripts. | |
| 240 # | |
| 241 | |
| 242 # Try some words from other scripts. | |
| 243 # Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin | |
| 244 # | |
| 245 <data>•ΑΒΓ<200> •БВГ<200> •אבג֓<200> •ابت<200> •١٢٣<100> •\u10A0\u10A1\u10A2<200
> •ABC<200> •</data> | |
| 246 | |
| 247 <data>•\u0301•A<200></data> | |
| 248 | |
| 249 | |
| 250 # | |
| 251 # Hindi word break tests, imported from the old RBBI tests. | |
| 252 # An historical note: a much earlier version of ICU break iterators had a nu
mber | |
| 253 # of special case rules for Hindi, which were tested by an earlier version of | |
| 254 # this test data. The current RBBI rules do not special case Hindi in | |
| 255 # any way, making this test data much less signfificant. | |
| 256 # | |
| 257 <data>•\u0917\u092a\u00ad\u0936\u092a<200>!•\u092f\u0939<200> •\u0939\u093f\u092
8\u094d\u200d\u0926\u0940<200> •\u0939\u0948<200> •\u0905\u093e\u092a<200> •\u09
38\u093f\u0916\u094b\u0917\u0947<200>?•\n•:•\u092a\u094d\u0930\u093e\u092f\u0903
<200> | |
| 258 •\u0935\u0930\u094d\u0937\u093e<200>\r\n•\u092a\u094d\u0930\u0915\u093e\u0936<20
0>,•\u0924\u0941\u092e\u093e\u0930\u094b<200> •\u092e\u093f\u0924\u094d\u0930<20
0> •\u0915\u093e<200> •\u092a\u0924\u094d\u0930<200> •\u092a\u095d\u094b<200> •\
u0938\u094d\u0924\u094d\u0930\u093f<200>.• •\u0968\u0966.\u0969\u096f<100> •\u09
67\u0966\u0966.\u0966\u0966<100>\u20a8•\u0967,\u0967\u0966\u0966.\u0966\u0966<10
0> •\u0905\u092e\u091c<200>\n•\u0938\u094d\u200d\u0935\u0924\u0902\u0924\u094d\u
0930<200>\r•</data> | |
| 259 | |
| 260 # | |
| 261 # Failures from monkey tests | |
| 262 # | |
| 263 <data>•\u8527<400>\u02ba<200>\u0027\u0d42•\u00b7•\u09ea<100></data> | |
| 264 | |
| 265 # | |
| 266 # Jitterbug 5276 - treat Japanese half width voicing marks as Grapheme Extend | |
| 267 # | |
| 268 <data>•A\uff9e\uff9fBC<200> •1\uff9e\uff9f23<100></data> | |
| 269 | |
| 270 # User guide example: | |
| 271 <data>•Parlez<200>-•vous<200> •français<200> •?•</data> | |
| 272 | |
| 273 # Test for #11673 | |
| 274 <word> | |
| 275 <data>•ジョージア<400> •</data> | |
| 276 | |
| 277 ################################################################################
######## | |
| 278 # | |
| 279 # | |
| 280 # S e n t e n c e B o u n d a r y T e s t s | |
| 281 # | |
| 282 # | |
| 283 ################################################################################
########## | |
| 284 | |
| 285 | |
| 286 # | |
| 287 # Test data originally from RBBI RBBITest::TestDefaultRuleBasedSentenceIteration
() | |
| 288 # | |
| 289 <sent> | |
| 290 | |
| 291 | |
| 292 <sent> | |
| 293 <data>•This\n<100></data> | |
| 294 <data>•Hello! •how are you? •I'am fine. •Thankyou. •How are you \ | |
| 295 doing? •This\n<100> costs $20,00,000. •</data> | |
| 296 | |
| 297 | |
| 298 # Sentence ending in a quote. | |
| 299 <data>•"Sentence ending with a quote." •Bye.•</data> | |
| 300 | |
| 301 # Sentence, and test data, ending without a period or other terminator. | |
| 302 <data>•Here is a random sentence, no ending period<100></data> | |
| 303 | |
| 304 | |
| 305 <data>• (This is it). •Testing the sentence iterator. •\ | |
| 306 "This isn't it." •Hi! \ | |
| 307 •This is a simple sample sentence. •(This is it.) •This is a simple sample sente
nce. •\ | |
| 308 "This isn't it." •\ | |
| 309 Hi! •This is a simple sample sentence. •It does not have to make any sense as yo
u can see. •Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura
. •Che la dritta via aveo smarrita. •He said, that I said, that you said!! •Don'
t rock the boat.\u2029•Because I am the daddy, that is why. | |
| 310 •Not on my time (el timo.)! •</data> | |
| 311 | |
| 312 <data>•Hello. •So what!!\u2029•"But now," he said, \ | |
| 313 "I know!" •\ | |
| 314 Harris thumbed down several, including "Away We Go" (which became the huge succe
ss Oklahoma!). •One species, B. anthracis, is highly virulent. | |
| 315 •Wolf said about Sounder:\ | |
| 316 "Beautifully thought-out and directed." •\ | |
| 317 Have you ever said, "This is where\tI shall live"? •He answered, \ | |
| 318 "You may not!" •Another popular saying is: "How do you do?". \n•\ | |
| 319 Yet another popular saying is: \ | |
| 320 'I'm fine thanks.' •\ | |
| 321 What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tall!
!\ | |
| 322 •Now\r<100>is\n<100>the\r\n<100>time\n<100>\r<100>for\r<100>\r<100></data> | |
| 323 | |
| 324 <data>•No breaks when . is surrounded by UPPER.Case letters. •</data> | |
| 325 <data>•No breaks when . is followed by Numeric .4 a.4 C.4 3.1 .•</data> | |
| 326 <data>•No breaks when . is followed by a lower, with possible intervening punct
.,a .$a .)a. •</data> | |
| 327 | |
| 328 # | |
| 329 # Sentence Breaks: no break at the boundary between CJK and other letters | |
| 330 # | |
| 331 <data>•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:"JAVA\u821c\u8165\u7fc8\
u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46".\u2029
•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4JAVA\u821c\u8165\u7
fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d4
6\u97e5\u7751\u3002•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4
\u6470\u8790JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0\u60b1\u85
60\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2048•He said, "I can go there."\u2
029•Bye, now.•</data> | |
| 332 | |
| 333 # | |
| 334 # Treat fullwidth variants of .!? the same as their | |
| 335 # normal counterparts | |
| 336 # | |
| 337 <data>•I know I'm right\uff0e •Right\uff1f •Right\uff01 •</data> | |
| 338 | |
| 339 | |
| 340 # | |
| 341 # Don't break sentences at boundary between CJK and digits | |
| 342 # | |
| 343 <data>•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e48888\u821c\u8
165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u251
0\u5d46\u97e5\u7751\u3002•Bye, now<100></data> | |
| 344 | |
| 345 # | |
| 346 # Breaks around '(' following a sentence TERM. (Rule 9) | |
| 347 # | |
| 348 <data>•How do you do?(•Fine). •</data> | |
| 349 <data>•How do you do? •(Fine). •</data> | |
| 350 <data>•How do you do?(•fine). •</data> | |
| 351 <data>•How do you do? •(fine). •</data> | |
| 352 | |
| 353 # | |
| 354 <data>•Hello.123<100></data> # Rule 6 | |
| 355 <data>•Hello?•123<100></data> | |
| 356 | |
| 357 <data>•HELLO.Bye<100></data> # Rule 7 | |
| 358 <data>•HELLO?•Bye<100></data> | |
| 359 | |
| 360 <data>•Hello.goodbye<100></data> #Rule 8 | |
| 361 <data>•Hello. •Goodbye<100></data> | |
| 362 <data>•Hello. goodbye<100></data> | |
| 363 | |
| 364 | |
| 365 | |
| 366 # | |
| 367 # test for bug #4158381: No breaks when there are no terminators around | |
| 368 # | |
| 369 <data>•\<P>Provides a set of "lightweight" (all-java\<FONT SIZE="-2">\
<SUP>TM\</SUP>\</FONT> language) components that, to the maximum degree possible
, work the same on all platforms. •</data> | |
| 370 <data>•Another test.\u2029•</data> | |
| 371 | |
| 372 # test for bug #4143071: Make sure sentences that end with digits | |
| 373 # work right | |
| 374 # | |
| 375 <data>•Today is the 27th of May, 1998. •</data> | |
| 376 <data>•Tomorrow with be 28 May 1998. •</data> | |
| 377 <data>•The day after will be the 30th.\u2029•</data> | |
| 378 | |
| 379 # test for bug #4152416: Make sure sentences ending with a capital | |
| 380 # letter are treated correctly | |
| 381 # | |
| 382 <data>•The type of all primitive \<code>boolean\</code> values accessed in the t
arget VM. •Calls to xxx will return an implementor of this interface. \u2029•<
/data> | |
| 383 | |
| 384 # test for bug #4152117: Make sure sentence breaking is handling | |
| 385 # punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS | |
| 386 # HERE TO MAKE SURE IT DOESN'T CROP UP] | |
| 387 # | |
| 388 <data>•Constructs a randomly generated BigInteger, uniformly distributed over th
e range \<tt>0\</tt> to \<tt>(2\<sup>numBits\</sup> - 1\)\</tt>, inclusive. •Th
e uniformity of the distribution assumes that a fair source of random bits is pr
ovided in \<tt>rnd\</tt>. •Note that this constructor always constructs a non-n
egative biginteger. \n•Ahh abc. | |
| 389 •</data> | |
| 390 | |
| 391 # sentence breaks for hindi which used Devanagari script | |
| 392 # make sure there is sentence break after ?,danda(hindi phrase separator)
, | |
| 393 # fullstop followed by space. (VERY old test) | |
| 394 # | |
| 395 <data>•\u0928\u092e\u0938\u094d\u200d\u0924\u0947 \u0930\u092e\u0947\u0936\u0905
\u093e\u092a\u0915\u0948\u0938\u0947 \u0939\u0948?•\u092e\u0948 \u0905\u091a\u09
4d\u200d \u091b\u093e \u0939\u0942\u0901\u0964 •\u0905\u093e\u092a\r\n<100>\ | |
| 396 \u0915\u0948\u0938\u0947 \u0939\u0948?•\u0935\u0939 \u0915\u094d\u200d\u092f\u09
3e\n\ | |
| 397 <100>\u0939\u0948?•\u092f\u0939 \u0905\u093e\u092e \u0939\u0948. •\u092f\u0939 m
eans "this". •"\u092a\u095d\u093e\u0908" meaning "education" or "studies". •\u09
05\u093e\u091c(\u0938\u094d\u200d\u0935\u0924\u0902\u0924\u094d\u0930 \u0926\u09
3f\u0935\u093e\u0938) \u0939\u0948\u0964 •Let's end here. •</data> | |
| 398 | |
| 399 # Regression test for bug #1984, Sentence break in Arabic text. | |
| 400 | |
| 401 <data>\ | |
| 402 •\u0623\u0633\u0627\u0633\u064b\u0627\u060c\u0020\u062a\u062a\u0639\u0627"\u0645
\u0644\u0020\u0627\u0644\u062d\u0648\u0627\u0633\u064a\u0628\u0020"\u0641\u0642\
u0637\u0020\u0645\u0639\u0020\u0627\u0644\u0623\u0631\u0642\u0627\u0645\u060c\u0
648\u062a\u0642\u0648\u0645\u0020\u0628\u062a\u062e\u0632\u064a\u0646\u0020\u062
7\u0644\u0623\u062d\u0631\u0641\u0020\u0648\u0627\u0644\u0645\u062d\u0627\u0631\
u0641\u0020\u0627\u0644\u0623\u062e\u0631\u0649\u0020\u0628\u0639\u062f\u0020\u0
623\u0646\u062a\u064f\u0639\u0637\u064a\u0020\u0631\u0642\u0645\u0627\u0020\u064
5\u0639\u064a\u0646\u0627\u0020\u0644\u0643\u0644\u0020\u0648\u0627\u062d\u062f\
u0020\u0645\u0646\u0647\u0627\u002e\u0020•\u0648\u0642\u0628\u0644\u0020\u0627\u
062e\u062a\u0631\u0627\u0639\u0022\u064a\u0648\u0646\u0650\u0643\u0648\u062f\u00
22\u060c\u0020\u0643\u0627\u0646\u0020\u0647\u0646\u0627\u0643\u0020\u0645\u0626
\u0627\u062a\u0020\u0627\u0644\u0623\u0646\u0638\u0645\u0629\u0020\u0644\u0644\u
062a\u0634\u0641\u064a\u0631\u0648\u062a\u062e\u0635\u064a\u0635\u0020\u0647\u06
30\u0647\u0020\u0627\u0644\u0623\u0631\u0642\u0627\u0645\u0020\u0644\u0644\u0645
\u062d\u0627\u0631\u0641\u060c\u0020\u0648\u0644\u0645\u0020\u064a\u0648\u062c\u
062f\u0020\u0646\u0638\u0627\u0645\u062a\u0634\u0641\u064a\u0020\u0639\u0644\u06
49\u0020\u062c\u0645\u064a\u0639\u0020\u0627\u0644\u0645\u062d\u0627\u0631\u0641
\u0020\u0627\u0644\u0636\u0631\u0648\u0631\u064a\u0629. •</data> | |
| 403 | |
| 404 # Try a few more of the less common sentence endings. | |
| 405 <data>•Hello, world\u3002 •Hello, world\u1803 •Hello, world\u2048 •Hello, world\
u203c •Let's end here. •</data> | |
| 406 | |
| 407 | |
| 408 | |
| 409 | |
| 410 ################################################################ | |
| 411 # | |
| 412 # | |
| 413 # L I N E B R E A K | |
| 414 # | |
| 415 # | |
| 416 ################################################################ | |
| 417 | |
| 418 <line> | |
| 419 # | |
| 420 # Test Character for each of the line break classes. | |
| 421 # | |
| 422 # 00A1;AI # INVERTED EXCLAMATION MARK ¡ | |
| 423 # 0041;AL # LATIN CAPITAL LETTER A | |
| 424 # 0009;BA # <control> | |
| 425 # 00B4;BB # ACUTE ACCENT | |
| 426 # 000C;BK # <control> | |
| 427 # 2014;B2 # EM DASH | |
| 428 # FFFC;CB # OBJECT REPLACEMENT CHARACTER | |
| 429 # 0029;CL # RIGHT PARENTHESIS | |
| 430 # 0301;CM # COMBINING ACUTE ACCENT | |
| 431 # 0021;EX # EXCLAMATION MARK | |
| 432 # 00A0;GL # NO-BREAK SPACE | |
| 433 # 002D;HY # HYPHEN-MINUS | |
| 434 # 4E00;ID # <CJK Ideograph, First> | |
| 435 # 2024;IN # ONE DOT LEADER | |
| 436 # 002C;IS # COMMA | |
| 437 # 000A;LF # <control> | |
| 438 # 0E5A;NS # THAI CHARACTER ANGKHANKHU | |
| 439 # 0032;NU # DIGIT TWO | |
| 440 # 0028;OP # LEFT PARENTHESIS | |
| 441 # 0025;PO # PERCENT SIGN | |
| 442 # 0024;PR # DOLLAR SIGN | |
| 443 # 0022;QU # QUOTATION MARK | |
| 444 # 0E01;SA # THAI CHARACTER KO KAI | |
| 445 # DB7F;SG # Surrogate | |
| 446 # 0020;SP # SPACE | |
| 447 # 002F;SY # SOLIDUS / | |
| 448 # F8FF;XX # Private Use | |
| 449 # 200B;ZW # ZERO WIDTH SPACE | |
| 450 | |
| 451 | |
| 452 # 2b Always break at end of text | |
| 453 | |
| 454 <data>• •\u00A1•</data> | |
| 455 <data>• •\u0041•</data> | |
| 456 <data>• •\u0009•</data> | |
| 457 <data>• •\u00B4•</data> | |
| 458 <data>• \u000C<100></data> # LB3C × BK | |
| 459 <data>• •\u2014•</data> | |
| 460 <data>• •\uFFFC•</data> | |
| 461 <data>• \u0029•</data> # LB 8 × CL | |
| 462 # <data>• • \u0301•</data> # LB 7a Treat SP CM* as if it were ID #TODO: S
P CM | |
| 463 <data>• \u0021•</data> # LB 8 × EX | |
| 464 #<data>• \u00A0•</data> # LB 11b × GL TODO: fix. | |
| 465 <data>• •\u002D•</data> | |
| 466 <data>• •\u4E00•</data> | |
| 467 <data>• •\u2024•</data> | |
| 468 <data>• \u002C•</data> # LB 8 × IS | |
| 469 <data>• \u000A<100></data> # LB3C × ( BK | CR | LF | NL ) | |
| 470 <data>• •\u0E5A•</data> | |
| 471 <data>• •\u0032•</data> | |
| 472 <data>• •\u0028•</data> | |
| 473 <data>• •\u0025•</data> | |
| 474 <data>• •\u0024•</data> | |
| 475 <data>• •\u0022•</data> | |
| 476 <data>• •\u0E01•</data> | |
| 477 <data>• •\uDB7F•</data> | |
| 478 <data>• \u0020•</data> # LB4 - don't break before space. | |
| 479 <data>• \u002F•</data> # LB 8 × SY | |
| 480 <data>• •\uF8FF•</data> | |
| 481 <data>• \u200B•</data> # LB4 - don't break before ZA | |
| 482 | |
| 483 | |
| 484 # 3a Always break after hard line breaks. | |
| 485 # 3c Never break before hard line breaks. | |
| 486 | |
| 487 <data>• •\u00A1\u2028<100>\u00A1•</data> | |
| 488 <data>• •\u0041\u2028<100>\u0041•</data> | |
| 489 <data>• •\u0009\u2028<100>\u0009•</data> | |
| 490 <data>• •\u00B4\u2028<100>\u00B4•</data> | |
| 491 <data>• \u000C<100>\u2028<100>\u000C<100></data> | |
| 492 <data>• •\u2014\u2028<100>\u2014•</data> | |
| 493 <data>• •\uFFFC\u2028<100>\uFFFC•</data> | |
| 494 <data>• \u0029\u2028<100>\u0029•</data> | |
| 495 #<data>• \u0301\u2028<100>\u0301•</data> # TODO: fix. | |
| 496 <data>• \u0021\u2028<100>\u0021•</data> | |
| 497 #<data>• \u00A0\u2028<100>\u00A0•</data> # TODO: fix | |
| 498 <data>• •\u002D\u2028<100>\u002D•</data> | |
| 499 <data>• •\u4E00\u2028<100>\u4E00•</data> | |
| 500 <data>• •\u2024\u2028<100>\u2024•</data> | |
| 501 <data>• \u002C\u2028<100>\u002C•</data> | |
| 502 <data>• \u000A<100>\u2028<100>\u000A<100></data> | |
| 503 <data>• •\u0E5A\u2028<100>\u0E5A•</data> | |
| 504 <data>• •\u0032\u2028<100>\u0032•</data> | |
| 505 <data>• •\u0028\u2028<100>\u0028•</data> | |
| 506 <data>• •\u0025\u2028<100>\u0025•</data> | |
| 507 <data>• •\u0024\u2028<100>\u0024•</data> | |
| 508 <data>• •\u0022\u2028<100>\u0022•</data> | |
| 509 <data>• •\u0E01\u2028<100>\u0E01•</data> | |
| 510 <data>• •\uDB7F\u2028<100>\uDB7F•</data> | |
| 511 <data>• \u0020\u2028<100>\u0020•</data> | |
| 512 <data>• \u002F\u2028<100>\u002F•</data> | |
| 513 <data>• •\uF8FF\u2028<100>\uF8FF•</data> | |
| 514 <data>• \u200B\u2028<100>\u200B•</data> | |
| 515 | |
| 516 # User Guide example | |
| 517 | |
| 518 <data>•Parlez-•vous •français ?•</data> | |
| 519 | |
| 520 # | |
| 521 # Old Line Break Test data. Orginally located in RBBITest::TestDefaultRuleBase
dLineIteration() | |
| 522 # | |
| 523 | |
| 524 <line> | |
| 525 | |
| 526 <data>•Multi-•Level •example •of •a •semi-•idiotic •non-•sensical •(non-•importa
nt) •sentence. | |
| 527 <100>Hi •Hello •How\n<100>are\r<100>you\u2028<100>fine.\t•good. •Now\r<100>is\
n<100>the\r\n<100>time\n<100>\r<100>for\r<100>\r<100>all•</data> | |
| 528 | |
| 529 <line> | |
| 530 <data>•Hello! •how\r\n<100> •(are)\r<100> •you? •I'am •fine- •Thankyou. •foo\u00
a0bar | |
| 531 <100>How, •are, •you? •This, •costs •$20,00,000.•</data> | |
| 532 | |
| 533 # test for bug #4068133 | |
| 534 # | |
| 535 <data>•\u96f6•\u4e00\u3002•\u4e8c\u3001•\u4e09\u3002\u3001•\u56db\u3001\u3002\u3
001•\u4e94,•\u516d.•\u4e03.\u3001,\u3002•\u516b•</data> | |
| 536 | |
| 537 # to test for bug #4086052 | |
| 538 <data>•foo\u00a0bar•</data> | |
| 539 | |
| 540 # to test for bug #4097920 | |
| 541 <data>•dog,cat,mouse •(one)•(two)\n<100></data> | |
| 542 | |
| 543 # to test for bug #4035266 | |
| 544 <data>•The •balance •is •$-23,456.78, •not •-•$32,456.78!\n<100></data> | |
| 545 | |
| 546 | |
| 547 # to test for bug #4098467 | |
| 548 # What follows is a string of Korean characters (I found it in the Yellow P
ages | |
| 549 # ad for the Korean Presbyterian Church of San Francisco, and I hope I tran
scribed | |
| 550 # it correctly), first as precomposed syllables, and then as conjoining jam
o. | |
| 551 # Both sequences should be semantically identical and break the same way. | |
| 552 # precomposed syllables... (I == Rich Gillam?) | |
| 553 # | |
| 554 <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•
</data> | |
| 555 | |
| 556 # conjoining jamo... | |
| 557 <data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u1
1ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u11
00\u116d•\u1112\u116c•</data> | |
| 558 | |
| 559 # to test for bug #4117554: Fullwidth .!? should be treated as postJwrd | |
| 560 <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data> | |
| 561 | |
| 562 # Surrogate line break tests. | |
| 563 # | |
| 564 <data>•\u4e01•\ud840\udc01•\u4e02•abc •\ue000 •\udb80\udc01•</data> #This line
and the following are equivalent. | |
| 565 <data>•\u4e01•\U00020001•\u4e02•abc •\ue000 •\U000f0001•</data> | |
| 566 | |
| 567 # Regression for bug 836 | |
| 568 # Note: Unicode 5.1 changed this behavior | |
| 569 # Unicode 5.2 changed it again, there is no break following the '(
' | |
| 570 <data>•AAA(AAA •</data> | |
| 571 | |
| 572 # Try some words from other scripts. | |
| 573 # Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin | |
| 574 # | |
| 575 <data>•ΑΒΓ •БВГ •אבג֓ •ابت •١٢٣ •\u10A0\u10A1\u10A2 •ABC •</data> | |
| 576 | |
| 577 # | |
| 578 # ticket #4853: unpaired surrogates should behave like AL | |
| 579 # | |
| 580 <data>•abc\ud801xyz•</data> | |
| 581 | |
| 582 # | |
| 583 # Regression tests for failures that originally came from the monkey test. | |
| 584 # Monkey test failure lines can, with slight reformatting, be copied into th
is section | |
| 585 # as test cases. The error display from here is more informative. | |
| 586 # | |
| 587 <data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udc
fb•</data> | |
| 588 <data>•\u114d\u31f3•\ube44\u002d•\u0362\u24e2\u276e\u2014\u205f\ufe16•\uc877•\u0
fd0\u000a<100>\u20a3•</data> | |
| 589 <data>•\u080a\u215b\U0001d7d3\u002c•\u2025\U000e012e•\u02df\u118d\u0029\ua8d6\u0
085<100>\u6cc4\u2024\u202f\ufffc•</data> | |
| 590 | |
| 591 # Test for #10176 (in root) | |
| 592 <line> | |
| 593 <data>•abc/•s •def•</data> | |
| 594 <data>•abc/\u05D9 •def•</data> | |
| 595 <data>•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC•</data> | |
| 596 <data>•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05D
D/\u05D9\u05D5\u05EA•</data> | |
| 597 | |
| 598 | |
| 599 | |
| 600 ################################################################################
######## | |
| 601 # | |
| 602 # | |
| 603 # T i t l e B o u n d a r y T e s t s | |
| 604 # | |
| 605 # | |
| 606 ################################################################################
########## | |
| 607 <title> | |
| 608 <data>•Here •is •a •short •sample •sentence. •And •another.•</data> | |
| 609 <data>•HERE •IS •A •SHORT •SAMPLE •SENTENCE. •AND •ANOTHER.•</data> | |
| 610 <data>• •Start •and •end •with •spaces •</data> | |
| 611 <data>•Include 123 456 ^& •some 54332 •numbers 4445•abc123•abc •ending 1223 •</
data> | |
| 612 | |
| 613 <data>•Combining\u0301 \u0301•ma\u0306rks •bye •</data> | |
| 614 <data>•123 •Start •with •a •number.•</data> | |
| 615 | |
| 616 <data>•'•start •with •a •case-•ignorable •cha'r'a'cter•</data> | |
| 617 <data>•' '' •start •with •case-•ignorable & •case-•insensitive •cha'r'a'cter•</
data> | |
| 618 <data>• ''•aaa' •bbb '•ccc' '•ddd''' '''•eee '''•fff''' •ggg ''•</data> | |
| 619 # Note: apostrophe is case-ignorable. space is not cased. | |
| 620 | |
| 621 ################################################################################
########## | |
| 622 # | |
| 623 # Thai Tests | |
| 624 # | |
| 625 ################################################################################
########## | |
| 626 <locale th> | |
| 627 <word> | |
| 628 # | |
| 629 # Test data originally from the test code source file | |
| 630 # // @suwit -- Thai sample data from GVT Guideline | |
| 631 # | |
| 632 <data>•\u0E2B\u0E19\u0E36\u0E48\u0E07<200>\u0E04\u0E33<200>\u0E44\u0E17\u0E22<20
0>\ | |
| 633 \u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16<200>\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A<20
0>\ | |
| 634 \u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\ | |
| 635 \u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200></data> | |
| 636 | |
| 637 # Test data originally from http://bugs.icu-project.org/trac/search?q=r30327 | |
| 638 <data>•กู<200> •กิน<200>กุ้ง<200> •ปิ้่<200>งอ<200>ยู่<200>ใน<200>ถ้ำ<200></data
> | |
| 639 | |
| 640 <data>•\u0E01\u0E39<200>\u0020•\u0E01\u0E34\u0E19<200>\u0E01\u0E38\u0E49\u0E07<2
00>\ | |
| 641 \u0020•\u0E1B\u0E34\u0E49\u0E48<200>\u0E07\u0E2D<200>\u0E22\u0E39\u0E48<200>\ | |
| 642 \u0E43\u0E19<200>\u0E16\u0E49\u0E33<200></data> | |
| 643 | |
| 644 <line> | |
| 645 <data>•0E01\u0E39\u0020•\u0E01\u0E34\u0E19•\u0E01\u0E38\u0E49\u0E07\ | |
| 646 \u0020•\u0E1B\u0E34\u0E49\u0E48•\u0E07\u0E2D•\u0E22\u0E39\u0E48•\ | |
| 647 \u0E43\u0E19•\u0E16\u0E49\u0E33•</data> | |
| 648 | |
| 649 # Data originally from intltest RBBITest::TestThaiLineBreak() | |
| 650 # | |
| 651 # \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that | |
| 652 # represents elided letters at the end of a long word. It should be bound to | |
| 653 # the end of the word and not treated as an independent punctuation mark. | |
| 654 # | |
| 655 # the one time where the paiyannoi occurs somewhere other than at the end | |
| 656 # of a word is in the Thai abbrevation for "etc.", which both begins and | |
| 657 # ends with a paiyannoi | |
| 658 # | |
| 659 <line> | |
| 660 <data>•\u0e2a\u0e16\u0e32\u0e19\u0e35\u0e2f•\ | |
| 661 \u0e08\u0e30•\ | |
| 662 \u0e23\u0e30\u0e14\u0e21•\ | |
| 663 \u0e40\u0e08\u0e49\u0e32•\ | |
| 664 \u0e2b\u0e19\u0e49\u0e32\u0e17\u0e35\u0e48•\ | |
| 665 \u0e2d\u0e2d\u0e01•\ | |
| 666 \u0e21\u0e32•\ | |
| 667 \u0e40\u0e23\u0e48\u0e07•\ | |
| 668 \u0e23\u0e30\u0e1a\u0e32\u0e22•\ | |
| 669 \u0e2d\u0e22\u0e48\u0e32\u0e07•\ | |
| 670 \u0e40\u0e15\u0e47\u0e21•\ | |
| 671 \u0e2f\u0e25\u0e2f•\ | |
| 672 \u0e17\u0e35\u0e48•\ | |
| 673 \u0e19\u0e31\u0e49\u0e19•</data> | |
| 674 | |
| 675 # Data originally from RBBITest::TestMixedThaiLineBreak() | |
| 676 # @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English chara
cters start | |
| 677 # | |
| 678 <line> | |
| 679 <data>•\u0E1B\u0E35•\ | |
| 680 \u0E1E\u0E38\u0E17\u0E18\u0E28\u0E31\u0E01\u0E23\u0E32\u0E0A •\ | |
| 681 2545 •\ | |
| 682 \u0E40\u0E1B\u0E47\u0E19•\ | |
| 683 \u0E1B\u0E35•\ | |
| 684 \u0E09\u0E25\u0E2D\u0E07•\ | |
| 685 \u0E04\u0E23\u0E1A•\ | |
| 686 \u0E23\u0E2D\u0E1A •\ | |
| 687 \"\u0E52\u0E52\u0E50 •\ | |
| 688 \u0E1b\u0E35\" •\ | |
| 689 \u0E02\u0E2d\u0E07•\ | |
| 690 \u0E01\u0E23\u0E38\u0E07•\ | |
| 691 \u0E23\u0E31\u0E15\u0E19\u0E42\u0E01\u0E2A\u0E34\u0E19\u0E17\u0E23\u0E4C •\ | |
| 692 (\u0E01\u0E23\u0E38\u0E07\u0E40\u0E17\u0E1e\u0E2F•\ | |
| 693 \u0E2B\u0E23\u0E37\u0E2D •\ | |
| 694 Bangkok)•</data> | |
| 695 | |
| 696 # Data originally from RBBITest::TestMaiyamok() | |
| 697 # The Thai maiyamok character is a shorthand symbol that means "repeat the pre
vious | |
| 698 # word". Instead of appearing as a word unto itself, however, it's kept toget
her | |
| 699 # with the word before it. | |
| 700 # | |
| 701 <line> | |
| 702 <data>•\u0e44\u0e1b\u0e46•\ | |
| 703 \u0e21\u0e32\u0e46•\ | |
| 704 \u0e23\u0e30\u0e2b\u0e27\u0e48\u0e32\u0e07•\ | |
| 705 \u0e01\u0e23\u0e38\u0e07•\ | |
| 706 \u0e40\u0e17\u0e1e•\ | |
| 707 \u0e41\u0e25\u0e30•\ | |
| 708 \u0e40\u0e03\u0e35•\ | |
| 709 \u0e22\u0e07•\ | |
| 710 \u0e43\u0e2b\u0e21\u0e48•</data> | |
| 711 | |
| 712 # Test for #10296 | |
| 713 <line> | |
| 714 <data>•ใช•มั้ย•</data> | |
| 715 <data>•มั๊ยล่ะ•ที่รัก•</data> | |
| 716 | |
| 717 # Test for #10593 | |
| 718 <line> | |
| 719 <data>•เล่น•ผ่าน•ทาง•บลูทูธ•บน•อุปกรณ์•</data> | |
| 720 | |
| 721 # Test for city names #10691 | |
| 722 <line> | |
| 723 <data>•ไป•ที่•ซานฟรานซิสโก•</data> | |
| 724 | |
| 725 # Test for #10630, #10631 | |
| 726 <line> | |
| 727 <data>•แท็ก•แอปพลิเคชัน•เป็น•พิเศษ•</data> | |
| 728 | |
| 729 # Test for #11019 | |
| 730 <line> | |
| 731 <data>•เบ•เบราว์เซอร์•โพ•โพสต์•โพสท์•</data> | |
| 732 | |
| 733 # Test for #11688 | |
| 734 <line> | |
| 735 <data>•อัปเดต•อีเวนต์•</data> | |
| 736 | |
| 737 ################################################################################
########## | |
| 738 # | |
| 739 # Lao Tests | |
| 740 # | |
| 741 ################################################################################
########## | |
| 742 <locale en> | |
| 743 # Basic check for #7647 | |
| 744 <line> | |
| 745 <data>•ສະບາຍດີ•</data> | |
| 746 <data>•ດີ•ຂອບໃຈ•</data> | |
| 747 <data>•ເຈົ້າ•ເວົ້າ•ພາສາ•ອັງກິດ•ໄດ້•ບໍ່•</data> | |
| 748 <data>•ກະລຸນາ•ເວົ້າ•ຊ້າ•ໆ•</data> | |
| 749 | |
| 750 ################################################################################
########## | |
| 751 # | |
| 752 # Burmese/Myanmar Tests | |
| 753 # | |
| 754 ################################################################################
########## | |
| 755 <locale en> | |
| 756 # Basic sanity check for #10326 (some text from http://www.unicode.org/udhr/d/ud
hr_mya.txt) | |
| 757 <line> | |
| 758 <data>•လူ•တိုင်း•သည် •တူညီ •လွတ်လပ်•သော •ဂုဏ်•သိ•က္•ခါ•ဖြ•င့် •လည်းကောင်း၊ •</da
ta> | |
| 759 <data>•တူညီ•လွတ်လပ်•သော •အ•ခွ•င့်•အရေး•များ•ဖြ•င့် •လည်းကောင်း၊ •မွေး•ဖွား•လာ•သူ
များ •ဖြစ်သည်။•</data> | |
| 760 <data>•ထို•သူ•တို့၌ •ပိုင်းခြား •ဝေဖန်•တတ်•သော •ဉာဏ်•နှ•င့် •ကျ•င့်•ဝတ် •သိတတ်•သ
ော •စိတ်•တို့•ရှိ•ကြ၍ •</data> | |
| 761 <data>•ထို•သူ•တို့သည် •အချင်းချင်း •မေတ္တာ•ထား၍ •ဆက်ဆံ•ကျ•င့်•သုံး•</data> | |
| 762 | |
| 763 ################################################################################
########## | |
| 764 # | |
| 765 # Khmer Tests | |
| 766 # | |
| 767 ################################################################################
########## | |
| 768 | |
| 769 # Test data originally from http://bugs.icu-project.org/trac/search?q=r30327 | |
| 770 # from the file testdata/wordsegments.txt | |
| 771 <locale en> | |
| 772 <word> | |
| 773 | |
| 774 <data>•តើ<200>លោក<200>មក<200>ពី<200>ប្រទេស<200>ណា<200></data> | |
| 775 <data>•សណ្ដូក<200>ក<200>បណ្ដែត<200>ខ្លួន<200></data> | |
| 776 <data>•ពណ៌ស<200>ម្ដេច<200>ថា<200>ខ្មៅ<200></data> | |
| 777 #ប្រយោគ|ពី|របៀប|រួបរួម|និង|ភាព|ផ្សេងគ្នា|ដែល|អាច|ចូល<200></data> | |
| 778 <data>•ប្រយោគ<200>ពី<200>របៀប<200>ដែល<200>និង<200>ភាព<200>ផ្សេងគ្នា<200>ដែល<200>
អាច<200>ចូល<200></data> | |
| 779 #ប្រយោគ|ពី|របៀប|ជា|មួយ|និង|ភាព|ផ្សេងគ្នា|ដែល|អាច|ចូល<200></data> | |
| 780 <data>•សូម<200>ចំណាយពេល<200>បន្តិច<200>ដើម្បី<200>អធិស្ឋាន<200>អរព្រះគុណ<200>ដល់
<200>ព្រះអង្គ<200></data> | |
| 781 <data>•ការ<200>ថោកទាប<200>បរិប្បូណ៌<200>ដោយ<200></data> | |
| 782 <data>•ប្រើប្រាស់<200>ស្អាត<200>ទាំង<200>ចិត្ត<200>សិស្ស<200>នោះ<200></data> | |
| 783 <data>•បើ<200>អ្នក<200>ប្រព្រឺត្ត<200>អំពើអាក្រក់<200>មុខ<200>ជា<200>មាន<200></d
ata> | |
| 784 <data>•ប្រដាប់<200>ប្រដា<200>រ<200>រៀនសូត្រ<200>បន្ទប់<200>រៀន<200></data> | |
| 785 <data>•ដើរតួ<200>មនុស្សគ<200>ឥត<200>បញ្ចេញ<200>យោបល់<200>សោះ<200>ឡើយ<200></data> | |
| 786 <data>•មិន<200>អាច<200>ឲ្យ<200>យើង<200>ធ្វើ<200>កសិកម្ម<200>បាន<200>ឡើយ<200></da
ta> | |
| 787 <data>•បន្ត<200>សេចក្ត<200>ទៅទៀត<200></data> | |
| 788 <data>•ក្រុម<200>ប៉ូលិស<200>បណ្តាក់<200>គ្នា<200></data> | |
| 789 <data>•គ្មាន<200>សុខ<200>សំរាន្ត<200>ដង<200>ណា<200></data> | |
| 790 <data>•បាន<200>សុខភាព<200>បរិប្បូណ៌<200></data> | |
| 791 <data>•ជា<200>មេចោរ<200>ខ្ញុំ<200>នឹង<200>ស្លាប់<200>ទៅវិញ<200>ជា<200>មេចោរ<200>
</data> | |
| 792 <data>•ឯ<200>ការ<200>វាយ<200>ផ្ចាល<200>ដែល<200>នាំ<200></data> | |
| 793 <data>•គេ<200>ដឹក<200>ទៅ<200>សំឡាប់<200></data> | |
| 794 #អ្នក|ដែល|ជា|មន្ត្រី|ធំ|លើ|គាត់|ទេ<200></data> | |
| 795 <data>•យក<200>ទៅ<200>សម្លាប់ចោល<200>ស្ងាត់<200></data> | |
| 796 <data>•ត្រូវ<200>បាន<200>គេ<200>សម្លាប់<200></data> | |
| 797 <data>•នៅក្នុង<200>ស្រុក<200>ខ្ល<200>ងហ្ស៊ុន<200></data> | |
| 798 | |
| 799 | |
| 800 # | |
| 801 # Jitterbug 3671 Test Case | |
| 802 # | |
| 803 <data>•สวัสดี<200>ครับ<200>สบาย<200>ดี<200>ไหม<200> •ครับ<200></data> | |
| 804 | |
| 805 # | |
| 806 # Trac ticket 5595 Test Case | |
| 807 <data>•บท<200>ที่๑พายุ<200>ไซโคลน<200>โด<200>โรธี<200>อาศัย<200>อยู่<200>ท่ามกลา
ง<200>\ | |
| 808 ทุ่งใหญ่<200>ใน<200>แคนซัส<200>กับ<200>ลุง<200>เฮ<200>นรี<200>ชาวไร่<200>และ<200
>ป้า<200>เอ็ม<200>\ | |
| 809 ภรรยา<200>ชาวไร่<200>บ้าน<200>ของ<200>พวก<200>เขา<200>หลัง<200>เล็ก<200>เพราะ<20
0>ไม้<200>\ | |
| 810 สร้าง<200>บ้าน<200>ต้อง<200>ขน<200>มา<200>ด้วย<200>เกวียน<200>เป็น<200>ระยะ<200>
ทาง<200>หลาย<200>\ | |
| 811 ไมล์<200></data> | |
| 812 | |
| 813 ################################################################################
#### | |
| 814 # | |
| 815 # Tailored (locale specific) breaking. | |
| 816 # | |
| 817 ################################################################################
#### | |
| 818 | |
| 819 # Japanese line break tailoring test | |
| 820 | |
| 821 <locale ja> | |
| 822 <line> | |
| 823 <data>•\u3041•\u3043•\u3045•\u31f1•</data> | |
| 824 <locale en> | |
| 825 <line> | |
| 826 <data>•\u3041\u3043\u3045\u31f1•</data> | |
| 827 | |
| 828 # The following data was originally in RBBITest::TestJapaneseWordBreak() | |
| 829 <locale ja> | |
| 830 <word> | |
| 831 <data>•\u4ECA\u65E5<400>\u306F<400>\u3044\u3044<400>\u5929\u6C17<400>\u3067\u305
9<400>\u306D<400>\u3002•\u000D\u000A•</data> | |
| 832 | |
| 833 # UBreakIteratorType UBRK_WORD, Locale "ja" | |
| 834 # Don't break in runs of hiragana or runs of ideograph, where the latter include
s \u3005 \u3007 \u303B (cldrbug #2009). | |
| 835 # \u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC
\u30BF\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u
308B\u3002 | |
| 836 # modified to work with dbbi code - should verify | |
| 837 | |
| 838 <locale ja> | |
| 839 <word> | |
| 840 <data>•私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈々
<400>は<400>ワード<400>で<400>ある<400>。•</data> | |
| 841 | |
| 842 # Test for #10176 (in ja) | |
| 843 <line> | |
| 844 <data>•abc/•s •def•</data> | |
| 845 <data>•abc/\u05D9 •def•</data> | |
| 846 <data>•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC•</data> | |
| 847 <data>•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05D
D/\u05D9\u05D5\u05EA•</data> | |
| 848 | |
| 849 | |
| 850 <locale root> | |
| 851 <word> | |
| 852 <data>•私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈々
<400>は<400>ワード<400>で<400>ある<400>。•</data> | |
| 853 # The following test is for #10300 | |
| 854 <data>•例えば<400>オーストラリア<400>。•</data> | |
| 855 # The following test is for #10571 | |
| 856 <data>•一部<400>の<400>地域<400>では<400>、<0>ブラジル<400>、<0>インドネシア<400>、<0>オーストリア<400>、<0
>ニュージーランド<400>で<400>ある<400>。•</data> | |
| 857 | |
| 858 # UBreakIteratorType UBRK_SENTENCE, Locale "el" | |
| 859 # Add break after Greek question mark (cldrbug #2069). | |
| 860 # "\u0391\u03B2, \u03B3\u03B4; \u0395 \u03B6\u03B7\u037E \u0398 \u03B9\u03BA. " | |
| 861 # "\u039B\u03BC \u03BD\u03BE! \u039F\u03C0, \u03A1\u03C2? \u03A3" | |
| 862 # which is "Αβ, γδ; Ε ζη; Θ ικ. Λμ νξ! Οπ, Ρς? Σ" | |
| 863 | |
| 864 <locale root> | |
| 865 <sent> | |
| 866 <data>•Αβ, γδ; Ε ζη; Θ ικ. •Λμ νξ! •Οπ, Ρς? •Σ<100></data> | |
| 867 | |
| 868 <locale el> | |
| 869 <sent> | |
| 870 <data>•Αβ, γδ; •Ε ζη; •Θ ικ. •Λμ νξ! •Οπ, Ρς? •Σ<100></data> | |
| 871 | |
| 872 # UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX" | |
| 873 # Words don't include colon or period (cldrbug #1969). | |
| 874 | |
| 875 <locale en_US> | |
| 876 <word> | |
| 877 <data>•Can't<200> •have<200> •breaks<200> •in<200> •xx:yy<200> •or<200> •struct.
field<200> \ | |
| 878 •for<200> •CS<200>-•types<200>.•</data> | |
| 879 <data>•\uFF92\uFF76\uFF9E<400> •</data> | |
| 880 | |
| 881 <locale en_US_POSIX> | |
| 882 <word> | |
| 883 <data>•Can't<200> •have<200> •breaks<200> •in<200> •xx<200>:•yy<200> •or<200> •s
truct<200>.•field<200> \ | |
| 884 •for<200> •CS<200>-•types<200>.•</data> | |
| 885 <data>•\u06c9<200>\uc799\ufffa•</data> | |
| 886 <data>•\uFF92\uFF76\uFF9E<400> •</data> | |
| 887 | |
| 888 | |
| 889 # UBreakIteratorType UBRK_CHARACTER, Locale "th" | |
| 890 # Clusters should not include spacing Thai/Lao vowels (prefix or postfix), excep
t for [SARA] AM (cldrbug #2161). | |
| 891 # Update: As of Unicode 6.1 root has same behavior as th for this. | |
| 892 # | |
| 893 # "\u0E01\u0E23\u0E30\u0E17\u0E48\u0E2D\u0E21\u0E23\u0E08\u0E19\u0E32 " | |
| 894 # "(\u0E2A\u0E38\u0E0A\u0E32\u0E15\u0E34-\u0E08\u0E38\u0E11\u0E32\u0E21\u0E32\u0
E28) " | |
| 895 # "\u0E40\u0E14\u0E47\u0E01\u0E21\u0E35\u0E1B\u0E31\u0E0D\u0E2B\u0E32 " | |
| 896 # which is "กระท่อมรจนา (สุชาติ-จุฑามาศ) เด็กมีปัญหา " | |
| 897 | |
| 898 <locale th> | |
| 899 <char> | |
| 900 <data>•\u0E01•\u0E23•\u0E30•\u0E17\u0E48•\u0E2D•\u0E21•\u0E23•\u0E08•\u0E19•\u0E
32• •\ | |
| 901 (•\u0E2A\u0E38•\u0E0A•\u0E32•\u0E15\u0E34•-•\u0E08\u0E38•\u0E11•\u0E32•\u0E21•\u
0E32•\u0E28•)• •\ | |
| 902 \u0E40•\u0E14\u0E47•\u0E01•\u0E21\u0E35•\u0E1B\u0E31•\u0E0D•\u0E2B•\u0E32• •</da
ta> | |
| 903 | |
| 904 # Finnish line breaking | |
| 905 # | |
| 906 # These rules deal with hyphens when there is a space on the leading side. | |
| 907 # There should be a break opportunity between the space and the hyphen, and not
after the hyphen. | |
| 908 # See CLDR ticket 3029. | |
| 909 # See ICU ticket 8151 | |
| 910 | |
| 911 <locale root> | |
| 912 <line> | |
| 913 <data>•abc •- •def •abc •-•def •abc- •def •abc-•def•</data> # With ASC
II hyphen | |
| 914 <data>•abc •‐ •def •abc •‐•def •abc‐ •def •abc‐•def•</data> # With Uni
code u2010 hyphen | |
| 915 | |
| 916 <locale fi> | |
| 917 <line> | |
| 918 # TODO: problems with Finnish line break rules cause these two lines to fail. | |
| 919 #<data>•abc •- •def •abc •-def •abc- •def •abc-•def•</data> # With ASC
II hyphen | |
| 920 #<data>•abc •‐ •def •abc •‐def •abc‐ •def •abc‐•def•</data> # With Uni
code u2010 hyphen | |
| 921 | |
| 922 <data>•abc •- •def •abc •-def •abc- •def •</data> # With ASCII hyphen | |
| 923 <data>•abc •‐ •def •abc •‐def •abc‐ •def •</data> # With Unicode u2010
hyphen | |
| 924 | |
| 925 # Test for #10176 (in fi) | |
| 926 <line> | |
| 927 <data>•abc/•s •def•</data> | |
| 928 <data>•abc/\u05D9 •def•</data> | |
| 929 <data>•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC•</data> | |
| 930 <data>•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05D
D/\u05D9\u05D5\u05EA•</data> | |
| 931 | |
| 932 ################################################################################
#### | |
| 933 # | |
| 934 # Test CSS line break variants: strict, normal, loose | |
| 935 # | |
| 936 ################################################################################
#### | |
| 937 | |
| 938 <locale ja@lb=strict> | |
| 939 <line> | |
| 940 # •no brk before 3063 •no brk before 301C•no brk btw 2026 •no
brk before FF01• | |
| 941 <data>•\u3084\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026\u2026\u0020•\u
30A2\uFF01\u0020•</data> | |
| 942 | |
| 943 <locale ja@lb=normal> | |
| 944 <line> | |
| 945 # •brk OK before 3063 •brk OK before 301C •no brk btw 2026 •
no brk before FF01• | |
| 946 <data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•
\u30A2\uFF01\u0020•</data> | |
| 947 | |
| 948 <locale ja@lb=loose> | |
| 949 <line> | |
| 950 # •brk OK before 3063 •brk OK before 301C •brk OK btw 2026
•brk OK before FF01• | |
| 951 <data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020
•u30A2•\uFF01\u0020•</data> | |
| 952 | |
| 953 <locale en@lb=strict> | |
| 954 <line> | |
| 955 # •no brk before 3063 •no brk before 301C•no brk btw 2026 •no
brk before FF01• | |
| 956 <data>•\u3084\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026\u2026\u0020•\u
30A2\uFF01\u0020•</data> | |
| 957 | |
| 958 <locale en@lb=normal> | |
| 959 <line> | |
| 960 # •brk OK before 3063 •no brk before 301C •no brk btw 2026 •n
o brk before FF01• | |
| 961 <data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026\u2026\u0020•\
u30A2\uFF01\u0020•</data> | |
| 962 | |
| 963 <locale en@lb=loose> | |
| 964 <line> | |
| 965 # •brk OK before 3063 •no brk before 301C •brk OK btw 2026 •
no brk before FF01• | |
| 966 <data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026•\u2026\u0020•
u30A2\uFF01\u0020•</data> | |
| OLD | NEW |