OLD | NEW |
(Empty) | |
| 1 # Copyright (c) 2001-2009 International Business Machines |
| 2 # Corporation and others. All Rights Reserved. |
| 3 # |
| 4 # RBBI Test Data |
| 5 # |
| 6 # File: rbbitst.txt |
| 7 # |
| 8 # The format of this file looks vaguely like some kind of xml-ish markup, |
| 9 # but it is NOT. The syntax is this.. |
| 10 # |
| 11 # <word> any following data is for word break testing |
| 12 # <sent> any following data is for sentence break testing |
| 13 # <line> any following data is for line break testing |
| 14 # <char> any following data is for char break testing |
| 15 # <locale local_name> Switch to the named locale at the next occurence of <wo
rd>, <sent>, etc. |
| 16 # <data> ... </data> test data. May span multiple lines. |
| 17 # <> Break position, status == 0 |
| 18 # • Break position, status == 0 (Bullet, \u2022) |
| 19 # <nnn> Break position, status == nnn |
| 20 # \ Escape. Normal ICU unescape applied. |
| 21 # \ at end of line -> Line Continuation. Remove both the backslash and t
he new line |
| 22 # |
| 23 # |
| 24 |
| 25 |
| 26 # Temp debugging tests |
| 27 <line> |
| 28 <data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udc
fb•</data> |
| 29 |
| 30 ################################################################################
######## |
| 31 # |
| 32 # |
| 33 # G r a p h e m e C l u s t e r T e s t s |
| 34 # |
| 35 # |
| 36 ################################################################################
########## |
| 37 <char> |
| 38 |
| 39 <data>•a•b•c• •,•\u0666•</data> # Quick Test |
| 40 <data>•\r•\r•\r\n•\r\n•\n•\r•</data> # don't break CR/LF |
| 41 |
| 42 # Always break after controls. Combining chars don't combine with them. |
| 43 <data>•\u0003•\N{COMBINING GRAVE ACCENT}•\r•\N{COMBINING GRAVE ACCENT}•</data> |
| 44 <data>•\u0085•\N{COMBINING MACRON}•A\N{COMBINING MACRON}•</data> |
| 45 |
| 46 # Surrogates |
| 47 <data>•\U00011000•\U00010020•\U00010000\N{COMBINING MACRON}•</data> |
| 48 <data>•\ud800\udc00•\udbff\udfff•a•</data> |
| 49 |
| 50 # Extend (Combining chars) combine. |
| 51 <data>•A\N{COMBINING GRAVE ACCENT}•B•</data> |
| 52 <data>•\N{GREEK SMALL LETTER MU}\N{COMBINING LOW LINE}\N{COMBINING HORN}•</data> |
| 53 <data>•a\u0301•b\u0302•c\u0303•d\u0304•e\u0305•f\u0306•g\u0307•h\u0308•i\u0309•<
/data> |
| 54 |
| 55 <data>•a\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304
\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u
0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u03
03\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304
\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u
0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u03
03\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304
\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u
0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u03
03\u0304\u0301\u0302\u0303\u0304•</data> |
| 56 |
| 57 # Don't break Hangul Syllables |
| 58 # L : \u1100 |
| 59 # V : \u1161 |
| 60 # T : \u11A8 |
| 61 # LV : \uAC00 |
| 62 # LVT : \uAC01 |
| 63 |
| 64 <data>•\u1100\u1161\u11a8•\u1100\u1161\u11a8•</data> #LVT |
| 65 <data>•\u1100\u1161•\u1100\u1161•</data> |
| 66 <data>•\u1100\u1161\u11a8•\u1161•\u1100•\u11a8•\u1161\u1161\u1161\u11a8•</data> |
| 67 <data>•\u1100\u1100\uac01•\u1100\uac01•\u1100\uac01\u0301•\uac01•</data> |
| 68 <data>•\u1100\u0301•\u1161\u11a8\u0301•\u11a8•</data> |
| 69 |
| 70 |
| 71 |
| 72 # Hindi combining chars. (An old test) |
| 73 # TODO: Update these tests for Unicode 5.1 Extended Grapheme clusters |
| 74 #<data>•भ••ा•\u0930•\u0924• •\u0938\u0941\u0902•\u0926•\u0930• |
| 75 #•\u0939•\u094c•\u0964•</data> |
| 76 #<data>•\u0916\u0947•\u0938\u0941\u0902•\u0926•\u0930•\u0939•\u094c•\u0964•</dat
a> |
| 77 |
| 78 |
| 79 # Bug 1587. Tamil. \u0baa\u0bc1 is an Extended Grpaheme Cluster |
| 80 <data>•\u0baa\u0bc1•\u0baa\u0bc1•</data> |
| 81 |
| 82 # Regression test for bug 1889 |
| 83 <data>•\u0f40\u0f7d•\u0000•\u0f7e•</data> |
| 84 |
| 85 |
| 86 # 0xffff is a legal character, and should not stop the break iterator early. |
| 87 # (Requires special casing in implementation, which is why it gets a test.) |
| 88 <data>•\uffff•\uffff• •a•</data> |
| 89 |
| 90 # Treat Japanese Half Width voicing marks as combining |
| 91 <data>•A\uff9e•B\uff9f\uff9e\uff9f•C•</data> |
| 92 |
| 93 ################################################################################
######## |
| 94 # |
| 95 # |
| 96 # E x t e n d e d G r a p h e m e C l u s t e r T e s t s |
| 97 # |
| 98 # |
| 99 ################################################################################
########## |
| 100 #<xgc> |
| 101 |
| 102 # Plain Vanilla grapheme clusters |
| 103 #<data>•a•b•c•</data> |
| 104 #<data>•a\u0301\u0302• •b\u0303\u0304•</data> |
| 105 |
| 106 # Assorted Hindi combining marks |
| 107 #<data>•\u0904\u0903• •\u0937\u093E• •\u0904\u093F• •\u0937\u0940• •\u0937\u0949
• •\u0937\u094A• •\u0937\u094B• •\u0937\u094C•</data> |
| 108 |
| 109 # Thai Clusters |
| 110 # $Prepend $Extend* $PrependBase $Extend*; |
| 111 # |
| 112 #<data>•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02•
•</data> |
| 113 |
| 114 |
| 115 ################################################################################
######## |
| 116 # |
| 117 # |
| 118 # W o r d B o u n d a r y T e s t s |
| 119 # |
| 120 # |
| 121 ################################################################################
########## |
| 122 |
| 123 <word> |
| 124 # |
| 125 # Quick sanity test |
| 126 # |
| 127 <data>•hello<200> •there<200> •goodbye<200></data> |
| 128 <data>•hello<200> •12345<100> •,•</data> |
| 129 |
| 130 |
| 131 # |
| 132 # Test data originally in RBBIAPITest::TestFirstNextFollowing() and TestLastPrev
iousPreceding() |
| 133 # |
| 134 |
| 135 <word> |
| 136 <data>•This<200> •is<200> •a<200> •word<200> •break<200>.• • •Isn't<200> •it<200
>?• •2.25<100></data> |
| 137 |
| 138 |
| 139 |
| 140 # |
| 141 # Data originally from TestDefaultRuleBasedWordIteration() |
| 142 # |
| 143 <data>•Write<200> •wordrules<200>.• •123.456<100> •alpha\u00adbeta\u00adgamma<20
0> •\u092f\u0939<200> •</data> |
| 144 <data>• •\u0939\u093f\u0928\u094d\u200d\u0926\u0940<200> •\u0939\u0948<200> •\u0
905\u093e\u092a<200> •\u0938\u093f\u0916\u094b\u0917\u0947<200>?•</data> |
| 145 |
| 146 #Hindi Numbers |
| 147 <data>• •\u0968\u0966.\u0969\u096f<100> •\u0967\u0966\u0966.\u0966\u0966<100> •\
N{RUPEE SIGN}•\u0967,\u0967\u0966\u0966.\u0966\u0966<100> • •\u0905\u092e\u091c<
200>\n•</data> |
| 148 |
| 149 <data>•\u0938\u094d\u200d\u0935\u0924\u0902deadTA\u0930<200>\r•It's<200> •$•30.1
0<100> •12,34<100>¢•£•¤•¥•alpha\u05f3beta\u05f4gamma<200> •</data> |
| 150 |
| 151 <data>•Badges<200>?• •BADGES<200>!•?•!• •We<200> •don't<200> •need<200> •no<200>
•STINKING<200> •BADGES<200>!•!•1000,233,456.000<100> •1,23.322<100>%•123.1222<1
00>$•123,000.20<100> •179.01<100>%•X<200> •Now<200>\r•is<200>\n•the<200>\r\n•tim
e<200> •</data> |
| 152 |
| 153 #Hangul |
| 154 <data>•\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u111
2\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •Hello<200>,• •how
<200> •are<200> •you<200> •</data> |
| 155 |
| 156 |
| 157 # Words containing non-BMP letters |
| 158 <data>•abc\U00010300<200> •abc\N{DESERET SMALL LETTER ENG}<200> •abc\N{MATHEMATI
CAL BOLD SMALL Z}<200> •abc\N{MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL}<200
> •</data> |
| 159 |
| 160 # Unassigned code points |
| 161 <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data> |
| 162 |
| 163 # Hiragana & Katakana stay together, but separates from each other and Latin. |
| 164 <data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBININ
G ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\
N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA
LETTER N}<300>def<200>#•</data> |
| 165 |
| 166 # Words with interior formatting characters |
| 167 <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data
> |
| 168 |
| 169 # to test for bug #4097779 |
| 170 <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data> |
| 171 |
| 172 |
| 173 # to test for bug #4098467 |
| 174 # What follows is a string of Korean characters (I found it in the Yellow P
ages |
| 175 # ad for the Korean Presbyterian Church of San Francisco, and I hope I tran
scribed |
| 176 # it correctly), first as precomposed syllables, and then as conjoining jam
o. |
| 177 # Both sequences should be semantically identical and break the same way. |
| 178 # precomposed syllables... |
| 179 <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad
50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u11
0b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11
bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data> |
| 180 |
| 181 <data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •
</data> |
| 182 |
| 183 <data>•\u06c9\uc799\ufffa<200></data> |
| 184 |
| 185 # |
| 186 # Try some words from other scripts. |
| 187 # |
| 188 |
| 189 # Try some words from other scripts. |
| 190 # Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin |
| 191 # |
| 192 <data>•ΑΒΓ<200> •БВГ<200> •אבג֓<200> •ابت<200> •١٢٣<100> •\u10A0\u10A1\u10A2<200
> •ABC<200> •</data> |
| 193 |
| 194 <data>•\u0301•A<200></data> |
| 195 |
| 196 |
| 197 # |
| 198 # Hindi word break tests, imported from the old RBBI tests. |
| 199 # An historical note: a much earlier version of ICU break iterators had a nu
mber |
| 200 # of special case rules for Hindi, which were tested by an earlier version of |
| 201 # this test data. The current RBBI rules do not special case Hindi in |
| 202 # any way, making this test data much less signfificant. |
| 203 # |
| 204 <data>•\u0917\u092a\u00ad\u0936\u092a<200>!•\u092f\u0939<200> •\u0939\u093f\u092
8\u094d\u200d\u0926\u0940<200> •\u0939\u0948<200> •\u0905\u093e\u092a<200> •\u09
38\u093f\u0916\u094b\u0917\u0947<200>?•\n•:•\u092a\u094d\u0930\u093e\u092f\u0903
<200> |
| 205 •\u0935\u0930\u094d\u0937\u093e<200>\r\n•\u092a\u094d\u0930\u0915\u093e\u0936<20
0>,•\u0924\u0941\u092e\u093e\u0930\u094b<200> •\u092e\u093f\u0924\u094d\u0930<20
0> •\u0915\u093e<200> •\u092a\u0924\u094d\u0930<200> •\u092a\u095d\u094b<200> •\
u0938\u094d\u0924\u094d\u0930\u093f<200>.• •\u0968\u0966.\u0969\u096f<100> •\u09
67\u0966\u0966.\u0966\u0966<100>\u20a8•\u0967,\u0967\u0966\u0966.\u0966\u0966<10
0> •\u0905\u092e\u091c<200>\n•\u0938\u094d\u200d\u0935\u0924\u0902\u0924\u094d\u
0930<200>\r•</data> |
| 206 |
| 207 # |
| 208 # Failures from monkey tests |
| 209 # |
| 210 <data>•\u8527<400>\u02ba<200>\u0027\u0d42•\u00b7•\u09ea<100></data> |
| 211 |
| 212 # |
| 213 # Jitterbug 5276 - treat Japanese half width voicing marks as Grapheme Extend |
| 214 # |
| 215 <data>•A\uff9e\uff9fBC<200> •1\uff9e\uff9f23<100></data> |
| 216 |
| 217 ################################################################################
######## |
| 218 # |
| 219 # |
| 220 # S e n t e n c e B o u n d a r y T e s t s |
| 221 # |
| 222 # |
| 223 ################################################################################
########## |
| 224 |
| 225 |
| 226 # |
| 227 # Test data originally from RBBI RBBITest::TestDefaultRuleBasedSentenceIteration
() |
| 228 # |
| 229 <sent> |
| 230 |
| 231 |
| 232 <sent> |
| 233 <data>•This\n<100></data> |
| 234 <data>•Hello! •how are you? •I'am fine. •Thankyou. •How are you \ |
| 235 doing? •This\n<100> costs $20,00,000. •</data> |
| 236 |
| 237 |
| 238 # Sentence ending in a quote. |
| 239 <data>•"Sentence ending with a quote." •Bye.•</data> |
| 240 |
| 241 # Sentence, and test data, ending without a period or other terminator. |
| 242 <data>•Here is a random sentence, no ending period<100></data> |
| 243 |
| 244 |
| 245 <data>• (This is it). •Testing the sentence iterator. •\ |
| 246 "This isn't it." •Hi! \ |
| 247 •This is a simple sample sentence. •(This is it.) •This is a simple sample sente
nce. •\ |
| 248 "This isn't it." •\ |
| 249 Hi! •This is a simple sample sentence. •It does not have to make any sense as yo
u can see. •Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura
. •Che la dritta via aveo smarrita. •He said, that I said, that you said!! •Don'
t rock the boat.\u2029•Because I am the daddy, that is why. |
| 250 •Not on my time (el timo.)! •</data> |
| 251 |
| 252 <data>•Hello. •So what!!\u2029•"But now," he said, \ |
| 253 "I know!" •\ |
| 254 Harris thumbed down several, including "Away We Go" (which became the huge succe
ss Oklahoma!). •One species, B. anthracis, is highly virulent. |
| 255 •Wolf said about Sounder:\ |
| 256 "Beautifully thought-out and directed." •\ |
| 257 Have you ever said, "This is where\tI shall live"? •He answered, \ |
| 258 "You may not!" •Another popular saying is: "How do you do?". \n•\ |
| 259 Yet another popular saying is: \ |
| 260 'I'm fine thanks.' •\ |
| 261 What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tall!
!\ |
| 262 •Now\r<100>is\n<100>the\r\n<100>time\n<100>\r<100>for\r<100>\r<100></data> |
| 263 |
| 264 <data>•No breaks when . is surrounded by UPPER.Case letters. •</data> |
| 265 <data>•No breaks when . is followed by Numeric .4 a.4 C.4 3.1 .•</data> |
| 266 <data>•No breaks when . is followed by a lower, with possible intervening punct
.,a .$a .)a. •</data> |
| 267 |
| 268 # |
| 269 # Sentence Breaks: no break at the boundary between CJK and other letters |
| 270 # |
| 271 <data>•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:"JAVA\u821c\u8165\u7fc8\
u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46".\u2029
•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4JAVA\u821c\u8165\u7
fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d4
6\u97e5\u7751\u3002•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4
\u6470\u8790JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0\u60b1\u85
60\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2048•He said, "I can go there."\u2
029•Bye, now.•</data> |
| 272 |
| 273 # |
| 274 # Treat fullwidth variants of .!? the same as their |
| 275 # normal counterparts |
| 276 # |
| 277 <data>•I know I'm right\uff0e •Right\uff1f •Right\uff01 •</data> |
| 278 |
| 279 |
| 280 # |
| 281 # Don't break sentences at boundary between CJK and digits |
| 282 # |
| 283 <data>•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e48888\u821c\u8
165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u251
0\u5d46\u97e5\u7751\u3002•Bye, now<100></data> |
| 284 |
| 285 # |
| 286 # Breaks around '(' following a sentence TERM. (Rule 9) |
| 287 # |
| 288 <data>•How do you do?(•Fine). •</data> |
| 289 <data>•How do you do? •(Fine). •</data> |
| 290 <data>•How do you do?(•fine). •</data> |
| 291 <data>•How do you do? •(fine). •</data> |
| 292 |
| 293 # |
| 294 <data>•Hello.123<100></data> # Rule 6 |
| 295 <data>•Hello?•123<100></data> |
| 296 |
| 297 <data>•HELLO.Bye<100></data> # Rule 7 |
| 298 <data>•HELLO?•Bye<100></data> |
| 299 |
| 300 <data>•Hello.goodbye<100></data> #Rule 8 |
| 301 <data>•Hello. •Goodbye<100></data> |
| 302 <data>•Hello. goodbye<100></data> |
| 303 |
| 304 |
| 305 |
| 306 # |
| 307 # test for bug #4158381: No breaks when there are no terminators around |
| 308 # |
| 309 <data>•\<P>Provides a set of "lightweight" (all-java\<FONT SIZE="-2">\
<SUP>TM\</SUP>\</FONT> language) components that, to the maximum degree possible
, work the same on all platforms. •</data> |
| 310 <data>•Another test.\u2029•</data> |
| 311 |
| 312 # test for bug #4143071: Make sure sentences that end with digits |
| 313 # work right |
| 314 # |
| 315 <data>•Today is the 27th of May, 1998. •</data> |
| 316 <data>•Tomorrow with be 28 May 1998. •</data> |
| 317 <data>•The day after will be the 30th.\u2029•</data> |
| 318 |
| 319 # test for bug #4152416: Make sure sentences ending with a capital |
| 320 # letter are treated correctly |
| 321 # |
| 322 <data>•The type of all primitive \<code>boolean\</code> values accessed in the t
arget VM. •Calls to xxx will return an implementor of this interface. \u2029•<
/data> |
| 323 |
| 324 # test for bug #4152117: Make sure sentence breaking is handling |
| 325 # punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS |
| 326 # HERE TO MAKE SURE IT DOESN'T CROP UP] |
| 327 # |
| 328 <data>•Constructs a randomly generated BigInteger, uniformly distributed over th
e range \<tt>0\</tt> to \<tt>(2\<sup>numBits\</sup> - 1\)\</tt>, inclusive. •Th
e uniformity of the distribution assumes that a fair source of random bits is pr
ovided in \<tt>rnd\</tt>. •Note that this constructor always constructs a non-n
egative biginteger. \n•Ahh abc. |
| 329 •</data> |
| 330 |
| 331 # sentence breaks for hindi which used Devanagari script |
| 332 # make sure there is sentence break after ?,danda(hindi phrase separator)
, |
| 333 # fullstop followed by space. (VERY old test) |
| 334 # |
| 335 <data>•\u0928\u092e\u0938\u094d\u200d\u0924\u0947 \u0930\u092e\u0947\u0936\u0905
\u093e\u092a\u0915\u0948\u0938\u0947 \u0939\u0948?•\u092e\u0948 \u0905\u091a\u09
4d\u200d \u091b\u093e \u0939\u0942\u0901\u0964 •\u0905\u093e\u092a\r\n<100>\ |
| 336 \u0915\u0948\u0938\u0947 \u0939\u0948?•\u0935\u0939 \u0915\u094d\u200d\u092f\u09
3e\n\ |
| 337 <100>\u0939\u0948?•\u092f\u0939 \u0905\u093e\u092e \u0939\u0948. •\u092f\u0939 m
eans "this". •"\u092a\u095d\u093e\u0908" meaning "education" or "studies". •\u09
05\u093e\u091c(\u0938\u094d\u200d\u0935\u0924\u0902\u0924\u094d\u0930 \u0926\u09
3f\u0935\u093e\u0938) \u0939\u0948\u0964 •Let's end here. •</data> |
| 338 |
| 339 # Regression test for bug #1984, Sentence break in Arabic text. |
| 340 |
| 341 <data>\ |
| 342 •\u0623\u0633\u0627\u0633\u064b\u0627\u060c\u0020\u062a\u062a\u0639\u0627"\u0645
\u0644\u0020\u0627\u0644\u062d\u0648\u0627\u0633\u064a\u0628\u0020"\u0641\u0642\
u0637\u0020\u0645\u0639\u0020\u0627\u0644\u0623\u0631\u0642\u0627\u0645\u060c\u0
648\u062a\u0642\u0648\u0645\u0020\u0628\u062a\u062e\u0632\u064a\u0646\u0020\u062
7\u0644\u0623\u062d\u0631\u0641\u0020\u0648\u0627\u0644\u0645\u062d\u0627\u0631\
u0641\u0020\u0627\u0644\u0623\u062e\u0631\u0649\u0020\u0628\u0639\u062f\u0020\u0
623\u0646\u062a\u064f\u0639\u0637\u064a\u0020\u0631\u0642\u0645\u0627\u0020\u064
5\u0639\u064a\u0646\u0627\u0020\u0644\u0643\u0644\u0020\u0648\u0627\u062d\u062f\
u0020\u0645\u0646\u0647\u0627\u002e\u0020•\u0648\u0642\u0628\u0644\u0020\u0627\u
062e\u062a\u0631\u0627\u0639\u0022\u064a\u0648\u0646\u0650\u0643\u0648\u062f\u00
22\u060c\u0020\u0643\u0627\u0646\u0020\u0647\u0646\u0627\u0643\u0020\u0645\u0626
\u0627\u062a\u0020\u0627\u0644\u0623\u0646\u0638\u0645\u0629\u0020\u0644\u0644\u
062a\u0634\u0641\u064a\u0631\u0648\u062a\u062e\u0635\u064a\u0635\u0020\u0647\u06
30\u0647\u0020\u0627\u0644\u0623\u0631\u0642\u0627\u0645\u0020\u0644\u0644\u0645
\u062d\u0627\u0631\u0641\u060c\u0020\u0648\u0644\u0645\u0020\u064a\u0648\u062c\u
062f\u0020\u0646\u0638\u0627\u0645\u062a\u0634\u0641\u064a\u0020\u0639\u0644\u06
49\u0020\u062c\u0645\u064a\u0639\u0020\u0627\u0644\u0645\u062d\u0627\u0631\u0641
\u0020\u0627\u0644\u0636\u0631\u0648\u0631\u064a\u0629. •</data> |
| 343 |
| 344 # Try a few more of the less common sentence endings. |
| 345 <data>•Hello, world\u3002 •Hello, world\u1803 •Hello, world\u2048 •Hello, world\
u203c •Let's end here. •</data> |
| 346 |
| 347 |
| 348 |
| 349 |
| 350 ################################################################ |
| 351 # |
| 352 # |
| 353 # L I N E B R E A K |
| 354 # |
| 355 # |
| 356 ################################################################ |
| 357 |
| 358 <line> |
| 359 # |
| 360 # Test Character for each of the line break classes. |
| 361 # |
| 362 # 00A1;AI # INVERTED EXCLAMATION MARK ¡ |
| 363 # 0041;AL # LATIN CAPITAL LETTER A |
| 364 # 0009;BA # <control> |
| 365 # 00B4;BB # ACUTE ACCENT |
| 366 # 000C;BK # <control> |
| 367 # 2014;B2 # EM DASH |
| 368 # FFFC;CB # OBJECT REPLACEMENT CHARACTER |
| 369 # 0029;CL # RIGHT PARENTHESIS |
| 370 # 0301;CM # COMBINING ACUTE ACCENT |
| 371 # 0021;EX # EXCLAMATION MARK |
| 372 # 00A0;GL # NO-BREAK SPACE |
| 373 # 002D;HY # HYPHEN-MINUS |
| 374 # 4E00;ID # <CJK Ideograph, First> |
| 375 # 2024;IN # ONE DOT LEADER |
| 376 # 002C;IS # COMMA |
| 377 # 000A;LF # <control> |
| 378 # 0E5A;NS # THAI CHARACTER ANGKHANKHU |
| 379 # 0032;NU # DIGIT TWO |
| 380 # 0028;OP # LEFT PARENTHESIS |
| 381 # 0025;PO # PERCENT SIGN |
| 382 # 0024;PR # DOLLAR SIGN |
| 383 # 0022;QU # QUOTATION MARK |
| 384 # 0E01;SA # THAI CHARACTER KO KAI |
| 385 # DB7F;SG # Surrogate |
| 386 # 0020;SP # SPACE |
| 387 # 002F;SY # SOLIDUS / |
| 388 # F8FF;XX # Private Use |
| 389 # 200B;ZW # ZERO WIDTH SPACE |
| 390 |
| 391 |
| 392 # 2b Always break at end of text |
| 393 |
| 394 <data>• •\u00A1•</data> |
| 395 <data>• •\u0041•</data> |
| 396 <data>• •\u0009•</data> |
| 397 <data>• •\u00B4•</data> |
| 398 <data>• \u000C<100></data> # LB3C × BK |
| 399 <data>• •\u2014•</data> |
| 400 <data>• •\uFFFC•</data> |
| 401 <data>• \u0029•</data> # LB 8 × CL |
| 402 # <data>• • \u0301•</data> # LB 7a Treat SP CM* as if it were ID #TODO: S
P CM |
| 403 <data>• \u0021•</data> # LB 8 × EX |
| 404 #<data>• \u00A0•</data> # LB 11b × GL TODO: fix. |
| 405 <data>• •\u002D•</data> |
| 406 <data>• •\u4E00•</data> |
| 407 <data>• •\u2024•</data> |
| 408 <data>• \u002C•</data> # LB 8 × IS |
| 409 <data>• \u000A<100></data> # LB3C × ( BK | CR | LF | NL ) |
| 410 <data>• •\u0E5A•</data> |
| 411 <data>• •\u0032•</data> |
| 412 <data>• •\u0028•</data> |
| 413 <data>• •\u0025•</data> |
| 414 <data>• •\u0024•</data> |
| 415 <data>• •\u0022•</data> |
| 416 <data>• •\u0E01•</data> |
| 417 <data>• •\uDB7F•</data> |
| 418 <data>• \u0020•</data> # LB4 - don't break before space. |
| 419 <data>• \u002F•</data> # LB 8 × SY |
| 420 <data>• •\uF8FF•</data> |
| 421 <data>• \u200B•</data> # LB4 - don't break before ZA |
| 422 |
| 423 |
| 424 # 3a Always break after hard line breaks. |
| 425 # 3c Never break before hard line breaks. |
| 426 |
| 427 <data>• •\u00A1\u2028<100>\u00A1•</data> |
| 428 <data>• •\u0041\u2028<100>\u0041•</data> |
| 429 <data>• •\u0009\u2028<100>\u0009•</data> |
| 430 <data>• •\u00B4\u2028<100>\u00B4•</data> |
| 431 <data>• \u000C<100>\u2028<100>\u000C<100></data> |
| 432 <data>• •\u2014\u2028<100>\u2014•</data> |
| 433 <data>• •\uFFFC\u2028<100>\uFFFC•</data> |
| 434 <data>• \u0029\u2028<100>\u0029•</data> |
| 435 #<data>• \u0301\u2028<100>\u0301•</data> # TODO: fix. |
| 436 <data>• \u0021\u2028<100>\u0021•</data> |
| 437 #<data>• \u00A0\u2028<100>\u00A0•</data> # TODO: fix |
| 438 <data>• •\u002D\u2028<100>\u002D•</data> |
| 439 <data>• •\u4E00\u2028<100>\u4E00•</data> |
| 440 <data>• •\u2024\u2028<100>\u2024•</data> |
| 441 <data>• \u002C\u2028<100>\u002C•</data> |
| 442 <data>• \u000A<100>\u2028<100>\u000A<100></data> |
| 443 <data>• •\u0E5A\u2028<100>\u0E5A•</data> |
| 444 <data>• •\u0032\u2028<100>\u0032•</data> |
| 445 <data>• •\u0028\u2028<100>\u0028•</data> |
| 446 <data>• •\u0025\u2028<100>\u0025•</data> |
| 447 <data>• •\u0024\u2028<100>\u0024•</data> |
| 448 <data>• •\u0022\u2028<100>\u0022•</data> |
| 449 <data>• •\u0E01\u2028<100>\u0E01•</data> |
| 450 <data>• •\uDB7F\u2028<100>\uDB7F•</data> |
| 451 <data>• \u0020\u2028<100>\u0020•</data> |
| 452 <data>• \u002F\u2028<100>\u002F•</data> |
| 453 <data>• •\uF8FF\u2028<100>\uF8FF•</data> |
| 454 <data>• \u200B\u2028<100>\u200B•</data> |
| 455 |
| 456 |
| 457 # |
| 458 # Old Line Break Test data. Orginally located in RBBITest::TestDefaultRuleBase
dLineIteration() |
| 459 # |
| 460 |
| 461 <line> |
| 462 |
| 463 <data>•Multi-•Level •example •of •a •semi-•idiotic •non-•sensical •(non-•importa
nt) •sentence. |
| 464 <100>Hi •Hello •How\n<100>are\r<100>you\u2028<100>fine.\t•good. •Now\r<100>is\
n<100>the\r\n<100>time\n<100>\r<100>for\r<100>\r<100>all•</data> |
| 465 |
| 466 <line> |
| 467 <data>•Hello! •how\r\n<100> •(are)\r<100> •you? •I'am •fine- •Thankyou. •foo\u00
a0bar |
| 468 <100>How, •are, •you? •This, •costs •$20,00,000.•</data> |
| 469 |
| 470 # test for bug #4068133 |
| 471 # |
| 472 <data>•\u96f6•\u4e00\u3002•\u4e8c\u3001•\u4e09\u3002\u3001•\u56db\u3001\u3002\u3
001•\u4e94,•\u516d.•\u4e03.\u3001,\u3002•\u516b•</data> |
| 473 |
| 474 # to test for bug #4086052 |
| 475 <data>•foo\u00a0bar•</data> |
| 476 |
| 477 # to test for bug #4097920 |
| 478 <data>•dog,cat,mouse •(one)•(two)\n<100></data> |
| 479 |
| 480 # to test for bug #4035266 |
| 481 <data>•The •balance •is •$-23,456.78, •not •-•$32,456.78!\n<100></data> |
| 482 |
| 483 |
| 484 # to test for bug #4098467 |
| 485 # What follows is a string of Korean characters (I found it in the Yellow P
ages |
| 486 # ad for the Korean Presbyterian Church of San Francisco, and I hope I tran
scribed |
| 487 # it correctly), first as precomposed syllables, and then as conjoining jam
o. |
| 488 # Both sequences should be semantically identical and break the same way. |
| 489 # precomposed syllables... (I == Rich Gillam?) |
| 490 # |
| 491 <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•
</data> |
| 492 |
| 493 # conjoining jamo... |
| 494 # TODO: rules update needed |
| 495 #<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u
11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u
1100\u116d•\u1112\u116c•</data> |
| 496 |
| 497 # to test for bug #4117554: Fullwidth .!? should be treated as postJwrd |
| 498 <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data> |
| 499 |
| 500 # Surrogate line break tests. |
| 501 # |
| 502 <data>•\u4e01•\ud840\udc01•\u4e02•abc •\ue000 •\udb80\udc01•</data> |
| 503 |
| 504 # Regression for bug 836 |
| 505 # Note: Unicode 5.1 changed this behavior |
| 506 # Unicode 5.2 changed it again, there is no break following the '(
' |
| 507 <data>•AAA(AAA •</data> |
| 508 |
| 509 # Try some words from other scripts. |
| 510 # Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin |
| 511 # |
| 512 <data>•ΑΒΓ •БВГ •אבג֓ •ابت •١٢٣ •\u10A0\u10A1\u10A2 •ABC •</data> |
| 513 |
| 514 # |
| 515 # ticket #4853: unpaired surrogates should behave like AL |
| 516 # |
| 517 <data>•abc\ud801xyz•</data> |
| 518 |
| 519 # |
| 520 # Regression tests for failures that originally came from the monkey test. |
| 521 # Monkey test failure lines can, with slight reformatting, be copied into th
is section |
| 522 # as test cases. The error display from here is more informative. |
| 523 # |
| 524 <data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udc
fb•</data> |
| 525 <data>•\u114d\u31f3•\ube44\u002d•\u0362\u24e2\u276e\u2014\u205f\ufe16•\uc877•\u0
fd0\u000a<100>\u20a3•</data> |
| 526 <data>•\u080a\u215b\U0001d7d3\u002c•\u2025\U000e012e•\u02df\u118d\u0029\ua8d6\u0
085<100>\u6cc4\u2024\u202f\ufffc•</data> |
| 527 |
| 528 |
| 529 ################################################################################
######## |
| 530 # |
| 531 # |
| 532 # T i t l e B o u n d a r y T e s t s |
| 533 # |
| 534 # |
| 535 ################################################################################
########## |
| 536 <title> |
| 537 <data>•Here •is •a •short •sample •sentence. •And •another.•</data> |
| 538 <data>•HERE •IS •A •SHORT •SAMPLE •SENTENCE. •AND •ANOTHER.•</data> |
| 539 <data>• •Start •and •end •with •spaces •</data> |
| 540 <data>•Include 123 456 ^& •some 54332 •numbers 4445•abc123•abc •ending 1223 •</
data> |
| 541 |
| 542 <data>•Combining\u0301 \u0301•ma\u0306rks •bye •</data> |
| 543 <data>•123 •Start •with •a •number.•</data> |
| 544 |
| 545 <data>•'•start •with •a •case-•ignorable •cha'r'a'cter•</data> |
| 546 |
| 547 |
| 548 ################################################################################
########## |
| 549 # |
| 550 # Thai Tests |
| 551 # |
| 552 ################################################################################
########## |
| 553 <locale th> |
| 554 <word> |
| 555 # |
| 556 # Test data originally from the test code source file |
| 557 # // @suwit -- Thai sample data from GVT Guideline |
| 558 # |
| 559 <data>•\u0E2B\u0E19\u0E36\u0E48\u0E07<200>\u0E04\u0E33<200>\u0E44\u0E17\u0E22<20
0>\ |
| 560 \u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16<200>\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A<20
0>\ |
| 561 \u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\ |
| 562 \u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200></data> |
| 563 |
| 564 # |
| 565 # Jitterbug 3671 Test Case |
| 566 # |
| 567 <data>•สวัสดี<200>ครับ<200>สบาย<200>ดี<200>ไหม<200> •ครับ<200></data> |
| 568 |
| 569 # |
| 570 # Trac ticket 5595 Test Case |
| 571 <data>•บท<200>ที่๑พายุ<200>ไซโคลน<200>โด<200>โรธี<200>อาศัย<200>อยู่<200>ท่ามกลา
ง<200>\ |
| 572 ทุ่งใหญ่<200>ใน<200>แคนซัส<200>กับ<200>ลุง<200>เฮ<200>นรี<200>ชาวไร่<200>และ<200
>ป้า<200>เอ็ม<200>\ |
| 573 ภรรยา<200>ชาวไร่<200>บ้าน<200>ของ<200>พวก<200>เขา<200>หลัง<200>เล็ก<200>เพราะ<20
0>ไม้<200>\ |
| 574 สร้าง<200>บ้าน<200>ต้อง<200>ขน<200>มา<200>ด้วย<200>เกวียน<200>เป็น<200>ระยะ<200>
ทาง<200>หลาย<200>\ |
| 575 ไมล์<200></data> |
| 576 |
| 577 |
OLD | NEW |