OLD | NEW |
| (Empty) |
1 # | |
2 # Copyright (C) 2002-2013, International Business Machines Corporation | |
3 # and others. All Rights Reserved. | |
4 # | |
5 # file: word_ja.txt | |
6 # | |
7 # ICU Word Break Rules | |
8 # See Unicode Standard Annex #29. | |
9 # These rules are based on UAX #29 Revision 22 for Unicode Version 6.3 | |
10 # | |
11 # Note: Updates to word.txt will usually need to be merged into | |
12 # word_POSIX.txt also. | |
13 | |
14 ############################################################################## | |
15 # | |
16 # Character class definitions from TR 29 | |
17 # | |
18 ############################################################################## | |
19 | |
20 !!chain; | |
21 | |
22 | |
23 # | |
24 # Character Class Definitions. | |
25 # | |
26 | |
27 $CR = [\p{Word_Break = CR}]; | |
28 $LF = [\p{Word_Break = LF}]; | |
29 $Newline = [\p{Word_Break = Newline}]; | |
30 $Extend = [\p{Word_Break = Extend}]; | |
31 $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; | |
32 $Format = [\p{Word_Break = Format}]; | |
33 $Katakana = [\p{Word_Break = Katakana}]; | |
34 $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; | |
35 $ALetter = [\p{Word_Break = ALetter}]; | |
36 $Single_Quote = [\p{Word_Break = Single_Quote}]; | |
37 $Double_Quote = [\p{Word_Break = Double_Quote}]; | |
38 # Remove two full stop characters from $MidNumLet and add them to $MidNum | |
39 # to break a hostname into its components at the cost of breaking | |
40 # 'e.g.' and 'i.e.' as well. | |
41 # $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12. | |
42 # Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected | |
43 # while rules 6/7 are reverted to the old behavior we want. | |
44 $MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]]; | |
45 $MidLetter = [\p{Word_Break = MidLetter}]; | |
46 $MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]]; | |
47 $Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth d
igits | |
48 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; | |
49 | |
50 $Han = [:Han:]; | |
51 $Hiragana = [:Hiragana:]; | |
52 | |
53 | |
54 # Dictionary character set, for triggering language-based break engines. Curre
ntly | |
55 # limited to LineBreak=Complex_Context. Note that this set only works in Unico
de | |
56 # 5.0 or later as the definition of Complex_Context was corrected to include a
ll | |
57 # characters requiring dictionary break. | |
58 | |
59 $Control = [\p{Grapheme_Cluster_Break = Control}]; | |
60 $HangulSyllable = [\uac00-\ud7a3]; | |
61 $ComplexContext = [:LineBreak = Complex_Context:]; | |
62 $KanaKanji = [$Han $Hiragana $Katakana]; | |
63 $dictionary = [$ComplexContext]; | |
64 | |
65 $ALetterPlus = [$ALetter [$ComplexContext-$Extend-$Control]]; | |
66 | |
67 | |
68 # | |
69 # Rules 4 Ignore Format and Extend characters, | |
70 # except when they appear at the beginning of a region of text. | |
71 # | |
72 # TODO: check if handling of katakana in dictionary makes rules incorrect/void | |
73 $KatakanaEx = $Katakana ($Extend | $Format)*; | |
74 $Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*; | |
75 $ALetterEx = $ALetterPlus ($Extend | $Format)*; | |
76 $Single_QuoteEx = $Single_Quote ($Extend | $Format)*; | |
77 $Double_QuoteEx = $Double_Quote ($Extend | $Format)*; | |
78 $MidNumLetEx = $MidNumLet ($Extend | $Format)*; | |
79 $MidLetterEx = $MidLetter ($Extend | $Format)*; | |
80 $MidNumEx = $MidNum ($Extend | $Format)*; | |
81 $NumericEx = $Numeric ($Extend | $Format)*; | |
82 $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; | |
83 $Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*; | |
84 | |
85 $Ideographic = [\p{Ideographic} [\u3005 \u3007 \u303B]]; | |
86 $HiraganaEx = $Hiragana ($Extend | $Format)*; | |
87 $IdeographicEx = $Ideographic ($Extend | $Format)*; | |
88 | |
89 ## ------------------------------------------------- | |
90 | |
91 !!forward; | |
92 | |
93 | |
94 # Rule 3 - CR x LF | |
95 # | |
96 $CR $LF; | |
97 | |
98 # Rule 4 - ignore Format and Extend characters, except when they appear at the b
eginning | |
99 # of a region of Text. The rule here comes into play when the start o
f text | |
100 # begins with a group of Format chars, or with a "word" consisting of a
single | |
101 # char that is not in any of the listed word break categories followed
by | |
102 # format char(s), or is not a CJK dictionary character. | |
103 [^$CR $LF $Newline]? ($Extend | $Format)+; | |
104 | |
105 $NumericEx {100}; | |
106 $ALetterEx {200}; | |
107 $HangulSyllable {200}; | |
108 $Hebrew_LetterEx{200}; | |
109 $KatakanaEx {400}; # note: these status values override those from rule 5 | |
110 $HiraganaEx {400}; # by virtue of being numerically larger. | |
111 $IdeographicEx {400}; # | |
112 | |
113 # | |
114 # rule 5 | |
115 # Do not break between most letters. | |
116 # | |
117 ($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200}; | |
118 | |
119 # rule 6 and 7 | |
120 ($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx)
($ALetterEx | $Hebrew_LetterEx) {200}; | |
121 | |
122 # rule 7a | |
123 $Hebrew_LetterEx $Single_QuoteEx {200}; | |
124 | |
125 # rule 7b and 7c | |
126 $Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200}; | |
127 | |
128 # rule 8 | |
129 | |
130 $NumericEx $NumericEx {100}; | |
131 | |
132 # rule 9 | |
133 | |
134 ($ALetterEx | $Hebrew_LetterEx) $NumericEx {200}; | |
135 | |
136 # rule 10 | |
137 | |
138 $NumericEx ($ALetterEx | $Hebrew_LetterEx) {200}; | |
139 | |
140 # rule 11 and 12 | |
141 | |
142 $NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100}; | |
143 | |
144 # rule 13 | |
145 # to be consistent with $KanaKanji $KanaKanhi, changed | |
146 # from 300 to 400. | |
147 # See also TestRuleStatus in intltest/rbbiapts.cpp | |
148 $KatakanaEx $KatakanaEx {400}; | |
149 $HiraganaEx $HiraganaEx {400}; | |
150 $IdeographicEx $IdeographicEx {400}; | |
151 | |
152 # rule 13a/b | |
153 | |
154 $ALetterEx $ExtendNumLetEx {200}; # (13a) | |
155 $Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a) | |
156 $NumericEx $ExtendNumLetEx {100}; # (13a) | |
157 $KatakanaEx $ExtendNumLetEx {400}; # (13a) | |
158 $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) | |
159 | |
160 $ExtendNumLetEx $ALetterEx {200}; # (13b) | |
161 $ExtendNumLetEx $Hebrew_Letter {200}; # (13b) | |
162 $ExtendNumLetEx $NumericEx {100}; # (13b) | |
163 $ExtendNumLetEx $KatakanaEx {400}; # (13b) | |
164 | |
165 # rule 13c | |
166 | |
167 $Regional_IndicatorEx $Regional_IndicatorEx; | |
168 | |
169 ## ------------------------------------------------- | |
170 | |
171 !!reverse; | |
172 | |
173 $BackHebrew_LetterEx = ($Format | $Extend)* $Hebrew_Letter; | |
174 $BackALetterEx = ($Format | $Extend)* $ALetterPlus; | |
175 $BackSingle_QuoteEx = ($Format | $Extend)* $Single_Quote; | |
176 $BackDouble_QuoteEx = ($Format | $Extend)* $Double_Quote; | |
177 $BackMidNumLetEx = ($Format | $Extend)* $MidNumLet; | |
178 $BackNumericEx = ($Format | $Extend)* $Numeric; | |
179 $BackMidNumEx = ($Format | $Extend)* $MidNum; | |
180 $BackMidLetterEx = ($Format | $Extend)* $MidLetter; | |
181 $BackKatakanaEx = ($Format | $Extend)* $Katakana; | |
182 $BackHiraganaEx = ($Format | $Extend)* $Hiragana; | |
183 $BackIdeographicEx = ($Format | $Extend)* $Ideographic; | |
184 $BackExtendNumLetEx = ($Format | $Extend)* $ExtendNumLet; | |
185 $BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator; | |
186 | |
187 # rule 3 | |
188 $LF $CR; | |
189 | |
190 # rule 4 | |
191 ($Format | $Extend)* [^$CR $LF $Newline]?; | |
192 | |
193 # rule 5 | |
194 | |
195 ($BackALetterEx | $BackHebrew_LetterEx) ($BackALetterEx | $BackHebrew_LetterEx); | |
196 | |
197 # rule 6 and 7 | |
198 | |
199 ($BackALetterEx | $BackHebrew_LetterEx) ($BackMidLetterEx | $BackMidNumLetEx | $
BackSingle_QuoteEx) ($BackALetterEx | $BackHebrew_LetterEx); | |
200 | |
201 # rule 7a | |
202 $BackSingle_QuoteEx $BackHebrew_LetterEx; | |
203 | |
204 # Rule 7b and 7c | |
205 $BackHebrew_LetterEx $BackDouble_QuoteEx $BackHebrew_LetterEx; | |
206 | |
207 # rule 8 | |
208 | |
209 $BackNumericEx $BackNumericEx; | |
210 | |
211 # rule 9 | |
212 | |
213 $BackNumericEx ($BackALetterEx | $BackHebrew_LetterEx); | |
214 | |
215 # rule 10 | |
216 | |
217 ($BackALetterEx | $BackHebrew_LetterEx) $BackNumericEx; | |
218 | |
219 # rule 11 and 12 | |
220 | |
221 $BackNumericEx ($BackMidNumEx | $BackMidNumLetEx | $BackSingle_QuoteEx) $BackNum
ericEx; | |
222 | |
223 # rule 13 | |
224 | |
225 $BackKatakanaEx $BackKatakanaEx; | |
226 $BackHiraganaEx $BackHiraganaEx; | |
227 $BackIdeographicEx $BackIdeographicEx; | |
228 | |
229 # rules 13 a/b | |
230 # | |
231 $BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $B
ackKatakanaEx | $BackExtendNumLetEx); | |
232 ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $Back
ExtendNumLetEx; | |
233 | |
234 # rule 13c | |
235 | |
236 $BackRegional_IndicatorEx $BackRegional_IndicatorEx; | |
237 | |
238 ## ------------------------------------------------- | |
239 | |
240 !!safe_reverse; | |
241 | |
242 # rule 3 | |
243 ($Extend | $Format)+ .?; | |
244 | |
245 # rule 6 | |
246 ($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx
); | |
247 | |
248 # rule 7b | |
249 $Double_Quote $BackHebrew_LetterEx; | |
250 | |
251 | |
252 # rule 11 | |
253 ($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx; | |
254 | |
255 # For dictionary-based break | |
256 $dictionary $dictionary; | |
257 | |
258 ## ------------------------------------------------- | |
259 | |
260 !!safe_forward; | |
261 | |
262 # rule 4 | |
263 ($Extend | $Format)+ .?; | |
264 | |
265 # rule 6 | |
266 ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx); | |
267 | |
268 # rule 7b | |
269 $Double_QuoteEx $Hebrew_LetterEx; | |
270 | |
271 # rule 11 | |
272 ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx; | |
273 | |
274 # For dictionary-based break | |
275 $dictionary $dictionary; | |
OLD | NEW |