OLD | NEW |
1 # | 1 # |
2 # Copyright (C) 2002-2010, International Business Machines Corporation | 2 # Copyright (C) 2002-2010, International Business Machines Corporation |
3 # and others. All Rights Reserved. | 3 # and others. All Rights Reserved. |
4 # | 4 # |
5 # file: word.txt | 5 # file: word.txt |
6 # | 6 # |
7 # ICU Word Break Rules | 7 # ICU Word Break Rules |
8 # See Unicode Standard Annex #29. | 8 # See Unicode Standard Annex #29. |
9 # These rules are based on UAX-29 Revision 16 for Unicode 6.0 | 9 # These rules are based on UAX-29 Revision 16 for Unicode 6.0 |
10 # | 10 # |
(...skipping 11 matching lines...) Expand all Loading... |
22 | 22 |
23 # | 23 # |
24 # Character Class Definitions. | 24 # Character Class Definitions. |
25 # | 25 # |
26 | 26 |
27 $CR = [\p{Word_Break = CR}]; | 27 $CR = [\p{Word_Break = CR}]; |
28 $LF = [\p{Word_Break = LF}]; | 28 $LF = [\p{Word_Break = LF}]; |
29 $Newline = [\p{Word_Break = Newline}]; | 29 $Newline = [\p{Word_Break = Newline}]; |
30 $Extend = [\p{Word_Break = Extend}]; | 30 $Extend = [\p{Word_Break = Extend}]; |
31 $Format = [\p{Word_Break = Format}]; | 31 $Format = [\p{Word_Break = Format}]; |
| 32 $Hiragana = [:Hiragana:]; |
32 $Katakana = [\p{Word_Break = Katakana}]; | 33 $Katakana = [\p{Word_Break = Katakana}]; |
| 34 $Han = [:Han:]; |
33 $ALetter = [\p{Word_Break = ALetter}]; | 35 $ALetter = [\p{Word_Break = ALetter}]; |
34 $MidNumLet = [\p{Word_Break = MidNumLet}]; | 36 # Remove two full stop characters from $MidNumLet and add them to $MidNum |
| 37 # to break a hostname into its components at the cost of breaking |
| 38 # 'e.g.' and 'i.e.' as well. |
| 39 # $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12. |
| 40 # Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected |
| 41 # while rules 6/7 are reverted to the old behavior we want. |
| 42 $MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]]; |
35 $MidLetter = [\p{Word_Break = MidLetter}]; | 43 $MidLetter = [\p{Word_Break = MidLetter}]; |
36 $MidNum = [\p{Word_Break = MidNum}]; | 44 $MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]]; |
37 $Numeric = [\p{Word_Break = Numeric}]; | 45 $Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth d
igits |
38 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; | 46 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; |
39 | 47 |
| 48 # Extra sets not to break 'HebrewLetter U+0022 HebrewLetter'. |
| 49 $HebrewLet = [\p{Word_Break = ALetter} & \p{Script = Hebrew} - [\u05F3]]; |
| 50 # U+05F3 is ALetter and U+05F4 is MidLetter so that they're covered by |
| 51 # the current rule 6/7. |
| 52 $HebrewMidLet = [\u0022]; |
40 | 53 |
41 # Dictionary character set, for triggering language-based break engines. Curre
ntly | 54 # Dictionary character set, for triggering language-based break engines. Curre
ntly |
42 # limited to LineBreak=Complex_Context. Note that this set only works in Unico
de | 55 # limited to LineBreak=Complex_Context and CJK. Note that this set only works |
43 # 5.0 or later as the definition of Complex_Context was corrected to include a
ll | 56 # in Unicode 5.0 or later as the definition of Complex_Context was corrected t
o include all |
44 # characters requiring dictionary break. | 57 # characters requiring dictionary break. |
45 | 58 |
46 $dictionary = [:LineBreak = Complex_Context:]; | |
47 $Control = [\p{Grapheme_Cluster_Break = Control}]; | 59 $Control = [\p{Grapheme_Cluster_Break = Control}]; |
48 $ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default AL
etter does not | 60 $HangulSyllable = [\uac00-\ud7a3]; |
49 # include the dict
ionary characters. | 61 $ComplexContext = [:LineBreak = Complex_Context:]; |
| 62 $KanaKanji = [$Han $Hiragana $Katakana]; |
| 63 $dictionaryCJK = [$KanaKanji $HangulSyllable]; |
| 64 $dictionary = [$ComplexContext $dictionaryCJK]; |
| 65 |
| 66 # leave CJK scripts out of ALetterPlus |
| 67 $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; |
| 68 |
50 | 69 |
51 # | 70 # |
52 # Rules 4 Ignore Format and Extend characters, | 71 # Rules 4 Ignore Format and Extend characters, |
53 # except when they appear at the beginning of a region of text. | 72 # except when they appear at the beginning of a region of text. |
54 # | 73 # |
| 74 # TODO: check if handling of katakana in dictionary makes rules incorrect/void. |
55 $KatakanaEx = $Katakana ($Extend | $Format)*; | 75 $KatakanaEx = $Katakana ($Extend | $Format)*; |
56 $ALetterEx = $ALetterPlus ($Extend | $Format)*; | 76 $ALetterEx = $ALetterPlus ($Extend | $Format)*; |
57 $MidNumLetEx = $MidNumLet ($Extend | $Format)*; | 77 $MidNumLetEx = $MidNumLet ($Extend | $Format)*; |
58 $MidLetterEx = $MidLetter ($Extend | $Format)*; | 78 $MidLetterEx = $MidLetter ($Extend | $Format)*; |
59 $MidNumEx = $MidNum ($Extend | $Format)*; | 79 $MidNumEx = $MidNum ($Extend | $Format)*; |
60 $NumericEx = $Numeric ($Extend | $Format)*; | 80 $NumericEx = $Numeric ($Extend | $Format)*; |
61 $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; | 81 $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; |
| 82 $HebrewLetEx = $HebrewLet ($Extend | $Format)*; |
62 | 83 |
63 $Hiragana = [\p{script=Hiragana}]; | |
64 $Ideographic = [\p{Ideographic}]; | 84 $Ideographic = [\p{Ideographic}]; |
65 $HiraganaEx = $Hiragana ($Extend | $Format)*; | 85 $HiraganaEx = $Hiragana ($Extend | $Format)*; |
66 $IdeographicEx = $Ideographic ($Extend | $Format)*; | 86 $IdeographicEx = $Ideographic ($Extend | $Format)*; |
67 | 87 |
68 ## ------------------------------------------------- | 88 ## ------------------------------------------------- |
69 | 89 |
70 !!forward; | 90 !!forward; |
71 | 91 |
72 | 92 |
73 # Rule 3 - CR x LF | 93 # Rule 3 - CR x LF |
74 # | 94 # |
75 $CR $LF; | 95 $CR $LF; |
76 | 96 |
77 # Rule 4 - ignore Format and Extend characters, except when they appear at the b
eginning | 97 # Rule 4 - ignore Format and Extend characters, except when they appear at the b
eginning |
78 # of a region of Text. The rule here comes into play when the start o
f text | 98 # of a region of Text. The rule here comes into play when the start o
f text |
79 # begins with a group of Format chars, or with a "word" consisting of a
single | 99 # begins with a group of Format chars, or with a "word" consisting of a
single |
80 # char that is not in any of the listed word break categories followed
by | 100 # char that is not in any of the listed word break categories followed
by |
81 # format char(s). | 101 # format char(s). |
82 [^$CR $LF $Newline]? ($Extend | $Format)+; | 102 # format char(s), or is not a CJK dictionary character. |
| 103 [^$CR $LF $Newline $dictionaryCJK]? ($Extend | $Format)+; |
83 | 104 |
84 $NumericEx {100}; | 105 $NumericEx {100}; |
85 $ALetterEx {200}; | 106 $ALetterEx {200}; |
86 $KatakanaEx {300}; # note: these status values override those from rule 5 | 107 $HangulSyllable {200}; |
87 $HiraganaEx {300}; # by virtual of being numerically larger. | 108 $KatakanaEx {400}; #originally 300 |
| 109 $HiraganaEx {400}; #originally 300 |
88 $IdeographicEx {400}; # | 110 $IdeographicEx {400}; # |
89 | 111 |
90 # | 112 # |
91 # rule 5 | 113 # rule 5 |
92 # Do not break between most letters. | 114 # Do not break between most letters. |
93 # | 115 # |
94 $ALetterEx $ALetterEx {200}; | 116 $ALetterEx $ALetterEx {200}; |
95 | 117 |
96 # rule 6 and 7 | 118 # rule 6 and 7 |
97 $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; | 119 $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; |
98 | 120 |
| 121 # Chrome addition |
| 122 $HebrewLetEx $HebrewMidLet $HebrewLetEx {200}; |
| 123 |
99 # rule 8 | 124 # rule 8 |
100 | 125 |
101 $NumericEx $NumericEx {100}; | 126 $NumericEx $NumericEx {100}; |
102 | 127 |
103 # rule 9 | 128 # rule 9 |
104 | 129 |
105 $ALetterEx $NumericEx {200}; | 130 $ALetterEx $NumericEx {200}; |
106 | 131 |
107 # rule 10 | 132 # rule 10 |
108 | 133 |
109 $NumericEx $ALetterEx {200}; | 134 $NumericEx $ALetterEx {200}; |
110 | 135 |
111 # rule 11 and 12 | 136 # rule 11 and 12 |
112 | 137 |
113 $NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; | 138 $NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; |
114 | 139 |
115 # rule 13 | 140 # rule 13 |
116 | 141 |
117 $KatakanaEx $KatakanaEx {300}; | 142 # To be consistent with '$KanaKanji $KanaKanji', changed |
| 143 # from 300 to 400. |
| 144 # See also TestRuleStatus in intltest/rbbiapts.cpp |
| 145 $KatakanaEx $KatakanaEx {400}; |
118 | 146 |
119 # rule 13a/b | 147 # rule 13a/b |
120 | 148 |
121 $ALetterEx $ExtendNumLetEx {200}; # (13a) | 149 $ALetterEx $ExtendNumLetEx {200}; # (13a) |
122 $NumericEx $ExtendNumLetEx {100}; # (13a) | 150 $NumericEx $ExtendNumLetEx {100}; # (13a) |
123 $KatakanaEx $ExtendNumLetEx {300}; # (13a) | 151 $KatakanaEx $ExtendNumLetEx {400}; # (13a) |
124 $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) | 152 $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) |
125 | 153 |
126 $ExtendNumLetEx $ALetterEx {200}; # (13b) | 154 $ExtendNumLetEx $ALetterEx {200}; # (13b) |
127 $ExtendNumLetEx $NumericEx {100}; # (13b) | 155 $ExtendNumLetEx $NumericEx {100}; # (13b) |
128 $ExtendNumLetEx $KatakanaEx {300}; # (13b) | 156 $ExtendNumLetEx $KatakanaEx {400}; # (13b) |
129 | 157 |
| 158 # special handling for CJK characters: chain for later dictionary segmentation |
| 159 $HangulSyllable $HangulSyllable {200}; |
| 160 $KanaKanji $KanaKanji {400}; #different rule status if both kanji and kana found |
130 | 161 |
131 | 162 |
132 ## ------------------------------------------------- | 163 ## ------------------------------------------------- |
133 | 164 |
134 !!reverse; | 165 !!reverse; |
135 | 166 |
136 $BackALetterEx = ($Format | $Extend)* $ALetterPlus; | 167 $BackALetterEx = ($Format | $Extend)* $ALetterPlus; |
137 $BackMidNumLetEx = ($Format | $Extend)* $MidNumLet; | 168 $BackMidNumLetEx = ($Format | $Extend)* $MidNumLet; |
138 $BackNumericEx = ($Format | $Extend)* $Numeric; | 169 $BackNumericEx = ($Format | $Extend)* $Numeric; |
139 $BackMidNumEx = ($Format | $Extend)* $MidNum; | 170 $BackMidNumEx = ($Format | $Extend)* $MidNum; |
140 $BackMidLetterEx = ($Format | $Extend)* $MidLetter; | 171 $BackMidLetterEx = ($Format | $Extend)* $MidLetter; |
141 $BackKatakanaEx = ($Format | $Extend)* $Katakana; | 172 $BackKatakanaEx = ($Format | $Extend)* $Katakana; |
| 173 $BackHiraganaEx = ($Extend | $Format)* $Hiragana; |
142 $BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet; | 174 $BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet; |
| 175 $BackHebrewLetEx = ($Format | $Extend)* $HebrewLet; |
| 176 |
143 | 177 |
144 # rule 3 | 178 # rule 3 |
145 $LF $CR; | 179 $LF $CR; |
146 | 180 |
147 # rule 4 | 181 # rule 4 |
148 ($Format | $Extend)* [^$CR $LF $Newline]?; | 182 ($Format | $Extend)* [^$CR $LF $Newline $dictionaryCJK]?; |
149 | 183 |
150 # rule 5 | 184 # rule 5 |
151 | 185 |
152 $BackALetterEx $BackALetterEx; | 186 $BackALetterEx $BackALetterEx; |
153 | 187 |
154 # rule 6 and 7 | 188 # rule 6 and 7 |
155 | 189 |
156 $BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx; | 190 $BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx; |
157 | 191 |
| 192 # Chrome addition |
| 193 $BackHebrewLetEx $HebrewMidLet $BackHebrewLetEx; |
158 | 194 |
159 # rule 8 | 195 # rule 8 |
160 | 196 |
161 $BackNumericEx $BackNumericEx; | 197 $BackNumericEx $BackNumericEx; |
162 | 198 |
163 # rule 9 | 199 # rule 9 |
164 | 200 |
165 $BackNumericEx $BackALetterEx; | 201 $BackNumericEx $BackALetterEx; |
166 | 202 |
167 # rule 10 | 203 # rule 10 |
168 | 204 |
169 $BackALetterEx $BackNumericEx; | 205 $BackALetterEx $BackNumericEx; |
170 | 206 |
171 # rule 11 and 12 | 207 # rule 11 and 12 |
172 | 208 |
173 $BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx; | 209 $BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx; |
174 | 210 |
175 # rule 13 | 211 # rule 13 |
176 | 212 |
177 $BackKatakanaEx $BackKatakanaEx; | 213 $BackKatakanaEx $BackKatakanaEx; |
178 | 214 |
179 # rules 13 a/b | 215 # rules 13 a/b |
180 # | 216 # |
181 $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackEx
tendNumLetEx); | 217 $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackEx
tendNumLetEx); |
182 ($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; | 218 ($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; |
183 | 219 |
| 220 # special handling for CJK characters: chain for later dictionary segmentation |
| 221 $HangulSyllable $HangulSyllable; |
| 222 $KanaKanji $KanaKanji; #different rule status if both kanji and kana found |
| 223 |
184 ## ------------------------------------------------- | 224 ## ------------------------------------------------- |
185 | 225 |
186 !!safe_reverse; | 226 !!safe_reverse; |
187 | 227 |
188 # rule 3 | 228 # rule 3 |
189 ($Extend | $Format)+ .?; | 229 ($Extend | $Format)+ .?; |
190 | 230 |
191 # rule 6 | 231 # rule 6 |
192 ($MidLetter | $MidNumLet) $BackALetterEx; | 232 ($MidLetter | $MidNumLet) $BackALetterEx; |
193 | 233 |
(...skipping 11 matching lines...) Expand all Loading... |
205 ($Extend | $Format)+ .?; | 245 ($Extend | $Format)+ .?; |
206 | 246 |
207 # rule 6 | 247 # rule 6 |
208 ($MidLetterEx | $MidNumLetEx) $ALetterEx; | 248 ($MidLetterEx | $MidNumLetEx) $ALetterEx; |
209 | 249 |
210 # rule 11 | 250 # rule 11 |
211 ($MidNumEx | $MidNumLetEx) $NumericEx; | 251 ($MidNumEx | $MidNumLetEx) $NumericEx; |
212 | 252 |
213 # For dictionary-based break | 253 # For dictionary-based break |
214 $dictionary $dictionary; | 254 $dictionary $dictionary; |
OLD | NEW |