OLD | NEW |
(Empty) | |
| 1 # |
| 2 # Copyright (C) 2002-2010, International Business Machines Corporation |
| 3 # and others. All Rights Reserved. |
| 4 # |
| 5 # file: word.txt |
| 6 # |
| 7 # ICU Word Break Rules |
| 8 # See Unicode Standard Annex #29. |
| 9 # These rules are based on UAX-29 Revision 16 for Unicode 6.0 |
| 10 # |
| 11 # Note: Updates to word.txt will usually need to be merged into |
| 12 # word_POSIX.txt and word_ja.txt also. |
| 13 |
| 14 ############################################################################## |
| 15 # |
| 16 # Character class definitions from TR 29 |
| 17 # |
| 18 ############################################################################## |
| 19 |
| 20 !!chain; |
| 21 |
| 22 |
| 23 # |
| 24 # Character Class Definitions. |
| 25 # |
| 26 |
| 27 $CR = [\p{Word_Break = CR}]; |
| 28 $LF = [\p{Word_Break = LF}]; |
| 29 $Newline = [\p{Word_Break = Newline}]; |
| 30 $Extend = [\p{Word_Break = Extend}]; |
| 31 $Format = [\p{Word_Break = Format}]; |
| 32 $Katakana = [\p{Word_Break = Katakana}]; |
| 33 $ALetter = [\p{Word_Break = ALetter}]; |
| 34 $MidNumLet = [\p{Word_Break = MidNumLet}]; |
| 35 $MidLetter = [\p{Word_Break = MidLetter}]; |
| 36 $MidNum = [\p{Word_Break = MidNum}]; |
| 37 $Numeric = [\p{Word_Break = Numeric}]; |
| 38 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; |
| 39 |
| 40 |
| 41 # Dictionary character set, for triggering language-based break engines. Curre
ntly |
| 42 # limited to LineBreak=Complex_Context. Note that this set only works in Unico
de |
| 43 # 5.0 or later as the definition of Complex_Context was corrected to include a
ll |
| 44 # characters requiring dictionary break. |
| 45 |
| 46 $dictionary = [:LineBreak = Complex_Context:]; |
| 47 $Control = [\p{Grapheme_Cluster_Break = Control}]; |
| 48 $ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default AL
etter does not |
| 49 # include the dict
ionary characters. |
| 50 |
| 51 # |
| 52 # Rules 4 Ignore Format and Extend characters, |
| 53 # except when they appear at the beginning of a region of text. |
| 54 # |
| 55 $KatakanaEx = $Katakana ($Extend | $Format)*; |
| 56 $ALetterEx = $ALetterPlus ($Extend | $Format)*; |
| 57 $MidNumLetEx = $MidNumLet ($Extend | $Format)*; |
| 58 $MidLetterEx = $MidLetter ($Extend | $Format)*; |
| 59 $MidNumEx = $MidNum ($Extend | $Format)*; |
| 60 $NumericEx = $Numeric ($Extend | $Format)*; |
| 61 $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; |
| 62 |
| 63 $Hiragana = [\p{script=Hiragana}]; |
| 64 $Ideographic = [\p{Ideographic}]; |
| 65 $HiraganaEx = $Hiragana ($Extend | $Format)*; |
| 66 $IdeographicEx = $Ideographic ($Extend | $Format)*; |
| 67 |
| 68 ## ------------------------------------------------- |
| 69 |
| 70 !!forward; |
| 71 |
| 72 |
| 73 # Rule 3 - CR x LF |
| 74 # |
| 75 $CR $LF; |
| 76 |
| 77 # Rule 4 - ignore Format and Extend characters, except when they appear at the b
eginning |
| 78 # of a region of Text. The rule here comes into play when the start o
f text |
| 79 # begins with a group of Format chars, or with a "word" consisting of a
single |
| 80 # char that is not in any of the listed word break categories followed
by |
| 81 # format char(s). |
| 82 [^$CR $LF $Newline]? ($Extend | $Format)+; |
| 83 |
| 84 $NumericEx {100}; |
| 85 $ALetterEx {200}; |
| 86 $KatakanaEx {300}; # note: these status values override those from rule 5 |
| 87 $HiraganaEx {300}; # by virtual of being numerically larger. |
| 88 $IdeographicEx {400}; # |
| 89 |
| 90 # |
| 91 # rule 5 |
| 92 # Do not break between most letters. |
| 93 # |
| 94 $ALetterEx $ALetterEx {200}; |
| 95 |
| 96 # rule 6 and 7 |
| 97 $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; |
| 98 |
| 99 # rule 8 |
| 100 |
| 101 $NumericEx $NumericEx {100}; |
| 102 |
| 103 # rule 9 |
| 104 |
| 105 $ALetterEx $NumericEx {200}; |
| 106 |
| 107 # rule 10 |
| 108 |
| 109 $NumericEx $ALetterEx {200}; |
| 110 |
| 111 # rule 11 and 12 |
| 112 |
| 113 $NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; |
| 114 |
| 115 # rule 13 |
| 116 |
| 117 $KatakanaEx $KatakanaEx {300}; |
| 118 |
| 119 # rule 13a/b |
| 120 |
| 121 $ALetterEx $ExtendNumLetEx {200}; # (13a) |
| 122 $NumericEx $ExtendNumLetEx {100}; # (13a) |
| 123 $KatakanaEx $ExtendNumLetEx {300}; # (13a) |
| 124 $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) |
| 125 |
| 126 $ExtendNumLetEx $ALetterEx {200}; # (13b) |
| 127 $ExtendNumLetEx $NumericEx {100}; # (13b) |
| 128 $ExtendNumLetEx $KatakanaEx {300}; # (13b) |
| 129 |
| 130 |
| 131 |
| 132 ## ------------------------------------------------- |
| 133 |
| 134 !!reverse; |
| 135 |
| 136 $BackALetterEx = ($Format | $Extend)* $ALetterPlus; |
| 137 $BackMidNumLetEx = ($Format | $Extend)* $MidNumLet; |
| 138 $BackNumericEx = ($Format | $Extend)* $Numeric; |
| 139 $BackMidNumEx = ($Format | $Extend)* $MidNum; |
| 140 $BackMidLetterEx = ($Format | $Extend)* $MidLetter; |
| 141 $BackKatakanaEx = ($Format | $Extend)* $Katakana; |
| 142 $BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet; |
| 143 |
| 144 # rule 3 |
| 145 $LF $CR; |
| 146 |
| 147 # rule 4 |
| 148 ($Format | $Extend)* [^$CR $LF $Newline]?; |
| 149 |
| 150 # rule 5 |
| 151 |
| 152 $BackALetterEx $BackALetterEx; |
| 153 |
| 154 # rule 6 and 7 |
| 155 |
| 156 $BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx; |
| 157 |
| 158 |
| 159 # rule 8 |
| 160 |
| 161 $BackNumericEx $BackNumericEx; |
| 162 |
| 163 # rule 9 |
| 164 |
| 165 $BackNumericEx $BackALetterEx; |
| 166 |
| 167 # rule 10 |
| 168 |
| 169 $BackALetterEx $BackNumericEx; |
| 170 |
| 171 # rule 11 and 12 |
| 172 |
| 173 $BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx; |
| 174 |
| 175 # rule 13 |
| 176 |
| 177 $BackKatakanaEx $BackKatakanaEx; |
| 178 |
| 179 # rules 13 a/b |
| 180 # |
| 181 $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackEx
tendNumLetEx); |
| 182 ($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; |
| 183 |
| 184 ## ------------------------------------------------- |
| 185 |
| 186 !!safe_reverse; |
| 187 |
| 188 # rule 3 |
| 189 ($Extend | $Format)+ .?; |
| 190 |
| 191 # rule 6 |
| 192 ($MidLetter | $MidNumLet) $BackALetterEx; |
| 193 |
| 194 # rule 11 |
| 195 ($MidNum | $MidNumLet) $BackNumericEx; |
| 196 |
| 197 # For dictionary-based break |
| 198 $dictionary $dictionary; |
| 199 |
| 200 ## ------------------------------------------------- |
| 201 |
| 202 !!safe_forward; |
| 203 |
| 204 # rule 4 |
| 205 ($Extend | $Format)+ .?; |
| 206 |
| 207 # rule 6 |
| 208 ($MidLetterEx | $MidNumLetEx) $ALetterEx; |
| 209 |
| 210 # rule 11 |
| 211 ($MidNumEx | $MidNumLetEx) $NumericEx; |
| 212 |
| 213 # For dictionary-based break |
| 214 $dictionary $dictionary; |
OLD | NEW |