OLD | NEW |
1 # | 1 # |
2 # Copyright (C) 2002-2013, International Business Machines Corporation | 2 # Copyright (C) 2002-2013, International Business Machines Corporation |
3 # and others. All Rights Reserved. | 3 # and others. All Rights Reserved. |
4 # | 4 # |
5 # file: word.txt | 5 # file: word.txt |
6 # | 6 # |
7 # ICU Word Break Rules | 7 # ICU Word Break Rules |
8 # See Unicode Standard Annex #29. | 8 # See Unicode Standard Annex #29. |
9 # These rules are based on UAX #29 Revision 22 for Unicode Version 6.3 | 9 # These rules are based on UAX #29 Revision 22 for Unicode Version 6.3 |
10 # | 10 # |
(...skipping 17 matching lines...) Expand all Loading... |
28 $LF = [\p{Word_Break = LF}]; | 28 $LF = [\p{Word_Break = LF}]; |
29 $Newline = [\p{Word_Break = Newline}]; | 29 $Newline = [\p{Word_Break = Newline}]; |
30 $Extend = [\p{Word_Break = Extend}]; | 30 $Extend = [\p{Word_Break = Extend}]; |
31 $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; | 31 $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; |
32 $Format = [\p{Word_Break = Format}]; | 32 $Format = [\p{Word_Break = Format}]; |
33 $Katakana = [\p{Word_Break = Katakana}]; | 33 $Katakana = [\p{Word_Break = Katakana}]; |
34 $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; | 34 $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; |
35 $ALetter = [\p{Word_Break = ALetter}]; | 35 $ALetter = [\p{Word_Break = ALetter}]; |
36 $Single_Quote = [\p{Word_Break = Single_Quote}]; | 36 $Single_Quote = [\p{Word_Break = Single_Quote}]; |
37 $Double_Quote = [\p{Word_Break = Double_Quote}]; | 37 $Double_Quote = [\p{Word_Break = Double_Quote}]; |
38 $MidNumLet = [\p{Word_Break = MidNumLet}]; | 38 # Remove two full stop characters from $MidNumLet and add them to $MidNum |
| 39 # to break a hostname into its components at the cost of breaking |
| 40 # 'e.g.' and 'i.e.' as well. |
| 41 # $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12. |
| 42 # Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected |
| 43 # while rules 6/7 are reverted to the old behavior we want. |
| 44 $MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]]; |
39 $MidLetter = [\p{Word_Break = MidLetter}]; | 45 $MidLetter = [\p{Word_Break = MidLetter}]; |
40 $MidNum = [\p{Word_Break = MidNum}]; | 46 $MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]]; |
41 $Numeric = [\p{Word_Break = Numeric}]; | 47 $Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth d
igits |
42 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; | 48 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; |
43 | 49 |
44 $Han = [:Han:]; | 50 $Han = [:Han:]; |
45 $Hiragana = [:Hiragana:]; | 51 $Hiragana = [:Hiragana:]; |
46 | 52 |
47 | 53 |
48 # Dictionary character set, for triggering language-based break engines. Curre
ntly | 54 # Dictionary character set, for triggering language-based break engines. Curre
ntly |
49 # limited to LineBreak=Complex_Context. Note that this set only works in Unico
de | 55 # limited to LineBreak=Complex_Context. Note that this set only works in Unico
de |
50 # 5.0 or later as the definition of Complex_Context was corrected to include a
ll | 56 # 5.0 or later as the definition of Complex_Context was corrected to include a
ll |
51 # characters requiring dictionary break. | 57 # characters requiring dictionary break. |
(...skipping 214 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
266 ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx); | 272 ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx); |
267 | 273 |
268 # rule 7b | 274 # rule 7b |
269 $Double_QuoteEx $Hebrew_LetterEx; | 275 $Double_QuoteEx $Hebrew_LetterEx; |
270 | 276 |
271 # rule 11 | 277 # rule 11 |
272 ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx; | 278 ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx; |
273 | 279 |
274 # For dictionary-based break | 280 # For dictionary-based break |
275 $dictionary $dictionary; | 281 $dictionary $dictionary; |
OLD | NEW |