OLD | NEW |
1 # | 1 # |
2 # Copyright (C) 2002-2013, International Business Machines Corporation | 2 # Copyright (C) 2002-2013, International Business Machines Corporation |
3 # and others. All Rights Reserved. | 3 # and others. All Rights Reserved. |
4 # | 4 # |
5 # file: word.txt | 5 # file: word.txt |
6 # | 6 # |
7 # ICU Word Break Rules | 7 # ICU Word Break Rules |
8 # See Unicode Standard Annex #29. | 8 # See Unicode Standard Annex #29. |
9 # These rules are based on UAX #29 Revision 22 for Unicode Version 6.3 | 9 # These rules are based on UAX #29 Revision 22 for Unicode Version 6.3 |
10 # | 10 # |
(...skipping 17 matching lines...) Expand all Loading... |
28 $LF = [\p{Word_Break = LF}]; | 28 $LF = [\p{Word_Break = LF}]; |
29 $Newline = [\p{Word_Break = Newline}]; | 29 $Newline = [\p{Word_Break = Newline}]; |
30 $Extend = [\p{Word_Break = Extend}]; | 30 $Extend = [\p{Word_Break = Extend}]; |
31 $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; | 31 $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; |
32 $Format = [\p{Word_Break = Format}]; | 32 $Format = [\p{Word_Break = Format}]; |
33 $Katakana = [\p{Word_Break = Katakana}]; | 33 $Katakana = [\p{Word_Break = Katakana}]; |
34 $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; | 34 $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; |
35 $ALetter = [\p{Word_Break = ALetter}]; | 35 $ALetter = [\p{Word_Break = ALetter}]; |
36 $Single_Quote = [\p{Word_Break = Single_Quote}]; | 36 $Single_Quote = [\p{Word_Break = Single_Quote}]; |
37 $Double_Quote = [\p{Word_Break = Double_Quote}]; | 37 $Double_Quote = [\p{Word_Break = Double_Quote}]; |
38 # Remove two full stop characters from $MidNumLet and add them to $MidNum | 38 $MidNumLet = [\p{Word_Break = MidNumLet}]; |
39 # to break a hostname into its components at the cost of breaking | |
40 # 'e.g.' and 'i.e.' as well. | |
41 # $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12. | |
42 # Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected | |
43 # while rules 6/7 are reverted to the old behavior we want. | |
44 $MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]]; | |
45 $MidLetter = [\p{Word_Break = MidLetter}]; | 39 $MidLetter = [\p{Word_Break = MidLetter}]; |
46 $MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]]; | 40 $MidNum = [\p{Word_Break = MidNum}]; |
47 $Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth d
igits | 41 $Numeric = [\p{Word_Break = Numeric}]; |
48 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; | 42 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; |
49 | 43 |
50 $Han = [:Han:]; | 44 $Han = [:Han:]; |
51 $Hiragana = [:Hiragana:]; | 45 $Hiragana = [:Hiragana:]; |
52 | 46 |
53 | 47 |
54 # Dictionary character set, for triggering language-based break engines. Curre
ntly | 48 # Dictionary character set, for triggering language-based break engines. Curre
ntly |
55 # limited to LineBreak=Complex_Context. Note that this set only works in Unico
de | 49 # limited to LineBreak=Complex_Context. Note that this set only works in Unico
de |
56 # 5.0 or later as the definition of Complex_Context was corrected to include a
ll | 50 # 5.0 or later as the definition of Complex_Context was corrected to include a
ll |
57 # characters requiring dictionary break. | 51 # characters requiring dictionary break. |
(...skipping 214 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
272 ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx); | 266 ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx); |
273 | 267 |
274 # rule 7b | 268 # rule 7b |
275 $Double_QuoteEx $Hebrew_LetterEx; | 269 $Double_QuoteEx $Hebrew_LetterEx; |
276 | 270 |
277 # rule 11 | 271 # rule 11 |
278 ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx; | 272 ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx; |
279 | 273 |
280 # For dictionary-based break | 274 # For dictionary-based break |
281 $dictionary $dictionary; | 275 $dictionary $dictionary; |
OLD | NEW |