| OLD | NEW |
| 1 # | 1 # |
| 2 # Copyright (C) 2002-2013, International Business Machines Corporation | 2 # Copyright (C) 2002-2013, International Business Machines Corporation |
| 3 # and others. All Rights Reserved. | 3 # and others. All Rights Reserved. |
| 4 # | 4 # |
| 5 # file: word.txt | 5 # file: word.txt |
| 6 # | 6 # |
| 7 # ICU Word Break Rules | 7 # ICU Word Break Rules |
| 8 # See Unicode Standard Annex #29. | 8 # See Unicode Standard Annex #29. |
| 9 # These rules are based on UAX #29 Revision 22 for Unicode Version 6.3 | 9 # These rules are based on UAX #29 Revision 22 for Unicode Version 6.3 |
| 10 # | 10 # |
| (...skipping 17 matching lines...) Expand all Loading... |
| 28 $LF = [\p{Word_Break = LF}]; | 28 $LF = [\p{Word_Break = LF}]; |
| 29 $Newline = [\p{Word_Break = Newline}]; | 29 $Newline = [\p{Word_Break = Newline}]; |
| 30 $Extend = [\p{Word_Break = Extend}]; | 30 $Extend = [\p{Word_Break = Extend}]; |
| 31 $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; | 31 $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; |
| 32 $Format = [\p{Word_Break = Format}]; | 32 $Format = [\p{Word_Break = Format}]; |
| 33 $Katakana = [\p{Word_Break = Katakana}]; | 33 $Katakana = [\p{Word_Break = Katakana}]; |
| 34 $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; | 34 $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; |
| 35 $ALetter = [\p{Word_Break = ALetter}]; | 35 $ALetter = [\p{Word_Break = ALetter}]; |
| 36 $Single_Quote = [\p{Word_Break = Single_Quote}]; | 36 $Single_Quote = [\p{Word_Break = Single_Quote}]; |
| 37 $Double_Quote = [\p{Word_Break = Double_Quote}]; | 37 $Double_Quote = [\p{Word_Break = Double_Quote}]; |
| 38 $MidNumLet = [\p{Word_Break = MidNumLet}]; | 38 # Remove two full stop characters from $MidNumLet and add them to $MidNum |
| 39 # to break a hostname into its components at the cost of breaking |
| 40 # 'e.g.' and 'i.e.' as well. |
| 41 # $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12. |
| 42 # Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected |
| 43 # while rules 6/7 are reverted to the old behavior we want. |
| 44 $MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]]; |
| 39 $MidLetter = [\p{Word_Break = MidLetter}]; | 45 $MidLetter = [\p{Word_Break = MidLetter}]; |
| 40 $MidNum = [\p{Word_Break = MidNum}]; | 46 $MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]]; |
| 41 $Numeric = [\p{Word_Break = Numeric}]; | 47 $Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth d
igits |
| 42 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; | 48 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; |
| 43 | 49 |
| 44 $Han = [:Han:]; | 50 $Han = [:Han:]; |
| 45 $Hiragana = [:Hiragana:]; | 51 $Hiragana = [:Hiragana:]; |
| 46 | 52 |
| 47 | 53 |
| 48 # Dictionary character set, for triggering language-based break engines. Curre
ntly | 54 # Dictionary character set, for triggering language-based break engines. Curre
ntly |
| 49 # limited to LineBreak=Complex_Context. Note that this set only works in Unico
de | 55 # limited to LineBreak=Complex_Context. Note that this set only works in Unico
de |
| 50 # 5.0 or later as the definition of Complex_Context was corrected to include a
ll | 56 # 5.0 or later as the definition of Complex_Context was corrected to include a
ll |
| 51 # characters requiring dictionary break. | 57 # characters requiring dictionary break. |
| (...skipping 214 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 266 ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx); | 272 ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx); |
| 267 | 273 |
| 268 # rule 7b | 274 # rule 7b |
| 269 $Double_QuoteEx $Hebrew_LetterEx; | 275 $Double_QuoteEx $Hebrew_LetterEx; |
| 270 | 276 |
| 271 # rule 11 | 277 # rule 11 |
| 272 ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx; | 278 ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx; |
| 273 | 279 |
| 274 # For dictionary-based break | 280 # For dictionary-based break |
| 275 $dictionary $dictionary; | 281 $dictionary $dictionary; |
| OLD | NEW |