| OLD | NEW |
| 1 # | 1 # |
| 2 # Copyright (C) 2002-2013, International Business Machines Corporation | 2 # Copyright (C) 2002-2013, International Business Machines Corporation |
| 3 # and others. All Rights Reserved. | 3 # and others. All Rights Reserved. |
| 4 # | 4 # |
| 5 # file: word.txt | 5 # file: word.txt |
| 6 # | 6 # |
| 7 # ICU Word Break Rules | 7 # ICU Word Break Rules |
| 8 # See Unicode Standard Annex #29. | 8 # See Unicode Standard Annex #29. |
| 9 # These rules are based on UAX #29 Revision 22 for Unicode Version 6.3 | 9 # These rules are based on UAX #29 Revision 22 for Unicode Version 6.3 |
| 10 # | 10 # |
| (...skipping 17 matching lines...) Expand all Loading... |
| 28 $LF = [\p{Word_Break = LF}]; | 28 $LF = [\p{Word_Break = LF}]; |
| 29 $Newline = [\p{Word_Break = Newline}]; | 29 $Newline = [\p{Word_Break = Newline}]; |
| 30 $Extend = [\p{Word_Break = Extend}]; | 30 $Extend = [\p{Word_Break = Extend}]; |
| 31 $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; | 31 $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; |
| 32 $Format = [\p{Word_Break = Format}]; | 32 $Format = [\p{Word_Break = Format}]; |
| 33 $Katakana = [\p{Word_Break = Katakana}]; | 33 $Katakana = [\p{Word_Break = Katakana}]; |
| 34 $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; | 34 $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; |
| 35 $ALetter = [\p{Word_Break = ALetter}]; | 35 $ALetter = [\p{Word_Break = ALetter}]; |
| 36 $Single_Quote = [\p{Word_Break = Single_Quote}]; | 36 $Single_Quote = [\p{Word_Break = Single_Quote}]; |
| 37 $Double_Quote = [\p{Word_Break = Double_Quote}]; | 37 $Double_Quote = [\p{Word_Break = Double_Quote}]; |
| 38 # Remove two full stop characters from $MidNumLet and add them to $MidNum | 38 $MidNumLet = [\p{Word_Break = MidNumLet}]; |
| 39 # to break a hostname into its components at the cost of breaking | |
| 40 # 'e.g.' and 'i.e.' as well. | |
| 41 # $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12. | |
| 42 # Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected | |
| 43 # while rules 6/7 are reverted to the old behavior we want. | |
| 44 $MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]]; | |
| 45 $MidLetter = [\p{Word_Break = MidLetter}]; | 39 $MidLetter = [\p{Word_Break = MidLetter}]; |
| 46 $MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]]; | 40 $MidNum = [\p{Word_Break = MidNum}]; |
| 47 $Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth d
igits | 41 $Numeric = [\p{Word_Break = Numeric}]; |
| 48 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; | 42 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; |
| 49 | 43 |
| 50 $Han = [:Han:]; | 44 $Han = [:Han:]; |
| 51 $Hiragana = [:Hiragana:]; | 45 $Hiragana = [:Hiragana:]; |
| 52 | 46 |
| 53 | 47 |
| 54 # Dictionary character set, for triggering language-based break engines. Curre
ntly | 48 # Dictionary character set, for triggering language-based break engines. Curre
ntly |
| 55 # limited to LineBreak=Complex_Context. Note that this set only works in Unico
de | 49 # limited to LineBreak=Complex_Context. Note that this set only works in Unico
de |
| 56 # 5.0 or later as the definition of Complex_Context was corrected to include a
ll | 50 # 5.0 or later as the definition of Complex_Context was corrected to include a
ll |
| 57 # characters requiring dictionary break. | 51 # characters requiring dictionary break. |
| (...skipping 214 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 272 ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx); | 266 ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx); |
| 273 | 267 |
| 274 # rule 7b | 268 # rule 7b |
| 275 $Double_QuoteEx $Hebrew_LetterEx; | 269 $Double_QuoteEx $Hebrew_LetterEx; |
| 276 | 270 |
| 277 # rule 11 | 271 # rule 11 |
| 278 ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx; | 272 ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx; |
| 279 | 273 |
| 280 # For dictionary-based break | 274 # For dictionary-based break |
| 281 $dictionary $dictionary; | 275 $dictionary $dictionary; |
| OLD | NEW |