OLD | NEW |
1 diff --git a/source/data/brkitr/word.txt b/source/data/brkitr/word.txt | 1 diff --git a/source/data/brkitr/rules/word.txt b/source/data/brkitr/rules/word.t
xt |
2 index f89a2fe..c74da4c 100644 | 2 index aa7c47c..9c93dd5 100644 |
3 --- a/source/data/brkitr/word.txt | 3 --- a/source/data/brkitr/rules/word.txt |
4 +++ b/source/data/brkitr/word.txt | 4 +++ b/source/data/brkitr/rules/word.txt |
5 @@ -35,10 +35,16 @@ $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; | 5 @@ -39,10 +39,16 @@ $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; |
6 $ALetter = [\p{Word_Break = ALetter}]; | 6 $ALetter = [\p{Word_Break = ALetter}]; |
7 $Single_Quote = [\p{Word_Break = Single_Quote}]; | 7 $Single_Quote = [\p{Word_Break = Single_Quote}]; |
8 $Double_Quote = [\p{Word_Break = Double_Quote}]; | 8 $Double_Quote = [\p{Word_Break = Double_Quote}]; |
9 -$MidNumLet = [\p{Word_Break = MidNumLet}]; | 9 -$MidNumLet = [\p{Word_Break = MidNumLet}]; |
10 +# Remove two full stop characters from $MidNumLet and add them to $MidNum | 10 +# Remove two full stop characters from $MidNumLet and add them to $MidNum |
11 +# to break a hostname into its components at the cost of breaking | 11 +# to break a hostname into its components at the cost of breaking |
12 +# 'e.g.' and 'i.e.' as well. | 12 +# 'e.g.' and 'i.e.' as well. |
13 +# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12. | 13 +# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12. |
14 +# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected | 14 +# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected |
15 +# while rules 6/7 are reverted to the old behavior we want. | 15 +# while rules 6/7 are reverted to the old behavior we want. |
16 +$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]]; | 16 +$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]]; |
17 $MidLetter = [\p{Word_Break = MidLetter}]; | 17 $MidLetter = [\p{Word_Break = MidLetter}]; |
18 -$MidNum = [\p{Word_Break = MidNum}]; | 18 -$MidNum = [\p{Word_Break = MidNum}]; |
19 -$Numeric = [\p{Word_Break = Numeric}]; | 19 -$Numeric = [\p{Word_Break = Numeric}]; |
20 +$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]]; | 20 +$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]]; |
21 +$Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth
digits | 21 +$Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth
digits |
22 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; | 22 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; |
23 | 23 $E_Base = [\p{Word_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F
46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC]; |
24 $Han = [:Han:]; | 24 $E_Modifier = [\p{Word_Break = EM}]; |
OLD | NEW |