| Index: icu46/source/data/brkitr/word.txt
|
| ===================================================================
|
| --- icu46/source/data/brkitr/word.txt (revision 68397)
|
| +++ icu46/source/data/brkitr/word.txt (working copy)
|
| @@ -29,29 +29,49 @@
|
| $Newline = [\p{Word_Break = Newline}];
|
| $Extend = [\p{Word_Break = Extend}];
|
| $Format = [\p{Word_Break = Format}];
|
| +$Hiragana = [:Hiragana:];
|
| $Katakana = [\p{Word_Break = Katakana}];
|
| +$Han = [:Han:];
|
| $ALetter = [\p{Word_Break = ALetter}];
|
| -$MidNumLet = [\p{Word_Break = MidNumLet}];
|
| +# Remove two full stop characters from $MidNumLet and add them to $MidNum
|
| +# to break a hostname into its components at the cost of breaking
|
| +# 'e.g.' and 'i.e.' as well.
|
| +# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12.
|
| +# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected
|
| +# while rules 6/7 are reverted to the old behavior we want.
|
| +$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]];
|
| $MidLetter = [\p{Word_Break = MidLetter}];
|
| -$MidNum = [\p{Word_Break = MidNum}];
|
| -$Numeric = [\p{Word_Break = Numeric}];
|
| +$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]];
|
| +$Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth digits
|
| $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
|
|
| +# Extra sets not to break 'HebrewLetter U+0022 HebrewLetter'.
|
| +$HebrewLet = [\p{Word_Break = ALetter} & \p{Script = Hebrew} - [\u05F3]];
|
| +# U+05F3 is ALetter and U+05F4 is MidLetter so that they're covered by
|
| +# the current rule 6/7.
|
| +$HebrewMidLet = [\u0022];
|
|
|
| # Dictionary character set, for triggering language-based break engines. Currently
|
| -# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
| -# 5.0 or later as the definition of Complex_Context was corrected to include all
|
| +# limited to LineBreak=Complex_Context and CJK. Note that this set only works
|
| +# in Unicode 5.0 or later as the definition of Complex_Context was corrected to include all
|
| # characters requiring dictionary break.
|
|
|
| -$dictionary = [:LineBreak = Complex_Context:];
|
| $Control = [\p{Grapheme_Cluster_Break = Control}];
|
| -$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not
|
| - # include the dictionary characters.
|
| +$HangulSyllable = [\uac00-\ud7a3];
|
| +$ComplexContext = [:LineBreak = Complex_Context:];
|
| +$KanaKanji = [$Han $Hiragana $Katakana];
|
| +$dictionaryCJK = [$KanaKanji $HangulSyllable];
|
| +$dictionary = [$ComplexContext $dictionaryCJK];
|
|
|
| +# leave CJK scripts out of ALetterPlus
|
| +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
|
| +
|
| +
|
| #
|
| # Rules 4 Ignore Format and Extend characters,
|
| # except when they appear at the beginning of a region of text.
|
| #
|
| +# TODO: check if handling of katakana in dictionary makes rules incorrect/void.
|
| $KatakanaEx = $Katakana ($Extend | $Format)*;
|
| $ALetterEx = $ALetterPlus ($Extend | $Format)*;
|
| $MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
| @@ -59,8 +79,8 @@
|
| $MidNumEx = $MidNum ($Extend | $Format)*;
|
| $NumericEx = $Numeric ($Extend | $Format)*;
|
| $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
| +$HebrewLetEx = $HebrewLet ($Extend | $Format)*;
|
|
|
| -$Hiragana = [\p{script=Hiragana}];
|
| $Ideographic = [\p{Ideographic}];
|
| $HiraganaEx = $Hiragana ($Extend | $Format)*;
|
| $IdeographicEx = $Ideographic ($Extend | $Format)*;
|
| @@ -79,12 +99,14 @@
|
| # begins with a group of Format chars, or with a "word" consisting of a single
|
| # char that is not in any of the listed word break categories followed by
|
| # format char(s).
|
| -[^$CR $LF $Newline]? ($Extend | $Format)+;
|
| + # format char(s), or is not a CJK dictionary character.
|
| +[^$CR $LF $Newline $dictionaryCJK]? ($Extend | $Format)+;
|
|
|
| $NumericEx {100};
|
| $ALetterEx {200};
|
| -$KatakanaEx {300}; # note: these status values override those from rule 5
|
| -$HiraganaEx {300}; # by virtual of being numerically larger.
|
| +$HangulSyllable {200};
|
| +$KatakanaEx {400}; #originally 300
|
| +$HiraganaEx {400}; #originally 300
|
| $IdeographicEx {400}; #
|
|
|
| #
|
| @@ -96,6 +118,9 @@
|
| # rule 6 and 7
|
| $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
|
|
|
| +# Chrome addition
|
| +$HebrewLetEx $HebrewMidLet $HebrewLetEx {200};
|
| +
|
| # rule 8
|
|
|
| $NumericEx $NumericEx {100};
|
| @@ -114,21 +139,27 @@
|
|
|
| # rule 13
|
|
|
| -$KatakanaEx $KatakanaEx {300};
|
| +# To be consistent with '$KanaKanji $KanaKanji', changed
|
| +# from 300 to 400.
|
| +# See also TestRuleStatus in intltest/rbbiapts.cpp
|
| +$KatakanaEx $KatakanaEx {400};
|
|
|
| # rule 13a/b
|
|
|
| $ALetterEx $ExtendNumLetEx {200}; # (13a)
|
| $NumericEx $ExtendNumLetEx {100}; # (13a)
|
| -$KatakanaEx $ExtendNumLetEx {300}; # (13a)
|
| +$KatakanaEx $ExtendNumLetEx {400}; # (13a)
|
| $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
|
|
|
| $ExtendNumLetEx $ALetterEx {200}; # (13b)
|
| $ExtendNumLetEx $NumericEx {100}; # (13b)
|
| -$ExtendNumLetEx $KatakanaEx {300}; # (13b)
|
| -
|
| +$ExtendNumLetEx $KatakanaEx {400}; # (13b)
|
|
|
| +# special handling for CJK characters: chain for later dictionary segmentation
|
| +$HangulSyllable $HangulSyllable {200};
|
| +$KanaKanji $KanaKanji {400}; #different rule status if both kanji and kana found
|
|
|
| +
|
| ## -------------------------------------------------
|
|
|
| !!reverse;
|
| @@ -139,13 +170,16 @@
|
| $BackMidNumEx = ($Format | $Extend)* $MidNum;
|
| $BackMidLetterEx = ($Format | $Extend)* $MidLetter;
|
| $BackKatakanaEx = ($Format | $Extend)* $Katakana;
|
| +$BackHiraganaEx = ($Extend | $Format)* $Hiragana;
|
| $BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;
|
| +$BackHebrewLetEx = ($Format | $Extend)* $HebrewLet;
|
|
|
| +
|
| # rule 3
|
| $LF $CR;
|
|
|
| # rule 4
|
| -($Format | $Extend)* [^$CR $LF $Newline]?;
|
| +($Format | $Extend)* [^$CR $LF $Newline $dictionaryCJK]?;
|
|
|
| # rule 5
|
|
|
| @@ -155,6 +189,8 @@
|
|
|
| $BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;
|
|
|
| +# Chrome addition
|
| +$BackHebrewLetEx $HebrewMidLet $BackHebrewLetEx;
|
|
|
| # rule 8
|
|
|
| @@ -181,6 +217,10 @@
|
| $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
|
| ($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
|
|
|
| +# special handling for CJK characters: chain for later dictionary segmentation
|
| +$HangulSyllable $HangulSyllable;
|
| +$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
|
| +
|
| ## -------------------------------------------------
|
|
|
| !!safe_reverse;
|
|
|