Index: icu46/source/data/brkitr/word.txt |
=================================================================== |
--- icu46/source/data/brkitr/word.txt (revision 68397) |
+++ icu46/source/data/brkitr/word.txt (working copy) |
@@ -29,29 +29,49 @@ |
$Newline = [\p{Word_Break = Newline}]; |
$Extend = [\p{Word_Break = Extend}]; |
$Format = [\p{Word_Break = Format}]; |
+$Hiragana = [:Hiragana:]; |
$Katakana = [\p{Word_Break = Katakana}]; |
+$Han = [:Han:]; |
$ALetter = [\p{Word_Break = ALetter}]; |
-$MidNumLet = [\p{Word_Break = MidNumLet}]; |
+# Remove two full stop characters from $MidNumLet and add them to $MidNum |
+# to break a hostname into its components at the cost of breaking |
+# 'e.g.' and 'i.e.' as well. |
+# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12. |
+# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected |
+# while rules 6/7 are reverted to the old behavior we want. |
+$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]]; |
$MidLetter = [\p{Word_Break = MidLetter}]; |
-$MidNum = [\p{Word_Break = MidNum}]; |
-$Numeric = [\p{Word_Break = Numeric}]; |
+$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]]; |
+$Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth digits |
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; |
+# Extra sets not to break 'HebrewLetter U+0022 HebrewLetter'. |
+$HebrewLet = [\p{Word_Break = ALetter} & \p{Script = Hebrew} - [\u05F3]]; |
+# U+05F3 is ALetter and U+05F4 is MidLetter so that they're covered by |
+# the current rule 6/7. |
+$HebrewMidLet = [\u0022]; |
# Dictionary character set, for triggering language-based break engines. Currently |
-# limited to LineBreak=Complex_Context. Note that this set only works in Unicode |
-# 5.0 or later as the definition of Complex_Context was corrected to include all |
+# limited to LineBreak=Complex_Context and CJK. Note that this set only works |
+# in Unicode 5.0 or later as the definition of Complex_Context was corrected to include all |
# characters requiring dictionary break. |
-$dictionary = [:LineBreak = Complex_Context:]; |
$Control = [\p{Grapheme_Cluster_Break = Control}]; |
-$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not |
- # include the dictionary characters. |
+$HangulSyllable = [\uac00-\ud7a3]; |
+$ComplexContext = [:LineBreak = Complex_Context:]; |
+$KanaKanji = [$Han $Hiragana $Katakana]; |
+$dictionaryCJK = [$KanaKanji $HangulSyllable]; |
+$dictionary = [$ComplexContext $dictionaryCJK]; |
+# leave CJK scripts out of ALetterPlus |
+$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; |
+ |
+ |
# |
# Rules 4 Ignore Format and Extend characters, |
# except when they appear at the beginning of a region of text. |
# |
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void. |
$KatakanaEx = $Katakana ($Extend | $Format)*; |
$ALetterEx = $ALetterPlus ($Extend | $Format)*; |
$MidNumLetEx = $MidNumLet ($Extend | $Format)*; |
@@ -59,8 +79,8 @@ |
$MidNumEx = $MidNum ($Extend | $Format)*; |
$NumericEx = $Numeric ($Extend | $Format)*; |
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; |
+$HebrewLetEx = $HebrewLet ($Extend | $Format)*; |
-$Hiragana = [\p{script=Hiragana}]; |
$Ideographic = [\p{Ideographic}]; |
$HiraganaEx = $Hiragana ($Extend | $Format)*; |
$IdeographicEx = $Ideographic ($Extend | $Format)*; |
@@ -79,12 +99,14 @@ |
# begins with a group of Format chars, or with a "word" consisting of a single |
# char that is not in any of the listed word break categories followed by |
# format char(s). |
-[^$CR $LF $Newline]? ($Extend | $Format)+; |
+ # format char(s), or is not a CJK dictionary character. |
+[^$CR $LF $Newline $dictionaryCJK]? ($Extend | $Format)+; |
$NumericEx {100}; |
$ALetterEx {200}; |
-$KatakanaEx {300}; # note: these status values override those from rule 5 |
-$HiraganaEx {300}; # by virtual of being numerically larger. |
+$HangulSyllable {200}; |
+$KatakanaEx {400}; #originally 300 |
+$HiraganaEx {400}; #originally 300 |
$IdeographicEx {400}; # |
# |
@@ -96,6 +118,9 @@ |
# rule 6 and 7 |
$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; |
+# Chrome addition |
+$HebrewLetEx $HebrewMidLet $HebrewLetEx {200}; |
+ |
# rule 8 |
$NumericEx $NumericEx {100}; |
@@ -114,21 +139,27 @@ |
# rule 13 |
-$KatakanaEx $KatakanaEx {300}; |
+# To be consistent with '$KanaKanji $KanaKanji', changed |
+# from 300 to 400. |
+# See also TestRuleStatus in intltest/rbbiapts.cpp |
+$KatakanaEx $KatakanaEx {400}; |
# rule 13a/b |
$ALetterEx $ExtendNumLetEx {200}; # (13a) |
$NumericEx $ExtendNumLetEx {100}; # (13a) |
-$KatakanaEx $ExtendNumLetEx {300}; # (13a) |
+$KatakanaEx $ExtendNumLetEx {400}; # (13a) |
$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) |
$ExtendNumLetEx $ALetterEx {200}; # (13b) |
$ExtendNumLetEx $NumericEx {100}; # (13b) |
-$ExtendNumLetEx $KatakanaEx {300}; # (13b) |
- |
+$ExtendNumLetEx $KatakanaEx {400}; # (13b) |
+# special handling for CJK characters: chain for later dictionary segmentation |
+$HangulSyllable $HangulSyllable {200}; |
+$KanaKanji $KanaKanji {400}; #different rule status if both kanji and kana found |
+ |
## ------------------------------------------------- |
!!reverse; |
@@ -139,13 +170,16 @@ |
$BackMidNumEx = ($Format | $Extend)* $MidNum; |
$BackMidLetterEx = ($Format | $Extend)* $MidLetter; |
$BackKatakanaEx = ($Format | $Extend)* $Katakana; |
+$BackHiraganaEx = ($Extend | $Format)* $Hiragana; |
$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet; |
+$BackHebrewLetEx = ($Format | $Extend)* $HebrewLet; |
+ |
# rule 3 |
$LF $CR; |
# rule 4 |
-($Format | $Extend)* [^$CR $LF $Newline]?; |
+($Format | $Extend)* [^$CR $LF $Newline $dictionaryCJK]?; |
# rule 5 |
@@ -155,6 +189,8 @@ |
$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx; |
+# Chrome addition |
+$BackHebrewLetEx $HebrewMidLet $BackHebrewLetEx; |
# rule 8 |
@@ -181,6 +217,10 @@ |
$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx); |
($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; |
+# special handling for CJK characters: chain for later dictionary segmentation |
+$HangulSyllable $HangulSyllable; |
+$KanaKanji $KanaKanji; #different rule status if both kanji and kana found |
+ |
## ------------------------------------------------- |
!!safe_reverse; |