| Index: source/data/brkitr/word_ja.txt
|
| diff --git a/source/data/brkitr/word.txt b/source/data/brkitr/word_ja.txt
|
| similarity index 88%
|
| copy from source/data/brkitr/word.txt
|
| copy to source/data/brkitr/word_ja.txt
|
| index f89a2fe74526fc0e521f942b5f50957377a1a040..fb77507c8bf9185a7312fe38c261ea87f7b7d803 100644
|
| --- a/source/data/brkitr/word.txt
|
| +++ b/source/data/brkitr/word_ja.txt
|
| @@ -2,7 +2,7 @@
|
| # Copyright (C) 2002-2013, International Business Machines Corporation
|
| # and others. All Rights Reserved.
|
| #
|
| -# file: word.txt
|
| +# file: word_ja.txt
|
| #
|
| # ICU Word Break Rules
|
| # See Unicode Standard Annex #29.
|
| @@ -35,10 +35,16 @@ $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
| $ALetter = [\p{Word_Break = ALetter}];
|
| $Single_Quote = [\p{Word_Break = Single_Quote}];
|
| $Double_Quote = [\p{Word_Break = Double_Quote}];
|
| -$MidNumLet = [\p{Word_Break = MidNumLet}];
|
| +# Remove two full stop characters from $MidNumLet and add them to $MidNum
|
| +# to break a hostname into its components at the cost of breaking
|
| +# 'e.g.' and 'i.e.' as well.
|
| +# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12.
|
| +# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected
|
| +# while rules 6/7 are reverted to the old behavior we want.
|
| +$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]];
|
| $MidLetter = [\p{Word_Break = MidLetter}];
|
| -$MidNum = [\p{Word_Break = MidNum}];
|
| -$Numeric = [\p{Word_Break = Numeric}];
|
| +$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]];
|
| +$Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth digits
|
| $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
|
|
| $Han = [:Han:];
|
| @@ -50,15 +56,13 @@ $Hiragana = [:Hiragana:];
|
| # 5.0 or later as the definition of Complex_Context was corrected to include all
|
| # characters requiring dictionary break.
|
|
|
| -$Control = [\p{Grapheme_Cluster_Break = Control}];
|
| +$Control = [\p{Grapheme_Cluster_Break = Control}];
|
| $HangulSyllable = [\uac00-\ud7a3];
|
| $ComplexContext = [:LineBreak = Complex_Context:];
|
| $KanaKanji = [$Han $Hiragana $Katakana];
|
| -$dictionaryCJK = [$KanaKanji $HangulSyllable];
|
| -$dictionary = [$ComplexContext $dictionaryCJK];
|
| +$dictionary = [$ComplexContext];
|
|
|
| -# leave CJK scripts out of ALetterPlus
|
| -$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
|
| +$ALetterPlus = [$ALetter [$ComplexContext-$Extend-$Control]];
|
|
|
|
|
| #
|
| @@ -78,7 +82,7 @@ $NumericEx = $Numeric ($Extend | $Format)*;
|
| $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
| $Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*;
|
|
|
| -$Ideographic = [\p{Ideographic}];
|
| +$Ideographic = [\p{Ideographic} [\u3005 \u3007 \u303B]];
|
| $HiraganaEx = $Hiragana ($Extend | $Format)*;
|
| $IdeographicEx = $Ideographic ($Extend | $Format)*;
|
|
|
| @@ -142,6 +146,8 @@ $NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
|
| # from 300 to 400.
|
| # See also TestRuleStatus in intltest/rbbiapts.cpp
|
| $KatakanaEx $KatakanaEx {400};
|
| +$HiraganaEx $HiraganaEx {400};
|
| +$IdeographicEx $IdeographicEx {400};
|
|
|
| # rule 13a/b
|
|
|
| @@ -160,11 +166,6 @@ $ExtendNumLetEx $KatakanaEx {400}; # (13b)
|
|
|
| $Regional_IndicatorEx $Regional_IndicatorEx;
|
|
|
| -# special handling for CJK characters: chain for later dictionary segmentation
|
| -$HangulSyllable $HangulSyllable {200};
|
| -$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
|
| -
|
| -
|
| ## -------------------------------------------------
|
|
|
| !!reverse;
|
| @@ -179,6 +180,7 @@ $BackMidNumEx = ($Format | $Extend)* $MidNum;
|
| $BackMidLetterEx = ($Format | $Extend)* $MidLetter;
|
| $BackKatakanaEx = ($Format | $Extend)* $Katakana;
|
| $BackHiraganaEx = ($Format | $Extend)* $Hiragana;
|
| +$BackIdeographicEx = ($Format | $Extend)* $Ideographic;
|
| $BackExtendNumLetEx = ($Format | $Extend)* $ExtendNumLet;
|
| $BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;
|
|
|
| @@ -221,6 +223,8 @@ $BackNumericEx ($BackMidNumEx | $BackMidNumLetEx | $BackSingle_QuoteEx) $BackNum
|
| # rule 13
|
|
|
| $BackKatakanaEx $BackKatakanaEx;
|
| +$BackHiraganaEx $BackHiraganaEx;
|
| +$BackIdeographicEx $BackIdeographicEx;
|
|
|
| # rules 13 a/b
|
| #
|
| @@ -231,10 +235,6 @@ $BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $B
|
|
|
| $BackRegional_IndicatorEx $BackRegional_IndicatorEx;
|
|
|
| -# special handling for CJK characters: chain for later dictionary segmentation
|
| -$HangulSyllable $HangulSyllable;
|
| -$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
|
| -
|
| ## -------------------------------------------------
|
|
|
| !!safe_reverse;
|
|
|