| Index: source/data/brkitr/word_ja.txt
|
| diff --git a/source/data/brkitr/word_ja.txt b/source/data/brkitr/word_ja.txt
|
| deleted file mode 100644
|
| index fb77507c8bf9185a7312fe38c261ea87f7b7d803..0000000000000000000000000000000000000000
|
| --- a/source/data/brkitr/word_ja.txt
|
| +++ /dev/null
|
| @@ -1,275 +0,0 @@
|
| -#
|
| -# Copyright (C) 2002-2013, International Business Machines Corporation
|
| -# and others. All Rights Reserved.
|
| -#
|
| -# file: word_ja.txt
|
| -#
|
| -# ICU Word Break Rules
|
| -# See Unicode Standard Annex #29.
|
| -# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
|
| -#
|
| -# Note: Updates to word.txt will usually need to be merged into
|
| -# word_POSIX.txt also.
|
| -
|
| -##############################################################################
|
| -#
|
| -# Character class definitions from TR 29
|
| -#
|
| -##############################################################################
|
| -
|
| -!!chain;
|
| -
|
| -
|
| -#
|
| -# Character Class Definitions.
|
| -#
|
| -
|
| -$CR = [\p{Word_Break = CR}];
|
| -$LF = [\p{Word_Break = LF}];
|
| -$Newline = [\p{Word_Break = Newline}];
|
| -$Extend = [\p{Word_Break = Extend}];
|
| -$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
| -$Format = [\p{Word_Break = Format}];
|
| -$Katakana = [\p{Word_Break = Katakana}];
|
| -$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
| -$ALetter = [\p{Word_Break = ALetter}];
|
| -$Single_Quote = [\p{Word_Break = Single_Quote}];
|
| -$Double_Quote = [\p{Word_Break = Double_Quote}];
|
| -# Remove two full stop characters from $MidNumLet and add them to $MidNum
|
| -# to break a hostname into its components at the cost of breaking
|
| -# 'e.g.' and 'i.e.' as well.
|
| -# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12.
|
| -# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected
|
| -# while rules 6/7 are reverted to the old behavior we want.
|
| -$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]];
|
| -$MidLetter = [\p{Word_Break = MidLetter}];
|
| -$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]];
|
| -$Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth digits
|
| -$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
| -
|
| -$Han = [:Han:];
|
| -$Hiragana = [:Hiragana:];
|
| -
|
| -
|
| -# Dictionary character set, for triggering language-based break engines. Currently
|
| -# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
| -# 5.0 or later as the definition of Complex_Context was corrected to include all
|
| -# characters requiring dictionary break.
|
| -
|
| -$Control = [\p{Grapheme_Cluster_Break = Control}];
|
| -$HangulSyllable = [\uac00-\ud7a3];
|
| -$ComplexContext = [:LineBreak = Complex_Context:];
|
| -$KanaKanji = [$Han $Hiragana $Katakana];
|
| -$dictionary = [$ComplexContext];
|
| -
|
| -$ALetterPlus = [$ALetter [$ComplexContext-$Extend-$Control]];
|
| -
|
| -
|
| -#
|
| -# Rules 4 Ignore Format and Extend characters,
|
| -# except when they appear at the beginning of a region of text.
|
| -#
|
| -# TODO: check if handling of katakana in dictionary makes rules incorrect/void
|
| -$KatakanaEx = $Katakana ($Extend | $Format)*;
|
| -$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*;
|
| -$ALetterEx = $ALetterPlus ($Extend | $Format)*;
|
| -$Single_QuoteEx = $Single_Quote ($Extend | $Format)*;
|
| -$Double_QuoteEx = $Double_Quote ($Extend | $Format)*;
|
| -$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
| -$MidLetterEx = $MidLetter ($Extend | $Format)*;
|
| -$MidNumEx = $MidNum ($Extend | $Format)*;
|
| -$NumericEx = $Numeric ($Extend | $Format)*;
|
| -$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
| -$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*;
|
| -
|
| -$Ideographic = [\p{Ideographic} [\u3005 \u3007 \u303B]];
|
| -$HiraganaEx = $Hiragana ($Extend | $Format)*;
|
| -$IdeographicEx = $Ideographic ($Extend | $Format)*;
|
| -
|
| -## -------------------------------------------------
|
| -
|
| -!!forward;
|
| -
|
| -
|
| -# Rule 3 - CR x LF
|
| -#
|
| -$CR $LF;
|
| -
|
| -# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
|
| -# of a region of Text. The rule here comes into play when the start of text
|
| -# begins with a group of Format chars, or with a "word" consisting of a single
|
| -# char that is not in any of the listed word break categories followed by
|
| -# format char(s), or is not a CJK dictionary character.
|
| -[^$CR $LF $Newline]? ($Extend | $Format)+;
|
| -
|
| -$NumericEx {100};
|
| -$ALetterEx {200};
|
| -$HangulSyllable {200};
|
| -$Hebrew_LetterEx{200};
|
| -$KatakanaEx {400}; # note: these status values override those from rule 5
|
| -$HiraganaEx {400}; # by virtue of being numerically larger.
|
| -$IdeographicEx {400}; #
|
| -
|
| -#
|
| -# rule 5
|
| -# Do not break between most letters.
|
| -#
|
| -($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200};
|
| -
|
| -# rule 6 and 7
|
| -($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
|
| -
|
| -# rule 7a
|
| -$Hebrew_LetterEx $Single_QuoteEx {200};
|
| -
|
| -# rule 7b and 7c
|
| -$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
|
| -
|
| -# rule 8
|
| -
|
| -$NumericEx $NumericEx {100};
|
| -
|
| -# rule 9
|
| -
|
| -($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
|
| -
|
| -# rule 10
|
| -
|
| -$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
|
| -
|
| -# rule 11 and 12
|
| -
|
| -$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
|
| -
|
| -# rule 13
|
| -# to be consistent with $KanaKanji $KanaKanhi, changed
|
| -# from 300 to 400.
|
| -# See also TestRuleStatus in intltest/rbbiapts.cpp
|
| -$KatakanaEx $KatakanaEx {400};
|
| -$HiraganaEx $HiraganaEx {400};
|
| -$IdeographicEx $IdeographicEx {400};
|
| -
|
| -# rule 13a/b
|
| -
|
| -$ALetterEx $ExtendNumLetEx {200}; # (13a)
|
| -$Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a)
|
| -$NumericEx $ExtendNumLetEx {100}; # (13a)
|
| -$KatakanaEx $ExtendNumLetEx {400}; # (13a)
|
| -$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
|
| -
|
| -$ExtendNumLetEx $ALetterEx {200}; # (13b)
|
| -$ExtendNumLetEx $Hebrew_Letter {200}; # (13b)
|
| -$ExtendNumLetEx $NumericEx {100}; # (13b)
|
| -$ExtendNumLetEx $KatakanaEx {400}; # (13b)
|
| -
|
| -# rule 13c
|
| -
|
| -$Regional_IndicatorEx $Regional_IndicatorEx;
|
| -
|
| -## -------------------------------------------------
|
| -
|
| -!!reverse;
|
| -
|
| -$BackHebrew_LetterEx = ($Format | $Extend)* $Hebrew_Letter;
|
| -$BackALetterEx = ($Format | $Extend)* $ALetterPlus;
|
| -$BackSingle_QuoteEx = ($Format | $Extend)* $Single_Quote;
|
| -$BackDouble_QuoteEx = ($Format | $Extend)* $Double_Quote;
|
| -$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;
|
| -$BackNumericEx = ($Format | $Extend)* $Numeric;
|
| -$BackMidNumEx = ($Format | $Extend)* $MidNum;
|
| -$BackMidLetterEx = ($Format | $Extend)* $MidLetter;
|
| -$BackKatakanaEx = ($Format | $Extend)* $Katakana;
|
| -$BackHiraganaEx = ($Format | $Extend)* $Hiragana;
|
| -$BackIdeographicEx = ($Format | $Extend)* $Ideographic;
|
| -$BackExtendNumLetEx = ($Format | $Extend)* $ExtendNumLet;
|
| -$BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;
|
| -
|
| -# rule 3
|
| -$LF $CR;
|
| -
|
| -# rule 4
|
| -($Format | $Extend)* [^$CR $LF $Newline]?;
|
| -
|
| -# rule 5
|
| -
|
| -($BackALetterEx | $BackHebrew_LetterEx) ($BackALetterEx | $BackHebrew_LetterEx);
|
| -
|
| -# rule 6 and 7
|
| -
|
| -($BackALetterEx | $BackHebrew_LetterEx) ($BackMidLetterEx | $BackMidNumLetEx | $BackSingle_QuoteEx) ($BackALetterEx | $BackHebrew_LetterEx);
|
| -
|
| -# rule 7a
|
| -$BackSingle_QuoteEx $BackHebrew_LetterEx;
|
| -
|
| -# Rule 7b and 7c
|
| -$BackHebrew_LetterEx $BackDouble_QuoteEx $BackHebrew_LetterEx;
|
| -
|
| -# rule 8
|
| -
|
| -$BackNumericEx $BackNumericEx;
|
| -
|
| -# rule 9
|
| -
|
| -$BackNumericEx ($BackALetterEx | $BackHebrew_LetterEx);
|
| -
|
| -# rule 10
|
| -
|
| -($BackALetterEx | $BackHebrew_LetterEx) $BackNumericEx;
|
| -
|
| -# rule 11 and 12
|
| -
|
| -$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx | $BackSingle_QuoteEx) $BackNumericEx;
|
| -
|
| -# rule 13
|
| -
|
| -$BackKatakanaEx $BackKatakanaEx;
|
| -$BackHiraganaEx $BackHiraganaEx;
|
| -$BackIdeographicEx $BackIdeographicEx;
|
| -
|
| -# rules 13 a/b
|
| -#
|
| -$BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
|
| -($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
|
| -
|
| -# rule 13c
|
| -
|
| -$BackRegional_IndicatorEx $BackRegional_IndicatorEx;
|
| -
|
| -## -------------------------------------------------
|
| -
|
| -!!safe_reverse;
|
| -
|
| -# rule 3
|
| -($Extend | $Format)+ .?;
|
| -
|
| -# rule 6
|
| -($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx);
|
| -
|
| -# rule 7b
|
| -$Double_Quote $BackHebrew_LetterEx;
|
| -
|
| -
|
| -# rule 11
|
| -($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx;
|
| -
|
| -# For dictionary-based break
|
| -$dictionary $dictionary;
|
| -
|
| -## -------------------------------------------------
|
| -
|
| -!!safe_forward;
|
| -
|
| -# rule 4
|
| -($Extend | $Format)+ .?;
|
| -
|
| -# rule 6
|
| -($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx);
|
| -
|
| -# rule 7b
|
| -$Double_QuoteEx $Hebrew_LetterEx;
|
| -
|
| -# rule 11
|
| -($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx;
|
| -
|
| -# For dictionary-based break
|
| -$dictionary $dictionary;
|
|
|