icu46/source/data/brkitr/word.txt - Issue 6349014: ...

Side by Side Diff: icu46/source/data/brkitr/word.txt

Issue 6349014: ... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 9 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 #	1 #

2 # Copyright (C) 2002-2010, International Business Machines Corporation	2 # Copyright (C) 2002-2010, International Business Machines Corporation

3 # and others. All Rights Reserved.	3 # and others. All Rights Reserved.

4 #	4 #

5 # file: word.txt	5 # file: word.txt

6 #	6 #

7 # ICU Word Break Rules	7 # ICU Word Break Rules

8 # See Unicode Standard Annex #29.	8 # See Unicode Standard Annex #29.

9 # These rules are based on UAX-29 Revision 16 for Unicode 6.0	9 # These rules are based on UAX-29 Revision 16 for Unicode 6.0

10 #	10 #

(...skipping 11 matching lines...) Expand all Loading...
22	22

23 #	23 #

24 # Character Class Definitions.	24 # Character Class Definitions.

25 #	25 #

26	26

27 $CR = [\p{Word_Break = CR}];	27 $CR = [\p{Word_Break = CR}];

28 $LF = [\p{Word_Break = LF}];	28 $LF = [\p{Word_Break = LF}];

29 $Newline = [\p{Word_Break = Newline}];	29 $Newline = [\p{Word_Break = Newline}];

30 $Extend = [\p{Word_Break = Extend}];	30 $Extend = [\p{Word_Break = Extend}];

31 $Format = [\p{Word_Break = Format}];	31 $Format = [\p{Word_Break = Format}];

	32 $Hiragana = [:Hiragana:];

32 $Katakana = [\p{Word_Break = Katakana}];	33 $Katakana = [\p{Word_Break = Katakana}];

	34 $Han = [:Han:];

33 $ALetter = [\p{Word_Break = ALetter}];	35 $ALetter = [\p{Word_Break = ALetter}];

34 $MidNumLet = [\p{Word_Break = MidNumLet}];	36 # Remove two full stop characters from $MidNumLet and add them to $MidNum

	37 # to break a hostname into its components at the cost of breaking

	38 # 'e.g.' and 'i.e.' as well.

	39 # $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12.

	40 # Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected

	41 # while rules 6/7 are reverted to the old behavior we want.

	42 $MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]];

35 $MidLetter = [\p{Word_Break = MidLetter}];	43 $MidLetter = [\p{Word_Break = MidLetter}];

36 $MidNum = [\p{Word_Break = MidNum}];	44 $MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]];

37 $Numeric = [\p{Word_Break = Numeric}];	45 $Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth d igits

38 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];	46 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];

39	47

	48 # Extra sets not to break 'HebrewLetter U+0022 HebrewLetter'.

	49 $HebrewLet = [\p{Word_Break = ALetter} & \p{Script = Hebrew} - [\u05F3]];

	50 # U+05F3 is ALetter and U+05F4 is MidLetter so that they're covered by

	51 # the current rule 6/7.

	52 $HebrewMidLet = [\u0022];

40	53

41 # Dictionary character set, for triggering language-based break engines. Curre ntly	54 # Dictionary character set, for triggering language-based break engines. Curre ntly

42 # limited to LineBreak=Complex_Context. Note that this set only works in Unico de	55 # limited to LineBreak=Complex_Context and CJK. Note that this set only works

43 # 5.0 or later as the definition of Complex_Context was corrected to include a ll	56 # in Unicode 5.0 or later as the definition of Complex_Context was corrected t o include all

44 # characters requiring dictionary break.	57 # characters requiring dictionary break.

45	58

46 $dictionary = [:LineBreak = Complex_Context:];

47 $Control = [\p{Grapheme_Cluster_Break = Control}];	59 $Control = [\p{Grapheme_Cluster_Break = Control}];

48 $ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default AL etter does not	60 $HangulSyllable = [\uac00-\ud7a3];

49 # include the dict ionary characters.	61 $ComplexContext = [:LineBreak = Complex_Context:];

	62 $KanaKanji = [$Han $Hiragana $Katakana];

	63 $dictionaryCJK = [$KanaKanji $HangulSyllable];

	64 $dictionary = [$ComplexContext $dictionaryCJK];

	65

	66 # leave CJK scripts out of ALetterPlus

	67 $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];

	68

50	69

51 #	70 #

52 # Rules 4 Ignore Format and Extend characters,	71 # Rules 4 Ignore Format and Extend characters,

53 # except when they appear at the beginning of a region of text.	72 # except when they appear at the beginning of a region of text.

54 #	73 #

	74 # TODO: check if handling of katakana in dictionary makes rules incorrect/void.

55 $KatakanaEx = $Katakana ($Extend \| $Format)*;	75 $KatakanaEx = $Katakana ($Extend \| $Format)*;

56 $ALetterEx = $ALetterPlus ($Extend \| $Format)*;	76 $ALetterEx = $ALetterPlus ($Extend \| $Format)*;

57 $MidNumLetEx = $MidNumLet ($Extend \| $Format)*;	77 $MidNumLetEx = $MidNumLet ($Extend \| $Format)*;

58 $MidLetterEx = $MidLetter ($Extend \| $Format)*;	78 $MidLetterEx = $MidLetter ($Extend \| $Format)*;

59 $MidNumEx = $MidNum ($Extend \| $Format)*;	79 $MidNumEx = $MidNum ($Extend \| $Format)*;

60 $NumericEx = $Numeric ($Extend \| $Format)*;	80 $NumericEx = $Numeric ($Extend \| $Format)*;

61 $ExtendNumLetEx = $ExtendNumLet ($Extend \| $Format)*;	81 $ExtendNumLetEx = $ExtendNumLet ($Extend \| $Format)*;

	82 $HebrewLetEx = $HebrewLet ($Extend \| $Format)*;

62	83

63 $Hiragana = [\p{script=Hiragana}];

64 $Ideographic = [\p{Ideographic}];	84 $Ideographic = [\p{Ideographic}];

65 $HiraganaEx = $Hiragana ($Extend \| $Format)*;	85 $HiraganaEx = $Hiragana ($Extend \| $Format)*;

66 $IdeographicEx = $Ideographic ($Extend \| $Format)*;	86 $IdeographicEx = $Ideographic ($Extend \| $Format)*;

67	87

68 ## -------------------------------------------------	88 ## -------------------------------------------------

69	89

70 !!forward;	90 !!forward;

71	91

72	92

73 # Rule 3 - CR x LF	93 # Rule 3 - CR x LF

74 #	94 #

75 $CR $LF;	95 $CR $LF;

76	96

77 # Rule 4 - ignore Format and Extend characters, except when they appear at the b eginning	97 # Rule 4 - ignore Format and Extend characters, except when they appear at the b eginning

78 # of a region of Text. The rule here comes into play when the start o f text	98 # of a region of Text. The rule here comes into play when the start o f text

79 # begins with a group of Format chars, or with a "word" consisting of a single	99 # begins with a group of Format chars, or with a "word" consisting of a single

80 # char that is not in any of the listed word break categories followed by	100 # char that is not in any of the listed word break categories followed by

81 # format char(s).	101 # format char(s).

82 [^$CR $LF $Newline]? ($Extend \| $Format)+;	102 # format char(s), or is not a CJK dictionary character.

	103 [^$CR $LF $Newline $dictionaryCJK]? ($Extend \| $Format)+;

83	104

84 $NumericEx {100};	105 $NumericEx {100};

85 $ALetterEx {200};	106 $ALetterEx {200};

86 $KatakanaEx {300}; # note: these status values override those from rule 5	107 $HangulSyllable {200};

87 $HiraganaEx {300}; # by virtual of being numerically larger.	108 $KatakanaEx {400}; #originally 300

	109 $HiraganaEx {400}; #originally 300

88 $IdeographicEx {400}; #	110 $IdeographicEx {400}; #

89	111

90 #	112 #

91 # rule 5	113 # rule 5

92 # Do not break between most letters.	114 # Do not break between most letters.

93 #	115 #

94 $ALetterEx $ALetterEx {200};	116 $ALetterEx $ALetterEx {200};

95	117

96 # rule 6 and 7	118 # rule 6 and 7

97 $ALetterEx ($MidLetterEx \| $MidNumLetEx) $ALetterEx {200};	119 $ALetterEx ($MidLetterEx \| $MidNumLetEx) $ALetterEx {200};

98	120

	121 # Chrome addition

	122 $HebrewLetEx $HebrewMidLet $HebrewLetEx {200};

	123

99 # rule 8	124 # rule 8

100	125

101 $NumericEx $NumericEx {100};	126 $NumericEx $NumericEx {100};

102	127

103 # rule 9	128 # rule 9

104	129

105 $ALetterEx $NumericEx {200};	130 $ALetterEx $NumericEx {200};

106	131

107 # rule 10	132 # rule 10

108	133

109 $NumericEx $ALetterEx {200};	134 $NumericEx $ALetterEx {200};

110	135

111 # rule 11 and 12	136 # rule 11 and 12

112	137

113 $NumericEx ($MidNumEx \| $MidNumLetEx) $NumericEx {100};	138 $NumericEx ($MidNumEx \| $MidNumLetEx) $NumericEx {100};

114	139

115 # rule 13	140 # rule 13

116	141

117 $KatakanaEx $KatakanaEx {300};	142 # To be consistent with '$KanaKanji $KanaKanji', changed

	143 # from 300 to 400.

	144 # See also TestRuleStatus in intltest/rbbiapts.cpp

	145 $KatakanaEx $KatakanaEx {400};

118	146

119 # rule 13a/b	147 # rule 13a/b

120	148

121 $ALetterEx $ExtendNumLetEx {200}; # (13a)	149 $ALetterEx $ExtendNumLetEx {200}; # (13a)

122 $NumericEx $ExtendNumLetEx {100}; # (13a)	150 $NumericEx $ExtendNumLetEx {100}; # (13a)

123 $KatakanaEx $ExtendNumLetEx {300}; # (13a)	151 $KatakanaEx $ExtendNumLetEx {400}; # (13a)

124 $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)	152 $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)

125	153

126 $ExtendNumLetEx $ALetterEx {200}; # (13b)	154 $ExtendNumLetEx $ALetterEx {200}; # (13b)

127 $ExtendNumLetEx $NumericEx {100}; # (13b)	155 $ExtendNumLetEx $NumericEx {100}; # (13b)

128 $ExtendNumLetEx $KatakanaEx {300}; # (13b)	156 $ExtendNumLetEx $KatakanaEx {400}; # (13b)

129	157

	158 # special handling for CJK characters: chain for later dictionary segmentation

	159 $HangulSyllable $HangulSyllable {200};

	160 $KanaKanji $KanaKanji {400}; #different rule status if both kanji and kana found

130	161

131	162

132 ## -------------------------------------------------	163 ## -------------------------------------------------

133	164

134 !!reverse;	165 !!reverse;

135	166

136 $BackALetterEx = ($Format \| $Extend)* $ALetterPlus;	167 $BackALetterEx = ($Format \| $Extend)* $ALetterPlus;

137 $BackMidNumLetEx = ($Format \| $Extend)* $MidNumLet;	168 $BackMidNumLetEx = ($Format \| $Extend)* $MidNumLet;

138 $BackNumericEx = ($Format \| $Extend)* $Numeric;	169 $BackNumericEx = ($Format \| $Extend)* $Numeric;

139 $BackMidNumEx = ($Format \| $Extend)* $MidNum;	170 $BackMidNumEx = ($Format \| $Extend)* $MidNum;

140 $BackMidLetterEx = ($Format \| $Extend)* $MidLetter;	171 $BackMidLetterEx = ($Format \| $Extend)* $MidLetter;

141 $BackKatakanaEx = ($Format \| $Extend)* $Katakana;	172 $BackKatakanaEx = ($Format \| $Extend)* $Katakana;

	173 $BackHiraganaEx = ($Extend \| $Format)* $Hiragana;

142 $BackExtendNumLetEx= ($Format \| $Extend)* $ExtendNumLet;	174 $BackExtendNumLetEx= ($Format \| $Extend)* $ExtendNumLet;

	175 $BackHebrewLetEx = ($Format \| $Extend)* $HebrewLet;

	176

143	177

144 # rule 3	178 # rule 3

145 $LF $CR;	179 $LF $CR;

146	180

147 # rule 4	181 # rule 4

148 ($Format \| $Extend)* [^$CR $LF $Newline]?;	182 ($Format \| $Extend)* [^$CR $LF $Newline $dictionaryCJK]?;

149	183

150 # rule 5	184 # rule 5

151	185

152 $BackALetterEx $BackALetterEx;	186 $BackALetterEx $BackALetterEx;

153	187

154 # rule 6 and 7	188 # rule 6 and 7

155	189

156 $BackALetterEx ($BackMidLetterEx \| $BackMidNumLetEx) $BackALetterEx;	190 $BackALetterEx ($BackMidLetterEx \| $BackMidNumLetEx) $BackALetterEx;

157	191

	192 # Chrome addition

	193 $BackHebrewLetEx $HebrewMidLet $BackHebrewLetEx;

158	194

159 # rule 8	195 # rule 8

160	196

161 $BackNumericEx $BackNumericEx;	197 $BackNumericEx $BackNumericEx;

162	198

163 # rule 9	199 # rule 9

164	200

165 $BackNumericEx $BackALetterEx;	201 $BackNumericEx $BackALetterEx;

166	202

167 # rule 10	203 # rule 10

168	204

169 $BackALetterEx $BackNumericEx;	205 $BackALetterEx $BackNumericEx;

170	206

171 # rule 11 and 12	207 # rule 11 and 12

172	208

173 $BackNumericEx ($BackMidNumEx \| $BackMidNumLetEx) $BackNumericEx;	209 $BackNumericEx ($BackMidNumEx \| $BackMidNumLetEx) $BackNumericEx;

174	210

175 # rule 13	211 # rule 13

176	212

177 $BackKatakanaEx $BackKatakanaEx;	213 $BackKatakanaEx $BackKatakanaEx;

178	214

179 # rules 13 a/b	215 # rules 13 a/b

180 #	216 #

181 $BackExtendNumLetEx ($BackALetterEx \| $BackNumericEx \| $BackKatakanaEx \| $BackEx tendNumLetEx);	217 $BackExtendNumLetEx ($BackALetterEx \| $BackNumericEx \| $BackKatakanaEx \| $BackEx tendNumLetEx);

182 ($BackALetterEx \| $BackNumericEx \| $BackKatakanaEx) $BackExtendNumLetEx;	218 ($BackALetterEx \| $BackNumericEx \| $BackKatakanaEx) $BackExtendNumLetEx;

183	219

	220 # special handling for CJK characters: chain for later dictionary segmentation

	221 $HangulSyllable $HangulSyllable;

	222 $KanaKanji $KanaKanji; #different rule status if both kanji and kana found

	223

184 ## -------------------------------------------------	224 ## -------------------------------------------------

185	225

186 !!safe_reverse;	226 !!safe_reverse;

187	227

188 # rule 3	228 # rule 3

189 ($Extend \| $Format)+ .?;	229 ($Extend \| $Format)+ .?;

190	230

191 # rule 6	231 # rule 6

192 ($MidLetter \| $MidNumLet) $BackALetterEx;	232 ($MidLetter \| $MidNumLet) $BackALetterEx;

193	233

(...skipping 11 matching lines...) Expand all Loading...
205 ($Extend \| $Format)+ .?;	245 ($Extend \| $Format)+ .?;

206	246

207 # rule 6	247 # rule 6

208 ($MidLetterEx \| $MidNumLetEx) $ALetterEx;	248 ($MidLetterEx \| $MidNumLetEx) $ALetterEx;

209	249

210 # rule 11	250 # rule 11

211 ($MidNumEx \| $MidNumLetEx) $NumericEx;	251 ($MidNumEx \| $MidNumLetEx) $NumericEx;

212	252

213 # For dictionary-based break	253 # For dictionary-based break

214 $dictionary $dictionary;	254 $dictionary $dictionary;

OLD	NEW

« no previous file with comments | « icu46/source/data/brkitr/line.txt ('k') | no next file » | no next file with comments »