chrome/renderer/spellchecker/spellcheck_worditerator.cc - Issue 3112015: Customize Hebrew spellcheck word break iterator...

Side by Side Diff: chrome/renderer/spellchecker/spellcheck_worditerator.cc

Issue 3112015: Customize Hebrew spellcheck word break iterator... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 10 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 // Implements a custom word iterator used for our spellchecker.	5 // Implements a custom word iterator used for our spellchecker.

6	6

7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"	7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"

8	8

9 #include <map>	9 #include <map>

10 #include <string>	10 #include <string>

(...skipping 42 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
53 // This template only changes the forward-iteration rules. So, calling	53 // This template only changes the forward-iteration rules. So, calling

54 // ubrk_prev() returns the same results as the original template.	54 // ubrk_prev() returns the same results as the original template.

55 static const char kRuleTemplate[] =	55 static const char kRuleTemplate[] =

56 "!!chain;"	56 "!!chain;"

57 "$CR = [\\p{Word_Break = CR}];"	57 "$CR = [\\p{Word_Break = CR}];"

58 "$LF = [\\p{Word_Break = LF}];"	58 "$LF = [\\p{Word_Break = LF}];"

59 "$Newline = [\\p{Word_Break = Newline}];"	59 "$Newline = [\\p{Word_Break = Newline}];"

60 "$Extend = [\\p{Word_Break = Extend}];"	60 "$Extend = [\\p{Word_Break = Extend}];"

61 "$Format = [\\p{Word_Break = Format}];"	61 "$Format = [\\p{Word_Break = Format}];"

62 "$Katakana = [\\p{Word_Break = Katakana}];"	62 "$Katakana = [\\p{Word_Break = Katakana}];"

	63 // Not all the characters in a given script are ALetter.

	64 // For instance, U+05F4 is MidLetter. So, this may be

	65 // better, but it leads to an empty set error in Thai.

	66 // "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];"

63 "$ALetter = [\\p{script=%s}];"	67 "$ALetter = [\\p{script=%s}];"

64 "$MidNumLet = [\\p{Word_Break = MidNumLet}];"	68 "$MidNumLet = [\\p{Word_Break = MidNumLet}];"

65 "$MidLetter = [\\p{Word_Break = MidLetter}];"	69 "$MidLetter = [\\p{Word_Break = MidLetter}%s];"

66 "$MidNum = [\\p{Word_Break = MidNum}];"	70 "$MidNum = [\\p{Word_Break = MidNum}];"

67 "$Numeric = [\\p{Word_Break = Numeric}];"	71 "$Numeric = [\\p{Word_Break = Numeric}];"

68 "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];"	72 "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];"

69	73

70 "$Control = [\\p{Grapheme_Cluster_Break = Control}]; "	74 "$Control = [\\p{Grapheme_Cluster_Break = Control}]; "

71 "%s"	75 "%s" // ALetterPlus

72	76

73 "$KatakanaEx = $Katakana ($Extend \| $Format)*;"	77 "$KatakanaEx = $Katakana ($Extend \| $Format)*;"

74 "$ALetterEx = $ALetterPlus ($Extend \| $Format)*;"	78 "$ALetterEx = $ALetterPlus ($Extend \| $Format)*;"

75 "$MidNumLetEx = $MidNumLet ($Extend \| $Format)*;"	79 "$MidNumLetEx = $MidNumLet ($Extend \| $Format)*;"

76 "$MidLetterEx = $MidLetter ($Extend \| $Format)*;"	80 "$MidLetterEx = $MidLetter ($Extend \| $Format)*;"

77 "$MidNumEx = $MidNum ($Extend \| $Format)*;"	81 "$MidNumEx = $MidNum ($Extend \| $Format)*;"

78 "$NumericEx = $Numeric ($Extend \| $Format)*;"	82 "$NumericEx = $Numeric ($Extend \| $Format)*;"

79 "$ExtendNumLetEx = $ExtendNumLet ($Extend \| $Format)*;"	83 "$ExtendNumLetEx = $ExtendNumLet ($Extend \| $Format)*;"

80	84

81 "$Hiragana = [\\p{script=Hiragana}];"	85 "$Hiragana = [\\p{script=Hiragana}];"

82 "$Ideographic = [\\p{Ideographic}];"	86 "$Ideographic = [\\p{Ideographic}];"

83 "$HiraganaEx = $Hiragana ($Extend \| $Format)*;"	87 "$HiraganaEx = $Hiragana ($Extend \| $Format)*;"

84 "$IdeographicEx = $Ideographic ($Extend \| $Format)*;"	88 "$IdeographicEx = $Ideographic ($Extend \| $Format)*;"

85	89

86 "!!forward;"	90 "!!forward;"

87 "$CR $LF;"	91 "$CR $LF;"

88 "[^$CR $LF $Newline]? ($Extend \| $Format)+;"	92 "[^$CR $LF $Newline]? ($Extend \| $Format)+;"

89 "$ALetterEx {200};"	93 "$ALetterEx {200};"

90 "$ALetterEx $ALetterEx {200};"	94 "$ALetterEx $ALetterEx {200};"

91 "%s"	95 "%s" // (Allow\|Disallow) Contraction

92	96

93 "!!reverse;"	97 "!!reverse;"

94 "$BackALetterEx = ($Format \| $Extend)* $ALetterPlus;"	98 "$BackALetterEx = ($Format \| $Extend)* $ALetterPlus;"

95 "$BackMidNumLetEx = ($Format \| $Extend)* $MidNumLet;"	99 "$BackMidNumLetEx = ($Format \| $Extend)* $MidNumLet;"

96 "$BackNumericEx = ($Format \| $Extend)* $Numeric;"	100 "$BackNumericEx = ($Format \| $Extend)* $Numeric;"

97 "$BackMidNumEx = ($Format \| $Extend)* $MidNum;"	101 "$BackMidNumEx = ($Format \| $Extend)* $MidNum;"

98 "$BackMidLetterEx = ($Format \| $Extend)* $MidLetter;"	102 "$BackMidLetterEx = ($Format \| $Extend)* $MidLetter;"

99 "$BackKatakanaEx = ($Format \| $Extend)* $Katakana;"	103 "$BackKatakanaEx = ($Format \| $Extend)* $Katakana;"

100 "$BackExtendNumLetEx= ($Format \| $Extend)* $ExtendNumLet;"	104 "$BackExtendNumLetEx= ($Format \| $Extend)* $ExtendNumLet;"

101 "$LF $CR;"	105 "$LF $CR;"

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
143 aletter = "Latin";	147 aletter = "Latin";

144	148

145 const char kWithDictionary[] =	149 const char kWithDictionary[] =

146 "$dictionary = [:LineBreak = Complex_Context:];"	150 "$dictionary = [:LineBreak = Complex_Context:];"

147 "$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];";	151 "$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];";

148 const char kWithoutDictionary[] = "$ALetterPlus = $ALetter;";	152 const char kWithoutDictionary[] = "$ALetterPlus = $ALetter;";

149 const char* aletter_plus = kWithoutDictionary;	153 const char* aletter_plus = kWithoutDictionary;

150 if (script_code_ == USCRIPT_HANGUL \|\| script_code_ == USCRIPT_THAI)	154 if (script_code_ == USCRIPT_HANGUL \|\| script_code_ == USCRIPT_THAI)

151 aletter_plus = kWithDictionary;	155 aletter_plus = kWithDictionary;

152	156

	157 const char kMidLetterExtra[] = "";

	158 // For Hebrew, treat single/double quoation marks as MidLetter.

	159 const char kMidLetterExtraHebrew[] = "\"'";

	160 const char* midletter_extra = kMidLetterExtra;

	161 if (script_code_ == USCRIPT_HEBREW)

	162 midletter_extra = kMidLetterExtraHebrew;

	163

153 // Create two custom rule-sets: one allows contraction and the other does not.	164 // Create two custom rule-sets: one allows contraction and the other does not.

154 // We save these strings in UTF-16 so we can use it without conversions. (ICU	165 // We save these strings in UTF-16 so we can use it without conversions. (ICU

155 // needs UTF-16 strings.)	166 // needs UTF-16 strings.)

156 const char kAllowContraction[] =	167 const char kAllowContraction[] =

157 "$ALetterEx ($MidLetterEx \| $MidNumLetEx) $ALetterEx {200};";	168 "$ALetterEx ($MidLetterEx \| $MidNumLetEx) $ALetterEx {200};";

158 const char kDisallowContraction[] = "";	169 const char kDisallowContraction[] = "";

159	170

160 ruleset_allow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate,	171 ruleset_allow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate,

161 aletter, aletter_plus, kAllowContraction));	172 aletter, midletter_extra, aletter_plus, kAllowContraction));

162 ruleset_disallow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate,	173 ruleset_disallow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate,

163 aletter, aletter_plus, kDisallowContraction));	174 aletter, midletter_extra, aletter_plus, kDisallowContraction));

164 }	175 }

165	176

166 bool SpellcheckCharAttribute::OutputChar(UChar c, string16* output) const {	177 bool SpellcheckCharAttribute::OutputChar(UChar c, string16* output) const {

167 // Call the language-specific function if necessary.	178 // Call the language-specific function if necessary.

168 // Otherwise, we call the default one.	179 // Otherwise, we call the default one.

169 switch (script_code_) {	180 switch (script_code_) {

170 case USCRIPT_ARABIC:	181 case USCRIPT_ARABIC:

171 return OutputArabic(c, output);	182 return OutputArabic(c, output);

172	183

173 case USCRIPT_HANGUL:	184 case USCRIPT_HANGUL:

(...skipping 64 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
238 output->push_back(t);	249 output->push_back(t);

239 return true;	250 return true;

240 }	251 }

241	252

242 bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const {	253 bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const {

243 // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds	254 // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds

244 // to prevent our Hebrew dictionary from marking a Hebrew word including	255 // to prevent our Hebrew dictionary from marking a Hebrew word including

245 // niqquds as misspelled. (Same as Arabic vowel marks, we need to check	256 // niqquds as misspelled. (Same as Arabic vowel marks, we need to check

246 // niqquds manually and filter them out since their script codes are	257 // niqquds manually and filter them out since their script codes are

247 // USCRIPT_HEBREW.)	258 // USCRIPT_HEBREW.)

248 if (0x05D0 <= c && c <= 0x05EA)	259 // Pass through ASCII single/double quotation marks and Hebrew Geresh and

	260 // Gershayim.

	261 if ((0x05D0 <= c && c <= 0x05EA) \|\| c == 0x22 \|\| c == 0x27 \|\|

	262 c == 0x05F4 \|\| c == 0x05F3)

249 output->push_back(c);	263 output->push_back(c);

250 return true;	264 return true;

251 }	265 }

252	266

253 bool SpellcheckCharAttribute::OutputDefault(UChar c, string16* output) const {	267 bool SpellcheckCharAttribute::OutputDefault(UChar c, string16* output) const {

254 // Check the script code of this character and output only if it is the one	268 // Check the script code of this character and output only if it is the one

255 // used by the spellchecker language.	269 // used by the spellchecker language.

256 UErrorCode status = U_ZERO_ERROR;	270 UErrorCode status = U_ZERO_ERROR;

257 UScriptCode script_code = uscript_getScript(c, &status);	271 UScriptCode script_code = uscript_getScript(c, &status);

258 if (script_code == script_code_ \|\| script_code == USCRIPT_COMMON)	272 if (script_code == script_code_ \|\| script_code == USCRIPT_COMMON)

(...skipping 100 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
359 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)	373 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)

360 return false;	374 return false;

361	375

362 // Copy the normalized text to the output.	376 // Copy the normalized text to the output.

363 icu::StringCharacterIterator it(output);	377 icu::StringCharacterIterator it(output);

364 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())	378 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())

365 attribute_->OutputChar(c, output_string);	379 attribute_->OutputChar(c, output_string);

366	380

367 return !output_string->empty();	381 return !output_string->empty();

368 }	382 }

OLD	NEW

« no previous file with comments | « no previous file | chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc » ('j') | no next file with comments »