Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(75)

Side by Side Diff: chrome/renderer/spellchecker/spellcheck_worditerator.cc

Issue 3112015: Customize Hebrew spellcheck word break iterator... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 10 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // Implements a custom word iterator used for our spellchecker. 5 // Implements a custom word iterator used for our spellchecker.
6 6
7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" 7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
8 8
9 #include <map> 9 #include <map>
10 #include <string> 10 #include <string>
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
53 // This template only changes the forward-iteration rules. So, calling 53 // This template only changes the forward-iteration rules. So, calling
54 // ubrk_prev() returns the same results as the original template. 54 // ubrk_prev() returns the same results as the original template.
55 static const char kRuleTemplate[] = 55 static const char kRuleTemplate[] =
56 "!!chain;" 56 "!!chain;"
57 "$CR = [\\p{Word_Break = CR}];" 57 "$CR = [\\p{Word_Break = CR}];"
58 "$LF = [\\p{Word_Break = LF}];" 58 "$LF = [\\p{Word_Break = LF}];"
59 "$Newline = [\\p{Word_Break = Newline}];" 59 "$Newline = [\\p{Word_Break = Newline}];"
60 "$Extend = [\\p{Word_Break = Extend}];" 60 "$Extend = [\\p{Word_Break = Extend}];"
61 "$Format = [\\p{Word_Break = Format}];" 61 "$Format = [\\p{Word_Break = Format}];"
62 "$Katakana = [\\p{Word_Break = Katakana}];" 62 "$Katakana = [\\p{Word_Break = Katakana}];"
63 // Not all the characters in a given script are ALetter.
64 // For instance, U+05F4 is MidLetter. So, this may be
65 // better, but it leads to an empty set error in Thai.
66 // "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];"
63 "$ALetter = [\\p{script=%s}];" 67 "$ALetter = [\\p{script=%s}];"
64 "$MidNumLet = [\\p{Word_Break = MidNumLet}];" 68 "$MidNumLet = [\\p{Word_Break = MidNumLet}];"
65 "$MidLetter = [\\p{Word_Break = MidLetter}];" 69 "$MidLetter = [\\p{Word_Break = MidLetter}%s];"
66 "$MidNum = [\\p{Word_Break = MidNum}];" 70 "$MidNum = [\\p{Word_Break = MidNum}];"
67 "$Numeric = [\\p{Word_Break = Numeric}];" 71 "$Numeric = [\\p{Word_Break = Numeric}];"
68 "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];" 72 "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];"
69 73
70 "$Control = [\\p{Grapheme_Cluster_Break = Control}]; " 74 "$Control = [\\p{Grapheme_Cluster_Break = Control}]; "
71 "%s" 75 "%s" // ALetterPlus
72 76
73 "$KatakanaEx = $Katakana ($Extend | $Format)*;" 77 "$KatakanaEx = $Katakana ($Extend | $Format)*;"
74 "$ALetterEx = $ALetterPlus ($Extend | $Format)*;" 78 "$ALetterEx = $ALetterPlus ($Extend | $Format)*;"
75 "$MidNumLetEx = $MidNumLet ($Extend | $Format)*;" 79 "$MidNumLetEx = $MidNumLet ($Extend | $Format)*;"
76 "$MidLetterEx = $MidLetter ($Extend | $Format)*;" 80 "$MidLetterEx = $MidLetter ($Extend | $Format)*;"
77 "$MidNumEx = $MidNum ($Extend | $Format)*;" 81 "$MidNumEx = $MidNum ($Extend | $Format)*;"
78 "$NumericEx = $Numeric ($Extend | $Format)*;" 82 "$NumericEx = $Numeric ($Extend | $Format)*;"
79 "$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;" 83 "$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;"
80 84
81 "$Hiragana = [\\p{script=Hiragana}];" 85 "$Hiragana = [\\p{script=Hiragana}];"
82 "$Ideographic = [\\p{Ideographic}];" 86 "$Ideographic = [\\p{Ideographic}];"
83 "$HiraganaEx = $Hiragana ($Extend | $Format)*;" 87 "$HiraganaEx = $Hiragana ($Extend | $Format)*;"
84 "$IdeographicEx = $Ideographic ($Extend | $Format)*;" 88 "$IdeographicEx = $Ideographic ($Extend | $Format)*;"
85 89
86 "!!forward;" 90 "!!forward;"
87 "$CR $LF;" 91 "$CR $LF;"
88 "[^$CR $LF $Newline]? ($Extend | $Format)+;" 92 "[^$CR $LF $Newline]? ($Extend | $Format)+;"
89 "$ALetterEx {200};" 93 "$ALetterEx {200};"
90 "$ALetterEx $ALetterEx {200};" 94 "$ALetterEx $ALetterEx {200};"
91 "%s" 95 "%s" // (Allow|Disallow) Contraction
92 96
93 "!!reverse;" 97 "!!reverse;"
94 "$BackALetterEx = ($Format | $Extend)* $ALetterPlus;" 98 "$BackALetterEx = ($Format | $Extend)* $ALetterPlus;"
95 "$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;" 99 "$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;"
96 "$BackNumericEx = ($Format | $Extend)* $Numeric;" 100 "$BackNumericEx = ($Format | $Extend)* $Numeric;"
97 "$BackMidNumEx = ($Format | $Extend)* $MidNum;" 101 "$BackMidNumEx = ($Format | $Extend)* $MidNum;"
98 "$BackMidLetterEx = ($Format | $Extend)* $MidLetter;" 102 "$BackMidLetterEx = ($Format | $Extend)* $MidLetter;"
99 "$BackKatakanaEx = ($Format | $Extend)* $Katakana;" 103 "$BackKatakanaEx = ($Format | $Extend)* $Katakana;"
100 "$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;" 104 "$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;"
101 "$LF $CR;" 105 "$LF $CR;"
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
143 aletter = "Latin"; 147 aletter = "Latin";
144 148
145 const char kWithDictionary[] = 149 const char kWithDictionary[] =
146 "$dictionary = [:LineBreak = Complex_Context:];" 150 "$dictionary = [:LineBreak = Complex_Context:];"
147 "$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];"; 151 "$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];";
148 const char kWithoutDictionary[] = "$ALetterPlus = $ALetter;"; 152 const char kWithoutDictionary[] = "$ALetterPlus = $ALetter;";
149 const char* aletter_plus = kWithoutDictionary; 153 const char* aletter_plus = kWithoutDictionary;
150 if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI) 154 if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI)
151 aletter_plus = kWithDictionary; 155 aletter_plus = kWithDictionary;
152 156
157 const char kMidLetterExtra[] = "";
158 // For Hebrew, treat single/double quoation marks as MidLetter.
159 const char kMidLetterExtraHebrew[] = "\"'";
160 const char* midletter_extra = kMidLetterExtra;
161 if (script_code_ == USCRIPT_HEBREW)
162 midletter_extra = kMidLetterExtraHebrew;
163
153 // Create two custom rule-sets: one allows contraction and the other does not. 164 // Create two custom rule-sets: one allows contraction and the other does not.
154 // We save these strings in UTF-16 so we can use it without conversions. (ICU 165 // We save these strings in UTF-16 so we can use it without conversions. (ICU
155 // needs UTF-16 strings.) 166 // needs UTF-16 strings.)
156 const char kAllowContraction[] = 167 const char kAllowContraction[] =
157 "$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};"; 168 "$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};";
158 const char kDisallowContraction[] = ""; 169 const char kDisallowContraction[] = "";
159 170
160 ruleset_allow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate, 171 ruleset_allow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate,
161 aletter, aletter_plus, kAllowContraction)); 172 aletter, midletter_extra, aletter_plus, kAllowContraction));
162 ruleset_disallow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate, 173 ruleset_disallow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate,
163 aletter, aletter_plus, kDisallowContraction)); 174 aletter, midletter_extra, aletter_plus, kDisallowContraction));
164 } 175 }
165 176
166 bool SpellcheckCharAttribute::OutputChar(UChar c, string16* output) const { 177 bool SpellcheckCharAttribute::OutputChar(UChar c, string16* output) const {
167 // Call the language-specific function if necessary. 178 // Call the language-specific function if necessary.
168 // Otherwise, we call the default one. 179 // Otherwise, we call the default one.
169 switch (script_code_) { 180 switch (script_code_) {
170 case USCRIPT_ARABIC: 181 case USCRIPT_ARABIC:
171 return OutputArabic(c, output); 182 return OutputArabic(c, output);
172 183
173 case USCRIPT_HANGUL: 184 case USCRIPT_HANGUL:
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after
238 output->push_back(t); 249 output->push_back(t);
239 return true; 250 return true;
240 } 251 }
241 252
242 bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const { 253 bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const {
243 // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds 254 // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds
244 // to prevent our Hebrew dictionary from marking a Hebrew word including 255 // to prevent our Hebrew dictionary from marking a Hebrew word including
245 // niqquds as misspelled. (Same as Arabic vowel marks, we need to check 256 // niqquds as misspelled. (Same as Arabic vowel marks, we need to check
246 // niqquds manually and filter them out since their script codes are 257 // niqquds manually and filter them out since their script codes are
247 // USCRIPT_HEBREW.) 258 // USCRIPT_HEBREW.)
248 if (0x05D0 <= c && c <= 0x05EA) 259 // Pass through ASCII single/double quotation marks and Hebrew Geresh and
260 // Gershayim.
261 if ((0x05D0 <= c && c <= 0x05EA) || c == 0x22 || c == 0x27 ||
262 c == 0x05F4 || c == 0x05F3)
249 output->push_back(c); 263 output->push_back(c);
250 return true; 264 return true;
251 } 265 }
252 266
253 bool SpellcheckCharAttribute::OutputDefault(UChar c, string16* output) const { 267 bool SpellcheckCharAttribute::OutputDefault(UChar c, string16* output) const {
254 // Check the script code of this character and output only if it is the one 268 // Check the script code of this character and output only if it is the one
255 // used by the spellchecker language. 269 // used by the spellchecker language.
256 UErrorCode status = U_ZERO_ERROR; 270 UErrorCode status = U_ZERO_ERROR;
257 UScriptCode script_code = uscript_getScript(c, &status); 271 UScriptCode script_code = uscript_getScript(c, &status);
258 if (script_code == script_code_ || script_code == USCRIPT_COMMON) 272 if (script_code == script_code_ || script_code == USCRIPT_COMMON)
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after
359 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) 373 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)
360 return false; 374 return false;
361 375
362 // Copy the normalized text to the output. 376 // Copy the normalized text to the output.
363 icu::StringCharacterIterator it(output); 377 icu::StringCharacterIterator it(output);
364 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) 378 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())
365 attribute_->OutputChar(c, output_string); 379 attribute_->OutputChar(c, output_string);
366 380
367 return !output_string->empty(); 381 return !output_string->empty();
368 } 382 }
OLDNEW
« no previous file with comments | « no previous file | chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698