OLD | NEW |
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // Implements a custom word iterator used for our spellchecker. | 5 // Implements a custom word iterator used for our spellchecker. |
6 | 6 |
7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" | 7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" |
8 | 8 |
9 #include <map> | 9 #include <map> |
10 #include <string> | 10 #include <string> |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
53 // This template only changes the forward-iteration rules. So, calling | 53 // This template only changes the forward-iteration rules. So, calling |
54 // ubrk_prev() returns the same results as the original template. | 54 // ubrk_prev() returns the same results as the original template. |
55 static const char kRuleTemplate[] = | 55 static const char kRuleTemplate[] = |
56 "!!chain;" | 56 "!!chain;" |
57 "$CR = [\\p{Word_Break = CR}];" | 57 "$CR = [\\p{Word_Break = CR}];" |
58 "$LF = [\\p{Word_Break = LF}];" | 58 "$LF = [\\p{Word_Break = LF}];" |
59 "$Newline = [\\p{Word_Break = Newline}];" | 59 "$Newline = [\\p{Word_Break = Newline}];" |
60 "$Extend = [\\p{Word_Break = Extend}];" | 60 "$Extend = [\\p{Word_Break = Extend}];" |
61 "$Format = [\\p{Word_Break = Format}];" | 61 "$Format = [\\p{Word_Break = Format}];" |
62 "$Katakana = [\\p{Word_Break = Katakana}];" | 62 "$Katakana = [\\p{Word_Break = Katakana}];" |
| 63 // Not all the characters in a given script are ALetter. |
| 64 // For instance, U+05F4 is MidLetter. So, this may be |
| 65 // better, but it leads to an empty set error in Thai. |
| 66 // "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];" |
63 "$ALetter = [\\p{script=%s}];" | 67 "$ALetter = [\\p{script=%s}];" |
64 "$MidNumLet = [\\p{Word_Break = MidNumLet}];" | 68 "$MidNumLet = [\\p{Word_Break = MidNumLet}];" |
65 "$MidLetter = [\\p{Word_Break = MidLetter}];" | 69 "$MidLetter = [\\p{Word_Break = MidLetter}%s];" |
66 "$MidNum = [\\p{Word_Break = MidNum}];" | 70 "$MidNum = [\\p{Word_Break = MidNum}];" |
67 "$Numeric = [\\p{Word_Break = Numeric}];" | 71 "$Numeric = [\\p{Word_Break = Numeric}];" |
68 "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];" | 72 "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];" |
69 | 73 |
70 "$Control = [\\p{Grapheme_Cluster_Break = Control}]; " | 74 "$Control = [\\p{Grapheme_Cluster_Break = Control}]; " |
71 "%s" | 75 "%s" // ALetterPlus |
72 | 76 |
73 "$KatakanaEx = $Katakana ($Extend | $Format)*;" | 77 "$KatakanaEx = $Katakana ($Extend | $Format)*;" |
74 "$ALetterEx = $ALetterPlus ($Extend | $Format)*;" | 78 "$ALetterEx = $ALetterPlus ($Extend | $Format)*;" |
75 "$MidNumLetEx = $MidNumLet ($Extend | $Format)*;" | 79 "$MidNumLetEx = $MidNumLet ($Extend | $Format)*;" |
76 "$MidLetterEx = $MidLetter ($Extend | $Format)*;" | 80 "$MidLetterEx = $MidLetter ($Extend | $Format)*;" |
77 "$MidNumEx = $MidNum ($Extend | $Format)*;" | 81 "$MidNumEx = $MidNum ($Extend | $Format)*;" |
78 "$NumericEx = $Numeric ($Extend | $Format)*;" | 82 "$NumericEx = $Numeric ($Extend | $Format)*;" |
79 "$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;" | 83 "$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;" |
80 | 84 |
81 "$Hiragana = [\\p{script=Hiragana}];" | 85 "$Hiragana = [\\p{script=Hiragana}];" |
82 "$Ideographic = [\\p{Ideographic}];" | 86 "$Ideographic = [\\p{Ideographic}];" |
83 "$HiraganaEx = $Hiragana ($Extend | $Format)*;" | 87 "$HiraganaEx = $Hiragana ($Extend | $Format)*;" |
84 "$IdeographicEx = $Ideographic ($Extend | $Format)*;" | 88 "$IdeographicEx = $Ideographic ($Extend | $Format)*;" |
85 | 89 |
86 "!!forward;" | 90 "!!forward;" |
87 "$CR $LF;" | 91 "$CR $LF;" |
88 "[^$CR $LF $Newline]? ($Extend | $Format)+;" | 92 "[^$CR $LF $Newline]? ($Extend | $Format)+;" |
89 "$ALetterEx {200};" | 93 "$ALetterEx {200};" |
90 "$ALetterEx $ALetterEx {200};" | 94 "$ALetterEx $ALetterEx {200};" |
91 "%s" | 95 "%s" // (Allow|Disallow) Contraction |
92 | 96 |
93 "!!reverse;" | 97 "!!reverse;" |
94 "$BackALetterEx = ($Format | $Extend)* $ALetterPlus;" | 98 "$BackALetterEx = ($Format | $Extend)* $ALetterPlus;" |
95 "$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;" | 99 "$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;" |
96 "$BackNumericEx = ($Format | $Extend)* $Numeric;" | 100 "$BackNumericEx = ($Format | $Extend)* $Numeric;" |
97 "$BackMidNumEx = ($Format | $Extend)* $MidNum;" | 101 "$BackMidNumEx = ($Format | $Extend)* $MidNum;" |
98 "$BackMidLetterEx = ($Format | $Extend)* $MidLetter;" | 102 "$BackMidLetterEx = ($Format | $Extend)* $MidLetter;" |
99 "$BackKatakanaEx = ($Format | $Extend)* $Katakana;" | 103 "$BackKatakanaEx = ($Format | $Extend)* $Katakana;" |
100 "$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;" | 104 "$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;" |
101 "$LF $CR;" | 105 "$LF $CR;" |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
143 aletter = "Latin"; | 147 aletter = "Latin"; |
144 | 148 |
145 const char kWithDictionary[] = | 149 const char kWithDictionary[] = |
146 "$dictionary = [:LineBreak = Complex_Context:];" | 150 "$dictionary = [:LineBreak = Complex_Context:];" |
147 "$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];"; | 151 "$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];"; |
148 const char kWithoutDictionary[] = "$ALetterPlus = $ALetter;"; | 152 const char kWithoutDictionary[] = "$ALetterPlus = $ALetter;"; |
149 const char* aletter_plus = kWithoutDictionary; | 153 const char* aletter_plus = kWithoutDictionary; |
150 if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI) | 154 if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI) |
151 aletter_plus = kWithDictionary; | 155 aletter_plus = kWithDictionary; |
152 | 156 |
| 157 const char kMidLetterExtra[] = ""; |
| 158 // For Hebrew, treat single/double quoation marks as MidLetter. |
| 159 const char kMidLetterExtraHebrew[] = "\"'"; |
| 160 const char* midletter_extra = kMidLetterExtra; |
| 161 if (script_code_ == USCRIPT_HEBREW) |
| 162 midletter_extra = kMidLetterExtraHebrew; |
| 163 |
153 // Create two custom rule-sets: one allows contraction and the other does not. | 164 // Create two custom rule-sets: one allows contraction and the other does not. |
154 // We save these strings in UTF-16 so we can use it without conversions. (ICU | 165 // We save these strings in UTF-16 so we can use it without conversions. (ICU |
155 // needs UTF-16 strings.) | 166 // needs UTF-16 strings.) |
156 const char kAllowContraction[] = | 167 const char kAllowContraction[] = |
157 "$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};"; | 168 "$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};"; |
158 const char kDisallowContraction[] = ""; | 169 const char kDisallowContraction[] = ""; |
159 | 170 |
160 ruleset_allow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate, | 171 ruleset_allow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate, |
161 aletter, aletter_plus, kAllowContraction)); | 172 aletter, midletter_extra, aletter_plus, kAllowContraction)); |
162 ruleset_disallow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate, | 173 ruleset_disallow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate, |
163 aletter, aletter_plus, kDisallowContraction)); | 174 aletter, midletter_extra, aletter_plus, kDisallowContraction)); |
164 } | 175 } |
165 | 176 |
166 bool SpellcheckCharAttribute::OutputChar(UChar c, string16* output) const { | 177 bool SpellcheckCharAttribute::OutputChar(UChar c, string16* output) const { |
167 // Call the language-specific function if necessary. | 178 // Call the language-specific function if necessary. |
168 // Otherwise, we call the default one. | 179 // Otherwise, we call the default one. |
169 switch (script_code_) { | 180 switch (script_code_) { |
170 case USCRIPT_ARABIC: | 181 case USCRIPT_ARABIC: |
171 return OutputArabic(c, output); | 182 return OutputArabic(c, output); |
172 | 183 |
173 case USCRIPT_HANGUL: | 184 case USCRIPT_HANGUL: |
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
238 output->push_back(t); | 249 output->push_back(t); |
239 return true; | 250 return true; |
240 } | 251 } |
241 | 252 |
242 bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const { | 253 bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const { |
243 // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds | 254 // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds |
244 // to prevent our Hebrew dictionary from marking a Hebrew word including | 255 // to prevent our Hebrew dictionary from marking a Hebrew word including |
245 // niqquds as misspelled. (Same as Arabic vowel marks, we need to check | 256 // niqquds as misspelled. (Same as Arabic vowel marks, we need to check |
246 // niqquds manually and filter them out since their script codes are | 257 // niqquds manually and filter them out since their script codes are |
247 // USCRIPT_HEBREW.) | 258 // USCRIPT_HEBREW.) |
248 if (0x05D0 <= c && c <= 0x05EA) | 259 // Pass through ASCII single/double quotation marks and Hebrew Geresh and |
| 260 // Gershayim. |
| 261 if ((0x05D0 <= c && c <= 0x05EA) || c == 0x22 || c == 0x27 || |
| 262 c == 0x05F4 || c == 0x05F3) |
249 output->push_back(c); | 263 output->push_back(c); |
250 return true; | 264 return true; |
251 } | 265 } |
252 | 266 |
253 bool SpellcheckCharAttribute::OutputDefault(UChar c, string16* output) const { | 267 bool SpellcheckCharAttribute::OutputDefault(UChar c, string16* output) const { |
254 // Check the script code of this character and output only if it is the one | 268 // Check the script code of this character and output only if it is the one |
255 // used by the spellchecker language. | 269 // used by the spellchecker language. |
256 UErrorCode status = U_ZERO_ERROR; | 270 UErrorCode status = U_ZERO_ERROR; |
257 UScriptCode script_code = uscript_getScript(c, &status); | 271 UScriptCode script_code = uscript_getScript(c, &status); |
258 if (script_code == script_code_ || script_code == USCRIPT_COMMON) | 272 if (script_code == script_code_ || script_code == USCRIPT_COMMON) |
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
359 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) | 373 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) |
360 return false; | 374 return false; |
361 | 375 |
362 // Copy the normalized text to the output. | 376 // Copy the normalized text to the output. |
363 icu::StringCharacterIterator it(output); | 377 icu::StringCharacterIterator it(output); |
364 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) | 378 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) |
365 attribute_->OutputChar(c, output_string); | 379 attribute_->OutputChar(c, output_string); |
366 | 380 |
367 return !output_string->empty(); | 381 return !output_string->empty(); |
368 } | 382 } |
OLD | NEW |