| OLD | NEW |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // Implements a custom word iterator used for our spellchecker. | 5 // Implements a custom word iterator used for our spellchecker. |
| 6 | 6 |
| 7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" | 7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" |
| 8 | 8 |
| 9 #include <map> | 9 #include <map> |
| 10 #include <string> | 10 #include <string> |
| (...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 53 // This template only changes the forward-iteration rules. So, calling | 53 // This template only changes the forward-iteration rules. So, calling |
| 54 // ubrk_prev() returns the same results as the original template. | 54 // ubrk_prev() returns the same results as the original template. |
| 55 static const char kRuleTemplate[] = | 55 static const char kRuleTemplate[] = |
| 56 "!!chain;" | 56 "!!chain;" |
| 57 "$CR = [\\p{Word_Break = CR}];" | 57 "$CR = [\\p{Word_Break = CR}];" |
| 58 "$LF = [\\p{Word_Break = LF}];" | 58 "$LF = [\\p{Word_Break = LF}];" |
| 59 "$Newline = [\\p{Word_Break = Newline}];" | 59 "$Newline = [\\p{Word_Break = Newline}];" |
| 60 "$Extend = [\\p{Word_Break = Extend}];" | 60 "$Extend = [\\p{Word_Break = Extend}];" |
| 61 "$Format = [\\p{Word_Break = Format}];" | 61 "$Format = [\\p{Word_Break = Format}];" |
| 62 "$Katakana = [\\p{Word_Break = Katakana}];" | 62 "$Katakana = [\\p{Word_Break = Katakana}];" |
| 63 // Not all the characters in a given script are ALetter. |
| 64 // For instance, U+05F4 is MidLetter. So, this may be |
| 65 // better, but it leads to an empty set error in Thai. |
| 66 // "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];" |
| 63 "$ALetter = [\\p{script=%s}];" | 67 "$ALetter = [\\p{script=%s}];" |
| 64 "$MidNumLet = [\\p{Word_Break = MidNumLet}];" | 68 "$MidNumLet = [\\p{Word_Break = MidNumLet}];" |
| 65 "$MidLetter = [\\p{Word_Break = MidLetter}];" | 69 "$MidLetter = [\\p{Word_Break = MidLetter}%s];" |
| 66 "$MidNum = [\\p{Word_Break = MidNum}];" | 70 "$MidNum = [\\p{Word_Break = MidNum}];" |
| 67 "$Numeric = [\\p{Word_Break = Numeric}];" | 71 "$Numeric = [\\p{Word_Break = Numeric}];" |
| 68 "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];" | 72 "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];" |
| 69 | 73 |
| 70 "$Control = [\\p{Grapheme_Cluster_Break = Control}]; " | 74 "$Control = [\\p{Grapheme_Cluster_Break = Control}]; " |
| 71 "%s" | 75 "%s" // ALetterPlus |
| 72 | 76 |
| 73 "$KatakanaEx = $Katakana ($Extend | $Format)*;" | 77 "$KatakanaEx = $Katakana ($Extend | $Format)*;" |
| 74 "$ALetterEx = $ALetterPlus ($Extend | $Format)*;" | 78 "$ALetterEx = $ALetterPlus ($Extend | $Format)*;" |
| 75 "$MidNumLetEx = $MidNumLet ($Extend | $Format)*;" | 79 "$MidNumLetEx = $MidNumLet ($Extend | $Format)*;" |
| 76 "$MidLetterEx = $MidLetter ($Extend | $Format)*;" | 80 "$MidLetterEx = $MidLetter ($Extend | $Format)*;" |
| 77 "$MidNumEx = $MidNum ($Extend | $Format)*;" | 81 "$MidNumEx = $MidNum ($Extend | $Format)*;" |
| 78 "$NumericEx = $Numeric ($Extend | $Format)*;" | 82 "$NumericEx = $Numeric ($Extend | $Format)*;" |
| 79 "$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;" | 83 "$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;" |
| 80 | 84 |
| 81 "$Hiragana = [\\p{script=Hiragana}];" | 85 "$Hiragana = [\\p{script=Hiragana}];" |
| 82 "$Ideographic = [\\p{Ideographic}];" | 86 "$Ideographic = [\\p{Ideographic}];" |
| 83 "$HiraganaEx = $Hiragana ($Extend | $Format)*;" | 87 "$HiraganaEx = $Hiragana ($Extend | $Format)*;" |
| 84 "$IdeographicEx = $Ideographic ($Extend | $Format)*;" | 88 "$IdeographicEx = $Ideographic ($Extend | $Format)*;" |
| 85 | 89 |
| 86 "!!forward;" | 90 "!!forward;" |
| 87 "$CR $LF;" | 91 "$CR $LF;" |
| 88 "[^$CR $LF $Newline]? ($Extend | $Format)+;" | 92 "[^$CR $LF $Newline]? ($Extend | $Format)+;" |
| 89 "$ALetterEx {200};" | 93 "$ALetterEx {200};" |
| 90 "$ALetterEx $ALetterEx {200};" | 94 "$ALetterEx $ALetterEx {200};" |
| 91 "%s" | 95 "%s" // (Allow|Disallow) Contraction |
| 92 | 96 |
| 93 "!!reverse;" | 97 "!!reverse;" |
| 94 "$BackALetterEx = ($Format | $Extend)* $ALetterPlus;" | 98 "$BackALetterEx = ($Format | $Extend)* $ALetterPlus;" |
| 95 "$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;" | 99 "$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;" |
| 96 "$BackNumericEx = ($Format | $Extend)* $Numeric;" | 100 "$BackNumericEx = ($Format | $Extend)* $Numeric;" |
| 97 "$BackMidNumEx = ($Format | $Extend)* $MidNum;" | 101 "$BackMidNumEx = ($Format | $Extend)* $MidNum;" |
| 98 "$BackMidLetterEx = ($Format | $Extend)* $MidLetter;" | 102 "$BackMidLetterEx = ($Format | $Extend)* $MidLetter;" |
| 99 "$BackKatakanaEx = ($Format | $Extend)* $Katakana;" | 103 "$BackKatakanaEx = ($Format | $Extend)* $Katakana;" |
| 100 "$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;" | 104 "$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;" |
| 101 "$LF $CR;" | 105 "$LF $CR;" |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 143 aletter = "Latin"; | 147 aletter = "Latin"; |
| 144 | 148 |
| 145 const char kWithDictionary[] = | 149 const char kWithDictionary[] = |
| 146 "$dictionary = [:LineBreak = Complex_Context:];" | 150 "$dictionary = [:LineBreak = Complex_Context:];" |
| 147 "$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];"; | 151 "$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];"; |
| 148 const char kWithoutDictionary[] = "$ALetterPlus = $ALetter;"; | 152 const char kWithoutDictionary[] = "$ALetterPlus = $ALetter;"; |
| 149 const char* aletter_plus = kWithoutDictionary; | 153 const char* aletter_plus = kWithoutDictionary; |
| 150 if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI) | 154 if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI) |
| 151 aletter_plus = kWithDictionary; | 155 aletter_plus = kWithDictionary; |
| 152 | 156 |
| 157 const char kMidLetterExtra[] = ""; |
| 158 // For Hebrew, treat single/double quoation marks as MidLetter. |
| 159 const char kMidLetterExtraHebrew[] = "\"'"; |
| 160 const char* midletter_extra = kMidLetterExtra; |
| 161 if (script_code_ == USCRIPT_HEBREW) |
| 162 midletter_extra = kMidLetterExtraHebrew; |
| 163 |
| 153 // Create two custom rule-sets: one allows contraction and the other does not. | 164 // Create two custom rule-sets: one allows contraction and the other does not. |
| 154 // We save these strings in UTF-16 so we can use it without conversions. (ICU | 165 // We save these strings in UTF-16 so we can use it without conversions. (ICU |
| 155 // needs UTF-16 strings.) | 166 // needs UTF-16 strings.) |
| 156 const char kAllowContraction[] = | 167 const char kAllowContraction[] = |
| 157 "$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};"; | 168 "$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};"; |
| 158 const char kDisallowContraction[] = ""; | 169 const char kDisallowContraction[] = ""; |
| 159 | 170 |
| 160 ruleset_allow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate, | 171 ruleset_allow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate, |
| 161 aletter, aletter_plus, kAllowContraction)); | 172 aletter, midletter_extra, aletter_plus, kAllowContraction)); |
| 162 ruleset_disallow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate, | 173 ruleset_disallow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate, |
| 163 aletter, aletter_plus, kDisallowContraction)); | 174 aletter, midletter_extra, aletter_plus, kDisallowContraction)); |
| 164 } | 175 } |
| 165 | 176 |
| 166 bool SpellcheckCharAttribute::OutputChar(UChar c, string16* output) const { | 177 bool SpellcheckCharAttribute::OutputChar(UChar c, string16* output) const { |
| 167 // Call the language-specific function if necessary. | 178 // Call the language-specific function if necessary. |
| 168 // Otherwise, we call the default one. | 179 // Otherwise, we call the default one. |
| 169 switch (script_code_) { | 180 switch (script_code_) { |
| 170 case USCRIPT_ARABIC: | 181 case USCRIPT_ARABIC: |
| 171 return OutputArabic(c, output); | 182 return OutputArabic(c, output); |
| 172 | 183 |
| 173 case USCRIPT_HANGUL: | 184 case USCRIPT_HANGUL: |
| (...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 238 output->push_back(t); | 249 output->push_back(t); |
| 239 return true; | 250 return true; |
| 240 } | 251 } |
| 241 | 252 |
| 242 bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const { | 253 bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const { |
| 243 // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds | 254 // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds |
| 244 // to prevent our Hebrew dictionary from marking a Hebrew word including | 255 // to prevent our Hebrew dictionary from marking a Hebrew word including |
| 245 // niqquds as misspelled. (Same as Arabic vowel marks, we need to check | 256 // niqquds as misspelled. (Same as Arabic vowel marks, we need to check |
| 246 // niqquds manually and filter them out since their script codes are | 257 // niqquds manually and filter them out since their script codes are |
| 247 // USCRIPT_HEBREW.) | 258 // USCRIPT_HEBREW.) |
| 248 if (0x05D0 <= c && c <= 0x05EA) | 259 // Pass through ASCII single/double quotation marks and Hebrew Geresh and |
| 260 // Gershayim. |
| 261 if ((0x05D0 <= c && c <= 0x05EA) || c == 0x22 || c == 0x27 || |
| 262 c == 0x05F4 || c == 0x05F3) |
| 249 output->push_back(c); | 263 output->push_back(c); |
| 250 return true; | 264 return true; |
| 251 } | 265 } |
| 252 | 266 |
| 253 bool SpellcheckCharAttribute::OutputDefault(UChar c, string16* output) const { | 267 bool SpellcheckCharAttribute::OutputDefault(UChar c, string16* output) const { |
| 254 // Check the script code of this character and output only if it is the one | 268 // Check the script code of this character and output only if it is the one |
| 255 // used by the spellchecker language. | 269 // used by the spellchecker language. |
| 256 UErrorCode status = U_ZERO_ERROR; | 270 UErrorCode status = U_ZERO_ERROR; |
| 257 UScriptCode script_code = uscript_getScript(c, &status); | 271 UScriptCode script_code = uscript_getScript(c, &status); |
| 258 if (script_code == script_code_ || script_code == USCRIPT_COMMON) | 272 if (script_code == script_code_ || script_code == USCRIPT_COMMON) |
| (...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 359 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) | 373 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) |
| 360 return false; | 374 return false; |
| 361 | 375 |
| 362 // Copy the normalized text to the output. | 376 // Copy the normalized text to the output. |
| 363 icu::StringCharacterIterator it(output); | 377 icu::StringCharacterIterator it(output); |
| 364 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) | 378 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) |
| 365 attribute_->OutputChar(c, output_string); | 379 attribute_->OutputChar(c, output_string); |
| 366 | 380 |
| 367 return !output_string->empty(); | 381 return !output_string->empty(); |
| 368 } | 382 } |
| OLD | NEW |