| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 // Implements a custom word iterator used for our spellchecker. | |
| 6 | |
| 7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" | |
| 8 | |
| 9 #include <map> | |
| 10 #include <memory> | |
| 11 #include <string> | |
| 12 #include <utility> | |
| 13 | |
| 14 #include "base/i18n/break_iterator.h" | |
| 15 #include "base/logging.h" | |
| 16 #include "base/macros.h" | |
| 17 #include "base/strings/stringprintf.h" | |
| 18 #include "base/strings/utf_string_conversions.h" | |
| 19 #include "chrome/renderer/spellchecker/spellcheck.h" | |
| 20 #include "third_party/icu/source/common/unicode/normlzr.h" | |
| 21 #include "third_party/icu/source/common/unicode/schriter.h" | |
| 22 #include "third_party/icu/source/common/unicode/uscript.h" | |
| 23 #include "third_party/icu/source/i18n/unicode/ulocdata.h" | |
| 24 | |
| 25 using base::i18n::BreakIterator; | |
| 26 | |
| 27 // SpellcheckCharAttribute implementation: | |
| 28 | |
| 29 SpellcheckCharAttribute::SpellcheckCharAttribute() | |
| 30 : script_code_(USCRIPT_LATIN) { | |
| 31 } | |
| 32 | |
| 33 SpellcheckCharAttribute::~SpellcheckCharAttribute() { | |
| 34 } | |
| 35 | |
| 36 void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) { | |
| 37 CreateRuleSets(language); | |
| 38 } | |
| 39 | |
| 40 base::string16 SpellcheckCharAttribute::GetRuleSet( | |
| 41 bool allow_contraction) const { | |
| 42 return allow_contraction ? | |
| 43 ruleset_allow_contraction_ : ruleset_disallow_contraction_; | |
| 44 } | |
| 45 | |
| 46 void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) { | |
| 47 // The template for our custom rule sets, which is based on the word-break | |
| 48 // rules of ICU 4.0: | |
| 49 // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/b
rkitr/word.txt>. | |
| 50 // The major differences from the original one are listed below: | |
| 51 // * It discards comments in the original rules. | |
| 52 // * It discards characters not needed by our spellchecker (e.g. numbers, | |
| 53 // punctuation characters, Hiraganas, Katakanas, CJK Ideographs, and so on). | |
| 54 // * It allows customization of the $ALetter value (i.e. word characters). | |
| 55 // * It allows customization of the $ALetterPlus value (i.e. whether or not to | |
| 56 // use the dictionary data). | |
| 57 // * It allows choosing whether or not to split a text at contraction | |
| 58 // characters. | |
| 59 // This template only changes the forward-iteration rules. So, calling | |
| 60 // ubrk_prev() returns the same results as the original template. | |
| 61 static const char kRuleTemplate[] = | |
| 62 "!!chain;" | |
| 63 "$CR = [\\p{Word_Break = CR}];" | |
| 64 "$LF = [\\p{Word_Break = LF}];" | |
| 65 "$Newline = [\\p{Word_Break = Newline}];" | |
| 66 "$Extend = [\\p{Word_Break = Extend}];" | |
| 67 "$Format = [\\p{Word_Break = Format}];" | |
| 68 "$Katakana = [\\p{Word_Break = Katakana}];" | |
| 69 // Not all the characters in a given script are ALetter. | |
| 70 // For instance, U+05F4 is MidLetter. So, this may be | |
| 71 // better, but it leads to an empty set error in Thai. | |
| 72 // "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];" | |
| 73 "$ALetter = [\\p{script=%s}%s];" | |
| 74 // U+0027 (single quote/apostrophe) is not in MidNumLet any more | |
| 75 // in UAX 29 rev 21 or later. For our purpose, U+0027 | |
| 76 // has to be treated as MidNumLet. ( http://crbug.com/364072 ) | |
| 77 "$MidNumLet = [\\p{Word_Break = MidNumLet} \\u0027];" | |
| 78 "$MidLetter = [\\p{Word_Break = MidLetter}%s];" | |
| 79 "$MidNum = [\\p{Word_Break = MidNum}];" | |
| 80 "$Numeric = [\\p{Word_Break = Numeric}];" | |
| 81 "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];" | |
| 82 | |
| 83 "$Control = [\\p{Grapheme_Cluster_Break = Control}]; " | |
| 84 "%s" // ALetterPlus | |
| 85 | |
| 86 "$KatakanaEx = $Katakana ($Extend | $Format)*;" | |
| 87 "$ALetterEx = $ALetterPlus ($Extend | $Format)*;" | |
| 88 "$MidNumLetEx = $MidNumLet ($Extend | $Format)*;" | |
| 89 "$MidLetterEx = $MidLetter ($Extend | $Format)*;" | |
| 90 "$MidNumEx = $MidNum ($Extend | $Format)*;" | |
| 91 "$NumericEx = $Numeric ($Extend | $Format)*;" | |
| 92 "$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;" | |
| 93 | |
| 94 "$Hiragana = [\\p{script=Hiragana}];" | |
| 95 "$Ideographic = [\\p{Ideographic}];" | |
| 96 "$HiraganaEx = $Hiragana ($Extend | $Format)*;" | |
| 97 "$IdeographicEx = $Ideographic ($Extend | $Format)*;" | |
| 98 | |
| 99 "!!forward;" | |
| 100 "$CR $LF;" | |
| 101 "[^$CR $LF $Newline]? ($Extend | $Format)+;" | |
| 102 "$ALetterEx {200};" | |
| 103 "$ALetterEx $ALetterEx {200};" | |
| 104 "%s" // (Allow|Disallow) Contraction | |
| 105 | |
| 106 "!!reverse;" | |
| 107 "$BackALetterEx = ($Format | $Extend)* $ALetterPlus;" | |
| 108 "$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;" | |
| 109 "$BackNumericEx = ($Format | $Extend)* $Numeric;" | |
| 110 "$BackMidNumEx = ($Format | $Extend)* $MidNum;" | |
| 111 "$BackMidLetterEx = ($Format | $Extend)* $MidLetter;" | |
| 112 "$BackKatakanaEx = ($Format | $Extend)* $Katakana;" | |
| 113 "$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;" | |
| 114 "$LF $CR;" | |
| 115 "($Format | $Extend)* [^$CR $LF $Newline]?;" | |
| 116 "$BackALetterEx $BackALetterEx;" | |
| 117 "$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;" | |
| 118 "$BackNumericEx $BackNumericEx;" | |
| 119 "$BackNumericEx $BackALetterEx;" | |
| 120 "$BackALetterEx $BackNumericEx;" | |
| 121 "$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;" | |
| 122 "$BackKatakanaEx $BackKatakanaEx;" | |
| 123 "$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx |" | |
| 124 " $BackKatakanaEx | $BackExtendNumLetEx);" | |
| 125 "($BackALetterEx | $BackNumericEx | $BackKatakanaEx)" | |
| 126 " $BackExtendNumLetEx;" | |
| 127 | |
| 128 "!!safe_reverse;" | |
| 129 "($Extend | $Format)+ .?;" | |
| 130 "($MidLetter | $MidNumLet) $BackALetterEx;" | |
| 131 "($MidNum | $MidNumLet) $BackNumericEx;" | |
| 132 | |
| 133 "!!safe_forward;" | |
| 134 "($Extend | $Format)+ .?;" | |
| 135 "($MidLetterEx | $MidNumLetEx) $ALetterEx;" | |
| 136 "($MidNumEx | $MidNumLetEx) $NumericEx;"; | |
| 137 | |
| 138 // Retrieve the script codes used by the given language from ICU. When the | |
| 139 // given language consists of two or more scripts, we just use the first | |
| 140 // script. The size of returned script codes is always < 8. Therefore, we use | |
| 141 // an array of size 8 so we can include all script codes without insufficient | |
| 142 // buffer errors. | |
| 143 UErrorCode error = U_ZERO_ERROR; | |
| 144 UScriptCode script_code[8]; | |
| 145 int scripts = uscript_getCode(language.c_str(), script_code, | |
| 146 arraysize(script_code), &error); | |
| 147 if (U_SUCCESS(error) && scripts >= 1) | |
| 148 script_code_ = script_code[0]; | |
| 149 | |
| 150 // Retrieve the values for $ALetter and $ALetterPlus. We use the dictionary | |
| 151 // only for the languages which need it (i.e. Korean and Thai) to prevent ICU | |
| 152 // from returning dictionary words (i.e. Korean or Thai words) for languages | |
| 153 // which don't need them. | |
| 154 const char* aletter = uscript_getName(script_code_); | |
| 155 if (!aletter) | |
| 156 aletter = "Latin"; | |
| 157 | |
| 158 const char kWithDictionary[] = | |
| 159 "$dictionary = [:LineBreak = Complex_Context:];" | |
| 160 "$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];"; | |
| 161 const char kWithoutDictionary[] = "$ALetterPlus = $ALetter;"; | |
| 162 const char* aletter_plus = kWithoutDictionary; | |
| 163 if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI || | |
| 164 script_code_ == USCRIPT_LAO || script_code_ == USCRIPT_KHMER) | |
| 165 aletter_plus = kWithDictionary; | |
| 166 | |
| 167 // Treat numbers as word characters except for Arabic and Hebrew. | |
| 168 const char* aletter_extra = " [0123456789]"; | |
| 169 if (script_code_ == USCRIPT_HEBREW) | |
| 170 aletter_extra = ""; | |
| 171 else if (script_code_ == USCRIPT_ARABIC) | |
| 172 // When "script=Arabic", it does not include tatweel, which is | |
| 173 // "script=Common" so add it back. Otherwise, it creates unwanted | |
| 174 // word breaks. | |
| 175 aletter_extra = " [\\u0640]"; | |
| 176 | |
| 177 const char kMidLetterExtra[] = ""; | |
| 178 // For Hebrew, treat single/double quoation marks as MidLetter. | |
| 179 const char kMidLetterExtraHebrew[] = "\"'"; | |
| 180 const char* midletter_extra = kMidLetterExtra; | |
| 181 if (script_code_ == USCRIPT_HEBREW) | |
| 182 midletter_extra = kMidLetterExtraHebrew; | |
| 183 | |
| 184 // Create two custom rule-sets: one allows contraction and the other does not. | |
| 185 // We save these strings in UTF-16 so we can use it without conversions. (ICU | |
| 186 // needs UTF-16 strings.) | |
| 187 const char kAllowContraction[] = | |
| 188 "$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};"; | |
| 189 const char kDisallowContraction[] = ""; | |
| 190 | |
| 191 ruleset_allow_contraction_ = base::ASCIIToUTF16( | |
| 192 base::StringPrintf(kRuleTemplate, | |
| 193 aletter, | |
| 194 aletter_extra, | |
| 195 midletter_extra, | |
| 196 aletter_plus, | |
| 197 kAllowContraction)); | |
| 198 ruleset_disallow_contraction_ = base::ASCIIToUTF16( | |
| 199 base::StringPrintf(kRuleTemplate, | |
| 200 aletter, | |
| 201 aletter_extra, | |
| 202 midletter_extra, | |
| 203 aletter_plus, | |
| 204 kDisallowContraction)); | |
| 205 } | |
| 206 | |
| 207 bool SpellcheckCharAttribute::OutputChar(UChar c, | |
| 208 base::string16* output) const { | |
| 209 // Call the language-specific function if necessary. | |
| 210 // Otherwise, we call the default one. | |
| 211 switch (script_code_) { | |
| 212 case USCRIPT_ARABIC: | |
| 213 return OutputArabic(c, output); | |
| 214 | |
| 215 case USCRIPT_HANGUL: | |
| 216 return OutputHangul(c, output); | |
| 217 | |
| 218 case USCRIPT_HEBREW: | |
| 219 return OutputHebrew(c, output); | |
| 220 | |
| 221 default: | |
| 222 return OutputDefault(c, output); | |
| 223 } | |
| 224 } | |
| 225 | |
| 226 bool SpellcheckCharAttribute::OutputArabic(UChar c, | |
| 227 base::string16* output) const { | |
| 228 // Include non-Arabic characters (which should trigger a spelling error) | |
| 229 // and Arabic characters excluding vowel marks and class "Lm". | |
| 230 // We filter the latter because, while they are "letters", they are | |
| 231 // optional and so don't affect the correctness of the rest of the word. | |
| 232 if (!(0x0600 <= c && c <= 0x06FF) || (u_isalpha(c) && c != 0x0640)) | |
| 233 output->push_back(c); | |
| 234 return true; | |
| 235 } | |
| 236 | |
| 237 bool SpellcheckCharAttribute::OutputHangul(UChar c, | |
| 238 base::string16* output) const { | |
| 239 // Decompose a Hangul character to a Hangul vowel and consonants used by our | |
| 240 // spellchecker. A Hangul character of Unicode is a ligature consisting of a | |
| 241 // Hangul vowel and consonants, e.g. U+AC01 "Gag" consists of U+1100 "G", | |
| 242 // U+1161 "a", and U+11A8 "g". That is, we can treat each Hangul character as | |
| 243 // a point of a cubic linear space consisting of (first consonant, vowel, last | |
| 244 // consonant). Therefore, we can compose a Hangul character from a vowel and | |
| 245 // two consonants with linear composition: | |
| 246 // character = 0xAC00 + | |
| 247 // (first consonant - 0x1100) * 28 * 21 + | |
| 248 // (vowel - 0x1161) * 28 + | |
| 249 // (last consonant - 0x11A7); | |
| 250 // We can also decompose a Hangul character with linear decomposition: | |
| 251 // first consonant = (character - 0xAC00) / 28 / 21; | |
| 252 // vowel = (character - 0xAC00) / 28 % 21; | |
| 253 // last consonant = (character - 0xAC00) % 28; | |
| 254 // This code is copied from Unicode Standard Annex #15 | |
| 255 // <http://unicode.org/reports/tr15> and added some comments. | |
| 256 const int kSBase = 0xAC00; // U+AC00: the top of Hangul characters. | |
| 257 const int kLBase = 0x1100; // U+1100: the top of Hangul first consonants. | |
| 258 const int kVBase = 0x1161; // U+1161: the top of Hangul vowels. | |
| 259 const int kTBase = 0x11A7; // U+11A7: the top of Hangul last consonants. | |
| 260 const int kLCount = 19; // The number of Hangul first consonants. | |
| 261 const int kVCount = 21; // The number of Hangul vowels. | |
| 262 const int kTCount = 28; // The number of Hangul last consonants. | |
| 263 const int kNCount = kVCount * kTCount; | |
| 264 const int kSCount = kLCount * kNCount; | |
| 265 | |
| 266 int index = c - kSBase; | |
| 267 if (index < 0 || index >= kSBase + kSCount) { | |
| 268 // This is not a Hangul syllable. Call the default output function since we | |
| 269 // should output this character when it is a Hangul syllable. | |
| 270 return OutputDefault(c, output); | |
| 271 } | |
| 272 | |
| 273 // This is a Hangul character. Decompose this characters into Hangul vowels | |
| 274 // and consonants. | |
| 275 int l = kLBase + index / kNCount; | |
| 276 int v = kVBase + (index % kNCount) / kTCount; | |
| 277 int t = kTBase + index % kTCount; | |
| 278 output->push_back(l); | |
| 279 output->push_back(v); | |
| 280 if (t != kTBase) | |
| 281 output->push_back(t); | |
| 282 return true; | |
| 283 } | |
| 284 | |
| 285 bool SpellcheckCharAttribute::OutputHebrew(UChar c, | |
| 286 base::string16* output) const { | |
| 287 // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds | |
| 288 // to prevent our Hebrew dictionary from marking a Hebrew word including | |
| 289 // niqquds as misspelled. (Same as Arabic vowel marks, we need to check | |
| 290 // niqquds manually and filter them out since their script codes are | |
| 291 // USCRIPT_HEBREW.) | |
| 292 // Pass through ASCII single/double quotation marks and Hebrew Geresh and | |
| 293 // Gershayim. | |
| 294 if ((0x05D0 <= c && c <= 0x05EA) || c == 0x22 || c == 0x27 || | |
| 295 c == 0x05F4 || c == 0x05F3) | |
| 296 output->push_back(c); | |
| 297 return true; | |
| 298 } | |
| 299 | |
| 300 bool SpellcheckCharAttribute::OutputDefault(UChar c, | |
| 301 base::string16* output) const { | |
| 302 // Check the script code of this character and output only if it is the one | |
| 303 // used by the spellchecker language. | |
| 304 UErrorCode status = U_ZERO_ERROR; | |
| 305 UScriptCode script_code = uscript_getScript(c, &status); | |
| 306 if (script_code == script_code_ || script_code == USCRIPT_COMMON) | |
| 307 output->push_back(c); | |
| 308 return true; | |
| 309 } | |
| 310 | |
| 311 // SpellcheckWordIterator implementation: | |
| 312 | |
| 313 SpellcheckWordIterator::SpellcheckWordIterator() | |
| 314 : text_(NULL), | |
| 315 attribute_(NULL), | |
| 316 iterator_() { | |
| 317 } | |
| 318 | |
| 319 SpellcheckWordIterator::~SpellcheckWordIterator() { | |
| 320 Reset(); | |
| 321 } | |
| 322 | |
| 323 bool SpellcheckWordIterator::Initialize( | |
| 324 const SpellcheckCharAttribute* attribute, | |
| 325 bool allow_contraction) { | |
| 326 // Create a custom ICU break iterator with empty text used in this object. (We | |
| 327 // allow setting text later so we can re-use this iterator.) | |
| 328 DCHECK(attribute); | |
| 329 const base::string16 rule(attribute->GetRuleSet(allow_contraction)); | |
| 330 | |
| 331 // If there is no rule set, the attributes were invalid. | |
| 332 if (rule.empty()) | |
| 333 return false; | |
| 334 | |
| 335 std::unique_ptr<BreakIterator> iterator( | |
| 336 new BreakIterator(base::string16(), rule)); | |
| 337 if (!iterator->Init()) { | |
| 338 // Since we're not passing in any text, the only reason this could fail | |
| 339 // is if we fail to parse the rules. Since the rules are hardcoded, | |
| 340 // that would be a bug in this class. | |
| 341 NOTREACHED() << "failed to open iterator (broken rules)"; | |
| 342 return false; | |
| 343 } | |
| 344 iterator_ = std::move(iterator); | |
| 345 | |
| 346 // Set the character attributes so we can normalize the words extracted by | |
| 347 // this iterator. | |
| 348 attribute_ = attribute; | |
| 349 return true; | |
| 350 } | |
| 351 | |
| 352 bool SpellcheckWordIterator::IsInitialized() const { | |
| 353 // Return true iff we have an iterator. | |
| 354 return !!iterator_; | |
| 355 } | |
| 356 | |
| 357 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) { | |
| 358 DCHECK(!!iterator_); | |
| 359 | |
| 360 // Set the text to be split by this iterator. | |
| 361 if (!iterator_->SetText(text, length)) { | |
| 362 LOG(ERROR) << "failed to set text"; | |
| 363 return false; | |
| 364 } | |
| 365 | |
| 366 text_ = text; | |
| 367 return true; | |
| 368 } | |
| 369 | |
| 370 SpellcheckWordIterator::WordIteratorStatus SpellcheckWordIterator::GetNextWord( | |
| 371 base::string16* word_string, | |
| 372 int* word_start, | |
| 373 int* word_length) { | |
| 374 DCHECK(!!text_); | |
| 375 | |
| 376 word_string->clear(); | |
| 377 *word_start = 0; | |
| 378 *word_length = 0; | |
| 379 | |
| 380 if (!text_) { | |
| 381 return IS_END_OF_TEXT; | |
| 382 } | |
| 383 | |
| 384 // Find a word that can be checked for spelling or a character that can be | |
| 385 // skipped over. Rather than moving past a skippable character this returns | |
| 386 // IS_SKIPPABLE and defers handling the character to the calling function. | |
| 387 while (iterator_->Advance()) { | |
| 388 const size_t start = iterator_->prev(); | |
| 389 const size_t length = iterator_->pos() - start; | |
| 390 switch (iterator_->GetWordBreakStatus()) { | |
| 391 case BreakIterator::IS_WORD_BREAK: { | |
| 392 if (Normalize(start, length, word_string)) { | |
| 393 *word_start = start; | |
| 394 *word_length = length; | |
| 395 return IS_WORD; | |
| 396 } | |
| 397 break; | |
| 398 } | |
| 399 case BreakIterator::IS_SKIPPABLE_WORD: { | |
| 400 *word_string = iterator_->GetString(); | |
| 401 *word_start = start; | |
| 402 *word_length = length; | |
| 403 return IS_SKIPPABLE; | |
| 404 } | |
| 405 // |iterator_| is RULE_BASED so the break status should never be | |
| 406 // IS_LINE_OR_CHAR_BREAK. | |
| 407 case BreakIterator::IS_LINE_OR_CHAR_BREAK: { | |
| 408 NOTREACHED(); | |
| 409 break; | |
| 410 } | |
| 411 } | |
| 412 } | |
| 413 | |
| 414 // There aren't any more words in the given text. | |
| 415 return IS_END_OF_TEXT; | |
| 416 } | |
| 417 | |
| 418 void SpellcheckWordIterator::Reset() { | |
| 419 iterator_.reset(); | |
| 420 } | |
| 421 | |
| 422 bool SpellcheckWordIterator::Normalize(int input_start, | |
| 423 int input_length, | |
| 424 base::string16* output_string) const { | |
| 425 // We use NFKC (Normalization Form, Compatible decomposition, followed by | |
| 426 // canonical Composition) defined in Unicode Standard Annex #15 to normalize | |
| 427 // this token because it it the most suitable normalization algorithm for our | |
| 428 // spellchecker. Nevertheless, it is not a perfect algorithm for our | |
| 429 // spellchecker and we need manual normalization as well. The normalized | |
| 430 // text does not have to be NUL-terminated since its characters are copied to | |
| 431 // string16, which adds a NUL character when we need. | |
| 432 icu::UnicodeString input(FALSE, &text_[input_start], input_length); | |
| 433 UErrorCode status = U_ZERO_ERROR; | |
| 434 icu::UnicodeString output; | |
| 435 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status); | |
| 436 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) | |
| 437 return false; | |
| 438 | |
| 439 // Copy the normalized text to the output. | |
| 440 icu::StringCharacterIterator it(output); | |
| 441 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) | |
| 442 attribute_->OutputChar(c, output_string); | |
| 443 | |
| 444 return !output_string->empty(); | |
| 445 } | |
| OLD | NEW |