| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // Implements a custom word iterator used for our spellchecker. | 5 // Implements a custom word iterator used for our spellchecker. |
| 6 | 6 |
| 7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" | 7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" |
| 8 | 8 |
| 9 #include <map> | 9 #include <map> |
| 10 #include <string> | 10 #include <string> |
| 11 | 11 |
| 12 #include "base/basictypes.h" | 12 #include "base/basictypes.h" |
| 13 #include "base/i18n/break_iterator.h" |
| 13 #include "base/logging.h" | 14 #include "base/logging.h" |
| 14 #include "base/strings/stringprintf.h" | 15 #include "base/strings/stringprintf.h" |
| 15 #include "base/strings/utf_string_conversions.h" | 16 #include "base/strings/utf_string_conversions.h" |
| 16 #include "chrome/renderer/spellchecker/spellcheck.h" | 17 #include "chrome/renderer/spellchecker/spellcheck.h" |
| 17 #include "third_party/icu/source/common/unicode/normlzr.h" | 18 #include "third_party/icu/source/common/unicode/normlzr.h" |
| 18 #include "third_party/icu/source/common/unicode/schriter.h" | 19 #include "third_party/icu/source/common/unicode/schriter.h" |
| 19 #include "third_party/icu/source/common/unicode/uscript.h" | 20 #include "third_party/icu/source/common/unicode/uscript.h" |
| 20 #include "third_party/icu/source/i18n/unicode/ulocdata.h" | 21 #include "third_party/icu/source/i18n/unicode/ulocdata.h" |
| 21 | 22 |
| 22 // SpellcheckCharAttribute implementation: | 23 // SpellcheckCharAttribute implementation: |
| (...skipping 269 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 292 UScriptCode script_code = uscript_getScript(c, &status); | 293 UScriptCode script_code = uscript_getScript(c, &status); |
| 293 if (script_code == script_code_ || script_code == USCRIPT_COMMON) | 294 if (script_code == script_code_ || script_code == USCRIPT_COMMON) |
| 294 output->push_back(c); | 295 output->push_back(c); |
| 295 return true; | 296 return true; |
| 296 } | 297 } |
| 297 | 298 |
| 298 // SpellcheckWordIterator implementation: | 299 // SpellcheckWordIterator implementation: |
| 299 | 300 |
| 300 SpellcheckWordIterator::SpellcheckWordIterator() | 301 SpellcheckWordIterator::SpellcheckWordIterator() |
| 301 : text_(NULL), | 302 : text_(NULL), |
| 302 length_(0), | |
| 303 position_(UBRK_DONE), | |
| 304 attribute_(NULL), | 303 attribute_(NULL), |
| 305 iterator_(NULL) { | 304 iterator_() { |
| 306 } | 305 } |
| 307 | 306 |
| 308 SpellcheckWordIterator::~SpellcheckWordIterator() { | 307 SpellcheckWordIterator::~SpellcheckWordIterator() { |
| 309 Reset(); | 308 Reset(); |
| 310 } | 309 } |
| 311 | 310 |
| 312 bool SpellcheckWordIterator::Initialize( | 311 bool SpellcheckWordIterator::Initialize( |
| 313 const SpellcheckCharAttribute* attribute, | 312 const SpellcheckCharAttribute* attribute, |
| 314 bool allow_contraction) { | 313 bool allow_contraction) { |
| 315 // Create a custom ICU break iterator with empty text used in this object. (We | 314 // Create a custom ICU break iterator with empty text used in this object. (We |
| 316 // allow setting text later so we can re-use this iterator.) | 315 // allow setting text later so we can re-use this iterator.) |
| 317 DCHECK(attribute); | 316 DCHECK(attribute); |
| 318 UErrorCode open_status = U_ZERO_ERROR; | 317 const base::string16 rule(attribute->GetRuleSet(allow_contraction)); |
| 319 UParseError parse_status; | |
| 320 base::string16 rule(attribute->GetRuleSet(allow_contraction)); | |
| 321 | 318 |
| 322 // If there is no rule set, the attributes were invalid. | 319 // If there is no rule set, the attributes were invalid. |
| 323 if (rule.empty()) | 320 if (rule.empty()) |
| 324 return false; | 321 return false; |
| 325 | 322 |
| 326 iterator_ = ubrk_openRules(rule.c_str(), rule.length(), NULL, 0, | 323 scoped_ptr<base::i18n::BreakIterator> iterator( |
| 327 &parse_status, &open_status); | 324 new base::i18n::BreakIterator(base::string16(), rule)); |
| 328 if (U_FAILURE(open_status)) | 325 if (!iterator->Init()) { |
| 326 // Since we're not passing in any text, the only reason this could fail |
| 327 // is if we fail to parse the rules. Since the rules are hardcoded, |
| 328 // that would be a bug in this class. |
| 329 NOTREACHED() << "failed to open iterator (broken rules)"; |
| 329 return false; | 330 return false; |
| 331 } |
| 332 iterator_ = iterator.Pass(); |
| 330 | 333 |
| 331 // Set the character attributes so we can normalize the words extracted by | 334 // Set the character attributes so we can normalize the words extracted by |
| 332 // this iterator. | 335 // this iterator. |
| 333 attribute_ = attribute; | 336 attribute_ = attribute; |
| 334 return true; | 337 return true; |
| 335 } | 338 } |
| 336 | 339 |
| 337 bool SpellcheckWordIterator::IsInitialized() const { | 340 bool SpellcheckWordIterator::IsInitialized() const { |
| 338 // Return true if we have an ICU custom iterator. | 341 // Return true iff we have an iterator. |
| 339 return !!iterator_; | 342 return !!iterator_; |
| 340 } | 343 } |
| 341 | 344 |
| 342 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) { | 345 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) { |
| 343 DCHECK(!!iterator_); | 346 DCHECK(!!iterator_); |
| 344 | 347 |
| 345 // Set the text to be split by this iterator. | 348 // Set the text to be split by this iterator. |
| 346 UErrorCode status = U_ZERO_ERROR; | 349 if (!iterator_->SetText(text, length)) { |
| 347 ubrk_setText(iterator_, text, length, &status); | 350 LOG(ERROR) << "failed to set text"; |
| 348 if (U_FAILURE(status)) | |
| 349 return false; | 351 return false; |
| 350 | 352 } |
| 351 // Retrieve the position to the first word in this text. We return false if | |
| 352 // this text does not have any words. (For example, The input text consists | |
| 353 // only of Chinese characters while the spellchecker language is English.) | |
| 354 position_ = ubrk_first(iterator_); | |
| 355 if (position_ == UBRK_DONE) | |
| 356 return false; | |
| 357 | 353 |
| 358 text_ = text; | 354 text_ = text; |
| 359 length_ = static_cast<int>(length); | |
| 360 return true; | 355 return true; |
| 361 } | 356 } |
| 362 | 357 |
| 363 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string, | 358 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string, |
| 364 int* word_start, | 359 int* word_start, |
| 365 int* word_length) { | 360 int* word_length) { |
| 366 DCHECK(!!text_ && length_ > 0); | 361 DCHECK(!!text_); |
| 367 | 362 |
| 368 word_string->clear(); | 363 word_string->clear(); |
| 369 *word_start = 0; | 364 *word_start = 0; |
| 370 *word_length = 0; | 365 *word_length = 0; |
| 371 | 366 |
| 372 if (!text_ || position_ == UBRK_DONE) | 367 if (!text_) { |
| 373 return false; | 368 return false; |
| 369 } |
| 374 | 370 |
| 375 // Find a word that can be checked for spelling. Our rule sets filter out | 371 // Find a word that can be checked for spelling. Our rule sets filter out |
| 376 // invalid words (e.g. numbers and characters not supported by the | 372 // invalid words (e.g. numbers and characters not supported by the |
| 377 // spellchecker language) so this ubrk_getRuleStatus() call returns | 373 // spellchecker language) so this ubrk_getRuleStatus() call returns |
| 378 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such | 374 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such |
| 379 // words until we can find a valid word or reach the end of the input string. | 375 // words until we can find a valid word or reach the end of the input string. |
| 380 int next = ubrk_next(iterator_); | 376 while (iterator_->Advance()) { |
| 381 while (next != UBRK_DONE) { | 377 const size_t start = iterator_->prev(); |
| 382 if (ubrk_getRuleStatus(iterator_) != UBRK_WORD_NONE) { | 378 const size_t length = iterator_->pos() - start; |
| 383 if (Normalize(position_, next - position_, word_string)) { | 379 if (iterator_->IsWord()) { |
| 384 *word_start = position_; | 380 if (Normalize(start, length, word_string)) { |
| 385 *word_length = next - position_; | 381 *word_start = start; |
| 386 position_ = next; | 382 *word_length = length; |
| 387 return true; | 383 return true; |
| 388 } | 384 } |
| 389 } | 385 } |
| 390 position_ = next; | |
| 391 next = ubrk_next(iterator_); | |
| 392 } | 386 } |
| 393 | 387 |
| 394 // There aren't any more words in the given text. Set the position to | 388 // There aren't any more words in the given text. |
| 395 // UBRK_DONE to prevent from calling ubrk_next() next time when this function | |
| 396 // is called. | |
| 397 position_ = UBRK_DONE; | |
| 398 return false; | 389 return false; |
| 399 } | 390 } |
| 400 | 391 |
| 401 void SpellcheckWordIterator::Reset() { | 392 void SpellcheckWordIterator::Reset() { |
| 402 if (iterator_) { | 393 iterator_.reset(); |
| 403 ubrk_close(iterator_); | |
| 404 iterator_ = NULL; | |
| 405 } | |
| 406 } | 394 } |
| 407 | 395 |
| 408 bool SpellcheckWordIterator::Normalize(int input_start, | 396 bool SpellcheckWordIterator::Normalize(int input_start, |
| 409 int input_length, | 397 int input_length, |
| 410 base::string16* output_string) const { | 398 base::string16* output_string) const { |
| 411 // We use NFKC (Normalization Form, Compatible decomposition, followed by | 399 // We use NFKC (Normalization Form, Compatible decomposition, followed by |
| 412 // canonical Composition) defined in Unicode Standard Annex #15 to normalize | 400 // canonical Composition) defined in Unicode Standard Annex #15 to normalize |
| 413 // this token because it it the most suitable normalization algorithm for our | 401 // this token because it it the most suitable normalization algorithm for our |
| 414 // spellchecker. Nevertheless, it is not a perfect algorithm for our | 402 // spellchecker. Nevertheless, it is not a perfect algorithm for our |
| 415 // spellchecker and we need manual normalization as well. The normalized | 403 // spellchecker and we need manual normalization as well. The normalized |
| 416 // text does not have to be NUL-terminated since its characters are copied to | 404 // text does not have to be NUL-terminated since its characters are copied to |
| 417 // string16, which adds a NUL character when we need. | 405 // string16, which adds a NUL character when we need. |
| 418 icu::UnicodeString input(FALSE, &text_[input_start], input_length); | 406 icu::UnicodeString input(FALSE, &text_[input_start], input_length); |
| 419 UErrorCode status = U_ZERO_ERROR; | 407 UErrorCode status = U_ZERO_ERROR; |
| 420 icu::UnicodeString output; | 408 icu::UnicodeString output; |
| 421 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status); | 409 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status); |
| 422 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) | 410 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) |
| 423 return false; | 411 return false; |
| 424 | 412 |
| 425 // Copy the normalized text to the output. | 413 // Copy the normalized text to the output. |
| 426 icu::StringCharacterIterator it(output); | 414 icu::StringCharacterIterator it(output); |
| 427 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) | 415 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) |
| 428 attribute_->OutputChar(c, output_string); | 416 attribute_->OutputChar(c, output_string); |
| 429 | 417 |
| 430 return !output_string->empty(); | 418 return !output_string->empty(); |
| 431 } | 419 } |
| OLD | NEW |