Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // Implements a custom word iterator used for our spellchecker. | 5 // Implements a custom word iterator used for our spellchecker. |
| 6 | 6 |
| 7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" | 7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" |
| 8 | 8 |
| 9 #include <map> | 9 #include <map> |
| 10 #include <string> | 10 #include <string> |
| 11 | 11 |
| 12 #include "base/basictypes.h" | 12 #include "base/basictypes.h" |
| 13 #include "base/i18n/break_iterator.h" | |
| 13 #include "base/logging.h" | 14 #include "base/logging.h" |
| 15 #include "base/memory/scoped_ptr.h" | |
|
groby-ooo-7-16
2014/05/08 17:52:00
No need to include scoped_ptr.h - transitive depen
Andrew Hayden (chromium.org)
2014/05/09 15:16:16
Done.
| |
| 14 #include "base/strings/stringprintf.h" | 16 #include "base/strings/stringprintf.h" |
| 15 #include "base/strings/utf_string_conversions.h" | 17 #include "base/strings/utf_string_conversions.h" |
| 16 #include "chrome/renderer/spellchecker/spellcheck.h" | 18 #include "chrome/renderer/spellchecker/spellcheck.h" |
| 17 #include "third_party/icu/source/common/unicode/normlzr.h" | 19 #include "third_party/icu/source/common/unicode/normlzr.h" |
| 18 #include "third_party/icu/source/common/unicode/schriter.h" | 20 #include "third_party/icu/source/common/unicode/schriter.h" |
| 19 #include "third_party/icu/source/common/unicode/uscript.h" | 21 #include "third_party/icu/source/common/unicode/uscript.h" |
| 20 #include "third_party/icu/source/i18n/unicode/ulocdata.h" | 22 #include "third_party/icu/source/i18n/unicode/ulocdata.h" |
| 21 | 23 |
| 22 // SpellcheckCharAttribute implementation: | 24 // SpellcheckCharAttribute implementation: |
| 23 | 25 |
| (...skipping 268 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 292 UScriptCode script_code = uscript_getScript(c, &status); | 294 UScriptCode script_code = uscript_getScript(c, &status); |
| 293 if (script_code == script_code_ || script_code == USCRIPT_COMMON) | 295 if (script_code == script_code_ || script_code == USCRIPT_COMMON) |
| 294 output->push_back(c); | 296 output->push_back(c); |
| 295 return true; | 297 return true; |
| 296 } | 298 } |
| 297 | 299 |
| 298 // SpellcheckWordIterator implementation: | 300 // SpellcheckWordIterator implementation: |
| 299 | 301 |
| 300 SpellcheckWordIterator::SpellcheckWordIterator() | 302 SpellcheckWordIterator::SpellcheckWordIterator() |
| 301 : text_(NULL), | 303 : text_(NULL), |
| 302 length_(0), | |
| 303 position_(UBRK_DONE), | |
| 304 attribute_(NULL), | 304 attribute_(NULL), |
| 305 iterator_(NULL) { | 305 iterator_() { |
| 306 } | 306 } |
| 307 | 307 |
| 308 SpellcheckWordIterator::~SpellcheckWordIterator() { | 308 SpellcheckWordIterator::~SpellcheckWordIterator() { |
| 309 Reset(); | 309 Reset(); |
| 310 } | 310 } |
| 311 | 311 |
| 312 bool SpellcheckWordIterator::Initialize( | 312 bool SpellcheckWordIterator::Initialize( |
| 313 const SpellcheckCharAttribute* attribute, | 313 const SpellcheckCharAttribute* attribute, |
| 314 bool allow_contraction) { | 314 bool allow_contraction) { |
| 315 // Create a custom ICU break iterator with empty text used in this object. (We | 315 // Create a custom ICU break iterator with empty text used in this object. (We |
| 316 // allow setting text later so we can re-use this iterator.) | 316 // allow setting text later so we can re-use this iterator.) |
| 317 DCHECK(attribute); | 317 DCHECK(attribute); |
| 318 UErrorCode open_status = U_ZERO_ERROR; | 318 const base::string16 rule(attribute->GetRuleSet(allow_contraction)); |
| 319 UParseError parse_status; | |
| 320 base::string16 rule(attribute->GetRuleSet(allow_contraction)); | |
| 321 | 319 |
| 322 // If there is no rule set, the attributes were invalid. | 320 // If there is no rule set, the attributes were invalid. |
| 323 if (rule.empty()) | 321 if (rule.empty()) |
| 324 return false; | 322 return false; |
| 325 | 323 |
| 326 iterator_ = ubrk_openRules(rule.c_str(), rule.length(), NULL, 0, | 324 scoped_ptr<base::i18n::BreakIterator> iterator( |
| 327 &parse_status, &open_status); | 325 new base::i18n::BreakIterator(base::string16(), rule)); |
| 328 if (U_FAILURE(open_status)) | 326 if (!iterator->Init()) { |
| 327 NOTREACHED() << "failed to open iterator"; | |
|
groby-ooo-7-16
2014/05/08 17:52:00
Why NOTREACHED? If BreakIterator::Init truly can't
Andrew Hayden (chromium.org)
2014/05/09 15:16:16
I'll add a comment here. It can't fail *in this ca
| |
| 329 return false; | 328 return false; |
| 329 } | |
| 330 iterator_ = iterator.Pass(); | |
| 330 | 331 |
| 331 // Set the character attributes so we can normalize the words extracted by | 332 // Set the character attributes so we can normalize the words extracted by |
| 332 // this iterator. | 333 // this iterator. |
| 333 attribute_ = attribute; | 334 attribute_ = attribute; |
| 334 return true; | 335 return true; |
| 335 } | 336 } |
| 336 | 337 |
| 337 bool SpellcheckWordIterator::IsInitialized() const { | 338 bool SpellcheckWordIterator::IsInitialized() const { |
| 338 // Return true if we have an ICU custom iterator. | 339 // Return true iff we have an iterator. |
| 339 return !!iterator_; | 340 return !!iterator_; |
| 340 } | 341 } |
| 341 | 342 |
| 342 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) { | 343 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) { |
| 343 DCHECK(!!iterator_); | 344 DCHECK(!!iterator_); |
| 344 | 345 |
| 345 // Set the text to be split by this iterator. | 346 // Set the text to be split by this iterator. |
| 346 UErrorCode status = U_ZERO_ERROR; | 347 if (!iterator_->SetText(text, length)) { |
| 347 ubrk_setText(iterator_, text, length, &status); | 348 NOTREACHED() << "failed to set text"; |
|
groby-ooo-7-16
2014/05/08 17:52:00
See above.
Andrew Hayden (chromium.org)
2014/05/09 15:16:16
Done.
| |
| 348 if (U_FAILURE(status)) | |
| 349 return false; | 349 return false; |
| 350 } | |
| 350 | 351 |
| 351 // Retrieve the position to the first word in this text. We return false if | 352 // Return false if this text does not have any words. (For example, the |
| 352 // this text does not have any words. (For example, The input text consists | 353 // input text consists only of Chinese characters while the spellchecker |
| 353 // only of Chinese characters while the spellchecker language is English.) | 354 // language is English.) |
| 354 position_ = ubrk_first(iterator_); | 355 if (!iterator_->IsValid()) |
| 355 if (position_ == UBRK_DONE) | |
| 356 return false; | 356 return false; |
| 357 | 357 |
| 358 text_ = text; | 358 text_ = text; |
| 359 length_ = static_cast<int>(length); | |
| 360 return true; | 359 return true; |
| 361 } | 360 } |
| 362 | 361 |
| 363 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string, | 362 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string, |
| 364 int* word_start, | 363 int* word_start, |
| 365 int* word_length) { | 364 int* word_length) { |
| 366 DCHECK(!!text_ && length_ > 0); | 365 DCHECK(!!text_); |
| 367 | 366 |
| 368 word_string->clear(); | 367 word_string->clear(); |
| 369 *word_start = 0; | 368 *word_start = 0; |
| 370 *word_length = 0; | 369 *word_length = 0; |
| 371 | 370 |
| 372 if (!text_ || position_ == UBRK_DONE) | 371 if (!text_ || !iterator_->IsValid()) |
| 373 return false; | 372 return false; |
| 374 | 373 |
| 375 // Find a word that can be checked for spelling. Our rule sets filter out | 374 // Find a word that can be checked for spelling. Our rule sets filter out |
| 376 // invalid words (e.g. numbers and characters not supported by the | 375 // invalid words (e.g. numbers and characters not supported by the |
| 377 // spellchecker language) so this ubrk_getRuleStatus() call returns | 376 // spellchecker language) so this ubrk_getRuleStatus() call returns |
| 378 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such | 377 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such |
| 379 // words until we can find a valid word or reach the end of the input string. | 378 // words until we can find a valid word or reach the end of the input string. |
| 380 int next = ubrk_next(iterator_); | 379 while (iterator_->Advance()) { |
| 381 while (next != UBRK_DONE) { | 380 const size_t start = iterator_->prev(); |
| 382 if (ubrk_getRuleStatus(iterator_) != UBRK_WORD_NONE) { | 381 const size_t length = iterator_->pos() - start; |
| 383 if (Normalize(position_, next - position_, word_string)) { | 382 if (iterator_->IsWord()) { |
| 384 *word_start = position_; | 383 if (Normalize(start, length, word_string)) { |
| 385 *word_length = next - position_; | 384 *word_start = start; |
| 386 position_ = next; | 385 *word_length = length; |
| 387 return true; | 386 return true; |
| 388 } | 387 } |
| 389 } | 388 } |
| 390 position_ = next; | |
| 391 next = ubrk_next(iterator_); | |
| 392 } | 389 } |
| 393 | 390 |
| 394 // There aren't any more words in the given text. Set the position to | 391 // There aren't any more words in the given text. |
| 395 // UBRK_DONE to prevent from calling ubrk_next() next time when this function | |
| 396 // is called. | |
| 397 position_ = UBRK_DONE; | |
| 398 return false; | 392 return false; |
| 399 } | 393 } |
| 400 | 394 |
| 401 void SpellcheckWordIterator::Reset() { | 395 void SpellcheckWordIterator::Reset() { |
| 402 if (iterator_) { | 396 iterator_.reset(0); |
|
groby-ooo-7-16
2014/05/08 17:52:00
Just reset() - more idiomatic.
Andrew Hayden (chromium.org)
2014/05/09 15:16:16
Done.
| |
| 403 ubrk_close(iterator_); | |
| 404 iterator_ = NULL; | |
| 405 } | |
| 406 } | 397 } |
| 407 | 398 |
| 408 bool SpellcheckWordIterator::Normalize(int input_start, | 399 bool SpellcheckWordIterator::Normalize(int input_start, |
| 409 int input_length, | 400 int input_length, |
| 410 base::string16* output_string) const { | 401 base::string16* output_string) const { |
| 411 // We use NFKC (Normalization Form, Compatible decomposition, followed by | 402 // We use NFKC (Normalization Form, Compatible decomposition, followed by |
| 412 // canonical Composition) defined in Unicode Standard Annex #15 to normalize | 403 // canonical Composition) defined in Unicode Standard Annex #15 to normalize |
| 413 // this token because it it the most suitable normalization algorithm for our | 404 // this token because it it the most suitable normalization algorithm for our |
| 414 // spellchecker. Nevertheless, it is not a perfect algorithm for our | 405 // spellchecker. Nevertheless, it is not a perfect algorithm for our |
| 415 // spellchecker and we need manual normalization as well. The normalized | 406 // spellchecker and we need manual normalization as well. The normalized |
| 416 // text does not have to be NUL-terminated since its characters are copied to | 407 // text does not have to be NUL-terminated since its characters are copied to |
| 417 // string16, which adds a NUL character when we need. | 408 // string16, which adds a NUL character when we need. |
| 418 icu::UnicodeString input(FALSE, &text_[input_start], input_length); | 409 icu::UnicodeString input(FALSE, &text_[input_start], input_length); |
| 419 UErrorCode status = U_ZERO_ERROR; | 410 UErrorCode status = U_ZERO_ERROR; |
| 420 icu::UnicodeString output; | 411 icu::UnicodeString output; |
| 421 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status); | 412 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status); |
| 422 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) | 413 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) |
| 423 return false; | 414 return false; |
| 424 | 415 |
| 425 // Copy the normalized text to the output. | 416 // Copy the normalized text to the output. |
| 426 icu::StringCharacterIterator it(output); | 417 icu::StringCharacterIterator it(output); |
| 427 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) | 418 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) |
| 428 attribute_->OutputChar(c, output_string); | 419 attribute_->OutputChar(c, output_string); |
| 429 | 420 |
| 430 return !output_string->empty(); | 421 return !output_string->empty(); |
| 431 } | 422 } |
| OLD | NEW |