OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // Implements a custom word iterator used for our spellchecker. | 5 // Implements a custom word iterator used for our spellchecker. |
6 | 6 |
7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" | 7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" |
8 | 8 |
9 #include <map> | 9 #include <map> |
10 #include <string> | 10 #include <string> |
11 | 11 |
12 #include "base/basictypes.h" | 12 #include "base/basictypes.h" |
13 #include "base/i18n/break_iterator.h" | |
13 #include "base/logging.h" | 14 #include "base/logging.h" |
15 #include "base/memory/scoped_ptr.h" | |
groby-ooo-7-16
2014/05/08 17:52:00
No need to include scoped_ptr.h - transitive depen
Andrew Hayden (chromium.org)
2014/05/09 15:16:16
Done.
| |
14 #include "base/strings/stringprintf.h" | 16 #include "base/strings/stringprintf.h" |
15 #include "base/strings/utf_string_conversions.h" | 17 #include "base/strings/utf_string_conversions.h" |
16 #include "chrome/renderer/spellchecker/spellcheck.h" | 18 #include "chrome/renderer/spellchecker/spellcheck.h" |
17 #include "third_party/icu/source/common/unicode/normlzr.h" | 19 #include "third_party/icu/source/common/unicode/normlzr.h" |
18 #include "third_party/icu/source/common/unicode/schriter.h" | 20 #include "third_party/icu/source/common/unicode/schriter.h" |
19 #include "third_party/icu/source/common/unicode/uscript.h" | 21 #include "third_party/icu/source/common/unicode/uscript.h" |
20 #include "third_party/icu/source/i18n/unicode/ulocdata.h" | 22 #include "third_party/icu/source/i18n/unicode/ulocdata.h" |
21 | 23 |
22 // SpellcheckCharAttribute implementation: | 24 // SpellcheckCharAttribute implementation: |
23 | 25 |
(...skipping 268 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
292 UScriptCode script_code = uscript_getScript(c, &status); | 294 UScriptCode script_code = uscript_getScript(c, &status); |
293 if (script_code == script_code_ || script_code == USCRIPT_COMMON) | 295 if (script_code == script_code_ || script_code == USCRIPT_COMMON) |
294 output->push_back(c); | 296 output->push_back(c); |
295 return true; | 297 return true; |
296 } | 298 } |
297 | 299 |
298 // SpellcheckWordIterator implementation: | 300 // SpellcheckWordIterator implementation: |
299 | 301 |
300 SpellcheckWordIterator::SpellcheckWordIterator() | 302 SpellcheckWordIterator::SpellcheckWordIterator() |
301 : text_(NULL), | 303 : text_(NULL), |
302 length_(0), | |
303 position_(UBRK_DONE), | |
304 attribute_(NULL), | 304 attribute_(NULL), |
305 iterator_(NULL) { | 305 iterator_() { |
306 } | 306 } |
307 | 307 |
308 SpellcheckWordIterator::~SpellcheckWordIterator() { | 308 SpellcheckWordIterator::~SpellcheckWordIterator() { |
309 Reset(); | 309 Reset(); |
310 } | 310 } |
311 | 311 |
312 bool SpellcheckWordIterator::Initialize( | 312 bool SpellcheckWordIterator::Initialize( |
313 const SpellcheckCharAttribute* attribute, | 313 const SpellcheckCharAttribute* attribute, |
314 bool allow_contraction) { | 314 bool allow_contraction) { |
315 // Create a custom ICU break iterator with empty text used in this object. (We | 315 // Create a custom ICU break iterator with empty text used in this object. (We |
316 // allow setting text later so we can re-use this iterator.) | 316 // allow setting text later so we can re-use this iterator.) |
317 DCHECK(attribute); | 317 DCHECK(attribute); |
318 UErrorCode open_status = U_ZERO_ERROR; | 318 const base::string16 rule(attribute->GetRuleSet(allow_contraction)); |
319 UParseError parse_status; | |
320 base::string16 rule(attribute->GetRuleSet(allow_contraction)); | |
321 | 319 |
322 // If there is no rule set, the attributes were invalid. | 320 // If there is no rule set, the attributes were invalid. |
323 if (rule.empty()) | 321 if (rule.empty()) |
324 return false; | 322 return false; |
325 | 323 |
326 iterator_ = ubrk_openRules(rule.c_str(), rule.length(), NULL, 0, | 324 scoped_ptr<base::i18n::BreakIterator> iterator( |
327 &parse_status, &open_status); | 325 new base::i18n::BreakIterator(base::string16(), rule)); |
328 if (U_FAILURE(open_status)) | 326 if (!iterator->Init()) { |
327 NOTREACHED() << "failed to open iterator"; | |
groby-ooo-7-16
2014/05/08 17:52:00
Why NOTREACHED? If BreakIterator::Init truly can't
Andrew Hayden (chromium.org)
2014/05/09 15:16:16
I'll add a comment here. It can't fail *in this ca
| |
329 return false; | 328 return false; |
329 } | |
330 iterator_ = iterator.Pass(); | |
330 | 331 |
331 // Set the character attributes so we can normalize the words extracted by | 332 // Set the character attributes so we can normalize the words extracted by |
332 // this iterator. | 333 // this iterator. |
333 attribute_ = attribute; | 334 attribute_ = attribute; |
334 return true; | 335 return true; |
335 } | 336 } |
336 | 337 |
337 bool SpellcheckWordIterator::IsInitialized() const { | 338 bool SpellcheckWordIterator::IsInitialized() const { |
338 // Return true if we have an ICU custom iterator. | 339 // Return true iff we have an iterator. |
339 return !!iterator_; | 340 return !!iterator_; |
340 } | 341 } |
341 | 342 |
342 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) { | 343 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) { |
343 DCHECK(!!iterator_); | 344 DCHECK(!!iterator_); |
344 | 345 |
345 // Set the text to be split by this iterator. | 346 // Set the text to be split by this iterator. |
346 UErrorCode status = U_ZERO_ERROR; | 347 if (!iterator_->SetText(text, length)) { |
347 ubrk_setText(iterator_, text, length, &status); | 348 NOTREACHED() << "failed to set text"; |
groby-ooo-7-16
2014/05/08 17:52:00
See above.
Andrew Hayden (chromium.org)
2014/05/09 15:16:16
Done.
| |
348 if (U_FAILURE(status)) | |
349 return false; | 349 return false; |
350 } | |
350 | 351 |
351 // Retrieve the position to the first word in this text. We return false if | 352 // Return false if this text does not have any words. (For example, the |
352 // this text does not have any words. (For example, The input text consists | 353 // input text consists only of Chinese characters while the spellchecker |
353 // only of Chinese characters while the spellchecker language is English.) | 354 // language is English.) |
354 position_ = ubrk_first(iterator_); | 355 if (!iterator_->IsValid()) |
355 if (position_ == UBRK_DONE) | |
356 return false; | 356 return false; |
357 | 357 |
358 text_ = text; | 358 text_ = text; |
359 length_ = static_cast<int>(length); | |
360 return true; | 359 return true; |
361 } | 360 } |
362 | 361 |
363 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string, | 362 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string, |
364 int* word_start, | 363 int* word_start, |
365 int* word_length) { | 364 int* word_length) { |
366 DCHECK(!!text_ && length_ > 0); | 365 DCHECK(!!text_); |
367 | 366 |
368 word_string->clear(); | 367 word_string->clear(); |
369 *word_start = 0; | 368 *word_start = 0; |
370 *word_length = 0; | 369 *word_length = 0; |
371 | 370 |
372 if (!text_ || position_ == UBRK_DONE) | 371 if (!text_ || !iterator_->IsValid()) |
373 return false; | 372 return false; |
374 | 373 |
375 // Find a word that can be checked for spelling. Our rule sets filter out | 374 // Find a word that can be checked for spelling. Our rule sets filter out |
376 // invalid words (e.g. numbers and characters not supported by the | 375 // invalid words (e.g. numbers and characters not supported by the |
377 // spellchecker language) so this ubrk_getRuleStatus() call returns | 376 // spellchecker language) so this ubrk_getRuleStatus() call returns |
378 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such | 377 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such |
379 // words until we can find a valid word or reach the end of the input string. | 378 // words until we can find a valid word or reach the end of the input string. |
380 int next = ubrk_next(iterator_); | 379 while (iterator_->Advance()) { |
381 while (next != UBRK_DONE) { | 380 const size_t start = iterator_->prev(); |
382 if (ubrk_getRuleStatus(iterator_) != UBRK_WORD_NONE) { | 381 const size_t length = iterator_->pos() - start; |
383 if (Normalize(position_, next - position_, word_string)) { | 382 if (iterator_->IsWord()) { |
384 *word_start = position_; | 383 if (Normalize(start, length, word_string)) { |
385 *word_length = next - position_; | 384 *word_start = start; |
386 position_ = next; | 385 *word_length = length; |
387 return true; | 386 return true; |
388 } | 387 } |
389 } | 388 } |
390 position_ = next; | |
391 next = ubrk_next(iterator_); | |
392 } | 389 } |
393 | 390 |
394 // There aren't any more words in the given text. Set the position to | 391 // There aren't any more words in the given text. |
395 // UBRK_DONE to prevent from calling ubrk_next() next time when this function | |
396 // is called. | |
397 position_ = UBRK_DONE; | |
398 return false; | 392 return false; |
399 } | 393 } |
400 | 394 |
401 void SpellcheckWordIterator::Reset() { | 395 void SpellcheckWordIterator::Reset() { |
402 if (iterator_) { | 396 iterator_.reset(0); |
groby-ooo-7-16
2014/05/08 17:52:00
Just reset() - more idiomatic.
Andrew Hayden (chromium.org)
2014/05/09 15:16:16
Done.
| |
403 ubrk_close(iterator_); | |
404 iterator_ = NULL; | |
405 } | |
406 } | 397 } |
407 | 398 |
408 bool SpellcheckWordIterator::Normalize(int input_start, | 399 bool SpellcheckWordIterator::Normalize(int input_start, |
409 int input_length, | 400 int input_length, |
410 base::string16* output_string) const { | 401 base::string16* output_string) const { |
411 // We use NFKC (Normalization Form, Compatible decomposition, followed by | 402 // We use NFKC (Normalization Form, Compatible decomposition, followed by |
412 // canonical Composition) defined in Unicode Standard Annex #15 to normalize | 403 // canonical Composition) defined in Unicode Standard Annex #15 to normalize |
413 // this token because it it the most suitable normalization algorithm for our | 404 // this token because it it the most suitable normalization algorithm for our |
414 // spellchecker. Nevertheless, it is not a perfect algorithm for our | 405 // spellchecker. Nevertheless, it is not a perfect algorithm for our |
415 // spellchecker and we need manual normalization as well. The normalized | 406 // spellchecker and we need manual normalization as well. The normalized |
416 // text does not have to be NUL-terminated since its characters are copied to | 407 // text does not have to be NUL-terminated since its characters are copied to |
417 // string16, which adds a NUL character when we need. | 408 // string16, which adds a NUL character when we need. |
418 icu::UnicodeString input(FALSE, &text_[input_start], input_length); | 409 icu::UnicodeString input(FALSE, &text_[input_start], input_length); |
419 UErrorCode status = U_ZERO_ERROR; | 410 UErrorCode status = U_ZERO_ERROR; |
420 icu::UnicodeString output; | 411 icu::UnicodeString output; |
421 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status); | 412 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status); |
422 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) | 413 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) |
423 return false; | 414 return false; |
424 | 415 |
425 // Copy the normalized text to the output. | 416 // Copy the normalized text to the output. |
426 icu::StringCharacterIterator it(output); | 417 icu::StringCharacterIterator it(output); |
427 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) | 418 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) |
428 attribute_->OutputChar(c, output_string); | 419 attribute_->OutputChar(c, output_string); |
429 | 420 |
430 return !output_string->empty(); | 421 return !output_string->empty(); |
431 } | 422 } |
OLD | NEW |