OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // Implements a custom word iterator used for our spellchecker. | 5 // Implements a custom word iterator used for our spellchecker. |
6 | 6 |
7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" | 7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" |
8 | 8 |
9 #include <map> | 9 #include <map> |
10 #include <string> | 10 #include <string> |
11 | 11 |
12 #include "base/basictypes.h" | 12 #include "base/basictypes.h" |
13 #include "base/i18n/break_iterator.h" | |
13 #include "base/logging.h" | 14 #include "base/logging.h" |
14 #include "base/strings/stringprintf.h" | 15 #include "base/strings/stringprintf.h" |
15 #include "base/strings/utf_string_conversions.h" | 16 #include "base/strings/utf_string_conversions.h" |
16 #include "chrome/renderer/spellchecker/spellcheck.h" | 17 #include "chrome/renderer/spellchecker/spellcheck.h" |
17 #include "third_party/icu/source/common/unicode/normlzr.h" | 18 #include "third_party/icu/source/common/unicode/normlzr.h" |
18 #include "third_party/icu/source/common/unicode/schriter.h" | 19 #include "third_party/icu/source/common/unicode/schriter.h" |
19 #include "third_party/icu/source/common/unicode/uscript.h" | 20 #include "third_party/icu/source/common/unicode/uscript.h" |
20 #include "third_party/icu/source/i18n/unicode/ulocdata.h" | 21 #include "third_party/icu/source/i18n/unicode/ulocdata.h" |
21 | 22 |
22 // SpellcheckCharAttribute implementation: | 23 // SpellcheckCharAttribute implementation: |
(...skipping 269 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
292 UScriptCode script_code = uscript_getScript(c, &status); | 293 UScriptCode script_code = uscript_getScript(c, &status); |
293 if (script_code == script_code_ || script_code == USCRIPT_COMMON) | 294 if (script_code == script_code_ || script_code == USCRIPT_COMMON) |
294 output->push_back(c); | 295 output->push_back(c); |
295 return true; | 296 return true; |
296 } | 297 } |
297 | 298 |
298 // SpellcheckWordIterator implementation: | 299 // SpellcheckWordIterator implementation: |
299 | 300 |
300 SpellcheckWordIterator::SpellcheckWordIterator() | 301 SpellcheckWordIterator::SpellcheckWordIterator() |
301 : text_(NULL), | 302 : text_(NULL), |
302 length_(0), | |
303 position_(UBRK_DONE), | |
304 attribute_(NULL), | 303 attribute_(NULL), |
305 iterator_(NULL) { | 304 iterator_() { |
306 } | 305 } |
307 | 306 |
308 SpellcheckWordIterator::~SpellcheckWordIterator() { | 307 SpellcheckWordIterator::~SpellcheckWordIterator() { |
309 Reset(); | 308 Reset(); |
310 } | 309 } |
311 | 310 |
312 bool SpellcheckWordIterator::Initialize( | 311 bool SpellcheckWordIterator::Initialize( |
313 const SpellcheckCharAttribute* attribute, | 312 const SpellcheckCharAttribute* attribute, |
314 bool allow_contraction) { | 313 bool allow_contraction) { |
315 // Create a custom ICU break iterator with empty text used in this object. (We | 314 // Create a custom ICU break iterator with empty text used in this object. (We |
316 // allow setting text later so we can re-use this iterator.) | 315 // allow setting text later so we can re-use this iterator.) |
317 DCHECK(attribute); | 316 DCHECK(attribute); |
318 UErrorCode open_status = U_ZERO_ERROR; | 317 const base::string16 rule(attribute->GetRuleSet(allow_contraction)); |
319 UParseError parse_status; | |
320 base::string16 rule(attribute->GetRuleSet(allow_contraction)); | |
321 | 318 |
322 // If there is no rule set, the attributes were invalid. | 319 // If there is no rule set, the attributes were invalid. |
323 if (rule.empty()) | 320 if (rule.empty()) |
324 return false; | 321 return false; |
325 | 322 |
326 iterator_ = ubrk_openRules(rule.c_str(), rule.length(), NULL, 0, | 323 scoped_ptr<base::i18n::BreakIterator> iterator( |
327 &parse_status, &open_status); | 324 new base::i18n::BreakIterator(base::string16(), rule)); |
328 if (U_FAILURE(open_status)) | 325 if (!iterator->Init()) { |
326 // Since we're not passing in any text, the only reason this could fail | |
327 // is if we fail to parse the rules. Since the rules are hardcoded, | |
328 // that would be a bug in this class. | |
329 NOTREACHED() << "failed to open iterator (broken rules)"; | |
329 return false; | 330 return false; |
331 } | |
332 iterator_ = iterator.Pass(); | |
330 | 333 |
331 // Set the character attributes so we can normalize the words extracted by | 334 // Set the character attributes so we can normalize the words extracted by |
332 // this iterator. | 335 // this iterator. |
333 attribute_ = attribute; | 336 attribute_ = attribute; |
334 return true; | 337 return true; |
335 } | 338 } |
336 | 339 |
337 bool SpellcheckWordIterator::IsInitialized() const { | 340 bool SpellcheckWordIterator::IsInitialized() const { |
338 // Return true if we have an ICU custom iterator. | 341 // Return true iff we have an iterator. |
339 return !!iterator_; | 342 return !!iterator_; |
340 } | 343 } |
341 | 344 |
342 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) { | 345 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) { |
343 DCHECK(!!iterator_); | 346 DCHECK(!!iterator_); |
344 | 347 |
345 // Set the text to be split by this iterator. | 348 // Set the text to be split by this iterator. |
346 UErrorCode status = U_ZERO_ERROR; | 349 if (!iterator_->SetText(text, length)) { |
347 ubrk_setText(iterator_, text, length, &status); | 350 LOG(ERROR) << "failed to set text"; |
348 if (U_FAILURE(status)) | |
349 return false; | 351 return false; |
352 } | |
350 | 353 |
351 // Retrieve the position to the first word in this text. We return false if | 354 // Return false if this text does not have any words. (For example, the |
Andrew Hayden (chromium.org)
2014/05/12 13:19:37
This turns out to be useless now. I asked the ICU
groby-ooo-7-16
2014/05/12 20:14:49
Hm. As long as the tests pass, I suppose we're goo
| |
352 // this text does not have any words. (For example, The input text consists | 355 // input text consists only of Chinese characters while the spellchecker |
353 // only of Chinese characters while the spellchecker language is English.) | 356 // language is English.) |
354 position_ = ubrk_first(iterator_); | 357 if (!iterator_->IsValid()) |
355 if (position_ == UBRK_DONE) | |
356 return false; | 358 return false; |
357 | 359 |
358 text_ = text; | 360 text_ = text; |
groby-ooo-7-16
2014/05/12 20:14:49
I _think_ all references to _text can go. There's
| |
359 length_ = static_cast<int>(length); | |
360 return true; | 361 return true; |
361 } | 362 } |
362 | 363 |
363 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string, | 364 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string, |
364 int* word_start, | 365 int* word_start, |
365 int* word_length) { | 366 int* word_length) { |
366 DCHECK(!!text_ && length_ > 0); | 367 DCHECK(!!text_); |
367 | 368 |
368 word_string->clear(); | 369 word_string->clear(); |
369 *word_start = 0; | 370 *word_start = 0; |
370 *word_length = 0; | 371 *word_length = 0; |
371 | 372 |
372 if (!text_ || position_ == UBRK_DONE) | 373 if (!text_ || !iterator_->IsValid()) |
Andrew Hayden (chromium.org)
2014/05/12 13:19:37
Similarly, here, the only time that IsValid() will
groby-ooo-7-16
2014/05/12 20:14:49
Since this is documented API behavior via the Brea
| |
373 return false; | 374 return false; |
374 | 375 |
375 // Find a word that can be checked for spelling. Our rule sets filter out | 376 // Find a word that can be checked for spelling. Our rule sets filter out |
376 // invalid words (e.g. numbers and characters not supported by the | 377 // invalid words (e.g. numbers and characters not supported by the |
377 // spellchecker language) so this ubrk_getRuleStatus() call returns | 378 // spellchecker language) so this ubrk_getRuleStatus() call returns |
378 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such | 379 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such |
379 // words until we can find a valid word or reach the end of the input string. | 380 // words until we can find a valid word or reach the end of the input string. |
380 int next = ubrk_next(iterator_); | 381 while (iterator_->Advance()) { |
381 while (next != UBRK_DONE) { | 382 const size_t start = iterator_->prev(); |
382 if (ubrk_getRuleStatus(iterator_) != UBRK_WORD_NONE) { | 383 const size_t length = iterator_->pos() - start; |
383 if (Normalize(position_, next - position_, word_string)) { | 384 if (iterator_->IsWord()) { |
384 *word_start = position_; | 385 if (Normalize(start, length, word_string)) { |
385 *word_length = next - position_; | 386 *word_start = start; |
386 position_ = next; | 387 *word_length = length; |
387 return true; | 388 return true; |
388 } | 389 } |
389 } | 390 } |
390 position_ = next; | |
391 next = ubrk_next(iterator_); | |
392 } | 391 } |
393 | 392 |
394 // There aren't any more words in the given text. Set the position to | 393 // There aren't any more words in the given text. |
395 // UBRK_DONE to prevent from calling ubrk_next() next time when this function | |
396 // is called. | |
397 position_ = UBRK_DONE; | |
398 return false; | 394 return false; |
399 } | 395 } |
400 | 396 |
401 void SpellcheckWordIterator::Reset() { | 397 void SpellcheckWordIterator::Reset() { |
402 if (iterator_) { | 398 iterator_.reset(); |
403 ubrk_close(iterator_); | |
404 iterator_ = NULL; | |
405 } | |
406 } | 399 } |
407 | 400 |
408 bool SpellcheckWordIterator::Normalize(int input_start, | 401 bool SpellcheckWordIterator::Normalize(int input_start, |
409 int input_length, | 402 int input_length, |
410 base::string16* output_string) const { | 403 base::string16* output_string) const { |
411 // We use NFKC (Normalization Form, Compatible decomposition, followed by | 404 // We use NFKC (Normalization Form, Compatible decomposition, followed by |
412 // canonical Composition) defined in Unicode Standard Annex #15 to normalize | 405 // canonical Composition) defined in Unicode Standard Annex #15 to normalize |
413 // this token because it it the most suitable normalization algorithm for our | 406 // this token because it it the most suitable normalization algorithm for our |
414 // spellchecker. Nevertheless, it is not a perfect algorithm for our | 407 // spellchecker. Nevertheless, it is not a perfect algorithm for our |
415 // spellchecker and we need manual normalization as well. The normalized | 408 // spellchecker and we need manual normalization as well. The normalized |
416 // text does not have to be NUL-terminated since its characters are copied to | 409 // text does not have to be NUL-terminated since its characters are copied to |
417 // string16, which adds a NUL character when we need. | 410 // string16, which adds a NUL character when we need. |
418 icu::UnicodeString input(FALSE, &text_[input_start], input_length); | 411 icu::UnicodeString input(FALSE, &text_[input_start], input_length); |
419 UErrorCode status = U_ZERO_ERROR; | 412 UErrorCode status = U_ZERO_ERROR; |
420 icu::UnicodeString output; | 413 icu::UnicodeString output; |
421 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status); | 414 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status); |
422 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) | 415 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) |
423 return false; | 416 return false; |
424 | 417 |
425 // Copy the normalized text to the output. | 418 // Copy the normalized text to the output. |
426 icu::StringCharacterIterator it(output); | 419 icu::StringCharacterIterator it(output); |
427 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) | 420 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) |
428 attribute_->OutputChar(c, output_string); | 421 attribute_->OutputChar(c, output_string); |
429 | 422 |
430 return !output_string->empty(); | 423 return !output_string->empty(); |
431 } | 424 } |
OLD | NEW |