Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(361)

Side by Side Diff: chrome/renderer/spellchecker/spellcheck_worditerator.cc

Issue 270203003: Refactor code to avoid direct dependency upon ICU: spellcheck_worditerator (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@refactoring_icu_usage
Patch Set: Simplify Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // Implements a custom word iterator used for our spellchecker. 5 // Implements a custom word iterator used for our spellchecker.
6 6
7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" 7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
8 8
9 #include <map> 9 #include <map>
10 #include <string> 10 #include <string>
11 11
12 #include "base/basictypes.h" 12 #include "base/basictypes.h"
13 #include "base/i18n/break_iterator.h"
13 #include "base/logging.h" 14 #include "base/logging.h"
15 #include "base/memory/scoped_ptr.h"
groby-ooo-7-16 2014/05/08 17:52:00 No need to include scoped_ptr.h - transitive depen
Andrew Hayden (chromium.org) 2014/05/09 15:16:16 Done.
14 #include "base/strings/stringprintf.h" 16 #include "base/strings/stringprintf.h"
15 #include "base/strings/utf_string_conversions.h" 17 #include "base/strings/utf_string_conversions.h"
16 #include "chrome/renderer/spellchecker/spellcheck.h" 18 #include "chrome/renderer/spellchecker/spellcheck.h"
17 #include "third_party/icu/source/common/unicode/normlzr.h" 19 #include "third_party/icu/source/common/unicode/normlzr.h"
18 #include "third_party/icu/source/common/unicode/schriter.h" 20 #include "third_party/icu/source/common/unicode/schriter.h"
19 #include "third_party/icu/source/common/unicode/uscript.h" 21 #include "third_party/icu/source/common/unicode/uscript.h"
20 #include "third_party/icu/source/i18n/unicode/ulocdata.h" 22 #include "third_party/icu/source/i18n/unicode/ulocdata.h"
21 23
22 // SpellcheckCharAttribute implementation: 24 // SpellcheckCharAttribute implementation:
23 25
(...skipping 268 matching lines...) Expand 10 before | Expand all | Expand 10 after
292 UScriptCode script_code = uscript_getScript(c, &status); 294 UScriptCode script_code = uscript_getScript(c, &status);
293 if (script_code == script_code_ || script_code == USCRIPT_COMMON) 295 if (script_code == script_code_ || script_code == USCRIPT_COMMON)
294 output->push_back(c); 296 output->push_back(c);
295 return true; 297 return true;
296 } 298 }
297 299
298 // SpellcheckWordIterator implementation: 300 // SpellcheckWordIterator implementation:
299 301
300 SpellcheckWordIterator::SpellcheckWordIterator() 302 SpellcheckWordIterator::SpellcheckWordIterator()
301 : text_(NULL), 303 : text_(NULL),
302 length_(0),
303 position_(UBRK_DONE),
304 attribute_(NULL), 304 attribute_(NULL),
305 iterator_(NULL) { 305 iterator_() {
306 } 306 }
307 307
308 SpellcheckWordIterator::~SpellcheckWordIterator() { 308 SpellcheckWordIterator::~SpellcheckWordIterator() {
309 Reset(); 309 Reset();
310 } 310 }
311 311
312 bool SpellcheckWordIterator::Initialize( 312 bool SpellcheckWordIterator::Initialize(
313 const SpellcheckCharAttribute* attribute, 313 const SpellcheckCharAttribute* attribute,
314 bool allow_contraction) { 314 bool allow_contraction) {
315 // Create a custom ICU break iterator with empty text used in this object. (We 315 // Create a custom ICU break iterator with empty text used in this object. (We
316 // allow setting text later so we can re-use this iterator.) 316 // allow setting text later so we can re-use this iterator.)
317 DCHECK(attribute); 317 DCHECK(attribute);
318 UErrorCode open_status = U_ZERO_ERROR; 318 const base::string16 rule(attribute->GetRuleSet(allow_contraction));
319 UParseError parse_status;
320 base::string16 rule(attribute->GetRuleSet(allow_contraction));
321 319
322 // If there is no rule set, the attributes were invalid. 320 // If there is no rule set, the attributes were invalid.
323 if (rule.empty()) 321 if (rule.empty())
324 return false; 322 return false;
325 323
326 iterator_ = ubrk_openRules(rule.c_str(), rule.length(), NULL, 0, 324 scoped_ptr<base::i18n::BreakIterator> iterator(
327 &parse_status, &open_status); 325 new base::i18n::BreakIterator(base::string16(), rule));
328 if (U_FAILURE(open_status)) 326 if (!iterator->Init()) {
327 NOTREACHED() << "failed to open iterator";
groby-ooo-7-16 2014/05/08 17:52:00 Why NOTREACHED? If BreakIterator::Init truly can't
Andrew Hayden (chromium.org) 2014/05/09 15:16:16 I'll add a comment here. It can't fail *in this ca
329 return false; 328 return false;
329 }
330 iterator_ = iterator.Pass();
330 331
331 // Set the character attributes so we can normalize the words extracted by 332 // Set the character attributes so we can normalize the words extracted by
332 // this iterator. 333 // this iterator.
333 attribute_ = attribute; 334 attribute_ = attribute;
334 return true; 335 return true;
335 } 336 }
336 337
337 bool SpellcheckWordIterator::IsInitialized() const { 338 bool SpellcheckWordIterator::IsInitialized() const {
338 // Return true if we have an ICU custom iterator. 339 // Return true iff we have an iterator.
339 return !!iterator_; 340 return !!iterator_;
340 } 341 }
341 342
342 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) { 343 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) {
343 DCHECK(!!iterator_); 344 DCHECK(!!iterator_);
344 345
345 // Set the text to be split by this iterator. 346 // Set the text to be split by this iterator.
346 UErrorCode status = U_ZERO_ERROR; 347 if (!iterator_->SetText(text, length)) {
347 ubrk_setText(iterator_, text, length, &status); 348 NOTREACHED() << "failed to set text";
groby-ooo-7-16 2014/05/08 17:52:00 See above.
Andrew Hayden (chromium.org) 2014/05/09 15:16:16 Done.
348 if (U_FAILURE(status))
349 return false; 349 return false;
350 }
350 351
351 // Retrieve the position to the first word in this text. We return false if 352 // Return false if this text does not have any words. (For example, the
352 // this text does not have any words. (For example, The input text consists 353 // input text consists only of Chinese characters while the spellchecker
353 // only of Chinese characters while the spellchecker language is English.) 354 // language is English.)
354 position_ = ubrk_first(iterator_); 355 if (!iterator_->IsValid())
355 if (position_ == UBRK_DONE)
356 return false; 356 return false;
357 357
358 text_ = text; 358 text_ = text;
359 length_ = static_cast<int>(length);
360 return true; 359 return true;
361 } 360 }
362 361
363 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string, 362 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string,
364 int* word_start, 363 int* word_start,
365 int* word_length) { 364 int* word_length) {
366 DCHECK(!!text_ && length_ > 0); 365 DCHECK(!!text_);
367 366
368 word_string->clear(); 367 word_string->clear();
369 *word_start = 0; 368 *word_start = 0;
370 *word_length = 0; 369 *word_length = 0;
371 370
372 if (!text_ || position_ == UBRK_DONE) 371 if (!text_ || !iterator_->IsValid())
373 return false; 372 return false;
374 373
375 // Find a word that can be checked for spelling. Our rule sets filter out 374 // Find a word that can be checked for spelling. Our rule sets filter out
376 // invalid words (e.g. numbers and characters not supported by the 375 // invalid words (e.g. numbers and characters not supported by the
377 // spellchecker language) so this ubrk_getRuleStatus() call returns 376 // spellchecker language) so this ubrk_getRuleStatus() call returns
378 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such 377 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such
379 // words until we can find a valid word or reach the end of the input string. 378 // words until we can find a valid word or reach the end of the input string.
380 int next = ubrk_next(iterator_); 379 while (iterator_->Advance()) {
381 while (next != UBRK_DONE) { 380 const size_t start = iterator_->prev();
382 if (ubrk_getRuleStatus(iterator_) != UBRK_WORD_NONE) { 381 const size_t length = iterator_->pos() - start;
383 if (Normalize(position_, next - position_, word_string)) { 382 if (iterator_->IsWord()) {
384 *word_start = position_; 383 if (Normalize(start, length, word_string)) {
385 *word_length = next - position_; 384 *word_start = start;
386 position_ = next; 385 *word_length = length;
387 return true; 386 return true;
388 } 387 }
389 } 388 }
390 position_ = next;
391 next = ubrk_next(iterator_);
392 } 389 }
393 390
394 // There aren't any more words in the given text. Set the position to 391 // There aren't any more words in the given text.
395 // UBRK_DONE to prevent from calling ubrk_next() next time when this function
396 // is called.
397 position_ = UBRK_DONE;
398 return false; 392 return false;
399 } 393 }
400 394
401 void SpellcheckWordIterator::Reset() { 395 void SpellcheckWordIterator::Reset() {
402 if (iterator_) { 396 iterator_.reset(0);
groby-ooo-7-16 2014/05/08 17:52:00 Just reset() - more idiomatic.
Andrew Hayden (chromium.org) 2014/05/09 15:16:16 Done.
403 ubrk_close(iterator_);
404 iterator_ = NULL;
405 }
406 } 397 }
407 398
408 bool SpellcheckWordIterator::Normalize(int input_start, 399 bool SpellcheckWordIterator::Normalize(int input_start,
409 int input_length, 400 int input_length,
410 base::string16* output_string) const { 401 base::string16* output_string) const {
411 // We use NFKC (Normalization Form, Compatible decomposition, followed by 402 // We use NFKC (Normalization Form, Compatible decomposition, followed by
412 // canonical Composition) defined in Unicode Standard Annex #15 to normalize 403 // canonical Composition) defined in Unicode Standard Annex #15 to normalize
413 // this token because it it the most suitable normalization algorithm for our 404 // this token because it it the most suitable normalization algorithm for our
414 // spellchecker. Nevertheless, it is not a perfect algorithm for our 405 // spellchecker. Nevertheless, it is not a perfect algorithm for our
415 // spellchecker and we need manual normalization as well. The normalized 406 // spellchecker and we need manual normalization as well. The normalized
416 // text does not have to be NUL-terminated since its characters are copied to 407 // text does not have to be NUL-terminated since its characters are copied to
417 // string16, which adds a NUL character when we need. 408 // string16, which adds a NUL character when we need.
418 icu::UnicodeString input(FALSE, &text_[input_start], input_length); 409 icu::UnicodeString input(FALSE, &text_[input_start], input_length);
419 UErrorCode status = U_ZERO_ERROR; 410 UErrorCode status = U_ZERO_ERROR;
420 icu::UnicodeString output; 411 icu::UnicodeString output;
421 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status); 412 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);
422 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) 413 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)
423 return false; 414 return false;
424 415
425 // Copy the normalized text to the output. 416 // Copy the normalized text to the output.
426 icu::StringCharacterIterator it(output); 417 icu::StringCharacterIterator it(output);
427 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) 418 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())
428 attribute_->OutputChar(c, output_string); 419 attribute_->OutputChar(c, output_string);
429 420
430 return !output_string->empty(); 421 return !output_string->empty();
431 } 422 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698