Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(56)

Side by Side Diff: chrome/renderer/spellchecker/spellcheck_worditerator.cc

Issue 270203003: Refactor code to avoid direct dependency upon ICU: spellcheck_worditerator (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@refactoring_icu_usage
Patch Set: jungshik@ comments Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « chrome/renderer/spellchecker/spellcheck_worditerator.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // Implements a custom word iterator used for our spellchecker. 5 // Implements a custom word iterator used for our spellchecker.
6 6
7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" 7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
8 8
9 #include <map> 9 #include <map>
10 #include <string> 10 #include <string>
11 11
12 #include "base/basictypes.h" 12 #include "base/basictypes.h"
13 #include "base/i18n/break_iterator.h"
13 #include "base/logging.h" 14 #include "base/logging.h"
14 #include "base/strings/stringprintf.h" 15 #include "base/strings/stringprintf.h"
15 #include "base/strings/utf_string_conversions.h" 16 #include "base/strings/utf_string_conversions.h"
16 #include "chrome/renderer/spellchecker/spellcheck.h" 17 #include "chrome/renderer/spellchecker/spellcheck.h"
17 #include "third_party/icu/source/common/unicode/normlzr.h" 18 #include "third_party/icu/source/common/unicode/normlzr.h"
18 #include "third_party/icu/source/common/unicode/schriter.h" 19 #include "third_party/icu/source/common/unicode/schriter.h"
19 #include "third_party/icu/source/common/unicode/uscript.h" 20 #include "third_party/icu/source/common/unicode/uscript.h"
20 #include "third_party/icu/source/i18n/unicode/ulocdata.h" 21 #include "third_party/icu/source/i18n/unicode/ulocdata.h"
21 22
22 // SpellcheckCharAttribute implementation: 23 // SpellcheckCharAttribute implementation:
(...skipping 269 matching lines...) Expand 10 before | Expand all | Expand 10 after
292 UScriptCode script_code = uscript_getScript(c, &status); 293 UScriptCode script_code = uscript_getScript(c, &status);
293 if (script_code == script_code_ || script_code == USCRIPT_COMMON) 294 if (script_code == script_code_ || script_code == USCRIPT_COMMON)
294 output->push_back(c); 295 output->push_back(c);
295 return true; 296 return true;
296 } 297 }
297 298
298 // SpellcheckWordIterator implementation: 299 // SpellcheckWordIterator implementation:
299 300
300 SpellcheckWordIterator::SpellcheckWordIterator() 301 SpellcheckWordIterator::SpellcheckWordIterator()
301 : text_(NULL), 302 : text_(NULL),
302 length_(0),
303 position_(UBRK_DONE),
304 attribute_(NULL), 303 attribute_(NULL),
305 iterator_(NULL) { 304 iterator_() {
306 } 305 }
307 306
308 SpellcheckWordIterator::~SpellcheckWordIterator() { 307 SpellcheckWordIterator::~SpellcheckWordIterator() {
309 Reset(); 308 Reset();
310 } 309 }
311 310
312 bool SpellcheckWordIterator::Initialize( 311 bool SpellcheckWordIterator::Initialize(
313 const SpellcheckCharAttribute* attribute, 312 const SpellcheckCharAttribute* attribute,
314 bool allow_contraction) { 313 bool allow_contraction) {
315 // Create a custom ICU break iterator with empty text used in this object. (We 314 // Create a custom ICU break iterator with empty text used in this object. (We
316 // allow setting text later so we can re-use this iterator.) 315 // allow setting text later so we can re-use this iterator.)
317 DCHECK(attribute); 316 DCHECK(attribute);
318 UErrorCode open_status = U_ZERO_ERROR; 317 const base::string16 rule(attribute->GetRuleSet(allow_contraction));
319 UParseError parse_status;
320 base::string16 rule(attribute->GetRuleSet(allow_contraction));
321 318
322 // If there is no rule set, the attributes were invalid. 319 // If there is no rule set, the attributes were invalid.
323 if (rule.empty()) 320 if (rule.empty())
324 return false; 321 return false;
325 322
326 iterator_ = ubrk_openRules(rule.c_str(), rule.length(), NULL, 0, 323 scoped_ptr<base::i18n::BreakIterator> iterator(
327 &parse_status, &open_status); 324 new base::i18n::BreakIterator(base::string16(), rule));
328 if (U_FAILURE(open_status)) 325 if (!iterator->Init()) {
326 // Since we're not passing in any text, the only reason this could fail
327 // is if we fail to parse the rules. Since the rules are hardcoded,
328 // that would be a bug in this class.
329 NOTREACHED() << "failed to open iterator (broken rules)";
329 return false; 330 return false;
331 }
332 iterator_ = iterator.Pass();
330 333
331 // Set the character attributes so we can normalize the words extracted by 334 // Set the character attributes so we can normalize the words extracted by
332 // this iterator. 335 // this iterator.
333 attribute_ = attribute; 336 attribute_ = attribute;
334 return true; 337 return true;
335 } 338 }
336 339
337 bool SpellcheckWordIterator::IsInitialized() const { 340 bool SpellcheckWordIterator::IsInitialized() const {
338 // Return true if we have an ICU custom iterator. 341 // Return true iff we have an iterator.
339 return !!iterator_; 342 return !!iterator_;
340 } 343 }
341 344
342 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) { 345 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) {
343 DCHECK(!!iterator_); 346 DCHECK(!!iterator_);
344 347
345 // Set the text to be split by this iterator. 348 // Set the text to be split by this iterator.
346 UErrorCode status = U_ZERO_ERROR; 349 if (!iterator_->SetText(text, length)) {
347 ubrk_setText(iterator_, text, length, &status); 350 LOG(ERROR) << "failed to set text";
348 if (U_FAILURE(status))
349 return false; 351 return false;
350 352 }
351 // Retrieve the position to the first word in this text. We return false if
352 // this text does not have any words. (For example, The input text consists
353 // only of Chinese characters while the spellchecker language is English.)
354 position_ = ubrk_first(iterator_);
355 if (position_ == UBRK_DONE)
356 return false;
357 353
358 text_ = text; 354 text_ = text;
359 length_ = static_cast<int>(length);
360 return true; 355 return true;
361 } 356 }
362 357
363 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string, 358 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string,
364 int* word_start, 359 int* word_start,
365 int* word_length) { 360 int* word_length) {
366 DCHECK(!!text_ && length_ > 0); 361 DCHECK(!!text_);
367 362
368 word_string->clear(); 363 word_string->clear();
369 *word_start = 0; 364 *word_start = 0;
370 *word_length = 0; 365 *word_length = 0;
371 366
372 if (!text_ || position_ == UBRK_DONE) 367 if (!text_) {
373 return false; 368 return false;
369 }
374 370
375 // Find a word that can be checked for spelling. Our rule sets filter out 371 // Find a word that can be checked for spelling. Our rule sets filter out
376 // invalid words (e.g. numbers and characters not supported by the 372 // invalid words (e.g. numbers and characters not supported by the
377 // spellchecker language) so this ubrk_getRuleStatus() call returns 373 // spellchecker language) so this ubrk_getRuleStatus() call returns
378 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such 374 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such
379 // words until we can find a valid word or reach the end of the input string. 375 // words until we can find a valid word or reach the end of the input string.
380 int next = ubrk_next(iterator_); 376 while (iterator_->Advance()) {
381 while (next != UBRK_DONE) { 377 const size_t start = iterator_->prev();
382 if (ubrk_getRuleStatus(iterator_) != UBRK_WORD_NONE) { 378 const size_t length = iterator_->pos() - start;
383 if (Normalize(position_, next - position_, word_string)) { 379 if (iterator_->IsWord()) {
384 *word_start = position_; 380 if (Normalize(start, length, word_string)) {
385 *word_length = next - position_; 381 *word_start = start;
386 position_ = next; 382 *word_length = length;
387 return true; 383 return true;
388 } 384 }
389 } 385 }
390 position_ = next;
391 next = ubrk_next(iterator_);
392 } 386 }
393 387
394 // There aren't any more words in the given text. Set the position to 388 // There aren't any more words in the given text.
395 // UBRK_DONE to prevent from calling ubrk_next() next time when this function
396 // is called.
397 position_ = UBRK_DONE;
398 return false; 389 return false;
399 } 390 }
400 391
401 void SpellcheckWordIterator::Reset() { 392 void SpellcheckWordIterator::Reset() {
402 if (iterator_) { 393 iterator_.reset();
403 ubrk_close(iterator_);
404 iterator_ = NULL;
405 }
406 } 394 }
407 395
408 bool SpellcheckWordIterator::Normalize(int input_start, 396 bool SpellcheckWordIterator::Normalize(int input_start,
409 int input_length, 397 int input_length,
410 base::string16* output_string) const { 398 base::string16* output_string) const {
411 // We use NFKC (Normalization Form, Compatible decomposition, followed by 399 // We use NFKC (Normalization Form, Compatible decomposition, followed by
412 // canonical Composition) defined in Unicode Standard Annex #15 to normalize 400 // canonical Composition) defined in Unicode Standard Annex #15 to normalize
413 // this token because it it the most suitable normalization algorithm for our 401 // this token because it it the most suitable normalization algorithm for our
414 // spellchecker. Nevertheless, it is not a perfect algorithm for our 402 // spellchecker. Nevertheless, it is not a perfect algorithm for our
415 // spellchecker and we need manual normalization as well. The normalized 403 // spellchecker and we need manual normalization as well. The normalized
416 // text does not have to be NUL-terminated since its characters are copied to 404 // text does not have to be NUL-terminated since its characters are copied to
417 // string16, which adds a NUL character when we need. 405 // string16, which adds a NUL character when we need.
418 icu::UnicodeString input(FALSE, &text_[input_start], input_length); 406 icu::UnicodeString input(FALSE, &text_[input_start], input_length);
419 UErrorCode status = U_ZERO_ERROR; 407 UErrorCode status = U_ZERO_ERROR;
420 icu::UnicodeString output; 408 icu::UnicodeString output;
421 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status); 409 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);
422 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) 410 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)
423 return false; 411 return false;
424 412
425 // Copy the normalized text to the output. 413 // Copy the normalized text to the output.
426 icu::StringCharacterIterator it(output); 414 icu::StringCharacterIterator it(output);
427 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) 415 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())
428 attribute_->OutputChar(c, output_string); 416 attribute_->OutputChar(c, output_string);
429 417
430 return !output_string->empty(); 418 return !output_string->empty();
431 } 419 }
OLDNEW
« no previous file with comments | « chrome/renderer/spellchecker/spellcheck_worditerator.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698