chrome/renderer/spellchecker/spellcheck_worditerator.cc - Issue 270203003: Refactor code to avoid direct dependency upon ICU: spellcheck_worditerator

Side by Side Diff: chrome/renderer/spellchecker/spellcheck_worditerator.cc

Issue 270203003: Refactor code to avoid direct dependency upon ICU: spellcheck_worditerator (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@refactoring_icu_usage

Patch Set: jungshik@ comments Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 // Implements a custom word iterator used for our spellchecker.	5 // Implements a custom word iterator used for our spellchecker.

6	6

7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"	7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"

8	8

9 #include <map>	9 #include <map>

10 #include <string>	10 #include <string>

11	11

12 #include "base/basictypes.h"	12 #include "base/basictypes.h"

	13 #include "base/i18n/break_iterator.h"

13 #include "base/logging.h"	14 #include "base/logging.h"

14 #include "base/strings/stringprintf.h"	15 #include "base/strings/stringprintf.h"

15 #include "base/strings/utf_string_conversions.h"	16 #include "base/strings/utf_string_conversions.h"

16 #include "chrome/renderer/spellchecker/spellcheck.h"	17 #include "chrome/renderer/spellchecker/spellcheck.h"

17 #include "third_party/icu/source/common/unicode/normlzr.h"	18 #include "third_party/icu/source/common/unicode/normlzr.h"

18 #include "third_party/icu/source/common/unicode/schriter.h"	19 #include "third_party/icu/source/common/unicode/schriter.h"

19 #include "third_party/icu/source/common/unicode/uscript.h"	20 #include "third_party/icu/source/common/unicode/uscript.h"

20 #include "third_party/icu/source/i18n/unicode/ulocdata.h"	21 #include "third_party/icu/source/i18n/unicode/ulocdata.h"

21	22

22 // SpellcheckCharAttribute implementation:	23 // SpellcheckCharAttribute implementation:

(...skipping 269 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
292 UScriptCode script_code = uscript_getScript(c, &status);	293 UScriptCode script_code = uscript_getScript(c, &status);

293 if (script_code == script_code_ \|\| script_code == USCRIPT_COMMON)	294 if (script_code == script_code_ \|\| script_code == USCRIPT_COMMON)

294 output->push_back(c);	295 output->push_back(c);

295 return true;	296 return true;

296 }	297 }

297	298

298 // SpellcheckWordIterator implementation:	299 // SpellcheckWordIterator implementation:

299	300

300 SpellcheckWordIterator::SpellcheckWordIterator()	301 SpellcheckWordIterator::SpellcheckWordIterator()

301 : text_(NULL),	302 : text_(NULL),

302 length_(0),

303 position_(UBRK_DONE),

304 attribute_(NULL),	303 attribute_(NULL),

305 iterator_(NULL) {	304 iterator_() {

306 }	305 }

307	306

308 SpellcheckWordIterator::~SpellcheckWordIterator() {	307 SpellcheckWordIterator::~SpellcheckWordIterator() {

309 Reset();	308 Reset();

310 }	309 }

311	310

312 bool SpellcheckWordIterator::Initialize(	311 bool SpellcheckWordIterator::Initialize(

313 const SpellcheckCharAttribute* attribute,	312 const SpellcheckCharAttribute* attribute,

314 bool allow_contraction) {	313 bool allow_contraction) {

315 // Create a custom ICU break iterator with empty text used in this object. (We	314 // Create a custom ICU break iterator with empty text used in this object. (We

316 // allow setting text later so we can re-use this iterator.)	315 // allow setting text later so we can re-use this iterator.)

317 DCHECK(attribute);	316 DCHECK(attribute);

318 UErrorCode open_status = U_ZERO_ERROR;	317 const base::string16 rule(attribute->GetRuleSet(allow_contraction));

319 UParseError parse_status;

320 base::string16 rule(attribute->GetRuleSet(allow_contraction));

321	318

322 // If there is no rule set, the attributes were invalid.	319 // If there is no rule set, the attributes were invalid.

323 if (rule.empty())	320 if (rule.empty())

324 return false;	321 return false;

325	322

326 iterator_ = ubrk_openRules(rule.c_str(), rule.length(), NULL, 0,	323 scoped_ptr<base::i18n::BreakIterator> iterator(

327 &parse_status, &open_status);	324 new base::i18n::BreakIterator(base::string16(), rule));

328 if (U_FAILURE(open_status))	325 if (!iterator->Init()) {

	326 // Since we're not passing in any text, the only reason this could fail

	327 // is if we fail to parse the rules. Since the rules are hardcoded,

	328 // that would be a bug in this class.

	329 NOTREACHED() << "failed to open iterator (broken rules)";

329 return false;	330 return false;

	331 }

	332 iterator_ = iterator.Pass();

330	333

331 // Set the character attributes so we can normalize the words extracted by	334 // Set the character attributes so we can normalize the words extracted by

332 // this iterator.	335 // this iterator.

333 attribute_ = attribute;	336 attribute_ = attribute;

334 return true;	337 return true;

335 }	338 }

336	339

337 bool SpellcheckWordIterator::IsInitialized() const {	340 bool SpellcheckWordIterator::IsInitialized() const {

338 // Return true if we have an ICU custom iterator.	341 // Return true iff we have an iterator.

339 return !!iterator_;	342 return !!iterator_;

340 }	343 }

341	344

342 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) {	345 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) {

343 DCHECK(!!iterator_);	346 DCHECK(!!iterator_);

344	347

345 // Set the text to be split by this iterator.	348 // Set the text to be split by this iterator.

346 UErrorCode status = U_ZERO_ERROR;	349 if (!iterator_->SetText(text, length)) {

347 ubrk_setText(iterator_, text, length, &status);	350 LOG(ERROR) << "failed to set text";

348 if (U_FAILURE(status))

349 return false;	351 return false;

350	352 }

351 // Retrieve the position to the first word in this text. We return false if

352 // this text does not have any words. (For example, The input text consists

353 // only of Chinese characters while the spellchecker language is English.)

354 position_ = ubrk_first(iterator_);

355 if (position_ == UBRK_DONE)

356 return false;

357	353

358 text_ = text;	354 text_ = text;

359 length_ = static_cast<int>(length);

360 return true;	355 return true;

361 }	356 }

362	357

363 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string,	358 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string,

364 int* word_start,	359 int* word_start,

365 int* word_length) {	360 int* word_length) {

366 DCHECK(!!text_ && length_ > 0);	361 DCHECK(!!text_);

367	362

368 word_string->clear();	363 word_string->clear();

369 *word_start = 0;	364 *word_start = 0;

370 *word_length = 0;	365 *word_length = 0;

371	366

372 if (!text_ \|\| position_ == UBRK_DONE)	367 if (!text_) {

373 return false;	368 return false;

	369 }

374	370

375 // Find a word that can be checked for spelling. Our rule sets filter out	371 // Find a word that can be checked for spelling. Our rule sets filter out

376 // invalid words (e.g. numbers and characters not supported by the	372 // invalid words (e.g. numbers and characters not supported by the

377 // spellchecker language) so this ubrk_getRuleStatus() call returns	373 // spellchecker language) so this ubrk_getRuleStatus() call returns

378 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such	374 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such

379 // words until we can find a valid word or reach the end of the input string.	375 // words until we can find a valid word or reach the end of the input string.

380 int next = ubrk_next(iterator_);	376 while (iterator_->Advance()) {

381 while (next != UBRK_DONE) {	377 const size_t start = iterator_->prev();

382 if (ubrk_getRuleStatus(iterator_) != UBRK_WORD_NONE) {	378 const size_t length = iterator_->pos() - start;

383 if (Normalize(position_, next - position_, word_string)) {	379 if (iterator_->IsWord()) {

384 *word_start = position_;	380 if (Normalize(start, length, word_string)) {

385 *word_length = next - position_;	381 *word_start = start;

386 position_ = next;	382 *word_length = length;

387 return true;	383 return true;

388 }	384 }

389 }	385 }

390 position_ = next;

391 next = ubrk_next(iterator_);

392 }	386 }

393	387

394 // There aren't any more words in the given text. Set the position to	388 // There aren't any more words in the given text.

395 // UBRK_DONE to prevent from calling ubrk_next() next time when this function

396 // is called.

397 position_ = UBRK_DONE;

398 return false;	389 return false;

399 }	390 }

400	391

401 void SpellcheckWordIterator::Reset() {	392 void SpellcheckWordIterator::Reset() {

402 if (iterator_) {	393 iterator_.reset();

403 ubrk_close(iterator_);

404 iterator_ = NULL;

405 }

406 }	394 }

407	395

408 bool SpellcheckWordIterator::Normalize(int input_start,	396 bool SpellcheckWordIterator::Normalize(int input_start,

409 int input_length,	397 int input_length,

410 base::string16* output_string) const {	398 base::string16* output_string) const {

411 // We use NFKC (Normalization Form, Compatible decomposition, followed by	399 // We use NFKC (Normalization Form, Compatible decomposition, followed by

412 // canonical Composition) defined in Unicode Standard Annex #15 to normalize	400 // canonical Composition) defined in Unicode Standard Annex #15 to normalize

413 // this token because it it the most suitable normalization algorithm for our	401 // this token because it it the most suitable normalization algorithm for our

414 // spellchecker. Nevertheless, it is not a perfect algorithm for our	402 // spellchecker. Nevertheless, it is not a perfect algorithm for our

415 // spellchecker and we need manual normalization as well. The normalized	403 // spellchecker and we need manual normalization as well. The normalized

416 // text does not have to be NUL-terminated since its characters are copied to	404 // text does not have to be NUL-terminated since its characters are copied to

417 // string16, which adds a NUL character when we need.	405 // string16, which adds a NUL character when we need.

418 icu::UnicodeString input(FALSE, &text_[input_start], input_length);	406 icu::UnicodeString input(FALSE, &text_[input_start], input_length);

419 UErrorCode status = U_ZERO_ERROR;	407 UErrorCode status = U_ZERO_ERROR;

420 icu::UnicodeString output;	408 icu::UnicodeString output;

421 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);	409 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);

422 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)	410 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)

423 return false;	411 return false;

424	412

425 // Copy the normalized text to the output.	413 // Copy the normalized text to the output.

426 icu::StringCharacterIterator it(output);	414 icu::StringCharacterIterator it(output);

427 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())	415 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())

428 attribute_->OutputChar(c, output_string);	416 attribute_->OutputChar(c, output_string);

429	417

430 return !output_string->empty();	418 return !output_string->empty();

431 }	419 }

OLD	NEW

« no previous file with comments | « chrome/renderer/spellchecker/spellcheck_worditerator.h ('k') | no next file » | no next file with comments »