chrome/renderer/spellchecker/spellcheck_worditerator.cc - Issue 270203003: Refactor code to avoid direct dependency upon ICU: spellcheck_worditerator

Side by Side Diff: chrome/renderer/spellchecker/spellcheck_worditerator.cc

Issue 270203003: Refactor code to avoid direct dependency upon ICU: spellcheck_worditerator (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@refactoring_icu_usage

Patch Set: Rebase Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 // Implements a custom word iterator used for our spellchecker.	5 // Implements a custom word iterator used for our spellchecker.

6	6

7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"	7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"

8	8

9 #include <map>	9 #include <map>

10 #include <string>	10 #include <string>

11	11

12 #include "base/basictypes.h"	12 #include "base/basictypes.h"

	13 #include "base/i18n/break_iterator.h"

13 #include "base/logging.h"	14 #include "base/logging.h"

14 #include "base/strings/stringprintf.h"	15 #include "base/strings/stringprintf.h"

15 #include "base/strings/utf_string_conversions.h"	16 #include "base/strings/utf_string_conversions.h"

16 #include "chrome/renderer/spellchecker/spellcheck.h"	17 #include "chrome/renderer/spellchecker/spellcheck.h"

17 #include "third_party/icu/source/common/unicode/normlzr.h"	18 #include "third_party/icu/source/common/unicode/normlzr.h"

18 #include "third_party/icu/source/common/unicode/schriter.h"	19 #include "third_party/icu/source/common/unicode/schriter.h"

19 #include "third_party/icu/source/common/unicode/uscript.h"	20 #include "third_party/icu/source/common/unicode/uscript.h"

20 #include "third_party/icu/source/i18n/unicode/ulocdata.h"	21 #include "third_party/icu/source/i18n/unicode/ulocdata.h"

21	22

22 // SpellcheckCharAttribute implementation:	23 // SpellcheckCharAttribute implementation:

(...skipping 269 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
292 UScriptCode script_code = uscript_getScript(c, &status);	293 UScriptCode script_code = uscript_getScript(c, &status);

293 if (script_code == script_code_ \|\| script_code == USCRIPT_COMMON)	294 if (script_code == script_code_ \|\| script_code == USCRIPT_COMMON)

294 output->push_back(c);	295 output->push_back(c);

295 return true;	296 return true;

296 }	297 }

297	298

298 // SpellcheckWordIterator implementation:	299 // SpellcheckWordIterator implementation:

299	300

300 SpellcheckWordIterator::SpellcheckWordIterator()	301 SpellcheckWordIterator::SpellcheckWordIterator()

301 : text_(NULL),	302 : text_(NULL),

302 length_(0),

303 position_(UBRK_DONE),

304 attribute_(NULL),	303 attribute_(NULL),

305 iterator_(NULL) {	304 iterator_() {

306 }	305 }

307	306

308 SpellcheckWordIterator::~SpellcheckWordIterator() {	307 SpellcheckWordIterator::~SpellcheckWordIterator() {

309 Reset();	308 Reset();

310 }	309 }

311	310

312 bool SpellcheckWordIterator::Initialize(	311 bool SpellcheckWordIterator::Initialize(

313 const SpellcheckCharAttribute* attribute,	312 const SpellcheckCharAttribute* attribute,

314 bool allow_contraction) {	313 bool allow_contraction) {

315 // Create a custom ICU break iterator with empty text used in this object. (We	314 // Create a custom ICU break iterator with empty text used in this object. (We

316 // allow setting text later so we can re-use this iterator.)	315 // allow setting text later so we can re-use this iterator.)

317 DCHECK(attribute);	316 DCHECK(attribute);

318 UErrorCode open_status = U_ZERO_ERROR;	317 const base::string16 rule(attribute->GetRuleSet(allow_contraction));

319 UParseError parse_status;

320 base::string16 rule(attribute->GetRuleSet(allow_contraction));

321	318

322 // If there is no rule set, the attributes were invalid.	319 // If there is no rule set, the attributes were invalid.

323 if (rule.empty())	320 if (rule.empty())

324 return false;	321 return false;

325	322

326 iterator_ = ubrk_openRules(rule.c_str(), rule.length(), NULL, 0,	323 scoped_ptr<base::i18n::BreakIterator> iterator(

327 &parse_status, &open_status);	324 new base::i18n::BreakIterator(base::string16(), rule));

328 if (U_FAILURE(open_status))	325 if (!iterator->Init()) {

	326 // Since we're not passing in any text, the only reason this could fail

	327 // is if we fail to parse the rules. Since the rules are hardcoded,

	328 // that would be a bug in this class.

	329 NOTREACHED() << "failed to open iterator (broken rules)";

329 return false;	330 return false;

	331 }

	332 iterator_ = iterator.Pass();

330	333

331 // Set the character attributes so we can normalize the words extracted by	334 // Set the character attributes so we can normalize the words extracted by

332 // this iterator.	335 // this iterator.

333 attribute_ = attribute;	336 attribute_ = attribute;

334 return true;	337 return true;

335 }	338 }

336	339

337 bool SpellcheckWordIterator::IsInitialized() const {	340 bool SpellcheckWordIterator::IsInitialized() const {

338 // Return true if we have an ICU custom iterator.	341 // Return true iff we have an iterator.

339 return !!iterator_;	342 return !!iterator_;

340 }	343 }

341	344

342 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) {	345 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) {

343 DCHECK(!!iterator_);	346 DCHECK(!!iterator_);

344	347

345 // Set the text to be split by this iterator.	348 // Set the text to be split by this iterator.

346 UErrorCode status = U_ZERO_ERROR;	349 if (!iterator_->SetText(text, length)) {

347 ubrk_setText(iterator_, text, length, &status);	350 LOG(ERROR) << "failed to set text";

348 if (U_FAILURE(status))

349 return false;	351 return false;

	352 }

350	353

351 // Retrieve the position to the first word in this text. We return false if	354 // Return false if this text does not have any words. (For example, the
	Andrew Hayden (chromium.org) 2014/05/12 13:19:37 This turns out to be useless now. I asked the ICU This turns out to be useless now. I asked the ICU mailing list if calling ubrk_first (which is what happens after setting the text) can ever return a non-zero value, and the answer is no. IsValid is based on this, so it will always - ALWAYS - be true after setting text or calling init. This comment has been incorrect since it was put here, as far as I can tell, and is useless. The later loop on Advance() will function correctly in all cases. We should take this out, as there's no way to do this detection correctly without iterating over the whole string in advance. groby-ooo-7-16 2014/05/12 20:14:49 Hm. As long as the tests pass, I suppose we're goo Show quoted text On 2014/05/12 13:19:37, Andrew Hayden wrote: > This turns out to be useless now. I asked the ICU mailing list if calling > ubrk_first (which is what happens after setting the text) can ever return a > non-zero value, and the answer is no. IsValid is based on this, so it will > always - ALWAYS - be true after setting text or calling init. This comment has > been incorrect since it was put here, as far as I can tell, and is useless. The > later loop on Advance() will function correctly in all cases. We should take > this out, as there's no way to do this detection correctly without iterating > over the whole string in advance. Hm. As long as the tests pass, I suppose we're good. There's never been a test for the "false" case, so they should still pass ;) (Too bad SetText can still fail... I'd love to make this void. Oh well)
352 // this text does not have any words. (For example, The input text consists	355 // input text consists only of Chinese characters while the spellchecker

353 // only of Chinese characters while the spellchecker language is English.)	356 // language is English.)

354 position_ = ubrk_first(iterator_);	357 if (!iterator_->IsValid())

355 if (position_ == UBRK_DONE)

356 return false;	358 return false;

357	359

358 text_ = text;	360 text_ = text;
	groby-ooo-7-16 2014/05/12 20:14:49 I _think_ all references to _text can go. There's I _think_ all references to _text can go. There's no reason to keep that alive any more, either.
359 length_ = static_cast<int>(length);

360 return true;	361 return true;

361 }	362 }

362	363

363 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string,	364 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string,

364 int* word_start,	365 int* word_start,

365 int* word_length) {	366 int* word_length) {

366 DCHECK(!!text_ && length_ > 0);	367 DCHECK(!!text_);

367	368

368 word_string->clear();	369 word_string->clear();

369 *word_start = 0;	370 *word_start = 0;

370 *word_length = 0;	371 *word_length = 0;

371	372

372 if (!text_ \|\| position_ == UBRK_DONE)	373 if (!text_ \|\| !iterator_->IsValid())
	Andrew Hayden (chromium.org) 2014/05/12 13:19:37 Similarly, here, the only time that IsValid() will Similarly, here, the only time that IsValid() will be false is after we get through the entire string. The while loop below already guards on Advance() returning true, which takes care of it properly. This check is completely unnecessary. groby-ooo-7-16 2014/05/12 20:14:49 Since this is documented API behavior via the Brea Show quoted text On 2014/05/12 13:19:37, Andrew Hayden wrote: > Similarly, here, the only time that IsValid() will be false is after we get > through the entire string. The while loop below already guards on Advance() > returning true, which takes care of it properly. This check is completely > unnecessary. Since this is documented API behavior via the BreakIterator unittests, yeah, I'd say this can be removed.
373 return false;	374 return false;

374	375

375 // Find a word that can be checked for spelling. Our rule sets filter out	376 // Find a word that can be checked for spelling. Our rule sets filter out

376 // invalid words (e.g. numbers and characters not supported by the	377 // invalid words (e.g. numbers and characters not supported by the

377 // spellchecker language) so this ubrk_getRuleStatus() call returns	378 // spellchecker language) so this ubrk_getRuleStatus() call returns

378 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such	379 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such

379 // words until we can find a valid word or reach the end of the input string.	380 // words until we can find a valid word or reach the end of the input string.

380 int next = ubrk_next(iterator_);	381 while (iterator_->Advance()) {

381 while (next != UBRK_DONE) {	382 const size_t start = iterator_->prev();

382 if (ubrk_getRuleStatus(iterator_) != UBRK_WORD_NONE) {	383 const size_t length = iterator_->pos() - start;

383 if (Normalize(position_, next - position_, word_string)) {	384 if (iterator_->IsWord()) {

384 *word_start = position_;	385 if (Normalize(start, length, word_string)) {

385 *word_length = next - position_;	386 *word_start = start;

386 position_ = next;	387 *word_length = length;

387 return true;	388 return true;

388 }	389 }

389 }	390 }

390 position_ = next;

391 next = ubrk_next(iterator_);

392 }	391 }

393	392

394 // There aren't any more words in the given text. Set the position to	393 // There aren't any more words in the given text.

395 // UBRK_DONE to prevent from calling ubrk_next() next time when this function

396 // is called.

397 position_ = UBRK_DONE;

398 return false;	394 return false;

399 }	395 }

400	396

401 void SpellcheckWordIterator::Reset() {	397 void SpellcheckWordIterator::Reset() {

402 if (iterator_) {	398 iterator_.reset();

403 ubrk_close(iterator_);

404 iterator_ = NULL;

405 }

406 }	399 }

407	400

408 bool SpellcheckWordIterator::Normalize(int input_start,	401 bool SpellcheckWordIterator::Normalize(int input_start,

409 int input_length,	402 int input_length,

410 base::string16* output_string) const {	403 base::string16* output_string) const {

411 // We use NFKC (Normalization Form, Compatible decomposition, followed by	404 // We use NFKC (Normalization Form, Compatible decomposition, followed by

412 // canonical Composition) defined in Unicode Standard Annex #15 to normalize	405 // canonical Composition) defined in Unicode Standard Annex #15 to normalize

413 // this token because it it the most suitable normalization algorithm for our	406 // this token because it it the most suitable normalization algorithm for our

414 // spellchecker. Nevertheless, it is not a perfect algorithm for our	407 // spellchecker. Nevertheless, it is not a perfect algorithm for our

415 // spellchecker and we need manual normalization as well. The normalized	408 // spellchecker and we need manual normalization as well. The normalized

416 // text does not have to be NUL-terminated since its characters are copied to	409 // text does not have to be NUL-terminated since its characters are copied to

417 // string16, which adds a NUL character when we need.	410 // string16, which adds a NUL character when we need.

418 icu::UnicodeString input(FALSE, &text_[input_start], input_length);	411 icu::UnicodeString input(FALSE, &text_[input_start], input_length);

419 UErrorCode status = U_ZERO_ERROR;	412 UErrorCode status = U_ZERO_ERROR;

420 icu::UnicodeString output;	413 icu::UnicodeString output;

421 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);	414 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);

422 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)	415 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)

423 return false;	416 return false;

424	417

425 // Copy the normalized text to the output.	418 // Copy the normalized text to the output.

426 icu::StringCharacterIterator it(output);	419 icu::StringCharacterIterator it(output);

427 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())	420 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())

428 attribute_->OutputChar(c, output_string);	421 attribute_->OutputChar(c, output_string);

429	422

430 return !output_string->empty();	423 return !output_string->empty();

431 }	424 }

OLD	NEW

« no previous file with comments | « chrome/renderer/spellchecker/spellcheck_worditerator.h ('k') | no next file » | no next file with comments »