chrome/renderer/spellchecker/spellcheck_worditerator.cc - Issue 270203003: Refactor code to avoid direct dependency upon ICU: spellcheck_worditerator

Side by Side Diff: chrome/renderer/spellchecker/spellcheck_worditerator.cc

Issue 270203003: Refactor code to avoid direct dependency upon ICU: spellcheck_worditerator (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@refactoring_icu_usage

Patch Set: Simplify Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 // Implements a custom word iterator used for our spellchecker.	5 // Implements a custom word iterator used for our spellchecker.

6	6

7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"	7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"

8	8

9 #include <map>	9 #include <map>

10 #include <string>	10 #include <string>

11	11

12 #include "base/basictypes.h"	12 #include "base/basictypes.h"

	13 #include "base/i18n/break_iterator.h"

13 #include "base/logging.h"	14 #include "base/logging.h"

	15 #include "base/memory/scoped_ptr.h"
	groby-ooo-7-16 2014/05/08 17:52:00 No need to include scoped_ptr.h - transitive depen No need to include scoped_ptr.h - transitive dependency Andrew Hayden (chromium.org) 2014/05/09 15:16:16 Done. Show quoted text On 2014/05/08 17:52:00, groby wrote: > No need to include scoped_ptr.h - transitive dependency Done.
14 #include "base/strings/stringprintf.h"	16 #include "base/strings/stringprintf.h"

15 #include "base/strings/utf_string_conversions.h"	17 #include "base/strings/utf_string_conversions.h"

16 #include "chrome/renderer/spellchecker/spellcheck.h"	18 #include "chrome/renderer/spellchecker/spellcheck.h"

17 #include "third_party/icu/source/common/unicode/normlzr.h"	19 #include "third_party/icu/source/common/unicode/normlzr.h"

18 #include "third_party/icu/source/common/unicode/schriter.h"	20 #include "third_party/icu/source/common/unicode/schriter.h"

19 #include "third_party/icu/source/common/unicode/uscript.h"	21 #include "third_party/icu/source/common/unicode/uscript.h"

20 #include "third_party/icu/source/i18n/unicode/ulocdata.h"	22 #include "third_party/icu/source/i18n/unicode/ulocdata.h"

21	23

22 // SpellcheckCharAttribute implementation:	24 // SpellcheckCharAttribute implementation:

23	25

(...skipping 268 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
292 UScriptCode script_code = uscript_getScript(c, &status);	294 UScriptCode script_code = uscript_getScript(c, &status);

293 if (script_code == script_code_ \|\| script_code == USCRIPT_COMMON)	295 if (script_code == script_code_ \|\| script_code == USCRIPT_COMMON)

294 output->push_back(c);	296 output->push_back(c);

295 return true;	297 return true;

296 }	298 }

297	299

298 // SpellcheckWordIterator implementation:	300 // SpellcheckWordIterator implementation:

299	301

300 SpellcheckWordIterator::SpellcheckWordIterator()	302 SpellcheckWordIterator::SpellcheckWordIterator()

301 : text_(NULL),	303 : text_(NULL),

302 length_(0),

303 position_(UBRK_DONE),

304 attribute_(NULL),	304 attribute_(NULL),

305 iterator_(NULL) {	305 iterator_() {

306 }	306 }

307	307

308 SpellcheckWordIterator::~SpellcheckWordIterator() {	308 SpellcheckWordIterator::~SpellcheckWordIterator() {

309 Reset();	309 Reset();

310 }	310 }

311	311

312 bool SpellcheckWordIterator::Initialize(	312 bool SpellcheckWordIterator::Initialize(

313 const SpellcheckCharAttribute* attribute,	313 const SpellcheckCharAttribute* attribute,

314 bool allow_contraction) {	314 bool allow_contraction) {

315 // Create a custom ICU break iterator with empty text used in this object. (We	315 // Create a custom ICU break iterator with empty text used in this object. (We

316 // allow setting text later so we can re-use this iterator.)	316 // allow setting text later so we can re-use this iterator.)

317 DCHECK(attribute);	317 DCHECK(attribute);

318 UErrorCode open_status = U_ZERO_ERROR;	318 const base::string16 rule(attribute->GetRuleSet(allow_contraction));

319 UParseError parse_status;

320 base::string16 rule(attribute->GetRuleSet(allow_contraction));

321	319

322 // If there is no rule set, the attributes were invalid.	320 // If there is no rule set, the attributes were invalid.

323 if (rule.empty())	321 if (rule.empty())

324 return false;	322 return false;

325	323

326 iterator_ = ubrk_openRules(rule.c_str(), rule.length(), NULL, 0,	324 scoped_ptr<base::i18n::BreakIterator> iterator(

327 &parse_status, &open_status);	325 new base::i18n::BreakIterator(base::string16(), rule));

328 if (U_FAILURE(open_status))	326 if (!iterator->Init()) {

	327 NOTREACHED() << "failed to open iterator";
	groby-ooo-7-16 2014/05/08 17:52:00 Why NOTREACHED? If BreakIterator::Init truly can't Why NOTREACHED? If BreakIterator::Init truly can't fail, just have it there. Or better, have it not return an error. If it can fail, this is not unreachable. Andrew Hayden (chromium.org) 2014/05/09 15:16:16 I'll add a comment here. It can't fail in this ca Show quoted text On 2014/05/08 17:52:00, groby wrote: > Why NOTREACHED? If BreakIterator::Init truly can't fail, just have it there. Or > better, have it not return an error. If it can fail, this is not unreachable. I'll add a comment here. It can't fail in this case* unless the rules fail to parse, which would mean (since they're hardcoded) that there was a serious code error in this class.
329 return false;	328 return false;

	329 }

	330 iterator_ = iterator.Pass();

330	331

331 // Set the character attributes so we can normalize the words extracted by	332 // Set the character attributes so we can normalize the words extracted by

332 // this iterator.	333 // this iterator.

333 attribute_ = attribute;	334 attribute_ = attribute;

334 return true;	335 return true;

335 }	336 }

336	337

337 bool SpellcheckWordIterator::IsInitialized() const {	338 bool SpellcheckWordIterator::IsInitialized() const {

338 // Return true if we have an ICU custom iterator.	339 // Return true iff we have an iterator.

339 return !!iterator_;	340 return !!iterator_;

340 }	341 }

341	342

342 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) {	343 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) {

343 DCHECK(!!iterator_);	344 DCHECK(!!iterator_);

344	345

345 // Set the text to be split by this iterator.	346 // Set the text to be split by this iterator.

346 UErrorCode status = U_ZERO_ERROR;	347 if (!iterator_->SetText(text, length)) {

347 ubrk_setText(iterator_, text, length, &status);	348 NOTREACHED() << "failed to set text";
	groby-ooo-7-16 2014/05/08 17:52:00 See above. See above. Andrew Hayden (chromium.org) 2014/05/09 15:16:16 Done. Show quoted text On 2014/05/08 17:52:00, groby wrote: > See above. Done.
348 if (U_FAILURE(status))

349 return false;	349 return false;

	350 }

350	351

351 // Retrieve the position to the first word in this text. We return false if	352 // Return false if this text does not have any words. (For example, the

352 // this text does not have any words. (For example, The input text consists	353 // input text consists only of Chinese characters while the spellchecker

353 // only of Chinese characters while the spellchecker language is English.)	354 // language is English.)

354 position_ = ubrk_first(iterator_);	355 if (!iterator_->IsValid())

355 if (position_ == UBRK_DONE)

356 return false;	356 return false;

357	357

358 text_ = text;	358 text_ = text;

359 length_ = static_cast<int>(length);

360 return true;	359 return true;

361 }	360 }

362	361

363 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string,	362 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string,

364 int* word_start,	363 int* word_start,

365 int* word_length) {	364 int* word_length) {

366 DCHECK(!!text_ && length_ > 0);	365 DCHECK(!!text_);

367	366

368 word_string->clear();	367 word_string->clear();

369 *word_start = 0;	368 *word_start = 0;

370 *word_length = 0;	369 *word_length = 0;

371	370

372 if (!text_ \|\| position_ == UBRK_DONE)	371 if (!text_ \|\| !iterator_->IsValid())

373 return false;	372 return false;

374	373

375 // Find a word that can be checked for spelling. Our rule sets filter out	374 // Find a word that can be checked for spelling. Our rule sets filter out

376 // invalid words (e.g. numbers and characters not supported by the	375 // invalid words (e.g. numbers and characters not supported by the

377 // spellchecker language) so this ubrk_getRuleStatus() call returns	376 // spellchecker language) so this ubrk_getRuleStatus() call returns

378 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such	377 // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such

379 // words until we can find a valid word or reach the end of the input string.	378 // words until we can find a valid word or reach the end of the input string.

380 int next = ubrk_next(iterator_);	379 while (iterator_->Advance()) {

381 while (next != UBRK_DONE) {	380 const size_t start = iterator_->prev();

382 if (ubrk_getRuleStatus(iterator_) != UBRK_WORD_NONE) {	381 const size_t length = iterator_->pos() - start;

383 if (Normalize(position_, next - position_, word_string)) {	382 if (iterator_->IsWord()) {

384 *word_start = position_;	383 if (Normalize(start, length, word_string)) {

385 *word_length = next - position_;	384 *word_start = start;

386 position_ = next;	385 *word_length = length;

387 return true;	386 return true;

388 }	387 }

389 }	388 }

390 position_ = next;

391 next = ubrk_next(iterator_);

392 }	389 }

393	390

394 // There aren't any more words in the given text. Set the position to	391 // There aren't any more words in the given text.

395 // UBRK_DONE to prevent from calling ubrk_next() next time when this function

396 // is called.

397 position_ = UBRK_DONE;

398 return false;	392 return false;

399 }	393 }

400	394

401 void SpellcheckWordIterator::Reset() {	395 void SpellcheckWordIterator::Reset() {

402 if (iterator_) {	396 iterator_.reset(0);
	groby-ooo-7-16 2014/05/08 17:52:00 Just reset() - more idiomatic. Just reset() - more idiomatic. Andrew Hayden (chromium.org) 2014/05/09 15:16:16 Done. Show quoted text On 2014/05/08 17:52:00, groby wrote: > Just reset() - more idiomatic. Done.
403 ubrk_close(iterator_);

404 iterator_ = NULL;

405 }

406 }	397 }

407	398

408 bool SpellcheckWordIterator::Normalize(int input_start,	399 bool SpellcheckWordIterator::Normalize(int input_start,

409 int input_length,	400 int input_length,

410 base::string16* output_string) const {	401 base::string16* output_string) const {

411 // We use NFKC (Normalization Form, Compatible decomposition, followed by	402 // We use NFKC (Normalization Form, Compatible decomposition, followed by

412 // canonical Composition) defined in Unicode Standard Annex #15 to normalize	403 // canonical Composition) defined in Unicode Standard Annex #15 to normalize

413 // this token because it it the most suitable normalization algorithm for our	404 // this token because it it the most suitable normalization algorithm for our

414 // spellchecker. Nevertheless, it is not a perfect algorithm for our	405 // spellchecker. Nevertheless, it is not a perfect algorithm for our

415 // spellchecker and we need manual normalization as well. The normalized	406 // spellchecker and we need manual normalization as well. The normalized

416 // text does not have to be NUL-terminated since its characters are copied to	407 // text does not have to be NUL-terminated since its characters are copied to

417 // string16, which adds a NUL character when we need.	408 // string16, which adds a NUL character when we need.

418 icu::UnicodeString input(FALSE, &text_[input_start], input_length);	409 icu::UnicodeString input(FALSE, &text_[input_start], input_length);

419 UErrorCode status = U_ZERO_ERROR;	410 UErrorCode status = U_ZERO_ERROR;

420 icu::UnicodeString output;	411 icu::UnicodeString output;

421 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);	412 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);

422 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)	413 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)

423 return false;	414 return false;

424	415

425 // Copy the normalized text to the output.	416 // Copy the normalized text to the output.

426 icu::StringCharacterIterator it(output);	417 icu::StringCharacterIterator it(output);

427 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())	418 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())

428 attribute_->OutputChar(c, output_string);	419 attribute_->OutputChar(c, output_string);

429	420

430 return !output_string->empty();	421 return !output_string->empty();

431 }	422 }

OLD	NEW

« chrome/renderer/spellchecker/spellcheck_worditerator.h ('K') | « chrome/renderer/spellchecker/spellcheck_worditerator.h ('k') | no next file » | no next file with comments »