chrome/renderer/spellchecker/spellcheck_worditerator.cc - Issue 105493002: Use base namespace for string16 in chrome/renderer.

Side by Side Diff: chrome/renderer/spellchecker/spellcheck_worditerator.cc

Issue 105493002: Use base namespace for string16 in chrome/renderer. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 7 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « chrome/renderer/spellchecker/spellcheck_worditerator.h ('k') | chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 // Implements a custom word iterator used for our spellchecker.	5 // Implements a custom word iterator used for our spellchecker.

6	6

7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"	7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"

8	8

9 #include <map>	9 #include <map>

10 #include <string>	10 #include <string>

(...skipping 14 matching lines...) Expand all Loading...
25 : script_code_(USCRIPT_LATIN) {	25 : script_code_(USCRIPT_LATIN) {

26 }	26 }

27	27

28 SpellcheckCharAttribute::~SpellcheckCharAttribute() {	28 SpellcheckCharAttribute::~SpellcheckCharAttribute() {

29 }	29 }

30	30

31 void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) {	31 void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) {

32 CreateRuleSets(language);	32 CreateRuleSets(language);

33 }	33 }

34	34

35 string16 SpellcheckCharAttribute::GetRuleSet(bool allow_contraction) const {	35 base::string16 SpellcheckCharAttribute::GetRuleSet(

	36 bool allow_contraction) const {

36 return allow_contraction ?	37 return allow_contraction ?

37 ruleset_allow_contraction_ : ruleset_disallow_contraction_;	38 ruleset_allow_contraction_ : ruleset_disallow_contraction_;

38 }	39 }

39	40

40 void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {	41 void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {

41 // The template for our custom rule sets, which is based on the word-break	42 // The template for our custom rule sets, which is based on the word-break

42 // rules of ICU 4.0:	43 // rules of ICU 4.0:

43 // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/b rkitr/word.txt>.	44 // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/b rkitr/word.txt>.

44 // The major differences from the original one are listed below:	45 // The major differences from the original one are listed below:

45 // * It discards comments in the original rules.	46 // * It discards comments in the original rules.

(...skipping 136 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
182 kAllowContraction));	183 kAllowContraction));

183 ruleset_disallow_contraction_ = ASCIIToUTF16(	184 ruleset_disallow_contraction_ = ASCIIToUTF16(

184 base::StringPrintf(kRuleTemplate,	185 base::StringPrintf(kRuleTemplate,

185 aletter,	186 aletter,

186 aletter_extra,	187 aletter_extra,

187 midletter_extra,	188 midletter_extra,

188 aletter_plus,	189 aletter_plus,

189 kDisallowContraction));	190 kDisallowContraction));

190 }	191 }

191	192

192 bool SpellcheckCharAttribute::OutputChar(UChar c, string16* output) const {	193 bool SpellcheckCharAttribute::OutputChar(UChar c,

	194 base::string16* output) const {

193 // Call the language-specific function if necessary.	195 // Call the language-specific function if necessary.

194 // Otherwise, we call the default one.	196 // Otherwise, we call the default one.

195 switch (script_code_) {	197 switch (script_code_) {

196 case USCRIPT_ARABIC:	198 case USCRIPT_ARABIC:

197 return OutputArabic(c, output);	199 return OutputArabic(c, output);

198	200

199 case USCRIPT_HANGUL:	201 case USCRIPT_HANGUL:

200 return OutputHangul(c, output);	202 return OutputHangul(c, output);

201	203

202 case USCRIPT_HEBREW:	204 case USCRIPT_HEBREW:

203 return OutputHebrew(c, output);	205 return OutputHebrew(c, output);

204	206

205 default:	207 default:

206 return OutputDefault(c, output);	208 return OutputDefault(c, output);

207 }	209 }

208 }	210 }

209	211

210 bool SpellcheckCharAttribute::OutputArabic(UChar c, string16* output) const {	212 bool SpellcheckCharAttribute::OutputArabic(UChar c,

	213 base::string16* output) const {

211 // Discard characters not from Arabic alphabets. We also discard vowel marks	214 // Discard characters not from Arabic alphabets. We also discard vowel marks

212 // of Arabic (Damma, Fatha, Kasra, etc.) to prevent our Arabic dictionary from	215 // of Arabic (Damma, Fatha, Kasra, etc.) to prevent our Arabic dictionary from

213 // marking an Arabic word including vowel marks as misspelled. (We need to	216 // marking an Arabic word including vowel marks as misspelled. (We need to

214 // check these vowel marks manually and filter them out since their script	217 // check these vowel marks manually and filter them out since their script

215 // codes are USCRIPT_ARABIC.)	218 // codes are USCRIPT_ARABIC.)

216 if (0x0621 <= c && c <= 0x064D)	219 if (0x0621 <= c && c <= 0x064D)

217 output->push_back(c);	220 output->push_back(c);

218 return true;	221 return true;

219 }	222 }

220	223

221 bool SpellcheckCharAttribute::OutputHangul(UChar c, string16* output) const {	224 bool SpellcheckCharAttribute::OutputHangul(UChar c,

	225 base::string16* output) const {

222 // Decompose a Hangul character to a Hangul vowel and consonants used by our	226 // Decompose a Hangul character to a Hangul vowel and consonants used by our

223 // spellchecker. A Hangul character of Unicode is a ligature consisting of a	227 // spellchecker. A Hangul character of Unicode is a ligature consisting of a

224 // Hangul vowel and consonants, e.g. U+AC01 "Gag" consists of U+1100 "G",	228 // Hangul vowel and consonants, e.g. U+AC01 "Gag" consists of U+1100 "G",

225 // U+1161 "a", and U+11A8 "g". That is, we can treat each Hangul character as	229 // U+1161 "a", and U+11A8 "g". That is, we can treat each Hangul character as

226 // a point of a cubic linear space consisting of (first consonant, vowel, last	230 // a point of a cubic linear space consisting of (first consonant, vowel, last

227 // consonant). Therefore, we can compose a Hangul character from a vowel and	231 // consonant). Therefore, we can compose a Hangul character from a vowel and

228 // two consonants with linear composition:	232 // two consonants with linear composition:

229 // character = 0xAC00 +	233 // character = 0xAC00 +

230 // (first consonant - 0x1100) * 28 * 21 +	234 // (first consonant - 0x1100) * 28 * 21 +

231 // (vowel - 0x1161) * 28 +	235 // (vowel - 0x1161) * 28 +

(...skipping 26 matching lines...) Expand all Loading...
258 int l = kLBase + index / kNCount;	262 int l = kLBase + index / kNCount;

259 int v = kVBase + (index % kNCount) / kTCount;	263 int v = kVBase + (index % kNCount) / kTCount;

260 int t = kTBase + index % kTCount;	264 int t = kTBase + index % kTCount;

261 output->push_back(l);	265 output->push_back(l);

262 output->push_back(v);	266 output->push_back(v);

263 if (t != kTBase)	267 if (t != kTBase)

264 output->push_back(t);	268 output->push_back(t);

265 return true;	269 return true;

266 }	270 }

267	271

268 bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const {	272 bool SpellcheckCharAttribute::OutputHebrew(UChar c,

	273 base::string16* output) const {

269 // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds	274 // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds

270 // to prevent our Hebrew dictionary from marking a Hebrew word including	275 // to prevent our Hebrew dictionary from marking a Hebrew word including

271 // niqquds as misspelled. (Same as Arabic vowel marks, we need to check	276 // niqquds as misspelled. (Same as Arabic vowel marks, we need to check

272 // niqquds manually and filter them out since their script codes are	277 // niqquds manually and filter them out since their script codes are

273 // USCRIPT_HEBREW.)	278 // USCRIPT_HEBREW.)

274 // Pass through ASCII single/double quotation marks and Hebrew Geresh and	279 // Pass through ASCII single/double quotation marks and Hebrew Geresh and

275 // Gershayim.	280 // Gershayim.

276 if ((0x05D0 <= c && c <= 0x05EA) \|\| c == 0x22 \|\| c == 0x27 \|\|	281 if ((0x05D0 <= c && c <= 0x05EA) \|\| c == 0x22 \|\| c == 0x27 \|\|

277 c == 0x05F4 \|\| c == 0x05F3)	282 c == 0x05F4 \|\| c == 0x05F3)

278 output->push_back(c);	283 output->push_back(c);

279 return true;	284 return true;

280 }	285 }

281	286

282 bool SpellcheckCharAttribute::OutputDefault(UChar c, string16* output) const {	287 bool SpellcheckCharAttribute::OutputDefault(UChar c,

	288 base::string16* output) const {

283 // Check the script code of this character and output only if it is the one	289 // Check the script code of this character and output only if it is the one

284 // used by the spellchecker language.	290 // used by the spellchecker language.

285 UErrorCode status = U_ZERO_ERROR;	291 UErrorCode status = U_ZERO_ERROR;

286 UScriptCode script_code = uscript_getScript(c, &status);	292 UScriptCode script_code = uscript_getScript(c, &status);

287 if (script_code == script_code_ \|\| script_code == USCRIPT_COMMON)	293 if (script_code == script_code_ \|\| script_code == USCRIPT_COMMON)

288 output->push_back(c);	294 output->push_back(c);

289 return true;	295 return true;

290 }	296 }

291	297

292 // SpellcheckWordIterator implementation:	298 // SpellcheckWordIterator implementation:

(...skipping 11 matching lines...) Expand all Loading...
304 }	310 }

305	311

306 bool SpellcheckWordIterator::Initialize(	312 bool SpellcheckWordIterator::Initialize(

307 const SpellcheckCharAttribute* attribute,	313 const SpellcheckCharAttribute* attribute,

308 bool allow_contraction) {	314 bool allow_contraction) {

309 // Create a custom ICU break iterator with empty text used in this object. (We	315 // Create a custom ICU break iterator with empty text used in this object. (We

310 // allow setting text later so we can re-use this iterator.)	316 // allow setting text later so we can re-use this iterator.)

311 DCHECK(attribute);	317 DCHECK(attribute);

312 UErrorCode open_status = U_ZERO_ERROR;	318 UErrorCode open_status = U_ZERO_ERROR;

313 UParseError parse_status;	319 UParseError parse_status;

314 string16 rule(attribute->GetRuleSet(allow_contraction));	320 base::string16 rule(attribute->GetRuleSet(allow_contraction));

315	321

316 // If there is no rule set, the attributes were invalid.	322 // If there is no rule set, the attributes were invalid.

317 if (rule.empty())	323 if (rule.empty())

318 return false;	324 return false;

319	325

320 iterator_ = ubrk_openRules(rule.c_str(), rule.length(), NULL, 0,	326 iterator_ = ubrk_openRules(rule.c_str(), rule.length(), NULL, 0,

321 &parse_status, &open_status);	327 &parse_status, &open_status);

322 if (U_FAILURE(open_status))	328 if (U_FAILURE(open_status))

323 return false;	329 return false;

324	330

(...skipping 22 matching lines...) Expand all Loading...
347 // only of Chinese characters while the spellchecker language is English.)	353 // only of Chinese characters while the spellchecker language is English.)

348 position_ = ubrk_first(iterator_);	354 position_ = ubrk_first(iterator_);

349 if (position_ == UBRK_DONE)	355 if (position_ == UBRK_DONE)

350 return false;	356 return false;

351	357

352 text_ = text;	358 text_ = text;

353 length_ = static_cast<int>(length);	359 length_ = static_cast<int>(length);

354 return true;	360 return true;

355 }	361 }

356	362

357 bool SpellcheckWordIterator::GetNextWord(string16* word_string,	363 bool SpellcheckWordIterator::GetNextWord(base::string16* word_string,

358 int* word_start,	364 int* word_start,

359 int* word_length) {	365 int* word_length) {

360 DCHECK(!!text_ && length_ > 0);	366 DCHECK(!!text_ && length_ > 0);

361	367

362 word_string->clear();	368 word_string->clear();

363 *word_start = 0;	369 *word_start = 0;

364 *word_length = 0;	370 *word_length = 0;

365	371

366 if (!text_ \|\| position_ == UBRK_DONE)	372 if (!text_ \|\| position_ == UBRK_DONE)

367 return false;	373 return false;

(...skipping 26 matching lines...) Expand all Loading...
394	400

395 void SpellcheckWordIterator::Reset() {	401 void SpellcheckWordIterator::Reset() {

396 if (iterator_) {	402 if (iterator_) {

397 ubrk_close(iterator_);	403 ubrk_close(iterator_);

398 iterator_ = NULL;	404 iterator_ = NULL;

399 }	405 }

400 }	406 }

401	407

402 bool SpellcheckWordIterator::Normalize(int input_start,	408 bool SpellcheckWordIterator::Normalize(int input_start,

403 int input_length,	409 int input_length,

404 string16* output_string) const {	410 base::string16* output_string) const {

405 // We use NFKC (Normalization Form, Compatible decomposition, followed by	411 // We use NFKC (Normalization Form, Compatible decomposition, followed by

406 // canonical Composition) defined in Unicode Standard Annex #15 to normalize	412 // canonical Composition) defined in Unicode Standard Annex #15 to normalize

407 // this token because it it the most suitable normalization algorithm for our	413 // this token because it it the most suitable normalization algorithm for our

408 // spellchecker. Nevertheless, it is not a perfect algorithm for our	414 // spellchecker. Nevertheless, it is not a perfect algorithm for our

409 // spellchecker and we need manual normalization as well. The normalized	415 // spellchecker and we need manual normalization as well. The normalized

410 // text does not have to be NUL-terminated since its characters are copied to	416 // text does not have to be NUL-terminated since its characters are copied to

411 // string16, which adds a NUL character when we need.	417 // string16, which adds a NUL character when we need.

412 icu::UnicodeString input(FALSE, &text_[input_start], input_length);	418 icu::UnicodeString input(FALSE, &text_[input_start], input_length);

413 UErrorCode status = U_ZERO_ERROR;	419 UErrorCode status = U_ZERO_ERROR;

414 icu::UnicodeString output;	420 icu::UnicodeString output;

415 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);	421 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);

416 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)	422 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)

417 return false;	423 return false;

418	424

419 // Copy the normalized text to the output.	425 // Copy the normalized text to the output.

420 icu::StringCharacterIterator it(output);	426 icu::StringCharacterIterator it(output);

421 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())	427 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())

422 attribute_->OutputChar(c, output_string);	428 attribute_->OutputChar(c, output_string);

423	429

424 return !output_string->empty();	430 return !output_string->empty();

425 }	431 }

OLD	NEW