chrome/renderer/spellchecker/spellcheck_worditerator.cc - Issue 2159283003: [WIP][DO NOT LAND] Componentize spellcheck

Side by Side Diff: chrome/renderer/spellchecker/spellcheck_worditerator.cc

Issue 2159283003: [WIP][DO NOT LAND] Componentize spellcheck Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « chrome/renderer/spellchecker/spellcheck_worditerator.h ('k') | chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 // Implements a custom word iterator used for our spellchecker.

6

7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"

8

9 #include <map>

10 #include <memory>

11 #include <string>

12 #include <utility>

13

14 #include "base/i18n/break_iterator.h"

15 #include "base/logging.h"

16 #include "base/macros.h"

17 #include "base/strings/stringprintf.h"

18 #include "base/strings/utf_string_conversions.h"

19 #include "chrome/renderer/spellchecker/spellcheck.h"

20 #include "third_party/icu/source/common/unicode/normlzr.h"

21 #include "third_party/icu/source/common/unicode/schriter.h"

22 #include "third_party/icu/source/common/unicode/uscript.h"

23 #include "third_party/icu/source/i18n/unicode/ulocdata.h"

24

25 using base::i18n::BreakIterator;

26

27 // SpellcheckCharAttribute implementation:

28

29 SpellcheckCharAttribute::SpellcheckCharAttribute()

30 : script_code_(USCRIPT_LATIN) {

31 }

32

33 SpellcheckCharAttribute::~SpellcheckCharAttribute() {

34 }

35

36 void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) {

37 CreateRuleSets(language);

38 }

39

40 base::string16 SpellcheckCharAttribute::GetRuleSet(

41 bool allow_contraction) const {

42 return allow_contraction ?

43 ruleset_allow_contraction_ : ruleset_disallow_contraction_;

44 }

45

46 void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {

47 // The template for our custom rule sets, which is based on the word-break

48 // rules of ICU 4.0:

49 // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/b rkitr/word.txt>.

50 // The major differences from the original one are listed below:

51 // * It discards comments in the original rules.

52 // * It discards characters not needed by our spellchecker (e.g. numbers,

53 // punctuation characters, Hiraganas, Katakanas, CJK Ideographs, and so on).

54 // * It allows customization of the $ALetter value (i.e. word characters).

55 // * It allows customization of the $ALetterPlus value (i.e. whether or not to

56 // use the dictionary data).

57 // * It allows choosing whether or not to split a text at contraction

58 // characters.

59 // This template only changes the forward-iteration rules. So, calling

60 // ubrk_prev() returns the same results as the original template.

61 static const char kRuleTemplate[] =

62 "!!chain;"

63 "$CR = [\\p{Word_Break = CR}];"

64 "$LF = [\\p{Word_Break = LF}];"

65 "$Newline = [\\p{Word_Break = Newline}];"

66 "$Extend = [\\p{Word_Break = Extend}];"

67 "$Format = [\\p{Word_Break = Format}];"

68 "$Katakana = [\\p{Word_Break = Katakana}];"

69 // Not all the characters in a given script are ALetter.

70 // For instance, U+05F4 is MidLetter. So, this may be

71 // better, but it leads to an empty set error in Thai.

72 // "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];"

73 "$ALetter = [\\p{script=%s}%s];"

74 // U+0027 (single quote/apostrophe) is not in MidNumLet any more

75 // in UAX 29 rev 21 or later. For our purpose, U+0027

76 // has to be treated as MidNumLet. ( http://crbug.com/364072 )

77 "$MidNumLet = [\\p{Word_Break = MidNumLet} \\u0027];"

78 "$MidLetter = [\\p{Word_Break = MidLetter}%s];"

79 "$MidNum = [\\p{Word_Break = MidNum}];"

80 "$Numeric = [\\p{Word_Break = Numeric}];"

81 "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];"

82

83 "$Control = [\\p{Grapheme_Cluster_Break = Control}]; "

84 "%s" // ALetterPlus

85

86 "$KatakanaEx = $Katakana ($Extend \| $Format)*;"

87 "$ALetterEx = $ALetterPlus ($Extend \| $Format)*;"

88 "$MidNumLetEx = $MidNumLet ($Extend \| $Format)*;"

89 "$MidLetterEx = $MidLetter ($Extend \| $Format)*;"

90 "$MidNumEx = $MidNum ($Extend \| $Format)*;"

91 "$NumericEx = $Numeric ($Extend \| $Format)*;"

92 "$ExtendNumLetEx = $ExtendNumLet ($Extend \| $Format)*;"

93

94 "$Hiragana = [\\p{script=Hiragana}];"

95 "$Ideographic = [\\p{Ideographic}];"

96 "$HiraganaEx = $Hiragana ($Extend \| $Format)*;"

97 "$IdeographicEx = $Ideographic ($Extend \| $Format)*;"

98

99 "!!forward;"

100 "$CR $LF;"

101 "[^$CR $LF $Newline]? ($Extend \| $Format)+;"

102 "$ALetterEx {200};"

103 "$ALetterEx $ALetterEx {200};"

104 "%s" // (Allow\|Disallow) Contraction

105

106 "!!reverse;"

107 "$BackALetterEx = ($Format \| $Extend)* $ALetterPlus;"

108 "$BackMidNumLetEx = ($Format \| $Extend)* $MidNumLet;"

109 "$BackNumericEx = ($Format \| $Extend)* $Numeric;"

110 "$BackMidNumEx = ($Format \| $Extend)* $MidNum;"

111 "$BackMidLetterEx = ($Format \| $Extend)* $MidLetter;"

112 "$BackKatakanaEx = ($Format \| $Extend)* $Katakana;"

113 "$BackExtendNumLetEx= ($Format \| $Extend)* $ExtendNumLet;"

114 "$LF $CR;"

115 "($Format \| $Extend)* [^$CR $LF $Newline]?;"

116 "$BackALetterEx $BackALetterEx;"

117 "$BackALetterEx ($BackMidLetterEx \| $BackMidNumLetEx) $BackALetterEx;"

118 "$BackNumericEx $BackNumericEx;"

119 "$BackNumericEx $BackALetterEx;"

120 "$BackALetterEx $BackNumericEx;"

121 "$BackNumericEx ($BackMidNumEx \| $BackMidNumLetEx) $BackNumericEx;"

122 "$BackKatakanaEx $BackKatakanaEx;"

123 "$BackExtendNumLetEx ($BackALetterEx \| $BackNumericEx \|"

124 " $BackKatakanaEx \| $BackExtendNumLetEx);"

125 "($BackALetterEx \| $BackNumericEx \| $BackKatakanaEx)"

126 " $BackExtendNumLetEx;"

127

128 "!!safe_reverse;"

129 "($Extend \| $Format)+ .?;"

130 "($MidLetter \| $MidNumLet) $BackALetterEx;"

131 "($MidNum \| $MidNumLet) $BackNumericEx;"

132

133 "!!safe_forward;"

134 "($Extend \| $Format)+ .?;"

135 "($MidLetterEx \| $MidNumLetEx) $ALetterEx;"

136 "($MidNumEx \| $MidNumLetEx) $NumericEx;";

137

138 // Retrieve the script codes used by the given language from ICU. When the

139 // given language consists of two or more scripts, we just use the first

140 // script. The size of returned script codes is always < 8. Therefore, we use

141 // an array of size 8 so we can include all script codes without insufficient

142 // buffer errors.

143 UErrorCode error = U_ZERO_ERROR;

144 UScriptCode script_code[8];

145 int scripts = uscript_getCode(language.c_str(), script_code,

146 arraysize(script_code), &error);

147 if (U_SUCCESS(error) && scripts >= 1)

148 script_code_ = script_code[0];

149

150 // Retrieve the values for $ALetter and $ALetterPlus. We use the dictionary

151 // only for the languages which need it (i.e. Korean and Thai) to prevent ICU

152 // from returning dictionary words (i.e. Korean or Thai words) for languages

153 // which don't need them.

154 const char* aletter = uscript_getName(script_code_);

155 if (!aletter)

156 aletter = "Latin";

157

158 const char kWithDictionary[] =

159 "$dictionary = [:LineBreak = Complex_Context:];"

160 "$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];";

161 const char kWithoutDictionary[] = "$ALetterPlus = $ALetter;";

162 const char* aletter_plus = kWithoutDictionary;

163 if (script_code_ == USCRIPT_HANGUL \|\| script_code_ == USCRIPT_THAI \|\|

164 script_code_ == USCRIPT_LAO \|\| script_code_ == USCRIPT_KHMER)

165 aletter_plus = kWithDictionary;

166

167 // Treat numbers as word characters except for Arabic and Hebrew.

168 const char* aletter_extra = " [0123456789]";

169 if (script_code_ == USCRIPT_HEBREW)

170 aletter_extra = "";

171 else if (script_code_ == USCRIPT_ARABIC)

172 // When "script=Arabic", it does not include tatweel, which is

173 // "script=Common" so add it back. Otherwise, it creates unwanted

174 // word breaks.

175 aletter_extra = " [\\u0640]";

176

177 const char kMidLetterExtra[] = "";

178 // For Hebrew, treat single/double quoation marks as MidLetter.

179 const char kMidLetterExtraHebrew[] = "\"'";

180 const char* midletter_extra = kMidLetterExtra;

181 if (script_code_ == USCRIPT_HEBREW)

182 midletter_extra = kMidLetterExtraHebrew;

183

184 // Create two custom rule-sets: one allows contraction and the other does not.

185 // We save these strings in UTF-16 so we can use it without conversions. (ICU

186 // needs UTF-16 strings.)

187 const char kAllowContraction[] =

188 "$ALetterEx ($MidLetterEx \| $MidNumLetEx) $ALetterEx {200};";

189 const char kDisallowContraction[] = "";

190

191 ruleset_allow_contraction_ = base::ASCIIToUTF16(

192 base::StringPrintf(kRuleTemplate,

193 aletter,

194 aletter_extra,

195 midletter_extra,

196 aletter_plus,

197 kAllowContraction));

198 ruleset_disallow_contraction_ = base::ASCIIToUTF16(

199 base::StringPrintf(kRuleTemplate,

200 aletter,

201 aletter_extra,

202 midletter_extra,

203 aletter_plus,

204 kDisallowContraction));

205 }

206

207 bool SpellcheckCharAttribute::OutputChar(UChar c,

208 base::string16* output) const {

209 // Call the language-specific function if necessary.

210 // Otherwise, we call the default one.

211 switch (script_code_) {

212 case USCRIPT_ARABIC:

213 return OutputArabic(c, output);

214

215 case USCRIPT_HANGUL:

216 return OutputHangul(c, output);

217

218 case USCRIPT_HEBREW:

219 return OutputHebrew(c, output);

220

221 default:

222 return OutputDefault(c, output);

223 }

224 }

225

226 bool SpellcheckCharAttribute::OutputArabic(UChar c,

227 base::string16* output) const {

228 // Include non-Arabic characters (which should trigger a spelling error)

229 // and Arabic characters excluding vowel marks and class "Lm".

230 // We filter the latter because, while they are "letters", they are

231 // optional and so don't affect the correctness of the rest of the word.

232 if (!(0x0600 <= c && c <= 0x06FF) \|\| (u_isalpha(c) && c != 0x0640))

233 output->push_back(c);

234 return true;

235 }

236

237 bool SpellcheckCharAttribute::OutputHangul(UChar c,

238 base::string16* output) const {

239 // Decompose a Hangul character to a Hangul vowel and consonants used by our

240 // spellchecker. A Hangul character of Unicode is a ligature consisting of a

241 // Hangul vowel and consonants, e.g. U+AC01 "Gag" consists of U+1100 "G",

242 // U+1161 "a", and U+11A8 "g". That is, we can treat each Hangul character as

243 // a point of a cubic linear space consisting of (first consonant, vowel, last

244 // consonant). Therefore, we can compose a Hangul character from a vowel and

245 // two consonants with linear composition:

246 // character = 0xAC00 +

247 // (first consonant - 0x1100) * 28 * 21 +

248 // (vowel - 0x1161) * 28 +

249 // (last consonant - 0x11A7);

250 // We can also decompose a Hangul character with linear decomposition:

251 // first consonant = (character - 0xAC00) / 28 / 21;

252 // vowel = (character - 0xAC00) / 28 % 21;

253 // last consonant = (character - 0xAC00) % 28;

254 // This code is copied from Unicode Standard Annex #15

255 // <http://unicode.org/reports/tr15> and added some comments.

256 const int kSBase = 0xAC00; // U+AC00: the top of Hangul characters.

257 const int kLBase = 0x1100; // U+1100: the top of Hangul first consonants.

258 const int kVBase = 0x1161; // U+1161: the top of Hangul vowels.

259 const int kTBase = 0x11A7; // U+11A7: the top of Hangul last consonants.

260 const int kLCount = 19; // The number of Hangul first consonants.

261 const int kVCount = 21; // The number of Hangul vowels.

262 const int kTCount = 28; // The number of Hangul last consonants.

263 const int kNCount = kVCount * kTCount;

264 const int kSCount = kLCount * kNCount;

265

266 int index = c - kSBase;

267 if (index < 0 \|\| index >= kSBase + kSCount) {

268 // This is not a Hangul syllable. Call the default output function since we

269 // should output this character when it is a Hangul syllable.

270 return OutputDefault(c, output);

271 }

272

273 // This is a Hangul character. Decompose this characters into Hangul vowels

274 // and consonants.

275 int l = kLBase + index / kNCount;

276 int v = kVBase + (index % kNCount) / kTCount;

277 int t = kTBase + index % kTCount;

278 output->push_back(l);

279 output->push_back(v);

280 if (t != kTBase)

281 output->push_back(t);

282 return true;

283 }

284

285 bool SpellcheckCharAttribute::OutputHebrew(UChar c,

286 base::string16* output) const {

287 // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds

288 // to prevent our Hebrew dictionary from marking a Hebrew word including

289 // niqquds as misspelled. (Same as Arabic vowel marks, we need to check

290 // niqquds manually and filter them out since their script codes are

291 // USCRIPT_HEBREW.)

292 // Pass through ASCII single/double quotation marks and Hebrew Geresh and

293 // Gershayim.

294 if ((0x05D0 <= c && c <= 0x05EA) \|\| c == 0x22 \|\| c == 0x27 \|\|

295 c == 0x05F4 \|\| c == 0x05F3)

296 output->push_back(c);

297 return true;

298 }

299

300 bool SpellcheckCharAttribute::OutputDefault(UChar c,

301 base::string16* output) const {

302 // Check the script code of this character and output only if it is the one

303 // used by the spellchecker language.

304 UErrorCode status = U_ZERO_ERROR;

305 UScriptCode script_code = uscript_getScript(c, &status);

306 if (script_code == script_code_ \|\| script_code == USCRIPT_COMMON)

307 output->push_back(c);

308 return true;

309 }

310

311 // SpellcheckWordIterator implementation:

312

313 SpellcheckWordIterator::SpellcheckWordIterator()

314 : text_(NULL),

315 attribute_(NULL),

316 iterator_() {

317 }

318

319 SpellcheckWordIterator::~SpellcheckWordIterator() {

320 Reset();

321 }

322

323 bool SpellcheckWordIterator::Initialize(

324 const SpellcheckCharAttribute* attribute,

325 bool allow_contraction) {

326 // Create a custom ICU break iterator with empty text used in this object. (We

327 // allow setting text later so we can re-use this iterator.)

328 DCHECK(attribute);

329 const base::string16 rule(attribute->GetRuleSet(allow_contraction));

330

331 // If there is no rule set, the attributes were invalid.

332 if (rule.empty())

333 return false;

334

335 std::unique_ptr<BreakIterator> iterator(

336 new BreakIterator(base::string16(), rule));

337 if (!iterator->Init()) {

338 // Since we're not passing in any text, the only reason this could fail

339 // is if we fail to parse the rules. Since the rules are hardcoded,

340 // that would be a bug in this class.

341 NOTREACHED() << "failed to open iterator (broken rules)";

342 return false;

343 }

344 iterator_ = std::move(iterator);

345

346 // Set the character attributes so we can normalize the words extracted by

347 // this iterator.

348 attribute_ = attribute;

349 return true;

350 }

351

352 bool SpellcheckWordIterator::IsInitialized() const {

353 // Return true iff we have an iterator.

354 return !!iterator_;

355 }

356

357 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) {

358 DCHECK(!!iterator_);

359

360 // Set the text to be split by this iterator.

361 if (!iterator_->SetText(text, length)) {

362 LOG(ERROR) << "failed to set text";

363 return false;

364 }

365

366 text_ = text;

367 return true;

368 }

369

370 SpellcheckWordIterator::WordIteratorStatus SpellcheckWordIterator::GetNextWord(

371 base::string16* word_string,

372 int* word_start,

373 int* word_length) {

374 DCHECK(!!text_);

375

376 word_string->clear();

377 *word_start = 0;

378 *word_length = 0;

379

380 if (!text_) {

381 return IS_END_OF_TEXT;

382 }

383

384 // Find a word that can be checked for spelling or a character that can be

385 // skipped over. Rather than moving past a skippable character this returns

386 // IS_SKIPPABLE and defers handling the character to the calling function.

387 while (iterator_->Advance()) {

388 const size_t start = iterator_->prev();

389 const size_t length = iterator_->pos() - start;

390 switch (iterator_->GetWordBreakStatus()) {

391 case BreakIterator::IS_WORD_BREAK: {

392 if (Normalize(start, length, word_string)) {

393 *word_start = start;

394 *word_length = length;

395 return IS_WORD;

396 }

397 break;

398 }

399 case BreakIterator::IS_SKIPPABLE_WORD: {

400 *word_string = iterator_->GetString();

401 *word_start = start;

402 *word_length = length;

403 return IS_SKIPPABLE;

404 }

405 // \|iterator_\| is RULE_BASED so the break status should never be

406 // IS_LINE_OR_CHAR_BREAK.

407 case BreakIterator::IS_LINE_OR_CHAR_BREAK: {

408 NOTREACHED();

409 break;

410 }

411 }

412 }

413

414 // There aren't any more words in the given text.

415 return IS_END_OF_TEXT;

416 }

417

418 void SpellcheckWordIterator::Reset() {

419 iterator_.reset();

420 }

421

422 bool SpellcheckWordIterator::Normalize(int input_start,

423 int input_length,

424 base::string16* output_string) const {

425 // We use NFKC (Normalization Form, Compatible decomposition, followed by

426 // canonical Composition) defined in Unicode Standard Annex #15 to normalize

427 // this token because it it the most suitable normalization algorithm for our

428 // spellchecker. Nevertheless, it is not a perfect algorithm for our

429 // spellchecker and we need manual normalization as well. The normalized

430 // text does not have to be NUL-terminated since its characters are copied to

431 // string16, which adds a NUL character when we need.

432 icu::UnicodeString input(FALSE, &text_[input_start], input_length);

433 UErrorCode status = U_ZERO_ERROR;

434 icu::UnicodeString output;

435 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);

436 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)

437 return false;

438

439 // Copy the normalized text to the output.

440 icu::StringCharacterIterator it(output);

441 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())

442 attribute_->OutputChar(c, output_string);

443

444 return !output_string->empty();

445 }

OLD	NEW