OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 // Implements a custom word iterator used for our spellchecker. | |
6 | |
7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" | |
8 | |
9 #include <map> | |
10 #include <memory> | |
11 #include <string> | |
12 #include <utility> | |
13 | |
14 #include "base/i18n/break_iterator.h" | |
15 #include "base/logging.h" | |
16 #include "base/macros.h" | |
17 #include "base/strings/stringprintf.h" | |
18 #include "base/strings/utf_string_conversions.h" | |
19 #include "chrome/renderer/spellchecker/spellcheck.h" | |
20 #include "third_party/icu/source/common/unicode/normlzr.h" | |
21 #include "third_party/icu/source/common/unicode/schriter.h" | |
22 #include "third_party/icu/source/common/unicode/uscript.h" | |
23 #include "third_party/icu/source/i18n/unicode/ulocdata.h" | |
24 | |
25 using base::i18n::BreakIterator; | |
26 | |
27 // SpellcheckCharAttribute implementation: | |
28 | |
29 SpellcheckCharAttribute::SpellcheckCharAttribute() | |
30 : script_code_(USCRIPT_LATIN) { | |
31 } | |
32 | |
33 SpellcheckCharAttribute::~SpellcheckCharAttribute() { | |
34 } | |
35 | |
36 void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) { | |
37 CreateRuleSets(language); | |
38 } | |
39 | |
40 base::string16 SpellcheckCharAttribute::GetRuleSet( | |
41 bool allow_contraction) const { | |
42 return allow_contraction ? | |
43 ruleset_allow_contraction_ : ruleset_disallow_contraction_; | |
44 } | |
45 | |
46 void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) { | |
47 // The template for our custom rule sets, which is based on the word-break | |
48 // rules of ICU 4.0: | |
49 // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/b
rkitr/word.txt>. | |
50 // The major differences from the original one are listed below: | |
51 // * It discards comments in the original rules. | |
52 // * It discards characters not needed by our spellchecker (e.g. numbers, | |
53 // punctuation characters, Hiraganas, Katakanas, CJK Ideographs, and so on). | |
54 // * It allows customization of the $ALetter value (i.e. word characters). | |
55 // * It allows customization of the $ALetterPlus value (i.e. whether or not to | |
56 // use the dictionary data). | |
57 // * It allows choosing whether or not to split a text at contraction | |
58 // characters. | |
59 // This template only changes the forward-iteration rules. So, calling | |
60 // ubrk_prev() returns the same results as the original template. | |
61 static const char kRuleTemplate[] = | |
62 "!!chain;" | |
63 "$CR = [\\p{Word_Break = CR}];" | |
64 "$LF = [\\p{Word_Break = LF}];" | |
65 "$Newline = [\\p{Word_Break = Newline}];" | |
66 "$Extend = [\\p{Word_Break = Extend}];" | |
67 "$Format = [\\p{Word_Break = Format}];" | |
68 "$Katakana = [\\p{Word_Break = Katakana}];" | |
69 // Not all the characters in a given script are ALetter. | |
70 // For instance, U+05F4 is MidLetter. So, this may be | |
71 // better, but it leads to an empty set error in Thai. | |
72 // "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];" | |
73 "$ALetter = [\\p{script=%s}%s];" | |
74 // U+0027 (single quote/apostrophe) is not in MidNumLet any more | |
75 // in UAX 29 rev 21 or later. For our purpose, U+0027 | |
76 // has to be treated as MidNumLet. ( http://crbug.com/364072 ) | |
77 "$MidNumLet = [\\p{Word_Break = MidNumLet} \\u0027];" | |
78 "$MidLetter = [\\p{Word_Break = MidLetter}%s];" | |
79 "$MidNum = [\\p{Word_Break = MidNum}];" | |
80 "$Numeric = [\\p{Word_Break = Numeric}];" | |
81 "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];" | |
82 | |
83 "$Control = [\\p{Grapheme_Cluster_Break = Control}]; " | |
84 "%s" // ALetterPlus | |
85 | |
86 "$KatakanaEx = $Katakana ($Extend | $Format)*;" | |
87 "$ALetterEx = $ALetterPlus ($Extend | $Format)*;" | |
88 "$MidNumLetEx = $MidNumLet ($Extend | $Format)*;" | |
89 "$MidLetterEx = $MidLetter ($Extend | $Format)*;" | |
90 "$MidNumEx = $MidNum ($Extend | $Format)*;" | |
91 "$NumericEx = $Numeric ($Extend | $Format)*;" | |
92 "$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;" | |
93 | |
94 "$Hiragana = [\\p{script=Hiragana}];" | |
95 "$Ideographic = [\\p{Ideographic}];" | |
96 "$HiraganaEx = $Hiragana ($Extend | $Format)*;" | |
97 "$IdeographicEx = $Ideographic ($Extend | $Format)*;" | |
98 | |
99 "!!forward;" | |
100 "$CR $LF;" | |
101 "[^$CR $LF $Newline]? ($Extend | $Format)+;" | |
102 "$ALetterEx {200};" | |
103 "$ALetterEx $ALetterEx {200};" | |
104 "%s" // (Allow|Disallow) Contraction | |
105 | |
106 "!!reverse;" | |
107 "$BackALetterEx = ($Format | $Extend)* $ALetterPlus;" | |
108 "$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;" | |
109 "$BackNumericEx = ($Format | $Extend)* $Numeric;" | |
110 "$BackMidNumEx = ($Format | $Extend)* $MidNum;" | |
111 "$BackMidLetterEx = ($Format | $Extend)* $MidLetter;" | |
112 "$BackKatakanaEx = ($Format | $Extend)* $Katakana;" | |
113 "$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;" | |
114 "$LF $CR;" | |
115 "($Format | $Extend)* [^$CR $LF $Newline]?;" | |
116 "$BackALetterEx $BackALetterEx;" | |
117 "$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;" | |
118 "$BackNumericEx $BackNumericEx;" | |
119 "$BackNumericEx $BackALetterEx;" | |
120 "$BackALetterEx $BackNumericEx;" | |
121 "$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;" | |
122 "$BackKatakanaEx $BackKatakanaEx;" | |
123 "$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx |" | |
124 " $BackKatakanaEx | $BackExtendNumLetEx);" | |
125 "($BackALetterEx | $BackNumericEx | $BackKatakanaEx)" | |
126 " $BackExtendNumLetEx;" | |
127 | |
128 "!!safe_reverse;" | |
129 "($Extend | $Format)+ .?;" | |
130 "($MidLetter | $MidNumLet) $BackALetterEx;" | |
131 "($MidNum | $MidNumLet) $BackNumericEx;" | |
132 | |
133 "!!safe_forward;" | |
134 "($Extend | $Format)+ .?;" | |
135 "($MidLetterEx | $MidNumLetEx) $ALetterEx;" | |
136 "($MidNumEx | $MidNumLetEx) $NumericEx;"; | |
137 | |
138 // Retrieve the script codes used by the given language from ICU. When the | |
139 // given language consists of two or more scripts, we just use the first | |
140 // script. The size of returned script codes is always < 8. Therefore, we use | |
141 // an array of size 8 so we can include all script codes without insufficient | |
142 // buffer errors. | |
143 UErrorCode error = U_ZERO_ERROR; | |
144 UScriptCode script_code[8]; | |
145 int scripts = uscript_getCode(language.c_str(), script_code, | |
146 arraysize(script_code), &error); | |
147 if (U_SUCCESS(error) && scripts >= 1) | |
148 script_code_ = script_code[0]; | |
149 | |
150 // Retrieve the values for $ALetter and $ALetterPlus. We use the dictionary | |
151 // only for the languages which need it (i.e. Korean and Thai) to prevent ICU | |
152 // from returning dictionary words (i.e. Korean or Thai words) for languages | |
153 // which don't need them. | |
154 const char* aletter = uscript_getName(script_code_); | |
155 if (!aletter) | |
156 aletter = "Latin"; | |
157 | |
158 const char kWithDictionary[] = | |
159 "$dictionary = [:LineBreak = Complex_Context:];" | |
160 "$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];"; | |
161 const char kWithoutDictionary[] = "$ALetterPlus = $ALetter;"; | |
162 const char* aletter_plus = kWithoutDictionary; | |
163 if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI || | |
164 script_code_ == USCRIPT_LAO || script_code_ == USCRIPT_KHMER) | |
165 aletter_plus = kWithDictionary; | |
166 | |
167 // Treat numbers as word characters except for Arabic and Hebrew. | |
168 const char* aletter_extra = " [0123456789]"; | |
169 if (script_code_ == USCRIPT_HEBREW) | |
170 aletter_extra = ""; | |
171 else if (script_code_ == USCRIPT_ARABIC) | |
172 // When "script=Arabic", it does not include tatweel, which is | |
173 // "script=Common" so add it back. Otherwise, it creates unwanted | |
174 // word breaks. | |
175 aletter_extra = " [\\u0640]"; | |
176 | |
177 const char kMidLetterExtra[] = ""; | |
178 // For Hebrew, treat single/double quoation marks as MidLetter. | |
179 const char kMidLetterExtraHebrew[] = "\"'"; | |
180 const char* midletter_extra = kMidLetterExtra; | |
181 if (script_code_ == USCRIPT_HEBREW) | |
182 midletter_extra = kMidLetterExtraHebrew; | |
183 | |
184 // Create two custom rule-sets: one allows contraction and the other does not. | |
185 // We save these strings in UTF-16 so we can use it without conversions. (ICU | |
186 // needs UTF-16 strings.) | |
187 const char kAllowContraction[] = | |
188 "$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};"; | |
189 const char kDisallowContraction[] = ""; | |
190 | |
191 ruleset_allow_contraction_ = base::ASCIIToUTF16( | |
192 base::StringPrintf(kRuleTemplate, | |
193 aletter, | |
194 aletter_extra, | |
195 midletter_extra, | |
196 aletter_plus, | |
197 kAllowContraction)); | |
198 ruleset_disallow_contraction_ = base::ASCIIToUTF16( | |
199 base::StringPrintf(kRuleTemplate, | |
200 aletter, | |
201 aletter_extra, | |
202 midletter_extra, | |
203 aletter_plus, | |
204 kDisallowContraction)); | |
205 } | |
206 | |
207 bool SpellcheckCharAttribute::OutputChar(UChar c, | |
208 base::string16* output) const { | |
209 // Call the language-specific function if necessary. | |
210 // Otherwise, we call the default one. | |
211 switch (script_code_) { | |
212 case USCRIPT_ARABIC: | |
213 return OutputArabic(c, output); | |
214 | |
215 case USCRIPT_HANGUL: | |
216 return OutputHangul(c, output); | |
217 | |
218 case USCRIPT_HEBREW: | |
219 return OutputHebrew(c, output); | |
220 | |
221 default: | |
222 return OutputDefault(c, output); | |
223 } | |
224 } | |
225 | |
226 bool SpellcheckCharAttribute::OutputArabic(UChar c, | |
227 base::string16* output) const { | |
228 // Include non-Arabic characters (which should trigger a spelling error) | |
229 // and Arabic characters excluding vowel marks and class "Lm". | |
230 // We filter the latter because, while they are "letters", they are | |
231 // optional and so don't affect the correctness of the rest of the word. | |
232 if (!(0x0600 <= c && c <= 0x06FF) || (u_isalpha(c) && c != 0x0640)) | |
233 output->push_back(c); | |
234 return true; | |
235 } | |
236 | |
237 bool SpellcheckCharAttribute::OutputHangul(UChar c, | |
238 base::string16* output) const { | |
239 // Decompose a Hangul character to a Hangul vowel and consonants used by our | |
240 // spellchecker. A Hangul character of Unicode is a ligature consisting of a | |
241 // Hangul vowel and consonants, e.g. U+AC01 "Gag" consists of U+1100 "G", | |
242 // U+1161 "a", and U+11A8 "g". That is, we can treat each Hangul character as | |
243 // a point of a cubic linear space consisting of (first consonant, vowel, last | |
244 // consonant). Therefore, we can compose a Hangul character from a vowel and | |
245 // two consonants with linear composition: | |
246 // character = 0xAC00 + | |
247 // (first consonant - 0x1100) * 28 * 21 + | |
248 // (vowel - 0x1161) * 28 + | |
249 // (last consonant - 0x11A7); | |
250 // We can also decompose a Hangul character with linear decomposition: | |
251 // first consonant = (character - 0xAC00) / 28 / 21; | |
252 // vowel = (character - 0xAC00) / 28 % 21; | |
253 // last consonant = (character - 0xAC00) % 28; | |
254 // This code is copied from Unicode Standard Annex #15 | |
255 // <http://unicode.org/reports/tr15> and added some comments. | |
256 const int kSBase = 0xAC00; // U+AC00: the top of Hangul characters. | |
257 const int kLBase = 0x1100; // U+1100: the top of Hangul first consonants. | |
258 const int kVBase = 0x1161; // U+1161: the top of Hangul vowels. | |
259 const int kTBase = 0x11A7; // U+11A7: the top of Hangul last consonants. | |
260 const int kLCount = 19; // The number of Hangul first consonants. | |
261 const int kVCount = 21; // The number of Hangul vowels. | |
262 const int kTCount = 28; // The number of Hangul last consonants. | |
263 const int kNCount = kVCount * kTCount; | |
264 const int kSCount = kLCount * kNCount; | |
265 | |
266 int index = c - kSBase; | |
267 if (index < 0 || index >= kSBase + kSCount) { | |
268 // This is not a Hangul syllable. Call the default output function since we | |
269 // should output this character when it is a Hangul syllable. | |
270 return OutputDefault(c, output); | |
271 } | |
272 | |
273 // This is a Hangul character. Decompose this characters into Hangul vowels | |
274 // and consonants. | |
275 int l = kLBase + index / kNCount; | |
276 int v = kVBase + (index % kNCount) / kTCount; | |
277 int t = kTBase + index % kTCount; | |
278 output->push_back(l); | |
279 output->push_back(v); | |
280 if (t != kTBase) | |
281 output->push_back(t); | |
282 return true; | |
283 } | |
284 | |
285 bool SpellcheckCharAttribute::OutputHebrew(UChar c, | |
286 base::string16* output) const { | |
287 // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds | |
288 // to prevent our Hebrew dictionary from marking a Hebrew word including | |
289 // niqquds as misspelled. (Same as Arabic vowel marks, we need to check | |
290 // niqquds manually and filter them out since their script codes are | |
291 // USCRIPT_HEBREW.) | |
292 // Pass through ASCII single/double quotation marks and Hebrew Geresh and | |
293 // Gershayim. | |
294 if ((0x05D0 <= c && c <= 0x05EA) || c == 0x22 || c == 0x27 || | |
295 c == 0x05F4 || c == 0x05F3) | |
296 output->push_back(c); | |
297 return true; | |
298 } | |
299 | |
300 bool SpellcheckCharAttribute::OutputDefault(UChar c, | |
301 base::string16* output) const { | |
302 // Check the script code of this character and output only if it is the one | |
303 // used by the spellchecker language. | |
304 UErrorCode status = U_ZERO_ERROR; | |
305 UScriptCode script_code = uscript_getScript(c, &status); | |
306 if (script_code == script_code_ || script_code == USCRIPT_COMMON) | |
307 output->push_back(c); | |
308 return true; | |
309 } | |
310 | |
311 // SpellcheckWordIterator implementation: | |
312 | |
313 SpellcheckWordIterator::SpellcheckWordIterator() | |
314 : text_(NULL), | |
315 attribute_(NULL), | |
316 iterator_() { | |
317 } | |
318 | |
319 SpellcheckWordIterator::~SpellcheckWordIterator() { | |
320 Reset(); | |
321 } | |
322 | |
323 bool SpellcheckWordIterator::Initialize( | |
324 const SpellcheckCharAttribute* attribute, | |
325 bool allow_contraction) { | |
326 // Create a custom ICU break iterator with empty text used in this object. (We | |
327 // allow setting text later so we can re-use this iterator.) | |
328 DCHECK(attribute); | |
329 const base::string16 rule(attribute->GetRuleSet(allow_contraction)); | |
330 | |
331 // If there is no rule set, the attributes were invalid. | |
332 if (rule.empty()) | |
333 return false; | |
334 | |
335 std::unique_ptr<BreakIterator> iterator( | |
336 new BreakIterator(base::string16(), rule)); | |
337 if (!iterator->Init()) { | |
338 // Since we're not passing in any text, the only reason this could fail | |
339 // is if we fail to parse the rules. Since the rules are hardcoded, | |
340 // that would be a bug in this class. | |
341 NOTREACHED() << "failed to open iterator (broken rules)"; | |
342 return false; | |
343 } | |
344 iterator_ = std::move(iterator); | |
345 | |
346 // Set the character attributes so we can normalize the words extracted by | |
347 // this iterator. | |
348 attribute_ = attribute; | |
349 return true; | |
350 } | |
351 | |
352 bool SpellcheckWordIterator::IsInitialized() const { | |
353 // Return true iff we have an iterator. | |
354 return !!iterator_; | |
355 } | |
356 | |
357 bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) { | |
358 DCHECK(!!iterator_); | |
359 | |
360 // Set the text to be split by this iterator. | |
361 if (!iterator_->SetText(text, length)) { | |
362 LOG(ERROR) << "failed to set text"; | |
363 return false; | |
364 } | |
365 | |
366 text_ = text; | |
367 return true; | |
368 } | |
369 | |
370 SpellcheckWordIterator::WordIteratorStatus SpellcheckWordIterator::GetNextWord( | |
371 base::string16* word_string, | |
372 int* word_start, | |
373 int* word_length) { | |
374 DCHECK(!!text_); | |
375 | |
376 word_string->clear(); | |
377 *word_start = 0; | |
378 *word_length = 0; | |
379 | |
380 if (!text_) { | |
381 return IS_END_OF_TEXT; | |
382 } | |
383 | |
384 // Find a word that can be checked for spelling or a character that can be | |
385 // skipped over. Rather than moving past a skippable character this returns | |
386 // IS_SKIPPABLE and defers handling the character to the calling function. | |
387 while (iterator_->Advance()) { | |
388 const size_t start = iterator_->prev(); | |
389 const size_t length = iterator_->pos() - start; | |
390 switch (iterator_->GetWordBreakStatus()) { | |
391 case BreakIterator::IS_WORD_BREAK: { | |
392 if (Normalize(start, length, word_string)) { | |
393 *word_start = start; | |
394 *word_length = length; | |
395 return IS_WORD; | |
396 } | |
397 break; | |
398 } | |
399 case BreakIterator::IS_SKIPPABLE_WORD: { | |
400 *word_string = iterator_->GetString(); | |
401 *word_start = start; | |
402 *word_length = length; | |
403 return IS_SKIPPABLE; | |
404 } | |
405 // |iterator_| is RULE_BASED so the break status should never be | |
406 // IS_LINE_OR_CHAR_BREAK. | |
407 case BreakIterator::IS_LINE_OR_CHAR_BREAK: { | |
408 NOTREACHED(); | |
409 break; | |
410 } | |
411 } | |
412 } | |
413 | |
414 // There aren't any more words in the given text. | |
415 return IS_END_OF_TEXT; | |
416 } | |
417 | |
418 void SpellcheckWordIterator::Reset() { | |
419 iterator_.reset(); | |
420 } | |
421 | |
422 bool SpellcheckWordIterator::Normalize(int input_start, | |
423 int input_length, | |
424 base::string16* output_string) const { | |
425 // We use NFKC (Normalization Form, Compatible decomposition, followed by | |
426 // canonical Composition) defined in Unicode Standard Annex #15 to normalize | |
427 // this token because it it the most suitable normalization algorithm for our | |
428 // spellchecker. Nevertheless, it is not a perfect algorithm for our | |
429 // spellchecker and we need manual normalization as well. The normalized | |
430 // text does not have to be NUL-terminated since its characters are copied to | |
431 // string16, which adds a NUL character when we need. | |
432 icu::UnicodeString input(FALSE, &text_[input_start], input_length); | |
433 UErrorCode status = U_ZERO_ERROR; | |
434 icu::UnicodeString output; | |
435 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status); | |
436 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) | |
437 return false; | |
438 | |
439 // Copy the normalized text to the output. | |
440 icu::StringCharacterIterator it(output); | |
441 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) | |
442 attribute_->OutputChar(c, output_string); | |
443 | |
444 return !output_string->empty(); | |
445 } | |
OLD | NEW |