OLD | NEW |
| (Empty) |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 // Defines an iterator class that enumerates words supported by our spellchecker | |
6 // from multi-language text. This class is used for filtering out characters | |
7 // not supported by our spellchecker. | |
8 | |
9 #ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ | |
10 #define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ | |
11 | |
12 #include <stddef.h> | |
13 | |
14 #include <memory> | |
15 #include <string> | |
16 | |
17 #include "base/macros.h" | |
18 #include "base/strings/string16.h" | |
19 #include "third_party/icu/source/common/unicode/uscript.h" | |
20 | |
21 namespace base { | |
22 namespace i18n { | |
23 class BreakIterator; | |
24 } // namespace i18n | |
25 } // namespace base | |
26 | |
27 // A class which encapsulates language-specific operations used by | |
28 // SpellcheckWordIterator. When we set the spellchecker language, this class | |
29 // creates rule sets that filter out the characters not supported by the | |
30 // spellchecker. (Please read the comment in the SpellcheckWordIterator class | |
31 // about how to use this class.) | |
32 class SpellcheckCharAttribute { | |
33 public: | |
34 SpellcheckCharAttribute(); | |
35 ~SpellcheckCharAttribute(); | |
36 | |
37 // Sets the language of the spellchecker. When this function is called with an | |
38 // ISO language code, this function creates the custom rule-sets used by | |
39 // the ICU break iterator so it can extract only words used by the language. | |
40 // GetRuleSet() returns the rule-sets created in this function. | |
41 void SetDefaultLanguage(const std::string& language); | |
42 | |
43 // Returns a custom rule-set string used by the ICU break iterator. This class | |
44 // has two rule-sets, one splits a contraction and the other does not, so we | |
45 // can split a concaticated word (e.g. "seven-year-old") into words (e.g. | |
46 // "seven", "year", and "old") and check their spellings. The result stirng is | |
47 // encoded in UTF-16 since ICU needs UTF-16 strings. | |
48 base::string16 GetRuleSet(bool allow_contraction) const; | |
49 | |
50 // Outputs a character only if it is a word character. (Please read the | |
51 // comments in CreateRuleSets() why we need this function.) | |
52 bool OutputChar(UChar c, base::string16* output) const; | |
53 | |
54 private: | |
55 // Creates the rule-sets that return words possibly used by the given | |
56 // language. Unfortunately, these rule-sets are not perfect and have some | |
57 // false-positives. For example, they return combined accent marks even though | |
58 // we need English words only. We call OutputCharacter() to filter out such | |
59 // false-positive characters. | |
60 void CreateRuleSets(const std::string& language); | |
61 | |
62 // Outputs a character only if it is one used by the given language. These | |
63 // functions are called from OutputChar(). | |
64 bool OutputArabic(UChar c, base::string16* output) const; | |
65 bool OutputHangul(UChar c, base::string16* output) const; | |
66 bool OutputHebrew(UChar c, base::string16* output) const; | |
67 bool OutputDefault(UChar c, base::string16* output) const; | |
68 | |
69 // The custom rule-set strings used by ICU break iterator. Since it is not so | |
70 // easy to create custom rule-sets from an ISO language code, this class | |
71 // saves these rule-set strings created when we set the language. | |
72 base::string16 ruleset_allow_contraction_; | |
73 base::string16 ruleset_disallow_contraction_; | |
74 | |
75 // The script code used by this language. | |
76 UScriptCode script_code_; | |
77 | |
78 DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute); | |
79 }; | |
80 | |
81 // A class which extracts words that can be checked for spelling from a | |
82 // multi-language string. The ICU word-break iterator does not discard some | |
83 // punctuation characters attached to a word. For example, when we set a word | |
84 // "_hello_" to a word-break iterator, it just returns "_hello_". Neither does | |
85 // it discard characters not used by the language. For example, it returns | |
86 // Russian words even though we need English words only. To extract only the | |
87 // words that our spellchecker can check their spellings, this class uses custom | |
88 // rule-sets created by the SpellcheckCharAttribute class. Also, this class | |
89 // normalizes extracted words so our spellchecker can check the spellings of | |
90 // words that include ligatures, combined characters, full-width characters, | |
91 // etc. This class uses UTF-16 strings as its input and output strings since | |
92 // UTF-16 is the native encoding of ICU and avoid unnecessary conversions | |
93 // when changing the encoding of this string for our spellchecker. (Chrome can | |
94 // use two or more spellcheckers and we cannot assume their encodings.) | |
95 // The following snippet is an example that extracts words with this class. | |
96 // | |
97 // // Creates the language-specific attributes for US English. | |
98 // SpellcheckCharAttribute attribute; | |
99 // attribute.SetDefaultLanguage("en-US"); | |
100 // | |
101 // // Set up a SpellcheckWordIterator object which extracts English words, | |
102 // // and retrieve them. | |
103 // SpellcheckWordIterator iterator; | |
104 // base::string16 text(base::UTF8ToUTF16("this is a test.")); | |
105 // iterator.Initialize(&attribute, true); | |
106 // iterator.SetText(text.c_str(), text_.length()); | |
107 // | |
108 // base::string16 word; | |
109 // int offset; | |
110 // int length; | |
111 // while (iterator.GetNextWord(&word, &offset, &length)) { | |
112 // ... | |
113 // } | |
114 // | |
115 class SpellcheckWordIterator { | |
116 public: | |
117 enum WordIteratorStatus { | |
118 // The end of a sequence of text that the iterator recognizes as characters | |
119 // that can form a word. | |
120 IS_WORD, | |
121 // Non-word characters that the iterator can skip past, such as punctuation, | |
122 // whitespace, and characters from another character set. | |
123 IS_SKIPPABLE, | |
124 // The end of the text that the iterator is going over. | |
125 IS_END_OF_TEXT | |
126 }; | |
127 | |
128 SpellcheckWordIterator(); | |
129 ~SpellcheckWordIterator(); | |
130 | |
131 // Initializes a word-iterator object with the language-specific attribute. If | |
132 // we need to split contractions and concatenated words, call this function | |
133 // with its 'allow_contraction' parameter false. (This function uses lots of | |
134 // temporal memory to compile a custom word-break rule into an automaton.) | |
135 bool Initialize(const SpellcheckCharAttribute* attribute, | |
136 bool allow_contraction); | |
137 | |
138 // Returns whether this word iterator is initialized. | |
139 bool IsInitialized() const; | |
140 | |
141 // Set text to be iterated. (This text does not have to be NULL-terminated.) | |
142 // This function also resets internal state so we can reuse this iterator | |
143 // without calling Initialize(). | |
144 bool SetText(const base::char16* text, size_t length); | |
145 | |
146 // Advances |iterator_| through |text_| and gets the current status of the | |
147 // word iterator within |text|: | |
148 // | |
149 // - Returns IS_WORD if the iterator just found the end of a sequence of word | |
150 // characters and it was able to normalize the sequence. This stores the | |
151 // normalized string into |word_string| and stores the position and length | |
152 // into |word_start| and |word_length| respectively. Keep in mind that | |
153 // since this function normalizes the output word, the length of | |
154 // |word_string| may be different from the |word_length|. Therefore, when | |
155 // we call functions that change the input text, such as | |
156 // string16::replace(), we need to use |word_start| and |word_length| as | |
157 // listed in the following snippet: | |
158 // | |
159 // while(iterator.GetNextWord(&word, &offset, &length)) | |
160 // text.replace(offset, length, word); | |
161 // | |
162 // - Returns IS_SKIPPABLE if the iterator just found a character that the | |
163 // iterator can skip past such as punctuation, whitespace, and characters | |
164 // from another character set. This stores the character, position, and | |
165 // length into |word_string|, |word_start|, and |word_length| respectively. | |
166 // | |
167 // - Returns IS_END_OF_TEXT if the iterator has reached the end of |text_|. | |
168 SpellcheckWordIterator::WordIteratorStatus | |
169 GetNextWord(base::string16* word_string, int* word_start, int* word_length); | |
170 | |
171 // Releases all the resources attached to this object. | |
172 void Reset(); | |
173 | |
174 private: | |
175 // Normalizes a non-terminated string returned from an ICU word-break | |
176 // iterator. A word returned from an ICU break iterator may include characters | |
177 // not supported by our spellchecker, e.g. ligatures, combining/ characters, | |
178 // full-width letters, etc. This function replaces such characters with | |
179 // alternative characters supported by our spellchecker. This function also | |
180 // calls SpellcheckWordIterator::OutputChar() to filter out false-positive | |
181 // characters. | |
182 bool Normalize(int input_start, | |
183 int input_length, | |
184 base::string16* output_string) const; | |
185 | |
186 // The pointer to the input string from which we are extracting words. | |
187 const base::char16* text_; | |
188 | |
189 // The language-specific attributes used for filtering out non-word | |
190 // characters. | |
191 const SpellcheckCharAttribute* attribute_; | |
192 | |
193 // The break iterator. | |
194 std::unique_ptr<base::i18n::BreakIterator> iterator_; | |
195 | |
196 DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator); | |
197 }; | |
198 | |
199 #endif // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ | |
200 | |
OLD | NEW |