| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 // Defines an iterator class that enumerates words supported by our spellchecker | |
| 6 // from multi-language text. This class is used for filtering out characters | |
| 7 // not supported by our spellchecker. | |
| 8 | |
| 9 #ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ | |
| 10 #define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ | |
| 11 | |
| 12 #include <stddef.h> | |
| 13 | |
| 14 #include <memory> | |
| 15 #include <string> | |
| 16 | |
| 17 #include "base/macros.h" | |
| 18 #include "base/strings/string16.h" | |
| 19 #include "third_party/icu/source/common/unicode/uscript.h" | |
| 20 | |
| 21 namespace base { | |
| 22 namespace i18n { | |
| 23 class BreakIterator; | |
| 24 } // namespace i18n | |
| 25 } // namespace base | |
| 26 | |
| 27 // A class which encapsulates language-specific operations used by | |
| 28 // SpellcheckWordIterator. When we set the spellchecker language, this class | |
| 29 // creates rule sets that filter out the characters not supported by the | |
| 30 // spellchecker. (Please read the comment in the SpellcheckWordIterator class | |
| 31 // about how to use this class.) | |
| 32 class SpellcheckCharAttribute { | |
| 33 public: | |
| 34 SpellcheckCharAttribute(); | |
| 35 ~SpellcheckCharAttribute(); | |
| 36 | |
| 37 // Sets the language of the spellchecker. When this function is called with an | |
| 38 // ISO language code, this function creates the custom rule-sets used by | |
| 39 // the ICU break iterator so it can extract only words used by the language. | |
| 40 // GetRuleSet() returns the rule-sets created in this function. | |
| 41 void SetDefaultLanguage(const std::string& language); | |
| 42 | |
| 43 // Returns a custom rule-set string used by the ICU break iterator. This class | |
| 44 // has two rule-sets, one splits a contraction and the other does not, so we | |
| 45 // can split a concaticated word (e.g. "seven-year-old") into words (e.g. | |
| 46 // "seven", "year", and "old") and check their spellings. The result stirng is | |
| 47 // encoded in UTF-16 since ICU needs UTF-16 strings. | |
| 48 base::string16 GetRuleSet(bool allow_contraction) const; | |
| 49 | |
| 50 // Outputs a character only if it is a word character. (Please read the | |
| 51 // comments in CreateRuleSets() why we need this function.) | |
| 52 bool OutputChar(UChar c, base::string16* output) const; | |
| 53 | |
| 54 private: | |
| 55 // Creates the rule-sets that return words possibly used by the given | |
| 56 // language. Unfortunately, these rule-sets are not perfect and have some | |
| 57 // false-positives. For example, they return combined accent marks even though | |
| 58 // we need English words only. We call OutputCharacter() to filter out such | |
| 59 // false-positive characters. | |
| 60 void CreateRuleSets(const std::string& language); | |
| 61 | |
| 62 // Outputs a character only if it is one used by the given language. These | |
| 63 // functions are called from OutputChar(). | |
| 64 bool OutputArabic(UChar c, base::string16* output) const; | |
| 65 bool OutputHangul(UChar c, base::string16* output) const; | |
| 66 bool OutputHebrew(UChar c, base::string16* output) const; | |
| 67 bool OutputDefault(UChar c, base::string16* output) const; | |
| 68 | |
| 69 // The custom rule-set strings used by ICU break iterator. Since it is not so | |
| 70 // easy to create custom rule-sets from an ISO language code, this class | |
| 71 // saves these rule-set strings created when we set the language. | |
| 72 base::string16 ruleset_allow_contraction_; | |
| 73 base::string16 ruleset_disallow_contraction_; | |
| 74 | |
| 75 // The script code used by this language. | |
| 76 UScriptCode script_code_; | |
| 77 | |
| 78 DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute); | |
| 79 }; | |
| 80 | |
| 81 // A class which extracts words that can be checked for spelling from a | |
| 82 // multi-language string. The ICU word-break iterator does not discard some | |
| 83 // punctuation characters attached to a word. For example, when we set a word | |
| 84 // "_hello_" to a word-break iterator, it just returns "_hello_". Neither does | |
| 85 // it discard characters not used by the language. For example, it returns | |
| 86 // Russian words even though we need English words only. To extract only the | |
| 87 // words that our spellchecker can check their spellings, this class uses custom | |
| 88 // rule-sets created by the SpellcheckCharAttribute class. Also, this class | |
| 89 // normalizes extracted words so our spellchecker can check the spellings of | |
| 90 // words that include ligatures, combined characters, full-width characters, | |
| 91 // etc. This class uses UTF-16 strings as its input and output strings since | |
| 92 // UTF-16 is the native encoding of ICU and avoid unnecessary conversions | |
| 93 // when changing the encoding of this string for our spellchecker. (Chrome can | |
| 94 // use two or more spellcheckers and we cannot assume their encodings.) | |
| 95 // The following snippet is an example that extracts words with this class. | |
| 96 // | |
| 97 // // Creates the language-specific attributes for US English. | |
| 98 // SpellcheckCharAttribute attribute; | |
| 99 // attribute.SetDefaultLanguage("en-US"); | |
| 100 // | |
| 101 // // Set up a SpellcheckWordIterator object which extracts English words, | |
| 102 // // and retrieve them. | |
| 103 // SpellcheckWordIterator iterator; | |
| 104 // base::string16 text(base::UTF8ToUTF16("this is a test.")); | |
| 105 // iterator.Initialize(&attribute, true); | |
| 106 // iterator.SetText(text.c_str(), text_.length()); | |
| 107 // | |
| 108 // base::string16 word; | |
| 109 // int offset; | |
| 110 // int length; | |
| 111 // while (iterator.GetNextWord(&word, &offset, &length)) { | |
| 112 // ... | |
| 113 // } | |
| 114 // | |
| 115 class SpellcheckWordIterator { | |
| 116 public: | |
| 117 enum WordIteratorStatus { | |
| 118 // The end of a sequence of text that the iterator recognizes as characters | |
| 119 // that can form a word. | |
| 120 IS_WORD, | |
| 121 // Non-word characters that the iterator can skip past, such as punctuation, | |
| 122 // whitespace, and characters from another character set. | |
| 123 IS_SKIPPABLE, | |
| 124 // The end of the text that the iterator is going over. | |
| 125 IS_END_OF_TEXT | |
| 126 }; | |
| 127 | |
| 128 SpellcheckWordIterator(); | |
| 129 ~SpellcheckWordIterator(); | |
| 130 | |
| 131 // Initializes a word-iterator object with the language-specific attribute. If | |
| 132 // we need to split contractions and concatenated words, call this function | |
| 133 // with its 'allow_contraction' parameter false. (This function uses lots of | |
| 134 // temporal memory to compile a custom word-break rule into an automaton.) | |
| 135 bool Initialize(const SpellcheckCharAttribute* attribute, | |
| 136 bool allow_contraction); | |
| 137 | |
| 138 // Returns whether this word iterator is initialized. | |
| 139 bool IsInitialized() const; | |
| 140 | |
| 141 // Set text to be iterated. (This text does not have to be NULL-terminated.) | |
| 142 // This function also resets internal state so we can reuse this iterator | |
| 143 // without calling Initialize(). | |
| 144 bool SetText(const base::char16* text, size_t length); | |
| 145 | |
| 146 // Advances |iterator_| through |text_| and gets the current status of the | |
| 147 // word iterator within |text|: | |
| 148 // | |
| 149 // - Returns IS_WORD if the iterator just found the end of a sequence of word | |
| 150 // characters and it was able to normalize the sequence. This stores the | |
| 151 // normalized string into |word_string| and stores the position and length | |
| 152 // into |word_start| and |word_length| respectively. Keep in mind that | |
| 153 // since this function normalizes the output word, the length of | |
| 154 // |word_string| may be different from the |word_length|. Therefore, when | |
| 155 // we call functions that change the input text, such as | |
| 156 // string16::replace(), we need to use |word_start| and |word_length| as | |
| 157 // listed in the following snippet: | |
| 158 // | |
| 159 // while(iterator.GetNextWord(&word, &offset, &length)) | |
| 160 // text.replace(offset, length, word); | |
| 161 // | |
| 162 // - Returns IS_SKIPPABLE if the iterator just found a character that the | |
| 163 // iterator can skip past such as punctuation, whitespace, and characters | |
| 164 // from another character set. This stores the character, position, and | |
| 165 // length into |word_string|, |word_start|, and |word_length| respectively. | |
| 166 // | |
| 167 // - Returns IS_END_OF_TEXT if the iterator has reached the end of |text_|. | |
| 168 SpellcheckWordIterator::WordIteratorStatus | |
| 169 GetNextWord(base::string16* word_string, int* word_start, int* word_length); | |
| 170 | |
| 171 // Releases all the resources attached to this object. | |
| 172 void Reset(); | |
| 173 | |
| 174 private: | |
| 175 // Normalizes a non-terminated string returned from an ICU word-break | |
| 176 // iterator. A word returned from an ICU break iterator may include characters | |
| 177 // not supported by our spellchecker, e.g. ligatures, combining/ characters, | |
| 178 // full-width letters, etc. This function replaces such characters with | |
| 179 // alternative characters supported by our spellchecker. This function also | |
| 180 // calls SpellcheckWordIterator::OutputChar() to filter out false-positive | |
| 181 // characters. | |
| 182 bool Normalize(int input_start, | |
| 183 int input_length, | |
| 184 base::string16* output_string) const; | |
| 185 | |
| 186 // The pointer to the input string from which we are extracting words. | |
| 187 const base::char16* text_; | |
| 188 | |
| 189 // The language-specific attributes used for filtering out non-word | |
| 190 // characters. | |
| 191 const SpellcheckCharAttribute* attribute_; | |
| 192 | |
| 193 // The break iterator. | |
| 194 std::unique_ptr<base::i18n::BreakIterator> iterator_; | |
| 195 | |
| 196 DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator); | |
| 197 }; | |
| 198 | |
| 199 #endif // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ | |
| 200 | |
| OLD | NEW |