| OLD | NEW |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // Defines an iterator class that enumerates words supported by our spellchecker | 5 // Defines an iterator class that enumerates words supported by our spellchecker |
| 6 // from multi-language text. This class is used for filtering out characters | 6 // from multi-language text. This class is used for filtering out characters |
| 7 // not supported by our spellchecker. | 7 // not supported by our spellchecker. |
| 8 | 8 |
| 9 #ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ | 9 #ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ |
| 10 #define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ | 10 #define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ |
| (...skipping 19 matching lines...) Expand all Loading... |
| 30 // ISO language code, this function creates the custom rule-sets used by | 30 // ISO language code, this function creates the custom rule-sets used by |
| 31 // the ICU break iterator so it can extract only words used by the language. | 31 // the ICU break iterator so it can extract only words used by the language. |
| 32 // GetRuleSet() returns the rule-sets created in this function. | 32 // GetRuleSet() returns the rule-sets created in this function. |
| 33 void SetDefaultLanguage(const std::string& language); | 33 void SetDefaultLanguage(const std::string& language); |
| 34 | 34 |
| 35 // Returns a custom rule-set string used by the ICU break iterator. This class | 35 // Returns a custom rule-set string used by the ICU break iterator. This class |
| 36 // has two rule-sets, one splits a contraction and the other does not, so we | 36 // has two rule-sets, one splits a contraction and the other does not, so we |
| 37 // can split a concaticated word (e.g. "seven-year-old") into words (e.g. | 37 // can split a concaticated word (e.g. "seven-year-old") into words (e.g. |
| 38 // "seven", "year", and "old") and check their spellings. The result stirng is | 38 // "seven", "year", and "old") and check their spellings. The result stirng is |
| 39 // encoded in UTF-16 since ICU needs UTF-16 strings. | 39 // encoded in UTF-16 since ICU needs UTF-16 strings. |
| 40 string16 GetRuleSet(bool allow_contraction) const; | 40 base::string16 GetRuleSet(bool allow_contraction) const; |
| 41 | 41 |
| 42 // Outputs a character only if it is a word character. (Please read the | 42 // Outputs a character only if it is a word character. (Please read the |
| 43 // comments in CreateRuleSets() why we need this function.) | 43 // comments in CreateRuleSets() why we need this function.) |
| 44 bool OutputChar(UChar c, string16* output) const; | 44 bool OutputChar(UChar c, base::string16* output) const; |
| 45 | 45 |
| 46 private: | 46 private: |
| 47 // Creates the rule-sets that return words possibly used by the given | 47 // Creates the rule-sets that return words possibly used by the given |
| 48 // language. Unfortunately, these rule-sets are not perfect and have some | 48 // language. Unfortunately, these rule-sets are not perfect and have some |
| 49 // false-positives. For example, they return combined accent marks even though | 49 // false-positives. For example, they return combined accent marks even though |
| 50 // we need English words only. We call OutputCharacter() to filter out such | 50 // we need English words only. We call OutputCharacter() to filter out such |
| 51 // false-positive characters. | 51 // false-positive characters. |
| 52 void CreateRuleSets(const std::string& language); | 52 void CreateRuleSets(const std::string& language); |
| 53 | 53 |
| 54 // Outputs a character only if it is one used by the given language. These | 54 // Outputs a character only if it is one used by the given language. These |
| 55 // functions are called from OutputChar(). | 55 // functions are called from OutputChar(). |
| 56 bool OutputArabic(UChar c, string16* output) const; | 56 bool OutputArabic(UChar c, base::string16* output) const; |
| 57 bool OutputHangul(UChar c, string16* output) const; | 57 bool OutputHangul(UChar c, base::string16* output) const; |
| 58 bool OutputHebrew(UChar c, string16* output) const; | 58 bool OutputHebrew(UChar c, base::string16* output) const; |
| 59 bool OutputDefault(UChar c, string16* output) const; | 59 bool OutputDefault(UChar c, base::string16* output) const; |
| 60 | 60 |
| 61 // The custom rule-set strings used by ICU break iterator. Since it is not so | 61 // The custom rule-set strings used by ICU break iterator. Since it is not so |
| 62 // easy to create custom rule-sets from an ISO language code, this class | 62 // easy to create custom rule-sets from an ISO language code, this class |
| 63 // saves these rule-set strings created when we set the language. | 63 // saves these rule-set strings created when we set the language. |
| 64 string16 ruleset_allow_contraction_; | 64 base::string16 ruleset_allow_contraction_; |
| 65 string16 ruleset_disallow_contraction_; | 65 base::string16 ruleset_disallow_contraction_; |
| 66 | 66 |
| 67 // The script code used by this language. | 67 // The script code used by this language. |
| 68 UScriptCode script_code_; | 68 UScriptCode script_code_; |
| 69 | 69 |
| 70 DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute); | 70 DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute); |
| 71 }; | 71 }; |
| 72 | 72 |
| 73 // A class which extracts words that can be checked for spelling from a | 73 // A class which extracts words that can be checked for spelling from a |
| 74 // multi-language string. The ICU word-break iterator does not discard some | 74 // multi-language string. The ICU word-break iterator does not discard some |
| 75 // punctuation characters attached to a word. For example, when we set a word | 75 // punctuation characters attached to a word. For example, when we set a word |
| (...skipping 10 matching lines...) Expand all Loading... |
| 86 // use two or more spellcheckers and we cannot assume their encodings.) | 86 // use two or more spellcheckers and we cannot assume their encodings.) |
| 87 // The following snippet is an example that extracts words with this class. | 87 // The following snippet is an example that extracts words with this class. |
| 88 // | 88 // |
| 89 // // Creates the language-specific attributes for US English. | 89 // // Creates the language-specific attributes for US English. |
| 90 // SpellcheckCharAttribute attribute; | 90 // SpellcheckCharAttribute attribute; |
| 91 // attribute.SetDefaultLanguage("en-US"); | 91 // attribute.SetDefaultLanguage("en-US"); |
| 92 // | 92 // |
| 93 // // Set up a SpellcheckWordIterator object which extracts English words, | 93 // // Set up a SpellcheckWordIterator object which extracts English words, |
| 94 // // and retrieve them. | 94 // // and retrieve them. |
| 95 // SpellcheckWordIterator iterator; | 95 // SpellcheckWordIterator iterator; |
| 96 // string16 text(UTF8ToUTF16("this is a test.")); | 96 // base::string16 text(UTF8ToUTF16("this is a test.")); |
| 97 // iterator.Initialize(&attribute, true); | 97 // iterator.Initialize(&attribute, true); |
| 98 // iterator.SetText(text.c_str(), text_.length()); | 98 // iterator.SetText(text.c_str(), text_.length()); |
| 99 // | 99 // |
| 100 // string16 word; | 100 // base::string16 word; |
| 101 // int offset; | 101 // int offset; |
| 102 // int length; | 102 // int length; |
| 103 // while (iterator.GetNextWord(&word, &offset, &length)) { | 103 // while (iterator.GetNextWord(&word, &offset, &length)) { |
| 104 // ... | 104 // ... |
| 105 // } | 105 // } |
| 106 // | 106 // |
| 107 class SpellcheckWordIterator { | 107 class SpellcheckWordIterator { |
| 108 public: | 108 public: |
| 109 SpellcheckWordIterator(); | 109 SpellcheckWordIterator(); |
| 110 ~SpellcheckWordIterator(); | 110 ~SpellcheckWordIterator(); |
| (...skipping 16 matching lines...) Expand all Loading... |
| 127 // Retrieves a word (or a contraction), stores its copy to 'word_string', and | 127 // Retrieves a word (or a contraction), stores its copy to 'word_string', and |
| 128 // stores the position and the length for input word to 'word_start'. Since | 128 // stores the position and the length for input word to 'word_start'. Since |
| 129 // this function normalizes the output word, the length of 'word_string' may | 129 // this function normalizes the output word, the length of 'word_string' may |
| 130 // be different from the 'word_length'. Therefore, when we call functions that | 130 // be different from the 'word_length'. Therefore, when we call functions that |
| 131 // changes the input text, such as string16::replace(), we need to use | 131 // changes the input text, such as string16::replace(), we need to use |
| 132 // 'word_start' and 'word_length' as listed in the following snippet. | 132 // 'word_start' and 'word_length' as listed in the following snippet. |
| 133 // | 133 // |
| 134 // while(iterator.GetNextWord(&word, &offset, &length)) | 134 // while(iterator.GetNextWord(&word, &offset, &length)) |
| 135 // text.replace(offset, length, word); | 135 // text.replace(offset, length, word); |
| 136 // | 136 // |
| 137 bool GetNextWord(string16* word_string, | 137 bool GetNextWord(base::string16* word_string, |
| 138 int* word_start, | 138 int* word_start, |
| 139 int* word_length); | 139 int* word_length); |
| 140 | 140 |
| 141 // Releases all the resources attached to this object. | 141 // Releases all the resources attached to this object. |
| 142 void Reset(); | 142 void Reset(); |
| 143 | 143 |
| 144 private: | 144 private: |
| 145 // Normalizes a non-terminated string returned from an ICU word-break | 145 // Normalizes a non-terminated string returned from an ICU word-break |
| 146 // iterator. A word returned from an ICU break iterator may include characters | 146 // iterator. A word returned from an ICU break iterator may include characters |
| 147 // not supported by our spellchecker, e.g. ligatures, combining/ characters, | 147 // not supported by our spellchecker, e.g. ligatures, combining/ characters, |
| 148 // full-width letters, etc. This function replaces such characters with | 148 // full-width letters, etc. This function replaces such characters with |
| 149 // alternative characters supported by our spellchecker. This function also | 149 // alternative characters supported by our spellchecker. This function also |
| 150 // calls SpellcheckWordIterator::OutputChar() to filter out false-positive | 150 // calls SpellcheckWordIterator::OutputChar() to filter out false-positive |
| 151 // characters. | 151 // characters. |
| 152 bool Normalize(int input_start, | 152 bool Normalize(int input_start, |
| 153 int input_length, | 153 int input_length, |
| 154 string16* output_string) const; | 154 base::string16* output_string) const; |
| 155 | 155 |
| 156 // The pointer to the input string from which we are extracting words. | 156 // The pointer to the input string from which we are extracting words. |
| 157 const char16* text_; | 157 const char16* text_; |
| 158 | 158 |
| 159 // The length of the original string. | 159 // The length of the original string. |
| 160 int length_; | 160 int length_; |
| 161 | 161 |
| 162 // The current position in the original string. | 162 // The current position in the original string. |
| 163 int position_; | 163 int position_; |
| 164 | 164 |
| 165 // The language-specific attributes used for filtering out non-word | 165 // The language-specific attributes used for filtering out non-word |
| 166 // characters. | 166 // characters. |
| 167 const SpellcheckCharAttribute* attribute_; | 167 const SpellcheckCharAttribute* attribute_; |
| 168 | 168 |
| 169 // The ICU break iterator. | 169 // The ICU break iterator. |
| 170 UBreakIterator* iterator_; | 170 UBreakIterator* iterator_; |
| 171 | 171 |
| 172 DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator); | 172 DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator); |
| 173 }; | 173 }; |
| 174 | 174 |
| 175 #endif // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ | 175 #endif // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ |
| 176 | 176 |
| OLD | NEW |