OLD | NEW |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // Defines an iterator class that enumerates words supported by our spellchecker | 5 // Defines an iterator class that enumerates words supported by our spellchecker |
6 // from multi-language text. This class is used for filtering out characters | 6 // from multi-language text. This class is used for filtering out characters |
7 // not supported by our spellchecker. | 7 // not supported by our spellchecker. |
8 | 8 |
9 #ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ | 9 #ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ |
10 #define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ | 10 #define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ |
(...skipping 19 matching lines...) Expand all Loading... |
30 // ISO language code, this function creates the custom rule-sets used by | 30 // ISO language code, this function creates the custom rule-sets used by |
31 // the ICU break iterator so it can extract only words used by the language. | 31 // the ICU break iterator so it can extract only words used by the language. |
32 // GetRuleSet() returns the rule-sets created in this function. | 32 // GetRuleSet() returns the rule-sets created in this function. |
33 void SetDefaultLanguage(const std::string& language); | 33 void SetDefaultLanguage(const std::string& language); |
34 | 34 |
35 // Returns a custom rule-set string used by the ICU break iterator. This class | 35 // Returns a custom rule-set string used by the ICU break iterator. This class |
36 // has two rule-sets, one splits a contraction and the other does not, so we | 36 // has two rule-sets, one splits a contraction and the other does not, so we |
37 // can split a concaticated word (e.g. "seven-year-old") into words (e.g. | 37 // can split a concaticated word (e.g. "seven-year-old") into words (e.g. |
38 // "seven", "year", and "old") and check their spellings. The result stirng is | 38 // "seven", "year", and "old") and check their spellings. The result stirng is |
39 // encoded in UTF-16 since ICU needs UTF-16 strings. | 39 // encoded in UTF-16 since ICU needs UTF-16 strings. |
40 string16 GetRuleSet(bool allow_contraction) const; | 40 base::string16 GetRuleSet(bool allow_contraction) const; |
41 | 41 |
42 // Outputs a character only if it is a word character. (Please read the | 42 // Outputs a character only if it is a word character. (Please read the |
43 // comments in CreateRuleSets() why we need this function.) | 43 // comments in CreateRuleSets() why we need this function.) |
44 bool OutputChar(UChar c, string16* output) const; | 44 bool OutputChar(UChar c, base::string16* output) const; |
45 | 45 |
46 private: | 46 private: |
47 // Creates the rule-sets that return words possibly used by the given | 47 // Creates the rule-sets that return words possibly used by the given |
48 // language. Unfortunately, these rule-sets are not perfect and have some | 48 // language. Unfortunately, these rule-sets are not perfect and have some |
49 // false-positives. For example, they return combined accent marks even though | 49 // false-positives. For example, they return combined accent marks even though |
50 // we need English words only. We call OutputCharacter() to filter out such | 50 // we need English words only. We call OutputCharacter() to filter out such |
51 // false-positive characters. | 51 // false-positive characters. |
52 void CreateRuleSets(const std::string& language); | 52 void CreateRuleSets(const std::string& language); |
53 | 53 |
54 // Outputs a character only if it is one used by the given language. These | 54 // Outputs a character only if it is one used by the given language. These |
55 // functions are called from OutputChar(). | 55 // functions are called from OutputChar(). |
56 bool OutputArabic(UChar c, string16* output) const; | 56 bool OutputArabic(UChar c, base::string16* output) const; |
57 bool OutputHangul(UChar c, string16* output) const; | 57 bool OutputHangul(UChar c, base::string16* output) const; |
58 bool OutputHebrew(UChar c, string16* output) const; | 58 bool OutputHebrew(UChar c, base::string16* output) const; |
59 bool OutputDefault(UChar c, string16* output) const; | 59 bool OutputDefault(UChar c, base::string16* output) const; |
60 | 60 |
61 // The custom rule-set strings used by ICU break iterator. Since it is not so | 61 // The custom rule-set strings used by ICU break iterator. Since it is not so |
62 // easy to create custom rule-sets from an ISO language code, this class | 62 // easy to create custom rule-sets from an ISO language code, this class |
63 // saves these rule-set strings created when we set the language. | 63 // saves these rule-set strings created when we set the language. |
64 string16 ruleset_allow_contraction_; | 64 base::string16 ruleset_allow_contraction_; |
65 string16 ruleset_disallow_contraction_; | 65 base::string16 ruleset_disallow_contraction_; |
66 | 66 |
67 // The script code used by this language. | 67 // The script code used by this language. |
68 UScriptCode script_code_; | 68 UScriptCode script_code_; |
69 | 69 |
70 DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute); | 70 DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute); |
71 }; | 71 }; |
72 | 72 |
73 // A class which extracts words that can be checked for spelling from a | 73 // A class which extracts words that can be checked for spelling from a |
74 // multi-language string. The ICU word-break iterator does not discard some | 74 // multi-language string. The ICU word-break iterator does not discard some |
75 // punctuation characters attached to a word. For example, when we set a word | 75 // punctuation characters attached to a word. For example, when we set a word |
(...skipping 10 matching lines...) Expand all Loading... |
86 // use two or more spellcheckers and we cannot assume their encodings.) | 86 // use two or more spellcheckers and we cannot assume their encodings.) |
87 // The following snippet is an example that extracts words with this class. | 87 // The following snippet is an example that extracts words with this class. |
88 // | 88 // |
89 // // Creates the language-specific attributes for US English. | 89 // // Creates the language-specific attributes for US English. |
90 // SpellcheckCharAttribute attribute; | 90 // SpellcheckCharAttribute attribute; |
91 // attribute.SetDefaultLanguage("en-US"); | 91 // attribute.SetDefaultLanguage("en-US"); |
92 // | 92 // |
93 // // Set up a SpellcheckWordIterator object which extracts English words, | 93 // // Set up a SpellcheckWordIterator object which extracts English words, |
94 // // and retrieve them. | 94 // // and retrieve them. |
95 // SpellcheckWordIterator iterator; | 95 // SpellcheckWordIterator iterator; |
96 // string16 text(UTF8ToUTF16("this is a test.")); | 96 // base::string16 text(UTF8ToUTF16("this is a test.")); |
97 // iterator.Initialize(&attribute, true); | 97 // iterator.Initialize(&attribute, true); |
98 // iterator.SetText(text.c_str(), text_.length()); | 98 // iterator.SetText(text.c_str(), text_.length()); |
99 // | 99 // |
100 // string16 word; | 100 // base::string16 word; |
101 // int offset; | 101 // int offset; |
102 // int length; | 102 // int length; |
103 // while (iterator.GetNextWord(&word, &offset, &length)) { | 103 // while (iterator.GetNextWord(&word, &offset, &length)) { |
104 // ... | 104 // ... |
105 // } | 105 // } |
106 // | 106 // |
107 class SpellcheckWordIterator { | 107 class SpellcheckWordIterator { |
108 public: | 108 public: |
109 SpellcheckWordIterator(); | 109 SpellcheckWordIterator(); |
110 ~SpellcheckWordIterator(); | 110 ~SpellcheckWordIterator(); |
(...skipping 16 matching lines...) Expand all Loading... |
127 // Retrieves a word (or a contraction), stores its copy to 'word_string', and | 127 // Retrieves a word (or a contraction), stores its copy to 'word_string', and |
128 // stores the position and the length for input word to 'word_start'. Since | 128 // stores the position and the length for input word to 'word_start'. Since |
129 // this function normalizes the output word, the length of 'word_string' may | 129 // this function normalizes the output word, the length of 'word_string' may |
130 // be different from the 'word_length'. Therefore, when we call functions that | 130 // be different from the 'word_length'. Therefore, when we call functions that |
131 // changes the input text, such as string16::replace(), we need to use | 131 // changes the input text, such as string16::replace(), we need to use |
132 // 'word_start' and 'word_length' as listed in the following snippet. | 132 // 'word_start' and 'word_length' as listed in the following snippet. |
133 // | 133 // |
134 // while(iterator.GetNextWord(&word, &offset, &length)) | 134 // while(iterator.GetNextWord(&word, &offset, &length)) |
135 // text.replace(offset, length, word); | 135 // text.replace(offset, length, word); |
136 // | 136 // |
137 bool GetNextWord(string16* word_string, | 137 bool GetNextWord(base::string16* word_string, |
138 int* word_start, | 138 int* word_start, |
139 int* word_length); | 139 int* word_length); |
140 | 140 |
141 // Releases all the resources attached to this object. | 141 // Releases all the resources attached to this object. |
142 void Reset(); | 142 void Reset(); |
143 | 143 |
144 private: | 144 private: |
145 // Normalizes a non-terminated string returned from an ICU word-break | 145 // Normalizes a non-terminated string returned from an ICU word-break |
146 // iterator. A word returned from an ICU break iterator may include characters | 146 // iterator. A word returned from an ICU break iterator may include characters |
147 // not supported by our spellchecker, e.g. ligatures, combining/ characters, | 147 // not supported by our spellchecker, e.g. ligatures, combining/ characters, |
148 // full-width letters, etc. This function replaces such characters with | 148 // full-width letters, etc. This function replaces such characters with |
149 // alternative characters supported by our spellchecker. This function also | 149 // alternative characters supported by our spellchecker. This function also |
150 // calls SpellcheckWordIterator::OutputChar() to filter out false-positive | 150 // calls SpellcheckWordIterator::OutputChar() to filter out false-positive |
151 // characters. | 151 // characters. |
152 bool Normalize(int input_start, | 152 bool Normalize(int input_start, |
153 int input_length, | 153 int input_length, |
154 string16* output_string) const; | 154 base::string16* output_string) const; |
155 | 155 |
156 // The pointer to the input string from which we are extracting words. | 156 // The pointer to the input string from which we are extracting words. |
157 const char16* text_; | 157 const char16* text_; |
158 | 158 |
159 // The length of the original string. | 159 // The length of the original string. |
160 int length_; | 160 int length_; |
161 | 161 |
162 // The current position in the original string. | 162 // The current position in the original string. |
163 int position_; | 163 int position_; |
164 | 164 |
165 // The language-specific attributes used for filtering out non-word | 165 // The language-specific attributes used for filtering out non-word |
166 // characters. | 166 // characters. |
167 const SpellcheckCharAttribute* attribute_; | 167 const SpellcheckCharAttribute* attribute_; |
168 | 168 |
169 // The ICU break iterator. | 169 // The ICU break iterator. |
170 UBreakIterator* iterator_; | 170 UBreakIterator* iterator_; |
171 | 171 |
172 DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator); | 172 DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator); |
173 }; | 173 }; |
174 | 174 |
175 #endif // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ | 175 #endif // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ |
176 | 176 |
OLD | NEW |