OLD | NEW |
| (Empty) |
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "chrome/browser/spellcheck_worditerator.h" | |
6 | |
7 #include <map> | |
8 #include <string> | |
9 | |
10 #include "base/basictypes.h" | |
11 #include "base/string_util.h" | |
12 #include "chrome/browser/spellchecker.h" | |
13 | |
14 #include "third_party/icu/public/common/unicode/normlzr.h" | |
15 #include "third_party/icu/public/common/unicode/schriter.h" | |
16 #include "third_party/icu/public/common/unicode/uchar.h" | |
17 #include "third_party/icu/public/common/unicode/uscript.h" | |
18 #include "third_party/icu/public/common/unicode/uset.h" | |
19 #include "third_party/icu/public/i18n/unicode/ulocdata.h" | |
20 | |
21 SpellcheckCharAttribute::SpellcheckCharAttribute() { | |
22 InitializeScriptTable(); | |
23 | |
24 // Even though many dictionaries treats numbers and contractions as words and | |
25 // treats USCRIPT_COMMON characters as word characters, the | |
26 // SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word | |
27 // characters to strictly-distinguish contraction characters from word | |
28 // characters. | |
29 SetWordScript(USCRIPT_COMMON, false); | |
30 | |
31 // Initialize the table of characters used for contractions. | |
32 // This array consists of the 'Midletter' and 'MidNumLet' characters of the | |
33 // word-break property list provided by Unicode, Inc.: | |
34 // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt | |
35 static const UChar32 kMidLetters[] = { | |
36 L'\x003A', // MidLetter # COLON | |
37 L'\x00B7', // MidLetter # MIDDLE DOT | |
38 L'\x0387', // MidLetter # GREEK ANO TELEIA | |
39 L'\x05F4', // MidLetter # HEBREW PUNCTUATION GERSHAYIM | |
40 L'\x2027', // MidLetter # HYPHENATION POINT | |
41 L'\xFE13', // MidLetter # PRESENTATION FORM FOR VERTICAL COLON | |
42 L'\xFE55', // MidLetter # SMALL COLON | |
43 L'\xFF1A', // MidLetter # FULLWIDTH COLON | |
44 L'\x0027', // MidNumLet # APOSTROPHE | |
45 L'\x002E', // MidNumLet # FULL STOP | |
46 L'\x2018', // MidNumLet # LEFT SINGLE QUOTATION MARK | |
47 L'\x2019', // MidNumLet # RIGHT SINGLE QUOTATION MARK | |
48 L'\x2024', // MidNumLet # ONE DOT LEADER | |
49 L'\xFE52', // MidNumLet # SMALL FULL STOP | |
50 L'\xFF07', // MidNumLet # FULLWIDTH APOSTROPHE | |
51 L'\xFF0E', // MidNumLet # FULLWIDTH FULL STOP | |
52 }; | |
53 for (size_t i = 0; i < arraysize(kMidLetters); ++i) | |
54 middle_letters_[kMidLetters[i]] = true; | |
55 } | |
56 | |
57 SpellcheckCharAttribute::~SpellcheckCharAttribute() { | |
58 } | |
59 | |
60 // Sets the default language for this object. | |
61 // This function retrieves the exemplar set to set up the default character | |
62 // attributes. | |
63 void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) { | |
64 UErrorCode status = U_ZERO_ERROR; | |
65 ULocaleData* locale_data = ulocdata_open(language.c_str(), &status); | |
66 if (U_FAILURE(status)) | |
67 return; | |
68 | |
69 // Retrieves the exemplar set of the given language and update the | |
70 // character-attribute table to treat its characters as word characters. | |
71 USet* exemplar_set = uset_open(1, 0); | |
72 ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD, | |
73 &status); | |
74 ulocdata_close(locale_data); | |
75 if (U_SUCCESS(status)) { | |
76 int length = uset_size(exemplar_set); | |
77 for (int i = 0; i < length; ++i) { | |
78 UChar32 character = uset_charAt(exemplar_set, i); | |
79 SetWordScript(GetScriptCode(character), true); | |
80 } | |
81 | |
82 // Many languages use combining characters to input their characters from | |
83 // keyboards. On the other hand, this exemplar set does not always include | |
84 // combining characters for such languages. | |
85 // To treat such combining characters as word characters, we decompose | |
86 // this exemplar set and treat the decomposed characters as word characters. | |
87 icu::UnicodeString composed; | |
88 for (int i = 0; i < length; ++i) | |
89 composed.append(uset_charAt(exemplar_set, i)); | |
90 | |
91 icu::UnicodeString decomposed; | |
92 icu::Normalizer::decompose(composed, FALSE, 0, decomposed, status); | |
93 if (U_SUCCESS(status)) { | |
94 icu::StringCharacterIterator iterator(decomposed); | |
95 UChar32 character = iterator.first32(); | |
96 while (character != icu::CharacterIterator::DONE) { | |
97 SetWordScript(GetScriptCode(character), true); | |
98 character = iterator.next32(); | |
99 } | |
100 } | |
101 } | |
102 uset_close(exemplar_set); | |
103 } | |
104 | |
105 // Returns whether or not the given character is a character used by the | |
106 // selected dictionary. | |
107 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const { | |
108 return IsWordScript(GetScriptCode(character)) && !u_isdigit(character); | |
109 } | |
110 | |
111 // Returns whether or not the given character is a character used by | |
112 // contractions. | |
113 bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const { | |
114 std::map<UChar32, bool>::const_iterator iterator; | |
115 iterator = middle_letters_.find(character); | |
116 if (iterator == middle_letters_.end()) | |
117 return false; | |
118 return iterator->second; | |
119 } | |
120 | |
121 // Initializes the mapping table. | |
122 void SpellcheckCharAttribute::InitializeScriptTable() { | |
123 for (size_t i = 0; i < arraysize(script_attributes_); ++i) | |
124 script_attributes_[i] = false; | |
125 } | |
126 | |
127 // Retrieves the ICU script code. | |
128 UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const { | |
129 UErrorCode status = U_ZERO_ERROR; | |
130 UScriptCode script_code = uscript_getScript(character, &status); | |
131 return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE; | |
132 } | |
133 | |
134 // Updates the mapping table from an ICU script code to its attribute, i.e. | |
135 // whether not a script is used by the selected dictionary. | |
136 void SpellcheckCharAttribute::SetWordScript(const int script_code, | |
137 bool in_use) { | |
138 if (script_code < 0 || | |
139 static_cast<size_t>(script_code) >= arraysize(script_attributes_)) | |
140 return; | |
141 script_attributes_[script_code] = in_use; | |
142 } | |
143 | |
144 // Returns whether or not the given script is used by the selected | |
145 // dictionary. | |
146 bool SpellcheckCharAttribute::IsWordScript( | |
147 const UScriptCode script_code) const { | |
148 if (script_code < 0 || | |
149 static_cast<size_t>(script_code) >= arraysize(script_attributes_)) | |
150 return false; | |
151 return script_attributes_[script_code]; | |
152 } | |
153 | |
154 SpellcheckWordIterator::SpellcheckWordIterator() | |
155 : word_(NULL), | |
156 length_(0), | |
157 position_(0), | |
158 allow_contraction_(false), | |
159 attribute_(NULL) { | |
160 } | |
161 | |
162 SpellcheckWordIterator::~SpellcheckWordIterator() { | |
163 } | |
164 | |
165 // Initialize a word-iterator object. | |
166 void SpellcheckWordIterator::Initialize( | |
167 const SpellcheckCharAttribute* attribute, | |
168 const char16* word, | |
169 size_t length, | |
170 bool allow_contraction) { | |
171 word_ = word; | |
172 position_ = 0; | |
173 length_ = static_cast<int>(length); | |
174 allow_contraction_ = allow_contraction; | |
175 attribute_ = attribute; | |
176 } | |
177 | |
178 // Retrieves a word (or a contraction). | |
179 // When a contraction is enclosed with contraction characters (e.g. 'isn't', | |
180 // 'rock'n'roll'), we should discard the beginning and the end of the | |
181 // contraction but we should never split the contraction. | |
182 // To handle this case easily, we should firstly extract a segment consisting | |
183 // of word characters and contraction characters, and discard contraction | |
184 // characters at the beginning and the end of the extracted segment. | |
185 bool SpellcheckWordIterator::GetNextWord(string16* word_string, | |
186 int* word_start, | |
187 int* word_length) { | |
188 word_string->clear(); | |
189 *word_start = 0; | |
190 *word_length = 0; | |
191 while (position_ < length_) { | |
192 int segment_start = 0; | |
193 int segment_end = 0; | |
194 GetSegment(&segment_start, &segment_end); | |
195 TrimSegment(segment_start, segment_end, word_start, word_length); | |
196 if (*word_length > 0) | |
197 return Normalize(*word_start, *word_length, word_string); | |
198 } | |
199 | |
200 return false; | |
201 } | |
202 | |
203 // Retrieves a segment consisting of word characters (and contraction | |
204 // characters if the |allow_contraction_| value is true). | |
205 // When the current position refers to a non-word character, this function | |
206 // returns a non-empty segment consisting of the character itself. In this | |
207 // case, the TrimSegment() function discards the character and returns an | |
208 // empty word (i.e. |word_length| == 0). | |
209 void SpellcheckWordIterator::GetSegment(int* segment_start, | |
210 int* segment_end) { | |
211 int position = position_; | |
212 while (position < length_) { | |
213 UChar32 character; | |
214 U16_NEXT(word_, position, length_, character); | |
215 if (!attribute_->IsWordChar(character)) { | |
216 if (!allow_contraction_ || !attribute_->IsContractionChar(character)) | |
217 break; | |
218 } | |
219 } | |
220 *segment_start = position_; | |
221 *segment_end = position; | |
222 position_ = position; | |
223 } | |
224 | |
225 // Discards non-word characters at the beginning and the end of the given | |
226 // segment. | |
227 void SpellcheckWordIterator::TrimSegment(int segment_start, | |
228 int segment_end, | |
229 int* word_start, | |
230 int* word_length) const { | |
231 while (segment_start < segment_end) { | |
232 UChar32 character; | |
233 int segment_next = segment_start; | |
234 U16_NEXT(word_, segment_next, segment_end, character); | |
235 if (attribute_->IsWordChar(character)) { | |
236 *word_start = segment_start; | |
237 break; | |
238 } | |
239 segment_start = segment_next; | |
240 } | |
241 while (segment_end >= segment_start) { | |
242 UChar32 character; | |
243 int segment_prev = segment_end; | |
244 U16_PREV(word_, segment_start, segment_prev, character); | |
245 if (attribute_->IsWordChar(character)) { | |
246 *word_length = segment_end - segment_start; | |
247 break; | |
248 } | |
249 segment_end = segment_prev; | |
250 } | |
251 } | |
252 | |
253 // Normalizes a non-terminated string into its canonical form so that | |
254 // a spellchecker object can check spellings of words which contain ligatures, | |
255 // full-width letters, etc. | |
256 // USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but | |
257 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin, | |
258 // etc. For its details, please read the script table in | |
259 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt". | |
260 bool SpellcheckWordIterator::Normalize(int input_start, | |
261 int input_length, | |
262 string16* output_string) const { | |
263 // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/" | |
264 // does not only write NFKD and NFKC can compose ligatures into their ASCII | |
265 // alternatives, but also write NFKC keeps accents of characters. | |
266 // Therefore, NFKC seems to be the best option for hunspell. | |
267 icu::UnicodeString input(FALSE, &word_[input_start], input_length); | |
268 UErrorCode status = U_ZERO_ERROR; | |
269 icu::UnicodeString output; | |
270 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status); | |
271 if (U_SUCCESS(status)) | |
272 output_string->assign(output.getTerminatedBuffer()); | |
273 return status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING; | |
274 } | |
OLD | NEW |