Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(252)

Side by Side Diff: chrome/browser/spellcheck_worditerator.cc

Issue 395007: Move Mac to using renderer spellchecker. (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: ui test fix Created 11 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « chrome/browser/spellcheck_worditerator.h ('k') | chrome/browser/spellchecker.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/spellcheck_worditerator.h"
6
7 #include <map>
8 #include <string>
9
10 #include "base/basictypes.h"
11 #include "base/string_util.h"
12 #include "chrome/browser/spellchecker.h"
13
14 #include "third_party/icu/public/common/unicode/normlzr.h"
15 #include "third_party/icu/public/common/unicode/schriter.h"
16 #include "third_party/icu/public/common/unicode/uchar.h"
17 #include "third_party/icu/public/common/unicode/uscript.h"
18 #include "third_party/icu/public/common/unicode/uset.h"
19 #include "third_party/icu/public/i18n/unicode/ulocdata.h"
20
21 SpellcheckCharAttribute::SpellcheckCharAttribute() {
22 InitializeScriptTable();
23
24 // Even though many dictionaries treats numbers and contractions as words and
25 // treats USCRIPT_COMMON characters as word characters, the
26 // SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word
27 // characters to strictly-distinguish contraction characters from word
28 // characters.
29 SetWordScript(USCRIPT_COMMON, false);
30
31 // Initialize the table of characters used for contractions.
32 // This array consists of the 'Midletter' and 'MidNumLet' characters of the
33 // word-break property list provided by Unicode, Inc.:
34 // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt
35 static const UChar32 kMidLetters[] = {
36 L'\x003A', // MidLetter # COLON
37 L'\x00B7', // MidLetter # MIDDLE DOT
38 L'\x0387', // MidLetter # GREEK ANO TELEIA
39 L'\x05F4', // MidLetter # HEBREW PUNCTUATION GERSHAYIM
40 L'\x2027', // MidLetter # HYPHENATION POINT
41 L'\xFE13', // MidLetter # PRESENTATION FORM FOR VERTICAL COLON
42 L'\xFE55', // MidLetter # SMALL COLON
43 L'\xFF1A', // MidLetter # FULLWIDTH COLON
44 L'\x0027', // MidNumLet # APOSTROPHE
45 L'\x002E', // MidNumLet # FULL STOP
46 L'\x2018', // MidNumLet # LEFT SINGLE QUOTATION MARK
47 L'\x2019', // MidNumLet # RIGHT SINGLE QUOTATION MARK
48 L'\x2024', // MidNumLet # ONE DOT LEADER
49 L'\xFE52', // MidNumLet # SMALL FULL STOP
50 L'\xFF07', // MidNumLet # FULLWIDTH APOSTROPHE
51 L'\xFF0E', // MidNumLet # FULLWIDTH FULL STOP
52 };
53 for (size_t i = 0; i < arraysize(kMidLetters); ++i)
54 middle_letters_[kMidLetters[i]] = true;
55 }
56
57 SpellcheckCharAttribute::~SpellcheckCharAttribute() {
58 }
59
60 // Sets the default language for this object.
61 // This function retrieves the exemplar set to set up the default character
62 // attributes.
63 void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) {
64 UErrorCode status = U_ZERO_ERROR;
65 ULocaleData* locale_data = ulocdata_open(language.c_str(), &status);
66 if (U_FAILURE(status))
67 return;
68
69 // Retrieves the exemplar set of the given language and update the
70 // character-attribute table to treat its characters as word characters.
71 USet* exemplar_set = uset_open(1, 0);
72 ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD,
73 &status);
74 ulocdata_close(locale_data);
75 if (U_SUCCESS(status)) {
76 int length = uset_size(exemplar_set);
77 for (int i = 0; i < length; ++i) {
78 UChar32 character = uset_charAt(exemplar_set, i);
79 SetWordScript(GetScriptCode(character), true);
80 }
81
82 // Many languages use combining characters to input their characters from
83 // keyboards. On the other hand, this exemplar set does not always include
84 // combining characters for such languages.
85 // To treat such combining characters as word characters, we decompose
86 // this exemplar set and treat the decomposed characters as word characters.
87 icu::UnicodeString composed;
88 for (int i = 0; i < length; ++i)
89 composed.append(uset_charAt(exemplar_set, i));
90
91 icu::UnicodeString decomposed;
92 icu::Normalizer::decompose(composed, FALSE, 0, decomposed, status);
93 if (U_SUCCESS(status)) {
94 icu::StringCharacterIterator iterator(decomposed);
95 UChar32 character = iterator.first32();
96 while (character != icu::CharacterIterator::DONE) {
97 SetWordScript(GetScriptCode(character), true);
98 character = iterator.next32();
99 }
100 }
101 }
102 uset_close(exemplar_set);
103 }
104
105 // Returns whether or not the given character is a character used by the
106 // selected dictionary.
107 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const {
108 return IsWordScript(GetScriptCode(character)) && !u_isdigit(character);
109 }
110
111 // Returns whether or not the given character is a character used by
112 // contractions.
113 bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const {
114 std::map<UChar32, bool>::const_iterator iterator;
115 iterator = middle_letters_.find(character);
116 if (iterator == middle_letters_.end())
117 return false;
118 return iterator->second;
119 }
120
121 // Initializes the mapping table.
122 void SpellcheckCharAttribute::InitializeScriptTable() {
123 for (size_t i = 0; i < arraysize(script_attributes_); ++i)
124 script_attributes_[i] = false;
125 }
126
127 // Retrieves the ICU script code.
128 UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const {
129 UErrorCode status = U_ZERO_ERROR;
130 UScriptCode script_code = uscript_getScript(character, &status);
131 return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE;
132 }
133
134 // Updates the mapping table from an ICU script code to its attribute, i.e.
135 // whether not a script is used by the selected dictionary.
136 void SpellcheckCharAttribute::SetWordScript(const int script_code,
137 bool in_use) {
138 if (script_code < 0 ||
139 static_cast<size_t>(script_code) >= arraysize(script_attributes_))
140 return;
141 script_attributes_[script_code] = in_use;
142 }
143
144 // Returns whether or not the given script is used by the selected
145 // dictionary.
146 bool SpellcheckCharAttribute::IsWordScript(
147 const UScriptCode script_code) const {
148 if (script_code < 0 ||
149 static_cast<size_t>(script_code) >= arraysize(script_attributes_))
150 return false;
151 return script_attributes_[script_code];
152 }
153
154 SpellcheckWordIterator::SpellcheckWordIterator()
155 : word_(NULL),
156 length_(0),
157 position_(0),
158 allow_contraction_(false),
159 attribute_(NULL) {
160 }
161
162 SpellcheckWordIterator::~SpellcheckWordIterator() {
163 }
164
165 // Initialize a word-iterator object.
166 void SpellcheckWordIterator::Initialize(
167 const SpellcheckCharAttribute* attribute,
168 const char16* word,
169 size_t length,
170 bool allow_contraction) {
171 word_ = word;
172 position_ = 0;
173 length_ = static_cast<int>(length);
174 allow_contraction_ = allow_contraction;
175 attribute_ = attribute;
176 }
177
178 // Retrieves a word (or a contraction).
179 // When a contraction is enclosed with contraction characters (e.g. 'isn't',
180 // 'rock'n'roll'), we should discard the beginning and the end of the
181 // contraction but we should never split the contraction.
182 // To handle this case easily, we should firstly extract a segment consisting
183 // of word characters and contraction characters, and discard contraction
184 // characters at the beginning and the end of the extracted segment.
185 bool SpellcheckWordIterator::GetNextWord(string16* word_string,
186 int* word_start,
187 int* word_length) {
188 word_string->clear();
189 *word_start = 0;
190 *word_length = 0;
191 while (position_ < length_) {
192 int segment_start = 0;
193 int segment_end = 0;
194 GetSegment(&segment_start, &segment_end);
195 TrimSegment(segment_start, segment_end, word_start, word_length);
196 if (*word_length > 0)
197 return Normalize(*word_start, *word_length, word_string);
198 }
199
200 return false;
201 }
202
203 // Retrieves a segment consisting of word characters (and contraction
204 // characters if the |allow_contraction_| value is true).
205 // When the current position refers to a non-word character, this function
206 // returns a non-empty segment consisting of the character itself. In this
207 // case, the TrimSegment() function discards the character and returns an
208 // empty word (i.e. |word_length| == 0).
209 void SpellcheckWordIterator::GetSegment(int* segment_start,
210 int* segment_end) {
211 int position = position_;
212 while (position < length_) {
213 UChar32 character;
214 U16_NEXT(word_, position, length_, character);
215 if (!attribute_->IsWordChar(character)) {
216 if (!allow_contraction_ || !attribute_->IsContractionChar(character))
217 break;
218 }
219 }
220 *segment_start = position_;
221 *segment_end = position;
222 position_ = position;
223 }
224
225 // Discards non-word characters at the beginning and the end of the given
226 // segment.
227 void SpellcheckWordIterator::TrimSegment(int segment_start,
228 int segment_end,
229 int* word_start,
230 int* word_length) const {
231 while (segment_start < segment_end) {
232 UChar32 character;
233 int segment_next = segment_start;
234 U16_NEXT(word_, segment_next, segment_end, character);
235 if (attribute_->IsWordChar(character)) {
236 *word_start = segment_start;
237 break;
238 }
239 segment_start = segment_next;
240 }
241 while (segment_end >= segment_start) {
242 UChar32 character;
243 int segment_prev = segment_end;
244 U16_PREV(word_, segment_start, segment_prev, character);
245 if (attribute_->IsWordChar(character)) {
246 *word_length = segment_end - segment_start;
247 break;
248 }
249 segment_end = segment_prev;
250 }
251 }
252
253 // Normalizes a non-terminated string into its canonical form so that
254 // a spellchecker object can check spellings of words which contain ligatures,
255 // full-width letters, etc.
256 // USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but
257 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin,
258 // etc. For its details, please read the script table in
259 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt".
260 bool SpellcheckWordIterator::Normalize(int input_start,
261 int input_length,
262 string16* output_string) const {
263 // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/"
264 // does not only write NFKD and NFKC can compose ligatures into their ASCII
265 // alternatives, but also write NFKC keeps accents of characters.
266 // Therefore, NFKC seems to be the best option for hunspell.
267 icu::UnicodeString input(FALSE, &word_[input_start], input_length);
268 UErrorCode status = U_ZERO_ERROR;
269 icu::UnicodeString output;
270 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);
271 if (U_SUCCESS(status))
272 output_string->assign(output.getTerminatedBuffer());
273 return status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING;
274 }
OLDNEW
« no previous file with comments | « chrome/browser/spellcheck_worditerator.h ('k') | chrome/browser/spellchecker.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698