OLD | NEW |
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/browser/spellcheck_worditerator.h" | 5 #include "chrome/browser/spellcheck_worditerator.h" |
6 | 6 |
7 #include <map> | 7 #include <map> |
8 #include <string> | 8 #include <string> |
9 | 9 |
10 #include "base/basictypes.h" | 10 #include "base/basictypes.h" |
(...skipping 30 matching lines...) Expand all Loading... |
41 L'\xFF1A', // MidLetter # FULLWIDTH COLON | 41 L'\xFF1A', // MidLetter # FULLWIDTH COLON |
42 L'\x0027', // MidNumLet # APOSTROPHE | 42 L'\x0027', // MidNumLet # APOSTROPHE |
43 L'\x002E', // MidNumLet # FULL STOP | 43 L'\x002E', // MidNumLet # FULL STOP |
44 L'\x2018', // MidNumLet # LEFT SINGLE QUOTATION MARK | 44 L'\x2018', // MidNumLet # LEFT SINGLE QUOTATION MARK |
45 L'\x2019', // MidNumLet # RIGHT SINGLE QUOTATION MARK | 45 L'\x2019', // MidNumLet # RIGHT SINGLE QUOTATION MARK |
46 L'\x2024', // MidNumLet # ONE DOT LEADER | 46 L'\x2024', // MidNumLet # ONE DOT LEADER |
47 L'\xFE52', // MidNumLet # SMALL FULL STOP | 47 L'\xFE52', // MidNumLet # SMALL FULL STOP |
48 L'\xFF07', // MidNumLet # FULLWIDTH APOSTROPHE | 48 L'\xFF07', // MidNumLet # FULLWIDTH APOSTROPHE |
49 L'\xFF0E', // MidNumLet # FULLWIDTH FULL STOP | 49 L'\xFF0E', // MidNumLet # FULLWIDTH FULL STOP |
50 }; | 50 }; |
51 for (int i = 0; i < arraysize(kMidLetters); i++) | 51 for (size_t i = 0; i < arraysize(kMidLetters); ++i) |
52 middle_letters_[kMidLetters[i]] = true; | 52 middle_letters_[kMidLetters[i]] = true; |
53 } | 53 } |
54 | 54 |
55 SpellcheckCharAttribute::~SpellcheckCharAttribute() { | 55 SpellcheckCharAttribute::~SpellcheckCharAttribute() { |
56 } | 56 } |
57 | 57 |
58 // Sets the default language for this object. | 58 // Sets the default language for this object. |
59 // This function retrieves the exemplar set to set up the default character | 59 // This function retrieves the exemplar set to set up the default character |
60 // attributes. | 60 // attributes. |
61 void SpellcheckCharAttribute::SetDefaultLanguage(const std::wstring& language) { | 61 void SpellcheckCharAttribute::SetDefaultLanguage(const std::wstring& language) { |
62 // Retrieves the locale data of the given language. | 62 // Retrieves the locale data of the given language. |
63 std::string language_encoded; | 63 std::string language_encoded; |
64 WideToCodepage(language, "us-ascii", OnStringUtilConversionError::SKIP, | 64 WideToCodepage(language, "us-ascii", OnStringUtilConversionError::SKIP, |
65 &language_encoded); | 65 &language_encoded); |
66 UErrorCode status = U_ZERO_ERROR; | 66 UErrorCode status = U_ZERO_ERROR; |
67 ULocaleData* locale_data = ulocdata_open(language_encoded.c_str(), &status); | 67 ULocaleData* locale_data = ulocdata_open(language_encoded.c_str(), &status); |
68 if (U_FAILURE(status)) | 68 if (U_FAILURE(status)) |
69 return; | 69 return; |
70 | 70 |
71 // Retrieves the exemplar set of the given language and update the | 71 // Retrieves the exemplar set of the given language and update the |
72 // character-attribute table to treat its characters as word characters. | 72 // character-attribute table to treat its characters as word characters. |
73 USet* exemplar_set = uset_open(1, 0); | 73 USet* exemplar_set = uset_open(1, 0); |
74 ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD, | 74 ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD, |
75 &status); | 75 &status); |
76 ulocdata_close(locale_data); | 76 ulocdata_close(locale_data); |
77 if (U_SUCCESS(status)) { | 77 if (U_SUCCESS(status)) { |
78 int length = uset_size(exemplar_set); | 78 int length = uset_size(exemplar_set); |
79 for (int i = 0; i < length; i++) { | 79 for (int i = 0; i < length; ++i) { |
80 UChar32 character = uset_charAt(exemplar_set, i); | 80 UChar32 character = uset_charAt(exemplar_set, i); |
81 SetWordScript(GetScriptCode(character), true); | 81 SetWordScript(GetScriptCode(character), true); |
82 } | 82 } |
83 } | 83 } |
84 uset_close(exemplar_set); | 84 uset_close(exemplar_set); |
85 } | 85 } |
86 | 86 |
87 // Returns whether or not the given character is a character used by the | 87 // Returns whether or not the given character is a character used by the |
88 // selected dictionary. | 88 // selected dictionary. |
89 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const { | 89 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const { |
90 return IsWordScript(GetScriptCode(character)) && !u_isdigit(character); | 90 return IsWordScript(GetScriptCode(character)) && !u_isdigit(character); |
91 } | 91 } |
92 | 92 |
93 // Returns whether or not the given character is a character used by | 93 // Returns whether or not the given character is a character used by |
94 // contractions. | 94 // contractions. |
95 bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const { | 95 bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const { |
96 std::map<UChar32, bool>::const_iterator iterator; | 96 std::map<UChar32, bool>::const_iterator iterator; |
97 iterator = middle_letters_.find(character); | 97 iterator = middle_letters_.find(character); |
98 if (iterator == middle_letters_.end()) | 98 if (iterator == middle_letters_.end()) |
99 return false; | 99 return false; |
100 return iterator->second; | 100 return iterator->second; |
101 } | 101 } |
102 | 102 |
103 // Initializes the mapping table. | 103 // Initializes the mapping table. |
104 void SpellcheckCharAttribute::InitializeScriptTable() { | 104 void SpellcheckCharAttribute::InitializeScriptTable() { |
105 for (int i = 0; i < arraysize(script_attributes_); i++) | 105 for (size_t i = 0; i < arraysize(script_attributes_); ++i) |
106 script_attributes_[i] = false; | 106 script_attributes_[i] = false; |
107 } | 107 } |
108 | 108 |
109 // Retrieves the ICU script code. | 109 // Retrieves the ICU script code. |
110 UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const { | 110 UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const { |
111 UErrorCode status = U_ZERO_ERROR; | 111 UErrorCode status = U_ZERO_ERROR; |
112 UScriptCode script_code = uscript_getScript(character, &status); | 112 UScriptCode script_code = uscript_getScript(character, &status); |
113 return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE; | 113 return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE; |
114 } | 114 } |
115 | 115 |
116 // Updates the mapping table from an ICU script code to its attribute, i.e. | 116 // Updates the mapping table from an ICU script code to its attribute, i.e. |
117 // whether not a script is used by the selected dictionary. | 117 // whether not a script is used by the selected dictionary. |
118 void SpellcheckCharAttribute::SetWordScript(const int script_code, | 118 void SpellcheckCharAttribute::SetWordScript(const int script_code, |
119 bool in_use) { | 119 bool in_use) { |
120 if (script_code < 0 || script_code >= arraysize(script_attributes_)) | 120 if (script_code < 0 || |
| 121 static_cast<size_t>(script_code) >= arraysize(script_attributes_)) |
121 return; | 122 return; |
122 script_attributes_[script_code] = in_use; | 123 script_attributes_[script_code] = in_use; |
123 } | 124 } |
124 | 125 |
125 // Returns whether or not the given script is used by the selected | 126 // Returns whether or not the given script is used by the selected |
126 // dictionary. | 127 // dictionary. |
127 bool SpellcheckCharAttribute::IsWordScript( | 128 bool SpellcheckCharAttribute::IsWordScript( |
128 const UScriptCode script_code) const { | 129 const UScriptCode script_code) const { |
129 if (script_code < 0 || script_code >= arraysize(script_attributes_)) | 130 if (script_code < 0 || |
| 131 static_cast<size_t>(script_code) >= arraysize(script_attributes_)) |
130 return false; | 132 return false; |
131 return script_attributes_[script_code]; | 133 return script_attributes_[script_code]; |
132 } | 134 } |
133 | 135 |
134 SpellcheckWordIterator::SpellcheckWordIterator() | 136 SpellcheckWordIterator::SpellcheckWordIterator() |
135 : word_(NULL), | 137 : word_(NULL), |
| 138 length_(0), |
136 position_(0), | 139 position_(0), |
137 length_(0), | |
138 allow_contraction_(false), | 140 allow_contraction_(false), |
139 attribute_(NULL) { | 141 attribute_(NULL) { |
140 } | 142 } |
141 | 143 |
142 SpellcheckWordIterator::~SpellcheckWordIterator() { | 144 SpellcheckWordIterator::~SpellcheckWordIterator() { |
143 } | 145 } |
144 | 146 |
145 // Initialize a word-iterator object. | 147 // Initialize a word-iterator object. |
146 void SpellcheckWordIterator::Initialize( | 148 void SpellcheckWordIterator::Initialize( |
147 const SpellcheckCharAttribute* attribute, | 149 const SpellcheckCharAttribute* attribute, |
148 const wchar_t* word, | 150 const char16* word, |
149 size_t length, | 151 size_t length, |
150 bool allow_contraction) { | 152 bool allow_contraction) { |
151 word_ = word; | 153 word_ = word; |
152 position_ = 0; | 154 position_ = 0; |
153 length_ = static_cast<int>(length); | 155 length_ = static_cast<int>(length); |
154 allow_contraction_ = allow_contraction; | 156 allow_contraction_ = allow_contraction; |
155 attribute_ = attribute; | 157 attribute_ = attribute; |
156 } | 158 } |
157 | 159 |
158 // Retrieves a word (or a contraction). | 160 // Retrieves a word (or a contraction). |
159 // When a contraction is enclosed with contraction characters (e.g. 'isn't', | 161 // When a contraction is enclosed with contraction characters (e.g. 'isn't', |
160 // 'rock'n'roll'), we should discard the beginning and the end of the | 162 // 'rock'n'roll'), we should discard the beginning and the end of the |
161 // contraction but we should never split the contraction. | 163 // contraction but we should never split the contraction. |
162 // To handle this case easily, we should firstly extract a segment consisting | 164 // To handle this case easily, we should firstly extract a segment consisting |
163 // of word characters and contraction characters, and discard contraction | 165 // of word characters and contraction characters, and discard contraction |
164 // characters at the beginning and the end of the extracted segment. | 166 // characters at the beginning and the end of the extracted segment. |
165 bool SpellcheckWordIterator::GetNextWord(std::wstring* word_string, | 167 bool SpellcheckWordIterator::GetNextWord(string16* word_string, |
166 int* word_start, | 168 int* word_start, |
167 int* word_length) { | 169 int* word_length) { |
168 word_string->empty(); | 170 word_string->empty(); |
169 *word_start = 0; | 171 *word_start = 0; |
170 *word_length = 0; | 172 *word_length = 0; |
171 while (position_ < length_) { | 173 while (position_ < length_) { |
172 int segment_start = 0; | 174 int segment_start = 0; |
173 int segment_end = 0; | 175 int segment_end = 0; |
174 GetSegment(&segment_start, &segment_end); | 176 GetSegment(&segment_start, &segment_end); |
175 TrimSegment(segment_start, segment_end, word_start, word_length); | 177 TrimSegment(segment_start, segment_end, word_start, word_length); |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
232 | 234 |
233 // Normalizes a non-terminated string into its canonical form so that | 235 // Normalizes a non-terminated string into its canonical form so that |
234 // a spellchecker object can check spellings of words which contain ligatures, | 236 // a spellchecker object can check spellings of words which contain ligatures, |
235 // full-width letters, etc. | 237 // full-width letters, etc. |
236 // USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but | 238 // USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but |
237 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin, | 239 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin, |
238 // etc. For its details, please read the script table in | 240 // etc. For its details, please read the script table in |
239 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt". | 241 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt". |
240 bool SpellcheckWordIterator::Normalize(int input_start, | 242 bool SpellcheckWordIterator::Normalize(int input_start, |
241 int input_length, | 243 int input_length, |
242 std::wstring* output_string) const { | 244 string16* output_string) const { |
243 // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/" | 245 // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/" |
244 // does not only write NFKD and NFKC can compose ligatures into their ASCII | 246 // does not only write NFKD and NFKC can compose ligatures into their ASCII |
245 // alternatives, but also write NFKC keeps accents of characters. | 247 // alternatives, but also write NFKC keeps accents of characters. |
246 // Therefore, NFKC seems to be the best option for hunspell. | 248 // Therefore, NFKC seems to be the best option for hunspell. |
247 // To use NKFC for normalization, the length of the output string is mostly | 249 // To use NKFC for normalization, the length of the output string is mostly |
248 // equal to the one of the input string. (One exception is ligatures.) | 250 // equal to the one of the input string. (One exception is ligatures.) |
249 // To avoid the unorm_normalize() function from being called always twice, | 251 // To avoid the unorm_normalize() function from being called always twice, |
250 // we temporarily allocate |input_length| + 1 characters to the output string | 252 // we temporarily allocate |input_length| + 1 characters to the output string |
251 // and call the function with it. We re-allocate the output string | 253 // and call the function with it. We re-allocate the output string |
252 // only if it cannot store the normalized string, i.e. the output string is | 254 // only if it cannot store the normalized string, i.e. the output string is |
253 // longer than the input one. | 255 // longer than the input one. |
254 const wchar_t* input_string = &word_[input_start]; | 256 const char16* input_string = &word_[input_start]; |
255 UErrorCode error_code = U_ZERO_ERROR; | 257 UErrorCode error_code = U_ZERO_ERROR; |
256 int output_length = input_length + 1; | 258 int output_length = input_length + 1; |
257 wchar_t *output_buffer = WriteInto(output_string, output_length); | 259 char16* output_buffer = WriteInto(output_string, output_length); |
258 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0, | 260 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0, |
259 output_buffer, output_length, &error_code); | 261 output_buffer, output_length, &error_code); |
260 if (error_code == U_BUFFER_OVERFLOW_ERROR) { | 262 if (error_code == U_BUFFER_OVERFLOW_ERROR) { |
261 error_code = U_ZERO_ERROR; | 263 error_code = U_ZERO_ERROR; |
262 output_buffer = WriteInto(output_string, ++output_length); | 264 output_buffer = WriteInto(output_string, ++output_length); |
263 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0, | 265 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0, |
264 output_buffer, output_length, &error_code); | 266 output_buffer, output_length, &error_code); |
265 } | 267 } |
266 return (error_code == U_ZERO_ERROR); | 268 return (error_code == U_ZERO_ERROR); |
267 } | 269 } |
268 | 270 |
OLD | NEW |