Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(376)

Side by Side Diff: chrome/browser/spellcheck_worditerator.cc

Issue 14408: Port the spell checker to posix. (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 12 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « chrome/browser/spellcheck_worditerator.h ('k') | chrome/browser/spellchecker.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/browser/spellcheck_worditerator.h" 5 #include "chrome/browser/spellcheck_worditerator.h"
6 6
7 #include <map> 7 #include <map>
8 #include <string> 8 #include <string>
9 9
10 #include "base/basictypes.h" 10 #include "base/basictypes.h"
(...skipping 30 matching lines...) Expand all
41 L'\xFF1A', // MidLetter # FULLWIDTH COLON 41 L'\xFF1A', // MidLetter # FULLWIDTH COLON
42 L'\x0027', // MidNumLet # APOSTROPHE 42 L'\x0027', // MidNumLet # APOSTROPHE
43 L'\x002E', // MidNumLet # FULL STOP 43 L'\x002E', // MidNumLet # FULL STOP
44 L'\x2018', // MidNumLet # LEFT SINGLE QUOTATION MARK 44 L'\x2018', // MidNumLet # LEFT SINGLE QUOTATION MARK
45 L'\x2019', // MidNumLet # RIGHT SINGLE QUOTATION MARK 45 L'\x2019', // MidNumLet # RIGHT SINGLE QUOTATION MARK
46 L'\x2024', // MidNumLet # ONE DOT LEADER 46 L'\x2024', // MidNumLet # ONE DOT LEADER
47 L'\xFE52', // MidNumLet # SMALL FULL STOP 47 L'\xFE52', // MidNumLet # SMALL FULL STOP
48 L'\xFF07', // MidNumLet # FULLWIDTH APOSTROPHE 48 L'\xFF07', // MidNumLet # FULLWIDTH APOSTROPHE
49 L'\xFF0E', // MidNumLet # FULLWIDTH FULL STOP 49 L'\xFF0E', // MidNumLet # FULLWIDTH FULL STOP
50 }; 50 };
51 for (int i = 0; i < arraysize(kMidLetters); i++) 51 for (size_t i = 0; i < arraysize(kMidLetters); ++i)
52 middle_letters_[kMidLetters[i]] = true; 52 middle_letters_[kMidLetters[i]] = true;
53 } 53 }
54 54
55 SpellcheckCharAttribute::~SpellcheckCharAttribute() { 55 SpellcheckCharAttribute::~SpellcheckCharAttribute() {
56 } 56 }
57 57
58 // Sets the default language for this object. 58 // Sets the default language for this object.
59 // This function retrieves the exemplar set to set up the default character 59 // This function retrieves the exemplar set to set up the default character
60 // attributes. 60 // attributes.
61 void SpellcheckCharAttribute::SetDefaultLanguage(const std::wstring& language) { 61 void SpellcheckCharAttribute::SetDefaultLanguage(const std::wstring& language) {
62 // Retrieves the locale data of the given language. 62 // Retrieves the locale data of the given language.
63 std::string language_encoded; 63 std::string language_encoded;
64 WideToCodepage(language, "us-ascii", OnStringUtilConversionError::SKIP, 64 WideToCodepage(language, "us-ascii", OnStringUtilConversionError::SKIP,
65 &language_encoded); 65 &language_encoded);
66 UErrorCode status = U_ZERO_ERROR; 66 UErrorCode status = U_ZERO_ERROR;
67 ULocaleData* locale_data = ulocdata_open(language_encoded.c_str(), &status); 67 ULocaleData* locale_data = ulocdata_open(language_encoded.c_str(), &status);
68 if (U_FAILURE(status)) 68 if (U_FAILURE(status))
69 return; 69 return;
70 70
71 // Retrieves the exemplar set of the given language and update the 71 // Retrieves the exemplar set of the given language and update the
72 // character-attribute table to treat its characters as word characters. 72 // character-attribute table to treat its characters as word characters.
73 USet* exemplar_set = uset_open(1, 0); 73 USet* exemplar_set = uset_open(1, 0);
74 ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD, 74 ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD,
75 &status); 75 &status);
76 ulocdata_close(locale_data); 76 ulocdata_close(locale_data);
77 if (U_SUCCESS(status)) { 77 if (U_SUCCESS(status)) {
78 int length = uset_size(exemplar_set); 78 int length = uset_size(exemplar_set);
79 for (int i = 0; i < length; i++) { 79 for (int i = 0; i < length; ++i) {
80 UChar32 character = uset_charAt(exemplar_set, i); 80 UChar32 character = uset_charAt(exemplar_set, i);
81 SetWordScript(GetScriptCode(character), true); 81 SetWordScript(GetScriptCode(character), true);
82 } 82 }
83 } 83 }
84 uset_close(exemplar_set); 84 uset_close(exemplar_set);
85 } 85 }
86 86
87 // Returns whether or not the given character is a character used by the 87 // Returns whether or not the given character is a character used by the
88 // selected dictionary. 88 // selected dictionary.
89 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const { 89 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const {
90 return IsWordScript(GetScriptCode(character)) && !u_isdigit(character); 90 return IsWordScript(GetScriptCode(character)) && !u_isdigit(character);
91 } 91 }
92 92
93 // Returns whether or not the given character is a character used by 93 // Returns whether or not the given character is a character used by
94 // contractions. 94 // contractions.
95 bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const { 95 bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const {
96 std::map<UChar32, bool>::const_iterator iterator; 96 std::map<UChar32, bool>::const_iterator iterator;
97 iterator = middle_letters_.find(character); 97 iterator = middle_letters_.find(character);
98 if (iterator == middle_letters_.end()) 98 if (iterator == middle_letters_.end())
99 return false; 99 return false;
100 return iterator->second; 100 return iterator->second;
101 } 101 }
102 102
103 // Initializes the mapping table. 103 // Initializes the mapping table.
104 void SpellcheckCharAttribute::InitializeScriptTable() { 104 void SpellcheckCharAttribute::InitializeScriptTable() {
105 for (int i = 0; i < arraysize(script_attributes_); i++) 105 for (size_t i = 0; i < arraysize(script_attributes_); ++i)
106 script_attributes_[i] = false; 106 script_attributes_[i] = false;
107 } 107 }
108 108
109 // Retrieves the ICU script code. 109 // Retrieves the ICU script code.
110 UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const { 110 UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const {
111 UErrorCode status = U_ZERO_ERROR; 111 UErrorCode status = U_ZERO_ERROR;
112 UScriptCode script_code = uscript_getScript(character, &status); 112 UScriptCode script_code = uscript_getScript(character, &status);
113 return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE; 113 return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE;
114 } 114 }
115 115
116 // Updates the mapping table from an ICU script code to its attribute, i.e. 116 // Updates the mapping table from an ICU script code to its attribute, i.e.
117 // whether not a script is used by the selected dictionary. 117 // whether not a script is used by the selected dictionary.
118 void SpellcheckCharAttribute::SetWordScript(const int script_code, 118 void SpellcheckCharAttribute::SetWordScript(const int script_code,
119 bool in_use) { 119 bool in_use) {
120 if (script_code < 0 || script_code >= arraysize(script_attributes_)) 120 if (script_code < 0 ||
121 static_cast<size_t>(script_code) >= arraysize(script_attributes_))
121 return; 122 return;
122 script_attributes_[script_code] = in_use; 123 script_attributes_[script_code] = in_use;
123 } 124 }
124 125
125 // Returns whether or not the given script is used by the selected 126 // Returns whether or not the given script is used by the selected
126 // dictionary. 127 // dictionary.
127 bool SpellcheckCharAttribute::IsWordScript( 128 bool SpellcheckCharAttribute::IsWordScript(
128 const UScriptCode script_code) const { 129 const UScriptCode script_code) const {
129 if (script_code < 0 || script_code >= arraysize(script_attributes_)) 130 if (script_code < 0 ||
131 static_cast<size_t>(script_code) >= arraysize(script_attributes_))
130 return false; 132 return false;
131 return script_attributes_[script_code]; 133 return script_attributes_[script_code];
132 } 134 }
133 135
134 SpellcheckWordIterator::SpellcheckWordIterator() 136 SpellcheckWordIterator::SpellcheckWordIterator()
135 : word_(NULL), 137 : word_(NULL),
138 length_(0),
136 position_(0), 139 position_(0),
137 length_(0),
138 allow_contraction_(false), 140 allow_contraction_(false),
139 attribute_(NULL) { 141 attribute_(NULL) {
140 } 142 }
141 143
142 SpellcheckWordIterator::~SpellcheckWordIterator() { 144 SpellcheckWordIterator::~SpellcheckWordIterator() {
143 } 145 }
144 146
145 // Initialize a word-iterator object. 147 // Initialize a word-iterator object.
146 void SpellcheckWordIterator::Initialize( 148 void SpellcheckWordIterator::Initialize(
147 const SpellcheckCharAttribute* attribute, 149 const SpellcheckCharAttribute* attribute,
148 const wchar_t* word, 150 const char16* word,
149 size_t length, 151 size_t length,
150 bool allow_contraction) { 152 bool allow_contraction) {
151 word_ = word; 153 word_ = word;
152 position_ = 0; 154 position_ = 0;
153 length_ = static_cast<int>(length); 155 length_ = static_cast<int>(length);
154 allow_contraction_ = allow_contraction; 156 allow_contraction_ = allow_contraction;
155 attribute_ = attribute; 157 attribute_ = attribute;
156 } 158 }
157 159
158 // Retrieves a word (or a contraction). 160 // Retrieves a word (or a contraction).
159 // When a contraction is enclosed with contraction characters (e.g. 'isn't', 161 // When a contraction is enclosed with contraction characters (e.g. 'isn't',
160 // 'rock'n'roll'), we should discard the beginning and the end of the 162 // 'rock'n'roll'), we should discard the beginning and the end of the
161 // contraction but we should never split the contraction. 163 // contraction but we should never split the contraction.
162 // To handle this case easily, we should firstly extract a segment consisting 164 // To handle this case easily, we should firstly extract a segment consisting
163 // of word characters and contraction characters, and discard contraction 165 // of word characters and contraction characters, and discard contraction
164 // characters at the beginning and the end of the extracted segment. 166 // characters at the beginning and the end of the extracted segment.
165 bool SpellcheckWordIterator::GetNextWord(std::wstring* word_string, 167 bool SpellcheckWordIterator::GetNextWord(string16* word_string,
166 int* word_start, 168 int* word_start,
167 int* word_length) { 169 int* word_length) {
168 word_string->empty(); 170 word_string->empty();
169 *word_start = 0; 171 *word_start = 0;
170 *word_length = 0; 172 *word_length = 0;
171 while (position_ < length_) { 173 while (position_ < length_) {
172 int segment_start = 0; 174 int segment_start = 0;
173 int segment_end = 0; 175 int segment_end = 0;
174 GetSegment(&segment_start, &segment_end); 176 GetSegment(&segment_start, &segment_end);
175 TrimSegment(segment_start, segment_end, word_start, word_length); 177 TrimSegment(segment_start, segment_end, word_start, word_length);
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
232 234
233 // Normalizes a non-terminated string into its canonical form so that 235 // Normalizes a non-terminated string into its canonical form so that
234 // a spellchecker object can check spellings of words which contain ligatures, 236 // a spellchecker object can check spellings of words which contain ligatures,
235 // full-width letters, etc. 237 // full-width letters, etc.
236 // USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but 238 // USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but
237 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin, 239 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin,
238 // etc. For its details, please read the script table in 240 // etc. For its details, please read the script table in
239 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt". 241 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt".
240 bool SpellcheckWordIterator::Normalize(int input_start, 242 bool SpellcheckWordIterator::Normalize(int input_start,
241 int input_length, 243 int input_length,
242 std::wstring* output_string) const { 244 string16* output_string) const {
243 // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/" 245 // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/"
244 // does not only write NFKD and NFKC can compose ligatures into their ASCII 246 // does not only write NFKD and NFKC can compose ligatures into their ASCII
245 // alternatives, but also write NFKC keeps accents of characters. 247 // alternatives, but also write NFKC keeps accents of characters.
246 // Therefore, NFKC seems to be the best option for hunspell. 248 // Therefore, NFKC seems to be the best option for hunspell.
247 // To use NKFC for normalization, the length of the output string is mostly 249 // To use NKFC for normalization, the length of the output string is mostly
248 // equal to the one of the input string. (One exception is ligatures.) 250 // equal to the one of the input string. (One exception is ligatures.)
249 // To avoid the unorm_normalize() function from being called always twice, 251 // To avoid the unorm_normalize() function from being called always twice,
250 // we temporarily allocate |input_length| + 1 characters to the output string 252 // we temporarily allocate |input_length| + 1 characters to the output string
251 // and call the function with it. We re-allocate the output string 253 // and call the function with it. We re-allocate the output string
252 // only if it cannot store the normalized string, i.e. the output string is 254 // only if it cannot store the normalized string, i.e. the output string is
253 // longer than the input one. 255 // longer than the input one.
254 const wchar_t* input_string = &word_[input_start]; 256 const char16* input_string = &word_[input_start];
255 UErrorCode error_code = U_ZERO_ERROR; 257 UErrorCode error_code = U_ZERO_ERROR;
256 int output_length = input_length + 1; 258 int output_length = input_length + 1;
257 wchar_t *output_buffer = WriteInto(output_string, output_length); 259 char16* output_buffer = WriteInto(output_string, output_length);
258 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0, 260 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,
259 output_buffer, output_length, &error_code); 261 output_buffer, output_length, &error_code);
260 if (error_code == U_BUFFER_OVERFLOW_ERROR) { 262 if (error_code == U_BUFFER_OVERFLOW_ERROR) {
261 error_code = U_ZERO_ERROR; 263 error_code = U_ZERO_ERROR;
262 output_buffer = WriteInto(output_string, ++output_length); 264 output_buffer = WriteInto(output_string, ++output_length);
263 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0, 265 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,
264 output_buffer, output_length, &error_code); 266 output_buffer, output_length, &error_code);
265 } 267 }
266 return (error_code == U_ZERO_ERROR); 268 return (error_code == U_ZERO_ERROR);
267 } 269 }
268 270
OLDNEW
« no previous file with comments | « chrome/browser/spellcheck_worditerator.h ('k') | chrome/browser/spellchecker.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698