| OLD | NEW |
| 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/browser/spellcheck_worditerator.h" | 5 #include "chrome/browser/spellcheck_worditerator.h" |
| 6 | 6 |
| 7 #include <map> | 7 #include <map> |
| 8 #include <string> | 8 #include <string> |
| 9 | 9 |
| 10 #include "base/basictypes.h" | 10 #include "base/basictypes.h" |
| (...skipping 30 matching lines...) Expand all Loading... |
| 41 L'\xFF1A', // MidLetter # FULLWIDTH COLON | 41 L'\xFF1A', // MidLetter # FULLWIDTH COLON |
| 42 L'\x0027', // MidNumLet # APOSTROPHE | 42 L'\x0027', // MidNumLet # APOSTROPHE |
| 43 L'\x002E', // MidNumLet # FULL STOP | 43 L'\x002E', // MidNumLet # FULL STOP |
| 44 L'\x2018', // MidNumLet # LEFT SINGLE QUOTATION MARK | 44 L'\x2018', // MidNumLet # LEFT SINGLE QUOTATION MARK |
| 45 L'\x2019', // MidNumLet # RIGHT SINGLE QUOTATION MARK | 45 L'\x2019', // MidNumLet # RIGHT SINGLE QUOTATION MARK |
| 46 L'\x2024', // MidNumLet # ONE DOT LEADER | 46 L'\x2024', // MidNumLet # ONE DOT LEADER |
| 47 L'\xFE52', // MidNumLet # SMALL FULL STOP | 47 L'\xFE52', // MidNumLet # SMALL FULL STOP |
| 48 L'\xFF07', // MidNumLet # FULLWIDTH APOSTROPHE | 48 L'\xFF07', // MidNumLet # FULLWIDTH APOSTROPHE |
| 49 L'\xFF0E', // MidNumLet # FULLWIDTH FULL STOP | 49 L'\xFF0E', // MidNumLet # FULLWIDTH FULL STOP |
| 50 }; | 50 }; |
| 51 for (int i = 0; i < arraysize(kMidLetters); i++) | 51 for (size_t i = 0; i < arraysize(kMidLetters); ++i) |
| 52 middle_letters_[kMidLetters[i]] = true; | 52 middle_letters_[kMidLetters[i]] = true; |
| 53 } | 53 } |
| 54 | 54 |
| 55 SpellcheckCharAttribute::~SpellcheckCharAttribute() { | 55 SpellcheckCharAttribute::~SpellcheckCharAttribute() { |
| 56 } | 56 } |
| 57 | 57 |
| 58 // Sets the default language for this object. | 58 // Sets the default language for this object. |
| 59 // This function retrieves the exemplar set to set up the default character | 59 // This function retrieves the exemplar set to set up the default character |
| 60 // attributes. | 60 // attributes. |
| 61 void SpellcheckCharAttribute::SetDefaultLanguage(const std::wstring& language) { | 61 void SpellcheckCharAttribute::SetDefaultLanguage(const std::wstring& language) { |
| 62 // Retrieves the locale data of the given language. | 62 // Retrieves the locale data of the given language. |
| 63 std::string language_encoded; | 63 std::string language_encoded; |
| 64 WideToCodepage(language, "us-ascii", OnStringUtilConversionError::SKIP, | 64 WideToCodepage(language, "us-ascii", OnStringUtilConversionError::SKIP, |
| 65 &language_encoded); | 65 &language_encoded); |
| 66 UErrorCode status = U_ZERO_ERROR; | 66 UErrorCode status = U_ZERO_ERROR; |
| 67 ULocaleData* locale_data = ulocdata_open(language_encoded.c_str(), &status); | 67 ULocaleData* locale_data = ulocdata_open(language_encoded.c_str(), &status); |
| 68 if (U_FAILURE(status)) | 68 if (U_FAILURE(status)) |
| 69 return; | 69 return; |
| 70 | 70 |
| 71 // Retrieves the exemplar set of the given language and update the | 71 // Retrieves the exemplar set of the given language and update the |
| 72 // character-attribute table to treat its characters as word characters. | 72 // character-attribute table to treat its characters as word characters. |
| 73 USet* exemplar_set = uset_open(1, 0); | 73 USet* exemplar_set = uset_open(1, 0); |
| 74 ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD, | 74 ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD, |
| 75 &status); | 75 &status); |
| 76 ulocdata_close(locale_data); | 76 ulocdata_close(locale_data); |
| 77 if (U_SUCCESS(status)) { | 77 if (U_SUCCESS(status)) { |
| 78 int length = uset_size(exemplar_set); | 78 int length = uset_size(exemplar_set); |
| 79 for (int i = 0; i < length; i++) { | 79 for (int i = 0; i < length; ++i) { |
| 80 UChar32 character = uset_charAt(exemplar_set, i); | 80 UChar32 character = uset_charAt(exemplar_set, i); |
| 81 SetWordScript(GetScriptCode(character), true); | 81 SetWordScript(GetScriptCode(character), true); |
| 82 } | 82 } |
| 83 } | 83 } |
| 84 uset_close(exemplar_set); | 84 uset_close(exemplar_set); |
| 85 } | 85 } |
| 86 | 86 |
| 87 // Returns whether or not the given character is a character used by the | 87 // Returns whether or not the given character is a character used by the |
| 88 // selected dictionary. | 88 // selected dictionary. |
| 89 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const { | 89 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const { |
| 90 return IsWordScript(GetScriptCode(character)) && !u_isdigit(character); | 90 return IsWordScript(GetScriptCode(character)) && !u_isdigit(character); |
| 91 } | 91 } |
| 92 | 92 |
| 93 // Returns whether or not the given character is a character used by | 93 // Returns whether or not the given character is a character used by |
| 94 // contractions. | 94 // contractions. |
| 95 bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const { | 95 bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const { |
| 96 std::map<UChar32, bool>::const_iterator iterator; | 96 std::map<UChar32, bool>::const_iterator iterator; |
| 97 iterator = middle_letters_.find(character); | 97 iterator = middle_letters_.find(character); |
| 98 if (iterator == middle_letters_.end()) | 98 if (iterator == middle_letters_.end()) |
| 99 return false; | 99 return false; |
| 100 return iterator->second; | 100 return iterator->second; |
| 101 } | 101 } |
| 102 | 102 |
| 103 // Initializes the mapping table. | 103 // Initializes the mapping table. |
| 104 void SpellcheckCharAttribute::InitializeScriptTable() { | 104 void SpellcheckCharAttribute::InitializeScriptTable() { |
| 105 for (int i = 0; i < arraysize(script_attributes_); i++) | 105 for (size_t i = 0; i < arraysize(script_attributes_); ++i) |
| 106 script_attributes_[i] = false; | 106 script_attributes_[i] = false; |
| 107 } | 107 } |
| 108 | 108 |
| 109 // Retrieves the ICU script code. | 109 // Retrieves the ICU script code. |
| 110 UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const { | 110 UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const { |
| 111 UErrorCode status = U_ZERO_ERROR; | 111 UErrorCode status = U_ZERO_ERROR; |
| 112 UScriptCode script_code = uscript_getScript(character, &status); | 112 UScriptCode script_code = uscript_getScript(character, &status); |
| 113 return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE; | 113 return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE; |
| 114 } | 114 } |
| 115 | 115 |
| 116 // Updates the mapping table from an ICU script code to its attribute, i.e. | 116 // Updates the mapping table from an ICU script code to its attribute, i.e. |
| 117 // whether not a script is used by the selected dictionary. | 117 // whether not a script is used by the selected dictionary. |
| 118 void SpellcheckCharAttribute::SetWordScript(const int script_code, | 118 void SpellcheckCharAttribute::SetWordScript(const int script_code, |
| 119 bool in_use) { | 119 bool in_use) { |
| 120 if (script_code < 0 || script_code >= arraysize(script_attributes_)) | 120 if (script_code < 0 || |
| 121 static_cast<size_t>(script_code) >= arraysize(script_attributes_)) |
| 121 return; | 122 return; |
| 122 script_attributes_[script_code] = in_use; | 123 script_attributes_[script_code] = in_use; |
| 123 } | 124 } |
| 124 | 125 |
| 125 // Returns whether or not the given script is used by the selected | 126 // Returns whether or not the given script is used by the selected |
| 126 // dictionary. | 127 // dictionary. |
| 127 bool SpellcheckCharAttribute::IsWordScript( | 128 bool SpellcheckCharAttribute::IsWordScript( |
| 128 const UScriptCode script_code) const { | 129 const UScriptCode script_code) const { |
| 129 if (script_code < 0 || script_code >= arraysize(script_attributes_)) | 130 if (script_code < 0 || |
| 131 static_cast<size_t>(script_code) >= arraysize(script_attributes_)) |
| 130 return false; | 132 return false; |
| 131 return script_attributes_[script_code]; | 133 return script_attributes_[script_code]; |
| 132 } | 134 } |
| 133 | 135 |
| 134 SpellcheckWordIterator::SpellcheckWordIterator() | 136 SpellcheckWordIterator::SpellcheckWordIterator() |
| 135 : word_(NULL), | 137 : word_(NULL), |
| 138 length_(0), |
| 136 position_(0), | 139 position_(0), |
| 137 length_(0), | |
| 138 allow_contraction_(false), | 140 allow_contraction_(false), |
| 139 attribute_(NULL) { | 141 attribute_(NULL) { |
| 140 } | 142 } |
| 141 | 143 |
| 142 SpellcheckWordIterator::~SpellcheckWordIterator() { | 144 SpellcheckWordIterator::~SpellcheckWordIterator() { |
| 143 } | 145 } |
| 144 | 146 |
| 145 // Initialize a word-iterator object. | 147 // Initialize a word-iterator object. |
| 146 void SpellcheckWordIterator::Initialize( | 148 void SpellcheckWordIterator::Initialize( |
| 147 const SpellcheckCharAttribute* attribute, | 149 const SpellcheckCharAttribute* attribute, |
| 148 const wchar_t* word, | 150 const char16* word, |
| 149 size_t length, | 151 size_t length, |
| 150 bool allow_contraction) { | 152 bool allow_contraction) { |
| 151 word_ = word; | 153 word_ = word; |
| 152 position_ = 0; | 154 position_ = 0; |
| 153 length_ = static_cast<int>(length); | 155 length_ = static_cast<int>(length); |
| 154 allow_contraction_ = allow_contraction; | 156 allow_contraction_ = allow_contraction; |
| 155 attribute_ = attribute; | 157 attribute_ = attribute; |
| 156 } | 158 } |
| 157 | 159 |
| 158 // Retrieves a word (or a contraction). | 160 // Retrieves a word (or a contraction). |
| 159 // When a contraction is enclosed with contraction characters (e.g. 'isn't', | 161 // When a contraction is enclosed with contraction characters (e.g. 'isn't', |
| 160 // 'rock'n'roll'), we should discard the beginning and the end of the | 162 // 'rock'n'roll'), we should discard the beginning and the end of the |
| 161 // contraction but we should never split the contraction. | 163 // contraction but we should never split the contraction. |
| 162 // To handle this case easily, we should firstly extract a segment consisting | 164 // To handle this case easily, we should firstly extract a segment consisting |
| 163 // of word characters and contraction characters, and discard contraction | 165 // of word characters and contraction characters, and discard contraction |
| 164 // characters at the beginning and the end of the extracted segment. | 166 // characters at the beginning and the end of the extracted segment. |
| 165 bool SpellcheckWordIterator::GetNextWord(std::wstring* word_string, | 167 bool SpellcheckWordIterator::GetNextWord(string16* word_string, |
| 166 int* word_start, | 168 int* word_start, |
| 167 int* word_length) { | 169 int* word_length) { |
| 168 word_string->empty(); | 170 word_string->empty(); |
| 169 *word_start = 0; | 171 *word_start = 0; |
| 170 *word_length = 0; | 172 *word_length = 0; |
| 171 while (position_ < length_) { | 173 while (position_ < length_) { |
| 172 int segment_start = 0; | 174 int segment_start = 0; |
| 173 int segment_end = 0; | 175 int segment_end = 0; |
| 174 GetSegment(&segment_start, &segment_end); | 176 GetSegment(&segment_start, &segment_end); |
| 175 TrimSegment(segment_start, segment_end, word_start, word_length); | 177 TrimSegment(segment_start, segment_end, word_start, word_length); |
| (...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 232 | 234 |
| 233 // Normalizes a non-terminated string into its canonical form so that | 235 // Normalizes a non-terminated string into its canonical form so that |
| 234 // a spellchecker object can check spellings of words which contain ligatures, | 236 // a spellchecker object can check spellings of words which contain ligatures, |
| 235 // full-width letters, etc. | 237 // full-width letters, etc. |
| 236 // USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but | 238 // USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but |
| 237 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin, | 239 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin, |
| 238 // etc. For its details, please read the script table in | 240 // etc. For its details, please read the script table in |
| 239 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt". | 241 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt". |
| 240 bool SpellcheckWordIterator::Normalize(int input_start, | 242 bool SpellcheckWordIterator::Normalize(int input_start, |
| 241 int input_length, | 243 int input_length, |
| 242 std::wstring* output_string) const { | 244 string16* output_string) const { |
| 243 // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/" | 245 // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/" |
| 244 // does not only write NFKD and NFKC can compose ligatures into their ASCII | 246 // does not only write NFKD and NFKC can compose ligatures into their ASCII |
| 245 // alternatives, but also write NFKC keeps accents of characters. | 247 // alternatives, but also write NFKC keeps accents of characters. |
| 246 // Therefore, NFKC seems to be the best option for hunspell. | 248 // Therefore, NFKC seems to be the best option for hunspell. |
| 247 // To use NKFC for normalization, the length of the output string is mostly | 249 // To use NKFC for normalization, the length of the output string is mostly |
| 248 // equal to the one of the input string. (One exception is ligatures.) | 250 // equal to the one of the input string. (One exception is ligatures.) |
| 249 // To avoid the unorm_normalize() function from being called always twice, | 251 // To avoid the unorm_normalize() function from being called always twice, |
| 250 // we temporarily allocate |input_length| + 1 characters to the output string | 252 // we temporarily allocate |input_length| + 1 characters to the output string |
| 251 // and call the function with it. We re-allocate the output string | 253 // and call the function with it. We re-allocate the output string |
| 252 // only if it cannot store the normalized string, i.e. the output string is | 254 // only if it cannot store the normalized string, i.e. the output string is |
| 253 // longer than the input one. | 255 // longer than the input one. |
| 254 const wchar_t* input_string = &word_[input_start]; | 256 const char16* input_string = &word_[input_start]; |
| 255 UErrorCode error_code = U_ZERO_ERROR; | 257 UErrorCode error_code = U_ZERO_ERROR; |
| 256 int output_length = input_length + 1; | 258 int output_length = input_length + 1; |
| 257 wchar_t *output_buffer = WriteInto(output_string, output_length); | 259 char16* output_buffer = WriteInto(output_string, output_length); |
| 258 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0, | 260 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0, |
| 259 output_buffer, output_length, &error_code); | 261 output_buffer, output_length, &error_code); |
| 260 if (error_code == U_BUFFER_OVERFLOW_ERROR) { | 262 if (error_code == U_BUFFER_OVERFLOW_ERROR) { |
| 261 error_code = U_ZERO_ERROR; | 263 error_code = U_ZERO_ERROR; |
| 262 output_buffer = WriteInto(output_string, ++output_length); | 264 output_buffer = WriteInto(output_string, ++output_length); |
| 263 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0, | 265 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0, |
| 264 output_buffer, output_length, &error_code); | 266 output_buffer, output_length, &error_code); |
| 265 } | 267 } |
| 266 return (error_code == U_ZERO_ERROR); | 268 return (error_code == U_ZERO_ERROR); |
| 267 } | 269 } |
| 268 | 270 |
| OLD | NEW |