| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "chrome/browser/spellcheck_worditerator.h" | |
| 6 | |
| 7 #include <map> | |
| 8 #include <string> | |
| 9 | |
| 10 #include "base/basictypes.h" | |
| 11 #include "base/string_util.h" | |
| 12 #include "chrome/browser/spellchecker.h" | |
| 13 | |
| 14 #include "third_party/icu/public/common/unicode/normlzr.h" | |
| 15 #include "third_party/icu/public/common/unicode/schriter.h" | |
| 16 #include "third_party/icu/public/common/unicode/uchar.h" | |
| 17 #include "third_party/icu/public/common/unicode/uscript.h" | |
| 18 #include "third_party/icu/public/common/unicode/uset.h" | |
| 19 #include "third_party/icu/public/i18n/unicode/ulocdata.h" | |
| 20 | |
| 21 SpellcheckCharAttribute::SpellcheckCharAttribute() { | |
| 22 InitializeScriptTable(); | |
| 23 | |
| 24 // Even though many dictionaries treats numbers and contractions as words and | |
| 25 // treats USCRIPT_COMMON characters as word characters, the | |
| 26 // SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word | |
| 27 // characters to strictly-distinguish contraction characters from word | |
| 28 // characters. | |
| 29 SetWordScript(USCRIPT_COMMON, false); | |
| 30 | |
| 31 // Initialize the table of characters used for contractions. | |
| 32 // This array consists of the 'Midletter' and 'MidNumLet' characters of the | |
| 33 // word-break property list provided by Unicode, Inc.: | |
| 34 // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt | |
| 35 static const UChar32 kMidLetters[] = { | |
| 36 L'\x003A', // MidLetter # COLON | |
| 37 L'\x00B7', // MidLetter # MIDDLE DOT | |
| 38 L'\x0387', // MidLetter # GREEK ANO TELEIA | |
| 39 L'\x05F4', // MidLetter # HEBREW PUNCTUATION GERSHAYIM | |
| 40 L'\x2027', // MidLetter # HYPHENATION POINT | |
| 41 L'\xFE13', // MidLetter # PRESENTATION FORM FOR VERTICAL COLON | |
| 42 L'\xFE55', // MidLetter # SMALL COLON | |
| 43 L'\xFF1A', // MidLetter # FULLWIDTH COLON | |
| 44 L'\x0027', // MidNumLet # APOSTROPHE | |
| 45 L'\x002E', // MidNumLet # FULL STOP | |
| 46 L'\x2018', // MidNumLet # LEFT SINGLE QUOTATION MARK | |
| 47 L'\x2019', // MidNumLet # RIGHT SINGLE QUOTATION MARK | |
| 48 L'\x2024', // MidNumLet # ONE DOT LEADER | |
| 49 L'\xFE52', // MidNumLet # SMALL FULL STOP | |
| 50 L'\xFF07', // MidNumLet # FULLWIDTH APOSTROPHE | |
| 51 L'\xFF0E', // MidNumLet # FULLWIDTH FULL STOP | |
| 52 }; | |
| 53 for (size_t i = 0; i < arraysize(kMidLetters); ++i) | |
| 54 middle_letters_[kMidLetters[i]] = true; | |
| 55 } | |
| 56 | |
| 57 SpellcheckCharAttribute::~SpellcheckCharAttribute() { | |
| 58 } | |
| 59 | |
| 60 // Sets the default language for this object. | |
| 61 // This function retrieves the exemplar set to set up the default character | |
| 62 // attributes. | |
| 63 void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) { | |
| 64 UErrorCode status = U_ZERO_ERROR; | |
| 65 ULocaleData* locale_data = ulocdata_open(language.c_str(), &status); | |
| 66 if (U_FAILURE(status)) | |
| 67 return; | |
| 68 | |
| 69 // Retrieves the exemplar set of the given language and update the | |
| 70 // character-attribute table to treat its characters as word characters. | |
| 71 USet* exemplar_set = uset_open(1, 0); | |
| 72 ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD, | |
| 73 &status); | |
| 74 ulocdata_close(locale_data); | |
| 75 if (U_SUCCESS(status)) { | |
| 76 int length = uset_size(exemplar_set); | |
| 77 for (int i = 0; i < length; ++i) { | |
| 78 UChar32 character = uset_charAt(exemplar_set, i); | |
| 79 SetWordScript(GetScriptCode(character), true); | |
| 80 } | |
| 81 | |
| 82 // Many languages use combining characters to input their characters from | |
| 83 // keyboards. On the other hand, this exemplar set does not always include | |
| 84 // combining characters for such languages. | |
| 85 // To treat such combining characters as word characters, we decompose | |
| 86 // this exemplar set and treat the decomposed characters as word characters. | |
| 87 icu::UnicodeString composed; | |
| 88 for (int i = 0; i < length; ++i) | |
| 89 composed.append(uset_charAt(exemplar_set, i)); | |
| 90 | |
| 91 icu::UnicodeString decomposed; | |
| 92 icu::Normalizer::decompose(composed, FALSE, 0, decomposed, status); | |
| 93 if (U_SUCCESS(status)) { | |
| 94 icu::StringCharacterIterator iterator(decomposed); | |
| 95 UChar32 character = iterator.first32(); | |
| 96 while (character != icu::CharacterIterator::DONE) { | |
| 97 SetWordScript(GetScriptCode(character), true); | |
| 98 character = iterator.next32(); | |
| 99 } | |
| 100 } | |
| 101 } | |
| 102 uset_close(exemplar_set); | |
| 103 } | |
| 104 | |
| 105 // Returns whether or not the given character is a character used by the | |
| 106 // selected dictionary. | |
| 107 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const { | |
| 108 return IsWordScript(GetScriptCode(character)) && !u_isdigit(character); | |
| 109 } | |
| 110 | |
| 111 // Returns whether or not the given character is a character used by | |
| 112 // contractions. | |
| 113 bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const { | |
| 114 std::map<UChar32, bool>::const_iterator iterator; | |
| 115 iterator = middle_letters_.find(character); | |
| 116 if (iterator == middle_letters_.end()) | |
| 117 return false; | |
| 118 return iterator->second; | |
| 119 } | |
| 120 | |
| 121 // Initializes the mapping table. | |
| 122 void SpellcheckCharAttribute::InitializeScriptTable() { | |
| 123 for (size_t i = 0; i < arraysize(script_attributes_); ++i) | |
| 124 script_attributes_[i] = false; | |
| 125 } | |
| 126 | |
| 127 // Retrieves the ICU script code. | |
| 128 UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const { | |
| 129 UErrorCode status = U_ZERO_ERROR; | |
| 130 UScriptCode script_code = uscript_getScript(character, &status); | |
| 131 return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE; | |
| 132 } | |
| 133 | |
| 134 // Updates the mapping table from an ICU script code to its attribute, i.e. | |
| 135 // whether not a script is used by the selected dictionary. | |
| 136 void SpellcheckCharAttribute::SetWordScript(const int script_code, | |
| 137 bool in_use) { | |
| 138 if (script_code < 0 || | |
| 139 static_cast<size_t>(script_code) >= arraysize(script_attributes_)) | |
| 140 return; | |
| 141 script_attributes_[script_code] = in_use; | |
| 142 } | |
| 143 | |
| 144 // Returns whether or not the given script is used by the selected | |
| 145 // dictionary. | |
| 146 bool SpellcheckCharAttribute::IsWordScript( | |
| 147 const UScriptCode script_code) const { | |
| 148 if (script_code < 0 || | |
| 149 static_cast<size_t>(script_code) >= arraysize(script_attributes_)) | |
| 150 return false; | |
| 151 return script_attributes_[script_code]; | |
| 152 } | |
| 153 | |
| 154 SpellcheckWordIterator::SpellcheckWordIterator() | |
| 155 : word_(NULL), | |
| 156 length_(0), | |
| 157 position_(0), | |
| 158 allow_contraction_(false), | |
| 159 attribute_(NULL) { | |
| 160 } | |
| 161 | |
| 162 SpellcheckWordIterator::~SpellcheckWordIterator() { | |
| 163 } | |
| 164 | |
| 165 // Initialize a word-iterator object. | |
| 166 void SpellcheckWordIterator::Initialize( | |
| 167 const SpellcheckCharAttribute* attribute, | |
| 168 const char16* word, | |
| 169 size_t length, | |
| 170 bool allow_contraction) { | |
| 171 word_ = word; | |
| 172 position_ = 0; | |
| 173 length_ = static_cast<int>(length); | |
| 174 allow_contraction_ = allow_contraction; | |
| 175 attribute_ = attribute; | |
| 176 } | |
| 177 | |
| 178 // Retrieves a word (or a contraction). | |
| 179 // When a contraction is enclosed with contraction characters (e.g. 'isn't', | |
| 180 // 'rock'n'roll'), we should discard the beginning and the end of the | |
| 181 // contraction but we should never split the contraction. | |
| 182 // To handle this case easily, we should firstly extract a segment consisting | |
| 183 // of word characters and contraction characters, and discard contraction | |
| 184 // characters at the beginning and the end of the extracted segment. | |
| 185 bool SpellcheckWordIterator::GetNextWord(string16* word_string, | |
| 186 int* word_start, | |
| 187 int* word_length) { | |
| 188 word_string->clear(); | |
| 189 *word_start = 0; | |
| 190 *word_length = 0; | |
| 191 while (position_ < length_) { | |
| 192 int segment_start = 0; | |
| 193 int segment_end = 0; | |
| 194 GetSegment(&segment_start, &segment_end); | |
| 195 TrimSegment(segment_start, segment_end, word_start, word_length); | |
| 196 if (*word_length > 0) | |
| 197 return Normalize(*word_start, *word_length, word_string); | |
| 198 } | |
| 199 | |
| 200 return false; | |
| 201 } | |
| 202 | |
| 203 // Retrieves a segment consisting of word characters (and contraction | |
| 204 // characters if the |allow_contraction_| value is true). | |
| 205 // When the current position refers to a non-word character, this function | |
| 206 // returns a non-empty segment consisting of the character itself. In this | |
| 207 // case, the TrimSegment() function discards the character and returns an | |
| 208 // empty word (i.e. |word_length| == 0). | |
| 209 void SpellcheckWordIterator::GetSegment(int* segment_start, | |
| 210 int* segment_end) { | |
| 211 int position = position_; | |
| 212 while (position < length_) { | |
| 213 UChar32 character; | |
| 214 U16_NEXT(word_, position, length_, character); | |
| 215 if (!attribute_->IsWordChar(character)) { | |
| 216 if (!allow_contraction_ || !attribute_->IsContractionChar(character)) | |
| 217 break; | |
| 218 } | |
| 219 } | |
| 220 *segment_start = position_; | |
| 221 *segment_end = position; | |
| 222 position_ = position; | |
| 223 } | |
| 224 | |
| 225 // Discards non-word characters at the beginning and the end of the given | |
| 226 // segment. | |
| 227 void SpellcheckWordIterator::TrimSegment(int segment_start, | |
| 228 int segment_end, | |
| 229 int* word_start, | |
| 230 int* word_length) const { | |
| 231 while (segment_start < segment_end) { | |
| 232 UChar32 character; | |
| 233 int segment_next = segment_start; | |
| 234 U16_NEXT(word_, segment_next, segment_end, character); | |
| 235 if (attribute_->IsWordChar(character)) { | |
| 236 *word_start = segment_start; | |
| 237 break; | |
| 238 } | |
| 239 segment_start = segment_next; | |
| 240 } | |
| 241 while (segment_end >= segment_start) { | |
| 242 UChar32 character; | |
| 243 int segment_prev = segment_end; | |
| 244 U16_PREV(word_, segment_start, segment_prev, character); | |
| 245 if (attribute_->IsWordChar(character)) { | |
| 246 *word_length = segment_end - segment_start; | |
| 247 break; | |
| 248 } | |
| 249 segment_end = segment_prev; | |
| 250 } | |
| 251 } | |
| 252 | |
| 253 // Normalizes a non-terminated string into its canonical form so that | |
| 254 // a spellchecker object can check spellings of words which contain ligatures, | |
| 255 // full-width letters, etc. | |
| 256 // USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but | |
| 257 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin, | |
| 258 // etc. For its details, please read the script table in | |
| 259 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt". | |
| 260 bool SpellcheckWordIterator::Normalize(int input_start, | |
| 261 int input_length, | |
| 262 string16* output_string) const { | |
| 263 // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/" | |
| 264 // does not only write NFKD and NFKC can compose ligatures into their ASCII | |
| 265 // alternatives, but also write NFKC keeps accents of characters. | |
| 266 // Therefore, NFKC seems to be the best option for hunspell. | |
| 267 icu::UnicodeString input(FALSE, &word_[input_start], input_length); | |
| 268 UErrorCode status = U_ZERO_ERROR; | |
| 269 icu::UnicodeString output; | |
| 270 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status); | |
| 271 if (U_SUCCESS(status)) | |
| 272 output_string->assign(output.getTerminatedBuffer()); | |
| 273 return status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING; | |
| 274 } | |
| OLD | NEW |