| OLD | NEW |
| 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/browser/spellcheck_worditerator.h" | 5 #include "chrome/browser/spellcheck_worditerator.h" |
| 6 | 6 |
| 7 #include <map> | 7 #include <map> |
| 8 #include <string> | 8 #include <string> |
| 9 | 9 |
| 10 #include "base/basictypes.h" | 10 #include "base/basictypes.h" |
| 11 #include "base/string_util.h" | 11 #include "base/string_util.h" |
| 12 | 12 |
| 13 #include "third_party/icu38/public/common/unicode/normlzr.h" |
| 14 #include "third_party/icu38/public/common/unicode/schriter.h" |
| 13 #include "third_party/icu38/public/common/unicode/uchar.h" | 15 #include "third_party/icu38/public/common/unicode/uchar.h" |
| 14 #include "third_party/icu38/public/common/unicode/unorm.h" | |
| 15 #include "third_party/icu38/public/common/unicode/uscript.h" | 16 #include "third_party/icu38/public/common/unicode/uscript.h" |
| 16 #include "third_party/icu38/public/common/unicode/uset.h" | 17 #include "third_party/icu38/public/common/unicode/uset.h" |
| 17 #include "third_party/icu38/public/i18n/unicode/ulocdata.h" | 18 #include "third_party/icu38/public/i18n/unicode/ulocdata.h" |
| 18 | 19 |
| 19 SpellcheckCharAttribute::SpellcheckCharAttribute() { | 20 SpellcheckCharAttribute::SpellcheckCharAttribute() { |
| 20 InitializeScriptTable(); | 21 InitializeScriptTable(); |
| 21 | 22 |
| 22 // Even though many dictionaries treats numbers and contractions as words and | 23 // Even though many dictionaries treats numbers and contractions as words and |
| 23 // treats USCRIPT_COMMON characters as word characters, the | 24 // treats USCRIPT_COMMON characters as word characters, the |
| 24 // SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word | 25 // SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word |
| (...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 73 USet* exemplar_set = uset_open(1, 0); | 74 USet* exemplar_set = uset_open(1, 0); |
| 74 ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD, | 75 ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD, |
| 75 &status); | 76 &status); |
| 76 ulocdata_close(locale_data); | 77 ulocdata_close(locale_data); |
| 77 if (U_SUCCESS(status)) { | 78 if (U_SUCCESS(status)) { |
| 78 int length = uset_size(exemplar_set); | 79 int length = uset_size(exemplar_set); |
| 79 for (int i = 0; i < length; ++i) { | 80 for (int i = 0; i < length; ++i) { |
| 80 UChar32 character = uset_charAt(exemplar_set, i); | 81 UChar32 character = uset_charAt(exemplar_set, i); |
| 81 SetWordScript(GetScriptCode(character), true); | 82 SetWordScript(GetScriptCode(character), true); |
| 82 } | 83 } |
| 84 |
| 85 // Many languages use combining characters to input their characters from |
| 86 // keyboards. On the other hand, this exemplar set does not always include |
| 87 // combining characters for such languages. |
| 88 // To treat such combining characters as word characters, we decompose |
| 89 // this exemplar set and treat the decomposed characters as word characters. |
| 90 UnicodeString composed; |
| 91 for (int i = 0; i < length; ++i) |
| 92 composed.append(uset_charAt(exemplar_set, i)); |
| 93 |
| 94 UnicodeString decomposed; |
| 95 Normalizer::decompose(composed, FALSE, 0, decomposed, status); |
| 96 if (U_SUCCESS(status)) { |
| 97 StringCharacterIterator iterator(decomposed); |
| 98 UChar32 character = iterator.first32(); |
| 99 while (character != CharacterIterator::DONE) { |
| 100 SetWordScript(GetScriptCode(character), true); |
| 101 character = iterator.next32(); |
| 102 } |
| 103 } |
| 83 } | 104 } |
| 84 uset_close(exemplar_set); | 105 uset_close(exemplar_set); |
| 85 } | 106 } |
| 86 | 107 |
| 87 // Returns whether or not the given character is a character used by the | 108 // Returns whether or not the given character is a character used by the |
| 88 // selected dictionary. | 109 // selected dictionary. |
| 89 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const { | 110 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const { |
| 90 return IsWordScript(GetScriptCode(character)) && !u_isdigit(character); | 111 return IsWordScript(GetScriptCode(character)) && !u_isdigit(character); |
| 91 } | 112 } |
| 92 | 113 |
| (...skipping 146 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 239 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin, | 260 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin, |
| 240 // etc. For its details, please read the script table in | 261 // etc. For its details, please read the script table in |
| 241 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt". | 262 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt". |
| 242 bool SpellcheckWordIterator::Normalize(int input_start, | 263 bool SpellcheckWordIterator::Normalize(int input_start, |
| 243 int input_length, | 264 int input_length, |
| 244 string16* output_string) const { | 265 string16* output_string) const { |
| 245 // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/" | 266 // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/" |
| 246 // does not only write NFKD and NFKC can compose ligatures into their ASCII | 267 // does not only write NFKD and NFKC can compose ligatures into their ASCII |
| 247 // alternatives, but also write NFKC keeps accents of characters. | 268 // alternatives, but also write NFKC keeps accents of characters. |
| 248 // Therefore, NFKC seems to be the best option for hunspell. | 269 // Therefore, NFKC seems to be the best option for hunspell. |
| 249 // To use NKFC for normalization, the length of the output string is mostly | 270 UnicodeString input(FALSE, &word_[input_start], input_length); |
| 250 // equal to the one of the input string. (One exception is ligatures.) | 271 UErrorCode status = U_ZERO_ERROR; |
| 251 // To avoid the unorm_normalize() function from being called always twice, | 272 UnicodeString output; |
| 252 // we temporarily allocate |input_length| + 1 characters to the output string | 273 Normalizer::normalize(input, UNORM_NFKC, 0, output, status); |
| 253 // and call the function with it. We re-allocate the output string | 274 if (U_SUCCESS(status)) |
| 254 // only if it cannot store the normalized string, i.e. the output string is | 275 output_string->assign(output.getTerminatedBuffer()); |
| 255 // longer than the input one. | 276 return (status == U_ZERO_ERROR); |
| 256 const char16* input_string = &word_[input_start]; | |
| 257 UErrorCode error_code = U_ZERO_ERROR; | |
| 258 int output_length = input_length + 1; | |
| 259 char16* output_buffer = WriteInto(output_string, output_length); | |
| 260 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0, | |
| 261 output_buffer, output_length, &error_code); | |
| 262 if (error_code == U_BUFFER_OVERFLOW_ERROR) { | |
| 263 error_code = U_ZERO_ERROR; | |
| 264 output_buffer = WriteInto(output_string, ++output_length); | |
| 265 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0, | |
| 266 output_buffer, output_length, &error_code); | |
| 267 } | |
| 268 return (error_code == U_ZERO_ERROR); | |
| 269 } | 277 } |
| 270 | 278 |
| OLD | NEW |