Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(415)

Side by Side Diff: chrome/browser/spellcheck_worditerator.cc

Issue 21079: Fix for Issue 6431 "Two issues about Vietnamese Spell-Checker".... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 11 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/browser/spellcheck_worditerator.h" 5 #include "chrome/browser/spellcheck_worditerator.h"
6 6
7 #include <map> 7 #include <map>
8 #include <string> 8 #include <string>
9 9
10 #include "base/basictypes.h" 10 #include "base/basictypes.h"
11 #include "base/string_util.h" 11 #include "base/string_util.h"
12 12
13 #include "third_party/icu38/public/common/unicode/normlzr.h"
14 #include "third_party/icu38/public/common/unicode/schriter.h"
13 #include "third_party/icu38/public/common/unicode/uchar.h" 15 #include "third_party/icu38/public/common/unicode/uchar.h"
14 #include "third_party/icu38/public/common/unicode/unorm.h"
15 #include "third_party/icu38/public/common/unicode/uscript.h" 16 #include "third_party/icu38/public/common/unicode/uscript.h"
16 #include "third_party/icu38/public/common/unicode/uset.h" 17 #include "third_party/icu38/public/common/unicode/uset.h"
17 #include "third_party/icu38/public/i18n/unicode/ulocdata.h" 18 #include "third_party/icu38/public/i18n/unicode/ulocdata.h"
18 19
19 SpellcheckCharAttribute::SpellcheckCharAttribute() { 20 SpellcheckCharAttribute::SpellcheckCharAttribute() {
20 InitializeScriptTable(); 21 InitializeScriptTable();
21 22
22 // Even though many dictionaries treats numbers and contractions as words and 23 // Even though many dictionaries treats numbers and contractions as words and
23 // treats USCRIPT_COMMON characters as word characters, the 24 // treats USCRIPT_COMMON characters as word characters, the
24 // SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word 25 // SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
73 USet* exemplar_set = uset_open(1, 0); 74 USet* exemplar_set = uset_open(1, 0);
74 ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD, 75 ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD,
75 &status); 76 &status);
76 ulocdata_close(locale_data); 77 ulocdata_close(locale_data);
77 if (U_SUCCESS(status)) { 78 if (U_SUCCESS(status)) {
78 int length = uset_size(exemplar_set); 79 int length = uset_size(exemplar_set);
79 for (int i = 0; i < length; ++i) { 80 for (int i = 0; i < length; ++i) {
80 UChar32 character = uset_charAt(exemplar_set, i); 81 UChar32 character = uset_charAt(exemplar_set, i);
81 SetWordScript(GetScriptCode(character), true); 82 SetWordScript(GetScriptCode(character), true);
82 } 83 }
84
85 // Many languages use combining characters to input their characters from
86 // keyboards. On the other hand, this exemplar set does not always include
87 // combining characters for such languages.
88 // To treat such combining characters as word characters, we decompose
89 // this exemplar set and treat the decomposed characters as word characters.
90 UnicodeString composed;
91 for (int i = 0; i < length; ++i)
92 composed.append(uset_charAt(exemplar_set, i));
93
94 UnicodeString decomposed;
95 Normalizer::decompose(composed, FALSE, 0, decomposed, status);
96 if (U_SUCCESS(status)) {
97 StringCharacterIterator iterator(decomposed);
98 UChar32 character = iterator.first32();
99 while (character != CharacterIterator::DONE) {
100 SetWordScript(GetScriptCode(character), true);
101 character = iterator.next32();
102 }
103 }
83 } 104 }
84 uset_close(exemplar_set); 105 uset_close(exemplar_set);
85 } 106 }
86 107
87 // Returns whether or not the given character is a character used by the 108 // Returns whether or not the given character is a character used by the
88 // selected dictionary. 109 // selected dictionary.
89 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const { 110 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const {
90 return IsWordScript(GetScriptCode(character)) && !u_isdigit(character); 111 return IsWordScript(GetScriptCode(character)) && !u_isdigit(character);
91 } 112 }
92 113
(...skipping 146 matching lines...) Expand 10 before | Expand all | Expand 10 after
239 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin, 260 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin,
240 // etc. For its details, please read the script table in 261 // etc. For its details, please read the script table in
241 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt". 262 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt".
242 bool SpellcheckWordIterator::Normalize(int input_start, 263 bool SpellcheckWordIterator::Normalize(int input_start,
243 int input_length, 264 int input_length,
244 string16* output_string) const { 265 string16* output_string) const {
245 // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/" 266 // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/"
246 // does not only write NFKD and NFKC can compose ligatures into their ASCII 267 // does not only write NFKD and NFKC can compose ligatures into their ASCII
247 // alternatives, but also write NFKC keeps accents of characters. 268 // alternatives, but also write NFKC keeps accents of characters.
248 // Therefore, NFKC seems to be the best option for hunspell. 269 // Therefore, NFKC seems to be the best option for hunspell.
249 // To use NKFC for normalization, the length of the output string is mostly 270 UnicodeString input(FALSE, &word_[input_start], input_length);
250 // equal to the one of the input string. (One exception is ligatures.) 271 UErrorCode status = U_ZERO_ERROR;
251 // To avoid the unorm_normalize() function from being called always twice, 272 UnicodeString output;
252 // we temporarily allocate |input_length| + 1 characters to the output string 273 Normalizer::normalize(input, UNORM_NFKC, 0, output, status);
253 // and call the function with it. We re-allocate the output string 274 if (U_SUCCESS(status))
254 // only if it cannot store the normalized string, i.e. the output string is 275 output_string->assign(output.getTerminatedBuffer());
255 // longer than the input one. 276 return (status == U_ZERO_ERROR);
256 const char16* input_string = &word_[input_start];
257 UErrorCode error_code = U_ZERO_ERROR;
258 int output_length = input_length + 1;
259 char16* output_buffer = WriteInto(output_string, output_length);
260 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,
261 output_buffer, output_length, &error_code);
262 if (error_code == U_BUFFER_OVERFLOW_ERROR) {
263 error_code = U_ZERO_ERROR;
264 output_buffer = WriteInto(output_string, ++output_length);
265 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,
266 output_buffer, output_length, &error_code);
267 }
268 return (error_code == U_ZERO_ERROR);
269 } 277 }
270 278
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698