chrome/browser/spellcheck_worditerator.cc - Issue 21079: Fix for Issue 6431 "Two issues about Vietnamese Spell-Checker"....

Side by Side Diff: chrome/browser/spellcheck_worditerator.cc

Issue 21079: Fix for Issue 6431 "Two issues about Vietnamese Spell-Checker".... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 11 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/browser/spellcheck_worditerator.h"	5 #include "chrome/browser/spellcheck_worditerator.h"

6	6

7 #include <map>	7 #include <map>

8 #include <string>	8 #include <string>

9	9

10 #include "base/basictypes.h"	10 #include "base/basictypes.h"

11 #include "base/string_util.h"	11 #include "base/string_util.h"

12	12

	13 #include "third_party/icu38/public/common/unicode/normlzr.h"

	14 #include "third_party/icu38/public/common/unicode/schriter.h"

13 #include "third_party/icu38/public/common/unicode/uchar.h"	15 #include "third_party/icu38/public/common/unicode/uchar.h"

14 #include "third_party/icu38/public/common/unicode/unorm.h"

15 #include "third_party/icu38/public/common/unicode/uscript.h"	16 #include "third_party/icu38/public/common/unicode/uscript.h"

16 #include "third_party/icu38/public/common/unicode/uset.h"	17 #include "third_party/icu38/public/common/unicode/uset.h"

17 #include "third_party/icu38/public/i18n/unicode/ulocdata.h"	18 #include "third_party/icu38/public/i18n/unicode/ulocdata.h"

18	19

19 SpellcheckCharAttribute::SpellcheckCharAttribute() {	20 SpellcheckCharAttribute::SpellcheckCharAttribute() {

20 InitializeScriptTable();	21 InitializeScriptTable();

21	22

22 // Even though many dictionaries treats numbers and contractions as words and	23 // Even though many dictionaries treats numbers and contractions as words and

23 // treats USCRIPT_COMMON characters as word characters, the	24 // treats USCRIPT_COMMON characters as word characters, the

24 // SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word	25 // SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word

(...skipping 48 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
73 USet* exemplar_set = uset_open(1, 0);	74 USet* exemplar_set = uset_open(1, 0);

74 ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD,	75 ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD,

75 &status);	76 &status);

76 ulocdata_close(locale_data);	77 ulocdata_close(locale_data);

77 if (U_SUCCESS(status)) {	78 if (U_SUCCESS(status)) {

78 int length = uset_size(exemplar_set);	79 int length = uset_size(exemplar_set);

79 for (int i = 0; i < length; ++i) {	80 for (int i = 0; i < length; ++i) {

80 UChar32 character = uset_charAt(exemplar_set, i);	81 UChar32 character = uset_charAt(exemplar_set, i);

81 SetWordScript(GetScriptCode(character), true);	82 SetWordScript(GetScriptCode(character), true);

82 }	83 }

	84

	85 // Many languages use combining characters to input their characters from

	86 // keyboards. On the other hand, this exemplar set does not always include

	87 // combining characters for such languages.

	88 // To treat such combining characters as word characters, we decompose

	89 // this exemplar set and treat the decomposed characters as word characters.

	90 UnicodeString composed;

	91 for (int i = 0; i < length; ++i)

	92 composed.append(uset_charAt(exemplar_set, i));

	93

	94 UnicodeString decomposed;

	95 Normalizer::decompose(composed, FALSE, 0, decomposed, status);

	96 if (U_SUCCESS(status)) {

	97 StringCharacterIterator iterator(decomposed);

	98 UChar32 character = iterator.first32();

	99 while (character != CharacterIterator::DONE) {

	100 SetWordScript(GetScriptCode(character), true);

	101 character = iterator.next32();

	102 }

	103 }

83 }	104 }

84 uset_close(exemplar_set);	105 uset_close(exemplar_set);

85 }	106 }

86	107

87 // Returns whether or not the given character is a character used by the	108 // Returns whether or not the given character is a character used by the

88 // selected dictionary.	109 // selected dictionary.

89 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const {	110 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const {

90 return IsWordScript(GetScriptCode(character)) && !u_isdigit(character);	111 return IsWordScript(GetScriptCode(character)) && !u_isdigit(character);

91 }	112 }

92	113

(...skipping 146 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
239 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin,	260 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin,

240 // etc. For its details, please read the script table in	261 // etc. For its details, please read the script table in

241 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt".	262 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt".

242 bool SpellcheckWordIterator::Normalize(int input_start,	263 bool SpellcheckWordIterator::Normalize(int input_start,

243 int input_length,	264 int input_length,

244 string16* output_string) const {	265 string16* output_string) const {

245 // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/"	266 // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/"

246 // does not only write NFKD and NFKC can compose ligatures into their ASCII	267 // does not only write NFKD and NFKC can compose ligatures into their ASCII

247 // alternatives, but also write NFKC keeps accents of characters.	268 // alternatives, but also write NFKC keeps accents of characters.

248 // Therefore, NFKC seems to be the best option for hunspell.	269 // Therefore, NFKC seems to be the best option for hunspell.

249 // To use NKFC for normalization, the length of the output string is mostly	270 UnicodeString input(FALSE, &word_[input_start], input_length);

250 // equal to the one of the input string. (One exception is ligatures.)	271 UErrorCode status = U_ZERO_ERROR;

251 // To avoid the unorm_normalize() function from being called always twice,	272 UnicodeString output;

252 // we temporarily allocate \|input_length\| + 1 characters to the output string	273 Normalizer::normalize(input, UNORM_NFKC, 0, output, status);

253 // and call the function with it. We re-allocate the output string	274 if (U_SUCCESS(status))

254 // only if it cannot store the normalized string, i.e. the output string is	275 output_string->assign(output.getTerminatedBuffer());

255 // longer than the input one.	276 return (status == U_ZERO_ERROR);

256 const char16* input_string = &word_[input_start];

257 UErrorCode error_code = U_ZERO_ERROR;

258 int output_length = input_length + 1;

259 char16* output_buffer = WriteInto(output_string, output_length);

260 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,

261 output_buffer, output_length, &error_code);

262 if (error_code == U_BUFFER_OVERFLOW_ERROR) {

263 error_code = U_ZERO_ERROR;

264 output_buffer = WriteInto(output_string, ++output_length);

265 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,

266 output_buffer, output_length, &error_code);

267 }

268 return (error_code == U_ZERO_ERROR);

269 }	277 }

270	278

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »