chrome/browser/spellcheck_worditerator.cc - Issue 14408: Port the spell checker to posix.

Side by Side Diff: chrome/browser/spellcheck_worditerator.cc

Issue 14408: Port the spell checker to posix. (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 12 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/browser/spellcheck_worditerator.h"	5 #include "chrome/browser/spellcheck_worditerator.h"

6	6

7 #include <map>	7 #include <map>

8 #include <string>	8 #include <string>

9	9

10 #include "base/basictypes.h"	10 #include "base/basictypes.h"

(...skipping 30 matching lines...) Expand all Loading...
41 L'\xFF1A', // MidLetter # FULLWIDTH COLON	41 L'\xFF1A', // MidLetter # FULLWIDTH COLON

42 L'\x0027', // MidNumLet # APOSTROPHE	42 L'\x0027', // MidNumLet # APOSTROPHE

43 L'\x002E', // MidNumLet # FULL STOP	43 L'\x002E', // MidNumLet # FULL STOP

44 L'\x2018', // MidNumLet # LEFT SINGLE QUOTATION MARK	44 L'\x2018', // MidNumLet # LEFT SINGLE QUOTATION MARK

45 L'\x2019', // MidNumLet # RIGHT SINGLE QUOTATION MARK	45 L'\x2019', // MidNumLet # RIGHT SINGLE QUOTATION MARK

46 L'\x2024', // MidNumLet # ONE DOT LEADER	46 L'\x2024', // MidNumLet # ONE DOT LEADER

47 L'\xFE52', // MidNumLet # SMALL FULL STOP	47 L'\xFE52', // MidNumLet # SMALL FULL STOP

48 L'\xFF07', // MidNumLet # FULLWIDTH APOSTROPHE	48 L'\xFF07', // MidNumLet # FULLWIDTH APOSTROPHE

49 L'\xFF0E', // MidNumLet # FULLWIDTH FULL STOP	49 L'\xFF0E', // MidNumLet # FULLWIDTH FULL STOP

50 };	50 };

51 for (int i = 0; i < arraysize(kMidLetters); i++)	51 for (size_t i = 0; i < arraysize(kMidLetters); ++i)

52 middle_letters_[kMidLetters[i]] = true;	52 middle_letters_[kMidLetters[i]] = true;

53 }	53 }

54	54

55 SpellcheckCharAttribute::~SpellcheckCharAttribute() {	55 SpellcheckCharAttribute::~SpellcheckCharAttribute() {

56 }	56 }

57	57

58 // Sets the default language for this object.	58 // Sets the default language for this object.

59 // This function retrieves the exemplar set to set up the default character	59 // This function retrieves the exemplar set to set up the default character

60 // attributes.	60 // attributes.

61 void SpellcheckCharAttribute::SetDefaultLanguage(const std::wstring& language) {	61 void SpellcheckCharAttribute::SetDefaultLanguage(const std::wstring& language) {

62 // Retrieves the locale data of the given language.	62 // Retrieves the locale data of the given language.

63 std::string language_encoded;	63 std::string language_encoded;

64 WideToCodepage(language, "us-ascii", OnStringUtilConversionError::SKIP,	64 WideToCodepage(language, "us-ascii", OnStringUtilConversionError::SKIP,

65 &language_encoded);	65 &language_encoded);

66 UErrorCode status = U_ZERO_ERROR;	66 UErrorCode status = U_ZERO_ERROR;

67 ULocaleData* locale_data = ulocdata_open(language_encoded.c_str(), &status);	67 ULocaleData* locale_data = ulocdata_open(language_encoded.c_str(), &status);

68 if (U_FAILURE(status))	68 if (U_FAILURE(status))

69 return;	69 return;

70	70

71 // Retrieves the exemplar set of the given language and update the	71 // Retrieves the exemplar set of the given language and update the

72 // character-attribute table to treat its characters as word characters.	72 // character-attribute table to treat its characters as word characters.

73 USet* exemplar_set = uset_open(1, 0);	73 USet* exemplar_set = uset_open(1, 0);

74 ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD,	74 ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD,

75 &status);	75 &status);

76 ulocdata_close(locale_data);	76 ulocdata_close(locale_data);

77 if (U_SUCCESS(status)) {	77 if (U_SUCCESS(status)) {

78 int length = uset_size(exemplar_set);	78 int length = uset_size(exemplar_set);

79 for (int i = 0; i < length; i++) {	79 for (int i = 0; i < length; ++i) {

80 UChar32 character = uset_charAt(exemplar_set, i);	80 UChar32 character = uset_charAt(exemplar_set, i);

81 SetWordScript(GetScriptCode(character), true);	81 SetWordScript(GetScriptCode(character), true);

82 }	82 }

83 }	83 }

84 uset_close(exemplar_set);	84 uset_close(exemplar_set);

85 }	85 }

86	86

87 // Returns whether or not the given character is a character used by the	87 // Returns whether or not the given character is a character used by the

88 // selected dictionary.	88 // selected dictionary.

89 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const {	89 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const {

90 return IsWordScript(GetScriptCode(character)) && !u_isdigit(character);	90 return IsWordScript(GetScriptCode(character)) && !u_isdigit(character);

91 }	91 }

92	92

93 // Returns whether or not the given character is a character used by	93 // Returns whether or not the given character is a character used by

94 // contractions.	94 // contractions.

95 bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const {	95 bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const {

96 std::map<UChar32, bool>::const_iterator iterator;	96 std::map<UChar32, bool>::const_iterator iterator;

97 iterator = middle_letters_.find(character);	97 iterator = middle_letters_.find(character);

98 if (iterator == middle_letters_.end())	98 if (iterator == middle_letters_.end())

99 return false;	99 return false;

100 return iterator->second;	100 return iterator->second;

101 }	101 }

102	102

103 // Initializes the mapping table.	103 // Initializes the mapping table.

104 void SpellcheckCharAttribute::InitializeScriptTable() {	104 void SpellcheckCharAttribute::InitializeScriptTable() {

105 for (int i = 0; i < arraysize(script_attributes_); i++)	105 for (size_t i = 0; i < arraysize(script_attributes_); ++i)

106 script_attributes_[i] = false;	106 script_attributes_[i] = false;

107 }	107 }

108	108

109 // Retrieves the ICU script code.	109 // Retrieves the ICU script code.

110 UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const {	110 UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const {

111 UErrorCode status = U_ZERO_ERROR;	111 UErrorCode status = U_ZERO_ERROR;

112 UScriptCode script_code = uscript_getScript(character, &status);	112 UScriptCode script_code = uscript_getScript(character, &status);

113 return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE;	113 return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE;

114 }	114 }

115	115

116 // Updates the mapping table from an ICU script code to its attribute, i.e.	116 // Updates the mapping table from an ICU script code to its attribute, i.e.

117 // whether not a script is used by the selected dictionary.	117 // whether not a script is used by the selected dictionary.

118 void SpellcheckCharAttribute::SetWordScript(const int script_code,	118 void SpellcheckCharAttribute::SetWordScript(const int script_code,

119 bool in_use) {	119 bool in_use) {

120 if (script_code < 0 \|\| script_code >= arraysize(script_attributes_))	120 if (script_code < 0 \|\|

	121 static_cast<size_t>(script_code) >= arraysize(script_attributes_))

121 return;	122 return;

122 script_attributes_[script_code] = in_use;	123 script_attributes_[script_code] = in_use;

123 }	124 }

124	125

125 // Returns whether or not the given script is used by the selected	126 // Returns whether or not the given script is used by the selected

126 // dictionary.	127 // dictionary.

127 bool SpellcheckCharAttribute::IsWordScript(	128 bool SpellcheckCharAttribute::IsWordScript(

128 const UScriptCode script_code) const {	129 const UScriptCode script_code) const {

129 if (script_code < 0 \|\| script_code >= arraysize(script_attributes_))	130 if (script_code < 0 \|\|

	131 static_cast<size_t>(script_code) >= arraysize(script_attributes_))

130 return false;	132 return false;

131 return script_attributes_[script_code];	133 return script_attributes_[script_code];

132 }	134 }

133	135

134 SpellcheckWordIterator::SpellcheckWordIterator()	136 SpellcheckWordIterator::SpellcheckWordIterator()

135 : word_(NULL),	137 : word_(NULL),

	138 length_(0),

136 position_(0),	139 position_(0),

137 length_(0),

138 allow_contraction_(false),	140 allow_contraction_(false),

139 attribute_(NULL) {	141 attribute_(NULL) {

140 }	142 }

141	143

142 SpellcheckWordIterator::~SpellcheckWordIterator() {	144 SpellcheckWordIterator::~SpellcheckWordIterator() {

143 }	145 }

144	146

145 // Initialize a word-iterator object.	147 // Initialize a word-iterator object.

146 void SpellcheckWordIterator::Initialize(	148 void SpellcheckWordIterator::Initialize(

147 const SpellcheckCharAttribute* attribute,	149 const SpellcheckCharAttribute* attribute,

148 const wchar_t* word,	150 const char16* word,

149 size_t length,	151 size_t length,

150 bool allow_contraction) {	152 bool allow_contraction) {

151 word_ = word;	153 word_ = word;

152 position_ = 0;	154 position_ = 0;

153 length_ = static_cast<int>(length);	155 length_ = static_cast<int>(length);

154 allow_contraction_ = allow_contraction;	156 allow_contraction_ = allow_contraction;

155 attribute_ = attribute;	157 attribute_ = attribute;

156 }	158 }

157	159

158 // Retrieves a word (or a contraction).	160 // Retrieves a word (or a contraction).

159 // When a contraction is enclosed with contraction characters (e.g. 'isn't',	161 // When a contraction is enclosed with contraction characters (e.g. 'isn't',

160 // 'rock'n'roll'), we should discard the beginning and the end of the	162 // 'rock'n'roll'), we should discard the beginning and the end of the

161 // contraction but we should never split the contraction.	163 // contraction but we should never split the contraction.

162 // To handle this case easily, we should firstly extract a segment consisting	164 // To handle this case easily, we should firstly extract a segment consisting

163 // of word characters and contraction characters, and discard contraction	165 // of word characters and contraction characters, and discard contraction

164 // characters at the beginning and the end of the extracted segment.	166 // characters at the beginning and the end of the extracted segment.

165 bool SpellcheckWordIterator::GetNextWord(std::wstring* word_string,	167 bool SpellcheckWordIterator::GetNextWord(string16* word_string,

166 int* word_start,	168 int* word_start,

167 int* word_length) {	169 int* word_length) {

168 word_string->empty();	170 word_string->empty();

169 *word_start = 0;	171 *word_start = 0;

170 *word_length = 0;	172 *word_length = 0;

171 while (position_ < length_) {	173 while (position_ < length_) {

172 int segment_start = 0;	174 int segment_start = 0;

173 int segment_end = 0;	175 int segment_end = 0;

174 GetSegment(&segment_start, &segment_end);	176 GetSegment(&segment_start, &segment_end);

175 TrimSegment(segment_start, segment_end, word_start, word_length);	177 TrimSegment(segment_start, segment_end, word_start, word_length);

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
232	234

233 // Normalizes a non-terminated string into its canonical form so that	235 // Normalizes a non-terminated string into its canonical form so that

234 // a spellchecker object can check spellings of words which contain ligatures,	236 // a spellchecker object can check spellings of words which contain ligatures,

235 // full-width letters, etc.	237 // full-width letters, etc.

236 // USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but	238 // USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but

237 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin,	239 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin,

238 // etc. For its details, please read the script table in	240 // etc. For its details, please read the script table in

239 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt".	241 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt".

240 bool SpellcheckWordIterator::Normalize(int input_start,	242 bool SpellcheckWordIterator::Normalize(int input_start,

241 int input_length,	243 int input_length,

242 std::wstring* output_string) const {	244 string16* output_string) const {

243 // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/"	245 // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/"

244 // does not only write NFKD and NFKC can compose ligatures into their ASCII	246 // does not only write NFKD and NFKC can compose ligatures into their ASCII

245 // alternatives, but also write NFKC keeps accents of characters.	247 // alternatives, but also write NFKC keeps accents of characters.

246 // Therefore, NFKC seems to be the best option for hunspell.	248 // Therefore, NFKC seems to be the best option for hunspell.

247 // To use NKFC for normalization, the length of the output string is mostly	249 // To use NKFC for normalization, the length of the output string is mostly

248 // equal to the one of the input string. (One exception is ligatures.)	250 // equal to the one of the input string. (One exception is ligatures.)

249 // To avoid the unorm_normalize() function from being called always twice,	251 // To avoid the unorm_normalize() function from being called always twice,

250 // we temporarily allocate \|input_length\| + 1 characters to the output string	252 // we temporarily allocate \|input_length\| + 1 characters to the output string

251 // and call the function with it. We re-allocate the output string	253 // and call the function with it. We re-allocate the output string

252 // only if it cannot store the normalized string, i.e. the output string is	254 // only if it cannot store the normalized string, i.e. the output string is

253 // longer than the input one.	255 // longer than the input one.

254 const wchar_t* input_string = &word_[input_start];	256 const char16* input_string = &word_[input_start];

255 UErrorCode error_code = U_ZERO_ERROR;	257 UErrorCode error_code = U_ZERO_ERROR;

256 int output_length = input_length + 1;	258 int output_length = input_length + 1;

257 wchar_t *output_buffer = WriteInto(output_string, output_length);	259 char16* output_buffer = WriteInto(output_string, output_length);

258 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,	260 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,

259 output_buffer, output_length, &error_code);	261 output_buffer, output_length, &error_code);

260 if (error_code == U_BUFFER_OVERFLOW_ERROR) {	262 if (error_code == U_BUFFER_OVERFLOW_ERROR) {

261 error_code = U_ZERO_ERROR;	263 error_code = U_ZERO_ERROR;

262 output_buffer = WriteInto(output_string, ++output_length);	264 output_buffer = WriteInto(output_string, ++output_length);

263 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,	265 output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,

264 output_buffer, output_length, &error_code);	266 output_buffer, output_length, &error_code);

265 }	267 }

266 return (error_code == U_ZERO_ERROR);	268 return (error_code == U_ZERO_ERROR);

267 }	269 }

268	270

OLD	NEW

« no previous file with comments | « chrome/browser/spellcheck_worditerator.h ('k') | chrome/browser/spellchecker.h » ('j') | no next file with comments »