chrome/browser/spellcheck_worditerator.cc - Issue 395007: Move Mac to using renderer spellchecker.

Side by Side Diff: chrome/browser/spellcheck_worditerator.cc

Issue 395007: Move Mac to using renderer spellchecker. (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: ui test fix Created 11 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "chrome/browser/spellcheck_worditerator.h"

6

7 #include <map>

8 #include <string>

9

10 #include "base/basictypes.h"

11 #include "base/string_util.h"

12 #include "chrome/browser/spellchecker.h"

13

14 #include "third_party/icu/public/common/unicode/normlzr.h"

15 #include "third_party/icu/public/common/unicode/schriter.h"

16 #include "third_party/icu/public/common/unicode/uchar.h"

17 #include "third_party/icu/public/common/unicode/uscript.h"

18 #include "third_party/icu/public/common/unicode/uset.h"

19 #include "third_party/icu/public/i18n/unicode/ulocdata.h"

20

21 SpellcheckCharAttribute::SpellcheckCharAttribute() {

22 InitializeScriptTable();

23

24 // Even though many dictionaries treats numbers and contractions as words and

25 // treats USCRIPT_COMMON characters as word characters, the

26 // SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word

27 // characters to strictly-distinguish contraction characters from word

28 // characters.

29 SetWordScript(USCRIPT_COMMON, false);

30

31 // Initialize the table of characters used for contractions.

32 // This array consists of the 'Midletter' and 'MidNumLet' characters of the

33 // word-break property list provided by Unicode, Inc.:

34 // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt

35 static const UChar32 kMidLetters[] = {

36 L'\x003A', // MidLetter # COLON

37 L'\x00B7', // MidLetter # MIDDLE DOT

38 L'\x0387', // MidLetter # GREEK ANO TELEIA

39 L'\x05F4', // MidLetter # HEBREW PUNCTUATION GERSHAYIM

40 L'\x2027', // MidLetter # HYPHENATION POINT

41 L'\xFE13', // MidLetter # PRESENTATION FORM FOR VERTICAL COLON

42 L'\xFE55', // MidLetter # SMALL COLON

43 L'\xFF1A', // MidLetter # FULLWIDTH COLON

44 L'\x0027', // MidNumLet # APOSTROPHE

45 L'\x002E', // MidNumLet # FULL STOP

46 L'\x2018', // MidNumLet # LEFT SINGLE QUOTATION MARK

47 L'\x2019', // MidNumLet # RIGHT SINGLE QUOTATION MARK

48 L'\x2024', // MidNumLet # ONE DOT LEADER

49 L'\xFE52', // MidNumLet # SMALL FULL STOP

50 L'\xFF07', // MidNumLet # FULLWIDTH APOSTROPHE

51 L'\xFF0E', // MidNumLet # FULLWIDTH FULL STOP

52 };

53 for (size_t i = 0; i < arraysize(kMidLetters); ++i)

54 middle_letters_[kMidLetters[i]] = true;

55 }

56

57 SpellcheckCharAttribute::~SpellcheckCharAttribute() {

58 }

59

60 // Sets the default language for this object.

61 // This function retrieves the exemplar set to set up the default character

62 // attributes.

63 void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) {

64 UErrorCode status = U_ZERO_ERROR;

65 ULocaleData* locale_data = ulocdata_open(language.c_str(), &status);

66 if (U_FAILURE(status))

67 return;

68

69 // Retrieves the exemplar set of the given language and update the

70 // character-attribute table to treat its characters as word characters.

71 USet* exemplar_set = uset_open(1, 0);

72 ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD,

73 &status);

74 ulocdata_close(locale_data);

75 if (U_SUCCESS(status)) {

76 int length = uset_size(exemplar_set);

77 for (int i = 0; i < length; ++i) {

78 UChar32 character = uset_charAt(exemplar_set, i);

79 SetWordScript(GetScriptCode(character), true);

80 }

81

82 // Many languages use combining characters to input their characters from

83 // keyboards. On the other hand, this exemplar set does not always include

84 // combining characters for such languages.

85 // To treat such combining characters as word characters, we decompose

86 // this exemplar set and treat the decomposed characters as word characters.

87 icu::UnicodeString composed;

88 for (int i = 0; i < length; ++i)

89 composed.append(uset_charAt(exemplar_set, i));

90

91 icu::UnicodeString decomposed;

92 icu::Normalizer::decompose(composed, FALSE, 0, decomposed, status);

93 if (U_SUCCESS(status)) {

94 icu::StringCharacterIterator iterator(decomposed);

95 UChar32 character = iterator.first32();

96 while (character != icu::CharacterIterator::DONE) {

97 SetWordScript(GetScriptCode(character), true);

98 character = iterator.next32();

99 }

100 }

101 }

102 uset_close(exemplar_set);

103 }

104

105 // Returns whether or not the given character is a character used by the

106 // selected dictionary.

107 bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const {

108 return IsWordScript(GetScriptCode(character)) && !u_isdigit(character);

109 }

110

111 // Returns whether or not the given character is a character used by

112 // contractions.

113 bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const {

114 std::map<UChar32, bool>::const_iterator iterator;

115 iterator = middle_letters_.find(character);

116 if (iterator == middle_letters_.end())

117 return false;

118 return iterator->second;

119 }

120

121 // Initializes the mapping table.

122 void SpellcheckCharAttribute::InitializeScriptTable() {

123 for (size_t i = 0; i < arraysize(script_attributes_); ++i)

124 script_attributes_[i] = false;

125 }

126

127 // Retrieves the ICU script code.

128 UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const {

129 UErrorCode status = U_ZERO_ERROR;

130 UScriptCode script_code = uscript_getScript(character, &status);

131 return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE;

132 }

133

134 // Updates the mapping table from an ICU script code to its attribute, i.e.

135 // whether not a script is used by the selected dictionary.

136 void SpellcheckCharAttribute::SetWordScript(const int script_code,

137 bool in_use) {

138 if (script_code < 0 \|\|

139 static_cast<size_t>(script_code) >= arraysize(script_attributes_))

140 return;

141 script_attributes_[script_code] = in_use;

142 }

143

144 // Returns whether or not the given script is used by the selected

145 // dictionary.

146 bool SpellcheckCharAttribute::IsWordScript(

147 const UScriptCode script_code) const {

148 if (script_code < 0 \|\|

149 static_cast<size_t>(script_code) >= arraysize(script_attributes_))

150 return false;

151 return script_attributes_[script_code];

152 }

153

154 SpellcheckWordIterator::SpellcheckWordIterator()

155 : word_(NULL),

156 length_(0),

157 position_(0),

158 allow_contraction_(false),

159 attribute_(NULL) {

160 }

161

162 SpellcheckWordIterator::~SpellcheckWordIterator() {

163 }

164

165 // Initialize a word-iterator object.

166 void SpellcheckWordIterator::Initialize(

167 const SpellcheckCharAttribute* attribute,

168 const char16* word,

169 size_t length,

170 bool allow_contraction) {

171 word_ = word;

172 position_ = 0;

173 length_ = static_cast<int>(length);

174 allow_contraction_ = allow_contraction;

175 attribute_ = attribute;

176 }

177

178 // Retrieves a word (or a contraction).

179 // When a contraction is enclosed with contraction characters (e.g. 'isn't',

180 // 'rock'n'roll'), we should discard the beginning and the end of the

181 // contraction but we should never split the contraction.

182 // To handle this case easily, we should firstly extract a segment consisting

183 // of word characters and contraction characters, and discard contraction

184 // characters at the beginning and the end of the extracted segment.

185 bool SpellcheckWordIterator::GetNextWord(string16* word_string,

186 int* word_start,

187 int* word_length) {

188 word_string->clear();

189 *word_start = 0;

190 *word_length = 0;

191 while (position_ < length_) {

192 int segment_start = 0;

193 int segment_end = 0;

194 GetSegment(&segment_start, &segment_end);

195 TrimSegment(segment_start, segment_end, word_start, word_length);

196 if (*word_length > 0)

197 return Normalize(word_start, word_length, word_string);

198 }

199

200 return false;

201 }

202

203 // Retrieves a segment consisting of word characters (and contraction

204 // characters if the \|allow_contraction_\| value is true).

205 // When the current position refers to a non-word character, this function

206 // returns a non-empty segment consisting of the character itself. In this

207 // case, the TrimSegment() function discards the character and returns an

208 // empty word (i.e. \|word_length\| == 0).

209 void SpellcheckWordIterator::GetSegment(int* segment_start,

210 int* segment_end) {

211 int position = position_;

212 while (position < length_) {

213 UChar32 character;

214 U16_NEXT(word_, position, length_, character);

215 if (!attribute_->IsWordChar(character)) {

216 if (!allow_contraction_ \|\| !attribute_->IsContractionChar(character))

217 break;

218 }

219 }

220 *segment_start = position_;

221 *segment_end = position;

222 position_ = position;

223 }

224

225 // Discards non-word characters at the beginning and the end of the given

226 // segment.

227 void SpellcheckWordIterator::TrimSegment(int segment_start,

228 int segment_end,

229 int* word_start,

230 int* word_length) const {

231 while (segment_start < segment_end) {

232 UChar32 character;

233 int segment_next = segment_start;

234 U16_NEXT(word_, segment_next, segment_end, character);

235 if (attribute_->IsWordChar(character)) {

236 *word_start = segment_start;

237 break;

238 }

239 segment_start = segment_next;

240 }

241 while (segment_end >= segment_start) {

242 UChar32 character;

243 int segment_prev = segment_end;

244 U16_PREV(word_, segment_start, segment_prev, character);

245 if (attribute_->IsWordChar(character)) {

246 *word_length = segment_end - segment_start;

247 break;

248 }

249 segment_end = segment_prev;

250 }

251 }

252

253 // Normalizes a non-terminated string into its canonical form so that

254 // a spellchecker object can check spellings of words which contain ligatures,

255 // full-width letters, etc.

256 // USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but

257 // also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin,

258 // etc. For its details, please read the script table in

259 // "http://www.unicode.org/Public/UNIDATA/Scripts.txt".

260 bool SpellcheckWordIterator::Normalize(int input_start,

261 int input_length,

262 string16* output_string) const {

263 // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/"

264 // does not only write NFKD and NFKC can compose ligatures into their ASCII

265 // alternatives, but also write NFKC keeps accents of characters.

266 // Therefore, NFKC seems to be the best option for hunspell.

267 icu::UnicodeString input(FALSE, &word_[input_start], input_length);

268 UErrorCode status = U_ZERO_ERROR;

269 icu::UnicodeString output;

270 icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);

271 if (U_SUCCESS(status))

272 output_string->assign(output.getTerminatedBuffer());

273 return status == U_ZERO_ERROR \|\| status == U_STRING_NOT_TERMINATED_WARNING;

274 }

OLD	NEW

« no previous file with comments | « chrome/browser/spellcheck_worditerator.h ('k') | chrome/browser/spellchecker.h » ('j') | no next file with comments »