chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc - Issue 2198143002: Componentize spellcheck [3]: move renderer/ files to component.

Side by Side Diff: chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc

Issue 2198143002: Componentize spellcheck [3]: move renderer/ files to component. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: fix formatting Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « chrome/renderer/spellchecker/spellcheck_worditerator.cc ('k') | chrome/renderer/spellchecker/spelling_engine.h » ('j') | components/spellcheck/renderer/spellcheck_multilingual_unittest.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include <stddef.h>

6

7 #include <string>

8 #include <vector>

9

10 #include "base/format_macros.h"

11 #include "base/i18n/break_iterator.h"

12 #include "base/macros.h"

13 #include "base/strings/string_split.h"

14 #include "base/strings/stringprintf.h"

15 #include "base/strings/utf_string_conversions.h"

16 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"

17 #include "testing/gtest/include/gtest/gtest.h"

18

19 using base::i18n::BreakIterator;

20 using WordIteratorStatus = SpellcheckWordIterator::WordIteratorStatus;

21

22 namespace {

23

24 struct TestCase {

25 const char* language;

26 bool allow_contraction;

27 const wchar_t* expected_words;

28 };

29

30 base::string16 GetRulesForLanguage(const std::string& language) {

31 SpellcheckCharAttribute attribute;

32 attribute.SetDefaultLanguage(language);

33 return attribute.GetRuleSet(true);

34 }

35

36 WordIteratorStatus GetNextNonSkippableWord(SpellcheckWordIterator* iterator,

37 base::string16* word_string,

38 int* word_start,

39 int* word_length) {

40 WordIteratorStatus status = SpellcheckWordIterator::IS_SKIPPABLE;

41 while (status == SpellcheckWordIterator::IS_SKIPPABLE)

42 status = iterator->GetNextWord(word_string, word_start, word_length);

43 return status;

44 }

45

46 } // namespace

47

48 // Tests whether or not our SpellcheckWordIterator can extract words used by the

49 // specified language from a multi-language text.

50 TEST(SpellcheckWordIteratorTest, SplitWord) {

51 // An input text. This text includes words of several languages. (Some words

52 // are not separated with whitespace characters.) Our SpellcheckWordIterator

53 // should extract the words used by the specified language from this text and

54 // normalize them so our spell-checker can check their spellings. If

55 // characters are found that are not from the specified language the test

56 // skips them.

57 const wchar_t kTestText[] =

58 // Graphic characters

59 L"!@#$%^&*()"

60 // Latin (including a contraction character and a ligature).

61 L"hello:hello a\xFB03x"

62 // Greek

63 L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"

64 // Cyrillic

65 L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"

66 L"\x0443\x0439\x0442\x0435"

67 // Hebrew (including niqquds)

68 L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd "

69 // Hebrew words with U+0027 and U+05F3

70 L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "

71 // Hebrew words with U+0022 and U+05F4

72 L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "

73 // Hebrew words enclosed with ASCII quotes.

74 L"\"\x05e6\x05d4\x0022\x05dc\" '\x05e9\x05c1\x05b8\x05dc\x05d5'"

75 // Arabic (including vowel marks)

76 L"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627\x0645\x064f "

77 L"\x0639\x064e\x0644\x064e\x064a\x0652\x0643\x064f\x0645\x0652 "

78 // Farsi/Persian (including vowel marks)

79 // Make sure \u064b - \u0652 are removed.

80 L"\x0647\x0634\x064e\x0631\x062d "

81 L"\x0647\x062e\x0648\x0627\x0647 "

82 L"\x0650\x062f\x0631\x062f "

83 L"\x0631\x0645\x0627\x0646\x0652 "

84 L"\x0633\x0631\x0651 "

85 L"\x0646\x0646\x064e\x062c\x064f\x0633 "

86 L"\x0627\x0644\x062d\x0645\x062f "

87 // Also make sure that class "Lm" (the \u0640) is filtered out too.

88 L"\x062c\x062c\x0640\x062c\x062c"

89 // Hindi

90 L"\x0930\x093E\x091C\x0927\x093E\x0928"

91 // Thai

92 L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"

93 L"\x0e23\x0e31\x0e1a"

94 // Hiraganas

95 L"\x3053\x3093\x306B\x3061\x306F"

96 // CJKV ideographs

97 L"\x4F60\x597D"

98 // Hangul Syllables

99 L"\xC548\xB155\xD558\xC138\xC694"

100 // Full-width latin : Hello

101 L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F "

102 L"e.g.,";

103

104 // The languages and expected results used in this test.

105 static const TestCase kTestCases[] = {

106 {

107 // English (keep contraction words)

108 "en-US", true, L"hello:hello affix Hello e.g"

109 }, {

110 // English (split contraction words)

111 "en-US", false, L"hello hello affix Hello e g"

112 }, {

113 // Greek

114 "el-GR", true,

115 L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"

116 }, {

117 // Russian

118 "ru-RU", true,

119 L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"

120 L"\x0443\x0439\x0442\x0435"

121 }, {

122 // Hebrew

123 "he-IL", true,

124 L"\x05e9\x05dc\x05d5\x05dd "

125 L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "

126 L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "

127 L"\x05e6\x05d4\x0022\x05dc \x05e9\x05dc\x05d5"

128 }, {

129 // Arabic

130 "ar", true,

131 L"\x0627\x0644\x0633\x0644\x0627\x0645 "

132 L"\x0639\x0644\x064a\x0643\x0645 "

133 // Farsi/Persian

134 L"\x0647\x0634\x0631\x062d "

135 L"\x0647\x062e\x0648\x0627\x0647 "

136 L"\x062f\x0631\x062f "

137 L"\x0631\x0645\x0627\x0646 "

138 L"\x0633\x0631 "

139 L"\x0646\x0646\x062c\x0633 "

140 L"\x0627\x0644\x062d\x0645\x062f "

141 L"\x062c\x062c\x062c\x062c"

142 }, {

143 // Hindi

144 "hi-IN", true,

145 L"\x0930\x093E\x091C\x0927\x093E\x0928"

146 }, {

147 // Thai

148 "th-TH", true,

149 L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"

150 L"\x0e23\x0e31\x0e1a"

151 }, {

152 // Korean

153 "ko-KR", true,

154 L"\x110b\x1161\x11ab\x1102\x1167\x11bc\x1112\x1161"

155 L"\x1109\x1166\x110b\x116d"

156 },

157 };

158

159 for (size_t i = 0; i < arraysize(kTestCases); ++i) {

160 SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,

161 kTestCases[i].language));

162

163 SpellcheckCharAttribute attributes;

164 attributes.SetDefaultLanguage(kTestCases[i].language);

165

166 base::string16 input(base::WideToUTF16(kTestText));

167 SpellcheckWordIterator iterator;

168 EXPECT_TRUE(iterator.Initialize(&attributes,

169 kTestCases[i].allow_contraction));

170 EXPECT_TRUE(iterator.SetText(input.c_str(), input.length()));

171

172 std::vector<base::string16> expected_words = base::SplitString(

173 base::WideToUTF16(kTestCases[i].expected_words),

174 base::string16(1, ' '), base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);

175

176 base::string16 actual_word;

177 int actual_start, actual_len;

178 size_t index = 0;

179 for (SpellcheckWordIterator::WordIteratorStatus status =

180 iterator.GetNextWord(&actual_word, &actual_start, &actual_len);

181 status != SpellcheckWordIterator::IS_END_OF_TEXT;

182 status =

183 iterator.GetNextWord(&actual_word, &actual_start, &actual_len)) {

184 if (status == SpellcheckWordIterator::WordIteratorStatus::IS_SKIPPABLE)

185 continue;

186

187 EXPECT_TRUE(index < expected_words.size());

188 if (index < expected_words.size())

189 EXPECT_EQ(expected_words[index], actual_word);

190 ++index;

191 }

192 }

193 }

194

195 // Tests whether our SpellcheckWordIterator extracts an empty word without

196 // getting stuck in an infinite loop when inputting a Khmer text. (This is a

197 // regression test for Issue 46278.)

198 TEST(SpellcheckWordIteratorTest, RuleSetConsistency) {

199 SpellcheckCharAttribute attributes;

200 attributes.SetDefaultLanguage("en-US");

201

202 const wchar_t kTestText[] = L"\x1791\x17c1\x002e";

203 base::string16 input(base::WideToUTF16(kTestText));

204

205 SpellcheckWordIterator iterator;

206 EXPECT_TRUE(iterator.Initialize(&attributes, true));

207 EXPECT_TRUE(iterator.SetText(input.c_str(), input.length()));

208

209 // When SpellcheckWordIterator uses an inconsistent ICU ruleset, the following

210 // iterator.GetNextWord() calls get stuck in an infinite loop. Therefore, this

211 // test succeeds if this call returns without timeouts.

212 base::string16 actual_word;

213 int actual_start, actual_len;

214 WordIteratorStatus status = GetNextNonSkippableWord(

215 &iterator, &actual_word, &actual_start, &actual_len);

216

217 EXPECT_EQ(SpellcheckWordIterator::WordIteratorStatus::IS_END_OF_TEXT, status);

218 EXPECT_EQ(0, actual_start);

219 EXPECT_EQ(0, actual_len);

220 }

221

222 // Vertify our SpellcheckWordIterator can treat ASCII numbers as word characters

223 // on LTR languages. On the other hand, it should not treat ASCII numbers as

224 // word characters on RTL languages because they change the text direction from

225 // RTL to LTR.

226 TEST(SpellcheckWordIteratorTest, TreatNumbersAsWordCharacters) {

227 // A set of a language, a dummy word, and a text direction used in this test.

228 // For each language, this test splits a dummy word, which consists of ASCII

229 // numbers and an alphabet of the language, into words. When ASCII numbers are

230 // treated as word characters, the split word becomes equal to the dummy word.

231 // Otherwise, the split word does not include ASCII numbers.

232 static const struct {

233 const char* language;

234 const wchar_t* text;

235 bool left_to_right;

236 } kTestCases[] = {

237 {

238 // English

239 "en-US", L"0123456789" L"a", true,

240 }, {

241 // Greek

242 "el-GR", L"0123456789" L"\x03B1", true,

243 }, {

244 // Russian

245 "ru-RU", L"0123456789" L"\x0430", true,

246 }, {

247 // Hebrew

248 "he-IL", L"0123456789" L"\x05D0", false,

249 }, {

250 // Arabic

251 "ar", L"0123456789" L"\x0627", false,

252 }, {

253 // Hindi

254 "hi-IN", L"0123456789" L"\x0905", true,

255 }, {

256 // Thai

257 "th-TH", L"0123456789" L"\x0e01", true,

258 }, {

259 // Korean

260 "ko-KR", L"0123456789" L"\x1100\x1161", true,

261 },

262 };

263

264 for (size_t i = 0; i < arraysize(kTestCases); ++i) {

265 SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,

266 kTestCases[i].language));

267

268 SpellcheckCharAttribute attributes;

269 attributes.SetDefaultLanguage(kTestCases[i].language);

270

271 base::string16 input_word(base::WideToUTF16(kTestCases[i].text));

272 SpellcheckWordIterator iterator;

273 EXPECT_TRUE(iterator.Initialize(&attributes, true));

274 EXPECT_TRUE(iterator.SetText(input_word.c_str(), input_word.length()));

275

276 base::string16 actual_word;

277 int actual_start, actual_len;

278 WordIteratorStatus status = GetNextNonSkippableWord(

279 &iterator, &actual_word, &actual_start, &actual_len);

280

281 EXPECT_EQ(SpellcheckWordIterator::WordIteratorStatus::IS_WORD, status);

282 if (kTestCases[i].left_to_right)

283 EXPECT_EQ(input_word, actual_word);

284 else

285 EXPECT_NE(input_word, actual_word);

286 }

287 }

288

289 // Verify SpellcheckWordIterator treats typographical apostrophe as a part of

290 // the word.

291 TEST(SpellcheckWordIteratorTest, TypographicalApostropheIsPartOfWord) {

292 static const struct {

293 const char* language;

294 const wchar_t* input;

295 const wchar_t* expected;

296 } kTestCases[] = {

297 // Typewriter apostrophe:

298 {"en-AU", L"you're", L"you're"},

299 {"en-CA", L"you're", L"you're"},

300 {"en-GB", L"you're", L"you're"},

301 {"en-US", L"you're", L"you're"},

302 {"en-US", L"!!!!you're", L"you're"},

303 // Typographical apostrophe:

304 {"en-AU", L"you\x2019re", L"you\x2019re"},

305 {"en-CA", L"you\x2019re", L"you\x2019re"},

306 {"en-GB", L"you\x2019re", L"you\x2019re"},

307 {"en-US", L"you\x2019re", L"you\x2019re"},

308 {"en-US", L"....you\x2019re", L"you\x2019re"},

309 };

310

311 for (size_t i = 0; i < arraysize(kTestCases); ++i) {

312 SpellcheckCharAttribute attributes;

313 attributes.SetDefaultLanguage(kTestCases[i].language);

314

315 base::string16 input_word(base::WideToUTF16(kTestCases[i].input));

316 base::string16 expected_word(base::WideToUTF16(kTestCases[i].expected));

317 SpellcheckWordIterator iterator;

318 EXPECT_TRUE(iterator.Initialize(&attributes, true));

319 EXPECT_TRUE(iterator.SetText(input_word.c_str(), input_word.length()));

320

321 base::string16 actual_word;

322 int actual_start, actual_len;

323 WordIteratorStatus status = GetNextNonSkippableWord(

324 &iterator, &actual_word, &actual_start, &actual_len);

325

326 EXPECT_EQ(SpellcheckWordIterator::WordIteratorStatus::IS_WORD, status);

327 EXPECT_EQ(expected_word, actual_word);

328 EXPECT_LE(0, actual_start);

329 EXPECT_EQ(expected_word.length(),

330 static_cast<base::string16::size_type>(actual_len));

331 }

332 }

333

334 TEST(SpellcheckWordIteratorTest, Initialization) {

335 // Test initialization works when a default language is set.

336 {

337 SpellcheckCharAttribute attributes;

338 attributes.SetDefaultLanguage("en-US");

339

340 SpellcheckWordIterator iterator;

341 EXPECT_TRUE(iterator.Initialize(&attributes, true));

342 }

343

344 // Test initialization fails when no default language is set.

345 {

346 SpellcheckCharAttribute attributes;

347

348 SpellcheckWordIterator iterator;

349 EXPECT_FALSE(iterator.Initialize(&attributes, true));

350 }

351 }

352

353 // This test uses English rules to check that different character set

354 // combinations properly find word breaks and skippable characters.

355 TEST(SpellcheckWordIteratorTest, FindSkippableWordsEnglish) {

356 // A string containing the English word "foo", followed by two Khmer

357 // characters, the English word "Can", and then two Russian characters and

358 // punctuation.

359 base::string16 text(

360 base::WideToUTF16(L"foo \x1791\x17C1 Can \x041C\x0438..."));

361 BreakIterator iter(text, GetRulesForLanguage("en-US"));

362 ASSERT_TRUE(iter.Init());

363

364 EXPECT_TRUE(iter.Advance());

365 // Finds "foo".

366 EXPECT_EQ(base::UTF8ToUTF16("foo"), iter.GetString());

367 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);

368 EXPECT_TRUE(iter.Advance());

369 // Finds the space and then the Khmer characters.

370 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());

371 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

372 EXPECT_TRUE(iter.Advance());

373 EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1"), iter.GetString());

374 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

375 EXPECT_TRUE(iter.Advance());

376 // Finds the next space and "Can".

377 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());

378 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

379 EXPECT_TRUE(iter.Advance());

380 EXPECT_EQ(base::UTF8ToUTF16("Can"), iter.GetString());

381 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);

382 EXPECT_TRUE(iter.Advance());

383 // Finds the next space and each Russian character.

384 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());

385 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

386 EXPECT_TRUE(iter.Advance());

387 EXPECT_EQ(base::WideToUTF16(L"\x041C"), iter.GetString());

388 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

389 EXPECT_TRUE(iter.Advance());

390 EXPECT_EQ(base::WideToUTF16(L"\x0438"), iter.GetString());

391 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

392 EXPECT_TRUE(iter.Advance());

393 // Finds the periods at the end.

394 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());

395 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

396 EXPECT_TRUE(iter.Advance());

397 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());

398 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

399 EXPECT_TRUE(iter.Advance());

400 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());

401 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

402 EXPECT_FALSE(iter.Advance());

403 }

404

405 // This test uses Russian rules to check that different character set

406 // combinations properly find word breaks and skippable characters.

407 TEST(SpellcheckWordIteratorTest, FindSkippableWordsRussian) {

408 // A string containing punctuation followed by two Russian characters, the

409 // English word "Can", and then two Khmer characters.

410 base::string16 text(base::WideToUTF16(L".;\x041C\x0438 Can \x1791\x17C1 "));

411 BreakIterator iter(text, GetRulesForLanguage("ru-RU"));

412 ASSERT_TRUE(iter.Init());

413

414 EXPECT_TRUE(iter.Advance());

415 // Finds the period and semicolon.

416 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());

417 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

418 EXPECT_TRUE(iter.Advance());

419 EXPECT_EQ(base::UTF8ToUTF16(";"), iter.GetString());

420 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

421 EXPECT_TRUE(iter.Advance());

422 // Finds all the Russian characters.

423 EXPECT_EQ(base::WideToUTF16(L"\x041C\x0438"), iter.GetString());

424 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);

425 EXPECT_TRUE(iter.Advance());

426 // Finds the space and each character in "Can".

427 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());

428 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

429 EXPECT_TRUE(iter.Advance());

430 EXPECT_EQ(base::UTF8ToUTF16("C"), iter.GetString());

431 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

432 EXPECT_TRUE(iter.Advance());

433 EXPECT_EQ(base::UTF8ToUTF16("a"), iter.GetString());

434 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

435 EXPECT_TRUE(iter.Advance());

436 EXPECT_EQ(base::UTF8ToUTF16("n"), iter.GetString());

437 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

438 EXPECT_TRUE(iter.Advance());

439 // Finds the next space, the Khmer characters, and the last two spaces.

440 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());

441 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

442 EXPECT_TRUE(iter.Advance());

443 EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1"), iter.GetString());

444 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

445 EXPECT_TRUE(iter.Advance());

446 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());

447 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

448 EXPECT_TRUE(iter.Advance());

449 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());

450 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

451 EXPECT_FALSE(iter.Advance());

452 }

453

454 // This test uses Khmer rules to check that different character set combinations

455 // properly find word breaks and skippable characters. Khmer does not use spaces

456 // between words and uses a dictionary to determine word breaks instead.

457 TEST(SpellcheckWordIteratorTest, FindSkippableWordsKhmer) {

458 // A string containing two Russian characters followed by two, three, and

459 // two-character Khmer words, and then English characters and punctuation.

460 base::string16 text(base::WideToUTF16(

461 L"\x041C\x0438 \x178F\x17BE\x179B\x17C4\x1780\x1798\x1780zoo. ,"));

462 BreakIterator iter(text, GetRulesForLanguage("km"));

463 ASSERT_TRUE(iter.Init());

464

465 EXPECT_TRUE(iter.Advance());

466 // Finds each Russian character and the space.

467 EXPECT_EQ(base::WideToUTF16(L"\x041C"), iter.GetString());

468 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

469 EXPECT_TRUE(iter.Advance());

470 EXPECT_EQ(base::WideToUTF16(L"\x0438"), iter.GetString());

471 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

472 EXPECT_TRUE(iter.Advance());

473 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());

474 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

475 EXPECT_TRUE(iter.Advance());

476 // Finds the first two-character Khmer word.

477 EXPECT_EQ(base::WideToUTF16(L"\x178F\x17BE"), iter.GetString());

478 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);

479 EXPECT_TRUE(iter.Advance());

480 // Finds the three-character Khmer word and then the next two-character word.

481 // Note: Technically these are two different Khmer words so the Khmer language

482 // rule should find a break between them but due to the heuristic/statistical

483 // nature of the Khmer word breaker it does not.

484 EXPECT_EQ(base::WideToUTF16(L"\x179B\x17C4\x1780\x1798\x1780"),

485 iter.GetString());

486 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);

487 EXPECT_TRUE(iter.Advance());

488 // Finds each character in "zoo".

489 EXPECT_EQ(base::UTF8ToUTF16("z"), iter.GetString());

490 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

491 EXPECT_TRUE(iter.Advance());

492 EXPECT_EQ(base::UTF8ToUTF16("o"), iter.GetString());

493 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

494 EXPECT_TRUE(iter.Advance());

495 EXPECT_EQ(base::UTF8ToUTF16("o"), iter.GetString());

496 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

497 EXPECT_TRUE(iter.Advance());

498 // Finds the period, space, and comma.

499 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());

500 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

501 EXPECT_TRUE(iter.Advance());

502 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());

503 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

504 EXPECT_TRUE(iter.Advance());

505 EXPECT_EQ(base::UTF8ToUTF16(","), iter.GetString());

506 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

507 EXPECT_FALSE(iter.Advance());

508 }

OLD	NEW