Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(110)

Side by Side Diff: chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc

Issue 2159283003: [WIP][DO NOT LAND] Componentize spellcheck Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <stddef.h>
6
7 #include <string>
8 #include <vector>
9
10 #include "base/format_macros.h"
11 #include "base/i18n/break_iterator.h"
12 #include "base/macros.h"
13 #include "base/strings/string_split.h"
14 #include "base/strings/stringprintf.h"
15 #include "base/strings/utf_string_conversions.h"
16 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
17 #include "testing/gtest/include/gtest/gtest.h"
18
19 using base::i18n::BreakIterator;
20 using WordIteratorStatus = SpellcheckWordIterator::WordIteratorStatus;
21
22 namespace {
23
24 struct TestCase {
25 const char* language;
26 bool allow_contraction;
27 const wchar_t* expected_words;
28 };
29
30 base::string16 GetRulesForLanguage(const std::string& language) {
31 SpellcheckCharAttribute attribute;
32 attribute.SetDefaultLanguage(language);
33 return attribute.GetRuleSet(true);
34 }
35
36 WordIteratorStatus GetNextNonSkippableWord(SpellcheckWordIterator* iterator,
37 base::string16* word_string,
38 int* word_start,
39 int* word_length) {
40 WordIteratorStatus status = SpellcheckWordIterator::IS_SKIPPABLE;
41 while (status == SpellcheckWordIterator::IS_SKIPPABLE)
42 status = iterator->GetNextWord(word_string, word_start, word_length);
43 return status;
44 }
45
46 } // namespace
47
48 // Tests whether or not our SpellcheckWordIterator can extract words used by the
49 // specified language from a multi-language text.
50 TEST(SpellcheckWordIteratorTest, SplitWord) {
51 // An input text. This text includes words of several languages. (Some words
52 // are not separated with whitespace characters.) Our SpellcheckWordIterator
53 // should extract the words used by the specified language from this text and
54 // normalize them so our spell-checker can check their spellings. If
55 // characters are found that are not from the specified language the test
56 // skips them.
57 const wchar_t kTestText[] =
58 // Graphic characters
59 L"!@#$%^&*()"
60 // Latin (including a contraction character and a ligature).
61 L"hello:hello a\xFB03x"
62 // Greek
63 L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
64 // Cyrillic
65 L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
66 L"\x0443\x0439\x0442\x0435"
67 // Hebrew (including niqquds)
68 L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd "
69 // Hebrew words with U+0027 and U+05F3
70 L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
71 // Hebrew words with U+0022 and U+05F4
72 L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
73 // Hebrew words enclosed with ASCII quotes.
74 L"\"\x05e6\x05d4\x0022\x05dc\" '\x05e9\x05c1\x05b8\x05dc\x05d5'"
75 // Arabic (including vowel marks)
76 L"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627\x0645\x064f "
77 L"\x0639\x064e\x0644\x064e\x064a\x0652\x0643\x064f\x0645\x0652 "
78 // Farsi/Persian (including vowel marks)
79 // Make sure \u064b - \u0652 are removed.
80 L"\x0647\x0634\x064e\x0631\x062d "
81 L"\x0647\x062e\x0648\x0627\x0647 "
82 L"\x0650\x062f\x0631\x062f "
83 L"\x0631\x0645\x0627\x0646\x0652 "
84 L"\x0633\x0631\x0651 "
85 L"\x0646\x0646\x064e\x062c\x064f\x0633 "
86 L"\x0627\x0644\x062d\x0645\x062f "
87 // Also make sure that class "Lm" (the \u0640) is filtered out too.
88 L"\x062c\x062c\x0640\x062c\x062c"
89 // Hindi
90 L"\x0930\x093E\x091C\x0927\x093E\x0928"
91 // Thai
92 L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
93 L"\x0e23\x0e31\x0e1a"
94 // Hiraganas
95 L"\x3053\x3093\x306B\x3061\x306F"
96 // CJKV ideographs
97 L"\x4F60\x597D"
98 // Hangul Syllables
99 L"\xC548\xB155\xD558\xC138\xC694"
100 // Full-width latin : Hello
101 L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F "
102 L"e.g.,";
103
104 // The languages and expected results used in this test.
105 static const TestCase kTestCases[] = {
106 {
107 // English (keep contraction words)
108 "en-US", true, L"hello:hello affix Hello e.g"
109 }, {
110 // English (split contraction words)
111 "en-US", false, L"hello hello affix Hello e g"
112 }, {
113 // Greek
114 "el-GR", true,
115 L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
116 }, {
117 // Russian
118 "ru-RU", true,
119 L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
120 L"\x0443\x0439\x0442\x0435"
121 }, {
122 // Hebrew
123 "he-IL", true,
124 L"\x05e9\x05dc\x05d5\x05dd "
125 L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
126 L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
127 L"\x05e6\x05d4\x0022\x05dc \x05e9\x05dc\x05d5"
128 }, {
129 // Arabic
130 "ar", true,
131 L"\x0627\x0644\x0633\x0644\x0627\x0645 "
132 L"\x0639\x0644\x064a\x0643\x0645 "
133 // Farsi/Persian
134 L"\x0647\x0634\x0631\x062d "
135 L"\x0647\x062e\x0648\x0627\x0647 "
136 L"\x062f\x0631\x062f "
137 L"\x0631\x0645\x0627\x0646 "
138 L"\x0633\x0631 "
139 L"\x0646\x0646\x062c\x0633 "
140 L"\x0627\x0644\x062d\x0645\x062f "
141 L"\x062c\x062c\x062c\x062c"
142 }, {
143 // Hindi
144 "hi-IN", true,
145 L"\x0930\x093E\x091C\x0927\x093E\x0928"
146 }, {
147 // Thai
148 "th-TH", true,
149 L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
150 L"\x0e23\x0e31\x0e1a"
151 }, {
152 // Korean
153 "ko-KR", true,
154 L"\x110b\x1161\x11ab\x1102\x1167\x11bc\x1112\x1161"
155 L"\x1109\x1166\x110b\x116d"
156 },
157 };
158
159 for (size_t i = 0; i < arraysize(kTestCases); ++i) {
160 SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,
161 kTestCases[i].language));
162
163 SpellcheckCharAttribute attributes;
164 attributes.SetDefaultLanguage(kTestCases[i].language);
165
166 base::string16 input(base::WideToUTF16(kTestText));
167 SpellcheckWordIterator iterator;
168 EXPECT_TRUE(iterator.Initialize(&attributes,
169 kTestCases[i].allow_contraction));
170 EXPECT_TRUE(iterator.SetText(input.c_str(), input.length()));
171
172 std::vector<base::string16> expected_words = base::SplitString(
173 base::WideToUTF16(kTestCases[i].expected_words),
174 base::string16(1, ' '), base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
175
176 base::string16 actual_word;
177 int actual_start, actual_len;
178 size_t index = 0;
179 for (SpellcheckWordIterator::WordIteratorStatus status =
180 iterator.GetNextWord(&actual_word, &actual_start, &actual_len);
181 status != SpellcheckWordIterator::IS_END_OF_TEXT;
182 status =
183 iterator.GetNextWord(&actual_word, &actual_start, &actual_len)) {
184 if (status == SpellcheckWordIterator::WordIteratorStatus::IS_SKIPPABLE)
185 continue;
186
187 EXPECT_TRUE(index < expected_words.size());
188 if (index < expected_words.size())
189 EXPECT_EQ(expected_words[index], actual_word);
190 ++index;
191 }
192 }
193 }
194
195 // Tests whether our SpellcheckWordIterator extracts an empty word without
196 // getting stuck in an infinite loop when inputting a Khmer text. (This is a
197 // regression test for Issue 46278.)
198 TEST(SpellcheckWordIteratorTest, RuleSetConsistency) {
199 SpellcheckCharAttribute attributes;
200 attributes.SetDefaultLanguage("en-US");
201
202 const wchar_t kTestText[] = L"\x1791\x17c1\x002e";
203 base::string16 input(base::WideToUTF16(kTestText));
204
205 SpellcheckWordIterator iterator;
206 EXPECT_TRUE(iterator.Initialize(&attributes, true));
207 EXPECT_TRUE(iterator.SetText(input.c_str(), input.length()));
208
209 // When SpellcheckWordIterator uses an inconsistent ICU ruleset, the following
210 // iterator.GetNextWord() calls get stuck in an infinite loop. Therefore, this
211 // test succeeds if this call returns without timeouts.
212 base::string16 actual_word;
213 int actual_start, actual_len;
214 WordIteratorStatus status = GetNextNonSkippableWord(
215 &iterator, &actual_word, &actual_start, &actual_len);
216
217 EXPECT_EQ(SpellcheckWordIterator::WordIteratorStatus::IS_END_OF_TEXT, status);
218 EXPECT_EQ(0, actual_start);
219 EXPECT_EQ(0, actual_len);
220 }
221
222 // Vertify our SpellcheckWordIterator can treat ASCII numbers as word characters
223 // on LTR languages. On the other hand, it should not treat ASCII numbers as
224 // word characters on RTL languages because they change the text direction from
225 // RTL to LTR.
226 TEST(SpellcheckWordIteratorTest, TreatNumbersAsWordCharacters) {
227 // A set of a language, a dummy word, and a text direction used in this test.
228 // For each language, this test splits a dummy word, which consists of ASCII
229 // numbers and an alphabet of the language, into words. When ASCII numbers are
230 // treated as word characters, the split word becomes equal to the dummy word.
231 // Otherwise, the split word does not include ASCII numbers.
232 static const struct {
233 const char* language;
234 const wchar_t* text;
235 bool left_to_right;
236 } kTestCases[] = {
237 {
238 // English
239 "en-US", L"0123456789" L"a", true,
240 }, {
241 // Greek
242 "el-GR", L"0123456789" L"\x03B1", true,
243 }, {
244 // Russian
245 "ru-RU", L"0123456789" L"\x0430", true,
246 }, {
247 // Hebrew
248 "he-IL", L"0123456789" L"\x05D0", false,
249 }, {
250 // Arabic
251 "ar", L"0123456789" L"\x0627", false,
252 }, {
253 // Hindi
254 "hi-IN", L"0123456789" L"\x0905", true,
255 }, {
256 // Thai
257 "th-TH", L"0123456789" L"\x0e01", true,
258 }, {
259 // Korean
260 "ko-KR", L"0123456789" L"\x1100\x1161", true,
261 },
262 };
263
264 for (size_t i = 0; i < arraysize(kTestCases); ++i) {
265 SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,
266 kTestCases[i].language));
267
268 SpellcheckCharAttribute attributes;
269 attributes.SetDefaultLanguage(kTestCases[i].language);
270
271 base::string16 input_word(base::WideToUTF16(kTestCases[i].text));
272 SpellcheckWordIterator iterator;
273 EXPECT_TRUE(iterator.Initialize(&attributes, true));
274 EXPECT_TRUE(iterator.SetText(input_word.c_str(), input_word.length()));
275
276 base::string16 actual_word;
277 int actual_start, actual_len;
278 WordIteratorStatus status = GetNextNonSkippableWord(
279 &iterator, &actual_word, &actual_start, &actual_len);
280
281 EXPECT_EQ(SpellcheckWordIterator::WordIteratorStatus::IS_WORD, status);
282 if (kTestCases[i].left_to_right)
283 EXPECT_EQ(input_word, actual_word);
284 else
285 EXPECT_NE(input_word, actual_word);
286 }
287 }
288
289 // Verify SpellcheckWordIterator treats typographical apostrophe as a part of
290 // the word.
291 TEST(SpellcheckWordIteratorTest, TypographicalApostropheIsPartOfWord) {
292 static const struct {
293 const char* language;
294 const wchar_t* input;
295 const wchar_t* expected;
296 } kTestCases[] = {
297 // Typewriter apostrophe:
298 {"en-AU", L"you're", L"you're"},
299 {"en-CA", L"you're", L"you're"},
300 {"en-GB", L"you're", L"you're"},
301 {"en-US", L"you're", L"you're"},
302 {"en-US", L"!!!!you're", L"you're"},
303 // Typographical apostrophe:
304 {"en-AU", L"you\x2019re", L"you\x2019re"},
305 {"en-CA", L"you\x2019re", L"you\x2019re"},
306 {"en-GB", L"you\x2019re", L"you\x2019re"},
307 {"en-US", L"you\x2019re", L"you\x2019re"},
308 {"en-US", L"....you\x2019re", L"you\x2019re"},
309 };
310
311 for (size_t i = 0; i < arraysize(kTestCases); ++i) {
312 SpellcheckCharAttribute attributes;
313 attributes.SetDefaultLanguage(kTestCases[i].language);
314
315 base::string16 input_word(base::WideToUTF16(kTestCases[i].input));
316 base::string16 expected_word(base::WideToUTF16(kTestCases[i].expected));
317 SpellcheckWordIterator iterator;
318 EXPECT_TRUE(iterator.Initialize(&attributes, true));
319 EXPECT_TRUE(iterator.SetText(input_word.c_str(), input_word.length()));
320
321 base::string16 actual_word;
322 int actual_start, actual_len;
323 WordIteratorStatus status = GetNextNonSkippableWord(
324 &iterator, &actual_word, &actual_start, &actual_len);
325
326 EXPECT_EQ(SpellcheckWordIterator::WordIteratorStatus::IS_WORD, status);
327 EXPECT_EQ(expected_word, actual_word);
328 EXPECT_LE(0, actual_start);
329 EXPECT_EQ(expected_word.length(),
330 static_cast<base::string16::size_type>(actual_len));
331 }
332 }
333
334 TEST(SpellcheckWordIteratorTest, Initialization) {
335 // Test initialization works when a default language is set.
336 {
337 SpellcheckCharAttribute attributes;
338 attributes.SetDefaultLanguage("en-US");
339
340 SpellcheckWordIterator iterator;
341 EXPECT_TRUE(iterator.Initialize(&attributes, true));
342 }
343
344 // Test initialization fails when no default language is set.
345 {
346 SpellcheckCharAttribute attributes;
347
348 SpellcheckWordIterator iterator;
349 EXPECT_FALSE(iterator.Initialize(&attributes, true));
350 }
351 }
352
353 // This test uses English rules to check that different character set
354 // combinations properly find word breaks and skippable characters.
355 TEST(SpellcheckWordIteratorTest, FindSkippableWordsEnglish) {
356 // A string containing the English word "foo", followed by two Khmer
357 // characters, the English word "Can", and then two Russian characters and
358 // punctuation.
359 base::string16 text(
360 base::WideToUTF16(L"foo \x1791\x17C1 Can \x041C\x0438..."));
361 BreakIterator iter(text, GetRulesForLanguage("en-US"));
362 ASSERT_TRUE(iter.Init());
363
364 EXPECT_TRUE(iter.Advance());
365 // Finds "foo".
366 EXPECT_EQ(base::UTF8ToUTF16("foo"), iter.GetString());
367 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
368 EXPECT_TRUE(iter.Advance());
369 // Finds the space and then the Khmer characters.
370 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
371 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
372 EXPECT_TRUE(iter.Advance());
373 EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1"), iter.GetString());
374 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
375 EXPECT_TRUE(iter.Advance());
376 // Finds the next space and "Can".
377 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
378 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
379 EXPECT_TRUE(iter.Advance());
380 EXPECT_EQ(base::UTF8ToUTF16("Can"), iter.GetString());
381 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
382 EXPECT_TRUE(iter.Advance());
383 // Finds the next space and each Russian character.
384 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
385 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
386 EXPECT_TRUE(iter.Advance());
387 EXPECT_EQ(base::WideToUTF16(L"\x041C"), iter.GetString());
388 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
389 EXPECT_TRUE(iter.Advance());
390 EXPECT_EQ(base::WideToUTF16(L"\x0438"), iter.GetString());
391 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
392 EXPECT_TRUE(iter.Advance());
393 // Finds the periods at the end.
394 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
395 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
396 EXPECT_TRUE(iter.Advance());
397 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
398 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
399 EXPECT_TRUE(iter.Advance());
400 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
401 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
402 EXPECT_FALSE(iter.Advance());
403 }
404
405 // This test uses Russian rules to check that different character set
406 // combinations properly find word breaks and skippable characters.
407 TEST(SpellcheckWordIteratorTest, FindSkippableWordsRussian) {
408 // A string containing punctuation followed by two Russian characters, the
409 // English word "Can", and then two Khmer characters.
410 base::string16 text(base::WideToUTF16(L".;\x041C\x0438 Can \x1791\x17C1 "));
411 BreakIterator iter(text, GetRulesForLanguage("ru-RU"));
412 ASSERT_TRUE(iter.Init());
413
414 EXPECT_TRUE(iter.Advance());
415 // Finds the period and semicolon.
416 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
417 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
418 EXPECT_TRUE(iter.Advance());
419 EXPECT_EQ(base::UTF8ToUTF16(";"), iter.GetString());
420 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
421 EXPECT_TRUE(iter.Advance());
422 // Finds all the Russian characters.
423 EXPECT_EQ(base::WideToUTF16(L"\x041C\x0438"), iter.GetString());
424 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
425 EXPECT_TRUE(iter.Advance());
426 // Finds the space and each character in "Can".
427 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
428 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
429 EXPECT_TRUE(iter.Advance());
430 EXPECT_EQ(base::UTF8ToUTF16("C"), iter.GetString());
431 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
432 EXPECT_TRUE(iter.Advance());
433 EXPECT_EQ(base::UTF8ToUTF16("a"), iter.GetString());
434 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
435 EXPECT_TRUE(iter.Advance());
436 EXPECT_EQ(base::UTF8ToUTF16("n"), iter.GetString());
437 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
438 EXPECT_TRUE(iter.Advance());
439 // Finds the next space, the Khmer characters, and the last two spaces.
440 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
441 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
442 EXPECT_TRUE(iter.Advance());
443 EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1"), iter.GetString());
444 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
445 EXPECT_TRUE(iter.Advance());
446 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
447 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
448 EXPECT_TRUE(iter.Advance());
449 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
450 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
451 EXPECT_FALSE(iter.Advance());
452 }
453
454 // This test uses Khmer rules to check that different character set combinations
455 // properly find word breaks and skippable characters. Khmer does not use spaces
456 // between words and uses a dictionary to determine word breaks instead.
457 TEST(SpellcheckWordIteratorTest, FindSkippableWordsKhmer) {
458 // A string containing two Russian characters followed by two, three, and
459 // two-character Khmer words, and then English characters and punctuation.
460 base::string16 text(base::WideToUTF16(
461 L"\x041C\x0438 \x178F\x17BE\x179B\x17C4\x1780\x1798\x1780zoo. ,"));
462 BreakIterator iter(text, GetRulesForLanguage("km"));
463 ASSERT_TRUE(iter.Init());
464
465 EXPECT_TRUE(iter.Advance());
466 // Finds each Russian character and the space.
467 EXPECT_EQ(base::WideToUTF16(L"\x041C"), iter.GetString());
468 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
469 EXPECT_TRUE(iter.Advance());
470 EXPECT_EQ(base::WideToUTF16(L"\x0438"), iter.GetString());
471 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
472 EXPECT_TRUE(iter.Advance());
473 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
474 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
475 EXPECT_TRUE(iter.Advance());
476 // Finds the first two-character Khmer word.
477 EXPECT_EQ(base::WideToUTF16(L"\x178F\x17BE"), iter.GetString());
478 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
479 EXPECT_TRUE(iter.Advance());
480 // Finds the three-character Khmer word and then the next two-character word.
481 // Note: Technically these are two different Khmer words so the Khmer language
482 // rule should find a break between them but due to the heuristic/statistical
483 // nature of the Khmer word breaker it does not.
484 EXPECT_EQ(base::WideToUTF16(L"\x179B\x17C4\x1780\x1798\x1780"),
485 iter.GetString());
486 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
487 EXPECT_TRUE(iter.Advance());
488 // Finds each character in "zoo".
489 EXPECT_EQ(base::UTF8ToUTF16("z"), iter.GetString());
490 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
491 EXPECT_TRUE(iter.Advance());
492 EXPECT_EQ(base::UTF8ToUTF16("o"), iter.GetString());
493 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
494 EXPECT_TRUE(iter.Advance());
495 EXPECT_EQ(base::UTF8ToUTF16("o"), iter.GetString());
496 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
497 EXPECT_TRUE(iter.Advance());
498 // Finds the period, space, and comma.
499 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
500 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
501 EXPECT_TRUE(iter.Advance());
502 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
503 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
504 EXPECT_TRUE(iter.Advance());
505 EXPECT_EQ(base::UTF8ToUTF16(","), iter.GetString());
506 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
507 EXPECT_FALSE(iter.Advance());
508 }
OLDNEW
« no previous file with comments | « chrome/renderer/spellchecker/spellcheck_worditerator.cc ('k') | chrome/renderer/spellchecker/spelling_engine.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698