| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include <stddef.h> | |
| 6 | |
| 7 #include <string> | |
| 8 #include <vector> | |
| 9 | |
| 10 #include "base/format_macros.h" | |
| 11 #include "base/i18n/break_iterator.h" | |
| 12 #include "base/macros.h" | |
| 13 #include "base/strings/string_split.h" | |
| 14 #include "base/strings/stringprintf.h" | |
| 15 #include "base/strings/utf_string_conversions.h" | |
| 16 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" | |
| 17 #include "testing/gtest/include/gtest/gtest.h" | |
| 18 | |
| 19 using base::i18n::BreakIterator; | |
| 20 using WordIteratorStatus = SpellcheckWordIterator::WordIteratorStatus; | |
| 21 | |
| 22 namespace { | |
| 23 | |
| 24 struct TestCase { | |
| 25 const char* language; | |
| 26 bool allow_contraction; | |
| 27 const wchar_t* expected_words; | |
| 28 }; | |
| 29 | |
| 30 base::string16 GetRulesForLanguage(const std::string& language) { | |
| 31 SpellcheckCharAttribute attribute; | |
| 32 attribute.SetDefaultLanguage(language); | |
| 33 return attribute.GetRuleSet(true); | |
| 34 } | |
| 35 | |
| 36 WordIteratorStatus GetNextNonSkippableWord(SpellcheckWordIterator* iterator, | |
| 37 base::string16* word_string, | |
| 38 int* word_start, | |
| 39 int* word_length) { | |
| 40 WordIteratorStatus status = SpellcheckWordIterator::IS_SKIPPABLE; | |
| 41 while (status == SpellcheckWordIterator::IS_SKIPPABLE) | |
| 42 status = iterator->GetNextWord(word_string, word_start, word_length); | |
| 43 return status; | |
| 44 } | |
| 45 | |
| 46 } // namespace | |
| 47 | |
| 48 // Tests whether or not our SpellcheckWordIterator can extract words used by the | |
| 49 // specified language from a multi-language text. | |
| 50 TEST(SpellcheckWordIteratorTest, SplitWord) { | |
| 51 // An input text. This text includes words of several languages. (Some words | |
| 52 // are not separated with whitespace characters.) Our SpellcheckWordIterator | |
| 53 // should extract the words used by the specified language from this text and | |
| 54 // normalize them so our spell-checker can check their spellings. If | |
| 55 // characters are found that are not from the specified language the test | |
| 56 // skips them. | |
| 57 const wchar_t kTestText[] = | |
| 58 // Graphic characters | |
| 59 L"!@#$%^&*()" | |
| 60 // Latin (including a contraction character and a ligature). | |
| 61 L"hello:hello a\xFB03x" | |
| 62 // Greek | |
| 63 L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5" | |
| 64 // Cyrillic | |
| 65 L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432" | |
| 66 L"\x0443\x0439\x0442\x0435" | |
| 67 // Hebrew (including niqquds) | |
| 68 L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd " | |
| 69 // Hebrew words with U+0027 and U+05F3 | |
| 70 L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 " | |
| 71 // Hebrew words with U+0022 and U+05F4 | |
| 72 L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc " | |
| 73 // Hebrew words enclosed with ASCII quotes. | |
| 74 L"\"\x05e6\x05d4\x0022\x05dc\" '\x05e9\x05c1\x05b8\x05dc\x05d5'" | |
| 75 // Arabic (including vowel marks) | |
| 76 L"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627\x0645\x064f " | |
| 77 L"\x0639\x064e\x0644\x064e\x064a\x0652\x0643\x064f\x0645\x0652 " | |
| 78 // Farsi/Persian (including vowel marks) | |
| 79 // Make sure \u064b - \u0652 are removed. | |
| 80 L"\x0647\x0634\x064e\x0631\x062d " | |
| 81 L"\x0647\x062e\x0648\x0627\x0647 " | |
| 82 L"\x0650\x062f\x0631\x062f " | |
| 83 L"\x0631\x0645\x0627\x0646\x0652 " | |
| 84 L"\x0633\x0631\x0651 " | |
| 85 L"\x0646\x0646\x064e\x062c\x064f\x0633 " | |
| 86 L"\x0627\x0644\x062d\x0645\x062f " | |
| 87 // Also make sure that class "Lm" (the \u0640) is filtered out too. | |
| 88 L"\x062c\x062c\x0640\x062c\x062c" | |
| 89 // Hindi | |
| 90 L"\x0930\x093E\x091C\x0927\x093E\x0928" | |
| 91 // Thai | |
| 92 L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04" | |
| 93 L"\x0e23\x0e31\x0e1a" | |
| 94 // Hiraganas | |
| 95 L"\x3053\x3093\x306B\x3061\x306F" | |
| 96 // CJKV ideographs | |
| 97 L"\x4F60\x597D" | |
| 98 // Hangul Syllables | |
| 99 L"\xC548\xB155\xD558\xC138\xC694" | |
| 100 // Full-width latin : Hello | |
| 101 L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F " | |
| 102 L"e.g.,"; | |
| 103 | |
| 104 // The languages and expected results used in this test. | |
| 105 static const TestCase kTestCases[] = { | |
| 106 { | |
| 107 // English (keep contraction words) | |
| 108 "en-US", true, L"hello:hello affix Hello e.g" | |
| 109 }, { | |
| 110 // English (split contraction words) | |
| 111 "en-US", false, L"hello hello affix Hello e g" | |
| 112 }, { | |
| 113 // Greek | |
| 114 "el-GR", true, | |
| 115 L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5" | |
| 116 }, { | |
| 117 // Russian | |
| 118 "ru-RU", true, | |
| 119 L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432" | |
| 120 L"\x0443\x0439\x0442\x0435" | |
| 121 }, { | |
| 122 // Hebrew | |
| 123 "he-IL", true, | |
| 124 L"\x05e9\x05dc\x05d5\x05dd " | |
| 125 L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 " | |
| 126 L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc " | |
| 127 L"\x05e6\x05d4\x0022\x05dc \x05e9\x05dc\x05d5" | |
| 128 }, { | |
| 129 // Arabic | |
| 130 "ar", true, | |
| 131 L"\x0627\x0644\x0633\x0644\x0627\x0645 " | |
| 132 L"\x0639\x0644\x064a\x0643\x0645 " | |
| 133 // Farsi/Persian | |
| 134 L"\x0647\x0634\x0631\x062d " | |
| 135 L"\x0647\x062e\x0648\x0627\x0647 " | |
| 136 L"\x062f\x0631\x062f " | |
| 137 L"\x0631\x0645\x0627\x0646 " | |
| 138 L"\x0633\x0631 " | |
| 139 L"\x0646\x0646\x062c\x0633 " | |
| 140 L"\x0627\x0644\x062d\x0645\x062f " | |
| 141 L"\x062c\x062c\x062c\x062c" | |
| 142 }, { | |
| 143 // Hindi | |
| 144 "hi-IN", true, | |
| 145 L"\x0930\x093E\x091C\x0927\x093E\x0928" | |
| 146 }, { | |
| 147 // Thai | |
| 148 "th-TH", true, | |
| 149 L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04" | |
| 150 L"\x0e23\x0e31\x0e1a" | |
| 151 }, { | |
| 152 // Korean | |
| 153 "ko-KR", true, | |
| 154 L"\x110b\x1161\x11ab\x1102\x1167\x11bc\x1112\x1161" | |
| 155 L"\x1109\x1166\x110b\x116d" | |
| 156 }, | |
| 157 }; | |
| 158 | |
| 159 for (size_t i = 0; i < arraysize(kTestCases); ++i) { | |
| 160 SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i, | |
| 161 kTestCases[i].language)); | |
| 162 | |
| 163 SpellcheckCharAttribute attributes; | |
| 164 attributes.SetDefaultLanguage(kTestCases[i].language); | |
| 165 | |
| 166 base::string16 input(base::WideToUTF16(kTestText)); | |
| 167 SpellcheckWordIterator iterator; | |
| 168 EXPECT_TRUE(iterator.Initialize(&attributes, | |
| 169 kTestCases[i].allow_contraction)); | |
| 170 EXPECT_TRUE(iterator.SetText(input.c_str(), input.length())); | |
| 171 | |
| 172 std::vector<base::string16> expected_words = base::SplitString( | |
| 173 base::WideToUTF16(kTestCases[i].expected_words), | |
| 174 base::string16(1, ' '), base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL); | |
| 175 | |
| 176 base::string16 actual_word; | |
| 177 int actual_start, actual_len; | |
| 178 size_t index = 0; | |
| 179 for (SpellcheckWordIterator::WordIteratorStatus status = | |
| 180 iterator.GetNextWord(&actual_word, &actual_start, &actual_len); | |
| 181 status != SpellcheckWordIterator::IS_END_OF_TEXT; | |
| 182 status = | |
| 183 iterator.GetNextWord(&actual_word, &actual_start, &actual_len)) { | |
| 184 if (status == SpellcheckWordIterator::WordIteratorStatus::IS_SKIPPABLE) | |
| 185 continue; | |
| 186 | |
| 187 EXPECT_TRUE(index < expected_words.size()); | |
| 188 if (index < expected_words.size()) | |
| 189 EXPECT_EQ(expected_words[index], actual_word); | |
| 190 ++index; | |
| 191 } | |
| 192 } | |
| 193 } | |
| 194 | |
| 195 // Tests whether our SpellcheckWordIterator extracts an empty word without | |
| 196 // getting stuck in an infinite loop when inputting a Khmer text. (This is a | |
| 197 // regression test for Issue 46278.) | |
| 198 TEST(SpellcheckWordIteratorTest, RuleSetConsistency) { | |
| 199 SpellcheckCharAttribute attributes; | |
| 200 attributes.SetDefaultLanguage("en-US"); | |
| 201 | |
| 202 const wchar_t kTestText[] = L"\x1791\x17c1\x002e"; | |
| 203 base::string16 input(base::WideToUTF16(kTestText)); | |
| 204 | |
| 205 SpellcheckWordIterator iterator; | |
| 206 EXPECT_TRUE(iterator.Initialize(&attributes, true)); | |
| 207 EXPECT_TRUE(iterator.SetText(input.c_str(), input.length())); | |
| 208 | |
| 209 // When SpellcheckWordIterator uses an inconsistent ICU ruleset, the following | |
| 210 // iterator.GetNextWord() calls get stuck in an infinite loop. Therefore, this | |
| 211 // test succeeds if this call returns without timeouts. | |
| 212 base::string16 actual_word; | |
| 213 int actual_start, actual_len; | |
| 214 WordIteratorStatus status = GetNextNonSkippableWord( | |
| 215 &iterator, &actual_word, &actual_start, &actual_len); | |
| 216 | |
| 217 EXPECT_EQ(SpellcheckWordIterator::WordIteratorStatus::IS_END_OF_TEXT, status); | |
| 218 EXPECT_EQ(0, actual_start); | |
| 219 EXPECT_EQ(0, actual_len); | |
| 220 } | |
| 221 | |
| 222 // Vertify our SpellcheckWordIterator can treat ASCII numbers as word characters | |
| 223 // on LTR languages. On the other hand, it should not treat ASCII numbers as | |
| 224 // word characters on RTL languages because they change the text direction from | |
| 225 // RTL to LTR. | |
| 226 TEST(SpellcheckWordIteratorTest, TreatNumbersAsWordCharacters) { | |
| 227 // A set of a language, a dummy word, and a text direction used in this test. | |
| 228 // For each language, this test splits a dummy word, which consists of ASCII | |
| 229 // numbers and an alphabet of the language, into words. When ASCII numbers are | |
| 230 // treated as word characters, the split word becomes equal to the dummy word. | |
| 231 // Otherwise, the split word does not include ASCII numbers. | |
| 232 static const struct { | |
| 233 const char* language; | |
| 234 const wchar_t* text; | |
| 235 bool left_to_right; | |
| 236 } kTestCases[] = { | |
| 237 { | |
| 238 // English | |
| 239 "en-US", L"0123456789" L"a", true, | |
| 240 }, { | |
| 241 // Greek | |
| 242 "el-GR", L"0123456789" L"\x03B1", true, | |
| 243 }, { | |
| 244 // Russian | |
| 245 "ru-RU", L"0123456789" L"\x0430", true, | |
| 246 }, { | |
| 247 // Hebrew | |
| 248 "he-IL", L"0123456789" L"\x05D0", false, | |
| 249 }, { | |
| 250 // Arabic | |
| 251 "ar", L"0123456789" L"\x0627", false, | |
| 252 }, { | |
| 253 // Hindi | |
| 254 "hi-IN", L"0123456789" L"\x0905", true, | |
| 255 }, { | |
| 256 // Thai | |
| 257 "th-TH", L"0123456789" L"\x0e01", true, | |
| 258 }, { | |
| 259 // Korean | |
| 260 "ko-KR", L"0123456789" L"\x1100\x1161", true, | |
| 261 }, | |
| 262 }; | |
| 263 | |
| 264 for (size_t i = 0; i < arraysize(kTestCases); ++i) { | |
| 265 SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i, | |
| 266 kTestCases[i].language)); | |
| 267 | |
| 268 SpellcheckCharAttribute attributes; | |
| 269 attributes.SetDefaultLanguage(kTestCases[i].language); | |
| 270 | |
| 271 base::string16 input_word(base::WideToUTF16(kTestCases[i].text)); | |
| 272 SpellcheckWordIterator iterator; | |
| 273 EXPECT_TRUE(iterator.Initialize(&attributes, true)); | |
| 274 EXPECT_TRUE(iterator.SetText(input_word.c_str(), input_word.length())); | |
| 275 | |
| 276 base::string16 actual_word; | |
| 277 int actual_start, actual_len; | |
| 278 WordIteratorStatus status = GetNextNonSkippableWord( | |
| 279 &iterator, &actual_word, &actual_start, &actual_len); | |
| 280 | |
| 281 EXPECT_EQ(SpellcheckWordIterator::WordIteratorStatus::IS_WORD, status); | |
| 282 if (kTestCases[i].left_to_right) | |
| 283 EXPECT_EQ(input_word, actual_word); | |
| 284 else | |
| 285 EXPECT_NE(input_word, actual_word); | |
| 286 } | |
| 287 } | |
| 288 | |
| 289 // Verify SpellcheckWordIterator treats typographical apostrophe as a part of | |
| 290 // the word. | |
| 291 TEST(SpellcheckWordIteratorTest, TypographicalApostropheIsPartOfWord) { | |
| 292 static const struct { | |
| 293 const char* language; | |
| 294 const wchar_t* input; | |
| 295 const wchar_t* expected; | |
| 296 } kTestCases[] = { | |
| 297 // Typewriter apostrophe: | |
| 298 {"en-AU", L"you're", L"you're"}, | |
| 299 {"en-CA", L"you're", L"you're"}, | |
| 300 {"en-GB", L"you're", L"you're"}, | |
| 301 {"en-US", L"you're", L"you're"}, | |
| 302 {"en-US", L"!!!!you're", L"you're"}, | |
| 303 // Typographical apostrophe: | |
| 304 {"en-AU", L"you\x2019re", L"you\x2019re"}, | |
| 305 {"en-CA", L"you\x2019re", L"you\x2019re"}, | |
| 306 {"en-GB", L"you\x2019re", L"you\x2019re"}, | |
| 307 {"en-US", L"you\x2019re", L"you\x2019re"}, | |
| 308 {"en-US", L"....you\x2019re", L"you\x2019re"}, | |
| 309 }; | |
| 310 | |
| 311 for (size_t i = 0; i < arraysize(kTestCases); ++i) { | |
| 312 SpellcheckCharAttribute attributes; | |
| 313 attributes.SetDefaultLanguage(kTestCases[i].language); | |
| 314 | |
| 315 base::string16 input_word(base::WideToUTF16(kTestCases[i].input)); | |
| 316 base::string16 expected_word(base::WideToUTF16(kTestCases[i].expected)); | |
| 317 SpellcheckWordIterator iterator; | |
| 318 EXPECT_TRUE(iterator.Initialize(&attributes, true)); | |
| 319 EXPECT_TRUE(iterator.SetText(input_word.c_str(), input_word.length())); | |
| 320 | |
| 321 base::string16 actual_word; | |
| 322 int actual_start, actual_len; | |
| 323 WordIteratorStatus status = GetNextNonSkippableWord( | |
| 324 &iterator, &actual_word, &actual_start, &actual_len); | |
| 325 | |
| 326 EXPECT_EQ(SpellcheckWordIterator::WordIteratorStatus::IS_WORD, status); | |
| 327 EXPECT_EQ(expected_word, actual_word); | |
| 328 EXPECT_LE(0, actual_start); | |
| 329 EXPECT_EQ(expected_word.length(), | |
| 330 static_cast<base::string16::size_type>(actual_len)); | |
| 331 } | |
| 332 } | |
| 333 | |
| 334 TEST(SpellcheckWordIteratorTest, Initialization) { | |
| 335 // Test initialization works when a default language is set. | |
| 336 { | |
| 337 SpellcheckCharAttribute attributes; | |
| 338 attributes.SetDefaultLanguage("en-US"); | |
| 339 | |
| 340 SpellcheckWordIterator iterator; | |
| 341 EXPECT_TRUE(iterator.Initialize(&attributes, true)); | |
| 342 } | |
| 343 | |
| 344 // Test initialization fails when no default language is set. | |
| 345 { | |
| 346 SpellcheckCharAttribute attributes; | |
| 347 | |
| 348 SpellcheckWordIterator iterator; | |
| 349 EXPECT_FALSE(iterator.Initialize(&attributes, true)); | |
| 350 } | |
| 351 } | |
| 352 | |
| 353 // This test uses English rules to check that different character set | |
| 354 // combinations properly find word breaks and skippable characters. | |
| 355 TEST(SpellcheckWordIteratorTest, FindSkippableWordsEnglish) { | |
| 356 // A string containing the English word "foo", followed by two Khmer | |
| 357 // characters, the English word "Can", and then two Russian characters and | |
| 358 // punctuation. | |
| 359 base::string16 text( | |
| 360 base::WideToUTF16(L"foo \x1791\x17C1 Can \x041C\x0438...")); | |
| 361 BreakIterator iter(text, GetRulesForLanguage("en-US")); | |
| 362 ASSERT_TRUE(iter.Init()); | |
| 363 | |
| 364 EXPECT_TRUE(iter.Advance()); | |
| 365 // Finds "foo". | |
| 366 EXPECT_EQ(base::UTF8ToUTF16("foo"), iter.GetString()); | |
| 367 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); | |
| 368 EXPECT_TRUE(iter.Advance()); | |
| 369 // Finds the space and then the Khmer characters. | |
| 370 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 371 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 372 EXPECT_TRUE(iter.Advance()); | |
| 373 EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1"), iter.GetString()); | |
| 374 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 375 EXPECT_TRUE(iter.Advance()); | |
| 376 // Finds the next space and "Can". | |
| 377 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 378 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 379 EXPECT_TRUE(iter.Advance()); | |
| 380 EXPECT_EQ(base::UTF8ToUTF16("Can"), iter.GetString()); | |
| 381 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); | |
| 382 EXPECT_TRUE(iter.Advance()); | |
| 383 // Finds the next space and each Russian character. | |
| 384 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 385 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 386 EXPECT_TRUE(iter.Advance()); | |
| 387 EXPECT_EQ(base::WideToUTF16(L"\x041C"), iter.GetString()); | |
| 388 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 389 EXPECT_TRUE(iter.Advance()); | |
| 390 EXPECT_EQ(base::WideToUTF16(L"\x0438"), iter.GetString()); | |
| 391 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 392 EXPECT_TRUE(iter.Advance()); | |
| 393 // Finds the periods at the end. | |
| 394 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString()); | |
| 395 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 396 EXPECT_TRUE(iter.Advance()); | |
| 397 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString()); | |
| 398 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 399 EXPECT_TRUE(iter.Advance()); | |
| 400 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString()); | |
| 401 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 402 EXPECT_FALSE(iter.Advance()); | |
| 403 } | |
| 404 | |
| 405 // This test uses Russian rules to check that different character set | |
| 406 // combinations properly find word breaks and skippable characters. | |
| 407 TEST(SpellcheckWordIteratorTest, FindSkippableWordsRussian) { | |
| 408 // A string containing punctuation followed by two Russian characters, the | |
| 409 // English word "Can", and then two Khmer characters. | |
| 410 base::string16 text(base::WideToUTF16(L".;\x041C\x0438 Can \x1791\x17C1 ")); | |
| 411 BreakIterator iter(text, GetRulesForLanguage("ru-RU")); | |
| 412 ASSERT_TRUE(iter.Init()); | |
| 413 | |
| 414 EXPECT_TRUE(iter.Advance()); | |
| 415 // Finds the period and semicolon. | |
| 416 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString()); | |
| 417 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 418 EXPECT_TRUE(iter.Advance()); | |
| 419 EXPECT_EQ(base::UTF8ToUTF16(";"), iter.GetString()); | |
| 420 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 421 EXPECT_TRUE(iter.Advance()); | |
| 422 // Finds all the Russian characters. | |
| 423 EXPECT_EQ(base::WideToUTF16(L"\x041C\x0438"), iter.GetString()); | |
| 424 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); | |
| 425 EXPECT_TRUE(iter.Advance()); | |
| 426 // Finds the space and each character in "Can". | |
| 427 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 428 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 429 EXPECT_TRUE(iter.Advance()); | |
| 430 EXPECT_EQ(base::UTF8ToUTF16("C"), iter.GetString()); | |
| 431 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 432 EXPECT_TRUE(iter.Advance()); | |
| 433 EXPECT_EQ(base::UTF8ToUTF16("a"), iter.GetString()); | |
| 434 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 435 EXPECT_TRUE(iter.Advance()); | |
| 436 EXPECT_EQ(base::UTF8ToUTF16("n"), iter.GetString()); | |
| 437 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 438 EXPECT_TRUE(iter.Advance()); | |
| 439 // Finds the next space, the Khmer characters, and the last two spaces. | |
| 440 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 441 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 442 EXPECT_TRUE(iter.Advance()); | |
| 443 EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1"), iter.GetString()); | |
| 444 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 445 EXPECT_TRUE(iter.Advance()); | |
| 446 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 447 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 448 EXPECT_TRUE(iter.Advance()); | |
| 449 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 450 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 451 EXPECT_FALSE(iter.Advance()); | |
| 452 } | |
| 453 | |
| 454 // This test uses Khmer rules to check that different character set combinations | |
| 455 // properly find word breaks and skippable characters. Khmer does not use spaces | |
| 456 // between words and uses a dictionary to determine word breaks instead. | |
| 457 TEST(SpellcheckWordIteratorTest, FindSkippableWordsKhmer) { | |
| 458 // A string containing two Russian characters followed by two, three, and | |
| 459 // two-character Khmer words, and then English characters and punctuation. | |
| 460 base::string16 text(base::WideToUTF16( | |
| 461 L"\x041C\x0438 \x178F\x17BE\x179B\x17C4\x1780\x1798\x1780zoo. ,")); | |
| 462 BreakIterator iter(text, GetRulesForLanguage("km")); | |
| 463 ASSERT_TRUE(iter.Init()); | |
| 464 | |
| 465 EXPECT_TRUE(iter.Advance()); | |
| 466 // Finds each Russian character and the space. | |
| 467 EXPECT_EQ(base::WideToUTF16(L"\x041C"), iter.GetString()); | |
| 468 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 469 EXPECT_TRUE(iter.Advance()); | |
| 470 EXPECT_EQ(base::WideToUTF16(L"\x0438"), iter.GetString()); | |
| 471 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 472 EXPECT_TRUE(iter.Advance()); | |
| 473 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 474 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 475 EXPECT_TRUE(iter.Advance()); | |
| 476 // Finds the first two-character Khmer word. | |
| 477 EXPECT_EQ(base::WideToUTF16(L"\x178F\x17BE"), iter.GetString()); | |
| 478 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); | |
| 479 EXPECT_TRUE(iter.Advance()); | |
| 480 // Finds the three-character Khmer word and then the next two-character word. | |
| 481 // Note: Technically these are two different Khmer words so the Khmer language | |
| 482 // rule should find a break between them but due to the heuristic/statistical | |
| 483 // nature of the Khmer word breaker it does not. | |
| 484 EXPECT_EQ(base::WideToUTF16(L"\x179B\x17C4\x1780\x1798\x1780"), | |
| 485 iter.GetString()); | |
| 486 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); | |
| 487 EXPECT_TRUE(iter.Advance()); | |
| 488 // Finds each character in "zoo". | |
| 489 EXPECT_EQ(base::UTF8ToUTF16("z"), iter.GetString()); | |
| 490 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 491 EXPECT_TRUE(iter.Advance()); | |
| 492 EXPECT_EQ(base::UTF8ToUTF16("o"), iter.GetString()); | |
| 493 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 494 EXPECT_TRUE(iter.Advance()); | |
| 495 EXPECT_EQ(base::UTF8ToUTF16("o"), iter.GetString()); | |
| 496 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 497 EXPECT_TRUE(iter.Advance()); | |
| 498 // Finds the period, space, and comma. | |
| 499 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString()); | |
| 500 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 501 EXPECT_TRUE(iter.Advance()); | |
| 502 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 503 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 504 EXPECT_TRUE(iter.Advance()); | |
| 505 EXPECT_EQ(base::UTF8ToUTF16(","), iter.GetString()); | |
| 506 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 507 EXPECT_FALSE(iter.Advance()); | |
| 508 } | |
| OLD | NEW |