Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include <string> | 5 #include <string> |
| 6 #include <vector> | 6 #include <vector> |
| 7 | 7 |
| 8 #include "base/format_macros.h" | 8 #include "base/format_macros.h" |
| 9 #include "base/i18n/break_iterator.h" | |
| 9 #include "base/strings/string_split.h" | 10 #include "base/strings/string_split.h" |
| 10 #include "base/strings/stringprintf.h" | 11 #include "base/strings/stringprintf.h" |
| 11 #include "base/strings/utf_string_conversions.h" | 12 #include "base/strings/utf_string_conversions.h" |
| 12 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" | 13 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" |
| 13 #include "testing/gtest/include/gtest/gtest.h" | 14 #include "testing/gtest/include/gtest/gtest.h" |
| 14 | 15 |
| 16 using base::i18n::BreakIterator; | |
| 17 | |
| 15 namespace { | 18 namespace { |
| 16 | 19 |
| 17 struct TestCase { | 20 struct TestCase { |
| 18 const char* language; | 21 const char* language; |
| 19 bool allow_contraction; | 22 bool allow_contraction; |
| 20 const wchar_t* expected_words; | 23 const wchar_t* expected_words; |
| 21 }; | 24 }; |
| 22 | 25 |
| 26 base::string16 GetRulesForLanguage(const std::string& language) { | |
| 27 SpellcheckCharAttribute attribute; | |
| 28 attribute.SetDefaultLanguage(language); | |
| 29 return attribute.GetRuleSet(true); | |
| 30 } | |
| 31 | |
| 23 } // namespace | 32 } // namespace |
| 24 | 33 |
| 25 // Tests whether or not our SpellcheckWordIterator can extract only words used | 34 // Tests whether or not our SpellcheckWordIterator can extract only words used |
| 26 // by the specified language from a multi-language text. | 35 // by the specified language from a multi-language text. |
| 27 TEST(SpellcheckWordIteratorTest, SplitWord) { | 36 TEST(SpellcheckWordIteratorTest, SplitWord) { |
| 28 // An input text. This text includes words of several languages. (Some words | 37 // An input text. This text includes words of several languages. (Some words |
| 29 // are not separated with whitespace characters.) Our SpellcheckWordIterator | 38 // are not separated with whitespace characters.) Our SpellcheckWordIterator |
| 30 // should extract only the words used by the specified language from this text | 39 // should extract only the words used by the specified language from this text |
| 31 // and normalize them so our spell-checker can check their spellings. | 40 // and normalize them so our spell-checker can check their spellings. |
| 32 const wchar_t kTestText[] = | 41 const wchar_t kTestText[] = |
| (...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 288 } | 297 } |
| 289 | 298 |
| 290 // Test initialization fails when no default language is set. | 299 // Test initialization fails when no default language is set. |
| 291 { | 300 { |
| 292 SpellcheckCharAttribute attributes; | 301 SpellcheckCharAttribute attributes; |
| 293 | 302 |
| 294 SpellcheckWordIterator iterator; | 303 SpellcheckWordIterator iterator; |
| 295 EXPECT_FALSE(iterator.Initialize(&attributes, true)); | 304 EXPECT_FALSE(iterator.Initialize(&attributes, true)); |
| 296 } | 305 } |
| 297 } | 306 } |
| 307 | |
| 308 // This test uses English rules to check that different character set | |
| 309 // combinations properly find word breaks and skippable characters. | |
| 310 TEST(SpellcheckWordIteratorTest, FindSkippableWordsEnglish) { | |
| 311 // A string containing the English word "foo", followed by two Khmer | |
| 312 // characters, the English word "Can", and then two Russian characters and | |
| 313 // punctuation. | |
| 314 base::string16 text( | |
| 315 base::WideToUTF16(L"foo \x1791\x17C1 Can \x041C\x0438...")); | |
| 316 BreakIterator iter(text, GetRulesForLanguage("en-US")); | |
| 317 ASSERT_TRUE(iter.Init()); | |
| 318 | |
| 319 EXPECT_TRUE(iter.Advance()); | |
| 320 // Finds "foo". | |
| 321 EXPECT_EQ(base::UTF8ToUTF16("foo"), iter.GetString()); | |
| 322 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); | |
| 323 EXPECT_TRUE(iter.Advance()); | |
| 324 // Finds the space and then the Khmer characters. | |
| 325 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 326 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 327 EXPECT_TRUE(iter.Advance()); | |
| 328 EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1"), iter.GetString()); | |
| 329 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 330 EXPECT_TRUE(iter.Advance()); | |
| 331 // Finds the next space and "Can". | |
| 332 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 333 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 334 EXPECT_TRUE(iter.Advance()); | |
| 335 EXPECT_EQ(base::UTF8ToUTF16("Can"), iter.GetString()); | |
| 336 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); | |
| 337 EXPECT_TRUE(iter.Advance()); | |
| 338 // Finds the next space and each Russian character. | |
| 339 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 340 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 341 EXPECT_TRUE(iter.Advance()); | |
| 342 EXPECT_EQ(base::WideToUTF16(L"\x041C"), iter.GetString()); | |
| 343 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 344 EXPECT_TRUE(iter.Advance()); | |
| 345 EXPECT_EQ(base::WideToUTF16(L"\x0438"), iter.GetString()); | |
| 346 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 347 EXPECT_TRUE(iter.Advance()); | |
| 348 // Finds the periods at the end. | |
| 349 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString()); | |
| 350 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 351 EXPECT_TRUE(iter.Advance()); | |
| 352 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString()); | |
| 353 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 354 EXPECT_TRUE(iter.Advance()); | |
| 355 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString()); | |
| 356 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 357 EXPECT_FALSE(iter.Advance()); | |
| 358 } | |
| 359 | |
| 360 // This test uses Russian rules to check that different character set | |
| 361 // combinations properly find word breaks and skippable characters. | |
| 362 TEST(SpellcheckWordIteratorTest, FindSkippableWordsRussian) { | |
| 363 // A string containing punctuation followed by two Russian characters, the | |
| 364 // English word "Can", and then two Khmer characters. | |
| 365 base::string16 text(base::WideToUTF16(L".;\x041C\x0438 Can \x1791\x17C1 ")); | |
| 366 BreakIterator iter(text, GetRulesForLanguage("ru-RU")); | |
| 367 ASSERT_TRUE(iter.Init()); | |
| 368 | |
| 369 EXPECT_TRUE(iter.Advance()); | |
| 370 // Finds the period and semicolon. | |
| 371 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString()); | |
| 372 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 373 EXPECT_TRUE(iter.Advance()); | |
| 374 EXPECT_EQ(base::UTF8ToUTF16(";"), iter.GetString()); | |
| 375 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 376 EXPECT_TRUE(iter.Advance()); | |
| 377 // Finds all the Russian characters. | |
| 378 EXPECT_EQ(base::WideToUTF16(L"\x041C\x0438"), iter.GetString()); | |
| 379 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); | |
| 380 EXPECT_TRUE(iter.Advance()); | |
| 381 // Finds the space and each character in "Can". | |
| 382 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 383 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 384 EXPECT_TRUE(iter.Advance()); | |
| 385 EXPECT_EQ(base::UTF8ToUTF16("C"), iter.GetString()); | |
| 386 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 387 EXPECT_TRUE(iter.Advance()); | |
| 388 EXPECT_EQ(base::UTF8ToUTF16("a"), iter.GetString()); | |
| 389 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 390 EXPECT_TRUE(iter.Advance()); | |
| 391 EXPECT_EQ(base::UTF8ToUTF16("n"), iter.GetString()); | |
| 392 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 393 EXPECT_TRUE(iter.Advance()); | |
| 394 // Finds the next space, the Khmer characters, and the last two spaces. | |
| 395 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 396 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 397 EXPECT_TRUE(iter.Advance()); | |
| 398 EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1"), iter.GetString()); | |
| 399 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 400 EXPECT_TRUE(iter.Advance()); | |
| 401 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 402 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 403 EXPECT_TRUE(iter.Advance()); | |
| 404 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 405 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 406 EXPECT_FALSE(iter.Advance()); | |
| 407 } | |
| 408 | |
| 409 // This test uses Khmer rules to check that different character set combinations | |
| 410 // properly find word breaks and skippable characters. | |
| 411 TEST(SpellcheckWordIteratorTest, FindSkippableWordsKhmer) { | |
| 412 // A string containing two Russian characters followed by two, three, and two | |
| 413 // Khmer characters, and then English characters and punctuation. | |
| 414 base::string16 text(base::WideToUTF16( | |
| 415 L"\x041C\x0438 \x178F\x17BE \x179B\x17C4\x1780 \x1798\x1780zoo. ,")); | |
|
jungshik at Google
2015/08/12 16:56:52
Ick. Sorry it's not clear to you (and for a post-l
| |
| 416 BreakIterator iter(text, GetRulesForLanguage("km")); | |
| 417 ASSERT_TRUE(iter.Init()); | |
| 418 | |
| 419 EXPECT_TRUE(iter.Advance()); | |
| 420 // Finds each Russian character and the space. | |
| 421 EXPECT_EQ(base::WideToUTF16(L"\x041C"), iter.GetString()); | |
| 422 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 423 EXPECT_TRUE(iter.Advance()); | |
| 424 EXPECT_EQ(base::WideToUTF16(L"\x0438"), iter.GetString()); | |
| 425 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 426 EXPECT_TRUE(iter.Advance()); | |
| 427 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 428 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 429 EXPECT_TRUE(iter.Advance()); | |
| 430 // Finds the first two Khmer characters and the space. | |
| 431 EXPECT_EQ(base::WideToUTF16(L"\x178F\x17BE"), iter.GetString()); | |
| 432 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); | |
| 433 EXPECT_TRUE(iter.Advance()); | |
| 434 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 435 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 436 EXPECT_TRUE(iter.Advance()); | |
| 437 // Finds the next three Khmer characters and the space. | |
| 438 EXPECT_EQ(base::WideToUTF16(L"\x179B\x17C4\x1780"), iter.GetString()); | |
| 439 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); | |
| 440 EXPECT_TRUE(iter.Advance()); | |
| 441 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 442 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 443 EXPECT_TRUE(iter.Advance()); | |
| 444 // Finds the last two Khmer characters. | |
| 445 EXPECT_EQ(base::WideToUTF16(L"\x1798\x1780"), iter.GetString()); | |
| 446 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); | |
| 447 EXPECT_TRUE(iter.Advance()); | |
| 448 // Finds each character in "zoo". | |
| 449 EXPECT_EQ(base::UTF8ToUTF16("z"), iter.GetString()); | |
| 450 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 451 EXPECT_TRUE(iter.Advance()); | |
| 452 EXPECT_EQ(base::UTF8ToUTF16("o"), iter.GetString()); | |
| 453 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 454 EXPECT_TRUE(iter.Advance()); | |
| 455 EXPECT_EQ(base::UTF8ToUTF16("o"), iter.GetString()); | |
| 456 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 457 EXPECT_TRUE(iter.Advance()); | |
| 458 // Finds the period, space, and comma. | |
| 459 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString()); | |
| 460 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 461 EXPECT_TRUE(iter.Advance()); | |
| 462 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); | |
| 463 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 464 EXPECT_TRUE(iter.Advance()); | |
| 465 EXPECT_EQ(base::UTF8ToUTF16(","), iter.GetString()); | |
| 466 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); | |
| 467 EXPECT_FALSE(iter.Advance()); | |
| 468 } | |
| OLD | NEW |