Chromium Code Reviews| Index: chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc |
| diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc |
| index 08809ded8e7905613cfbdafbd361a970d7d5a333..cdaa9d010674d938237a58835065b673a95808a5 100644 |
| --- a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc |
| +++ b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc |
| @@ -6,12 +6,15 @@ |
| #include <vector> |
| #include "base/format_macros.h" |
| +#include "base/i18n/break_iterator.h" |
| #include "base/strings/string_split.h" |
| #include "base/strings/stringprintf.h" |
| #include "base/strings/utf_string_conversions.h" |
| #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" |
| #include "testing/gtest/include/gtest/gtest.h" |
| +using base::i18n::BreakIterator; |
| + |
| namespace { |
| struct TestCase { |
| @@ -20,6 +23,12 @@ struct TestCase { |
| const wchar_t* expected_words; |
| }; |
| +base::string16 GetRulesForLanguage(const std::string& language) { |
| + SpellcheckCharAttribute attribute; |
| + attribute.SetDefaultLanguage(language); |
| + return attribute.GetRuleSet(true); |
| +} |
| + |
| } // namespace |
| // Tests whether or not our SpellcheckWordIterator can extract only words used |
| @@ -295,3 +304,149 @@ TEST(SpellcheckWordIteratorTest, Initialization) { |
| EXPECT_FALSE(iterator.Initialize(&attributes, true)); |
| } |
| } |
| + |
| +// This test uses English rules to check that different character set |
| +// combinations properly find word breaks and skippable characters. |
| +TEST(SpellcheckWordIteratorTest, FindSkippableWordsEnglish) { |
| + // The string "foo ទេ Can Ми..." which contains English, Khmer, and Russian |
| + // characters, in that order. |
| + base::string16 text( |
| + base::WideToUTF16(L"foo \x1791\x17c1 Can \x041C\x0438...")); |
| + BreakIterator iter(text, GetRulesForLanguage("en-US")); |
| + ASSERT_TRUE(iter.Init()); |
| + |
| + EXPECT_TRUE(iter.Advance()); |
| + // Finds "foo". |
| + EXPECT_EQ(base::UTF8ToUTF16("foo"), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); |
| + EXPECT_TRUE(iter.Advance()); |
| + // Finds the space and then the Khmer characters. |
| + EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::WideToUTF16(L"\x1791\x17c1"), iter.GetString()); |
|
jungshik at Google
2015/08/11 21:43:50
Interesting. Even if Khmer is not treated as eithe
jungshik at Google
2015/08/11 22:22:07
I figured out why Khmer is not split up here. That
Julius
2015/08/12 01:22:21
Acknowledged.
Julius
2015/08/12 01:22:21
Acknowledged.
|
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + // Finds the next space and "Can". |
| + EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::UTF8ToUTF16("Can"), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); |
| + EXPECT_TRUE(iter.Advance()); |
| + // Finds the next space and each Russian character. |
| + EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::WideToUTF16(L"\x041C"), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::WideToUTF16(L"\x0438"), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + // Finds the periods at the end. |
| + EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_FALSE(iter.Advance()); |
| +} |
| + |
| +// This test uses Russian rules to check that different character set |
| +// combinations properly find word breaks and skippable characters. |
| +TEST(SpellcheckWordIteratorTest, FindSkippableWordsRussian) { |
| + // The string ".;Ми Can ទេ " which contains Russian, English, and Khmer |
| + // characters, in that order. |
| + base::string16 text(base::WideToUTF16(L".;\x041C\x0438 Can \x1791\x17c1 ")); |
| + BreakIterator iter(text, GetRulesForLanguage("ru-RU")); |
| + ASSERT_TRUE(iter.Init()); |
| + |
| + EXPECT_TRUE(iter.Advance()); |
| + // Finds the period and semicolon. |
| + EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::UTF8ToUTF16(";"), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + // Finds all the Russian characters. |
| + EXPECT_EQ(base::WideToUTF16(L"\x041C\x0438"), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); |
| + EXPECT_TRUE(iter.Advance()); |
| + // Finds the space and each character in "Can". |
| + EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::UTF8ToUTF16("C"), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::UTF8ToUTF16("a"), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::UTF8ToUTF16("n"), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + // Finds the next space, the Khmer characters, and the last two spaces. |
| + EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::WideToUTF16(L"\x1791\x17c1"), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_FALSE(iter.Advance()); |
| +} |
| + |
| +// This test uses Khmer rules to check that different character set combinations |
| +// properly find word breaks and skippable characters. |
| +TEST(SpellcheckWordIteratorTest, FindSkippableWordsKhmer) { |
| + // The string "Ми ទេzoo. ," which contains Russian, Khmer, and English |
| + // characters, in that order. |
|
jungshik at Google
2015/08/11 21:43:50
A Khmer example can be made more interesting. If y
Julius
2015/08/12 01:22:21
Swapped the Khmer text in this case with your sugg
|
| + base::string16 text(base::WideToUTF16(L"\x041C\x0438 \x1791\x17c1zoo. ,")); |
| + BreakIterator iter(text, GetRulesForLanguage("km")); |
| + ASSERT_TRUE(iter.Init()); |
| + |
| + EXPECT_TRUE(iter.Advance()); |
| + // Finds each Russian character and the space. |
| + EXPECT_EQ(base::WideToUTF16(L"\x041C"), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::WideToUTF16(L"\x0438"), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + // Finds all the Khmer characters. |
| + EXPECT_EQ(base::WideToUTF16(L"\x1791\x17c1"), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK); |
| + EXPECT_TRUE(iter.Advance()); |
| + // Finds each character in "zoo". |
| + EXPECT_EQ(base::UTF8ToUTF16("z"), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::UTF8ToUTF16("o"), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::UTF8ToUTF16("o"), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + // Finds the period, space, and comma. |
| + EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_TRUE(iter.Advance()); |
| + EXPECT_EQ(base::UTF8ToUTF16(","), iter.GetString()); |
| + EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD); |
| + EXPECT_FALSE(iter.Advance()); |
| +} |