| Index: chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
|
| diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
|
| deleted file mode 100644
|
| index ff4c4673877593a81abc77a03e5e95f3b5f00fec..0000000000000000000000000000000000000000
|
| --- a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
|
| +++ /dev/null
|
| @@ -1,508 +0,0 @@
|
| -// Copyright (c) 2012 The Chromium Authors. All rights reserved.
|
| -// Use of this source code is governed by a BSD-style license that can be
|
| -// found in the LICENSE file.
|
| -
|
| -#include <stddef.h>
|
| -
|
| -#include <string>
|
| -#include <vector>
|
| -
|
| -#include "base/format_macros.h"
|
| -#include "base/i18n/break_iterator.h"
|
| -#include "base/macros.h"
|
| -#include "base/strings/string_split.h"
|
| -#include "base/strings/stringprintf.h"
|
| -#include "base/strings/utf_string_conversions.h"
|
| -#include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
|
| -#include "testing/gtest/include/gtest/gtest.h"
|
| -
|
| -using base::i18n::BreakIterator;
|
| -using WordIteratorStatus = SpellcheckWordIterator::WordIteratorStatus;
|
| -
|
| -namespace {
|
| -
|
| -struct TestCase {
|
| - const char* language;
|
| - bool allow_contraction;
|
| - const wchar_t* expected_words;
|
| -};
|
| -
|
| -base::string16 GetRulesForLanguage(const std::string& language) {
|
| - SpellcheckCharAttribute attribute;
|
| - attribute.SetDefaultLanguage(language);
|
| - return attribute.GetRuleSet(true);
|
| -}
|
| -
|
| -WordIteratorStatus GetNextNonSkippableWord(SpellcheckWordIterator* iterator,
|
| - base::string16* word_string,
|
| - int* word_start,
|
| - int* word_length) {
|
| - WordIteratorStatus status = SpellcheckWordIterator::IS_SKIPPABLE;
|
| - while (status == SpellcheckWordIterator::IS_SKIPPABLE)
|
| - status = iterator->GetNextWord(word_string, word_start, word_length);
|
| - return status;
|
| -}
|
| -
|
| -} // namespace
|
| -
|
| -// Tests whether or not our SpellcheckWordIterator can extract words used by the
|
| -// specified language from a multi-language text.
|
| -TEST(SpellcheckWordIteratorTest, SplitWord) {
|
| - // An input text. This text includes words of several languages. (Some words
|
| - // are not separated with whitespace characters.) Our SpellcheckWordIterator
|
| - // should extract the words used by the specified language from this text and
|
| - // normalize them so our spell-checker can check their spellings. If
|
| - // characters are found that are not from the specified language the test
|
| - // skips them.
|
| - const wchar_t kTestText[] =
|
| - // Graphic characters
|
| - L"!@#$%^&*()"
|
| - // Latin (including a contraction character and a ligature).
|
| - L"hello:hello a\xFB03x"
|
| - // Greek
|
| - L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
|
| - // Cyrillic
|
| - L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
|
| - L"\x0443\x0439\x0442\x0435"
|
| - // Hebrew (including niqquds)
|
| - L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd "
|
| - // Hebrew words with U+0027 and U+05F3
|
| - L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
|
| - // Hebrew words with U+0022 and U+05F4
|
| - L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
|
| - // Hebrew words enclosed with ASCII quotes.
|
| - L"\"\x05e6\x05d4\x0022\x05dc\" '\x05e9\x05c1\x05b8\x05dc\x05d5'"
|
| - // Arabic (including vowel marks)
|
| - L"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627\x0645\x064f "
|
| - L"\x0639\x064e\x0644\x064e\x064a\x0652\x0643\x064f\x0645\x0652 "
|
| - // Farsi/Persian (including vowel marks)
|
| - // Make sure \u064b - \u0652 are removed.
|
| - L"\x0647\x0634\x064e\x0631\x062d "
|
| - L"\x0647\x062e\x0648\x0627\x0647 "
|
| - L"\x0650\x062f\x0631\x062f "
|
| - L"\x0631\x0645\x0627\x0646\x0652 "
|
| - L"\x0633\x0631\x0651 "
|
| - L"\x0646\x0646\x064e\x062c\x064f\x0633 "
|
| - L"\x0627\x0644\x062d\x0645\x062f "
|
| - // Also make sure that class "Lm" (the \u0640) is filtered out too.
|
| - L"\x062c\x062c\x0640\x062c\x062c"
|
| - // Hindi
|
| - L"\x0930\x093E\x091C\x0927\x093E\x0928"
|
| - // Thai
|
| - L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
|
| - L"\x0e23\x0e31\x0e1a"
|
| - // Hiraganas
|
| - L"\x3053\x3093\x306B\x3061\x306F"
|
| - // CJKV ideographs
|
| - L"\x4F60\x597D"
|
| - // Hangul Syllables
|
| - L"\xC548\xB155\xD558\xC138\xC694"
|
| - // Full-width latin : Hello
|
| - L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F "
|
| - L"e.g.,";
|
| -
|
| - // The languages and expected results used in this test.
|
| - static const TestCase kTestCases[] = {
|
| - {
|
| - // English (keep contraction words)
|
| - "en-US", true, L"hello:hello affix Hello e.g"
|
| - }, {
|
| - // English (split contraction words)
|
| - "en-US", false, L"hello hello affix Hello e g"
|
| - }, {
|
| - // Greek
|
| - "el-GR", true,
|
| - L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
|
| - }, {
|
| - // Russian
|
| - "ru-RU", true,
|
| - L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
|
| - L"\x0443\x0439\x0442\x0435"
|
| - }, {
|
| - // Hebrew
|
| - "he-IL", true,
|
| - L"\x05e9\x05dc\x05d5\x05dd "
|
| - L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
|
| - L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
|
| - L"\x05e6\x05d4\x0022\x05dc \x05e9\x05dc\x05d5"
|
| - }, {
|
| - // Arabic
|
| - "ar", true,
|
| - L"\x0627\x0644\x0633\x0644\x0627\x0645 "
|
| - L"\x0639\x0644\x064a\x0643\x0645 "
|
| - // Farsi/Persian
|
| - L"\x0647\x0634\x0631\x062d "
|
| - L"\x0647\x062e\x0648\x0627\x0647 "
|
| - L"\x062f\x0631\x062f "
|
| - L"\x0631\x0645\x0627\x0646 "
|
| - L"\x0633\x0631 "
|
| - L"\x0646\x0646\x062c\x0633 "
|
| - L"\x0627\x0644\x062d\x0645\x062f "
|
| - L"\x062c\x062c\x062c\x062c"
|
| - }, {
|
| - // Hindi
|
| - "hi-IN", true,
|
| - L"\x0930\x093E\x091C\x0927\x093E\x0928"
|
| - }, {
|
| - // Thai
|
| - "th-TH", true,
|
| - L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
|
| - L"\x0e23\x0e31\x0e1a"
|
| - }, {
|
| - // Korean
|
| - "ko-KR", true,
|
| - L"\x110b\x1161\x11ab\x1102\x1167\x11bc\x1112\x1161"
|
| - L"\x1109\x1166\x110b\x116d"
|
| - },
|
| - };
|
| -
|
| - for (size_t i = 0; i < arraysize(kTestCases); ++i) {
|
| - SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,
|
| - kTestCases[i].language));
|
| -
|
| - SpellcheckCharAttribute attributes;
|
| - attributes.SetDefaultLanguage(kTestCases[i].language);
|
| -
|
| - base::string16 input(base::WideToUTF16(kTestText));
|
| - SpellcheckWordIterator iterator;
|
| - EXPECT_TRUE(iterator.Initialize(&attributes,
|
| - kTestCases[i].allow_contraction));
|
| - EXPECT_TRUE(iterator.SetText(input.c_str(), input.length()));
|
| -
|
| - std::vector<base::string16> expected_words = base::SplitString(
|
| - base::WideToUTF16(kTestCases[i].expected_words),
|
| - base::string16(1, ' '), base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
|
| -
|
| - base::string16 actual_word;
|
| - int actual_start, actual_len;
|
| - size_t index = 0;
|
| - for (SpellcheckWordIterator::WordIteratorStatus status =
|
| - iterator.GetNextWord(&actual_word, &actual_start, &actual_len);
|
| - status != SpellcheckWordIterator::IS_END_OF_TEXT;
|
| - status =
|
| - iterator.GetNextWord(&actual_word, &actual_start, &actual_len)) {
|
| - if (status == SpellcheckWordIterator::WordIteratorStatus::IS_SKIPPABLE)
|
| - continue;
|
| -
|
| - EXPECT_TRUE(index < expected_words.size());
|
| - if (index < expected_words.size())
|
| - EXPECT_EQ(expected_words[index], actual_word);
|
| - ++index;
|
| - }
|
| - }
|
| -}
|
| -
|
| -// Tests whether our SpellcheckWordIterator extracts an empty word without
|
| -// getting stuck in an infinite loop when inputting a Khmer text. (This is a
|
| -// regression test for Issue 46278.)
|
| -TEST(SpellcheckWordIteratorTest, RuleSetConsistency) {
|
| - SpellcheckCharAttribute attributes;
|
| - attributes.SetDefaultLanguage("en-US");
|
| -
|
| - const wchar_t kTestText[] = L"\x1791\x17c1\x002e";
|
| - base::string16 input(base::WideToUTF16(kTestText));
|
| -
|
| - SpellcheckWordIterator iterator;
|
| - EXPECT_TRUE(iterator.Initialize(&attributes, true));
|
| - EXPECT_TRUE(iterator.SetText(input.c_str(), input.length()));
|
| -
|
| - // When SpellcheckWordIterator uses an inconsistent ICU ruleset, the following
|
| - // iterator.GetNextWord() calls get stuck in an infinite loop. Therefore, this
|
| - // test succeeds if this call returns without timeouts.
|
| - base::string16 actual_word;
|
| - int actual_start, actual_len;
|
| - WordIteratorStatus status = GetNextNonSkippableWord(
|
| - &iterator, &actual_word, &actual_start, &actual_len);
|
| -
|
| - EXPECT_EQ(SpellcheckWordIterator::WordIteratorStatus::IS_END_OF_TEXT, status);
|
| - EXPECT_EQ(0, actual_start);
|
| - EXPECT_EQ(0, actual_len);
|
| -}
|
| -
|
| -// Vertify our SpellcheckWordIterator can treat ASCII numbers as word characters
|
| -// on LTR languages. On the other hand, it should not treat ASCII numbers as
|
| -// word characters on RTL languages because they change the text direction from
|
| -// RTL to LTR.
|
| -TEST(SpellcheckWordIteratorTest, TreatNumbersAsWordCharacters) {
|
| - // A set of a language, a dummy word, and a text direction used in this test.
|
| - // For each language, this test splits a dummy word, which consists of ASCII
|
| - // numbers and an alphabet of the language, into words. When ASCII numbers are
|
| - // treated as word characters, the split word becomes equal to the dummy word.
|
| - // Otherwise, the split word does not include ASCII numbers.
|
| - static const struct {
|
| - const char* language;
|
| - const wchar_t* text;
|
| - bool left_to_right;
|
| - } kTestCases[] = {
|
| - {
|
| - // English
|
| - "en-US", L"0123456789" L"a", true,
|
| - }, {
|
| - // Greek
|
| - "el-GR", L"0123456789" L"\x03B1", true,
|
| - }, {
|
| - // Russian
|
| - "ru-RU", L"0123456789" L"\x0430", true,
|
| - }, {
|
| - // Hebrew
|
| - "he-IL", L"0123456789" L"\x05D0", false,
|
| - }, {
|
| - // Arabic
|
| - "ar", L"0123456789" L"\x0627", false,
|
| - }, {
|
| - // Hindi
|
| - "hi-IN", L"0123456789" L"\x0905", true,
|
| - }, {
|
| - // Thai
|
| - "th-TH", L"0123456789" L"\x0e01", true,
|
| - }, {
|
| - // Korean
|
| - "ko-KR", L"0123456789" L"\x1100\x1161", true,
|
| - },
|
| - };
|
| -
|
| - for (size_t i = 0; i < arraysize(kTestCases); ++i) {
|
| - SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,
|
| - kTestCases[i].language));
|
| -
|
| - SpellcheckCharAttribute attributes;
|
| - attributes.SetDefaultLanguage(kTestCases[i].language);
|
| -
|
| - base::string16 input_word(base::WideToUTF16(kTestCases[i].text));
|
| - SpellcheckWordIterator iterator;
|
| - EXPECT_TRUE(iterator.Initialize(&attributes, true));
|
| - EXPECT_TRUE(iterator.SetText(input_word.c_str(), input_word.length()));
|
| -
|
| - base::string16 actual_word;
|
| - int actual_start, actual_len;
|
| - WordIteratorStatus status = GetNextNonSkippableWord(
|
| - &iterator, &actual_word, &actual_start, &actual_len);
|
| -
|
| - EXPECT_EQ(SpellcheckWordIterator::WordIteratorStatus::IS_WORD, status);
|
| - if (kTestCases[i].left_to_right)
|
| - EXPECT_EQ(input_word, actual_word);
|
| - else
|
| - EXPECT_NE(input_word, actual_word);
|
| - }
|
| -}
|
| -
|
| -// Verify SpellcheckWordIterator treats typographical apostrophe as a part of
|
| -// the word.
|
| -TEST(SpellcheckWordIteratorTest, TypographicalApostropheIsPartOfWord) {
|
| - static const struct {
|
| - const char* language;
|
| - const wchar_t* input;
|
| - const wchar_t* expected;
|
| - } kTestCases[] = {
|
| - // Typewriter apostrophe:
|
| - {"en-AU", L"you're", L"you're"},
|
| - {"en-CA", L"you're", L"you're"},
|
| - {"en-GB", L"you're", L"you're"},
|
| - {"en-US", L"you're", L"you're"},
|
| - {"en-US", L"!!!!you're", L"you're"},
|
| - // Typographical apostrophe:
|
| - {"en-AU", L"you\x2019re", L"you\x2019re"},
|
| - {"en-CA", L"you\x2019re", L"you\x2019re"},
|
| - {"en-GB", L"you\x2019re", L"you\x2019re"},
|
| - {"en-US", L"you\x2019re", L"you\x2019re"},
|
| - {"en-US", L"....you\x2019re", L"you\x2019re"},
|
| - };
|
| -
|
| - for (size_t i = 0; i < arraysize(kTestCases); ++i) {
|
| - SpellcheckCharAttribute attributes;
|
| - attributes.SetDefaultLanguage(kTestCases[i].language);
|
| -
|
| - base::string16 input_word(base::WideToUTF16(kTestCases[i].input));
|
| - base::string16 expected_word(base::WideToUTF16(kTestCases[i].expected));
|
| - SpellcheckWordIterator iterator;
|
| - EXPECT_TRUE(iterator.Initialize(&attributes, true));
|
| - EXPECT_TRUE(iterator.SetText(input_word.c_str(), input_word.length()));
|
| -
|
| - base::string16 actual_word;
|
| - int actual_start, actual_len;
|
| - WordIteratorStatus status = GetNextNonSkippableWord(
|
| - &iterator, &actual_word, &actual_start, &actual_len);
|
| -
|
| - EXPECT_EQ(SpellcheckWordIterator::WordIteratorStatus::IS_WORD, status);
|
| - EXPECT_EQ(expected_word, actual_word);
|
| - EXPECT_LE(0, actual_start);
|
| - EXPECT_EQ(expected_word.length(),
|
| - static_cast<base::string16::size_type>(actual_len));
|
| - }
|
| -}
|
| -
|
| -TEST(SpellcheckWordIteratorTest, Initialization) {
|
| - // Test initialization works when a default language is set.
|
| - {
|
| - SpellcheckCharAttribute attributes;
|
| - attributes.SetDefaultLanguage("en-US");
|
| -
|
| - SpellcheckWordIterator iterator;
|
| - EXPECT_TRUE(iterator.Initialize(&attributes, true));
|
| - }
|
| -
|
| - // Test initialization fails when no default language is set.
|
| - {
|
| - SpellcheckCharAttribute attributes;
|
| -
|
| - SpellcheckWordIterator iterator;
|
| - EXPECT_FALSE(iterator.Initialize(&attributes, true));
|
| - }
|
| -}
|
| -
|
| -// This test uses English rules to check that different character set
|
| -// combinations properly find word breaks and skippable characters.
|
| -TEST(SpellcheckWordIteratorTest, FindSkippableWordsEnglish) {
|
| - // A string containing the English word "foo", followed by two Khmer
|
| - // characters, the English word "Can", and then two Russian characters and
|
| - // punctuation.
|
| - base::string16 text(
|
| - base::WideToUTF16(L"foo \x1791\x17C1 Can \x041C\x0438..."));
|
| - BreakIterator iter(text, GetRulesForLanguage("en-US"));
|
| - ASSERT_TRUE(iter.Init());
|
| -
|
| - EXPECT_TRUE(iter.Advance());
|
| - // Finds "foo".
|
| - EXPECT_EQ(base::UTF8ToUTF16("foo"), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
|
| - EXPECT_TRUE(iter.Advance());
|
| - // Finds the space and then the Khmer characters.
|
| - EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1"), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - // Finds the next space and "Can".
|
| - EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::UTF8ToUTF16("Can"), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
|
| - EXPECT_TRUE(iter.Advance());
|
| - // Finds the next space and each Russian character.
|
| - EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::WideToUTF16(L"\x041C"), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::WideToUTF16(L"\x0438"), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - // Finds the periods at the end.
|
| - EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_FALSE(iter.Advance());
|
| -}
|
| -
|
| -// This test uses Russian rules to check that different character set
|
| -// combinations properly find word breaks and skippable characters.
|
| -TEST(SpellcheckWordIteratorTest, FindSkippableWordsRussian) {
|
| - // A string containing punctuation followed by two Russian characters, the
|
| - // English word "Can", and then two Khmer characters.
|
| - base::string16 text(base::WideToUTF16(L".;\x041C\x0438 Can \x1791\x17C1 "));
|
| - BreakIterator iter(text, GetRulesForLanguage("ru-RU"));
|
| - ASSERT_TRUE(iter.Init());
|
| -
|
| - EXPECT_TRUE(iter.Advance());
|
| - // Finds the period and semicolon.
|
| - EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::UTF8ToUTF16(";"), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - // Finds all the Russian characters.
|
| - EXPECT_EQ(base::WideToUTF16(L"\x041C\x0438"), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
|
| - EXPECT_TRUE(iter.Advance());
|
| - // Finds the space and each character in "Can".
|
| - EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::UTF8ToUTF16("C"), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::UTF8ToUTF16("a"), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::UTF8ToUTF16("n"), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - // Finds the next space, the Khmer characters, and the last two spaces.
|
| - EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1"), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_FALSE(iter.Advance());
|
| -}
|
| -
|
| -// This test uses Khmer rules to check that different character set combinations
|
| -// properly find word breaks and skippable characters. Khmer does not use spaces
|
| -// between words and uses a dictionary to determine word breaks instead.
|
| -TEST(SpellcheckWordIteratorTest, FindSkippableWordsKhmer) {
|
| - // A string containing two Russian characters followed by two, three, and
|
| - // two-character Khmer words, and then English characters and punctuation.
|
| - base::string16 text(base::WideToUTF16(
|
| - L"\x041C\x0438 \x178F\x17BE\x179B\x17C4\x1780\x1798\x1780zoo. ,"));
|
| - BreakIterator iter(text, GetRulesForLanguage("km"));
|
| - ASSERT_TRUE(iter.Init());
|
| -
|
| - EXPECT_TRUE(iter.Advance());
|
| - // Finds each Russian character and the space.
|
| - EXPECT_EQ(base::WideToUTF16(L"\x041C"), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::WideToUTF16(L"\x0438"), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - // Finds the first two-character Khmer word.
|
| - EXPECT_EQ(base::WideToUTF16(L"\x178F\x17BE"), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
|
| - EXPECT_TRUE(iter.Advance());
|
| - // Finds the three-character Khmer word and then the next two-character word.
|
| - // Note: Technically these are two different Khmer words so the Khmer language
|
| - // rule should find a break between them but due to the heuristic/statistical
|
| - // nature of the Khmer word breaker it does not.
|
| - EXPECT_EQ(base::WideToUTF16(L"\x179B\x17C4\x1780\x1798\x1780"),
|
| - iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
|
| - EXPECT_TRUE(iter.Advance());
|
| - // Finds each character in "zoo".
|
| - EXPECT_EQ(base::UTF8ToUTF16("z"), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::UTF8ToUTF16("o"), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::UTF8ToUTF16("o"), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - // Finds the period, space, and comma.
|
| - EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_TRUE(iter.Advance());
|
| - EXPECT_EQ(base::UTF8ToUTF16(","), iter.GetString());
|
| - EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
|
| - EXPECT_FALSE(iter.Advance());
|
| -}
|
|
|