chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc - Issue 1272683002: Creates BreakIterator::GetWordBreakStatus.

Side by Side Diff: chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc

Issue 1272683002: Creates BreakIterator::GetWordBreakStatus. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Comment clarifications and using EXPECT_EQ. Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include <string>	5 #include <string>

6 #include <vector>	6 #include <vector>

7	7

8 #include "base/format_macros.h"	8 #include "base/format_macros.h"

	9 #include "base/i18n/break_iterator.h"

9 #include "base/strings/string_split.h"	10 #include "base/strings/string_split.h"

10 #include "base/strings/stringprintf.h"	11 #include "base/strings/stringprintf.h"

11 #include "base/strings/utf_string_conversions.h"	12 #include "base/strings/utf_string_conversions.h"

12 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"	13 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"

13 #include "testing/gtest/include/gtest/gtest.h"	14 #include "testing/gtest/include/gtest/gtest.h"

14	15

	16 using base::i18n::BreakIterator;

	17

15 namespace {	18 namespace {

16	19

17 struct TestCase {	20 struct TestCase {

18 const char* language;	21 const char* language;

19 bool allow_contraction;	22 bool allow_contraction;

20 const wchar_t* expected_words;	23 const wchar_t* expected_words;

21 };	24 };

22	25

	26 base::string16 GetRulesForLanguage(const std::string& language) {

	27 SpellcheckCharAttribute attribute;

	28 attribute.SetDefaultLanguage(language);

	29 return attribute.GetRuleSet(true);

	30 }

	31

23 } // namespace	32 } // namespace

24	33

25 // Tests whether or not our SpellcheckWordIterator can extract only words used	34 // Tests whether or not our SpellcheckWordIterator can extract only words used

26 // by the specified language from a multi-language text.	35 // by the specified language from a multi-language text.

27 TEST(SpellcheckWordIteratorTest, SplitWord) {	36 TEST(SpellcheckWordIteratorTest, SplitWord) {

28 // An input text. This text includes words of several languages. (Some words	37 // An input text. This text includes words of several languages. (Some words

29 // are not separated with whitespace characters.) Our SpellcheckWordIterator	38 // are not separated with whitespace characters.) Our SpellcheckWordIterator

30 // should extract only the words used by the specified language from this text	39 // should extract only the words used by the specified language from this text

31 // and normalize them so our spell-checker can check their spellings.	40 // and normalize them so our spell-checker can check their spellings.

32 const wchar_t kTestText[] =	41 const wchar_t kTestText[] =

(...skipping 255 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
288 }	297 }

289	298

290 // Test initialization fails when no default language is set.	299 // Test initialization fails when no default language is set.

291 {	300 {

292 SpellcheckCharAttribute attributes;	301 SpellcheckCharAttribute attributes;

293	302

294 SpellcheckWordIterator iterator;	303 SpellcheckWordIterator iterator;

295 EXPECT_FALSE(iterator.Initialize(&attributes, true));	304 EXPECT_FALSE(iterator.Initialize(&attributes, true));

296 }	305 }

297 }	306 }

	307

	308 // This test uses English rules to check that different character set

	309 // combinations properly find word breaks and skippable characters.

	310 TEST(SpellcheckWordIteratorTest, FindSkippableWordsEnglish) {

	311 // The string "foo ទេ Can Ми..." which contains English, Khmer, and Russian

	312 // characters, in that order.

	313 base::string16 text(

	314 base::WideToUTF16(L"foo \x1791\x17c1 Can \x041C\x0438..."));

	315 BreakIterator iter(text, GetRulesForLanguage("en-US"));

	316 ASSERT_TRUE(iter.Init());

	317

	318 EXPECT_TRUE(iter.Advance());

	319 // Finds "foo".

	320 EXPECT_EQ(base::UTF8ToUTF16("foo"), iter.GetString());

	321 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);

	322 EXPECT_TRUE(iter.Advance());

	323 // Finds the space and then the Khmer characters.

	324 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());

	325 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	326 EXPECT_TRUE(iter.Advance());

	327 EXPECT_EQ(base::WideToUTF16(L"\x1791\x17c1"), iter.GetString());
	jungshik at Google 2015/08/11 21:43:50 Interesting. Even if Khmer is not treated as eithe Interesting. Even if Khmer is not treated as either ALetter or ALetterPlus, w-b-iterator still does not break them apart (perhaps because it's a single grapheme... rules does not have that info?). jungshik at Google 2015/08/11 22:22:07 I figured out why Khmer is not split up here. That Show quoted text On 2015/08/11 21:43:50, jungshik at google wrote: > Interesting. Even if Khmer is not treated as either ALetter or ALetterPlus, > w-b-iterator still does not break them apart (perhaps because it's a single > grapheme... rules does not have that info?). I figured out why Khmer is not split up here. That's because Khmer uses a dictionary for word (as well as line) breaking. And, it's handled outside non-dictionary cases. Our custom rules do not change the following line: # For dictionary-based break $dictionary $dictionary; Julius 2015/08/12 01:22:21 Acknowledged. Show quoted text On 2015/08/11 22:22:07, jungshik at google wrote: > On 2015/08/11 21:43:50, jungshik at google wrote: > > Interesting. Even if Khmer is not treated as either ALetter or ALetterPlus, > > w-b-iterator still does not break them apart (perhaps because it's a single > > grapheme... rules does not have that info?). > > I figured out why Khmer is not split up here. That's because Khmer uses a > dictionary for word (as well as line) breaking. And, it's handled outside > non-dictionary cases. Our custom rules do not change the following line: > > # For dictionary-based break > $dictionary $dictionary; > Acknowledged. Julius 2015/08/12 01:22:21 Acknowledged. Show quoted text On 2015/08/11 21:43:50, jungshik at google wrote: > Interesting. Even if Khmer is not treated as either ALetter or ALetterPlus, > w-b-iterator still does not break them apart (perhaps because it's a single > grapheme... rules does not have that info?). Acknowledged.
	328 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	329 EXPECT_TRUE(iter.Advance());

	330 // Finds the next space and "Can".

	331 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());

	332 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	333 EXPECT_TRUE(iter.Advance());

	334 EXPECT_EQ(base::UTF8ToUTF16("Can"), iter.GetString());

	335 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);

	336 EXPECT_TRUE(iter.Advance());

	337 // Finds the next space and each Russian character.

	338 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());

	339 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	340 EXPECT_TRUE(iter.Advance());

	341 EXPECT_EQ(base::WideToUTF16(L"\x041C"), iter.GetString());

	342 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	343 EXPECT_TRUE(iter.Advance());

	344 EXPECT_EQ(base::WideToUTF16(L"\x0438"), iter.GetString());

	345 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	346 EXPECT_TRUE(iter.Advance());

	347 // Finds the periods at the end.

	348 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());

	349 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	350 EXPECT_TRUE(iter.Advance());

	351 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());

	352 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	353 EXPECT_TRUE(iter.Advance());

	354 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());

	355 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	356 EXPECT_FALSE(iter.Advance());

	357 }

	358

	359 // This test uses Russian rules to check that different character set

	360 // combinations properly find word breaks and skippable characters.

	361 TEST(SpellcheckWordIteratorTest, FindSkippableWordsRussian) {

	362 // The string ".;Ми Can ទេ " which contains Russian, English, and Khmer

	363 // characters, in that order.

	364 base::string16 text(base::WideToUTF16(L".;\x041C\x0438 Can \x1791\x17c1 "));

	365 BreakIterator iter(text, GetRulesForLanguage("ru-RU"));

	366 ASSERT_TRUE(iter.Init());

	367

	368 EXPECT_TRUE(iter.Advance());

	369 // Finds the period and semicolon.

	370 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());

	371 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	372 EXPECT_TRUE(iter.Advance());

	373 EXPECT_EQ(base::UTF8ToUTF16(";"), iter.GetString());

	374 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	375 EXPECT_TRUE(iter.Advance());

	376 // Finds all the Russian characters.

	377 EXPECT_EQ(base::WideToUTF16(L"\x041C\x0438"), iter.GetString());

	378 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);

	379 EXPECT_TRUE(iter.Advance());

	380 // Finds the space and each character in "Can".

	381 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());

	382 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	383 EXPECT_TRUE(iter.Advance());

	384 EXPECT_EQ(base::UTF8ToUTF16("C"), iter.GetString());

	385 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	386 EXPECT_TRUE(iter.Advance());

	387 EXPECT_EQ(base::UTF8ToUTF16("a"), iter.GetString());

	388 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	389 EXPECT_TRUE(iter.Advance());

	390 EXPECT_EQ(base::UTF8ToUTF16("n"), iter.GetString());

	391 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	392 EXPECT_TRUE(iter.Advance());

	393 // Finds the next space, the Khmer characters, and the last two spaces.

	394 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());

	395 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	396 EXPECT_TRUE(iter.Advance());

	397 EXPECT_EQ(base::WideToUTF16(L"\x1791\x17c1"), iter.GetString());

	398 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	399 EXPECT_TRUE(iter.Advance());

	400 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());

	401 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	402 EXPECT_TRUE(iter.Advance());

	403 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());

	404 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	405 EXPECT_FALSE(iter.Advance());

	406 }

	407

	408 // This test uses Khmer rules to check that different character set combinations

	409 // properly find word breaks and skippable characters.

	410 TEST(SpellcheckWordIteratorTest, FindSkippableWordsKhmer) {

	411 // The string "Ми ទេzoo. ," which contains Russian, Khmer, and English

	412 // characters, in that order.
	jungshik at Google 2015/08/11 21:43:50 A Khmer example can be made more interesting. If y A Khmer example can be made more interesting. If you take an example from ICU's Khmer break iterator tests. I took the following example from the first line in the Khmer section of third_party/icu/source/test/testdata/rbbitst.txt U+178F U+17BE <word break> U+179B U+17C4 U+1780 <word break> U+1798 U+1780 Julius 2015/08/12 01:22:21 Swapped the Khmer text in this case with your sugg Show quoted text On 2015/08/11 21:43:50, jungshik at google wrote: > A Khmer example can be made more interesting. If you take an example from ICU's > Khmer break iterator tests. > > I took the following example from the first line in the Khmer section of > third_party/icu/source/test/testdata/rbbitst.txt > > > U+178F U+17BE <word break> U+179B U+17C4 U+1780 <word break> U+1798 U+1780 Swapped the Khmer text in this case with your suggested texted.
	413 base::string16 text(base::WideToUTF16(L"\x041C\x0438 \x1791\x17c1zoo. ,"));

	414 BreakIterator iter(text, GetRulesForLanguage("km"));

	415 ASSERT_TRUE(iter.Init());

	416

	417 EXPECT_TRUE(iter.Advance());

	418 // Finds each Russian character and the space.

	419 EXPECT_EQ(base::WideToUTF16(L"\x041C"), iter.GetString());

	420 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	421 EXPECT_TRUE(iter.Advance());

	422 EXPECT_EQ(base::WideToUTF16(L"\x0438"), iter.GetString());

	423 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	424 EXPECT_TRUE(iter.Advance());

	425 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());

	426 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	427 EXPECT_TRUE(iter.Advance());

	428 // Finds all the Khmer characters.

	429 EXPECT_EQ(base::WideToUTF16(L"\x1791\x17c1"), iter.GetString());

	430 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);

	431 EXPECT_TRUE(iter.Advance());

	432 // Finds each character in "zoo".

	433 EXPECT_EQ(base::UTF8ToUTF16("z"), iter.GetString());

	434 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	435 EXPECT_TRUE(iter.Advance());

	436 EXPECT_EQ(base::UTF8ToUTF16("o"), iter.GetString());

	437 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	438 EXPECT_TRUE(iter.Advance());

	439 EXPECT_EQ(base::UTF8ToUTF16("o"), iter.GetString());

	440 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	441 EXPECT_TRUE(iter.Advance());

	442 // Finds the period, space, and comma.

	443 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());

	444 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	445 EXPECT_TRUE(iter.Advance());

	446 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());

	447 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	448 EXPECT_TRUE(iter.Advance());

	449 EXPECT_EQ(base::UTF8ToUTF16(","), iter.GetString());

	450 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);

	451 EXPECT_FALSE(iter.Advance());

	452 }

OLD	NEW

« base/i18n/break_iterator_unittest.cc ('K') | « base/i18n/break_iterator_unittest.cc ('k') | no next file » | no next file with comments »