Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(129)

Side by Side Diff: javatests/org/chromium/distiller/StringUtilTest.java

Issue 1131853006: Fix word count issue for Chinese and Japanese (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: rewrite tests Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « javatests/org/chromium/distiller/DocumentTitleGetterTest.java ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package org.chromium.distiller; 5 package org.chromium.distiller;
6 6
7 import com.google.gwt.regexp.shared.RegExp; 7 import com.google.gwt.regexp.shared.RegExp;
8 import java.util.ArrayList;
9 import java.util.List;
8 10
9 public class StringUtilTest extends JsTestCase { 11 public class StringUtilTest extends JsTestCase {
12 public void testFastWordCounter() {
13 List<StringUtil.WordCounter> counters = new ArrayList();
14 counters.add(new StringUtil.FastWordCounter());
15 counters.add(new StringUtil.LetterWordCounter());
16 counters.add(new StringUtil.FullWordCounter());
17
18 for(StringUtil.WordCounter counter: counters) {
19 assertEquals(0, counter.count(""));
20 assertEquals(0, counter.count(" -@# ';]"));
21 assertEquals(1, counter.count("word"));
22 assertEquals(1, counter.count("b'fore"));
23 assertEquals(1, counter.count(" _word.under_score_ "));
24 assertEquals(2, counter.count(" \ttwo\nwords"));
25 assertEquals(2, counter.count(" \ttwo @^@^&(@#$([][;;\nwords"));
26 // Norwegian
27 assertEquals(5, counter.count("dør når på svært dårlig"));
28 assertEquals(5, counter.count("svært få dør av blåbærsyltetøy"));
29 // Greek
30 assertEquals(11, counter.count(
31 "Παρέμβαση των ΗΠΑ για τα τεχνητά νησιά που κατασκευάζει η Κ ίνα"));
32 // Arabic
33 assertEquals(6, counter.count("زلزال بقوة 8.5 درجات يضرب اليابان"));
34 // Tibetan
35 assertEquals(1, counter.count("༧གོང་ས་མཆོག་གི་ནང་གི་ངོ་སྤྲོད་ཀྱི་གསུ ང་ཆོས་ལེགས་གྲུབ།"));
36 // Thai
37 assertEquals(3, counter.count("โซลาร์ อิมพัลส์ทู เหินฟ้าข้ามมหาสมุทร "));
38 }
39 }
40
41 public void testLetterWordCounter() {
42 List<StringUtil.WordCounter> counters = new ArrayList();
43 counters.add(new StringUtil.LetterWordCounter());
44 counters.add(new StringUtil.FullWordCounter());
45
46 for(StringUtil.WordCounter counter: counters) {
47 // Hangul uses space as word delimiter like English.
48 assertEquals(1, counter.count("어"));
49 assertEquals(2, counter.count("한국어 단어"));
50 assertEquals(5, counter.count("한 국 어 단 어"));
51 assertEquals(8, counter.count(
52 "예비군 훈련장 총기 난사범 최모씨의 군복에서 발견된 유서."));
53 }
54 }
55
56 public void testFullWordCounter() {
57 StringUtil.WordCounter counter = new StringUtil.FullWordCounter();
58 // One Chinese sentence, or a series of Japanese glyphs should not be tr eated
59 // as a single word.
60 assertTrue(counter.count("一個中文句子不應該當成一個字") > 1); // zh-Hant
61 assertTrue(counter.count("中国和马来西亚使用简体字") > 1); // zh-Hans
62 assertTrue(counter.count("ファイナルファンタジー") > 1); // Katakana
63 assertTrue(counter.count("いってらっしゃい") > 1); // Hiragana
64 assertTrue(counter.count("仏仮駅辺") > 1); // Kanji
65 // However, treating each Chinese/Japanese glyph as a word is also wrong .
66 assertTrue(counter.count("一個中文句子不應該當成一個字") < 14);
67 assertTrue(counter.count("中国和马来西亚使用简体字") < 12);
68 assertTrue(counter.count("ファイナルファンタジー") < 11);
69 assertTrue(counter.count("いってらっしゃい") < 8);
70 assertTrue(counter.count("仏仮駅辺") < 4);
71 // Even if they are separated by spaces.
72 assertTrue(counter.count("一 個 中 文 句 子 不 應 該 當 成 一 個 字") < 14);
73 assertTrue(counter.count("中 国 和 马 来 西 亚 使 用 简 体 字") < 12);
74 assertTrue(counter.count("フ ァ イ ナ ル フ ァ ン タ ジ ー") < 11);
75 assertTrue(counter.count("い っ て ら っ し ゃ い") < 8);
76 assertTrue(counter.count("仏 仮 駅 辺") < 4);
77
78 assertEquals(1, counter.count("字"));
79 assertEquals(1, counter.count("が"));
80
81 // Mixing ASCII words and Chinese/Japanese glyphs
82 assertEquals(2, counter.count("word字"));
83 assertEquals(2, counter.count("word 字"));
84 }
85
86 public void testSelectWordCounter() {
87 StringUtil.WordCounter counter;
88
89 counter = StringUtil.selectWordCounter("abc");
90 assertTrue(counter instanceof StringUtil.FastWordCounter);
91
92 counter = StringUtil.selectWordCounter("어");
93 assertTrue(counter instanceof StringUtil.LetterWordCounter);
94
95 counter = StringUtil.selectWordCounter("字");
96 assertTrue(counter instanceof StringUtil.FullWordCounter);
97 }
98
10 public void testCountWords() { 99 public void testCountWords() {
11 assertEquals(0, StringUtil.countWords("")); 100 StringUtil.setWordCounter("");
12 assertEquals(0, StringUtil.countWords(" -@# ';]")); 101 assertEquals(2, StringUtil.countWords("two words"));
13 assertEquals(1, StringUtil.countWords("word")); 102 assertEquals(0, StringUtil.countWords("어"));
14 assertEquals(1, StringUtil.countWords("b'fore")); 103 StringUtil.setWordCounter("어");
15 assertEquals(1, StringUtil.countWords(" _word.under_score_ ")); 104 assertEquals(1, StringUtil.countWords("어"));
16 assertEquals(2, StringUtil.countWords(" \ttwo\nwords")); 105 assertEquals(0, StringUtil.countWords("字"));
17 assertEquals(2, StringUtil.countWords(" \ttwo @^@^&(@#$([][;;\nwords")); 106 StringUtil.setWordCounter("字");
18 assertEquals(5, StringUtil.countWords("dør når på svært dårlig")); 107 assertEquals(1, StringUtil.countWords("字"));
19 assertEquals(5, StringUtil.countWords("svært få dør av blåbærsyltetøy")) ; 108 // Make sure the internal WordCounter is restored to FullWordCounter in the end.
20 } 109 }
21 110
22 public void testIsWhitespace() { 111 public void testIsWhitespace() {
23 assertTrue(StringUtil.isWhitespace(' ')); 112 assertTrue(StringUtil.isWhitespace(' '));
24 assertTrue(StringUtil.isWhitespace('\t')); 113 assertTrue(StringUtil.isWhitespace('\t'));
25 assertTrue(StringUtil.isWhitespace('\n')); 114 assertTrue(StringUtil.isWhitespace('\n'));
26 assertTrue(StringUtil.isWhitespace('\u00a0')); 115 assertTrue(StringUtil.isWhitespace('\u00a0'));
27 assertFalse(StringUtil.isWhitespace('a')); 116 assertFalse(StringUtil.isWhitespace('a'));
28 assertFalse(StringUtil.isWhitespace('$')); 117 assertFalse(StringUtil.isWhitespace('$'));
29 assertFalse(StringUtil.isWhitespace('_')); 118 assertFalse(StringUtil.isWhitespace('_'));
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after
107 assertEquals(-1, StringUtil.toNumber("'8_")); 196 assertEquals(-1, StringUtil.toNumber("'8_"));
108 assertEquals(-1, StringUtil.toNumber("")); 197 assertEquals(-1, StringUtil.toNumber(""));
109 assertEquals(-1, StringUtil.toNumber(" ")); 198 assertEquals(-1, StringUtil.toNumber(" "));
110 assertEquals(-1, StringUtil.toNumber("\u00a0\u0460")); 199 assertEquals(-1, StringUtil.toNumber("\u00a0\u0460"));
111 assertEquals(-1, StringUtil.toNumber("abc")); 200 assertEquals(-1, StringUtil.toNumber("abc"));
112 assertEquals(-1, StringUtil.toNumber("$")); 201 assertEquals(-1, StringUtil.toNumber("$"));
113 assertEquals(-1, StringUtil.toNumber("_")); 202 assertEquals(-1, StringUtil.toNumber("_"));
114 } 203 }
115 204
116 } 205 }
OLDNEW
« no previous file with comments | « javatests/org/chromium/distiller/DocumentTitleGetterTest.java ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698