Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(433)

Side by Side Diff: javatests/org/chromium/distiller/StringUtilTest.java

Issue 1131853006: Fix word count issue for Chinese and Japanese (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: use java interface Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package org.chromium.distiller; 5 package org.chromium.distiller;
6 6
7 import com.google.gwt.regexp.shared.RegExp; 7 import com.google.gwt.regexp.shared.RegExp;
8 8
9 public class StringUtilTest extends JsTestCase { 9 public class StringUtilTest extends JsTestCase {
10 public void testCountWords() { 10 public void testCountWords() {
11 assertEquals(0, StringUtil.countWords("")); 11 assertEquals(0, StringUtil.countWords(""));
12 assertEquals(0, StringUtil.countWords(" -@# ';]")); 12 assertEquals(0, StringUtil.countWords(" -@# ';]"));
13 assertEquals(1, StringUtil.countWords("word")); 13 assertEquals(1, StringUtil.countWords("word"));
14 assertEquals(1, StringUtil.countWords("b'fore")); 14 assertEquals(1, StringUtil.countWords("b'fore"));
15 assertEquals(1, StringUtil.countWords(" _word.under_score_ ")); 15 assertEquals(1, StringUtil.countWords(" _word.under_score_ "));
16 assertEquals(2, StringUtil.countWords(" \ttwo\nwords")); 16 assertEquals(2, StringUtil.countWords(" \ttwo\nwords"));
17 assertEquals(2, StringUtil.countWords(" \ttwo @^@^&(@#$([][;;\nwords")); 17 assertEquals(2, StringUtil.countWords(" \ttwo @^@^&(@#$([][;;\nwords"));
18 assertEquals(5, StringUtil.countWords("dør når på svært dårlig")); 18 assertEquals(5, StringUtil.countWords("dør når på svært dårlig"));
19 assertEquals(5, StringUtil.countWords("svært få dør av blåbærsyltetøy")) ; 19 assertEquals(5, StringUtil.countWords("svært få dør av blåbærsyltetøy")) ;
20
21 // One Chinese sentence, or a series of Japanese glyphs should not be tr eated
22 // as a single word.
23 assertTrue(StringUtil.countWords("一個中文句子不應該當成一個字") > 1); // zh-Hant
24 assertTrue(StringUtil.countWords("中国和马来西亚使用简体字") > 1); // zh-Hans
25 assertTrue(StringUtil.countWords("ファイナルファンタジー") > 1); // Katakana
26 assertTrue(StringUtil.countWords("いってらっしゃい") > 1); // Hiragana
27 assertTrue(StringUtil.countWords("仏仮駅辺") > 1); // Kanji
28 // However, treating each Chinese/Japanese glyph as a word is also wrong .
29 assertTrue(StringUtil.countWords("一個中文句子不應該當成一個字") < 14);
30 assertTrue(StringUtil.countWords("中国和马来西亚使用简体字") < 12);
31 assertTrue(StringUtil.countWords("ファイナルファンタジー") < 11);
32 assertTrue(StringUtil.countWords("いってらっしゃい") < 8);
33 assertTrue(StringUtil.countWords("仏仮駅辺") < 4);
34 // Even if they are separated by spaces.
35 assertTrue(StringUtil.countWords("一 個 中 文 句 子 不 應 該 當 成 一 個 字") < 14);
36 assertTrue(StringUtil.countWords("中 国 和 马 来 西 亚 使 用 简 体 字") < 12);
37 assertTrue(StringUtil.countWords("フ ァ イ ナ ル フ ァ ン タ ジ ー") < 11);
38 assertTrue(StringUtil.countWords("い っ て ら っ し ゃ い") < 8);
39 assertTrue(StringUtil.countWords("仏 仮 駅 辺") < 4);
40
41 assertEquals(1, StringUtil.countWords("字"));
42 assertEquals(1, StringUtil.countWords("が"));
43
44 // Mixing ASCII words and Chinese/Japanese glyphs
45 assertEquals(2, StringUtil.countWords("word字"));
46 assertEquals(2, StringUtil.countWords("word 字"));
47
48 // Hangul uses space as word delimiter like English.
49 assertEquals(1, StringUtil.countWords("어"));
50 assertEquals(2, StringUtil.countWords("한국어 단어"));
51 assertEquals(5, StringUtil.countWords("한 국 어 단 어"));
52 assertEquals(8, StringUtil.countWords(
53 "예비군 훈련장 총기 난사범 최모씨의 군복에서 발견된 유서."));
54 }
55
56 public void testCountWordsFast() {
57 StringUtil.selectCountWordsFunc("");
cjhopman 2015/05/29 19:38:57 This should probably use the inner classes directl
wychen 2015/05/31 09:04:03 Done.
58 assertEquals(0, StringUtil.countWords("어"));
59 StringUtil.selectCountWordsFunc("어");
60 assertEquals(1, StringUtil.countWords("어"));
61 assertEquals(0, StringUtil.countWords("字"));
62 StringUtil.selectCountWordsFunc("字");
63 assertEquals(1, StringUtil.countWords("字"));
20 } 64 }
21 65
22 public void testIsWhitespace() { 66 public void testIsWhitespace() {
23 assertTrue(StringUtil.isWhitespace(' ')); 67 assertTrue(StringUtil.isWhitespace(' '));
24 assertTrue(StringUtil.isWhitespace('\t')); 68 assertTrue(StringUtil.isWhitespace('\t'));
25 assertTrue(StringUtil.isWhitespace('\n')); 69 assertTrue(StringUtil.isWhitespace('\n'));
26 assertTrue(StringUtil.isWhitespace('\u00a0')); 70 assertTrue(StringUtil.isWhitespace('\u00a0'));
27 assertFalse(StringUtil.isWhitespace('a')); 71 assertFalse(StringUtil.isWhitespace('a'));
28 assertFalse(StringUtil.isWhitespace('$')); 72 assertFalse(StringUtil.isWhitespace('$'));
29 assertFalse(StringUtil.isWhitespace('_')); 73 assertFalse(StringUtil.isWhitespace('_'));
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after
107 assertEquals(-1, StringUtil.toNumber("'8_")); 151 assertEquals(-1, StringUtil.toNumber("'8_"));
108 assertEquals(-1, StringUtil.toNumber("")); 152 assertEquals(-1, StringUtil.toNumber(""));
109 assertEquals(-1, StringUtil.toNumber(" ")); 153 assertEquals(-1, StringUtil.toNumber(" "));
110 assertEquals(-1, StringUtil.toNumber("\u00a0\u0460")); 154 assertEquals(-1, StringUtil.toNumber("\u00a0\u0460"));
111 assertEquals(-1, StringUtil.toNumber("abc")); 155 assertEquals(-1, StringUtil.toNumber("abc"));
112 assertEquals(-1, StringUtil.toNumber("$")); 156 assertEquals(-1, StringUtil.toNumber("$"));
113 assertEquals(-1, StringUtil.toNumber("_")); 157 assertEquals(-1, StringUtil.toNumber("_"));
114 } 158 }
115 159
116 } 160 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698