Index: javatests/org/chromium/distiller/StringUtilTest.java |
diff --git a/javatests/org/chromium/distiller/StringUtilTest.java b/javatests/org/chromium/distiller/StringUtilTest.java |
index 620fbbcc61e8d4c379aa17c7a1c95c819e6e4dff..4867946f7cf2efb0169957c1fdf9c4ae1cccaec7 100644 |
--- a/javatests/org/chromium/distiller/StringUtilTest.java |
+++ b/javatests/org/chromium/distiller/StringUtilTest.java |
@@ -5,18 +5,107 @@ |
package org.chromium.distiller; |
import com.google.gwt.regexp.shared.RegExp; |
+import java.util.ArrayList; |
+import java.util.List; |
public class StringUtilTest extends JsTestCase { |
+ public void testFastWordCounter() { |
+ List<StringUtil.WordCounter> counters = new ArrayList(); |
+ counters.add(new StringUtil.FastWordCounter()); |
+ counters.add(new StringUtil.LetterWordCounter()); |
+ counters.add(new StringUtil.FullWordCounter()); |
+ |
+ for(StringUtil.WordCounter counter: counters) { |
+ assertEquals(0, counter.count("")); |
+ assertEquals(0, counter.count(" -@# ';]")); |
+ assertEquals(1, counter.count("word")); |
+ assertEquals(1, counter.count("b'fore")); |
+ assertEquals(1, counter.count(" _word.under_score_ ")); |
+ assertEquals(2, counter.count(" \ttwo\nwords")); |
+ assertEquals(2, counter.count(" \ttwo @^@^&(@#$([][;;\nwords")); |
+ // Norwegian |
+ assertEquals(5, counter.count("dør når på svært dårlig")); |
+ assertEquals(5, counter.count("svært få dør av blåbærsyltetøy")); |
+ // Greek |
+ assertEquals(11, counter.count( |
+ "Παρέμβαση των ΗΠΑ για τα τεχνητά νησιά που κατασκευάζει η Κίνα")); |
+ // Arabic |
+ assertEquals(6, counter.count("زلزال بقوة 8.5 درجات يضرب اليابان")); |
+ // Tibetan |
+ assertEquals(1, counter.count("༧གོང་ས་མཆོག་གི་ནང་གི་ངོ་སྤྲོད་ཀྱི་གསུང་ཆོས་ལེགས་གྲུབ།")); |
+ // Thai |
+ assertEquals(3, counter.count("โซลาร์ อิมพัลส์ทู เหินฟ้าข้ามมหาสมุทร")); |
+ } |
+ } |
+ |
+ public void testLetterWordCounter() { |
+ List<StringUtil.WordCounter> counters = new ArrayList(); |
+ counters.add(new StringUtil.LetterWordCounter()); |
+ counters.add(new StringUtil.FullWordCounter()); |
+ |
+ for(StringUtil.WordCounter counter: counters) { |
+ // Hangul uses space as word delimiter like English. |
+ assertEquals(1, counter.count("어")); |
+ assertEquals(2, counter.count("한국어 단어")); |
+ assertEquals(5, counter.count("한 국 어 단 어")); |
+ assertEquals(8, counter.count( |
+ "예비군 훈련장 총기 난사범 최모씨의 군복에서 발견된 유서.")); |
+ } |
+ } |
+ |
+ public void testFullWordCounter() { |
+ StringUtil.WordCounter counter = new StringUtil.FullWordCounter(); |
+ // One Chinese sentence, or a series of Japanese glyphs should not be treated |
+ // as a single word. |
+ assertTrue(counter.count("一個中文句子不應該當成一個字") > 1); // zh-Hant |
+ assertTrue(counter.count("中国和马来西亚使用简体字") > 1); // zh-Hans |
+ assertTrue(counter.count("ファイナルファンタジー") > 1); // Katakana |
+ assertTrue(counter.count("いってらっしゃい") > 1); // Hiragana |
+ assertTrue(counter.count("仏仮駅辺") > 1); // Kanji |
+ // However, treating each Chinese/Japanese glyph as a word is also wrong. |
+ assertTrue(counter.count("一個中文句子不應該當成一個字") < 14); |
+ assertTrue(counter.count("中国和马来西亚使用简体字") < 12); |
+ assertTrue(counter.count("ファイナルファンタジー") < 11); |
+ assertTrue(counter.count("いってらっしゃい") < 8); |
+ assertTrue(counter.count("仏仮駅辺") < 4); |
+ // Even if they are separated by spaces. |
+ assertTrue(counter.count("一 個 中 文 句 子 不 應 該 當 成 一 個 字") < 14); |
+ assertTrue(counter.count("中 国 和 马 来 西 亚 使 用 简 体 字") < 12); |
+ assertTrue(counter.count("フ ァ イ ナ ル フ ァ ン タ ジ ー") < 11); |
+ assertTrue(counter.count("い っ て ら っ し ゃ い") < 8); |
+ assertTrue(counter.count("仏 仮 駅 辺") < 4); |
+ |
+ assertEquals(1, counter.count("字")); |
+ assertEquals(1, counter.count("が")); |
+ |
+ // Mixing ASCII words and Chinese/Japanese glyphs |
+ assertEquals(2, counter.count("word字")); |
+ assertEquals(2, counter.count("word 字")); |
+ } |
+ |
+ public void testSelectWordCounter() { |
+ StringUtil.WordCounter counter; |
+ |
+ counter = StringUtil.selectWordCounter("abc"); |
+ assertTrue(counter instanceof StringUtil.FastWordCounter); |
+ |
+ counter = StringUtil.selectWordCounter("어"); |
+ assertTrue(counter instanceof StringUtil.LetterWordCounter); |
+ |
+ counter = StringUtil.selectWordCounter("字"); |
+ assertTrue(counter instanceof StringUtil.FullWordCounter); |
+ } |
+ |
public void testCountWords() { |
- assertEquals(0, StringUtil.countWords("")); |
- assertEquals(0, StringUtil.countWords(" -@# ';]")); |
- assertEquals(1, StringUtil.countWords("word")); |
- assertEquals(1, StringUtil.countWords("b'fore")); |
- assertEquals(1, StringUtil.countWords(" _word.under_score_ ")); |
- assertEquals(2, StringUtil.countWords(" \ttwo\nwords")); |
- assertEquals(2, StringUtil.countWords(" \ttwo @^@^&(@#$([][;;\nwords")); |
- assertEquals(5, StringUtil.countWords("dør når på svært dårlig")); |
- assertEquals(5, StringUtil.countWords("svært få dør av blåbærsyltetøy")); |
+ StringUtil.setWordCounter(""); |
+ assertEquals(2, StringUtil.countWords("two words")); |
+ assertEquals(0, StringUtil.countWords("어")); |
+ StringUtil.setWordCounter("어"); |
+ assertEquals(1, StringUtil.countWords("어")); |
+ assertEquals(0, StringUtil.countWords("字")); |
+ StringUtil.setWordCounter("字"); |
+ assertEquals(1, StringUtil.countWords("字")); |
+ // Make sure the internal WordCounter is restored to FullWordCounter in the end. |
} |
public void testIsWhitespace() { |