Index: javatests/org/chromium/distiller/StringUtilTest.java |
diff --git a/javatests/org/chromium/distiller/StringUtilTest.java b/javatests/org/chromium/distiller/StringUtilTest.java |
index 620fbbcc61e8d4c379aa17c7a1c95c819e6e4dff..84b8227c726ce5d3047d85f067f2b37d34f0bf06 100644 |
--- a/javatests/org/chromium/distiller/StringUtilTest.java |
+++ b/javatests/org/chromium/distiller/StringUtilTest.java |
@@ -17,6 +17,31 @@ public class StringUtilTest extends JsTestCase { |
assertEquals(2, StringUtil.countWords(" \ttwo @^@^&(@#$([][;;\nwords")); |
assertEquals(5, StringUtil.countWords("dør når på svært dårlig")); |
assertEquals(5, StringUtil.countWords("svært få dør av blåbærsyltetøy")); |
+ |
+ // One Chinese sentence, or a series of Japanese glyphs should not be treated |
+ // as a single word. |
+ assertTrue(StringUtil.countWords("一個中文句子不應該當成一個字") > 1); |
+ assertTrue(StringUtil.countWords("ファイナルファンタジー") > 1); |
+ // However, treating each Chinese/Japanese glyph as a word is also wrong. |
+ assertTrue(StringUtil.countWords("一個中文句子不應該當成一個字") < 14); |
+ assertTrue(StringUtil.countWords("ファイナルファンタジー") < 11); |
kuan
2015/05/15 01:45:07
this is a katakana sentence, how about also adding
wychen
2015/05/18 18:49:20
All added.
kuan
2015/05/21 17:46:15
thanks!
|
+ // Even if they are separated by spaces. |
+ assertTrue(StringUtil.countWords("一 個 中 文 句 子 不 應 該 當 成 一 個 字") < 14); |
+ assertTrue(StringUtil.countWords("フ ァ イ ナ ル フ ァ ン タ ジ ー") < 11); |
+ |
+ assertEquals(1, StringUtil.countWords("字")); |
+ assertEquals(1, StringUtil.countWords("が")); |
+ |
+ // Mixing ASCII words and Chinese/Japanese glyphs |
+ assertEquals(2, StringUtil.countWords("word字")); |
+ assertEquals(2, StringUtil.countWords("word 字")); |
+ |
+ // Hangul uses space as word delimiter like English. |
+ assertEquals(1, StringUtil.countWords("어")); |
+ assertEquals(2, StringUtil.countWords("한국어 단어")); |
+ assertEquals(5, StringUtil.countWords("한 국 어 단 어")); |
+ assertEquals(8, StringUtil.countWords( |
+ "예비군 훈련장 총기 난사범 최모씨의 군복에서 발견된 유서.")); |
} |
public void testIsWhitespace() { |