Index: java/org/chromium/distiller/StringUtil.java |
diff --git a/java/org/chromium/distiller/StringUtil.java b/java/org/chromium/distiller/StringUtil.java |
index 00f387644ac3e50ab9ff091d8e21ee4dbc22a23c..9c0d081434f04a5d239bc651471f0710557bd55c 100644 |
--- a/java/org/chromium/distiller/StringUtil.java |
+++ b/java/org/chromium/distiller/StringUtil.java |
@@ -47,8 +47,14 @@ public class StringUtil { |
}-*/; |
public static native int countWords(String s) /*-{ |
cjhopman
2015/05/15 20:16:55
Does this new approach make sense everywhere that
cjhopman
2015/05/15 20:16:55
Maybe we should change the name of this to reflect
wychen
2015/05/18 18:49:20
Well, the goal of this function is still to count
wychen
2015/05/18 18:49:20
One problem I see is the title finding part. It co
|
- var m = s.match(/(\S*[\w\u00C0-\u1FFF\u2C00-\uD7FF]\S*)/g); |
- return m ? m.length : 0; |
+ // The following range includes broader alphabetical letters and Hangul Syllables. |
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g); |
+ var c = (m ? m.length : 0); |
+ // The following range includes Hiragana, Katakana, and CJK Unified Ideographs. |
+ // Hangul Syllables are not included. |
+ m = s.match(/([\u3040-\uA4CF])/g); |
+ c += Math.ceil((m ? m.length : 0) * 0.55); |
+ return c; |
}-*/; |
public static native String regexEscape(String s) /*-{ |