Index: java/org/chromium/distiller/StringUtil.java |
diff --git a/java/org/chromium/distiller/StringUtil.java b/java/org/chromium/distiller/StringUtil.java |
index 00f387644ac3e50ab9ff091d8e21ee4dbc22a23c..8fa0b0c7e96111d10a6e8c77aec31564e80a2917 100644 |
--- a/java/org/chromium/distiller/StringUtil.java |
+++ b/java/org/chromium/distiller/StringUtil.java |
@@ -42,13 +42,50 @@ public class StringUtil { |
return RegExp.compile(regex, "gi").replace(input, replace); |
} |
- public static native boolean containsWordCharacter(String s) /*-{ |
- return /[\w\u00C0-\u1FFF\u2C00-\uD7FF]/.test(s); |
+ /** |
+ * For some languages, counting the number of words relies on non-trivial word |
+ * segmentation algorithms, or even huge look-up tables. This function needs to |
+ * be reasonably fast, so the word count for some languages would only be an |
+ * approximation. |
+ * Read https://crbug.com/484750 for more info. |
+ */ |
+ public static native void selectCountWordsFunc(String text) /*-{ |
+ if (/[\u3040-\uA4CF]/.test(text)) { |
+ $wnd._countWords = function(s) { |
cjhopman
2015/05/21 19:32:02
Don't make changes to $wnd
wychen
2015/05/21 23:07:03
Done.
|
+ // The following range includes broader alphabetical letters and Hangul Syllables. |
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g); |
+ var c = (m ? m.length : 0); |
+ // The following range includes Hiragana, Katakana, and CJK Unified Ideographs. |
+ // Hangul Syllables are not included. |
+ m = s.match(/([\u3040-\uA4CF])/g); |
+ c += Math.ceil((m ? m.length : 0) * 0.55); |
+ return c; |
+ }; |
+ return; |
+ } |
+ if (/[\AC00-\uD7AF]/.test(text)) { |
+ $wnd._countWords = function(s) { |
+ // The following range includes broader alphabetical letters and Hangul Syllables. |
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g); |
+ return (m ? m.length : 0); |
+ }; |
+ return; |
+ } |
+ $wnd._countWords = function(s) { |
+ // The following range includes broader alphabetical letters. |
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF]\S*)/g); |
+ return (m ? m.length : 0); |
+ }; |
}-*/; |
+ private static final int __dummy_setCountWords = initCountWords(); |
+ private static int initCountWords() { |
+ selectCountWordsFunc("字"); |
+ return 0; |
+ } |
+ |
public static native int countWords(String s) /*-{ |
- var m = s.match(/(\S*[\w\u00C0-\u1FFF\u2C00-\uD7FF]\S*)/g); |
- return m ? m.length : 0; |
+ return $wnd._countWords(s); |
}-*/; |
public static native String regexEscape(String s) /*-{ |