Index: java/org/chromium/distiller/StringUtil.java |
diff --git a/java/org/chromium/distiller/StringUtil.java b/java/org/chromium/distiller/StringUtil.java |
index 00f387644ac3e50ab9ff091d8e21ee4dbc22a23c..b9eafefae1ed0c94f942723898846e2826bd4c9b 100644 |
--- a/java/org/chromium/distiller/StringUtil.java |
+++ b/java/org/chromium/distiller/StringUtil.java |
@@ -4,6 +4,7 @@ |
package org.chromium.distiller; |
+import com.google.gwt.core.client.JavaScriptObject; |
import com.google.gwt.regexp.shared.RegExp; |
public class StringUtil { |
@@ -42,14 +43,69 @@ public class StringUtil { |
return RegExp.compile(regex, "gi").replace(input, replace); |
} |
- public static native boolean containsWordCharacter(String s) /*-{ |
- return /[\w\u00C0-\u1FFF\u2C00-\uD7FF]/.test(s); |
- }-*/; |
+ /** |
+ * For some languages, counting the number of words relies on non-trivial word |
+ * segmentation algorithms, or even huge look-up tables. This function needs to |
+ * be reasonably fast, so the word count for some languages would only be an |
+ * approximation. |
+ * Read https://crbug.com/484750 for more info. |
+ */ |
+ public static interface WordCounter { |
+ public int count(String s); |
+ } |
- public static native int countWords(String s) /*-{ |
- var m = s.match(/(\S*[\w\u00C0-\u1FFF\u2C00-\uD7FF]\S*)/g); |
- return m ? m.length : 0; |
- }-*/; |
+ public static class FullWordCounter implements WordCounter { |
+ public native int count(String s) /*-{ |
+ // The following range includes broader alphabetical letters and Hangul Syllables. |
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g); |
+ var c = (m ? m.length : 0); |
+ // The following range includes Hiragana, Katakana, and CJK Unified Ideographs. |
+ // Hangul Syllables are not included. |
+ m = s.match(/([\u3040-\uA4CF])/g); |
+ c += Math.ceil((m ? m.length : 0) * 0.55); |
+ return c; |
+ }-*/; |
+ } |
+ |
+ public static class LetterWordCounter implements WordCounter { |
+ public native int count(String s) /*-{ |
+ // The following range includes broader alphabetical letters and Hangul Syllables. |
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g); |
+ return (m ? m.length : 0); |
+ }-*/; |
+ } |
+ |
+ public static class FastWordCounter implements WordCounter { |
+ public native int count(String s) /*-{ |
+ // The following range includes broader alphabetical letters. |
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF]\S*)/g); |
+ return (m ? m.length : 0); |
+ }-*/; |
+ } |
+ |
+ public static void setWordCounter(String text) { |
+ sWordCounter = selectWordCounter(text); |
+ } |
+ |
+ public static WordCounter selectWordCounter(String text) { |
+ final RegExp rFull = RegExp.compile("[\\u3040-\\uA4CF]", "g"); |
+ final RegExp rLetter = RegExp.compile("[\\uAC00-\\uD7AF]", "g"); |
+ |
+ if (rFull.test(text)) { |
+ return new FullWordCounter(); |
+ } else if (rLetter.test(text)) { |
+ return new LetterWordCounter(); |
+ } else { |
+ return new FastWordCounter(); |
+ } |
+ } |
+ |
+ // Use the safest version of WordCounter as the default. |
+ static WordCounter sWordCounter = new FullWordCounter(); |
+ |
+ public static int countWords(String s) { |
+ return sWordCounter.count(s); |
+ }; |
public static native String regexEscape(String s) /*-{ |
return s.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&"); |