Index: java/org/chromium/distiller/StringUtil.java |
diff --git a/java/org/chromium/distiller/StringUtil.java b/java/org/chromium/distiller/StringUtil.java |
index 00f387644ac3e50ab9ff091d8e21ee4dbc22a23c..b1e99c3ea403eda1b352975f8caa9e8761e3e2f6 100644 |
--- a/java/org/chromium/distiller/StringUtil.java |
+++ b/java/org/chromium/distiller/StringUtil.java |
@@ -4,6 +4,7 @@ |
package org.chromium.distiller; |
+import com.google.gwt.core.client.JavaScriptObject; |
import com.google.gwt.regexp.shared.RegExp; |
public class StringUtil { |
@@ -42,14 +43,65 @@ public class StringUtil { |
return RegExp.compile(regex, "gi").replace(input, replace); |
} |
- public static native boolean containsWordCharacter(String s) /*-{ |
- return /[\w\u00C0-\u1FFF\u2C00-\uD7FF]/.test(s); |
- }-*/; |
+ /** |
+ * For some languages, counting the number of words relies on non-trivial word |
+ * segmentation algorithms, or even huge look-up tables. This function needs to |
+ * be reasonably fast, so the word count for some languages would only be an |
+ * approximation. |
+ * Read https://crbug.com/484750 for more info. |
+ */ |
+ private static interface CountWords { |
cjhopman
2015/05/29 19:38:57
nit: probably should be s/CountWords/WordCounter
wychen
2015/05/31 09:04:03
Done.
|
+ public int countWords(String s); |
+ } |
- public static native int countWords(String s) /*-{ |
- var m = s.match(/(\S*[\w\u00C0-\u1FFF\u2C00-\uD7FF]\S*)/g); |
- return m ? m.length : 0; |
- }-*/; |
+ private static class FullWordCounting implements CountWords { |
+ public native int countWords(String s) /*-{ |
+ // The following range includes broader alphabetical letters and Hangul Syllables. |
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g); |
+ var c = (m ? m.length : 0); |
+ // The following range includes Hiragana, Katakana, and CJK Unified Ideographs. |
+ // Hangul Syllables are not included. |
+ m = s.match(/([\u3040-\uA4CF])/g); |
+ c += Math.ceil((m ? m.length : 0) * 0.55); |
+ return c; |
+ }-*/; |
+ } |
+ |
+ private static class LetterWordCounting implements CountWords { |
+ public native int countWords(String s) /*-{ |
+ // The following range includes broader alphabetical letters and Hangul Syllables. |
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g); |
+ return (m ? m.length : 0); |
+ }-*/; |
+ } |
+ |
+ private static class FastWordCounting implements CountWords { |
+ public native int countWords(String s) /*-{ |
+ // The following range includes broader alphabetical letters. |
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF]\S*)/g); |
+ return (m ? m.length : 0); |
+ }-*/; |
+ } |
+ |
+ public static void selectCountWordsFunc(String text) { |
+ final RegExp rFull = RegExp.compile("[\\u3040-\\uA4CF]", "g"); |
+ final RegExp rLetter = RegExp.compile("[\\uAC00-\\uD7AF]", "g"); |
+ |
+ if (rFull.test(text)) { |
+ _countWords = new FullWordCounting(); |
+ } else if (rLetter.test(text)) { |
+ _countWords = new LetterWordCounting(); |
+ } else { |
+ _countWords = new FastWordCounting(); |
+ } |
+ } |
+ |
+ // Use the safest version of countWords as the default. |
+ static CountWords _countWords = new FullWordCounting(); |
cjhopman
2015/05/29 19:38:57
s/_countWords/sCountWords
in fact, I'd probably c
wychen
2015/05/31 09:04:03
Done.
|
+ |
+ public static int countWords(String s) { |
+ return _countWords.countWords(s); |
+ }; |
public static native String regexEscape(String s) /*-{ |
return s.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&"); |