Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(296)

Unified Diff: java/org/chromium/distiller/StringUtil.java

Issue 1131853006: Fix word count issue for Chinese and Japanese (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: use java interface Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: java/org/chromium/distiller/StringUtil.java
diff --git a/java/org/chromium/distiller/StringUtil.java b/java/org/chromium/distiller/StringUtil.java
index 00f387644ac3e50ab9ff091d8e21ee4dbc22a23c..b1e99c3ea403eda1b352975f8caa9e8761e3e2f6 100644
--- a/java/org/chromium/distiller/StringUtil.java
+++ b/java/org/chromium/distiller/StringUtil.java
@@ -4,6 +4,7 @@
package org.chromium.distiller;
+import com.google.gwt.core.client.JavaScriptObject;
import com.google.gwt.regexp.shared.RegExp;
public class StringUtil {
@@ -42,14 +43,65 @@ public class StringUtil {
return RegExp.compile(regex, "gi").replace(input, replace);
}
- public static native boolean containsWordCharacter(String s) /*-{
- return /[\w\u00C0-\u1FFF\u2C00-\uD7FF]/.test(s);
- }-*/;
+ /**
+ * For some languages, counting the number of words relies on non-trivial word
+ * segmentation algorithms, or even huge look-up tables. This function needs to
+ * be reasonably fast, so the word count for some languages would only be an
+ * approximation.
+ * Read https://crbug.com/484750 for more info.
+ */
+ private static interface CountWords {
cjhopman 2015/05/29 19:38:57 nit: probably should be s/CountWords/WordCounter
wychen 2015/05/31 09:04:03 Done.
+ public int countWords(String s);
+ }
- public static native int countWords(String s) /*-{
- var m = s.match(/(\S*[\w\u00C0-\u1FFF\u2C00-\uD7FF]\S*)/g);
- return m ? m.length : 0;
- }-*/;
+ private static class FullWordCounting implements CountWords {
+ public native int countWords(String s) /*-{
+ // The following range includes broader alphabetical letters and Hangul Syllables.
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g);
+ var c = (m ? m.length : 0);
+ // The following range includes Hiragana, Katakana, and CJK Unified Ideographs.
+ // Hangul Syllables are not included.
+ m = s.match(/([\u3040-\uA4CF])/g);
+ c += Math.ceil((m ? m.length : 0) * 0.55);
+ return c;
+ }-*/;
+ }
+
+ private static class LetterWordCounting implements CountWords {
+ public native int countWords(String s) /*-{
+ // The following range includes broader alphabetical letters and Hangul Syllables.
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g);
+ return (m ? m.length : 0);
+ }-*/;
+ }
+
+ private static class FastWordCounting implements CountWords {
+ public native int countWords(String s) /*-{
+ // The following range includes broader alphabetical letters.
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF]\S*)/g);
+ return (m ? m.length : 0);
+ }-*/;
+ }
+
+ public static void selectCountWordsFunc(String text) {
+ final RegExp rFull = RegExp.compile("[\\u3040-\\uA4CF]", "g");
+ final RegExp rLetter = RegExp.compile("[\\uAC00-\\uD7AF]", "g");
+
+ if (rFull.test(text)) {
+ _countWords = new FullWordCounting();
+ } else if (rLetter.test(text)) {
+ _countWords = new LetterWordCounting();
+ } else {
+ _countWords = new FastWordCounting();
+ }
+ }
+
+ // Use the safest version of countWords as the default.
+ static CountWords _countWords = new FullWordCounting();
cjhopman 2015/05/29 19:38:57 s/_countWords/sCountWords in fact, I'd probably c
wychen 2015/05/31 09:04:03 Done.
+
+ public static int countWords(String s) {
+ return _countWords.countWords(s);
+ };
public static native String regexEscape(String s) /*-{
return s.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");

Powered by Google App Engine
This is Rietveld 408576698