Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(292)

Unified Diff: java/org/chromium/distiller/StringUtil.java

Issue 1131853006: Fix word count issue for Chinese and Japanese (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: rewrite tests Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: java/org/chromium/distiller/StringUtil.java
diff --git a/java/org/chromium/distiller/StringUtil.java b/java/org/chromium/distiller/StringUtil.java
index 00f387644ac3e50ab9ff091d8e21ee4dbc22a23c..b9eafefae1ed0c94f942723898846e2826bd4c9b 100644
--- a/java/org/chromium/distiller/StringUtil.java
+++ b/java/org/chromium/distiller/StringUtil.java
@@ -4,6 +4,7 @@
package org.chromium.distiller;
+import com.google.gwt.core.client.JavaScriptObject;
import com.google.gwt.regexp.shared.RegExp;
public class StringUtil {
@@ -42,14 +43,69 @@ public class StringUtil {
return RegExp.compile(regex, "gi").replace(input, replace);
}
- public static native boolean containsWordCharacter(String s) /*-{
- return /[\w\u00C0-\u1FFF\u2C00-\uD7FF]/.test(s);
- }-*/;
+ /**
+ * For some languages, counting the number of words relies on non-trivial word
+ * segmentation algorithms, or even huge look-up tables. This function needs to
+ * be reasonably fast, so the word count for some languages would only be an
+ * approximation.
+ * Read https://crbug.com/484750 for more info.
+ */
+ public static interface WordCounter {
+ public int count(String s);
+ }
- public static native int countWords(String s) /*-{
- var m = s.match(/(\S*[\w\u00C0-\u1FFF\u2C00-\uD7FF]\S*)/g);
- return m ? m.length : 0;
- }-*/;
+ public static class FullWordCounter implements WordCounter {
+ public native int count(String s) /*-{
+ // The following range includes broader alphabetical letters and Hangul Syllables.
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g);
+ var c = (m ? m.length : 0);
+ // The following range includes Hiragana, Katakana, and CJK Unified Ideographs.
+ // Hangul Syllables are not included.
+ m = s.match(/([\u3040-\uA4CF])/g);
+ c += Math.ceil((m ? m.length : 0) * 0.55);
+ return c;
+ }-*/;
+ }
+
+ public static class LetterWordCounter implements WordCounter {
+ public native int count(String s) /*-{
+ // The following range includes broader alphabetical letters and Hangul Syllables.
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g);
+ return (m ? m.length : 0);
+ }-*/;
+ }
+
+ public static class FastWordCounter implements WordCounter {
+ public native int count(String s) /*-{
+ // The following range includes broader alphabetical letters.
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF]\S*)/g);
+ return (m ? m.length : 0);
+ }-*/;
+ }
+
+ public static void setWordCounter(String text) {
+ sWordCounter = selectWordCounter(text);
+ }
+
+ public static WordCounter selectWordCounter(String text) {
+ final RegExp rFull = RegExp.compile("[\\u3040-\\uA4CF]", "g");
+ final RegExp rLetter = RegExp.compile("[\\uAC00-\\uD7AF]", "g");
+
+ if (rFull.test(text)) {
+ return new FullWordCounter();
+ } else if (rLetter.test(text)) {
+ return new LetterWordCounter();
+ } else {
+ return new FastWordCounter();
+ }
+ }
+
+ // Use the safest version of WordCounter as the default.
+ static WordCounter sWordCounter = new FullWordCounter();
+
+ public static int countWords(String s) {
+ return sWordCounter.count(s);
+ };
public static native String regexEscape(String s) /*-{
return s.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
« no previous file with comments | « java/org/chromium/distiller/DomDistiller.java ('k') | javatests/org/chromium/distiller/DocumentTitleGetterTest.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698