java/org/chromium/distiller/StringUtil.java - Issue 1131853006: Fix word count issue for Chinese and Japanese

Unified Diff: java/org/chromium/distiller/StringUtil.java

Issue 1131853006: Fix word count issue for Chinese and Japanese (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master

Patch Set: rewrite tests Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « java/org/chromium/distiller/DomDistiller.java ('k') | javatests/org/chromium/distiller/DocumentTitleGetterTest.java » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: java/org/chromium/distiller/StringUtil.java

diff --git a/java/org/chromium/distiller/StringUtil.java b/java/org/chromium/distiller/StringUtil.java

index 00f387644ac3e50ab9ff091d8e21ee4dbc22a23c..b9eafefae1ed0c94f942723898846e2826bd4c9b 100644

--- a/java/org/chromium/distiller/StringUtil.java

+++ b/java/org/chromium/distiller/StringUtil.java

@@ -4,6 +4,7 @@

package org.chromium.distiller;

+import com.google.gwt.core.client.JavaScriptObject;

import com.google.gwt.regexp.shared.RegExp;

public class StringUtil {

@@ -42,14 +43,69 @@ public class StringUtil {

return RegExp.compile(regex, "gi").replace(input, replace);

}

- public static native boolean containsWordCharacter(String s) /*-{

- return /[\w\u00C0-\u1FFF\u2C00-\uD7FF]/.test(s);

- }-*/;

+ /**

+ * For some languages, counting the number of words relies on non-trivial word

+ * segmentation algorithms, or even huge look-up tables. This function needs to

+ * be reasonably fast, so the word count for some languages would only be an

+ * approximation.

+ * Read https://crbug.com/484750 for more info.

+ */

+ public static interface WordCounter {

+ public int count(String s);

+ }

- public static native int countWords(String s) /*-{

- var m = s.match(/(\S*[\w\u00C0-\u1FFF\u2C00-\uD7FF]\S*)/g);

- return m ? m.length : 0;

- }-*/;

+ public static class FullWordCounter implements WordCounter {

+ public native int count(String s) /*-{

+ // The following range includes broader alphabetical letters and Hangul Syllables.

+ var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g);

+ var c = (m ? m.length : 0);

+ // The following range includes Hiragana, Katakana, and CJK Unified Ideographs.

+ // Hangul Syllables are not included.

+ m = s.match(/([\u3040-\uA4CF])/g);

+ c += Math.ceil((m ? m.length : 0) * 0.55);

+ return c;

+ }-*/;

+ }

+ public static class LetterWordCounter implements WordCounter {

+ public native int count(String s) /*-{

+ // The following range includes broader alphabetical letters and Hangul Syllables.

+ var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g);

+ return (m ? m.length : 0);

+ }-*/;

+ }

+ public static class FastWordCounter implements WordCounter {

+ public native int count(String s) /*-{

+ // The following range includes broader alphabetical letters.

+ var m = s.match(/(\S*[\w\u00C0-\u1FFF]\S*)/g);

+ return (m ? m.length : 0);

+ }-*/;

+ }

+ public static void setWordCounter(String text) {

+ sWordCounter = selectWordCounter(text);

+ }

+ public static WordCounter selectWordCounter(String text) {

+ final RegExp rFull = RegExp.compile("[\\u3040-\\uA4CF]", "g");

+ final RegExp rLetter = RegExp.compile("[\\uAC00-\\uD7AF]", "g");

+ if (rFull.test(text)) {

+ return new FullWordCounter();

+ } else if (rLetter.test(text)) {

+ return new LetterWordCounter();

+ } else {

+ return new FastWordCounter();

+ }

+ // Use the safest version of WordCounter as the default.

+ static WordCounter sWordCounter = new FullWordCounter();

+ public static int countWords(String s) {

+ return sWordCounter.count(s);

+ };

public static native String regexEscape(String s) /*-{

return s.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");

« no previous file with comments | « java/org/chromium/distiller/DomDistiller.java ('k') | javatests/org/chromium/distiller/DocumentTitleGetterTest.java » ('j') | no next file with comments »