| Index: java/org/chromium/distiller/StringUtil.java
|
| diff --git a/java/org/chromium/distiller/StringUtil.java b/java/org/chromium/distiller/StringUtil.java
|
| index 00f387644ac3e50ab9ff091d8e21ee4dbc22a23c..b9eafefae1ed0c94f942723898846e2826bd4c9b 100644
|
| --- a/java/org/chromium/distiller/StringUtil.java
|
| +++ b/java/org/chromium/distiller/StringUtil.java
|
| @@ -4,6 +4,7 @@
|
|
|
| package org.chromium.distiller;
|
|
|
| +import com.google.gwt.core.client.JavaScriptObject;
|
| import com.google.gwt.regexp.shared.RegExp;
|
|
|
| public class StringUtil {
|
| @@ -42,14 +43,69 @@ public class StringUtil {
|
| return RegExp.compile(regex, "gi").replace(input, replace);
|
| }
|
|
|
| - public static native boolean containsWordCharacter(String s) /*-{
|
| - return /[\w\u00C0-\u1FFF\u2C00-\uD7FF]/.test(s);
|
| - }-*/;
|
| + /**
|
| + * For some languages, counting the number of words relies on non-trivial word
|
| + * segmentation algorithms, or even huge look-up tables. This function needs to
|
| + * be reasonably fast, so the word count for some languages would only be an
|
| + * approximation.
|
| + * Read https://crbug.com/484750 for more info.
|
| + */
|
| + public static interface WordCounter {
|
| + public int count(String s);
|
| + }
|
|
|
| - public static native int countWords(String s) /*-{
|
| - var m = s.match(/(\S*[\w\u00C0-\u1FFF\u2C00-\uD7FF]\S*)/g);
|
| - return m ? m.length : 0;
|
| - }-*/;
|
| + public static class FullWordCounter implements WordCounter {
|
| + public native int count(String s) /*-{
|
| + // The following range includes broader alphabetical letters and Hangul Syllables.
|
| + var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g);
|
| + var c = (m ? m.length : 0);
|
| + // The following range includes Hiragana, Katakana, and CJK Unified Ideographs.
|
| + // Hangul Syllables are not included.
|
| + m = s.match(/([\u3040-\uA4CF])/g);
|
| + c += Math.ceil((m ? m.length : 0) * 0.55);
|
| + return c;
|
| + }-*/;
|
| + }
|
| +
|
| + public static class LetterWordCounter implements WordCounter {
|
| + public native int count(String s) /*-{
|
| + // The following range includes broader alphabetical letters and Hangul Syllables.
|
| + var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g);
|
| + return (m ? m.length : 0);
|
| + }-*/;
|
| + }
|
| +
|
| + public static class FastWordCounter implements WordCounter {
|
| + public native int count(String s) /*-{
|
| + // The following range includes broader alphabetical letters.
|
| + var m = s.match(/(\S*[\w\u00C0-\u1FFF]\S*)/g);
|
| + return (m ? m.length : 0);
|
| + }-*/;
|
| + }
|
| +
|
| + public static void setWordCounter(String text) {
|
| + sWordCounter = selectWordCounter(text);
|
| + }
|
| +
|
| + public static WordCounter selectWordCounter(String text) {
|
| + final RegExp rFull = RegExp.compile("[\\u3040-\\uA4CF]", "g");
|
| + final RegExp rLetter = RegExp.compile("[\\uAC00-\\uD7AF]", "g");
|
| +
|
| + if (rFull.test(text)) {
|
| + return new FullWordCounter();
|
| + } else if (rLetter.test(text)) {
|
| + return new LetterWordCounter();
|
| + } else {
|
| + return new FastWordCounter();
|
| + }
|
| + }
|
| +
|
| + // Use the safest version of WordCounter as the default.
|
| + static WordCounter sWordCounter = new FullWordCounter();
|
| +
|
| + public static int countWords(String s) {
|
| + return sWordCounter.count(s);
|
| + };
|
|
|
| public static native String regexEscape(String s) /*-{
|
| return s.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
|
|
|