Chromium Code Reviews| Index: java/org/chromium/distiller/StringUtil.java |
| diff --git a/java/org/chromium/distiller/StringUtil.java b/java/org/chromium/distiller/StringUtil.java |
| index 00f387644ac3e50ab9ff091d8e21ee4dbc22a23c..b1e99c3ea403eda1b352975f8caa9e8761e3e2f6 100644 |
| --- a/java/org/chromium/distiller/StringUtil.java |
| +++ b/java/org/chromium/distiller/StringUtil.java |
| @@ -4,6 +4,7 @@ |
| package org.chromium.distiller; |
| +import com.google.gwt.core.client.JavaScriptObject; |
| import com.google.gwt.regexp.shared.RegExp; |
| public class StringUtil { |
| @@ -42,14 +43,65 @@ public class StringUtil { |
| return RegExp.compile(regex, "gi").replace(input, replace); |
| } |
| - public static native boolean containsWordCharacter(String s) /*-{ |
| - return /[\w\u00C0-\u1FFF\u2C00-\uD7FF]/.test(s); |
| - }-*/; |
| + /** |
| + * For some languages, counting the number of words relies on non-trivial word |
| + * segmentation algorithms, or even huge look-up tables. This function needs to |
| + * be reasonably fast, so the word count for some languages would only be an |
| + * approximation. |
| + * Read https://crbug.com/484750 for more info. |
| + */ |
| + private static interface CountWords { |
|
cjhopman
2015/05/29 19:38:57
nit: probably should be s/CountWords/WordCounter
wychen
2015/05/31 09:04:03
Done.
|
| + public int countWords(String s); |
| + } |
| - public static native int countWords(String s) /*-{ |
| - var m = s.match(/(\S*[\w\u00C0-\u1FFF\u2C00-\uD7FF]\S*)/g); |
| - return m ? m.length : 0; |
| - }-*/; |
| + private static class FullWordCounting implements CountWords { |
| + public native int countWords(String s) /*-{ |
| + // The following range includes broader alphabetical letters and Hangul Syllables. |
| + var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g); |
| + var c = (m ? m.length : 0); |
| + // The following range includes Hiragana, Katakana, and CJK Unified Ideographs. |
| + // Hangul Syllables are not included. |
| + m = s.match(/([\u3040-\uA4CF])/g); |
| + c += Math.ceil((m ? m.length : 0) * 0.55); |
| + return c; |
| + }-*/; |
| + } |
| + |
| + private static class LetterWordCounting implements CountWords { |
| + public native int countWords(String s) /*-{ |
| + // The following range includes broader alphabetical letters and Hangul Syllables. |
| + var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g); |
| + return (m ? m.length : 0); |
| + }-*/; |
| + } |
| + |
| + private static class FastWordCounting implements CountWords { |
| + public native int countWords(String s) /*-{ |
| + // The following range includes broader alphabetical letters. |
| + var m = s.match(/(\S*[\w\u00C0-\u1FFF]\S*)/g); |
| + return (m ? m.length : 0); |
| + }-*/; |
| + } |
| + |
| + public static void selectCountWordsFunc(String text) { |
| + final RegExp rFull = RegExp.compile("[\\u3040-\\uA4CF]", "g"); |
| + final RegExp rLetter = RegExp.compile("[\\uAC00-\\uD7AF]", "g"); |
| + |
| + if (rFull.test(text)) { |
| + _countWords = new FullWordCounting(); |
| + } else if (rLetter.test(text)) { |
| + _countWords = new LetterWordCounting(); |
| + } else { |
| + _countWords = new FastWordCounting(); |
| + } |
| + } |
| + |
| + // Use the safest version of countWords as the default. |
| + static CountWords _countWords = new FullWordCounting(); |
|
cjhopman
2015/05/29 19:38:57
s/_countWords/sCountWords
in fact, I'd probably c
wychen
2015/05/31 09:04:03
Done.
|
| + |
| + public static int countWords(String s) { |
| + return _countWords.countWords(s); |
| + }; |
| public static native String regexEscape(String s) /*-{ |
| return s.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&"); |