Chromium Code Reviews| Index: java/org/chromium/distiller/StringUtil.java |
| diff --git a/java/org/chromium/distiller/StringUtil.java b/java/org/chromium/distiller/StringUtil.java |
| index 00f387644ac3e50ab9ff091d8e21ee4dbc22a23c..ed6dee7da8cdd682c582e99e7a87890471847177 100644 |
| --- a/java/org/chromium/distiller/StringUtil.java |
| +++ b/java/org/chromium/distiller/StringUtil.java |
| @@ -42,13 +42,50 @@ public class StringUtil { |
| return RegExp.compile(regex, "gi").replace(input, replace); |
| } |
| - public static native boolean containsWordCharacter(String s) /*-{ |
| - return /[\w\u00C0-\u1FFF\u2C00-\uD7FF]/.test(s); |
| + /** |
| + * For some languages, counting the number of words relies on non-trivial word |
| + * segmentation algorithms, or even huge look-up tables. This function needs to |
| + * be reasonably fast, so the word count for some languages would only be an |
| + * approximation. |
| + * Read https://crbug.com/484750 for more info. |
| + */ |
| + public static native void selectCountWordsFunc(String text) /*-{ |
| + if (/[\u3040-\uA4CF]/.test(text)) { |
| + $_countWords = function(s) { |
|
cjhopman
2015/05/22 00:11:58
This is still going to modify the global window ob
cjhopman
2015/05/22 00:13:29
Oh, I see what you did. Yeah, we shouldn't add glo
wychen
2015/05/23 00:32:27
Found a workaround. PTAL.
|
| + // The following range includes broader alphabetical letters and Hangul Syllables. |
| + var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g); |
| + var c = (m ? m.length : 0); |
| + // The following range includes Hiragana, Katakana, and CJK Unified Ideographs. |
| + // Hangul Syllables are not included. |
| + m = s.match(/([\u3040-\uA4CF])/g); |
| + c += Math.ceil((m ? m.length : 0) * 0.55); |
| + return c; |
| + }; |
| + return; |
| + } |
| + if (/[\AC00-\uD7AF]/.test(text)) { |
| + $_countWords = function(s) { |
| + // The following range includes broader alphabetical letters and Hangul Syllables. |
| + var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g); |
| + return (m ? m.length : 0); |
| + }; |
| + return; |
| + } |
| + $_countWords = function(s) { |
| + // The following range includes broader alphabetical letters. |
| + var m = s.match(/(\S*[\w\u00C0-\u1FFF]\S*)/g); |
| + return (m ? m.length : 0); |
| + }; |
| }-*/; |
| + private static final int __dummy_setCountWords = initCountWords(); |
| + private static int initCountWords() { |
| + selectCountWordsFunc("字"); |
| + return 0; |
| + } |
| + |
| public static native int countWords(String s) /*-{ |
| - var m = s.match(/(\S*[\w\u00C0-\u1FFF\u2C00-\uD7FF]\S*)/g); |
| - return m ? m.length : 0; |
| + return $_countWords(s); |
| }-*/; |
| public static native String regexEscape(String s) /*-{ |