Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(487)

Unified Diff: java/org/chromium/distiller/StringUtil.java

Issue 1131853006: Fix word count issue for Chinese and Japanese (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: speed up Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: java/org/chromium/distiller/StringUtil.java
diff --git a/java/org/chromium/distiller/StringUtil.java b/java/org/chromium/distiller/StringUtil.java
index 00f387644ac3e50ab9ff091d8e21ee4dbc22a23c..8fa0b0c7e96111d10a6e8c77aec31564e80a2917 100644
--- a/java/org/chromium/distiller/StringUtil.java
+++ b/java/org/chromium/distiller/StringUtil.java
@@ -42,13 +42,50 @@ public class StringUtil {
return RegExp.compile(regex, "gi").replace(input, replace);
}
- public static native boolean containsWordCharacter(String s) /*-{
- return /[\w\u00C0-\u1FFF\u2C00-\uD7FF]/.test(s);
+ /**
+ * For some languages, counting the number of words relies on non-trivial word
+ * segmentation algorithms, or even huge look-up tables. This function needs to
+ * be reasonably fast, so the word count for some languages would only be an
+ * approximation.
+ * Read https://crbug.com/484750 for more info.
+ */
+ public static native void selectCountWordsFunc(String text) /*-{
+ if (/[\u3040-\uA4CF]/.test(text)) {
+ $wnd._countWords = function(s) {
cjhopman 2015/05/21 19:32:02 Don't make changes to $wnd
wychen 2015/05/21 23:07:03 Done.
+ // The following range includes broader alphabetical letters and Hangul Syllables.
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g);
+ var c = (m ? m.length : 0);
+ // The following range includes Hiragana, Katakana, and CJK Unified Ideographs.
+ // Hangul Syllables are not included.
+ m = s.match(/([\u3040-\uA4CF])/g);
+ c += Math.ceil((m ? m.length : 0) * 0.55);
+ return c;
+ };
+ return;
+ }
+ if (/[\AC00-\uD7AF]/.test(text)) {
+ $wnd._countWords = function(s) {
+ // The following range includes broader alphabetical letters and Hangul Syllables.
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g);
+ return (m ? m.length : 0);
+ };
+ return;
+ }
+ $wnd._countWords = function(s) {
+ // The following range includes broader alphabetical letters.
+ var m = s.match(/(\S*[\w\u00C0-\u1FFF]\S*)/g);
+ return (m ? m.length : 0);
+ };
}-*/;
+ private static final int __dummy_setCountWords = initCountWords();
+ private static int initCountWords() {
+ selectCountWordsFunc("字");
+ return 0;
+ }
+
public static native int countWords(String s) /*-{
- var m = s.match(/(\S*[\w\u00C0-\u1FFF\u2C00-\uD7FF]\S*)/g);
- return m ? m.length : 0;
+ return $wnd._countWords(s);
}-*/;
public static native String regexEscape(String s) /*-{

Powered by Google App Engine
This is Rietveld 408576698