Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(92)

Side by Side Diff: java/org/chromium/distiller/StringUtil.java

Issue 1131853006: Fix word count issue for Chinese and Japanese (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: use java interface Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package org.chromium.distiller; 5 package org.chromium.distiller;
6 6
7 import com.google.gwt.core.client.JavaScriptObject;
7 import com.google.gwt.regexp.shared.RegExp; 8 import com.google.gwt.regexp.shared.RegExp;
8 9
9 public class StringUtil { 10 public class StringUtil {
10 // For the whitespace-related functions below, Java's and Javascript's versi ons of '\s' and '\S' 11 // For the whitespace-related functions below, Java's and Javascript's versi ons of '\s' and '\S'
11 // are different. E.g. java doesn't recognize   in a text node as whit espace but 12 // are different. E.g. java doesn't recognize   in a text node as whit espace but
12 // javascript does. The former causes GWT tests to fail; the latter is what we want. 13 // javascript does. The former causes GWT tests to fail; the latter is what we want.
13 // Don't use the "g" global search flag, or subsequent searches, even with d ifferent Character 14 // Don't use the "g" global search flag, or subsequent searches, even with d ifferent Character
14 // or String, become unpredictable. 15 // or String, become unpredictable.
15 16
16 public static native boolean isWhitespace(Character c) /*-{ 17 public static native boolean isWhitespace(Character c) /*-{
(...skipping 18 matching lines...) Expand all
35 } 36 }
36 37
37 public static boolean match(String input, String regex) { 38 public static boolean match(String input, String regex) {
38 return RegExp.compile(regex, "i").test(input); 39 return RegExp.compile(regex, "i").test(input);
39 } 40 }
40 41
41 public static String findAndReplace(String input, String regex, String repla ce) { 42 public static String findAndReplace(String input, String regex, String repla ce) {
42 return RegExp.compile(regex, "gi").replace(input, replace); 43 return RegExp.compile(regex, "gi").replace(input, replace);
43 } 44 }
44 45
45 public static native boolean containsWordCharacter(String s) /*-{ 46 /**
46 return /[\w\u00C0-\u1FFF\u2C00-\uD7FF]/.test(s); 47 * For some languages, counting the number of words relies on non-trivial wo rd
47 }-*/; 48 * segmentation algorithms, or even huge look-up tables. This function needs to
49 * be reasonably fast, so the word count for some languages would only be an
50 * approximation.
51 * Read https://crbug.com/484750 for more info.
52 */
53 private static interface CountWords {
cjhopman 2015/05/29 19:38:57 nit: probably should be s/CountWords/WordCounter
wychen 2015/05/31 09:04:03 Done.
54 public int countWords(String s);
55 }
48 56
49 public static native int countWords(String s) /*-{ 57 private static class FullWordCounting implements CountWords {
50 var m = s.match(/(\S*[\w\u00C0-\u1FFF\u2C00-\uD7FF]\S*)/g); 58 public native int countWords(String s) /*-{
51 return m ? m.length : 0; 59 // The following range includes broader alphabetical letters and Han gul Syllables.
52 }-*/; 60 var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g);
61 var c = (m ? m.length : 0);
62 // The following range includes Hiragana, Katakana, and CJK Unified Ideographs.
63 // Hangul Syllables are not included.
64 m = s.match(/([\u3040-\uA4CF])/g);
65 c += Math.ceil((m ? m.length : 0) * 0.55);
66 return c;
67 }-*/;
68 }
69
70 private static class LetterWordCounting implements CountWords {
71 public native int countWords(String s) /*-{
72 // The following range includes broader alphabetical letters and Han gul Syllables.
73 var m = s.match(/(\S*[\w\u00C0-\u1FFF\uAC00-\uD7AF]\S*)/g);
74 return (m ? m.length : 0);
75 }-*/;
76 }
77
78 private static class FastWordCounting implements CountWords {
79 public native int countWords(String s) /*-{
80 // The following range includes broader alphabetical letters.
81 var m = s.match(/(\S*[\w\u00C0-\u1FFF]\S*)/g);
82 return (m ? m.length : 0);
83 }-*/;
84 }
85
86 public static void selectCountWordsFunc(String text) {
87 final RegExp rFull = RegExp.compile("[\\u3040-\\uA4CF]", "g");
88 final RegExp rLetter = RegExp.compile("[\\uAC00-\\uD7AF]", "g");
89
90 if (rFull.test(text)) {
91 _countWords = new FullWordCounting();
92 } else if (rLetter.test(text)) {
93 _countWords = new LetterWordCounting();
94 } else {
95 _countWords = new FastWordCounting();
96 }
97 }
98
99 // Use the safest version of countWords as the default.
100 static CountWords _countWords = new FullWordCounting();
cjhopman 2015/05/29 19:38:57 s/_countWords/sCountWords in fact, I'd probably c
wychen 2015/05/31 09:04:03 Done.
101
102 public static int countWords(String s) {
103 return _countWords.countWords(s);
104 };
53 105
54 public static native String regexEscape(String s) /*-{ 106 public static native String regexEscape(String s) /*-{
55 return s.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&"); 107 return s.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
56 }-*/; 108 }-*/;
57 109
58 /* 110 /*
59 * Returns true if character is a digit. 111 * Returns true if character is a digit.
60 */ 112 */
61 public static native boolean isDigit(Character c) /*-{ 113 public static native boolean isDigit(Character c) /*-{
62 return /\d/.test(c); 114 return /\d/.test(c);
(...skipping 16 matching lines...) Expand all
79 /** 131 /**
80 * Returns the plain number if given string can be converted to one >= 0. 132 * Returns the plain number if given string can be converted to one >= 0.
81 * Returns -1 if string is empty or not all digits. 133 * Returns -1 if string is empty or not all digits.
82 */ 134 */
83 public static int toNumber(String s) { 135 public static int toNumber(String s) {
84 if (s.isEmpty() || !StringUtil.isStringAllDigits(s)) return -1; 136 if (s.isEmpty() || !StringUtil.isStringAllDigits(s)) return -1;
85 return JavaScript.parseInt(s, 10); 137 return JavaScript.parseInt(s, 10);
86 } 138 }
87 139
88 } 140 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698