Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1348)

Side by Side Diff: java/org/chromium/distiller/PathComponentPagePattern.java

Issue 1029593003: implement validations of pagination URLs (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: addr chris's comments Created 5 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 package org.chromium.distiller;
6
7 import com.google.gwt.regexp.shared.MatchResult;
8 import com.google.gwt.regexp.shared.RegExp;
9
10 /**
11 * This class detects the page parameter in the path of a potential pagination U RL. If detected,
12 * it replaces the page param value with PageParameterDetector.PAGE_PARAM_PLACEH OLDER, then creates
13 * and returns a new object. This object can then be accessed via PageParameter Detector.PagePattern
14 * interface to:
15 * - validate the generated URL page pattern against the document URL
16 * - determine if a URL is a paging URL based on the page pattern.
17 * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pat tern is
18 * "http://www.foo.com/a/b-[*!].html". (See comments at top of PageParameterDete ctor.java).
19 */
20 public class PathComponentPagePattern implements PageParameterDetector.PagePatte rn {
21 private final ParsedUrl mUrl;
22 private final int mPageNumber;
23 private final int mPlaceholderStart;
24 private final String mUrlStr;
25 // Start position of path component containing placeholder.
26 private int mPlaceholderSegmentStart;
27 // Page param path component in list of path components.
28 private int mParamIndex = -1;
29 private final String mPrefix; // The part of the page pattern before the pla ceholder.
30 private String mSuffix = ""; // The part of the page pattern after the plac eholder.
31
32 /**
33 * Returns a new PagePattern if url is valid and contains PAGE_PARAM_PLACEHO LDER.
34 */
35 static PageParameterDetector.PagePattern create(ParsedUrl url, int pathStart , int digitStart,
36 int digitEnd) {
37 try {
38 return new PathComponentPagePattern(url, pathStart, digitStart, digi tEnd);
39 } catch (IllegalArgumentException e) {
40 return null;
41 }
42 }
43
44 @Override
45 public String toString() {
46 return mUrlStr;
47 }
48
49 @Override
50 public int getPageNumber() {
51 return mPageNumber;
52 }
53
54 /**
55 * Returns true if pattern and URL are sufficiently similar and the pattern' s components are not
56 * calendar digits.
57 *
58 * @param docUrl the current document URL
59 */
60 @Override
61 public boolean isValidFor(ParsedUrl docUrl) {
62 final int urlPathComponentsLen = docUrl.getPathComponents().length;
63 final int patternPathComponentsLen = mUrl.getPathComponents().length;
64
65 // Both the pattern and doc URL must have the similar path.
66 if (urlPathComponentsLen > patternPathComponentsLen) return false;
67
68 // If both doc URL and page pattern have only 1 component, their common prefix+suffix must
69 // be at least half of the entire component in doc URL, e.g doc URL is
70 // "foo.com/foo-bar-threads-132" and pattern is "foo.com/foo-bar-threads -132-[*!]".
71 if (urlPathComponentsLen == 1 && patternPathComponentsLen == 1) {
72 final String urlComponent = docUrl.getPathComponents()[0];
73 final String patternComponent = mUrl.getPathComponents()[0];
74 int commonPrefixLen = getLongestCommonPrefixLength(urlComponent, pat ternComponent);
75 int commonSuffixLen = getLongestCommonSuffixLength(urlComponent, pat ternComponent,
76 commonPrefixLen);
77 return (commonSuffixLen + commonPrefixLen) * 2 >= urlComponent.lengt h();
78 }
79
80 if (!hasSamePathComponentsAs(docUrl)) return false;
81
82 if (isCalendarPage()) return false;
83
84 return true;
85 }
86
87 /**
88 * Returns true if a URL matches this page pattern based on a pipeline of ru les:
89 * - suffix (part of pattern after page param placeholder) must be same, and
90 * - different set of rules depending on if page param is at start of path c omponent or not.
91 *
92 * @param url the URL to evalutate
93 */
94 @Override
95 public boolean isPagingUrl(String url) {
96 // Both url and pattern must have the same suffix, if available.
97 if (!mSuffix.isEmpty() && !url.endsWith(mSuffix)) return false;
98
99 return atStartOfPathComponent() ? isPagingUrlForStartOfPathComponent(url ) :
100 isPagingUrlForNotStartOfPathComponent(url);
101 }
102
103 private PathComponentPagePattern(ParsedUrl url, int pathStart, int digitStar t, int digitEnd)
104 throws IllegalArgumentException {
105 final String urlStr = url.toString();
106 if (isLastNumericPathComponentBad(urlStr, pathStart, digitStart, digitEn d)) {
107 throw new IllegalArgumentException("Bad last numeric path component" );
108 }
109
110 String valueStr = urlStr.substring(digitStart, digitEnd);
111 int value = StringUtil.toNumber(valueStr);
112 if (value < 0) {
113 throw new IllegalArgumentException("Value in path component is an in valid number: " +
114 valueStr);
115 }
116
117 String pattern = urlStr.substring(0, digitStart) +
118 PageParameterDetector.PAGE_PARAM_PLACEHOLDER + urlStr.substring( digitEnd);
119 mUrl = ParsedUrl.create(pattern);
120 if (mUrl == null) throw new IllegalArgumentException("Invalid URL: " + p attern);
121 mUrlStr = pattern;
122 mPageNumber = value;
123 mPlaceholderStart = digitStart;
124 mPlaceholderSegmentStart = mUrlStr.lastIndexOf('/', mPlaceholderStart);
125 determineParamIndex();
126 mPrefix = mUrlStr.substring(0, mPlaceholderSegmentStart);
127 // Determine suffix, if available.
128 final int urlLen = mUrlStr.length();
129 int suffixLen = urlLen - mPlaceholderStart -
130 PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN;
131 if (suffixLen != 0) mSuffix = mUrlStr.substring(urlLen - suffixLen);
132 }
133
134 private boolean atStartOfPathComponent() {
135 return mUrlStr.charAt(mPlaceholderStart - 1) == '/';
136 }
137
138 private void determineParamIndex() {
139 final String[] pathComponents = mUrl.getPathComponents();
140 for (mParamIndex = 0; mParamIndex < pathComponents.length; mParamIndex++ ) {
141 if (pathComponents[mParamIndex].contains(
142 PageParameterDetector.PAGE_PARAM_PLACEHOLDER)) {
143 break;
144 }
145 }
146 }
147
148 /**
149 * Returns true if, except for the path component containing the page param, the other path
150 * components of doc URL are the same as pattern's. But pattern may have mo re components, e.g.:
151 * - doc URL is /thread/12, pattern is /thread/12/page/[*!]
152 * returns true because "thread" and "12" in doc URL match those in patter n
153 * - doc URL is /thread/12/foo, pattern is /thread/12/page/[*!]/foo
154 * returns false because "foo" in doc URL doesn't match "page" in pattern whose page param
155 path component comes after.
156 * - doc URL is /thread/12/foo, pattern is /thread/12/[*!]/foo
157 * returns true because "foo" in doc URL would match "foo" in pattern whos e page param path
158 * component is skipped when matching.
159 */
160 private boolean hasSamePathComponentsAs(ParsedUrl docUrl) {
161 final String[] urlComponents = docUrl.getPathComponents();
162 final String[] patternComponents = mUrl.getPathComponents();
163 boolean passedParamComponent = false;
164 for (int i = 0, j = 0; i < urlComponents.length && j < patternComponents .length; i++, j++) {
165 if (i == mParamIndex && !passedParamComponent) {
166 passedParamComponent = true;
167 // Repeat current path component if doc URL has less components (as per comments
168 // just above, doc URL may have less components).
169 if (urlComponents.length < patternComponents.length) i--;
170 continue;
171 }
172 if (!urlComponents[i].equalsIgnoreCase(patternComponents[j])) return false;
173 }
174
175 return true;
176 }
177
178 /**
179 * Returns true if pattern is for a calendar page, e.g. 2012/01/[*!], which would be a
180 * false-positive.
181 */
182 private boolean isCalendarPage() {
183 if (mParamIndex < 2) return false;
184
185 // Only if param is the entire path component. This handles some cases erroneously
186 // considered false-positives e.g. first page is
187 // http://www.politico.com/story/2014/07/barack-obama-immigration-legal- questions-109467.html,
188 // and second page is
189 // http://www.politico.com/story/2014/07/barack-obama-immigration-legal- questions-109467_Page2.html,
190 // would be considered false-positives otherwise because of "2014" and " 07".
191 final String[] patternComponents = mUrl.getPathComponents();
192 if (patternComponents[mParamIndex].length() !=
193 PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN) {
194 return false;
195 }
196
197 int month = StringUtil.toNumber(patternComponents[mParamIndex - 1]);
198 if (month > 0 && month <= 12) {
199 int year = StringUtil.toNumber(patternComponents[mParamIndex - 2]);
200 if (year > 1970 && year < 3000) return true;
201 }
202
203 return false;
204 }
205
206 private static int getLongestCommonPrefixLength(String str1, String str2) {
207 if (str1.isEmpty() || str2.isEmpty()) return 0;
208
209 int limit = Math.min(str1.length(), str2.length());
210 int i = 0;
211 for (; i < limit; i++) {
212 if (str1.charAt(i) != str2.charAt(i)) break;
213 }
214 return i;
215 }
216
217 private static int getLongestCommonSuffixLength(String str1, String str2, in t startIndex) {
218 int commonSuffixLen = 0;
219 for (int i = str1.length() - 1, j = str2.length() - 1;
220 i > startIndex && j > startIndex; i--, j--, commonSuffixLen++) {
221 if (str1.charAt(i) != str2.charAt(i)) break;
222 }
223 return commonSuffixLen;
224 }
225
226 /**
227 * Returns true if url is a paging URL based on the page pattern where the p age param is at the
228 * start of a path component.
229 * If the page pattern is www.foo.com/a/[*!]/abc.html, expected doc URL is:
230 * - www.foo.com/a/2/abc.html
231 * - www.foo.com/a/abc.html
232 * - www.foo.com/abc.html.
233 */
234 private boolean isPagingUrlForStartOfPathComponent(String url) {
235 final int urlLen = url.length();
236 final int suffixLen = mSuffix.length();
237 final int suffixStart = url.length() - suffixLen;
238
239 int prevComponentPos = mUrl.getPath().lastIndexOf('/',
240 // We're only looking in the path, so the reverse search should start at the index
241 // excluding the url's origin.
242 mPlaceholderSegmentStart - 1 - mUrl.getOrigin().length());
243 if (prevComponentPos != -1) {
244 // Now, add back the url's origin to the index of previous path comp onent.
245 prevComponentPos += mUrl.getOrigin().length();
246 if (prevComponentPos + suffixLen == urlLen) {
247 // The url doesn't have page number param and previous path comp onent, like
248 // www.foo.com/abc.html.
249 return url.regionMatches(0, mUrlStr, 0, prevComponentPos);
250 }
251 }
252
253 // If both url and pattern have the same prefix, url must have nothing e lse.
254 if (url.startsWith(mPrefix)) {
255 int acceptLen = mPlaceholderSegmentStart + suffixLen;
256 // The url doesn't have page number parameter, like www.foo.com/a/ab c.html.
257 if (acceptLen == urlLen) return true;
258 if (acceptLen > urlLen) return false;
259
260 // While we are here, the url must have page number param, so the ur l must have a '/'
261 // at the pattern's path component start position.
262 if (url.charAt(mPlaceholderSegmentStart) != '/') return false;
263
264 return PageParameterDetector.isPlainNumber(url.substring(mPlaceholde rSegmentStart + 1,
265 suffixStart));
266 }
267
268 return false;
269 }
270
271 /**
272 * Returns true if url is a paging URL based on the page pattern where the p age param is not at
273 * the start of a path component.
274 * If the page pattern is www.foo.com/a/abc-[*!].html, expected doc URL is:
275 * - www.foo.com/a/abc-2.html
276 * - www.foo.com/a/abc.html.
277 */
278 private boolean isPagingUrlForNotStartOfPathComponent(String url) {
279 final int urlLen = url.length();
280 final int suffixStart = urlLen - mSuffix.length();
281
282 // The page param path component of both url and pattern must have the s ame prefix.
283 if (!url.startsWith(mPrefix)) return false;
284
285 // Find the first different character in page param path component just before
286 // placeholder or suffix, then check if it's acceptable.
287 int firstDiffPos = mPlaceholderSegmentStart;
288 int maxPos = Math.min(mPlaceholderStart, suffixStart);
289 for (; firstDiffPos < maxPos; firstDiffPos++) {
290 if (url.charAt(firstDiffPos) != mUrlStr.charAt(firstDiffPos)) break;
291 }
292 if (firstDiffPos == suffixStart) { // First different character is the suffix.
293 if (firstDiffPos + 1 == mPlaceholderStart &&
294 isPageParamSeparator(mUrlStr.charAt(firstDiffPos))) {
295 return true;
296 }
297 } else if (firstDiffPos == mPlaceholderStart) { // First different char acter is page param.
298 if (PageParameterDetector.isPlainNumber(url.substring(firstDiffPos, suffixStart))) {
299 return true;
300 }
301 }
302
303 return false;
304 }
305
306 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l).
307 private static RegExp sLastPathComponentRegExp = null; // Match last path c omponent.
308
309 /**
310 * Returns true if:
311 * - the digitStart to digitEnd of urlStr is the last path component, and
312 * - the entire path component is numeric, and
313 * - the previous path component is a bad page param name.
314 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an d "tag" is a bad
315 * page param.
316 */
317 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, i nt digitStart,
318 int digitEnd) {
319 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component.
320 pathStart < digitStart - 1) { // Not the first path component.
321 String postMatch = urlStr.substring(digitEnd).toLowerCase();
322 // Checks that this is the last path component, and trailing charact ers, if available,
323 // are (s)htm(l) extensions.
324 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i");
325 if (sExtRegExp.test(postMatch)) {
326 // Entire component is numeric, get previous path component.
327 if (sLastPathComponentRegExp == null) {
328 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i") ;
329 }
330 MatchResult prevPathComponent = sLastPathComponentRegExp.exec(
331 urlStr.substring(pathStart + 1, digitStart));
332 if (prevPathComponent != null && prevPathComponent.getGroupCount () > 1 &&
333 PageParameterDetector.isPageParamNameBad(prevPathCompone nt.getGroup(1))) {
334 return true;
335 }
336 } // last numeric path component
337 }
338
339 return false;
340 }
341
342 /**
343 * Returns true if given character is one of '-', '_', ';', ','.
344 */
345 private static native boolean isPageParamSeparator(Character c) /*-{
346 return /[-_;,]/.test(c);
347 }-*/;
348
349 }
OLDNEW
« no previous file with comments | « java/org/chromium/distiller/ParsedUrl.java ('k') | java/org/chromium/distiller/QueryParamPagePattern.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698