Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(338)

Side by Side Diff: java/org/chromium/distiller/PagePattern.java

Issue 1029593003: implement validations of pagination URLs (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: nit Created 5 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 package org.chromium.distiller;
6
7 import com.google.gwt.regexp.shared.RegExp;
8
9 /**
10 * This class handles the URL page pattern generated from a potential pagination URL, i.e. a URL
11 * containing the PageParameterDetector.PAGE_PARAM_PLACEHOLDER (see comments at top of
12 * PageParameterDetector.java).
13 * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pat tern is
14 * "http://www.foo.com/a/b-[*!].html".
15 * This class determines the type of pattern (inside query or path), validates p attern against
16 * document URL, and determines if it's a paging URL based on the document URL.
17 */
18 public class PagePattern {
cjhopman 2015/04/07 00:45:49 After commenting a bit below, I wonder if it would
kuan 2015/04/10 22:41:27 Done. i did two - QueryParamPagePattern and PathCo
19 private final ParsedUrl mUrl;
20 private final int mPageParamPos;
cjhopman 2015/04/07 00:45:49 Some of this code below was hard for me to follow.
kuan 2015/04/10 22:41:27 Done.
21 private final String mUrlStr;
22 private final int mQueryPos;
23 private int mComponentPos; // Start position of either page param query or path component.
24 private int mParamIndex = -1; // Page param path component position in list of path components.
25
26 /**
27 * Returns a new PagePattern if url is valid and contains PAGE_PARAM_PLACEHO LDER.
28 */
29 static PagePattern create(String url) {
cjhopman 2015/04/07 00:45:48 How about having a create method for page pattern
kuan 2015/04/10 22:41:27 Done.
30 try {
31 return new PagePattern(url);
32 } catch (Exception e) {
33 return null;
34 }
35 }
36
37 /**
38 * Validates this page pattern according to the current document URL through a pipeline of
39 * rules:
40 * - for query page parameter, pattern and URL must have same path component s.
41 * - for path page parameter,
42 * - pattern and URL must have same number of path components.
cjhopman 2015/04/07 00:45:49 Personally, I'd have these function-level comments
kuan 2015/04/10 22:41:28 Done.
43 * - if only 1 path component, both must have long-enough common prefix an d suffix.
44 * - else all pattern's components, except for page parameter, must be sam e as url's.
45 * - lastly, pattern's components cannot be calendar digits.
46 *
47 * Returns true if page pattern is valid.
48 *
49 * @param docUrl the current document URL
50 * @param pagePattern the page pattern to validate
51 */
52 boolean isValidFor(ParsedUrl docUrl) {
cjhopman 2015/04/07 00:45:48 Would it make sense to check validity when the pag
kuan 2015/04/10 22:41:27 i don't believe so. multiple pagination URLs coul
53 // If page parameter is a query, page pattern and doc URL must have the same path.
54 if (isQueryPattern()) {
55 return docUrl.getTrimmedPath().equalsIgnoreCase(mUrl.getTrimmedPath( ));
56 }
57
58 final int urlPathComponentsLen = docUrl.getPathComponents().length;
59 final int patternPathComponentsLen = mUrl.getPathComponents().length;
60
61 // If the page param is inside of path components, both the pattern and doc URL must have
62 // the similar path.
63 if (urlPathComponentsLen > patternPathComponentsLen) return false;
64
65 // If both doc URL and page pattern have only 1 component, their common prefix+suffix must
66 // be at least half of the entire component in doc URL, e.g doc URL is
67 // "foo.com/foo-bar-threads-132" and pattern is "foo.com/foo-bar-threads -132-[*!]".
68 if (urlPathComponentsLen == 1 && patternPathComponentsLen == 1) {
69 final String urlComponent = docUrl.getPathComponents()[0];
70 final String patternComponent = mUrl.getPathComponents()[0];
71 int commonPrefixLen = getLongestCommonPrefixLength(urlComponent, pat ternComponent);
72 return (getLongestCommonSuffixLength(urlComponent, patternComponent, commonPrefixLen) +
cjhopman 2015/04/07 00:45:48 nit: could you put the common suffix in a local va
kuan 2015/04/10 22:41:28 Done.
73 commonPrefixLen) * 2 >= urlComponent.length();
74 }
75
76 if (!hasSamePathComponentsAs(docUrl)) return false;
77
78 if (isCalendarPage()) return false;
cjhopman 2015/04/07 00:45:48 Why do we check this here? it doesn't really depen
kuan 2015/04/10 22:41:28 it's true it doesn't depend on doc url. however,
79
80 return true;
81 }
82
83 private static RegExp sSlashOrHtmExtRegExp = null; // Match either '/' or " .htm(l)".
84
85 /**
86 * Returns true if a URL matches this page pattern based on a pipeline of ru les:
87 * - suffix (part of pattern after page param placeholder) must be same, and
88 * - for query page parameter,
89 * - scheme, host, and path must be same, and
90 * - query components, except that for page number, must be same in order and value, and
91 * - query value must be a plain number.
92 * - for path page parameter that is part of a path component,
93 * - if the first different character in path component is suffix, it must be a page parameter
94 * separator, followed by the page parameter in the pattern
95 * - else if it's page parameter, it and possible following digits must be a plain number.
96 * - for path page parameter that is the entire path component,
97 * - if URL has no page number param and previous path component, everythi ng else matches, or
98 * - if prefix is the same, URL doesn't have anyhing else
99 * - else url must have '/' at the same position as pattern's page paramet er path component,
100 * followed by a plain number.
101 *
102 * @param url the URL to evalutate
103 * @param pagePattern the URL page pattern to match with
104 */
105 boolean isPagingUrl(String url) {
106 final int urlLen = url.length();
107 final int patternLen = mUrlStr.length();
108 boolean isDynamicParam = isDynamic();
109
110 // Both url and patterm must have the same suffix, if available.
cjhopman 2015/04/07 00:45:48 nit: s/patterm/pattern
kuan 2015/04/10 22:41:28 Done.
111 int suffixLen = patternLen - mPageParamPos -
112 PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN;
113 if (suffixLen != 0) {
114 int compareLen = suffixLen - (isDynamicParam ? 1 : 0); // Excludes '&' or '?'.
115 if (!url.regionMatches(urlLen - compareLen, mUrlStr, patternLen - co mpareLen,
116 compareLen)) {
117 return false;
118 }
119 }
120
121 final int suffixPos = urlLen - suffixLen;
122
cjhopman 2015/04/07 00:45:48 Could we split these three cases into their own fu
kuan 2015/04/10 22:41:27 Done.
123 if (isDynamicParam) {
124 // If page parameter is dynamic, the url matches the pattern only wh en:
125 // 1. has same prefix (scheme, host, path)
126 // 2. has same query components with same value (except page numbe r query) in the same
127 // order.
128 // Examples:
129 // If page pattern is http://foo.com/a/b?queryA=v1&queryB=[*!]&query C=v3
130 // Returns true for:
131 // - http://foo.com/a/b/?queryA=v1&queryC=v3
132 // - http://foo.com/a/b/?queryA=v1&queyrB=4&queryC=v3
133 // Otherwise, returns false.
134 //
135 // If page pattern is http://foo.com/a/b?page=[*!]&query=a
136 // Returns true for:
137 // - http://foo.com/a/b?query=a
138 // - http://foo.com/a/b?page=2&query=a
139 // Otherwise, returns false.
140 //
141 // If page pattern is http://foo.com/a/b?page=[*!]
142 // Returns true for:
143 // - http://foo.com/a/b/
144 // - http://foo.com/a/b.html
145 // - http://foo.com/a/b.htm
146 // - http://foo.com/a/b?page=2
147 // Otherwise, returns false.
148
149 // Both url and pattern must have the same prefix.
150 if (suffixPos < mComponentPos || !url.regionMatches(0, mUrlStr, 0, m ComponentPos)) {
cjhopman 2015/04/07 00:45:48 how about: url.endsWith(mRequiredSuffix) and cal
kuan 2015/04/10 22:41:27 Done. i assume u mean for lines 110-119 (at the b
151 return false;
152 }
153
154 // If the url doesn't have page number query, it is fine.
155 if (mComponentPos == suffixPos) return true;
156
157 // If the only difference in the page param query component of url a nd pattern is "/",
158 // ".html" or ".html", it is fine.
cjhopman 2015/04/07 00:45:48 s/.html/.htm for one of the two
kuan 2015/04/10 22:41:28 Done.
159 String diffPart = url.substring(mComponentPos, suffixPos).toLowerCas e();
cjhopman 2015/04/07 00:45:48 I'm not really sure what diffpart is here. What is
kuan 2015/04/10 22:41:28 yes, it's for the last set of examples.
160 if (sSlashOrHtmExtRegExp == null) {
161 sSlashOrHtmExtRegExp = RegExp.compile("^\\/|(.html?)$", "i");
162 }
163 if (sSlashOrHtmExtRegExp.test(diffPart)) return true;
164
165 // Both url and pattern must have the same query name.
166 if (!url.regionMatches(mComponentPos, mUrlStr, mComponentPos,
167 mPageParamPos - mComponentPos)) {
168 return false;
169 }
170
171 return isPlainNumber(url.substring(mPageParamPos, suffixPos));
172 } // isDynamicParam
173
174 // If the page pattern is www.foo.com/a/abc-[*!].html, expected doc URL is:
175 // - www.foo.com/a/abc-2.html
176 // - www.foo.com/a/abc.html.
177 // If the page pattern is www.foo.com/a/[*!]/abc.html, expected doc URL is:
178 // - www.foo.com/a/2/abc.html
cjhopman 2015/04/07 00:45:49 The second part of this comment confused me becaus
kuan 2015/04/10 22:41:28 yes, i've now split up the comments since the func
179 // - www.foo.com/a/abc.html
180 // - www.foo.com/abc.html.
181
182 // Handle case where page param is part of the path component (as oppose d to being the
183 // entire path component).
184 if (isPartialPathComponent()) {
185 // The page param path component of both url and pattern must have t he same prefix.
186 if (urlLen < mComponentPos + suffixLen ||
cjhopman 2015/04/07 00:45:49 url.startsWith(mRequiredPrefix)? I guess if you d
kuan 2015/04/10 22:41:27 Done. i'm not sure wha tu mean by "make it clear"
187 !url.regionMatches(0, mUrlStr, 0, mComponentPos)) {
188 return false;
189 }
190
191 // Find the first different character in page param path component j ust before
192 // placeholder or suffix, then check if it's acceptable.
193 int firstDiffPos = mComponentPos;
194 int maxPos = Math.min(mPageParamPos, suffixPos);
195 for (; firstDiffPos < maxPos; firstDiffPos++) {
196 if (url.charAt(firstDiffPos) != mUrlStr.charAt(firstDiffPos)) br eak;
197 }
198 if (firstDiffPos == suffixPos) { // First different character is th e suffix.
199 if (firstDiffPos + 1 == mPageParamPos &&
cjhopman 2015/04/07 00:45:49 Hm, if I read this correctly, the following return
kuan 2015/04/10 22:41:28 yes it returns false, i verified in original code.
200 isPageParamSeparator(mUrlStr.charAt(firstDiffPos))) {
201 return true;
202 }
203 } else if (firstDiffPos == mPageParamPos) { // First different char acter is page param.
204 if (isPlainNumber(url.substring(firstDiffPos, suffixPos))) retur n true;
205 }
206
207 return false;
208 } // page param is part of the (not entire) path component.
209
210 // Handle case where page param is the entire path component.
cjhopman 2015/04/07 00:45:49 This comment looks like it applies to just the nex
kuan 2015/04/10 22:41:28 Done. comment is no longer needed since i've extr
211 int prevComponentPos = mUrlStr.lastIndexOf('/', mComponentPos - 1);
cjhopman 2015/04/07 00:45:48 Should this only be looking at the path? If not, w
kuan 2015/04/10 22:41:28 Done. good catch.
212 if (prevComponentPos != -1) {
213 // The url doesn't have page number param and previous path componen t, like
cjhopman 2015/04/07 00:45:49 Stated like this, it seems the comment applies to
kuan 2015/04/10 22:41:28 Done.
214 // www.foo.com/abc.html.
215 if (prevComponentPos + suffixLen == urlLen) {
216 return url.regionMatches(0, mUrlStr, 0, prevComponentPos);
217 }
218 }
219
220 // If both url and pattern have the same prefix, url must have nothing e lse.
221 if (url.regionMatches(0, mUrlStr, 0, mComponentPos)) {
222 int acceptLen = mComponentPos + suffixLen;
223 // The url doesn't have page number parameter, like www.foo.com/a/ab c.html.
224 if (acceptLen == urlLen) return true;
225 if (acceptLen > urlLen) return false;
226
227 // While we are here, the url must have page number param, so the ur l must have a '/'
228 // at the pattern's path component start position.
229 if (url.charAt(mComponentPos) != '/') return false;
230
231 return isPlainNumber(url.substring(mComponentPos + 1, suffixPos));
232 }
233
234 return false;
235 }
236
237 private PagePattern(String urlStr) throws Exception {
238 mUrl = ParsedUrl.create(urlStr);
239 if (mUrl == null) throw new Exception("Invalid URL: " + urlStr);
240 mPageParamPos = urlStr.indexOf(PageParameterDetector.PAGE_PARAM_PLACEHOL DER);
241 if (mPageParamPos == -1) {
242 throw new Exception("Required page param placeholder is missing in " + urlStr);
243 }
244 mUrlStr = urlStr;
245 mQueryPos = mUrlStr.lastIndexOf('?', mPageParamPos - 1);
246 if (isQueryPattern()) { // Page param is one of the query components.
247 mComponentPos = mUrlStr.lastIndexOf('&', mPageParamPos - 1);
248 // Page param is the first query.
249 if (mComponentPos == -1) mComponentPos = mQueryPos;
250 } else { // Page param is one of the path components.
251 mComponentPos = mUrlStr.lastIndexOf('/', mPageParamPos);
252 }
253 }
254
255 private boolean isQueryPattern() {
256 return mQueryPos != -1;
257 }
258
259 private boolean isPathPattern() {
260 return !isQueryPattern() && mComponentPos != -1;
261 }
262
263 private boolean isDynamic() {
264 return isQueryPattern() && mComponentPos > 0 && mUrlStr.charAt(mPagePara mPos - 1) == '=';
265 }
266
267 private boolean isPartialPathComponent() {
268 return isPathPattern() && mUrlStr.charAt(mPageParamPos - 1) != '/';
269 }
270
271 private void determineParamIndex() {
cjhopman 2015/04/07 00:45:48 We should just call this in the constructor so tha
kuan 2015/04/10 22:41:27 Done.
272 if (mParamIndex != -1 || !isPathPattern()) return;
273 final String[] pathComponents = mUrl.getPathComponents();
274 for (mParamIndex = 0; mParamIndex < pathComponents.length; mParamIndex++ ) {
275 if (pathComponents[mParamIndex].contains(
276 PageParameterDetector.PAGE_PARAM_PLACEHOLDER)) {
277 break;
278 }
279 }
280 }
281
282 /**
283 * Returns true if, except for the path component containing the page param, the other path
284 * components of doc URL are the same as pattern's. But pattern may have mo re components, e.g.
285 * doc URL is /thread/12 and pattern is /thread/12/page/[*!].
286 */
287 private boolean hasSamePathComponentsAs(ParsedUrl docUrl) {
288 determineParamIndex();
289 final String[] urlComponents = docUrl.getPathComponents();
290 final String[] patternComponents = mUrl.getPathComponents();
291 boolean passedParamComponent = false;
292 for (int i = 0, j = 0; i < urlComponents.length && j < patternComponents .length; i++, j++) {
293 if (i == mParamIndex && !passedParamComponent) {
294 passedParamComponent = true;
295 // Repeat current path component if doc URL has less components (as per comments
296 // just above, doc URL may have less components).
297 if (urlComponents.length < patternComponents.length) i--;
298 continue;
299 }
300 if (!urlComponents[i].equalsIgnoreCase(patternComponents[j])) return false;
301 }
302
303 return true;
304 }
305
306 /**
307 * Returns true if pattern is for a calendar page, e.g. 2012/01/[*!], which would be a
308 * false-positive.
309 */
310 private boolean isCalendarPage() {
311 determineParamIndex();
312 if (mParamIndex < 2) return false;
313
314 // Only if param is the entire path component. This handles some cases erroneously
315 // considered false-positives e.g. first page is
316 // http://www.politico.com/story/2014/07/barack-obama-immigration-legal- questions-109467.html,
317 // and second page is
318 // http://www.politico.com/story/2014/07/barack-obama-immigration-legal- questions-109467_Page2.html,
319 // would be considered false-positives otherwise because of "2014" and " 07".
320 final String[] patternComponents = mUrl.getPathComponents();
321 if (patternComponents[mParamIndex].length() !=
322 PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN) {
323 return false;
324 }
325
326 int month = StringUtil.toNumber(patternComponents[mParamIndex - 1]);
327 if (month > 0 && month <= 12) {
328 int year = StringUtil.toNumber(patternComponents[mParamIndex - 2]);
329 if (year > 1970 && year < 3000) return true;
330 }
331
332 return false;
333 }
334
335 private static int getLongestCommonPrefixLength(String str1, String str2) {
336 if (str1.isEmpty() || str2.isEmpty()) return 0;
337
338 int limit = Math.min(str1.length(), str2.length());
339 int i = 0;
340 for (; i < limit; i++) {
341 if (str1.charAt(i) != str2.charAt(i)) break;
342 }
343 return i;
344 }
345
346 private static int getLongestCommonSuffixLength(String str1, String str2, in t startIndex) {
347 int commonSuffixLen = 0;
348 for (int i = str1.length() - 1, j = str2.length() - 1;
349 i > startIndex && j > startIndex; i--, j--, commonSuffixLen++) {
350 if (str1.charAt(i) != str2.charAt(i)) break;
351 }
352 return commonSuffixLen;
353 }
354
355 /**
356 * Returns true if given string can be converted to a number >= 0.
357 */
358 private static boolean isPlainNumber(String str) {
359 return StringUtil.toNumber(str) >= 0;
360 }
361
362 /**
363 * Returns true if given character is one of '-', '_', ';', ','.
364 */
365 public static native boolean isPageParamSeparator(Character c) /*-{
366 return /[-_;,]/.test(c);
367 }-*/;
368
369 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698