Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(318)

Side by Side Diff: java/org/chromium/distiller/PathComponentPagePattern.java

Issue 1029593003: implement validations of pagination URLs (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: copyright 2016 -> 2015 Created 5 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 package org.chromium.distiller;
6
7 import com.google.gwt.regexp.shared.MatchResult;
8 import com.google.gwt.regexp.shared.RegExp;
9
10 /**
11 * This class detects the page parameter in the path of a potential pagination U RL. If detected,
12 * it replaces the page param value with PageParameterDetector.PAGE_PARAM_PLACEH OLDER, then creates
13 * and returns a new object. This object can then be accessed via PageParameter Detector.PagePattern
14 * interface to:
15 * - validate the generated URL page pattern against the document URL
16 * - determine if a URL is a paging URL based on the page pattern.
17 * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pat tern is
18 * "http://www.foo.com/a/b-[*!].html". (See comments at top of PageParameterDete ctor.java).
19 */
20 public class PathComponentPagePattern implements PageParameterDetector.PagePatte rn {
21 private final ParsedUrl mUrl;
22 private final int mPageNumber;
23 private final int mPlaceholderStart;
24 private final String mUrlStr;
25 // Start position of path component containing placeholder.
26 private int mPlaceholderSegmentStart;
27 // Page param path component in list of path components.
28 private int mParamIndex = -1;
29 private final String mPrefix; // The part of the page pattern before the pla ceholder.
30 private String mSuffix = ""; // The part of the page pattern after the plac eholder.
31
32 /**
33 * Returns a new PagePattern if url is valid and contains PAGE_PARAM_PLACEHO LDER.
34 */
35 static PageParameterDetector.PagePattern create(ParsedUrl url, int pathStart , int digitStart,
36 int digitEnd) {
37 try {
38 return new PathComponentPagePattern(url, pathStart, digitStart, digi tEnd);
39 } catch (Exception e) {
cjhopman 2015/04/16 21:58:32 We shouldn't be throwing/catching the base Excepti
kuan 2015/04/20 23:11:13 Done. ditto for QueryParamPagePattern.
40 return null;
41 }
42 }
43
44 @Override
45 public String toString() {
46 return mUrlStr;
47 }
48
49 @Override
50 public int getPageNumber() {
51 return mPageNumber;
52 }
53
54 /**
55 * Returns true if pattern and URL are sufficiently similar and the pattern' s components are not
56 * calendar digits.
57 *
58 * @param docUrl the current document URL
59 */
60 @Override
61 public boolean isValidFor(ParsedUrl docUrl) {
62 final int urlPathComponentsLen = docUrl.getPathComponents().length;
63 final int patternPathComponentsLen = mUrl.getPathComponents().length;
64
65 // If the page param is inside of path components, both the pattern and doc URL must have
cjhopman 2015/04/16 21:58:32 We know that the page param is inside of path comp
kuan 2015/04/20 23:11:13 Done.
66 // the similar path.
67 if (urlPathComponentsLen > patternPathComponentsLen) return false;
68
69 // If both doc URL and page pattern have only 1 component, their common prefix+suffix must
70 // be at least half of the entire component in doc URL, e.g doc URL is
71 // "foo.com/foo-bar-threads-132" and pattern is "foo.com/foo-bar-threads -132-[*!]".
72 if (urlPathComponentsLen == 1 && patternPathComponentsLen == 1) {
73 final String urlComponent = docUrl.getPathComponents()[0];
74 final String patternComponent = mUrl.getPathComponents()[0];
75 int commonPrefixLen = getLongestCommonPrefixLength(urlComponent, pat ternComponent);
76 int commonSuffixLen = getLongestCommonSuffixLength(urlComponent, pat ternComponent,
77 commonPrefixLen);
78 return (commonSuffixLen + commonPrefixLen) * 2 >= urlComponent.lengt h();
79 }
80
81 if (!hasSamePathComponentsAs(docUrl)) return false;
82
83 if (isCalendarPage()) return false;
84
85 return true;
86 }
87
88 /**
89 * Returns true if a URL matches this page pattern based on a pipeline of ru les:
90 * - suffix (part of pattern after page param placeholder) must be same, and
91 * - for path page parameter that is part of a path component,
92 * - if the first different character in path component is suffix, it must be a page parameter
93 * separator, followed by the page parameter in the pattern
94 * - else if it's page parameter, it and possible following digits must be a plain number.
95 * - for path page parameter that is the entire path component,
96 * - if URL has no page number param and previous path component, everythi ng else matches, or
97 * - if prefix is the same, URL doesn't have anyhing else
98 * - else url must have '/' at the same position as pattern's page paramet er path component,
99 * followed by a plain number.
100 *
101 * @param url the URL to evalutate
102 */
103 @Override
104 public boolean isPagingUrl(String url) {
105 // Both url and pattern must have the same suffix, if available.
106 if (!mSuffix.isEmpty() && !url.endsWith(mSuffix)) return false;
107
108 return isPartialPathComponent() ? isPartialPathComponentPagingUrl(url) :
109 isEntirePathComponentPagingUrl(url);
110 }
111
112 private PathComponentPagePattern(ParsedUrl url, int pathStart, int digitStar t, int digitEnd)
113 throws Exception {
114 final String urlStr = url.toString();
115 if (isLastNumericPathComponentBad(urlStr, pathStart, digitStart, digitEn d)) {
116 throw new Exception("Bad last numeric path component");
117 }
118
119 String valueStr = urlStr.substring(digitStart, digitEnd);
120 int value = StringUtil.toNumber(valueStr);
121 if (value < 0) {
122 throw new Exception("Value in path component is an invalid number: " + valueStr);
123 }
124
125 String pattern = urlStr.substring(0, digitStart) +
126 PageParameterDetector.PAGE_PARAM_PLACEHOLDER + urlStr.substring( digitEnd);
127 mUrl = ParsedUrl.create(pattern);
128 if (mUrl == null) throw new Exception("Invalid URL: " + pattern);
129 mUrlStr = pattern;
130 mPageNumber = value;
131 mPlaceholderStart = digitStart;
132 mPlaceholderSegmentStart = mUrlStr.lastIndexOf('/', mPlaceholderStart);
133 determineParamIndex();
134 mPrefix = mUrlStr.substring(0, mPlaceholderSegmentStart);
135 // Determine suffix, if available.
136 final int urlLen = mUrlStr.length();
137 int suffixLen = urlLen - mPlaceholderStart -
138 PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN;
139 if (suffixLen != 0) mSuffix = mUrlStr.substring(urlLen - suffixLen);
140 }
141
142 private boolean isPartialPathComponent() {
143 return mUrlStr.charAt(mPlaceholderStart - 1) != '/';
cjhopman 2015/04/16 21:58:32 this is strange, shouldn't foo.com/article/2page/
kuan 2015/04/20 23:11:13 good point, so i did more testing w/ the original
144 }
145
146 private void determineParamIndex() {
147 final String[] pathComponents = mUrl.getPathComponents();
148 for (mParamIndex = 0; mParamIndex < pathComponents.length; mParamIndex++ ) {
149 if (pathComponents[mParamIndex].contains(
150 PageParameterDetector.PAGE_PARAM_PLACEHOLDER)) {
151 break;
152 }
153 }
154 }
155
156 /**
157 * Returns true if, except for the path component containing the page param, the other path
158 * components of doc URL are the same as pattern's. But pattern may have mo re components, e.g.:
159 * - doc URL is /thread/12, pattern is /thread/12/page/[*!]
160 * returns true because "thread" and "12" in doc URL match those in patter n
161 * - doc URL is /thread/12/foo, pattern is /thread/12/page/[*!]/foo
162 * returns false because "foo" in doc URL doesn't match "page" in pattern whose page param
163 path component comes after.
164 * - doc URL is /thread/12/foo, pattern is /thread/12/[*!]/foo
165 * returns true because "foo" in doc URL would match "foo" in pattern whos e page param path
166 * component is skipped when matching.
167 */
168 private boolean hasSamePathComponentsAs(ParsedUrl docUrl) {
169 determineParamIndex();
cjhopman 2015/04/16 21:58:32 don't need to call this, it's done in the construc
kuan 2015/04/20 23:11:13 Done. i forgot :(
170 final String[] urlComponents = docUrl.getPathComponents();
171 final String[] patternComponents = mUrl.getPathComponents();
172 boolean passedParamComponent = false;
173 for (int i = 0, j = 0; i < urlComponents.length && j < patternComponents .length; i++, j++) {
174 if (i == mParamIndex && !passedParamComponent) {
175 passedParamComponent = true;
176 // Repeat current path component if doc URL has less components (as per comments
177 // just above, doc URL may have less components).
178 if (urlComponents.length < patternComponents.length) i--;
179 continue;
180 }
181 if (!urlComponents[i].equalsIgnoreCase(patternComponents[j])) return false;
182 }
183
184 return true;
185 }
186
187 /**
188 * Returns true if pattern is for a calendar page, e.g. 2012/01/[*!], which would be a
189 * false-positive.
190 */
191 private boolean isCalendarPage() {
192 determineParamIndex();
193 if (mParamIndex < 2) return false;
194
195 // Only if param is the entire path component. This handles some cases erroneously
196 // considered false-positives e.g. first page is
197 // http://www.politico.com/story/2014/07/barack-obama-immigration-legal- questions-109467.html,
198 // and second page is
199 // http://www.politico.com/story/2014/07/barack-obama-immigration-legal- questions-109467_Page2.html,
200 // would be considered false-positives otherwise because of "2014" and " 07".
201 final String[] patternComponents = mUrl.getPathComponents();
202 if (patternComponents[mParamIndex].length() !=
203 PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN) {
204 return false;
205 }
206
207 int month = StringUtil.toNumber(patternComponents[mParamIndex - 1]);
208 if (month > 0 && month <= 12) {
209 int year = StringUtil.toNumber(patternComponents[mParamIndex - 2]);
210 if (year > 1970 && year < 3000) return true;
211 }
212
213 return false;
214 }
215
216 private static int getLongestCommonPrefixLength(String str1, String str2) {
217 if (str1.isEmpty() || str2.isEmpty()) return 0;
218
219 int limit = Math.min(str1.length(), str2.length());
220 int i = 0;
221 for (; i < limit; i++) {
222 if (str1.charAt(i) != str2.charAt(i)) break;
223 }
224 return i;
225 }
226
227 private static int getLongestCommonSuffixLength(String str1, String str2, in t startIndex) {
228 int commonSuffixLen = 0;
229 for (int i = str1.length() - 1, j = str2.length() - 1;
230 i > startIndex && j > startIndex; i--, j--, commonSuffixLen++) {
231 if (str1.charAt(i) != str2.charAt(i)) break;
232 }
233 return commonSuffixLen;
234 }
235
236 /**
237 * Returns true if url is a paging URL based on the page pattern where the p age param is part
238 * of a path component.
239 * If the page pattern is www.foo.com/a/abc-[*!].html, expected doc URL is:
240 * - www.foo.com/a/abc-2.html
241 * - www.foo.com/a/abc.html.
242 */
243 private boolean isPartialPathComponentPagingUrl(String url) {
244 final int urlLen = url.length();
245 final int suffixStart = urlLen - mSuffix.length();
246
247 // The page param path component of both url and pattern must have the s ame prefix.
248 if (!url.startsWith(mPrefix)) return false;
249
250 // Find the first different character in page param path component just before
251 // placeholder or suffix, then check if it's acceptable.
252 int firstDiffPos = mPlaceholderSegmentStart;
253 int maxPos = Math.min(mPlaceholderStart, suffixStart);
254 for (; firstDiffPos < maxPos; firstDiffPos++) {
255 if (url.charAt(firstDiffPos) != mUrlStr.charAt(firstDiffPos)) break;
256 }
257 if (firstDiffPos == suffixStart) { // First different character is the suffix.
258 if (firstDiffPos + 1 == mPlaceholderStart &&
259 isPageParamSeparator(mUrlStr.charAt(firstDiffPos))) {
260 return true;
261 }
262 } else if (firstDiffPos == mPlaceholderStart) { // First different char acter is page param.
263 if (PageParameterDetector.isPlainNumber(url.substring(firstDiffPos, suffixStart))) {
264 return true;
265 }
266 }
267
268 return false;
269 }
270
271 /**
272 * Returns true if url is a paging URL based on the page pattern where the p age param is the
273 * entire path component.
274 * If the page pattern is www.foo.com/a/[*!]/abc.html, expected doc URL is:
275 * - www.foo.com/a/2/abc.html
276 * - www.foo.com/a/abc.html
277 * - www.foo.com/abc.html.
278 */
279 private boolean isEntirePathComponentPagingUrl(String url) {
280 final int urlLen = url.length();
281 final int suffixLen = mSuffix.length();
282 final int suffixStart = url.length() - suffixLen;
283
284 int prevComponentPos = mUrl.getPath().lastIndexOf('/',
285 // We're only looking in the path, so the reverse search should start at the index
286 // excluding the url's origin.
287 mPlaceholderSegmentStart - 1 - mUrl.getOrigin().length());
288 if (prevComponentPos != -1) {
289 // Now, add back the url's origin to the index of previous path comp onent.
290 prevComponentPos += mUrl.getOrigin().length();
291 if (prevComponentPos + suffixLen == urlLen) {
292 // The url doesn't have page number param and previous path comp onent, like
293 // www.foo.com/abc.html.
294 return url.regionMatches(0, mUrlStr, 0, prevComponentPos);
295 }
296 }
297
298 // If both url and pattern have the same prefix, url must have nothing e lse.
299 if (url.startsWith(mPrefix)) {
300 int acceptLen = mPlaceholderSegmentStart + suffixLen;
301 // The url doesn't have page number parameter, like www.foo.com/a/ab c.html.
302 if (acceptLen == urlLen) return true;
303 if (acceptLen > urlLen) return false;
304
305 // While we are here, the url must have page number param, so the ur l must have a '/'
306 // at the pattern's path component start position.
307 if (url.charAt(mPlaceholderSegmentStart) != '/') return false;
308
309 return PageParameterDetector.isPlainNumber(url.substring(mPlaceholde rSegmentStart + 1,
310 suffixStart));
311 }
312
313 return false;
314 }
315
316 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l).
317 private static RegExp sLastPathComponentRegExp = null; // Match last path c omponent.
318
319 /**
320 * Returns true if:
321 * - the digitStart to digitEnd of urlStr is the last path component, and
322 * - the entire path component is numeric, and
323 * - the previous path component is a bad page param name.
324 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an d "tag" is a bad
325 * page param.
326 */
327 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, i nt digitStart,
328 int digitEnd) {
329 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component.
330 pathStart < digitStart - 1) { // Not the first path component.
331 String postMatch = urlStr.substring(digitEnd).toLowerCase();
332 // Checks that this is the last path component, and trailing charact ers, if available,
333 // are (s)htm(l) extensions.
334 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i");
335 if (sExtRegExp.test(postMatch)) {
336 // Entire component is numeric, get previous path component.
337 if (sLastPathComponentRegExp == null) {
338 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i") ;
339 }
340 MatchResult prevPathComponent = sLastPathComponentRegExp.exec(
341 urlStr.substring(pathStart + 1, digitStart));
342 if (prevPathComponent != null && prevPathComponent.getGroupCount () > 1 &&
343 PageParameterDetector.isPageParamNameBad(prevPathCompone nt.getGroup(1))) {
344 return true;
345 }
346 } // last numeric path component
347 }
348
349 return false;
350 }
351
352 /**
353 * Returns true if given character is one of '-', '_', ';', ','.
354 */
355 private static native boolean isPageParamSeparator(Character c) /*-{
356 return /[-_;,]/.test(c);
357 }-*/;
358
359 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698