Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(202)

Side by Side Diff: java/org/chromium/distiller/QueryParamPagePattern.java

Issue 1029593003: implement validations of pagination URLs (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: addr chris's comments Created 5 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 package org.chromium.distiller;
6
7 import com.google.gwt.regexp.shared.RegExp;
8
9 /**
10 * This class detects the page parameter in the query of a potential pagination URL. If detected,
11 * it replaces the page param value with PageParameterDetector.PAGE_PARAM_PLACEH OLDER, then creates
12 * and returns a new object. This object can then be called via PageParameterDe tector.PagePattern
13 * interface to:
14 * - validate the generated URL page pattern against the document URL
15 * - determine if a URL is a paging URL based on the page pattern.
16 * Example: if the original url is "http://www.foo.com/a/b/?page=2&query=a", the page pattern is
17 * "http://www.foo.com/a/b?page=[*!]&query=a". (See comments at top of PageParam eterDetector.java).
18 */
19 public class QueryParamPagePattern implements PageParameterDetector.PagePattern {
20 private final ParsedUrl mUrl;
21 private final int mPageNumber;
22 private final int mPlaceholderStart;
23 private final String mUrlStr;
24 private final int mQueryStart;
25 // Start position of query param containing placeholder.
26 private int mPlaceholderSegmentStart;
27 private final String mPrefix; // The part of the page pattern before the pl aceholder.
28 private String mSuffix = ""; // The part of the page pattern after the plac eholder.
29 // This is not mSuffix.length(), see their initializations in constructor.
30 private final int mSuffixLen;
31
32 /**
33 * Returns a new QueryParamPagePattern if url is valid and page param is in the query.
34 */
35 static PageParameterDetector.PagePattern create(ParsedUrl url, String queryN ame,
36 String queryValue) {
37 try {
38 return new QueryParamPagePattern(url, queryName, queryValue);
39 } catch (IllegalArgumentException e) {
40 return null;
41 }
42 }
43
44 @Override
45 public String toString() {
46 return mUrlStr;
47 }
48
49 @Override
50 public int getPageNumber() {
51 return mPageNumber;
52 }
53
54 /**
55 * Returns true if page pattern and URL have the same path components.
56 *
57 * @param docUrl the current document URL
58 */
59 @Override
60 public boolean isValidFor(ParsedUrl docUrl) {
61 return docUrl.getTrimmedPath().equalsIgnoreCase(mUrl.getTrimmedPath());
62 }
63
64 private static RegExp sSlashOrHtmExtRegExp = null; // Match either '/' or " .htm(l)".
65
66 /**
67 * Returns true if a URL matches this page pattern based on a pipeline of ru les:
68 * - suffix (part of pattern after page param placeholder) must be same, and
69 * - scheme, host, and path must be same, and
70 * - query params, except that for page number, must be same in order and va lue, and
71 * - query value must be a plain number.
72 *
73 * @param url the URL to evalutate
74 */
75 @Override
76 public boolean isPagingUrl(String url) {
77 // Both url and pattern must have the same suffix, if available.
78 if (mSuffixLen != 0 && !url.endsWith(mSuffix)) return false;
79
80 final int suffixStart = url.length() - mSuffixLen;
81
82 // The url matches the pattern only when:
83 // 1. has same prefix (scheme, host, path)
84 // 2. has same query params with same value (except page number query) in the same
85 // order.
86 // Examples:
87 // If page pattern is http://foo.com/a/b?queryA=v1&queryB=[*!]&queryC=v3
88 // Returns true for:
89 // - http://foo.com/a/b/?queryA=v1&queryC=v3
90 // - http://foo.com/a/b/?queryA=v1&queyrB=4&queryC=v3
91 // Otherwise, returns false.
92 //
93 // If page pattern is http://foo.com/a/b?page=[*!]&query=a
94 // Returns true for:
95 // - http://foo.com/a/b?query=a
96 // - http://foo.com/a/b?page=2&query=a
97 // Otherwise, returns false.
98 //
99 // If page pattern is http://foo.com/a/b?page=[*!]
100 // Returns true for:
101 // - http://foo.com/a/b/
102 // - http://foo.com/a/b.html
103 // - http://foo.com/a/b.htm
104 // - http://foo.com/a/b?page=2
105 // Otherwise, returns false.
106
107 // Both url and pattern must have the same prefix.
108 if (!url.startsWith(mPrefix)) return false;
109
110 // If the url doesn't have page number query, it is fine.
111 if (mPlaceholderSegmentStart == suffixStart) return true;
112
113 // If the only difference in the page param between url and pattern is " /", ".htm" or
114 // ".html", it is fine.
115 String diffPart = url.substring(mPlaceholderSegmentStart, suffixStart).t oLowerCase();
116 if (sSlashOrHtmExtRegExp == null) {
117 sSlashOrHtmExtRegExp = RegExp.compile("^\\/|(.html?)$", "i");
118 }
119 if (sSlashOrHtmExtRegExp.test(diffPart)) return true;
120
121 // Both url and pattern must have the same query name.
122 if (!url.regionMatches(mPlaceholderSegmentStart, mUrlStr, mPlaceholderSe gmentStart,
123 mPlaceholderStart - mPlaceholderSegmentStart)) {
124 return false;
125 }
126
127 return PageParameterDetector.isPlainNumber(url.substring(mPlaceholderSta rt, suffixStart));
128 }
129
130 private QueryParamPagePattern(ParsedUrl url, String queryName, String queryV alue)
131 throws IllegalArgumentException {
132 if (queryName.isEmpty()) throw new IllegalArgumentException("Empty query name");
133 if (queryValue.isEmpty()) throw new IllegalArgumentException("Empty quer y value");
134 if (!StringUtil.isStringAllDigits(queryValue)) {
135 throw new IllegalArgumentException("Query value has non-digits: " + queryValue);
136 }
137 if (PageParameterDetector.isPageParamNameBad(queryName)) {
138 throw new IllegalArgumentException("Query name is bad page param nam e: " + queryName);
139 }
140
141 int value = StringUtil.toNumber(queryValue);
142 if (value < 0) {
143 throw new IllegalArgumentException("Query value is an invalid number : " + queryValue);
144 }
145
146 String pattern = url.replaceQueryValue(queryName, queryValue,
147 PageParameterDetector.PAGE_PARAM_PLACEHOLDER);
148 mUrl = ParsedUrl.create(pattern);
149 if (mUrl == null) throw new IllegalArgumentException("Invalid URL: " + p attern);
150 mUrlStr = pattern;
151 mPageNumber = value;
152 mPlaceholderStart = pattern.indexOf(PageParameterDetector.PAGE_PARAM_PLA CEHOLDER);
153 mQueryStart = mUrlStr.lastIndexOf('?', mPlaceholderStart - 1);
154 mPlaceholderSegmentStart = mUrlStr.lastIndexOf('&', mPlaceholderStart - 1);
155 if (mPlaceholderSegmentStart == -1) { // Page param is the first query.
156 mPlaceholderSegmentStart = mQueryStart;
157 }
158 mPrefix = mUrlStr.substring(0, mPlaceholderSegmentStart);
159 // Determine suffix, if available.
160 final int urlLen = mUrlStr.length();
161 mSuffixLen = urlLen - mPlaceholderStart - PageParameterDetector.PAGE_PAR AM_PLACEHOLDER_LEN;
162 if (mSuffixLen != 0) {
163 mSuffix = mUrlStr.substring(urlLen - mSuffixLen + 1); // +1 to exclu de '&' or '?'.
164 }
165 }
166
167 }
OLDNEW
« no previous file with comments | « java/org/chromium/distiller/PathComponentPagePattern.java ('k') | javatests/org/chromium/distiller/PageParamInfoTest.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698