Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(400)

Unified Diff: java/org/chromium/distiller/QueryParamPagePattern.java

Issue 1029593003: implement validations of pagination URLs (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: addr chris's comments Created 5 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: java/org/chromium/distiller/QueryParamPagePattern.java
diff --git a/java/org/chromium/distiller/QueryParamPagePattern.java b/java/org/chromium/distiller/QueryParamPagePattern.java
new file mode 100644
index 0000000000000000000000000000000000000000..6c09c38b5fe807ea5f16f726a5c1102c7dd3385a
--- /dev/null
+++ b/java/org/chromium/distiller/QueryParamPagePattern.java
@@ -0,0 +1,167 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package org.chromium.distiller;
+
+import com.google.gwt.regexp.shared.RegExp;
+
+/**
+ * This class detects the page parameter in the query of a potential pagination URL. If detected,
+ * it replaces the page param value with PageParameterDetector.PAGE_PARAM_PLACEHOLDER, then creates
+ * and returns a new object. This object can then be called via PageParameterDetector.PagePattern
+ * interface to:
+ * - validate the generated URL page pattern against the document URL
+ * - determine if a URL is a paging URL based on the page pattern.
+ * Example: if the original url is "http://www.foo.com/a/b/?page=2&query=a", the page pattern is
+ * "http://www.foo.com/a/b?page=[*!]&query=a". (See comments at top of PageParameterDetector.java).
+ */
+public class QueryParamPagePattern implements PageParameterDetector.PagePattern {
+ private final ParsedUrl mUrl;
+ private final int mPageNumber;
+ private final int mPlaceholderStart;
+ private final String mUrlStr;
+ private final int mQueryStart;
+ // Start position of query param containing placeholder.
+ private int mPlaceholderSegmentStart;
+ private final String mPrefix; // The part of the page pattern before the placeholder.
+ private String mSuffix = ""; // The part of the page pattern after the placeholder.
+ // This is not mSuffix.length(), see their initializations in constructor.
+ private final int mSuffixLen;
+
+ /**
+ * Returns a new QueryParamPagePattern if url is valid and page param is in the query.
+ */
+ static PageParameterDetector.PagePattern create(ParsedUrl url, String queryName,
+ String queryValue) {
+ try {
+ return new QueryParamPagePattern(url, queryName, queryValue);
+ } catch (IllegalArgumentException e) {
+ return null;
+ }
+ }
+
+ @Override
+ public String toString() {
+ return mUrlStr;
+ }
+
+ @Override
+ public int getPageNumber() {
+ return mPageNumber;
+ }
+
+ /**
+ * Returns true if page pattern and URL have the same path components.
+ *
+ * @param docUrl the current document URL
+ */
+ @Override
+ public boolean isValidFor(ParsedUrl docUrl) {
+ return docUrl.getTrimmedPath().equalsIgnoreCase(mUrl.getTrimmedPath());
+ }
+
+ private static RegExp sSlashOrHtmExtRegExp = null; // Match either '/' or ".htm(l)".
+
+ /**
+ * Returns true if a URL matches this page pattern based on a pipeline of rules:
+ * - suffix (part of pattern after page param placeholder) must be same, and
+ * - scheme, host, and path must be same, and
+ * - query params, except that for page number, must be same in order and value, and
+ * - query value must be a plain number.
+ *
+ * @param url the URL to evalutate
+ */
+ @Override
+ public boolean isPagingUrl(String url) {
+ // Both url and pattern must have the same suffix, if available.
+ if (mSuffixLen != 0 && !url.endsWith(mSuffix)) return false;
+
+ final int suffixStart = url.length() - mSuffixLen;
+
+ // The url matches the pattern only when:
+ // 1. has same prefix (scheme, host, path)
+ // 2. has same query params with same value (except page number query) in the same
+ // order.
+ // Examples:
+ // If page pattern is http://foo.com/a/b?queryA=v1&queryB=[*!]&queryC=v3
+ // Returns true for:
+ // - http://foo.com/a/b/?queryA=v1&queryC=v3
+ // - http://foo.com/a/b/?queryA=v1&queyrB=4&queryC=v3
+ // Otherwise, returns false.
+ //
+ // If page pattern is http://foo.com/a/b?page=[*!]&query=a
+ // Returns true for:
+ // - http://foo.com/a/b?query=a
+ // - http://foo.com/a/b?page=2&query=a
+ // Otherwise, returns false.
+ //
+ // If page pattern is http://foo.com/a/b?page=[*!]
+ // Returns true for:
+ // - http://foo.com/a/b/
+ // - http://foo.com/a/b.html
+ // - http://foo.com/a/b.htm
+ // - http://foo.com/a/b?page=2
+ // Otherwise, returns false.
+
+ // Both url and pattern must have the same prefix.
+ if (!url.startsWith(mPrefix)) return false;
+
+ // If the url doesn't have page number query, it is fine.
+ if (mPlaceholderSegmentStart == suffixStart) return true;
+
+ // If the only difference in the page param between url and pattern is "/", ".htm" or
+ // ".html", it is fine.
+ String diffPart = url.substring(mPlaceholderSegmentStart, suffixStart).toLowerCase();
+ if (sSlashOrHtmExtRegExp == null) {
+ sSlashOrHtmExtRegExp = RegExp.compile("^\\/|(.html?)$", "i");
+ }
+ if (sSlashOrHtmExtRegExp.test(diffPart)) return true;
+
+ // Both url and pattern must have the same query name.
+ if (!url.regionMatches(mPlaceholderSegmentStart, mUrlStr, mPlaceholderSegmentStart,
+ mPlaceholderStart - mPlaceholderSegmentStart)) {
+ return false;
+ }
+
+ return PageParameterDetector.isPlainNumber(url.substring(mPlaceholderStart, suffixStart));
+ }
+
+ private QueryParamPagePattern(ParsedUrl url, String queryName, String queryValue)
+ throws IllegalArgumentException {
+ if (queryName.isEmpty()) throw new IllegalArgumentException("Empty query name");
+ if (queryValue.isEmpty()) throw new IllegalArgumentException("Empty query value");
+ if (!StringUtil.isStringAllDigits(queryValue)) {
+ throw new IllegalArgumentException("Query value has non-digits: " + queryValue);
+ }
+ if (PageParameterDetector.isPageParamNameBad(queryName)) {
+ throw new IllegalArgumentException("Query name is bad page param name: " + queryName);
+ }
+
+ int value = StringUtil.toNumber(queryValue);
+ if (value < 0) {
+ throw new IllegalArgumentException("Query value is an invalid number: " + queryValue);
+ }
+
+ String pattern = url.replaceQueryValue(queryName, queryValue,
+ PageParameterDetector.PAGE_PARAM_PLACEHOLDER);
+ mUrl = ParsedUrl.create(pattern);
+ if (mUrl == null) throw new IllegalArgumentException("Invalid URL: " + pattern);
+ mUrlStr = pattern;
+ mPageNumber = value;
+ mPlaceholderStart = pattern.indexOf(PageParameterDetector.PAGE_PARAM_PLACEHOLDER);
+ mQueryStart = mUrlStr.lastIndexOf('?', mPlaceholderStart - 1);
+ mPlaceholderSegmentStart = mUrlStr.lastIndexOf('&', mPlaceholderStart - 1);
+ if (mPlaceholderSegmentStart == -1) { // Page param is the first query.
+ mPlaceholderSegmentStart = mQueryStart;
+ }
+ mPrefix = mUrlStr.substring(0, mPlaceholderSegmentStart);
+ // Determine suffix, if available.
+ final int urlLen = mUrlStr.length();
+ mSuffixLen = urlLen - mPlaceholderStart - PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN;
+ if (mSuffixLen != 0) {
+ mSuffix = mUrlStr.substring(urlLen - mSuffixLen + 1); // +1 to exclude '&' or '?'.
+ }
+ }
+
+}
« no previous file with comments | « java/org/chromium/distiller/PathComponentPagePattern.java ('k') | javatests/org/chromium/distiller/PageParamInfoTest.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698