| Index: java/org/chromium/distiller/QueryParamPagePattern.java
|
| diff --git a/java/org/chromium/distiller/QueryParamPagePattern.java b/java/org/chromium/distiller/QueryParamPagePattern.java
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..6c09c38b5fe807ea5f16f726a5c1102c7dd3385a
|
| --- /dev/null
|
| +++ b/java/org/chromium/distiller/QueryParamPagePattern.java
|
| @@ -0,0 +1,167 @@
|
| +// Copyright 2015 The Chromium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +
|
| +package org.chromium.distiller;
|
| +
|
| +import com.google.gwt.regexp.shared.RegExp;
|
| +
|
| +/**
|
| + * This class detects the page parameter in the query of a potential pagination URL. If detected,
|
| + * it replaces the page param value with PageParameterDetector.PAGE_PARAM_PLACEHOLDER, then creates
|
| + * and returns a new object. This object can then be called via PageParameterDetector.PagePattern
|
| + * interface to:
|
| + * - validate the generated URL page pattern against the document URL
|
| + * - determine if a URL is a paging URL based on the page pattern.
|
| + * Example: if the original url is "http://www.foo.com/a/b/?page=2&query=a", the page pattern is
|
| + * "http://www.foo.com/a/b?page=[*!]&query=a". (See comments at top of PageParameterDetector.java).
|
| + */
|
| +public class QueryParamPagePattern implements PageParameterDetector.PagePattern {
|
| + private final ParsedUrl mUrl;
|
| + private final int mPageNumber;
|
| + private final int mPlaceholderStart;
|
| + private final String mUrlStr;
|
| + private final int mQueryStart;
|
| + // Start position of query param containing placeholder.
|
| + private int mPlaceholderSegmentStart;
|
| + private final String mPrefix; // The part of the page pattern before the placeholder.
|
| + private String mSuffix = ""; // The part of the page pattern after the placeholder.
|
| + // This is not mSuffix.length(), see their initializations in constructor.
|
| + private final int mSuffixLen;
|
| +
|
| + /**
|
| + * Returns a new QueryParamPagePattern if url is valid and page param is in the query.
|
| + */
|
| + static PageParameterDetector.PagePattern create(ParsedUrl url, String queryName,
|
| + String queryValue) {
|
| + try {
|
| + return new QueryParamPagePattern(url, queryName, queryValue);
|
| + } catch (IllegalArgumentException e) {
|
| + return null;
|
| + }
|
| + }
|
| +
|
| + @Override
|
| + public String toString() {
|
| + return mUrlStr;
|
| + }
|
| +
|
| + @Override
|
| + public int getPageNumber() {
|
| + return mPageNumber;
|
| + }
|
| +
|
| + /**
|
| + * Returns true if page pattern and URL have the same path components.
|
| + *
|
| + * @param docUrl the current document URL
|
| + */
|
| + @Override
|
| + public boolean isValidFor(ParsedUrl docUrl) {
|
| + return docUrl.getTrimmedPath().equalsIgnoreCase(mUrl.getTrimmedPath());
|
| + }
|
| +
|
| + private static RegExp sSlashOrHtmExtRegExp = null; // Match either '/' or ".htm(l)".
|
| +
|
| + /**
|
| + * Returns true if a URL matches this page pattern based on a pipeline of rules:
|
| + * - suffix (part of pattern after page param placeholder) must be same, and
|
| + * - scheme, host, and path must be same, and
|
| + * - query params, except that for page number, must be same in order and value, and
|
| + * - query value must be a plain number.
|
| + *
|
| + * @param url the URL to evalutate
|
| + */
|
| + @Override
|
| + public boolean isPagingUrl(String url) {
|
| + // Both url and pattern must have the same suffix, if available.
|
| + if (mSuffixLen != 0 && !url.endsWith(mSuffix)) return false;
|
| +
|
| + final int suffixStart = url.length() - mSuffixLen;
|
| +
|
| + // The url matches the pattern only when:
|
| + // 1. has same prefix (scheme, host, path)
|
| + // 2. has same query params with same value (except page number query) in the same
|
| + // order.
|
| + // Examples:
|
| + // If page pattern is http://foo.com/a/b?queryA=v1&queryB=[*!]&queryC=v3
|
| + // Returns true for:
|
| + // - http://foo.com/a/b/?queryA=v1&queryC=v3
|
| + // - http://foo.com/a/b/?queryA=v1&queyrB=4&queryC=v3
|
| + // Otherwise, returns false.
|
| + //
|
| + // If page pattern is http://foo.com/a/b?page=[*!]&query=a
|
| + // Returns true for:
|
| + // - http://foo.com/a/b?query=a
|
| + // - http://foo.com/a/b?page=2&query=a
|
| + // Otherwise, returns false.
|
| + //
|
| + // If page pattern is http://foo.com/a/b?page=[*!]
|
| + // Returns true for:
|
| + // - http://foo.com/a/b/
|
| + // - http://foo.com/a/b.html
|
| + // - http://foo.com/a/b.htm
|
| + // - http://foo.com/a/b?page=2
|
| + // Otherwise, returns false.
|
| +
|
| + // Both url and pattern must have the same prefix.
|
| + if (!url.startsWith(mPrefix)) return false;
|
| +
|
| + // If the url doesn't have page number query, it is fine.
|
| + if (mPlaceholderSegmentStart == suffixStart) return true;
|
| +
|
| + // If the only difference in the page param between url and pattern is "/", ".htm" or
|
| + // ".html", it is fine.
|
| + String diffPart = url.substring(mPlaceholderSegmentStart, suffixStart).toLowerCase();
|
| + if (sSlashOrHtmExtRegExp == null) {
|
| + sSlashOrHtmExtRegExp = RegExp.compile("^\\/|(.html?)$", "i");
|
| + }
|
| + if (sSlashOrHtmExtRegExp.test(diffPart)) return true;
|
| +
|
| + // Both url and pattern must have the same query name.
|
| + if (!url.regionMatches(mPlaceholderSegmentStart, mUrlStr, mPlaceholderSegmentStart,
|
| + mPlaceholderStart - mPlaceholderSegmentStart)) {
|
| + return false;
|
| + }
|
| +
|
| + return PageParameterDetector.isPlainNumber(url.substring(mPlaceholderStart, suffixStart));
|
| + }
|
| +
|
| + private QueryParamPagePattern(ParsedUrl url, String queryName, String queryValue)
|
| + throws IllegalArgumentException {
|
| + if (queryName.isEmpty()) throw new IllegalArgumentException("Empty query name");
|
| + if (queryValue.isEmpty()) throw new IllegalArgumentException("Empty query value");
|
| + if (!StringUtil.isStringAllDigits(queryValue)) {
|
| + throw new IllegalArgumentException("Query value has non-digits: " + queryValue);
|
| + }
|
| + if (PageParameterDetector.isPageParamNameBad(queryName)) {
|
| + throw new IllegalArgumentException("Query name is bad page param name: " + queryName);
|
| + }
|
| +
|
| + int value = StringUtil.toNumber(queryValue);
|
| + if (value < 0) {
|
| + throw new IllegalArgumentException("Query value is an invalid number: " + queryValue);
|
| + }
|
| +
|
| + String pattern = url.replaceQueryValue(queryName, queryValue,
|
| + PageParameterDetector.PAGE_PARAM_PLACEHOLDER);
|
| + mUrl = ParsedUrl.create(pattern);
|
| + if (mUrl == null) throw new IllegalArgumentException("Invalid URL: " + pattern);
|
| + mUrlStr = pattern;
|
| + mPageNumber = value;
|
| + mPlaceholderStart = pattern.indexOf(PageParameterDetector.PAGE_PARAM_PLACEHOLDER);
|
| + mQueryStart = mUrlStr.lastIndexOf('?', mPlaceholderStart - 1);
|
| + mPlaceholderSegmentStart = mUrlStr.lastIndexOf('&', mPlaceholderStart - 1);
|
| + if (mPlaceholderSegmentStart == -1) { // Page param is the first query.
|
| + mPlaceholderSegmentStart = mQueryStart;
|
| + }
|
| + mPrefix = mUrlStr.substring(0, mPlaceholderSegmentStart);
|
| + // Determine suffix, if available.
|
| + final int urlLen = mUrlStr.length();
|
| + mSuffixLen = urlLen - mPlaceholderStart - PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN;
|
| + if (mSuffixLen != 0) {
|
| + mSuffix = mUrlStr.substring(urlLen - mSuffixLen + 1); // +1 to exclude '&' or '?'.
|
| + }
|
| + }
|
| +
|
| +}
|
|
|