Chromium Code Reviews| Index: java/org/chromium/distiller/PagePattern.java |
| diff --git a/java/org/chromium/distiller/PagePattern.java b/java/org/chromium/distiller/PagePattern.java |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..44df63fd5c786aa622b3509a1f55940ad5e857b9 |
| --- /dev/null |
| +++ b/java/org/chromium/distiller/PagePattern.java |
| @@ -0,0 +1,369 @@ |
| +// Copyright 2016 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +package org.chromium.distiller; |
| + |
| +import com.google.gwt.regexp.shared.RegExp; |
| + |
| +/** |
| + * This class handles the URL page pattern generated from a potential pagination URL, i.e. a URL |
| + * containing the PageParameterDetector.PAGE_PARAM_PLACEHOLDER (see comments at top of |
| + * PageParameterDetector.java). |
| + * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pattern is |
| + * "http://www.foo.com/a/b-[*!].html". |
| + * This class determines the type of pattern (inside query or path), validates pattern against |
| + * document URL, and determines if it's a paging URL based on the document URL. |
| + */ |
| +public class PagePattern { |
|
cjhopman
2015/04/07 00:45:49
After commenting a bit below, I wonder if it would
kuan
2015/04/10 22:41:27
Done. i did two - QueryParamPagePattern and PathCo
|
| + private final ParsedUrl mUrl; |
| + private final int mPageParamPos; |
|
cjhopman
2015/04/07 00:45:49
Some of this code below was hard for me to follow.
kuan
2015/04/10 22:41:27
Done.
|
| + private final String mUrlStr; |
| + private final int mQueryPos; |
| + private int mComponentPos; // Start position of either page param query or path component. |
| + private int mParamIndex = -1; // Page param path component position in list of path components. |
| + |
| + /** |
| + * Returns a new PagePattern if url is valid and contains PAGE_PARAM_PLACEHOLDER. |
| + */ |
| + static PagePattern create(String url) { |
|
cjhopman
2015/04/07 00:45:48
How about having a create method for page pattern
kuan
2015/04/10 22:41:27
Done.
|
| + try { |
| + return new PagePattern(url); |
| + } catch (Exception e) { |
| + return null; |
| + } |
| + } |
| + |
| + /** |
| + * Validates this page pattern according to the current document URL through a pipeline of |
| + * rules: |
| + * - for query page parameter, pattern and URL must have same path components. |
| + * - for path page parameter, |
| + * - pattern and URL must have same number of path components. |
|
cjhopman
2015/04/07 00:45:49
Personally, I'd have these function-level comments
kuan
2015/04/10 22:41:28
Done.
|
| + * - if only 1 path component, both must have long-enough common prefix and suffix. |
| + * - else all pattern's components, except for page parameter, must be same as url's. |
| + * - lastly, pattern's components cannot be calendar digits. |
| + * |
| + * Returns true if page pattern is valid. |
| + * |
| + * @param docUrl the current document URL |
| + * @param pagePattern the page pattern to validate |
| + */ |
| + boolean isValidFor(ParsedUrl docUrl) { |
|
cjhopman
2015/04/07 00:45:48
Would it make sense to check validity when the pag
kuan
2015/04/10 22:41:27
i don't believe so. multiple pagination URLs coul
|
| + // If page parameter is a query, page pattern and doc URL must have the same path. |
| + if (isQueryPattern()) { |
| + return docUrl.getTrimmedPath().equalsIgnoreCase(mUrl.getTrimmedPath()); |
| + } |
| + |
| + final int urlPathComponentsLen = docUrl.getPathComponents().length; |
| + final int patternPathComponentsLen = mUrl.getPathComponents().length; |
| + |
| + // If the page param is inside of path components, both the pattern and doc URL must have |
| + // the similar path. |
| + if (urlPathComponentsLen > patternPathComponentsLen) return false; |
| + |
| + // If both doc URL and page pattern have only 1 component, their common prefix+suffix must |
| + // be at least half of the entire component in doc URL, e.g doc URL is |
| + // "foo.com/foo-bar-threads-132" and pattern is "foo.com/foo-bar-threads-132-[*!]". |
| + if (urlPathComponentsLen == 1 && patternPathComponentsLen == 1) { |
| + final String urlComponent = docUrl.getPathComponents()[0]; |
| + final String patternComponent = mUrl.getPathComponents()[0]; |
| + int commonPrefixLen = getLongestCommonPrefixLength(urlComponent, patternComponent); |
| + return (getLongestCommonSuffixLength(urlComponent, patternComponent, commonPrefixLen) + |
|
cjhopman
2015/04/07 00:45:48
nit: could you put the common suffix in a local va
kuan
2015/04/10 22:41:28
Done.
|
| + commonPrefixLen) * 2 >= urlComponent.length(); |
| + } |
| + |
| + if (!hasSamePathComponentsAs(docUrl)) return false; |
| + |
| + if (isCalendarPage()) return false; |
|
cjhopman
2015/04/07 00:45:48
Why do we check this here? it doesn't really depen
kuan
2015/04/10 22:41:28
it's true it doesn't depend on doc url. however,
|
| + |
| + return true; |
| + } |
| + |
| + private static RegExp sSlashOrHtmExtRegExp = null; // Match either '/' or ".htm(l)". |
| + |
| + /** |
| + * Returns true if a URL matches this page pattern based on a pipeline of rules: |
| + * - suffix (part of pattern after page param placeholder) must be same, and |
| + * - for query page parameter, |
| + * - scheme, host, and path must be same, and |
| + * - query components, except that for page number, must be same in order and value, and |
| + * - query value must be a plain number. |
| + * - for path page parameter that is part of a path component, |
| + * - if the first different character in path component is suffix, it must be a page parameter |
| + * separator, followed by the page parameter in the pattern |
| + * - else if it's page parameter, it and possible following digits must be a plain number. |
| + * - for path page parameter that is the entire path component, |
| + * - if URL has no page number param and previous path component, everything else matches, or |
| + * - if prefix is the same, URL doesn't have anyhing else |
| + * - else url must have '/' at the same position as pattern's page parameter path component, |
| + * followed by a plain number. |
| + * |
| + * @param url the URL to evalutate |
| + * @param pagePattern the URL page pattern to match with |
| + */ |
| + boolean isPagingUrl(String url) { |
| + final int urlLen = url.length(); |
| + final int patternLen = mUrlStr.length(); |
| + boolean isDynamicParam = isDynamic(); |
| + |
| + // Both url and patterm must have the same suffix, if available. |
|
cjhopman
2015/04/07 00:45:48
nit: s/patterm/pattern
kuan
2015/04/10 22:41:28
Done.
|
| + int suffixLen = patternLen - mPageParamPos - |
| + PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN; |
| + if (suffixLen != 0) { |
| + int compareLen = suffixLen - (isDynamicParam ? 1 : 0); // Excludes '&' or '?'. |
| + if (!url.regionMatches(urlLen - compareLen, mUrlStr, patternLen - compareLen, |
| + compareLen)) { |
| + return false; |
| + } |
| + } |
| + |
| + final int suffixPos = urlLen - suffixLen; |
| + |
|
cjhopman
2015/04/07 00:45:48
Could we split these three cases into their own fu
kuan
2015/04/10 22:41:27
Done.
|
| + if (isDynamicParam) { |
| + // If page parameter is dynamic, the url matches the pattern only when: |
| + // 1. has same prefix (scheme, host, path) |
| + // 2. has same query components with same value (except page number query) in the same |
| + // order. |
| + // Examples: |
| + // If page pattern is http://foo.com/a/b?queryA=v1&queryB=[*!]&queryC=v3 |
| + // Returns true for: |
| + // - http://foo.com/a/b/?queryA=v1&queryC=v3 |
| + // - http://foo.com/a/b/?queryA=v1&queyrB=4&queryC=v3 |
| + // Otherwise, returns false. |
| + // |
| + // If page pattern is http://foo.com/a/b?page=[*!]&query=a |
| + // Returns true for: |
| + // - http://foo.com/a/b?query=a |
| + // - http://foo.com/a/b?page=2&query=a |
| + // Otherwise, returns false. |
| + // |
| + // If page pattern is http://foo.com/a/b?page=[*!] |
| + // Returns true for: |
| + // - http://foo.com/a/b/ |
| + // - http://foo.com/a/b.html |
| + // - http://foo.com/a/b.htm |
| + // - http://foo.com/a/b?page=2 |
| + // Otherwise, returns false. |
| + |
| + // Both url and pattern must have the same prefix. |
| + if (suffixPos < mComponentPos || !url.regionMatches(0, mUrlStr, 0, mComponentPos)) { |
|
cjhopman
2015/04/07 00:45:48
how about:
url.endsWith(mRequiredSuffix)
and cal
kuan
2015/04/10 22:41:27
Done. i assume u mean for lines 110-119 (at the b
|
| + return false; |
| + } |
| + |
| + // If the url doesn't have page number query, it is fine. |
| + if (mComponentPos == suffixPos) return true; |
| + |
| + // If the only difference in the page param query component of url and pattern is "/", |
| + // ".html" or ".html", it is fine. |
|
cjhopman
2015/04/07 00:45:48
s/.html/.htm for one of the two
kuan
2015/04/10 22:41:28
Done.
|
| + String diffPart = url.substring(mComponentPos, suffixPos).toLowerCase(); |
|
cjhopman
2015/04/07 00:45:48
I'm not really sure what diffpart is here. What is
kuan
2015/04/10 22:41:28
yes, it's for the last set of examples.
|
| + if (sSlashOrHtmExtRegExp == null) { |
| + sSlashOrHtmExtRegExp = RegExp.compile("^\\/|(.html?)$", "i"); |
| + } |
| + if (sSlashOrHtmExtRegExp.test(diffPart)) return true; |
| + |
| + // Both url and pattern must have the same query name. |
| + if (!url.regionMatches(mComponentPos, mUrlStr, mComponentPos, |
| + mPageParamPos - mComponentPos)) { |
| + return false; |
| + } |
| + |
| + return isPlainNumber(url.substring(mPageParamPos, suffixPos)); |
| + } // isDynamicParam |
| + |
| + // If the page pattern is www.foo.com/a/abc-[*!].html, expected doc URL is: |
| + // - www.foo.com/a/abc-2.html |
| + // - www.foo.com/a/abc.html. |
| + // If the page pattern is www.foo.com/a/[*!]/abc.html, expected doc URL is: |
| + // - www.foo.com/a/2/abc.html |
|
cjhopman
2015/04/07 00:45:49
The second part of this comment confused me becaus
kuan
2015/04/10 22:41:28
yes, i've now split up the comments since the func
|
| + // - www.foo.com/a/abc.html |
| + // - www.foo.com/abc.html. |
| + |
| + // Handle case where page param is part of the path component (as opposed to being the |
| + // entire path component). |
| + if (isPartialPathComponent()) { |
| + // The page param path component of both url and pattern must have the same prefix. |
| + if (urlLen < mComponentPos + suffixLen || |
|
cjhopman
2015/04/07 00:45:49
url.startsWith(mRequiredPrefix)?
I guess if you d
kuan
2015/04/10 22:41:27
Done. i'm not sure wha tu mean by "make it clear"
|
| + !url.regionMatches(0, mUrlStr, 0, mComponentPos)) { |
| + return false; |
| + } |
| + |
| + // Find the first different character in page param path component just before |
| + // placeholder or suffix, then check if it's acceptable. |
| + int firstDiffPos = mComponentPos; |
| + int maxPos = Math.min(mPageParamPos, suffixPos); |
| + for (; firstDiffPos < maxPos; firstDiffPos++) { |
| + if (url.charAt(firstDiffPos) != mUrlStr.charAt(firstDiffPos)) break; |
| + } |
| + if (firstDiffPos == suffixPos) { // First different character is the suffix. |
| + if (firstDiffPos + 1 == mPageParamPos && |
|
cjhopman
2015/04/07 00:45:49
Hm, if I read this correctly, the following return
kuan
2015/04/10 22:41:28
yes it returns false, i verified in original code.
|
| + isPageParamSeparator(mUrlStr.charAt(firstDiffPos))) { |
| + return true; |
| + } |
| + } else if (firstDiffPos == mPageParamPos) { // First different character is page param. |
| + if (isPlainNumber(url.substring(firstDiffPos, suffixPos))) return true; |
| + } |
| + |
| + return false; |
| + } // page param is part of the (not entire) path component. |
| + |
| + // Handle case where page param is the entire path component. |
|
cjhopman
2015/04/07 00:45:49
This comment looks like it applies to just the nex
kuan
2015/04/10 22:41:28
Done. comment is no longer needed since i've extr
|
| + int prevComponentPos = mUrlStr.lastIndexOf('/', mComponentPos - 1); |
|
cjhopman
2015/04/07 00:45:48
Should this only be looking at the path? If not, w
kuan
2015/04/10 22:41:28
Done. good catch.
|
| + if (prevComponentPos != -1) { |
| + // The url doesn't have page number param and previous path component, like |
|
cjhopman
2015/04/07 00:45:49
Stated like this, it seems the comment applies to
kuan
2015/04/10 22:41:28
Done.
|
| + // www.foo.com/abc.html. |
| + if (prevComponentPos + suffixLen == urlLen) { |
| + return url.regionMatches(0, mUrlStr, 0, prevComponentPos); |
| + } |
| + } |
| + |
| + // If both url and pattern have the same prefix, url must have nothing else. |
| + if (url.regionMatches(0, mUrlStr, 0, mComponentPos)) { |
| + int acceptLen = mComponentPos + suffixLen; |
| + // The url doesn't have page number parameter, like www.foo.com/a/abc.html. |
| + if (acceptLen == urlLen) return true; |
| + if (acceptLen > urlLen) return false; |
| + |
| + // While we are here, the url must have page number param, so the url must have a '/' |
| + // at the pattern's path component start position. |
| + if (url.charAt(mComponentPos) != '/') return false; |
| + |
| + return isPlainNumber(url.substring(mComponentPos + 1, suffixPos)); |
| + } |
| + |
| + return false; |
| + } |
| + |
| + private PagePattern(String urlStr) throws Exception { |
| + mUrl = ParsedUrl.create(urlStr); |
| + if (mUrl == null) throw new Exception("Invalid URL: " + urlStr); |
| + mPageParamPos = urlStr.indexOf(PageParameterDetector.PAGE_PARAM_PLACEHOLDER); |
| + if (mPageParamPos == -1) { |
| + throw new Exception("Required page param placeholder is missing in " + urlStr); |
| + } |
| + mUrlStr = urlStr; |
| + mQueryPos = mUrlStr.lastIndexOf('?', mPageParamPos - 1); |
| + if (isQueryPattern()) { // Page param is one of the query components. |
| + mComponentPos = mUrlStr.lastIndexOf('&', mPageParamPos - 1); |
| + // Page param is the first query. |
| + if (mComponentPos == -1) mComponentPos = mQueryPos; |
| + } else { // Page param is one of the path components. |
| + mComponentPos = mUrlStr.lastIndexOf('/', mPageParamPos); |
| + } |
| + } |
| + |
| + private boolean isQueryPattern() { |
| + return mQueryPos != -1; |
| + } |
| + |
| + private boolean isPathPattern() { |
| + return !isQueryPattern() && mComponentPos != -1; |
| + } |
| + |
| + private boolean isDynamic() { |
| + return isQueryPattern() && mComponentPos > 0 && mUrlStr.charAt(mPageParamPos - 1) == '='; |
| + } |
| + |
| + private boolean isPartialPathComponent() { |
| + return isPathPattern() && mUrlStr.charAt(mPageParamPos - 1) != '/'; |
| + } |
| + |
| + private void determineParamIndex() { |
|
cjhopman
2015/04/07 00:45:48
We should just call this in the constructor so tha
kuan
2015/04/10 22:41:27
Done.
|
| + if (mParamIndex != -1 || !isPathPattern()) return; |
| + final String[] pathComponents = mUrl.getPathComponents(); |
| + for (mParamIndex = 0; mParamIndex < pathComponents.length; mParamIndex++) { |
| + if (pathComponents[mParamIndex].contains( |
| + PageParameterDetector.PAGE_PARAM_PLACEHOLDER)) { |
| + break; |
| + } |
| + } |
| + } |
| + |
| + /** |
| + * Returns true if, except for the path component containing the page param, the other path |
| + * components of doc URL are the same as pattern's. But pattern may have more components, e.g. |
| + * doc URL is /thread/12 and pattern is /thread/12/page/[*!]. |
| + */ |
| + private boolean hasSamePathComponentsAs(ParsedUrl docUrl) { |
| + determineParamIndex(); |
| + final String[] urlComponents = docUrl.getPathComponents(); |
| + final String[] patternComponents = mUrl.getPathComponents(); |
| + boolean passedParamComponent = false; |
| + for (int i = 0, j = 0; i < urlComponents.length && j < patternComponents.length; i++, j++) { |
| + if (i == mParamIndex && !passedParamComponent) { |
| + passedParamComponent = true; |
| + // Repeat current path component if doc URL has less components (as per comments |
| + // just above, doc URL may have less components). |
| + if (urlComponents.length < patternComponents.length) i--; |
| + continue; |
| + } |
| + if (!urlComponents[i].equalsIgnoreCase(patternComponents[j])) return false; |
| + } |
| + |
| + return true; |
| + } |
| + |
| + /** |
| + * Returns true if pattern is for a calendar page, e.g. 2012/01/[*!], which would be a |
| + * false-positive. |
| + */ |
| + private boolean isCalendarPage() { |
| + determineParamIndex(); |
| + if (mParamIndex < 2) return false; |
| + |
| + // Only if param is the entire path component. This handles some cases erroneously |
| + // considered false-positives e.g. first page is |
| + // http://www.politico.com/story/2014/07/barack-obama-immigration-legal-questions-109467.html, |
| + // and second page is |
| + // http://www.politico.com/story/2014/07/barack-obama-immigration-legal-questions-109467_Page2.html, |
| + // would be considered false-positives otherwise because of "2014" and "07". |
| + final String[] patternComponents = mUrl.getPathComponents(); |
| + if (patternComponents[mParamIndex].length() != |
| + PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN) { |
| + return false; |
| + } |
| + |
| + int month = StringUtil.toNumber(patternComponents[mParamIndex - 1]); |
| + if (month > 0 && month <= 12) { |
| + int year = StringUtil.toNumber(patternComponents[mParamIndex - 2]); |
| + if (year > 1970 && year < 3000) return true; |
| + } |
| + |
| + return false; |
| + } |
| + |
| + private static int getLongestCommonPrefixLength(String str1, String str2) { |
| + if (str1.isEmpty() || str2.isEmpty()) return 0; |
| + |
| + int limit = Math.min(str1.length(), str2.length()); |
| + int i = 0; |
| + for (; i < limit; i++) { |
| + if (str1.charAt(i) != str2.charAt(i)) break; |
| + } |
| + return i; |
| + } |
| + |
| + private static int getLongestCommonSuffixLength(String str1, String str2, int startIndex) { |
| + int commonSuffixLen = 0; |
| + for (int i = str1.length() - 1, j = str2.length() - 1; |
| + i > startIndex && j > startIndex; i--, j--, commonSuffixLen++) { |
| + if (str1.charAt(i) != str2.charAt(i)) break; |
| + } |
| + return commonSuffixLen; |
| + } |
| + |
| + /** |
| + * Returns true if given string can be converted to a number >= 0. |
| + */ |
| + private static boolean isPlainNumber(String str) { |
| + return StringUtil.toNumber(str) >= 0; |
| + } |
| + |
| + /** |
| + * Returns true if given character is one of '-', '_', ';', ','. |
| + */ |
| + public static native boolean isPageParamSeparator(Character c) /*-{ |
| + return /[-_;,]/.test(c); |
| + }-*/; |
| + |
| +} |