Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(272)

Unified Diff: java/org/chromium/distiller/PathComponentPagePattern.java

Issue 1029593003: implement validations of pagination URLs (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: copyright 2016 -> 2015 Created 5 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: java/org/chromium/distiller/PathComponentPagePattern.java
diff --git a/java/org/chromium/distiller/PathComponentPagePattern.java b/java/org/chromium/distiller/PathComponentPagePattern.java
new file mode 100644
index 0000000000000000000000000000000000000000..3d16f22f3d8cc875663fd375a2260bc6a60feb7d
--- /dev/null
+++ b/java/org/chromium/distiller/PathComponentPagePattern.java
@@ -0,0 +1,359 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package org.chromium.distiller;
+
+import com.google.gwt.regexp.shared.MatchResult;
+import com.google.gwt.regexp.shared.RegExp;
+
+/**
+ * This class detects the page parameter in the path of a potential pagination URL. If detected,
+ * it replaces the page param value with PageParameterDetector.PAGE_PARAM_PLACEHOLDER, then creates
+ * and returns a new object. This object can then be accessed via PageParameterDetector.PagePattern
+ * interface to:
+ * - validate the generated URL page pattern against the document URL
+ * - determine if a URL is a paging URL based on the page pattern.
+ * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pattern is
+ * "http://www.foo.com/a/b-[*!].html". (See comments at top of PageParameterDetector.java).
+ */
+public class PathComponentPagePattern implements PageParameterDetector.PagePattern {
+ private final ParsedUrl mUrl;
+ private final int mPageNumber;
+ private final int mPlaceholderStart;
+ private final String mUrlStr;
+ // Start position of path component containing placeholder.
+ private int mPlaceholderSegmentStart;
+ // Page param path component in list of path components.
+ private int mParamIndex = -1;
+ private final String mPrefix; // The part of the page pattern before the placeholder.
+ private String mSuffix = ""; // The part of the page pattern after the placeholder.
+
+ /**
+ * Returns a new PagePattern if url is valid and contains PAGE_PARAM_PLACEHOLDER.
+ */
+ static PageParameterDetector.PagePattern create(ParsedUrl url, int pathStart, int digitStart,
+ int digitEnd) {
+ try {
+ return new PathComponentPagePattern(url, pathStart, digitStart, digitEnd);
+ } catch (Exception e) {
cjhopman 2015/04/16 21:58:32 We shouldn't be throwing/catching the base Excepti
kuan 2015/04/20 23:11:13 Done. ditto for QueryParamPagePattern.
+ return null;
+ }
+ }
+
+ @Override
+ public String toString() {
+ return mUrlStr;
+ }
+
+ @Override
+ public int getPageNumber() {
+ return mPageNumber;
+ }
+
+ /**
+ * Returns true if pattern and URL are sufficiently similar and the pattern's components are not
+ * calendar digits.
+ *
+ * @param docUrl the current document URL
+ */
+ @Override
+ public boolean isValidFor(ParsedUrl docUrl) {
+ final int urlPathComponentsLen = docUrl.getPathComponents().length;
+ final int patternPathComponentsLen = mUrl.getPathComponents().length;
+
+ // If the page param is inside of path components, both the pattern and doc URL must have
cjhopman 2015/04/16 21:58:32 We know that the page param is inside of path comp
kuan 2015/04/20 23:11:13 Done.
+ // the similar path.
+ if (urlPathComponentsLen > patternPathComponentsLen) return false;
+
+ // If both doc URL and page pattern have only 1 component, their common prefix+suffix must
+ // be at least half of the entire component in doc URL, e.g doc URL is
+ // "foo.com/foo-bar-threads-132" and pattern is "foo.com/foo-bar-threads-132-[*!]".
+ if (urlPathComponentsLen == 1 && patternPathComponentsLen == 1) {
+ final String urlComponent = docUrl.getPathComponents()[0];
+ final String patternComponent = mUrl.getPathComponents()[0];
+ int commonPrefixLen = getLongestCommonPrefixLength(urlComponent, patternComponent);
+ int commonSuffixLen = getLongestCommonSuffixLength(urlComponent, patternComponent,
+ commonPrefixLen);
+ return (commonSuffixLen + commonPrefixLen) * 2 >= urlComponent.length();
+ }
+
+ if (!hasSamePathComponentsAs(docUrl)) return false;
+
+ if (isCalendarPage()) return false;
+
+ return true;
+ }
+
+ /**
+ * Returns true if a URL matches this page pattern based on a pipeline of rules:
+ * - suffix (part of pattern after page param placeholder) must be same, and
+ * - for path page parameter that is part of a path component,
+ * - if the first different character in path component is suffix, it must be a page parameter
+ * separator, followed by the page parameter in the pattern
+ * - else if it's page parameter, it and possible following digits must be a plain number.
+ * - for path page parameter that is the entire path component,
+ * - if URL has no page number param and previous path component, everything else matches, or
+ * - if prefix is the same, URL doesn't have anyhing else
+ * - else url must have '/' at the same position as pattern's page parameter path component,
+ * followed by a plain number.
+ *
+ * @param url the URL to evalutate
+ */
+ @Override
+ public boolean isPagingUrl(String url) {
+ // Both url and pattern must have the same suffix, if available.
+ if (!mSuffix.isEmpty() && !url.endsWith(mSuffix)) return false;
+
+ return isPartialPathComponent() ? isPartialPathComponentPagingUrl(url) :
+ isEntirePathComponentPagingUrl(url);
+ }
+
+ private PathComponentPagePattern(ParsedUrl url, int pathStart, int digitStart, int digitEnd)
+ throws Exception {
+ final String urlStr = url.toString();
+ if (isLastNumericPathComponentBad(urlStr, pathStart, digitStart, digitEnd)) {
+ throw new Exception("Bad last numeric path component");
+ }
+
+ String valueStr = urlStr.substring(digitStart, digitEnd);
+ int value = StringUtil.toNumber(valueStr);
+ if (value < 0) {
+ throw new Exception("Value in path component is an invalid number: " + valueStr);
+ }
+
+ String pattern = urlStr.substring(0, digitStart) +
+ PageParameterDetector.PAGE_PARAM_PLACEHOLDER + urlStr.substring(digitEnd);
+ mUrl = ParsedUrl.create(pattern);
+ if (mUrl == null) throw new Exception("Invalid URL: " + pattern);
+ mUrlStr = pattern;
+ mPageNumber = value;
+ mPlaceholderStart = digitStart;
+ mPlaceholderSegmentStart = mUrlStr.lastIndexOf('/', mPlaceholderStart);
+ determineParamIndex();
+ mPrefix = mUrlStr.substring(0, mPlaceholderSegmentStart);
+ // Determine suffix, if available.
+ final int urlLen = mUrlStr.length();
+ int suffixLen = urlLen - mPlaceholderStart -
+ PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN;
+ if (suffixLen != 0) mSuffix = mUrlStr.substring(urlLen - suffixLen);
+ }
+
+ private boolean isPartialPathComponent() {
+ return mUrlStr.charAt(mPlaceholderStart - 1) != '/';
cjhopman 2015/04/16 21:58:32 this is strange, shouldn't foo.com/article/2page/
kuan 2015/04/20 23:11:13 good point, so i did more testing w/ the original
+ }
+
+ private void determineParamIndex() {
+ final String[] pathComponents = mUrl.getPathComponents();
+ for (mParamIndex = 0; mParamIndex < pathComponents.length; mParamIndex++) {
+ if (pathComponents[mParamIndex].contains(
+ PageParameterDetector.PAGE_PARAM_PLACEHOLDER)) {
+ break;
+ }
+ }
+ }
+
+ /**
+ * Returns true if, except for the path component containing the page param, the other path
+ * components of doc URL are the same as pattern's. But pattern may have more components, e.g.:
+ * - doc URL is /thread/12, pattern is /thread/12/page/[*!]
+ * returns true because "thread" and "12" in doc URL match those in pattern
+ * - doc URL is /thread/12/foo, pattern is /thread/12/page/[*!]/foo
+ * returns false because "foo" in doc URL doesn't match "page" in pattern whose page param
+ path component comes after.
+ * - doc URL is /thread/12/foo, pattern is /thread/12/[*!]/foo
+ * returns true because "foo" in doc URL would match "foo" in pattern whose page param path
+ * component is skipped when matching.
+ */
+ private boolean hasSamePathComponentsAs(ParsedUrl docUrl) {
+ determineParamIndex();
cjhopman 2015/04/16 21:58:32 don't need to call this, it's done in the construc
kuan 2015/04/20 23:11:13 Done. i forgot :(
+ final String[] urlComponents = docUrl.getPathComponents();
+ final String[] patternComponents = mUrl.getPathComponents();
+ boolean passedParamComponent = false;
+ for (int i = 0, j = 0; i < urlComponents.length && j < patternComponents.length; i++, j++) {
+ if (i == mParamIndex && !passedParamComponent) {
+ passedParamComponent = true;
+ // Repeat current path component if doc URL has less components (as per comments
+ // just above, doc URL may have less components).
+ if (urlComponents.length < patternComponents.length) i--;
+ continue;
+ }
+ if (!urlComponents[i].equalsIgnoreCase(patternComponents[j])) return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Returns true if pattern is for a calendar page, e.g. 2012/01/[*!], which would be a
+ * false-positive.
+ */
+ private boolean isCalendarPage() {
+ determineParamIndex();
+ if (mParamIndex < 2) return false;
+
+ // Only if param is the entire path component. This handles some cases erroneously
+ // considered false-positives e.g. first page is
+ // http://www.politico.com/story/2014/07/barack-obama-immigration-legal-questions-109467.html,
+ // and second page is
+ // http://www.politico.com/story/2014/07/barack-obama-immigration-legal-questions-109467_Page2.html,
+ // would be considered false-positives otherwise because of "2014" and "07".
+ final String[] patternComponents = mUrl.getPathComponents();
+ if (patternComponents[mParamIndex].length() !=
+ PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN) {
+ return false;
+ }
+
+ int month = StringUtil.toNumber(patternComponents[mParamIndex - 1]);
+ if (month > 0 && month <= 12) {
+ int year = StringUtil.toNumber(patternComponents[mParamIndex - 2]);
+ if (year > 1970 && year < 3000) return true;
+ }
+
+ return false;
+ }
+
+ private static int getLongestCommonPrefixLength(String str1, String str2) {
+ if (str1.isEmpty() || str2.isEmpty()) return 0;
+
+ int limit = Math.min(str1.length(), str2.length());
+ int i = 0;
+ for (; i < limit; i++) {
+ if (str1.charAt(i) != str2.charAt(i)) break;
+ }
+ return i;
+ }
+
+ private static int getLongestCommonSuffixLength(String str1, String str2, int startIndex) {
+ int commonSuffixLen = 0;
+ for (int i = str1.length() - 1, j = str2.length() - 1;
+ i > startIndex && j > startIndex; i--, j--, commonSuffixLen++) {
+ if (str1.charAt(i) != str2.charAt(i)) break;
+ }
+ return commonSuffixLen;
+ }
+
+ /**
+ * Returns true if url is a paging URL based on the page pattern where the page param is part
+ * of a path component.
+ * If the page pattern is www.foo.com/a/abc-[*!].html, expected doc URL is:
+ * - www.foo.com/a/abc-2.html
+ * - www.foo.com/a/abc.html.
+ */
+ private boolean isPartialPathComponentPagingUrl(String url) {
+ final int urlLen = url.length();
+ final int suffixStart = urlLen - mSuffix.length();
+
+ // The page param path component of both url and pattern must have the same prefix.
+ if (!url.startsWith(mPrefix)) return false;
+
+ // Find the first different character in page param path component just before
+ // placeholder or suffix, then check if it's acceptable.
+ int firstDiffPos = mPlaceholderSegmentStart;
+ int maxPos = Math.min(mPlaceholderStart, suffixStart);
+ for (; firstDiffPos < maxPos; firstDiffPos++) {
+ if (url.charAt(firstDiffPos) != mUrlStr.charAt(firstDiffPos)) break;
+ }
+ if (firstDiffPos == suffixStart) { // First different character is the suffix.
+ if (firstDiffPos + 1 == mPlaceholderStart &&
+ isPageParamSeparator(mUrlStr.charAt(firstDiffPos))) {
+ return true;
+ }
+ } else if (firstDiffPos == mPlaceholderStart) { // First different character is page param.
+ if (PageParameterDetector.isPlainNumber(url.substring(firstDiffPos, suffixStart))) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Returns true if url is a paging URL based on the page pattern where the page param is the
+ * entire path component.
+ * If the page pattern is www.foo.com/a/[*!]/abc.html, expected doc URL is:
+ * - www.foo.com/a/2/abc.html
+ * - www.foo.com/a/abc.html
+ * - www.foo.com/abc.html.
+ */
+ private boolean isEntirePathComponentPagingUrl(String url) {
+ final int urlLen = url.length();
+ final int suffixLen = mSuffix.length();
+ final int suffixStart = url.length() - suffixLen;
+
+ int prevComponentPos = mUrl.getPath().lastIndexOf('/',
+ // We're only looking in the path, so the reverse search should start at the index
+ // excluding the url's origin.
+ mPlaceholderSegmentStart - 1 - mUrl.getOrigin().length());
+ if (prevComponentPos != -1) {
+ // Now, add back the url's origin to the index of previous path component.
+ prevComponentPos += mUrl.getOrigin().length();
+ if (prevComponentPos + suffixLen == urlLen) {
+ // The url doesn't have page number param and previous path component, like
+ // www.foo.com/abc.html.
+ return url.regionMatches(0, mUrlStr, 0, prevComponentPos);
+ }
+ }
+
+ // If both url and pattern have the same prefix, url must have nothing else.
+ if (url.startsWith(mPrefix)) {
+ int acceptLen = mPlaceholderSegmentStart + suffixLen;
+ // The url doesn't have page number parameter, like www.foo.com/a/abc.html.
+ if (acceptLen == urlLen) return true;
+ if (acceptLen > urlLen) return false;
+
+ // While we are here, the url must have page number param, so the url must have a '/'
+ // at the pattern's path component start position.
+ if (url.charAt(mPlaceholderSegmentStart) != '/') return false;
+
+ return PageParameterDetector.isPlainNumber(url.substring(mPlaceholderSegmentStart + 1,
+ suffixStart));
+ }
+
+ return false;
+ }
+
+ private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l).
+ private static RegExp sLastPathComponentRegExp = null; // Match last path component.
+
+ /**
+ * Returns true if:
+ * - the digitStart to digitEnd of urlStr is the last path component, and
+ * - the entire path component is numeric, and
+ * - the previous path component is a bad page param name.
+ * E.g. "www.foo.com/tag/2" will return true because of the above reasons and "tag" is a bad
+ * page param.
+ */
+ static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, int digitStart,
+ int digitEnd) {
+ if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component.
+ pathStart < digitStart - 1) { // Not the first path component.
+ String postMatch = urlStr.substring(digitEnd).toLowerCase();
+ // Checks that this is the last path component, and trailing characters, if available,
+ // are (s)htm(l) extensions.
+ if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i");
+ if (sExtRegExp.test(postMatch)) {
+ // Entire component is numeric, get previous path component.
+ if (sLastPathComponentRegExp == null) {
+ sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i");
+ }
+ MatchResult prevPathComponent = sLastPathComponentRegExp.exec(
+ urlStr.substring(pathStart + 1, digitStart));
+ if (prevPathComponent != null && prevPathComponent.getGroupCount() > 1 &&
+ PageParameterDetector.isPageParamNameBad(prevPathComponent.getGroup(1))) {
+ return true;
+ }
+ } // last numeric path component
+ }
+
+ return false;
+ }
+
+ /**
+ * Returns true if given character is one of '-', '_', ';', ','.
+ */
+ private static native boolean isPageParamSeparator(Character c) /*-{
+ return /[-_;,]/.test(c);
+ }-*/;
+
+}

Powered by Google App Engine
This is Rietveld 408576698