Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(651)

Unified Diff: java/org/chromium/distiller/PageParameterDetector.java

Issue 1029593003: implement validations of pagination URLs (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: rename test Created 5 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | javatests/org/chromium/distiller/PageParameterDetectorTest.java » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: java/org/chromium/distiller/PageParameterDetector.java
diff --git a/java/org/chromium/distiller/PageParameterDetector.java b/java/org/chromium/distiller/PageParameterDetector.java
index 0bf6dfcb7a57c7d290bb751bcc82e8d6d8f1767c..e3fb9b69b893d9d70ad50410b0489c9e490cd283 100644
--- a/java/org/chromium/distiller/PageParameterDetector.java
+++ b/java/org/chromium/distiller/PageParameterDetector.java
@@ -52,6 +52,11 @@ import java.util.Set;
*/
public class PageParameterDetector {
private static final String PAGE_PARAM_PLACEHOLDER = "[*!]";
+ private static final int PAGE_PARAM_PLACEHOLDER_LEN = PAGE_PARAM_PLACEHOLDER.length();
+ private static final int MIN_LINKS_TO_JUSTIFY_LINEAR_MAP = 2;
+
+ static final int PAGE_NUM_ADJACENT_MASK = 1 << 0;
+ static final int PAGE_NUM_CONSECUTIVE_MASK = 1 << 1;
/**
* Stores information about the link (anchor) after the page parameter is detected:
@@ -179,6 +184,169 @@ public class PageParameterDetector {
} // extractPageParamCandidatesFromPath
/**
+ * Validates the page pattern according to the current document URL through a pipeline of rules:
+ * - for query page parameter, pattern and URL must have same path components.
+ * - for path page parameter,
+ * - pattern and URL must have same number of path components.
+ * - if only 1 path component, both must have long-enough common prefix and suffix.
+ * - else all pattern's components, except for page parameter, must be same as url's.
+ * - lastly, pattern's components cannot be calendar digits.
+ *
+ * Returns true if page pattern is valid.
+ *
+ * @param docUrl the current document URL
+ * @param pagePattern the page pattern to validate
+ */
+ static boolean isPagePatternValid(ParsedUrl docUrl, String pagePattern) {
+ int pageParamPos = pagePattern.indexOf(PAGE_PARAM_PLACEHOLDER);
+ if (pageParamPos == -1) return false;
+
+ ParsedUrl patternUrl = ParsedUrl.create(pagePattern);
+
+ // If page parameter is a query, page pattern and doc URL must have the same path.
+ if (pagePattern.lastIndexOf('?', pageParamPos - 1) != -1) {
+ return docUrl.getTrimmedPath().equalsIgnoreCase(patternUrl.getTrimmedPath());
+ }
+
+ final String[] urlPathComponents = docUrl.getPathComponents();
+ final String[] patternPathComponents = patternUrl.getPathComponents();
+ final int urlPathComponentsLen = urlPathComponents.length;
+ final int patternPathComponentsLen = patternPathComponents.length;
+
+ // If the page param is inside of path components, both the pattern and doc URL must have
+ // the similar path.
+ if (urlPathComponentsLen > patternPathComponentsLen) return false;
cjhopman 2015/03/27 00:16:12 why ">" and not "!="?
kuan 2015/03/31 17:17:50 because pattern can hv more path components than d
+
+ // If both doc URL and page pattern have only 1 component, their common prefix+suffix must
+ // be at least half of the entire component in doc URL, e.g doc URL is
+ // "foo.com/foo-bar-threads-132" and pattern is "foo.com/foo-bar-threads-132-[*!]".
+ if (urlPathComponentsLen == 1 && patternPathComponentsLen == 1) {
+ final String urlComponent = urlPathComponents[0];
+ final String patternComponent = patternPathComponents[0];
+ int commonPrefixLen = getLongestCommonPrefixLength(urlComponent, patternComponent);
+ return (getLongestCommonSuffixLength(urlComponent, patternComponent, commonPrefixLen) +
+ commonPrefixLen) * 2 >= urlComponent.length();
+ }
+
+ // Get index of page parameter.
+ int paramIndex = 0;
+ for (; paramIndex < patternPathComponentsLen; paramIndex++) {
+ if (patternPathComponents[paramIndex].contains(PAGE_PARAM_PLACEHOLDER)) break;
+ }
+
+ // Except for the component containing the page param, the other components of doc URL must
cjhopman 2015/03/27 00:16:12 Can this be extracted to a separate function.
kuan 2015/03/31 17:17:50 Done.
+ // be part of pattern's path. But pattern may have more components, e.g. doc URL is
+ // /thread/12 and pattern is /thread/12/page/[*!].
+ boolean passedPageParamComponent = false;
+ for (int i = 0, j = 0; i < urlPathComponentsLen && j < patternPathComponentsLen; i++, j++) {
cjhopman 2015/03/27 00:16:12 I'm not really sure I follow the logic here (and a
cjhopman 2015/03/27 00:18:21 It won't reject that example actually. Still, how
kuan 2015/03/31 17:17:50 this would be invalid - pattern has extra "page" p
cjhopman 2015/04/07 00:45:48 I guess that the behavior doesn't seem to match th
kuan 2015/04/10 22:41:27 i've added ur examples, with explanations, to the
+ if (i == paramIndex && !passedPageParamComponent) {
+ passedPageParamComponent = true;
+ // Repeat current path component if doc URL has less components (as per comments
+ // just above, doc URL may have less components).
+ if (urlPathComponentsLen < patternPathComponentsLen) i--;
+ continue;
+ }
+
+ if (!urlPathComponents[i].equalsIgnoreCase(patternPathComponents[j])) return false;
+ }
+
+ // Check if pattern is for a calendar page, e.g. 2012/01/[*!], which would be a
+ // false-positive.
+ if (paramIndex >= 2 &&
cjhopman 2015/03/27 00:16:12 Extract this to another function
kuan 2015/03/31 17:17:50 Done.
+ // Only if param is the entire path component. This handles some cases erroneously
+ // considered false-positives e.g. first page is
+ // http://www.politico.com/story/2014/07/barack-obama-immigration-legal-questions-109467.html,
cjhopman 2015/03/27 00:16:12 why do we require that it be ordered yyyy/mm/dd fo
kuan 2015/03/31 17:17:50 i would think so. how else do we detect calendar
+ // and second page is
+ // http://www.politico.com/story/2014/07/barack-obama-immigration-legal-questions-109467_Page2.html,
+ // would be considered false-positives otherwise because of "2014" and "07".
+ patternPathComponents[paramIndex].length() == PAGE_PARAM_PLACEHOLDER_LEN) {
+ int month = StringUtil.toNumber(patternPathComponents[paramIndex - 1]);
+ if (month > 0 && month <= 12) {
+ int year = StringUtil.toNumber(patternPathComponents[paramIndex - 2]);
+ if (year > 1970 && year < 3000) return false;
+ }
+ }
+
+ return true;
+ } // isPagePatternValid
+
+ /**
+ * Evaluates if the given list of LinkInfo's is a list of paging URLs:
+ * - page numbers in list of LinkInfo's must be adjacent
+ * - page numbers in list of ascending numbers must either
+ * - be consecutive and form a page number sequence, or
+ * - must construct a linear map with a linear formula: page_parameter = a * page_number + b
+ * - if there's only 1 LinkInfo, the first ascending number must be page 1, first page URL must
+ * match page pattern, and the only outlink must be 2nd or 3rd page.
+ *
+ * Returns a populated PageParamInfo if evaluated true. Otherwise, returns null.
+ *
+ * @param allLinkInfo the list of LinkInfo's to evaluate
+ * @param pagePattern the URL pattern to use
+ * @param ascendingNumbers list of PageInfo's with ascending mPageNum's
+ * @param firstPageUrl the URL of the PageInfo with mPageNum=1
+ */
+ private static PageParamInfo getPageParamInfo(String pagePattern, List<LinkInfo> allLinkInfo,
+ List<PageParamInfo.PageInfo> ascendingNumbers, String firstPageUrl) {
+ if (allLinkInfo.size() >= MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) {
+ int result = arePageNumsAdjacentAndConsecutive(allLinkInfo, ascendingNumbers);
+ if ((result & PAGE_NUM_ADJACENT_MASK) == 0) return null;
+
+ PageParamInfo.LinearFormula linearFormula = getPageParamLinearFormula(allLinkInfo);
+
+ // PageParamInfo.Type.PAGE_NUMBER: ascending numbers must be consecutive and of a page
+ // number sequence.
+ if ((result & PAGE_NUM_CONSECUTIVE_MASK) != PAGE_NUM_CONSECUTIVE_MASK) return null;
+ if (!isPageNumberSeq(ascendingNumbers)) return null;
+ PageParamInfo pageParamInfo = new PageParamInfo();
+ pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER;
+ pageParamInfo.mFormula = linearFormula;
+ for (LinkInfo link : allLinkInfo) {
+ pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(link.mPageNum,
+ ascendingNumbers.get(link.mPosInAscendingList).mUrl));
+ }
+ return pageParamInfo;
+ }
+
+ // Most of news article have no more than 3 pages and the first page probably doesn't have
+ // any page parameter. If the first page url matches the the page pattern, we treat it as
+ // the first page of this pattern.
+ if (allLinkInfo.size() == 1 && !firstPageUrl.isEmpty()) {
+ final LinkInfo onlyLink = allLinkInfo.get(0);
+ boolean secondPageIsOutlink = onlyLink.mPageNum == 2 &&
+ onlyLink.mPosInAscendingList == 1;
+ boolean thirdPageIsOutlink = onlyLink.mPageNum == 3 &&
+ onlyLink.mPosInAscendingList == 2 &&
+ // onlyLink's pos is 2 (evaluated right before), so ascendingNumbers has >= 3
+ // elements; check if previous element is previous page.
+ ascendingNumbers.get(1).mPageNum == 2;
+ // 1 LinkInfo means ascendingNumbers has >= 1 element.
+ if (ascendingNumbers.get(0).mPageNum == 1 &&
+ (secondPageIsOutlink || thirdPageIsOutlink) &&
+ isPagingUrl(firstPageUrl, pagePattern)) {
+ // Has valid PageParamInfo, create and populate it.
+ PageParamInfo pageParamInfo = new PageParamInfo();
+ pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER;
+ int coefficient;
+ int delta = onlyLink.mPageParamValue - onlyLink.mPageNum;
+ if (delta == 0 || delta == 1) {
+ coefficient = 1;
+ } else {
+ coefficient = onlyLink.mPageParamValue;
+ delta = 0;
+ }
+ pageParamInfo.mFormula = new PageParamInfo.LinearFormula(coefficient, delta);
+ pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(1, firstPageUrl));
+ pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(onlyLink.mPageNum,
+ ascendingNumbers.get(onlyLink.mPosInAscendingList).mUrl));
+ return pageParamInfo;
+ }
+ }
+
+ return null;
+ } // getPageParamInfo
+
+ /**
* Returns true if given name is backlisted as a known bad page param name.
*/
private static boolean isPageParamNameBad(String name) {
@@ -197,8 +365,8 @@ public class PageParameterDetector {
* E.g. "www.foo.com/tag/2" will return true because of the above reasons and "tag" is a bad
* page param.
*/
- static boolean isLastNumericPathComponentBad(String urlStr, int pathStart,
- int digitStart, int digitEnd) {
+ static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, int digitStart,
+ int digitEnd) {
if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component.
pathStart < digitStart - 1) { // Not the first path component.
String postMatch = urlStr.substring(digitEnd).toLowerCase();
@@ -222,6 +390,374 @@ public class PageParameterDetector {
return false;
} // isLastNumericPathComponentBad
+ private static int getLongestCommonPrefixLength(String str1, String str2) {
+ if (str1.isEmpty() || str2.isEmpty()) return 0;
+
+ int limit = Math.min(str1.length(), str2.length());
+ int i = 0;
+ for (; i < limit; i++) {
+ if (str1.charAt(i) != str2.charAt(i)) break;
+ }
+ return i;
+ } // getLongestCommonPrefixLength
cjhopman 2015/03/27 00:16:12 Let's remove all these comments marking what funct
kuan 2015/03/31 17:17:50 Done.
+
+ private static int getLongestCommonSuffixLength(String str1, String str2, int startIndex) {
+ int commonSuffixLen = 0;
+ for (int i = str1.length() - 1, j = str2.length() - 1;
+ i > startIndex && j > startIndex; i--, j--, commonSuffixLen++) {
+ if (str1.charAt(i) != str2.charAt(i)) break;
+ }
+ return commonSuffixLen;
+ } // getLongestCommonSuffixLength
+
+ /**
+ * Detects if page numbers in list of LinkInfo's are adjacent, and if page numbers in list of
+ * PageParamInfo.PageInfo's are consecutive.
+ *
+ * For adjacency, the page numbers in list of LinkInfo's must either be adjacent, or separated
+ * by at most 1 plain text number which must represent the current page number in one of the
+ * PageParamInfo.PageInfo's.
+ * For consecutiveness, there must be at least one pair of consecutive number values in the list
+ * of LinkInfo's, or between a LinkInfo and a plain text number. Otherwise, these outlinks are
+ * likely to be page size selection links (e.g. in the document "See 1-10, 11-20...").
+ *
+ * Returns a int value that is a combination of bits:
+ * - bit for PAGE_PARAM_ADJACENT_MASK is set if allLinkInfo are adjacent
+ * - bit for PAGE_PARAM_CONSECUTIVE_MASK is set if ascendingNumbers are consecutive.
+ *
+ * @param allLinkInfo the list of LinkInfo's to evaluate
+ * @param ascendingNumbers list of PageInfo's with ascending mPageNum's
+ */
+ static int arePageNumsAdjacentAndConsecutive(List<LinkInfo> allLinkInfo,
+ List<PageParamInfo.PageInfo> ascendingNumbers) {
+ int result = 0;
+
+ // Check if elements in allLinkInfo are adjacent or there's only 1 gap i.e. the gap is
+ // current page number respresented in plain text.
+ int firstPos = -1;
+ int lastPos = -1;
+ int gapPos = -1;
+ Set<Integer> pageParamSet = new HashSet<Integer>(); // To check that page number is unique.
+ for (LinkInfo linkInfo : allLinkInfo) {
+ final int currPos = linkInfo.mPosInAscendingList;
+ if (lastPos == -1) {
+ firstPos = currPos;
+ } else if (currPos != lastPos + 1) {
+ // If position is not strictly ascending, or the gap size is > 1 (e.g. "[3] [4] 5 6
+ // [7]"), or there's more than 1 gap (e.g. "[3] 4 [5] 6 [7]"), allLinkInfo is not
+ // adjacent.
+ if (currPos <= lastPos || currPos != lastPos + 2 || gapPos != -1) return result;
+ gapPos = currPos - 1;
+ }
+ // Make sure page param value, i.e. page number represented in plain text, is unique.
+ if (!pageParamSet.add(linkInfo.mPageParamValue)) return result;
+ lastPos = currPos;
+ } // for all LinkInfo's
+
+ result |= PAGE_NUM_ADJACENT_MASK;
+
+ // Now, determine if page numbers in ascendingNumbers are consecutive.
+
+ // First, handle the gap.
+ if (gapPos != -1) {
+ if (gapPos <= 0 || gapPos >= ascendingNumbers.size() - 1) return result;
+ // The "gap" should represent current page number in plain text.
+ // Check if its adjacent page numbers are consecutive.
+ // e.g. "[1] [5] 6 [7] [12]" is accepted; "[4] 8 [16]" is rejected.
+ // This can eliminate links affecting the number of items on a page.
+ final int currPageNum = ascendingNumbers.get(gapPos).mPageNum;
+ if (ascendingNumbers.get(gapPos - 1).mPageNum == currPageNum - 1 &&
+ ascendingNumbers.get(gapPos + 1).mPageNum == currPageNum + 1) {
+ return result | PAGE_NUM_CONSECUTIVE_MASK;
+ }
+ return result;
+ }
+
+ // There is no gap. Check if at least one of the following cases is satisfied:
+ // Case #1: "[1] [2] ..." or "1 [2] ... ".
+ if ((firstPos == 0 || firstPos == 1) && ascendingNumbers.get(0).mPageNum == 1 &&
+ ascendingNumbers.get(1).mPageNum == 2) {
+ return result | PAGE_NUM_CONSECUTIVE_MASK;
+ }
+ // Case #2: "[1] 2 [3] ..." where [1] doesn't belong to current pattern.
+ if (firstPos == 2 && ascendingNumbers.get(2).mPageNum == 3 &&
+ ascendingNumbers.get(1).mUrl.isEmpty() && !ascendingNumbers.get(0).mUrl.isEmpty()) {
+ return result | PAGE_NUM_CONSECUTIVE_MASK;
+ }
+ // Case #3: "... [n-1] [n]" or "... [n - 1] n".
+ final int numbersSize = ascendingNumbers.size();
+ if ((lastPos == numbersSize - 1 || lastPos == numbersSize - 2) &&
+ ascendingNumbers.get(numbersSize - 2).mPageNum + 1 ==
+ ascendingNumbers.get(numbersSize - 1).mPageNum) {
+ return result | PAGE_NUM_CONSECUTIVE_MASK;
+ }
+ // Case #4: "... [i-1] [i] [i+1] ...".
+ for (int i = firstPos + 1; i < lastPos; i++) {
+ if (ascendingNumbers.get(i - 1).mPageNum + 2 == ascendingNumbers.get(i + 1).mPageNum) {
+ return result | PAGE_NUM_CONSECUTIVE_MASK;
+ }
+ }
+
+ // Otherwise, there's no pair of consecutive values.
+ return result;
+ } // arePageNumsAdjacentAndConsecutive
+
+ /**
+ *
+ * Determines if the list of LinkInfo's form a linear formula:
+ * pageParamValue = coefficient * pageNum + delta (delta == -coefficient or delta == 0).
cjhopman 2015/03/27 00:16:11 Do we really need this complicated linear formula?
kuan 2015/03/31 17:17:50 it's true we don't really care about the actual va
cjhopman 2015/04/07 00:45:48 i just want you to be sure if it's necessary or un
kuan 2015/04/10 22:41:27 i'm wary of removing it now, including the non-1 c
kuan 2015/04/13 17:21:38 to clarify the example above, the pagination URLs
+ *
+ * The coefficient and delta are calculated from the page parameter values and page numbers of 2
+ * LinkInfo's, and then validated against the remaining LinkInfo's.
+ * The order of page numbers doesn't matter.
+ *
+ * Returns PageParamInfo.LinearFormula, containing the coefficient and delta, if the page
+ * parameter forumla could be determined. Otherwise, returns null.
+ *
+ * @param allLinkInfo the list of LinkInfo's to evaluate
+ */
+ private static PageParamInfo.LinearFormula getPageParamLinearFormula(
+ List<LinkInfo> allLinkInfo) {
+ if (allLinkInfo.size() < MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) return null;
+
+ final LinkInfo firstLink = allLinkInfo.get(0);
+ final LinkInfo secondLink = allLinkInfo.get(1);
+
+ if (allLinkInfo.size() == 2 && Math.max(firstLink.mPageNum, secondLink.mPageNum) > 4) {
+ return null;
+ }
+
+ int deltaX = secondLink.mPageNum - firstLink.mPageNum;
+ if (deltaX == 0) return null;
+
+ int deltaY = secondLink.mPageParamValue - firstLink.mPageParamValue;
+ int coefficient = deltaY / deltaX;
+ if (coefficient == 0) return null;
+
+ int delta = firstLink.mPageParamValue - coefficient * firstLink.mPageNum;
+ if (delta != 0 && delta != -coefficient) return null;
+
+ // Check if the remaining elements are on the same linear map.
+ for (int i = 2; i < allLinkInfo.size(); i++) {
+ final LinkInfo link = allLinkInfo.get(i);
+ if (link.mPageParamValue != coefficient * link.mPageNum + delta) return null;
+ }
+
+ return new PageParamInfo.LinearFormula(coefficient, delta);
+ } // getPageParamLinearFormula
+
+ /**
+ * Returns true if page numbers in list of PageParamInfo.PageInfo's form a sequence, based on
+ * a pipeline of rules:
+ * - first PageInfo must have a URL unless it is the first page
+ * - there's only one plain number without URL in list
+ * - if only two pages, they must be siblings
cjhopman 2015/03/27 00:16:12 what's a sibling?
kuan 2015/03/31 17:17:50 Done.
+ * - page numbers must be adjacent and consecutive; otherwise, 2 non-consecutive numbers must be
+ * head/tail or have URLs.
+ *
+ * @param ascendingNumbers list of PageInfo's with ascending mPageNum's
+ */
+ private static boolean isPageNumberSeq(List<PageParamInfo.PageInfo> ascendingNumbers) {
cjhopman 2015/03/27 00:16:12 Try to avoid abbreviations in function names: s/Se
kuan 2015/03/31 17:17:50 Done.
+ if (ascendingNumbers.size() <= 1) return false;
+
+ // The first one must have a URL unless it is the first page.
+ final PageParamInfo.PageInfo firstPage = ascendingNumbers.get(0);
+ if (firstPage.mPageNum != 1 && firstPage.mUrl.isEmpty()) return false;
+
+ // There's only one plain number without URL in ascending numbers group.
+ boolean hasPlainNum = false;
+ for (PageParamInfo.PageInfo page : ascendingNumbers) {
+ if (page.mUrl.isEmpty()) {
+ if (hasPlainNum) return false;
+ hasPlainNum = true;
+ }
+ }
+
+ // If there are only two pages, they must be siblings.
+ if (ascendingNumbers.size() == 2) {
+ return firstPage.mPageNum + 1 == ascendingNumbers.get(1).mPageNum;
+ }
+
+ // Check if page numbers in ascendingNumbers are adjacent and consecutive.
+ for (int i = 1; i < ascendingNumbers.size(); i++) {
+ // If two adjacent numbers are not consecutive, we accept them only when:
+ // 1) one of them is head/tail, like [1],[n-i][n-i+1]..[n] or [1],[2], [3]...[i], [n].
+ // 2) both of them have URLs.
+ final PageParamInfo.PageInfo currPage = ascendingNumbers.get(i);
+ final PageParamInfo.PageInfo prevPage = ascendingNumbers.get(i - 1);
+ if (currPage.mPageNum - prevPage.mPageNum != 1) {
+ if (i != 1 && i != ascendingNumbers.size() - 1) return false;
+ if (currPage.mUrl.isEmpty() || prevPage.mUrl.isEmpty()) return false;
+ }
+ }
+
+ return true;
+ } // isPageNumberSeq
+
+ private static RegExp sSlashExtRegExp = null; // Match either '/' or ".htm(l)".
cjhopman 2015/03/27 00:16:12 This name needs to be more descriptive.
kuan 2015/03/31 17:17:50 Done.
+
+ /**
+ * Returns true if a URL matches the generated page pattern based on a pipeline of rules:
+ * - suffix (part of pattern after page param placeholder) must be same, and
+ * - for query page parameter,
+ * - scheme, host, and path must be same, and
+ * - query components, except that for page number, must be same in order and value, and
+ * - query value must be a plain number.
+ * - for path page parameter that is part of a path component,
+ * - if the first different character in path component is suffix, it must be a page parameter
+ * separator, followed by the page parameter in the pattern
+ * - else if it's page parameter, it and possible following digits must be a plain number.
+ * - for path page parameter that is the entire path component,
+ * - if URL has no page number param and previous path component, everything else matches, or
+ * - if prefix is the same, URL doesn't have anyhing else
+ * - else url must have '/' at the same position as pattern's page parameter path component,
+ * followed by a plain number.
+ *
+ * @param url the URL to evalutate
+ * @param pagePattern the URL page pattern to match with
+ */
+ static boolean isPagingUrl(String url, String pagePattern) {
+ int pageParamPos = pagePattern.indexOf(PAGE_PARAM_PLACEHOLDER);
+ if (pageParamPos == -1) return false;
+
+ int queryComponentStartPos = pagePattern.lastIndexOf('&', pageParamPos - 1);
+ if (queryComponentStartPos == -1) { // Page number is the first query.
+ queryComponentStartPos = pagePattern.lastIndexOf('?', pageParamPos - 1);
+ }
+
+ final int urlLen = url.length();
+ final int patternLen = pagePattern.length();
+ boolean isDynamicParam = queryComponentStartPos > 0 &&
+ pagePattern.charAt(pageParamPos - 1) == '=';
+
+ // Both url and patterm must have the same suffix, if available.
+ int suffixLen = patternLen - pageParamPos - PAGE_PARAM_PLACEHOLDER_LEN;
+ if (suffixLen != 0) {
+ int compareLen = suffixLen - (isDynamicParam ? 1 : 0); // Excludes '&' or '?'.
+ if (!url.regionMatches(urlLen - compareLen, pagePattern, patternLen - compareLen,
+ compareLen)) {
+ return false;
+ }
+ }
+
+ final int suffixPos = urlLen - suffixLen;
+
+ if (isDynamicParam) {
+ // If page parameter is dynamic, the url matches the pattern only when:
+ // 1. has same prefix (scheme, host, path)
+ // 2. has same query components with same value (except page number query) in the same
+ // order.
+ // Examples:
+ // If page pattern is http://foo.com/a/b?queryA=v1&queryB=[*!]&queryC=v3
+ // Returns true for:
+ // - http://foo.com/a/b/?queryA=v1&queryC=v3
+ // - http://foo.com/a/b/?queryA=v1&queyrB=4&queryC=v3
+ // Otherwise, returns false.
+ //
+ // If page pattern is http://foo.com/a/b?page=[*!]&query=a
+ // Returns true for:
+ // - http://foo.com/a/b?query=a
+ // - http://foo.com/a/b?page=2&query=a
+ // Otherwise, returns false.
+ //
+ // If page pattern is http://foo.com/a/b?page=[*!]
+ // Returns true for:
+ // - http://foo.com/a/b/
+ // - http://foo.com/a/b.html
+ // - http://foo.com/a/b.htm
+ // - http://foo.com/a/b?page=2
+ // Otherwise, returns false.
+
+ // Both url and pattern must have the same prefix.
+ if (suffixPos < queryComponentStartPos ||
+ !url.regionMatches(0, pagePattern, 0, queryComponentStartPos)) {
+ return false;
+ }
+
+ // If the url doesn't have page number query, it is fine.
+ if (queryComponentStartPos == suffixPos) return true;
+
+ // If the only difference in the page param query component of url and pattern is "/",
+ // ".html" or ".html", it is fine.
+ String diffPart = url.substring(queryComponentStartPos, suffixPos).toLowerCase();
+ if (sSlashExtRegExp == null) sSlashExtRegExp = RegExp.compile("^\\/|(.html?)$", "i");
+ if (sSlashExtRegExp.test(diffPart)) return true;
+
+ // Both url and pattern must have the same query name.
+ if (!url.regionMatches(queryComponentStartPos, pagePattern, queryComponentStartPos,
+ pageParamPos - queryComponentStartPos)) {
+ return false;
+ }
+
+ return isPlainNumber(url.substring(pageParamPos, suffixPos));
+ } // isDynamicParam
+
+ // If the page pattern is www.foo.com/a/abc-[*!].html, expected doc URL is:
+ // - www.foo.com/a/abc-2.html
+ // - www.foo.com/a/abc.html.
+ // If the page pattern is www.foo.com/a/[*!]/abc.html, expected doc URL is:
+ // - www.foo.com/a/2/abc.html
+ // - www.foo.com/a/abc.html
+ // - www.foo.com/abc.html.
+ int pageParamPathComponentPos = pagePattern.lastIndexOf('/', pageParamPos);
+ if (pageParamPathComponentPos == -1) return false;
+
+ // Handle case where page param is part of the path component (as opposed to being the
+ // entire path component).
+ if (pagePattern.charAt(pageParamPos - 1) != '/') {
+ // The page param path component of both url and pattern must have the same prefix.
+ if (urlLen < pageParamPathComponentPos + suffixLen ||
+ !url.regionMatches(0, pagePattern, 0, pageParamPathComponentPos)) {
+ return false;
+ }
+
+ // Find the first different character in page param path component just before
+ // placeholder or suffix, then check if it's acceptable.
+ int firstDiffPos = pageParamPathComponentPos;
+ int maxPos = Math.min(pageParamPos, suffixPos);
+ for (; firstDiffPos < maxPos; firstDiffPos++) {
+ if (url.charAt(firstDiffPos) != pagePattern.charAt(firstDiffPos)) break;
+ }
+ if (firstDiffPos == suffixPos) { // First different character is the suffix.
+ if (firstDiffPos + 1 == pageParamPos &&
+ isPageParamSeparator(pagePattern.charAt(firstDiffPos))) {
+ return true;
+ }
+ } else if (firstDiffPos == pageParamPos) { // First different character is page param.
+ if (isPlainNumber(url.substring(firstDiffPos, suffixPos))) return true;
+ }
+
+ return false;
+ } // page param is part of the (not entire) path component.
+
+ // Handle case where page param is the entire path component.
+ int prevPageParamPathComponentPos = pagePattern.lastIndexOf('/',
+ pageParamPathComponentPos - 1);
+ if (prevPageParamPathComponentPos != -1) {
+ // The url doesn't have page number param and previous path component, like
+ // www.foo.com/abc.html.
+ if (prevPageParamPathComponentPos + suffixLen == urlLen) {
+ return url.regionMatches(0, pagePattern, 0, prevPageParamPathComponentPos);
+ }
+ }
+
+ // If both url and pattern have the same prefix, url must have nothing else.
+ if (url.regionMatches(0, pagePattern, 0, pageParamPathComponentPos)) {
+ int acceptLen = pageParamPathComponentPos + suffixLen;
+ // The url doesn't have page number parameter, like www.foo.com/a/abc.html.
+ if (acceptLen == urlLen) return true;
+ if (acceptLen > urlLen) return false;
+
+ // While we are here, the url must have page number param, so the url must have a '/'
+ // at the pattern's path component start position.
+ if (url.charAt(pageParamPathComponentPos) != '/') return false;
+
+ return isPlainNumber(url.substring(pageParamPathComponentPos + 1, suffixPos));
+ }
+
+ return false;
+ } // isPagingUrl
+
/**
* If sBadPageParamNames is null, initialize it with all the known bad page param names, in
* alphabetical order.
@@ -261,4 +797,18 @@ public class PageParameterDetector {
sBadPageParamNames.add("wiki");
} // initBadPageParamNames
+ /**
+ * Returns true if given string can be converted to a number >= 0.
+ */
+ private static boolean isPlainNumber(String str) {
+ return StringUtil.toNumber(str) >= 0;
+ } // isPlainNumber
+
+ /**
+ * Returns true if given character is one of '-', '_', ';', ','.
+ */
+ public static native boolean isPageParamSeparator(Character c) /*-{
+ return /[-_;,]/.test(c);
+ }-*/;
+
}
« no previous file with comments | « no previous file | javatests/org/chromium/distiller/PageParameterDetectorTest.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698