java/org/chromium/distiller/PageParameterDetector.java - Issue 1029593003: implement validations of pagination URLs

Unified Diff: java/org/chromium/distiller/PageParameterDetector.java

Issue 1029593003: implement validations of pagination URLs (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « java/org/chromium/distiller/PageParamInfo.java ('k') | java/org/chromium/distiller/ParsedUrl.java » ('j') | java/org/chromium/distiller/PathComponentPagePattern.java » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: java/org/chromium/distiller/PageParameterDetector.java

diff --git a/java/org/chromium/distiller/PageParameterDetector.java b/java/org/chromium/distiller/PageParameterDetector.java

index 0bf6dfcb7a57c7d290bb751bcc82e8d6d8f1767c..4149b8fce8d88098cfeeadc93a8e6154a5a75f33 100644

--- a/java/org/chromium/distiller/PageParameterDetector.java

+++ b/java/org/chromium/distiller/PageParameterDetector.java

@@ -1,4 +1,4 @@

// Use of this source code is governed by a BSD-style license that can be

// found in the LICENSE file.

@@ -29,8 +29,8 @@ import java.util.Set;

* "digital" means the text contains only digits.

* A page pattern is a paging URL whose page parameter value is replaced with a place holder

* (PAGE_PARAM_PLACEHOLDER).

- * Example: if the original url is "http: *www.foo.com/a/b-3.html", the page pattern is

- * "http: *www.foo.com/a/b-[*!].html".

+ * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pattern is

+ * "http://www.foo.com/a/b-[*!].html".

* This class extracts the page parameter from a document's outlinks.

* The basic idea:

@@ -39,19 +39,58 @@ import java.util.Set;

* (either a query value or a path component) in URL. If one part of a URL is always a linear

* map from its digital anchor text, we guess the part is the page parameter of the URL.

- * As an example, consider a document http: *a/b?c=1&p=10, which contains the following digital

+ * As an example, consider a document http://a/b?c=1&p=10, which contains the following digital

* outlinks:

- * <a href=http: *a/b?c=1&p=20>3</a>

- * <a href=http: *a/b?c=1&p=30>4</a>

- * <a href=http: *a/b?c=1&p=40>5</a>

- * <a href=http: *a/b?c=1&p=all>single page</a>

+ * <a href=http://a/b?c=1&p=20>3</a>

+ * <a href=http://a/b?c=1&p=30>4</a>

+ * <a href=http://a/b?c=1&p=40>5</a>

+ * <a href=http://a/b?c=1&p=all>single page</a>

* This class finds that the "p" parameter is always equal to "anchor text" * 10 - 10, and so

- * guesses it is the page parameter. The associated page pattern is http: *a/b?c=1&p=[*!].

+ * guesses it is the page parameter. The associated page pattern is http://a/b?c=1&p=[*!].

* Then, this class extracts the single page based on page parameter info. The single page url is

- * http: *a/b?c=1&p=all.

+ * http://a/b?c=1&p=all.

public class PageParameterDetector {

- private static final String PAGE_PARAM_PLACEHOLDER = "[*!]";

+ private static final int MIN_LINKS_TO_JUSTIFY_LINEAR_MAP = 2;

+ static final String PAGE_PARAM_PLACEHOLDER = "[*!]";

+ static final int PAGE_PARAM_PLACEHOLDER_LEN = PAGE_PARAM_PLACEHOLDER.length();

+ static final int PAGE_NUM_ADJACENT_MASK = 1 << 0;

+ static final int PAGE_NUM_CONSECUTIVE_MASK = 1 << 1;

+ /**

+ * The interface that page pattern handlers must implement to detect page parameter from

+ * potential pagination URLs.

+ */

+ interface PagePattern {

+ /**

+ * Returns the string of the URL page pattern.

+ */

+ String toString();

+ /**

+ * Returns the page number extracted from the URL during creation of object that implements

+ * this interface.

+ */

+ int getPageNumber();

+ /**

+ * Validates this page pattern according to the current document URL through a pipeline of

+ * rules.

+ *

+ * Returns true if page pattern is valid.

+ *

+ * @param docUrl the current document URL

+ */

+ boolean isValidFor(ParsedUrl docUrl);

+ /**

+ * Returns true if a URL matches this page pattern based on a pipeline of rules.

+ *

+ * @param url the URL to evalutate

+ */

+ boolean isPagingUrl(String url);

+ }

/**

* Stores information about the link (anchor) after the page parameter is detected:

@@ -70,29 +109,38 @@ public class PageParameterDetector {

mPageParamValue = pageParamValue;

mPosInAscendingList = posInAscendingList;

}

- } // LinkInfo

+ }

/**

* Stores a map of URL pattern to its associated list of LinkInfo's.

private static class PageCandidatesMap {

- private final Map<String, List<LinkInfo>> map = new HashMap<String, List<LinkInfo>>();

+ private static class Info {

+ private final PagePattern mPattern;

+ private final List<LinkInfo> mLinks;

+ Info(PagePattern pattern, LinkInfo link) {

+ mPattern = pattern;

+ mLinks = new ArrayList<LinkInfo>();

+ mLinks.add(link);

+ }

+ private final Map<String, Info> map = new HashMap<String, Info>();

/**

* Adds urlPattern with its LinkInfo into the map. If the urlPattern already exists, adds

* the link to the list of LinkInfo's. Otherwise, creates a new map entry.

- private void add(String urlPattern, LinkInfo link) {

- if (map.containsKey(urlPattern)) {

- map.get(urlPattern).add(link);

+ private void add(PagePattern pattern, LinkInfo link) {

+ final String patternStr = pattern.toString();

+ if (map.containsKey(patternStr)) {

+ map.get(patternStr).mLinks.add(link);

} else {

- List<LinkInfo> links = new ArrayList<LinkInfo>();

- links.add(link);

- map.put(urlPattern, links);

+ map.put(patternStr, new Info(pattern, link));

}

- } // PageCandidatesMap

+ }

// All the known bad page param names.

private static Set<String> sBadPageParamNames = null;

@@ -118,19 +166,13 @@ public class PageParameterDetector {

if (queryParams.length == 0) return; // No query.

for (String[] nameValue : queryParams) {

- final String queryName = nameValue[0];

- final String queryValue = nameValue[1];

- if (!queryName.isEmpty() && !queryValue.isEmpty() &&

- StringUtil.isStringAllDigits(queryValue) && !isPageParamNameBad(queryName)) {

- int value = StringUtil.toNumber(queryValue);

- if (value >= 0) {

- pageCandidates.add(

- url.replaceQueryValue(queryName, queryValue, PAGE_PARAM_PLACEHOLDER),

- new LinkInfo(pageNum, value, posInAscendingNumbers));

- }

+ PagePattern pattern = QueryParamPagePattern.create(url, nameValue[0], nameValue[1]);

+ if (pattern != null) {

+ pageCandidates.add(pattern,

+ new LinkInfo(pageNum, pattern.getPageNumber(), posInAscendingNumbers));

}

- } // extractPageParamCandidatesFromQuery

+ }

private static RegExp sDigitsRegExp = null; // Match at least 1 digit.

@@ -166,61 +208,291 @@ public class PageParameterDetector {

final int matchEnd = sDigitsRegExp.getLastIndex();

final int matchStart = matchEnd - match.getGroup(1).length();

+ PagePattern pattern = PathComponentPagePattern.create(url, pathStart, matchStart,

+ matchEnd);

+ if (pattern != null) {

+ pageCandidates.add(pattern,

+ new LinkInfo(pageNum, pattern.getPageNumber(), posInAscendingNumbers));

+ }

+ } // while there're matches

+ }

- if (isLastNumericPathComponentBad(urlStr, pathStart, matchStart, matchEnd)) continue;

+ /**

+ * Evaluates if the given list of LinkInfo's is a list of paging URLs:

+ * - page numbers in list of LinkInfo's must be adjacent

+ * - page numbers in list of ascending numbers must either

+ * - be consecutive and form a page number sequence, or

+ * - must construct a linear map with a linear formula: page_parameter = a * page_number + b

+ * - if there's only 1 LinkInfo, the first ascending number must be page 1, first page URL must

+ * match page pattern, and the only outlink must be 2nd or 3rd page.

+ *

+ * Returns a populated PageParamInfo if evaluated true. Otherwise, returns null.

+ *

+ * @param allLinkInfo the list of LinkInfo's to evaluate

+ * @param pagePattern the URL pattern to use

+ * @param ascendingNumbers list of PageInfo's with ascending mPageNum's

+ * @param firstPageUrl the URL of the PageInfo with mPageNum=1

+ */

+ private static PageParamInfo getPageParamInfo(PagePattern pagePattern,

cjhopman 2015/04/16 21:58:32 I feel like we have to many SomethingInfo types he

cjhopman 2015/04/16 21:58:32 Would it make sense for this function to be in the

kuan 2015/04/20 23:11:13 i'm bad at names, and ran out of them :(

kuan 2015/04/20 23:11:13 Done. to avoid cross-access between PageParameter

+ List<LinkInfo> allLinkInfo, List<PageParamInfo.PageInfo> ascendingNumbers,

+ String firstPageUrl) {

+ if (allLinkInfo.size() >= MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) {

+ int result = arePageNumsAdjacentAndConsecutive(allLinkInfo, ascendingNumbers);

+ if ((result & PAGE_NUM_ADJACENT_MASK) == 0) return null;

+ PageParamInfo.LinearFormula linearFormula = getPageParamLinearFormula(allLinkInfo);

- int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEnd));

- if (value >= 0) {

- pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_PLACEHOLDER +

- urlStr.substring(matchEnd),

- new LinkInfo(pageNum, value, posInAscendingNumbers));

+ // PageParamInfo.Type.PAGE_NUMBER: ascending numbers must be consecutive and of a page

+ // number sequence.

+ if ((result & PAGE_NUM_CONSECUTIVE_MASK) != PAGE_NUM_CONSECUTIVE_MASK) return null;

+ if (!isPageNumberSequence(ascendingNumbers)) return null;

+ PageParamInfo pageParamInfo = new PageParamInfo();

cjhopman 2015/04/16 21:58:32 Could we move some of the logic of creating the pa

kuan 2015/04/20 23:11:13 Done.

+ pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER;

+ pageParamInfo.mFormula = linearFormula;

+ for (LinkInfo link : allLinkInfo) {

+ pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(link.mPageNum,

+ ascendingNumbers.get(link.mPosInAscendingList).mUrl));

}

- } // while there're matches

- } // extractPageParamCandidatesFromPath

+ return pageParamInfo;

+ }

+ // Most of news article have no more than 3 pages and the first page probably doesn't have

+ // any page parameter. If the first page url matches the the page pattern, we treat it as

+ // the first page of this pattern.

+ if (allLinkInfo.size() == 1 && !firstPageUrl.isEmpty()) {

+ final LinkInfo onlyLink = allLinkInfo.get(0);

+ boolean secondPageIsOutlink = onlyLink.mPageNum == 2 &&

+ onlyLink.mPosInAscendingList == 1;

+ boolean thirdPageIsOutlink = onlyLink.mPageNum == 3 &&

+ onlyLink.mPosInAscendingList == 2 &&

+ // onlyLink's pos is 2 (evaluated right before), so ascendingNumbers has >= 3

+ // elements; check if previous element is previous page.

+ ascendingNumbers.get(1).mPageNum == 2;

+ // 1 LinkInfo means ascendingNumbers has >= 1 element.

+ if (ascendingNumbers.get(0).mPageNum == 1 &&

+ (secondPageIsOutlink || thirdPageIsOutlink) &&

+ pagePattern.isPagingUrl(firstPageUrl)) {

+ // Has valid PageParamInfo, create and populate it.

+ PageParamInfo pageParamInfo = new PageParamInfo();

+ pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER;

+ int coefficient;

+ int delta = onlyLink.mPageParamValue - onlyLink.mPageNum;

+ if (delta == 0 || delta == 1) {

+ coefficient = 1;

+ } else {

+ coefficient = onlyLink.mPageParamValue;

+ delta = 0;

+ }

+ pageParamInfo.mFormula = new PageParamInfo.LinearFormula(coefficient, delta);

+ pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(1, firstPageUrl));

+ pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(onlyLink.mPageNum,

+ ascendingNumbers.get(onlyLink.mPosInAscendingList).mUrl));

+ return pageParamInfo;

+ }

+ return null;

+ }

/**

* Returns true if given name is backlisted as a known bad page param name.

- private static boolean isPageParamNameBad(String name) {

+ static boolean isPageParamNameBad(String name) {

initBadPageParamNames();

return sBadPageParamNames.contains(name.toLowerCase());

- } // isPageParamNameBad

+ }

+ /**

+ * Detects if page numbers in list of LinkInfo's are adjacent, and if page numbers in list of

+ * PageParamInfo.PageInfo's are consecutive.

+ *

+ * For adjacency, the page numbers in list of LinkInfo's must either be adjacent, or separated

+ * by at most 1 plain text number which must represent the current page number in one of the

+ * PageParamInfo.PageInfo's.

+ * For consecutiveness, there must be at least one pair of consecutive number values in the list

+ * of LinkInfo's, or between a LinkInfo and a plain text number. Otherwise, these outlinks are

+ * likely to be page size selection links (e.g. in the document "See 1-10, 11-20...").

+ *

+ * Returns a int value that is a combination of bits:

+ * - bit for PAGE_PARAM_ADJACENT_MASK is set if allLinkInfo are adjacent

+ * - bit for PAGE_PARAM_CONSECUTIVE_MASK is set if ascendingNumbers are consecutive.

+ *

+ * @param allLinkInfo the list of LinkInfo's to evaluate

+ * @param ascendingNumbers list of PageInfo's with ascending mPageNum's

+ */

+ static int arePageNumsAdjacentAndConsecutive(List<LinkInfo> allLinkInfo,

+ List<PageParamInfo.PageInfo> ascendingNumbers) {

+ int result = 0;

+ // Check if elements in allLinkInfo are adjacent or there's only 1 gap i.e. the gap is

+ // current page number respresented in plain text.

+ int firstPos = -1;

+ int lastPos = -1;

+ int gapPos = -1;

+ Set<Integer> pageParamSet = new HashSet<Integer>(); // To check that page number is unique.

+ for (LinkInfo linkInfo : allLinkInfo) {

+ final int currPos = linkInfo.mPosInAscendingList;

+ if (lastPos == -1) {

+ firstPos = currPos;

+ } else if (currPos != lastPos + 1) {

+ // If position is not strictly ascending, or the gap size is > 1 (e.g. "[3] [4] 5 6

+ // [7]"), or there's more than 1 gap (e.g. "[3] 4 [5] 6 [7]"), allLinkInfo is not

+ // adjacent.

+ if (currPos <= lastPos || currPos != lastPos + 2 || gapPos != -1) return result;

+ gapPos = currPos - 1;

+ }

+ // Make sure page param value, i.e. page number represented in plain text, is unique.

+ if (!pageParamSet.add(linkInfo.mPageParamValue)) return result;

+ lastPos = currPos;

+ } // for all LinkInfo's

+ result |= PAGE_NUM_ADJACENT_MASK;

- private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l).

- private static RegExp sLastPathComponentRegExp = null; // Match last path component.

+ // Now, determine if page numbers in ascendingNumbers are consecutive.

+ // First, handle the gap.

+ if (gapPos != -1) {

+ if (gapPos <= 0 || gapPos >= ascendingNumbers.size() - 1) return result;

+ // The "gap" should represent current page number in plain text.

+ // Check if its adjacent page numbers are consecutive.

+ // e.g. "[1] [5] 6 [7] [12]" is accepted; "[4] 8 [16]" is rejected.

+ // This can eliminate links affecting the number of items on a page.

+ final int currPageNum = ascendingNumbers.get(gapPos).mPageNum;

+ if (ascendingNumbers.get(gapPos - 1).mPageNum == currPageNum - 1 &&

+ ascendingNumbers.get(gapPos + 1).mPageNum == currPageNum + 1) {

+ return result | PAGE_NUM_CONSECUTIVE_MASK;

+ }

+ return result;

+ }

+ // There is no gap. Check if at least one of the following cases is satisfied:

+ // Case #1: "[1] [2] ..." or "1 [2] ... ".

+ if ((firstPos == 0 || firstPos == 1) && ascendingNumbers.get(0).mPageNum == 1 &&

+ ascendingNumbers.get(1).mPageNum == 2) {

+ return result | PAGE_NUM_CONSECUTIVE_MASK;

+ }

+ // Case #2: "[1] 2 [3] ..." where [1] doesn't belong to current pattern.

+ if (firstPos == 2 && ascendingNumbers.get(2).mPageNum == 3 &&

+ ascendingNumbers.get(1).mUrl.isEmpty() && !ascendingNumbers.get(0).mUrl.isEmpty()) {

+ return result | PAGE_NUM_CONSECUTIVE_MASK;

+ }

+ // Case #3: "... [n-1] [n]" or "... [n - 1] n".

+ final int numbersSize = ascendingNumbers.size();

+ if ((lastPos == numbersSize - 1 || lastPos == numbersSize - 2) &&

+ ascendingNumbers.get(numbersSize - 2).mPageNum + 1 ==

+ ascendingNumbers.get(numbersSize - 1).mPageNum) {

+ return result | PAGE_NUM_CONSECUTIVE_MASK;

+ }

+ // Case #4: "... [i-1] [i] [i+1] ...".

+ for (int i = firstPos + 1; i < lastPos; i++) {

+ if (ascendingNumbers.get(i - 1).mPageNum + 2 == ascendingNumbers.get(i + 1).mPageNum) {

+ return result | PAGE_NUM_CONSECUTIVE_MASK;

+ }

+ // Otherwise, there's no pair of consecutive values.

+ return result;

+ }

/**

- * Returns true if:

- * - the digitStart to digitEnd of urlStr is the last path component, and

- * - the entire path component is numeric, and

- * - the previous path component is a bad page param name.

- * E.g. "www.foo.com/tag/2" will return true because of the above reasons and "tag" is a bad

- * page param.

+ *

+ * Determines if the list of LinkInfo's form a linear formula:

+ * pageParamValue = coefficient * pageNum + delta (delta == -coefficient or delta == 0).

+ *

+ * The coefficient and delta are calculated from the page parameter values and page numbers of 2

+ * LinkInfo's, and then validated against the remaining LinkInfo's.

+ * The order of page numbers doesn't matter.

+ *

+ * Returns PageParamInfo.LinearFormula, containing the coefficient and delta, if the page

+ * parameter forumla could be determined. Otherwise, returns null.

+ *

+ * @param allLinkInfo the list of LinkInfo's to evaluate

- static boolean isLastNumericPathComponentBad(String urlStr, int pathStart,

- int digitStart, int digitEnd) {

- if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component.

- pathStart < digitStart - 1) { // Not the first path component.

- String postMatch = urlStr.substring(digitEnd).toLowerCase();

- // Checks that this is the last path component, and trailing characters, if available,

- // are (s)htm(l) extensions.

- if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i");

- if (sExtRegExp.test(postMatch)) {

- // Entire component is numeric, get previous path component.

- if (sLastPathComponentRegExp == null) {

- sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i");

- }

- MatchResult prevPathComponent = sLastPathComponentRegExp.exec(

- urlStr.substring(pathStart + 1, digitStart));

- if (prevPathComponent != null && prevPathComponent.getGroupCount() > 1 &&

- isPageParamNameBad(prevPathComponent.getGroup(1))) {

- return true;

- }

- } // last numeric path component

+ // TODO(kuan): As this gets rolled out, reassesss the necessity of non-1 coefficient support.

+ private static PageParamInfo.LinearFormula getPageParamLinearFormula(

+ List<LinkInfo> allLinkInfo) {

+ if (allLinkInfo.size() < MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) return null;

+ final LinkInfo firstLink = allLinkInfo.get(0);

+ final LinkInfo secondLink = allLinkInfo.get(1);

+ if (allLinkInfo.size() == 2 && Math.max(firstLink.mPageNum, secondLink.mPageNum) > 4) {

+ return null;

}

- return false;

- } // isLastNumericPathComponentBad

+ int deltaX = secondLink.mPageNum - firstLink.mPageNum;

+ if (deltaX == 0) return null;

+ int deltaY = secondLink.mPageParamValue - firstLink.mPageParamValue;

+ int coefficient = deltaY / deltaX;

+ if (coefficient == 0) return null;

+ int delta = firstLink.mPageParamValue - coefficient * firstLink.mPageNum;

+ if (delta != 0 && delta != -coefficient) return null;

+ // Check if the remaining elements are on the same linear map.

+ for (int i = 2; i < allLinkInfo.size(); i++) {

+ final LinkInfo link = allLinkInfo.get(i);

+ if (link.mPageParamValue != coefficient * link.mPageNum + delta) return null;

+ }

+ return new PageParamInfo.LinearFormula(coefficient, delta);

+ }

+ /**

+ * Returns true if page numbers in list of PageParamInfo.PageInfo's form a sequence, based on

+ * a pipeline of rules:

+ * - first PageInfo must have a URL unless it is the first page

+ * - there's only one plain number without URL in list

+ * - if only two pages, they must be siblings - 2nd page number must follow 1st

+ * - page numbers must be adjacent and consecutive; otherwise, 2 non-consecutive numbers must be

+ * head/tail or have URLs.

+ *

+ * @param ascendingNumbers list of PageInfo's with ascending mPageNum's

+ */

+ private static boolean isPageNumberSequence(List<PageParamInfo.PageInfo> ascendingNumbers) {

+ if (ascendingNumbers.size() <= 1) return false;

+ // The first one must have a URL unless it is the first page.

+ final PageParamInfo.PageInfo firstPage = ascendingNumbers.get(0);

+ if (firstPage.mPageNum != 1 && firstPage.mUrl.isEmpty()) return false;

+ // There's only one plain number without URL in ascending numbers group.

+ boolean hasPlainNum = false;

+ for (PageParamInfo.PageInfo page : ascendingNumbers) {

+ if (page.mUrl.isEmpty()) {

+ if (hasPlainNum) return false;

+ hasPlainNum = true;

+ }

+ // If there are only two pages, they must be siblings.

+ if (ascendingNumbers.size() == 2) {

+ return firstPage.mPageNum + 1 == ascendingNumbers.get(1).mPageNum;

+ }

+ // Check if page numbers in ascendingNumbers are adjacent and consecutive.

+ for (int i = 1; i < ascendingNumbers.size(); i++) {

+ // If two adjacent numbers are not consecutive, we accept them only when:

+ // 1) one of them is head/tail, like [1],[n-i][n-i+1]..[n] or [1],[2], [3]...[i], [n].

+ // 2) both of them have URLs.

+ final PageParamInfo.PageInfo currPage = ascendingNumbers.get(i);

+ final PageParamInfo.PageInfo prevPage = ascendingNumbers.get(i - 1);

+ if (currPage.mPageNum - prevPage.mPageNum != 1) {

+ if (i != 1 && i != ascendingNumbers.size() - 1) return false;

+ if (currPage.mUrl.isEmpty() || prevPage.mUrl.isEmpty()) return false;

+ }

+ return true;

+ }

+ /**

+ * Returns true if given string can be converted to a number >= 0.

+ */

+ static boolean isPlainNumber(String str) {

+ return StringUtil.toNumber(str) >= 0;

+ }

/**

* If sBadPageParamNames is null, initialize it with all the known bad page param names, in

@@ -259,6 +531,6 @@ public class PageParameterDetector {

sBadPageParamNames.add("videos");

sBadPageParamNames.add("w");

sBadPageParamNames.add("wiki");

- } // initBadPageParamNames

+ }

}