| Index: java/org/chromium/distiller/PageParameterDetector.java
|
| diff --git a/java/org/chromium/distiller/PageParameterDetector.java b/java/org/chromium/distiller/PageParameterDetector.java
|
| index 0bf6dfcb7a57c7d290bb751bcc82e8d6d8f1767c..5f0d03e3928c1bf249d46d4b69d9ceb12fa31169 100644
|
| --- a/java/org/chromium/distiller/PageParameterDetector.java
|
| +++ b/java/org/chromium/distiller/PageParameterDetector.java
|
| @@ -1,4 +1,4 @@
|
| -// Copyright 2016 The Chromium Authors. All rights reserved.
|
| +// Copyright 2015 The Chromium Authors. All rights reserved.
|
| // Use of this source code is governed by a BSD-style license that can be
|
| // found in the LICENSE file.
|
|
|
| @@ -20,17 +20,15 @@ import java.util.Set;
|
| * Background:
|
| * The long article/news/forum thread/blog document may be partitioned into several partial pages
|
| * by webmaster. Each partial page has outlinks pointing to the adjacent partial pages. The
|
| - * anchor text of those outlinks is numeric. Meanwhile, there may be a page which contains the
|
| - * whole content, called "single page".
|
| + * anchor text of those outlinks is numeric.
|
| *
|
| * Definitions:
|
| - * A single page document is a document that contains the whole content.
|
| * A paging document is one of the partial pages.
|
| * "digital" means the text contains only digits.
|
| * A page pattern is a paging URL whose page parameter value is replaced with a place holder
|
| * (PAGE_PARAM_PLACEHOLDER).
|
| - * Example: if the original url is "http: *www.foo.com/a/b-3.html", the page pattern is
|
| - * "http: *www.foo.com/a/b-[*!].html".
|
| + * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pattern is
|
| + * "http://www.foo.com/a/b-[*!].html".
|
| *
|
| * This class extracts the page parameter from a document's outlinks.
|
| * The basic idea:
|
| @@ -39,60 +37,82 @@ import java.util.Set;
|
| * (either a query value or a path component) in URL. If one part of a URL is always a linear
|
| * map from its digital anchor text, we guess the part is the page parameter of the URL.
|
| *
|
| - * As an example, consider a document http: *a/b?c=1&p=10, which contains the following digital
|
| + * As an example, consider a document http://a/b?c=1&p=10, which contains the following digital
|
| * outlinks:
|
| - * <a href=http: *a/b?c=1&p=20>3</a>
|
| - * <a href=http: *a/b?c=1&p=30>4</a>
|
| - * <a href=http: *a/b?c=1&p=40>5</a>
|
| - * <a href=http: *a/b?c=1&p=all>single page</a>
|
| + * <a href=http://a/b?c=1&p=20>3</a>
|
| + * <a href=http://a/b?c=1&p=30>4</a>
|
| + * <a href=http://a/b?c=1&p=40>5</a>
|
| * This class finds that the "p" parameter is always equal to "anchor text" * 10 - 10, and so
|
| - * guesses it is the page parameter. The associated page pattern is http: *a/b?c=1&p=[*!].
|
| - * Then, this class extracts the single page based on page parameter info. The single page url is
|
| - * http: *a/b?c=1&p=all.
|
| + * guesses it is the page parameter. The associated page pattern is http://a/b?c=1&p=[*!].
|
| */
|
| public class PageParameterDetector {
|
| - private static final String PAGE_PARAM_PLACEHOLDER = "[*!]";
|
| + static final String PAGE_PARAM_PLACEHOLDER = "[*!]";
|
| + static final int PAGE_PARAM_PLACEHOLDER_LEN = PAGE_PARAM_PLACEHOLDER.length();
|
|
|
| /**
|
| - * Stores information about the link (anchor) after the page parameter is detected:
|
| - * - the page number (as represented by the original plain text) for the link
|
| - * - the original page parameter numeric component in the URL (this component would be replaced
|
| - * by PAGE_PARAM_PLACEHOLDER in the URL pattern)
|
| - * - the position of this link in the list of ascending numbers.
|
| + * The interface that page pattern handlers must implement to detect page parameter from
|
| + * potential pagination URLs.
|
| */
|
| - static class LinkInfo {
|
| - private int mPageNum;
|
| - private int mPageParamValue;
|
| - private int mPosInAscendingList;
|
| + interface PagePattern {
|
| + /**
|
| + * Returns the string of the URL page pattern.
|
| + */
|
| + String toString();
|
| +
|
| + /**
|
| + * Returns the page number extracted from the URL during creation of object that implements
|
| + * this interface.
|
| + */
|
| + int getPageNumber();
|
| +
|
| + /**
|
| + * Validates this page pattern according to the current document URL through a pipeline of
|
| + * rules.
|
| + *
|
| + * Returns true if page pattern is valid.
|
| + *
|
| + * @param docUrl the current document URL
|
| + */
|
| + boolean isValidFor(ParsedUrl docUrl);
|
|
|
| - LinkInfo(int pageNum, int pageParamValue, int posInAscendingList) {
|
| - mPageNum = pageNum;
|
| - mPageParamValue = pageParamValue;
|
| - mPosInAscendingList = posInAscendingList;
|
| - }
|
| - } // LinkInfo
|
| + /**
|
| + * Returns true if a URL matches this page pattern based on a pipeline of rules.
|
| + *
|
| + * @param url the URL to evalutate
|
| + */
|
| + boolean isPagingUrl(String url);
|
| + }
|
|
|
| /**
|
| - * Stores a map of URL pattern to its associated list of LinkInfo's.
|
| + * Stores a map of URL pattern to its associated list of PageLinkInfo's.
|
| */
|
| private static class PageCandidatesMap {
|
| - private final Map<String, List<LinkInfo>> map = new HashMap<String, List<LinkInfo>>();
|
| + private static class Info {
|
| + private final PagePattern mPattern;
|
| + private final List<PageLinkInfo> mLinks;
|
| +
|
| + Info(PagePattern pattern, PageLinkInfo link) {
|
| + mPattern = pattern;
|
| + mLinks = new ArrayList<PageLinkInfo>();
|
| + mLinks.add(link);
|
| + }
|
| + }
|
| +
|
| + private final Map<String, Info> map = new HashMap<String, Info>();
|
|
|
| /**
|
| - * Adds urlPattern with its LinkInfo into the map. If the urlPattern already exists, adds
|
| - * the link to the list of LinkInfo's. Otherwise, creates a new map entry.
|
| + * Adds urlPattern with its PageLinkInfo into the map. If the urlPattern already exists,
|
| + * adds the link to the list of LinkInfo's. Otherwise, creates a new map entry.
|
| */
|
| - private void add(String urlPattern, LinkInfo link) {
|
| - if (map.containsKey(urlPattern)) {
|
| - map.get(urlPattern).add(link);
|
| + private void add(PagePattern pattern, PageLinkInfo link) {
|
| + final String patternStr = pattern.toString();
|
| + if (map.containsKey(patternStr)) {
|
| + map.get(patternStr).mLinks.add(link);
|
| } else {
|
| - List<LinkInfo> links = new ArrayList<LinkInfo>();
|
| - links.add(link);
|
| - map.put(urlPattern, links);
|
| + map.put(patternStr, new Info(pattern, link));
|
| }
|
| }
|
| -
|
| - } // PageCandidatesMap
|
| + }
|
|
|
| // All the known bad page param names.
|
| private static Set<String> sBadPageParamNames = null;
|
| @@ -105,12 +125,12 @@ public class PageParameterDetector {
|
| * - the name of a query name-value component is not one of sBadPageParamNames, and
|
| * - the value of the query component is a plain number (>= 0).
|
| * E.g. a URL query with 3 plain number query values will generate 3 URL page patterns with 3
|
| - * LinkInfo's, and hence 3 page parameter candidates.
|
| + * PageLinkInfo's, and hence 3 page parameter candidates.
|
| *
|
| * @param url ParsedUrl of the URL to process
|
| * @param pageNum the page number as represented in original plain text
|
| * @param posInAscendingNumbers position of this page number in the list of ascending numbers
|
| - * @param pageCandidates the map of URL pattern to its associated list of LinkInfo's
|
| + * @param pageCandidates the map of URL pattern to its associated list of PageLinkInfo's
|
| */
|
| private static void extractPageParamCandidatesFromQuery(ParsedUrl url, int pageNum,
|
| int posInAscendingNumbers, PageCandidatesMap pageCandidates) {
|
| @@ -118,19 +138,13 @@ public class PageParameterDetector {
|
| if (queryParams.length == 0) return; // No query.
|
|
|
| for (String[] nameValue : queryParams) {
|
| - final String queryName = nameValue[0];
|
| - final String queryValue = nameValue[1];
|
| - if (!queryName.isEmpty() && !queryValue.isEmpty() &&
|
| - StringUtil.isStringAllDigits(queryValue) && !isPageParamNameBad(queryName)) {
|
| - int value = StringUtil.toNumber(queryValue);
|
| - if (value >= 0) {
|
| - pageCandidates.add(
|
| - url.replaceQueryValue(queryName, queryValue, PAGE_PARAM_PLACEHOLDER),
|
| - new LinkInfo(pageNum, value, posInAscendingNumbers));
|
| - }
|
| + PagePattern pattern = QueryParamPagePattern.create(url, nameValue[0], nameValue[1]);
|
| + if (pattern != null) {
|
| + pageCandidates.add(pattern,
|
| + new PageLinkInfo(pageNum, pattern.getPageNumber(), posInAscendingNumbers));
|
| }
|
| }
|
| - } // extractPageParamCandidatesFromQuery
|
| + }
|
|
|
| private static RegExp sDigitsRegExp = null; // Match at least 1 digit.
|
|
|
| @@ -141,12 +155,12 @@ public class PageParameterDetector {
|
| * A page parameter candidate is one where a path component contains consecutive digits which
|
| * can be converted to a plain number (>= 0).
|
| * E.g. a URL path with 3 path components that contain plain numbers will generate 3 URL page
|
| - * patterns with 3 LinkInfo's, and hence 3 page parameter candidates.
|
| + * patterns with 3 PageLinkInfo's, and hence 3 page parameter candidates.
|
| *
|
| * @param url ParsedUrl of the URL to process
|
| * @param pageNum the page number as represented in original plain text
|
| * @param posInAscendingNumbers position of this page number in the list of ascending numbers
|
| - * @param pageCandidates the map of URL pattern to its associated list of LinkInfo's
|
| + * @param pageCandidates the map of URL pattern to its associated list of PageLinkInfo's
|
| */
|
|
|
| private static void extractPageParamCandidatesFromPath(ParsedUrl url, int pageNum,
|
| @@ -166,61 +180,29 @@ public class PageParameterDetector {
|
|
|
| final int matchEnd = sDigitsRegExp.getLastIndex();
|
| final int matchStart = matchEnd - match.getGroup(1).length();
|
| -
|
| - if (isLastNumericPathComponentBad(urlStr, pathStart, matchStart, matchEnd)) continue;
|
| -
|
| - int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEnd));
|
| - if (value >= 0) {
|
| - pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_PLACEHOLDER +
|
| - urlStr.substring(matchEnd),
|
| - new LinkInfo(pageNum, value, posInAscendingNumbers));
|
| + PagePattern pattern = PathComponentPagePattern.create(url, pathStart, matchStart,
|
| + matchEnd);
|
| + if (pattern != null) {
|
| + pageCandidates.add(pattern,
|
| + new PageLinkInfo(pageNum, pattern.getPageNumber(), posInAscendingNumbers));
|
| }
|
| } // while there're matches
|
| - } // extractPageParamCandidatesFromPath
|
| + }
|
|
|
| /**
|
| * Returns true if given name is backlisted as a known bad page param name.
|
| */
|
| - private static boolean isPageParamNameBad(String name) {
|
| + static boolean isPageParamNameBad(String name) {
|
| initBadPageParamNames();
|
| return sBadPageParamNames.contains(name.toLowerCase());
|
| - } // isPageParamNameBad
|
| -
|
| - private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l).
|
| - private static RegExp sLastPathComponentRegExp = null; // Match last path component.
|
| + }
|
|
|
| /**
|
| - * Returns true if:
|
| - * - the digitStart to digitEnd of urlStr is the last path component, and
|
| - * - the entire path component is numeric, and
|
| - * - the previous path component is a bad page param name.
|
| - * E.g. "www.foo.com/tag/2" will return true because of the above reasons and "tag" is a bad
|
| - * page param.
|
| + * Returns true if given string can be converted to a number >= 0.
|
| */
|
| - static boolean isLastNumericPathComponentBad(String urlStr, int pathStart,
|
| - int digitStart, int digitEnd) {
|
| - if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component.
|
| - pathStart < digitStart - 1) { // Not the first path component.
|
| - String postMatch = urlStr.substring(digitEnd).toLowerCase();
|
| - // Checks that this is the last path component, and trailing characters, if available,
|
| - // are (s)htm(l) extensions.
|
| - if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i");
|
| - if (sExtRegExp.test(postMatch)) {
|
| - // Entire component is numeric, get previous path component.
|
| - if (sLastPathComponentRegExp == null) {
|
| - sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i");
|
| - }
|
| - MatchResult prevPathComponent = sLastPathComponentRegExp.exec(
|
| - urlStr.substring(pathStart + 1, digitStart));
|
| - if (prevPathComponent != null && prevPathComponent.getGroupCount() > 1 &&
|
| - isPageParamNameBad(prevPathComponent.getGroup(1))) {
|
| - return true;
|
| - }
|
| - } // last numeric path component
|
| - }
|
| -
|
| - return false;
|
| - } // isLastNumericPathComponentBad
|
| + static boolean isPlainNumber(String str) {
|
| + return StringUtil.toNumber(str) >= 0;
|
| + }
|
|
|
| /**
|
| * If sBadPageParamNames is null, initialize it with all the known bad page param names, in
|
| @@ -259,6 +241,6 @@ public class PageParameterDetector {
|
| sBadPageParamNames.add("videos");
|
| sBadPageParamNames.add("w");
|
| sBadPageParamNames.add("wiki");
|
| - } // initBadPageParamNames
|
| + }
|
|
|
| }
|
|
|