Index: java/org/chromium/distiller/PageParameterDetector.java |
diff --git a/java/org/chromium/distiller/PageParameterDetector.java b/java/org/chromium/distiller/PageParameterDetector.java |
index 0bf6dfcb7a57c7d290bb751bcc82e8d6d8f1767c..5f0d03e3928c1bf249d46d4b69d9ceb12fa31169 100644 |
--- a/java/org/chromium/distiller/PageParameterDetector.java |
+++ b/java/org/chromium/distiller/PageParameterDetector.java |
@@ -1,4 +1,4 @@ |
-// Copyright 2016 The Chromium Authors. All rights reserved. |
+// Copyright 2015 The Chromium Authors. All rights reserved. |
// Use of this source code is governed by a BSD-style license that can be |
// found in the LICENSE file. |
@@ -20,17 +20,15 @@ import java.util.Set; |
* Background: |
* The long article/news/forum thread/blog document may be partitioned into several partial pages |
* by webmaster. Each partial page has outlinks pointing to the adjacent partial pages. The |
- * anchor text of those outlinks is numeric. Meanwhile, there may be a page which contains the |
- * whole content, called "single page". |
+ * anchor text of those outlinks is numeric. |
* |
* Definitions: |
- * A single page document is a document that contains the whole content. |
* A paging document is one of the partial pages. |
* "digital" means the text contains only digits. |
* A page pattern is a paging URL whose page parameter value is replaced with a place holder |
* (PAGE_PARAM_PLACEHOLDER). |
- * Example: if the original url is "http: *www.foo.com/a/b-3.html", the page pattern is |
- * "http: *www.foo.com/a/b-[*!].html". |
+ * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pattern is |
+ * "http://www.foo.com/a/b-[*!].html". |
* |
* This class extracts the page parameter from a document's outlinks. |
* The basic idea: |
@@ -39,60 +37,82 @@ import java.util.Set; |
* (either a query value or a path component) in URL. If one part of a URL is always a linear |
* map from its digital anchor text, we guess the part is the page parameter of the URL. |
* |
- * As an example, consider a document http: *a/b?c=1&p=10, which contains the following digital |
+ * As an example, consider a document http://a/b?c=1&p=10, which contains the following digital |
* outlinks: |
- * <a href=http: *a/b?c=1&p=20>3</a> |
- * <a href=http: *a/b?c=1&p=30>4</a> |
- * <a href=http: *a/b?c=1&p=40>5</a> |
- * <a href=http: *a/b?c=1&p=all>single page</a> |
+ * <a href=http://a/b?c=1&p=20>3</a> |
+ * <a href=http://a/b?c=1&p=30>4</a> |
+ * <a href=http://a/b?c=1&p=40>5</a> |
* This class finds that the "p" parameter is always equal to "anchor text" * 10 - 10, and so |
- * guesses it is the page parameter. The associated page pattern is http: *a/b?c=1&p=[*!]. |
- * Then, this class extracts the single page based on page parameter info. The single page url is |
- * http: *a/b?c=1&p=all. |
+ * guesses it is the page parameter. The associated page pattern is http://a/b?c=1&p=[*!]. |
*/ |
public class PageParameterDetector { |
- private static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; |
+ static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; |
+ static final int PAGE_PARAM_PLACEHOLDER_LEN = PAGE_PARAM_PLACEHOLDER.length(); |
/** |
- * Stores information about the link (anchor) after the page parameter is detected: |
- * - the page number (as represented by the original plain text) for the link |
- * - the original page parameter numeric component in the URL (this component would be replaced |
- * by PAGE_PARAM_PLACEHOLDER in the URL pattern) |
- * - the position of this link in the list of ascending numbers. |
+ * The interface that page pattern handlers must implement to detect page parameter from |
+ * potential pagination URLs. |
*/ |
- static class LinkInfo { |
- private int mPageNum; |
- private int mPageParamValue; |
- private int mPosInAscendingList; |
+ interface PagePattern { |
+ /** |
+ * Returns the string of the URL page pattern. |
+ */ |
+ String toString(); |
+ |
+ /** |
+ * Returns the page number extracted from the URL during creation of object that implements |
+ * this interface. |
+ */ |
+ int getPageNumber(); |
+ |
+ /** |
+ * Validates this page pattern according to the current document URL through a pipeline of |
+ * rules. |
+ * |
+ * Returns true if page pattern is valid. |
+ * |
+ * @param docUrl the current document URL |
+ */ |
+ boolean isValidFor(ParsedUrl docUrl); |
- LinkInfo(int pageNum, int pageParamValue, int posInAscendingList) { |
- mPageNum = pageNum; |
- mPageParamValue = pageParamValue; |
- mPosInAscendingList = posInAscendingList; |
- } |
- } // LinkInfo |
+ /** |
+ * Returns true if a URL matches this page pattern based on a pipeline of rules. |
+ * |
+ * @param url the URL to evalutate |
+ */ |
+ boolean isPagingUrl(String url); |
+ } |
/** |
- * Stores a map of URL pattern to its associated list of LinkInfo's. |
+ * Stores a map of URL pattern to its associated list of PageLinkInfo's. |
*/ |
private static class PageCandidatesMap { |
- private final Map<String, List<LinkInfo>> map = new HashMap<String, List<LinkInfo>>(); |
+ private static class Info { |
+ private final PagePattern mPattern; |
+ private final List<PageLinkInfo> mLinks; |
+ |
+ Info(PagePattern pattern, PageLinkInfo link) { |
+ mPattern = pattern; |
+ mLinks = new ArrayList<PageLinkInfo>(); |
+ mLinks.add(link); |
+ } |
+ } |
+ |
+ private final Map<String, Info> map = new HashMap<String, Info>(); |
/** |
- * Adds urlPattern with its LinkInfo into the map. If the urlPattern already exists, adds |
- * the link to the list of LinkInfo's. Otherwise, creates a new map entry. |
+ * Adds urlPattern with its PageLinkInfo into the map. If the urlPattern already exists, |
+ * adds the link to the list of LinkInfo's. Otherwise, creates a new map entry. |
*/ |
- private void add(String urlPattern, LinkInfo link) { |
- if (map.containsKey(urlPattern)) { |
- map.get(urlPattern).add(link); |
+ private void add(PagePattern pattern, PageLinkInfo link) { |
+ final String patternStr = pattern.toString(); |
+ if (map.containsKey(patternStr)) { |
+ map.get(patternStr).mLinks.add(link); |
} else { |
- List<LinkInfo> links = new ArrayList<LinkInfo>(); |
- links.add(link); |
- map.put(urlPattern, links); |
+ map.put(patternStr, new Info(pattern, link)); |
} |
} |
- |
- } // PageCandidatesMap |
+ } |
// All the known bad page param names. |
private static Set<String> sBadPageParamNames = null; |
@@ -105,12 +125,12 @@ public class PageParameterDetector { |
* - the name of a query name-value component is not one of sBadPageParamNames, and |
* - the value of the query component is a plain number (>= 0). |
* E.g. a URL query with 3 plain number query values will generate 3 URL page patterns with 3 |
- * LinkInfo's, and hence 3 page parameter candidates. |
+ * PageLinkInfo's, and hence 3 page parameter candidates. |
* |
* @param url ParsedUrl of the URL to process |
* @param pageNum the page number as represented in original plain text |
* @param posInAscendingNumbers position of this page number in the list of ascending numbers |
- * @param pageCandidates the map of URL pattern to its associated list of LinkInfo's |
+ * @param pageCandidates the map of URL pattern to its associated list of PageLinkInfo's |
*/ |
private static void extractPageParamCandidatesFromQuery(ParsedUrl url, int pageNum, |
int posInAscendingNumbers, PageCandidatesMap pageCandidates) { |
@@ -118,19 +138,13 @@ public class PageParameterDetector { |
if (queryParams.length == 0) return; // No query. |
for (String[] nameValue : queryParams) { |
- final String queryName = nameValue[0]; |
- final String queryValue = nameValue[1]; |
- if (!queryName.isEmpty() && !queryValue.isEmpty() && |
- StringUtil.isStringAllDigits(queryValue) && !isPageParamNameBad(queryName)) { |
- int value = StringUtil.toNumber(queryValue); |
- if (value >= 0) { |
- pageCandidates.add( |
- url.replaceQueryValue(queryName, queryValue, PAGE_PARAM_PLACEHOLDER), |
- new LinkInfo(pageNum, value, posInAscendingNumbers)); |
- } |
+ PagePattern pattern = QueryParamPagePattern.create(url, nameValue[0], nameValue[1]); |
+ if (pattern != null) { |
+ pageCandidates.add(pattern, |
+ new PageLinkInfo(pageNum, pattern.getPageNumber(), posInAscendingNumbers)); |
} |
} |
- } // extractPageParamCandidatesFromQuery |
+ } |
private static RegExp sDigitsRegExp = null; // Match at least 1 digit. |
@@ -141,12 +155,12 @@ public class PageParameterDetector { |
* A page parameter candidate is one where a path component contains consecutive digits which |
* can be converted to a plain number (>= 0). |
* E.g. a URL path with 3 path components that contain plain numbers will generate 3 URL page |
- * patterns with 3 LinkInfo's, and hence 3 page parameter candidates. |
+ * patterns with 3 PageLinkInfo's, and hence 3 page parameter candidates. |
* |
* @param url ParsedUrl of the URL to process |
* @param pageNum the page number as represented in original plain text |
* @param posInAscendingNumbers position of this page number in the list of ascending numbers |
- * @param pageCandidates the map of URL pattern to its associated list of LinkInfo's |
+ * @param pageCandidates the map of URL pattern to its associated list of PageLinkInfo's |
*/ |
private static void extractPageParamCandidatesFromPath(ParsedUrl url, int pageNum, |
@@ -166,61 +180,29 @@ public class PageParameterDetector { |
final int matchEnd = sDigitsRegExp.getLastIndex(); |
final int matchStart = matchEnd - match.getGroup(1).length(); |
- |
- if (isLastNumericPathComponentBad(urlStr, pathStart, matchStart, matchEnd)) continue; |
- |
- int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEnd)); |
- if (value >= 0) { |
- pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_PLACEHOLDER + |
- urlStr.substring(matchEnd), |
- new LinkInfo(pageNum, value, posInAscendingNumbers)); |
+ PagePattern pattern = PathComponentPagePattern.create(url, pathStart, matchStart, |
+ matchEnd); |
+ if (pattern != null) { |
+ pageCandidates.add(pattern, |
+ new PageLinkInfo(pageNum, pattern.getPageNumber(), posInAscendingNumbers)); |
} |
} // while there're matches |
- } // extractPageParamCandidatesFromPath |
+ } |
/** |
* Returns true if given name is backlisted as a known bad page param name. |
*/ |
- private static boolean isPageParamNameBad(String name) { |
+ static boolean isPageParamNameBad(String name) { |
initBadPageParamNames(); |
return sBadPageParamNames.contains(name.toLowerCase()); |
- } // isPageParamNameBad |
- |
- private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l). |
- private static RegExp sLastPathComponentRegExp = null; // Match last path component. |
+ } |
/** |
- * Returns true if: |
- * - the digitStart to digitEnd of urlStr is the last path component, and |
- * - the entire path component is numeric, and |
- * - the previous path component is a bad page param name. |
- * E.g. "www.foo.com/tag/2" will return true because of the above reasons and "tag" is a bad |
- * page param. |
+ * Returns true if given string can be converted to a number >= 0. |
*/ |
- static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, |
- int digitStart, int digitEnd) { |
- if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component. |
- pathStart < digitStart - 1) { // Not the first path component. |
- String postMatch = urlStr.substring(digitEnd).toLowerCase(); |
- // Checks that this is the last path component, and trailing characters, if available, |
- // are (s)htm(l) extensions. |
- if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i"); |
- if (sExtRegExp.test(postMatch)) { |
- // Entire component is numeric, get previous path component. |
- if (sLastPathComponentRegExp == null) { |
- sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i"); |
- } |
- MatchResult prevPathComponent = sLastPathComponentRegExp.exec( |
- urlStr.substring(pathStart + 1, digitStart)); |
- if (prevPathComponent != null && prevPathComponent.getGroupCount() > 1 && |
- isPageParamNameBad(prevPathComponent.getGroup(1))) { |
- return true; |
- } |
- } // last numeric path component |
- } |
- |
- return false; |
- } // isLastNumericPathComponentBad |
+ static boolean isPlainNumber(String str) { |
+ return StringUtil.toNumber(str) >= 0; |
+ } |
/** |
* If sBadPageParamNames is null, initialize it with all the known bad page param names, in |
@@ -259,6 +241,6 @@ public class PageParameterDetector { |
sBadPageParamNames.add("videos"); |
sBadPageParamNames.add("w"); |
sBadPageParamNames.add("wiki"); |
- } // initBadPageParamNames |
+ } |
} |