Index: java/org/chromium/distiller/PageParameterDetector.java |
diff --git a/java/org/chromium/distiller/PageParameterDetector.java b/java/org/chromium/distiller/PageParameterDetector.java |
index 0bf6dfcb7a57c7d290bb751bcc82e8d6d8f1767c..4149b8fce8d88098cfeeadc93a8e6154a5a75f33 100644 |
--- a/java/org/chromium/distiller/PageParameterDetector.java |
+++ b/java/org/chromium/distiller/PageParameterDetector.java |
@@ -1,4 +1,4 @@ |
-// Copyright 2016 The Chromium Authors. All rights reserved. |
+// Copyright 2015 The Chromium Authors. All rights reserved. |
// Use of this source code is governed by a BSD-style license that can be |
// found in the LICENSE file. |
@@ -29,8 +29,8 @@ import java.util.Set; |
* "digital" means the text contains only digits. |
* A page pattern is a paging URL whose page parameter value is replaced with a place holder |
* (PAGE_PARAM_PLACEHOLDER). |
- * Example: if the original url is "http: *www.foo.com/a/b-3.html", the page pattern is |
- * "http: *www.foo.com/a/b-[*!].html". |
+ * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pattern is |
+ * "http://www.foo.com/a/b-[*!].html". |
* |
* This class extracts the page parameter from a document's outlinks. |
* The basic idea: |
@@ -39,19 +39,58 @@ import java.util.Set; |
* (either a query value or a path component) in URL. If one part of a URL is always a linear |
* map from its digital anchor text, we guess the part is the page parameter of the URL. |
* |
- * As an example, consider a document http: *a/b?c=1&p=10, which contains the following digital |
+ * As an example, consider a document http://a/b?c=1&p=10, which contains the following digital |
* outlinks: |
- * <a href=http: *a/b?c=1&p=20>3</a> |
- * <a href=http: *a/b?c=1&p=30>4</a> |
- * <a href=http: *a/b?c=1&p=40>5</a> |
- * <a href=http: *a/b?c=1&p=all>single page</a> |
+ * <a href=http://a/b?c=1&p=20>3</a> |
+ * <a href=http://a/b?c=1&p=30>4</a> |
+ * <a href=http://a/b?c=1&p=40>5</a> |
+ * <a href=http://a/b?c=1&p=all>single page</a> |
* This class finds that the "p" parameter is always equal to "anchor text" * 10 - 10, and so |
- * guesses it is the page parameter. The associated page pattern is http: *a/b?c=1&p=[*!]. |
+ * guesses it is the page parameter. The associated page pattern is http://a/b?c=1&p=[*!]. |
* Then, this class extracts the single page based on page parameter info. The single page url is |
- * http: *a/b?c=1&p=all. |
+ * http://a/b?c=1&p=all. |
*/ |
public class PageParameterDetector { |
- private static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; |
+ private static final int MIN_LINKS_TO_JUSTIFY_LINEAR_MAP = 2; |
+ |
+ static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; |
+ static final int PAGE_PARAM_PLACEHOLDER_LEN = PAGE_PARAM_PLACEHOLDER.length(); |
+ static final int PAGE_NUM_ADJACENT_MASK = 1 << 0; |
+ static final int PAGE_NUM_CONSECUTIVE_MASK = 1 << 1; |
+ |
+ /** |
+ * The interface that page pattern handlers must implement to detect page parameter from |
+ * potential pagination URLs. |
+ */ |
+ interface PagePattern { |
+ /** |
+ * Returns the string of the URL page pattern. |
+ */ |
+ String toString(); |
+ |
+ /** |
+ * Returns the page number extracted from the URL during creation of object that implements |
+ * this interface. |
+ */ |
+ int getPageNumber(); |
+ |
+ /** |
+ * Validates this page pattern according to the current document URL through a pipeline of |
+ * rules. |
+ * |
+ * Returns true if page pattern is valid. |
+ * |
+ * @param docUrl the current document URL |
+ */ |
+ boolean isValidFor(ParsedUrl docUrl); |
+ |
+ /** |
+ * Returns true if a URL matches this page pattern based on a pipeline of rules. |
+ * |
+ * @param url the URL to evalutate |
+ */ |
+ boolean isPagingUrl(String url); |
+ } |
/** |
* Stores information about the link (anchor) after the page parameter is detected: |
@@ -70,29 +109,38 @@ public class PageParameterDetector { |
mPageParamValue = pageParamValue; |
mPosInAscendingList = posInAscendingList; |
} |
- } // LinkInfo |
+ } |
/** |
* Stores a map of URL pattern to its associated list of LinkInfo's. |
*/ |
private static class PageCandidatesMap { |
- private final Map<String, List<LinkInfo>> map = new HashMap<String, List<LinkInfo>>(); |
+ private static class Info { |
+ private final PagePattern mPattern; |
+ private final List<LinkInfo> mLinks; |
+ |
+ Info(PagePattern pattern, LinkInfo link) { |
+ mPattern = pattern; |
+ mLinks = new ArrayList<LinkInfo>(); |
+ mLinks.add(link); |
+ } |
+ } |
+ |
+ private final Map<String, Info> map = new HashMap<String, Info>(); |
/** |
* Adds urlPattern with its LinkInfo into the map. If the urlPattern already exists, adds |
* the link to the list of LinkInfo's. Otherwise, creates a new map entry. |
*/ |
- private void add(String urlPattern, LinkInfo link) { |
- if (map.containsKey(urlPattern)) { |
- map.get(urlPattern).add(link); |
+ private void add(PagePattern pattern, LinkInfo link) { |
+ final String patternStr = pattern.toString(); |
+ if (map.containsKey(patternStr)) { |
+ map.get(patternStr).mLinks.add(link); |
} else { |
- List<LinkInfo> links = new ArrayList<LinkInfo>(); |
- links.add(link); |
- map.put(urlPattern, links); |
+ map.put(patternStr, new Info(pattern, link)); |
} |
} |
- |
- } // PageCandidatesMap |
+ } |
// All the known bad page param names. |
private static Set<String> sBadPageParamNames = null; |
@@ -118,19 +166,13 @@ public class PageParameterDetector { |
if (queryParams.length == 0) return; // No query. |
for (String[] nameValue : queryParams) { |
- final String queryName = nameValue[0]; |
- final String queryValue = nameValue[1]; |
- if (!queryName.isEmpty() && !queryValue.isEmpty() && |
- StringUtil.isStringAllDigits(queryValue) && !isPageParamNameBad(queryName)) { |
- int value = StringUtil.toNumber(queryValue); |
- if (value >= 0) { |
- pageCandidates.add( |
- url.replaceQueryValue(queryName, queryValue, PAGE_PARAM_PLACEHOLDER), |
- new LinkInfo(pageNum, value, posInAscendingNumbers)); |
- } |
+ PagePattern pattern = QueryParamPagePattern.create(url, nameValue[0], nameValue[1]); |
+ if (pattern != null) { |
+ pageCandidates.add(pattern, |
+ new LinkInfo(pageNum, pattern.getPageNumber(), posInAscendingNumbers)); |
} |
} |
- } // extractPageParamCandidatesFromQuery |
+ } |
private static RegExp sDigitsRegExp = null; // Match at least 1 digit. |
@@ -166,61 +208,291 @@ public class PageParameterDetector { |
final int matchEnd = sDigitsRegExp.getLastIndex(); |
final int matchStart = matchEnd - match.getGroup(1).length(); |
+ PagePattern pattern = PathComponentPagePattern.create(url, pathStart, matchStart, |
+ matchEnd); |
+ if (pattern != null) { |
+ pageCandidates.add(pattern, |
+ new LinkInfo(pageNum, pattern.getPageNumber(), posInAscendingNumbers)); |
+ } |
+ } // while there're matches |
+ } |
- if (isLastNumericPathComponentBad(urlStr, pathStart, matchStart, matchEnd)) continue; |
+ /** |
+ * Evaluates if the given list of LinkInfo's is a list of paging URLs: |
+ * - page numbers in list of LinkInfo's must be adjacent |
+ * - page numbers in list of ascending numbers must either |
+ * - be consecutive and form a page number sequence, or |
+ * - must construct a linear map with a linear formula: page_parameter = a * page_number + b |
+ * - if there's only 1 LinkInfo, the first ascending number must be page 1, first page URL must |
+ * match page pattern, and the only outlink must be 2nd or 3rd page. |
+ * |
+ * Returns a populated PageParamInfo if evaluated true. Otherwise, returns null. |
+ * |
+ * @param allLinkInfo the list of LinkInfo's to evaluate |
+ * @param pagePattern the URL pattern to use |
+ * @param ascendingNumbers list of PageInfo's with ascending mPageNum's |
+ * @param firstPageUrl the URL of the PageInfo with mPageNum=1 |
+ */ |
+ private static PageParamInfo getPageParamInfo(PagePattern pagePattern, |
cjhopman
2015/04/16 21:58:32
I feel like we have to many SomethingInfo types he
cjhopman
2015/04/16 21:58:32
Would it make sense for this function to be in the
kuan
2015/04/20 23:11:13
i'm bad at names, and ran out of them :(
kuan
2015/04/20 23:11:13
Done. to avoid cross-access between PageParameter
|
+ List<LinkInfo> allLinkInfo, List<PageParamInfo.PageInfo> ascendingNumbers, |
+ String firstPageUrl) { |
+ if (allLinkInfo.size() >= MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) { |
+ int result = arePageNumsAdjacentAndConsecutive(allLinkInfo, ascendingNumbers); |
+ if ((result & PAGE_NUM_ADJACENT_MASK) == 0) return null; |
+ |
+ PageParamInfo.LinearFormula linearFormula = getPageParamLinearFormula(allLinkInfo); |
- int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEnd)); |
- if (value >= 0) { |
- pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_PLACEHOLDER + |
- urlStr.substring(matchEnd), |
- new LinkInfo(pageNum, value, posInAscendingNumbers)); |
+ // PageParamInfo.Type.PAGE_NUMBER: ascending numbers must be consecutive and of a page |
+ // number sequence. |
+ if ((result & PAGE_NUM_CONSECUTIVE_MASK) != PAGE_NUM_CONSECUTIVE_MASK) return null; |
+ if (!isPageNumberSequence(ascendingNumbers)) return null; |
+ PageParamInfo pageParamInfo = new PageParamInfo(); |
cjhopman
2015/04/16 21:58:32
Could we move some of the logic of creating the pa
kuan
2015/04/20 23:11:13
Done.
|
+ pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER; |
+ pageParamInfo.mFormula = linearFormula; |
+ for (LinkInfo link : allLinkInfo) { |
+ pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(link.mPageNum, |
+ ascendingNumbers.get(link.mPosInAscendingList).mUrl)); |
} |
- } // while there're matches |
- } // extractPageParamCandidatesFromPath |
+ return pageParamInfo; |
+ } |
+ |
+ // Most of news article have no more than 3 pages and the first page probably doesn't have |
+ // any page parameter. If the first page url matches the the page pattern, we treat it as |
+ // the first page of this pattern. |
+ if (allLinkInfo.size() == 1 && !firstPageUrl.isEmpty()) { |
+ final LinkInfo onlyLink = allLinkInfo.get(0); |
+ boolean secondPageIsOutlink = onlyLink.mPageNum == 2 && |
+ onlyLink.mPosInAscendingList == 1; |
+ boolean thirdPageIsOutlink = onlyLink.mPageNum == 3 && |
+ onlyLink.mPosInAscendingList == 2 && |
+ // onlyLink's pos is 2 (evaluated right before), so ascendingNumbers has >= 3 |
+ // elements; check if previous element is previous page. |
+ ascendingNumbers.get(1).mPageNum == 2; |
+ // 1 LinkInfo means ascendingNumbers has >= 1 element. |
+ if (ascendingNumbers.get(0).mPageNum == 1 && |
+ (secondPageIsOutlink || thirdPageIsOutlink) && |
+ pagePattern.isPagingUrl(firstPageUrl)) { |
+ // Has valid PageParamInfo, create and populate it. |
+ PageParamInfo pageParamInfo = new PageParamInfo(); |
+ pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER; |
+ int coefficient; |
+ int delta = onlyLink.mPageParamValue - onlyLink.mPageNum; |
+ if (delta == 0 || delta == 1) { |
+ coefficient = 1; |
+ } else { |
+ coefficient = onlyLink.mPageParamValue; |
+ delta = 0; |
+ } |
+ pageParamInfo.mFormula = new PageParamInfo.LinearFormula(coefficient, delta); |
+ pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(1, firstPageUrl)); |
+ pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(onlyLink.mPageNum, |
+ ascendingNumbers.get(onlyLink.mPosInAscendingList).mUrl)); |
+ return pageParamInfo; |
+ } |
+ } |
+ |
+ return null; |
+ } |
/** |
* Returns true if given name is backlisted as a known bad page param name. |
*/ |
- private static boolean isPageParamNameBad(String name) { |
+ static boolean isPageParamNameBad(String name) { |
initBadPageParamNames(); |
return sBadPageParamNames.contains(name.toLowerCase()); |
- } // isPageParamNameBad |
+ } |
+ |
+ /** |
+ * Detects if page numbers in list of LinkInfo's are adjacent, and if page numbers in list of |
+ * PageParamInfo.PageInfo's are consecutive. |
+ * |
+ * For adjacency, the page numbers in list of LinkInfo's must either be adjacent, or separated |
+ * by at most 1 plain text number which must represent the current page number in one of the |
+ * PageParamInfo.PageInfo's. |
+ * For consecutiveness, there must be at least one pair of consecutive number values in the list |
+ * of LinkInfo's, or between a LinkInfo and a plain text number. Otherwise, these outlinks are |
+ * likely to be page size selection links (e.g. in the document "See 1-10, 11-20..."). |
+ * |
+ * Returns a int value that is a combination of bits: |
+ * - bit for PAGE_PARAM_ADJACENT_MASK is set if allLinkInfo are adjacent |
+ * - bit for PAGE_PARAM_CONSECUTIVE_MASK is set if ascendingNumbers are consecutive. |
+ * |
+ * @param allLinkInfo the list of LinkInfo's to evaluate |
+ * @param ascendingNumbers list of PageInfo's with ascending mPageNum's |
+ */ |
+ static int arePageNumsAdjacentAndConsecutive(List<LinkInfo> allLinkInfo, |
+ List<PageParamInfo.PageInfo> ascendingNumbers) { |
+ int result = 0; |
+ |
+ // Check if elements in allLinkInfo are adjacent or there's only 1 gap i.e. the gap is |
+ // current page number respresented in plain text. |
+ int firstPos = -1; |
+ int lastPos = -1; |
+ int gapPos = -1; |
+ Set<Integer> pageParamSet = new HashSet<Integer>(); // To check that page number is unique. |
+ for (LinkInfo linkInfo : allLinkInfo) { |
+ final int currPos = linkInfo.mPosInAscendingList; |
+ if (lastPos == -1) { |
+ firstPos = currPos; |
+ } else if (currPos != lastPos + 1) { |
+ // If position is not strictly ascending, or the gap size is > 1 (e.g. "[3] [4] 5 6 |
+ // [7]"), or there's more than 1 gap (e.g. "[3] 4 [5] 6 [7]"), allLinkInfo is not |
+ // adjacent. |
+ if (currPos <= lastPos || currPos != lastPos + 2 || gapPos != -1) return result; |
+ gapPos = currPos - 1; |
+ } |
+ // Make sure page param value, i.e. page number represented in plain text, is unique. |
+ if (!pageParamSet.add(linkInfo.mPageParamValue)) return result; |
+ lastPos = currPos; |
+ } // for all LinkInfo's |
+ |
+ result |= PAGE_NUM_ADJACENT_MASK; |
- private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l). |
- private static RegExp sLastPathComponentRegExp = null; // Match last path component. |
+ // Now, determine if page numbers in ascendingNumbers are consecutive. |
+ |
+ // First, handle the gap. |
+ if (gapPos != -1) { |
+ if (gapPos <= 0 || gapPos >= ascendingNumbers.size() - 1) return result; |
+ // The "gap" should represent current page number in plain text. |
+ // Check if its adjacent page numbers are consecutive. |
+ // e.g. "[1] [5] 6 [7] [12]" is accepted; "[4] 8 [16]" is rejected. |
+ // This can eliminate links affecting the number of items on a page. |
+ final int currPageNum = ascendingNumbers.get(gapPos).mPageNum; |
+ if (ascendingNumbers.get(gapPos - 1).mPageNum == currPageNum - 1 && |
+ ascendingNumbers.get(gapPos + 1).mPageNum == currPageNum + 1) { |
+ return result | PAGE_NUM_CONSECUTIVE_MASK; |
+ } |
+ return result; |
+ } |
+ |
+ // There is no gap. Check if at least one of the following cases is satisfied: |
+ // Case #1: "[1] [2] ..." or "1 [2] ... ". |
+ if ((firstPos == 0 || firstPos == 1) && ascendingNumbers.get(0).mPageNum == 1 && |
+ ascendingNumbers.get(1).mPageNum == 2) { |
+ return result | PAGE_NUM_CONSECUTIVE_MASK; |
+ } |
+ // Case #2: "[1] 2 [3] ..." where [1] doesn't belong to current pattern. |
+ if (firstPos == 2 && ascendingNumbers.get(2).mPageNum == 3 && |
+ ascendingNumbers.get(1).mUrl.isEmpty() && !ascendingNumbers.get(0).mUrl.isEmpty()) { |
+ return result | PAGE_NUM_CONSECUTIVE_MASK; |
+ } |
+ // Case #3: "... [n-1] [n]" or "... [n - 1] n". |
+ final int numbersSize = ascendingNumbers.size(); |
+ if ((lastPos == numbersSize - 1 || lastPos == numbersSize - 2) && |
+ ascendingNumbers.get(numbersSize - 2).mPageNum + 1 == |
+ ascendingNumbers.get(numbersSize - 1).mPageNum) { |
+ return result | PAGE_NUM_CONSECUTIVE_MASK; |
+ } |
+ // Case #4: "... [i-1] [i] [i+1] ...". |
+ for (int i = firstPos + 1; i < lastPos; i++) { |
+ if (ascendingNumbers.get(i - 1).mPageNum + 2 == ascendingNumbers.get(i + 1).mPageNum) { |
+ return result | PAGE_NUM_CONSECUTIVE_MASK; |
+ } |
+ } |
+ |
+ // Otherwise, there's no pair of consecutive values. |
+ return result; |
+ } |
/** |
- * Returns true if: |
- * - the digitStart to digitEnd of urlStr is the last path component, and |
- * - the entire path component is numeric, and |
- * - the previous path component is a bad page param name. |
- * E.g. "www.foo.com/tag/2" will return true because of the above reasons and "tag" is a bad |
- * page param. |
+ * |
+ * Determines if the list of LinkInfo's form a linear formula: |
+ * pageParamValue = coefficient * pageNum + delta (delta == -coefficient or delta == 0). |
+ * |
+ * The coefficient and delta are calculated from the page parameter values and page numbers of 2 |
+ * LinkInfo's, and then validated against the remaining LinkInfo's. |
+ * The order of page numbers doesn't matter. |
+ * |
+ * Returns PageParamInfo.LinearFormula, containing the coefficient and delta, if the page |
+ * parameter forumla could be determined. Otherwise, returns null. |
+ * |
+ * @param allLinkInfo the list of LinkInfo's to evaluate |
*/ |
- static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, |
- int digitStart, int digitEnd) { |
- if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component. |
- pathStart < digitStart - 1) { // Not the first path component. |
- String postMatch = urlStr.substring(digitEnd).toLowerCase(); |
- // Checks that this is the last path component, and trailing characters, if available, |
- // are (s)htm(l) extensions. |
- if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i"); |
- if (sExtRegExp.test(postMatch)) { |
- // Entire component is numeric, get previous path component. |
- if (sLastPathComponentRegExp == null) { |
- sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i"); |
- } |
- MatchResult prevPathComponent = sLastPathComponentRegExp.exec( |
- urlStr.substring(pathStart + 1, digitStart)); |
- if (prevPathComponent != null && prevPathComponent.getGroupCount() > 1 && |
- isPageParamNameBad(prevPathComponent.getGroup(1))) { |
- return true; |
- } |
- } // last numeric path component |
+ // TODO(kuan): As this gets rolled out, reassesss the necessity of non-1 coefficient support. |
+ private static PageParamInfo.LinearFormula getPageParamLinearFormula( |
+ List<LinkInfo> allLinkInfo) { |
+ if (allLinkInfo.size() < MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) return null; |
+ |
+ final LinkInfo firstLink = allLinkInfo.get(0); |
+ final LinkInfo secondLink = allLinkInfo.get(1); |
+ |
+ if (allLinkInfo.size() == 2 && Math.max(firstLink.mPageNum, secondLink.mPageNum) > 4) { |
+ return null; |
} |
- return false; |
- } // isLastNumericPathComponentBad |
+ int deltaX = secondLink.mPageNum - firstLink.mPageNum; |
+ if (deltaX == 0) return null; |
+ |
+ int deltaY = secondLink.mPageParamValue - firstLink.mPageParamValue; |
+ int coefficient = deltaY / deltaX; |
+ if (coefficient == 0) return null; |
+ |
+ int delta = firstLink.mPageParamValue - coefficient * firstLink.mPageNum; |
+ if (delta != 0 && delta != -coefficient) return null; |
+ |
+ // Check if the remaining elements are on the same linear map. |
+ for (int i = 2; i < allLinkInfo.size(); i++) { |
+ final LinkInfo link = allLinkInfo.get(i); |
+ if (link.mPageParamValue != coefficient * link.mPageNum + delta) return null; |
+ } |
+ |
+ return new PageParamInfo.LinearFormula(coefficient, delta); |
+ } |
+ |
+ /** |
+ * Returns true if page numbers in list of PageParamInfo.PageInfo's form a sequence, based on |
+ * a pipeline of rules: |
+ * - first PageInfo must have a URL unless it is the first page |
+ * - there's only one plain number without URL in list |
+ * - if only two pages, they must be siblings - 2nd page number must follow 1st |
+ * - page numbers must be adjacent and consecutive; otherwise, 2 non-consecutive numbers must be |
+ * head/tail or have URLs. |
+ * |
+ * @param ascendingNumbers list of PageInfo's with ascending mPageNum's |
+ */ |
+ private static boolean isPageNumberSequence(List<PageParamInfo.PageInfo> ascendingNumbers) { |
+ if (ascendingNumbers.size() <= 1) return false; |
+ |
+ // The first one must have a URL unless it is the first page. |
+ final PageParamInfo.PageInfo firstPage = ascendingNumbers.get(0); |
+ if (firstPage.mPageNum != 1 && firstPage.mUrl.isEmpty()) return false; |
+ |
+ // There's only one plain number without URL in ascending numbers group. |
+ boolean hasPlainNum = false; |
+ for (PageParamInfo.PageInfo page : ascendingNumbers) { |
+ if (page.mUrl.isEmpty()) { |
+ if (hasPlainNum) return false; |
+ hasPlainNum = true; |
+ } |
+ } |
+ |
+ // If there are only two pages, they must be siblings. |
+ if (ascendingNumbers.size() == 2) { |
+ return firstPage.mPageNum + 1 == ascendingNumbers.get(1).mPageNum; |
+ } |
+ |
+ // Check if page numbers in ascendingNumbers are adjacent and consecutive. |
+ for (int i = 1; i < ascendingNumbers.size(); i++) { |
+ // If two adjacent numbers are not consecutive, we accept them only when: |
+ // 1) one of them is head/tail, like [1],[n-i][n-i+1]..[n] or [1],[2], [3]...[i], [n]. |
+ // 2) both of them have URLs. |
+ final PageParamInfo.PageInfo currPage = ascendingNumbers.get(i); |
+ final PageParamInfo.PageInfo prevPage = ascendingNumbers.get(i - 1); |
+ if (currPage.mPageNum - prevPage.mPageNum != 1) { |
+ if (i != 1 && i != ascendingNumbers.size() - 1) return false; |
+ if (currPage.mUrl.isEmpty() || prevPage.mUrl.isEmpty()) return false; |
+ } |
+ } |
+ |
+ return true; |
+ } |
+ |
+ /** |
+ * Returns true if given string can be converted to a number >= 0. |
+ */ |
+ static boolean isPlainNumber(String str) { |
+ return StringUtil.toNumber(str) >= 0; |
+ } |
/** |
* If sBadPageParamNames is null, initialize it with all the known bad page param names, in |
@@ -259,6 +531,6 @@ public class PageParameterDetector { |
sBadPageParamNames.add("videos"); |
sBadPageParamNames.add("w"); |
sBadPageParamNames.add("wiki"); |
- } // initBadPageParamNames |
+ } |
} |