| Index: java/org/chromium/distiller/PageParameterDetector.java
|
| diff --git a/java/org/chromium/distiller/PageParameterDetector.java b/java/org/chromium/distiller/PageParameterDetector.java
|
| index 0bf6dfcb7a57c7d290bb751bcc82e8d6d8f1767c..64c942eb5fe09f417ba12af8e62845dc0303bc97 100644
|
| --- a/java/org/chromium/distiller/PageParameterDetector.java
|
| +++ b/java/org/chromium/distiller/PageParameterDetector.java
|
| @@ -29,8 +29,8 @@ import java.util.Set;
|
| * "digital" means the text contains only digits.
|
| * A page pattern is a paging URL whose page parameter value is replaced with a place holder
|
| * (PAGE_PARAM_PLACEHOLDER).
|
| - * Example: if the original url is "http: *www.foo.com/a/b-3.html", the page pattern is
|
| - * "http: *www.foo.com/a/b-[*!].html".
|
| + * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pattern is
|
| + * "http://www.foo.com/a/b-[*!].html".
|
| *
|
| * This class extracts the page parameter from a document's outlinks.
|
| * The basic idea:
|
| @@ -51,7 +51,12 @@ import java.util.Set;
|
| * http: *a/b?c=1&p=all.
|
| */
|
| public class PageParameterDetector {
|
| - private static final String PAGE_PARAM_PLACEHOLDER = "[*!]";
|
| + private static final int MIN_LINKS_TO_JUSTIFY_LINEAR_MAP = 2;
|
| +
|
| + static final String PAGE_PARAM_PLACEHOLDER = "[*!]";
|
| + static final int PAGE_PARAM_PLACEHOLDER_LEN = PAGE_PARAM_PLACEHOLDER.length();
|
| + static final int PAGE_NUM_ADJACENT_MASK = 1 << 0;
|
| + static final int PAGE_NUM_CONSECUTIVE_MASK = 1 << 1;
|
|
|
| /**
|
| * Stores information about the link (anchor) after the page parameter is detected:
|
| @@ -70,29 +75,41 @@ public class PageParameterDetector {
|
| mPageParamValue = pageParamValue;
|
| mPosInAscendingList = posInAscendingList;
|
| }
|
| - } // LinkInfo
|
| + }
|
|
|
| /**
|
| * Stores a map of URL pattern to its associated list of LinkInfo's.
|
| */
|
| private static class PageCandidatesMap {
|
| - private final Map<String, List<LinkInfo>> map = new HashMap<String, List<LinkInfo>>();
|
| + private static class Info {
|
| + private final PagePattern mPattern;
|
| + private final List<LinkInfo> mLinks;
|
| +
|
| + Info(PagePattern pattern, LinkInfo link) {
|
| + mPattern = pattern;
|
| + mLinks = new ArrayList<LinkInfo>();
|
| + mLinks.add(link);
|
| + }
|
| + }
|
| +
|
| + private final Map<String, Info> map = new HashMap<String, Info>();
|
|
|
| /**
|
| * Adds urlPattern with its LinkInfo into the map. If the urlPattern already exists, adds
|
| * the link to the list of LinkInfo's. Otherwise, creates a new map entry.
|
| + * Returns true if addition is successful.
|
| */
|
| - private void add(String urlPattern, LinkInfo link) {
|
| + private boolean add(String urlPattern, LinkInfo link) {
|
| if (map.containsKey(urlPattern)) {
|
| - map.get(urlPattern).add(link);
|
| - } else {
|
| - List<LinkInfo> links = new ArrayList<LinkInfo>();
|
| - links.add(link);
|
| - map.put(urlPattern, links);
|
| + map.get(urlPattern).mLinks.add(link);
|
| + return true;
|
| }
|
| + PagePattern pat = PagePattern.create(urlPattern);
|
| + if (pat == null) return false;
|
| + map.put(urlPattern, new Info(pat, link));
|
| + return true;
|
| }
|
| -
|
| - } // PageCandidatesMap
|
| + }
|
|
|
| // All the known bad page param names.
|
| private static Set<String> sBadPageParamNames = null;
|
| @@ -130,7 +147,7 @@ public class PageParameterDetector {
|
| }
|
| }
|
| }
|
| - } // extractPageParamCandidatesFromQuery
|
| + }
|
|
|
| private static RegExp sDigitsRegExp = null; // Match at least 1 digit.
|
|
|
| @@ -176,7 +193,84 @@ public class PageParameterDetector {
|
| new LinkInfo(pageNum, value, posInAscendingNumbers));
|
| }
|
| } // while there're matches
|
| - } // extractPageParamCandidatesFromPath
|
| + }
|
| +
|
| + /**
|
| + * Evaluates if the given list of LinkInfo's is a list of paging URLs:
|
| + * - page numbers in list of LinkInfo's must be adjacent
|
| + * - page numbers in list of ascending numbers must either
|
| + * - be consecutive and form a page number sequence, or
|
| + * - must construct a linear map with a linear formula: page_parameter = a * page_number + b
|
| + * - if there's only 1 LinkInfo, the first ascending number must be page 1, first page URL must
|
| + * match page pattern, and the only outlink must be 2nd or 3rd page.
|
| + *
|
| + * Returns a populated PageParamInfo if evaluated true. Otherwise, returns null.
|
| + *
|
| + * @param allLinkInfo the list of LinkInfo's to evaluate
|
| + * @param pagePattern the URL pattern to use
|
| + * @param ascendingNumbers list of PageInfo's with ascending mPageNum's
|
| + * @param firstPageUrl the URL of the PageInfo with mPageNum=1
|
| + */
|
| + private static PageParamInfo getPageParamInfo(PagePattern pagePattern,
|
| + List<LinkInfo> allLinkInfo, List<PageParamInfo.PageInfo> ascendingNumbers,
|
| + String firstPageUrl) {
|
| + if (allLinkInfo.size() >= MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) {
|
| + int result = arePageNumsAdjacentAndConsecutive(allLinkInfo, ascendingNumbers);
|
| + if ((result & PAGE_NUM_ADJACENT_MASK) == 0) return null;
|
| +
|
| + PageParamInfo.LinearFormula linearFormula = getPageParamLinearFormula(allLinkInfo);
|
| +
|
| + // PageParamInfo.Type.PAGE_NUMBER: ascending numbers must be consecutive and of a page
|
| + // number sequence.
|
| + if ((result & PAGE_NUM_CONSECUTIVE_MASK) != PAGE_NUM_CONSECUTIVE_MASK) return null;
|
| + if (!isPageNumberSequence(ascendingNumbers)) return null;
|
| + PageParamInfo pageParamInfo = new PageParamInfo();
|
| + pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER;
|
| + pageParamInfo.mFormula = linearFormula;
|
| + for (LinkInfo link : allLinkInfo) {
|
| + pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(link.mPageNum,
|
| + ascendingNumbers.get(link.mPosInAscendingList).mUrl));
|
| + }
|
| + return pageParamInfo;
|
| + }
|
| +
|
| + // Most of news article have no more than 3 pages and the first page probably doesn't have
|
| + // any page parameter. If the first page url matches the the page pattern, we treat it as
|
| + // the first page of this pattern.
|
| + if (allLinkInfo.size() == 1 && !firstPageUrl.isEmpty()) {
|
| + final LinkInfo onlyLink = allLinkInfo.get(0);
|
| + boolean secondPageIsOutlink = onlyLink.mPageNum == 2 &&
|
| + onlyLink.mPosInAscendingList == 1;
|
| + boolean thirdPageIsOutlink = onlyLink.mPageNum == 3 &&
|
| + onlyLink.mPosInAscendingList == 2 &&
|
| + // onlyLink's pos is 2 (evaluated right before), so ascendingNumbers has >= 3
|
| + // elements; check if previous element is previous page.
|
| + ascendingNumbers.get(1).mPageNum == 2;
|
| + // 1 LinkInfo means ascendingNumbers has >= 1 element.
|
| + if (ascendingNumbers.get(0).mPageNum == 1 &&
|
| + (secondPageIsOutlink || thirdPageIsOutlink) &&
|
| + pagePattern.isPagingUrl(firstPageUrl)) {
|
| + // Has valid PageParamInfo, create and populate it.
|
| + PageParamInfo pageParamInfo = new PageParamInfo();
|
| + pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER;
|
| + int coefficient;
|
| + int delta = onlyLink.mPageParamValue - onlyLink.mPageNum;
|
| + if (delta == 0 || delta == 1) {
|
| + coefficient = 1;
|
| + } else {
|
| + coefficient = onlyLink.mPageParamValue;
|
| + delta = 0;
|
| + }
|
| + pageParamInfo.mFormula = new PageParamInfo.LinearFormula(coefficient, delta);
|
| + pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(1, firstPageUrl));
|
| + pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(onlyLink.mPageNum,
|
| + ascendingNumbers.get(onlyLink.mPosInAscendingList).mUrl));
|
| + return pageParamInfo;
|
| + }
|
| + }
|
| +
|
| + return null;
|
| + }
|
|
|
| /**
|
| * Returns true if given name is backlisted as a known bad page param name.
|
| @@ -184,7 +278,7 @@ public class PageParameterDetector {
|
| private static boolean isPageParamNameBad(String name) {
|
| initBadPageParamNames();
|
| return sBadPageParamNames.contains(name.toLowerCase());
|
| - } // isPageParamNameBad
|
| + }
|
|
|
| private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l).
|
| private static RegExp sLastPathComponentRegExp = null; // Match last path component.
|
| @@ -197,8 +291,8 @@ public class PageParameterDetector {
|
| * E.g. "www.foo.com/tag/2" will return true because of the above reasons and "tag" is a bad
|
| * page param.
|
| */
|
| - static boolean isLastNumericPathComponentBad(String urlStr, int pathStart,
|
| - int digitStart, int digitEnd) {
|
| + static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, int digitStart,
|
| + int digitEnd) {
|
| if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component.
|
| pathStart < digitStart - 1) { // Not the first path component.
|
| String postMatch = urlStr.substring(digitEnd).toLowerCase();
|
| @@ -220,7 +314,191 @@ public class PageParameterDetector {
|
| }
|
|
|
| return false;
|
| - } // isLastNumericPathComponentBad
|
| + }
|
| +
|
| + /**
|
| + * Detects if page numbers in list of LinkInfo's are adjacent, and if page numbers in list of
|
| + * PageParamInfo.PageInfo's are consecutive.
|
| + *
|
| + * For adjacency, the page numbers in list of LinkInfo's must either be adjacent, or separated
|
| + * by at most 1 plain text number which must represent the current page number in one of the
|
| + * PageParamInfo.PageInfo's.
|
| + * For consecutiveness, there must be at least one pair of consecutive number values in the list
|
| + * of LinkInfo's, or between a LinkInfo and a plain text number. Otherwise, these outlinks are
|
| + * likely to be page size selection links (e.g. in the document "See 1-10, 11-20...").
|
| + *
|
| + * Returns a int value that is a combination of bits:
|
| + * - bit for PAGE_PARAM_ADJACENT_MASK is set if allLinkInfo are adjacent
|
| + * - bit for PAGE_PARAM_CONSECUTIVE_MASK is set if ascendingNumbers are consecutive.
|
| + *
|
| + * @param allLinkInfo the list of LinkInfo's to evaluate
|
| + * @param ascendingNumbers list of PageInfo's with ascending mPageNum's
|
| + */
|
| + static int arePageNumsAdjacentAndConsecutive(List<LinkInfo> allLinkInfo,
|
| + List<PageParamInfo.PageInfo> ascendingNumbers) {
|
| + int result = 0;
|
| +
|
| + // Check if elements in allLinkInfo are adjacent or there's only 1 gap i.e. the gap is
|
| + // current page number respresented in plain text.
|
| + int firstPos = -1;
|
| + int lastPos = -1;
|
| + int gapPos = -1;
|
| + Set<Integer> pageParamSet = new HashSet<Integer>(); // To check that page number is unique.
|
| + for (LinkInfo linkInfo : allLinkInfo) {
|
| + final int currPos = linkInfo.mPosInAscendingList;
|
| + if (lastPos == -1) {
|
| + firstPos = currPos;
|
| + } else if (currPos != lastPos + 1) {
|
| + // If position is not strictly ascending, or the gap size is > 1 (e.g. "[3] [4] 5 6
|
| + // [7]"), or there's more than 1 gap (e.g. "[3] 4 [5] 6 [7]"), allLinkInfo is not
|
| + // adjacent.
|
| + if (currPos <= lastPos || currPos != lastPos + 2 || gapPos != -1) return result;
|
| + gapPos = currPos - 1;
|
| + }
|
| + // Make sure page param value, i.e. page number represented in plain text, is unique.
|
| + if (!pageParamSet.add(linkInfo.mPageParamValue)) return result;
|
| + lastPos = currPos;
|
| + } // for all LinkInfo's
|
| +
|
| + result |= PAGE_NUM_ADJACENT_MASK;
|
| +
|
| + // Now, determine if page numbers in ascendingNumbers are consecutive.
|
| +
|
| + // First, handle the gap.
|
| + if (gapPos != -1) {
|
| + if (gapPos <= 0 || gapPos >= ascendingNumbers.size() - 1) return result;
|
| + // The "gap" should represent current page number in plain text.
|
| + // Check if its adjacent page numbers are consecutive.
|
| + // e.g. "[1] [5] 6 [7] [12]" is accepted; "[4] 8 [16]" is rejected.
|
| + // This can eliminate links affecting the number of items on a page.
|
| + final int currPageNum = ascendingNumbers.get(gapPos).mPageNum;
|
| + if (ascendingNumbers.get(gapPos - 1).mPageNum == currPageNum - 1 &&
|
| + ascendingNumbers.get(gapPos + 1).mPageNum == currPageNum + 1) {
|
| + return result | PAGE_NUM_CONSECUTIVE_MASK;
|
| + }
|
| + return result;
|
| + }
|
| +
|
| + // There is no gap. Check if at least one of the following cases is satisfied:
|
| + // Case #1: "[1] [2] ..." or "1 [2] ... ".
|
| + if ((firstPos == 0 || firstPos == 1) && ascendingNumbers.get(0).mPageNum == 1 &&
|
| + ascendingNumbers.get(1).mPageNum == 2) {
|
| + return result | PAGE_NUM_CONSECUTIVE_MASK;
|
| + }
|
| + // Case #2: "[1] 2 [3] ..." where [1] doesn't belong to current pattern.
|
| + if (firstPos == 2 && ascendingNumbers.get(2).mPageNum == 3 &&
|
| + ascendingNumbers.get(1).mUrl.isEmpty() && !ascendingNumbers.get(0).mUrl.isEmpty()) {
|
| + return result | PAGE_NUM_CONSECUTIVE_MASK;
|
| + }
|
| + // Case #3: "... [n-1] [n]" or "... [n - 1] n".
|
| + final int numbersSize = ascendingNumbers.size();
|
| + if ((lastPos == numbersSize - 1 || lastPos == numbersSize - 2) &&
|
| + ascendingNumbers.get(numbersSize - 2).mPageNum + 1 ==
|
| + ascendingNumbers.get(numbersSize - 1).mPageNum) {
|
| + return result | PAGE_NUM_CONSECUTIVE_MASK;
|
| + }
|
| + // Case #4: "... [i-1] [i] [i+1] ...".
|
| + for (int i = firstPos + 1; i < lastPos; i++) {
|
| + if (ascendingNumbers.get(i - 1).mPageNum + 2 == ascendingNumbers.get(i + 1).mPageNum) {
|
| + return result | PAGE_NUM_CONSECUTIVE_MASK;
|
| + }
|
| + }
|
| +
|
| + // Otherwise, there's no pair of consecutive values.
|
| + return result;
|
| + }
|
| +
|
| + /**
|
| + *
|
| + * Determines if the list of LinkInfo's form a linear formula:
|
| + * pageParamValue = coefficient * pageNum + delta (delta == -coefficient or delta == 0).
|
| + *
|
| + * The coefficient and delta are calculated from the page parameter values and page numbers of 2
|
| + * LinkInfo's, and then validated against the remaining LinkInfo's.
|
| + * The order of page numbers doesn't matter.
|
| + *
|
| + * Returns PageParamInfo.LinearFormula, containing the coefficient and delta, if the page
|
| + * parameter forumla could be determined. Otherwise, returns null.
|
| + *
|
| + * @param allLinkInfo the list of LinkInfo's to evaluate
|
| + */
|
| + private static PageParamInfo.LinearFormula getPageParamLinearFormula(
|
| + List<LinkInfo> allLinkInfo) {
|
| + if (allLinkInfo.size() < MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) return null;
|
| +
|
| + final LinkInfo firstLink = allLinkInfo.get(0);
|
| + final LinkInfo secondLink = allLinkInfo.get(1);
|
| +
|
| + if (allLinkInfo.size() == 2 && Math.max(firstLink.mPageNum, secondLink.mPageNum) > 4) {
|
| + return null;
|
| + }
|
| +
|
| + int deltaX = secondLink.mPageNum - firstLink.mPageNum;
|
| + if (deltaX == 0) return null;
|
| +
|
| + int deltaY = secondLink.mPageParamValue - firstLink.mPageParamValue;
|
| + int coefficient = deltaY / deltaX;
|
| + if (coefficient == 0) return null;
|
| +
|
| + int delta = firstLink.mPageParamValue - coefficient * firstLink.mPageNum;
|
| + if (delta != 0 && delta != -coefficient) return null;
|
| +
|
| + // Check if the remaining elements are on the same linear map.
|
| + for (int i = 2; i < allLinkInfo.size(); i++) {
|
| + final LinkInfo link = allLinkInfo.get(i);
|
| + if (link.mPageParamValue != coefficient * link.mPageNum + delta) return null;
|
| + }
|
| +
|
| + return new PageParamInfo.LinearFormula(coefficient, delta);
|
| + }
|
| +
|
| + /**
|
| + * Returns true if page numbers in list of PageParamInfo.PageInfo's form a sequence, based on
|
| + * a pipeline of rules:
|
| + * - first PageInfo must have a URL unless it is the first page
|
| + * - there's only one plain number without URL in list
|
| + * - if only two pages, they must be siblings - 2nd page number must follow 1st
|
| + * - page numbers must be adjacent and consecutive; otherwise, 2 non-consecutive numbers must be
|
| + * head/tail or have URLs.
|
| + *
|
| + * @param ascendingNumbers list of PageInfo's with ascending mPageNum's
|
| + */
|
| + private static boolean isPageNumberSequence(List<PageParamInfo.PageInfo> ascendingNumbers) {
|
| + if (ascendingNumbers.size() <= 1) return false;
|
| +
|
| + // The first one must have a URL unless it is the first page.
|
| + final PageParamInfo.PageInfo firstPage = ascendingNumbers.get(0);
|
| + if (firstPage.mPageNum != 1 && firstPage.mUrl.isEmpty()) return false;
|
| +
|
| + // There's only one plain number without URL in ascending numbers group.
|
| + boolean hasPlainNum = false;
|
| + for (PageParamInfo.PageInfo page : ascendingNumbers) {
|
| + if (page.mUrl.isEmpty()) {
|
| + if (hasPlainNum) return false;
|
| + hasPlainNum = true;
|
| + }
|
| + }
|
| +
|
| + // If there are only two pages, they must be siblings.
|
| + if (ascendingNumbers.size() == 2) {
|
| + return firstPage.mPageNum + 1 == ascendingNumbers.get(1).mPageNum;
|
| + }
|
| +
|
| + // Check if page numbers in ascendingNumbers are adjacent and consecutive.
|
| + for (int i = 1; i < ascendingNumbers.size(); i++) {
|
| + // If two adjacent numbers are not consecutive, we accept them only when:
|
| + // 1) one of them is head/tail, like [1],[n-i][n-i+1]..[n] or [1],[2], [3]...[i], [n].
|
| + // 2) both of them have URLs.
|
| + final PageParamInfo.PageInfo currPage = ascendingNumbers.get(i);
|
| + final PageParamInfo.PageInfo prevPage = ascendingNumbers.get(i - 1);
|
| + if (currPage.mPageNum - prevPage.mPageNum != 1) {
|
| + if (i != 1 && i != ascendingNumbers.size() - 1) return false;
|
| + if (currPage.mUrl.isEmpty() || prevPage.mUrl.isEmpty()) return false;
|
| + }
|
| + }
|
| +
|
| + return true;
|
| + }
|
|
|
| /**
|
| * If sBadPageParamNames is null, initialize it with all the known bad page param names, in
|
| @@ -259,6 +537,6 @@ public class PageParameterDetector {
|
| sBadPageParamNames.add("videos");
|
| sBadPageParamNames.add("w");
|
| sBadPageParamNames.add("wiki");
|
| - } // initBadPageParamNames
|
| + }
|
|
|
| }
|
|
|