| Index: java/org/chromium/distiller/PageParameterParser.java
|
| diff --git a/java/org/chromium/distiller/PageParameterParser.java b/java/org/chromium/distiller/PageParameterParser.java
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..76882c6f2eb9b75efdb83fc305b88ba6a349f57e
|
| --- /dev/null
|
| +++ b/java/org/chromium/distiller/PageParameterParser.java
|
| @@ -0,0 +1,357 @@
|
| +// Copyright 2015 The Chromium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +
|
| +package org.chromium.distiller;
|
| +
|
| +import org.chromium.distiller.proto.DomDistillerProtos;
|
| +import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo;
|
| +
|
| +import com.google.gwt.dom.client.AnchorElement;
|
| +import com.google.gwt.dom.client.Document;
|
| +import com.google.gwt.dom.client.Element;
|
| +import com.google.gwt.dom.client.Node;
|
| +import com.google.gwt.dom.client.NodeList;
|
| +import com.google.gwt.dom.client.Style;
|
| +import com.google.gwt.regexp.shared.MatchResult;
|
| +import com.google.gwt.regexp.shared.RegExp;
|
| +
|
| +/**
|
| + * Background:
|
| + * The long article/news/forum thread/blog document may be partitioned into several partial pages
|
| + * by webmaster. Each partial page has outlinks pointing to the adjacent partial pages. The
|
| + * anchor text of those outlinks is numeric.
|
| + *
|
| + * This class parses the document to collect groups of adjacent plain text numbers and outlinks with
|
| + * digital anchor text. These are then passed to PageParameterParser which would spit out the
|
| + * pagination URLs if available.
|
| + */
|
| +public class PageParameterParser {
|
| + // If the numeric value of a link's anchor text is greater than this number, we don't think it
|
| + // represents the page number of the link.
|
| + private static final int MAX_NUM_FOR_PAGE_PARAM = 100;
|
| +
|
| + /**
|
| + * Stores PageParamInfo.PageInfo and the anchor's text, specifically returned by
|
| + * getPageInfoAndText().
|
| + */
|
| + private static class PageInfoAndText {
|
| + private final PageParamInfo.PageInfo mPageInfo;
|
| + private final String mText;
|
| +
|
| + PageInfoAndText(int number, String url, String text) {
|
| + mPageInfo = new PageParamInfo.PageInfo(number, url);
|
| + mText = text;
|
| + }
|
| + }
|
| +
|
| + /**
|
| + * Entry point for PageParameterParser.
|
| + * Parses the document to collect outlinks with numeric anchor text and numeric text around
|
| + * them. These are then passed to PageParameterParser to detect pagination URLs.
|
| + *
|
| + * @return PageParamInfo (see PageParamInfo.java), always. If no page parameter is detected or
|
| + * determined to be best, its mType is PageParamInfo.Type.UNSET.
|
| + *
|
| + * @param originalUrl the original URL of the document to be parsed.
|
| + * @param timingInfo for tracking performance.
|
| + */
|
| + public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) {
|
| + PageParameterParser parser = new PageParameterParser(timingInfo);
|
| + return parser.parseDocument(Document.get().getDocumentElement(), originalUrl);
|
| + }
|
| +
|
| + private final TimingInfo mTimingInfo;
|
| + private String mDocUrl = "";
|
| + private ParsedUrl mParsedUrl = null;
|
| + private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new MonotonicPageInfosGroups();
|
| + private int mNumForwardLinksProcessed = 0;
|
| +
|
| + private static RegExp sHrefCleaner = RegExp.compile("\\/$");
|
| + private static RegExp sInvalidParentWrapper = null;
|
| +
|
| + private PageParameterParser(TimingInfo timingInfo) {
|
| + mTimingInfo = timingInfo;
|
| + }
|
| +
|
| + /**
|
| + * Acutually implements PageParameterParser.parse(), see above description for parse().
|
| + */
|
| + private PageParamInfo parseDocument(Element root, String originalUrl) {
|
| + double startTime = DomUtil.getTime();
|
| +
|
| + mDocUrl = sHrefCleaner.replace(originalUrl, "");
|
| + mParsedUrl = ParsedUrl.create(mDocUrl);
|
| + if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL.
|
| +
|
| + AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase(
|
| + PagingLinksFinder.getBaseUrlForRelative(root, originalUrl));
|
| +
|
| + NodeList<Element> allLinks = root.getElementsByTagName("A");
|
| + int idx = 0;
|
| + while (idx < allLinks.getLength()) {
|
| + final AnchorElement link = AnchorElement.as(allLinks.getItem(idx));
|
| + PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor);
|
| + if (pageInfoAndText == null) {
|
| + idx++;
|
| + continue;
|
| + }
|
| +
|
| + // This link is a good candidate for pagination.
|
| +
|
| + // Close current group of adjacent numbers, add a new group if necessary.
|
| + mAdjacentNumbersGroups.addGroup();
|
| +
|
| + // Before we append the link to the new group of adjacent numbers, check if it's
|
| + // preceded by a text node with numeric text; if so, add it before the link.
|
| + findAndAddClosestValidLeafNodes(link, false, true, null);
|
| +
|
| + // Add the link to the current group of adjacent numbers.
|
| + mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo);
|
| +
|
| + // Add all following text nodes and links with numeric text.
|
| + mNumForwardLinksProcessed = 0;
|
| + findAndAddClosestValidLeafNodes(link, false, false, baseAnchor);
|
| +
|
| + // Skip the current link and links already processed in the forward
|
| + // findandAddClosestValidLeafNodes().
|
| + idx += 1 + mNumForwardLinksProcessed;
|
| + } // while there're links.
|
| +
|
| + mAdjacentNumbersGroups.cleanup();
|
| +
|
| + LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser");
|
| +
|
| + startTime = DomUtil.getTime();
|
| + PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups, mDocUrl);
|
| + LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector");
|
| + return info;
|
| + }
|
| +
|
| + /**
|
| + * @return a populated PageInfoAndText if given link is to be added to mAdjacentNumbersGroups.
|
| + * Otherwise, returns null if link is to be ignored.
|
| + * "javascript:" links with numeric text are considered valid links to be added.
|
| + *
|
| + * @param link to process.
|
| + * @param baseAnchor created for the current document.
|
| + */
|
| + private PageInfoAndText getPageInfoAndText(AnchorElement link, AnchorElement baseAnchor) {
|
| + // Ignore invisible links.
|
| + if (!DomUtil.isVisible(link)) return null;
|
| +
|
| + // Use javascript innerText (instead of javascript textContent) to only get visible text.
|
| + String linkText = StringUtil.jsTrim(DomUtil.getInnerText(link));
|
| + int number = linkTextToNumber(linkText);
|
| + if (!isPlainPageNumber(number)) return null;
|
| +
|
| + String linkHref = resolveLinkHref(link, baseAnchor);
|
| + final boolean isEmptyHref = linkHref.isEmpty();
|
| + boolean isJavascriptLink = false;
|
| + ParsedUrl url = null;
|
| + if (!isEmptyHref) {
|
| + isJavascriptLink = isJavascriptHref(linkHref);
|
| + url = ParsedUrl.create(linkHref);
|
| + if (url == null ||
|
| + (!isJavascriptLink && !url.getHost().equalsIgnoreCase(mParsedUrl.getHost()))) {
|
| + return null;
|
| + }
|
| + url.setHash("");
|
| + }
|
| +
|
| + if (isEmptyHref || isJavascriptLink || isDisabledLink(link)) {
|
| + return new PageInfoAndText(number, "", linkText);
|
| + }
|
| +
|
| + return new PageInfoAndText(number, sHrefCleaner.replace(url.toString(), ""), linkText);
|
| + }
|
| +
|
| + /**
|
| + * Finds and adds the leaf node(s) closest to the given start node.
|
| + * This recurses and keeps finding and, if necessary, adding the numeric text of valid nodes,
|
| + * collecting the PageParamInfo.PageInfo's for the current adjacency group.
|
| + * For backward search, i.e. nodes before start node, search terminates (i.e. recursion stops)
|
| + * once a text node or anchor is encountered. If the text node contains numeric text, it's
|
| + * added to the current adjacency group. Otherwise, a new group is created to break the
|
| + * adjacency.
|
| + * For forward search, i.e. nodes after start node, search continues (i.e. recursion continues)
|
| + * until a text node or anchor with non-numeric text is encountered. In the process, text nodes
|
| + * and anchors with numeric text are added to the current adjaency group. When a non-numeric
|
| + * text node or anchor is encountered, a new group is started to break the adjacency, and search
|
| + * ends.
|
| + *
|
| + * @return true to continue search, false to stop.
|
| + *
|
| + * @param start node to work on.
|
| + * @param checkStart true to check start node. Otherwise, the previous or next sibling of the
|
| + * start node is checked.
|
| + * @param backward true to search backward (i.e. nodes before start node), false to search
|
| + * forward (i.e. nodes after start node).
|
| + * @param baseAnchor created for the current document, only needed for forward search.
|
| + */
|
| + private boolean findAndAddClosestValidLeafNodes(Node start, boolean checkStart,
|
| + boolean backward, AnchorElement baseAnchor) {
|
| + Node node = checkStart ? start :
|
| + (backward ? start.getPreviousSibling() : start.getNextSibling());
|
| + if (node == null) { // No sibling, try parent.
|
| + node = start.getParentNode();
|
| + if (sInvalidParentWrapper == null) {
|
| + sInvalidParentWrapper = RegExp.compile("(BODY)|(HTML)");
|
| + }
|
| + if (sInvalidParentWrapper.test(node.getNodeName())) return false;
|
| + return findAndAddClosestValidLeafNodes(node, false, backward, baseAnchor);
|
| + }
|
| +
|
| + checkStart = false;
|
| + switch (node.getNodeType()) {
|
| + case Node.TEXT_NODE:
|
| + String text = node.getNodeValue();
|
| + // Text must contain words.
|
| + if (text.isEmpty() || StringUtil.countWords(text) == 0) break;
|
| + boolean added = addNonLinkTextIfValid(node.getNodeValue());
|
| + // For backward search, we're done regardless if text was added.
|
| + // For forward search, we're done only if text was invalid, otherwise continue.
|
| + if (backward || !added) return false;
|
| + break;
|
| +
|
| + case Node.ELEMENT_NODE:
|
| + Element e = Element.as(node);
|
| + if (e.hasTagName("A")) {
|
| + // For backward search, we're done because we've already processed the anchor.
|
| + if (backward) return false;
|
| + // For forward search, we're done only if link was invalid, otherwise continue.
|
| + mNumForwardLinksProcessed++;
|
| + if (!addLinkIfValid(AnchorElement.as(e), baseAnchor)) return false;
|
| + break;
|
| + }
|
| + // Intentionally fall through.
|
| +
|
| + default:
|
| + // Check children nodes.
|
| + if (!node.hasChildNodes()) break;
|
| + checkStart = true; // We want to check the child node.
|
| + if (backward) {
|
| + // Start the backward search with the rightmost child i.e. last and closest to
|
| + // given node.
|
| + node = node.getLastChild();
|
| + } else {
|
| + // Start the forward search with the leftmost child i.e. first and closest to
|
| + // given node.
|
| + node = node.getFirstChild();
|
| + }
|
| + break;
|
| + }
|
| +
|
| + return findAndAddClosestValidLeafNodes(node, checkStart, backward, baseAnchor);
|
| + }
|
| +
|
| + private static RegExp sTermsRegExp = null; // Match terms i.e. words.
|
| + private static RegExp sSurroundingDigitsRegExp = null; // Match term with only digits.
|
| +
|
| + /**
|
| + * Handle the text for a non-link node. Each numeric term in the text that is a valid plain
|
| + * page number adds a PageParamInfo.PageInfo into the current adjacent group. All other terms
|
| + * break the adjacency in the current group, adding a new group instead.
|
| + *
|
| + * @Return true if text was added to current group of adjacent numbers. Otherwise, false with
|
| + * a new group created to break the current adjacency.
|
| + */
|
| + private boolean addNonLinkTextIfValid(String text) {
|
| + if (!StringUtil.containsDigit(text)) {
|
| + // The text does not contain valid number(s); if necessary, current group of adjacent
|
| + // numbers should be closed, adding a new group if possible.
|
| + mAdjacentNumbersGroups.addGroup();
|
| + return false;
|
| + }
|
| +
|
| + if (sTermsRegExp == null) {
|
| + sTermsRegExp = RegExp.compile("(\\S*[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\\S*)", "gi");
|
| + } else {
|
| + sTermsRegExp.setLastIndex(0);
|
| + }
|
| + if (sSurroundingDigitsRegExp == null) {
|
| + sSurroundingDigitsRegExp = RegExp.compile("^[\\W_]*(\\d+)[\\W_]*$", "i");
|
| + }
|
| +
|
| + // Extract terms from the text, differentiating between those that contain only digits and
|
| + // those that contain non-digits.
|
| + boolean added = false;
|
| + while (true) {
|
| + MatchResult match = sTermsRegExp.exec(text);
|
| + if (match == null) break;
|
| + if (match.getGroupCount() <= 1) continue;
|
| +
|
| + String term = match.getGroup(1);
|
| + MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term);
|
| + int number = -1;
|
| + if (termWithDigits != null && termWithDigits.getGroupCount() > 1) {
|
| + number = StringUtil.toNumber(termWithDigits.getGroup(1));
|
| + }
|
| + if (isPlainPageNumber(number)) {
|
| + // This text is a valid candidate of plain text page number, add it to last group of
|
| + // adjacent numbers.
|
| + mAdjacentNumbersGroups.addNumber(number, "");
|
| + added = true;
|
| + } else {
|
| + // The text is not a valid number, so current group of adjacent numbers should be
|
| + // closed, adding a new group if possible.
|
| + mAdjacentNumbersGroups.addGroup();
|
| + }
|
| + } // while there're matches
|
| +
|
| + return added;
|
| + }
|
| +
|
| + /**
|
| + * Adds PageParamInfo.PageInfo to the current adjacent group for a link if its text is numeric.
|
| + * Otherwise, add a new group to break the adjacency.
|
| + *
|
| + * @Return true if link was added, false otherwise.
|
| + */
|
| + private boolean addLinkIfValid(AnchorElement link, AnchorElement baseAnchor) {
|
| + PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor);
|
| + if (pageInfoAndText != null) {
|
| + mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo);
|
| + return true;
|
| + }
|
| + mAdjacentNumbersGroups.addGroup();
|
| + return false;
|
| + }
|
| +
|
| + /**
|
| + * @return true if link is disabled i.e. not clickable because it has a text cursor.
|
| + */
|
| + private static boolean isDisabledLink(AnchorElement link) {
|
| + Style style = DomUtil.getComputedStyle(link);
|
| + return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cursor.TEXT;
|
| + }
|
| +
|
| + /**
|
| + * @return true if href starts with "javascript:".
|
| + */
|
| + private static boolean isJavascriptHref(String href) {
|
| + return href.startsWith("javascript:");
|
| + }
|
| +
|
| + private static String resolveLinkHref(AnchorElement link, AnchorElement baseAnchor) {
|
| + // Anchors without "href" attribute are not considered potential pagination links.
|
| + String linkHref = link.getAttribute("href");
|
| + if (linkHref.isEmpty()) return "";
|
| + baseAnchor.setAttribute("href", linkHref);
|
| + return baseAnchor.getHref();
|
| + }
|
| +
|
| + private static int linkTextToNumber(String linkText) {
|
| + linkText = linkText.replaceAll("[()\\[\\]{}]", "");
|
| + linkText = linkText.trim(); // Remove leading and trailing whitespaces.
|
| + return StringUtil.toNumber(linkText);
|
| + }
|
| +
|
| + /**
|
| + * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM.
|
| + */
|
| + private static boolean isPlainPageNumber(int number) {
|
| + return number >= 0 && number <= MAX_NUM_FOR_PAGE_PARAM;
|
| + }
|
| +
|
| +}
|
|
|