Index: java/org/chromium/distiller/PageParameterParser.java |
diff --git a/java/org/chromium/distiller/PageParameterParser.java b/java/org/chromium/distiller/PageParameterParser.java |
new file mode 100644 |
index 0000000000000000000000000000000000000000..76882c6f2eb9b75efdb83fc305b88ba6a349f57e |
--- /dev/null |
+++ b/java/org/chromium/distiller/PageParameterParser.java |
@@ -0,0 +1,357 @@ |
+// Copyright 2015 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+package org.chromium.distiller; |
+ |
+import org.chromium.distiller.proto.DomDistillerProtos; |
+import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo; |
+ |
+import com.google.gwt.dom.client.AnchorElement; |
+import com.google.gwt.dom.client.Document; |
+import com.google.gwt.dom.client.Element; |
+import com.google.gwt.dom.client.Node; |
+import com.google.gwt.dom.client.NodeList; |
+import com.google.gwt.dom.client.Style; |
+import com.google.gwt.regexp.shared.MatchResult; |
+import com.google.gwt.regexp.shared.RegExp; |
+ |
+/** |
+ * Background: |
+ * The long article/news/forum thread/blog document may be partitioned into several partial pages |
+ * by webmaster. Each partial page has outlinks pointing to the adjacent partial pages. The |
+ * anchor text of those outlinks is numeric. |
+ * |
+ * This class parses the document to collect groups of adjacent plain text numbers and outlinks with |
+ * digital anchor text. These are then passed to PageParameterParser which would spit out the |
+ * pagination URLs if available. |
+ */ |
+public class PageParameterParser { |
+ // If the numeric value of a link's anchor text is greater than this number, we don't think it |
+ // represents the page number of the link. |
+ private static final int MAX_NUM_FOR_PAGE_PARAM = 100; |
+ |
+ /** |
+ * Stores PageParamInfo.PageInfo and the anchor's text, specifically returned by |
+ * getPageInfoAndText(). |
+ */ |
+ private static class PageInfoAndText { |
+ private final PageParamInfo.PageInfo mPageInfo; |
+ private final String mText; |
+ |
+ PageInfoAndText(int number, String url, String text) { |
+ mPageInfo = new PageParamInfo.PageInfo(number, url); |
+ mText = text; |
+ } |
+ } |
+ |
+ /** |
+ * Entry point for PageParameterParser. |
+ * Parses the document to collect outlinks with numeric anchor text and numeric text around |
+ * them. These are then passed to PageParameterParser to detect pagination URLs. |
+ * |
+ * @return PageParamInfo (see PageParamInfo.java), always. If no page parameter is detected or |
+ * determined to be best, its mType is PageParamInfo.Type.UNSET. |
+ * |
+ * @param originalUrl the original URL of the document to be parsed. |
+ * @param timingInfo for tracking performance. |
+ */ |
+ public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) { |
+ PageParameterParser parser = new PageParameterParser(timingInfo); |
+ return parser.parseDocument(Document.get().getDocumentElement(), originalUrl); |
+ } |
+ |
+ private final TimingInfo mTimingInfo; |
+ private String mDocUrl = ""; |
+ private ParsedUrl mParsedUrl = null; |
+ private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new MonotonicPageInfosGroups(); |
+ private int mNumForwardLinksProcessed = 0; |
+ |
+ private static RegExp sHrefCleaner = RegExp.compile("\\/$"); |
+ private static RegExp sInvalidParentWrapper = null; |
+ |
+ private PageParameterParser(TimingInfo timingInfo) { |
+ mTimingInfo = timingInfo; |
+ } |
+ |
+ /** |
+ * Acutually implements PageParameterParser.parse(), see above description for parse(). |
+ */ |
+ private PageParamInfo parseDocument(Element root, String originalUrl) { |
+ double startTime = DomUtil.getTime(); |
+ |
+ mDocUrl = sHrefCleaner.replace(originalUrl, ""); |
+ mParsedUrl = ParsedUrl.create(mDocUrl); |
+ if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL. |
+ |
+ AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase( |
+ PagingLinksFinder.getBaseUrlForRelative(root, originalUrl)); |
+ |
+ NodeList<Element> allLinks = root.getElementsByTagName("A"); |
+ int idx = 0; |
+ while (idx < allLinks.getLength()) { |
+ final AnchorElement link = AnchorElement.as(allLinks.getItem(idx)); |
+ PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor); |
+ if (pageInfoAndText == null) { |
+ idx++; |
+ continue; |
+ } |
+ |
+ // This link is a good candidate for pagination. |
+ |
+ // Close current group of adjacent numbers, add a new group if necessary. |
+ mAdjacentNumbersGroups.addGroup(); |
+ |
+ // Before we append the link to the new group of adjacent numbers, check if it's |
+ // preceded by a text node with numeric text; if so, add it before the link. |
+ findAndAddClosestValidLeafNodes(link, false, true, null); |
+ |
+ // Add the link to the current group of adjacent numbers. |
+ mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo); |
+ |
+ // Add all following text nodes and links with numeric text. |
+ mNumForwardLinksProcessed = 0; |
+ findAndAddClosestValidLeafNodes(link, false, false, baseAnchor); |
+ |
+ // Skip the current link and links already processed in the forward |
+ // findandAddClosestValidLeafNodes(). |
+ idx += 1 + mNumForwardLinksProcessed; |
+ } // while there're links. |
+ |
+ mAdjacentNumbersGroups.cleanup(); |
+ |
+ LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser"); |
+ |
+ startTime = DomUtil.getTime(); |
+ PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups, mDocUrl); |
+ LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector"); |
+ return info; |
+ } |
+ |
+ /** |
+ * @return a populated PageInfoAndText if given link is to be added to mAdjacentNumbersGroups. |
+ * Otherwise, returns null if link is to be ignored. |
+ * "javascript:" links with numeric text are considered valid links to be added. |
+ * |
+ * @param link to process. |
+ * @param baseAnchor created for the current document. |
+ */ |
+ private PageInfoAndText getPageInfoAndText(AnchorElement link, AnchorElement baseAnchor) { |
+ // Ignore invisible links. |
+ if (!DomUtil.isVisible(link)) return null; |
+ |
+ // Use javascript innerText (instead of javascript textContent) to only get visible text. |
+ String linkText = StringUtil.jsTrim(DomUtil.getInnerText(link)); |
+ int number = linkTextToNumber(linkText); |
+ if (!isPlainPageNumber(number)) return null; |
+ |
+ String linkHref = resolveLinkHref(link, baseAnchor); |
+ final boolean isEmptyHref = linkHref.isEmpty(); |
+ boolean isJavascriptLink = false; |
+ ParsedUrl url = null; |
+ if (!isEmptyHref) { |
+ isJavascriptLink = isJavascriptHref(linkHref); |
+ url = ParsedUrl.create(linkHref); |
+ if (url == null || |
+ (!isJavascriptLink && !url.getHost().equalsIgnoreCase(mParsedUrl.getHost()))) { |
+ return null; |
+ } |
+ url.setHash(""); |
+ } |
+ |
+ if (isEmptyHref || isJavascriptLink || isDisabledLink(link)) { |
+ return new PageInfoAndText(number, "", linkText); |
+ } |
+ |
+ return new PageInfoAndText(number, sHrefCleaner.replace(url.toString(), ""), linkText); |
+ } |
+ |
+ /** |
+ * Finds and adds the leaf node(s) closest to the given start node. |
+ * This recurses and keeps finding and, if necessary, adding the numeric text of valid nodes, |
+ * collecting the PageParamInfo.PageInfo's for the current adjacency group. |
+ * For backward search, i.e. nodes before start node, search terminates (i.e. recursion stops) |
+ * once a text node or anchor is encountered. If the text node contains numeric text, it's |
+ * added to the current adjacency group. Otherwise, a new group is created to break the |
+ * adjacency. |
+ * For forward search, i.e. nodes after start node, search continues (i.e. recursion continues) |
+ * until a text node or anchor with non-numeric text is encountered. In the process, text nodes |
+ * and anchors with numeric text are added to the current adjaency group. When a non-numeric |
+ * text node or anchor is encountered, a new group is started to break the adjacency, and search |
+ * ends. |
+ * |
+ * @return true to continue search, false to stop. |
+ * |
+ * @param start node to work on. |
+ * @param checkStart true to check start node. Otherwise, the previous or next sibling of the |
+ * start node is checked. |
+ * @param backward true to search backward (i.e. nodes before start node), false to search |
+ * forward (i.e. nodes after start node). |
+ * @param baseAnchor created for the current document, only needed for forward search. |
+ */ |
+ private boolean findAndAddClosestValidLeafNodes(Node start, boolean checkStart, |
+ boolean backward, AnchorElement baseAnchor) { |
+ Node node = checkStart ? start : |
+ (backward ? start.getPreviousSibling() : start.getNextSibling()); |
+ if (node == null) { // No sibling, try parent. |
+ node = start.getParentNode(); |
+ if (sInvalidParentWrapper == null) { |
+ sInvalidParentWrapper = RegExp.compile("(BODY)|(HTML)"); |
+ } |
+ if (sInvalidParentWrapper.test(node.getNodeName())) return false; |
+ return findAndAddClosestValidLeafNodes(node, false, backward, baseAnchor); |
+ } |
+ |
+ checkStart = false; |
+ switch (node.getNodeType()) { |
+ case Node.TEXT_NODE: |
+ String text = node.getNodeValue(); |
+ // Text must contain words. |
+ if (text.isEmpty() || StringUtil.countWords(text) == 0) break; |
+ boolean added = addNonLinkTextIfValid(node.getNodeValue()); |
+ // For backward search, we're done regardless if text was added. |
+ // For forward search, we're done only if text was invalid, otherwise continue. |
+ if (backward || !added) return false; |
+ break; |
+ |
+ case Node.ELEMENT_NODE: |
+ Element e = Element.as(node); |
+ if (e.hasTagName("A")) { |
+ // For backward search, we're done because we've already processed the anchor. |
+ if (backward) return false; |
+ // For forward search, we're done only if link was invalid, otherwise continue. |
+ mNumForwardLinksProcessed++; |
+ if (!addLinkIfValid(AnchorElement.as(e), baseAnchor)) return false; |
+ break; |
+ } |
+ // Intentionally fall through. |
+ |
+ default: |
+ // Check children nodes. |
+ if (!node.hasChildNodes()) break; |
+ checkStart = true; // We want to check the child node. |
+ if (backward) { |
+ // Start the backward search with the rightmost child i.e. last and closest to |
+ // given node. |
+ node = node.getLastChild(); |
+ } else { |
+ // Start the forward search with the leftmost child i.e. first and closest to |
+ // given node. |
+ node = node.getFirstChild(); |
+ } |
+ break; |
+ } |
+ |
+ return findAndAddClosestValidLeafNodes(node, checkStart, backward, baseAnchor); |
+ } |
+ |
+ private static RegExp sTermsRegExp = null; // Match terms i.e. words. |
+ private static RegExp sSurroundingDigitsRegExp = null; // Match term with only digits. |
+ |
+ /** |
+ * Handle the text for a non-link node. Each numeric term in the text that is a valid plain |
+ * page number adds a PageParamInfo.PageInfo into the current adjacent group. All other terms |
+ * break the adjacency in the current group, adding a new group instead. |
+ * |
+ * @Return true if text was added to current group of adjacent numbers. Otherwise, false with |
+ * a new group created to break the current adjacency. |
+ */ |
+ private boolean addNonLinkTextIfValid(String text) { |
+ if (!StringUtil.containsDigit(text)) { |
+ // The text does not contain valid number(s); if necessary, current group of adjacent |
+ // numbers should be closed, adding a new group if possible. |
+ mAdjacentNumbersGroups.addGroup(); |
+ return false; |
+ } |
+ |
+ if (sTermsRegExp == null) { |
+ sTermsRegExp = RegExp.compile("(\\S*[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\\S*)", "gi"); |
+ } else { |
+ sTermsRegExp.setLastIndex(0); |
+ } |
+ if (sSurroundingDigitsRegExp == null) { |
+ sSurroundingDigitsRegExp = RegExp.compile("^[\\W_]*(\\d+)[\\W_]*$", "i"); |
+ } |
+ |
+ // Extract terms from the text, differentiating between those that contain only digits and |
+ // those that contain non-digits. |
+ boolean added = false; |
+ while (true) { |
+ MatchResult match = sTermsRegExp.exec(text); |
+ if (match == null) break; |
+ if (match.getGroupCount() <= 1) continue; |
+ |
+ String term = match.getGroup(1); |
+ MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term); |
+ int number = -1; |
+ if (termWithDigits != null && termWithDigits.getGroupCount() > 1) { |
+ number = StringUtil.toNumber(termWithDigits.getGroup(1)); |
+ } |
+ if (isPlainPageNumber(number)) { |
+ // This text is a valid candidate of plain text page number, add it to last group of |
+ // adjacent numbers. |
+ mAdjacentNumbersGroups.addNumber(number, ""); |
+ added = true; |
+ } else { |
+ // The text is not a valid number, so current group of adjacent numbers should be |
+ // closed, adding a new group if possible. |
+ mAdjacentNumbersGroups.addGroup(); |
+ } |
+ } // while there're matches |
+ |
+ return added; |
+ } |
+ |
+ /** |
+ * Adds PageParamInfo.PageInfo to the current adjacent group for a link if its text is numeric. |
+ * Otherwise, add a new group to break the adjacency. |
+ * |
+ * @Return true if link was added, false otherwise. |
+ */ |
+ private boolean addLinkIfValid(AnchorElement link, AnchorElement baseAnchor) { |
+ PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor); |
+ if (pageInfoAndText != null) { |
+ mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo); |
+ return true; |
+ } |
+ mAdjacentNumbersGroups.addGroup(); |
+ return false; |
+ } |
+ |
+ /** |
+ * @return true if link is disabled i.e. not clickable because it has a text cursor. |
+ */ |
+ private static boolean isDisabledLink(AnchorElement link) { |
+ Style style = DomUtil.getComputedStyle(link); |
+ return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cursor.TEXT; |
+ } |
+ |
+ /** |
+ * @return true if href starts with "javascript:". |
+ */ |
+ private static boolean isJavascriptHref(String href) { |
+ return href.startsWith("javascript:"); |
+ } |
+ |
+ private static String resolveLinkHref(AnchorElement link, AnchorElement baseAnchor) { |
+ // Anchors without "href" attribute are not considered potential pagination links. |
+ String linkHref = link.getAttribute("href"); |
+ if (linkHref.isEmpty()) return ""; |
+ baseAnchor.setAttribute("href", linkHref); |
+ return baseAnchor.getHref(); |
+ } |
+ |
+ private static int linkTextToNumber(String linkText) { |
+ linkText = linkText.replaceAll("[()\\[\\]{}]", ""); |
+ linkText = linkText.trim(); // Remove leading and trailing whitespaces. |
+ return StringUtil.toNumber(linkText); |
+ } |
+ |
+ /** |
+ * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM. |
+ */ |
+ private static boolean isPlainPageNumber(int number) { |
+ return number >= 0 && number <= MAX_NUM_FOR_PAGE_PARAM; |
+ } |
+ |
+} |