Index: java/org/chromium/distiller/PageParameterParser.java |
diff --git a/java/org/chromium/distiller/PageParameterParser.java b/java/org/chromium/distiller/PageParameterParser.java |
new file mode 100644 |
index 0000000000000000000000000000000000000000..70c1e8dea62a0d0ea0d60fcd97a1f64667095fa7 |
--- /dev/null |
+++ b/java/org/chromium/distiller/PageParameterParser.java |
@@ -0,0 +1,284 @@ |
+// Copyright 2015 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+package org.chromium.distiller; |
+ |
+import org.chromium.distiller.proto.DomDistillerProtos; |
+import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo; |
+ |
+import com.google.gwt.dom.client.AnchorElement; |
+import com.google.gwt.dom.client.Document; |
+import com.google.gwt.dom.client.Element; |
+import com.google.gwt.dom.client.Node; |
+import com.google.gwt.dom.client.NodeList; |
+import com.google.gwt.dom.client.Style; |
+import com.google.gwt.regexp.shared.MatchResult; |
+import com.google.gwt.regexp.shared.RegExp; |
+ |
+import java.util.HashSet; |
+import java.util.Set; |
+ |
+/** |
+ * Background: |
+ * The long article/news/forum thread/blog document may be partitioned into several partial pages |
+ * by webmaster. Each partial page has outlinks pointing to the adjacent partial pages. The |
+ * anchor text of those outlinks is numeric. |
+ * |
+ * This class parses the document to collect groups of adjacent plain text numbers and outlinks with |
+ * digital anchor text. These are then passed to PageParameterParser which would spit out the |
+ * pagination URLs if available. |
+ */ |
+public class PageParameterParser { |
+ // If the numeric value of a link's anchor text is greater than this number, we don't think it |
+ // represents the page number of the link. |
+ private static final int MAX_NUM_FOR_PAGE_PARAM = 100; |
+ |
+ /** |
+ * Type of sibling to check. |
+ */ |
+ private static enum Sibling { |
+ PREV, // Node.getPreviousSibling(). |
+ NEXT, // Node.getNextSibling(). |
+ } |
+ |
+ /** |
+ * Entry point for PageParameterParser. |
+ * Parses the document to collect outlinks with digital anchor text and numeric text around |
+ * them. These are then passed to PageParameterParser to detect pagination URLs. |
+ * |
+ * @return PageParamInfo (see PageParamInfo.java), always. If no page parameter is detected or |
+ * determined to be best, its mType is PageParamInfo.Type.UNSET. |
+ * |
+ * @param originalUrl the original URL of the document to be parsed. |
+ * @param timingInfo for tracking performance. |
+ */ |
+ public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) { |
+ PageParameterParser parser = new PageParameterParser(timingInfo); |
+ return parser.parseDocument(Document.get().getDocumentElement(), originalUrl); |
+ } |
+ |
+ private final TimingInfo mTimingInfo; |
+ private String mDocUrl = ""; |
+ private ParsedUrl mParsedUrl = null; |
+ private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new MonotonicPageInfosGroups(); |
+ // Siblings that have been processed before and after each link. |
+ private final Set<Node> mProcessedSiblings = new HashSet<Node>(); |
+ |
+ private static RegExp sHrefCleaner = null; |
+ |
+ private PageParameterParser(TimingInfo timingInfo) { |
+ mTimingInfo = timingInfo; |
+ } |
+ |
+ /** |
+ * Acutually implements PageParameterParser.parse(), see above description for parse(). |
+ */ |
+ private PageParamInfo parseDocument(Element root, String originalUrl) { |
+ double startTime = DomUtil.getTime(); |
+ |
+ mDocUrl = originalUrl; |
+ mParsedUrl = ParsedUrl.create(mDocUrl); |
+ if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL. |
+ |
+ mAdjacentNumbersGroups.addGroup(); |
+ AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase( |
+ PagingLinksFinder.getBaseUrlForRelative(root, originalUrl)); |
+ |
+ NodeList<Element> allLinks = root.getElementsByTagName("A"); |
cjhopman
2015/06/26 00:35:25
What do you think of this alternative approach:
a
kuan
2015/07/13 22:57:03
Done.
|
+ for (int i = 0; i < allLinks.getLength(); i++) { |
+ final AnchorElement link = AnchorElement.as(allLinks.getItem(i)); |
+ |
+ String linkHref = PagingLinksFinder.resolveLinkHref(link, baseAnchor); |
+ ParsedUrl url = ParsedUrl.create(linkHref); |
+ if (url == null || !url.getHost().equalsIgnoreCase(mParsedUrl.getHost())) continue; |
+ url.setHash(""); |
+ |
+ // Use javascript innerText (instead of javascript textContent) to only get visible |
+ // text. |
+ String linkText = DomUtil.getInnerText(link); |
+ int number = linkTextToNumber(linkText); |
+ if (isPlainPageNumber(number)) { |
+ // Before we append the link to the current group of adjacent numbers, check if it's |
+ // preceded by a sibling with text, which would determine if the current adjacency |
+ // continues or ends. |
+ Node parentWrapper = null; |
+ if (!checkForSiblingWithText(link, Sibling.PREV)) { // Link has no sibling. |
+ // The link could be a child of a parent that is simply a wrapper, i.e. with no |
+ // extra text, in which case, we should be checking the siblings of the topmost |
+ // parent wrapper. |
+ parentWrapper = findParentWrapper(link, linkText.length()); |
+ if (parentWrapper != null) checkForSiblingWithText(parentWrapper, Sibling.PREV); |
+ } |
+ |
+ // This link is a good candidate for pagination, add it to the current group of |
+ // adjacent numbers. |
+ if (isDisabledLink(link)) { |
+ linkHref = ""; |
+ } else { |
+ if (sHrefCleaner == null) sHrefCleaner = RegExp.compile("/?(#.*)?$"); |
+ linkHref = sHrefCleaner.replace(url.toString(), ""); |
+ } |
+ mAdjacentNumbersGroups.addNumber(number, linkHref); |
+ |
+ // Check if the link is followed by a sibling with text, which would determine if |
+ // the current adjacency continues or ends. |
+ checkForSiblingWithText(parentWrapper == null ? link : parentWrapper, Sibling.NEXT); |
+ } else { |
+ // This link is not a valid pagination link, so current group of adjacent numbers |
+ // should be closed, adding a new group if possible. |
+ mAdjacentNumbersGroups.addGroup(); |
+ } |
+ } // for all links |
+ |
+ mAdjacentNumbersGroups.cleanup(); |
+ |
+ LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser"); |
+ |
+ startTime = DomUtil.getTime(); |
+ PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups, mDocUrl); |
+ LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector"); |
+ return info; |
+ } |
+ |
+ private static RegExp sTermsRegExp = null; // Match terms i.e. words. |
+ private static RegExp sSurroundingDigitsRegExp = null; // Match term with only digits. |
+ |
+ /** |
+ * Checks for previous or next sibling with word text. If the text contains digit(s) as terms |
+ * that form a valid page number, the sibling is added to the current group of adjacent |
+ * numbers. Otherwise, the current group of adjacent numbers is closed to end the current |
+ * adjacency, and a new group is started. |
+ * |
+ * @return true if given start node has at least 1 sibling, false otherwise. |
+ |
+ * @param start node to start checking with. |
+ * @param sibling previous or next sibling. |
+ */ |
+ private boolean checkForSiblingWithText(Node start, Sibling sibling) { |
+ Node node = start; |
+ Node prevNode = null; |
+ String text = ""; |
+ // Find the first sibling that has inner text with words. |
+ do { |
+ prevNode = node; |
+ node = sibling == Sibling.PREV ? node.getPreviousSibling() : node.getNextSibling(); |
+ if (node == null && prevNode == start) return false; |
+ if (node == null || node.getNodeType() == Node.DOCUMENT_NODE || |
+ !mProcessedSiblings.add(node)) { // Node has been processed. |
+ return true; |
+ } |
+ |
+ if (node.getNodeType() == Node.TEXT_NODE) { |
+ text = node.getNodeValue(); |
+ } else { |
+ Element e = Element.as(node); |
+ |
+ // Ignore non-void anchors, since we're iterating through them. |
+ if (e.hasTagName("A") && shouldIgnoreLinkSibling(e)) return true; |
+ |
+ // Ignore non-void link children, since we're iterating through them. |
+ NodeList<Element> linkChildren = e.getElementsByTagName("A"); |
+ for (int i = 0; i < linkChildren.getLength(); i++) { |
+ if (shouldIgnoreLinkSibling(linkChildren.getItem(i))) return true; |
+ } |
+ |
+ text = DomUtil.getInnerText(e); |
+ } |
+ } while (text.isEmpty() || StringUtil.countWords(text) == 0); |
+ |
+ if (!StringUtil.containsDigit(text)) { |
+ // The sibling does not contain valid number(s), so current group of adjacent numbers |
+ // should be closed, adding a new group if possible. |
+ mAdjacentNumbersGroups.addGroup(); |
+ return true; |
+ } |
+ |
+ if (sTermsRegExp == null) { |
+ sTermsRegExp = RegExp.compile("(\\S*[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\\S*)", "gi"); |
+ } else { |
+ sTermsRegExp.setLastIndex(0); |
+ } |
+ if (sSurroundingDigitsRegExp == null) { |
+ sSurroundingDigitsRegExp = RegExp.compile("^[\\W_]*(\\d+)[\\W_]*$", "i"); |
+ } |
+ |
+ // Extract terms from the text, differentiating between those that contain only digits and |
+ // those that contain non-digits. |
+ while (true) { |
+ MatchResult match = sTermsRegExp.exec(text); |
+ if (match == null) break; |
+ if (match.getGroupCount() <= 1) continue; |
+ |
+ String term = match.getGroup(1); |
+ MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term); |
+ int number = -1; |
+ if (termWithDigits != null && termWithDigits.getGroupCount() > 1) { |
+ number = StringUtil.toNumber(termWithDigits.getGroup(1)); |
+ } |
+ if (isPlainPageNumber(number)) { |
+ // This sibling is a valid candidate of plain text page number, add it to last |
+ // group of adjacent numbers. |
+ mAdjacentNumbersGroups.addNumber(number, ""); |
+ } else { |
+ // The sibling is not a valid number, so current group of adjacent numbers |
+ // should be closed, adding a new group if possible. |
+ mAdjacentNumbersGroups.addGroup(); |
+ } |
+ } // while there're matches |
+ |
+ return true; |
+ } |
+ |
+ /** |
+ * @return the topmost parent of the given node that simply wraps the node, i.e. with no more |
+ * inner text than that of given node. |
+ */ |
+ private static Node findParentWrapper(Node node, int nodeTextLen) { |
+ Node parent = node; |
+ Node prevParent = null; |
+ // While keeping track of each parent, once we find the first one that has more text than |
+ // given node, the previous parent would be what we want. |
+ do { |
+ prevParent = parent; |
+ parent = parent.getParentNode(); |
+ } while (parent != null && DomUtil.getInnerText(parent).length() == nodeTextLen); |
+ |
+ return prevParent == node || prevParent.getNodeType() == Node.DOCUMENT_NODE ? |
+ null : prevParent; |
+ } |
+ |
+ /** |
+ * @return true if link is disabled i.e. not clickable because it has a text cursor. |
+ */ |
+ private static boolean isDisabledLink(AnchorElement link) { |
+ Style style = DomUtil.getComputedStyle(link); |
+ return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cursor.TEXT; |
+ } |
+ |
+ /** |
+ * @return true if link should be ignored when checking for sibling. |
+ * All non-void links will be processed, and hence could be ignored when checking for sibling. |
+ * Void links are rejected at the beginning because their hosts are different from |
+ * originalUrl's, so they should be considered when checking for sibling. |
+ */ |
+ private static boolean shouldIgnoreLinkSibling(Element link) { |
+ return !AnchorElement.as(link).getHref().equals("javascript:void(0)"); |
+ } |
+ |
+ private static int linkTextToNumber(String linkText) { |
+ linkText = linkText.replaceAll("[()\\[\\]{}]", ""); |
+ linkText = linkText.trim(); // Remove leading and trailing whitespaces. |
+ // Remove duplicate internal whitespaces. |
+ linkText = linkText.replaceAll("\\s\\{2,\\}", " "); |
+ return StringUtil.toNumber(linkText); |
+ } |
+ |
+ /** |
+ * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM. |
+ */ |
+ private static boolean isPlainPageNumber(int number) { |
+ return number >= 0 && number < MAX_NUM_FOR_PAGE_PARAM; |
+ } |
+ |
+} |