Chromium Code Reviews| Index: java/org/chromium/distiller/PageParameterParser.java | 
| diff --git a/java/org/chromium/distiller/PageParameterParser.java b/java/org/chromium/distiller/PageParameterParser.java | 
| new file mode 100644 | 
| index 0000000000000000000000000000000000000000..70c1e8dea62a0d0ea0d60fcd97a1f64667095fa7 | 
| --- /dev/null | 
| +++ b/java/org/chromium/distiller/PageParameterParser.java | 
| @@ -0,0 +1,284 @@ | 
| +// Copyright 2015 The Chromium Authors. All rights reserved. | 
| +// Use of this source code is governed by a BSD-style license that can be | 
| +// found in the LICENSE file. | 
| + | 
| +package org.chromium.distiller; | 
| + | 
| +import org.chromium.distiller.proto.DomDistillerProtos; | 
| +import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo; | 
| + | 
| +import com.google.gwt.dom.client.AnchorElement; | 
| +import com.google.gwt.dom.client.Document; | 
| +import com.google.gwt.dom.client.Element; | 
| +import com.google.gwt.dom.client.Node; | 
| +import com.google.gwt.dom.client.NodeList; | 
| +import com.google.gwt.dom.client.Style; | 
| +import com.google.gwt.regexp.shared.MatchResult; | 
| +import com.google.gwt.regexp.shared.RegExp; | 
| + | 
| +import java.util.HashSet; | 
| +import java.util.Set; | 
| + | 
| +/** | 
| + * Background: | 
| + * The long article/news/forum thread/blog document may be partitioned into several partial pages | 
| + * by webmaster. Each partial page has outlinks pointing to the adjacent partial pages. The | 
| + * anchor text of those outlinks is numeric. | 
| + * | 
| + * This class parses the document to collect groups of adjacent plain text numbers and outlinks with | 
| + * digital anchor text. These are then passed to PageParameterParser which would spit out the | 
| + * pagination URLs if available. | 
| + */ | 
| +public class PageParameterParser { | 
| + // If the numeric value of a link's anchor text is greater than this number, we don't think it | 
| + // represents the page number of the link. | 
| + private static final int MAX_NUM_FOR_PAGE_PARAM = 100; | 
| + | 
| + /** | 
| + * Type of sibling to check. | 
| + */ | 
| + private static enum Sibling { | 
| + PREV, // Node.getPreviousSibling(). | 
| + NEXT, // Node.getNextSibling(). | 
| + } | 
| + | 
| + /** | 
| + * Entry point for PageParameterParser. | 
| + * Parses the document to collect outlinks with digital anchor text and numeric text around | 
| + * them. These are then passed to PageParameterParser to detect pagination URLs. | 
| + * | 
| + * @return PageParamInfo (see PageParamInfo.java), always. If no page parameter is detected or | 
| + * determined to be best, its mType is PageParamInfo.Type.UNSET. | 
| + * | 
| + * @param originalUrl the original URL of the document to be parsed. | 
| + * @param timingInfo for tracking performance. | 
| + */ | 
| + public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) { | 
| + PageParameterParser parser = new PageParameterParser(timingInfo); | 
| + return parser.parseDocument(Document.get().getDocumentElement(), originalUrl); | 
| + } | 
| + | 
| + private final TimingInfo mTimingInfo; | 
| + private String mDocUrl = ""; | 
| + private ParsedUrl mParsedUrl = null; | 
| + private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new MonotonicPageInfosGroups(); | 
| + // Siblings that have been processed before and after each link. | 
| + private final Set<Node> mProcessedSiblings = new HashSet<Node>(); | 
| + | 
| + private static RegExp sHrefCleaner = null; | 
| + | 
| + private PageParameterParser(TimingInfo timingInfo) { | 
| + mTimingInfo = timingInfo; | 
| + } | 
| + | 
| + /** | 
| + * Acutually implements PageParameterParser.parse(), see above description for parse(). | 
| + */ | 
| + private PageParamInfo parseDocument(Element root, String originalUrl) { | 
| + double startTime = DomUtil.getTime(); | 
| + | 
| + mDocUrl = originalUrl; | 
| + mParsedUrl = ParsedUrl.create(mDocUrl); | 
| + if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL. | 
| + | 
| + mAdjacentNumbersGroups.addGroup(); | 
| + AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase( | 
| + PagingLinksFinder.getBaseUrlForRelative(root, originalUrl)); | 
| + | 
| + NodeList<Element> allLinks = root.getElementsByTagName("A"); | 
| 
 
cjhopman
2015/06/26 00:35:25
What do you think of this alternative approach:
a
 
kuan
2015/07/13 22:57:03
Done.
 
 | 
| + for (int i = 0; i < allLinks.getLength(); i++) { | 
| + final AnchorElement link = AnchorElement.as(allLinks.getItem(i)); | 
| + | 
| + String linkHref = PagingLinksFinder.resolveLinkHref(link, baseAnchor); | 
| + ParsedUrl url = ParsedUrl.create(linkHref); | 
| + if (url == null || !url.getHost().equalsIgnoreCase(mParsedUrl.getHost())) continue; | 
| + url.setHash(""); | 
| + | 
| + // Use javascript innerText (instead of javascript textContent) to only get visible | 
| + // text. | 
| + String linkText = DomUtil.getInnerText(link); | 
| + int number = linkTextToNumber(linkText); | 
| + if (isPlainPageNumber(number)) { | 
| + // Before we append the link to the current group of adjacent numbers, check if it's | 
| + // preceded by a sibling with text, which would determine if the current adjacency | 
| + // continues or ends. | 
| + Node parentWrapper = null; | 
| + if (!checkForSiblingWithText(link, Sibling.PREV)) { // Link has no sibling. | 
| + // The link could be a child of a parent that is simply a wrapper, i.e. with no | 
| + // extra text, in which case, we should be checking the siblings of the topmost | 
| + // parent wrapper. | 
| + parentWrapper = findParentWrapper(link, linkText.length()); | 
| + if (parentWrapper != null) checkForSiblingWithText(parentWrapper, Sibling.PREV); | 
| + } | 
| + | 
| + // This link is a good candidate for pagination, add it to the current group of | 
| + // adjacent numbers. | 
| + if (isDisabledLink(link)) { | 
| + linkHref = ""; | 
| + } else { | 
| + if (sHrefCleaner == null) sHrefCleaner = RegExp.compile("/?(#.*)?$"); | 
| + linkHref = sHrefCleaner.replace(url.toString(), ""); | 
| + } | 
| + mAdjacentNumbersGroups.addNumber(number, linkHref); | 
| + | 
| + // Check if the link is followed by a sibling with text, which would determine if | 
| + // the current adjacency continues or ends. | 
| + checkForSiblingWithText(parentWrapper == null ? link : parentWrapper, Sibling.NEXT); | 
| + } else { | 
| + // This link is not a valid pagination link, so current group of adjacent numbers | 
| + // should be closed, adding a new group if possible. | 
| + mAdjacentNumbersGroups.addGroup(); | 
| + } | 
| + } // for all links | 
| + | 
| + mAdjacentNumbersGroups.cleanup(); | 
| + | 
| + LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser"); | 
| + | 
| + startTime = DomUtil.getTime(); | 
| + PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups, mDocUrl); | 
| + LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector"); | 
| + return info; | 
| + } | 
| + | 
| + private static RegExp sTermsRegExp = null; // Match terms i.e. words. | 
| + private static RegExp sSurroundingDigitsRegExp = null; // Match term with only digits. | 
| + | 
| + /** | 
| + * Checks for previous or next sibling with word text. If the text contains digit(s) as terms | 
| + * that form a valid page number, the sibling is added to the current group of adjacent | 
| + * numbers. Otherwise, the current group of adjacent numbers is closed to end the current | 
| + * adjacency, and a new group is started. | 
| + * | 
| + * @return true if given start node has at least 1 sibling, false otherwise. | 
| + | 
| + * @param start node to start checking with. | 
| + * @param sibling previous or next sibling. | 
| + */ | 
| + private boolean checkForSiblingWithText(Node start, Sibling sibling) { | 
| + Node node = start; | 
| + Node prevNode = null; | 
| + String text = ""; | 
| + // Find the first sibling that has inner text with words. | 
| + do { | 
| + prevNode = node; | 
| + node = sibling == Sibling.PREV ? node.getPreviousSibling() : node.getNextSibling(); | 
| + if (node == null && prevNode == start) return false; | 
| + if (node == null || node.getNodeType() == Node.DOCUMENT_NODE || | 
| + !mProcessedSiblings.add(node)) { // Node has been processed. | 
| + return true; | 
| + } | 
| + | 
| + if (node.getNodeType() == Node.TEXT_NODE) { | 
| + text = node.getNodeValue(); | 
| + } else { | 
| + Element e = Element.as(node); | 
| + | 
| + // Ignore non-void anchors, since we're iterating through them. | 
| + if (e.hasTagName("A") && shouldIgnoreLinkSibling(e)) return true; | 
| + | 
| + // Ignore non-void link children, since we're iterating through them. | 
| + NodeList<Element> linkChildren = e.getElementsByTagName("A"); | 
| + for (int i = 0; i < linkChildren.getLength(); i++) { | 
| + if (shouldIgnoreLinkSibling(linkChildren.getItem(i))) return true; | 
| + } | 
| + | 
| + text = DomUtil.getInnerText(e); | 
| + } | 
| + } while (text.isEmpty() || StringUtil.countWords(text) == 0); | 
| + | 
| + if (!StringUtil.containsDigit(text)) { | 
| + // The sibling does not contain valid number(s), so current group of adjacent numbers | 
| + // should be closed, adding a new group if possible. | 
| + mAdjacentNumbersGroups.addGroup(); | 
| + return true; | 
| + } | 
| + | 
| + if (sTermsRegExp == null) { | 
| + sTermsRegExp = RegExp.compile("(\\S*[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\\S*)", "gi"); | 
| + } else { | 
| + sTermsRegExp.setLastIndex(0); | 
| + } | 
| + if (sSurroundingDigitsRegExp == null) { | 
| + sSurroundingDigitsRegExp = RegExp.compile("^[\\W_]*(\\d+)[\\W_]*$", "i"); | 
| + } | 
| + | 
| + // Extract terms from the text, differentiating between those that contain only digits and | 
| + // those that contain non-digits. | 
| + while (true) { | 
| + MatchResult match = sTermsRegExp.exec(text); | 
| + if (match == null) break; | 
| + if (match.getGroupCount() <= 1) continue; | 
| + | 
| + String term = match.getGroup(1); | 
| + MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term); | 
| + int number = -1; | 
| + if (termWithDigits != null && termWithDigits.getGroupCount() > 1) { | 
| + number = StringUtil.toNumber(termWithDigits.getGroup(1)); | 
| + } | 
| + if (isPlainPageNumber(number)) { | 
| + // This sibling is a valid candidate of plain text page number, add it to last | 
| + // group of adjacent numbers. | 
| + mAdjacentNumbersGroups.addNumber(number, ""); | 
| + } else { | 
| + // The sibling is not a valid number, so current group of adjacent numbers | 
| + // should be closed, adding a new group if possible. | 
| + mAdjacentNumbersGroups.addGroup(); | 
| + } | 
| + } // while there're matches | 
| + | 
| + return true; | 
| + } | 
| + | 
| + /** | 
| + * @return the topmost parent of the given node that simply wraps the node, i.e. with no more | 
| + * inner text than that of given node. | 
| + */ | 
| + private static Node findParentWrapper(Node node, int nodeTextLen) { | 
| + Node parent = node; | 
| + Node prevParent = null; | 
| + // While keeping track of each parent, once we find the first one that has more text than | 
| + // given node, the previous parent would be what we want. | 
| + do { | 
| + prevParent = parent; | 
| + parent = parent.getParentNode(); | 
| + } while (parent != null && DomUtil.getInnerText(parent).length() == nodeTextLen); | 
| + | 
| + return prevParent == node || prevParent.getNodeType() == Node.DOCUMENT_NODE ? | 
| + null : prevParent; | 
| + } | 
| + | 
| + /** | 
| + * @return true if link is disabled i.e. not clickable because it has a text cursor. | 
| + */ | 
| + private static boolean isDisabledLink(AnchorElement link) { | 
| + Style style = DomUtil.getComputedStyle(link); | 
| + return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cursor.TEXT; | 
| + } | 
| + | 
| + /** | 
| + * @return true if link should be ignored when checking for sibling. | 
| + * All non-void links will be processed, and hence could be ignored when checking for sibling. | 
| + * Void links are rejected at the beginning because their hosts are different from | 
| + * originalUrl's, so they should be considered when checking for sibling. | 
| + */ | 
| + private static boolean shouldIgnoreLinkSibling(Element link) { | 
| + return !AnchorElement.as(link).getHref().equals("javascript:void(0)"); | 
| + } | 
| + | 
| + private static int linkTextToNumber(String linkText) { | 
| + linkText = linkText.replaceAll("[()\\[\\]{}]", ""); | 
| + linkText = linkText.trim(); // Remove leading and trailing whitespaces. | 
| + // Remove duplicate internal whitespaces. | 
| + linkText = linkText.replaceAll("\\s\\{2,\\}", " "); | 
| + return StringUtil.toNumber(linkText); | 
| + } | 
| + | 
| + /** | 
| + * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM. | 
| + */ | 
| + private static boolean isPlainPageNumber(int number) { | 
| + return number >= 0 && number < MAX_NUM_FOR_PAGE_PARAM; | 
| + } | 
| + | 
| +} |