Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1876)

Unified Diff: java/org/chromium/distiller/PageParameterParser.java

Issue 1178633002: implement parser for new pagination algorithm (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | java/org/chromium/distiller/ParsedUrl.java » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: java/org/chromium/distiller/PageParameterParser.java
diff --git a/java/org/chromium/distiller/PageParameterParser.java b/java/org/chromium/distiller/PageParameterParser.java
new file mode 100644
index 0000000000000000000000000000000000000000..70c1e8dea62a0d0ea0d60fcd97a1f64667095fa7
--- /dev/null
+++ b/java/org/chromium/distiller/PageParameterParser.java
@@ -0,0 +1,284 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package org.chromium.distiller;
+
+import org.chromium.distiller.proto.DomDistillerProtos;
+import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo;
+
+import com.google.gwt.dom.client.AnchorElement;
+import com.google.gwt.dom.client.Document;
+import com.google.gwt.dom.client.Element;
+import com.google.gwt.dom.client.Node;
+import com.google.gwt.dom.client.NodeList;
+import com.google.gwt.dom.client.Style;
+import com.google.gwt.regexp.shared.MatchResult;
+import com.google.gwt.regexp.shared.RegExp;
+
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Background:
+ * The long article/news/forum thread/blog document may be partitioned into several partial pages
+ * by webmaster. Each partial page has outlinks pointing to the adjacent partial pages. The
+ * anchor text of those outlinks is numeric.
+ *
+ * This class parses the document to collect groups of adjacent plain text numbers and outlinks with
+ * digital anchor text. These are then passed to PageParameterParser which would spit out the
+ * pagination URLs if available.
+ */
+public class PageParameterParser {
+ // If the numeric value of a link's anchor text is greater than this number, we don't think it
+ // represents the page number of the link.
+ private static final int MAX_NUM_FOR_PAGE_PARAM = 100;
+
+ /**
+ * Type of sibling to check.
+ */
+ private static enum Sibling {
+ PREV, // Node.getPreviousSibling().
+ NEXT, // Node.getNextSibling().
+ }
+
+ /**
+ * Entry point for PageParameterParser.
+ * Parses the document to collect outlinks with digital anchor text and numeric text around
+ * them. These are then passed to PageParameterParser to detect pagination URLs.
+ *
+ * @return PageParamInfo (see PageParamInfo.java), always. If no page parameter is detected or
+ * determined to be best, its mType is PageParamInfo.Type.UNSET.
+ *
+ * @param originalUrl the original URL of the document to be parsed.
+ * @param timingInfo for tracking performance.
+ */
+ public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) {
+ PageParameterParser parser = new PageParameterParser(timingInfo);
+ return parser.parseDocument(Document.get().getDocumentElement(), originalUrl);
+ }
+
+ private final TimingInfo mTimingInfo;
+ private String mDocUrl = "";
+ private ParsedUrl mParsedUrl = null;
+ private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new MonotonicPageInfosGroups();
+ // Siblings that have been processed before and after each link.
+ private final Set<Node> mProcessedSiblings = new HashSet<Node>();
+
+ private static RegExp sHrefCleaner = null;
+
+ private PageParameterParser(TimingInfo timingInfo) {
+ mTimingInfo = timingInfo;
+ }
+
+ /**
+ * Acutually implements PageParameterParser.parse(), see above description for parse().
+ */
+ private PageParamInfo parseDocument(Element root, String originalUrl) {
+ double startTime = DomUtil.getTime();
+
+ mDocUrl = originalUrl;
+ mParsedUrl = ParsedUrl.create(mDocUrl);
+ if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL.
+
+ mAdjacentNumbersGroups.addGroup();
+ AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase(
+ PagingLinksFinder.getBaseUrlForRelative(root, originalUrl));
+
+ NodeList<Element> allLinks = root.getElementsByTagName("A");
cjhopman 2015/06/26 00:35:25 What do you think of this alternative approach: a
kuan 2015/07/13 22:57:03 Done.
+ for (int i = 0; i < allLinks.getLength(); i++) {
+ final AnchorElement link = AnchorElement.as(allLinks.getItem(i));
+
+ String linkHref = PagingLinksFinder.resolveLinkHref(link, baseAnchor);
+ ParsedUrl url = ParsedUrl.create(linkHref);
+ if (url == null || !url.getHost().equalsIgnoreCase(mParsedUrl.getHost())) continue;
+ url.setHash("");
+
+ // Use javascript innerText (instead of javascript textContent) to only get visible
+ // text.
+ String linkText = DomUtil.getInnerText(link);
+ int number = linkTextToNumber(linkText);
+ if (isPlainPageNumber(number)) {
+ // Before we append the link to the current group of adjacent numbers, check if it's
+ // preceded by a sibling with text, which would determine if the current adjacency
+ // continues or ends.
+ Node parentWrapper = null;
+ if (!checkForSiblingWithText(link, Sibling.PREV)) { // Link has no sibling.
+ // The link could be a child of a parent that is simply a wrapper, i.e. with no
+ // extra text, in which case, we should be checking the siblings of the topmost
+ // parent wrapper.
+ parentWrapper = findParentWrapper(link, linkText.length());
+ if (parentWrapper != null) checkForSiblingWithText(parentWrapper, Sibling.PREV);
+ }
+
+ // This link is a good candidate for pagination, add it to the current group of
+ // adjacent numbers.
+ if (isDisabledLink(link)) {
+ linkHref = "";
+ } else {
+ if (sHrefCleaner == null) sHrefCleaner = RegExp.compile("/?(#.*)?$");
+ linkHref = sHrefCleaner.replace(url.toString(), "");
+ }
+ mAdjacentNumbersGroups.addNumber(number, linkHref);
+
+ // Check if the link is followed by a sibling with text, which would determine if
+ // the current adjacency continues or ends.
+ checkForSiblingWithText(parentWrapper == null ? link : parentWrapper, Sibling.NEXT);
+ } else {
+ // This link is not a valid pagination link, so current group of adjacent numbers
+ // should be closed, adding a new group if possible.
+ mAdjacentNumbersGroups.addGroup();
+ }
+ } // for all links
+
+ mAdjacentNumbersGroups.cleanup();
+
+ LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser");
+
+ startTime = DomUtil.getTime();
+ PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups, mDocUrl);
+ LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector");
+ return info;
+ }
+
+ private static RegExp sTermsRegExp = null; // Match terms i.e. words.
+ private static RegExp sSurroundingDigitsRegExp = null; // Match term with only digits.
+
+ /**
+ * Checks for previous or next sibling with word text. If the text contains digit(s) as terms
+ * that form a valid page number, the sibling is added to the current group of adjacent
+ * numbers. Otherwise, the current group of adjacent numbers is closed to end the current
+ * adjacency, and a new group is started.
+ *
+ * @return true if given start node has at least 1 sibling, false otherwise.
+
+ * @param start node to start checking with.
+ * @param sibling previous or next sibling.
+ */
+ private boolean checkForSiblingWithText(Node start, Sibling sibling) {
+ Node node = start;
+ Node prevNode = null;
+ String text = "";
+ // Find the first sibling that has inner text with words.
+ do {
+ prevNode = node;
+ node = sibling == Sibling.PREV ? node.getPreviousSibling() : node.getNextSibling();
+ if (node == null && prevNode == start) return false;
+ if (node == null || node.getNodeType() == Node.DOCUMENT_NODE ||
+ !mProcessedSiblings.add(node)) { // Node has been processed.
+ return true;
+ }
+
+ if (node.getNodeType() == Node.TEXT_NODE) {
+ text = node.getNodeValue();
+ } else {
+ Element e = Element.as(node);
+
+ // Ignore non-void anchors, since we're iterating through them.
+ if (e.hasTagName("A") && shouldIgnoreLinkSibling(e)) return true;
+
+ // Ignore non-void link children, since we're iterating through them.
+ NodeList<Element> linkChildren = e.getElementsByTagName("A");
+ for (int i = 0; i < linkChildren.getLength(); i++) {
+ if (shouldIgnoreLinkSibling(linkChildren.getItem(i))) return true;
+ }
+
+ text = DomUtil.getInnerText(e);
+ }
+ } while (text.isEmpty() || StringUtil.countWords(text) == 0);
+
+ if (!StringUtil.containsDigit(text)) {
+ // The sibling does not contain valid number(s), so current group of adjacent numbers
+ // should be closed, adding a new group if possible.
+ mAdjacentNumbersGroups.addGroup();
+ return true;
+ }
+
+ if (sTermsRegExp == null) {
+ sTermsRegExp = RegExp.compile("(\\S*[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\\S*)", "gi");
+ } else {
+ sTermsRegExp.setLastIndex(0);
+ }
+ if (sSurroundingDigitsRegExp == null) {
+ sSurroundingDigitsRegExp = RegExp.compile("^[\\W_]*(\\d+)[\\W_]*$", "i");
+ }
+
+ // Extract terms from the text, differentiating between those that contain only digits and
+ // those that contain non-digits.
+ while (true) {
+ MatchResult match = sTermsRegExp.exec(text);
+ if (match == null) break;
+ if (match.getGroupCount() <= 1) continue;
+
+ String term = match.getGroup(1);
+ MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term);
+ int number = -1;
+ if (termWithDigits != null && termWithDigits.getGroupCount() > 1) {
+ number = StringUtil.toNumber(termWithDigits.getGroup(1));
+ }
+ if (isPlainPageNumber(number)) {
+ // This sibling is a valid candidate of plain text page number, add it to last
+ // group of adjacent numbers.
+ mAdjacentNumbersGroups.addNumber(number, "");
+ } else {
+ // The sibling is not a valid number, so current group of adjacent numbers
+ // should be closed, adding a new group if possible.
+ mAdjacentNumbersGroups.addGroup();
+ }
+ } // while there're matches
+
+ return true;
+ }
+
+ /**
+ * @return the topmost parent of the given node that simply wraps the node, i.e. with no more
+ * inner text than that of given node.
+ */
+ private static Node findParentWrapper(Node node, int nodeTextLen) {
+ Node parent = node;
+ Node prevParent = null;
+ // While keeping track of each parent, once we find the first one that has more text than
+ // given node, the previous parent would be what we want.
+ do {
+ prevParent = parent;
+ parent = parent.getParentNode();
+ } while (parent != null && DomUtil.getInnerText(parent).length() == nodeTextLen);
+
+ return prevParent == node || prevParent.getNodeType() == Node.DOCUMENT_NODE ?
+ null : prevParent;
+ }
+
+ /**
+ * @return true if link is disabled i.e. not clickable because it has a text cursor.
+ */
+ private static boolean isDisabledLink(AnchorElement link) {
+ Style style = DomUtil.getComputedStyle(link);
+ return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cursor.TEXT;
+ }
+
+ /**
+ * @return true if link should be ignored when checking for sibling.
+ * All non-void links will be processed, and hence could be ignored when checking for sibling.
+ * Void links are rejected at the beginning because their hosts are different from
+ * originalUrl's, so they should be considered when checking for sibling.
+ */
+ private static boolean shouldIgnoreLinkSibling(Element link) {
+ return !AnchorElement.as(link).getHref().equals("javascript:void(0)");
+ }
+
+ private static int linkTextToNumber(String linkText) {
+ linkText = linkText.replaceAll("[()\\[\\]{}]", "");
+ linkText = linkText.trim(); // Remove leading and trailing whitespaces.
+ // Remove duplicate internal whitespaces.
+ linkText = linkText.replaceAll("\\s\\{2,\\}", " ");
+ return StringUtil.toNumber(linkText);
+ }
+
+ /**
+ * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM.
+ */
+ private static boolean isPlainPageNumber(int number) {
+ return number >= 0 && number < MAX_NUM_FOR_PAGE_PARAM;
+ }
+
+}
« no previous file with comments | « no previous file | java/org/chromium/distiller/ParsedUrl.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698