Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(421)

Unified Diff: java/org/chromium/distiller/PageParameterParser.java

Issue 1178633002: implement parser for new pagination algorithm (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: addr chris's comments Created 5 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: java/org/chromium/distiller/PageParameterParser.java
diff --git a/java/org/chromium/distiller/PageParameterParser.java b/java/org/chromium/distiller/PageParameterParser.java
new file mode 100644
index 0000000000000000000000000000000000000000..43cfdbac3ce0355e8f248ad26dd0edfb5bc09645
--- /dev/null
+++ b/java/org/chromium/distiller/PageParameterParser.java
@@ -0,0 +1,362 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package org.chromium.distiller;
+
+import org.chromium.distiller.proto.DomDistillerProtos;
+import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo;
+
+import com.google.gwt.dom.client.AnchorElement;
+import com.google.gwt.dom.client.Document;
+import com.google.gwt.dom.client.Element;
+import com.google.gwt.dom.client.Node;
+import com.google.gwt.dom.client.NodeList;
+import com.google.gwt.dom.client.Style;
+import com.google.gwt.regexp.shared.MatchResult;
+import com.google.gwt.regexp.shared.RegExp;
+
+/**
+ * Background:
+ * The long article/news/forum thread/blog document may be partitioned into several partial pages
+ * by webmaster. Each partial page has outlinks pointing to the adjacent partial pages. The
+ * anchor text of those outlinks is numeric.
+ *
+ * This class parses the document to collect groups of adjacent plain text numbers and outlinks with
+ * digital anchor text. These are then passed to PageParameterParser which would spit out the
+ * pagination URLs if available.
+ */
+public class PageParameterParser {
+ // If the numeric value of a link's anchor text is greater than this number, we don't think it
+ // represents the page number of the link.
+ private static final int MAX_NUM_FOR_PAGE_PARAM = 100;
+
+ /**
+ * Stores PageParamInfo.PageInfo and the anchor's text, specifically returned by
+ * getPageInfoAndText().
+ */
+ private static class PageInfoAndText {
+ private final PageParamInfo.PageInfo mPageInfo;
+ private final String mText;
+
+ PageInfoAndText(int number, String url, String text) {
+ mPageInfo = new PageParamInfo.PageInfo(number, url);
+ mText = text;
+ }
+ }
+
+ /**
+ * Entry point for PageParameterParser.
+ * Parses the document to collect outlinks with digital anchor text and numeric text around
+ * them. These are then passed to PageParameterParser to detect pagination URLs.
+ *
+ * @return PageParamInfo (see PageParamInfo.java), always. If no page parameter is detected or
+ * determined to be best, its mType is PageParamInfo.Type.UNSET.
+ *
+ * @param originalUrl the original URL of the document to be parsed.
+ * @param timingInfo for tracking performance.
+ */
+ public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) {
+ PageParameterParser parser = new PageParameterParser(timingInfo);
+ return parser.parseDocument(Document.get().getDocumentElement(), originalUrl);
+ }
+
+ private final TimingInfo mTimingInfo;
+ private String mDocUrl = "";
+ private ParsedUrl mParsedUrl = null;
+ private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new MonotonicPageInfosGroups();
+
+ private static RegExp sHrefCleaner = null;
+
+ private PageParameterParser(TimingInfo timingInfo) {
+ mTimingInfo = timingInfo;
+ }
+
+ /**
+ * Acutually implements PageParameterParser.parse(), see above description for parse().
+ */
+ private PageParamInfo parseDocument(Element root, String originalUrl) {
+ double startTime = DomUtil.getTime();
+
+ mDocUrl = originalUrl;
+ mParsedUrl = ParsedUrl.create(mDocUrl);
+ if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL.
+
+ AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase(
+ PagingLinksFinder.getBaseUrlForRelative(root, originalUrl));
+
+ NodeList<Element> allLinks = root.getElementsByTagName("A");
+ int idx = 0;
+ while (idx < allLinks.getLength()) {
+ final AnchorElement link = AnchorElement.as(allLinks.getItem(idx));
+ PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor);
+ if (pageInfoAndText == null) {
+ idx++;
+ continue;
+ }
+
+ // This link is a good candidate for pagination.
+
+ // Close current group of adjacent numbers, add a new group if necessary.
+ mAdjacentNumbersGroups.addGroup();
+
+ // Before we append the link to the new group of adjacent numbers, check if it's
+ // preceded by a sibling with text; if so, add it before the link.
+ Node parentWrapper = null;
cjhopman 2015/07/29 01:07:53 What's this parent wrapper? I don't recall that be
kuan 2015/07/30 16:47:00 i had it in the previous change, and attempted to
cjhopman 2015/08/04 21:58:41 But why the parent wrapper thing? Why not just wal
kuan 2015/08/04 22:38:37 what do u mean by "backwards/forwards in the tree"
kuan 2015/08/11 19:09:38 Done.
+ if (!checkForPrevSiblingWithText(link)) { // Link has no sibling.
+ // The link could be a child of a parent that is simply a wrapper, i.e. with no
+ // extra text, in which case, we should be checking the siblings of the topmost
+ // parent wrapper.
+ parentWrapper = findParentWrapper(link, pageInfoAndText.mText.length());
+ if (parentWrapper != null) checkForPrevSiblingWithText(parentWrapper);
+ }
+
+ // Add the link to the current group of adjacent numbers.
+ mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo);
+
+ // Add all following siblings with numeric text, with or without links.
+ int numLinksAdded = 0;
+ if (parentWrapper == null)
+ numLinksAdded = addFollowingSiblings(link, false, baseAnchor);
+ else
+ numLinksAdded = addFollowingSiblings(parentWrapper, true, baseAnchor);
+
+ // Skip the current link and links already processed in addFollowingSiblings().
+ idx += 1 + numLinksAdded;
+ } // while there're links.
+
+ mAdjacentNumbersGroups.cleanup();
+
+ LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser");
+
+ startTime = DomUtil.getTime();
+ PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups, mDocUrl);
+ LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector");
+ return info;
+ }
+
+
+ /**
+ * @return a populated PageInfoAndText if given link is to be added to mAdjacentNumbersGroups.
+ * Otherwise, returns null if link is to be ignored.
+ * "javascript:void" links with numeric text are considered valid links to be added.
+ *
+ * @param link to process.
+ * @param baseAnchor created for the current document.
+ */
+ private PageInfoAndText getPageInfoAndText(AnchorElement link, AnchorElement baseAnchor) {
+ // Ignore invisible links.
+ int width = link.getOffsetWidth();
+ int height = link.getOffsetHeight();
+ if (width == 0 || height == 0 || !DomUtil.isVisible(link)) return null;
cjhopman 2015/07/29 01:07:53 It seems odd that invisible links are handled here
kuan 2015/07/30 16:47:00 invisible links need to be ignored. i do this her
+
+ String linkHref = PagingLinksFinder.resolveLinkHref(link, baseAnchor);
+ boolean isVoidLink = isVoidHref(linkHref);
+ ParsedUrl url = ParsedUrl.create(linkHref);
+ if (url == null || (!isVoidLink && !url.getHost().equalsIgnoreCase(mParsedUrl.getHost()))) {
+ return null;
+ }
+
+ url.setHash("");
+
+ // Use javascript innerText (instead of javascript textContent) to only get visible text.
+ String linkText = DomUtil.getInnerText(link);
+ int number = linkTextToNumber(linkText);
+ if (!isPlainPageNumber(number)) return null;
+
+ if (isVoidLink || isDisabledLink(link)) return new PageInfoAndText(number, "", linkText);
+
+ if (sHrefCleaner == null) sHrefCleaner = RegExp.compile("/?(#.*)?$");
+ return new PageInfoAndText(number, sHrefCleaner.replace(url.toString(), ""), linkText);
+ }
+
+ /**
+ * Checks for previous sibling with word text. If the text contains digit(s) as terms that
+ * form a valid page number, the sibling is added to the current group of adjacent numbers.
+ * Otherwise, the current group of adjacent numbers is closed to end the current adjacency, and
+ * a new group is started.
+ *
+ * @return true if given start node has at least 1 sibling, false otherwise.
+
+ * @param start node to start checking with.
+ */
+ private boolean checkForPrevSiblingWithText(Node start) {
cjhopman 2015/07/29 01:07:53 I'm having difficulty understanding both the way t
kuan 2015/07/30 16:47:00 i initially had the check for previous and next nu
+ Node node = start;
+ Node prevNode = null;
+ String text = "";
+ // Find the first previous sibling that has inner text with words.
+ do {
+ prevNode = node;
+ node = node.getPreviousSibling();
+ if (node == null && prevNode == start) return false;
+ if (node == null || node.getNodeType() == Node.DOCUMENT_NODE) return true;
+
+ if (node.getNodeType() == Node.TEXT_NODE) {
+ text = node.getNodeValue();
+ } else {
+ Element e = Element.as(node);
+ // Previous link siblings or children have already been processed.
+ if (e.hasTagName("A") || e.getElementsByTagName("A").getLength() > 0) return true;
+ text = DomUtil.getInnerText(e);
+ }
+ } while (text.isEmpty() || StringUtil.countWords(text) == 0);
+
+ addNumberText(text);
+ return true;
+ }
+
+ /**
+ * Adds all following siblings (links and non-links) with numeric text. If the text contains
+ * digit(s) as terms that form a valid page number, the sibling is added to the current group of
+ * adjacent numbers. Otherwise, the current group of adjacent numbers is closed to end the
+ * current adjacency, and a new group is started.
+ *
+ * @return number of links added.
+
+ * @param start node to start checking with.
+ * @param isParentWrapper true if given start node is a parent wrapper of a link.
+ * @param baseAnchor created for the current document.
+ */
+ private int addFollowingSiblings(Node start, boolean isParentWrapper,
+ AnchorElement baseAnchor) {
+ Node node = start;
+ Node prevNode = null;
+ String text = "";
+ int numLinksProcessed = 0;
+ // Find all following siblings, add them if their text is purely numeric.
+ while (true) {
+ prevNode = node;
+ node = node.getNextSibling();
+ if (node == null || node.getNodeType() == Node.DOCUMENT_NODE) return numLinksProcessed;
+
+ boolean handled = false;
+ if (node.getNodeType() == Node.TEXT_NODE) {
+ text = node.getNodeValue();
+ } else {
+ Element e = Element.as(node);
+ if (e.hasTagName("A")) {
+ addValidLink(AnchorElement.as(e), baseAnchor);
+ numLinksProcessed++;
+ handled = true;
+ } else if (isParentWrapper) {
+ NodeList<Element> linkChildren = e.getElementsByTagName("A");
+ final int numChildren = linkChildren.getLength();
+ for (int i = 0; i < numChildren; i++) {
+ addValidLink(AnchorElement.as(linkChildren.getItem(i)), baseAnchor);
+ numLinksProcessed++;
+ }
+ if (numChildren > 0) handled = true;
+ }
+
+ text = handled ? "" : DomUtil.getInnerText(e);
+ }
+
+ if (!text.isEmpty() && StringUtil.countWords(text) > 0) addNumberText(text);
+ }
+ }
+
+ private static RegExp sTermsRegExp = null; // Match terms i.e. words.
+ private static RegExp sSurroundingDigitsRegExp = null; // Match term with only digits.
+
+ /**
+ * Add PageParamInfo.PageInfo for a non-link with numeric text.
cjhopman 2015/07/29 01:07:53 It looks like the text doesn't have to be strictly
kuan 2015/07/30 16:47:00 Done. renamed fn too.
+ */
+ private void addNumberText(String text) {
+ if (!StringUtil.containsDigit(text)) {
+ // The sibling does not contain valid number(s); if necessary, current group of adjacent
+ // numbers should be closed, adding a new group if possible.
+ mAdjacentNumbersGroups.addGroup();
+ return;
+ }
+
+ if (sTermsRegExp == null) {
+ sTermsRegExp = RegExp.compile("(\\S*[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\\S*)", "gi");
+ } else {
+ sTermsRegExp.setLastIndex(0);
+ }
+ if (sSurroundingDigitsRegExp == null) {
+ sSurroundingDigitsRegExp = RegExp.compile("^[\\W_]*(\\d+)[\\W_]*$", "i");
+ }
+
+ // Extract terms from the text, differentiating between those that contain only digits and
+ // those that contain non-digits.
+ while (true) {
+ MatchResult match = sTermsRegExp.exec(text);
+ if (match == null) break;
+ if (match.getGroupCount() <= 1) continue;
+
+ String term = match.getGroup(1);
+ MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term);
+ int number = -1;
+ if (termWithDigits != null && termWithDigits.getGroupCount() > 1) {
+ number = StringUtil.toNumber(termWithDigits.getGroup(1));
+ }
+ if (isPlainPageNumber(number)) {
+ // This sibling is a valid candidate of plain text page number, add it to last
+ // group of adjacent numbers.
+ mAdjacentNumbersGroups.addNumber(number, "");
+ } else {
+ // The sibling is not a valid number, so current group of adjacent numbers
+ // should be closed, adding a new group if possible.
+ mAdjacentNumbersGroups.addGroup();
+ }
+ } // while there're matches
+ }
+
+ /**
+ * Add PageParamInfo.PageInfo for a link if its text is numeric.
+ */
+ private void addValidLink(AnchorElement link, AnchorElement baseAnchor) {
cjhopman 2015/07/29 01:07:53 probably rename this to addLinkIfValid() since it
kuan 2015/07/30 16:47:00 Done. this fn is created simply to prevent duplic
+ PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor);
+ if (pageInfoAndText != null) mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo);
+ }
+
+ /**
+ * @return the topmost parent of the given node that simply wraps the node, i.e. with no more
+ * inner text than that of given node.
+ */
+ private static Node findParentWrapper(Node node, int nodeTextLen) {
+ Node parent = node;
+ Node prevParent = null;
+ // While keeping track of each parent, once we find the first one that has more text than
+ // given node, the previous parent would be what we want.
+ do {
+ prevParent = parent;
+ parent = parent.getParentNode();
+ } while (parent != null && DomUtil.getInnerText(parent).length() == nodeTextLen);
+
+ return prevParent == node || prevParent.getNodeType() == Node.DOCUMENT_NODE ?
+ null : prevParent;
+ }
+
+ /**
+ * @return true if link is disabled i.e. not clickable because it has a text cursor.
+ */
+ private static boolean isDisabledLink(AnchorElement link) {
+ Style style = DomUtil.getComputedStyle(link);
+ return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cursor.TEXT;
+ }
+
+ /**
+ * @return true if href is "javascript:void(0)".
+ */
+ private static boolean isVoidHref(String href) {
+ return href.equals("javascript:void(0)");
+ }
+
+ private static int linkTextToNumber(String linkText) {
+ linkText = linkText.replaceAll("[()\\[\\]{}]", "");
+ linkText = linkText.trim(); // Remove leading and trailing whitespaces.
+ // Remove duplicate internal whitespaces.
+ linkText = linkText.replaceAll("\\s\\{2,\\}", " ");
+ return StringUtil.toNumber(linkText);
+ }
+
+ /**
+ * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM.
+ */
+ private static boolean isPlainPageNumber(int number) {
+ return number >= 0 && number < MAX_NUM_FOR_PAGE_PARAM;
+ }
+
+}
« no previous file with comments | « java/org/chromium/distiller/MonotonicPageInfosGroups.java ('k') | java/org/chromium/distiller/ParsedUrl.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698