java/org/chromium/distiller/PageParameterParser.java - Issue 1178633002: implement parser for new pagination algorithm

Unified Diff: java/org/chromium/distiller/PageParameterParser.java

Issue 1178633002: implement parser for new pagination algorithm (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master

Patch Set: addr chris's comments Created 5 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: java/org/chromium/distiller/PageParameterParser.java

diff --git a/java/org/chromium/distiller/PageParameterParser.java b/java/org/chromium/distiller/PageParameterParser.java

new file mode 100644

index 0000000000000000000000000000000000000000..43cfdbac3ce0355e8f248ad26dd0edfb5bc09645

--- /dev/null

+++ b/java/org/chromium/distiller/PageParameterParser.java

@@ -0,0 +1,362 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+package org.chromium.distiller;

+import org.chromium.distiller.proto.DomDistillerProtos;

+import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo;

+import com.google.gwt.dom.client.AnchorElement;

+import com.google.gwt.dom.client.Document;

+import com.google.gwt.dom.client.Element;

+import com.google.gwt.dom.client.Node;

+import com.google.gwt.dom.client.NodeList;

+import com.google.gwt.dom.client.Style;

+import com.google.gwt.regexp.shared.MatchResult;

+import com.google.gwt.regexp.shared.RegExp;

+/**

+ * Background:

+ * The long article/news/forum thread/blog document may be partitioned into several partial pages

+ * by webmaster. Each partial page has outlinks pointing to the adjacent partial pages. The

+ * anchor text of those outlinks is numeric.

+ *

+ * This class parses the document to collect groups of adjacent plain text numbers and outlinks with

+ * digital anchor text. These are then passed to PageParameterParser which would spit out the

+ * pagination URLs if available.

+ */

+public class PageParameterParser {

+ // If the numeric value of a link's anchor text is greater than this number, we don't think it

+ // represents the page number of the link.

+ private static final int MAX_NUM_FOR_PAGE_PARAM = 100;

+ /**

+ * Stores PageParamInfo.PageInfo and the anchor's text, specifically returned by

+ * getPageInfoAndText().

+ */

+ private static class PageInfoAndText {

+ private final PageParamInfo.PageInfo mPageInfo;

+ private final String mText;

+ PageInfoAndText(int number, String url, String text) {

+ mPageInfo = new PageParamInfo.PageInfo(number, url);

+ mText = text;

+ }

+ /**

+ * Entry point for PageParameterParser.

+ * Parses the document to collect outlinks with digital anchor text and numeric text around

+ * them. These are then passed to PageParameterParser to detect pagination URLs.

+ *

+ * @return PageParamInfo (see PageParamInfo.java), always. If no page parameter is detected or

+ * determined to be best, its mType is PageParamInfo.Type.UNSET.

+ *

+ * @param originalUrl the original URL of the document to be parsed.

+ * @param timingInfo for tracking performance.

+ */

+ public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) {

+ PageParameterParser parser = new PageParameterParser(timingInfo);

+ return parser.parseDocument(Document.get().getDocumentElement(), originalUrl);

+ }

+ private final TimingInfo mTimingInfo;

+ private String mDocUrl = "";

+ private ParsedUrl mParsedUrl = null;

+ private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new MonotonicPageInfosGroups();

+ private static RegExp sHrefCleaner = null;

+ private PageParameterParser(TimingInfo timingInfo) {

+ mTimingInfo = timingInfo;

+ }

+ /**

+ * Acutually implements PageParameterParser.parse(), see above description for parse().

+ */

+ private PageParamInfo parseDocument(Element root, String originalUrl) {

+ double startTime = DomUtil.getTime();

+ mDocUrl = originalUrl;

+ mParsedUrl = ParsedUrl.create(mDocUrl);

+ if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL.

+ AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase(

+ PagingLinksFinder.getBaseUrlForRelative(root, originalUrl));

+ NodeList<Element> allLinks = root.getElementsByTagName("A");

+ int idx = 0;

+ while (idx < allLinks.getLength()) {

+ final AnchorElement link = AnchorElement.as(allLinks.getItem(idx));

+ PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor);

+ if (pageInfoAndText == null) {

+ idx++;

+ continue;

+ }

+ // This link is a good candidate for pagination.

+ // Close current group of adjacent numbers, add a new group if necessary.

+ mAdjacentNumbersGroups.addGroup();

+ // Before we append the link to the new group of adjacent numbers, check if it's

+ // preceded by a sibling with text; if so, add it before the link.

+ Node parentWrapper = null;

cjhopman 2015/07/29 01:07:53 What's this parent wrapper? I don't recall that be

kuan 2015/07/30 16:47:00 i had it in the previous change, and attempted to

cjhopman 2015/08/04 21:58:41 But why the parent wrapper thing? Why not just wal

kuan 2015/08/04 22:38:37 what do u mean by "backwards/forwards in the tree"

kuan 2015/08/11 19:09:38 Done.

+ if (!checkForPrevSiblingWithText(link)) { // Link has no sibling.

+ // The link could be a child of a parent that is simply a wrapper, i.e. with no

+ // extra text, in which case, we should be checking the siblings of the topmost

+ // parent wrapper.

+ parentWrapper = findParentWrapper(link, pageInfoAndText.mText.length());

+ if (parentWrapper != null) checkForPrevSiblingWithText(parentWrapper);

+ }

+ // Add the link to the current group of adjacent numbers.

+ mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo);

+ // Add all following siblings with numeric text, with or without links.

+ int numLinksAdded = 0;

+ if (parentWrapper == null)

+ numLinksAdded = addFollowingSiblings(link, false, baseAnchor);

+ else

+ numLinksAdded = addFollowingSiblings(parentWrapper, true, baseAnchor);

+ // Skip the current link and links already processed in addFollowingSiblings().

+ idx += 1 + numLinksAdded;

+ } // while there're links.

+ mAdjacentNumbersGroups.cleanup();

+ LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser");

+ startTime = DomUtil.getTime();

+ PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups, mDocUrl);

+ LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector");

+ return info;

+ }

+ /**

+ * @return a populated PageInfoAndText if given link is to be added to mAdjacentNumbersGroups.

+ * Otherwise, returns null if link is to be ignored.

+ * "javascript:void" links with numeric text are considered valid links to be added.

+ *

+ * @param link to process.

+ * @param baseAnchor created for the current document.

+ */

+ private PageInfoAndText getPageInfoAndText(AnchorElement link, AnchorElement baseAnchor) {

+ // Ignore invisible links.

+ int width = link.getOffsetWidth();

+ int height = link.getOffsetHeight();

+ if (width == 0 || height == 0 || !DomUtil.isVisible(link)) return null;

cjhopman 2015/07/29 01:07:53 It seems odd that invisible links are handled here

kuan 2015/07/30 16:47:00 invisible links need to be ignored. i do this her

+ String linkHref = PagingLinksFinder.resolveLinkHref(link, baseAnchor);

+ boolean isVoidLink = isVoidHref(linkHref);

+ ParsedUrl url = ParsedUrl.create(linkHref);

+ if (url == null || (!isVoidLink && !url.getHost().equalsIgnoreCase(mParsedUrl.getHost()))) {

+ return null;

+ }

+ url.setHash("");

+ // Use javascript innerText (instead of javascript textContent) to only get visible text.

+ String linkText = DomUtil.getInnerText(link);

+ int number = linkTextToNumber(linkText);

+ if (!isPlainPageNumber(number)) return null;

+ if (isVoidLink || isDisabledLink(link)) return new PageInfoAndText(number, "", linkText);

+ if (sHrefCleaner == null) sHrefCleaner = RegExp.compile("/?(#.*)?$");

+ return new PageInfoAndText(number, sHrefCleaner.replace(url.toString(), ""), linkText);

+ }

+ /**

+ * Checks for previous sibling with word text. If the text contains digit(s) as terms that

+ * form a valid page number, the sibling is added to the current group of adjacent numbers.

+ * Otherwise, the current group of adjacent numbers is closed to end the current adjacency, and

+ * a new group is started.

+ *

+ * @return true if given start node has at least 1 sibling, false otherwise.

+ * @param start node to start checking with.

+ */

+ private boolean checkForPrevSiblingWithText(Node start) {

cjhopman 2015/07/29 01:07:53 I'm having difficulty understanding both the way t

kuan 2015/07/30 16:47:00 i initially had the check for previous and next nu

+ Node node = start;

+ Node prevNode = null;

+ String text = "";

+ // Find the first previous sibling that has inner text with words.

+ do {

+ prevNode = node;

+ node = node.getPreviousSibling();

+ if (node == null && prevNode == start) return false;

+ if (node == null || node.getNodeType() == Node.DOCUMENT_NODE) return true;

+ if (node.getNodeType() == Node.TEXT_NODE) {

+ text = node.getNodeValue();

+ } else {

+ Element e = Element.as(node);

+ // Previous link siblings or children have already been processed.

+ if (e.hasTagName("A") || e.getElementsByTagName("A").getLength() > 0) return true;

+ text = DomUtil.getInnerText(e);

+ }

+ } while (text.isEmpty() || StringUtil.countWords(text) == 0);

+ addNumberText(text);

+ return true;

+ }

+ /**

+ * Adds all following siblings (links and non-links) with numeric text. If the text contains

+ * digit(s) as terms that form a valid page number, the sibling is added to the current group of

+ * adjacent numbers. Otherwise, the current group of adjacent numbers is closed to end the

+ * current adjacency, and a new group is started.

+ *

+ * @return number of links added.

+ * @param start node to start checking with.

+ * @param isParentWrapper true if given start node is a parent wrapper of a link.

+ * @param baseAnchor created for the current document.

+ */

+ private int addFollowingSiblings(Node start, boolean isParentWrapper,

+ AnchorElement baseAnchor) {

+ Node node = start;

+ Node prevNode = null;

+ String text = "";

+ int numLinksProcessed = 0;

+ // Find all following siblings, add them if their text is purely numeric.

+ while (true) {

+ prevNode = node;

+ node = node.getNextSibling();

+ if (node == null || node.getNodeType() == Node.DOCUMENT_NODE) return numLinksProcessed;

+ boolean handled = false;

+ if (node.getNodeType() == Node.TEXT_NODE) {

+ text = node.getNodeValue();

+ } else {

+ Element e = Element.as(node);

+ if (e.hasTagName("A")) {

+ addValidLink(AnchorElement.as(e), baseAnchor);

+ numLinksProcessed++;

+ handled = true;

+ } else if (isParentWrapper) {

+ NodeList<Element> linkChildren = e.getElementsByTagName("A");

+ final int numChildren = linkChildren.getLength();

+ for (int i = 0; i < numChildren; i++) {

+ addValidLink(AnchorElement.as(linkChildren.getItem(i)), baseAnchor);

+ numLinksProcessed++;

+ }

+ if (numChildren > 0) handled = true;

+ }

+ text = handled ? "" : DomUtil.getInnerText(e);

+ }

+ if (!text.isEmpty() && StringUtil.countWords(text) > 0) addNumberText(text);

+ }

+ private static RegExp sTermsRegExp = null; // Match terms i.e. words.

+ private static RegExp sSurroundingDigitsRegExp = null; // Match term with only digits.

+ /**

+ * Add PageParamInfo.PageInfo for a non-link with numeric text.

cjhopman 2015/07/29 01:07:53 It looks like the text doesn't have to be strictly

kuan 2015/07/30 16:47:00 Done. renamed fn too.

+ */

+ private void addNumberText(String text) {

+ if (!StringUtil.containsDigit(text)) {

+ // The sibling does not contain valid number(s); if necessary, current group of adjacent

+ // numbers should be closed, adding a new group if possible.

+ mAdjacentNumbersGroups.addGroup();

+ return;

+ }

+ if (sTermsRegExp == null) {

+ sTermsRegExp = RegExp.compile("(\\S*[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\\S*)", "gi");

+ } else {

+ sTermsRegExp.setLastIndex(0);

+ }

+ if (sSurroundingDigitsRegExp == null) {

+ sSurroundingDigitsRegExp = RegExp.compile("^[\\W_]*(\\d+)[\\W_]*$", "i");

+ }

+ // Extract terms from the text, differentiating between those that contain only digits and

+ // those that contain non-digits.

+ while (true) {

+ MatchResult match = sTermsRegExp.exec(text);

+ if (match == null) break;

+ if (match.getGroupCount() <= 1) continue;

+ String term = match.getGroup(1);

+ MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term);

+ int number = -1;

+ if (termWithDigits != null && termWithDigits.getGroupCount() > 1) {

+ number = StringUtil.toNumber(termWithDigits.getGroup(1));

+ }

+ if (isPlainPageNumber(number)) {

+ // This sibling is a valid candidate of plain text page number, add it to last

+ // group of adjacent numbers.

+ mAdjacentNumbersGroups.addNumber(number, "");

+ } else {

+ // The sibling is not a valid number, so current group of adjacent numbers

+ // should be closed, adding a new group if possible.

+ mAdjacentNumbersGroups.addGroup();

+ }

+ } // while there're matches

+ }

+ /**

+ * Add PageParamInfo.PageInfo for a link if its text is numeric.

+ */

+ private void addValidLink(AnchorElement link, AnchorElement baseAnchor) {

cjhopman 2015/07/29 01:07:53 probably rename this to addLinkIfValid() since it

kuan 2015/07/30 16:47:00 Done. this fn is created simply to prevent duplic

+ PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor);

+ if (pageInfoAndText != null) mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo);

+ }

+ /**

+ * @return the topmost parent of the given node that simply wraps the node, i.e. with no more

+ * inner text than that of given node.

+ */

+ private static Node findParentWrapper(Node node, int nodeTextLen) {

+ Node parent = node;

+ Node prevParent = null;

+ // While keeping track of each parent, once we find the first one that has more text than

+ // given node, the previous parent would be what we want.

+ do {

+ prevParent = parent;

+ parent = parent.getParentNode();

+ } while (parent != null && DomUtil.getInnerText(parent).length() == nodeTextLen);

+ return prevParent == node || prevParent.getNodeType() == Node.DOCUMENT_NODE ?

+ null : prevParent;

+ }

+ /**

+ * @return true if link is disabled i.e. not clickable because it has a text cursor.

+ */

+ private static boolean isDisabledLink(AnchorElement link) {

+ Style style = DomUtil.getComputedStyle(link);

+ return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cursor.TEXT;

+ }

+ /**

+ * @return true if href is "javascript:void(0)".

+ */

+ private static boolean isVoidHref(String href) {

+ return href.equals("javascript:void(0)");

+ }

+ private static int linkTextToNumber(String linkText) {

+ linkText = linkText.replaceAll("[()\\[\\]{}]", "");

+ linkText = linkText.trim(); // Remove leading and trailing whitespaces.

+ // Remove duplicate internal whitespaces.

+ linkText = linkText.replaceAll("\\s\\{2,\\}", " ");

+ return StringUtil.toNumber(linkText);

+ }

+ /**

+ * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM.

+ */

+ private static boolean isPlainPageNumber(int number) {

+ return number >= 0 && number < MAX_NUM_FOR_PAGE_PARAM;

+ }

« no previous file with comments | « java/org/chromium/distiller/MonotonicPageInfosGroups.java ('k') | java/org/chromium/distiller/ParsedUrl.java » ('j') | no next file with comments »