java/org/chromium/distiller/PageParameterParser.java - Issue 1178633002: implement parser for new pagination algorithm

Unified Diff: java/org/chromium/distiller/PageParameterParser.java

Issue 1178633002: implement parser for new pagination algorithm (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: java/org/chromium/distiller/PageParameterParser.java

diff --git a/java/org/chromium/distiller/PageParameterParser.java b/java/org/chromium/distiller/PageParameterParser.java

new file mode 100644

index 0000000000000000000000000000000000000000..70c1e8dea62a0d0ea0d60fcd97a1f64667095fa7

--- /dev/null

+++ b/java/org/chromium/distiller/PageParameterParser.java

@@ -0,0 +1,284 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+package org.chromium.distiller;

+import org.chromium.distiller.proto.DomDistillerProtos;

+import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo;

+import com.google.gwt.dom.client.AnchorElement;

+import com.google.gwt.dom.client.Document;

+import com.google.gwt.dom.client.Element;

+import com.google.gwt.dom.client.Node;

+import com.google.gwt.dom.client.NodeList;

+import com.google.gwt.dom.client.Style;

+import com.google.gwt.regexp.shared.MatchResult;

+import com.google.gwt.regexp.shared.RegExp;

+import java.util.HashSet;

+import java.util.Set;

+/**

+ * Background:

+ * The long article/news/forum thread/blog document may be partitioned into several partial pages

+ * by webmaster. Each partial page has outlinks pointing to the adjacent partial pages. The

+ * anchor text of those outlinks is numeric.

+ *

+ * This class parses the document to collect groups of adjacent plain text numbers and outlinks with

+ * digital anchor text. These are then passed to PageParameterParser which would spit out the

+ * pagination URLs if available.

+ */

+public class PageParameterParser {

+ // If the numeric value of a link's anchor text is greater than this number, we don't think it

+ // represents the page number of the link.

+ private static final int MAX_NUM_FOR_PAGE_PARAM = 100;

+ /**

+ * Type of sibling to check.

+ */

+ private static enum Sibling {

+ PREV, // Node.getPreviousSibling().

+ NEXT, // Node.getNextSibling().

+ }

+ /**

+ * Entry point for PageParameterParser.

+ * Parses the document to collect outlinks with digital anchor text and numeric text around

+ * them. These are then passed to PageParameterParser to detect pagination URLs.

+ *

+ * @return PageParamInfo (see PageParamInfo.java), always. If no page parameter is detected or

+ * determined to be best, its mType is PageParamInfo.Type.UNSET.

+ *

+ * @param originalUrl the original URL of the document to be parsed.

+ * @param timingInfo for tracking performance.

+ */

+ public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) {

+ PageParameterParser parser = new PageParameterParser(timingInfo);

+ return parser.parseDocument(Document.get().getDocumentElement(), originalUrl);

+ }

+ private final TimingInfo mTimingInfo;

+ private String mDocUrl = "";

+ private ParsedUrl mParsedUrl = null;

+ private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new MonotonicPageInfosGroups();

+ // Siblings that have been processed before and after each link.

+ private final Set<Node> mProcessedSiblings = new HashSet<Node>();

+ private static RegExp sHrefCleaner = null;

+ private PageParameterParser(TimingInfo timingInfo) {

+ mTimingInfo = timingInfo;

+ }

+ /**

+ * Acutually implements PageParameterParser.parse(), see above description for parse().

+ */

+ private PageParamInfo parseDocument(Element root, String originalUrl) {

+ double startTime = DomUtil.getTime();

+ mDocUrl = originalUrl;

+ mParsedUrl = ParsedUrl.create(mDocUrl);

+ if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL.

+ mAdjacentNumbersGroups.addGroup();

+ AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase(

+ PagingLinksFinder.getBaseUrlForRelative(root, originalUrl));

+ NodeList<Element> allLinks = root.getElementsByTagName("A");

cjhopman 2015/06/26 00:35:25 What do you think of this alternative approach: a

kuan 2015/07/13 22:57:03 Done.

+ for (int i = 0; i < allLinks.getLength(); i++) {

+ final AnchorElement link = AnchorElement.as(allLinks.getItem(i));

+ String linkHref = PagingLinksFinder.resolveLinkHref(link, baseAnchor);

+ ParsedUrl url = ParsedUrl.create(linkHref);

+ if (url == null || !url.getHost().equalsIgnoreCase(mParsedUrl.getHost())) continue;

+ url.setHash("");

+ // Use javascript innerText (instead of javascript textContent) to only get visible

+ // text.

+ String linkText = DomUtil.getInnerText(link);

+ int number = linkTextToNumber(linkText);

+ if (isPlainPageNumber(number)) {

+ // Before we append the link to the current group of adjacent numbers, check if it's

+ // preceded by a sibling with text, which would determine if the current adjacency

+ // continues or ends.

+ Node parentWrapper = null;

+ if (!checkForSiblingWithText(link, Sibling.PREV)) { // Link has no sibling.

+ // The link could be a child of a parent that is simply a wrapper, i.e. with no

+ // extra text, in which case, we should be checking the siblings of the topmost

+ // parent wrapper.

+ parentWrapper = findParentWrapper(link, linkText.length());

+ if (parentWrapper != null) checkForSiblingWithText(parentWrapper, Sibling.PREV);

+ }

+ // This link is a good candidate for pagination, add it to the current group of

+ // adjacent numbers.

+ if (isDisabledLink(link)) {

+ linkHref = "";

+ } else {

+ if (sHrefCleaner == null) sHrefCleaner = RegExp.compile("/?(#.*)?$");

+ linkHref = sHrefCleaner.replace(url.toString(), "");

+ }

+ mAdjacentNumbersGroups.addNumber(number, linkHref);

+ // Check if the link is followed by a sibling with text, which would determine if

+ // the current adjacency continues or ends.

+ checkForSiblingWithText(parentWrapper == null ? link : parentWrapper, Sibling.NEXT);

+ } else {

+ // This link is not a valid pagination link, so current group of adjacent numbers

+ // should be closed, adding a new group if possible.

+ mAdjacentNumbersGroups.addGroup();

+ }

+ } // for all links

+ mAdjacentNumbersGroups.cleanup();

+ LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser");

+ startTime = DomUtil.getTime();

+ PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups, mDocUrl);

+ LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector");

+ return info;

+ }

+ private static RegExp sTermsRegExp = null; // Match terms i.e. words.

+ private static RegExp sSurroundingDigitsRegExp = null; // Match term with only digits.

+ /**

+ * Checks for previous or next sibling with word text. If the text contains digit(s) as terms

+ * that form a valid page number, the sibling is added to the current group of adjacent

+ * numbers. Otherwise, the current group of adjacent numbers is closed to end the current

+ * adjacency, and a new group is started.

+ *

+ * @return true if given start node has at least 1 sibling, false otherwise.

+ * @param start node to start checking with.

+ * @param sibling previous or next sibling.

+ */

+ private boolean checkForSiblingWithText(Node start, Sibling sibling) {

+ Node node = start;

+ Node prevNode = null;

+ String text = "";

+ // Find the first sibling that has inner text with words.

+ do {

+ prevNode = node;

+ node = sibling == Sibling.PREV ? node.getPreviousSibling() : node.getNextSibling();

+ if (node == null && prevNode == start) return false;

+ if (node == null || node.getNodeType() == Node.DOCUMENT_NODE ||

+ !mProcessedSiblings.add(node)) { // Node has been processed.

+ return true;

+ }

+ if (node.getNodeType() == Node.TEXT_NODE) {

+ text = node.getNodeValue();

+ } else {

+ Element e = Element.as(node);

+ // Ignore non-void anchors, since we're iterating through them.

+ if (e.hasTagName("A") && shouldIgnoreLinkSibling(e)) return true;

+ // Ignore non-void link children, since we're iterating through them.

+ NodeList<Element> linkChildren = e.getElementsByTagName("A");

+ for (int i = 0; i < linkChildren.getLength(); i++) {

+ if (shouldIgnoreLinkSibling(linkChildren.getItem(i))) return true;

+ }

+ text = DomUtil.getInnerText(e);

+ }

+ } while (text.isEmpty() || StringUtil.countWords(text) == 0);

+ if (!StringUtil.containsDigit(text)) {

+ // The sibling does not contain valid number(s), so current group of adjacent numbers

+ // should be closed, adding a new group if possible.

+ mAdjacentNumbersGroups.addGroup();

+ return true;

+ }

+ if (sTermsRegExp == null) {

+ sTermsRegExp = RegExp.compile("(\\S*[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\\S*)", "gi");

+ } else {

+ sTermsRegExp.setLastIndex(0);

+ }

+ if (sSurroundingDigitsRegExp == null) {

+ sSurroundingDigitsRegExp = RegExp.compile("^[\\W_]*(\\d+)[\\W_]*$", "i");

+ }

+ // Extract terms from the text, differentiating between those that contain only digits and

+ // those that contain non-digits.

+ while (true) {

+ MatchResult match = sTermsRegExp.exec(text);

+ if (match == null) break;

+ if (match.getGroupCount() <= 1) continue;

+ String term = match.getGroup(1);

+ MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term);

+ int number = -1;

+ if (termWithDigits != null && termWithDigits.getGroupCount() > 1) {

+ number = StringUtil.toNumber(termWithDigits.getGroup(1));

+ }

+ if (isPlainPageNumber(number)) {

+ // This sibling is a valid candidate of plain text page number, add it to last

+ // group of adjacent numbers.

+ mAdjacentNumbersGroups.addNumber(number, "");

+ } else {

+ // The sibling is not a valid number, so current group of adjacent numbers

+ // should be closed, adding a new group if possible.

+ mAdjacentNumbersGroups.addGroup();

+ }

+ } // while there're matches

+ return true;

+ }

+ /**

+ * @return the topmost parent of the given node that simply wraps the node, i.e. with no more

+ * inner text than that of given node.

+ */

+ private static Node findParentWrapper(Node node, int nodeTextLen) {

+ Node parent = node;

+ Node prevParent = null;

+ // While keeping track of each parent, once we find the first one that has more text than

+ // given node, the previous parent would be what we want.

+ do {

+ prevParent = parent;

+ parent = parent.getParentNode();

+ } while (parent != null && DomUtil.getInnerText(parent).length() == nodeTextLen);

+ return prevParent == node || prevParent.getNodeType() == Node.DOCUMENT_NODE ?

+ null : prevParent;

+ }

+ /**

+ * @return true if link is disabled i.e. not clickable because it has a text cursor.

+ */

+ private static boolean isDisabledLink(AnchorElement link) {

+ Style style = DomUtil.getComputedStyle(link);

+ return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cursor.TEXT;

+ }

+ /**

+ * @return true if link should be ignored when checking for sibling.

+ * All non-void links will be processed, and hence could be ignored when checking for sibling.

+ * Void links are rejected at the beginning because their hosts are different from

+ * originalUrl's, so they should be considered when checking for sibling.

+ */

+ private static boolean shouldIgnoreLinkSibling(Element link) {

+ return !AnchorElement.as(link).getHref().equals("javascript:void(0)");

+ }

+ private static int linkTextToNumber(String linkText) {

+ linkText = linkText.replaceAll("[()\\[\\]{}]", "");

+ linkText = linkText.trim(); // Remove leading and trailing whitespaces.

+ // Remove duplicate internal whitespaces.

+ linkText = linkText.replaceAll("\\s\\{2,\\}", " ");

+ return StringUtil.toNumber(linkText);

+ }

+ /**

+ * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM.

+ */

+ private static boolean isPlainPageNumber(int number) {

+ return number >= 0 && number < MAX_NUM_FOR_PAGE_PARAM;

+ }

« no previous file with comments | « no previous file | java/org/chromium/distiller/ParsedUrl.java » ('j') | no next file with comments »