Chromium Code Reviews| Index: java/org/chromium/distiller/PageParameterParser.java |
| diff --git a/java/org/chromium/distiller/PageParameterParser.java b/java/org/chromium/distiller/PageParameterParser.java |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..43cfdbac3ce0355e8f248ad26dd0edfb5bc09645 |
| --- /dev/null |
| +++ b/java/org/chromium/distiller/PageParameterParser.java |
| @@ -0,0 +1,362 @@ |
| +// Copyright 2015 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +package org.chromium.distiller; |
| + |
| +import org.chromium.distiller.proto.DomDistillerProtos; |
| +import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo; |
| + |
| +import com.google.gwt.dom.client.AnchorElement; |
| +import com.google.gwt.dom.client.Document; |
| +import com.google.gwt.dom.client.Element; |
| +import com.google.gwt.dom.client.Node; |
| +import com.google.gwt.dom.client.NodeList; |
| +import com.google.gwt.dom.client.Style; |
| +import com.google.gwt.regexp.shared.MatchResult; |
| +import com.google.gwt.regexp.shared.RegExp; |
| + |
| +/** |
| + * Background: |
| + * The long article/news/forum thread/blog document may be partitioned into several partial pages |
| + * by webmaster. Each partial page has outlinks pointing to the adjacent partial pages. The |
| + * anchor text of those outlinks is numeric. |
| + * |
| + * This class parses the document to collect groups of adjacent plain text numbers and outlinks with |
| + * digital anchor text. These are then passed to PageParameterParser which would spit out the |
| + * pagination URLs if available. |
| + */ |
| +public class PageParameterParser { |
| + // If the numeric value of a link's anchor text is greater than this number, we don't think it |
| + // represents the page number of the link. |
| + private static final int MAX_NUM_FOR_PAGE_PARAM = 100; |
| + |
| + /** |
| + * Stores PageParamInfo.PageInfo and the anchor's text, specifically returned by |
| + * getPageInfoAndText(). |
| + */ |
| + private static class PageInfoAndText { |
| + private final PageParamInfo.PageInfo mPageInfo; |
| + private final String mText; |
| + |
| + PageInfoAndText(int number, String url, String text) { |
| + mPageInfo = new PageParamInfo.PageInfo(number, url); |
| + mText = text; |
| + } |
| + } |
| + |
| + /** |
| + * Entry point for PageParameterParser. |
| + * Parses the document to collect outlinks with digital anchor text and numeric text around |
| + * them. These are then passed to PageParameterParser to detect pagination URLs. |
| + * |
| + * @return PageParamInfo (see PageParamInfo.java), always. If no page parameter is detected or |
| + * determined to be best, its mType is PageParamInfo.Type.UNSET. |
| + * |
| + * @param originalUrl the original URL of the document to be parsed. |
| + * @param timingInfo for tracking performance. |
| + */ |
| + public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) { |
| + PageParameterParser parser = new PageParameterParser(timingInfo); |
| + return parser.parseDocument(Document.get().getDocumentElement(), originalUrl); |
| + } |
| + |
| + private final TimingInfo mTimingInfo; |
| + private String mDocUrl = ""; |
| + private ParsedUrl mParsedUrl = null; |
| + private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new MonotonicPageInfosGroups(); |
| + |
| + private static RegExp sHrefCleaner = null; |
| + |
| + private PageParameterParser(TimingInfo timingInfo) { |
| + mTimingInfo = timingInfo; |
| + } |
| + |
| + /** |
| + * Acutually implements PageParameterParser.parse(), see above description for parse(). |
| + */ |
| + private PageParamInfo parseDocument(Element root, String originalUrl) { |
| + double startTime = DomUtil.getTime(); |
| + |
| + mDocUrl = originalUrl; |
| + mParsedUrl = ParsedUrl.create(mDocUrl); |
| + if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL. |
| + |
| + AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase( |
| + PagingLinksFinder.getBaseUrlForRelative(root, originalUrl)); |
| + |
| + NodeList<Element> allLinks = root.getElementsByTagName("A"); |
| + int idx = 0; |
| + while (idx < allLinks.getLength()) { |
| + final AnchorElement link = AnchorElement.as(allLinks.getItem(idx)); |
| + PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor); |
| + if (pageInfoAndText == null) { |
| + idx++; |
| + continue; |
| + } |
| + |
| + // This link is a good candidate for pagination. |
| + |
| + // Close current group of adjacent numbers, add a new group if necessary. |
| + mAdjacentNumbersGroups.addGroup(); |
| + |
| + // Before we append the link to the new group of adjacent numbers, check if it's |
| + // preceded by a sibling with text; if so, add it before the link. |
| + Node parentWrapper = null; |
|
cjhopman
2015/07/29 01:07:53
What's this parent wrapper? I don't recall that be
kuan
2015/07/30 16:47:00
i had it in the previous change, and attempted to
cjhopman
2015/08/04 21:58:41
But why the parent wrapper thing? Why not just wal
kuan
2015/08/04 22:38:37
what do u mean by "backwards/forwards in the tree"
kuan
2015/08/11 19:09:38
Done.
|
| + if (!checkForPrevSiblingWithText(link)) { // Link has no sibling. |
| + // The link could be a child of a parent that is simply a wrapper, i.e. with no |
| + // extra text, in which case, we should be checking the siblings of the topmost |
| + // parent wrapper. |
| + parentWrapper = findParentWrapper(link, pageInfoAndText.mText.length()); |
| + if (parentWrapper != null) checkForPrevSiblingWithText(parentWrapper); |
| + } |
| + |
| + // Add the link to the current group of adjacent numbers. |
| + mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo); |
| + |
| + // Add all following siblings with numeric text, with or without links. |
| + int numLinksAdded = 0; |
| + if (parentWrapper == null) |
| + numLinksAdded = addFollowingSiblings(link, false, baseAnchor); |
| + else |
| + numLinksAdded = addFollowingSiblings(parentWrapper, true, baseAnchor); |
| + |
| + // Skip the current link and links already processed in addFollowingSiblings(). |
| + idx += 1 + numLinksAdded; |
| + } // while there're links. |
| + |
| + mAdjacentNumbersGroups.cleanup(); |
| + |
| + LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser"); |
| + |
| + startTime = DomUtil.getTime(); |
| + PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups, mDocUrl); |
| + LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector"); |
| + return info; |
| + } |
| + |
| + |
| + /** |
| + * @return a populated PageInfoAndText if given link is to be added to mAdjacentNumbersGroups. |
| + * Otherwise, returns null if link is to be ignored. |
| + * "javascript:void" links with numeric text are considered valid links to be added. |
| + * |
| + * @param link to process. |
| + * @param baseAnchor created for the current document. |
| + */ |
| + private PageInfoAndText getPageInfoAndText(AnchorElement link, AnchorElement baseAnchor) { |
| + // Ignore invisible links. |
| + int width = link.getOffsetWidth(); |
| + int height = link.getOffsetHeight(); |
| + if (width == 0 || height == 0 || !DomUtil.isVisible(link)) return null; |
|
cjhopman
2015/07/29 01:07:53
It seems odd that invisible links are handled here
kuan
2015/07/30 16:47:00
invisible links need to be ignored. i do this her
|
| + |
| + String linkHref = PagingLinksFinder.resolveLinkHref(link, baseAnchor); |
| + boolean isVoidLink = isVoidHref(linkHref); |
| + ParsedUrl url = ParsedUrl.create(linkHref); |
| + if (url == null || (!isVoidLink && !url.getHost().equalsIgnoreCase(mParsedUrl.getHost()))) { |
| + return null; |
| + } |
| + |
| + url.setHash(""); |
| + |
| + // Use javascript innerText (instead of javascript textContent) to only get visible text. |
| + String linkText = DomUtil.getInnerText(link); |
| + int number = linkTextToNumber(linkText); |
| + if (!isPlainPageNumber(number)) return null; |
| + |
| + if (isVoidLink || isDisabledLink(link)) return new PageInfoAndText(number, "", linkText); |
| + |
| + if (sHrefCleaner == null) sHrefCleaner = RegExp.compile("/?(#.*)?$"); |
| + return new PageInfoAndText(number, sHrefCleaner.replace(url.toString(), ""), linkText); |
| + } |
| + |
| + /** |
| + * Checks for previous sibling with word text. If the text contains digit(s) as terms that |
| + * form a valid page number, the sibling is added to the current group of adjacent numbers. |
| + * Otherwise, the current group of adjacent numbers is closed to end the current adjacency, and |
| + * a new group is started. |
| + * |
| + * @return true if given start node has at least 1 sibling, false otherwise. |
| + |
| + * @param start node to start checking with. |
| + */ |
| + private boolean checkForPrevSiblingWithText(Node start) { |
|
cjhopman
2015/07/29 01:07:53
I'm having difficulty understanding both the way t
kuan
2015/07/30 16:47:00
i initially had the check for previous and next nu
|
| + Node node = start; |
| + Node prevNode = null; |
| + String text = ""; |
| + // Find the first previous sibling that has inner text with words. |
| + do { |
| + prevNode = node; |
| + node = node.getPreviousSibling(); |
| + if (node == null && prevNode == start) return false; |
| + if (node == null || node.getNodeType() == Node.DOCUMENT_NODE) return true; |
| + |
| + if (node.getNodeType() == Node.TEXT_NODE) { |
| + text = node.getNodeValue(); |
| + } else { |
| + Element e = Element.as(node); |
| + // Previous link siblings or children have already been processed. |
| + if (e.hasTagName("A") || e.getElementsByTagName("A").getLength() > 0) return true; |
| + text = DomUtil.getInnerText(e); |
| + } |
| + } while (text.isEmpty() || StringUtil.countWords(text) == 0); |
| + |
| + addNumberText(text); |
| + return true; |
| + } |
| + |
| + /** |
| + * Adds all following siblings (links and non-links) with numeric text. If the text contains |
| + * digit(s) as terms that form a valid page number, the sibling is added to the current group of |
| + * adjacent numbers. Otherwise, the current group of adjacent numbers is closed to end the |
| + * current adjacency, and a new group is started. |
| + * |
| + * @return number of links added. |
| + |
| + * @param start node to start checking with. |
| + * @param isParentWrapper true if given start node is a parent wrapper of a link. |
| + * @param baseAnchor created for the current document. |
| + */ |
| + private int addFollowingSiblings(Node start, boolean isParentWrapper, |
| + AnchorElement baseAnchor) { |
| + Node node = start; |
| + Node prevNode = null; |
| + String text = ""; |
| + int numLinksProcessed = 0; |
| + // Find all following siblings, add them if their text is purely numeric. |
| + while (true) { |
| + prevNode = node; |
| + node = node.getNextSibling(); |
| + if (node == null || node.getNodeType() == Node.DOCUMENT_NODE) return numLinksProcessed; |
| + |
| + boolean handled = false; |
| + if (node.getNodeType() == Node.TEXT_NODE) { |
| + text = node.getNodeValue(); |
| + } else { |
| + Element e = Element.as(node); |
| + if (e.hasTagName("A")) { |
| + addValidLink(AnchorElement.as(e), baseAnchor); |
| + numLinksProcessed++; |
| + handled = true; |
| + } else if (isParentWrapper) { |
| + NodeList<Element> linkChildren = e.getElementsByTagName("A"); |
| + final int numChildren = linkChildren.getLength(); |
| + for (int i = 0; i < numChildren; i++) { |
| + addValidLink(AnchorElement.as(linkChildren.getItem(i)), baseAnchor); |
| + numLinksProcessed++; |
| + } |
| + if (numChildren > 0) handled = true; |
| + } |
| + |
| + text = handled ? "" : DomUtil.getInnerText(e); |
| + } |
| + |
| + if (!text.isEmpty() && StringUtil.countWords(text) > 0) addNumberText(text); |
| + } |
| + } |
| + |
| + private static RegExp sTermsRegExp = null; // Match terms i.e. words. |
| + private static RegExp sSurroundingDigitsRegExp = null; // Match term with only digits. |
| + |
| + /** |
| + * Add PageParamInfo.PageInfo for a non-link with numeric text. |
|
cjhopman
2015/07/29 01:07:53
It looks like the text doesn't have to be strictly
kuan
2015/07/30 16:47:00
Done. renamed fn too.
|
| + */ |
| + private void addNumberText(String text) { |
| + if (!StringUtil.containsDigit(text)) { |
| + // The sibling does not contain valid number(s); if necessary, current group of adjacent |
| + // numbers should be closed, adding a new group if possible. |
| + mAdjacentNumbersGroups.addGroup(); |
| + return; |
| + } |
| + |
| + if (sTermsRegExp == null) { |
| + sTermsRegExp = RegExp.compile("(\\S*[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\\S*)", "gi"); |
| + } else { |
| + sTermsRegExp.setLastIndex(0); |
| + } |
| + if (sSurroundingDigitsRegExp == null) { |
| + sSurroundingDigitsRegExp = RegExp.compile("^[\\W_]*(\\d+)[\\W_]*$", "i"); |
| + } |
| + |
| + // Extract terms from the text, differentiating between those that contain only digits and |
| + // those that contain non-digits. |
| + while (true) { |
| + MatchResult match = sTermsRegExp.exec(text); |
| + if (match == null) break; |
| + if (match.getGroupCount() <= 1) continue; |
| + |
| + String term = match.getGroup(1); |
| + MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term); |
| + int number = -1; |
| + if (termWithDigits != null && termWithDigits.getGroupCount() > 1) { |
| + number = StringUtil.toNumber(termWithDigits.getGroup(1)); |
| + } |
| + if (isPlainPageNumber(number)) { |
| + // This sibling is a valid candidate of plain text page number, add it to last |
| + // group of adjacent numbers. |
| + mAdjacentNumbersGroups.addNumber(number, ""); |
| + } else { |
| + // The sibling is not a valid number, so current group of adjacent numbers |
| + // should be closed, adding a new group if possible. |
| + mAdjacentNumbersGroups.addGroup(); |
| + } |
| + } // while there're matches |
| + } |
| + |
| + /** |
| + * Add PageParamInfo.PageInfo for a link if its text is numeric. |
| + */ |
| + private void addValidLink(AnchorElement link, AnchorElement baseAnchor) { |
|
cjhopman
2015/07/29 01:07:53
probably rename this to addLinkIfValid() since it
kuan
2015/07/30 16:47:00
Done. this fn is created simply to prevent duplic
|
| + PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor); |
| + if (pageInfoAndText != null) mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo); |
| + } |
| + |
| + /** |
| + * @return the topmost parent of the given node that simply wraps the node, i.e. with no more |
| + * inner text than that of given node. |
| + */ |
| + private static Node findParentWrapper(Node node, int nodeTextLen) { |
| + Node parent = node; |
| + Node prevParent = null; |
| + // While keeping track of each parent, once we find the first one that has more text than |
| + // given node, the previous parent would be what we want. |
| + do { |
| + prevParent = parent; |
| + parent = parent.getParentNode(); |
| + } while (parent != null && DomUtil.getInnerText(parent).length() == nodeTextLen); |
| + |
| + return prevParent == node || prevParent.getNodeType() == Node.DOCUMENT_NODE ? |
| + null : prevParent; |
| + } |
| + |
| + /** |
| + * @return true if link is disabled i.e. not clickable because it has a text cursor. |
| + */ |
| + private static boolean isDisabledLink(AnchorElement link) { |
| + Style style = DomUtil.getComputedStyle(link); |
| + return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cursor.TEXT; |
| + } |
| + |
| + /** |
| + * @return true if href is "javascript:void(0)". |
| + */ |
| + private static boolean isVoidHref(String href) { |
| + return href.equals("javascript:void(0)"); |
| + } |
| + |
| + private static int linkTextToNumber(String linkText) { |
| + linkText = linkText.replaceAll("[()\\[\\]{}]", ""); |
| + linkText = linkText.trim(); // Remove leading and trailing whitespaces. |
| + // Remove duplicate internal whitespaces. |
| + linkText = linkText.replaceAll("\\s\\{2,\\}", " "); |
| + return StringUtil.toNumber(linkText); |
| + } |
| + |
| + /** |
| + * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM. |
| + */ |
| + private static boolean isPlainPageNumber(int number) { |
| + return number >= 0 && number < MAX_NUM_FOR_PAGE_PARAM; |
| + } |
| + |
| +} |