java/org/chromium/distiller/PageParameterParser.java - Issue 1178633002: implement parser for new pagination algorithm

Side by Side Diff: java/org/chromium/distiller/PageParameterParser.java

Issue 1178633002: implement parser for new pagination algorithm (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright 2015 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 package org.chromium.distiller;

	6

	7 import org.chromium.distiller.proto.DomDistillerProtos;

	8 import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo;

	9

	10 import com.google.gwt.dom.client.AnchorElement;

	11 import com.google.gwt.dom.client.Document;

	12 import com.google.gwt.dom.client.Element;

	13 import com.google.gwt.dom.client.Node;

	14 import com.google.gwt.dom.client.NodeList;

	15 import com.google.gwt.dom.client.Style;

	16 import com.google.gwt.regexp.shared.MatchResult;

	17 import com.google.gwt.regexp.shared.RegExp;

	18

	19 import java.util.HashSet;

	20 import java.util.Set;

	21

	22 /**

	23 * Background:

	24 * The long article/news/forum thread/blog document may be partitioned into se veral partial pages

	25 * by webmaster. Each partial page has outlinks pointing to the adjacent part ial pages. The

	26 * anchor text of those outlinks is numeric.

	27 *

	28 * This class parses the document to collect groups of adjacent plain text numbe rs and outlinks with

	29 * digital anchor text. These are then passed to PageParameterParser which woul d spit out the

	30 * pagination URLs if available.

	31 */

	32 public class PageParameterParser {

	33 // If the numeric value of a link's anchor text is greater than this number, we don't think it

	34 // represents the page number of the link.

	35 private static final int MAX_NUM_FOR_PAGE_PARAM = 100;

	36

	37 /**

	38 * Type of sibling to check.

	39 */

	40 private static enum Sibling {

	41 PREV, // Node.getPreviousSibling().

	42 NEXT, // Node.getNextSibling().

	43 }

	44

	45 /**

	46 * Entry point for PageParameterParser.

	47 * Parses the document to collect outlinks with digital anchor text and nume ric text around

	48 * them. These are then passed to PageParameterParser to detect pagination URLs.

	49 *

	50 * @return PageParamInfo (see PageParamInfo.java), always. If no page param eter is detected or

	51 * determined to be best, its mType is PageParamInfo.Type.UNSET.

	52 *

	53 * @param originalUrl the original URL of the document to be parsed.

	54 * @param timingInfo for tracking performance.

	55 */

	56 public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) {

	57 PageParameterParser parser = new PageParameterParser(timingInfo);

	58 return parser.parseDocument(Document.get().getDocumentElement(), origina lUrl);

	59 }

	60

	61 private final TimingInfo mTimingInfo;

	62 private String mDocUrl = "";

	63 private ParsedUrl mParsedUrl = null;

	64 private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new Monotoni cPageInfosGroups();

	65 // Siblings that have been processed before and after each link.

	66 private final Set<Node> mProcessedSiblings = new HashSet<Node>();

	67

	68 private static RegExp sHrefCleaner = null;

	69

	70 private PageParameterParser(TimingInfo timingInfo) {

	71 mTimingInfo = timingInfo;

	72 }

	73

	74 /**

	75 * Acutually implements PageParameterParser.parse(), see above description f or parse().

	76 */

	77 private PageParamInfo parseDocument(Element root, String originalUrl) {

	78 double startTime = DomUtil.getTime();

	79

	80 mDocUrl = originalUrl;

	81 mParsedUrl = ParsedUrl.create(mDocUrl);

	82 if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL.

	83

	84 mAdjacentNumbersGroups.addGroup();

	85 AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase(

	86 PagingLinksFinder.getBaseUrlForRelative(root, originalUrl));

	87

	88 NodeList<Element> allLinks = root.getElementsByTagName("A");
	cjhopman 2015/06/26 00:35:25 What do you think of this alternative approach: a What do you think of this alternative approach: allLinks = root.gEBTN("A"); idx = 0; while idx < allLinks.size: while idx < allLinks.size && !isNumber(allLinks[idx]): idx++; <find+add previous number siblings> <add allLinks[idx]> idx += 1 + <find+add next number siblings and return number of links added> That is, find a link that is a number, and then find all its siblings and skip ahead in the links list. kuan 2015/07/13 22:57:03 Done. Show quoted text On 2015/06/26 00:35:25, cjhopman wrote: > What do you think of this alternative approach: > > allLinks = root.gEBTN("A"); > idx = 0; > while idx < allLinks.size: > while idx < allLinks.size && !isNumber(allLinks[idx]): idx++; > <find+add previous number siblings> > <add allLinks[idx]> > idx += 1 + <find+add next number siblings and return number of links added> > > That is, find a link that is a number, and then find all its siblings and skip > ahead in the links list. Done.
	89 for (int i = 0; i < allLinks.getLength(); i++) {

	90 final AnchorElement link = AnchorElement.as(allLinks.getItem(i));

	91

	92 String linkHref = PagingLinksFinder.resolveLinkHref(link, baseAnchor );

	93 ParsedUrl url = ParsedUrl.create(linkHref);

	94 if (url == null \|\| !url.getHost().equalsIgnoreCase(mParsedUrl.getHos t())) continue;

	95 url.setHash("");

	96

	97 // Use javascript innerText (instead of javascript textContent) to o nly get visible

	98 // text.

	99 String linkText = DomUtil.getInnerText(link);

	100 int number = linkTextToNumber(linkText);

	101 if (isPlainPageNumber(number)) {

	102 // Before we append the link to the current group of adjacent nu mbers, check if it's

	103 // preceded by a sibling with text, which would determine if the current adjacency

	104 // continues or ends.

	105 Node parentWrapper = null;

	106 if (!checkForSiblingWithText(link, Sibling.PREV)) { // Link has no sibling.

	107 // The link could be a child of a parent that is simply a wr apper, i.e. with no

	108 // extra text, in which case, we should be checking the sibl ings of the topmost

	109 // parent wrapper.

	110 parentWrapper = findParentWrapper(link, linkText.length());

	111 if (parentWrapper != null) checkForSiblingWithText(parentWra pper, Sibling.PREV);

	112 }

	113

	114 // This link is a good candidate for pagination, add it to the c urrent group of

	115 // adjacent numbers.

	116 if (isDisabledLink(link)) {

	117 linkHref = "";

	118 } else {

	119 if (sHrefCleaner == null) sHrefCleaner = RegExp.compile("/?( #.*)?$");

	120 linkHref = sHrefCleaner.replace(url.toString(), "");

	121 }

	122 mAdjacentNumbersGroups.addNumber(number, linkHref);

	123

	124 // Check if the link is followed by a sibling with text, which w ould determine if

	125 // the current adjacency continues or ends.

	126 checkForSiblingWithText(parentWrapper == null ? link : parentWra pper, Sibling.NEXT);

	127 } else {

	128 // This link is not a valid pagination link, so current group of adjacent numbers

	129 // should be closed, adding a new group if possible.

	130 mAdjacentNumbersGroups.addGroup();

	131 }

	132 } // for all links

	133

	134 mAdjacentNumbersGroups.cleanup();

	135

	136 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser");

	137

	138 startTime = DomUtil.getTime();

	139 PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups , mDocUrl);

	140 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector");

	141 return info;

	142 }

	143

	144 private static RegExp sTermsRegExp = null; // Match terms i.e. words.

	145 private static RegExp sSurroundingDigitsRegExp = null; // Match term with o nly digits.

	146

	147 /**

	148 * Checks for previous or next sibling with word text. If the text contains digit(s) as terms

	149 * that form a valid page number, the sibling is added to the current group of adjacent

	150 * numbers. Otherwise, the current group of adjacent numbers is closed to e nd the current

	151 * adjacency, and a new group is started.

	152 *

	153 * @return true if given start node has at least 1 sibling, false otherwise.

	154

	155 * @param start node to start checking with.

	156 * @param sibling previous or next sibling.

	157 */

	158 private boolean checkForSiblingWithText(Node start, Sibling sibling) {

	159 Node node = start;

	160 Node prevNode = null;

	161 String text = "";

	162 // Find the first sibling that has inner text with words.

	163 do {

	164 prevNode = node;

	165 node = sibling == Sibling.PREV ? node.getPreviousSibling() : node.ge tNextSibling();

	166 if (node == null && prevNode == start) return false;

	167 if (node == null \|\| node.getNodeType() == Node.DOCUMENT_NODE \|\|

	168 !mProcessedSiblings.add(node)) { // Node has been processed .

	169 return true;

	170 }

	171

	172 if (node.getNodeType() == Node.TEXT_NODE) {

	173 text = node.getNodeValue();

	174 } else {

	175 Element e = Element.as(node);

	176

	177 // Ignore non-void anchors, since we're iterating through them.

	178 if (e.hasTagName("A") && shouldIgnoreLinkSibling(e)) return true ;

	179

	180 // Ignore non-void link children, since we're iterating through them.

	181 NodeList<Element> linkChildren = e.getElementsByTagName("A");

	182 for (int i = 0; i < linkChildren.getLength(); i++) {

	183 if (shouldIgnoreLinkSibling(linkChildren.getItem(i))) return true;

	184 }

	185

	186 text = DomUtil.getInnerText(e);

	187 }

	188 } while (text.isEmpty() \|\| StringUtil.countWords(text) == 0);

	189

	190 if (!StringUtil.containsDigit(text)) {

	191 // The sibling does not contain valid number(s), so current group of adjacent numbers

	192 // should be closed, adding a new group if possible.

	193 mAdjacentNumbersGroups.addGroup();

	194 return true;

	195 }

	196

	197 if (sTermsRegExp == null) {

	198 sTermsRegExp = RegExp.compile("(\\S[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\ \S)", "gi");

	199 } else {

	200 sTermsRegExp.setLastIndex(0);

	201 }

	202 if (sSurroundingDigitsRegExp == null) {

	203 sSurroundingDigitsRegExp = RegExp.compile("^[\\W_](\\d+)[\\W_]$", "i");

	204 }

	205

	206 // Extract terms from the text, differentiating between those that conta in only digits and

	207 // those that contain non-digits.

	208 while (true) {

	209 MatchResult match = sTermsRegExp.exec(text);

	210 if (match == null) break;

	211 if (match.getGroupCount() <= 1) continue;

	212

	213 String term = match.getGroup(1);

	214 MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term);

	215 int number = -1;

	216 if (termWithDigits != null && termWithDigits.getGroupCount() > 1) {

	217 number = StringUtil.toNumber(termWithDigits.getGroup(1));

	218 }

	219 if (isPlainPageNumber(number)) {

	220 // This sibling is a valid candidate of plain text page number, add it to last

	221 // group of adjacent numbers.

	222 mAdjacentNumbersGroups.addNumber(number, "");

	223 } else {

	224 // The sibling is not a valid number, so current group of adjace nt numbers

	225 // should be closed, adding a new group if possible.

	226 mAdjacentNumbersGroups.addGroup();

	227 }

	228 } // while there're matches

	229

	230 return true;

	231 }

	232

	233 /**

	234 * @return the topmost parent of the given node that simply wraps the node, i.e. with no more

	235 * inner text than that of given node.

	236 */

	237 private static Node findParentWrapper(Node node, int nodeTextLen) {

	238 Node parent = node;

	239 Node prevParent = null;

	240 // While keeping track of each parent, once we find the first one that h as more text than

	241 // given node, the previous parent would be what we want.

	242 do {

	243 prevParent = parent;

	244 parent = parent.getParentNode();

	245 } while (parent != null && DomUtil.getInnerText(parent).length() == node TextLen);

	246

	247 return prevParent == node \|\| prevParent.getNodeType() == Node.DOCUMENT_N ODE ?

	248 null : prevParent;

	249 }

	250

	251 /**

	252 * @return true if link is disabled i.e. not clickable because it has a text cursor.

	253 */

	254 private static boolean isDisabledLink(AnchorElement link) {

	255 Style style = DomUtil.getComputedStyle(link);

	256 return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cu rsor.TEXT;

	257 }

	258

	259 /**

	260 * @return true if link should be ignored when checking for sibling.

	261 * All non-void links will be processed, and hence could be ignored when che cking for sibling.

	262 * Void links are rejected at the beginning because their hosts are differen t from

	263 * originalUrl's, so they should be considered when checking for sibling.

	264 */

	265 private static boolean shouldIgnoreLinkSibling(Element link) {

	266 return !AnchorElement.as(link).getHref().equals("javascript:void(0)");

	267 }

	268

	269 private static int linkTextToNumber(String linkText) {

	270 linkText = linkText.replaceAll("[()\\[\\]{}]", "");

	271 linkText = linkText.trim(); // Remove leading and trailing whitespaces.

	272 // Remove duplicate internal whitespaces.

	273 linkText = linkText.replaceAll("\\s\\{2,\\}", " ");

	274 return StringUtil.toNumber(linkText);

	275 }

	276

	277 /**

	278 * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM.

	279 */

	280 private static boolean isPlainPageNumber(int number) {

	281 return number >= 0 && number < MAX_NUM_FOR_PAGE_PARAM;

	282 }

	283

	284 }

OLD	NEW

« no previous file with comments | « no previous file | java/org/chromium/distiller/ParsedUrl.java » ('j') | no next file with comments »