Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 package org.chromium.distiller; | |
| 6 | |
| 7 import org.chromium.distiller.proto.DomDistillerProtos; | |
| 8 import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo; | |
| 9 | |
| 10 import com.google.gwt.dom.client.AnchorElement; | |
| 11 import com.google.gwt.dom.client.Document; | |
| 12 import com.google.gwt.dom.client.Element; | |
| 13 import com.google.gwt.dom.client.Node; | |
| 14 import com.google.gwt.dom.client.NodeList; | |
| 15 import com.google.gwt.dom.client.Style; | |
| 16 import com.google.gwt.regexp.shared.MatchResult; | |
| 17 import com.google.gwt.regexp.shared.RegExp; | |
| 18 | |
| 19 import java.util.HashSet; | |
| 20 import java.util.Set; | |
| 21 | |
| 22 /** | |
| 23 * Background: | |
| 24 * The long article/news/forum thread/blog document may be partitioned into se veral partial pages | |
| 25 * by webmaster. Each partial page has outlinks pointing to the adjacent part ial pages. The | |
| 26 * anchor text of those outlinks is numeric. | |
| 27 * | |
| 28 * This class parses the document to collect groups of adjacent plain text numbe rs and outlinks with | |
| 29 * digital anchor text. These are then passed to PageParameterParser which woul d spit out the | |
| 30 * pagination URLs if available. | |
| 31 */ | |
| 32 public class PageParameterParser { | |
| 33 // If the numeric value of a link's anchor text is greater than this number, we don't think it | |
| 34 // represents the page number of the link. | |
| 35 private static final int MAX_NUM_FOR_PAGE_PARAM = 100; | |
| 36 | |
| 37 /** | |
| 38 * Type of sibling to check. | |
| 39 */ | |
| 40 private static enum Sibling { | |
| 41 PREV, // Node.getPreviousSibling(). | |
| 42 NEXT, // Node.getNextSibling(). | |
| 43 } | |
| 44 | |
| 45 /** | |
| 46 * Entry point for PageParameterParser. | |
| 47 * Parses the document to collect outlinks with digital anchor text and nume ric text around | |
| 48 * them. These are then passed to PageParameterParser to detect pagination URLs. | |
| 49 * | |
| 50 * @return PageParamInfo (see PageParamInfo.java), always. If no page param eter is detected or | |
| 51 * determined to be best, its mType is PageParamInfo.Type.UNSET. | |
| 52 * | |
| 53 * @param originalUrl the original URL of the document to be parsed. | |
| 54 * @param timingInfo for tracking performance. | |
| 55 */ | |
| 56 public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) { | |
| 57 PageParameterParser parser = new PageParameterParser(timingInfo); | |
| 58 return parser.parseDocument(Document.get().getDocumentElement(), origina lUrl); | |
| 59 } | |
| 60 | |
| 61 private final TimingInfo mTimingInfo; | |
| 62 private String mDocUrl = ""; | |
| 63 private ParsedUrl mParsedUrl = null; | |
| 64 private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new Monotoni cPageInfosGroups(); | |
| 65 // Siblings that have been processed before and after each link. | |
| 66 private final Set<Node> mProcessedSiblings = new HashSet<Node>(); | |
| 67 | |
| 68 private static RegExp sHrefCleaner = null; | |
| 69 | |
| 70 private PageParameterParser(TimingInfo timingInfo) { | |
| 71 mTimingInfo = timingInfo; | |
| 72 } | |
| 73 | |
| 74 /** | |
| 75 * Acutually implements PageParameterParser.parse(), see above description f or parse(). | |
| 76 */ | |
| 77 private PageParamInfo parseDocument(Element root, String originalUrl) { | |
| 78 double startTime = DomUtil.getTime(); | |
| 79 | |
| 80 mDocUrl = originalUrl; | |
| 81 mParsedUrl = ParsedUrl.create(mDocUrl); | |
| 82 if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL. | |
| 83 | |
| 84 mAdjacentNumbersGroups.addGroup(); | |
| 85 AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase( | |
| 86 PagingLinksFinder.getBaseUrlForRelative(root, originalUrl)); | |
| 87 | |
| 88 NodeList<Element> allLinks = root.getElementsByTagName("A"); | |
|
cjhopman
2015/06/26 00:35:25
What do you think of this alternative approach:
a
kuan
2015/07/13 22:57:03
Done.
| |
| 89 for (int i = 0; i < allLinks.getLength(); i++) { | |
| 90 final AnchorElement link = AnchorElement.as(allLinks.getItem(i)); | |
| 91 | |
| 92 String linkHref = PagingLinksFinder.resolveLinkHref(link, baseAnchor ); | |
| 93 ParsedUrl url = ParsedUrl.create(linkHref); | |
| 94 if (url == null || !url.getHost().equalsIgnoreCase(mParsedUrl.getHos t())) continue; | |
| 95 url.setHash(""); | |
| 96 | |
| 97 // Use javascript innerText (instead of javascript textContent) to o nly get visible | |
| 98 // text. | |
| 99 String linkText = DomUtil.getInnerText(link); | |
| 100 int number = linkTextToNumber(linkText); | |
| 101 if (isPlainPageNumber(number)) { | |
| 102 // Before we append the link to the current group of adjacent nu mbers, check if it's | |
| 103 // preceded by a sibling with text, which would determine if the current adjacency | |
| 104 // continues or ends. | |
| 105 Node parentWrapper = null; | |
| 106 if (!checkForSiblingWithText(link, Sibling.PREV)) { // Link has no sibling. | |
| 107 // The link could be a child of a parent that is simply a wr apper, i.e. with no | |
| 108 // extra text, in which case, we should be checking the sibl ings of the topmost | |
| 109 // parent wrapper. | |
| 110 parentWrapper = findParentWrapper(link, linkText.length()); | |
| 111 if (parentWrapper != null) checkForSiblingWithText(parentWra pper, Sibling.PREV); | |
| 112 } | |
| 113 | |
| 114 // This link is a good candidate for pagination, add it to the c urrent group of | |
| 115 // adjacent numbers. | |
| 116 if (isDisabledLink(link)) { | |
| 117 linkHref = ""; | |
| 118 } else { | |
| 119 if (sHrefCleaner == null) sHrefCleaner = RegExp.compile("/?( #.*)?$"); | |
| 120 linkHref = sHrefCleaner.replace(url.toString(), ""); | |
| 121 } | |
| 122 mAdjacentNumbersGroups.addNumber(number, linkHref); | |
| 123 | |
| 124 // Check if the link is followed by a sibling with text, which w ould determine if | |
| 125 // the current adjacency continues or ends. | |
| 126 checkForSiblingWithText(parentWrapper == null ? link : parentWra pper, Sibling.NEXT); | |
| 127 } else { | |
| 128 // This link is not a valid pagination link, so current group of adjacent numbers | |
| 129 // should be closed, adding a new group if possible. | |
| 130 mAdjacentNumbersGroups.addGroup(); | |
| 131 } | |
| 132 } // for all links | |
| 133 | |
| 134 mAdjacentNumbersGroups.cleanup(); | |
| 135 | |
| 136 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser"); | |
| 137 | |
| 138 startTime = DomUtil.getTime(); | |
| 139 PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups , mDocUrl); | |
| 140 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector"); | |
| 141 return info; | |
| 142 } | |
| 143 | |
| 144 private static RegExp sTermsRegExp = null; // Match terms i.e. words. | |
| 145 private static RegExp sSurroundingDigitsRegExp = null; // Match term with o nly digits. | |
| 146 | |
| 147 /** | |
| 148 * Checks for previous or next sibling with word text. If the text contains digit(s) as terms | |
| 149 * that form a valid page number, the sibling is added to the current group of adjacent | |
| 150 * numbers. Otherwise, the current group of adjacent numbers is closed to e nd the current | |
| 151 * adjacency, and a new group is started. | |
| 152 * | |
| 153 * @return true if given start node has at least 1 sibling, false otherwise. | |
| 154 | |
| 155 * @param start node to start checking with. | |
| 156 * @param sibling previous or next sibling. | |
| 157 */ | |
| 158 private boolean checkForSiblingWithText(Node start, Sibling sibling) { | |
| 159 Node node = start; | |
| 160 Node prevNode = null; | |
| 161 String text = ""; | |
| 162 // Find the first sibling that has inner text with words. | |
| 163 do { | |
| 164 prevNode = node; | |
| 165 node = sibling == Sibling.PREV ? node.getPreviousSibling() : node.ge tNextSibling(); | |
| 166 if (node == null && prevNode == start) return false; | |
| 167 if (node == null || node.getNodeType() == Node.DOCUMENT_NODE || | |
| 168 !mProcessedSiblings.add(node)) { // Node has been processed . | |
| 169 return true; | |
| 170 } | |
| 171 | |
| 172 if (node.getNodeType() == Node.TEXT_NODE) { | |
| 173 text = node.getNodeValue(); | |
| 174 } else { | |
| 175 Element e = Element.as(node); | |
| 176 | |
| 177 // Ignore non-void anchors, since we're iterating through them. | |
| 178 if (e.hasTagName("A") && shouldIgnoreLinkSibling(e)) return true ; | |
| 179 | |
| 180 // Ignore non-void link children, since we're iterating through them. | |
| 181 NodeList<Element> linkChildren = e.getElementsByTagName("A"); | |
| 182 for (int i = 0; i < linkChildren.getLength(); i++) { | |
| 183 if (shouldIgnoreLinkSibling(linkChildren.getItem(i))) return true; | |
| 184 } | |
| 185 | |
| 186 text = DomUtil.getInnerText(e); | |
| 187 } | |
| 188 } while (text.isEmpty() || StringUtil.countWords(text) == 0); | |
| 189 | |
| 190 if (!StringUtil.containsDigit(text)) { | |
| 191 // The sibling does not contain valid number(s), so current group of adjacent numbers | |
| 192 // should be closed, adding a new group if possible. | |
| 193 mAdjacentNumbersGroups.addGroup(); | |
| 194 return true; | |
| 195 } | |
| 196 | |
| 197 if (sTermsRegExp == null) { | |
| 198 sTermsRegExp = RegExp.compile("(\\S*[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\ \S*)", "gi"); | |
| 199 } else { | |
| 200 sTermsRegExp.setLastIndex(0); | |
| 201 } | |
| 202 if (sSurroundingDigitsRegExp == null) { | |
| 203 sSurroundingDigitsRegExp = RegExp.compile("^[\\W_]*(\\d+)[\\W_]*$", "i"); | |
| 204 } | |
| 205 | |
| 206 // Extract terms from the text, differentiating between those that conta in only digits and | |
| 207 // those that contain non-digits. | |
| 208 while (true) { | |
| 209 MatchResult match = sTermsRegExp.exec(text); | |
| 210 if (match == null) break; | |
| 211 if (match.getGroupCount() <= 1) continue; | |
| 212 | |
| 213 String term = match.getGroup(1); | |
| 214 MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term); | |
| 215 int number = -1; | |
| 216 if (termWithDigits != null && termWithDigits.getGroupCount() > 1) { | |
| 217 number = StringUtil.toNumber(termWithDigits.getGroup(1)); | |
| 218 } | |
| 219 if (isPlainPageNumber(number)) { | |
| 220 // This sibling is a valid candidate of plain text page number, add it to last | |
| 221 // group of adjacent numbers. | |
| 222 mAdjacentNumbersGroups.addNumber(number, ""); | |
| 223 } else { | |
| 224 // The sibling is not a valid number, so current group of adjace nt numbers | |
| 225 // should be closed, adding a new group if possible. | |
| 226 mAdjacentNumbersGroups.addGroup(); | |
| 227 } | |
| 228 } // while there're matches | |
| 229 | |
| 230 return true; | |
| 231 } | |
| 232 | |
| 233 /** | |
| 234 * @return the topmost parent of the given node that simply wraps the node, i.e. with no more | |
| 235 * inner text than that of given node. | |
| 236 */ | |
| 237 private static Node findParentWrapper(Node node, int nodeTextLen) { | |
| 238 Node parent = node; | |
| 239 Node prevParent = null; | |
| 240 // While keeping track of each parent, once we find the first one that h as more text than | |
| 241 // given node, the previous parent would be what we want. | |
| 242 do { | |
| 243 prevParent = parent; | |
| 244 parent = parent.getParentNode(); | |
| 245 } while (parent != null && DomUtil.getInnerText(parent).length() == node TextLen); | |
| 246 | |
| 247 return prevParent == node || prevParent.getNodeType() == Node.DOCUMENT_N ODE ? | |
| 248 null : prevParent; | |
| 249 } | |
| 250 | |
| 251 /** | |
| 252 * @return true if link is disabled i.e. not clickable because it has a text cursor. | |
| 253 */ | |
| 254 private static boolean isDisabledLink(AnchorElement link) { | |
| 255 Style style = DomUtil.getComputedStyle(link); | |
| 256 return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cu rsor.TEXT; | |
| 257 } | |
| 258 | |
| 259 /** | |
| 260 * @return true if link should be ignored when checking for sibling. | |
| 261 * All non-void links will be processed, and hence could be ignored when che cking for sibling. | |
| 262 * Void links are rejected at the beginning because their hosts are differen t from | |
| 263 * originalUrl's, so they should be considered when checking for sibling. | |
| 264 */ | |
| 265 private static boolean shouldIgnoreLinkSibling(Element link) { | |
| 266 return !AnchorElement.as(link).getHref().equals("javascript:void(0)"); | |
| 267 } | |
| 268 | |
| 269 private static int linkTextToNumber(String linkText) { | |
| 270 linkText = linkText.replaceAll("[()\\[\\]{}]", ""); | |
| 271 linkText = linkText.trim(); // Remove leading and trailing whitespaces. | |
| 272 // Remove duplicate internal whitespaces. | |
| 273 linkText = linkText.replaceAll("\\s\\{2,\\}", " "); | |
| 274 return StringUtil.toNumber(linkText); | |
| 275 } | |
| 276 | |
| 277 /** | |
| 278 * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM. | |
| 279 */ | |
| 280 private static boolean isPlainPageNumber(int number) { | |
| 281 return number >= 0 && number < MAX_NUM_FOR_PAGE_PARAM; | |
| 282 } | |
| 283 | |
| 284 } | |
| OLD | NEW |