Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 package org.chromium.distiller; | |
| 6 | |
| 7 import org.chromium.distiller.proto.DomDistillerProtos; | |
| 8 import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo; | |
| 9 | |
| 10 import com.google.gwt.dom.client.AnchorElement; | |
| 11 import com.google.gwt.dom.client.Document; | |
| 12 import com.google.gwt.dom.client.Element; | |
| 13 import com.google.gwt.dom.client.Node; | |
| 14 import com.google.gwt.dom.client.NodeList; | |
| 15 import com.google.gwt.dom.client.Style; | |
| 16 import com.google.gwt.regexp.shared.MatchResult; | |
| 17 import com.google.gwt.regexp.shared.RegExp; | |
| 18 | |
| 19 /** | |
| 20 * Background: | |
| 21 * The long article/news/forum thread/blog document may be partitioned into se veral partial pages | |
| 22 * by webmaster. Each partial page has outlinks pointing to the adjacent part ial pages. The | |
| 23 * anchor text of those outlinks is numeric. | |
| 24 * | |
| 25 * This class parses the document to collect groups of adjacent plain text numbe rs and outlinks with | |
| 26 * digital anchor text. These are then passed to PageParameterParser which woul d spit out the | |
| 27 * pagination URLs if available. | |
| 28 */ | |
| 29 public class PageParameterParser { | |
| 30 // If the numeric value of a link's anchor text is greater than this number, we don't think it | |
| 31 // represents the page number of the link. | |
| 32 private static final int MAX_NUM_FOR_PAGE_PARAM = 100; | |
| 33 | |
| 34 /** | |
| 35 * Stores PageParamInfo.PageInfo and the anchor's text, specifically returne d by | |
| 36 * getPageInfoAndText(). | |
| 37 */ | |
| 38 private static class PageInfoAndText { | |
| 39 private final PageParamInfo.PageInfo mPageInfo; | |
| 40 private final String mText; | |
| 41 | |
| 42 PageInfoAndText(int number, String url, String text) { | |
| 43 mPageInfo = new PageParamInfo.PageInfo(number, url); | |
| 44 mText = text; | |
| 45 } | |
| 46 } | |
| 47 | |
| 48 /** | |
| 49 * Entry point for PageParameterParser. | |
| 50 * Parses the document to collect outlinks with digital anchor text and nume ric text around | |
|
wychen
2015/09/21 23:08:03
Does digital mean the same thing as numeric?
kuan
2015/10/02 15:59:17
Done.
| |
| 51 * them. These are then passed to PageParameterParser to detect pagination URLs. | |
| 52 * | |
| 53 * @return PageParamInfo (see PageParamInfo.java), always. If no page param eter is detected or | |
| 54 * determined to be best, its mType is PageParamInfo.Type.UNSET. | |
| 55 * | |
| 56 * @param originalUrl the original URL of the document to be parsed. | |
| 57 * @param timingInfo for tracking performance. | |
| 58 */ | |
| 59 public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) { | |
| 60 PageParameterParser parser = new PageParameterParser(timingInfo); | |
| 61 return parser.parseDocument(Document.get().getDocumentElement(), origina lUrl); | |
| 62 } | |
| 63 | |
| 64 private final TimingInfo mTimingInfo; | |
| 65 private String mDocUrl = ""; | |
| 66 private ParsedUrl mParsedUrl = null; | |
| 67 private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new Monotoni cPageInfosGroups(); | |
| 68 private int mNumForwardLinksProcessed = 0; | |
| 69 | |
| 70 private static RegExp sHrefCleaner = null; | |
| 71 private static RegExp sInvalidParentWrapper = null; | |
| 72 | |
| 73 private PageParameterParser(TimingInfo timingInfo) { | |
| 74 mTimingInfo = timingInfo; | |
| 75 } | |
| 76 | |
| 77 /** | |
| 78 * Acutually implements PageParameterParser.parse(), see above description f or parse(). | |
| 79 */ | |
| 80 private PageParamInfo parseDocument(Element root, String originalUrl) { | |
| 81 double startTime = DomUtil.getTime(); | |
| 82 | |
| 83 if (sHrefCleaner == null) sHrefCleaner = RegExp.compile("\\/$"); | |
|
wychen
2015/09/21 23:08:03
Is this faster than eager initialization? If these
kuan
2015/10/02 15:59:17
sHrefCleaner is always used, so i've changed to in
| |
| 84 if (sInvalidParentWrapper == null) sInvalidParentWrapper = RegExp.compil e("(BODY)|(HTML)"); | |
| 85 | |
| 86 mDocUrl = sHrefCleaner.replace(originalUrl, ""); | |
| 87 mParsedUrl = ParsedUrl.create(mDocUrl); | |
| 88 if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL. | |
| 89 | |
| 90 AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase( | |
| 91 PagingLinksFinder.getBaseUrlForRelative(root, originalUrl)); | |
| 92 | |
| 93 NodeList<Element> allLinks = root.getElementsByTagName("A"); | |
| 94 int idx = 0; | |
| 95 while (idx < allLinks.getLength()) { | |
| 96 final AnchorElement link = AnchorElement.as(allLinks.getItem(idx)); | |
| 97 PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAncho r); | |
| 98 if (pageInfoAndText == null) { | |
| 99 idx++; | |
| 100 continue; | |
| 101 } | |
| 102 | |
| 103 // This link is a good candidate for pagination. | |
| 104 | |
| 105 // Close current group of adjacent numbers, add a new group if neces sary. | |
| 106 mAdjacentNumbersGroups.addGroup(); | |
| 107 | |
| 108 // Before we append the link to the new group of adjacent numbers, c heck if it's | |
| 109 // preceded by a text node with numeric text; if so, add it before t he link. | |
| 110 findAndAddClosestValidLeafNodes(link, false, true, null); | |
| 111 | |
| 112 // Add the link to the current group of adjacent numbers. | |
| 113 mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo); | |
| 114 | |
| 115 // Add all following text nodes and links with numeric text. | |
| 116 mNumForwardLinksProcessed = 0; | |
| 117 findAndAddClosestValidLeafNodes(link, false, false, baseAnchor); | |
| 118 | |
| 119 // Skip the current link and links already processed in the forward | |
| 120 // findandAddClosestValidLeafNodes(). | |
| 121 idx += 1 + mNumForwardLinksProcessed; | |
| 122 } // while there're links. | |
| 123 | |
| 124 mAdjacentNumbersGroups.cleanup(); | |
| 125 | |
| 126 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser"); | |
| 127 | |
| 128 startTime = DomUtil.getTime(); | |
| 129 PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups , mDocUrl); | |
| 130 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector"); | |
| 131 return info; | |
| 132 } | |
| 133 | |
| 134 /** | |
| 135 * @return a populated PageInfoAndText if given link is to be added to mAdja centNumbersGroups. | |
| 136 * Otherwise, returns null if link is to be ignored. | |
| 137 * "javascript:void" links with numeric text are considered valid links to b e added. | |
|
wychen
2015/09/21 23:08:03
nit: not necessarily void.
kuan
2015/10/02 15:59:17
Done.
| |
| 138 * | |
| 139 * @param link to process. | |
| 140 * @param baseAnchor created for the current document. | |
| 141 */ | |
| 142 private PageInfoAndText getPageInfoAndText(AnchorElement link, AnchorElement baseAnchor) { | |
| 143 // Ignore invisible links. | |
| 144 if (!DomUtil.isVisible(link)) return null; | |
| 145 | |
| 146 String linkHref = resolveLinkHref(link, baseAnchor); | |
| 147 final boolean isEmptyHref = linkHref.isEmpty(); | |
| 148 boolean isJavascriptLink = false; | |
| 149 ParsedUrl url = null; | |
| 150 if (!isEmptyHref) { | |
| 151 isJavascriptLink = isJavascriptHref(linkHref); | |
| 152 url = ParsedUrl.create(linkHref); | |
| 153 if (url == null || | |
| 154 (!isJavascriptLink && !url.getHost().equalsIgnoreCase(mParse dUrl.getHost()))) { | |
| 155 return null; | |
| 156 } | |
| 157 url.setHash(""); | |
| 158 } | |
| 159 | |
| 160 // Use javascript innerText (instead of javascript textContent) to only get visible text. | |
| 161 String linkText = StringUtil.jsTrim(DomUtil.getInnerText(link)); | |
| 162 int number = linkTextToNumber(linkText); | |
| 163 if (!isPlainPageNumber(number)) return null; | |
|
wychen
2015/09/21 23:08:03
Since most links aren't numbers, can we use this a
kuan
2015/10/02 15:59:17
i can't move it to beginning of while loop - it ne
| |
| 164 | |
| 165 if (isEmptyHref || isJavascriptLink || isDisabledLink(link)) { | |
| 166 return new PageInfoAndText(number, "", linkText); | |
| 167 } | |
| 168 | |
| 169 return new PageInfoAndText(number, sHrefCleaner.replace(url.toString(), ""), linkText); | |
| 170 } | |
| 171 | |
| 172 /** | |
| 173 * Finds and adds the leaf node(s) closest to the given start node. | |
| 174 * This recurses and keeps finding and, if necessary, adding the numeric tex t of valid nodes, | |
| 175 * collecting the PageParamInfo.PageInfo's for the current adjacency group. | |
| 176 * For backward search, i.e. nodes before start node, search terminates (i.e . recusion stops) | |
| 177 * once a text node or anchor is encountered. If the text node contains num eric text, it's | |
| 178 * added to the current adjacency group. Otherwise, a new group is created to break the | |
| 179 * adjacency. | |
| 180 * For forward search, i.e. nodes after start node, search continues (i.e. r ecursion continues) | |
| 181 * until a text node or anchor with non-numeric text is encountered. In the process, text nodes | |
| 182 * and anchors with numeric text are added to the current adjaency group. W hen a non-numeric | |
| 183 * text node or anchor is encountered, a new group is started to break the a djacency, and search | |
| 184 * ends. | |
| 185 * | |
| 186 * @return true to continue search, false to stop. | |
| 187 * | |
| 188 * @param start node to work on. | |
| 189 * @param checkStart true to check start node. Otherwise, the previous or n ext sibling of the | |
| 190 * start node is checked. | |
| 191 * @param backward true to search backward (i.e. nodes before start node), f alse to search | |
| 192 * forward (i.e. nodes after start node). | |
| 193 * @param baseAnchor created for the current document, only needed for forwa rd search. | |
| 194 */ | |
| 195 private boolean findAndAddClosestValidLeafNodes(Node start, boolean checkSta rt, | |
| 196 boolean backward, AnchorElement baseAnchor) { | |
| 197 Node node = checkStart ? start : | |
| 198 (backward ? start.getPreviousSibling() : start.getNextSibling()) ; | |
| 199 if (node == null) { // No sibling, try parent. | |
| 200 node = start.getParentNode(); | |
| 201 if (sInvalidParentWrapper.test(node.getNodeName())) return false; | |
| 202 return findAndAddClosestValidLeafNodes(node, false, backward, baseAn chor); | |
| 203 } | |
| 204 | |
| 205 checkStart = false; | |
| 206 switch (node.getNodeType()) { | |
| 207 case Node.TEXT_NODE: | |
| 208 String text = node.getNodeValue(); | |
| 209 // Text must contain words. | |
| 210 if (text.isEmpty() || StringUtil.countWords(text) == 0) break; | |
| 211 boolean added = addNonLinkTextIfValid(node.getNodeValue()); | |
| 212 // For backward search, we're done regardless if text was added. | |
| 213 // For forward search, we're done only if text was invalid, othe rwise continue. | |
| 214 if (backward || !added) return false; | |
| 215 break; | |
| 216 | |
| 217 case Node.ELEMENT_NODE: | |
| 218 Element e = Element.as(node); | |
| 219 if (e.hasTagName("A")) { | |
| 220 // For backward search, we're done because we've already pro cessed the anchor. | |
| 221 if (backward) return false; | |
| 222 // For forward search, we're done only if link was invalid, otherwise continue. | |
| 223 mNumForwardLinksProcessed++; | |
| 224 if (!addLinkIfValid(AnchorElement.as(e), baseAnchor)) return false; | |
| 225 break; | |
| 226 } | |
| 227 // Intentionally fall through. | |
| 228 | |
| 229 default: | |
| 230 // Check children nodes. | |
| 231 if (!node.hasChildNodes()) break; | |
| 232 checkStart = true; // We want to check the child node. | |
| 233 if (backward) { | |
| 234 // Start the backward search with the rightmost child i.e. l ast and closest to | |
| 235 // given node. | |
| 236 node = node.getLastChild(); | |
| 237 } else { | |
| 238 // Start the forward search with the leftmost child i.e. fir st and closest to | |
| 239 // given node. | |
|
wychen
2015/09/21 23:08:03
nit: indentation
kuan
2015/10/02 15:59:17
Done.
| |
| 240 node = node.getFirstChild(); | |
| 241 } | |
| 242 break; | |
| 243 } | |
| 244 | |
| 245 return findAndAddClosestValidLeafNodes(node, checkStart, backward, baseA nchor); | |
| 246 } | |
| 247 | |
| 248 private static RegExp sTermsRegExp = null; // Match terms i.e. words. | |
| 249 private static RegExp sSurroundingDigitsRegExp = null; // Match term with o nly digits. | |
| 250 | |
| 251 /** | |
| 252 * Handle the text for a non-link node. Each numeric term in the text that is a valid plain | |
| 253 * page number adds a PageParamInfo.PageInfo into the current adjacent group . All other terms | |
| 254 * break the adjacency in the current group, adding a new group instead. | |
| 255 * | |
| 256 * @Return true if text was added to current group of adjacent numbers. Oth erwise, false with | |
| 257 * a new group created to break the current adjacency. | |
| 258 */ | |
| 259 private boolean addNonLinkTextIfValid(String text) { | |
| 260 if (!StringUtil.containsDigit(text)) { | |
| 261 // The text does not contain valid number(s); if necessary, current group of adjacent | |
| 262 // numbers should be closed, adding a new group if possible. | |
| 263 mAdjacentNumbersGroups.addGroup(); | |
| 264 return false; | |
| 265 } | |
| 266 | |
| 267 if (sTermsRegExp == null) { | |
| 268 sTermsRegExp = RegExp.compile("(\\S*[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\ \S*)", "gi"); | |
| 269 } else { | |
| 270 sTermsRegExp.setLastIndex(0); | |
| 271 } | |
| 272 if (sSurroundingDigitsRegExp == null) { | |
| 273 sSurroundingDigitsRegExp = RegExp.compile("^[\\W_]*(\\d+)[\\W_]*$", "i"); | |
| 274 } | |
| 275 | |
| 276 // Extract terms from the text, differentiating between those that conta in only digits and | |
| 277 // those that contain non-digits. | |
| 278 boolean added = false; | |
| 279 while (true) { | |
| 280 MatchResult match = sTermsRegExp.exec(text); | |
| 281 if (match == null) break; | |
| 282 if (match.getGroupCount() <= 1) continue; | |
| 283 | |
| 284 String term = match.getGroup(1); | |
| 285 MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term); | |
| 286 int number = -1; | |
| 287 if (termWithDigits != null && termWithDigits.getGroupCount() > 1) { | |
| 288 number = StringUtil.toNumber(termWithDigits.getGroup(1)); | |
| 289 } | |
| 290 if (isPlainPageNumber(number)) { | |
| 291 // This text is a valid candidate of plain text page number, add it to last group of | |
| 292 // adjacent numbers. | |
| 293 mAdjacentNumbersGroups.addNumber(number, ""); | |
| 294 added = true; | |
| 295 } else { | |
| 296 // The text is not a valid number, so current group of adjacent numbers should be | |
| 297 // closed, adding a new group if possible. | |
| 298 mAdjacentNumbersGroups.addGroup(); | |
| 299 } | |
| 300 } // while there're matches | |
| 301 | |
| 302 return added; | |
| 303 } | |
| 304 | |
| 305 /** | |
| 306 * Adds PageParamInfo.PageInfo to the current adjacent group for a link if i ts text is numeric. | |
| 307 * Otherwise, add a new group to break the adjacency. | |
| 308 * | |
| 309 * @Return true if link was added, false otherwise. | |
| 310 */ | |
| 311 private boolean addLinkIfValid(AnchorElement link, AnchorElement baseAnchor) { | |
| 312 PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor); | |
| 313 if (pageInfoAndText != null) { | |
| 314 mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo); | |
| 315 return true; | |
| 316 } | |
| 317 mAdjacentNumbersGroups.addGroup(); | |
| 318 return false; | |
| 319 } | |
| 320 | |
| 321 /** | |
| 322 * @return true if link is disabled i.e. not clickable because it has a text cursor. | |
| 323 */ | |
| 324 private static boolean isDisabledLink(AnchorElement link) { | |
| 325 Style style = DomUtil.getComputedStyle(link); | |
| 326 return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cu rsor.TEXT; | |
|
wychen
2015/09/21 23:08:03
Even if the cursor style is different, the link is
kuan
2015/10/02 15:59:17
no, the link is not clickable - it behaves like re
| |
| 327 } | |
| 328 | |
| 329 /** | |
| 330 * @return true if href starts with "javascript:". | |
| 331 */ | |
| 332 private static boolean isJavascriptHref(String href) { | |
| 333 return href.startsWith("javascript:"); | |
| 334 } | |
| 335 | |
| 336 private static String resolveLinkHref(AnchorElement link, AnchorElement base Anchor) { | |
| 337 String linkHref = link.getAttribute("href"); | |
| 338 if (linkHref.isEmpty()) return ""; | |
|
wychen
2015/09/21 23:08:03
If href="", it means the current URL. What's the r
kuan
2015/10/02 15:59:17
anchors w/out "href" attr are not considered pagin
| |
| 339 baseAnchor.setAttribute("href", linkHref); | |
| 340 return baseAnchor.getHref(); | |
| 341 } | |
| 342 | |
| 343 private static int linkTextToNumber(String linkText) { | |
| 344 linkText = linkText.replaceAll("[()\\[\\]{}]", ""); | |
| 345 linkText = linkText.trim(); // Remove leading and trailing whitespaces. | |
| 346 // Remove duplicate internal whitespaces. | |
| 347 linkText = linkText.replaceAll("\\s\\{2,\\}", " "); | |
|
wychen
2015/09/21 23:08:03
Why is this necessary?
kuan
2015/10/02 15:59:17
the original code has this, so i follow suit. how
| |
| 348 return StringUtil.toNumber(linkText); | |
| 349 } | |
| 350 | |
| 351 /** | |
| 352 * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM. | |
| 353 */ | |
| 354 private static boolean isPlainPageNumber(int number) { | |
| 355 return number >= 0 && number <= MAX_NUM_FOR_PAGE_PARAM; | |
| 356 } | |
| 357 | |
| 358 } | |
| OLD | NEW |