Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 package org.chromium.distiller; | |
| 6 | |
| 7 import org.chromium.distiller.proto.DomDistillerProtos; | |
| 8 import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo; | |
| 9 | |
| 10 import com.google.gwt.dom.client.AnchorElement; | |
| 11 import com.google.gwt.dom.client.Document; | |
| 12 import com.google.gwt.dom.client.Element; | |
| 13 import com.google.gwt.dom.client.Node; | |
| 14 import com.google.gwt.dom.client.NodeList; | |
| 15 import com.google.gwt.dom.client.Style; | |
| 16 import com.google.gwt.regexp.shared.MatchResult; | |
| 17 import com.google.gwt.regexp.shared.RegExp; | |
| 18 | |
| 19 /** | |
| 20 * Background: | |
| 21 * The long article/news/forum thread/blog document may be partitioned into se veral partial pages | |
| 22 * by webmaster. Each partial page has outlinks pointing to the adjacent part ial pages. The | |
| 23 * anchor text of those outlinks is numeric. | |
| 24 * | |
| 25 * This class parses the document to collect groups of adjacent plain text numbe rs and outlinks with | |
| 26 * digital anchor text. These are then passed to PageParameterParser which woul d spit out the | |
| 27 * pagination URLs if available. | |
| 28 */ | |
| 29 public class PageParameterParser { | |
| 30 // If the numeric value of a link's anchor text is greater than this number, we don't think it | |
| 31 // represents the page number of the link. | |
| 32 private static final int MAX_NUM_FOR_PAGE_PARAM = 100; | |
| 33 | |
| 34 /** | |
| 35 * Stores PageParamInfo.PageInfo and the anchor's text, specifically returne d by | |
| 36 * getPageInfoAndText(). | |
| 37 */ | |
| 38 private static class PageInfoAndText { | |
| 39 private final PageParamInfo.PageInfo mPageInfo; | |
| 40 private final String mText; | |
| 41 | |
| 42 PageInfoAndText(int number, String url, String text) { | |
| 43 mPageInfo = new PageParamInfo.PageInfo(number, url); | |
| 44 mText = text; | |
| 45 } | |
| 46 } | |
| 47 | |
| 48 /** | |
| 49 * Entry point for PageParameterParser. | |
| 50 * Parses the document to collect outlinks with digital anchor text and nume ric text around | |
| 51 * them. These are then passed to PageParameterParser to detect pagination URLs. | |
| 52 * | |
| 53 * @return PageParamInfo (see PageParamInfo.java), always. If no page param eter is detected or | |
| 54 * determined to be best, its mType is PageParamInfo.Type.UNSET. | |
| 55 * | |
| 56 * @param originalUrl the original URL of the document to be parsed. | |
| 57 * @param timingInfo for tracking performance. | |
| 58 */ | |
| 59 public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) { | |
| 60 PageParameterParser parser = new PageParameterParser(timingInfo); | |
| 61 return parser.parseDocument(Document.get().getDocumentElement(), origina lUrl); | |
| 62 } | |
| 63 | |
| 64 private final TimingInfo mTimingInfo; | |
| 65 private String mDocUrl = ""; | |
| 66 private ParsedUrl mParsedUrl = null; | |
| 67 private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new Monotoni cPageInfosGroups(); | |
| 68 | |
| 69 private static RegExp sHrefCleaner = null; | |
| 70 | |
| 71 private PageParameterParser(TimingInfo timingInfo) { | |
| 72 mTimingInfo = timingInfo; | |
| 73 } | |
| 74 | |
| 75 /** | |
| 76 * Acutually implements PageParameterParser.parse(), see above description f or parse(). | |
| 77 */ | |
| 78 private PageParamInfo parseDocument(Element root, String originalUrl) { | |
| 79 double startTime = DomUtil.getTime(); | |
| 80 | |
| 81 mDocUrl = originalUrl; | |
| 82 mParsedUrl = ParsedUrl.create(mDocUrl); | |
| 83 if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL. | |
| 84 | |
| 85 AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase( | |
| 86 PagingLinksFinder.getBaseUrlForRelative(root, originalUrl)); | |
| 87 | |
| 88 NodeList<Element> allLinks = root.getElementsByTagName("A"); | |
| 89 int idx = 0; | |
| 90 while (idx < allLinks.getLength()) { | |
| 91 final AnchorElement link = AnchorElement.as(allLinks.getItem(idx)); | |
| 92 PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAncho r); | |
| 93 if (pageInfoAndText == null) { | |
| 94 idx++; | |
| 95 continue; | |
| 96 } | |
| 97 | |
| 98 // This link is a good candidate for pagination. | |
| 99 | |
| 100 // Close current group of adjacent numbers, add a new group if neces sary. | |
| 101 mAdjacentNumbersGroups.addGroup(); | |
| 102 | |
| 103 // Before we append the link to the new group of adjacent numbers, c heck if it's | |
| 104 // preceded by a sibling with text; if so, add it before the link. | |
| 105 Node parentWrapper = null; | |
|
cjhopman
2015/07/29 01:07:53
What's this parent wrapper? I don't recall that be
kuan
2015/07/30 16:47:00
i had it in the previous change, and attempted to
cjhopman
2015/08/04 21:58:41
But why the parent wrapper thing? Why not just wal
kuan
2015/08/04 22:38:37
what do u mean by "backwards/forwards in the tree"
kuan
2015/08/11 19:09:38
Done.
| |
| 106 if (!checkForPrevSiblingWithText(link)) { // Link has no sibling. | |
| 107 // The link could be a child of a parent that is simply a wrappe r, i.e. with no | |
| 108 // extra text, in which case, we should be checking the siblings of the topmost | |
| 109 // parent wrapper. | |
| 110 parentWrapper = findParentWrapper(link, pageInfoAndText.mText.le ngth()); | |
| 111 if (parentWrapper != null) checkForPrevSiblingWithText(parentWra pper); | |
| 112 } | |
| 113 | |
| 114 // Add the link to the current group of adjacent numbers. | |
| 115 mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo); | |
| 116 | |
| 117 // Add all following siblings with numeric text, with or without lin ks. | |
| 118 int numLinksAdded = 0; | |
| 119 if (parentWrapper == null) | |
| 120 numLinksAdded = addFollowingSiblings(link, false, baseAnchor); | |
| 121 else | |
| 122 numLinksAdded = addFollowingSiblings(parentWrapper, true, baseAn chor); | |
| 123 | |
| 124 // Skip the current link and links already processed in addFollowing Siblings(). | |
| 125 idx += 1 + numLinksAdded; | |
| 126 } // while there're links. | |
| 127 | |
| 128 mAdjacentNumbersGroups.cleanup(); | |
| 129 | |
| 130 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser"); | |
| 131 | |
| 132 startTime = DomUtil.getTime(); | |
| 133 PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups , mDocUrl); | |
| 134 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector"); | |
| 135 return info; | |
| 136 } | |
| 137 | |
| 138 | |
| 139 /** | |
| 140 * @return a populated PageInfoAndText if given link is to be added to mAdja centNumbersGroups. | |
| 141 * Otherwise, returns null if link is to be ignored. | |
| 142 * "javascript:void" links with numeric text are considered valid links to b e added. | |
| 143 * | |
| 144 * @param link to process. | |
| 145 * @param baseAnchor created for the current document. | |
| 146 */ | |
| 147 private PageInfoAndText getPageInfoAndText(AnchorElement link, AnchorElement baseAnchor) { | |
| 148 // Ignore invisible links. | |
| 149 int width = link.getOffsetWidth(); | |
| 150 int height = link.getOffsetHeight(); | |
| 151 if (width == 0 || height == 0 || !DomUtil.isVisible(link)) return null; | |
|
cjhopman
2015/07/29 01:07:53
It seems odd that invisible links are handled here
kuan
2015/07/30 16:47:00
invisible links need to be ignored. i do this her
| |
| 152 | |
| 153 String linkHref = PagingLinksFinder.resolveLinkHref(link, baseAnchor); | |
| 154 boolean isVoidLink = isVoidHref(linkHref); | |
| 155 ParsedUrl url = ParsedUrl.create(linkHref); | |
| 156 if (url == null || (!isVoidLink && !url.getHost().equalsIgnoreCase(mPars edUrl.getHost()))) { | |
| 157 return null; | |
| 158 } | |
| 159 | |
| 160 url.setHash(""); | |
| 161 | |
| 162 // Use javascript innerText (instead of javascript textContent) to only get visible text. | |
| 163 String linkText = DomUtil.getInnerText(link); | |
| 164 int number = linkTextToNumber(linkText); | |
| 165 if (!isPlainPageNumber(number)) return null; | |
| 166 | |
| 167 if (isVoidLink || isDisabledLink(link)) return new PageInfoAndText(numbe r, "", linkText); | |
| 168 | |
| 169 if (sHrefCleaner == null) sHrefCleaner = RegExp.compile("/?(#.*)?$"); | |
| 170 return new PageInfoAndText(number, sHrefCleaner.replace(url.toString(), ""), linkText); | |
| 171 } | |
| 172 | |
| 173 /** | |
| 174 * Checks for previous sibling with word text. If the text contains digit(s ) as terms that | |
| 175 * form a valid page number, the sibling is added to the current group of ad jacent numbers. | |
| 176 * Otherwise, the current group of adjacent numbers is closed to end the cur rent adjacency, and | |
| 177 * a new group is started. | |
| 178 * | |
| 179 * @return true if given start node has at least 1 sibling, false otherwise. | |
| 180 | |
| 181 * @param start node to start checking with. | |
| 182 */ | |
| 183 private boolean checkForPrevSiblingWithText(Node start) { | |
|
cjhopman
2015/07/29 01:07:53
I'm having difficulty understanding both the way t
kuan
2015/07/30 16:47:00
i initially had the check for previous and next nu
| |
| 184 Node node = start; | |
| 185 Node prevNode = null; | |
| 186 String text = ""; | |
| 187 // Find the first previous sibling that has inner text with words. | |
| 188 do { | |
| 189 prevNode = node; | |
| 190 node = node.getPreviousSibling(); | |
| 191 if (node == null && prevNode == start) return false; | |
| 192 if (node == null || node.getNodeType() == Node.DOCUMENT_NODE) return true; | |
| 193 | |
| 194 if (node.getNodeType() == Node.TEXT_NODE) { | |
| 195 text = node.getNodeValue(); | |
| 196 } else { | |
| 197 Element e = Element.as(node); | |
| 198 // Previous link siblings or children have already been processe d. | |
| 199 if (e.hasTagName("A") || e.getElementsByTagName("A").getLength() > 0) return true; | |
| 200 text = DomUtil.getInnerText(e); | |
| 201 } | |
| 202 } while (text.isEmpty() || StringUtil.countWords(text) == 0); | |
| 203 | |
| 204 addNumberText(text); | |
| 205 return true; | |
| 206 } | |
| 207 | |
| 208 /** | |
| 209 * Adds all following siblings (links and non-links) with numeric text. If the text contains | |
| 210 * digit(s) as terms that form a valid page number, the sibling is added to the current group of | |
| 211 * adjacent numbers. Otherwise, the current group of adjacent numbers is cl osed to end the | |
| 212 * current adjacency, and a new group is started. | |
| 213 * | |
| 214 * @return number of links added. | |
| 215 | |
| 216 * @param start node to start checking with. | |
| 217 * @param isParentWrapper true if given start node is a parent wrapper of a link. | |
| 218 * @param baseAnchor created for the current document. | |
| 219 */ | |
| 220 private int addFollowingSiblings(Node start, boolean isParentWrapper, | |
| 221 AnchorElement baseAnchor) { | |
| 222 Node node = start; | |
| 223 Node prevNode = null; | |
| 224 String text = ""; | |
| 225 int numLinksProcessed = 0; | |
| 226 // Find all following siblings, add them if their text is purely numeric . | |
| 227 while (true) { | |
| 228 prevNode = node; | |
| 229 node = node.getNextSibling(); | |
| 230 if (node == null || node.getNodeType() == Node.DOCUMENT_NODE) return numLinksProcessed; | |
| 231 | |
| 232 boolean handled = false; | |
| 233 if (node.getNodeType() == Node.TEXT_NODE) { | |
| 234 text = node.getNodeValue(); | |
| 235 } else { | |
| 236 Element e = Element.as(node); | |
| 237 if (e.hasTagName("A")) { | |
| 238 addValidLink(AnchorElement.as(e), baseAnchor); | |
| 239 numLinksProcessed++; | |
| 240 handled = true; | |
| 241 } else if (isParentWrapper) { | |
| 242 NodeList<Element> linkChildren = e.getElementsByTagName("A") ; | |
| 243 final int numChildren = linkChildren.getLength(); | |
| 244 for (int i = 0; i < numChildren; i++) { | |
| 245 addValidLink(AnchorElement.as(linkChildren.getItem(i)), baseAnchor); | |
| 246 numLinksProcessed++; | |
| 247 } | |
| 248 if (numChildren > 0) handled = true; | |
| 249 } | |
| 250 | |
| 251 text = handled ? "" : DomUtil.getInnerText(e); | |
| 252 } | |
| 253 | |
| 254 if (!text.isEmpty() && StringUtil.countWords(text) > 0) addNumberTex t(text); | |
| 255 } | |
| 256 } | |
| 257 | |
| 258 private static RegExp sTermsRegExp = null; // Match terms i.e. words. | |
| 259 private static RegExp sSurroundingDigitsRegExp = null; // Match term with o nly digits. | |
| 260 | |
| 261 /** | |
| 262 * Add PageParamInfo.PageInfo for a non-link with numeric text. | |
|
cjhopman
2015/07/29 01:07:53
It looks like the text doesn't have to be strictly
kuan
2015/07/30 16:47:00
Done. renamed fn too.
| |
| 263 */ | |
| 264 private void addNumberText(String text) { | |
| 265 if (!StringUtil.containsDigit(text)) { | |
| 266 // The sibling does not contain valid number(s); if necessary, curre nt group of adjacent | |
| 267 // numbers should be closed, adding a new group if possible. | |
| 268 mAdjacentNumbersGroups.addGroup(); | |
| 269 return; | |
| 270 } | |
| 271 | |
| 272 if (sTermsRegExp == null) { | |
| 273 sTermsRegExp = RegExp.compile("(\\S*[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\ \S*)", "gi"); | |
| 274 } else { | |
| 275 sTermsRegExp.setLastIndex(0); | |
| 276 } | |
| 277 if (sSurroundingDigitsRegExp == null) { | |
| 278 sSurroundingDigitsRegExp = RegExp.compile("^[\\W_]*(\\d+)[\\W_]*$", "i"); | |
| 279 } | |
| 280 | |
| 281 // Extract terms from the text, differentiating between those that conta in only digits and | |
| 282 // those that contain non-digits. | |
| 283 while (true) { | |
| 284 MatchResult match = sTermsRegExp.exec(text); | |
| 285 if (match == null) break; | |
| 286 if (match.getGroupCount() <= 1) continue; | |
| 287 | |
| 288 String term = match.getGroup(1); | |
| 289 MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term); | |
| 290 int number = -1; | |
| 291 if (termWithDigits != null && termWithDigits.getGroupCount() > 1) { | |
| 292 number = StringUtil.toNumber(termWithDigits.getGroup(1)); | |
| 293 } | |
| 294 if (isPlainPageNumber(number)) { | |
| 295 // This sibling is a valid candidate of plain text page number, add it to last | |
| 296 // group of adjacent numbers. | |
| 297 mAdjacentNumbersGroups.addNumber(number, ""); | |
| 298 } else { | |
| 299 // The sibling is not a valid number, so current group of adjace nt numbers | |
| 300 // should be closed, adding a new group if possible. | |
| 301 mAdjacentNumbersGroups.addGroup(); | |
| 302 } | |
| 303 } // while there're matches | |
| 304 } | |
| 305 | |
| 306 /** | |
| 307 * Add PageParamInfo.PageInfo for a link if its text is numeric. | |
| 308 */ | |
| 309 private void addValidLink(AnchorElement link, AnchorElement baseAnchor) { | |
|
cjhopman
2015/07/29 01:07:53
probably rename this to addLinkIfValid() since it
kuan
2015/07/30 16:47:00
Done. this fn is created simply to prevent duplic
| |
| 310 PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor); | |
| 311 if (pageInfoAndText != null) mAdjacentNumbersGroups.addPageInfo(pageInfo AndText.mPageInfo); | |
| 312 } | |
| 313 | |
| 314 /** | |
| 315 * @return the topmost parent of the given node that simply wraps the node, i.e. with no more | |
| 316 * inner text than that of given node. | |
| 317 */ | |
| 318 private static Node findParentWrapper(Node node, int nodeTextLen) { | |
| 319 Node parent = node; | |
| 320 Node prevParent = null; | |
| 321 // While keeping track of each parent, once we find the first one that h as more text than | |
| 322 // given node, the previous parent would be what we want. | |
| 323 do { | |
| 324 prevParent = parent; | |
| 325 parent = parent.getParentNode(); | |
| 326 } while (parent != null && DomUtil.getInnerText(parent).length() == node TextLen); | |
| 327 | |
| 328 return prevParent == node || prevParent.getNodeType() == Node.DOCUMENT_N ODE ? | |
| 329 null : prevParent; | |
| 330 } | |
| 331 | |
| 332 /** | |
| 333 * @return true if link is disabled i.e. not clickable because it has a text cursor. | |
| 334 */ | |
| 335 private static boolean isDisabledLink(AnchorElement link) { | |
| 336 Style style = DomUtil.getComputedStyle(link); | |
| 337 return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cu rsor.TEXT; | |
| 338 } | |
| 339 | |
| 340 /** | |
| 341 * @return true if href is "javascript:void(0)". | |
| 342 */ | |
| 343 private static boolean isVoidHref(String href) { | |
| 344 return href.equals("javascript:void(0)"); | |
| 345 } | |
| 346 | |
| 347 private static int linkTextToNumber(String linkText) { | |
| 348 linkText = linkText.replaceAll("[()\\[\\]{}]", ""); | |
| 349 linkText = linkText.trim(); // Remove leading and trailing whitespaces. | |
| 350 // Remove duplicate internal whitespaces. | |
| 351 linkText = linkText.replaceAll("\\s\\{2,\\}", " "); | |
| 352 return StringUtil.toNumber(linkText); | |
| 353 } | |
| 354 | |
| 355 /** | |
| 356 * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM. | |
| 357 */ | |
| 358 private static boolean isPlainPageNumber(int number) { | |
| 359 return number >= 0 && number < MAX_NUM_FOR_PAGE_PARAM; | |
| 360 } | |
| 361 | |
| 362 } | |
| OLD | NEW |