Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(119)

Side by Side Diff: java/org/chromium/distiller/PageParameterParser.java

Issue 1178633002: implement parser for new pagination algorithm (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: addr chris's comments Created 5 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 package org.chromium.distiller;
6
7 import org.chromium.distiller.proto.DomDistillerProtos;
8 import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo;
9
10 import com.google.gwt.dom.client.AnchorElement;
11 import com.google.gwt.dom.client.Document;
12 import com.google.gwt.dom.client.Element;
13 import com.google.gwt.dom.client.Node;
14 import com.google.gwt.dom.client.NodeList;
15 import com.google.gwt.dom.client.Style;
16 import com.google.gwt.regexp.shared.MatchResult;
17 import com.google.gwt.regexp.shared.RegExp;
18
19 /**
20 * Background:
21 * The long article/news/forum thread/blog document may be partitioned into se veral partial pages
22 * by webmaster. Each partial page has outlinks pointing to the adjacent part ial pages. The
23 * anchor text of those outlinks is numeric.
24 *
25 * This class parses the document to collect groups of adjacent plain text numbe rs and outlinks with
26 * digital anchor text. These are then passed to PageParameterParser which woul d spit out the
27 * pagination URLs if available.
28 */
29 public class PageParameterParser {
30 // If the numeric value of a link's anchor text is greater than this number, we don't think it
31 // represents the page number of the link.
32 private static final int MAX_NUM_FOR_PAGE_PARAM = 100;
33
34 /**
35 * Stores PageParamInfo.PageInfo and the anchor's text, specifically returne d by
36 * getPageInfoAndText().
37 */
38 private static class PageInfoAndText {
39 private final PageParamInfo.PageInfo mPageInfo;
40 private final String mText;
41
42 PageInfoAndText(int number, String url, String text) {
43 mPageInfo = new PageParamInfo.PageInfo(number, url);
44 mText = text;
45 }
46 }
47
48 /**
49 * Entry point for PageParameterParser.
50 * Parses the document to collect outlinks with digital anchor text and nume ric text around
51 * them. These are then passed to PageParameterParser to detect pagination URLs.
52 *
53 * @return PageParamInfo (see PageParamInfo.java), always. If no page param eter is detected or
54 * determined to be best, its mType is PageParamInfo.Type.UNSET.
55 *
56 * @param originalUrl the original URL of the document to be parsed.
57 * @param timingInfo for tracking performance.
58 */
59 public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) {
60 PageParameterParser parser = new PageParameterParser(timingInfo);
61 return parser.parseDocument(Document.get().getDocumentElement(), origina lUrl);
62 }
63
64 private final TimingInfo mTimingInfo;
65 private String mDocUrl = "";
66 private ParsedUrl mParsedUrl = null;
67 private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new Monotoni cPageInfosGroups();
68
69 private static RegExp sHrefCleaner = null;
70
71 private PageParameterParser(TimingInfo timingInfo) {
72 mTimingInfo = timingInfo;
73 }
74
75 /**
76 * Acutually implements PageParameterParser.parse(), see above description f or parse().
77 */
78 private PageParamInfo parseDocument(Element root, String originalUrl) {
79 double startTime = DomUtil.getTime();
80
81 mDocUrl = originalUrl;
82 mParsedUrl = ParsedUrl.create(mDocUrl);
83 if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL.
84
85 AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase(
86 PagingLinksFinder.getBaseUrlForRelative(root, originalUrl));
87
88 NodeList<Element> allLinks = root.getElementsByTagName("A");
89 int idx = 0;
90 while (idx < allLinks.getLength()) {
91 final AnchorElement link = AnchorElement.as(allLinks.getItem(idx));
92 PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAncho r);
93 if (pageInfoAndText == null) {
94 idx++;
95 continue;
96 }
97
98 // This link is a good candidate for pagination.
99
100 // Close current group of adjacent numbers, add a new group if neces sary.
101 mAdjacentNumbersGroups.addGroup();
102
103 // Before we append the link to the new group of adjacent numbers, c heck if it's
104 // preceded by a sibling with text; if so, add it before the link.
105 Node parentWrapper = null;
cjhopman 2015/07/29 01:07:53 What's this parent wrapper? I don't recall that be
kuan 2015/07/30 16:47:00 i had it in the previous change, and attempted to
cjhopman 2015/08/04 21:58:41 But why the parent wrapper thing? Why not just wal
kuan 2015/08/04 22:38:37 what do u mean by "backwards/forwards in the tree"
kuan 2015/08/11 19:09:38 Done.
106 if (!checkForPrevSiblingWithText(link)) { // Link has no sibling.
107 // The link could be a child of a parent that is simply a wrappe r, i.e. with no
108 // extra text, in which case, we should be checking the siblings of the topmost
109 // parent wrapper.
110 parentWrapper = findParentWrapper(link, pageInfoAndText.mText.le ngth());
111 if (parentWrapper != null) checkForPrevSiblingWithText(parentWra pper);
112 }
113
114 // Add the link to the current group of adjacent numbers.
115 mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo);
116
117 // Add all following siblings with numeric text, with or without lin ks.
118 int numLinksAdded = 0;
119 if (parentWrapper == null)
120 numLinksAdded = addFollowingSiblings(link, false, baseAnchor);
121 else
122 numLinksAdded = addFollowingSiblings(parentWrapper, true, baseAn chor);
123
124 // Skip the current link and links already processed in addFollowing Siblings().
125 idx += 1 + numLinksAdded;
126 } // while there're links.
127
128 mAdjacentNumbersGroups.cleanup();
129
130 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser");
131
132 startTime = DomUtil.getTime();
133 PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups , mDocUrl);
134 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector");
135 return info;
136 }
137
138
139 /**
140 * @return a populated PageInfoAndText if given link is to be added to mAdja centNumbersGroups.
141 * Otherwise, returns null if link is to be ignored.
142 * "javascript:void" links with numeric text are considered valid links to b e added.
143 *
144 * @param link to process.
145 * @param baseAnchor created for the current document.
146 */
147 private PageInfoAndText getPageInfoAndText(AnchorElement link, AnchorElement baseAnchor) {
148 // Ignore invisible links.
149 int width = link.getOffsetWidth();
150 int height = link.getOffsetHeight();
151 if (width == 0 || height == 0 || !DomUtil.isVisible(link)) return null;
cjhopman 2015/07/29 01:07:53 It seems odd that invisible links are handled here
kuan 2015/07/30 16:47:00 invisible links need to be ignored. i do this her
152
153 String linkHref = PagingLinksFinder.resolveLinkHref(link, baseAnchor);
154 boolean isVoidLink = isVoidHref(linkHref);
155 ParsedUrl url = ParsedUrl.create(linkHref);
156 if (url == null || (!isVoidLink && !url.getHost().equalsIgnoreCase(mPars edUrl.getHost()))) {
157 return null;
158 }
159
160 url.setHash("");
161
162 // Use javascript innerText (instead of javascript textContent) to only get visible text.
163 String linkText = DomUtil.getInnerText(link);
164 int number = linkTextToNumber(linkText);
165 if (!isPlainPageNumber(number)) return null;
166
167 if (isVoidLink || isDisabledLink(link)) return new PageInfoAndText(numbe r, "", linkText);
168
169 if (sHrefCleaner == null) sHrefCleaner = RegExp.compile("/?(#.*)?$");
170 return new PageInfoAndText(number, sHrefCleaner.replace(url.toString(), ""), linkText);
171 }
172
173 /**
174 * Checks for previous sibling with word text. If the text contains digit(s ) as terms that
175 * form a valid page number, the sibling is added to the current group of ad jacent numbers.
176 * Otherwise, the current group of adjacent numbers is closed to end the cur rent adjacency, and
177 * a new group is started.
178 *
179 * @return true if given start node has at least 1 sibling, false otherwise.
180
181 * @param start node to start checking with.
182 */
183 private boolean checkForPrevSiblingWithText(Node start) {
cjhopman 2015/07/29 01:07:53 I'm having difficulty understanding both the way t
kuan 2015/07/30 16:47:00 i initially had the check for previous and next nu
184 Node node = start;
185 Node prevNode = null;
186 String text = "";
187 // Find the first previous sibling that has inner text with words.
188 do {
189 prevNode = node;
190 node = node.getPreviousSibling();
191 if (node == null && prevNode == start) return false;
192 if (node == null || node.getNodeType() == Node.DOCUMENT_NODE) return true;
193
194 if (node.getNodeType() == Node.TEXT_NODE) {
195 text = node.getNodeValue();
196 } else {
197 Element e = Element.as(node);
198 // Previous link siblings or children have already been processe d.
199 if (e.hasTagName("A") || e.getElementsByTagName("A").getLength() > 0) return true;
200 text = DomUtil.getInnerText(e);
201 }
202 } while (text.isEmpty() || StringUtil.countWords(text) == 0);
203
204 addNumberText(text);
205 return true;
206 }
207
208 /**
209 * Adds all following siblings (links and non-links) with numeric text. If the text contains
210 * digit(s) as terms that form a valid page number, the sibling is added to the current group of
211 * adjacent numbers. Otherwise, the current group of adjacent numbers is cl osed to end the
212 * current adjacency, and a new group is started.
213 *
214 * @return number of links added.
215
216 * @param start node to start checking with.
217 * @param isParentWrapper true if given start node is a parent wrapper of a link.
218 * @param baseAnchor created for the current document.
219 */
220 private int addFollowingSiblings(Node start, boolean isParentWrapper,
221 AnchorElement baseAnchor) {
222 Node node = start;
223 Node prevNode = null;
224 String text = "";
225 int numLinksProcessed = 0;
226 // Find all following siblings, add them if their text is purely numeric .
227 while (true) {
228 prevNode = node;
229 node = node.getNextSibling();
230 if (node == null || node.getNodeType() == Node.DOCUMENT_NODE) return numLinksProcessed;
231
232 boolean handled = false;
233 if (node.getNodeType() == Node.TEXT_NODE) {
234 text = node.getNodeValue();
235 } else {
236 Element e = Element.as(node);
237 if (e.hasTagName("A")) {
238 addValidLink(AnchorElement.as(e), baseAnchor);
239 numLinksProcessed++;
240 handled = true;
241 } else if (isParentWrapper) {
242 NodeList<Element> linkChildren = e.getElementsByTagName("A") ;
243 final int numChildren = linkChildren.getLength();
244 for (int i = 0; i < numChildren; i++) {
245 addValidLink(AnchorElement.as(linkChildren.getItem(i)), baseAnchor);
246 numLinksProcessed++;
247 }
248 if (numChildren > 0) handled = true;
249 }
250
251 text = handled ? "" : DomUtil.getInnerText(e);
252 }
253
254 if (!text.isEmpty() && StringUtil.countWords(text) > 0) addNumberTex t(text);
255 }
256 }
257
258 private static RegExp sTermsRegExp = null; // Match terms i.e. words.
259 private static RegExp sSurroundingDigitsRegExp = null; // Match term with o nly digits.
260
261 /**
262 * Add PageParamInfo.PageInfo for a non-link with numeric text.
cjhopman 2015/07/29 01:07:53 It looks like the text doesn't have to be strictly
kuan 2015/07/30 16:47:00 Done. renamed fn too.
263 */
264 private void addNumberText(String text) {
265 if (!StringUtil.containsDigit(text)) {
266 // The sibling does not contain valid number(s); if necessary, curre nt group of adjacent
267 // numbers should be closed, adding a new group if possible.
268 mAdjacentNumbersGroups.addGroup();
269 return;
270 }
271
272 if (sTermsRegExp == null) {
273 sTermsRegExp = RegExp.compile("(\\S*[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\ \S*)", "gi");
274 } else {
275 sTermsRegExp.setLastIndex(0);
276 }
277 if (sSurroundingDigitsRegExp == null) {
278 sSurroundingDigitsRegExp = RegExp.compile("^[\\W_]*(\\d+)[\\W_]*$", "i");
279 }
280
281 // Extract terms from the text, differentiating between those that conta in only digits and
282 // those that contain non-digits.
283 while (true) {
284 MatchResult match = sTermsRegExp.exec(text);
285 if (match == null) break;
286 if (match.getGroupCount() <= 1) continue;
287
288 String term = match.getGroup(1);
289 MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term);
290 int number = -1;
291 if (termWithDigits != null && termWithDigits.getGroupCount() > 1) {
292 number = StringUtil.toNumber(termWithDigits.getGroup(1));
293 }
294 if (isPlainPageNumber(number)) {
295 // This sibling is a valid candidate of plain text page number, add it to last
296 // group of adjacent numbers.
297 mAdjacentNumbersGroups.addNumber(number, "");
298 } else {
299 // The sibling is not a valid number, so current group of adjace nt numbers
300 // should be closed, adding a new group if possible.
301 mAdjacentNumbersGroups.addGroup();
302 }
303 } // while there're matches
304 }
305
306 /**
307 * Add PageParamInfo.PageInfo for a link if its text is numeric.
308 */
309 private void addValidLink(AnchorElement link, AnchorElement baseAnchor) {
cjhopman 2015/07/29 01:07:53 probably rename this to addLinkIfValid() since it
kuan 2015/07/30 16:47:00 Done. this fn is created simply to prevent duplic
310 PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor);
311 if (pageInfoAndText != null) mAdjacentNumbersGroups.addPageInfo(pageInfo AndText.mPageInfo);
312 }
313
314 /**
315 * @return the topmost parent of the given node that simply wraps the node, i.e. with no more
316 * inner text than that of given node.
317 */
318 private static Node findParentWrapper(Node node, int nodeTextLen) {
319 Node parent = node;
320 Node prevParent = null;
321 // While keeping track of each parent, once we find the first one that h as more text than
322 // given node, the previous parent would be what we want.
323 do {
324 prevParent = parent;
325 parent = parent.getParentNode();
326 } while (parent != null && DomUtil.getInnerText(parent).length() == node TextLen);
327
328 return prevParent == node || prevParent.getNodeType() == Node.DOCUMENT_N ODE ?
329 null : prevParent;
330 }
331
332 /**
333 * @return true if link is disabled i.e. not clickable because it has a text cursor.
334 */
335 private static boolean isDisabledLink(AnchorElement link) {
336 Style style = DomUtil.getComputedStyle(link);
337 return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cu rsor.TEXT;
338 }
339
340 /**
341 * @return true if href is "javascript:void(0)".
342 */
343 private static boolean isVoidHref(String href) {
344 return href.equals("javascript:void(0)");
345 }
346
347 private static int linkTextToNumber(String linkText) {
348 linkText = linkText.replaceAll("[()\\[\\]{}]", "");
349 linkText = linkText.trim(); // Remove leading and trailing whitespaces.
350 // Remove duplicate internal whitespaces.
351 linkText = linkText.replaceAll("\\s\\{2,\\}", " ");
352 return StringUtil.toNumber(linkText);
353 }
354
355 /**
356 * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM.
357 */
358 private static boolean isPlainPageNumber(int number) {
359 return number >= 0 && number < MAX_NUM_FOR_PAGE_PARAM;
360 }
361
362 }
OLDNEW
« no previous file with comments | « java/org/chromium/distiller/MonotonicPageInfosGroups.java ('k') | java/org/chromium/distiller/ParsedUrl.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698