Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(379)

Side by Side Diff: java/org/chromium/distiller/PageParameterParser.java

Issue 1178633002: implement parser for new pagination algorithm (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | java/org/chromium/distiller/ParsedUrl.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 package org.chromium.distiller;
6
7 import org.chromium.distiller.proto.DomDistillerProtos;
8 import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo;
9
10 import com.google.gwt.dom.client.AnchorElement;
11 import com.google.gwt.dom.client.Document;
12 import com.google.gwt.dom.client.Element;
13 import com.google.gwt.dom.client.Node;
14 import com.google.gwt.dom.client.NodeList;
15 import com.google.gwt.dom.client.Style;
16 import com.google.gwt.regexp.shared.MatchResult;
17 import com.google.gwt.regexp.shared.RegExp;
18
19 import java.util.HashSet;
20 import java.util.Set;
21
22 /**
23 * Background:
24 * The long article/news/forum thread/blog document may be partitioned into se veral partial pages
25 * by webmaster. Each partial page has outlinks pointing to the adjacent part ial pages. The
26 * anchor text of those outlinks is numeric.
27 *
28 * This class parses the document to collect groups of adjacent plain text numbe rs and outlinks with
29 * digital anchor text. These are then passed to PageParameterParser which woul d spit out the
30 * pagination URLs if available.
31 */
32 public class PageParameterParser {
33 // If the numeric value of a link's anchor text is greater than this number, we don't think it
34 // represents the page number of the link.
35 private static final int MAX_NUM_FOR_PAGE_PARAM = 100;
36
37 /**
38 * Type of sibling to check.
39 */
40 private static enum Sibling {
41 PREV, // Node.getPreviousSibling().
42 NEXT, // Node.getNextSibling().
43 }
44
45 /**
46 * Entry point for PageParameterParser.
47 * Parses the document to collect outlinks with digital anchor text and nume ric text around
48 * them. These are then passed to PageParameterParser to detect pagination URLs.
49 *
50 * @return PageParamInfo (see PageParamInfo.java), always. If no page param eter is detected or
51 * determined to be best, its mType is PageParamInfo.Type.UNSET.
52 *
53 * @param originalUrl the original URL of the document to be parsed.
54 * @param timingInfo for tracking performance.
55 */
56 public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) {
57 PageParameterParser parser = new PageParameterParser(timingInfo);
58 return parser.parseDocument(Document.get().getDocumentElement(), origina lUrl);
59 }
60
61 private final TimingInfo mTimingInfo;
62 private String mDocUrl = "";
63 private ParsedUrl mParsedUrl = null;
64 private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new Monotoni cPageInfosGroups();
65 // Siblings that have been processed before and after each link.
66 private final Set<Node> mProcessedSiblings = new HashSet<Node>();
67
68 private static RegExp sHrefCleaner = null;
69
70 private PageParameterParser(TimingInfo timingInfo) {
71 mTimingInfo = timingInfo;
72 }
73
74 /**
75 * Acutually implements PageParameterParser.parse(), see above description f or parse().
76 */
77 private PageParamInfo parseDocument(Element root, String originalUrl) {
78 double startTime = DomUtil.getTime();
79
80 mDocUrl = originalUrl;
81 mParsedUrl = ParsedUrl.create(mDocUrl);
82 if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL.
83
84 mAdjacentNumbersGroups.addGroup();
85 AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase(
86 PagingLinksFinder.getBaseUrlForRelative(root, originalUrl));
87
88 NodeList<Element> allLinks = root.getElementsByTagName("A");
cjhopman 2015/06/26 00:35:25 What do you think of this alternative approach: a
kuan 2015/07/13 22:57:03 Done.
89 for (int i = 0; i < allLinks.getLength(); i++) {
90 final AnchorElement link = AnchorElement.as(allLinks.getItem(i));
91
92 String linkHref = PagingLinksFinder.resolveLinkHref(link, baseAnchor );
93 ParsedUrl url = ParsedUrl.create(linkHref);
94 if (url == null || !url.getHost().equalsIgnoreCase(mParsedUrl.getHos t())) continue;
95 url.setHash("");
96
97 // Use javascript innerText (instead of javascript textContent) to o nly get visible
98 // text.
99 String linkText = DomUtil.getInnerText(link);
100 int number = linkTextToNumber(linkText);
101 if (isPlainPageNumber(number)) {
102 // Before we append the link to the current group of adjacent nu mbers, check if it's
103 // preceded by a sibling with text, which would determine if the current adjacency
104 // continues or ends.
105 Node parentWrapper = null;
106 if (!checkForSiblingWithText(link, Sibling.PREV)) { // Link has no sibling.
107 // The link could be a child of a parent that is simply a wr apper, i.e. with no
108 // extra text, in which case, we should be checking the sibl ings of the topmost
109 // parent wrapper.
110 parentWrapper = findParentWrapper(link, linkText.length());
111 if (parentWrapper != null) checkForSiblingWithText(parentWra pper, Sibling.PREV);
112 }
113
114 // This link is a good candidate for pagination, add it to the c urrent group of
115 // adjacent numbers.
116 if (isDisabledLink(link)) {
117 linkHref = "";
118 } else {
119 if (sHrefCleaner == null) sHrefCleaner = RegExp.compile("/?( #.*)?$");
120 linkHref = sHrefCleaner.replace(url.toString(), "");
121 }
122 mAdjacentNumbersGroups.addNumber(number, linkHref);
123
124 // Check if the link is followed by a sibling with text, which w ould determine if
125 // the current adjacency continues or ends.
126 checkForSiblingWithText(parentWrapper == null ? link : parentWra pper, Sibling.NEXT);
127 } else {
128 // This link is not a valid pagination link, so current group of adjacent numbers
129 // should be closed, adding a new group if possible.
130 mAdjacentNumbersGroups.addGroup();
131 }
132 } // for all links
133
134 mAdjacentNumbersGroups.cleanup();
135
136 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser");
137
138 startTime = DomUtil.getTime();
139 PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups , mDocUrl);
140 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector");
141 return info;
142 }
143
144 private static RegExp sTermsRegExp = null; // Match terms i.e. words.
145 private static RegExp sSurroundingDigitsRegExp = null; // Match term with o nly digits.
146
147 /**
148 * Checks for previous or next sibling with word text. If the text contains digit(s) as terms
149 * that form a valid page number, the sibling is added to the current group of adjacent
150 * numbers. Otherwise, the current group of adjacent numbers is closed to e nd the current
151 * adjacency, and a new group is started.
152 *
153 * @return true if given start node has at least 1 sibling, false otherwise.
154
155 * @param start node to start checking with.
156 * @param sibling previous or next sibling.
157 */
158 private boolean checkForSiblingWithText(Node start, Sibling sibling) {
159 Node node = start;
160 Node prevNode = null;
161 String text = "";
162 // Find the first sibling that has inner text with words.
163 do {
164 prevNode = node;
165 node = sibling == Sibling.PREV ? node.getPreviousSibling() : node.ge tNextSibling();
166 if (node == null && prevNode == start) return false;
167 if (node == null || node.getNodeType() == Node.DOCUMENT_NODE ||
168 !mProcessedSiblings.add(node)) { // Node has been processed .
169 return true;
170 }
171
172 if (node.getNodeType() == Node.TEXT_NODE) {
173 text = node.getNodeValue();
174 } else {
175 Element e = Element.as(node);
176
177 // Ignore non-void anchors, since we're iterating through them.
178 if (e.hasTagName("A") && shouldIgnoreLinkSibling(e)) return true ;
179
180 // Ignore non-void link children, since we're iterating through them.
181 NodeList<Element> linkChildren = e.getElementsByTagName("A");
182 for (int i = 0; i < linkChildren.getLength(); i++) {
183 if (shouldIgnoreLinkSibling(linkChildren.getItem(i))) return true;
184 }
185
186 text = DomUtil.getInnerText(e);
187 }
188 } while (text.isEmpty() || StringUtil.countWords(text) == 0);
189
190 if (!StringUtil.containsDigit(text)) {
191 // The sibling does not contain valid number(s), so current group of adjacent numbers
192 // should be closed, adding a new group if possible.
193 mAdjacentNumbersGroups.addGroup();
194 return true;
195 }
196
197 if (sTermsRegExp == null) {
198 sTermsRegExp = RegExp.compile("(\\S*[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\ \S*)", "gi");
199 } else {
200 sTermsRegExp.setLastIndex(0);
201 }
202 if (sSurroundingDigitsRegExp == null) {
203 sSurroundingDigitsRegExp = RegExp.compile("^[\\W_]*(\\d+)[\\W_]*$", "i");
204 }
205
206 // Extract terms from the text, differentiating between those that conta in only digits and
207 // those that contain non-digits.
208 while (true) {
209 MatchResult match = sTermsRegExp.exec(text);
210 if (match == null) break;
211 if (match.getGroupCount() <= 1) continue;
212
213 String term = match.getGroup(1);
214 MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term);
215 int number = -1;
216 if (termWithDigits != null && termWithDigits.getGroupCount() > 1) {
217 number = StringUtil.toNumber(termWithDigits.getGroup(1));
218 }
219 if (isPlainPageNumber(number)) {
220 // This sibling is a valid candidate of plain text page number, add it to last
221 // group of adjacent numbers.
222 mAdjacentNumbersGroups.addNumber(number, "");
223 } else {
224 // The sibling is not a valid number, so current group of adjace nt numbers
225 // should be closed, adding a new group if possible.
226 mAdjacentNumbersGroups.addGroup();
227 }
228 } // while there're matches
229
230 return true;
231 }
232
233 /**
234 * @return the topmost parent of the given node that simply wraps the node, i.e. with no more
235 * inner text than that of given node.
236 */
237 private static Node findParentWrapper(Node node, int nodeTextLen) {
238 Node parent = node;
239 Node prevParent = null;
240 // While keeping track of each parent, once we find the first one that h as more text than
241 // given node, the previous parent would be what we want.
242 do {
243 prevParent = parent;
244 parent = parent.getParentNode();
245 } while (parent != null && DomUtil.getInnerText(parent).length() == node TextLen);
246
247 return prevParent == node || prevParent.getNodeType() == Node.DOCUMENT_N ODE ?
248 null : prevParent;
249 }
250
251 /**
252 * @return true if link is disabled i.e. not clickable because it has a text cursor.
253 */
254 private static boolean isDisabledLink(AnchorElement link) {
255 Style style = DomUtil.getComputedStyle(link);
256 return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cu rsor.TEXT;
257 }
258
259 /**
260 * @return true if link should be ignored when checking for sibling.
261 * All non-void links will be processed, and hence could be ignored when che cking for sibling.
262 * Void links are rejected at the beginning because their hosts are differen t from
263 * originalUrl's, so they should be considered when checking for sibling.
264 */
265 private static boolean shouldIgnoreLinkSibling(Element link) {
266 return !AnchorElement.as(link).getHref().equals("javascript:void(0)");
267 }
268
269 private static int linkTextToNumber(String linkText) {
270 linkText = linkText.replaceAll("[()\\[\\]{}]", "");
271 linkText = linkText.trim(); // Remove leading and trailing whitespaces.
272 // Remove duplicate internal whitespaces.
273 linkText = linkText.replaceAll("\\s\\{2,\\}", " ");
274 return StringUtil.toNumber(linkText);
275 }
276
277 /**
278 * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM.
279 */
280 private static boolean isPlainPageNumber(int number) {
281 return number >= 0 && number < MAX_NUM_FOR_PAGE_PARAM;
282 }
283
284 }
OLDNEW
« no previous file with comments | « no previous file | java/org/chromium/distiller/ParsedUrl.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698