OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 package org.chromium.distiller; | |
6 | |
7 import org.chromium.distiller.proto.DomDistillerProtos; | |
8 import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo; | |
9 | |
10 import com.google.gwt.dom.client.AnchorElement; | |
11 import com.google.gwt.dom.client.Document; | |
12 import com.google.gwt.dom.client.Element; | |
13 import com.google.gwt.dom.client.Node; | |
14 import com.google.gwt.dom.client.NodeList; | |
15 import com.google.gwt.dom.client.Style; | |
16 import com.google.gwt.regexp.shared.MatchResult; | |
17 import com.google.gwt.regexp.shared.RegExp; | |
18 | |
19 import java.util.HashSet; | |
20 import java.util.Set; | |
21 | |
22 /** | |
23 * Background: | |
24 * The long article/news/forum thread/blog document may be partitioned into se veral partial pages | |
25 * by webmaster. Each partial page has outlinks pointing to the adjacent part ial pages. The | |
26 * anchor text of those outlinks is numeric. | |
27 * | |
28 * This class parses the document to collect groups of adjacent plain text numbe rs and outlinks with | |
29 * digital anchor text. These are then passed to PageParameterParser which woul d spit out the | |
30 * pagination URLs if available. | |
31 */ | |
32 public class PageParameterParser { | |
33 // If the numeric value of a link's anchor text is greater than this number, we don't think it | |
34 // represents the page number of the link. | |
35 private static final int MAX_NUM_FOR_PAGE_PARAM = 100; | |
36 | |
37 /** | |
38 * Type of sibling to check. | |
39 */ | |
40 private static enum Sibling { | |
41 PREV, // Node.getPreviousSibling(). | |
42 NEXT, // Node.getNextSibling(). | |
43 } | |
44 | |
45 /** | |
46 * Entry point for PageParameterParser. | |
47 * Parses the document to collect outlinks with digital anchor text and nume ric text around | |
48 * them. These are then passed to PageParameterParser to detect pagination URLs. | |
49 * | |
50 * @return PageParamInfo (see PageParamInfo.java), always. If no page param eter is detected or | |
51 * determined to be best, its mType is PageParamInfo.Type.UNSET. | |
52 * | |
53 * @param originalUrl the original URL of the document to be parsed. | |
54 * @param timingInfo for tracking performance. | |
55 */ | |
56 public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) { | |
57 PageParameterParser parser = new PageParameterParser(timingInfo); | |
58 return parser.parseDocument(Document.get().getDocumentElement(), origina lUrl); | |
59 } | |
60 | |
61 private final TimingInfo mTimingInfo; | |
62 private String mDocUrl = ""; | |
63 private ParsedUrl mParsedUrl = null; | |
64 private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new Monotoni cPageInfosGroups(); | |
65 // Siblings that have been processed before and after each link. | |
66 private final Set<Node> mProcessedSiblings = new HashSet<Node>(); | |
67 | |
68 private static RegExp sHrefCleaner = null; | |
69 | |
70 private PageParameterParser(TimingInfo timingInfo) { | |
71 mTimingInfo = timingInfo; | |
72 } | |
73 | |
74 /** | |
75 * Acutually implements PageParameterParser.parse(), see above description f or parse(). | |
76 */ | |
77 private PageParamInfo parseDocument(Element root, String originalUrl) { | |
78 double startTime = DomUtil.getTime(); | |
79 | |
80 mDocUrl = originalUrl; | |
81 mParsedUrl = ParsedUrl.create(mDocUrl); | |
82 if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL. | |
83 | |
84 mAdjacentNumbersGroups.addGroup(); | |
85 AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase( | |
86 PagingLinksFinder.getBaseUrlForRelative(root, originalUrl)); | |
87 | |
88 NodeList<Element> allLinks = root.getElementsByTagName("A"); | |
cjhopman
2015/06/26 00:35:25
What do you think of this alternative approach:
a
kuan
2015/07/13 22:57:03
Done.
| |
89 for (int i = 0; i < allLinks.getLength(); i++) { | |
90 final AnchorElement link = AnchorElement.as(allLinks.getItem(i)); | |
91 | |
92 String linkHref = PagingLinksFinder.resolveLinkHref(link, baseAnchor ); | |
93 ParsedUrl url = ParsedUrl.create(linkHref); | |
94 if (url == null || !url.getHost().equalsIgnoreCase(mParsedUrl.getHos t())) continue; | |
95 url.setHash(""); | |
96 | |
97 // Use javascript innerText (instead of javascript textContent) to o nly get visible | |
98 // text. | |
99 String linkText = DomUtil.getInnerText(link); | |
100 int number = linkTextToNumber(linkText); | |
101 if (isPlainPageNumber(number)) { | |
102 // Before we append the link to the current group of adjacent nu mbers, check if it's | |
103 // preceded by a sibling with text, which would determine if the current adjacency | |
104 // continues or ends. | |
105 Node parentWrapper = null; | |
106 if (!checkForSiblingWithText(link, Sibling.PREV)) { // Link has no sibling. | |
107 // The link could be a child of a parent that is simply a wr apper, i.e. with no | |
108 // extra text, in which case, we should be checking the sibl ings of the topmost | |
109 // parent wrapper. | |
110 parentWrapper = findParentWrapper(link, linkText.length()); | |
111 if (parentWrapper != null) checkForSiblingWithText(parentWra pper, Sibling.PREV); | |
112 } | |
113 | |
114 // This link is a good candidate for pagination, add it to the c urrent group of | |
115 // adjacent numbers. | |
116 if (isDisabledLink(link)) { | |
117 linkHref = ""; | |
118 } else { | |
119 if (sHrefCleaner == null) sHrefCleaner = RegExp.compile("/?( #.*)?$"); | |
120 linkHref = sHrefCleaner.replace(url.toString(), ""); | |
121 } | |
122 mAdjacentNumbersGroups.addNumber(number, linkHref); | |
123 | |
124 // Check if the link is followed by a sibling with text, which w ould determine if | |
125 // the current adjacency continues or ends. | |
126 checkForSiblingWithText(parentWrapper == null ? link : parentWra pper, Sibling.NEXT); | |
127 } else { | |
128 // This link is not a valid pagination link, so current group of adjacent numbers | |
129 // should be closed, adding a new group if possible. | |
130 mAdjacentNumbersGroups.addGroup(); | |
131 } | |
132 } // for all links | |
133 | |
134 mAdjacentNumbersGroups.cleanup(); | |
135 | |
136 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser"); | |
137 | |
138 startTime = DomUtil.getTime(); | |
139 PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups , mDocUrl); | |
140 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector"); | |
141 return info; | |
142 } | |
143 | |
144 private static RegExp sTermsRegExp = null; // Match terms i.e. words. | |
145 private static RegExp sSurroundingDigitsRegExp = null; // Match term with o nly digits. | |
146 | |
147 /** | |
148 * Checks for previous or next sibling with word text. If the text contains digit(s) as terms | |
149 * that form a valid page number, the sibling is added to the current group of adjacent | |
150 * numbers. Otherwise, the current group of adjacent numbers is closed to e nd the current | |
151 * adjacency, and a new group is started. | |
152 * | |
153 * @return true if given start node has at least 1 sibling, false otherwise. | |
154 | |
155 * @param start node to start checking with. | |
156 * @param sibling previous or next sibling. | |
157 */ | |
158 private boolean checkForSiblingWithText(Node start, Sibling sibling) { | |
159 Node node = start; | |
160 Node prevNode = null; | |
161 String text = ""; | |
162 // Find the first sibling that has inner text with words. | |
163 do { | |
164 prevNode = node; | |
165 node = sibling == Sibling.PREV ? node.getPreviousSibling() : node.ge tNextSibling(); | |
166 if (node == null && prevNode == start) return false; | |
167 if (node == null || node.getNodeType() == Node.DOCUMENT_NODE || | |
168 !mProcessedSiblings.add(node)) { // Node has been processed . | |
169 return true; | |
170 } | |
171 | |
172 if (node.getNodeType() == Node.TEXT_NODE) { | |
173 text = node.getNodeValue(); | |
174 } else { | |
175 Element e = Element.as(node); | |
176 | |
177 // Ignore non-void anchors, since we're iterating through them. | |
178 if (e.hasTagName("A") && shouldIgnoreLinkSibling(e)) return true ; | |
179 | |
180 // Ignore non-void link children, since we're iterating through them. | |
181 NodeList<Element> linkChildren = e.getElementsByTagName("A"); | |
182 for (int i = 0; i < linkChildren.getLength(); i++) { | |
183 if (shouldIgnoreLinkSibling(linkChildren.getItem(i))) return true; | |
184 } | |
185 | |
186 text = DomUtil.getInnerText(e); | |
187 } | |
188 } while (text.isEmpty() || StringUtil.countWords(text) == 0); | |
189 | |
190 if (!StringUtil.containsDigit(text)) { | |
191 // The sibling does not contain valid number(s), so current group of adjacent numbers | |
192 // should be closed, adding a new group if possible. | |
193 mAdjacentNumbersGroups.addGroup(); | |
194 return true; | |
195 } | |
196 | |
197 if (sTermsRegExp == null) { | |
198 sTermsRegExp = RegExp.compile("(\\S*[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\ \S*)", "gi"); | |
199 } else { | |
200 sTermsRegExp.setLastIndex(0); | |
201 } | |
202 if (sSurroundingDigitsRegExp == null) { | |
203 sSurroundingDigitsRegExp = RegExp.compile("^[\\W_]*(\\d+)[\\W_]*$", "i"); | |
204 } | |
205 | |
206 // Extract terms from the text, differentiating between those that conta in only digits and | |
207 // those that contain non-digits. | |
208 while (true) { | |
209 MatchResult match = sTermsRegExp.exec(text); | |
210 if (match == null) break; | |
211 if (match.getGroupCount() <= 1) continue; | |
212 | |
213 String term = match.getGroup(1); | |
214 MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term); | |
215 int number = -1; | |
216 if (termWithDigits != null && termWithDigits.getGroupCount() > 1) { | |
217 number = StringUtil.toNumber(termWithDigits.getGroup(1)); | |
218 } | |
219 if (isPlainPageNumber(number)) { | |
220 // This sibling is a valid candidate of plain text page number, add it to last | |
221 // group of adjacent numbers. | |
222 mAdjacentNumbersGroups.addNumber(number, ""); | |
223 } else { | |
224 // The sibling is not a valid number, so current group of adjace nt numbers | |
225 // should be closed, adding a new group if possible. | |
226 mAdjacentNumbersGroups.addGroup(); | |
227 } | |
228 } // while there're matches | |
229 | |
230 return true; | |
231 } | |
232 | |
233 /** | |
234 * @return the topmost parent of the given node that simply wraps the node, i.e. with no more | |
235 * inner text than that of given node. | |
236 */ | |
237 private static Node findParentWrapper(Node node, int nodeTextLen) { | |
238 Node parent = node; | |
239 Node prevParent = null; | |
240 // While keeping track of each parent, once we find the first one that h as more text than | |
241 // given node, the previous parent would be what we want. | |
242 do { | |
243 prevParent = parent; | |
244 parent = parent.getParentNode(); | |
245 } while (parent != null && DomUtil.getInnerText(parent).length() == node TextLen); | |
246 | |
247 return prevParent == node || prevParent.getNodeType() == Node.DOCUMENT_N ODE ? | |
248 null : prevParent; | |
249 } | |
250 | |
251 /** | |
252 * @return true if link is disabled i.e. not clickable because it has a text cursor. | |
253 */ | |
254 private static boolean isDisabledLink(AnchorElement link) { | |
255 Style style = DomUtil.getComputedStyle(link); | |
256 return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cu rsor.TEXT; | |
257 } | |
258 | |
259 /** | |
260 * @return true if link should be ignored when checking for sibling. | |
261 * All non-void links will be processed, and hence could be ignored when che cking for sibling. | |
262 * Void links are rejected at the beginning because their hosts are differen t from | |
263 * originalUrl's, so they should be considered when checking for sibling. | |
264 */ | |
265 private static boolean shouldIgnoreLinkSibling(Element link) { | |
266 return !AnchorElement.as(link).getHref().equals("javascript:void(0)"); | |
267 } | |
268 | |
269 private static int linkTextToNumber(String linkText) { | |
270 linkText = linkText.replaceAll("[()\\[\\]{}]", ""); | |
271 linkText = linkText.trim(); // Remove leading and trailing whitespaces. | |
272 // Remove duplicate internal whitespaces. | |
273 linkText = linkText.replaceAll("\\s\\{2,\\}", " "); | |
274 return StringUtil.toNumber(linkText); | |
275 } | |
276 | |
277 /** | |
278 * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM. | |
279 */ | |
280 private static boolean isPlainPageNumber(int number) { | |
281 return number >= 0 && number < MAX_NUM_FOR_PAGE_PARAM; | |
282 } | |
283 | |
284 } | |
OLD | NEW |