OLD | NEW |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 /* | 5 /* |
6 * Parts of this file are adapted from Readability. | 6 * Parts of this file are adapted from Readability. |
7 * | 7 * |
8 * Readability is Copyright (c) 2010 Src90 Inc | 8 * Readability is Copyright (c) 2010 Src90 Inc |
9 * and licenced under the Apache License, Version 2.0. | 9 * and licenced under the Apache License, Version 2.0. |
10 */ | 10 */ |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
43 private static final String NEXT_LINK_REGEX = "(next|weiter|continue|>([^\\|
]|$)|»([^\\|]|$))"; | 43 private static final String NEXT_LINK_REGEX = "(next|weiter|continue|>([^\\|
]|$)|»([^\\|]|$))"; |
44 private static final String PREV_LINK_REGEX = "(prev|early|old|new|<|«)"; | 44 private static final String PREV_LINK_REGEX = "(prev|early|old|new|<|«)"; |
45 private static final String POSITIVE_REGEX = "article|body|content|entry|hen
try|main|page|pagination|post|text|blog|story"; | 45 private static final String POSITIVE_REGEX = "article|body|content|entry|hen
try|main|page|pagination|post|text|blog|story"; |
46 private static final String NEGATIVE_REGEX = "combx|comment|com-|contact|foo
t|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sid
ebar|sponsor|shopping|tags|tool|widget"; | 46 private static final String NEGATIVE_REGEX = "combx|comment|com-|contact|foo
t|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sid
ebar|sponsor|shopping|tags|tool|widget"; |
47 private static final String EXTRANEOUS_REGEX = | 47 private static final String EXTRANEOUS_REGEX = |
48 "print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sig
n|single"; | 48 "print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sig
n|single"; |
49 // Examples that match PAGE_NUMBER_REGEX are: "_p3", "-pg3", "p3", "_1", "-1
2-2". | 49 // Examples that match PAGE_NUMBER_REGEX are: "_p3", "-pg3", "p3", "_1", "-1
2-2". |
50 // Examples that don't match PAGE_NUMBER_REGEX are: "_p3 ", "p", "p123". | 50 // Examples that don't match PAGE_NUMBER_REGEX are: "_p3 ", "p", "p123". |
51 private static final String PAGE_NUMBER_REGEX = "((_|-)?p[a-z]*|(_|-))[0-9]{
1,2}$"; | 51 private static final String PAGE_NUMBER_REGEX = "((_|-)?p[a-z]*|(_|-))[0-9]{
1,2}$"; |
52 | 52 |
53 public static DomDistillerProtos.PaginationInfo getPaginationInfo() { | 53 public static DomDistillerProtos.PaginationInfo getPaginationInfo(String ori
ginal_domain) { |
54 DomDistillerProtos.PaginationInfo info = DomDistillerProtos.PaginationIn
fo.create(); | 54 DomDistillerProtos.PaginationInfo info = DomDistillerProtos.PaginationIn
fo.create(); |
55 String next = findNext(Document.get().getDocumentElement()); | 55 String next = findNext(Document.get().getDocumentElement(), original_dom
ain); |
56 if (next != null) { | 56 if (next != null) { |
57 info.setNextPage(next); | 57 info.setNextPage(next); |
58 } | 58 } |
59 return info; | 59 return info; |
60 } | 60 } |
61 | 61 |
62 public static String findNext(Element root) { | |
63 /** | 62 /** |
| 63 * @param original_domain The original domain of the page being processed if
it's a file://. |
64 * @return The next page link for the document. | 64 * @return The next page link for the document. |
65 */ | 65 */ |
66 return findPagingLink(root, PageLink.NEXT); | 66 public static String findNext(Element root, String original_domain) { |
| 67 return findPagingLink(root, original_domain, PageLink.NEXT); |
67 } | 68 } |
68 | 69 |
69 public static String findPrevious(Element root) { | |
70 /** | 70 /** |
| 71 * @param original_domain The original domain of the page being processed if
it's a file://. |
71 * @return The previous page link for the document. | 72 * @return The previous page link for the document. |
72 */ | 73 */ |
73 return findPagingLink(root, PageLink.PREV); | 74 public static String findPrevious(Element root, String original_domain) { |
| 75 return findPagingLink(root, original_domain, PageLink.PREV); |
74 } | 76 } |
75 | 77 |
76 private static String findPagingLink(Element root, PageLink pageLink) { | 78 private static String findPagingLink(Element root, String original_domain, P
ageLink pageLink) { |
77 // findPagingLink() is static, so clear mLinkDebugInfo before processing
the links. | 79 // findPagingLink() is static, so clear mLinkDebugInfo before processing
the links. |
78 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) { | 80 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) { |
79 mLinkDebugInfo.clear(); | 81 mLinkDebugInfo.clear(); |
80 } | 82 } |
81 | 83 |
82 String baseUrl = findBaseUrl(); | 84 String baseUrl = findBaseUrl(original_domain); |
83 // Remove trailing '/' from window location href, because it'll be used
to compare with | 85 // Remove trailing '/' from window location href, because it'll be used
to compare with |
84 // other href's whose trailing '/' are also removed. | 86 // other href's whose trailing '/' are also removed. |
85 String wndLocationHref = StringUtil.findAndReplace(Window.Location.getHr
ef(), "\\/$", ""); | 87 String wndLocationHref = StringUtil.findAndReplace(Window.Location.getHr
ef(), "\\/$", ""); |
86 NodeList<Element> allLinks = root.getElementsByTagName("A"); | 88 NodeList<Element> allLinks = root.getElementsByTagName("A"); |
87 Map<String, PagingLinkObj> possiblePages = new HashMap<String, PagingLin
kObj>(); | 89 Map<String, PagingLinkObj> possiblePages = new HashMap<String, PagingLin
kObj>(); |
88 | 90 |
89 // Loop through all links, looking for hints that they may be next- or p
revious- page links. | 91 // Loop through all links, looking for hints that they may be next- or p
revious- page links. |
90 // Things like having "page" in their textContent, className or id, or b
eing a child of a | 92 // Things like having "page" in their textContent, className or id, or b
eing a child of a |
91 // node with a page-y className or id. | 93 // node with a page-y className or id. |
92 // Also possible: levenshtein distance? longest common subsequence? | 94 // Also possible: levenshtein distance? longest common subsequence? |
(...skipping 28 matching lines...) Expand all Loading... |
121 linkHref.equalsIgnoreCase(wndLocationHref) || | 123 linkHref.equalsIgnoreCase(wndLocationHref) || |
122 (pageLink == PageLink.NEXT && linkHref.equalsIgnoreCase(base
Url))) { | 124 (pageLink == PageLink.NEXT && linkHref.equalsIgnoreCase(base
Url))) { |
123 appendDbgStrForLink(link, | 125 appendDbgStrForLink(link, |
124 "ignored: empty or same as current or base url" + baseUr
l); | 126 "ignored: empty or same as current or base url" + baseUr
l); |
125 continue; | 127 continue; |
126 } | 128 } |
127 | 129 |
128 // If it's on a different domain, skip it. | 130 // If it's on a different domain, skip it. |
129 String[] urlSlashes = StringUtil.split(linkHref, "\\/+"); | 131 String[] urlSlashes = StringUtil.split(linkHref, "\\/+"); |
130 if (urlSlashes.length < 3 || // Expect at least the protocol, domai
n, and path. | 132 if (urlSlashes.length < 3 || // Expect at least the protocol, domai
n, and path. |
131 !Window.Location.getHost().equalsIgnoreCase(urlSlashes[1]))
{ | 133 !getLocationHost(original_domain).equalsIgnoreCase(urlSlashe
s[1])) { |
132 appendDbgStrForLink(link, "ignored: different domain"); | 134 appendDbgStrForLink(link, "ignored: different domain"); |
133 continue; | 135 continue; |
134 } | 136 } |
135 | 137 |
136 // Use javascript innerText (instead of javascript textContent) to o
nly get visible | 138 // Use javascript innerText (instead of javascript textContent) to o
nly get visible |
137 // text. | 139 // text. |
138 String linkText = DomUtil.getInnerText(link); | 140 String linkText = DomUtil.getInnerText(link); |
139 | 141 |
140 // If the linkText looks like it's not the next or previous page, sk
ip it. | 142 // If the linkText looks like it's not the next or previous page, sk
ip it. |
141 if (StringUtil.match(linkText, EXTRANEOUS_REGEX) || linkText.length(
) > 25) { | 143 if (StringUtil.match(linkText, EXTRANEOUS_REGEX) || linkText.length(
) > 25) { |
(...skipping 163 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
305 topPage.mScore + ", txt=[" + topPage.mLinkText + "], " + pag
ingHref); | 307 topPage.mScore + ", txt=[" + topPage.mLinkText + "], " + pag
ingHref); |
306 } | 308 } |
307 | 309 |
308 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) { | 310 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) { |
309 logDbgInfoToConsole(pageLink, pagingHref, allLinks); | 311 logDbgInfoToConsole(pageLink, pagingHref, allLinks); |
310 } | 312 } |
311 | 313 |
312 return pagingHref; | 314 return pagingHref; |
313 } | 315 } |
314 | 316 |
315 private static String findBaseUrl() { | 317 private static String getLocationHost(String original_domain) { |
| 318 return original_domain.isEmpty() ? Window.Location.getHost() : original_
domain; |
| 319 } |
| 320 |
| 321 private static String findBaseUrl(String original_domain) { |
316 // This extracts relevant parts from the window location's path based on
various heuristics | 322 // This extracts relevant parts from the window location's path based on
various heuristics |
317 // to determine the path of the base URL of the document. This path is
then appended to the | 323 // to determine the path of the base URL of the document. This path is
then appended to the |
318 // window location protocol and host to form the base URL of the documen
t. This base URL is | 324 // window location protocol and host to form the base URL of the documen
t. This base URL is |
319 // then used as reference for comparison against an anchor's href to to
determine if the | 325 // then used as reference for comparison against an anchor's href to to
determine if the |
320 // anchor is a next or previous paging link. | 326 // anchor is a next or previous paging link. |
321 | 327 |
322 // First, from the window's location's path, extract the segments delimi
ted by '/'. Then, | 328 // First, from the window's location's path, extract the segments delimi
ted by '/'. Then, |
323 // because the later segments probably contain less relevant information
for the base URL, | 329 // because the later segments probably contain less relevant information
for the base URL, |
324 // reverse the segments for easier processing. | 330 // reverse the segments for easier processing. |
325 // Note: '?' is a special character in RegEx, so enclose it within [] to
specify the actual | 331 // Note: '?' is a special character in RegEx, so enclose it within [] to
specify the actual |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
365 // If the first or second segment is shorter than 3 characters, and
the first | 371 // If the first or second segment is shorter than 3 characters, and
the first |
366 // segment was purely alphas, ignore it. | 372 // segment was purely alphas, ignore it. |
367 if (i < 2 && segment.length() < 3 && !StringUtil.match(urlSlashes[0]
, "[a-z]")) { | 373 if (i < 2 && segment.length() < 3 && !StringUtil.match(urlSlashes[0]
, "[a-z]")) { |
368 continue; | 374 continue; |
369 } | 375 } |
370 | 376 |
371 // If we got here, append the segment to cleanedSegments. | 377 // If we got here, append the segment to cleanedSegments. |
372 cleanedSegments.add(segment); | 378 cleanedSegments.add(segment); |
373 } // for all urlSlashes | 379 } // for all urlSlashes |
374 | 380 |
375 return Window.Location.getProtocol() + "//" + Window.Location.getHost()
+ "/" + | 381 return Window.Location.getProtocol() + "//" + getLocationHost(original_d
omain) + "/" + |
376 reverseJoin(cleanedSegments, "/"); | 382 reverseJoin(cleanedSegments, "/"); |
377 } | 383 } |
378 | 384 |
379 private static String reverseJoin(List<String> array, String delim) { | 385 private static String reverseJoin(List<String> array, String delim) { |
380 // As per http://stackoverflow.com/questions/5748044/gwt-string-concaten
ation-operator-vs-stringbuffer, | 386 // As per http://stackoverflow.com/questions/5748044/gwt-string-concaten
ation-operator-vs-stringbuffer, |
381 // + operator is faster for javascript than StringBuffer/StringBuilder. | 387 // + operator is faster for javascript than StringBuffer/StringBuilder. |
382 String joined = ""; | 388 String joined = ""; |
383 for (int i = array.size() - 1; i >= 0; i--) { | 389 for (int i = array.size() - 1; i >= 0; i--) { |
384 joined += array.get(i); | 390 joined += array.get(i); |
385 if (i > 0) joined += delim; | 391 if (i > 0) joined += delim; |
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
449 } | 455 } |
450 | 456 |
451 private enum PageLink { | 457 private enum PageLink { |
452 NEXT, | 458 NEXT, |
453 PREV, | 459 PREV, |
454 } | 460 } |
455 | 461 |
456 private static final Map<Element, String> mLinkDebugInfo = new HashMap<Eleme
nt, String>(); | 462 private static final Map<Element, String> mLinkDebugInfo = new HashMap<Eleme
nt, String>(); |
457 | 463 |
458 } | 464 } |
OLD | NEW |