Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(184)

Side by Side Diff: src/com/dom_distiller/client/PagingLinksFinder.java

Issue 661883003: add option for original domain (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: rm empty line Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 /* 5 /*
6 * Parts of this file are adapted from Readability. 6 * Parts of this file are adapted from Readability.
7 * 7 *
8 * Readability is Copyright (c) 2010 Src90 Inc 8 * Readability is Copyright (c) 2010 Src90 Inc
9 * and licenced under the Apache License, Version 2.0. 9 * and licenced under the Apache License, Version 2.0.
10 */ 10 */
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
43 private static final String NEXT_LINK_REGEX = "(next|weiter|continue|>([^\\| ]|$)|»([^\\|]|$))"; 43 private static final String NEXT_LINK_REGEX = "(next|weiter|continue|>([^\\| ]|$)|»([^\\|]|$))";
44 private static final String PREV_LINK_REGEX = "(prev|early|old|new|<|«)"; 44 private static final String PREV_LINK_REGEX = "(prev|early|old|new|<|«)";
45 private static final String POSITIVE_REGEX = "article|body|content|entry|hen try|main|page|pagination|post|text|blog|story"; 45 private static final String POSITIVE_REGEX = "article|body|content|entry|hen try|main|page|pagination|post|text|blog|story";
46 private static final String NEGATIVE_REGEX = "combx|comment|com-|contact|foo t|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sid ebar|sponsor|shopping|tags|tool|widget"; 46 private static final String NEGATIVE_REGEX = "combx|comment|com-|contact|foo t|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sid ebar|sponsor|shopping|tags|tool|widget";
47 private static final String EXTRANEOUS_REGEX = 47 private static final String EXTRANEOUS_REGEX =
48 "print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sig n|single"; 48 "print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sig n|single";
49 // Examples that match PAGE_NUMBER_REGEX are: "_p3", "-pg3", "p3", "_1", "-1 2-2". 49 // Examples that match PAGE_NUMBER_REGEX are: "_p3", "-pg3", "p3", "_1", "-1 2-2".
50 // Examples that don't match PAGE_NUMBER_REGEX are: "_p3 ", "p", "p123". 50 // Examples that don't match PAGE_NUMBER_REGEX are: "_p3 ", "p", "p123".
51 private static final String PAGE_NUMBER_REGEX = "((_|-)?p[a-z]*|(_|-))[0-9]{ 1,2}$"; 51 private static final String PAGE_NUMBER_REGEX = "((_|-)?p[a-z]*|(_|-))[0-9]{ 1,2}$";
52 52
53 public static DomDistillerProtos.PaginationInfo getPaginationInfo() { 53 public static DomDistillerProtos.PaginationInfo getPaginationInfo(String ori ginal_domain) {
54 DomDistillerProtos.PaginationInfo info = DomDistillerProtos.PaginationIn fo.create(); 54 DomDistillerProtos.PaginationInfo info = DomDistillerProtos.PaginationIn fo.create();
55 String next = findNext(Document.get().getDocumentElement()); 55 String next = findNext(Document.get().getDocumentElement(), original_dom ain);
56 if (next != null) { 56 if (next != null) {
57 info.setNextPage(next); 57 info.setNextPage(next);
58 } 58 }
59 return info; 59 return info;
60 } 60 }
61 61
62 public static String findNext(Element root) {
63 /** 62 /**
63 * @param original_domain The original domain of the page being processed if it's a file://.
64 * @return The next page link for the document. 64 * @return The next page link for the document.
65 */ 65 */
66 return findPagingLink(root, PageLink.NEXT); 66 public static String findNext(Element root, String original_domain) {
67 return findPagingLink(root, original_domain, PageLink.NEXT);
67 } 68 }
68 69
69 public static String findPrevious(Element root) {
70 /** 70 /**
71 * @param original_domain The original domain of the page being processed if it's a file://.
71 * @return The previous page link for the document. 72 * @return The previous page link for the document.
72 */ 73 */
73 return findPagingLink(root, PageLink.PREV); 74 public static String findPrevious(Element root, String original_domain) {
75 return findPagingLink(root, original_domain, PageLink.PREV);
74 } 76 }
75 77
76 private static String findPagingLink(Element root, PageLink pageLink) { 78 private static String findPagingLink(Element root, String original_domain, P ageLink pageLink) {
77 // findPagingLink() is static, so clear mLinkDebugInfo before processing the links. 79 // findPagingLink() is static, so clear mLinkDebugInfo before processing the links.
78 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) { 80 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) {
79 mLinkDebugInfo.clear(); 81 mLinkDebugInfo.clear();
80 } 82 }
81 83
82 String baseUrl = findBaseUrl(); 84 String baseUrl = findBaseUrl(original_domain);
83 // Remove trailing '/' from window location href, because it'll be used to compare with 85 // Remove trailing '/' from window location href, because it'll be used to compare with
84 // other href's whose trailing '/' are also removed. 86 // other href's whose trailing '/' are also removed.
85 String wndLocationHref = StringUtil.findAndReplace(Window.Location.getHr ef(), "\\/$", ""); 87 String wndLocationHref = StringUtil.findAndReplace(Window.Location.getHr ef(), "\\/$", "");
86 NodeList<Element> allLinks = root.getElementsByTagName("A"); 88 NodeList<Element> allLinks = root.getElementsByTagName("A");
87 Map<String, PagingLinkObj> possiblePages = new HashMap<String, PagingLin kObj>(); 89 Map<String, PagingLinkObj> possiblePages = new HashMap<String, PagingLin kObj>();
88 90
89 // Loop through all links, looking for hints that they may be next- or p revious- page links. 91 // Loop through all links, looking for hints that they may be next- or p revious- page links.
90 // Things like having "page" in their textContent, className or id, or b eing a child of a 92 // Things like having "page" in their textContent, className or id, or b eing a child of a
91 // node with a page-y className or id. 93 // node with a page-y className or id.
92 // Also possible: levenshtein distance? longest common subsequence? 94 // Also possible: levenshtein distance? longest common subsequence?
(...skipping 28 matching lines...) Expand all
121 linkHref.equalsIgnoreCase(wndLocationHref) || 123 linkHref.equalsIgnoreCase(wndLocationHref) ||
122 (pageLink == PageLink.NEXT && linkHref.equalsIgnoreCase(base Url))) { 124 (pageLink == PageLink.NEXT && linkHref.equalsIgnoreCase(base Url))) {
123 appendDbgStrForLink(link, 125 appendDbgStrForLink(link,
124 "ignored: empty or same as current or base url" + baseUr l); 126 "ignored: empty or same as current or base url" + baseUr l);
125 continue; 127 continue;
126 } 128 }
127 129
128 // If it's on a different domain, skip it. 130 // If it's on a different domain, skip it.
129 String[] urlSlashes = StringUtil.split(linkHref, "\\/+"); 131 String[] urlSlashes = StringUtil.split(linkHref, "\\/+");
130 if (urlSlashes.length < 3 || // Expect at least the protocol, domai n, and path. 132 if (urlSlashes.length < 3 || // Expect at least the protocol, domai n, and path.
131 !Window.Location.getHost().equalsIgnoreCase(urlSlashes[1])) { 133 !getLocationHost(original_domain).equalsIgnoreCase(urlSlashe s[1])) {
132 appendDbgStrForLink(link, "ignored: different domain"); 134 appendDbgStrForLink(link, "ignored: different domain");
133 continue; 135 continue;
134 } 136 }
135 137
136 // Use javascript innerText (instead of javascript textContent) to o nly get visible 138 // Use javascript innerText (instead of javascript textContent) to o nly get visible
137 // text. 139 // text.
138 String linkText = DomUtil.getInnerText(link); 140 String linkText = DomUtil.getInnerText(link);
139 141
140 // If the linkText looks like it's not the next or previous page, sk ip it. 142 // If the linkText looks like it's not the next or previous page, sk ip it.
141 if (StringUtil.match(linkText, EXTRANEOUS_REGEX) || linkText.length( ) > 25) { 143 if (StringUtil.match(linkText, EXTRANEOUS_REGEX) || linkText.length( ) > 25) {
(...skipping 163 matching lines...) Expand 10 before | Expand all | Expand 10 after
305 topPage.mScore + ", txt=[" + topPage.mLinkText + "], " + pag ingHref); 307 topPage.mScore + ", txt=[" + topPage.mLinkText + "], " + pag ingHref);
306 } 308 }
307 309
308 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) { 310 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) {
309 logDbgInfoToConsole(pageLink, pagingHref, allLinks); 311 logDbgInfoToConsole(pageLink, pagingHref, allLinks);
310 } 312 }
311 313
312 return pagingHref; 314 return pagingHref;
313 } 315 }
314 316
315 private static String findBaseUrl() { 317 private static String getLocationHost(String original_domain) {
318 return original_domain.isEmpty() ? Window.Location.getHost() : original_ domain;
319 }
320
321 private static String findBaseUrl(String original_domain) {
316 // This extracts relevant parts from the window location's path based on various heuristics 322 // This extracts relevant parts from the window location's path based on various heuristics
317 // to determine the path of the base URL of the document. This path is then appended to the 323 // to determine the path of the base URL of the document. This path is then appended to the
318 // window location protocol and host to form the base URL of the documen t. This base URL is 324 // window location protocol and host to form the base URL of the documen t. This base URL is
319 // then used as reference for comparison against an anchor's href to to determine if the 325 // then used as reference for comparison against an anchor's href to to determine if the
320 // anchor is a next or previous paging link. 326 // anchor is a next or previous paging link.
321 327
322 // First, from the window's location's path, extract the segments delimi ted by '/'. Then, 328 // First, from the window's location's path, extract the segments delimi ted by '/'. Then,
323 // because the later segments probably contain less relevant information for the base URL, 329 // because the later segments probably contain less relevant information for the base URL,
324 // reverse the segments for easier processing. 330 // reverse the segments for easier processing.
325 // Note: '?' is a special character in RegEx, so enclose it within [] to specify the actual 331 // Note: '?' is a special character in RegEx, so enclose it within [] to specify the actual
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
365 // If the first or second segment is shorter than 3 characters, and the first 371 // If the first or second segment is shorter than 3 characters, and the first
366 // segment was purely alphas, ignore it. 372 // segment was purely alphas, ignore it.
367 if (i < 2 && segment.length() < 3 && !StringUtil.match(urlSlashes[0] , "[a-z]")) { 373 if (i < 2 && segment.length() < 3 && !StringUtil.match(urlSlashes[0] , "[a-z]")) {
368 continue; 374 continue;
369 } 375 }
370 376
371 // If we got here, append the segment to cleanedSegments. 377 // If we got here, append the segment to cleanedSegments.
372 cleanedSegments.add(segment); 378 cleanedSegments.add(segment);
373 } // for all urlSlashes 379 } // for all urlSlashes
374 380
375 return Window.Location.getProtocol() + "//" + Window.Location.getHost() + "/" + 381 return Window.Location.getProtocol() + "//" + getLocationHost(original_d omain) + "/" +
376 reverseJoin(cleanedSegments, "/"); 382 reverseJoin(cleanedSegments, "/");
377 } 383 }
378 384
379 private static String reverseJoin(List<String> array, String delim) { 385 private static String reverseJoin(List<String> array, String delim) {
380 // As per http://stackoverflow.com/questions/5748044/gwt-string-concaten ation-operator-vs-stringbuffer, 386 // As per http://stackoverflow.com/questions/5748044/gwt-string-concaten ation-operator-vs-stringbuffer,
381 // + operator is faster for javascript than StringBuffer/StringBuilder. 387 // + operator is faster for javascript than StringBuffer/StringBuilder.
382 String joined = ""; 388 String joined = "";
383 for (int i = array.size() - 1; i >= 0; i--) { 389 for (int i = array.size() - 1; i >= 0; i--) {
384 joined += array.get(i); 390 joined += array.get(i);
385 if (i > 0) joined += delim; 391 if (i > 0) joined += delim;
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after
449 } 455 }
450 456
451 private enum PageLink { 457 private enum PageLink {
452 NEXT, 458 NEXT,
453 PREV, 459 PREV,
454 } 460 }
455 461
456 private static final Map<Element, String> mLinkDebugInfo = new HashMap<Eleme nt, String>(); 462 private static final Map<Element, String> mLinkDebugInfo = new HashMap<Eleme nt, String>();
457 463
458 } 464 }
OLDNEW
« no previous file with comments | « src/com/dom_distiller/client/DomDistiller.java ('k') | test/com/dom_distiller/client/PagingLinksFinderTest.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698