src/com/dom_distiller/client/PagingLinksFinder.java - Issue 661883003: add option for original domain

Side by Side Diff: src/com/dom_distiller/client/PagingLinksFinder.java

Issue 661883003: add option for original domain (Closed) Base URL: https://code.google.com/p/dom-distiller/@master

Patch Set: rm empty line Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 /*	5 /*

6 * Parts of this file are adapted from Readability.	6 * Parts of this file are adapted from Readability.

7 *	7 *

8 * Readability is Copyright (c) 2010 Src90 Inc	8 * Readability is Copyright (c) 2010 Src90 Inc

9 * and licenced under the Apache License, Version 2.0.	9 * and licenced under the Apache License, Version 2.0.

10 */	10 */

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
43 private static final String NEXT_LINK_REGEX = "(next\|weiter\|continue\|>([^\\\| ]\|$)\|»([^\\\|]\|$))";	43 private static final String NEXT_LINK_REGEX = "(next\|weiter\|continue\|>([^\\\| ]\|$)\|»([^\\\|]\|$))";

44 private static final String PREV_LINK_REGEX = "(prev\|early\|old\|new\|<\|«)";	44 private static final String PREV_LINK_REGEX = "(prev\|early\|old\|new\|<\|«)";

45 private static final String POSITIVE_REGEX = "article\|body\|content\|entry\|hen try\|main\|page\|pagination\|post\|text\|blog\|story";	45 private static final String POSITIVE_REGEX = "article\|body\|content\|entry\|hen try\|main\|page\|pagination\|post\|text\|blog\|story";

46 private static final String NEGATIVE_REGEX = "combx\|comment\|com-\|contact\|foo t\|footer\|footnote\|masthead\|media\|meta\|outbrain\|promo\|related\|scroll\|shoutbox\|sid ebar\|sponsor\|shopping\|tags\|tool\|widget";	46 private static final String NEGATIVE_REGEX = "combx\|comment\|com-\|contact\|foo t\|footer\|footnote\|masthead\|media\|meta\|outbrain\|promo\|related\|scroll\|shoutbox\|sid ebar\|sponsor\|shopping\|tags\|tool\|widget";

47 private static final String EXTRANEOUS_REGEX =	47 private static final String EXTRANEOUS_REGEX =

48 "print\|archive\|comment\|discuss\|e[\\-]?mail\|share\|reply\|all\|login\|sig n\|single";	48 "print\|archive\|comment\|discuss\|e[\\-]?mail\|share\|reply\|all\|login\|sig n\|single";

49 // Examples that match PAGE_NUMBER_REGEX are: "_p3", "-pg3", "p3", "_1", "-1 2-2".	49 // Examples that match PAGE_NUMBER_REGEX are: "_p3", "-pg3", "p3", "_1", "-1 2-2".

50 // Examples that don't match PAGE_NUMBER_REGEX are: "_p3 ", "p", "p123".	50 // Examples that don't match PAGE_NUMBER_REGEX are: "_p3 ", "p", "p123".

51 private static final String PAGE_NUMBER_REGEX = "((_\|-)?p[a-z]*\|(_\|-))[0-9]{ 1,2}$";	51 private static final String PAGE_NUMBER_REGEX = "((_\|-)?p[a-z]*\|(_\|-))[0-9]{ 1,2}$";

52	52

53 public static DomDistillerProtos.PaginationInfo getPaginationInfo() {	53 public static DomDistillerProtos.PaginationInfo getPaginationInfo(String ori ginal_domain) {

54 DomDistillerProtos.PaginationInfo info = DomDistillerProtos.PaginationIn fo.create();	54 DomDistillerProtos.PaginationInfo info = DomDistillerProtos.PaginationIn fo.create();

55 String next = findNext(Document.get().getDocumentElement());	55 String next = findNext(Document.get().getDocumentElement(), original_dom ain);

56 if (next != null) {	56 if (next != null) {

57 info.setNextPage(next);	57 info.setNextPage(next);

58 }	58 }

59 return info;	59 return info;

60 }	60 }

61	61

62 public static String findNext(Element root) {

63 /**	62 /**

	63 * @param original_domain The original domain of the page being processed if it's a file://.

64 * @return The next page link for the document.	64 * @return The next page link for the document.

65 */	65 */

66 return findPagingLink(root, PageLink.NEXT);	66 public static String findNext(Element root, String original_domain) {

	67 return findPagingLink(root, original_domain, PageLink.NEXT);

67 }	68 }

68	69

69 public static String findPrevious(Element root) {

70 /**	70 /**

	71 * @param original_domain The original domain of the page being processed if it's a file://.

71 * @return The previous page link for the document.	72 * @return The previous page link for the document.

72 */	73 */

73 return findPagingLink(root, PageLink.PREV);	74 public static String findPrevious(Element root, String original_domain) {

	75 return findPagingLink(root, original_domain, PageLink.PREV);

74 }	76 }

75	77

76 private static String findPagingLink(Element root, PageLink pageLink) {	78 private static String findPagingLink(Element root, String original_domain, P ageLink pageLink) {

77 // findPagingLink() is static, so clear mLinkDebugInfo before processing the links.	79 // findPagingLink() is static, so clear mLinkDebugInfo before processing the links.

78 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) {	80 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) {

79 mLinkDebugInfo.clear();	81 mLinkDebugInfo.clear();

80 }	82 }

81	83

82 String baseUrl = findBaseUrl();	84 String baseUrl = findBaseUrl(original_domain);

83 // Remove trailing '/' from window location href, because it'll be used to compare with	85 // Remove trailing '/' from window location href, because it'll be used to compare with

84 // other href's whose trailing '/' are also removed.	86 // other href's whose trailing '/' are also removed.

85 String wndLocationHref = StringUtil.findAndReplace(Window.Location.getHr ef(), "\\/$", "");	87 String wndLocationHref = StringUtil.findAndReplace(Window.Location.getHr ef(), "\\/$", "");

86 NodeList<Element> allLinks = root.getElementsByTagName("A");	88 NodeList<Element> allLinks = root.getElementsByTagName("A");

87 Map<String, PagingLinkObj> possiblePages = new HashMap<String, PagingLin kObj>();	89 Map<String, PagingLinkObj> possiblePages = new HashMap<String, PagingLin kObj>();

88	90

89 // Loop through all links, looking for hints that they may be next- or p revious- page links.	91 // Loop through all links, looking for hints that they may be next- or p revious- page links.

90 // Things like having "page" in their textContent, className or id, or b eing a child of a	92 // Things like having "page" in their textContent, className or id, or b eing a child of a

91 // node with a page-y className or id.	93 // node with a page-y className or id.

92 // Also possible: levenshtein distance? longest common subsequence?	94 // Also possible: levenshtein distance? longest common subsequence?

(...skipping 28 matching lines...) Expand all Loading...
121 linkHref.equalsIgnoreCase(wndLocationHref) \|\|	123 linkHref.equalsIgnoreCase(wndLocationHref) \|\|

122 (pageLink == PageLink.NEXT && linkHref.equalsIgnoreCase(base Url))) {	124 (pageLink == PageLink.NEXT && linkHref.equalsIgnoreCase(base Url))) {

123 appendDbgStrForLink(link,	125 appendDbgStrForLink(link,

124 "ignored: empty or same as current or base url" + baseUr l);	126 "ignored: empty or same as current or base url" + baseUr l);

125 continue;	127 continue;

126 }	128 }

127	129

128 // If it's on a different domain, skip it.	130 // If it's on a different domain, skip it.

129 String[] urlSlashes = StringUtil.split(linkHref, "\\/+");	131 String[] urlSlashes = StringUtil.split(linkHref, "\\/+");

130 if (urlSlashes.length < 3 \|\| // Expect at least the protocol, domai n, and path.	132 if (urlSlashes.length < 3 \|\| // Expect at least the protocol, domai n, and path.

131 !Window.Location.getHost().equalsIgnoreCase(urlSlashes[1])) {	133 !getLocationHost(original_domain).equalsIgnoreCase(urlSlashe s[1])) {

132 appendDbgStrForLink(link, "ignored: different domain");	134 appendDbgStrForLink(link, "ignored: different domain");

133 continue;	135 continue;

134 }	136 }

135	137

136 // Use javascript innerText (instead of javascript textContent) to o nly get visible	138 // Use javascript innerText (instead of javascript textContent) to o nly get visible

137 // text.	139 // text.

138 String linkText = DomUtil.getInnerText(link);	140 String linkText = DomUtil.getInnerText(link);

139	141

140 // If the linkText looks like it's not the next or previous page, sk ip it.	142 // If the linkText looks like it's not the next or previous page, sk ip it.

141 if (StringUtil.match(linkText, EXTRANEOUS_REGEX) \|\| linkText.length( ) > 25) {	143 if (StringUtil.match(linkText, EXTRANEOUS_REGEX) \|\| linkText.length( ) > 25) {

(...skipping 163 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
305 topPage.mScore + ", txt=[" + topPage.mLinkText + "], " + pag ingHref);	307 topPage.mScore + ", txt=[" + topPage.mLinkText + "], " + pag ingHref);

306 }	308 }

307	309

308 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) {	310 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_PAGING_INFO)) {

309 logDbgInfoToConsole(pageLink, pagingHref, allLinks);	311 logDbgInfoToConsole(pageLink, pagingHref, allLinks);

310 }	312 }

311	313

312 return pagingHref;	314 return pagingHref;

313 }	315 }

314	316

315 private static String findBaseUrl() {	317 private static String getLocationHost(String original_domain) {

	318 return original_domain.isEmpty() ? Window.Location.getHost() : original_ domain;

	319 }

	320

	321 private static String findBaseUrl(String original_domain) {

316 // This extracts relevant parts from the window location's path based on various heuristics	322 // This extracts relevant parts from the window location's path based on various heuristics

317 // to determine the path of the base URL of the document. This path is then appended to the	323 // to determine the path of the base URL of the document. This path is then appended to the

318 // window location protocol and host to form the base URL of the documen t. This base URL is	324 // window location protocol and host to form the base URL of the documen t. This base URL is

319 // then used as reference for comparison against an anchor's href to to determine if the	325 // then used as reference for comparison against an anchor's href to to determine if the

320 // anchor is a next or previous paging link.	326 // anchor is a next or previous paging link.

321	327

322 // First, from the window's location's path, extract the segments delimi ted by '/'. Then,	328 // First, from the window's location's path, extract the segments delimi ted by '/'. Then,

323 // because the later segments probably contain less relevant information for the base URL,	329 // because the later segments probably contain less relevant information for the base URL,

324 // reverse the segments for easier processing.	330 // reverse the segments for easier processing.

325 // Note: '?' is a special character in RegEx, so enclose it within [] to specify the actual	331 // Note: '?' is a special character in RegEx, so enclose it within [] to specify the actual

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
365 // If the first or second segment is shorter than 3 characters, and the first	371 // If the first or second segment is shorter than 3 characters, and the first

366 // segment was purely alphas, ignore it.	372 // segment was purely alphas, ignore it.

367 if (i < 2 && segment.length() < 3 && !StringUtil.match(urlSlashes[0] , "[a-z]")) {	373 if (i < 2 && segment.length() < 3 && !StringUtil.match(urlSlashes[0] , "[a-z]")) {

368 continue;	374 continue;

369 }	375 }

370	376

371 // If we got here, append the segment to cleanedSegments.	377 // If we got here, append the segment to cleanedSegments.

372 cleanedSegments.add(segment);	378 cleanedSegments.add(segment);

373 } // for all urlSlashes	379 } // for all urlSlashes

374	380

375 return Window.Location.getProtocol() + "//" + Window.Location.getHost() + "/" +	381 return Window.Location.getProtocol() + "//" + getLocationHost(original_d omain) + "/" +

376 reverseJoin(cleanedSegments, "/");	382 reverseJoin(cleanedSegments, "/");

377 }	383 }

378	384

379 private static String reverseJoin(List<String> array, String delim) {	385 private static String reverseJoin(List<String> array, String delim) {

380 // As per http://stackoverflow.com/questions/5748044/gwt-string-concaten ation-operator-vs-stringbuffer,	386 // As per http://stackoverflow.com/questions/5748044/gwt-string-concaten ation-operator-vs-stringbuffer,

381 // + operator is faster for javascript than StringBuffer/StringBuilder.	387 // + operator is faster for javascript than StringBuffer/StringBuilder.

382 String joined = "";	388 String joined = "";

383 for (int i = array.size() - 1; i >= 0; i--) {	389 for (int i = array.size() - 1; i >= 0; i--) {

384 joined += array.get(i);	390 joined += array.get(i);

385 if (i > 0) joined += delim;	391 if (i > 0) joined += delim;

(...skipping 63 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
449 }	455 }

450	456

451 private enum PageLink {	457 private enum PageLink {

452 NEXT,	458 NEXT,

453 PREV,	459 PREV,

454 }	460 }

455	461

456 private static final Map<Element, String> mLinkDebugInfo = new HashMap<Eleme nt, String>();	462 private static final Map<Element, String> mLinkDebugInfo = new HashMap<Eleme nt, String>();

457	463

458 }	464 }

OLD	NEW

« no previous file with comments | « src/com/dom_distiller/client/DomDistiller.java ('k') | test/com/dom_distiller/client/PagingLinksFinderTest.java » ('j') | no next file with comments »