src/com/dom_distiller/client/PagingLinksFinder.java - Issue 449923002: gwt getInnerText -> javascript innerText or textContent

Side by Side Diff: src/com/dom_distiller/client/PagingLinksFinder.java

Issue 449923002: gwt getInnerText -> javascript innerText or textContent (Closed) Base URL: https://code.google.com/p/dom-distiller/@master

Patch Set: Created 6 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 /*	5 /*

6 * Parts of this file are adapted from Readability.	6 * Parts of this file are adapted from Readability.

7 *	7 *

8 * Readability is Copyright (c) 2010 Src90 Inc	8 * Readability is Copyright (c) 2010 Src90 Inc

9 * and licenced under the Apache License, Version 2.0.	9 * and licenced under the Apache License, Version 2.0.

10 */	10 */

(...skipping 115 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
126 }	126 }

127	127

128 // If it's on a different domain, skip it.	128 // If it's on a different domain, skip it.

129 String[] urlSlashes = StringUtil.split(linkHref, "\\/+");	129 String[] urlSlashes = StringUtil.split(linkHref, "\\/+");

130 if (urlSlashes.length < 3 \|\| // Expect at least the protocol, domai n, and path.	130 if (urlSlashes.length < 3 \|\| // Expect at least the protocol, domai n, and path.

131 !Window.Location.getHost().equalsIgnoreCase(urlSlashes[1])) {	131 !Window.Location.getHost().equalsIgnoreCase(urlSlashes[1])) {

132 appendDbgStrForLink(link, "ignored: different domain");	132 appendDbgStrForLink(link, "ignored: different domain");

133 continue;	133 continue;

134 }	134 }

135	135

136 String linkText = link.getInnerText();	136 // Use javascript innerText (instead of javascript textContent) to o nly get visible

	137 // text.

	138 String linkText = DomUtil.getInnerText(link);

137	139

138 // If the linkText looks like it's not the next or previous page, sk ip it.	140 // If the linkText looks like it's not the next or previous page, sk ip it.

139 if (StringUtil.match(linkText, EXTRANEOUS_REGEX) \|\| linkText.length( ) > 25) {	141 if (StringUtil.match(linkText, EXTRANEOUS_REGEX) \|\| linkText.length( ) > 25) {

140 appendDbgStrForLink(link, "ignored: one of extra");	142 appendDbgStrForLink(link, "ignored: one of extra");

141 continue;	143 continue;

142 }	144 }

143	145

144 // For next page link, if the initial part of the URL is identical t o the base URL, but	146 // For next page link, if the initial part of the URL is identical t o the base URL, but

145 // the rest of it doesn't contain any digits, it's certainly not a n ext page link.	147 // the rest of it doesn't contain any digits, it's certainly not a n ext page link.

146 // However, this doesn't apply to previous page link, because most s ites will just have	148 // However, this doesn't apply to previous page link, because most s ites will just have

(...skipping 262 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
409 // (TODO)kuan): investigate how to get logging when running "ant test.pr od" - currently,	411 // (TODO)kuan): investigate how to get logging when running "ant test.pr od" - currently,

410 // nothing appears. In the meantime, throwing an exception with a log m essage at suspicious	412 // nothing appears. In the meantime, throwing an exception with a log m essage at suspicious

411 // codepoints can produce a call stack and help debugging, albeit tediou sly.	413 // codepoints can produce a call stack and help debugging, albeit tediou sly.

412 LogUtil.logToConsole("numLinks=" + allLinks.getLength() + ", found " +	414 LogUtil.logToConsole("numLinks=" + allLinks.getLength() + ", found " +

413 (pageLink == PageLink.NEXT ? "next: " : "prev: ") +	415 (pageLink == PageLink.NEXT ? "next: " : "prev: ") +

414 (pagingHref != null ? pagingHref : "null"));	416 (pagingHref != null ? pagingHref : "null"));

415	417

416 for (int i = 0; i < allLinks.getLength(); i++) {	418 for (int i = 0; i < allLinks.getLength(); i++) {

417 AnchorElement link = AnchorElement.as(allLinks.getItem(i));	419 AnchorElement link = AnchorElement.as(allLinks.getItem(i));

418	420

419 String text = link.getInnerText();	421 // Use javascript innerText (instead of javascript textContent) to g et only visible

	422 // text.

	423 String text = DomUtil.getInnerText(link);

420 // Trim unnecessary whitespaces from text.	424 // Trim unnecessary whitespaces from text.

421 String[] words = StringUtil.split(text, "\\s+");	425 String[] words = StringUtil.split(text, "\\s+");

422 text = "";	426 text = "";

423 for (int w = 0; w < words.length; w++) {	427 for (int w = 0; w < words.length; w++) {

424 text += words[w];	428 text += words[w];

425 if (w < words.length - 1) text += " ";	429 if (w < words.length - 1) text += " ";

426 }	430 }

427	431

428 LogUtil.logToConsole(i + ")" + link.getHref() + ", txt=[" + text + " ], dbg=[" +	432 LogUtil.logToConsole(i + ")" + link.getHref() + ", txt=[" + text + " ], dbg=[" +

429 mLinkDebugInfo.get(link) + "]");	433 mLinkDebugInfo.get(link) + "]");

(...skipping 15 matching lines...) Expand all Loading...
445 }	449 }

446	450

447 private enum PageLink {	451 private enum PageLink {

448 NEXT,	452 NEXT,

449 PREV,	453 PREV,

450 }	454 }

451	455

452 private static final Map<Element, String> mLinkDebugInfo = new HashMap<Eleme nt, String>();	456 private static final Map<Element, String> mLinkDebugInfo = new HashMap<Eleme nt, String>();

453	457

454 }	458 }

OLD	NEW

« no previous file with comments | « src/com/dom_distiller/client/IEReadingViewParser.java ('k') | src/com/dom_distiller/client/SchemaOrgParser.java » ('j') | no next file with comments »