src/com/dom_distiller/client/DocumentTitleGetter.java - Issue 449923002: gwt getInnerText -> javascript innerText or textContent

Side by Side Diff: src/com/dom_distiller/client/DocumentTitleGetter.java

Issue 449923002: gwt getInnerText -> javascript innerText or textContent (Closed) Base URL: https://code.google.com/p/dom-distiller/@master

Patch Set: Created 6 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « src/com/dom_distiller/client/ContentExtractor.java ('k') | src/com/dom_distiller/client/DomUtil.java » ('j') | no next file with comments »

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 /*	5 /*

6 * Parts of this file are adapted from Readability.	6 * Parts of this file are adapted from Readability.

7 *	7 *

8 * Readability is Copyright (c) 2010 Src90 Inc	8 * Readability is Copyright (c) 2010 Src90 Inc

9 * and licenced under the Apache License, Version 2.0.	9 * and licenced under the Apache License, Version 2.0.

10 */	10 */

(...skipping 16 matching lines...) Expand all Loading...
27 * title in its TextBlock's, and marks a TextBlock with DefaultLabels.TITLE labe l if its text is	27 * title in its TextBlock's, and marks a TextBlock with DefaultLabels.TITLE labe l if its text is

28 * identical to one of the substrings. ExpandTitleToContentFilter then uses the se marked	28 * identical to one of the substrings. ExpandTitleToContentFilter then uses the se marked

29 * TextBlock's to further mark more TextBlocks as content. Lastly, BoilerplateB lockFilter makes	29 * TextBlock's to further mark more TextBlocks as content. Lastly, BoilerplateB lockFilter makes

30 * sure to block filtering of these TITLE-marked TextBlock's.	30 * sure to block filtering of these TITLE-marked TextBlock's.

31 */	31 */

32 public class DocumentTitleGetter {	32 public class DocumentTitleGetter {

33 /**	33 /**

34 * @return The title of the distilled document.	34 * @return The title of the distilled document.

35 */	35 */

36 public static String getDocumentTitle(Object objTitle, Element root) {	36 public static String getDocumentTitle(Object objTitle, Element root) {

37 String currTitle = "", origTitle = "";	37 String currTitle = "", origTitle = "";

38	38

39 if (objTitle.getClass() == currTitle.getClass()) { // If objTitle is of String type.	39 if (objTitle.getClass() == currTitle.getClass()) { // If objTitle is of String type.

40 currTitle = origTitle = objTitle.toString();	40 currTitle = origTitle = objTitle.toString();

41 } else if (root != null) { // Otherwise, use text of first TITLE elemen t.	41 } else if (root != null) { // Otherwise, use text of first TITLE elemen t.

42 NodeList<Element> titles = root.getElementsByTagName("TITLE");	42 NodeList<Element> titles = root.getElementsByTagName("TITLE");

43 if (titles.getLength() > 0) {	43 if (titles.getLength() > 0) {

44 currTitle = origTitle = titles.getItem(0).getInnerText();	44 // Use javacript textContent instead of javascript innerText; the latter only returns

	45 // visible text, but <title> tags are invisible.

	46 currTitle = origTitle = DomUtil.javascriptTextContent(titles.getIt em(0));

45 }	47 }

46 }	48 }

47 if (currTitle == "") return "";	49 if (currTitle == "") return "";

48	50

49 if (StringUtil.match(currTitle, " [\\\|\\-] ")) { // Title has '\|' and/o r '-'.	51 if (StringUtil.match(currTitle, " [\\\|\\-] ")) { // Title has '\|' and/o r '-'.

50 // Get part before last '\|' or '-'.	52 // Get part before last '\|' or '-'.

51 currTitle = StringUtil.findAndReplace(origTitle, "(.)[\\\|\\-] .", "$1");	53 currTitle = StringUtil.findAndReplace(origTitle, "(.)[\\\|\\-] .", "$1");

52 if (StringUtil.splitLength(currTitle, "\\s+") < 3) { // Part has < 3 words.	54 if (StringUtil.splitLength(currTitle, "\\s+") < 3) { // Part has < 3 words.

53 // Get part after first '\|' or '-'.	55 // Get part after first '\|' or '-'.

54 currTitle = StringUtil.findAndReplace(origTitle, "[^\\\|\\-][\\\| \\-](.)", "$1");	56 currTitle = StringUtil.findAndReplace(origTitle, "[^\\\|\\-][\\\| \\-](.)", "$1");

55 }	57 }

56 } else if (currTitle.indexOf(": ") != -1) { // Title has ':'.	58 } else if (currTitle.indexOf(": ") != -1) { // Title has ':'.

57 // Get part after last ':'.	59 // Get part after last ':'.

58 currTitle = StringUtil.findAndReplace(origTitle, ".:(.)", "$1");	60 currTitle = StringUtil.findAndReplace(origTitle, ".:(.)", "$1");

59 if (StringUtil.splitLength(currTitle, "\\s+") < 3) { // Part has < 3 words.	61 if (StringUtil.splitLength(currTitle, "\\s+") < 3) { // Part has < 3 words.

60 // Get part after first ':'.	62 // Get part after first ':'.

61 currTitle = StringUtil.findAndReplace(origTitle, "[^:][:](.)", " $1");	63 currTitle = StringUtil.findAndReplace(origTitle, "[^:][:](.)", " $1");

62 }	64 }

63 } else if (root != null && (currTitle.length() > 150 \|\| currTitle.length () < 15)) {	65 } else if (root != null && (currTitle.length() > 150 \|\| currTitle.length () < 15)) {

64 // Get plain text from the only H1 element.	66 // Get plain text from the only H1 element.

65 // TODO(kuan): this is what readability does, but this block may mak e more sense as an	67 // TODO(kuan): this is what readability does, but this block may mak e more sense as an

66 // if rather than else-if, e.g. currently this else-if block is used when original title	68 // if rather than else-if, e.g. currently this else-if block is used when original title

67 // is "foo" but not when it is "foo \|" or "foo:".	69 // is "foo" but not when it is "foo \|" or "foo:".

68 currTitle = findTheOnlyH1(root);	70 currTitle = findFirstH1(root);

69 if (currTitle == null) currTitle = origTitle;	71 if (currTitle.isEmpty()) currTitle = origTitle;

70 }	72 }

71	73

72 currTitle = StringUtil.trim(currTitle);	74 currTitle = StringUtil.trim(currTitle);

73	75

74 if (StringUtil.splitLength(currTitle, "\\s+") <= 4) currTitle = origTitl e;	76 if (StringUtil.splitLength(currTitle, "\\s+") <= 4) currTitle = origTitl e;

75	77

76 return currTitle;	78 return currTitle;

77 }	79 }

78	80

79	81

80 private static String findTheOnlyH1(Element root) {	82 private static String findFirstH1(Element root) {

81 NodeList<Element> hOnes = root.getElementsByTagName("H1");	83 NodeList<Element> hOnes = root.getElementsByTagName("H1");

82 return hOnes.getLength() == 1 ? hOnes.getItem(0).getInnerText() : null;	84 // Use javacript innerText instead of javascript textContent; the former only returns

	85 // visible text, and we assume visible H1's are more inclined to being p otential titles.

	86 String h1 = "";

	87 for (int i = 0; i < hOnes.getLength() && h1.isEmpty(); i++) {

	88 h1 = DomUtil.getInnerText(hOnes.getItem(i));

	89 }

	90 return h1;

83 }	91 }

84 }	92 }

OLD	NEW