Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(356)

Side by Side Diff: src/com/dom_distiller/client/DocumentTitleGetter.java

Issue 449923002: gwt getInnerText -> javascript innerText or textContent (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: Created 6 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 /* 5 /*
6 * Parts of this file are adapted from Readability. 6 * Parts of this file are adapted from Readability.
7 * 7 *
8 * Readability is Copyright (c) 2010 Src90 Inc 8 * Readability is Copyright (c) 2010 Src90 Inc
9 * and licenced under the Apache License, Version 2.0. 9 * and licenced under the Apache License, Version 2.0.
10 */ 10 */
(...skipping 16 matching lines...) Expand all
27 * title in its TextBlock's, and marks a TextBlock with DefaultLabels.TITLE labe l if its text is 27 * title in its TextBlock's, and marks a TextBlock with DefaultLabels.TITLE labe l if its text is
28 * identical to one of the substrings. ExpandTitleToContentFilter then uses the se marked 28 * identical to one of the substrings. ExpandTitleToContentFilter then uses the se marked
29 * TextBlock's to further mark more TextBlocks as content. Lastly, BoilerplateB lockFilter makes 29 * TextBlock's to further mark more TextBlocks as content. Lastly, BoilerplateB lockFilter makes
30 * sure to block filtering of these TITLE-marked TextBlock's. 30 * sure to block filtering of these TITLE-marked TextBlock's.
31 */ 31 */
32 public class DocumentTitleGetter { 32 public class DocumentTitleGetter {
33 /** 33 /**
34 * @return The title of the distilled document. 34 * @return The title of the distilled document.
35 */ 35 */
36 public static String getDocumentTitle(Object objTitle, Element root) { 36 public static String getDocumentTitle(Object objTitle, Element root) {
37 String currTitle = "", origTitle = ""; 37 String currTitle = "", origTitle = "";
38 38
39 if (objTitle.getClass() == currTitle.getClass()) { // If objTitle is of String type. 39 if (objTitle.getClass() == currTitle.getClass()) { // If objTitle is of String type.
40 currTitle = origTitle = objTitle.toString(); 40 currTitle = origTitle = objTitle.toString();
41 } else if (root != null) { // Otherwise, use text of first TITLE elemen t. 41 } else if (root != null) { // Otherwise, use text of first TITLE elemen t.
42 NodeList<Element> titles = root.getElementsByTagName("TITLE"); 42 NodeList<Element> titles = root.getElementsByTagName("TITLE");
43 if (titles.getLength() > 0) { 43 if (titles.getLength() > 0) {
44 currTitle = origTitle = titles.getItem(0).getInnerText(); 44 // Use javacript textContent instead of javascript innerText; the latter only returns
45 // visible text, but <title> tags are invisible.
46 currTitle = origTitle = DomUtil.javascriptTextContent(titles.getIt em(0));
45 } 47 }
46 } 48 }
47 if (currTitle == "") return ""; 49 if (currTitle == "") return "";
48 50
49 if (StringUtil.match(currTitle, " [\\|\\-] ")) { // Title has '|' and/o r '-'. 51 if (StringUtil.match(currTitle, " [\\|\\-] ")) { // Title has '|' and/o r '-'.
50 // Get part before last '|' or '-'. 52 // Get part before last '|' or '-'.
51 currTitle = StringUtil.findAndReplace(origTitle, "(.*)[\\|\\-] .*", "$1"); 53 currTitle = StringUtil.findAndReplace(origTitle, "(.*)[\\|\\-] .*", "$1");
52 if (StringUtil.splitLength(currTitle, "\\s+") < 3) { // Part has < 3 words. 54 if (StringUtil.splitLength(currTitle, "\\s+") < 3) { // Part has < 3 words.
53 // Get part after first '|' or '-'. 55 // Get part after first '|' or '-'.
54 currTitle = StringUtil.findAndReplace(origTitle, "[^\\|\\-]*[\\| \\-](.*)", "$1"); 56 currTitle = StringUtil.findAndReplace(origTitle, "[^\\|\\-]*[\\| \\-](.*)", "$1");
55 } 57 }
56 } else if (currTitle.indexOf(": ") != -1) { // Title has ':'. 58 } else if (currTitle.indexOf(": ") != -1) { // Title has ':'.
57 // Get part after last ':'. 59 // Get part after last ':'.
58 currTitle = StringUtil.findAndReplace(origTitle, ".*:(.*)", "$1"); 60 currTitle = StringUtil.findAndReplace(origTitle, ".*:(.*)", "$1");
59 if (StringUtil.splitLength(currTitle, "\\s+") < 3) { // Part has < 3 words. 61 if (StringUtil.splitLength(currTitle, "\\s+") < 3) { // Part has < 3 words.
60 // Get part after first ':'. 62 // Get part after first ':'.
61 currTitle = StringUtil.findAndReplace(origTitle, "[^:]*[:](.*)", " $1"); 63 currTitle = StringUtil.findAndReplace(origTitle, "[^:]*[:](.*)", " $1");
62 } 64 }
63 } else if (root != null && (currTitle.length() > 150 || currTitle.length () < 15)) { 65 } else if (root != null && (currTitle.length() > 150 || currTitle.length () < 15)) {
64 // Get plain text from the only H1 element. 66 // Get plain text from the only H1 element.
65 // TODO(kuan): this is what readability does, but this block may mak e more sense as an 67 // TODO(kuan): this is what readability does, but this block may mak e more sense as an
66 // if rather than else-if, e.g. currently this else-if block is used when original title 68 // if rather than else-if, e.g. currently this else-if block is used when original title
67 // is "foo" but not when it is "foo |" or "foo:". 69 // is "foo" but not when it is "foo |" or "foo:".
68 currTitle = findTheOnlyH1(root); 70 currTitle = findFirstH1(root);
69 if (currTitle == null) currTitle = origTitle; 71 if (currTitle.isEmpty()) currTitle = origTitle;
70 } 72 }
71 73
72 currTitle = StringUtil.trim(currTitle); 74 currTitle = StringUtil.trim(currTitle);
73 75
74 if (StringUtil.splitLength(currTitle, "\\s+") <= 4) currTitle = origTitl e; 76 if (StringUtil.splitLength(currTitle, "\\s+") <= 4) currTitle = origTitl e;
75 77
76 return currTitle; 78 return currTitle;
77 } 79 }
78 80
79 81
80 private static String findTheOnlyH1(Element root) { 82 private static String findFirstH1(Element root) {
81 NodeList<Element> hOnes = root.getElementsByTagName("H1"); 83 NodeList<Element> hOnes = root.getElementsByTagName("H1");
82 return hOnes.getLength() == 1 ? hOnes.getItem(0).getInnerText() : null; 84 // Use javacript innerText instead of javascript textContent; the former only returns
85 // visible text, and we assume visible H1's are more inclined to being p otential titles.
86 String h1 = "";
87 for (int i = 0; i < hOnes.getLength() && h1.isEmpty(); i++) {
88 h1 = DomUtil.getInnerText(hOnes.getItem(i));
89 }
90 return h1;
83 } 91 }
84 } 92 }
OLDNEW
« no previous file with comments | « src/com/dom_distiller/client/ContentExtractor.java ('k') | src/com/dom_distiller/client/DomUtil.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698