| OLD | NEW |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 /* | 5 /* |
| 6 * Parts of this file are adapted from Readability. | 6 * Parts of this file are adapted from Readability. |
| 7 * | 7 * |
| 8 * Readability is Copyright (c) 2010 Src90 Inc | 8 * Readability is Copyright (c) 2010 Src90 Inc |
| 9 * and licenced under the Apache License, Version 2.0. | 9 * and licenced under the Apache License, Version 2.0. |
| 10 */ | 10 */ |
| (...skipping 16 matching lines...) Expand all Loading... |
| 27 * title in its TextBlock's, and marks a TextBlock with DefaultLabels.TITLE labe
l if its text is | 27 * title in its TextBlock's, and marks a TextBlock with DefaultLabels.TITLE labe
l if its text is |
| 28 * identical to one of the substrings. ExpandTitleToContentFilter then uses the
se marked | 28 * identical to one of the substrings. ExpandTitleToContentFilter then uses the
se marked |
| 29 * TextBlock's to further mark more TextBlocks as content. Lastly, BoilerplateB
lockFilter makes | 29 * TextBlock's to further mark more TextBlocks as content. Lastly, BoilerplateB
lockFilter makes |
| 30 * sure to block filtering of these TITLE-marked TextBlock's. | 30 * sure to block filtering of these TITLE-marked TextBlock's. |
| 31 */ | 31 */ |
| 32 public class DocumentTitleGetter { | 32 public class DocumentTitleGetter { |
| 33 /** | 33 /** |
| 34 * @return The title of the distilled document. | 34 * @return The title of the distilled document. |
| 35 */ | 35 */ |
| 36 public static String getDocumentTitle(Object objTitle, Element root) { | 36 public static String getDocumentTitle(Object objTitle, Element root) { |
| 37 String currTitle = "", origTitle = ""; | 37 String currTitle = "", origTitle = ""; |
| 38 | 38 |
| 39 if (objTitle.getClass() == currTitle.getClass()) { // If objTitle is of
String type. | 39 if (objTitle.getClass() == currTitle.getClass()) { // If objTitle is of
String type. |
| 40 currTitle = origTitle = objTitle.toString(); | 40 currTitle = origTitle = objTitle.toString(); |
| 41 } else if (root != null) { // Otherwise, use text of first TITLE elemen
t. | 41 } else if (root != null) { // Otherwise, use text of first TITLE elemen
t. |
| 42 NodeList<Element> titles = root.getElementsByTagName("TITLE"); | 42 NodeList<Element> titles = root.getElementsByTagName("TITLE"); |
| 43 if (titles.getLength() > 0) { | 43 if (titles.getLength() > 0) { |
| 44 currTitle = origTitle = titles.getItem(0).getInnerText(); | 44 // Use javacript textContent instead of javascript innerText; the
latter only returns |
| 45 // visible text, but <title> tags are invisible. |
| 46 currTitle = origTitle = DomUtil.javascriptTextContent(titles.getIt
em(0)); |
| 45 } | 47 } |
| 46 } | 48 } |
| 47 if (currTitle == "") return ""; | 49 if (currTitle == "") return ""; |
| 48 | 50 |
| 49 if (StringUtil.match(currTitle, " [\\|\\-] ")) { // Title has '|' and/o
r '-'. | 51 if (StringUtil.match(currTitle, " [\\|\\-] ")) { // Title has '|' and/o
r '-'. |
| 50 // Get part before last '|' or '-'. | 52 // Get part before last '|' or '-'. |
| 51 currTitle = StringUtil.findAndReplace(origTitle, "(.*)[\\|\\-] .*",
"$1"); | 53 currTitle = StringUtil.findAndReplace(origTitle, "(.*)[\\|\\-] .*",
"$1"); |
| 52 if (StringUtil.splitLength(currTitle, "\\s+") < 3) { // Part has <
3 words. | 54 if (StringUtil.splitLength(currTitle, "\\s+") < 3) { // Part has <
3 words. |
| 53 // Get part after first '|' or '-'. | 55 // Get part after first '|' or '-'. |
| 54 currTitle = StringUtil.findAndReplace(origTitle, "[^\\|\\-]*[\\|
\\-](.*)", "$1"); | 56 currTitle = StringUtil.findAndReplace(origTitle, "[^\\|\\-]*[\\|
\\-](.*)", "$1"); |
| 55 } | 57 } |
| 56 } else if (currTitle.indexOf(": ") != -1) { // Title has ':'. | 58 } else if (currTitle.indexOf(": ") != -1) { // Title has ':'. |
| 57 // Get part after last ':'. | 59 // Get part after last ':'. |
| 58 currTitle = StringUtil.findAndReplace(origTitle, ".*:(.*)", "$1"); | 60 currTitle = StringUtil.findAndReplace(origTitle, ".*:(.*)", "$1"); |
| 59 if (StringUtil.splitLength(currTitle, "\\s+") < 3) { // Part has <
3 words. | 61 if (StringUtil.splitLength(currTitle, "\\s+") < 3) { // Part has <
3 words. |
| 60 // Get part after first ':'. | 62 // Get part after first ':'. |
| 61 currTitle = StringUtil.findAndReplace(origTitle, "[^:]*[:](.*)", "
$1"); | 63 currTitle = StringUtil.findAndReplace(origTitle, "[^:]*[:](.*)", "
$1"); |
| 62 } | 64 } |
| 63 } else if (root != null && (currTitle.length() > 150 || currTitle.length
() < 15)) { | 65 } else if (root != null && (currTitle.length() > 150 || currTitle.length
() < 15)) { |
| 64 // Get plain text from the only H1 element. | 66 // Get plain text from the only H1 element. |
| 65 // TODO(kuan): this is what readability does, but this block may mak
e more sense as an | 67 // TODO(kuan): this is what readability does, but this block may mak
e more sense as an |
| 66 // if rather than else-if, e.g. currently this else-if block is used
when original title | 68 // if rather than else-if, e.g. currently this else-if block is used
when original title |
| 67 // is "foo" but not when it is "foo |" or "foo:". | 69 // is "foo" but not when it is "foo |" or "foo:". |
| 68 currTitle = findTheOnlyH1(root); | 70 currTitle = findFirstH1(root); |
| 69 if (currTitle == null) currTitle = origTitle; | 71 if (currTitle.isEmpty()) currTitle = origTitle; |
| 70 } | 72 } |
| 71 | 73 |
| 72 currTitle = StringUtil.trim(currTitle); | 74 currTitle = StringUtil.trim(currTitle); |
| 73 | 75 |
| 74 if (StringUtil.splitLength(currTitle, "\\s+") <= 4) currTitle = origTitl
e; | 76 if (StringUtil.splitLength(currTitle, "\\s+") <= 4) currTitle = origTitl
e; |
| 75 | 77 |
| 76 return currTitle; | 78 return currTitle; |
| 77 } | 79 } |
| 78 | 80 |
| 79 | 81 |
| 80 private static String findTheOnlyH1(Element root) { | 82 private static String findFirstH1(Element root) { |
| 81 NodeList<Element> hOnes = root.getElementsByTagName("H1"); | 83 NodeList<Element> hOnes = root.getElementsByTagName("H1"); |
| 82 return hOnes.getLength() == 1 ? hOnes.getItem(0).getInnerText() : null; | 84 // Use javacript innerText instead of javascript textContent; the former
only returns |
| 85 // visible text, and we assume visible H1's are more inclined to being p
otential titles. |
| 86 String h1 = ""; |
| 87 for (int i = 0; i < hOnes.getLength() && h1.isEmpty(); i++) { |
| 88 h1 = DomUtil.getInnerText(hOnes.getItem(i)); |
| 89 } |
| 90 return h1; |
| 83 } | 91 } |
| 84 } | 92 } |
| OLD | NEW |