| OLD | NEW |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 package com.dom_distiller.client; | 5 package com.dom_distiller.client; |
| 6 | 6 |
| 7 import java.util.ArrayList; | 7 import java.util.ArrayList; |
| 8 import java.util.EnumMap; | 8 import java.util.EnumMap; |
| 9 import java.util.List; | 9 import java.util.List; |
| 10 import java.util.Map; | 10 import java.util.Map; |
| (...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 126 } | 126 } |
| 127 } | 127 } |
| 128 } | 128 } |
| 129 | 129 |
| 130 private void findDate() { | 130 private void findDate() { |
| 131 mDate = ""; | 131 mDate = ""; |
| 132 | 132 |
| 133 // Get date from any element that includes the "dateline" class. | 133 // Get date from any element that includes the "dateline" class. |
| 134 Element elem = DomUtil.getFirstElementWithClassName(mRoot, "dateline"); | 134 Element elem = DomUtil.getFirstElementWithClassName(mRoot, "dateline"); |
| 135 if (elem != null) { | 135 if (elem != null) { |
| 136 mDate = elem.getInnerText(); | 136 // Use javascript textContent (instead of javascript innerText) to i
nclude invisible |
| 137 // text. |
| 138 mDate = DomUtil.javascriptTextContent(elem); |
| 137 } else { // Otherwise, get date from meta tag with "displaydate" as nam
e. | 139 } else { // Otherwise, get date from meta tag with "displaydate" as nam
e. |
| 138 for (int i = 0; i < mAllMeta.getLength(); i++) { | 140 for (int i = 0; i < mAllMeta.getLength(); i++) { |
| 139 MetaElement meta = MetaElement.as(mAllMeta.getItem(i)); | 141 MetaElement meta = MetaElement.as(mAllMeta.getItem(i)); |
| 140 if (meta.getName().equalsIgnoreCase("displaydate")) { | 142 if (meta.getName().equalsIgnoreCase("displaydate")) { |
| 141 mDate = meta.getContent(); | 143 mDate = meta.getContent(); |
| 142 break; | 144 break; |
| 143 } | 145 } |
| 144 } | 146 } |
| 145 } | 147 } |
| 146 } | 148 } |
| 147 | 149 |
| 148 private void findAuthor() { | 150 private void findAuthor() { |
| 149 mAuthor = ""; | 151 mAuthor = ""; |
| 150 | 152 |
| 151 // Get author from the first element that includes the "byline-name" cla
ss. | 153 // Get author from the first element that includes the "byline-name" cla
ss. |
| 152 // Note that we ignore the order of this element for now. | 154 // Note that we ignore the order of this element for now. |
| 153 Element elem = DomUtil.getFirstElementWithClassName(mRoot, "byline-name"
); | 155 Element elem = DomUtil.getFirstElementWithClassName(mRoot, "byline-name"
); |
| 154 if (elem != null) mAuthor = elem.getInnerText(); | 156 // Use javascript textContent (instead of javascript innerText) to inclu
de invisible text. |
| 157 if (elem != null) mAuthor = DomUtil.javascriptTextContent(elem); |
| 155 } | 158 } |
| 156 | 159 |
| 157 private void findPublisher() { | 160 private void findPublisher() { |
| 158 mPublisher = ""; | 161 mPublisher = ""; |
| 159 | 162 |
| 160 // Look for "publisher" or "source_organization" attribute in any html t
ag. | 163 // Look for "publisher" or "source_organization" attribute in any html t
ag. |
| 161 NodeList<Element> allElems = mRoot.getElementsByTagName("*"); | 164 NodeList<Element> allElems = mRoot.getElementsByTagName("*"); |
| 162 for (int i = 0; i < allElems.getLength() && mPublisher.isEmpty(); i++) { | 165 for (int i = 0; i < allElems.getLength() && mPublisher.isEmpty(); i++) { |
| 163 Element e = allElems.getItem(i); | 166 Element e = allElems.getItem(i); |
| 164 mPublisher = e.getAttribute("publisher"); | 167 mPublisher = e.getAttribute("publisher"); |
| (...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 221 double aspectRatio = (double) width / (double) image.getHeight(); | 224 double aspectRatio = (double) width / (double) image.getHeight(); |
| 222 return aspectRatio >= 1.3 && aspectRatio <= 3.0; | 225 return aspectRatio >= 1.3 && aspectRatio <= 3.0; |
| 223 } | 226 } |
| 224 | 227 |
| 225 private static String getCaption(ImageElement image) { | 228 private static String getCaption(ImageElement image) { |
| 226 // If |image| is a child of <figure>, then get the <figcaption> elements
. | 229 // If |image| is a child of <figure>, then get the <figcaption> elements
. |
| 227 Element parent = image.getParentElement(); | 230 Element parent = image.getParentElement(); |
| 228 if (!parent.hasTagName("FIGURE")) return ""; | 231 if (!parent.hasTagName("FIGURE")) return ""; |
| 229 NodeList<Element> captions = parent.getElementsByTagName("FIGCAPTION"); | 232 NodeList<Element> captions = parent.getElementsByTagName("FIGCAPTION"); |
| 230 int numCaptions = captions.getLength(); | 233 int numCaptions = captions.getLength(); |
| 231 if (numCaptions > 0 && numCaptions <= 2) | 234 String caption = ""; |
| 232 return captions.getItem(0).getInnerText(); // Just use the first on
e. | 235 if (numCaptions > 0 && numCaptions <= 2) { |
| 233 return ""; | 236 // Use javascript innerText (instead of javascript textContent) to g
et only visible |
| 237 // captions. |
| 238 for (int i = 0; i < numCaptions && caption.isEmpty(); i++) { |
| 239 caption = DomUtil.getInnerText(captions.getItem(i)); |
| 240 } |
| 241 } |
| 242 return caption; |
| 234 } | 243 } |
| 235 | 244 |
| 236 private static boolean isTextInBody(Element root, String text) { | 245 private static boolean isTextInBody(Element root, String text) { |
| 237 String lowerText = text.toLowerCase(); | 246 String lowerText = text.toLowerCase(); |
| 238 NodeList<Element> bodies = root.getElementsByTagName("BODY"); | 247 NodeList<Element> bodies = root.getElementsByTagName("BODY"); |
| 239 for (int i = 0; i < bodies.getLength(); i++) { | 248 for (int i = 0; i < bodies.getLength(); i++) { |
| 240 if (bodies.getItem(i).getInnerText().toLowerCase().contains(lowerTex
t)) return true; | 249 // Use javascript textContent (instead of javascript innerText) to i
nclude invisible |
| 250 // text. |
| 251 if (DomUtil.javascriptTextContent( |
| 252 bodies.getItem(i)).toLowerCase().contains(lowerText)) { |
| 253 return true; |
| 254 } |
| 241 } | 255 } |
| 242 return false; | 256 return false; |
| 243 } | 257 } |
| 244 } | 258 } |
| OLD | NEW |