| OLD | NEW |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 package org.chromium.distiller; | 5 package org.chromium.distiller; |
| 6 | 6 |
| 7 import org.chromium.distiller.document.TextDocument; | 7 import org.chromium.distiller.document.TextDocument; |
| 8 import org.chromium.distiller.document.TextDocumentStatistics; | 8 import org.chromium.distiller.document.TextDocumentStatistics; |
| 9 import org.chromium.distiller.extractors.ArticleExtractor; | 9 import org.chromium.distiller.extractors.ArticleExtractor; |
| 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; | 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; |
| (...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 160 public List<String> getImageUrls() { | 160 public List<String> getImageUrls() { |
| 161 return imageUrls; | 161 return imageUrls; |
| 162 } | 162 } |
| 163 | 163 |
| 164 /** | 164 /** |
| 165 * Get the element of the main article, if any. | 165 * Get the element of the main article, if any. |
| 166 * @return An element of article (not necessarily the html5 article element)
. | 166 * @return An element of article (not necessarily the html5 article element)
. |
| 167 */ | 167 */ |
| 168 private Element getArticleElement(Element root) { | 168 private Element getArticleElement(Element root) { |
| 169 NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE"); | 169 NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE"); |
| 170 List<Element> visibleElements = getVisibleElements(allArticles); |
| 170 // Having multiple article elements usually indicates a bad case for thi
s shortcut. | 171 // Having multiple article elements usually indicates a bad case for thi
s shortcut. |
| 171 // TODO(wychen): some sites exclude things like title and author in arti
cle element. | 172 // TODO(wychen): some sites exclude things like title and author in arti
cle element. |
| 172 if (allArticles.getLength() == 1) { | 173 if (visibleElements.size() == 1) { |
| 173 return allArticles.getItem(0); | 174 return visibleElements.get(0); |
| 174 } | 175 } |
| 175 // Note that the CSS property matching is case sensitive, and "Article"
is the correct | 176 // Note that the CSS property matching is case sensitive, and "Article"
is the correct |
| 176 // capitalization. | 177 // capitalization. |
| 177 String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*
=\"Post\"]"; | 178 String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*
=\"Post\"]"; |
| 178 allArticles = DomUtil.querySelectorAll(root, query); | 179 allArticles = DomUtil.querySelectorAll(root, query); |
| 180 visibleElements = getVisibleElements(allArticles); |
| 179 // It is commonly seen that the article is wrapped separately or in mult
iple layers. | 181 // It is commonly seen that the article is wrapped separately or in mult
iple layers. |
| 180 if (allArticles.getLength() > 0) { | 182 if (visibleElements.size() > 0) { |
| 181 return Element.as(DomUtil.getNearestCommonAncestor(allArticles)); | 183 return Element.as(DomUtil.getNearestCommonAncestor(visibleElements))
; |
| 182 } | 184 } |
| 183 return null; | 185 return null; |
| 184 } | 186 } |
| 185 | 187 |
| 186 /** | 188 /** |
| 189 * Get a list of visible elements. |
| 190 * @return A list of visible elements. |
| 191 */ |
| 192 private List<Element> getVisibleElements(NodeList<Element> nodeList) { |
| 193 List<Element> visibleElements = new ArrayList<>(); |
| 194 for (int i = 0; i < nodeList.getLength(); i ++) { |
| 195 Element element = nodeList.getItem(i); |
| 196 if (DomUtil.isVisible(element)) { |
| 197 visibleElements.add(element); |
| 198 } |
| 199 } |
| 200 return visibleElements; |
| 201 } |
| 202 |
| 203 /** |
| 187 * Converts the original HTML page into a WebDocument for analysis. | 204 * Converts the original HTML page into a WebDocument for analysis. |
| 188 */ | 205 */ |
| 189 private WebDocumentInfo createWebDocumentInfoFromPage() { | 206 private WebDocumentInfo createWebDocumentInfoFromPage() { |
| 190 WebDocumentInfo info = new WebDocumentInfo(); | 207 WebDocumentInfo info = new WebDocumentInfo(); |
| 191 WebDocumentBuilder documentBuilder = new WebDocumentBuilder(); | 208 WebDocumentBuilder documentBuilder = new WebDocumentBuilder(); |
| 192 DomConverter converter = new DomConverter(documentBuilder); | 209 DomConverter converter = new DomConverter(documentBuilder); |
| 193 Element walkerRoot = getArticleElement(documentElement); | 210 Element walkerRoot = getArticleElement(documentElement); |
| 194 if (walkerRoot == null) { | 211 if (walkerRoot == null) { |
| 195 walkerRoot = documentElement; | 212 walkerRoot = documentElement; |
| 196 } | 213 } |
| (...skipping 11 matching lines...) Expand all Loading... |
| 208 * | 225 * |
| 209 * @param document the WebDocument representation of the page extracted from
the DOM. | 226 * @param document the WebDocument representation of the page extracted from
the DOM. |
| 210 */ | 227 */ |
| 211 private void processDocument(WebDocument document) { | 228 private void processDocument(WebDocument document) { |
| 212 TextDocument textDocument = document.createTextDocumentView(); | 229 TextDocument textDocument = document.createTextDocumentView(); |
| 213 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); | 230 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); |
| 214 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(
textDocument)); | 231 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(
textDocument)); |
| 215 textDocument.applyToModel(); | 232 textDocument.applyToModel(); |
| 216 } | 233 } |
| 217 } | 234 } |
| OLD | NEW |