OLD | NEW |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 package org.chromium.distiller; | 5 package org.chromium.distiller; |
6 | 6 |
7 import org.chromium.distiller.document.TextDocument; | 7 import org.chromium.distiller.document.TextDocument; |
8 import org.chromium.distiller.document.TextDocumentStatistics; | 8 import org.chromium.distiller.document.TextDocumentStatistics; |
9 import org.chromium.distiller.extractors.ArticleExtractor; | 9 import org.chromium.distiller.extractors.ArticleExtractor; |
10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; | 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; |
(...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
160 public List<String> getImageUrls() { | 160 public List<String> getImageUrls() { |
161 return imageUrls; | 161 return imageUrls; |
162 } | 162 } |
163 | 163 |
164 /** | 164 /** |
165 * Get the element of the main article, if any. | 165 * Get the element of the main article, if any. |
166 * @return An element of article (not necessarily the html5 article element)
. | 166 * @return An element of article (not necessarily the html5 article element)
. |
167 */ | 167 */ |
168 private Element getArticleElement(Element root) { | 168 private Element getArticleElement(Element root) { |
169 NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE"); | 169 NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE"); |
| 170 List<Element> visibleElements = getVisibleElements(allArticles); |
170 // Having multiple article elements usually indicates a bad case for thi
s shortcut. | 171 // Having multiple article elements usually indicates a bad case for thi
s shortcut. |
171 // TODO(wychen): some sites exclude things like title and author in arti
cle element. | 172 // TODO(wychen): some sites exclude things like title and author in arti
cle element. |
172 if (allArticles.getLength() == 1) { | 173 if (visibleElements.size() == 1) { |
173 return allArticles.getItem(0); | 174 return visibleElements.get(0); |
174 } | 175 } |
175 // Note that the CSS property matching is case sensitive, and "Article"
is the correct | 176 // Note that the CSS property matching is case sensitive, and "Article"
is the correct |
176 // capitalization. | 177 // capitalization. |
177 String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*
=\"Post\"]"; | 178 String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*
=\"Post\"]"; |
178 allArticles = DomUtil.querySelectorAll(root, query); | 179 allArticles = DomUtil.querySelectorAll(root, query); |
| 180 visibleElements = getVisibleElements(allArticles); |
179 // It is commonly seen that the article is wrapped separately or in mult
iple layers. | 181 // It is commonly seen that the article is wrapped separately or in mult
iple layers. |
180 if (allArticles.getLength() > 0) { | 182 if (visibleElements.size() > 0) { |
181 return Element.as(DomUtil.getNearestCommonAncestor(allArticles)); | 183 return Element.as(DomUtil.getNearestCommonAncestor(visibleElements))
; |
182 } | 184 } |
183 return null; | 185 return null; |
184 } | 186 } |
185 | 187 |
186 /** | 188 /** |
| 189 * Get a list of visible elements. |
| 190 * @return A list of visible elements. |
| 191 */ |
| 192 private List<Element> getVisibleElements(NodeList<Element> nodeList) { |
| 193 List<Element> visibleElements = new ArrayList<>(); |
| 194 for (int i = 0; i < nodeList.getLength(); i++) { |
| 195 Element element = nodeList.getItem(i); |
| 196 if (DomUtil.isVisible(element)) { |
| 197 visibleElements.add(element); |
| 198 } |
| 199 } |
| 200 return visibleElements; |
| 201 } |
| 202 |
| 203 /** |
187 * Converts the original HTML page into a WebDocument for analysis. | 204 * Converts the original HTML page into a WebDocument for analysis. |
188 */ | 205 */ |
189 private WebDocumentInfo createWebDocumentInfoFromPage() { | 206 private WebDocumentInfo createWebDocumentInfoFromPage() { |
190 WebDocumentInfo info = new WebDocumentInfo(); | 207 WebDocumentInfo info = new WebDocumentInfo(); |
191 WebDocumentBuilder documentBuilder = new WebDocumentBuilder(); | 208 WebDocumentBuilder documentBuilder = new WebDocumentBuilder(); |
192 DomConverter converter = new DomConverter(documentBuilder); | 209 DomConverter converter = new DomConverter(documentBuilder); |
193 Element walkerRoot = getArticleElement(documentElement); | 210 Element walkerRoot = getArticleElement(documentElement); |
194 if (walkerRoot == null) { | 211 if (walkerRoot == null) { |
195 walkerRoot = documentElement; | 212 walkerRoot = documentElement; |
196 } | 213 } |
(...skipping 11 matching lines...) Expand all Loading... |
208 * | 225 * |
209 * @param document the WebDocument representation of the page extracted from
the DOM. | 226 * @param document the WebDocument representation of the page extracted from
the DOM. |
210 */ | 227 */ |
211 private void processDocument(WebDocument document) { | 228 private void processDocument(WebDocument document) { |
212 TextDocument textDocument = document.createTextDocumentView(); | 229 TextDocument textDocument = document.createTextDocumentView(); |
213 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); | 230 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); |
214 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(
textDocument)); | 231 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(
textDocument)); |
215 textDocument.applyToModel(); | 232 textDocument.applyToModel(); |
216 } | 233 } |
217 } | 234 } |
OLD | NEW |