OLD | NEW |
---|---|
1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 package org.chromium.distiller; | 5 package org.chromium.distiller; |
6 | 6 |
7 import org.chromium.distiller.document.TextDocument; | 7 import org.chromium.distiller.document.TextDocument; |
8 import org.chromium.distiller.document.TextDocumentStatistics; | 8 import org.chromium.distiller.document.TextDocumentStatistics; |
9 import org.chromium.distiller.extractors.ArticleExtractor; | 9 import org.chromium.distiller.extractors.ArticleExtractor; |
10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; | 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; |
(...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
160 public List<String> getImageUrls() { | 160 public List<String> getImageUrls() { |
161 return imageUrls; | 161 return imageUrls; |
162 } | 162 } |
163 | 163 |
164 /** | 164 /** |
165 * Get the element of the main article, if any. | 165 * Get the element of the main article, if any. |
166 * @return An element of article (not necessarily the html5 article element) . | 166 * @return An element of article (not necessarily the html5 article element) . |
167 */ | 167 */ |
168 private Element getArticleElement(Element root) { | 168 private Element getArticleElement(Element root) { |
169 NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE"); | 169 NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE"); |
170 List<Element> visibleElements = getVisibleElements(allArticles); | |
170 // Having multiple article elements usually indicates a bad case for thi s shortcut. | 171 // Having multiple article elements usually indicates a bad case for thi s shortcut. |
171 // TODO(wychen): some sites exclude things like title and author in arti cle element. | 172 // TODO(wychen): some sites exclude things like title and author in arti cle element. |
172 if (allArticles.getLength() == 1) { | 173 if (visibleElements.size() == 1) { |
173 return allArticles.getItem(0); | 174 return visibleElements.get(0); |
174 } | 175 } |
175 // Note that the CSS property matching is case sensitive, and "Article" is the correct | 176 // Note that the CSS property matching is case sensitive, and "Article" is the correct |
176 // capitalization. | 177 // capitalization. |
177 String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype* =\"Post\"]"; | 178 String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype* =\"Post\"]"; |
178 allArticles = DomUtil.querySelectorAll(root, query); | 179 allArticles = DomUtil.querySelectorAll(root, query); |
180 visibleElements = getVisibleElements(allArticles); | |
179 // It is commonly seen that the article is wrapped separately or in mult iple layers. | 181 // It is commonly seen that the article is wrapped separately or in mult iple layers. |
180 if (allArticles.getLength() > 0) { | 182 if (visibleElements.size() > 0) { |
181 return Element.as(DomUtil.getNearestCommonAncestor(allArticles)); | 183 return Element.as(DomUtil.getNearestCommonAncestor(visibleElements)) ; |
182 } | 184 } |
183 return null; | 185 return null; |
184 } | 186 } |
185 | 187 |
186 /** | 188 /** |
189 * Get a list of visible elements. | |
190 * @return A list of visible elements. | |
191 */ | |
192 private List<Element> getVisibleElements(NodeList<Element> nodeList) { | |
193 List<Element> visibleElements = new ArrayList<>(); | |
194 for (int i = 0; i < nodeList.getLength(); i ++) { | |
mdjones
2015/10/21 18:15:02
nit: i++
| |
195 Element element = nodeList.getItem(i); | |
196 if (DomUtil.isVisible(element)) { | |
197 visibleElements.add(element); | |
198 } | |
199 } | |
200 return visibleElements; | |
201 } | |
202 | |
203 /** | |
187 * Converts the original HTML page into a WebDocument for analysis. | 204 * Converts the original HTML page into a WebDocument for analysis. |
188 */ | 205 */ |
189 private WebDocumentInfo createWebDocumentInfoFromPage() { | 206 private WebDocumentInfo createWebDocumentInfoFromPage() { |
190 WebDocumentInfo info = new WebDocumentInfo(); | 207 WebDocumentInfo info = new WebDocumentInfo(); |
191 WebDocumentBuilder documentBuilder = new WebDocumentBuilder(); | 208 WebDocumentBuilder documentBuilder = new WebDocumentBuilder(); |
192 DomConverter converter = new DomConverter(documentBuilder); | 209 DomConverter converter = new DomConverter(documentBuilder); |
193 Element walkerRoot = getArticleElement(documentElement); | 210 Element walkerRoot = getArticleElement(documentElement); |
194 if (walkerRoot == null) { | 211 if (walkerRoot == null) { |
195 walkerRoot = documentElement; | 212 walkerRoot = documentElement; |
196 } | 213 } |
(...skipping 11 matching lines...) Expand all Loading... | |
208 * | 225 * |
209 * @param document the WebDocument representation of the page extracted from the DOM. | 226 * @param document the WebDocument representation of the page extracted from the DOM. |
210 */ | 227 */ |
211 private void processDocument(WebDocument document) { | 228 private void processDocument(WebDocument document) { |
212 TextDocument textDocument = document.createTextDocumentView(); | 229 TextDocument textDocument = document.createTextDocumentView(); |
213 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); | 230 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); |
214 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument)); | 231 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument)); |
215 textDocument.applyToModel(); | 232 textDocument.applyToModel(); |
216 } | 233 } |
217 } | 234 } |
OLD | NEW |