Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(92)

Side by Side Diff: java/org/chromium/distiller/ContentExtractor.java

Issue 1411603004: Discard hidden articles when using fast path (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: Fixed inconsistent indentation Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package org.chromium.distiller; 5 package org.chromium.distiller;
6 6
7 import org.chromium.distiller.document.TextDocument; 7 import org.chromium.distiller.document.TextDocument;
8 import org.chromium.distiller.document.TextDocumentStatistics; 8 import org.chromium.distiller.document.TextDocumentStatistics;
9 import org.chromium.distiller.extractors.ArticleExtractor; 9 import org.chromium.distiller.extractors.ArticleExtractor;
10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;
(...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after
160 public List<String> getImageUrls() { 160 public List<String> getImageUrls() {
161 return imageUrls; 161 return imageUrls;
162 } 162 }
163 163
164 /** 164 /**
165 * Get the element of the main article, if any. 165 * Get the element of the main article, if any.
166 * @return An element of article (not necessarily the html5 article element) . 166 * @return An element of article (not necessarily the html5 article element) .
167 */ 167 */
168 private Element getArticleElement(Element root) { 168 private Element getArticleElement(Element root) {
169 NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE"); 169 NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE");
170 List<Element> visibleElements = getVisibleElements(allArticles);
170 // Having multiple article elements usually indicates a bad case for thi s shortcut. 171 // Having multiple article elements usually indicates a bad case for thi s shortcut.
171 // TODO(wychen): some sites exclude things like title and author in arti cle element. 172 // TODO(wychen): some sites exclude things like title and author in arti cle element.
172 if (allArticles.getLength() == 1) { 173 if (visibleElements.size() == 1) {
173 return allArticles.getItem(0); 174 return visibleElements.get(0);
174 } 175 }
175 // Note that the CSS property matching is case sensitive, and "Article" is the correct 176 // Note that the CSS property matching is case sensitive, and "Article" is the correct
176 // capitalization. 177 // capitalization.
177 String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype* =\"Post\"]"; 178 String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype* =\"Post\"]";
178 allArticles = DomUtil.querySelectorAll(root, query); 179 allArticles = DomUtil.querySelectorAll(root, query);
180 visibleElements = getVisibleElements(allArticles);
179 // It is commonly seen that the article is wrapped separately or in mult iple layers. 181 // It is commonly seen that the article is wrapped separately or in mult iple layers.
180 if (allArticles.getLength() > 0) { 182 if (visibleElements.size() > 0) {
181 return Element.as(DomUtil.getNearestCommonAncestor(allArticles)); 183 return Element.as(DomUtil.getNearestCommonAncestor(visibleElements)) ;
182 } 184 }
183 return null; 185 return null;
184 } 186 }
185 187
186 /** 188 /**
189 * Get a list of visible elements.
190 * @return A list of visible elements.
191 */
192 private List<Element> getVisibleElements(NodeList<Element> nodeList) {
193 List<Element> visibleElements = new ArrayList<>();
194 for (int i = 0; i < nodeList.getLength(); i ++) {
mdjones 2015/10/21 18:15:02 nit: i++
195 Element element = nodeList.getItem(i);
196 if (DomUtil.isVisible(element)) {
197 visibleElements.add(element);
198 }
199 }
200 return visibleElements;
201 }
202
203 /**
187 * Converts the original HTML page into a WebDocument for analysis. 204 * Converts the original HTML page into a WebDocument for analysis.
188 */ 205 */
189 private WebDocumentInfo createWebDocumentInfoFromPage() { 206 private WebDocumentInfo createWebDocumentInfoFromPage() {
190 WebDocumentInfo info = new WebDocumentInfo(); 207 WebDocumentInfo info = new WebDocumentInfo();
191 WebDocumentBuilder documentBuilder = new WebDocumentBuilder(); 208 WebDocumentBuilder documentBuilder = new WebDocumentBuilder();
192 DomConverter converter = new DomConverter(documentBuilder); 209 DomConverter converter = new DomConverter(documentBuilder);
193 Element walkerRoot = getArticleElement(documentElement); 210 Element walkerRoot = getArticleElement(documentElement);
194 if (walkerRoot == null) { 211 if (walkerRoot == null) {
195 walkerRoot = documentElement; 212 walkerRoot = documentElement;
196 } 213 }
(...skipping 11 matching lines...) Expand all
208 * 225 *
209 * @param document the WebDocument representation of the page extracted from the DOM. 226 * @param document the WebDocument representation of the page extracted from the DOM.
210 */ 227 */
211 private void processDocument(WebDocument document) { 228 private void processDocument(WebDocument document) {
212 TextDocument textDocument = document.createTextDocumentView(); 229 TextDocument textDocument = document.createTextDocumentView();
213 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); 230 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);
214 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument)); 231 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument));
215 textDocument.applyToModel(); 232 textDocument.applyToModel();
216 } 233 }
217 } 234 }
OLDNEW
« no previous file with comments | « no previous file | java/org/chromium/distiller/DomUtil.java » ('j') | java/org/chromium/distiller/DomUtil.java » ('J')

Powered by Google App Engine
This is Rietveld 408576698