Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(823)

Side by Side Diff: java/org/chromium/distiller/ContentExtractor.java

Issue 1411603004: Discard hidden articles when using fast path (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: nit fixed 4 Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | java/org/chromium/distiller/DomUtil.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package org.chromium.distiller; 5 package org.chromium.distiller;
6 6
7 import org.chromium.distiller.document.TextDocument; 7 import org.chromium.distiller.document.TextDocument;
8 import org.chromium.distiller.document.TextDocumentStatistics; 8 import org.chromium.distiller.document.TextDocumentStatistics;
9 import org.chromium.distiller.extractors.ArticleExtractor; 9 import org.chromium.distiller.extractors.ArticleExtractor;
10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;
(...skipping 144 matching lines...) Expand 10 before | Expand all | Expand 10 after
155 155
156 /** 156 /**
157 * Get a list of the content image URLs in the provided document. 157 * Get a list of the content image URLs in the provided document.
158 * @return A list of image URLs. 158 * @return A list of image URLs.
159 */ 159 */
160 public List<String> getImageUrls() { 160 public List<String> getImageUrls() {
161 return imageUrls; 161 return imageUrls;
162 } 162 }
163 163
164 /** 164 /**
165 * Get the element of the main article, if any.
166 * @return An element of article (not necessarily the html5 article element) .
167 */
168 private Element getArticleElement(Element root) {
169 NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE");
170 // Having multiple article elements usually indicates a bad case for thi s shortcut.
171 // TODO(wychen): some sites exclude things like title and author in arti cle element.
172 if (allArticles.getLength() == 1) {
173 return allArticles.getItem(0);
174 }
175 // Note that the CSS property matching is case sensitive, and "Article" is the correct
176 // capitalization.
177 String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype* =\"Post\"]";
178 allArticles = DomUtil.querySelectorAll(root, query);
179 // It is commonly seen that the article is wrapped separately or in mult iple layers.
180 if (allArticles.getLength() > 0) {
181 return Element.as(DomUtil.getNearestCommonAncestor(allArticles));
182 }
183 return null;
184 }
185
186 /**
187 * Converts the original HTML page into a WebDocument for analysis. 165 * Converts the original HTML page into a WebDocument for analysis.
188 */ 166 */
189 private WebDocumentInfo createWebDocumentInfoFromPage() { 167 private WebDocumentInfo createWebDocumentInfoFromPage() {
190 WebDocumentInfo info = new WebDocumentInfo(); 168 WebDocumentInfo info = new WebDocumentInfo();
191 WebDocumentBuilder documentBuilder = new WebDocumentBuilder(); 169 WebDocumentBuilder documentBuilder = new WebDocumentBuilder();
192 DomConverter converter = new DomConverter(documentBuilder); 170 DomConverter converter = new DomConverter(documentBuilder);
193 Element walkerRoot = getArticleElement(documentElement); 171 Element walkerRoot = DomUtil.getArticleElement(documentElement);
194 if (walkerRoot == null) { 172 if (walkerRoot == null) {
195 walkerRoot = documentElement; 173 walkerRoot = documentElement;
196 } 174 }
197 new DomWalker(converter).walk(walkerRoot); 175 new DomWalker(converter).walk(walkerRoot);
198 info.document = documentBuilder.toWebDocument(); 176 info.document = documentBuilder.toWebDocument();
199 ensureTitleInitialized(); 177 ensureTitleInitialized();
200 info.hiddenElements = converter.getHiddenElements(); 178 info.hiddenElements = converter.getHiddenElements();
201 179
202 return info; 180 return info;
203 } 181 }
204 182
205 /** 183 /**
206 * Implements the actual analysis of the page content, identifying the core elements of the 184 * Implements the actual analysis of the page content, identifying the core elements of the
207 * page. 185 * page.
208 * 186 *
209 * @param document the WebDocument representation of the page extracted from the DOM. 187 * @param document the WebDocument representation of the page extracted from the DOM.
210 */ 188 */
211 private void processDocument(WebDocument document) { 189 private void processDocument(WebDocument document) {
212 TextDocument textDocument = document.createTextDocumentView(); 190 TextDocument textDocument = document.createTextDocumentView();
213 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); 191 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);
214 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument)); 192 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument));
215 textDocument.applyToModel(); 193 textDocument.applyToModel();
216 } 194 }
217 } 195 }
OLDNEW
« no previous file with comments | « no previous file | java/org/chromium/distiller/DomUtil.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698