java/org/chromium/distiller/ContentExtractor.java - Issue 1411603004: Discard hidden articles when using fast path

Side by Side Diff: java/org/chromium/distiller/ContentExtractor.java

Issue 1411603004: Discard hidden articles when using fast path (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master

Patch Set: nit fixed Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package org.chromium.distiller;	5 package org.chromium.distiller;

6	6

7 import org.chromium.distiller.document.TextDocument;	7 import org.chromium.distiller.document.TextDocument;

8 import org.chromium.distiller.document.TextDocumentStatistics;	8 import org.chromium.distiller.document.TextDocumentStatistics;

9 import org.chromium.distiller.extractors.ArticleExtractor;	9 import org.chromium.distiller.extractors.ArticleExtractor;

10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;	10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;

(...skipping 144 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
155	155

156 /**	156 /**

157 * Get a list of the content image URLs in the provided document.	157 * Get a list of the content image URLs in the provided document.

158 * @return A list of image URLs.	158 * @return A list of image URLs.

159 */	159 */

160 public List<String> getImageUrls() {	160 public List<String> getImageUrls() {

161 return imageUrls;	161 return imageUrls;

162 }	162 }

163	163

164 /**	164 /**

165 * Get the element of the main article, if any.

166 * @return An element of article (not necessarily the html5 article element) .

167 */

168 private Element getArticleElement(Element root) {

169 NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE");

170 // Having multiple article elements usually indicates a bad case for thi s shortcut.

171 // TODO(wychen): some sites exclude things like title and author in arti cle element.

172 if (allArticles.getLength() == 1) {

173 return allArticles.getItem(0);

174 }

175 // Note that the CSS property matching is case sensitive, and "Article" is the correct

176 // capitalization.

177 String query = "[itemscope][itemtype=\"Article\"],[itemscope][itemtype =\"Post\"]";

178 allArticles = DomUtil.querySelectorAll(root, query);

179 // It is commonly seen that the article is wrapped separately or in mult iple layers.

180 if (allArticles.getLength() > 0) {

181 return Element.as(DomUtil.getNearestCommonAncestor(allArticles));

182 }

183 return null;

184 }

185

186 /**

187 * Converts the original HTML page into a WebDocument for analysis.	165 * Converts the original HTML page into a WebDocument for analysis.

188 */	166 */

189 private WebDocumentInfo createWebDocumentInfoFromPage() {	167 private WebDocumentInfo createWebDocumentInfoFromPage() {

190 WebDocumentInfo info = new WebDocumentInfo();	168 WebDocumentInfo info = new WebDocumentInfo();

191 WebDocumentBuilder documentBuilder = new WebDocumentBuilder();	169 WebDocumentBuilder documentBuilder = new WebDocumentBuilder();

192 DomConverter converter = new DomConverter(documentBuilder);	170 DomConverter converter = new DomConverter(documentBuilder);

193 Element walkerRoot = getArticleElement(documentElement);	171 Element walkerRoot = DomUtil.getArticleElement(documentElement);

194 if (walkerRoot == null) {	172 if (walkerRoot == null) {

195 walkerRoot = documentElement;	173 walkerRoot = documentElement;

196 }	174 }

197 new DomWalker(converter).walk(walkerRoot);	175 new DomWalker(converter).walk(walkerRoot);

198 info.document = documentBuilder.toWebDocument();	176 info.document = documentBuilder.toWebDocument();

199 ensureTitleInitialized();	177 ensureTitleInitialized();

200 info.hiddenElements = converter.getHiddenElements();	178 info.hiddenElements = converter.getHiddenElements();

201	179

202 return info;	180 return info;

203 }	181 }

204	182

205 /**	183 /**

206 * Implements the actual analysis of the page content, identifying the core elements of the	184 * Implements the actual analysis of the page content, identifying the core elements of the

207 * page.	185 * page.

208 *	186 *

209 * @param document the WebDocument representation of the page extracted from the DOM.	187 * @param document the WebDocument representation of the page extracted from the DOM.

210 */	188 */

211 private void processDocument(WebDocument document) {	189 private void processDocument(WebDocument document) {

212 TextDocument textDocument = document.createTextDocumentView();	190 TextDocument textDocument = document.createTextDocumentView();

213 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);	191 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);

214 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument));	192 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument));

215 textDocument.applyToModel();	193 textDocument.applyToModel();

216 }	194 }

217 }	195 }

OLD	NEW

« no previous file with comments | « no previous file | java/org/chromium/distiller/DomUtil.java » ('j') | java/org/chromium/distiller/DomUtil.java » ('J')