java/org/chromium/distiller/ContentExtractor.java - Issue 1411603004: Discard hidden articles when using fast path

Side by Side Diff: java/org/chromium/distiller/ContentExtractor.java

Issue 1411603004: Discard hidden articles when using fast path (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master

Patch Set: Comments addressed Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package org.chromium.distiller;	5 package org.chromium.distiller;

6	6

7 import org.chromium.distiller.document.TextDocument;	7 import org.chromium.distiller.document.TextDocument;

8 import org.chromium.distiller.document.TextDocumentStatistics;	8 import org.chromium.distiller.document.TextDocumentStatistics;

9 import org.chromium.distiller.extractors.ArticleExtractor;	9 import org.chromium.distiller.extractors.ArticleExtractor;

10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;	10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;

(...skipping 149 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
160 public List<String> getImageUrls() {	160 public List<String> getImageUrls() {

161 return imageUrls;	161 return imageUrls;

162 }	162 }

163	163

164 /**	164 /**

165 * Get the element of the main article, if any.	165 * Get the element of the main article, if any.

166 * @return An element of article (not necessarily the html5 article element) .	166 * @return An element of article (not necessarily the html5 article element) .

167 */	167 */

168 private Element getArticleElement(Element root) {	168 private Element getArticleElement(Element root) {

169 NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE");	169 NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE");

	170 List<Element> visibleElements = getVisibleElements(allArticles);

170 // Having multiple article elements usually indicates a bad case for thi s shortcut.	171 // Having multiple article elements usually indicates a bad case for thi s shortcut.

171 // TODO(wychen): some sites exclude things like title and author in arti cle element.	172 // TODO(wychen): some sites exclude things like title and author in arti cle element.

172 if (allArticles.getLength() == 1) {	173 if (visibleElements.size() == 1) {

173 return allArticles.getItem(0);	174 return visibleElements.get(0);

174 }	175 }

175 // Note that the CSS property matching is case sensitive, and "Article" is the correct	176 // Note that the CSS property matching is case sensitive, and "Article" is the correct

176 // capitalization.	177 // capitalization.

177 String query = "[itemscope][itemtype=\"Article\"],[itemscope][itemtype =\"Post\"]";	178 String query = "[itemscope][itemtype=\"Article\"],[itemscope][itemtype =\"Post\"]";

178 allArticles = DomUtil.querySelectorAll(root, query);	179 allArticles = DomUtil.querySelectorAll(root, query);

	180 visibleElements = getVisibleElements(allArticles);

179 // It is commonly seen that the article is wrapped separately or in mult iple layers.	181 // It is commonly seen that the article is wrapped separately or in mult iple layers.

180 if (allArticles.getLength() > 0) {	182 if (visibleElements.size() > 0) {

181 return Element.as(DomUtil.getNearestCommonAncestor(allArticles));	183 return Element.as(DomUtil.getNearestCommonAncestor(visibleElements)) ;

182 }	184 }

183 return null;	185 return null;

184 }	186 }

185	187

186 /**	188 /**

	189 * Get a list of visible elements.

	190 * @return A list of visible elements.

	191 */

	192 private List<Element> getVisibleElements(NodeList<Element> nodeList) {

	193 List<Element> visibleElements = new ArrayList<>();

	194 for (int i = 0; i < nodeList.getLength(); i++) {

	195 Element element = nodeList.getItem(i);

	196 if (DomUtil.isVisible(element)) {

	197 visibleElements.add(element);

	198 }

	199 }

	200 return visibleElements;

	201 }

	202

	203 /**

187 * Converts the original HTML page into a WebDocument for analysis.	204 * Converts the original HTML page into a WebDocument for analysis.

188 */	205 */

189 private WebDocumentInfo createWebDocumentInfoFromPage() {	206 private WebDocumentInfo createWebDocumentInfoFromPage() {

190 WebDocumentInfo info = new WebDocumentInfo();	207 WebDocumentInfo info = new WebDocumentInfo();

191 WebDocumentBuilder documentBuilder = new WebDocumentBuilder();	208 WebDocumentBuilder documentBuilder = new WebDocumentBuilder();

192 DomConverter converter = new DomConverter(documentBuilder);	209 DomConverter converter = new DomConverter(documentBuilder);

193 Element walkerRoot = getArticleElement(documentElement);	210 Element walkerRoot = getArticleElement(documentElement);

194 if (walkerRoot == null) {	211 if (walkerRoot == null) {

195 walkerRoot = documentElement;	212 walkerRoot = documentElement;

196 }	213 }

(...skipping 11 matching lines...) Expand all Loading...
208 *	225 *

209 * @param document the WebDocument representation of the page extracted from the DOM.	226 * @param document the WebDocument representation of the page extracted from the DOM.

210 */	227 */

211 private void processDocument(WebDocument document) {	228 private void processDocument(WebDocument document) {

212 TextDocument textDocument = document.createTextDocumentView();	229 TextDocument textDocument = document.createTextDocumentView();

213 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);	230 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);

214 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument));	231 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument));

215 textDocument.applyToModel();	232 textDocument.applyToModel();

216 }	233 }

217 }	234 }

OLD	NEW

« no previous file with comments | « no previous file | java/org/chromium/distiller/DomUtil.java » ('j') | javatests/org/chromium/distiller/ContentExtractorTest.java » ('J')