java/org/chromium/distiller/ContentExtractor.java - Issue 2267403008: Fix partially hidden article

Side by Side Diff: java/org/chromium/distiller/ContentExtractor.java

Issue 2267403008: Fix partially hidden article (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master

Patch Set: speed optimization Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package org.chromium.distiller;	5 package org.chromium.distiller;

6	6

7 import org.chromium.distiller.document.TextDocument;	7 import org.chromium.distiller.document.TextDocument;

8 import org.chromium.distiller.document.TextDocumentStatistics;	8 import org.chromium.distiller.document.TextDocumentStatistics;

9 import org.chromium.distiller.extractors.ArticleExtractor;	9 import org.chromium.distiller.extractors.ArticleExtractor;

10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;	10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;

(...skipping 26 matching lines...) Expand all Loading...
37 private List<String> imageUrls;	37 private List<String> imageUrls;

38 private String textDirection;	38 private String textDirection;

39	39

40 private class WebDocumentInfo {	40 private class WebDocumentInfo {

41 WebDocument document;	41 WebDocument document;

42 Set<Node> hiddenElements;	42 Set<Node> hiddenElements;

43 }	43 }

44	44

45 public ContentExtractor(Element root) {	45 public ContentExtractor(Element root) {

46 documentElement = root;	46 documentElement = root;

47 candidateTitles = new LinkedList<String>();	47 candidateTitles = new LinkedList<>();

48 mTimingInfo = TimingInfo.create();	48 mTimingInfo = TimingInfo.create();

49 mStatisticsInfo = StatisticsInfo.create();	49 mStatisticsInfo = StatisticsInfo.create();

50	50

51 double startTime = DomUtil.getTime();	51 double startTime = DomUtil.getTime();

52 parser = new MarkupParser(root, mTimingInfo);	52 parser = new MarkupParser(root, mTimingInfo);

53 mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime);	53 mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime);

54 textDirection = "";	54 textDirection = "";

55 }	55 }

56	56

57 // Grabs a list of candidate titles in descending priority order:	57 // Grabs a list of candidate titles in descending priority order:

(...skipping 99 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
157 public List<String> getImageUrls() {	157 public List<String> getImageUrls() {

158 return imageUrls;	158 return imageUrls;

159 }	159 }

160	160

161 /**	161 /**

162 * Converts the original HTML page into a WebDocument for analysis.	162 * Converts the original HTML page into a WebDocument for analysis.

163 */	163 */

164 private WebDocumentInfo createWebDocumentInfoFromPage() {	164 private WebDocumentInfo createWebDocumentInfoFromPage() {

165 WebDocumentInfo info = new WebDocumentInfo();	165 WebDocumentInfo info = new WebDocumentInfo();

166 WebDocumentBuilder documentBuilder = new WebDocumentBuilder();	166 WebDocumentBuilder documentBuilder = new WebDocumentBuilder();

	167

	168 NodeList<Element> mobileViewport = DomUtil.querySelectorAll(documentElem ent,

	169 "meta[name=\"viewport\"][content*=\"width=device-width\"]");

167 DomConverter converter = new DomConverter(documentBuilder);	170 DomConverter converter = new DomConverter(documentBuilder);

	171 converter.setIsMobileFriendly(mobileViewport.getLength() > 0);

	172

168 Element walkerRoot = DomUtil.getArticleElement(documentElement);	173 Element walkerRoot = DomUtil.getArticleElement(documentElement);

169 if (walkerRoot == null) {	174 converter.setHasArticleElement(walkerRoot != null);

	175

	176 if (walkerRoot != null) {

	177 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_BOILER_PIPE_PHASES)) {

	178 LogUtil.logToConsole("Extracted article element: " + walkerRoot) ;

	179 }

	180 } else {

170 walkerRoot = documentElement;	181 walkerRoot = documentElement;

171 }	182 }

172 new DomWalker(converter).walk(walkerRoot);	183 new DomWalker(converter).walk(walkerRoot);

173 info.document = documentBuilder.toWebDocument();	184 info.document = documentBuilder.toWebDocument();

174 ensureTitleInitialized();	185 ensureTitleInitialized();

175 info.hiddenElements = converter.getHiddenElements();	186 info.hiddenElements = converter.getHiddenElements();

176	187

177 return info;	188 return info;

178 }	189 }

179	190

180 /**	191 /**

181 * Implements the actual analysis of the page content, identifying the core elements of the	192 * Implements the actual analysis of the page content, identifying the core elements of the

182 * page.	193 * page.

183 *	194 *

184 * @param document the WebDocument representation of the page extracted from the DOM.	195 * @param document the WebDocument representation of the page extracted from the DOM.

185 */	196 */

186 private void processDocument(WebDocument document) {	197 private void processDocument(WebDocument document) {

187 TextDocument textDocument = document.createTextDocumentView();	198 TextDocument textDocument = document.createTextDocumentView();

188 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);	199 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);

189 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument));	200 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument));

190 textDocument.applyToModel();	201 textDocument.applyToModel();

191 }	202 }

192 }	203 }

OLD	NEW

« no previous file with comments | « no previous file | java/org/chromium/distiller/webdocument/DomConverter.java » ('j') | no next file with comments »