java/org/chromium/distiller/ContentExtractor.java - Issue 2203563002: Extract image URLs in srcset as well

Side by Side Diff: java/org/chromium/distiller/ContentExtractor.java

Issue 2203563002: Extract image URLs in srcset as well (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master

Patch Set: format Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package org.chromium.distiller;	5 package org.chromium.distiller;

6	6

7 import org.chromium.distiller.document.TextDocument;	7 import org.chromium.distiller.document.TextDocument;

8 import org.chromium.distiller.document.TextDocumentStatistics;	8 import org.chromium.distiller.document.TextDocumentStatistics;

9 import org.chromium.distiller.extractors.ArticleExtractor;	9 import org.chromium.distiller.extractors.ArticleExtractor;

10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;	10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;

(...skipping 16 matching lines...) Expand all Loading...
27 import java.util.LinkedList;	27 import java.util.LinkedList;

28 import java.util.List;	28 import java.util.List;

29 import java.util.Set;	29 import java.util.Set;

30	30

31 public class ContentExtractor {	31 public class ContentExtractor {

32 private final Element documentElement;	32 private final Element documentElement;

33 private final List<String> candidateTitles;	33 private final List<String> candidateTitles;

34 private final TimingInfo mTimingInfo;	34 private final TimingInfo mTimingInfo;

35 private final StatisticsInfo mStatisticsInfo;	35 private final StatisticsInfo mStatisticsInfo;

36 private final MarkupParser parser;	36 private final MarkupParser parser;

37 private final List<String> imageUrls;	37 private List<String> imageUrls;

38 private String textDirection;	38 private String textDirection;

39	39

40 private class WebDocumentInfo {	40 private class WebDocumentInfo {

41 WebDocument document;	41 WebDocument document;

42 Set<Node> hiddenElements;	42 Set<Node> hiddenElements;

43 }	43 }

44	44

45 public ContentExtractor(Element root) {	45 public ContentExtractor(Element root) {

46 documentElement = root;	46 documentElement = root;

47 candidateTitles = new LinkedList<String>();	47 candidateTitles = new LinkedList<String>();

48 mTimingInfo = TimingInfo.create();	48 mTimingInfo = TimingInfo.create();

49 mStatisticsInfo = StatisticsInfo.create();	49 mStatisticsInfo = StatisticsInfo.create();

50 imageUrls = new ArrayList<String>();

51	50

52 double startTime = DomUtil.getTime();	51 double startTime = DomUtil.getTime();

53 parser = new MarkupParser(root, mTimingInfo);	52 parser = new MarkupParser(root, mTimingInfo);

54 mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime);	53 mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime);

55 textDirection = "";	54 textDirection = "";

56 }	55 }

57	56

58 // Grabs a list of candidate titles in descending priority order:	57 // Grabs a list of candidate titles in descending priority order:

59 // 1) meta-information	58 // 1) meta-information

60 // 2) The document's title element, modified based on some readability heuri stics	59 // 2) The document's title element, modified based on some readability heuri stics

(...skipping 28 matching lines...) Expand all Loading...
89 double now = DomUtil.getTime();	88 double now = DomUtil.getTime();

90 WebDocumentInfo documentInfo = createWebDocumentInfoFromPage();	89 WebDocumentInfo documentInfo = createWebDocumentInfoFromPage();

91 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now);	90 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now);

92	91

93 now = DomUtil.getTime();	92 now = DomUtil.getTime();

94 processDocument(documentInfo.document);	93 processDocument(documentInfo.document);

95 RelevantElements.process(documentInfo.document);	94 RelevantElements.process(documentInfo.document);

96 LeadImageFinder.process(documentInfo.document);	95 LeadImageFinder.process(documentInfo.document);

97 NestedElementRetainer.process(documentInfo.document);	96 NestedElementRetainer.process(documentInfo.document);

98	97

99 List<WebImage> images = documentInfo.document.getContentImages();

100 for (WebImage wi : images) {

101 imageUrls.add(wi.getSrc());

102 }

103 mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now);	98 mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now);

104	99

105 now = DomUtil.getTime();	100 now = DomUtil.getTime();

106 String html = documentInfo.document.generateOutput(textOnly);	101 String html = documentInfo.document.generateOutput(textOnly);

107 mTimingInfo.setFormattingTime(DomUtil.getTime() - now);	102 mTimingInfo.setFormattingTime(DomUtil.getTime() - now);

108	103

	104 imageUrls = documentInfo.document.getImageUrls();

	105

109 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_TIMING_INFO)) {	106 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_TIMING_INFO)) {

110 for (int i = 0; i < mTimingInfo.getOtherTimesCount(); i++) {	107 for (int i = 0; i < mTimingInfo.getOtherTimesCount(); i++) {

111 TimingEntry entry = mTimingInfo.getOtherTimes(i);	108 TimingEntry entry = mTimingInfo.getOtherTimes(i);

112 LogUtil.logToConsole("Timing: " + entry.getName() + " = " + entr y.getTime());	109 LogUtil.logToConsole("Timing: " + entry.getName() + " = " + entr y.getTime());

113 }	110 }

114	111

115 LogUtil.logToConsole(	112 LogUtil.logToConsole(

116 "Timing: MarkupParsingTime = " +	113 "Timing: MarkupParsingTime = " +

117 mTimingInfo.getMarkupParsingTime() +	114 mTimingInfo.getMarkupParsingTime() +

118 "\nTiming: DocumentConstructionTime = " +	115 "\nTiming: DocumentConstructionTime = " +

(...skipping 67 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
186 *	183 *

187 * @param document the WebDocument representation of the page extracted from the DOM.	184 * @param document the WebDocument representation of the page extracted from the DOM.

188 */	185 */

189 private void processDocument(WebDocument document) {	186 private void processDocument(WebDocument document) {

190 TextDocument textDocument = document.createTextDocumentView();	187 TextDocument textDocument = document.createTextDocumentView();

191 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);	188 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);

192 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument));	189 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument));

193 textDocument.applyToModel();	190 textDocument.applyToModel();

194 }	191 }

195 }	192 }

OLD	NEW

« no previous file with comments | « no previous file | java/org/chromium/distiller/DocumentTitleGetter.java » ('j') | no next file with comments »