Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(268)

Side by Side Diff: java/org/chromium/distiller/ContentExtractor.java

Issue 2203563002: Extract image URLs in srcset as well (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: format Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | java/org/chromium/distiller/DocumentTitleGetter.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package org.chromium.distiller; 5 package org.chromium.distiller;
6 6
7 import org.chromium.distiller.document.TextDocument; 7 import org.chromium.distiller.document.TextDocument;
8 import org.chromium.distiller.document.TextDocumentStatistics; 8 import org.chromium.distiller.document.TextDocumentStatistics;
9 import org.chromium.distiller.extractors.ArticleExtractor; 9 import org.chromium.distiller.extractors.ArticleExtractor;
10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;
(...skipping 16 matching lines...) Expand all
27 import java.util.LinkedList; 27 import java.util.LinkedList;
28 import java.util.List; 28 import java.util.List;
29 import java.util.Set; 29 import java.util.Set;
30 30
31 public class ContentExtractor { 31 public class ContentExtractor {
32 private final Element documentElement; 32 private final Element documentElement;
33 private final List<String> candidateTitles; 33 private final List<String> candidateTitles;
34 private final TimingInfo mTimingInfo; 34 private final TimingInfo mTimingInfo;
35 private final StatisticsInfo mStatisticsInfo; 35 private final StatisticsInfo mStatisticsInfo;
36 private final MarkupParser parser; 36 private final MarkupParser parser;
37 private final List<String> imageUrls; 37 private List<String> imageUrls;
38 private String textDirection; 38 private String textDirection;
39 39
40 private class WebDocumentInfo { 40 private class WebDocumentInfo {
41 WebDocument document; 41 WebDocument document;
42 Set<Node> hiddenElements; 42 Set<Node> hiddenElements;
43 } 43 }
44 44
45 public ContentExtractor(Element root) { 45 public ContentExtractor(Element root) {
46 documentElement = root; 46 documentElement = root;
47 candidateTitles = new LinkedList<String>(); 47 candidateTitles = new LinkedList<String>();
48 mTimingInfo = TimingInfo.create(); 48 mTimingInfo = TimingInfo.create();
49 mStatisticsInfo = StatisticsInfo.create(); 49 mStatisticsInfo = StatisticsInfo.create();
50 imageUrls = new ArrayList<String>();
51 50
52 double startTime = DomUtil.getTime(); 51 double startTime = DomUtil.getTime();
53 parser = new MarkupParser(root, mTimingInfo); 52 parser = new MarkupParser(root, mTimingInfo);
54 mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime); 53 mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime);
55 textDirection = ""; 54 textDirection = "";
56 } 55 }
57 56
58 // Grabs a list of candidate titles in descending priority order: 57 // Grabs a list of candidate titles in descending priority order:
59 // 1) meta-information 58 // 1) meta-information
60 // 2) The document's title element, modified based on some readability heuri stics 59 // 2) The document's title element, modified based on some readability heuri stics
(...skipping 28 matching lines...) Expand all
89 double now = DomUtil.getTime(); 88 double now = DomUtil.getTime();
90 WebDocumentInfo documentInfo = createWebDocumentInfoFromPage(); 89 WebDocumentInfo documentInfo = createWebDocumentInfoFromPage();
91 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now); 90 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now);
92 91
93 now = DomUtil.getTime(); 92 now = DomUtil.getTime();
94 processDocument(documentInfo.document); 93 processDocument(documentInfo.document);
95 RelevantElements.process(documentInfo.document); 94 RelevantElements.process(documentInfo.document);
96 LeadImageFinder.process(documentInfo.document); 95 LeadImageFinder.process(documentInfo.document);
97 NestedElementRetainer.process(documentInfo.document); 96 NestedElementRetainer.process(documentInfo.document);
98 97
99 List<WebImage> images = documentInfo.document.getContentImages();
100 for (WebImage wi : images) {
101 imageUrls.add(wi.getSrc());
102 }
103 mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now); 98 mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now);
104 99
105 now = DomUtil.getTime(); 100 now = DomUtil.getTime();
106 String html = documentInfo.document.generateOutput(textOnly); 101 String html = documentInfo.document.generateOutput(textOnly);
107 mTimingInfo.setFormattingTime(DomUtil.getTime() - now); 102 mTimingInfo.setFormattingTime(DomUtil.getTime() - now);
108 103
104 imageUrls = documentInfo.document.getImageUrls();
105
109 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_TIMING_INFO)) { 106 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_TIMING_INFO)) {
110 for (int i = 0; i < mTimingInfo.getOtherTimesCount(); i++) { 107 for (int i = 0; i < mTimingInfo.getOtherTimesCount(); i++) {
111 TimingEntry entry = mTimingInfo.getOtherTimes(i); 108 TimingEntry entry = mTimingInfo.getOtherTimes(i);
112 LogUtil.logToConsole("Timing: " + entry.getName() + " = " + entr y.getTime()); 109 LogUtil.logToConsole("Timing: " + entry.getName() + " = " + entr y.getTime());
113 } 110 }
114 111
115 LogUtil.logToConsole( 112 LogUtil.logToConsole(
116 "Timing: MarkupParsingTime = " + 113 "Timing: MarkupParsingTime = " +
117 mTimingInfo.getMarkupParsingTime() + 114 mTimingInfo.getMarkupParsingTime() +
118 "\nTiming: DocumentConstructionTime = " + 115 "\nTiming: DocumentConstructionTime = " +
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
186 * 183 *
187 * @param document the WebDocument representation of the page extracted from the DOM. 184 * @param document the WebDocument representation of the page extracted from the DOM.
188 */ 185 */
189 private void processDocument(WebDocument document) { 186 private void processDocument(WebDocument document) {
190 TextDocument textDocument = document.createTextDocumentView(); 187 TextDocument textDocument = document.createTextDocumentView();
191 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); 188 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);
192 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument)); 189 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument));
193 textDocument.applyToModel(); 190 textDocument.applyToModel();
194 } 191 }
195 } 192 }
OLDNEW
« no previous file with comments | « no previous file | java/org/chromium/distiller/DocumentTitleGetter.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698