Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(39)

Side by Side Diff: src/com/dom_distiller/client/ContentExtractor.java

Issue 499623002: Instrument DomDistiller with timing information. (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: Created 6 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « proto/dom_distiller.proto ('k') | src/com/dom_distiller/client/DomDistiller.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package com.dom_distiller.client; 5 package com.dom_distiller.client;
6 6
7 import com.dom_distiller.proto.DomDistillerProtos;
8 import com.dom_distiller.proto.DomDistillerProtos.TimingInfo;
7 import com.google.gwt.dom.client.AnchorElement; 9 import com.google.gwt.dom.client.AnchorElement;
8 import com.google.gwt.dom.client.Document; 10 import com.google.gwt.dom.client.Document;
9 import com.google.gwt.dom.client.Element; 11 import com.google.gwt.dom.client.Element;
10 import com.google.gwt.dom.client.Node; 12 import com.google.gwt.dom.client.Node;
11 import com.google.gwt.dom.client.NodeList; 13 import com.google.gwt.dom.client.NodeList;
12 import com.google.gwt.dom.client.VideoElement; 14 import com.google.gwt.dom.client.VideoElement;
13 15
14 import de.l3s.boilerpipe.BoilerpipeProcessingException; 16 import de.l3s.boilerpipe.BoilerpipeProcessingException;
15 import de.l3s.boilerpipe.document.TextBlock; 17 import de.l3s.boilerpipe.document.TextBlock;
16 import de.l3s.boilerpipe.document.TextDocument; 18 import de.l3s.boilerpipe.document.TextDocument;
17 import de.l3s.boilerpipe.extractors.CommonExtractors; 19 import de.l3s.boilerpipe.extractors.CommonExtractors;
18 import de.l3s.boilerpipe.labels.DefaultLabels; 20 import de.l3s.boilerpipe.labels.DefaultLabels;
19 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler; 21 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
20 22
21 import java.util.ArrayList; 23 import java.util.ArrayList;
22 import java.util.LinkedList; 24 import java.util.LinkedList;
23 import java.util.List; 25 import java.util.List;
24 import java.util.logging.Logger;
25 26
26 public class ContentExtractor { 27 public class ContentExtractor {
27 static Logger logger = Logger.getLogger("DomDistiller"); 28 private final Element documentElement;
28 29
29 private final Element documentElement; 30 private final List<String> candidateTitles;
31
32 private final TimingInfo mTimingInfo;
30 33
31 private final MarkupParser parser; 34 private final MarkupParser parser;
32 35
33 private final List<String> candidateTitles;
34
35 public ContentExtractor(Element root) { 36 public ContentExtractor(Element root) {
36 this.documentElement = root; 37 this.documentElement = root;
38 this.candidateTitles = new LinkedList<String>();
39 this.mTimingInfo = DomDistillerProtos.TimingInfo.create();
40
41 double startTime = DomUtil.getTime();
37 this.parser = new MarkupParser(root); 42 this.parser = new MarkupParser(root);
38 this.candidateTitles = new LinkedList<String>(); 43 this.mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime);
39 } 44 }
40 45
41 // Grabs a list of candidate titles in descending priority order: 46 // Grabs a list of candidate titles in descending priority order:
42 // 1) meta-information 47 // 1) meta-information
43 // 2) The document's title element, modified based on some readability heuri stics 48 // 2) The document's title element, modified based on some readability heuri stics
44 // 3) The document's title element, if it's a string 49 // 3) The document's title element, if it's a string
45 private void ensureTitleInitialized() { 50 private void ensureTitleInitialized() {
46 if (candidateTitles.size() > 0) return; 51 if (candidateTitles.size() > 0) return;
47 52
48 String title = parser.getTitle(); 53 String title = parser.getTitle();
(...skipping 13 matching lines...) Expand all
62 ensureTitleInitialized(); 67 ensureTitleInitialized();
63 assert candidateTitles.size() > 0; 68 assert candidateTitles.size() > 0;
64 return candidateTitles.get(0); 69 return candidateTitles.get(0);
65 } 70 }
66 71
67 public String extractContent() { 72 public String extractContent() {
68 return extractContent(false); 73 return extractContent(false);
69 } 74 }
70 75
71 public String extractContent(boolean textOnly) { 76 public String extractContent(boolean textOnly) {
77 double now = DomUtil.getTime();
78 TextDocument document = createTextBlocksFromPage();
79 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now);
80
81 now = DomUtil.getTime();
82 List<Node> contentNodes = processTextBlocks(document);
83 mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now);
84
85 if (contentNodes.isEmpty()) {
86 return "";
87 }
88
89 now = DomUtil.getTime();
90 String html = formatExtractedNodes(textOnly, contentNodes);
91 mTimingInfo.setFormattingTime(DomUtil.getTime() - now);
92 return html;
93 }
94
95
96 /**
97 * Returns timing information about the most recent extraction run.
98 * @return an instance of DomDistillerProtos.TimingInfo with detailed timing statistics.
99 */
100 public TimingInfo getTimingInfo() {
101 return mTimingInfo;
102 }
103
104 /**
105 * Converts the original HTML page into a series of TextBlock for analysis.
106 * @return a document with the list of extracted TextBlocks and additional i nformation
107 * that can be useful for identifying the core elements of the page.
108 */
109 private TextDocument createTextBlocksFromPage() {
72 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl er(); 110 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl er();
73
74 htmlParser.startDocument(); 111 htmlParser.startDocument();
75 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(htmlParser); 112 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(htmlParser);
76 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS axVisitor); 113 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS axVisitor);
77 new DomWalker(filteringDomVisitor).walk(documentElement); 114 new DomWalker(filteringDomVisitor).walk(documentElement);
78 htmlParser.endDocument(); 115 htmlParser.endDocument();
79
80 TextDocument document = htmlParser.toTextDocument(); 116 TextDocument document = htmlParser.toTextDocument();
81 ensureTitleInitialized(); 117 ensureTitleInitialized();
118 document.setCandidateTitles(candidateTitles);
119 document.setHiddenElements(filteringDomVisitor.getHiddenElements());
120 document.setDataTables(filteringDomVisitor.getDataTables());
121 return document;
122 }
82 123
83 document.setCanddiateTitles(candidateTitles); 124 /**
125 * Implements the actual analysis of the page content, identifying the core elements of the
126 * page.
127 * @param document the TextBlock representation of the page extracted from t he DOM.
128 * @return a list of DOM nodes from the original document that contain the c ore elements of the
129 * page.
130 */
131 private List<Node> processTextBlocks(TextDocument document) {
84 try { 132 try {
85 CommonExtractors.ARTICLE_EXTRACTOR.process(document); 133 CommonExtractors.ARTICLE_EXTRACTOR.process(document);
86 } catch (BoilerpipeProcessingException e) { 134 } catch (BoilerpipeProcessingException e) {
87 logger.warning("Processing failed."); 135 LogUtil.logToConsole("DomDistiller Processing failed: " + e);
88 return ""; 136 return new LinkedList<Node>();
89 } 137 }
90 138
91
92 List<Node> contentNodes = getContentNodesForTextDocument(document); 139 List<Node> contentNodes = getContentNodesForTextDocument(document);
93 140
94 List<Node> contentAndRelevantElements = RelevantElementsFinder.findAndAd dElements( 141 List<Node> contentAndRelevantElements = RelevantElementsFinder.findAndAd dElements(
95 contentNodes, filteringDomVisitor.getHiddenElements(), 142 contentNodes, document.getHiddenElements(),
96 filteringDomVisitor.getDataTables(), Document.get().getDocumentE lement()); 143 document.getDataTables(), Document.get().getDocumentElement());
144 return contentAndRelevantElements;
145 }
97 146
98 if (contentAndRelevantElements.isEmpty()) { 147 /**
99 return ""; 148 * Creates a new minimal HTML document containing copies of the DOM nodes id entified as the
100 } 149 * core elements of the page. Some additional re-formatting hints may be inc luded in the new
101 150 * document.
102 Node clonedSubtree = NodeListExpander.expand(contentAndRelevantElements) .cloneSubtree(); 151 *
103 152 * @param textOnly indicates whether to simply return the aggregated text co ntent instead of
153 * HTML
154 * @param contentNodes the DOM nodes containing text to be included in the f inal docuemnt.
155 * @return A HTML or text document which includes the aggregated content of the provided HTML
156 * nodes.
157 */
158 private String formatExtractedNodes(boolean textOnly, List<Node> contentNode s) {
159 Node clonedSubtree = NodeListExpander.expand(contentNodes).cloneSubtree( );
104 if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) { 160 if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) {
105 return ""; 161 return "";
106 } 162 }
107 163
108 // The base URL in the distilled page viewer is different from that in 164 // The base URL in the distilled page viewer is different from that in
109 // the live page. This breaks all relative links (in anchors, 165 // the live page. This breaks all relative links (in anchors,
110 // images, etc.), so make them absolute in the distilled content. 166 // images, etc.), so make them absolute in the distilled content.
111 makeAllLinksAbsolute(clonedSubtree); 167 makeAllLinksAbsolute(clonedSubtree);
112
113 if (textOnly) { 168 if (textOnly) {
114 return getTextFromTree(clonedSubtree); 169 return getTextFromTree(clonedSubtree);
115 } 170 }
116 171
117 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might 172 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might
118 // break in some cases. 173 // break in some cases.
119 return Element.as(clonedSubtree).getInnerHTML(); 174 return Element.as(clonedSubtree).getInnerHTML();
120 } 175 }
121 176
122 /** 177 /**
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after
194 return; 249 return;
195 } 250 }
196 var elementsWithSrc = root.querySelectorAll('img,source,track,video'); 251 var elementsWithSrc = root.querySelectorAll('img,source,track,video');
197 for (var key in elementsWithSrc) { 252 for (var key in elementsWithSrc) {
198 if (elementsWithSrc[key].src) { 253 if (elementsWithSrc[key].src) {
199 elementsWithSrc[key].src = elementsWithSrc[key].src; 254 elementsWithSrc[key].src = elementsWithSrc[key].src;
200 } 255 }
201 } 256 }
202 }-*/; 257 }-*/;
203 } 258 }
OLDNEW
« no previous file with comments | « proto/dom_distiller.proto ('k') | src/com/dom_distiller/client/DomDistiller.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698