src/com/dom_distiller/client/ContentExtractor.java - Issue 499623002: Instrument DomDistiller with timing information.

Side by Side Diff: src/com/dom_distiller/client/ContentExtractor.java

Issue 499623002: Instrument DomDistiller with timing information. (Closed) Base URL: https://code.google.com/p/dom-distiller/@master

Patch Set: Created 6 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package com.dom_distiller.client;	5 package com.dom_distiller.client;

6	6

	7 import com.dom_distiller.proto.DomDistillerProtos;

	8 import com.dom_distiller.proto.DomDistillerProtos.TimingInfo;

7 import com.google.gwt.dom.client.AnchorElement;	9 import com.google.gwt.dom.client.AnchorElement;

8 import com.google.gwt.dom.client.Document;	10 import com.google.gwt.dom.client.Document;

9 import com.google.gwt.dom.client.Element;	11 import com.google.gwt.dom.client.Element;

10 import com.google.gwt.dom.client.Node;	12 import com.google.gwt.dom.client.Node;

11 import com.google.gwt.dom.client.NodeList;	13 import com.google.gwt.dom.client.NodeList;

12 import com.google.gwt.dom.client.VideoElement;	14 import com.google.gwt.dom.client.VideoElement;

13	15

14 import de.l3s.boilerpipe.BoilerpipeProcessingException;	16 import de.l3s.boilerpipe.BoilerpipeProcessingException;

15 import de.l3s.boilerpipe.document.TextBlock;	17 import de.l3s.boilerpipe.document.TextBlock;

16 import de.l3s.boilerpipe.document.TextDocument;	18 import de.l3s.boilerpipe.document.TextDocument;

17 import de.l3s.boilerpipe.extractors.CommonExtractors;	19 import de.l3s.boilerpipe.extractors.CommonExtractors;

18 import de.l3s.boilerpipe.labels.DefaultLabels;	20 import de.l3s.boilerpipe.labels.DefaultLabels;

19 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;	21 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;

20	22

21 import java.util.ArrayList;	23 import java.util.ArrayList;

22 import java.util.LinkedList;	24 import java.util.LinkedList;

23 import java.util.List;	25 import java.util.List;

24 import java.util.logging.Logger;

25	26

26 public class ContentExtractor {	27 public class ContentExtractor {

27 static Logger logger = Logger.getLogger("DomDistiller");	28 private final Element documentElement;

28	29

29 private final Element documentElement;	30 private final List<String> candidateTitles;

	31

	32 private final TimingInfo mTimingInfo;

30	33

31 private final MarkupParser parser;	34 private final MarkupParser parser;

32	35

33 private final List<String> candidateTitles;

34

35 public ContentExtractor(Element root) {	36 public ContentExtractor(Element root) {

36 this.documentElement = root;	37 this.documentElement = root;

	38 this.candidateTitles = new LinkedList<String>();

	39 this.mTimingInfo = DomDistillerProtos.TimingInfo.create();

	40

	41 double startTime = DomUtil.getTime();

37 this.parser = new MarkupParser(root);	42 this.parser = new MarkupParser(root);

38 this.candidateTitles = new LinkedList<String>();	43 this.mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime);

39 }	44 }

40	45

41 // Grabs a list of candidate titles in descending priority order:	46 // Grabs a list of candidate titles in descending priority order:

42 // 1) meta-information	47 // 1) meta-information

43 // 2) The document's title element, modified based on some readability heuri stics	48 // 2) The document's title element, modified based on some readability heuri stics

44 // 3) The document's title element, if it's a string	49 // 3) The document's title element, if it's a string

45 private void ensureTitleInitialized() {	50 private void ensureTitleInitialized() {

46 if (candidateTitles.size() > 0) return;	51 if (candidateTitles.size() > 0) return;

47	52

48 String title = parser.getTitle();	53 String title = parser.getTitle();

(...skipping 13 matching lines...) Expand all Loading...
62 ensureTitleInitialized();	67 ensureTitleInitialized();

63 assert candidateTitles.size() > 0;	68 assert candidateTitles.size() > 0;

64 return candidateTitles.get(0);	69 return candidateTitles.get(0);

65 }	70 }

66	71

67 public String extractContent() {	72 public String extractContent() {

68 return extractContent(false);	73 return extractContent(false);

69 }	74 }

70	75

71 public String extractContent(boolean textOnly) {	76 public String extractContent(boolean textOnly) {

	77 double now = DomUtil.getTime();

	78 TextDocument document = createTextBlocksFromPage();

	79 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now);

	80

	81 now = DomUtil.getTime();

	82 List<Node> contentNodes = processTextBlocks(document);

	83 mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now);

	84

	85 if (contentNodes.isEmpty()) {

	86 return "";

	87 }

	88

	89 now = DomUtil.getTime();

	90 String html = formatExtractedNodes(textOnly, contentNodes);

	91 mTimingInfo.setFormattingTime(DomUtil.getTime() - now);

	92 return html;

	93 }

	94

	95

	96 /**

	97 * Returns timing information about the most recent extraction run.

	98 * @return an instance of DomDistillerProtos.TimingInfo with detailed timing statistics.

	99 */

	100 public TimingInfo getTimingInfo() {

	101 return mTimingInfo;

	102 }

	103

	104 /**

	105 * Converts the original HTML page into a series of TextBlock for analysis.

	106 * @return a document with the list of extracted TextBlocks and additional i nformation

	107 * that can be useful for identifying the core elements of the page.

	108 */

	109 private TextDocument createTextBlocksFromPage() {

72 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl er();	110 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl er();

73

74 htmlParser.startDocument();	111 htmlParser.startDocument();

75 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(htmlParser);	112 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(htmlParser);

76 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS axVisitor);	113 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS axVisitor);

77 new DomWalker(filteringDomVisitor).walk(documentElement);	114 new DomWalker(filteringDomVisitor).walk(documentElement);

78 htmlParser.endDocument();	115 htmlParser.endDocument();

79

80 TextDocument document = htmlParser.toTextDocument();	116 TextDocument document = htmlParser.toTextDocument();

81 ensureTitleInitialized();	117 ensureTitleInitialized();

	118 document.setCandidateTitles(candidateTitles);

	119 document.setHiddenElements(filteringDomVisitor.getHiddenElements());

	120 document.setDataTables(filteringDomVisitor.getDataTables());

	121 return document;

	122 }

82	123

83 document.setCanddiateTitles(candidateTitles);	124 /**

	125 * Implements the actual analysis of the page content, identifying the core elements of the

	126 * page.

	127 * @param document the TextBlock representation of the page extracted from t he DOM.

	128 * @return a list of DOM nodes from the original document that contain the c ore elements of the

	129 * page.

	130 */

	131 private List<Node> processTextBlocks(TextDocument document) {

84 try {	132 try {

85 CommonExtractors.ARTICLE_EXTRACTOR.process(document);	133 CommonExtractors.ARTICLE_EXTRACTOR.process(document);

86 } catch (BoilerpipeProcessingException e) {	134 } catch (BoilerpipeProcessingException e) {

87 logger.warning("Processing failed.");	135 LogUtil.logToConsole("DomDistiller Processing failed: " + e);

88 return "";	136 return new LinkedList<Node>();

89 }	137 }

90	138

91

92 List<Node> contentNodes = getContentNodesForTextDocument(document);	139 List<Node> contentNodes = getContentNodesForTextDocument(document);

93	140

94 List<Node> contentAndRelevantElements = RelevantElementsFinder.findAndAd dElements(	141 List<Node> contentAndRelevantElements = RelevantElementsFinder.findAndAd dElements(

95 contentNodes, filteringDomVisitor.getHiddenElements(),	142 contentNodes, document.getHiddenElements(),

96 filteringDomVisitor.getDataTables(), Document.get().getDocumentE lement());	143 document.getDataTables(), Document.get().getDocumentElement());

	144 return contentAndRelevantElements;

	145 }

97	146

98 if (contentAndRelevantElements.isEmpty()) {	147 /**

99 return "";	148 * Creates a new minimal HTML document containing copies of the DOM nodes id entified as the

100 }	149 * core elements of the page. Some additional re-formatting hints may be inc luded in the new

101	150 * document.

102 Node clonedSubtree = NodeListExpander.expand(contentAndRelevantElements) .cloneSubtree();	151 *

103	152 * @param textOnly indicates whether to simply return the aggregated text co ntent instead of

	153 * HTML

	154 * @param contentNodes the DOM nodes containing text to be included in the f inal docuemnt.

	155 * @return A HTML or text document which includes the aggregated content of the provided HTML

	156 * nodes.

	157 */

	158 private String formatExtractedNodes(boolean textOnly, List<Node> contentNode s) {

	159 Node clonedSubtree = NodeListExpander.expand(contentNodes).cloneSubtree( );

104 if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) {	160 if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) {

105 return "";	161 return "";

106 }	162 }

107	163

108 // The base URL in the distilled page viewer is different from that in	164 // The base URL in the distilled page viewer is different from that in

109 // the live page. This breaks all relative links (in anchors,	165 // the live page. This breaks all relative links (in anchors,

110 // images, etc.), so make them absolute in the distilled content.	166 // images, etc.), so make them absolute in the distilled content.

111 makeAllLinksAbsolute(clonedSubtree);	167 makeAllLinksAbsolute(clonedSubtree);

112

113 if (textOnly) {	168 if (textOnly) {

114 return getTextFromTree(clonedSubtree);	169 return getTextFromTree(clonedSubtree);

115 }	170 }

116	171

117 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might	172 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might

118 // break in some cases.	173 // break in some cases.

119 return Element.as(clonedSubtree).getInnerHTML();	174 return Element.as(clonedSubtree).getInnerHTML();

120 }	175 }

121	176

122 /**	177 /**

(...skipping 71 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
194 return;	249 return;

195 }	250 }

196 var elementsWithSrc = root.querySelectorAll('img,source,track,video');	251 var elementsWithSrc = root.querySelectorAll('img,source,track,video');

197 for (var key in elementsWithSrc) {	252 for (var key in elementsWithSrc) {

198 if (elementsWithSrc[key].src) {	253 if (elementsWithSrc[key].src) {

199 elementsWithSrc[key].src = elementsWithSrc[key].src;	254 elementsWithSrc[key].src = elementsWithSrc[key].src;

200 }	255 }

201 }	256 }

202 }-*/;	257 }-*/;

203 }	258 }

OLD	NEW

« no previous file with comments | « proto/dom_distiller.proto ('k') | src/com/dom_distiller/client/DomDistiller.java » ('j') | no next file with comments »