src/com/dom_distiller/client/ContentExtractor.java - Issue 499623002: Instrument DomDistiller with timing information.

Unified Diff: src/com/dom_distiller/client/ContentExtractor.java

Issue 499623002: Instrument DomDistiller with timing information. (Closed) Base URL: https://code.google.com/p/dom-distiller/@master

Patch Set: Created 6 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/com/dom_distiller/client/ContentExtractor.java

diff --git a/src/com/dom_distiller/client/ContentExtractor.java b/src/com/dom_distiller/client/ContentExtractor.java

index 498d36ffb57b4ed59fe8a4078ec1b5ce59a0c774..b3151766d59f0f495b30617cd9d2f7858267b290 100644

--- a/src/com/dom_distiller/client/ContentExtractor.java

+++ b/src/com/dom_distiller/client/ContentExtractor.java

@@ -4,6 +4,8 @@

package com.dom_distiller.client;

+import com.dom_distiller.proto.DomDistillerProtos;

+import com.dom_distiller.proto.DomDistillerProtos.TimingInfo;

import com.google.gwt.dom.client.AnchorElement;

import com.google.gwt.dom.client.Document;

import com.google.gwt.dom.client.Element;

@@ -21,21 +23,24 @@ import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;

import java.util.ArrayList;

import java.util.LinkedList;

import java.util.List;

-import java.util.logging.Logger;

public class ContentExtractor {

- static Logger logger = Logger.getLogger("DomDistiller");

private final Element documentElement;

- private final MarkupParser parser;

private final List<String> candidateTitles;

+ private final TimingInfo mTimingInfo;

+ private final MarkupParser parser;

public ContentExtractor(Element root) {

this.documentElement = root;

- this.parser = new MarkupParser(root);

this.candidateTitles = new LinkedList<String>();

+ this.mTimingInfo = DomDistillerProtos.TimingInfo.create();

+ double startTime = DomUtil.getTime();

+ this.parser = new MarkupParser(root);

+ this.mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime);

}

// Grabs a list of candidate titles in descending priority order:

@@ -69,38 +74,89 @@ public class ContentExtractor {

}

public String extractContent(boolean textOnly) {

- BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandler();

+ double now = DomUtil.getTime();

+ TextDocument document = createTextBlocksFromPage();

+ mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now);

+ now = DomUtil.getTime();

+ List<Node> contentNodes = processTextBlocks(document);

+ mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now);

+ if (contentNodes.isEmpty()) {

+ return "";

+ }

+ now = DomUtil.getTime();

+ String html = formatExtractedNodes(textOnly, contentNodes);

+ mTimingInfo.setFormattingTime(DomUtil.getTime() - now);

+ return html;

+ }

+ /**

+ * Returns timing information about the most recent extraction run.

+ * @return an instance of DomDistillerProtos.TimingInfo with detailed timing statistics.

+ */

+ public TimingInfo getTimingInfo() {

+ return mTimingInfo;

+ }

+ /**

+ * Converts the original HTML page into a series of TextBlock for analysis.

+ * @return a document with the list of extracted TextBlocks and additional information

+ * that can be useful for identifying the core elements of the page.

+ */

+ private TextDocument createTextBlocksFromPage() {

+ BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandler();

htmlParser.startDocument();

DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(htmlParser);

FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToSaxVisitor);

new DomWalker(filteringDomVisitor).walk(documentElement);

htmlParser.endDocument();

TextDocument document = htmlParser.toTextDocument();

ensureTitleInitialized();

+ document.setCandidateTitles(candidateTitles);

+ document.setHiddenElements(filteringDomVisitor.getHiddenElements());

+ document.setDataTables(filteringDomVisitor.getDataTables());

+ return document;

+ }

- document.setCanddiateTitles(candidateTitles);

+ /**

+ * Implements the actual analysis of the page content, identifying the core elements of the

+ * page.

+ * @param document the TextBlock representation of the page extracted from the DOM.

+ * @return a list of DOM nodes from the original document that contain the core elements of the

+ * page.

+ */

+ private List<Node> processTextBlocks(TextDocument document) {

try {

CommonExtractors.ARTICLE_EXTRACTOR.process(document);

} catch (BoilerpipeProcessingException e) {

- logger.warning("Processing failed.");

- return "";

+ LogUtil.logToConsole("DomDistiller Processing failed: " + e);

+ return new LinkedList<Node>();

}

List<Node> contentNodes = getContentNodesForTextDocument(document);

List<Node> contentAndRelevantElements = RelevantElementsFinder.findAndAddElements(

- contentNodes, filteringDomVisitor.getHiddenElements(),

- filteringDomVisitor.getDataTables(), Document.get().getDocumentElement());

- if (contentAndRelevantElements.isEmpty()) {

- return "";

- }

- Node clonedSubtree = NodeListExpander.expand(contentAndRelevantElements).cloneSubtree();

+ contentNodes, document.getHiddenElements(),

+ document.getDataTables(), Document.get().getDocumentElement());

+ return contentAndRelevantElements;

+ }

+ /**

+ * Creates a new minimal HTML document containing copies of the DOM nodes identified as the

+ * core elements of the page. Some additional re-formatting hints may be included in the new

+ * document.

+ *

+ * @param textOnly indicates whether to simply return the aggregated text content instead of

+ * HTML

+ * @param contentNodes the DOM nodes containing text to be included in the final docuemnt.

+ * @return A HTML or text document which includes the aggregated content of the provided HTML

+ * nodes.

+ */

+ private String formatExtractedNodes(boolean textOnly, List<Node> contentNodes) {

+ Node clonedSubtree = NodeListExpander.expand(contentNodes).cloneSubtree();

if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) {

return "";

}

@@ -109,7 +165,6 @@ public class ContentExtractor {

// the live page. This breaks all relative links (in anchors,

// images, etc.), so make them absolute in the distilled content.

makeAllLinksAbsolute(clonedSubtree);

if (textOnly) {

return getTextFromTree(clonedSubtree);

}

« no previous file with comments | « proto/dom_distiller.proto ('k') | src/com/dom_distiller/client/DomDistiller.java » ('j') | no next file with comments »