Index: src/com/dom_distiller/client/ContentExtractor.java |
diff --git a/src/com/dom_distiller/client/ContentExtractor.java b/src/com/dom_distiller/client/ContentExtractor.java |
index 498d36ffb57b4ed59fe8a4078ec1b5ce59a0c774..b3151766d59f0f495b30617cd9d2f7858267b290 100644 |
--- a/src/com/dom_distiller/client/ContentExtractor.java |
+++ b/src/com/dom_distiller/client/ContentExtractor.java |
@@ -4,6 +4,8 @@ |
package com.dom_distiller.client; |
+import com.dom_distiller.proto.DomDistillerProtos; |
+import com.dom_distiller.proto.DomDistillerProtos.TimingInfo; |
import com.google.gwt.dom.client.AnchorElement; |
import com.google.gwt.dom.client.Document; |
import com.google.gwt.dom.client.Element; |
@@ -21,21 +23,24 @@ import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler; |
import java.util.ArrayList; |
import java.util.LinkedList; |
import java.util.List; |
-import java.util.logging.Logger; |
public class ContentExtractor { |
- static Logger logger = Logger.getLogger("DomDistiller"); |
- |
private final Element documentElement; |
- private final MarkupParser parser; |
- |
private final List<String> candidateTitles; |
+ private final TimingInfo mTimingInfo; |
+ |
+ private final MarkupParser parser; |
+ |
public ContentExtractor(Element root) { |
this.documentElement = root; |
- this.parser = new MarkupParser(root); |
this.candidateTitles = new LinkedList<String>(); |
+ this.mTimingInfo = DomDistillerProtos.TimingInfo.create(); |
+ |
+ double startTime = DomUtil.getTime(); |
+ this.parser = new MarkupParser(root); |
+ this.mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime); |
} |
// Grabs a list of candidate titles in descending priority order: |
@@ -69,38 +74,89 @@ public class ContentExtractor { |
} |
public String extractContent(boolean textOnly) { |
- BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandler(); |
+ double now = DomUtil.getTime(); |
+ TextDocument document = createTextBlocksFromPage(); |
+ mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now); |
+ |
+ now = DomUtil.getTime(); |
+ List<Node> contentNodes = processTextBlocks(document); |
+ mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now); |
+ |
+ if (contentNodes.isEmpty()) { |
+ return ""; |
+ } |
+ |
+ now = DomUtil.getTime(); |
+ String html = formatExtractedNodes(textOnly, contentNodes); |
+ mTimingInfo.setFormattingTime(DomUtil.getTime() - now); |
+ return html; |
+ } |
+ |
+ |
+ /** |
+ * Returns timing information about the most recent extraction run. |
+ * @return an instance of DomDistillerProtos.TimingInfo with detailed timing statistics. |
+ */ |
+ public TimingInfo getTimingInfo() { |
+ return mTimingInfo; |
+ } |
+ /** |
+ * Converts the original HTML page into a series of TextBlock for analysis. |
+ * @return a document with the list of extracted TextBlocks and additional information |
+ * that can be useful for identifying the core elements of the page. |
+ */ |
+ private TextDocument createTextBlocksFromPage() { |
+ BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandler(); |
htmlParser.startDocument(); |
DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(htmlParser); |
FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToSaxVisitor); |
new DomWalker(filteringDomVisitor).walk(documentElement); |
htmlParser.endDocument(); |
- |
TextDocument document = htmlParser.toTextDocument(); |
ensureTitleInitialized(); |
+ document.setCandidateTitles(candidateTitles); |
+ document.setHiddenElements(filteringDomVisitor.getHiddenElements()); |
+ document.setDataTables(filteringDomVisitor.getDataTables()); |
+ return document; |
+ } |
- document.setCanddiateTitles(candidateTitles); |
+ /** |
+ * Implements the actual analysis of the page content, identifying the core elements of the |
+ * page. |
+ * @param document the TextBlock representation of the page extracted from the DOM. |
+ * @return a list of DOM nodes from the original document that contain the core elements of the |
+ * page. |
+ */ |
+ private List<Node> processTextBlocks(TextDocument document) { |
try { |
CommonExtractors.ARTICLE_EXTRACTOR.process(document); |
} catch (BoilerpipeProcessingException e) { |
- logger.warning("Processing failed."); |
- return ""; |
+ LogUtil.logToConsole("DomDistiller Processing failed: " + e); |
+ return new LinkedList<Node>(); |
} |
- |
List<Node> contentNodes = getContentNodesForTextDocument(document); |
List<Node> contentAndRelevantElements = RelevantElementsFinder.findAndAddElements( |
- contentNodes, filteringDomVisitor.getHiddenElements(), |
- filteringDomVisitor.getDataTables(), Document.get().getDocumentElement()); |
- |
- if (contentAndRelevantElements.isEmpty()) { |
- return ""; |
- } |
- |
- Node clonedSubtree = NodeListExpander.expand(contentAndRelevantElements).cloneSubtree(); |
+ contentNodes, document.getHiddenElements(), |
+ document.getDataTables(), Document.get().getDocumentElement()); |
+ return contentAndRelevantElements; |
+ } |
+ /** |
+ * Creates a new minimal HTML document containing copies of the DOM nodes identified as the |
+ * core elements of the page. Some additional re-formatting hints may be included in the new |
+ * document. |
+ * |
+ * @param textOnly indicates whether to simply return the aggregated text content instead of |
+ * HTML |
+ * @param contentNodes the DOM nodes containing text to be included in the final docuemnt. |
+ * @return A HTML or text document which includes the aggregated content of the provided HTML |
+ * nodes. |
+ */ |
+ private String formatExtractedNodes(boolean textOnly, List<Node> contentNodes) { |
+ Node clonedSubtree = NodeListExpander.expand(contentNodes).cloneSubtree(); |
if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) { |
return ""; |
} |
@@ -109,7 +165,6 @@ public class ContentExtractor { |
// the live page. This breaks all relative links (in anchors, |
// images, etc.), so make them absolute in the distilled content. |
makeAllLinksAbsolute(clonedSubtree); |
- |
if (textOnly) { |
return getTextFromTree(clonedSubtree); |
} |