Index: src/com/dom_distiller/client/ContentExtractor.java |
diff --git a/src/com/dom_distiller/client/ContentExtractor.java b/src/com/dom_distiller/client/ContentExtractor.java |
index 2491fb616b1010050894261f6eaf5bbc39fb53bc..50cfef361ea98e891243907e6e5f7b2782d4eead 100644 |
--- a/src/com/dom_distiller/client/ContentExtractor.java |
+++ b/src/com/dom_distiller/client/ContentExtractor.java |
@@ -35,6 +35,10 @@ public class ContentExtractor implements Exportable { |
static Logger logger = Logger.getLogger("DomDistiller"); |
public static String extractContent() { |
+ return extractContent(false); |
+ } |
+ |
+ public static String extractContent(boolean text_only) { |
BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandler(); |
List<Node> textNodes = null; |
@@ -56,19 +60,11 @@ public class ContentExtractor implements Exportable { |
return ""; |
} |
- List<Integer> contentTextIndexes = new ArrayList<Integer>(); |
- for (TextBlock tb : document.getTextBlocks()) { |
- if (!tb.hasLabel(DefaultLabels.TITLE)) { |
- contentTextIndexes.addAll(tb.getContainedTextElements()); |
- } |
+ if (text_only) { |
+ return document.getText(true, false); |
} |
- Collections.sort(contentTextIndexes); |
- // Boilerpipe's text node indexes start at 1. |
- List<Node> contentNodes = new ArrayList<Node>(contentTextIndexes.size()); |
- for (Integer i : contentTextIndexes) { |
- contentNodes.add(textNodes.get(i - 1)); |
- } |
+ List<Node> contentNodes = getContentNodesForTextDocument(document, textNodes); |
List<Node> contentAndImages = RelevantImageFinder.findAndAddImages( |
contentNodes, Document.get().getDocumentElement()); |
@@ -100,6 +96,24 @@ public class ContentExtractor implements Exportable { |
return domToSaxVisitor.getTextNodes(); |
} |
+ private static List<Node> getContentNodesForTextDocument( |
+ TextDocument document, List<Node> textNodes) { |
+ List<Integer> contentTextIndexes = new ArrayList<Integer>(); |
+ for (TextBlock tb : document.getTextBlocks()) { |
+ if (!tb.hasLabel(DefaultLabels.TITLE)) { |
+ contentTextIndexes.addAll(tb.getContainedTextElements()); |
+ } |
+ } |
+ Collections.sort(contentTextIndexes); |
+ |
+ // Boilerpipe's text node indexes start at 1. |
+ List<Node> contentNodes = new ArrayList<Node>(contentTextIndexes.size()); |
+ for (Integer i : contentTextIndexes) { |
+ contentNodes.add(textNodes.get(i - 1)); |
+ } |
+ return contentNodes; |
+ } |
+ |
private static void makeAllLinksAbsolute(Node rootNode) { |
Element root = Element.as(rootNode); |