| Index: src/com/dom_distiller/client/ContentExtractor.java
|
| diff --git a/src/com/dom_distiller/client/ContentExtractor.java b/src/com/dom_distiller/client/ContentExtractor.java
|
| index 2491fb616b1010050894261f6eaf5bbc39fb53bc..50cfef361ea98e891243907e6e5f7b2782d4eead 100644
|
| --- a/src/com/dom_distiller/client/ContentExtractor.java
|
| +++ b/src/com/dom_distiller/client/ContentExtractor.java
|
| @@ -35,6 +35,10 @@ public class ContentExtractor implements Exportable {
|
| static Logger logger = Logger.getLogger("DomDistiller");
|
|
|
| public static String extractContent() {
|
| + return extractContent(false);
|
| + }
|
| +
|
| + public static String extractContent(boolean text_only) {
|
| BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandler();
|
| List<Node> textNodes = null;
|
|
|
| @@ -56,19 +60,11 @@ public class ContentExtractor implements Exportable {
|
| return "";
|
| }
|
|
|
| - List<Integer> contentTextIndexes = new ArrayList<Integer>();
|
| - for (TextBlock tb : document.getTextBlocks()) {
|
| - if (!tb.hasLabel(DefaultLabels.TITLE)) {
|
| - contentTextIndexes.addAll(tb.getContainedTextElements());
|
| - }
|
| + if (text_only) {
|
| + return document.getText(true, false);
|
| }
|
| - Collections.sort(contentTextIndexes);
|
|
|
| - // Boilerpipe's text node indexes start at 1.
|
| - List<Node> contentNodes = new ArrayList<Node>(contentTextIndexes.size());
|
| - for (Integer i : contentTextIndexes) {
|
| - contentNodes.add(textNodes.get(i - 1));
|
| - }
|
| + List<Node> contentNodes = getContentNodesForTextDocument(document, textNodes);
|
|
|
| List<Node> contentAndImages = RelevantImageFinder.findAndAddImages(
|
| contentNodes, Document.get().getDocumentElement());
|
| @@ -100,6 +96,24 @@ public class ContentExtractor implements Exportable {
|
| return domToSaxVisitor.getTextNodes();
|
| }
|
|
|
| + private static List<Node> getContentNodesForTextDocument(
|
| + TextDocument document, List<Node> textNodes) {
|
| + List<Integer> contentTextIndexes = new ArrayList<Integer>();
|
| + for (TextBlock tb : document.getTextBlocks()) {
|
| + if (!tb.hasLabel(DefaultLabels.TITLE)) {
|
| + contentTextIndexes.addAll(tb.getContainedTextElements());
|
| + }
|
| + }
|
| + Collections.sort(contentTextIndexes);
|
| +
|
| + // Boilerpipe's text node indexes start at 1.
|
| + List<Node> contentNodes = new ArrayList<Node>(contentTextIndexes.size());
|
| + for (Integer i : contentTextIndexes) {
|
| + contentNodes.add(textNodes.get(i - 1));
|
| + }
|
| + return contentNodes;
|
| + }
|
| +
|
| private static void makeAllLinksAbsolute(Node rootNode) {
|
| Element root = Element.as(rootNode);
|
|
|
|
|