Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(50)

Unified Diff: src/com/dom_distiller/client/ContentExtractor.java

Issue 286453002: Add extract_text_only option (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: Rebase Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « proto/dom_distiller.proto ('k') | src/com/dom_distiller/client/DomDistiller.java » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/com/dom_distiller/client/ContentExtractor.java
diff --git a/src/com/dom_distiller/client/ContentExtractor.java b/src/com/dom_distiller/client/ContentExtractor.java
index 2491fb616b1010050894261f6eaf5bbc39fb53bc..50cfef361ea98e891243907e6e5f7b2782d4eead 100644
--- a/src/com/dom_distiller/client/ContentExtractor.java
+++ b/src/com/dom_distiller/client/ContentExtractor.java
@@ -35,6 +35,10 @@ public class ContentExtractor implements Exportable {
static Logger logger = Logger.getLogger("DomDistiller");
public static String extractContent() {
+ return extractContent(false);
+ }
+
+ public static String extractContent(boolean text_only) {
BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandler();
List<Node> textNodes = null;
@@ -56,19 +60,11 @@ public class ContentExtractor implements Exportable {
return "";
}
- List<Integer> contentTextIndexes = new ArrayList<Integer>();
- for (TextBlock tb : document.getTextBlocks()) {
- if (!tb.hasLabel(DefaultLabels.TITLE)) {
- contentTextIndexes.addAll(tb.getContainedTextElements());
- }
+ if (text_only) {
+ return document.getText(true, false);
}
- Collections.sort(contentTextIndexes);
- // Boilerpipe's text node indexes start at 1.
- List<Node> contentNodes = new ArrayList<Node>(contentTextIndexes.size());
- for (Integer i : contentTextIndexes) {
- contentNodes.add(textNodes.get(i - 1));
- }
+ List<Node> contentNodes = getContentNodesForTextDocument(document, textNodes);
List<Node> contentAndImages = RelevantImageFinder.findAndAddImages(
contentNodes, Document.get().getDocumentElement());
@@ -100,6 +96,24 @@ public class ContentExtractor implements Exportable {
return domToSaxVisitor.getTextNodes();
}
+ private static List<Node> getContentNodesForTextDocument(
+ TextDocument document, List<Node> textNodes) {
+ List<Integer> contentTextIndexes = new ArrayList<Integer>();
+ for (TextBlock tb : document.getTextBlocks()) {
+ if (!tb.hasLabel(DefaultLabels.TITLE)) {
+ contentTextIndexes.addAll(tb.getContainedTextElements());
+ }
+ }
+ Collections.sort(contentTextIndexes);
+
+ // Boilerpipe's text node indexes start at 1.
+ List<Node> contentNodes = new ArrayList<Node>(contentTextIndexes.size());
+ for (Integer i : contentTextIndexes) {
+ contentNodes.add(textNodes.get(i - 1));
+ }
+ return contentNodes;
+ }
+
private static void makeAllLinksAbsolute(Node rootNode) {
Element root = Element.as(rootNode);
« no previous file with comments | « proto/dom_distiller.proto ('k') | src/com/dom_distiller/client/DomDistiller.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698