Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(177)

Side by Side Diff: src/com/dom_distiller/client/ContentExtractor.java

Issue 286453002: Add extract_text_only option (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: Rebase Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « proto/dom_distiller.proto ('k') | src/com/dom_distiller/client/DomDistiller.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package com.dom_distiller.client; 5 package com.dom_distiller.client;
6 6
7 import java.util.ArrayList; 7 import java.util.ArrayList;
8 import java.util.Collections; 8 import java.util.Collections;
9 import java.util.List; 9 import java.util.List;
10 import java.util.logging.Logger; 10 import java.util.logging.Logger;
(...skipping 17 matching lines...) Expand all
28 28
29 import org.xml.sax.AttributesImpl; 29 import org.xml.sax.AttributesImpl;
30 import org.xml.sax.ContentHandler; 30 import org.xml.sax.ContentHandler;
31 import org.xml.sax.SAXException; 31 import org.xml.sax.SAXException;
32 32
33 @Export() 33 @Export()
34 public class ContentExtractor implements Exportable { 34 public class ContentExtractor implements Exportable {
35 static Logger logger = Logger.getLogger("DomDistiller"); 35 static Logger logger = Logger.getLogger("DomDistiller");
36 36
37 public static String extractContent() { 37 public static String extractContent() {
38 return extractContent(false);
39 }
40
41 public static String extractContent(boolean text_only) {
38 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl er(); 42 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl er();
39 List<Node> textNodes = null; 43 List<Node> textNodes = null;
40 44
41 try { 45 try {
42 htmlParser.startDocument(); 46 htmlParser.startDocument();
43 Element documentElement = Document.get().getDocumentElement(); 47 Element documentElement = Document.get().getDocumentElement();
44 textNodes = parse(documentElement, htmlParser); 48 textNodes = parse(documentElement, htmlParser);
45 htmlParser.endDocument(); 49 htmlParser.endDocument();
46 } catch (SAXException e) { 50 } catch (SAXException e) {
47 logger.warning("Parsing failed."); 51 logger.warning("Parsing failed.");
48 return ""; 52 return "";
49 } 53 }
50 54
51 TextDocument document = htmlParser.toTextDocument(); 55 TextDocument document = htmlParser.toTextDocument();
52 try { 56 try {
53 CommonExtractors.ARTICLE_EXTRACTOR.process(document); 57 CommonExtractors.ARTICLE_EXTRACTOR.process(document);
54 } catch (BoilerpipeProcessingException e) { 58 } catch (BoilerpipeProcessingException e) {
55 logger.warning("Processing failed."); 59 logger.warning("Processing failed.");
56 return ""; 60 return "";
57 } 61 }
58 62
59 List<Integer> contentTextIndexes = new ArrayList<Integer>(); 63 if (text_only) {
60 for (TextBlock tb : document.getTextBlocks()) { 64 return document.getText(true, false);
61 if (!tb.hasLabel(DefaultLabels.TITLE)) {
62 contentTextIndexes.addAll(tb.getContainedTextElements());
63 }
64 } 65 }
65 Collections.sort(contentTextIndexes);
66 66
67 // Boilerpipe's text node indexes start at 1. 67 List<Node> contentNodes = getContentNodesForTextDocument(document, textN odes);
68 List<Node> contentNodes = new ArrayList<Node>(contentTextIndexes.size()) ;
69 for (Integer i : contentTextIndexes) {
70 contentNodes.add(textNodes.get(i - 1));
71 }
72 68
73 List<Node> contentAndImages = RelevantImageFinder.findAndAddImages( 69 List<Node> contentAndImages = RelevantImageFinder.findAndAddImages(
74 contentNodes, Document.get().getDocumentElement()); 70 contentNodes, Document.get().getDocumentElement());
75 71
76 if (contentAndImages.isEmpty()) { 72 if (contentAndImages.isEmpty()) {
77 return ""; 73 return "";
78 } 74 }
79 75
80 Node clonedSubtree = NodeListExpander.expand(contentAndImages).cloneSubt ree(); 76 Node clonedSubtree = NodeListExpander.expand(contentAndImages).cloneSubt ree();
81 77
(...skipping 11 matching lines...) Expand all
93 return Element.as(clonedSubtree).getInnerHTML(); 89 return Element.as(clonedSubtree).getInnerHTML();
94 } 90 }
95 91
96 private static List<Node> parse(Element e, ContentHandler handler) { 92 private static List<Node> parse(Element e, ContentHandler handler) {
97 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler); 93 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler);
98 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS axVisitor); 94 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS axVisitor);
99 new DomWalker(filteringDomVisitor).walk(e); 95 new DomWalker(filteringDomVisitor).walk(e);
100 return domToSaxVisitor.getTextNodes(); 96 return domToSaxVisitor.getTextNodes();
101 } 97 }
102 98
99 private static List<Node> getContentNodesForTextDocument(
100 TextDocument document, List<Node> textNodes) {
101 List<Integer> contentTextIndexes = new ArrayList<Integer>();
102 for (TextBlock tb : document.getTextBlocks()) {
103 if (!tb.hasLabel(DefaultLabels.TITLE)) {
104 contentTextIndexes.addAll(tb.getContainedTextElements());
105 }
106 }
107 Collections.sort(contentTextIndexes);
108
109 // Boilerpipe's text node indexes start at 1.
110 List<Node> contentNodes = new ArrayList<Node>(contentTextIndexes.size()) ;
111 for (Integer i : contentTextIndexes) {
112 contentNodes.add(textNodes.get(i - 1));
113 }
114 return contentNodes;
115 }
116
103 private static void makeAllLinksAbsolute(Node rootNode) { 117 private static void makeAllLinksAbsolute(Node rootNode) {
104 Element root = Element.as(rootNode); 118 Element root = Element.as(rootNode);
105 119
106 // AnchorElement.getHref() and ImageElement.getSrc() both return the 120 // AnchorElement.getHref() and ImageElement.getSrc() both return the
107 // absolute URI, so simply set them as the respective attributes. 121 // absolute URI, so simply set them as the respective attributes.
108 122
109 NodeList<Element> allLinks = root.getElementsByTagName("A"); 123 NodeList<Element> allLinks = root.getElementsByTagName("A");
110 for (int i = 0; i < allLinks.getLength(); i++) { 124 for (int i = 0; i < allLinks.getLength(); i++) {
111 AnchorElement link = AnchorElement.as(allLinks.getItem(i)); 125 AnchorElement link = AnchorElement.as(allLinks.getItem(i));
112 link.setHref(link.getHref()); 126 link.setHref(link.getHref());
113 } 127 }
114 128
115 NodeList<Element> allImages = root.getElementsByTagName("IMG"); 129 NodeList<Element> allImages = root.getElementsByTagName("IMG");
116 for (int i = 0; i < allImages.getLength(); i++) { 130 for (int i = 0; i < allImages.getLength(); i++) {
117 ImageElement image = ImageElement.as(allImages.getItem(i)); 131 ImageElement image = ImageElement.as(allImages.getItem(i));
118 image.setSrc(image.getSrc()); 132 image.setSrc(image.getSrc());
119 } 133 }
120 } 134 }
121 } 135 }
OLDNEW
« no previous file with comments | « proto/dom_distiller.proto ('k') | src/com/dom_distiller/client/DomDistiller.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698