OLD | NEW |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 package com.dom_distiller.client; | 5 package com.dom_distiller.client; |
6 | 6 |
7 import java.util.ArrayList; | 7 import java.util.ArrayList; |
8 import java.util.Collections; | 8 import java.util.Collections; |
9 import java.util.List; | 9 import java.util.List; |
10 import java.util.logging.Logger; | 10 import java.util.logging.Logger; |
(...skipping 17 matching lines...) Expand all Loading... |
28 | 28 |
29 import org.xml.sax.AttributesImpl; | 29 import org.xml.sax.AttributesImpl; |
30 import org.xml.sax.ContentHandler; | 30 import org.xml.sax.ContentHandler; |
31 import org.xml.sax.SAXException; | 31 import org.xml.sax.SAXException; |
32 | 32 |
33 @Export() | 33 @Export() |
34 public class ContentExtractor implements Exportable { | 34 public class ContentExtractor implements Exportable { |
35 static Logger logger = Logger.getLogger("DomDistiller"); | 35 static Logger logger = Logger.getLogger("DomDistiller"); |
36 | 36 |
37 public static String extractContent() { | 37 public static String extractContent() { |
| 38 return extractContent(false); |
| 39 } |
| 40 |
| 41 public static String extractContent(boolean text_only) { |
38 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl
er(); | 42 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl
er(); |
39 List<Node> textNodes = null; | 43 List<Node> textNodes = null; |
40 | 44 |
41 try { | 45 try { |
42 htmlParser.startDocument(); | 46 htmlParser.startDocument(); |
43 Element documentElement = Document.get().getDocumentElement(); | 47 Element documentElement = Document.get().getDocumentElement(); |
44 textNodes = parse(documentElement, htmlParser); | 48 textNodes = parse(documentElement, htmlParser); |
45 htmlParser.endDocument(); | 49 htmlParser.endDocument(); |
46 } catch (SAXException e) { | 50 } catch (SAXException e) { |
47 logger.warning("Parsing failed."); | 51 logger.warning("Parsing failed."); |
48 return ""; | 52 return ""; |
49 } | 53 } |
50 | 54 |
51 TextDocument document = htmlParser.toTextDocument(); | 55 TextDocument document = htmlParser.toTextDocument(); |
52 try { | 56 try { |
53 CommonExtractors.ARTICLE_EXTRACTOR.process(document); | 57 CommonExtractors.ARTICLE_EXTRACTOR.process(document); |
54 } catch (BoilerpipeProcessingException e) { | 58 } catch (BoilerpipeProcessingException e) { |
55 logger.warning("Processing failed."); | 59 logger.warning("Processing failed."); |
56 return ""; | 60 return ""; |
57 } | 61 } |
58 | 62 |
59 List<Integer> contentTextIndexes = new ArrayList<Integer>(); | 63 if (text_only) { |
60 for (TextBlock tb : document.getTextBlocks()) { | 64 return document.getText(true, false); |
61 if (!tb.hasLabel(DefaultLabels.TITLE)) { | |
62 contentTextIndexes.addAll(tb.getContainedTextElements()); | |
63 } | |
64 } | 65 } |
65 Collections.sort(contentTextIndexes); | |
66 | 66 |
67 // Boilerpipe's text node indexes start at 1. | 67 List<Node> contentNodes = getContentNodesForTextDocument(document, textN
odes); |
68 List<Node> contentNodes = new ArrayList<Node>(contentTextIndexes.size())
; | |
69 for (Integer i : contentTextIndexes) { | |
70 contentNodes.add(textNodes.get(i - 1)); | |
71 } | |
72 | 68 |
73 List<Node> contentAndImages = RelevantImageFinder.findAndAddImages( | 69 List<Node> contentAndImages = RelevantImageFinder.findAndAddImages( |
74 contentNodes, Document.get().getDocumentElement()); | 70 contentNodes, Document.get().getDocumentElement()); |
75 | 71 |
76 if (contentAndImages.isEmpty()) { | 72 if (contentAndImages.isEmpty()) { |
77 return ""; | 73 return ""; |
78 } | 74 } |
79 | 75 |
80 Node clonedSubtree = NodeListExpander.expand(contentAndImages).cloneSubt
ree(); | 76 Node clonedSubtree = NodeListExpander.expand(contentAndImages).cloneSubt
ree(); |
81 | 77 |
(...skipping 11 matching lines...) Expand all Loading... |
93 return Element.as(clonedSubtree).getInnerHTML(); | 89 return Element.as(clonedSubtree).getInnerHTML(); |
94 } | 90 } |
95 | 91 |
96 private static List<Node> parse(Element e, ContentHandler handler) { | 92 private static List<Node> parse(Element e, ContentHandler handler) { |
97 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler); | 93 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler); |
98 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS
axVisitor); | 94 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS
axVisitor); |
99 new DomWalker(filteringDomVisitor).walk(e); | 95 new DomWalker(filteringDomVisitor).walk(e); |
100 return domToSaxVisitor.getTextNodes(); | 96 return domToSaxVisitor.getTextNodes(); |
101 } | 97 } |
102 | 98 |
| 99 private static List<Node> getContentNodesForTextDocument( |
| 100 TextDocument document, List<Node> textNodes) { |
| 101 List<Integer> contentTextIndexes = new ArrayList<Integer>(); |
| 102 for (TextBlock tb : document.getTextBlocks()) { |
| 103 if (!tb.hasLabel(DefaultLabels.TITLE)) { |
| 104 contentTextIndexes.addAll(tb.getContainedTextElements()); |
| 105 } |
| 106 } |
| 107 Collections.sort(contentTextIndexes); |
| 108 |
| 109 // Boilerpipe's text node indexes start at 1. |
| 110 List<Node> contentNodes = new ArrayList<Node>(contentTextIndexes.size())
; |
| 111 for (Integer i : contentTextIndexes) { |
| 112 contentNodes.add(textNodes.get(i - 1)); |
| 113 } |
| 114 return contentNodes; |
| 115 } |
| 116 |
103 private static void makeAllLinksAbsolute(Node rootNode) { | 117 private static void makeAllLinksAbsolute(Node rootNode) { |
104 Element root = Element.as(rootNode); | 118 Element root = Element.as(rootNode); |
105 | 119 |
106 // AnchorElement.getHref() and ImageElement.getSrc() both return the | 120 // AnchorElement.getHref() and ImageElement.getSrc() both return the |
107 // absolute URI, so simply set them as the respective attributes. | 121 // absolute URI, so simply set them as the respective attributes. |
108 | 122 |
109 NodeList<Element> allLinks = root.getElementsByTagName("A"); | 123 NodeList<Element> allLinks = root.getElementsByTagName("A"); |
110 for (int i = 0; i < allLinks.getLength(); i++) { | 124 for (int i = 0; i < allLinks.getLength(); i++) { |
111 AnchorElement link = AnchorElement.as(allLinks.getItem(i)); | 125 AnchorElement link = AnchorElement.as(allLinks.getItem(i)); |
112 link.setHref(link.getHref()); | 126 link.setHref(link.getHref()); |
113 } | 127 } |
114 | 128 |
115 NodeList<Element> allImages = root.getElementsByTagName("IMG"); | 129 NodeList<Element> allImages = root.getElementsByTagName("IMG"); |
116 for (int i = 0; i < allImages.getLength(); i++) { | 130 for (int i = 0; i < allImages.getLength(); i++) { |
117 ImageElement image = ImageElement.as(allImages.getItem(i)); | 131 ImageElement image = ImageElement.as(allImages.getItem(i)); |
118 image.setSrc(image.getSrc()); | 132 image.setSrc(image.getSrc()); |
119 } | 133 } |
120 } | 134 } |
121 } | 135 } |
OLD | NEW |