OLD | NEW |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 package com.dom_distiller.client; | 5 package com.dom_distiller.client; |
6 | 6 |
7 import java.util.ArrayList; | 7 import java.util.ArrayList; |
8 import java.util.Collections; | 8 import java.util.Collections; |
9 import java.util.List; | 9 import java.util.List; |
10 import java.util.logging.Logger; | 10 import java.util.logging.Logger; |
11 | 11 |
12 import com.google.gwt.dom.client.AnchorElement; | 12 import com.google.gwt.dom.client.AnchorElement; |
13 import com.google.gwt.dom.client.Document; | 13 import com.google.gwt.dom.client.Document; |
14 import com.google.gwt.dom.client.Element; | 14 import com.google.gwt.dom.client.Element; |
15 import com.google.gwt.dom.client.ImageElement; | 15 import com.google.gwt.dom.client.ImageElement; |
16 import com.google.gwt.dom.client.Node; | 16 import com.google.gwt.dom.client.Node; |
17 import com.google.gwt.dom.client.NodeList; | 17 import com.google.gwt.dom.client.NodeList; |
18 | 18 |
19 import de.l3s.boilerpipe.BoilerpipeProcessingException; | 19 import de.l3s.boilerpipe.BoilerpipeProcessingException; |
20 import de.l3s.boilerpipe.document.TextBlock; | 20 import de.l3s.boilerpipe.document.TextBlock; |
21 import de.l3s.boilerpipe.document.TextDocument; | 21 import de.l3s.boilerpipe.document.TextDocument; |
22 import de.l3s.boilerpipe.extractors.CommonExtractors; | 22 import de.l3s.boilerpipe.extractors.CommonExtractors; |
23 import de.l3s.boilerpipe.labels.DefaultLabels; | 23 import de.l3s.boilerpipe.labels.DefaultLabels; |
24 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler; | 24 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler; |
25 | 25 |
26 import org.timepedia.exporter.client.Export; | 26 import org.timepedia.exporter.client.Export; |
27 import org.timepedia.exporter.client.Exportable; | 27 import org.timepedia.exporter.client.Exportable; |
28 | 28 |
29 import org.xml.sax.AttributesImpl; | 29 import org.xml.sax.AttributesImpl; |
| 30 import org.xml.sax.ContentHandler; |
30 import org.xml.sax.SAXException; | 31 import org.xml.sax.SAXException; |
31 | 32 |
32 @Export() | 33 @Export() |
33 public class ContentExtractor implements Exportable { | 34 public class ContentExtractor implements Exportable { |
34 static Logger logger = Logger.getLogger("DomDistiller"); | 35 static Logger logger = Logger.getLogger("DomDistiller"); |
35 | 36 |
36 public static String extractContent() { | 37 public static String extractContent() { |
37 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl
er(); | 38 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl
er(); |
38 List<Node> textNodes = null; | 39 List<Node> textNodes = null; |
39 | 40 |
40 try { | 41 try { |
41 htmlParser.startDocument(); | 42 htmlParser.startDocument(); |
42 Element documentElement = Document.get().getDocumentElement(); | 43 Element documentElement = Document.get().getDocumentElement(); |
43 textNodes = DomToSaxParser.parse(documentElement, htmlParser); | 44 textNodes = parse(documentElement, htmlParser); |
44 htmlParser.endDocument(); | 45 htmlParser.endDocument(); |
45 } catch (SAXException e) { | 46 } catch (SAXException e) { |
46 logger.warning("Parsing failed."); | 47 logger.warning("Parsing failed."); |
47 return ""; | 48 return ""; |
48 } | 49 } |
49 | 50 |
50 TextDocument document = htmlParser.toTextDocument(); | 51 TextDocument document = htmlParser.toTextDocument(); |
51 try { | 52 try { |
52 CommonExtractors.ARTICLE_EXTRACTOR.process(document); | 53 CommonExtractors.ARTICLE_EXTRACTOR.process(document); |
53 } catch (BoilerpipeProcessingException e) { | 54 } catch (BoilerpipeProcessingException e) { |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
85 // The base URL in the distilled page viewer is different from that in | 86 // The base URL in the distilled page viewer is different from that in |
86 // the live page. This breaks all relative links (in anchors and | 87 // the live page. This breaks all relative links (in anchors and |
87 // images), so make them absolute in the distilled content. | 88 // images), so make them absolute in the distilled content. |
88 makeAllLinksAbsolute(clonedSubtree); | 89 makeAllLinksAbsolute(clonedSubtree); |
89 | 90 |
90 // TODO(cjhopman): this discards the top element and just returns its ch
ildren. This might | 91 // TODO(cjhopman): this discards the top element and just returns its ch
ildren. This might |
91 // break in some cases. | 92 // break in some cases. |
92 return Element.as(clonedSubtree).getInnerHTML(); | 93 return Element.as(clonedSubtree).getInnerHTML(); |
93 } | 94 } |
94 | 95 |
| 96 private static List<Node> parse(Element e, ContentHandler handler) { |
| 97 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler); |
| 98 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS
axVisitor); |
| 99 new DomWalker(filteringDomVisitor).walk(e); |
| 100 return domToSaxVisitor.getTextNodes(); |
| 101 } |
| 102 |
95 private static void makeAllLinksAbsolute(Node rootNode) { | 103 private static void makeAllLinksAbsolute(Node rootNode) { |
96 Element root = Element.as(rootNode); | 104 Element root = Element.as(rootNode); |
97 | 105 |
98 // AnchorElement.getHref() and ImageElement.getSrc() both return the | 106 // AnchorElement.getHref() and ImageElement.getSrc() both return the |
99 // absolute URI, so simply set them as the respective attributes. | 107 // absolute URI, so simply set them as the respective attributes. |
100 | 108 |
101 NodeList<Element> allLinks = root.getElementsByTagName("A"); | 109 NodeList<Element> allLinks = root.getElementsByTagName("A"); |
102 for (int i = 0; i < allLinks.getLength(); i++) { | 110 for (int i = 0; i < allLinks.getLength(); i++) { |
103 AnchorElement link = AnchorElement.as(allLinks.getItem(i)); | 111 AnchorElement link = AnchorElement.as(allLinks.getItem(i)); |
104 link.setHref(link.getHref()); | 112 link.setHref(link.getHref()); |
105 } | 113 } |
106 | 114 |
107 NodeList<Element> allImages = root.getElementsByTagName("IMG"); | 115 NodeList<Element> allImages = root.getElementsByTagName("IMG"); |
108 for (int i = 0; i < allImages.getLength(); i++) { | 116 for (int i = 0; i < allImages.getLength(); i++) { |
109 ImageElement image = ImageElement.as(allImages.getItem(i)); | 117 ImageElement image = ImageElement.as(allImages.getItem(i)); |
110 image.setSrc(image.getSrc()); | 118 image.setSrc(image.getSrc()); |
111 } | 119 } |
112 } | 120 } |
113 } | 121 } |
OLD | NEW |