OLD | NEW |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 package com.dom_distiller.client; | 5 package com.dom_distiller.client; |
6 | 6 |
7 import java.util.ArrayList; | 7 import java.util.ArrayList; |
8 import java.util.Collections; | 8 import java.util.Collections; |
9 import java.util.List; | 9 import java.util.List; |
10 import java.util.logging.Logger; | 10 import java.util.logging.Logger; |
11 | 11 |
12 import com.google.gwt.dom.client.AnchorElement; | 12 import com.google.gwt.dom.client.AnchorElement; |
13 import com.google.gwt.dom.client.Document; | 13 import com.google.gwt.dom.client.Document; |
14 import com.google.gwt.dom.client.Element; | 14 import com.google.gwt.dom.client.Element; |
15 import com.google.gwt.dom.client.ImageElement; | 15 import com.google.gwt.dom.client.ImageElement; |
16 import com.google.gwt.dom.client.Node; | 16 import com.google.gwt.dom.client.Node; |
17 import com.google.gwt.dom.client.NodeList; | 17 import com.google.gwt.dom.client.NodeList; |
18 | 18 |
19 import de.l3s.boilerpipe.BoilerpipeProcessingException; | 19 import de.l3s.boilerpipe.BoilerpipeProcessingException; |
20 import de.l3s.boilerpipe.document.TextBlock; | 20 import de.l3s.boilerpipe.document.TextBlock; |
21 import de.l3s.boilerpipe.document.TextDocument; | 21 import de.l3s.boilerpipe.document.TextDocument; |
22 import de.l3s.boilerpipe.extractors.CommonExtractors; | 22 import de.l3s.boilerpipe.extractors.CommonExtractors; |
23 import de.l3s.boilerpipe.labels.DefaultLabels; | 23 import de.l3s.boilerpipe.labels.DefaultLabels; |
24 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler; | 24 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler; |
25 | 25 |
26 import org.timepedia.exporter.client.Export; | 26 import org.timepedia.exporter.client.Export; |
27 import org.timepedia.exporter.client.Exportable; | 27 import org.timepedia.exporter.client.Exportable; |
28 | 28 |
29 import org.xml.sax.AttributesImpl; | 29 import org.xml.sax.AttributesImpl; |
| 30 import org.xml.sax.ContentHandler; |
30 import org.xml.sax.SAXException; | 31 import org.xml.sax.SAXException; |
31 | 32 |
32 @Export() | 33 @Export() |
33 public class ContentExtractor implements Exportable { | 34 public class ContentExtractor implements Exportable { |
34 static Logger logger = Logger.getLogger("DomDistiller"); | 35 static Logger logger = Logger.getLogger("DomDistiller"); |
35 | 36 |
36 public static String extractContent() { | 37 public static String extractContent() { |
37 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl
er(); | 38 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl
er(); |
38 List<Node> textNodes = null; | 39 List<Node> textNodes = null; |
39 | 40 |
40 try { | 41 try { |
41 htmlParser.startDocument(); | 42 htmlParser.startDocument(); |
42 Element documentElement = Document.get().getDocumentElement(); | 43 Element documentElement = Document.get().getDocumentElement(); |
43 textNodes = DomToSaxParser.parse(documentElement, htmlParser); | 44 textNodes = parse(documentElement, htmlParser); |
44 htmlParser.endDocument(); | 45 htmlParser.endDocument(); |
45 } catch (SAXException e) { | 46 } catch (SAXException e) { |
46 logger.warning("Parsing failed."); | 47 logger.warning("Parsing failed."); |
47 return ""; | 48 return ""; |
48 } | 49 } |
49 | 50 |
50 TextDocument document = htmlParser.toTextDocument(); | 51 TextDocument document = htmlParser.toTextDocument(); |
51 try { | 52 try { |
52 CommonExtractors.ARTICLE_EXTRACTOR.process(document); | 53 CommonExtractors.ARTICLE_EXTRACTOR.process(document); |
53 } catch (BoilerpipeProcessingException e) { | 54 } catch (BoilerpipeProcessingException e) { |
(...skipping 27 matching lines...) Expand all Loading... |
81 // The base URL in the distilled page viewer is different from that in | 82 // The base URL in the distilled page viewer is different from that in |
82 // the live page. This breaks all relative links (in anchors and | 83 // the live page. This breaks all relative links (in anchors and |
83 // images), so make them absolute in the distilled content. | 84 // images), so make them absolute in the distilled content. |
84 makeAllLinksAbsolute(clonedSubtree); | 85 makeAllLinksAbsolute(clonedSubtree); |
85 | 86 |
86 // TODO(cjhopman): this discards the top element and just returns its ch
ildren. This might | 87 // TODO(cjhopman): this discards the top element and just returns its ch
ildren. This might |
87 // break in some cases. | 88 // break in some cases. |
88 return Element.as(clonedSubtree).getInnerHTML(); | 89 return Element.as(clonedSubtree).getInnerHTML(); |
89 } | 90 } |
90 | 91 |
| 92 private static List<Node> parse(Element e, ContentHandler handler) { |
| 93 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler); |
| 94 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS
axVisitor); |
| 95 new DomWalker(filteringDomVisitor).walk(e); |
| 96 return domToSaxVisitor.getTextNodes(); |
| 97 } |
| 98 |
91 private static void makeAllLinksAbsolute(Node rootNode) { | 99 private static void makeAllLinksAbsolute(Node rootNode) { |
92 Element root = Element.as(rootNode); | 100 Element root = Element.as(rootNode); |
93 | 101 |
94 // AnchorElement.getHref() and ImageElement.getSrc() both return the | 102 // AnchorElement.getHref() and ImageElement.getSrc() both return the |
95 // absolute URI, so simply set them as the respective attributes. | 103 // absolute URI, so simply set them as the respective attributes. |
96 | 104 |
97 NodeList<Element> allLinks = root.getElementsByTagName("A"); | 105 NodeList<Element> allLinks = root.getElementsByTagName("A"); |
98 for (int i = 0; i < allLinks.getLength(); i++) { | 106 for (int i = 0; i < allLinks.getLength(); i++) { |
99 AnchorElement link = AnchorElement.as(allLinks.getItem(i)); | 107 AnchorElement link = AnchorElement.as(allLinks.getItem(i)); |
100 link.setHref(link.getHref()); | 108 link.setHref(link.getHref()); |
101 } | 109 } |
102 | 110 |
103 NodeList<Element> allImages = root.getElementsByTagName("IMG"); | 111 NodeList<Element> allImages = root.getElementsByTagName("IMG"); |
104 for (int i = 0; i < allImages.getLength(); i++) { | 112 for (int i = 0; i < allImages.getLength(); i++) { |
105 ImageElement image = ImageElement.as(allImages.getItem(i)); | 113 ImageElement image = ImageElement.as(allImages.getItem(i)); |
106 image.setSrc(image.getSrc()); | 114 image.setSrc(image.getSrc()); |
107 } | 115 } |
108 } | 116 } |
109 } | 117 } |
OLD | NEW |