| OLD | NEW |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 package com.dom_distiller.client; | 5 package com.dom_distiller.client; |
| 6 | 6 |
| 7 import java.util.ArrayList; | 7 import java.util.ArrayList; |
| 8 import java.util.Collections; | 8 import java.util.Collections; |
| 9 import java.util.List; | 9 import java.util.List; |
| 10 import java.util.logging.Logger; | 10 import java.util.logging.Logger; |
| 11 | 11 |
| 12 import com.google.gwt.dom.client.AnchorElement; | 12 import com.google.gwt.dom.client.AnchorElement; |
| 13 import com.google.gwt.dom.client.Document; | 13 import com.google.gwt.dom.client.Document; |
| 14 import com.google.gwt.dom.client.Element; | 14 import com.google.gwt.dom.client.Element; |
| 15 import com.google.gwt.dom.client.ImageElement; | 15 import com.google.gwt.dom.client.ImageElement; |
| 16 import com.google.gwt.dom.client.Node; | 16 import com.google.gwt.dom.client.Node; |
| 17 import com.google.gwt.dom.client.NodeList; | 17 import com.google.gwt.dom.client.NodeList; |
| 18 | 18 |
| 19 import de.l3s.boilerpipe.BoilerpipeProcessingException; | 19 import de.l3s.boilerpipe.BoilerpipeProcessingException; |
| 20 import de.l3s.boilerpipe.document.TextBlock; | 20 import de.l3s.boilerpipe.document.TextBlock; |
| 21 import de.l3s.boilerpipe.document.TextDocument; | 21 import de.l3s.boilerpipe.document.TextDocument; |
| 22 import de.l3s.boilerpipe.extractors.CommonExtractors; | 22 import de.l3s.boilerpipe.extractors.CommonExtractors; |
| 23 import de.l3s.boilerpipe.labels.DefaultLabels; | 23 import de.l3s.boilerpipe.labels.DefaultLabels; |
| 24 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler; | 24 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler; |
| 25 | 25 |
| 26 import org.timepedia.exporter.client.Export; | 26 import org.timepedia.exporter.client.Export; |
| 27 import org.timepedia.exporter.client.Exportable; | 27 import org.timepedia.exporter.client.Exportable; |
| 28 | 28 |
| 29 import org.xml.sax.AttributesImpl; | 29 import org.xml.sax.AttributesImpl; |
| 30 import org.xml.sax.ContentHandler; |
| 30 import org.xml.sax.SAXException; | 31 import org.xml.sax.SAXException; |
| 31 | 32 |
| 32 @Export() | 33 @Export() |
| 33 public class ContentExtractor implements Exportable { | 34 public class ContentExtractor implements Exportable { |
| 34 static Logger logger = Logger.getLogger("DomDistiller"); | 35 static Logger logger = Logger.getLogger("DomDistiller"); |
| 35 | 36 |
| 36 public static String extractContent() { | 37 public static String extractContent() { |
| 37 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl
er(); | 38 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl
er(); |
| 38 List<Node> textNodes = null; | 39 List<Node> textNodes = null; |
| 39 | 40 |
| 40 try { | 41 try { |
| 41 htmlParser.startDocument(); | 42 htmlParser.startDocument(); |
| 42 Element documentElement = Document.get().getDocumentElement(); | 43 Element documentElement = Document.get().getDocumentElement(); |
| 43 textNodes = DomToSaxParser.parse(documentElement, htmlParser); | 44 textNodes = parse(documentElement, htmlParser); |
| 44 htmlParser.endDocument(); | 45 htmlParser.endDocument(); |
| 45 } catch (SAXException e) { | 46 } catch (SAXException e) { |
| 46 logger.warning("Parsing failed."); | 47 logger.warning("Parsing failed."); |
| 47 return ""; | 48 return ""; |
| 48 } | 49 } |
| 49 | 50 |
| 50 TextDocument document = htmlParser.toTextDocument(); | 51 TextDocument document = htmlParser.toTextDocument(); |
| 51 try { | 52 try { |
| 52 CommonExtractors.ARTICLE_EXTRACTOR.process(document); | 53 CommonExtractors.ARTICLE_EXTRACTOR.process(document); |
| 53 } catch (BoilerpipeProcessingException e) { | 54 } catch (BoilerpipeProcessingException e) { |
| (...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 85 // The base URL in the distilled page viewer is different from that in | 86 // The base URL in the distilled page viewer is different from that in |
| 86 // the live page. This breaks all relative links (in anchors and | 87 // the live page. This breaks all relative links (in anchors and |
| 87 // images), so make them absolute in the distilled content. | 88 // images), so make them absolute in the distilled content. |
| 88 makeAllLinksAbsolute(clonedSubtree); | 89 makeAllLinksAbsolute(clonedSubtree); |
| 89 | 90 |
| 90 // TODO(cjhopman): this discards the top element and just returns its ch
ildren. This might | 91 // TODO(cjhopman): this discards the top element and just returns its ch
ildren. This might |
| 91 // break in some cases. | 92 // break in some cases. |
| 92 return Element.as(clonedSubtree).getInnerHTML(); | 93 return Element.as(clonedSubtree).getInnerHTML(); |
| 93 } | 94 } |
| 94 | 95 |
| 96 private static List<Node> parse(Element e, ContentHandler handler) { |
| 97 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler); |
| 98 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS
axVisitor); |
| 99 new DomWalker(filteringDomVisitor).walk(e); |
| 100 return domToSaxVisitor.getTextNodes(); |
| 101 } |
| 102 |
| 95 private static void makeAllLinksAbsolute(Node rootNode) { | 103 private static void makeAllLinksAbsolute(Node rootNode) { |
| 96 Element root = Element.as(rootNode); | 104 Element root = Element.as(rootNode); |
| 97 | 105 |
| 98 // AnchorElement.getHref() and ImageElement.getSrc() both return the | 106 // AnchorElement.getHref() and ImageElement.getSrc() both return the |
| 99 // absolute URI, so simply set them as the respective attributes. | 107 // absolute URI, so simply set them as the respective attributes. |
| 100 | 108 |
| 101 NodeList<Element> allLinks = root.getElementsByTagName("A"); | 109 NodeList<Element> allLinks = root.getElementsByTagName("A"); |
| 102 for (int i = 0; i < allLinks.getLength(); i++) { | 110 for (int i = 0; i < allLinks.getLength(); i++) { |
| 103 AnchorElement link = AnchorElement.as(allLinks.getItem(i)); | 111 AnchorElement link = AnchorElement.as(allLinks.getItem(i)); |
| 104 link.setHref(link.getHref()); | 112 link.setHref(link.getHref()); |
| 105 } | 113 } |
| 106 | 114 |
| 107 NodeList<Element> allImages = root.getElementsByTagName("IMG"); | 115 NodeList<Element> allImages = root.getElementsByTagName("IMG"); |
| 108 for (int i = 0; i < allImages.getLength(); i++) { | 116 for (int i = 0; i < allImages.getLength(); i++) { |
| 109 ImageElement image = ImageElement.as(allImages.getItem(i)); | 117 ImageElement image = ImageElement.as(allImages.getItem(i)); |
| 110 image.setSrc(image.getSrc()); | 118 image.setSrc(image.getSrc()); |
| 111 } | 119 } |
| 112 } | 120 } |
| 113 } | 121 } |
| OLD | NEW |