Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(130)

Side by Side Diff: src/com/dom_distiller/client/ContentExtractor.java

Issue 275493007: filter out invisible elements (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: addressed comments Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package com.dom_distiller.client; 5 package com.dom_distiller.client;
6 6
7 import java.util.ArrayList; 7 import java.util.ArrayList;
8 import java.util.Collections; 8 import java.util.Collections;
9 import java.util.List; 9 import java.util.List;
10 import java.util.logging.Logger; 10 import java.util.logging.Logger;
11 11
12 import com.google.gwt.dom.client.AnchorElement; 12 import com.google.gwt.dom.client.AnchorElement;
13 import com.google.gwt.dom.client.Document; 13 import com.google.gwt.dom.client.Document;
14 import com.google.gwt.dom.client.Element; 14 import com.google.gwt.dom.client.Element;
15 import com.google.gwt.dom.client.ImageElement; 15 import com.google.gwt.dom.client.ImageElement;
16 import com.google.gwt.dom.client.Node; 16 import com.google.gwt.dom.client.Node;
17 import com.google.gwt.dom.client.NodeList; 17 import com.google.gwt.dom.client.NodeList;
18 18
19 import de.l3s.boilerpipe.BoilerpipeProcessingException; 19 import de.l3s.boilerpipe.BoilerpipeProcessingException;
20 import de.l3s.boilerpipe.document.TextBlock; 20 import de.l3s.boilerpipe.document.TextBlock;
21 import de.l3s.boilerpipe.document.TextDocument; 21 import de.l3s.boilerpipe.document.TextDocument;
22 import de.l3s.boilerpipe.extractors.CommonExtractors; 22 import de.l3s.boilerpipe.extractors.CommonExtractors;
23 import de.l3s.boilerpipe.labels.DefaultLabels; 23 import de.l3s.boilerpipe.labels.DefaultLabels;
24 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler; 24 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
25 25
26 import org.timepedia.exporter.client.Export; 26 import org.timepedia.exporter.client.Export;
27 import org.timepedia.exporter.client.Exportable; 27 import org.timepedia.exporter.client.Exportable;
28 28
29 import org.xml.sax.AttributesImpl; 29 import org.xml.sax.AttributesImpl;
30 import org.xml.sax.ContentHandler;
30 import org.xml.sax.SAXException; 31 import org.xml.sax.SAXException;
31 32
32 @Export() 33 @Export()
33 public class ContentExtractor implements Exportable { 34 public class ContentExtractor implements Exportable {
34 static Logger logger = Logger.getLogger("DomDistiller"); 35 static Logger logger = Logger.getLogger("DomDistiller");
35 36
36 public static String extractContent() { 37 public static String extractContent() {
37 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl er(); 38 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl er();
38 List<Node> textNodes = null; 39 List<Node> textNodes = null;
39 40
40 try { 41 try {
41 htmlParser.startDocument(); 42 htmlParser.startDocument();
42 Element documentElement = Document.get().getDocumentElement(); 43 Element documentElement = Document.get().getDocumentElement();
43 textNodes = DomToSaxParser.parse(documentElement, htmlParser); 44 textNodes = parse(documentElement, htmlParser);
44 htmlParser.endDocument(); 45 htmlParser.endDocument();
45 } catch (SAXException e) { 46 } catch (SAXException e) {
46 logger.warning("Parsing failed."); 47 logger.warning("Parsing failed.");
47 return ""; 48 return "";
48 } 49 }
49 50
50 TextDocument document = htmlParser.toTextDocument(); 51 TextDocument document = htmlParser.toTextDocument();
51 try { 52 try {
52 CommonExtractors.ARTICLE_EXTRACTOR.process(document); 53 CommonExtractors.ARTICLE_EXTRACTOR.process(document);
53 } catch (BoilerpipeProcessingException e) { 54 } catch (BoilerpipeProcessingException e) {
(...skipping 27 matching lines...) Expand all
81 // The base URL in the distilled page viewer is different from that in 82 // The base URL in the distilled page viewer is different from that in
82 // the live page. This breaks all relative links (in anchors and 83 // the live page. This breaks all relative links (in anchors and
83 // images), so make them absolute in the distilled content. 84 // images), so make them absolute in the distilled content.
84 makeAllLinksAbsolute(clonedSubtree); 85 makeAllLinksAbsolute(clonedSubtree);
85 86
86 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might 87 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might
87 // break in some cases. 88 // break in some cases.
88 return Element.as(clonedSubtree).getInnerHTML(); 89 return Element.as(clonedSubtree).getInnerHTML();
89 } 90 }
90 91
92 private static List<Node> parse(Element e, ContentHandler handler) {
93 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler);
94 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS axVisitor);
95 new DomWalker(filteringDomVisitor).walk(e);
96 return domToSaxVisitor.getTextNodes();
97 }
98
91 private static void makeAllLinksAbsolute(Node rootNode) { 99 private static void makeAllLinksAbsolute(Node rootNode) {
92 Element root = Element.as(rootNode); 100 Element root = Element.as(rootNode);
93 101
94 // AnchorElement.getHref() and ImageElement.getSrc() both return the 102 // AnchorElement.getHref() and ImageElement.getSrc() both return the
95 // absolute URI, so simply set them as the respective attributes. 103 // absolute URI, so simply set them as the respective attributes.
96 104
97 NodeList<Element> allLinks = root.getElementsByTagName("A"); 105 NodeList<Element> allLinks = root.getElementsByTagName("A");
98 for (int i = 0; i < allLinks.getLength(); i++) { 106 for (int i = 0; i < allLinks.getLength(); i++) {
99 AnchorElement link = AnchorElement.as(allLinks.getItem(i)); 107 AnchorElement link = AnchorElement.as(allLinks.getItem(i));
100 link.setHref(link.getHref()); 108 link.setHref(link.getHref());
101 } 109 }
102 110
103 NodeList<Element> allImages = root.getElementsByTagName("IMG"); 111 NodeList<Element> allImages = root.getElementsByTagName("IMG");
104 for (int i = 0; i < allImages.getLength(); i++) { 112 for (int i = 0; i < allImages.getLength(); i++) {
105 ImageElement image = ImageElement.as(allImages.getItem(i)); 113 ImageElement image = ImageElement.as(allImages.getItem(i));
106 image.setSrc(image.getSrc()); 114 image.setSrc(image.getSrc());
107 } 115 }
108 } 116 }
109 } 117 }
OLDNEW
« no previous file with comments | « no previous file | src/com/dom_distiller/client/DomToSaxParser.java » ('j') | src/com/dom_distiller/client/DomUtil.java » ('J')

Powered by Google App Engine
This is Rietveld 408576698