Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(391)

Side by Side Diff: src/com/dom_distiller/client/ContentExtractor.java

Issue 275493007: filter out invisible elements (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: addressed comments Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | src/com/dom_distiller/client/DomToSaxParser.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package com.dom_distiller.client; 5 package com.dom_distiller.client;
6 6
7 import java.util.ArrayList; 7 import java.util.ArrayList;
8 import java.util.Collections; 8 import java.util.Collections;
9 import java.util.List; 9 import java.util.List;
10 import java.util.logging.Logger; 10 import java.util.logging.Logger;
11 11
12 import com.google.gwt.dom.client.AnchorElement; 12 import com.google.gwt.dom.client.AnchorElement;
13 import com.google.gwt.dom.client.Document; 13 import com.google.gwt.dom.client.Document;
14 import com.google.gwt.dom.client.Element; 14 import com.google.gwt.dom.client.Element;
15 import com.google.gwt.dom.client.ImageElement; 15 import com.google.gwt.dom.client.ImageElement;
16 import com.google.gwt.dom.client.Node; 16 import com.google.gwt.dom.client.Node;
17 import com.google.gwt.dom.client.NodeList; 17 import com.google.gwt.dom.client.NodeList;
18 18
19 import de.l3s.boilerpipe.BoilerpipeProcessingException; 19 import de.l3s.boilerpipe.BoilerpipeProcessingException;
20 import de.l3s.boilerpipe.document.TextBlock; 20 import de.l3s.boilerpipe.document.TextBlock;
21 import de.l3s.boilerpipe.document.TextDocument; 21 import de.l3s.boilerpipe.document.TextDocument;
22 import de.l3s.boilerpipe.extractors.CommonExtractors; 22 import de.l3s.boilerpipe.extractors.CommonExtractors;
23 import de.l3s.boilerpipe.labels.DefaultLabels; 23 import de.l3s.boilerpipe.labels.DefaultLabels;
24 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler; 24 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
25 25
26 import org.timepedia.exporter.client.Export; 26 import org.timepedia.exporter.client.Export;
27 import org.timepedia.exporter.client.Exportable; 27 import org.timepedia.exporter.client.Exportable;
28 28
29 import org.xml.sax.AttributesImpl; 29 import org.xml.sax.AttributesImpl;
30 import org.xml.sax.ContentHandler;
30 import org.xml.sax.SAXException; 31 import org.xml.sax.SAXException;
31 32
32 @Export() 33 @Export()
33 public class ContentExtractor implements Exportable { 34 public class ContentExtractor implements Exportable {
34 static Logger logger = Logger.getLogger("DomDistiller"); 35 static Logger logger = Logger.getLogger("DomDistiller");
35 36
36 public static String extractContent() { 37 public static String extractContent() {
37 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl er(); 38 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl er();
38 List<Node> textNodes = null; 39 List<Node> textNodes = null;
39 40
40 try { 41 try {
41 htmlParser.startDocument(); 42 htmlParser.startDocument();
42 Element documentElement = Document.get().getDocumentElement(); 43 Element documentElement = Document.get().getDocumentElement();
43 textNodes = DomToSaxParser.parse(documentElement, htmlParser); 44 textNodes = parse(documentElement, htmlParser);
44 htmlParser.endDocument(); 45 htmlParser.endDocument();
45 } catch (SAXException e) { 46 } catch (SAXException e) {
46 logger.warning("Parsing failed."); 47 logger.warning("Parsing failed.");
47 return ""; 48 return "";
48 } 49 }
49 50
50 TextDocument document = htmlParser.toTextDocument(); 51 TextDocument document = htmlParser.toTextDocument();
51 try { 52 try {
52 CommonExtractors.ARTICLE_EXTRACTOR.process(document); 53 CommonExtractors.ARTICLE_EXTRACTOR.process(document);
53 } catch (BoilerpipeProcessingException e) { 54 } catch (BoilerpipeProcessingException e) {
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
85 // The base URL in the distilled page viewer is different from that in 86 // The base URL in the distilled page viewer is different from that in
86 // the live page. This breaks all relative links (in anchors and 87 // the live page. This breaks all relative links (in anchors and
87 // images), so make them absolute in the distilled content. 88 // images), so make them absolute in the distilled content.
88 makeAllLinksAbsolute(clonedSubtree); 89 makeAllLinksAbsolute(clonedSubtree);
89 90
90 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might 91 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might
91 // break in some cases. 92 // break in some cases.
92 return Element.as(clonedSubtree).getInnerHTML(); 93 return Element.as(clonedSubtree).getInnerHTML();
93 } 94 }
94 95
96 private static List<Node> parse(Element e, ContentHandler handler) {
97 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler);
98 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS axVisitor);
99 new DomWalker(filteringDomVisitor).walk(e);
100 return domToSaxVisitor.getTextNodes();
101 }
102
95 private static void makeAllLinksAbsolute(Node rootNode) { 103 private static void makeAllLinksAbsolute(Node rootNode) {
96 Element root = Element.as(rootNode); 104 Element root = Element.as(rootNode);
97 105
98 // AnchorElement.getHref() and ImageElement.getSrc() both return the 106 // AnchorElement.getHref() and ImageElement.getSrc() both return the
99 // absolute URI, so simply set them as the respective attributes. 107 // absolute URI, so simply set them as the respective attributes.
100 108
101 NodeList<Element> allLinks = root.getElementsByTagName("A"); 109 NodeList<Element> allLinks = root.getElementsByTagName("A");
102 for (int i = 0; i < allLinks.getLength(); i++) { 110 for (int i = 0; i < allLinks.getLength(); i++) {
103 AnchorElement link = AnchorElement.as(allLinks.getItem(i)); 111 AnchorElement link = AnchorElement.as(allLinks.getItem(i));
104 link.setHref(link.getHref()); 112 link.setHref(link.getHref());
105 } 113 }
106 114
107 NodeList<Element> allImages = root.getElementsByTagName("IMG"); 115 NodeList<Element> allImages = root.getElementsByTagName("IMG");
108 for (int i = 0; i < allImages.getLength(); i++) { 116 for (int i = 0; i < allImages.getLength(); i++) {
109 ImageElement image = ImageElement.as(allImages.getItem(i)); 117 ImageElement image = ImageElement.as(allImages.getItem(i));
110 image.setSrc(image.getSrc()); 118 image.setSrc(image.getSrc());
111 } 119 }
112 } 120 }
113 } 121 }
OLDNEW
« no previous file with comments | « no previous file | src/com/dom_distiller/client/DomToSaxParser.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698