src/com/dom_distiller/client/ContentExtractor.java - Issue 275493007: filter out invisible elements

Side by Side Diff: src/com/dom_distiller/client/ContentExtractor.java

Issue 275493007: filter out invisible elements (Closed) Base URL: https://code.google.com/p/dom-distiller/@master

Patch Set: addressed comments Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package com.dom_distiller.client;	5 package com.dom_distiller.client;

6	6

7 import java.util.ArrayList;	7 import java.util.ArrayList;

8 import java.util.Collections;	8 import java.util.Collections;

9 import java.util.List;	9 import java.util.List;

10 import java.util.logging.Logger;	10 import java.util.logging.Logger;

11	11

12 import com.google.gwt.dom.client.AnchorElement;	12 import com.google.gwt.dom.client.AnchorElement;

13 import com.google.gwt.dom.client.Document;	13 import com.google.gwt.dom.client.Document;

14 import com.google.gwt.dom.client.Element;	14 import com.google.gwt.dom.client.Element;

15 import com.google.gwt.dom.client.ImageElement;	15 import com.google.gwt.dom.client.ImageElement;

16 import com.google.gwt.dom.client.Node;	16 import com.google.gwt.dom.client.Node;

17 import com.google.gwt.dom.client.NodeList;	17 import com.google.gwt.dom.client.NodeList;

18	18

19 import de.l3s.boilerpipe.BoilerpipeProcessingException;	19 import de.l3s.boilerpipe.BoilerpipeProcessingException;

20 import de.l3s.boilerpipe.document.TextBlock;	20 import de.l3s.boilerpipe.document.TextBlock;

21 import de.l3s.boilerpipe.document.TextDocument;	21 import de.l3s.boilerpipe.document.TextDocument;

22 import de.l3s.boilerpipe.extractors.CommonExtractors;	22 import de.l3s.boilerpipe.extractors.CommonExtractors;

23 import de.l3s.boilerpipe.labels.DefaultLabels;	23 import de.l3s.boilerpipe.labels.DefaultLabels;

24 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;	24 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;

25	25

26 import org.timepedia.exporter.client.Export;	26 import org.timepedia.exporter.client.Export;

27 import org.timepedia.exporter.client.Exportable;	27 import org.timepedia.exporter.client.Exportable;

28	28

29 import org.xml.sax.AttributesImpl;	29 import org.xml.sax.AttributesImpl;

	30 import org.xml.sax.ContentHandler;

30 import org.xml.sax.SAXException;	31 import org.xml.sax.SAXException;

31	32

32 @Export()	33 @Export()

33 public class ContentExtractor implements Exportable {	34 public class ContentExtractor implements Exportable {

34 static Logger logger = Logger.getLogger("DomDistiller");	35 static Logger logger = Logger.getLogger("DomDistiller");

35	36

36 public static String extractContent() {	37 public static String extractContent() {

37 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl er();	38 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl er();

38 List<Node> textNodes = null;	39 List<Node> textNodes = null;

39	40

40 try {	41 try {

41 htmlParser.startDocument();	42 htmlParser.startDocument();

42 Element documentElement = Document.get().getDocumentElement();	43 Element documentElement = Document.get().getDocumentElement();

43 textNodes = DomToSaxParser.parse(documentElement, htmlParser);	44 textNodes = parse(documentElement, htmlParser);

44 htmlParser.endDocument();	45 htmlParser.endDocument();

45 } catch (SAXException e) {	46 } catch (SAXException e) {

46 logger.warning("Parsing failed.");	47 logger.warning("Parsing failed.");

47 return "";	48 return "";

48 }	49 }

49	50

50 TextDocument document = htmlParser.toTextDocument();	51 TextDocument document = htmlParser.toTextDocument();

51 try {	52 try {

52 CommonExtractors.ARTICLE_EXTRACTOR.process(document);	53 CommonExtractors.ARTICLE_EXTRACTOR.process(document);

53 } catch (BoilerpipeProcessingException e) {	54 } catch (BoilerpipeProcessingException e) {

(...skipping 27 matching lines...) Expand all Loading...
81 // The base URL in the distilled page viewer is different from that in	82 // The base URL in the distilled page viewer is different from that in

82 // the live page. This breaks all relative links (in anchors and	83 // the live page. This breaks all relative links (in anchors and

83 // images), so make them absolute in the distilled content.	84 // images), so make them absolute in the distilled content.

84 makeAllLinksAbsolute(clonedSubtree);	85 makeAllLinksAbsolute(clonedSubtree);

85	86

86 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might	87 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might

87 // break in some cases.	88 // break in some cases.

88 return Element.as(clonedSubtree).getInnerHTML();	89 return Element.as(clonedSubtree).getInnerHTML();

89 }	90 }

90	91

	92 private static List<Node> parse(Element e, ContentHandler handler) {

	93 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler);

	94 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS axVisitor);

	95 new DomWalker(filteringDomVisitor).walk(e);

	96 return domToSaxVisitor.getTextNodes();

	97 }

	98

91 private static void makeAllLinksAbsolute(Node rootNode) {	99 private static void makeAllLinksAbsolute(Node rootNode) {

92 Element root = Element.as(rootNode);	100 Element root = Element.as(rootNode);

93	101

94 // AnchorElement.getHref() and ImageElement.getSrc() both return the	102 // AnchorElement.getHref() and ImageElement.getSrc() both return the

95 // absolute URI, so simply set them as the respective attributes.	103 // absolute URI, so simply set them as the respective attributes.

96	104

97 NodeList<Element> allLinks = root.getElementsByTagName("A");	105 NodeList<Element> allLinks = root.getElementsByTagName("A");

98 for (int i = 0; i < allLinks.getLength(); i++) {	106 for (int i = 0; i < allLinks.getLength(); i++) {

99 AnchorElement link = AnchorElement.as(allLinks.getItem(i));	107 AnchorElement link = AnchorElement.as(allLinks.getItem(i));

100 link.setHref(link.getHref());	108 link.setHref(link.getHref());

101 }	109 }

102	110

103 NodeList<Element> allImages = root.getElementsByTagName("IMG");	111 NodeList<Element> allImages = root.getElementsByTagName("IMG");

104 for (int i = 0; i < allImages.getLength(); i++) {	112 for (int i = 0; i < allImages.getLength(); i++) {

105 ImageElement image = ImageElement.as(allImages.getItem(i));	113 ImageElement image = ImageElement.as(allImages.getItem(i));

106 image.setSrc(image.getSrc());	114 image.setSrc(image.getSrc());

107 }	115 }

108 }	116 }

109 }	117 }

OLD	NEW

« no previous file with comments | « no previous file | src/com/dom_distiller/client/DomToSaxParser.java » ('j') | src/com/dom_distiller/client/DomUtil.java » ('J')