src/com/dom_distiller/client/ContentExtractor.java - Issue 275493007: filter out invisible elements

Side by Side Diff: src/com/dom_distiller/client/ContentExtractor.java

Issue 275493007: filter out invisible elements (Closed) Base URL: https://code.google.com/p/dom-distiller/@master

Patch Set: addressed comments Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package com.dom_distiller.client;	5 package com.dom_distiller.client;

6	6

7 import java.util.ArrayList;	7 import java.util.ArrayList;

8 import java.util.Collections;	8 import java.util.Collections;

9 import java.util.List;	9 import java.util.List;

10 import java.util.logging.Logger;	10 import java.util.logging.Logger;

11	11

12 import com.google.gwt.dom.client.AnchorElement;	12 import com.google.gwt.dom.client.AnchorElement;

13 import com.google.gwt.dom.client.Document;	13 import com.google.gwt.dom.client.Document;

14 import com.google.gwt.dom.client.Element;	14 import com.google.gwt.dom.client.Element;

15 import com.google.gwt.dom.client.ImageElement;	15 import com.google.gwt.dom.client.ImageElement;

16 import com.google.gwt.dom.client.Node;	16 import com.google.gwt.dom.client.Node;

17 import com.google.gwt.dom.client.NodeList;	17 import com.google.gwt.dom.client.NodeList;

18	18

19 import de.l3s.boilerpipe.BoilerpipeProcessingException;	19 import de.l3s.boilerpipe.BoilerpipeProcessingException;

20 import de.l3s.boilerpipe.document.TextBlock;	20 import de.l3s.boilerpipe.document.TextBlock;

21 import de.l3s.boilerpipe.document.TextDocument;	21 import de.l3s.boilerpipe.document.TextDocument;

22 import de.l3s.boilerpipe.extractors.CommonExtractors;	22 import de.l3s.boilerpipe.extractors.CommonExtractors;

23 import de.l3s.boilerpipe.labels.DefaultLabels;	23 import de.l3s.boilerpipe.labels.DefaultLabels;

24 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;	24 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;

25	25

26 import org.timepedia.exporter.client.Export;	26 import org.timepedia.exporter.client.Export;

27 import org.timepedia.exporter.client.Exportable;	27 import org.timepedia.exporter.client.Exportable;

28	28

29 import org.xml.sax.AttributesImpl;	29 import org.xml.sax.AttributesImpl;

	30 import org.xml.sax.ContentHandler;

30 import org.xml.sax.SAXException;	31 import org.xml.sax.SAXException;

31	32

32 @Export()	33 @Export()

33 public class ContentExtractor implements Exportable {	34 public class ContentExtractor implements Exportable {

34 static Logger logger = Logger.getLogger("DomDistiller");	35 static Logger logger = Logger.getLogger("DomDistiller");

35	36

36 public static String extractContent() {	37 public static String extractContent() {

37 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl er();	38 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl er();

38 List<Node> textNodes = null;	39 List<Node> textNodes = null;

39	40

40 try {	41 try {

41 htmlParser.startDocument();	42 htmlParser.startDocument();

42 Element documentElement = Document.get().getDocumentElement();	43 Element documentElement = Document.get().getDocumentElement();

43 textNodes = DomToSaxParser.parse(documentElement, htmlParser);	44 textNodes = parse(documentElement, htmlParser);

44 htmlParser.endDocument();	45 htmlParser.endDocument();

45 } catch (SAXException e) {	46 } catch (SAXException e) {

46 logger.warning("Parsing failed.");	47 logger.warning("Parsing failed.");

47 return "";	48 return "";

48 }	49 }

49	50

50 TextDocument document = htmlParser.toTextDocument();	51 TextDocument document = htmlParser.toTextDocument();

51 try {	52 try {

52 CommonExtractors.ARTICLE_EXTRACTOR.process(document);	53 CommonExtractors.ARTICLE_EXTRACTOR.process(document);

53 } catch (BoilerpipeProcessingException e) {	54 } catch (BoilerpipeProcessingException e) {

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
85 // The base URL in the distilled page viewer is different from that in	86 // The base URL in the distilled page viewer is different from that in

86 // the live page. This breaks all relative links (in anchors and	87 // the live page. This breaks all relative links (in anchors and

87 // images), so make them absolute in the distilled content.	88 // images), so make them absolute in the distilled content.

88 makeAllLinksAbsolute(clonedSubtree);	89 makeAllLinksAbsolute(clonedSubtree);

89	90

90 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might	91 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might

91 // break in some cases.	92 // break in some cases.

92 return Element.as(clonedSubtree).getInnerHTML();	93 return Element.as(clonedSubtree).getInnerHTML();

93 }	94 }

94	95

	96 private static List<Node> parse(Element e, ContentHandler handler) {

	97 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler);

	98 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS axVisitor);

	99 new DomWalker(filteringDomVisitor).walk(e);

	100 return domToSaxVisitor.getTextNodes();

	101 }

	102

95 private static void makeAllLinksAbsolute(Node rootNode) {	103 private static void makeAllLinksAbsolute(Node rootNode) {

96 Element root = Element.as(rootNode);	104 Element root = Element.as(rootNode);

97	105

98 // AnchorElement.getHref() and ImageElement.getSrc() both return the	106 // AnchorElement.getHref() and ImageElement.getSrc() both return the

99 // absolute URI, so simply set them as the respective attributes.	107 // absolute URI, so simply set them as the respective attributes.

100	108

101 NodeList<Element> allLinks = root.getElementsByTagName("A");	109 NodeList<Element> allLinks = root.getElementsByTagName("A");

102 for (int i = 0; i < allLinks.getLength(); i++) {	110 for (int i = 0; i < allLinks.getLength(); i++) {

103 AnchorElement link = AnchorElement.as(allLinks.getItem(i));	111 AnchorElement link = AnchorElement.as(allLinks.getItem(i));

104 link.setHref(link.getHref());	112 link.setHref(link.getHref());

105 }	113 }

106	114

107 NodeList<Element> allImages = root.getElementsByTagName("IMG");	115 NodeList<Element> allImages = root.getElementsByTagName("IMG");

108 for (int i = 0; i < allImages.getLength(); i++) {	116 for (int i = 0; i < allImages.getLength(); i++) {

109 ImageElement image = ImageElement.as(allImages.getItem(i));	117 ImageElement image = ImageElement.as(allImages.getItem(i));

110 image.setSrc(image.getSrc());	118 image.setSrc(image.getSrc());

111 }	119 }

112 }	120 }

113 }	121 }

OLD	NEW

« no previous file with comments | « no previous file | src/com/dom_distiller/client/DomToSaxParser.java » ('j') | no next file with comments »