src/com/dom_distiller/client/ContentExtractor.java - Issue 322553005: Improve handling of <video>, <figure> and <br>

Side by Side Diff: src/com/dom_distiller/client/ContentExtractor.java

Issue 322553005: Improve handling of <video>, <figure> and <br> (Closed) Base URL: https://code.google.com/p/dom-distiller/@master

Patch Set: Created 6 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package com.dom_distiller.client;	5 package com.dom_distiller.client;

6	6

7 import com.google.gwt.dom.client.AnchorElement;	7 import com.google.gwt.dom.client.AnchorElement;

8 import com.google.gwt.dom.client.Document;	8 import com.google.gwt.dom.client.Document;

9 import com.google.gwt.dom.client.Element;	9 import com.google.gwt.dom.client.Element;

10 import com.google.gwt.dom.client.ImageElement;

11 import com.google.gwt.dom.client.Node;	10 import com.google.gwt.dom.client.Node;

12 import com.google.gwt.dom.client.NodeList;	11 import com.google.gwt.dom.client.NodeList;

13	12

14 import de.l3s.boilerpipe.BoilerpipeProcessingException;	13 import de.l3s.boilerpipe.BoilerpipeProcessingException;

15 import de.l3s.boilerpipe.document.TextBlock;	14 import de.l3s.boilerpipe.document.TextBlock;

16 import de.l3s.boilerpipe.document.TextDocument;	15 import de.l3s.boilerpipe.document.TextDocument;

17 import de.l3s.boilerpipe.extractors.CommonExtractors;	16 import de.l3s.boilerpipe.extractors.CommonExtractors;

18 import de.l3s.boilerpipe.labels.DefaultLabels;	17 import de.l3s.boilerpipe.labels.DefaultLabels;

19 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;	18 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;

20	19

(...skipping 45 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
66 return "";	65 return "";

67 }	66 }

68	67

69 Node clonedSubtree = NodeListExpander.expand(contentAndRelevantElements) .cloneSubtree();	68 Node clonedSubtree = NodeListExpander.expand(contentAndRelevantElements) .cloneSubtree();

70	69

71 if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) {	70 if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) {

72 return "";	71 return "";

73 }	72 }

74	73

75 // The base URL in the distilled page viewer is different from that in	74 // The base URL in the distilled page viewer is different from that in

76 // the live page. This breaks all relative links (in anchors and	75 // the live page. This breaks all relative links (in anchors,

77 // images), so make them absolute in the distilled content.	76 // images, etc.), so make them absolute in the distilled content.

78 makeAllLinksAbsolute(clonedSubtree);	77 makeAllLinksAbsolute(clonedSubtree);

79	78

80 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might	79 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might

81 // break in some cases.	80 // break in some cases.

82 return Element.as(clonedSubtree).getInnerHTML();	81 return Element.as(clonedSubtree).getInnerHTML();

83 }	82 }

84	83

85 private static List<Node> getContentNodesForTextDocument(TextDocument docume nt) {	84 private static List<Node> getContentNodesForTextDocument(TextDocument docume nt) {

86 List<Node> contentTextNodes = new ArrayList<Node>();	85 List<Node> contentTextNodes = new ArrayList<Node>();

87 for (TextBlock tb : document.getTextBlocks()) {	86 for (TextBlock tb : document.getTextBlocks()) {

(...skipping 11 matching lines...) Expand all Loading...
99 Element root = Element.as(rootNode);	98 Element root = Element.as(rootNode);

100	99

101 // AnchorElement.getHref() and ImageElement.getSrc() both return the	100 // AnchorElement.getHref() and ImageElement.getSrc() both return the

102 // absolute URI, so simply set them as the respective attributes.	101 // absolute URI, so simply set them as the respective attributes.

103	102

104 NodeList<Element> allLinks = root.getElementsByTagName("A");	103 NodeList<Element> allLinks = root.getElementsByTagName("A");

105 for (int i = 0; i < allLinks.getLength(); i++) {	104 for (int i = 0; i < allLinks.getLength(); i++) {

106 AnchorElement link = AnchorElement.as(allLinks.getItem(i));	105 AnchorElement link = AnchorElement.as(allLinks.getItem(i));

107 link.setHref(link.getHref());	106 link.setHref(link.getHref());

108 }	107 }

	108 makeAllSrcAttributesAbsolute(root);

	109 }

109	110

110 NodeList<Element> allImages = root.getElementsByTagName("IMG");	111 private static native void makeAllSrcAttributesAbsolute(Element root) /*-{

111 for (int i = 0; i < allImages.getLength(); i++) {	112 var elementsWithSrc = root.querySelectorAll('img,source,track,video');

112 ImageElement image = ImageElement.as(allImages.getItem(i));	113 for (var key in elementsWithSrc) {

113 image.setSrc(image.getSrc());	114 elementsWithSrc[key].src = elementsWithSrc[key].src;

114 }	115 }

115 }	116 }-*/;

116 }	117 }

OLD	NEW

« no previous file with comments | « no previous file | src/com/dom_distiller/client/FilteringDomVisitor.java » ('j') | no next file with comments »