Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(367)

Side by Side Diff: src/com/dom_distiller/client/ContentExtractor.java

Issue 322553005: Improve handling of <video>, <figure> and <br> (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: Created 6 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | src/com/dom_distiller/client/FilteringDomVisitor.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package com.dom_distiller.client; 5 package com.dom_distiller.client;
6 6
7 import com.google.gwt.dom.client.AnchorElement; 7 import com.google.gwt.dom.client.AnchorElement;
8 import com.google.gwt.dom.client.Document; 8 import com.google.gwt.dom.client.Document;
9 import com.google.gwt.dom.client.Element; 9 import com.google.gwt.dom.client.Element;
10 import com.google.gwt.dom.client.ImageElement;
11 import com.google.gwt.dom.client.Node; 10 import com.google.gwt.dom.client.Node;
12 import com.google.gwt.dom.client.NodeList; 11 import com.google.gwt.dom.client.NodeList;
13 12
14 import de.l3s.boilerpipe.BoilerpipeProcessingException; 13 import de.l3s.boilerpipe.BoilerpipeProcessingException;
15 import de.l3s.boilerpipe.document.TextBlock; 14 import de.l3s.boilerpipe.document.TextBlock;
16 import de.l3s.boilerpipe.document.TextDocument; 15 import de.l3s.boilerpipe.document.TextDocument;
17 import de.l3s.boilerpipe.extractors.CommonExtractors; 16 import de.l3s.boilerpipe.extractors.CommonExtractors;
18 import de.l3s.boilerpipe.labels.DefaultLabels; 17 import de.l3s.boilerpipe.labels.DefaultLabels;
19 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler; 18 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
20 19
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
66 return ""; 65 return "";
67 } 66 }
68 67
69 Node clonedSubtree = NodeListExpander.expand(contentAndRelevantElements) .cloneSubtree(); 68 Node clonedSubtree = NodeListExpander.expand(contentAndRelevantElements) .cloneSubtree();
70 69
71 if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) { 70 if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) {
72 return ""; 71 return "";
73 } 72 }
74 73
75 // The base URL in the distilled page viewer is different from that in 74 // The base URL in the distilled page viewer is different from that in
76 // the live page. This breaks all relative links (in anchors and 75 // the live page. This breaks all relative links (in anchors,
77 // images), so make them absolute in the distilled content. 76 // images, etc.), so make them absolute in the distilled content.
78 makeAllLinksAbsolute(clonedSubtree); 77 makeAllLinksAbsolute(clonedSubtree);
79 78
80 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might 79 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might
81 // break in some cases. 80 // break in some cases.
82 return Element.as(clonedSubtree).getInnerHTML(); 81 return Element.as(clonedSubtree).getInnerHTML();
83 } 82 }
84 83
85 private static List<Node> getContentNodesForTextDocument(TextDocument docume nt) { 84 private static List<Node> getContentNodesForTextDocument(TextDocument docume nt) {
86 List<Node> contentTextNodes = new ArrayList<Node>(); 85 List<Node> contentTextNodes = new ArrayList<Node>();
87 for (TextBlock tb : document.getTextBlocks()) { 86 for (TextBlock tb : document.getTextBlocks()) {
(...skipping 11 matching lines...) Expand all
99 Element root = Element.as(rootNode); 98 Element root = Element.as(rootNode);
100 99
101 // AnchorElement.getHref() and ImageElement.getSrc() both return the 100 // AnchorElement.getHref() and ImageElement.getSrc() both return the
102 // absolute URI, so simply set them as the respective attributes. 101 // absolute URI, so simply set them as the respective attributes.
103 102
104 NodeList<Element> allLinks = root.getElementsByTagName("A"); 103 NodeList<Element> allLinks = root.getElementsByTagName("A");
105 for (int i = 0; i < allLinks.getLength(); i++) { 104 for (int i = 0; i < allLinks.getLength(); i++) {
106 AnchorElement link = AnchorElement.as(allLinks.getItem(i)); 105 AnchorElement link = AnchorElement.as(allLinks.getItem(i));
107 link.setHref(link.getHref()); 106 link.setHref(link.getHref());
108 } 107 }
108 makeAllSrcAttributesAbsolute(root);
109 }
109 110
110 NodeList<Element> allImages = root.getElementsByTagName("IMG"); 111 private static native void makeAllSrcAttributesAbsolute(Element root) /*-{
111 for (int i = 0; i < allImages.getLength(); i++) { 112 var elementsWithSrc = root.querySelectorAll('img,source,track,video');
112 ImageElement image = ImageElement.as(allImages.getItem(i)); 113 for (var key in elementsWithSrc) {
113 image.setSrc(image.getSrc()); 114 elementsWithSrc[key].src = elementsWithSrc[key].src;
114 } 115 }
115 } 116 }-*/;
116 } 117 }
OLDNEW
« no previous file with comments | « no previous file | src/com/dom_distiller/client/FilteringDomVisitor.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698