Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(205)

Side by Side Diff: src/com/dom_distiller/client/ContentExtractor.java

Issue 290993004: Fix final content and title extraction. (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package com.dom_distiller.client; 5 package com.dom_distiller.client;
6 6
7 import com.google.gwt.dom.client.AnchorElement; 7 import com.google.gwt.dom.client.AnchorElement;
8 import com.google.gwt.dom.client.Document; 8 import com.google.gwt.dom.client.Document;
9 import com.google.gwt.dom.client.Element; 9 import com.google.gwt.dom.client.Element;
10 import com.google.gwt.dom.client.ImageElement; 10 import com.google.gwt.dom.client.ImageElement;
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after
66 List<Node> contentNodes = getContentNodesForTextDocument(document, textN odes); 66 List<Node> contentNodes = getContentNodesForTextDocument(document, textN odes);
67 67
68 List<Node> contentAndImages = RelevantImageFinder.findAndAddImages( 68 List<Node> contentAndImages = RelevantImageFinder.findAndAddImages(
69 contentNodes, Document.get().getDocumentElement()); 69 contentNodes, Document.get().getDocumentElement());
70 70
71 if (contentAndImages.isEmpty()) { 71 if (contentAndImages.isEmpty()) {
72 return ""; 72 return "";
73 } 73 }
74 74
75 Node clonedSubtree = NodeListExpander.expand(contentAndImages).cloneSubt ree(); 75 Node clonedSubtree = NodeListExpander.expand(contentAndImages).cloneSubt ree();
76
77 if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) { 76 if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) {
78 return ""; 77 return "";
79 } 78 }
80 79
81 // The base URL in the distilled page viewer is different from that in 80 // The base URL in the distilled page viewer is different from that in
82 // the live page. This breaks all relative links (in anchors and 81 // the live page. This breaks all relative links (in anchors and
83 // images), so make them absolute in the distilled content. 82 // images), so make them absolute in the distilled content.
84 makeAllLinksAbsolute(clonedSubtree); 83 makeAllLinksAbsolute(clonedSubtree);
85 84
86 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might 85 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might
87 // break in some cases. 86 // break in some cases.
88 return Element.as(clonedSubtree).getInnerHTML(); 87 return Element.as(clonedSubtree).getInnerHTML();
89 } 88 }
90 89
91 private static List<Node> parse(Element e, ContentHandler handler) { 90 private static List<Node> parse(Element e, ContentHandler handler) {
92 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler); 91 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler);
93 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS axVisitor); 92 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS axVisitor);
94 new DomWalker(filteringDomVisitor).walk(e); 93 new DomWalker(filteringDomVisitor).walk(e);
95 return domToSaxVisitor.getTextNodes(); 94 return domToSaxVisitor.getTextNodes();
96 } 95 }
97 96
98 private static List<Node> getContentNodesForTextDocument( 97 private static List<Node> getContentNodesForTextDocument(
99 TextDocument document, List<Node> textNodes) { 98 TextDocument document, List<Node> textNodes) {
100 List<Integer> contentTextIndexes = new ArrayList<Integer>(); 99 List<Integer> contentTextIndexes = new ArrayList<Integer>();
101 for (TextBlock tb : document.getTextBlocks()) { 100 for (TextBlock tb : document.getTextBlocks()) {
101 if (!tb.isContent()) {
102 continue;
103 }
102 if (!tb.hasLabel(DefaultLabels.TITLE)) { 104 if (!tb.hasLabel(DefaultLabels.TITLE)) {
103 contentTextIndexes.addAll(tb.getContainedTextElements()); 105 contentTextIndexes.addAll(tb.getContainedTextElements());
104 } 106 }
105 } 107 }
106 Collections.sort(contentTextIndexes); 108 Collections.sort(contentTextIndexes);
107 109
108 // Boilerpipe's text node indexes start at 1. 110 // Boilerpipe's text node indexes start at 1.
109 List<Node> contentNodes = new ArrayList<Node>(contentTextIndexes.size()) ; 111 List<Node> contentNodes = new ArrayList<Node>(contentTextIndexes.size()) ;
110 for (Integer i : contentTextIndexes) { 112 for (Integer i : contentTextIndexes) {
111 contentNodes.add(textNodes.get(i - 1)); 113 contentNodes.add(textNodes.get(i - 1));
(...skipping 13 matching lines...) Expand all
125 link.setHref(link.getHref()); 127 link.setHref(link.getHref());
126 } 128 }
127 129
128 NodeList<Element> allImages = root.getElementsByTagName("IMG"); 130 NodeList<Element> allImages = root.getElementsByTagName("IMG");
129 for (int i = 0; i < allImages.getLength(); i++) { 131 for (int i = 0; i < allImages.getLength(); i++) {
130 ImageElement image = ImageElement.as(allImages.getItem(i)); 132 ImageElement image = ImageElement.as(allImages.getItem(i));
131 image.setSrc(image.getSrc()); 133 image.setSrc(image.getSrc());
132 } 134 }
133 } 135 }
134 } 136 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698