Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(127)

Side by Side Diff: src/com/dom_distiller/client/ContentExtractor.java

Issue 290993004: Fix final content and title extraction. (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: changed title handling, added test Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package com.dom_distiller.client; 5 package com.dom_distiller.client;
6 6
7 import com.google.gwt.dom.client.AnchorElement; 7 import com.google.gwt.dom.client.AnchorElement;
8 import com.google.gwt.dom.client.Document; 8 import com.google.gwt.dom.client.Document;
9 import com.google.gwt.dom.client.Element; 9 import com.google.gwt.dom.client.Element;
10 import com.google.gwt.dom.client.ImageElement; 10 import com.google.gwt.dom.client.ImageElement;
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after
66 List<Node> contentNodes = getContentNodesForTextDocument(document, textN odes); 66 List<Node> contentNodes = getContentNodesForTextDocument(document, textN odes);
67 67
68 List<Node> contentAndImages = RelevantImageFinder.findAndAddImages( 68 List<Node> contentAndImages = RelevantImageFinder.findAndAddImages(
69 contentNodes, Document.get().getDocumentElement()); 69 contentNodes, Document.get().getDocumentElement());
70 70
71 if (contentAndImages.isEmpty()) { 71 if (contentAndImages.isEmpty()) {
72 return ""; 72 return "";
73 } 73 }
74 74
75 Node clonedSubtree = NodeListExpander.expand(contentAndImages).cloneSubt ree(); 75 Node clonedSubtree = NodeListExpander.expand(contentAndImages).cloneSubt ree();
76 76 LogUtil.logToConsole(clonedSubtree.getNodeType() + " node:" + Node.ELEME NT_NODE);
cjhopman 2014/05/22 00:27:21 probably don't need this. If it's been useful, at
Yaron 2014/05/22 17:05:24 err right that was supposed to go. I was trying to
77 if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) { 77 if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) {
78 return ""; 78 return "";
79 } 79 }
80 80
81 // The base URL in the distilled page viewer is different from that in 81 // The base URL in the distilled page viewer is different from that in
82 // the live page. This breaks all relative links (in anchors and 82 // the live page. This breaks all relative links (in anchors and
83 // images), so make them absolute in the distilled content. 83 // images), so make them absolute in the distilled content.
84 makeAllLinksAbsolute(clonedSubtree); 84 makeAllLinksAbsolute(clonedSubtree);
85 85
86 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might 86 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might
87 // break in some cases. 87 // break in some cases.
88 return Element.as(clonedSubtree).getInnerHTML(); 88 return Element.as(clonedSubtree).getInnerHTML();
89 } 89 }
90 90
91 private static List<Node> parse(Element e, ContentHandler handler) { 91 private static List<Node> parse(Element e, ContentHandler handler) {
92 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler); 92 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler);
93 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS axVisitor); 93 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS axVisitor);
94 new DomWalker(filteringDomVisitor).walk(e); 94 new DomWalker(filteringDomVisitor).walk(e);
95 return domToSaxVisitor.getTextNodes(); 95 return domToSaxVisitor.getTextNodes();
96 } 96 }
97 97
98 private static List<Node> getContentNodesForTextDocument( 98 private static List<Node> getContentNodesForTextDocument(
99 TextDocument document, List<Node> textNodes) { 99 TextDocument document, List<Node> textNodes) {
100 List<Integer> contentTextIndexes = new ArrayList<Integer>(); 100 List<Integer> contentTextIndexes = new ArrayList<Integer>();
101 for (TextBlock tb : document.getTextBlocks()) { 101 for (TextBlock tb : document.getTextBlocks()) {
102 if (!tb.isContent()) {
103 continue;
104 }
102 if (!tb.hasLabel(DefaultLabels.TITLE)) { 105 if (!tb.hasLabel(DefaultLabels.TITLE)) {
103 contentTextIndexes.addAll(tb.getContainedTextElements()); 106 contentTextIndexes.addAll(tb.getContainedTextElements());
104 } 107 }
105 } 108 }
106 Collections.sort(contentTextIndexes); 109 Collections.sort(contentTextIndexes);
107 110
108 // Boilerpipe's text node indexes start at 1. 111 // Boilerpipe's text node indexes start at 1.
109 List<Node> contentNodes = new ArrayList<Node>(contentTextIndexes.size()) ; 112 List<Node> contentNodes = new ArrayList<Node>(contentTextIndexes.size()) ;
110 for (Integer i : contentTextIndexes) { 113 for (Integer i : contentTextIndexes) {
111 contentNodes.add(textNodes.get(i - 1)); 114 contentNodes.add(textNodes.get(i - 1));
(...skipping 13 matching lines...) Expand all
125 link.setHref(link.getHref()); 128 link.setHref(link.getHref());
126 } 129 }
127 130
128 NodeList<Element> allImages = root.getElementsByTagName("IMG"); 131 NodeList<Element> allImages = root.getElementsByTagName("IMG");
129 for (int i = 0; i < allImages.getLength(); i++) { 132 for (int i = 0; i < allImages.getLength(); i++) {
130 ImageElement image = ImageElement.as(allImages.getItem(i)); 133 ImageElement image = ImageElement.as(allImages.getItem(i));
131 image.setSrc(image.getSrc()); 134 image.setSrc(image.getSrc());
132 } 135 }
133 } 136 }
134 } 137 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698