src/com/dom_distiller/client/ContentExtractor.java - Issue 290993004: Fix final content and title extraction.

Side by Side Diff: src/com/dom_distiller/client/ContentExtractor.java

Issue 290993004: Fix final content and title extraction. (Closed) Base URL: https://code.google.com/p/dom-distiller/@master

Patch Set: changed title handling, added test Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « boilerpipe-core/src/main/de/l3s/boilerpipe/filters/heuristics/BlockProximityFusion.java ('k') | test/com/dom_distiller/client/ContentExtractorTest.java » ('j') | test/com/dom_distiller/client/ContentExtractorTest.java » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package com.dom_distiller.client;	5 package com.dom_distiller.client;

6	6

7 import com.google.gwt.dom.client.AnchorElement;	7 import com.google.gwt.dom.client.AnchorElement;

8 import com.google.gwt.dom.client.Document;	8 import com.google.gwt.dom.client.Document;

9 import com.google.gwt.dom.client.Element;	9 import com.google.gwt.dom.client.Element;

10 import com.google.gwt.dom.client.ImageElement;	10 import com.google.gwt.dom.client.ImageElement;

(...skipping 55 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
66 List<Node> contentNodes = getContentNodesForTextDocument(document, textN odes);	66 List<Node> contentNodes = getContentNodesForTextDocument(document, textN odes);

67	67

68 List<Node> contentAndImages = RelevantImageFinder.findAndAddImages(	68 List<Node> contentAndImages = RelevantImageFinder.findAndAddImages(

69 contentNodes, Document.get().getDocumentElement());	69 contentNodes, Document.get().getDocumentElement());

70	70

71 if (contentAndImages.isEmpty()) {	71 if (contentAndImages.isEmpty()) {

72 return "";	72 return "";

73 }	73 }

74	74

75 Node clonedSubtree = NodeListExpander.expand(contentAndImages).cloneSubt ree();	75 Node clonedSubtree = NodeListExpander.expand(contentAndImages).cloneSubt ree();

76	76 LogUtil.logToConsole(clonedSubtree.getNodeType() + " node:" + Node.ELEME NT_NODE);
	cjhopman 2014/05/22 00:27:21 probably don't need this. If it's been useful, at probably don't need this. If it's been useful, at least guard it with some debug flag. Yaron 2014/05/22 17:05:24 err right that was supposed to go. I was trying to Show quoted text On 2014/05/22 00:27:21, cjhopman wrote: > probably don't need this. If it's been useful, at least guard it with some debug > flag. err right that was supposed to go. I was trying to investigate something with small pages. I believe you can fail the below test if we return just a single text node but can't repro it now.
77 if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) {	77 if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) {

78 return "";	78 return "";

79 }	79 }

80	80

81 // The base URL in the distilled page viewer is different from that in	81 // The base URL in the distilled page viewer is different from that in

82 // the live page. This breaks all relative links (in anchors and	82 // the live page. This breaks all relative links (in anchors and

83 // images), so make them absolute in the distilled content.	83 // images), so make them absolute in the distilled content.

84 makeAllLinksAbsolute(clonedSubtree);	84 makeAllLinksAbsolute(clonedSubtree);

85	85

86 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might	86 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might

87 // break in some cases.	87 // break in some cases.

88 return Element.as(clonedSubtree).getInnerHTML();	88 return Element.as(clonedSubtree).getInnerHTML();

89 }	89 }

90	90

91 private static List<Node> parse(Element e, ContentHandler handler) {	91 private static List<Node> parse(Element e, ContentHandler handler) {

92 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler);	92 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler);

93 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS axVisitor);	93 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS axVisitor);

94 new DomWalker(filteringDomVisitor).walk(e);	94 new DomWalker(filteringDomVisitor).walk(e);

95 return domToSaxVisitor.getTextNodes();	95 return domToSaxVisitor.getTextNodes();

96 }	96 }

97	97

98 private static List<Node> getContentNodesForTextDocument(	98 private static List<Node> getContentNodesForTextDocument(

99 TextDocument document, List<Node> textNodes) {	99 TextDocument document, List<Node> textNodes) {

100 List<Integer> contentTextIndexes = new ArrayList<Integer>();	100 List<Integer> contentTextIndexes = new ArrayList<Integer>();

101 for (TextBlock tb : document.getTextBlocks()) {	101 for (TextBlock tb : document.getTextBlocks()) {

	102 if (!tb.isContent()) {

	103 continue;

	104 }

102 if (!tb.hasLabel(DefaultLabels.TITLE)) {	105 if (!tb.hasLabel(DefaultLabels.TITLE)) {

103 contentTextIndexes.addAll(tb.getContainedTextElements());	106 contentTextIndexes.addAll(tb.getContainedTextElements());

104 }	107 }

105 }	108 }

106 Collections.sort(contentTextIndexes);	109 Collections.sort(contentTextIndexes);

107	110

108 // Boilerpipe's text node indexes start at 1.	111 // Boilerpipe's text node indexes start at 1.

109 List<Node> contentNodes = new ArrayList<Node>(contentTextIndexes.size()) ;	112 List<Node> contentNodes = new ArrayList<Node>(contentTextIndexes.size()) ;

110 for (Integer i : contentTextIndexes) {	113 for (Integer i : contentTextIndexes) {

111 contentNodes.add(textNodes.get(i - 1));	114 contentNodes.add(textNodes.get(i - 1));

(...skipping 13 matching lines...) Expand all Loading...
125 link.setHref(link.getHref());	128 link.setHref(link.getHref());

126 }	129 }

127	130

128 NodeList<Element> allImages = root.getElementsByTagName("IMG");	131 NodeList<Element> allImages = root.getElementsByTagName("IMG");

129 for (int i = 0; i < allImages.getLength(); i++) {	132 for (int i = 0; i < allImages.getLength(); i++) {

130 ImageElement image = ImageElement.as(allImages.getItem(i));	133 ImageElement image = ImageElement.as(allImages.getItem(i));

131 image.setSrc(image.getSrc());	134 image.setSrc(image.getSrc());

132 }	135 }

133 }	136 }

134 }	137 }

OLD	NEW