src/com/dom_distiller/client/ContentExtractor.java - Issue 290993004: Fix final content and title extraction.

Side by Side Diff: src/com/dom_distiller/client/ContentExtractor.java

Issue 290993004: Fix final content and title extraction. (Closed) Base URL: https://code.google.com/p/dom-distiller/@master

Patch Set: Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « boilerpipe-core/src/main/de/l3s/boilerpipe/filters/heuristics/BlockProximityFusion.java ('k') | test/com/dom_distiller/client/ContentExtractorTest.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package com.dom_distiller.client;	5 package com.dom_distiller.client;

6	6

7 import com.google.gwt.dom.client.AnchorElement;	7 import com.google.gwt.dom.client.AnchorElement;

8 import com.google.gwt.dom.client.Document;	8 import com.google.gwt.dom.client.Document;

9 import com.google.gwt.dom.client.Element;	9 import com.google.gwt.dom.client.Element;

10 import com.google.gwt.dom.client.ImageElement;	10 import com.google.gwt.dom.client.ImageElement;

(...skipping 55 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
66 List<Node> contentNodes = getContentNodesForTextDocument(document, textN odes);	66 List<Node> contentNodes = getContentNodesForTextDocument(document, textN odes);

67	67

68 List<Node> contentAndImages = RelevantImageFinder.findAndAddImages(	68 List<Node> contentAndImages = RelevantImageFinder.findAndAddImages(

69 contentNodes, Document.get().getDocumentElement());	69 contentNodes, Document.get().getDocumentElement());

70	70

71 if (contentAndImages.isEmpty()) {	71 if (contentAndImages.isEmpty()) {

72 return "";	72 return "";

73 }	73 }

74	74

75 Node clonedSubtree = NodeListExpander.expand(contentAndImages).cloneSubt ree();	75 Node clonedSubtree = NodeListExpander.expand(contentAndImages).cloneSubt ree();

76

77 if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) {	76 if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) {

78 return "";	77 return "";

79 }	78 }

80	79

81 // The base URL in the distilled page viewer is different from that in	80 // The base URL in the distilled page viewer is different from that in

82 // the live page. This breaks all relative links (in anchors and	81 // the live page. This breaks all relative links (in anchors and

83 // images), so make them absolute in the distilled content.	82 // images), so make them absolute in the distilled content.

84 makeAllLinksAbsolute(clonedSubtree);	83 makeAllLinksAbsolute(clonedSubtree);

85	84

86 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might	85 // TODO(cjhopman): this discards the top element and just returns its ch ildren. This might

87 // break in some cases.	86 // break in some cases.

88 return Element.as(clonedSubtree).getInnerHTML();	87 return Element.as(clonedSubtree).getInnerHTML();

89 }	88 }

90	89

91 private static List<Node> parse(Element e, ContentHandler handler) {	90 private static List<Node> parse(Element e, ContentHandler handler) {

92 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler);	91 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(handler);

93 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS axVisitor);	92 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS axVisitor);

94 new DomWalker(filteringDomVisitor).walk(e);	93 new DomWalker(filteringDomVisitor).walk(e);

95 return domToSaxVisitor.getTextNodes();	94 return domToSaxVisitor.getTextNodes();

96 }	95 }

97	96

98 private static List<Node> getContentNodesForTextDocument(	97 private static List<Node> getContentNodesForTextDocument(

99 TextDocument document, List<Node> textNodes) {	98 TextDocument document, List<Node> textNodes) {

100 List<Integer> contentTextIndexes = new ArrayList<Integer>();	99 List<Integer> contentTextIndexes = new ArrayList<Integer>();

101 for (TextBlock tb : document.getTextBlocks()) {	100 for (TextBlock tb : document.getTextBlocks()) {

	101 if (!tb.isContent()) {

	102 continue;

	103 }

102 if (!tb.hasLabel(DefaultLabels.TITLE)) {	104 if (!tb.hasLabel(DefaultLabels.TITLE)) {

103 contentTextIndexes.addAll(tb.getContainedTextElements());	105 contentTextIndexes.addAll(tb.getContainedTextElements());

104 }	106 }

105 }	107 }

106 Collections.sort(contentTextIndexes);	108 Collections.sort(contentTextIndexes);

107	109

108 // Boilerpipe's text node indexes start at 1.	110 // Boilerpipe's text node indexes start at 1.

109 List<Node> contentNodes = new ArrayList<Node>(contentTextIndexes.size()) ;	111 List<Node> contentNodes = new ArrayList<Node>(contentTextIndexes.size()) ;

110 for (Integer i : contentTextIndexes) {	112 for (Integer i : contentTextIndexes) {

111 contentNodes.add(textNodes.get(i - 1));	113 contentNodes.add(textNodes.get(i - 1));

(...skipping 13 matching lines...) Expand all Loading...
125 link.setHref(link.getHref());	127 link.setHref(link.getHref());

126 }	128 }

127	129

128 NodeList<Element> allImages = root.getElementsByTagName("IMG");	130 NodeList<Element> allImages = root.getElementsByTagName("IMG");

129 for (int i = 0; i < allImages.getLength(); i++) {	131 for (int i = 0; i < allImages.getLength(); i++) {

130 ImageElement image = ImageElement.as(allImages.getItem(i));	132 ImageElement image = ImageElement.as(allImages.getItem(i));

131 image.setSrc(image.getSrc());	133 image.setSrc(image.getSrc());

132 }	134 }

133 }	135 }

134 }	136 }

OLD	NEW