javatests/org/chromium/distiller/ContentExtractorTest.java - Issue 1507373003: Clean up attributes of image elements

Side by Side Diff: javatests/org/chromium/distiller/ContentExtractorTest.java

Issue 1507373003: Clean up attributes of image elements (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master

Patch Set: Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« java/org/chromium/distiller/webdocument/WebImage.java ('K') | « java/org/chromium/distiller/webdocument/WebImage.java ('k') | javatests/org/chromium/distiller/DomUtilTest.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package org.chromium.distiller;	5 package org.chromium.distiller;

6	6

7 import com.google.gwt.dom.client.Document;	7 import com.google.gwt.dom.client.Document;

8 import com.google.gwt.dom.client.Element;	8 import com.google.gwt.dom.client.Element;

9	9

10 public class ContentExtractorTest extends DomDistillerJsTestCase {	10 public class ContentExtractorTest extends DomDistillerJsTestCase {

(...skipping 68 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
79 assertTrue(parser != null);	79 assertTrue(parser != null);

80 assertEquals(MARKUP_PARSER_TITLE, parser.getTitle());	80 assertEquals(MARKUP_PARSER_TITLE, parser.getTitle());

81	81

82 Document.get().setTitle(TITLE_TEXT);	82 Document.get().setTitle(TITLE_TEXT);

83	83

84 ContentExtractor extractor = new ContentExtractor(mRoot);	84 ContentExtractor extractor = new ContentExtractor(mRoot);

85 assertEquals("OpenGraph title should be picked over document.title",	85 assertEquals("OpenGraph title should be picked over document.title",

86 MARKUP_PARSER_TITLE, extractor.extractTitle());	86 MARKUP_PARSER_TITLE, extractor.extractTitle());

87 }	87 }

88	88

89 public void testImageWithSrcset() {	89 public void testImage() {

90 // Test the absolute and different kinds of relative URLs for image sour ces,	90 // Test the absolute and different kinds of relative URLs for image sour ces,

91 // and also add an extra comma (,) as malformed srcset syntax for robust ness.	91 // and also add an extra comma (,) as malformed srcset syntax for robust ness.

	92 // Also test images in WebImage, WebText, and WebTable.
	wychen 2015/12/09 05:17:40 TODO(wychen): how do I get images inside a WebText TODO(wychen): how do I get images inside a WebText? mdjones 2015/12/09 17:40:16 This is an unsolved problem. Since tables are curr Show quoted text On 2015/12/09 05:17:40, wychen wrote: > TODO(wychen): how do I get images inside a WebText? This is an unsolved problem. Since tables are currently treated as a singular element, it can be argued that we simply ignore images contained in them, or at least leave them unaltered. If we decide to start traversing into tables, we would have to make a number of changes to our extraction logic and possibly eval data. A mix of both is complicated; we can discuss this more offline.
92 final String html =	93 final String html =

93 "<h1>" + CONTENT_TEXT + "</h1>" +	94 "<h1>" + CONTENT_TEXT + "</h1>" +

94 "<img src=\"image\" srcset=\"image200 200w, //example.org/image400 4 00w\">" +	95 "<img id=\"a\" style=\"a\" align=\"left\" src=\"image\" srcset=\"ima ge200 200w, //example.org/image400 400w\">" +

	96 "<img id=\"a\" style=\"a\" align=\"left\" src=\"image2\">" +

95 "<table role=\"grid\"><tbody><tr><td>" +	97 "<table role=\"grid\"><tbody><tr><td>" +

96 "<img src=\"/image\" srcset=\"https://example.com/image2x 2x, /i mage4x 4x,\">" +	98 "<img id=\"a\" style=\"a\" align=\"left\" src=\"/image\" srcset= \"https://example.com/image2x 2x, /image4x 4x,\">" +

	99 "<img id=\"a\" style=\"a\" align=\"left\" src=\"/image2\">" +

97 "</td></tr></tbody></table>" +	100 "</td></tr></tbody></table>" +

98 "<p>" + CONTENT_TEXT + "</p>";	101 "<p>" + CONTENT_TEXT + "</p>";

99	102

100 final String expected =	103 final String expected =

101 "<h1>" + CONTENT_TEXT + "</h1>" +	104 "<h1>" + CONTENT_TEXT + "</h1>" +

102 "<img src=\"http://example.com/path/image\" " +	105 "<img src=\"http://example.com/path/image\" " +

103 "srcset=\"http://example.com/path/image200 200w, http://example .org/image400 400w\">" +	106 "srcset=\"http://example.com/path/image200 200w, http://example .org/image400 400w\">" +

	107 "<img src=\"http://example.com/path/image2\">" +

104 "<table role=\"grid\"><tbody><tr><td>" +	108 "<table role=\"grid\"><tbody><tr><td>" +

105 "<img src=\"http://example.com/image\" " +	109 "<img src=\"http://example.com/image\" " +

106 "srcset=\"https://example.com/image2x 2x, http://example.co m/image4x 4x, \">" +	110 "srcset=\"https://example.com/image2x 2x, http://example.co m/image4x 4x, \">" +

	111 "<img src=\"http://example.com/image2\">" +

107 "</td></tr></tbody></table>" +	112 "</td></tr></tbody></table>" +

108 "<p>" + CONTENT_TEXT + "</p>";	113 "<p>" + CONTENT_TEXT + "</p>";

109	114

110 mHead.setInnerHTML("<base href=\"http://example.com/path/\">");	115 mHead.setInnerHTML("<base href=\"http://example.com/path/\">");

111 mBody.setInnerHTML(html);	116 mBody.setInnerHTML(html);

112	117

113 ContentExtractor extractor = new ContentExtractor(mRoot);	118 ContentExtractor extractor = new ContentExtractor(mRoot);

114 String extractedContent = extractor.extractContent();	119 String extractedContent = extractor.extractContent();

115	120

116 assertEquals(expected,	121 assertEquals(expected,

(...skipping 27 matching lines...) Expand all Loading...
144 }	149 }

145	150

146 public void testRemoveStyleAttributes() {	151 public void testRemoveStyleAttributes() {

147 String html =	152 String html =

148 "<h1 style=\"font-weight: folder\">" +	153 "<h1 style=\"font-weight: folder\">" +

149 CONTENT_TEXT +	154 CONTENT_TEXT +

150 "</h1>" +	155 "</h1>" +

151 "<p style=\"\">" +	156 "<p style=\"\">" +

152 CONTENT_TEXT +	157 CONTENT_TEXT +

153 "</p>" +	158 "</p>" +

	159 "<img style=\"align: left\" src=\"/test.png\">" +

154 "<table style=\"position: absolute\">" +	160 "<table style=\"position: absolute\">" +

155 "<tbody style=\"font-size: 2\">" +	161 "<tbody style=\"font-size: 2\">" +

156 "<tr style=\"z-index: 0\">" +	162 "<tr style=\"z-index: 0\">" +

157 "<th style=\"top: 0px\">" + CONTENT_TEXT + "</th>" +	163 "<th style=\"top: 0px\">" + CONTENT_TEXT +

	164 "<img style=\"align: left\" src=\"/test.png\">" +

	165 "</th>" +

158 "<th style=\"width: 20px\">" + CONTENT_TEXT + "</th>" +	166 "<th style=\"width: 20px\">" + CONTENT_TEXT + "</th>" +

159 "</tr><tr style=\"left: 0\">" +	167 "</tr><tr style=\"left: 0\">" +

160 "<td style=\"display: block\">" + CONTENT_TEXT + "</td>" +	168 "<td style=\"display: block\">" + CONTENT_TEXT + "</td>" +

161 "<td style=\"color: #123\">" + CONTENT_TEXT + "</td>" +	169 "<td style=\"color: #123\">" + CONTENT_TEXT + "</td>" +

162 "</tr>" +	170 "</tr>" +

163 "</tbody>" +	171 "</tbody>" +

164 "</table>";	172 "</table>";

165	173

166 final String expected =	174 final String expected =

167 "<h1>" +	175 "<h1>" +

168 CONTENT_TEXT +	176 CONTENT_TEXT +

169 "</h1>" +	177 "</h1>" +

170 "<p>" +	178 "<p>" +

171 CONTENT_TEXT +	179 CONTENT_TEXT +

172 "</p>" +	180 "</p>" +

	181 "<img src=\"http://example.com/test.png\">" +

173 "<table>" +	182 "<table>" +

174 "<tbody>" +	183 "<tbody>" +

175 "<tr>" +	184 "<tr>" +

176 "<th>" + CONTENT_TEXT + "</th>" +	185 "<th>" + CONTENT_TEXT +

	186 "<img src=\"http://example.com/test.png\">" +

	187 "</th>" +

177 "<th>" + CONTENT_TEXT + "</th>" +	188 "<th>" + CONTENT_TEXT + "</th>" +

178 "</tr><tr>" +	189 "</tr><tr>" +

179 "<td>" + CONTENT_TEXT + "</td>" +	190 "<td>" + CONTENT_TEXT + "</td>" +

180 "<td>" + CONTENT_TEXT + "</td>" +	191 "<td>" + CONTENT_TEXT + "</td>" +

181 "</tr>" +	192 "</tr>" +

182 "</tbody>" +	193 "</tbody>" +

183 "</table>";	194 "</table>";

184	195

	196 mHead.setInnerHTML("<base href=\"http://example.com/\">");

185 mBody.setInnerHTML(html);	197 mBody.setInnerHTML(html);

186	198

187 ContentExtractor extractor = new ContentExtractor(mRoot);	199 ContentExtractor extractor = new ContentExtractor(mRoot);

188 String extractedContent = extractor.extractContent();	200 String extractedContent = extractor.extractContent();

189 assertEquals(expected,	201 assertEquals(expected,

190 TestUtil.removeAllDirAttributes(extractedContent));	202 TestUtil.removeAllDirAttributes(extractedContent));

191 }	203 }

192	204

193 public void testPreserveOrderedList() {	205 public void testPreserveOrderedList() {

194 Element outerListTag = Document.get().createElement("OL");	206 Element outerListTag = Document.get().createElement("OL");

(...skipping 379 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
574	586

575 final String htmlArticle =	587 final String htmlArticle =

576 "<h1>" + CONTENT_TEXT + "</h1>" +	588 "<h1>" + CONTENT_TEXT + "</h1>" +

577 "<div itemscope itemtype=\"http://schema.org/Movie\">" + article + " </div>";	589 "<div itemscope itemtype=\"http://schema.org/Movie\">" + article + " </div>";

578 final String expected = "<h1>" + CONTENT_TEXT + "</h1>" + article;	590 final String expected = "<h1>" + CONTENT_TEXT + "</h1>" + article;

579	591

580 // Non-article schema.org types should not use the fast path.	592 // Non-article schema.org types should not use the fast path.

581 assertExtractor(expected, htmlArticle);	593 assertExtractor(expected, htmlArticle);

582 }	594 }

583 }	595 }

OLD	NEW