test/com/dom_distiller/client/SchemaOrgParserTest.java - Issue 240073007: recognize and parse Schema.org Markup

Side by Side Diff: test/com/dom_distiller/client/SchemaOrgParserTest.java

Issue 240073007: recognize and parse Schema.org Markup (Closed) Base URL: https://code.google.com/p/dom-distiller/@master

Patch Set: addressed all comments Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright 2014 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 package com.dom_distiller.client;

	6

	7 import com.google.gwt.dom.client.Document;

	8 import com.google.gwt.dom.client.Element;

	9 import com.google.gwt.dom.client.ImageElement;

	10 import com.google.gwt.dom.client.MetaElement;

	11 import com.google.gwt.dom.client.NodeList;

	12

	13 import com.google.gwt.junit.client.GWTTestCase;

	14

	15 public class SchemaOrgParserTest extends GWTTestCase {

	16 @Override

	17 public String getModuleName() {

	18 return "com.dom_distiller.DomDistillerJUnit";

	19 }

	20

	21 public void testImageWithEmbeddedPublisher() {

	22 Element rootDiv = TestUtil.createDiv(0);

	23 setItemScopeAndType(rootDiv, "ImageObject");

	24 mBody.appendChild(rootDiv);

	25

	26 String expectedTitle = "Testcase for IMAGE";

	27 Element h = TestUtil.createHeading(1, expectedTitle);

	28 setItemProp(h, "headline");

	29 rootDiv.appendChild(h);

	30

	31 String expectedDescription = "Testing IMAGE with embedded publisher";

	32 h = TestUtil.createHeading(2, expectedDescription);

	33 setItemProp(h, "description");
	cjhopman 2014/04/18 01:17:01 Note: the following comment applies to all of thes Note: the following comment applies to all of these test cases You can construct an html string for the structure that you're building and then use setInnerHTML() (instead of programmatically creating the structure). Pro: It's easier to see/understand what structure you are building Con: It's harder to refer to values inside that structure (things like 'expectedTitle' might end up being specified in two places) I'm just trying to think about ways to make it more clear what the structure of the tree you are building is. If you don't like the html string approach, here's some other possibilities: how about moving all the appendChild calls to be next to each other (basically in the order that they would be in the html string)? I guess this would require unique names for each thing you build here. maybe even something as simple as a comment at the top of the function describing the structure would be fine. kuan 2014/04/18 23:34:38 Done. i changed to using html string and setInner Show quoted text On 2014/04/18 01:17:01, cjhopman wrote: > Note: the following comment applies to all of these test cases > > You can construct an html string for the structure that you're building and then > use setInnerHTML() (instead of programmatically creating the structure). > > Pro: It's easier to see/understand what structure you are building > Con: It's harder to refer to values inside that structure (things like > 'expectedTitle' might end up being specified in two places) > > > I'm just trying to think about ways to make it more clear what the structure of > the tree you are building is. If you don't like the html string approach, here's > some other possibilities: > > how about moving all the appendChild calls to be next to each other (basically > in the order that they would be in the html string)? I guess this would require > unique names for each thing you build here. > > maybe even something as simple as a comment at the top of the function > describing the structure would be fine. Done. i changed to using html string and setInnerHTML(), with the perk of using expected*.
	34 rootDiv.appendChild(h);

	35

	36 // This should extract the "href" attribute of the <a> tag.

	37 String expectedUrl = "http://test_image_with_embedded_item.html";

	38 Element link = TestUtil.createAnchor(expectedUrl, "test results");

	39 setItemProp(link, "contentUrl");

	40 rootDiv.appendChild(link);

	41

	42 Element div = TestUtil.createDiv(1);

	43 setItemProp(div, "publisher");

	44 setItemScopeAndType(div, "Organization");

	45 div.appendChild(TestUtil.createText("Publisher: "));

	46 String expectedPublisher = "Whatever Image Incorporated";

	47 Element span = TestUtil.createSpan(expectedPublisher);

	48 setItemProp(span, "name");

	49 div.appendChild(span);

	50 rootDiv.appendChild(div);

	51

	52 div = TestUtil.createDiv(2);

	53 String expectedCopyrightYear = "1999-2022";

	54 span = TestUtil.createSpan(expectedCopyrightYear);

	55 setItemProp(span, "copyrightYear");

	56 div.appendChild(span);

	57

	58 String expectedCopyrightHolder = "Whoever Image Copyrighted";

	59 span = TestUtil.createSpan(expectedCopyrightHolder);

	60 setItemProp(span, "copyrightHolder");

	61 div.appendChild(span);

	62 rootDiv.appendChild(div);

	63

	64 String expectedFormat = "jpeg";

	65 span = TestUtil.createSpan(expectedFormat);

	66 setItemProp(span, "encodingFormat");

	67 rootDiv.appendChild(span);

	68

	69 String expectedCaption = "A test for IMAGE with embedded publisher";

	70 span = TestUtil.createSpan(expectedCaption);

	71 setItemProp(span, "caption");

	72 rootDiv.appendChild(span);

	73

	74 // This should extract the "content" attribute of the <meta> tag.

	75 Element meta = TestUtil.createMetaName("no_name", "true");

	76 setItemProp(meta, "representativeOfPage");

	77 rootDiv.appendChild(meta);

	78

	79 meta = TestUtil.createMetaName("no_name", "600");

	80 setItemProp(meta, "width");

	81 rootDiv.appendChild(meta);

	82

	83 meta = TestUtil.createMetaName("no_name", "400");

	84 setItemProp(meta, "height");

	85 rootDiv.appendChild(meta);

	86

	87 SchemaOrgParser parser = new SchemaOrgParser(mRoot);

	88 assertEquals("IMAGE", parser.getType());

	89 assertEquals(expectedTitle, parser.getTitle());

	90 assertEquals(expectedDescription, parser.getDescription());

	91 assertEquals("", parser.getUrl());

	92 assertEquals(expectedPublisher, parser.getPublisher());

	93 assertEquals(null, parser.getArticle());

	94 assertEquals("", parser.getAuthor());

	95 assertEquals(

	96 "Copyright " + expectedCopyrightYear + " " + expectedCopyrightHolder ,

	97 parser.getCopyright());

	98 MarkupParser.Image[] images = parser.getImages();

	99 assertEquals(1, images.length);

	100 MarkupParser.Image image = images[0];

	101 assertEquals(expectedUrl, image.image);

	102 assertEquals(expectedUrl, image.url);

	103 assertEquals(null, image.secureUrl);

	104 assertEquals(expectedFormat, image.type);

	105 assertEquals(expectedCaption, image.caption);

	106 assertEquals(600, image.width);

	107 assertEquals(400, image.height);

	108 }

	109

	110 public void test2Images() {

	111 Element rootDiv = TestUtil.createDiv(0);

	112 setItemScopeAndType(rootDiv, "ImageObject");

	113 mBody.appendChild(rootDiv);

	114

	115 String expectedTitle1 = "Testcase for 1st IMAGE";

	116 Element h = TestUtil.createHeading(1, expectedTitle1);

	117 setItemProp(h, "headline");

	118 rootDiv.appendChild(h);

	119

	120 String expectedDescription1 = "Testing 1st IMAGE";

	121 h = TestUtil.createHeading(2, expectedDescription1);

	122 setItemProp(h, "description");

	123 rootDiv.appendChild(h);

	124

	125 // This should extract the "href" attribute of the <a> tag.

	126 String expectedUrl1 = "http://test_1st image.html";

	127 Element link = TestUtil.createAnchor(expectedUrl1, "1st test results");

	128 setItemProp(link, "contentUrl");

	129 rootDiv.appendChild(link);

	130

	131 String expectedPublisher1 = "Whatever 1st Image Incorporated";

	132 Element div = TestUtil.createDiv(1);

	133 setItemProp(div, "publisher");

	134 div.setInnerHTML(expectedPublisher1);

	135 rootDiv.appendChild(div);

	136

	137 div = TestUtil.createDiv(2);

	138 String expectedCopyrightYear1 = "1000-1999";

	139 Element span = TestUtil.createSpan(expectedCopyrightYear1);

	140 setItemProp(span, "copyrightYear");

	141 div.appendChild(span);

	142

	143 String expectedCopyrightHolder1 = "Whoever 1st Image Copyrighted";

	144 span = TestUtil.createSpan(expectedCopyrightHolder1);

	145 setItemProp(span, "copyrightHolder");

	146 div.appendChild(span);

	147 rootDiv.appendChild(div);

	148

	149 String expectedFormat1 = "jpeg";

	150 span = TestUtil.createSpan(expectedFormat1);

	151 setItemProp(span, "encodingFormat");

	152 rootDiv.appendChild(span);

	153

	154 String expectedCaption1 = "A test for 1st IMAGE";

	155 span = TestUtil.createSpan(expectedCaption1);

	156 setItemProp(span, "caption");

	157 rootDiv.appendChild(span);

	158

	159 // This should extract the "content" attribute of the <meta> tag.

	160 Element meta = TestUtil.createMetaName("no_name", "false");

	161 setItemProp(meta, "representativeOfPage");

	162 rootDiv.appendChild(meta);

	163

	164 meta = TestUtil.createMetaName("no_name", "400");

	165 setItemProp(meta, "width");

	166 rootDiv.appendChild(meta);

	167

	168 meta = TestUtil.createMetaName("no_name", "300");

	169 setItemProp(meta, "height");

	170 rootDiv.appendChild(meta);

	171

	172 rootDiv = TestUtil.createDiv(10);

	173 setItemScopeAndType(rootDiv, "ImageObject");

	174 mBody.appendChild(rootDiv);

	175

	176 String expectedTitle2 = "Testcase for 2nd IMAGE";

	177 h = TestUtil.createHeading(2, expectedTitle2);

	178 setItemProp(h, "headline");

	179 rootDiv.appendChild(h);

	180

	181 String expectedDescription2 = "Testing 2nd IMAGE";

	182 h = TestUtil.createHeading(2, expectedDescription2);

	183 setItemProp(h, "description");

	184 rootDiv.appendChild(h);

	185

	186 // This should extract the "href" attribute of the <a> tag.

	187 String expectedUrl2 = "http://test_2nd mage.html";

	188 link = TestUtil.createAnchor(expectedUrl2, "2nd test results");

	189 setItemProp(link, "contentUrl");

	190 rootDiv.appendChild(link);

	191

	192 String expectedPublisher2 = "Whatever 2nd Image Incorporated";

	193 div = TestUtil.createDiv(11);

	194 setItemProp(div, "publisher");

	195 div.setInnerHTML(expectedPublisher2);

	196 rootDiv.appendChild(div);

	197

	198 div = TestUtil.createDiv(12);

	199 String expectedCopyrightYear2 = "2000-2999";

	200 span = TestUtil.createSpan(expectedCopyrightYear2);

	201 setItemProp(span, "copyrightYear");

	202 div.appendChild(span);

	203

	204 String expectedCopyrightHolder2 = "Whoever 2nd Image Copyrighted";

	205 span = TestUtil.createSpan(expectedCopyrightHolder2);

	206 setItemProp(span, "copyrightHolder");

	207 div.appendChild(span);

	208 rootDiv.appendChild(div);

	209

	210 String expectedFormat2 = "gif";

	211 span = TestUtil.createSpan(expectedFormat2);

	212 setItemProp(span, "encodingFormat");

	213 rootDiv.appendChild(span);

	214

	215 String expectedCaption2 = "A test for 2nd IMAGE";

	216 span = TestUtil.createSpan(expectedCaption2);

	217 setItemProp(span, "caption");

	218 rootDiv.appendChild(span);

	219

	220 // This should extract the "content" attribute of the <meta> tag.

	221 meta = TestUtil.createMetaName("no_name", "true");

	222 setItemProp(meta, "representativeOfPage");

	223 rootDiv.appendChild(meta);

	224

	225 meta = TestUtil.createMetaName("no_name", "1000");

	226 setItemProp(meta, "width");

	227 rootDiv.appendChild(meta);

	228

	229 meta = TestUtil.createMetaName("no_name", "600");

	230 setItemProp(meta, "height");

	231 rootDiv.appendChild(meta);

	232

	233 SchemaOrgParser parser = new SchemaOrgParser(mRoot);

	234 // The basic properties of Thing should be from the first image that was

	235 // inserted.

	236 assertEquals("IMAGE", parser.getType());

	237 assertEquals(expectedTitle1, parser.getTitle());

	238 assertEquals(expectedDescription1, parser.getDescription());

	239 assertEquals("", parser.getUrl());

	240 assertEquals(expectedPublisher1, parser.getPublisher());

	241 assertEquals(null, parser.getArticle());

	242 assertEquals("", parser.getAuthor());

	243 assertEquals("Copyright " + expectedCopyrightYear1 + " " + expectedCopyr ightHolder1,

	244 parser.getCopyright());

	245

	246 MarkupParser.Image[] images = parser.getImages();

	247 assertEquals(2, images.length);

	248 // The 2nd image that was inserted is representative of page, so the

	249 // images should be swapped in \|images\|.

	250 MarkupParser.Image image = images[0];

	251 assertEquals(expectedUrl2, image.image);

	252 assertEquals(expectedUrl2, image.url);

	253 assertEquals(null, image.secureUrl);

	254 assertEquals(expectedFormat2, image.type);

	255 assertEquals(expectedCaption2, image.caption);

	256 assertEquals(1000, image.width);

	257 assertEquals(600, image.height);

	258 image = images[1];

	259 assertEquals(expectedUrl1, image.image);

	260 assertEquals(expectedUrl1, image.url);

	261 assertEquals(null, image.secureUrl);

	262 assertEquals(expectedFormat1, image.type);

	263 assertEquals(expectedCaption1, image.caption);

	264 assertEquals(400, image.width);

	265 assertEquals(300, image.height);

	266 }

	267

	268 public void testArticleWithEmbeddedAuthorAndPublisher() {

	269 Element rootDiv = TestUtil.createDiv(0);

	270 setItemScopeAndType(rootDiv, "Article");

	271 mBody.appendChild(rootDiv);

	272

	273 String expectedTitle = "Testcase for ARTICLE";

	274 Element h = TestUtil.createHeading(1, expectedTitle);

	275 setItemProp(h, "headline");

	276 rootDiv.appendChild(h);

	277

	278 String expectedDescription = "Testing ARTICLE with embedded author and p ublisher";

	279 h = TestUtil.createHeading(2, expectedDescription);

	280 setItemProp(h, "description");

	281 rootDiv.appendChild(h);

	282

	283 // This should extract the "href" attribute of the <a> tag.

	284 String expectedUrl = "http://test_article_with_embedded_items.html";

	285 Element link = TestUtil.createAnchor(expectedUrl, "test results");

	286 setItemProp(link, "url");

	287 rootDiv.appendChild(link);

	288

	289 // This should extract the "src" attribute of the <image> tag.

	290 String expectedImage = "http://test_article_with_embedded_items.jpeg";

	291 ImageElement image = TestUtil.createImage();

	292 image.setSrc(expectedImage);

	293 setItemProp(image, "image");

	294 rootDiv.appendChild(image);

	295

	296 Element div = TestUtil.createDiv(1);

	297 setItemProp(div, "author");

	298 setItemScopeAndType(div, "Person");

	299 div.appendChild(TestUtil.createText("Author: "));

	300 String expectedAuthor = "Whoever authored";

	301 Element span = TestUtil.createSpan(expectedAuthor);

	302 setItemProp(span, "name");

	303 div.appendChild(span);

	304 rootDiv.appendChild(div);

	305

	306 div = TestUtil.createDiv(2);

	307 setItemProp(div, "publisher");

	308 setItemScopeAndType(div, "Organization");

	309 div.appendChild(TestUtil.createText("Publisher: "));

	310 String expectedPublisher = "Whatever Article Incorporated";

	311 span = TestUtil.createSpan(expectedPublisher);

	312 setItemProp(span, "name");

	313 div.appendChild(span);

	314 rootDiv.appendChild(div);

	315

	316 String expectedDatePublished = "April 15, 2014";

	317 span = TestUtil.createSpan(expectedDatePublished);

	318 setItemProp(span, "datePublished");

	319 rootDiv.appendChild(span);

	320

	321 // This should extract the "datetime" attribute of the <time> tag.

	322 String expectedTimeModified = "2014-04-16T23:59";

	323 Element time = Document.get().createElement("time");

	324 time.setInnerHTML("April 16, 2014 11:59pm");

	325 time.setAttribute("datetime", expectedTimeModified);

	326 setItemProp(time, "dateModified");

	327 rootDiv.appendChild(time);

	328

	329 String expectedCopyrightYear = "2000-2014";

	330 span = TestUtil.createSpan(expectedCopyrightYear);

	331 setItemProp(span, "copyrightYear");

	332 rootDiv.appendChild(span);

	333

	334 String expectedCopyrightHolder = "Whoever Article Copyrighted";

	335 span = TestUtil.createSpan(expectedCopyrightHolder);

	336 setItemProp(span, "copyrightHolder");

	337 rootDiv.appendChild(span);

	338

	339 String expectedSection = "Romance thriller";

	340 span = TestUtil.createSpan(expectedSection);

	341 setItemProp(span, "articleSection");

	342 rootDiv.appendChild(span);

	343

	344 SchemaOrgParser parser = new SchemaOrgParser(mRoot);

	345 assertEquals("ARTICLE", parser.getType());

	346 assertEquals(expectedTitle, parser.getTitle());

	347 assertEquals(expectedDescription, parser.getDescription());

	348 assertEquals(expectedUrl, parser.getUrl());

	349 assertEquals(expectedAuthor, parser.getAuthor());

	350 assertEquals(expectedPublisher, parser.getPublisher());

	351 assertEquals(

	352 "Copyright " + expectedCopyrightYear + " " + expectedCopyrightHolder ,

	353 parser.getCopyright());

	354 MarkupParser.Image[] images = parser.getImages();

	355 assertEquals(1, images.length);

	356 assertEquals(expectedImage, images[0].image);

	357 assertEquals(expectedImage, images[0].url);

	358 MarkupParser.Article article = parser.getArticle();

	359 assertEquals(expectedDatePublished, article.publishedTime);

	360 assertEquals(expectedTimeModified, article.modifiedTime);

	361 assertEquals(null, article.expirationTime);

	362 assertEquals(expectedSection, article.section);

	363 assertEquals(1, article.authors.length);

	364 assertEquals(expectedAuthor, article.authors[0]);

	365 }

	366

	367 public void testItemscopeInHTMLTag() {

	368 setItemScopeAndType(mRoot, "Article");

	369

	370 String expectedTitle = "Testcase for ItemScope in HTML tag";

	371 Element h = TestUtil.createHeading(1, expectedTitle);

	372 setItemProp(h, "headline");

	373 mBody.appendChild(h);

	374

	375 SchemaOrgParser parser = new SchemaOrgParser(mRoot);

	376 assertEquals("ARTICLE", parser.getType());

	377 assertEquals(expectedTitle, parser.getTitle());

	378 assertTrue(parser.getArticle() != null);

	379

	380 // Remove "itemscope" and "itemtype" attributes in <html> tag, so that

	381 // other testcases won't be affected.

	382 mRoot.removeAttribute("ITEMSCOPE");

	383 mRoot.removeAttribute("ITEMTYPE");

	384 }

	385

	386 @Override

	387 protected void gwtSetUp() throws Exception {

	388 // Get root element.

	389 mRoot = Document.get().getDocumentElement();

	390

	391 // Get <body> element.

	392 NodeList<Element> bodies = mRoot.getElementsByTagName("BODY");

	393 if (bodies.getLength() != 1)

	394 throw new Exception("There shouldn't be more than 1 <body> tag");

	395 mBody = bodies.getItem(0);

	396

	397 // Remove all meta tags, otherwise a testcase may run with the meta tags

	398 // set up in a previous testcase, resulting in unexpected results.

	399 NodeList<Element> allMeta = mRoot.getElementsByTagName("META");

	400 for (int i = allMeta.getLength() - 1; i >= 0; i--) {

	401 allMeta.getItem(i).removeFromParent();

	402 }

	403

	404 // Remove all div tags, otherwise a testcase may run with the div tags

	405 // set up in a previous testcase, resulting in unexpected results.

	406 NodeList<Element> allDiv = mRoot.getElementsByTagName("DIV");

	407 for (int i = allDiv.getLength() - 1; i >= 0; i--) {

	408 allDiv.getItem(i).removeFromParent();

	409 }

	410 }

	411

	412 private void setItemScopeAndType(Element e, String type) {

	413 e.setAttribute("ITEMSCOPE", "");

	414 e.setAttribute("ITEMTYPE", "http://schema.org/" + type);

	415 }

	416

	417 private void setItemProp(Element e, String name) {

	418 e.setAttribute("itemprop", name);

	419 }

	420

	421 private Element mRoot;

	422 private Element mBody;

	423 }

OLD	NEW