test/com/dom_distiller/client/SchemaOrgParserTest.java - Issue 240073007: recognize and parse Schema.org Markup

Unified Diff: test/com/dom_distiller/client/SchemaOrgParserTest.java

Issue 240073007: recognize and parse Schema.org Markup (Closed) Base URL: https://code.google.com/p/dom-distiller/@master

Patch Set: Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« src/com/dom_distiller/client/SchemaOrgParser.java ('K') | « src/com/dom_distiller/client/SchemaOrgParser.java ('k') | test/com/dom_distiller/client/TestUtil.java » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: test/com/dom_distiller/client/SchemaOrgParserTest.java

diff --git a/test/com/dom_distiller/client/SchemaOrgParserTest.java b/test/com/dom_distiller/client/SchemaOrgParserTest.java

new file mode 100644

index 0000000000000000000000000000000000000000..f3f5470b78bdb466100aa2a75d221ed8541a8a80

--- /dev/null

+++ b/test/com/dom_distiller/client/SchemaOrgParserTest.java

@@ -0,0 +1,423 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+package com.dom_distiller.client;

+import com.google.gwt.dom.client.Document;

+import com.google.gwt.dom.client.Element;

+import com.google.gwt.dom.client.ImageElement;

+import com.google.gwt.dom.client.MetaElement;

+import com.google.gwt.dom.client.NodeList;

+import com.google.gwt.junit.client.GWTTestCase;

+public class SchemaOrgParserTest extends GWTTestCase {

+ @Override

+ public String getModuleName() {

+ return "com.dom_distiller.DomDistillerJUnit";

+ }

+ public void testImageWithEmbeddedPublisher() {

+ Element rootDiv = TestUtil.createDiv(0);

+ setItemScopeAndType(rootDiv, "ImageObject");

+ mBody.appendChild(rootDiv);

+ String expectedTitle = "Testcase for IMAGE";

+ Element h = TestUtil.createHeading(1, expectedTitle);

+ setItemProp(h, "headline");

+ rootDiv.appendChild(h);

+ String expectedDescription = "Testing IMAGE with embedded publisher";

+ h = TestUtil.createHeading(2, expectedDescription);

+ setItemProp(h, "description");

+ rootDiv.appendChild(h);

+ // This should extract the "href" attribute of the <a> tag.

+ String expectedUrl = "http://test_image_with_embedded_item.html";

+ Element link = TestUtil.createAnchor(expectedUrl, "test results");

+ setItemProp(link, "contentUrl");

+ rootDiv.appendChild(link);

+ Element div = TestUtil.createDiv(1);

+ setItemProp(div, "publisher");

+ setItemScopeAndType(div, "Organization");

+ div.appendChild(TestUtil.createText("Publisher: "));

+ String expectedPublisher = "Whatever Image Incorporated";

+ Element span = TestUtil.createSpan(expectedPublisher);

+ setItemProp(span, "name");

+ div.appendChild(span);

+ rootDiv.appendChild(div);

+ div = TestUtil.createDiv(2);

+ String expectedCopyrightYear = "1999-2022";

+ span = TestUtil.createSpan(expectedCopyrightYear);

+ setItemProp(span, "copyrightYear");

+ div.appendChild(span);

+ String expectedCopyrightHolder = "Whoever Image Copyrighted";

+ span = TestUtil.createSpan(expectedCopyrightHolder);

+ setItemProp(span, "copyrightHolder");

+ div.appendChild(span);

+ rootDiv.appendChild(div);

+ String expectedFormat = "jpeg";

+ span = TestUtil.createSpan(expectedFormat);

+ setItemProp(span, "encodingFormat");

+ rootDiv.appendChild(span);

+ String expectedCaption = "A test for IMAGE with embedded publisher";

+ span = TestUtil.createSpan(expectedCaption);

+ setItemProp(span, "caption");

+ rootDiv.appendChild(span);

+ // This should extract the "content" attribute of the <meta> tag.

+ Element meta = TestUtil.createMetaName("no_name", "true");

+ setItemProp(meta, "representativeOfPage");

+ rootDiv.appendChild(meta);

+ meta = TestUtil.createMetaName("no_name", "600");

+ setItemProp(meta, "width");

+ rootDiv.appendChild(meta);

+ meta = TestUtil.createMetaName("no_name", "400");

+ setItemProp(meta, "height");

+ rootDiv.appendChild(meta);

+ SchemaOrgParser parser = new SchemaOrgParser(mRoot);

+ assertEquals("IMAGE", parser.getType());

+ assertEquals(expectedTitle, parser.getTitle());

+ assertEquals(expectedDescription, parser.getDescription());

+ assertEquals("", parser.getUrl());

+ assertEquals(expectedPublisher, parser.getPublisher());

+ assertEquals(null, parser.getArticle());

+ assertEquals("", parser.getAuthor());

+ assertEquals(

+ "Copyright " + expectedCopyrightYear + " " + expectedCopyrightHolder,

+ parser.getCopyright());

+ MarkupParser.Image[] images = parser.getImages();

+ assertEquals(1, images.length);

+ MarkupParser.Image image = images[0];

+ assertEquals(expectedUrl, image.image);

+ assertEquals(expectedUrl, image.url);

+ assertEquals(null, image.secureUrl);

+ assertEquals(expectedFormat, image.type);

+ assertEquals(expectedCaption, image.caption);

+ assertEquals(600, image.width);

+ assertEquals(400, image.height);

+ }

+ public void test2Images() {

+ Element rootDiv = TestUtil.createDiv(0);

+ setItemScopeAndType(rootDiv, "ImageObject");

+ mBody.appendChild(rootDiv);

+ String expectedTitle1 = "Testcase for 1st IMAGE";

+ Element h = TestUtil.createHeading(1, expectedTitle1);

+ setItemProp(h, "headline");

+ rootDiv.appendChild(h);

+ String expectedDescription1 = "Testing 1st IMAGE";

+ h = TestUtil.createHeading(2, expectedDescription1);

+ setItemProp(h, "description");

+ rootDiv.appendChild(h);

+ // This should extract the "href" attribute of the <a> tag.

+ String expectedUrl1 = "http://test_1st image.html";

+ Element link = TestUtil.createAnchor(expectedUrl1, "1st test results");

+ setItemProp(link, "contentUrl");

+ rootDiv.appendChild(link);

+ String expectedPublisher1 = "Whatever 1st Image Incorporated";

+ Element div = TestUtil.createDiv(1);

+ setItemProp(div, "publisher");

+ div.setInnerHTML(expectedPublisher1);

+ rootDiv.appendChild(div);

+ div = TestUtil.createDiv(2);

+ String expectedCopyrightYear1 = "1000-1999";

+ Element span = TestUtil.createSpan(expectedCopyrightYear1);

+ setItemProp(span, "copyrightYear");

+ div.appendChild(span);

+ String expectedCopyrightHolder1 = "Whoever 1st Image Copyrighted";

+ span = TestUtil.createSpan(expectedCopyrightHolder1);

+ setItemProp(span, "copyrightHolder");

+ div.appendChild(span);

+ rootDiv.appendChild(div);

+ String expectedFormat1 = "jpeg";

+ span = TestUtil.createSpan(expectedFormat1);

+ setItemProp(span, "encodingFormat");

+ rootDiv.appendChild(span);

+ String expectedCaption1 = "A test for 1st IMAGE";

+ span = TestUtil.createSpan(expectedCaption1);

+ setItemProp(span, "caption");

+ rootDiv.appendChild(span);

+ // This should extract the "content" attribute of the <meta> tag.

+ Element meta = TestUtil.createMetaName("no_name", "false");

+ setItemProp(meta, "representativeOfPage");

+ rootDiv.appendChild(meta);

+ meta = TestUtil.createMetaName("no_name", "400");

+ setItemProp(meta, "width");

+ rootDiv.appendChild(meta);

+ meta = TestUtil.createMetaName("no_name", "300");

+ setItemProp(meta, "height");

+ rootDiv.appendChild(meta);

+ rootDiv = TestUtil.createDiv(10);

+ setItemScopeAndType(rootDiv, "ImageObject");

+ mBody.appendChild(rootDiv);

+ String expectedTitle2 = "Testcase for 2nd IMAGE";

+ h = TestUtil.createHeading(2, expectedTitle2);

+ setItemProp(h, "headline");

+ rootDiv.appendChild(h);

+ String expectedDescription2 = "Testing 2nd IMAGE";

+ h = TestUtil.createHeading(2, expectedDescription2);

+ setItemProp(h, "description");

+ rootDiv.appendChild(h);

+ // This should extract the "href" attribute of the <a> tag.

+ String expectedUrl2 = "http://test_2nd mage.html";

+ link = TestUtil.createAnchor(expectedUrl2, "2nd test results");

+ setItemProp(link, "contentUrl");

+ rootDiv.appendChild(link);

+ String expectedPublisher2 = "Whatever 2nd Image Incorporated";

+ div = TestUtil.createDiv(11);

+ setItemProp(div, "publisher");

+ div.setInnerHTML(expectedPublisher2);

+ rootDiv.appendChild(div);

+ div = TestUtil.createDiv(12);

+ String expectedCopyrightYear2 = "2000-2999";

+ span = TestUtil.createSpan(expectedCopyrightYear2);

+ setItemProp(span, "copyrightYear");

+ div.appendChild(span);

+ String expectedCopyrightHolder2 = "Whoever 2nd Image Copyrighted";

+ span = TestUtil.createSpan(expectedCopyrightHolder2);

+ setItemProp(span, "copyrightHolder");

+ div.appendChild(span);

+ rootDiv.appendChild(div);

+ String expectedFormat2 = "gif";

+ span = TestUtil.createSpan(expectedFormat2);

+ setItemProp(span, "encodingFormat");

+ rootDiv.appendChild(span);

+ String expectedCaption2 = "A test for 2nd IMAGE";

+ span = TestUtil.createSpan(expectedCaption2);

+ setItemProp(span, "caption");

+ rootDiv.appendChild(span);

+ // This should extract the "content" attribute of the <meta> tag.

+ meta = TestUtil.createMetaName("no_name", "true");

+ setItemProp(meta, "representativeOfPage");

+ rootDiv.appendChild(meta);

+ meta = TestUtil.createMetaName("no_name", "1000");

+ setItemProp(meta, "width");

+ rootDiv.appendChild(meta);

+ meta = TestUtil.createMetaName("no_name", "600");

+ setItemProp(meta, "height");

+ rootDiv.appendChild(meta);

+ SchemaOrgParser parser = new SchemaOrgParser(mRoot);

+ // The basic properties of Thing should be from the first image that was

+ // inserted.

+ assertEquals("IMAGE", parser.getType());

+ assertEquals(expectedTitle1, parser.getTitle());

+ assertEquals(expectedDescription1, parser.getDescription());

+ assertEquals("", parser.getUrl());

+ assertEquals(expectedPublisher1, parser.getPublisher());

+ assertEquals(null, parser.getArticle());

+ assertEquals("", parser.getAuthor());

+ assertEquals("Copyright " + expectedCopyrightYear1 + " " + expectedCopyrightHolder1,

+ parser.getCopyright());

+ MarkupParser.Image[] images = parser.getImages();

+ assertEquals(2, images.length);

+ // The 2nd image that was inserted is representative of page, so the

+ // images should be swapped in |images|.

+ MarkupParser.Image image = images[0];

+ assertEquals(expectedUrl2, image.image);

+ assertEquals(expectedUrl2, image.url);

+ assertEquals(null, image.secureUrl);

+ assertEquals(expectedFormat2, image.type);

+ assertEquals(expectedCaption2, image.caption);

+ assertEquals(1000, image.width);

+ assertEquals(600, image.height);

+ image = images[1];

+ assertEquals(expectedUrl1, image.image);

+ assertEquals(expectedUrl1, image.url);

+ assertEquals(null, image.secureUrl);

+ assertEquals(expectedFormat1, image.type);

+ assertEquals(expectedCaption1, image.caption);

+ assertEquals(400, image.width);

+ assertEquals(300, image.height);

+ }

+ public void testArticleWithEmbeddedAuthorAndPublisher() {

+ Element rootDiv = TestUtil.createDiv(0);

+ setItemScopeAndType(rootDiv, "Article");

+ mBody.appendChild(rootDiv);

+ String expectedTitle = "Testcase for ARTICLE";

+ Element h = TestUtil.createHeading(1, expectedTitle);

+ setItemProp(h, "headline");

+ rootDiv.appendChild(h);

+ String expectedDescription = "Testing ARTICLE with embedded author and publisher";

+ h = TestUtil.createHeading(2, expectedDescription);

+ setItemProp(h, "description");

+ rootDiv.appendChild(h);

+ // This should extract the "href" attribute of the <a> tag.

+ String expectedUrl = "http://test_article_with_embedded_items.html";

+ Element link = TestUtil.createAnchor(expectedUrl, "test results");

+ setItemProp(link, "url");

+ rootDiv.appendChild(link);

+ // This should extract the "src" attribute of the <image> tag.

+ String expectedImage = "http://test_article_with_embedded_items.jpeg";

+ ImageElement image = TestUtil.createImage();

+ image.setSrc(expectedImage);

+ setItemProp(image, "image");

+ rootDiv.appendChild(image);

+ Element div = TestUtil.createDiv(1);

+ setItemProp(div, "author");

+ setItemScopeAndType(div, "Person");

+ div.appendChild(TestUtil.createText("Author: "));

+ String expectedAuthor = "Whoever authored";

+ Element span = TestUtil.createSpan(expectedAuthor);

+ setItemProp(span, "name");

+ div.appendChild(span);

+ rootDiv.appendChild(div);

+ div = TestUtil.createDiv(2);

+ setItemProp(div, "publisher");

+ setItemScopeAndType(div, "Organization");

+ div.appendChild(TestUtil.createText("Publisher: "));

+ String expectedPublisher = "Whatever Article Incorporated";

+ span = TestUtil.createSpan(expectedPublisher);

+ setItemProp(span, "name");

+ div.appendChild(span);

+ rootDiv.appendChild(div);

+ String expectedDatePublished = "April 15, 2014";

+ span = TestUtil.createSpan(expectedDatePublished);

+ setItemProp(span, "datePublished");

+ rootDiv.appendChild(span);

+ // This should extract the "datetime" attribute of the <time> tag.

+ String expectedTimeModified = "2014-04-16T23:59";

+ Element time = Document.get().createElement("time");

+ time.setInnerHTML("April 16, 2014 11:59pm");

+ time.setAttribute("datetime", expectedTimeModified);

+ setItemProp(time, "dateModified");

+ rootDiv.appendChild(time);

+ String expectedCopyrightYear = "2000-2014";

+ span = TestUtil.createSpan(expectedCopyrightYear);

+ setItemProp(span, "copyrightYear");

+ rootDiv.appendChild(span);

+ String expectedCopyrightHolder = "Whoever Article Copyrighted";

+ span = TestUtil.createSpan(expectedCopyrightHolder);

+ setItemProp(span, "copyrightHolder");

+ rootDiv.appendChild(span);

+ String expectedSection = "Romance thriller";

+ span = TestUtil.createSpan(expectedSection);

+ setItemProp(span, "articleSection");

+ rootDiv.appendChild(span);

+ SchemaOrgParser parser = new SchemaOrgParser(mRoot);

+ assertEquals("ARTICLE", parser.getType());

+ assertEquals(expectedTitle, parser.getTitle());

+ assertEquals(expectedDescription, parser.getDescription());

+ assertEquals(expectedUrl, parser.getUrl());

+ assertEquals(expectedAuthor, parser.getAuthor());

+ assertEquals(expectedPublisher, parser.getPublisher());

+ assertEquals(

+ "Copyright " + expectedCopyrightYear + " " + expectedCopyrightHolder,

+ parser.getCopyright());

+ MarkupParser.Image[] images = parser.getImages();

+ assertEquals(1, images.length);

+ assertEquals(expectedImage, images[0].image);

+ assertEquals(expectedImage, images[0].url);

+ MarkupParser.Article article = parser.getArticle();

+ assertEquals(expectedDatePublished, article.publishedTime);

+ assertEquals(expectedTimeModified, article.modifiedTime);

+ assertEquals(null, article.expirationTime);

+ assertEquals(expectedSection, article.section);

+ assertEquals(1, article.authors.length);

+ assertEquals(expectedAuthor, article.authors[0]);

+ }

+ public void testItemscopeInHTMLTag() {

+ setItemScopeAndType(mRoot, "Article");

+ String expectedTitle = "Testcase for ItemScope in HTML tag";

+ Element h = TestUtil.createHeading(1, expectedTitle);

+ setItemProp(h, "headline");

+ mBody.appendChild(h);

+ SchemaOrgParser parser = new SchemaOrgParser(mRoot);

+ assertEquals("ARTICLE", parser.getType());

+ assertEquals(expectedTitle, parser.getTitle());

+ assertTrue(parser.getArticle() != null);

+ // Remove "itemscope" and "itemtype" attributes in <html> tag, so that

+ // other testcases won't be affected.

+ mRoot.removeAttribute("ITEMSCOPE");

+ mRoot.removeAttribute("ITEMTYPE");

+ }

+ @Override

+ protected void gwtSetUp() throws Exception {

+ // Get root element.

+ mRoot = Document.get().getDocumentElement();

+ // Get <body> element.

+ NodeList<Element> bodies = mRoot.getElementsByTagName("BODY");

+ if (bodies.getLength() != 1)

+ throw new Exception("There shouldn't be more than 1 <body> tag");

+ mBody = bodies.getItem(0);

+ // Remove all meta tags, otherwise a testcase may run with the meta tags

+ // set up in a previous testcase, resulting in unexpected results.

+ NodeList<Element> allMeta = mRoot.getElementsByTagName("META");

+ for (int i = allMeta.getLength() - 1; i >= 0; i--) {

+ allMeta.getItem(i).removeFromParent();

+ }

+ // Remove all div tags, otherwise a testcase may run with the div tags

+ // set up in a previous testcase, resulting in unexpected results.

+ NodeList<Element> allDiv = mRoot.getElementsByTagName("DIV");

+ for (int i = allDiv.getLength() - 1; i >= 0; i--) {

+ allDiv.getItem(i).removeFromParent();

+ }

+ private void setItemScopeAndType(Element e, String type) {

+ e.setAttribute("ITEMSCOPE", "");

+ e.setAttribute("ITEMTYPE", "http://schema.org/" + type);

+ }

+ private void setItemProp(Element e, String name) {

+ e.setAttribute("itemprop", name);

+ }

+ private Element mRoot;

+ private Element mBody;