src/com/dom_distiller/client/SchemaOrgParserAccessor.java - Issue 240073007: recognize and parse Schema.org Markup

Unified Diff: src/com/dom_distiller/client/SchemaOrgParserAccessor.java

Issue 240073007: recognize and parse Schema.org Markup (Closed) Base URL: https://code.google.com/p/dom-distiller/@master

Patch Set: addressed missed-out comments Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« src/com/dom_distiller/client/SchemaOrgParser.java ('K') | « src/com/dom_distiller/client/SchemaOrgParser.java ('k') | test/com/dom_distiller/client/SchemaOrgParserAccessorTest.java » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: src/com/dom_distiller/client/SchemaOrgParserAccessor.java

diff --git a/src/com/dom_distiller/client/SchemaOrgParserAccessor.java b/src/com/dom_distiller/client/SchemaOrgParserAccessor.java

new file mode 100644

index 0000000000000000000000000000000000000000..ed6ddfd96bbab7c20a2dd298a79af7c384ba8120

--- /dev/null

+++ b/src/com/dom_distiller/client/SchemaOrgParserAccessor.java

@@ -0,0 +1,146 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+package com.dom_distiller.client;

+import java.util.ArrayList;

+import java.util.List;

+import com.google.gwt.dom.client.Element;

+/**

+ * This class instantiates SchemaOrgParser and implements MarkupParser.Parser interface to provide

+ * access to properties that SchemaOrgParser has parsed.

+ */

+public class SchemaOrgParserAccessor implements MarkupParser.Parser {

+ private final SchemaOrgParser parser;

+ /**

+ * The object that instantiates SchemaOrgParser and implements its MarkupParser.Parser

+ * interface.

+ */

+ public SchemaOrgParserAccessor(Element root) {

+ parser = new SchemaOrgParser(root);

+ }

+ @Override

+ public String getTitle() {

+ String title = parser.findStringProperty(SchemaOrgParser.HEADLINE_PROP);

+ // If there's no "headline" property, use "name" property of first article.

+ if (title.isEmpty()) {

+ SchemaOrgParser.ThingItem item = parser.findFirstArticle();

+ if (item != null) title = item.getStringProperty(SchemaOrgParser.NAME_PROP);

+ }

+ return title;

+ }

+ @Override

+ public String getType() {

+ // TODO(kuan): consolidate/standardize types returned from all 3 parsers in MarkupParser.

+ // Returns ARTICLe if there's an article.

+ return parser.findFirstArticle() != null ? SchemaOrgParser.Type.ARTICLE.toString() : "";

+ }

+ @Override

+ public String getUrl() {

+ SchemaOrgParser.ThingItem item = parser.findFirstArticle();

+ return item != null ? item.getStringProperty(SchemaOrgParser.URL_PROP) : "";

+ }

+ @Override

+ public MarkupParser.Image[] getImages() {

+ List<SchemaOrgParser.ThingItem> itemScopes = parser.getItemScopes();

+ if (itemScopes.isEmpty()) return null;

+ List<MarkupParser.Image> images = new ArrayList<MarkupParser.Image>();

+ boolean hasRepresentativeImage = false;

+ MarkupParser.Image imageOfArticle = null;

+ for (int i = 0; i < itemScopes.size(); i++) {

+ SchemaOrgParser.ThingItem item = itemScopes.get(i);

+ MarkupParser.Image image = item.getImage();

+ if (image == null) continue;

+ // If |image| is from an article with the "image" property, remember it for now;

+ // it'll be added to to the list later when its position in the list can be determined.

+ if (imageOfArticle == null && item.getType() == SchemaOrgParser.Type.ARTICLE) {

+ imageOfArticle = image;

+ continue;

+ }

+ // Otherwise, |image| is from an ImageObject, insert it at beginning of list if it's

+ // the first image that's representative of page.

+ if (!hasRepresentativeImage && item.isImageRepresentativeOfPage()) {

+ hasRepresentativeImage = true;

+ // Image should be the dominant, i.e. first, one.

+ images.add(0, image);

+ } else {

+ images.add(image);

+ }

+ // Prepend |imageOfArticle| to list if there's no image representative of page; append it

+ // otherwise.

+ if (imageOfArticle != null) {

+ if (!hasRepresentativeImage) images.add(0, imageOfArticle);

+ else images.add(imageOfArticle);

+ }

+ if (images.isEmpty()) return null;

+ return images.toArray(new MarkupParser.Image[images.size()]);

+ }

+ @Override

+ public String getDescription() {

+ SchemaOrgParser.ThingItem item = parser.findFirstArticle();

+ return item != null ? item.getStringProperty(SchemaOrgParser.DESCRIPTION_PROP) : "";

+ }

+ @Override

+ public String getPublisher() {

+ SchemaOrgParser.ThingItem item = parser.findFirstArticle();

+ if (item == null) return "";

+ String publisher = item.getStringProperty(SchemaOrgParser.PUBLISHER_PROP);

+ // If there's no "publisher" property, use "copyrightHolder" property of first article.

+ if (publisher.isEmpty()) {

+ publisher = item.getStringProperty(SchemaOrgParser.COPYRIGHT_HOLDER_PROP);

+ }

+ return publisher;

+ }

+ @Override

+ public String getCopyright() {

+ // Returns a concatenated string of copyright year and copyright holder of the first article

+ // that has these properties, delimited by a whitespace.

+ SchemaOrgParser.ThingItem item = parser.findFirstArticle();

+ if (item == null) return "";

+ String copyright = SchemaOrgParser.concat(

+ item.getStringProperty(SchemaOrgParser.COPYRIGHT_YEAR_PROP),

+ item.getStringProperty(SchemaOrgParser.COPYRIGHT_HOLDER_PROP));

+ return copyright.isEmpty() ? copyright : "Copyright " + copyright;

+ }

+ @Override

+ public String getAuthor() {

+ String author = "";

+ SchemaOrgParser.ThingItem item = parser.findFirstArticle();

+ if (item != null) {

+ author = item.getStringProperty(SchemaOrgParser.AUTHOR_PROP);

+ // If there's no "author" property, use "creator" property.

+ if (author.isEmpty()) author = item.getStringProperty(SchemaOrgParser.CREATOR_PROP);

+ }

+ // Otherwise, use "rel=author" tag.

+ return author.isEmpty() ? parser.getAuthorFromRel() : author;

+ }

+ @Override

+ public MarkupParser.Article getArticle() {

+ SchemaOrgParser.ThingItem item = parser.findFirstArticle();

+ return item != null ? item.getArticle() : null;

+ }

+ @Override

+ public boolean optOut() {

+ return false;

+ }