src/com/dom_distiller/client/SchemaOrgParserAccessor.java - Issue 240073007: recognize and parse Schema.org Markup

Unified Diff: src/com/dom_distiller/client/SchemaOrgParserAccessor.java

Issue 240073007: recognize and parse Schema.org Markup (Closed) Base URL: https://code.google.com/p/dom-distiller/@master

Patch Set: rm 1 more unused prop in image Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« src/com/dom_distiller/client/SchemaOrgParser.java ('K') | « src/com/dom_distiller/client/SchemaOrgParser.java ('k') | test/com/dom_distiller/client/SchemaOrgParserAccessorTest.java » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: src/com/dom_distiller/client/SchemaOrgParserAccessor.java

diff --git a/src/com/dom_distiller/client/SchemaOrgParserAccessor.java b/src/com/dom_distiller/client/SchemaOrgParserAccessor.java

new file mode 100644

index 0000000000000000000000000000000000000000..27cfefc55bfc981a91d344776c9311ea36f3cf7e

--- /dev/null

+++ b/src/com/dom_distiller/client/SchemaOrgParserAccessor.java

@@ -0,0 +1,178 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+package com.dom_distiller.client;

+import java.util.ArrayList;

+import java.util.List;

+import com.google.gwt.dom.client.Element;

+/**

+ * This class instantiates SchemaOrgParser and implements MarkupParser.Parser interface to provide

+ * access to properties that SchemaOrgParser has parsed.

+ */

+public class SchemaOrgParserAccessor implements MarkupParser.Parser {

+ private final SchemaOrgParser parser;

+ /**

+ * The object that instantiates SchemaOrgParser and implements its MarkupParser.Parser

+ * interface.

+ */

+ public SchemaOrgParserAccessor(Element root) {

+ parser = new SchemaOrgParser(root);

+ }

+ @Override

+ public String getTitle() {

+ String title = "";

+ List<SchemaOrgParser.ThingItem> itemScopes = parser.getItemScopes();

+ // Get the "headline" property of the first article that has it.

+ for (int i = 0; i < itemScopes.size() && title.isEmpty(); i++) {

+ SchemaOrgParser.ThingItem item = itemScopes.get(i);

+ if (item.getType() == SchemaOrgParser.Type.ARTICLE) {

cjhopman 2014/04/29 17:04:19 Iterating through the articles seems pretty common

kuan 2014/04/29 23:26:43 Done.

+ title = item.getStringProperty(SchemaOrgParser.HEADLINE_PROP);

+ }

+ // If there's no "headline" property, use "name" property.

+ for (int i = 0; i < itemScopes.size() && title.isEmpty(); i++) {

+ SchemaOrgParser.ThingItem item = itemScopes.get(i);

+ if (item.getType() == SchemaOrgParser.Type.ARTICLE) {

+ title = item.getStringProperty(SchemaOrgParser.NAME_PROP);

+ }

+ return title;

+ }

+ @Override

+ public String getType() {

+ // TODO(kuan): consolidate/standardize types returned from all 3 parsers in MarkupParser.

+ // Returns ARTICLe if there's an article.

+ return parser.findFirstArticle() != null ? SchemaOrgParser.Type.ARTICLE.toString() : "";

+ }

+ @Override

+ public String getUrl() {

+ SchemaOrgParser.ArticleItem item = parser.findFirstArticle();

+ return item != null ? item.getStringProperty(SchemaOrgParser.URL_PROP) : "";

+ }

+ @Override

+ public MarkupParser.Image[] getImages() {

+ List<SchemaOrgParser.ThingItem> itemScopes = parser.getItemScopes();

+ if (itemScopes.isEmpty()) return null;

+ List<MarkupParser.Image> images = new ArrayList<MarkupParser.Image>();

+ boolean hasRepresentativeImage = false;

+ MarkupParser.Image imageOfArticle = null;

+ SchemaOrgParser.ImageItem associatedImageOfArticle = null;

+ for (int i = 0; i < itemScopes.size(); i++) {

+ SchemaOrgParser.ThingItem item = itemScopes.get(i);

+ MarkupParser.Image image = null;

+ if (item.getType() == SchemaOrgParser.Type.ARTICLE) {

+ SchemaOrgParser.ArticleItem articleItem = (SchemaOrgParser.ArticleItem) item;

+ // If article has an associated image or the "image" property, remember them for

+ // now; they'll be added to the list later when the position in the list can be

+ // determined.

+ if (associatedImageOfArticle == null) {

+ associatedImageOfArticle = articleItem.getRepresentativeImageItem();

+ if (associatedImageOfArticle != null) continue;

+ }

+ image = articleItem.getImage();

+ if (image == null) continue;

+ if (imageOfArticle == null) {

+ imageOfArticle = image;

+ } else {

+ images.add(image);

+ }

+ } else if (item.getType() == SchemaOrgParser.Type.IMAGE) {

+ SchemaOrgParser.ImageItem imageItem = (SchemaOrgParser.ImageItem) item;

+ image = imageItem.getImage();

+ // Insert |image| at beginning of list if it's the first image that's

+ // representative of page or it's the associated image of the first article.

+ if (!hasRepresentativeImage && (imageItem == associatedImageOfArticle ||

+ imageItem.isRepresentativeOfPage())) {

+ hasRepresentativeImage = true;

+ images.add(0, image);

+ } else {

+ images.add(image);

+ }

+ // Prepend |imageOfArticle| to list if there's no image representative of page; append it

+ // otherwise.

+ if (imageOfArticle != null) {

+ if (!hasRepresentativeImage) images.add(0, imageOfArticle);

+ else images.add(imageOfArticle);

+ }

+ if (images.isEmpty()) return null;

+ return images.toArray(new MarkupParser.Image[images.size()]);

+ }

+ @Override

+ public String getDescription() {

+ SchemaOrgParser.ArticleItem item = parser.findFirstArticle();

+ return item != null ? item.getStringProperty(SchemaOrgParser.DESCRIPTION_PROP) : "";

+ }

+ @Override

+ public String getPublisher() {

+ // Returns either the "publisher" or "copyrightHolder" property of the first article.

+ String publisher = "";

+ SchemaOrgParser.ArticleItem article = parser.findFirstArticle();

+ if (article != null) {

+ publisher = article.getPersonOrOrganizationName(SchemaOrgParser.PUBLISHER_PROP);

+ if (publisher.isEmpty()) {

+ publisher = article.getPersonOrOrganizationName(

+ SchemaOrgParser.COPYRIGHT_HOLDER_PROP);

+ }

+ return publisher;

+ }

+ @Override

+ public String getCopyright() {

+ // Returns a concatenated string of copyright year and copyright holder of the first article

+ // that has these properties, delimited by a whitespace.

+ SchemaOrgParser.ArticleItem item = parser.findFirstArticle();

+ if (item == null) return "";

+ String copyright = SchemaOrgParser.concat(

+ item.getStringProperty(SchemaOrgParser.COPYRIGHT_YEAR_PROP),

+ item.getPersonOrOrganizationName(SchemaOrgParser.COPYRIGHT_HOLDER_PROP));

+ return copyright.isEmpty() ? copyright : "Copyright " + copyright;

+ }

+ @Override

+ public String getAuthor() {

+ String author = "";

+ SchemaOrgParser.ArticleItem item = parser.findFirstArticle();

+ if (item != null) {

+ author = item.getPersonOrOrganizationName(SchemaOrgParser.AUTHOR_PROP);

+ // If there's no "author" property, use "creator" property.

+ if (author.isEmpty()) {

+ author = item.getPersonOrOrganizationName(SchemaOrgParser.CREATOR_PROP);

+ }

+ // Otherwise, use "rel=author" tag.

+ return author.isEmpty() ? parser.getAuthorFromRel() : author;

+ }

+ @Override

+ public MarkupParser.Article getArticle() {

+ SchemaOrgParser.ArticleItem item = parser.findFirstArticle();

+ return item != null ? item.getArticle() : null;

+ }

+ @Override

+ public boolean optOut() {

+ return false;

+ }