Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(73)

Unified Diff: src/com/dom_distiller/client/SchemaOrgParserAccessor.java

Issue 240073007: recognize and parse Schema.org Markup (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: rm 1 more unused prop in image Created 6 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/com/dom_distiller/client/SchemaOrgParserAccessor.java
diff --git a/src/com/dom_distiller/client/SchemaOrgParserAccessor.java b/src/com/dom_distiller/client/SchemaOrgParserAccessor.java
new file mode 100644
index 0000000000000000000000000000000000000000..27cfefc55bfc981a91d344776c9311ea36f3cf7e
--- /dev/null
+++ b/src/com/dom_distiller/client/SchemaOrgParserAccessor.java
@@ -0,0 +1,178 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package com.dom_distiller.client;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import com.google.gwt.dom.client.Element;
+
+/**
+ * This class instantiates SchemaOrgParser and implements MarkupParser.Parser interface to provide
+ * access to properties that SchemaOrgParser has parsed.
+ */
+public class SchemaOrgParserAccessor implements MarkupParser.Parser {
+ private final SchemaOrgParser parser;
+
+ /**
+ * The object that instantiates SchemaOrgParser and implements its MarkupParser.Parser
+ * interface.
+ */
+ public SchemaOrgParserAccessor(Element root) {
+ parser = new SchemaOrgParser(root);
+ }
+
+ @Override
+ public String getTitle() {
+ String title = "";
+ List<SchemaOrgParser.ThingItem> itemScopes = parser.getItemScopes();
+
+ // Get the "headline" property of the first article that has it.
+ for (int i = 0; i < itemScopes.size() && title.isEmpty(); i++) {
+ SchemaOrgParser.ThingItem item = itemScopes.get(i);
+ if (item.getType() == SchemaOrgParser.Type.ARTICLE) {
cjhopman 2014/04/29 17:04:19 Iterating through the articles seems pretty common
kuan 2014/04/29 23:26:43 Done.
+ title = item.getStringProperty(SchemaOrgParser.HEADLINE_PROP);
+ }
+ }
+
+ // If there's no "headline" property, use "name" property.
+ for (int i = 0; i < itemScopes.size() && title.isEmpty(); i++) {
+ SchemaOrgParser.ThingItem item = itemScopes.get(i);
+ if (item.getType() == SchemaOrgParser.Type.ARTICLE) {
+ title = item.getStringProperty(SchemaOrgParser.NAME_PROP);
+ }
+ }
+
+ return title;
+ }
+
+ @Override
+ public String getType() {
+ // TODO(kuan): consolidate/standardize types returned from all 3 parsers in MarkupParser.
+ // Returns ARTICLe if there's an article.
+ return parser.findFirstArticle() != null ? SchemaOrgParser.Type.ARTICLE.toString() : "";
+ }
+
+ @Override
+ public String getUrl() {
+ SchemaOrgParser.ArticleItem item = parser.findFirstArticle();
+ return item != null ? item.getStringProperty(SchemaOrgParser.URL_PROP) : "";
+ }
+
+ @Override
+ public MarkupParser.Image[] getImages() {
+ List<SchemaOrgParser.ThingItem> itemScopes = parser.getItemScopes();
+ if (itemScopes.isEmpty()) return null;
+
+ List<MarkupParser.Image> images = new ArrayList<MarkupParser.Image>();
+ boolean hasRepresentativeImage = false;
+ MarkupParser.Image imageOfArticle = null;
+ SchemaOrgParser.ImageItem associatedImageOfArticle = null;
+
+ for (int i = 0; i < itemScopes.size(); i++) {
+ SchemaOrgParser.ThingItem item = itemScopes.get(i);
+ MarkupParser.Image image = null;
+ if (item.getType() == SchemaOrgParser.Type.ARTICLE) {
+ SchemaOrgParser.ArticleItem articleItem = (SchemaOrgParser.ArticleItem) item;
+ // If article has an associated image or the "image" property, remember them for
+ // now; they'll be added to the list later when the position in the list can be
+ // determined.
+ if (associatedImageOfArticle == null) {
+ associatedImageOfArticle = articleItem.getRepresentativeImageItem();
+ if (associatedImageOfArticle != null) continue;
+ }
+ image = articleItem.getImage();
+ if (image == null) continue;
+ if (imageOfArticle == null) {
+ imageOfArticle = image;
+ } else {
+ images.add(image);
+ }
+ } else if (item.getType() == SchemaOrgParser.Type.IMAGE) {
+ SchemaOrgParser.ImageItem imageItem = (SchemaOrgParser.ImageItem) item;
+ image = imageItem.getImage();
+ // Insert |image| at beginning of list if it's the first image that's
+ // representative of page or it's the associated image of the first article.
+ if (!hasRepresentativeImage && (imageItem == associatedImageOfArticle ||
+ imageItem.isRepresentativeOfPage())) {
+ hasRepresentativeImage = true;
+ images.add(0, image);
+ } else {
+ images.add(image);
+ }
+ }
+ }
+
+ // Prepend |imageOfArticle| to list if there's no image representative of page; append it
+ // otherwise.
+ if (imageOfArticle != null) {
+ if (!hasRepresentativeImage) images.add(0, imageOfArticle);
+ else images.add(imageOfArticle);
+ }
+
+ if (images.isEmpty()) return null;
+
+ return images.toArray(new MarkupParser.Image[images.size()]);
+ }
+
+ @Override
+ public String getDescription() {
+ SchemaOrgParser.ArticleItem item = parser.findFirstArticle();
+ return item != null ? item.getStringProperty(SchemaOrgParser.DESCRIPTION_PROP) : "";
+ }
+
+ @Override
+ public String getPublisher() {
+ // Returns either the "publisher" or "copyrightHolder" property of the first article.
+ String publisher = "";
+ SchemaOrgParser.ArticleItem article = parser.findFirstArticle();
+ if (article != null) {
+ publisher = article.getPersonOrOrganizationName(SchemaOrgParser.PUBLISHER_PROP);
+ if (publisher.isEmpty()) {
+ publisher = article.getPersonOrOrganizationName(
+ SchemaOrgParser.COPYRIGHT_HOLDER_PROP);
+ }
+ }
+ return publisher;
+ }
+
+ @Override
+ public String getCopyright() {
+ // Returns a concatenated string of copyright year and copyright holder of the first article
+ // that has these properties, delimited by a whitespace.
+ SchemaOrgParser.ArticleItem item = parser.findFirstArticle();
+ if (item == null) return "";
+ String copyright = SchemaOrgParser.concat(
+ item.getStringProperty(SchemaOrgParser.COPYRIGHT_YEAR_PROP),
+ item.getPersonOrOrganizationName(SchemaOrgParser.COPYRIGHT_HOLDER_PROP));
+ return copyright.isEmpty() ? copyright : "Copyright " + copyright;
+ }
+
+ @Override
+ public String getAuthor() {
+ String author = "";
+ SchemaOrgParser.ArticleItem item = parser.findFirstArticle();
+ if (item != null) {
+ author = item.getPersonOrOrganizationName(SchemaOrgParser.AUTHOR_PROP);
+ // If there's no "author" property, use "creator" property.
+ if (author.isEmpty()) {
+ author = item.getPersonOrOrganizationName(SchemaOrgParser.CREATOR_PROP);
+ }
+ }
+ // Otherwise, use "rel=author" tag.
+ return author.isEmpty() ? parser.getAuthorFromRel() : author;
+ }
+
+ @Override
+ public MarkupParser.Article getArticle() {
+ SchemaOrgParser.ArticleItem item = parser.findFirstArticle();
+ return item != null ? item.getArticle() : null;
+ }
+
+ @Override
+ public boolean optOut() {
+ return false;
+ }
+}

Powered by Google App Engine
This is Rietveld 408576698