Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(60)

Unified Diff: src/com/dom_distiller/client/SchemaOrgParser.java

Issue 240073007: recognize and parse Schema.org Markup (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: rm 1 more unused prop in image Created 6 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/com/dom_distiller/client/SchemaOrgParser.java
diff --git a/src/com/dom_distiller/client/SchemaOrgParser.java b/src/com/dom_distiller/client/SchemaOrgParser.java
new file mode 100644
index 0000000000000000000000000000000000000000..0ee9e1faffddd6fdf7a9900934d5818d603f5472
--- /dev/null
+++ b/src/com/dom_distiller/client/SchemaOrgParser.java
@@ -0,0 +1,455 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package com.dom_distiller.client;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import com.google.gwt.dom.client.AnchorElement;
+import com.google.gwt.dom.client.Element;
+import com.google.gwt.dom.client.ImageElement;
+import com.google.gwt.dom.client.MetaElement;
+import com.google.gwt.dom.client.Node;
+import com.google.gwt.dom.client.NodeList;
+
+/**
+ * This class recognizes and parses schema.org markup tags, and returns the properties that matter
+ * to distilled content.
+ * Schema.org markup (http://schema.org) is based on the microdata format
+ * (http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html).
+ * For the basic Schema.org Thing type, the basic properties are: name, url, description, image.
+ * In addition, for each type that we support, we also parse more specific properties:
+ * - Article: headline (i.e. title), publisher, copyright year, copyright holder, date published,
+ * date modified, author, article section
+ * - ImageObject: headline (i.e. title), publisher, copyright year, copyright holder, content url,
+ * encoding format, caption, representative of page, width, height
+ * - Person: family name, given name
+ * - Organization: legal name.
+ * The value of a Schema.Org property can be a Schema.Org type, i.e. embedded. E.g., the author or
+ * publisher of article or publisher of image could be a Schema.Org Person or Organization type;
+ * in fact, this is the reason we support Person and Organization types.
+ */
+public class SchemaOrgParser {
+ static final String NAME_PROP = "name";
+ static final String URL_PROP = "url";
+ static final String DESCRIPTION_PROP = "description";
+ static final String IMAGE_PROP = "image";
+ static final String HEADLINE_PROP = "headline";
+ static final String PUBLISHER_PROP = "publisher";
+ static final String COPYRIGHT_HOLDER_PROP = "copyrightHolder";
+ static final String COPYRIGHT_YEAR_PROP = "copyrightYear";
+ static final String CONTENT_URL_PROP = "contentUrl";
+ static final String ENCODING_FORMAT_PROP = "encodingFormat";
+ static final String CAPTION_PROP = "caption";
+ static final String REPRESENTATIVE_PROP = "representativeOfPage";
+ static final String WIDTH_PROP = "width";
+ static final String HEIGHT_PROP = "height";
+ static final String DATE_PUBLISHED_PROP = "datePublished";
+ static final String DATE_MODIFIED_PROP = "dateModified";
+ static final String AUTHOR_PROP = "author";
+ static final String CREATOR_PROP = "creator";
+ static final String SECTION_PROP = "articleSection";
+ static final String ASSOCIATED_MEDIA_PROP = "associatedMedia";
+ static final String ENCODING_PROP = "encoding";
+ static final String FAMILY_NAME_PROP = "familyName";
+ static final String GIVEN_NAME_PROP = "givenName";
+ static final String LEGAL_NAME_PROP = "legalName";
+ static final String AUTHOR_REL = "author";
+
+ enum Type { // All these types are extended from Thing, directly or indirectly.
+ IMAGE,
+ ARTICLE,
+ PERSON,
+ ORGANIZATION,
+ UNSUPPORTED,
+ }
+
+ static class ThingItem {
+ private final Type mType;
+ private final Map<String, String> mStringProperties;
+ private final Map<String, ThingItem> mItemProperties;
+
+ ThingItem(Type type) {
+ mType = type;
+ mStringProperties = new HashMap<String, String>();
+ mItemProperties = new HashMap<String, ThingItem>();
+
+ addStringPropertyName(NAME_PROP);
+ addStringPropertyName(URL_PROP);
+ addStringPropertyName(DESCRIPTION_PROP);
+ addStringPropertyName(IMAGE_PROP);
+ }
+
+ final void addStringPropertyName(String name) {
+ mStringProperties.put(name, "");
+ }
+
+ final void addItemPropertyName(String name) {
+ mItemProperties.put(name, null);
+ }
+
+ final String getStringProperty(String name) {
+ return !mStringProperties.containsKey(name) ? "" : mStringProperties.get(name);
+ }
+
+ final ThingItem getItemProperty(String name) {
+ return !mItemProperties.containsKey(name) ? null : mItemProperties.get(name);
+ }
+
+ final Type getType() { return mType; }
+
+ final boolean isSupported() { return mType != Type.UNSUPPORTED; }
+
+ // Store |value| for property with |name|, unless the property already has a non-empty
+ // value, in which case |value| will be ignored. This means we only keep the first value.
+ final void putStringValue(String name, String value) {
+ if (mStringProperties.containsKey(name) && mStringProperties.get(name).isEmpty()) {
+ mStringProperties.put(name, value);
+ }
+ }
+
+ // Store |value| for property with |name|, unless the property already has a non-null value,
+ // in which case, |value| will be ignored. This means we only keep the first value.
+ final void putItemValue(String name, ThingItem value) {
+ if (mItemProperties.containsKey(name)) mItemProperties.put(name, value);
+ }
+ }
+
+ private final List<ThingItem> mItemScopes = new ArrayList<ThingItem>();
+ private String mAuthorFromRel = "";
+ private static final Map<String, Type> sTypeUrls;
+
+ static {
+ sTypeUrls = new HashMap<String, Type>();
+ sTypeUrls.put("http://schema.org/ImageObject", Type.IMAGE);
+ sTypeUrls.put("http://schema.org/Article", Type.ARTICLE);
+ sTypeUrls.put("http://schema.org/BlogPosting", Type.ARTICLE);
+ sTypeUrls.put("http://schema.org/NewsArticle", Type.ARTICLE);
+ sTypeUrls.put("http://schema.org/ScholarlyArticle", Type.ARTICLE);
+ sTypeUrls.put("http://schema.org/TechArticle", Type.ARTICLE);
+ sTypeUrls.put("http://schema.org/Person", Type.PERSON);
+ sTypeUrls.put("http://schema.org/Organization", Type.ORGANIZATION);
+ sTypeUrls.put("http://schema.org/Corporation", Type.ORGANIZATION);
+ sTypeUrls.put("http://schema.org/EducationalOrganization", Type.ORGANIZATION);
+ sTypeUrls.put("http://schema.org/GovernmentOrganization", Type.ORGANIZATION);
+ sTypeUrls.put("http://schema.org/NGO", Type.ORGANIZATION);
+ }
+
+ /**
+ * The object that extracts and verifies Schema.org markup tags from |root|.
+ */
+ public SchemaOrgParser(Element root) {
+ // TODO(kuan): Parsing all tags is pretty expensive, should we do so only lazily?
+ // If parse lazily, all get* methods will need to check for parsed state and, if necessary,
+ // parse before returning the requested properties.
+ // Note that the <html> element can also be the start of a Schema.org item, and hence needs
+ // to be parsed.
+ parse(root, null);
+ }
+
+ final ArticleItem findFirstArticle() {
+ for (int i = 0; i < mItemScopes.size(); i++) {
+ ThingItem item = mItemScopes.get(i);
+ if (item.mType == Type.ARTICLE) return (ArticleItem) item;
+ }
+ return null;
+ }
+
+ final List<ThingItem> getItemScopes() { return mItemScopes; }
+
+ final String getAuthorFromRel() { return mAuthorFromRel; }
+
+ static String concat(String first, String second) {
+ String concat = first;
+ if (!concat.isEmpty() && !second.isEmpty()) concat += " ";
+ concat += second;
+ return concat;
+ }
+
+ private void parse(Element e, ThingItem parentItem) {
+ ThingItem newItem = null;
+ boolean isItemScope = isItemScope(e);
+ // A non-null |parentItem| means we're currently parsing the elements for a schema.org type.
+ String[] propertyNames = parentItem != null ? getItemProp(e) : new String[0];
+
+ if (isItemScope) {
+ // The "itemscope" and "itemtype" attributes of |e| indicate the start of an item.
+ // Create the corresponding extended-ThingItem, and add it to the list if:
+ // 1) its type is supported, and
+ // 2) if the parent is an unsupported type, it's not an "itemprop" attribute of the
+ // parent, based on the rule that an item is a top-level item if its element doesn't
+ // have an itemprop attribute.
+ newItem = createItemForElement(e);
+ if (newItem != null && newItem.isSupported() &&
+ (parentItem == null || parentItem.isSupported() || propertyNames.length == 0)) {
+ mItemScopes.add(newItem);
+ }
+ }
+
+ // If parent is a supported type, parse the element for >= 1 properties in "itemprop"
+ // attribute.
+ if (propertyNames.length > 0 && parentItem.isSupported() &&
+ (newItem == null || newItem.isSupported())) {
+ for (int i = 0; i < propertyNames.length; i++) {
+ // If a new item was created above, the property value of this "itemprop" attribute
+ // is an embedded item, so add it to the parent item.
+ if (newItem != null) {
+ parentItem.putItemValue(propertyNames[i], newItem);
+ } else {
+ // Otherwise, extract the property value from the tag itself, and add it to the
+ // parent item.
+ parentItem.putStringValue(propertyNames[i], getPropertyValue(e));
+ }
+ }
+ }
+
+ // If <a> or <link> tags specify rel="author", extract it.
+ if (mAuthorFromRel.isEmpty()) mAuthorFromRel = getAuthorFromRelAttribute(e);
+
+ // Now, parse each child element recursively.
+ NodeList<Node> children = e.getChildNodes();
+ for (int i = 0; i < children.getLength(); i++) {
+ Node child = children.getItem(i);
+ if (child.getNodeType() != Node.ELEMENT_NODE) continue;
+ parse(Element.as(child), newItem != null ? newItem : parentItem);
+ }
+ }
+
+ private Type getItemType(Element e) {
+ // "itemtype" attribute is case-sensitive.
+ String type = e.getAttribute("ITEMTYPE");
+ return sTypeUrls.containsKey(type) ? sTypeUrls.get(type) : Type.UNSUPPORTED;
+ }
+
+ private ThingItem createItemForElement(Element e) {
+ ThingItem newItem = null;
+ Type type = getItemType(e);
+ switch (type) {
+ case IMAGE:
+ newItem = new ImageItem();
+ break;
+ case ARTICLE:
+ newItem = new ArticleItem();
+ break;
+ case PERSON:
+ newItem = new PersonItem();
+ break;
+ case ORGANIZATION:
+ newItem = new OrganizationItem();
+ break;
+ case UNSUPPORTED:
+ newItem = new UnsupportedItem();
+ break;
+ default:
+ return null;
+ }
+ return newItem;
+ }
+
+ static class ImageItem extends ThingItem {
+ ImageItem() {
+ super(Type.IMAGE);
+
+ addStringPropertyName(CONTENT_URL_PROP);
+ addStringPropertyName(ENCODING_FORMAT_PROP);
+ addStringPropertyName(CAPTION_PROP);
+ addStringPropertyName(REPRESENTATIVE_PROP);
+ addStringPropertyName(WIDTH_PROP);
+ addStringPropertyName(HEIGHT_PROP);
+ }
+
+ final boolean isRepresentativeOfPage() {
+ return getStringProperty(REPRESENTATIVE_PROP).equalsIgnoreCase("true");
+ }
+
+ final MarkupParser.Image getImage() {
+ MarkupParser.Image image = new MarkupParser.Image();
+ image.image = getStringProperty(CONTENT_URL_PROP);
+ if (image.image.isEmpty()) image.image = getStringProperty(URL_PROP);
+ image.url = image.image;
+ image.type = getStringProperty(ENCODING_FORMAT_PROP);
+ image.caption = getStringProperty(CAPTION_PROP);
+ try {
+ image.width = Integer.parseInt(getStringProperty(WIDTH_PROP), 10);
+ } catch (Exception e) {
+ }
+ try {
+ image.height = Integer.parseInt(getStringProperty(HEIGHT_PROP), 10);
+ } catch (Exception e) {
+ }
+ return image;
+ }
+ }
+
+ static class ArticleItem extends ThingItem {
+ ArticleItem() {
+ super(Type.ARTICLE);
+
+ addStringPropertyName(HEADLINE_PROP);
+ addStringPropertyName(PUBLISHER_PROP);
+ addStringPropertyName(COPYRIGHT_HOLDER_PROP);
+ addStringPropertyName(COPYRIGHT_YEAR_PROP);
+ addStringPropertyName(DATE_MODIFIED_PROP);
+ addStringPropertyName(DATE_PUBLISHED_PROP);
+ addStringPropertyName(AUTHOR_PROP);
+ addStringPropertyName(CREATOR_PROP);
+ addStringPropertyName(SECTION_PROP);
+
+ addItemPropertyName(PUBLISHER_PROP);
+ addItemPropertyName(COPYRIGHT_HOLDER_PROP);
+ addItemPropertyName(AUTHOR_PROP);
+ addItemPropertyName(CREATOR_PROP);
+ addItemPropertyName(ASSOCIATED_MEDIA_PROP);
+ addItemPropertyName(ENCODING_PROP);
+ }
+
+ final MarkupParser.Article getArticle() {
+ MarkupParser.Article article = new MarkupParser.Article();
+ article.publishedTime = getStringProperty(DATE_PUBLISHED_PROP);
+ article.modifiedTime = getStringProperty(DATE_MODIFIED_PROP);
+ article.section = getStringProperty(SECTION_PROP);
+ String author = getPersonOrOrganizationName(AUTHOR_PROP);
+ if (author.isEmpty()) author = getPersonOrOrganizationName(CREATOR_PROP);
+ article.authors = author.isEmpty() ? new String[0] : new String[] { author };
+ return article;
+ }
+
+ final String getPersonOrOrganizationName(String propertyName) {
+ // Returns either the string value of |propertyName| or the value returned by getName()
+ // of PersonItem or OrganizationItem.
+ String value = getStringProperty(propertyName);
+ if (!value.isEmpty()) return value;
+
+ ThingItem valueItem = getItemProperty(propertyName);
+ if (valueItem != null) {
+ if (valueItem.getType() == Type.PERSON) {
+ value = ((PersonItem) valueItem).getName();
+ } else if (valueItem.getType() == Type.ORGANIZATION) {
+ value = ((OrganizationItem) valueItem).getName();
+ }
+ }
+ return value;
+ }
+
+ final ImageItem getRepresentativeImageItem() {
+ // Returns the corrresponding ImageItem for "associatedMedia" or "encoding" property.
+ ThingItem imageItem = getItemProperty(ASSOCIATED_MEDIA_PROP);
+ if (imageItem == null) imageItem = getItemProperty(ENCODING_PROP);
+ return imageItem != null && imageItem.getType() == Type.IMAGE ?
+ (ImageItem) imageItem : null;
+ }
+
+ final MarkupParser.Image getImage() {
+ // Use value of "image" property to create a MarkupParser.Image.
+ String imageUrl = getStringProperty(IMAGE_PROP);
+ if (imageUrl.isEmpty()) return null;
+ MarkupParser.Image image = new MarkupParser.Image();
+ image.image = imageUrl;
+ image.url = imageUrl;
+ return image;
+ }
+ }
+
+ private static class PersonItem extends ThingItem {
+ PersonItem() {
+ super(Type.PERSON);
+
+ addStringPropertyName(FAMILY_NAME_PROP);
+ addStringPropertyName(GIVEN_NAME_PROP);
+ }
+
+ String getName() {
+ // Returns either the value of NAME_PROP, or concatenated values of GIVEN_NAME_PROP and
+ // FAMILY_NAME_PROP delimited by a whitespace.
+ String name = getStringProperty(NAME_PROP);
+ return !name.isEmpty() ? name :
+ concat(getStringProperty(GIVEN_NAME_PROP), getStringProperty(FAMILY_NAME_PROP));
+ }
+ }
+
+ private static class OrganizationItem extends ThingItem {
+ OrganizationItem() {
+ super(Type.ORGANIZATION);
+
+ addStringPropertyName(LEGAL_NAME_PROP);
+ }
+
+ String getName() {
+ // Returns either the value of NAME_PROP or LEGAL_NAME_PROP.
+ String name = getStringProperty(NAME_PROP);
+ return !name.isEmpty() ? name : getStringProperty(LEGAL_NAME_PROP);
+ }
+ }
+
+ private static class UnsupportedItem extends ThingItem {
+ UnsupportedItem() {
+ super(Type.UNSUPPORTED);
+ }
+ }
+
+ private static boolean isItemScope(Element e) {
+ return e.hasAttribute("ITEMSCOPE") && e.hasAttribute("ITEMTYPE");
+ }
+
+ private static String[] getItemProp(Element e) {
+ // "itemprop" attribute is case-sensitive, and can have multiple properties.
+ String itemprop = e.getAttribute("ITEMPROP");
+ if (itemprop.isEmpty()) return new String[0];
+ String[] splits = StringUtil.split(itemprop, "\\s+");
+ return splits.length > 0 ? splits : new String[] { itemprop };
+ }
+
+ private static final Map<String, String[]> sTagAttributesMap;
+
+ static {
+ // The key for |sTagAttributesMap| is the tag name, while the entry value is an array of
+ // attributes in the specified tag from which to extract information:
+ // - 0th attribute: contains the value for the property specified in itemprop
+ // - 1st attribute: if available, contains the value for the author property.
+ sTagAttributesMap = new HashMap<String, String[]>();
+ sTagAttributesMap.put("IMG", new String[] { "SRC" });
+ sTagAttributesMap.put("AUDIO", new String[] { "SRC" });
+ sTagAttributesMap.put("EMBED", new String[] { "SRC" });
+ sTagAttributesMap.put("IFRAME", new String[] { "SRC" });
+ sTagAttributesMap.put("SOURCE", new String[] { "SRC" });
+ sTagAttributesMap.put("TRACK", new String[] { "SRC" });
+ sTagAttributesMap.put("VIDEO", new String[] { "SRC" });
+ sTagAttributesMap.put("A", new String[] { "HREF", "REL" });
+ sTagAttributesMap.put("LINK", new String[] { "HREF", "REL" });
cjhopman 2014/04/29 17:04:19 I think this would be clearer if this was just a m
kuan 2014/04/29 23:26:43 Done. hehe.. back then when i changed to include
+ sTagAttributesMap.put("AREA", new String[] { "HREF" });
+ sTagAttributesMap.put("META", new String[] { "CONTENT" });
+ sTagAttributesMap.put("TIME", new String[] { "DATETIME" });
+ sTagAttributesMap.put("OBJECT", new String[] { "DATA" });
+ sTagAttributesMap.put("DATA", new String[] { "VALUE" });
+ sTagAttributesMap.put("METER", new String[] { "VALUE" });
+ }
+
+ // Extracts the property value from |e|. For some tags, the value is a specific attribute,
+ // while for others, it's the text between the start and end tags.
+ private static String getPropertyValue(Element e) {
+ String value = "";
+ String tagName = e.getTagName();
+ if (sTagAttributesMap.containsKey(tagName)) {
+ value = e.getAttribute(sTagAttributesMap.get(tagName)[0]);
+ }
+ if (value.isEmpty()) value = e.getInnerText();
+ return value;
+ }
+
+ // Extracts the author property from |e|'s "rel=author" attribute.
+ private static String getAuthorFromRelAttribute(Element e) {
+ String author = "";
+ String tagName = e.getTagName();
+ if (sTagAttributesMap.containsKey(tagName)) {
+ String[] attrs = sTagAttributesMap.get(tagName);
+ if (attrs.length > 1 && e.getAttribute(attrs[1]).equals(AUTHOR_REL)) {
+ author = e.getInnerText();
+ }
+ }
+ return author;
+ }
+}

Powered by Google App Engine
This is Rietveld 408576698