src/com/dom_distiller/client/SchemaOrgParser.java - Issue 240073007: recognize and parse Schema.org Markup

Unified Diff: src/com/dom_distiller/client/SchemaOrgParser.java

Issue 240073007: recognize and parse Schema.org Markup (Closed) Base URL: https://code.google.com/p/dom-distiller/@master

Patch Set: addressed comments Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/com/dom_distiller/client/SchemaOrgParser.java

diff --git a/src/com/dom_distiller/client/SchemaOrgParser.java b/src/com/dom_distiller/client/SchemaOrgParser.java

new file mode 100644

index 0000000000000000000000000000000000000000..b26b707aa6f57d886cc7146ebdb14984e027cc3e

--- /dev/null

+++ b/src/com/dom_distiller/client/SchemaOrgParser.java

@@ -0,0 +1,470 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+package com.dom_distiller.client;

+import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.List;

+import java.util.Map;

+import com.google.gwt.dom.client.AnchorElement;

+import com.google.gwt.dom.client.Element;

+import com.google.gwt.dom.client.ImageElement;

+import com.google.gwt.dom.client.MetaElement;

+import com.google.gwt.dom.client.Node;

+import com.google.gwt.dom.client.NodeList;

+/**

+ * This class recognizes and parses schema.org markup tags, and returns the properties that matter

+ * to distilled content.

+ * Schema.org markup (http://schema.org) is based on the microdata format

+ * (http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html).

+ * For the basic Schema.org Thing type, the basic properties are: name, url, description, image.

+ * In addition, for each type that we support, we also parse more specific properties:

+ * - Article: headline (i.e. title), publisher, copyright year, copyright holder, date published,

+ * date modified, author, article section

+ * - ImageObject: headline (i.e. title), publisher, copyright year, copyright holder, content url,

+ * encoding format, caption, representative of page, width, height

+ * - Person: family name, given name

+ * - Organization: legal name.

+ * The value of a Schema.Org property can be a Schema.Org type, i.e. embedded. E.g., the author or

+ * publisher of article or publisher of image could be a Schema.Org Person or Organization type;

+ * in fact, this is the reason we support Person and Organization types.

+ */

+public class SchemaOrgParser {

+ static final String NAME_PROP = "name";

+ static final String URL_PROP = "url";

+ static final String DESCRIPTION_PROP = "description";

+ static final String IMAGE_PROP = "image";

+ static final String HEADLINE_PROP = "headline";

+ static final String PUBLISHER_PROP = "publisher";

+ static final String COPYRIGHT_HOLDER_PROP = "copyrightHolder";

+ static final String COPYRIGHT_YEAR_PROP = "copyrightYear";

+ static final String CONTENT_URL_PROP = "contentUrl";

+ static final String ENCODING_FORMAT_PROP = "encodingFormat";

+ static final String CAPTION_PROP = "caption";

+ static final String REPRESENTATIVE_PROP = "representativeOfPage";

+ static final String WIDTH_PROP = "width";

+ static final String HEIGHT_PROP = "height";

+ static final String DATE_PUBLISHED_PROP = "datePublished";

+ static final String DATE_MODIFIED_PROP = "dateModified";

+ static final String AUTHOR_PROP = "author";

+ static final String CREATOR_PROP = "creator";

+ static final String SECTION_PROP = "articleSection";

+ static final String ASSOCIATED_MEDIA_PROP = "associatedMedia";

+ static final String ENCODING_PROP = "encoding";

+ static final String FAMILY_NAME_PROP = "familyName";

+ static final String GIVEN_NAME_PROP = "givenName";

+ static final String LEGAL_NAME_PROP = "legalName";

+ static final String AUTHOR_REL = "author";

+ enum Type { // All these types are extended from Thing, directly or indirectly.

+ IMAGE,

+ ARTICLE,

+ PERSON,

+ ORGANIZATION,

+ UNSUPPORTED,

+ }

+ static class ThingItem {

+ private final Type mType;

+ private final Map<String, String> mStringProperties;

+ private final Map<String, ThingItem> mItemProperties;

+ ThingItem(Type type) {

+ mType = type;

+ mStringProperties = new HashMap<String, String>();

+ mItemProperties = new HashMap<String, ThingItem>();

+ addStringPropertyName(NAME_PROP);

+ addStringPropertyName(URL_PROP);

+ addStringPropertyName(DESCRIPTION_PROP);

+ addStringPropertyName(IMAGE_PROP);

+ }

+ final void addStringPropertyName(String name) {

+ mStringProperties.put(name, "");

+ }

+ final void addItemPropertyName(String name) {

+ mItemProperties.put(name, null);

+ }

+ final String getStringProperty(String name) {

+ return !mStringProperties.containsKey(name) ? "" : mStringProperties.get(name);

+ }

+ final ThingItem getItemProperty(String name) {

+ return !mItemProperties.containsKey(name) ? null : mItemProperties.get(name);

+ }

+ final Type getType() { return mType; }

+ final boolean isSupported() { return mType != Type.UNSUPPORTED; }

+ // Store |value| for property with |name|, unless the property already has a non-empty

+ // value, in which case |value| will be ignored. This means we only keep the first value.

+ final void putStringValue(String name, String value) {

+ if (mStringProperties.containsKey(name) && mStringProperties.get(name).isEmpty()) {

+ mStringProperties.put(name, value);

+ }

+ // Store |value| for property with |name|, unless the property already has a non-null value,

+ // in which case, |value| will be ignored. This means we only keep the first value.

+ final void putItemValue(String name, ThingItem value) {

+ if (mItemProperties.containsKey(name)) mItemProperties.put(name, value);

+ }

+ private final List<ThingItem> mItemScopes = new ArrayList<ThingItem>();

+ private String mAuthorFromRel = "";

+ private static final Map<String, Type> sTypeUrls;

+ static {

+ sTypeUrls = new HashMap<String, Type>();

+ sTypeUrls.put("http://schema.org/ImageObject", Type.IMAGE);

+ sTypeUrls.put("http://schema.org/Article", Type.ARTICLE);

+ sTypeUrls.put("http://schema.org/BlogPosting", Type.ARTICLE);

+ sTypeUrls.put("http://schema.org/NewsArticle", Type.ARTICLE);

+ sTypeUrls.put("http://schema.org/ScholarlyArticle", Type.ARTICLE);

+ sTypeUrls.put("http://schema.org/TechArticle", Type.ARTICLE);

+ sTypeUrls.put("http://schema.org/Person", Type.PERSON);

+ sTypeUrls.put("http://schema.org/Organization", Type.ORGANIZATION);

+ sTypeUrls.put("http://schema.org/Corporation", Type.ORGANIZATION);

+ sTypeUrls.put("http://schema.org/EducationalOrganization", Type.ORGANIZATION);

+ sTypeUrls.put("http://schema.org/GovernmentOrganization", Type.ORGANIZATION);

+ sTypeUrls.put("http://schema.org/NGO", Type.ORGANIZATION);

+ }

+ /**

+ * The object that extracts and verifies Schema.org markup tags from |root|.

+ */

+ public SchemaOrgParser(Element root) {

+ // TODO(kuan): Parsing all tags is pretty expensive, should we do so only lazily?

+ // If parse lazily, all get* methods will need to check for parsed state and, if necessary,

+ // parse before returning the requested properties.

+ // Note that the <html> element can also be the start of a Schema.org item, and hence needs

+ // to be parsed.

+ parse(root, null);

+ }

+ final List<ArticleItem> getArticleItems() {

+ List<ArticleItem> articles = new ArrayList<ArticleItem>();

+ for (int i = 0; i < mItemScopes.size(); i++) {

+ ThingItem item = mItemScopes.get(i);

+ if (item.mType == Type.ARTICLE) articles.add((ArticleItem) item);

+ }

+ return articles;

+ }

+ final List<ImageItem> getImageItems() {

+ List<ImageItem> images = new ArrayList<ImageItem>();

+ for (int i = 0; i < mItemScopes.size(); i++) {

+ ThingItem item = mItemScopes.get(i);

+ if (item.mType == Type.IMAGE) images.add((ImageItem) item);

+ }

+ return images;

+ }

+ final String getAuthorFromRel() { return mAuthorFromRel; }

+ private void parse(Element e, ThingItem parentItem) {

+ ThingItem newItem = null;

+ boolean isItemScope = isItemScope(e);

+ // A non-null |parentItem| means we're currently parsing the elements for a schema.org type.

+ String[] propertyNames = parentItem != null ? getItemProp(e) : new String[0];

+ if (isItemScope) {

+ // The "itemscope" and "itemtype" attributes of |e| indicate the start of an item.

+ // Create the corresponding extended-ThingItem, and add it to the list if:

+ // 1) its type is supported, and

+ // 2) if the parent is an unsupported type, it's not an "itemprop" attribute of the

+ // parent, based on the rule that an item is a top-level item if its element doesn't

+ // have an itemprop attribute.

+ newItem = createItemForElement(e);

+ if (newItem != null && newItem.isSupported() &&

+ (parentItem == null || parentItem.isSupported() || propertyNames.length == 0)) {

+ mItemScopes.add(newItem);

+ }

+ // If parent is a supported type, parse the element for >= 1 properties in "itemprop"

+ // attribute.

+ if (propertyNames.length > 0 && parentItem.isSupported() &&

+ (newItem == null || newItem.isSupported())) {

+ for (int i = 0; i < propertyNames.length; i++) {

+ // If a new item was created above, the property value of this "itemprop" attribute

+ // is an embedded item, so add it to the parent item.

+ if (newItem != null) {

+ parentItem.putItemValue(propertyNames[i], newItem);

+ } else {

+ // Otherwise, extract the property value from the tag itself, and add it to the

+ // parent item.

+ parentItem.putStringValue(propertyNames[i], getPropertyValue(e));

+ }

+ // As per http://schema.org/author (or http://schema.org/Article and search for "author"

+ // property), if <a> or <link> tags specify rel="author", extract it.

+ if (mAuthorFromRel.isEmpty()) mAuthorFromRel = getAuthorFromRelAttribute(e);

+ // Now, parse each child element recursively.

+ NodeList<Node> children = e.getChildNodes();

+ for (int i = 0; i < children.getLength(); i++) {

+ Node child = children.getItem(i);

+ if (child.getNodeType() != Node.ELEMENT_NODE) continue;

+ parse(Element.as(child), newItem != null ? newItem : parentItem);

+ }

+ private Type getItemType(Element e) {

+ // "itemtype" attribute is case-sensitive.

+ String type = e.getAttribute("ITEMTYPE");

+ return sTypeUrls.containsKey(type) ? sTypeUrls.get(type) : Type.UNSUPPORTED;

+ }

+ private ThingItem createItemForElement(Element e) {

+ ThingItem newItem = null;

+ Type type = getItemType(e);

+ switch (type) {

+ case IMAGE:

+ newItem = new ImageItem();

+ break;

+ case ARTICLE:

+ newItem = new ArticleItem();

+ break;

+ case PERSON:

+ newItem = new PersonItem();

+ break;

+ case ORGANIZATION:

+ newItem = new OrganizationItem();

+ break;

+ case UNSUPPORTED:

+ newItem = new UnsupportedItem();

+ break;

+ default:

+ return null;

+ }

+ return newItem;

+ }

+ static class ImageItem extends ThingItem {

+ ImageItem() {

+ super(Type.IMAGE);

+ addStringPropertyName(CONTENT_URL_PROP);

+ addStringPropertyName(ENCODING_FORMAT_PROP);

+ addStringPropertyName(CAPTION_PROP);

+ addStringPropertyName(REPRESENTATIVE_PROP);

+ addStringPropertyName(WIDTH_PROP);

+ addStringPropertyName(HEIGHT_PROP);

+ }

+ final boolean isRepresentativeOfPage() {

+ return getStringProperty(REPRESENTATIVE_PROP).equalsIgnoreCase("true");

+ }

+ final MarkupParser.Image getImage() {

+ MarkupParser.Image image = new MarkupParser.Image();

+ image.image = getStringProperty(CONTENT_URL_PROP);

+ if (image.image.isEmpty()) image.image = getStringProperty(URL_PROP);

+ image.url = image.image;

+ image.type = getStringProperty(ENCODING_FORMAT_PROP);

+ image.caption = getStringProperty(CAPTION_PROP);

+ try {

+ image.width = Integer.parseInt(getStringProperty(WIDTH_PROP), 10);

+ } catch (Exception e) {

+ }

+ try {

+ image.height = Integer.parseInt(getStringProperty(HEIGHT_PROP), 10);

+ } catch (Exception e) {

+ }

+ return image;

+ }

+ static class ArticleItem extends ThingItem {

+ ArticleItem() {

+ super(Type.ARTICLE);

+ addStringPropertyName(HEADLINE_PROP);

+ addStringPropertyName(PUBLISHER_PROP);

+ addStringPropertyName(COPYRIGHT_HOLDER_PROP);

+ addStringPropertyName(COPYRIGHT_YEAR_PROP);

+ addStringPropertyName(DATE_MODIFIED_PROP);

+ addStringPropertyName(DATE_PUBLISHED_PROP);

+ addStringPropertyName(AUTHOR_PROP);

+ addStringPropertyName(CREATOR_PROP);

+ addStringPropertyName(SECTION_PROP);

+ addItemPropertyName(PUBLISHER_PROP);

+ addItemPropertyName(COPYRIGHT_HOLDER_PROP);

+ addItemPropertyName(AUTHOR_PROP);

+ addItemPropertyName(CREATOR_PROP);

+ addItemPropertyName(ASSOCIATED_MEDIA_PROP);

+ addItemPropertyName(ENCODING_PROP);

+ }

+ final MarkupParser.Article getArticle() {

+ MarkupParser.Article article = new MarkupParser.Article();

+ article.publishedTime = getStringProperty(DATE_PUBLISHED_PROP);

+ article.modifiedTime = getStringProperty(DATE_MODIFIED_PROP);

+ article.section = getStringProperty(SECTION_PROP);

+ String author = getPersonOrOrganizationName(AUTHOR_PROP);

+ if (author.isEmpty()) author = getPersonOrOrganizationName(CREATOR_PROP);

+ article.authors = author.isEmpty() ? new String[0] : new String[] { author };

+ return article;

+ }

+ final String getCopyright() {

+ // Returns a concatenated string of copyright year and copyright holder of the article,

+ // delimited by a whitespace.

+ String copyright = concat(getStringProperty(COPYRIGHT_YEAR_PROP),

+ getPersonOrOrganizationName(COPYRIGHT_HOLDER_PROP));

+ return copyright.isEmpty() ? copyright : "Copyright " + copyright;

+ }

+ final String getPersonOrOrganizationName(String propertyName) {

+ // Returns either the string value of |propertyName| or the value returned by getName()

+ // of PersonItem or OrganizationItem.

+ String value = getStringProperty(propertyName);

+ if (!value.isEmpty()) return value;

+ ThingItem valueItem = getItemProperty(propertyName);

+ if (valueItem != null) {

+ if (valueItem.getType() == Type.PERSON) {

+ value = ((PersonItem) valueItem).getName();

+ } else if (valueItem.getType() == Type.ORGANIZATION) {

+ value = ((OrganizationItem) valueItem).getName();

+ }

+ return value;

+ }

+ final ImageItem getRepresentativeImageItem() {

+ // Returns the corrresponding ImageItem for "associatedMedia" or "encoding" property.

+ ThingItem imageItem = getItemProperty(ASSOCIATED_MEDIA_PROP);

+ if (imageItem == null) imageItem = getItemProperty(ENCODING_PROP);

+ return imageItem != null && imageItem.getType() == Type.IMAGE ?

+ (ImageItem) imageItem : null;

+ }

+ final MarkupParser.Image getImage() {

+ // Use value of "image" property to create a MarkupParser.Image.

+ String imageUrl = getStringProperty(IMAGE_PROP);

+ if (imageUrl.isEmpty()) return null;

+ MarkupParser.Image image = new MarkupParser.Image();

+ image.image = imageUrl;

+ image.url = imageUrl;

+ return image;

+ }

+ private static class PersonItem extends ThingItem {

+ PersonItem() {

+ super(Type.PERSON);

+ addStringPropertyName(FAMILY_NAME_PROP);

+ addStringPropertyName(GIVEN_NAME_PROP);

+ }

+ String getName() {

+ // Returns either the value of NAME_PROP, or concatenated values of GIVEN_NAME_PROP and

+ // FAMILY_NAME_PROP delimited by a whitespace.

+ String name = getStringProperty(NAME_PROP);

+ return !name.isEmpty() ? name :

+ concat(getStringProperty(GIVEN_NAME_PROP), getStringProperty(FAMILY_NAME_PROP));

+ }

+ private static class OrganizationItem extends ThingItem {

+ OrganizationItem() {

+ super(Type.ORGANIZATION);

+ addStringPropertyName(LEGAL_NAME_PROP);

+ }

+ String getName() {

+ // Returns either the value of NAME_PROP or LEGAL_NAME_PROP.

+ String name = getStringProperty(NAME_PROP);

+ return !name.isEmpty() ? name : getStringProperty(LEGAL_NAME_PROP);

+ }

+ private static class UnsupportedItem extends ThingItem {

+ UnsupportedItem() {

+ super(Type.UNSUPPORTED);

+ }

+ private static boolean isItemScope(Element e) {

+ return e.hasAttribute("ITEMSCOPE") && e.hasAttribute("ITEMTYPE");

+ }

+ private static String[] getItemProp(Element e) {

+ // "itemprop" attribute is case-sensitive, and can have multiple properties.

+ String itemprop = e.getAttribute("ITEMPROP");

+ if (itemprop.isEmpty()) return new String[0];

+ String[] splits = StringUtil.split(itemprop, "\\s+");

+ return splits.length > 0 ? splits : new String[] { itemprop };

+ }

+ private static final Map<String, String> sTagAttributeMap;

+ static {

+ // The key for |sTagAttributeMap| is the tag name, while the entry value is an array of

+ // attributes in the specified tag from which to extract information:

+ // - 0th attribute: contains the value for the property specified in itemprop

+ // - 1st attribute: if available, contains the value for the author property.

+ sTagAttributeMap = new HashMap<String, String>();

+ sTagAttributeMap.put("IMG", "SRC");

+ sTagAttributeMap.put("AUDIO", "SRC");

+ sTagAttributeMap.put("EMBED", "SRC");

+ sTagAttributeMap.put("IFRAME", "SRC");

+ sTagAttributeMap.put("SOURCE", "SRC");

+ sTagAttributeMap.put("TRACK", "SRC");

+ sTagAttributeMap.put("VIDEO", "SRC");

+ sTagAttributeMap.put("A", "HREF");

+ sTagAttributeMap.put("LINK", "HREF");

+ sTagAttributeMap.put("AREA", "HREF");

+ sTagAttributeMap.put("META", "CONTENT");

+ sTagAttributeMap.put("TIME", "DATETIME");

+ sTagAttributeMap.put("OBJECT", "DATA");

+ sTagAttributeMap.put("DATA", "VALUE");

+ sTagAttributeMap.put("METER", "VALUE");

+ }

+ // Extracts the property value from |e|. For some tags, the value is a specific attribute,

+ // while for others, it's the text between the start and end tags.

+ private static String getPropertyValue(Element e) {

+ String value = "";

+ String tagName = e.getTagName();

+ if (sTagAttributeMap.containsKey(tagName)) {

+ value = e.getAttribute(sTagAttributeMap.get(tagName));

+ }

+ if (value.isEmpty()) value = e.getInnerText();

+ return value;

+ }

+ // Extracts the author property from the "rel=author" attribute of an anchor or a link element.

+ private static String getAuthorFromRelAttribute(Element e) {

+ String author = "";

+ String tagName = e.getTagName();

+ if ((tagName.equalsIgnoreCase("A") || tagName.equalsIgnoreCase("LINK")) &&

+ e.getAttribute("REL").equalsIgnoreCase(AUTHOR_REL)) {

+ author = e.getInnerText();

+ }

+ return author;

+ }

+ private static String concat(String first, String second) {

+ String concat = first;

+ if (!concat.isEmpty() && !second.isEmpty()) concat += " ";

+ concat += second;

+ return concat;

+ }

« no previous file with comments | « src/com/dom_distiller/client/MarkupParser.java ('k') | src/com/dom_distiller/client/SchemaOrgParserAccessor.java » ('j') | no next file with comments »