src/com/dom_distiller/client/SchemaOrgParser.java - Issue 240073007: recognize and parse Schema.org Markup

Unified Diff: src/com/dom_distiller/client/SchemaOrgParser.java

Issue 240073007: recognize and parse Schema.org Markup (Closed) Base URL: https://code.google.com/p/dom-distiller/@master

Patch Set: addressed all comments Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « src/com/dom_distiller/client/MarkupParser.java ('k') | test/com/dom_distiller/client/SchemaOrgParserTest.java » ('j') | test/com/dom_distiller/client/SchemaOrgParserTest.java » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: src/com/dom_distiller/client/SchemaOrgParser.java

diff --git a/src/com/dom_distiller/client/SchemaOrgParser.java b/src/com/dom_distiller/client/SchemaOrgParser.java

new file mode 100644

index 0000000000000000000000000000000000000000..7d2937ee5067952d8985884412a431dd57af3c80

--- /dev/null

+++ b/src/com/dom_distiller/client/SchemaOrgParser.java

@@ -0,0 +1,499 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+package com.dom_distiller.client;

+import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.List;

+import java.util.Map;

+import com.google.gwt.dom.client.AnchorElement;

+import com.google.gwt.dom.client.Element;

+import com.google.gwt.dom.client.ImageElement;

+import com.google.gwt.dom.client.MetaElement;

+import com.google.gwt.dom.client.NodeList;

+/**

+ * This class recognizes and parses Schema.org markup tags, and returns the properties that matter

+ * to distilled content.

+ * For the basic Schema.org Thing type, the basic properties are: name, url, description, image.

+ * In addition, for each type that we support, we also parse more specific properties:

+ * - Article: headline (i.e. title), publisher, copyright year, copyright holder, date published,

+ * date modified, author, article section

+ * - ImageObject: headline (i.e. title), publisher, copyright year, copyright holder, content url,

+ * encoding format, caption, representative of page, width, height

+ * - Person: family name, given name

+ * - Organization: legal name.

+ * The value of a Schema.Org property can be a Schema.Org type, i.e. embedded. E.g., the author or

+ * publisher of article or publisher of image could be a Schema.Org Person or Organization type;

+ * in fact, this is the reason we support Person and Organization types.

+ */

+public class SchemaOrgParser implements MarkupParser.Parser {

+ private static final String NAME_PROP = "name";

+ private static final String URL_PROP = "url";

+ private static final String DESCRIPTION_PROP = "description";

+ private static final String IMAGE_PROP = "image";

+ private static final String HEADLINE_PROP = "headline";

+ private static final String PUBLISHER_PROP = "publisher";

+ private static final String COPYRIGHT_HOLDER_PROP = "copyrightHolder";

+ private static final String COPYRIGHT_YEAR_PROP = "copyrightYear";

+ private static final String CONTENT_URL_PROP = "contentUrl";

+ private static final String ENCODING_FORMAT_PROP = "encodingFormat";

+ private static final String CAPTION_PROP = "caption";

+ private static final String REPRESENTATIVE_PROP = "representativeOfPage";

+ private static final String WIDTH_PROP = "width";

+ private static final String HEIGHT_PROP = "height";

+ private static final String DATE_PUBLISHED_PROP = "datePublished";

+ private static final String DATE_MODIFIED_PROP = "dateModified";

+ private static final String AUTHOR_PROP = "author";

+ private static final String SECTION_PROP = "articleSection";

+ private static final String FAMILY_NAME_PROP = "familyName";

+ private static final String GIVEN_NAME_PROP = "givenName";

+ private static final String LEGAL_NAME_PROP = "legalName";

+ private enum Type { // All these types are extended from Thing, directly or indirectly.

+ IMAGE,

+ ARTICLE,

+ PERSON,

+ ORGANIZATION,

+ UNSUPPORTED,

+ }

+ private static class ThingItem {

+ protected final Type mType;

cjhopman 2014/04/18 01:17:01 nit: I think all the fields here could be private

kuan 2014/04/18 23:34:38 Done.

+ protected final Element mRoot;

cjhopman 2014/04/18 01:17:01 mRoot looks unused.

kuan 2014/04/18 23:34:38 Done.

+ protected final String[] mStringPropertyNames;

+ protected final String[] mItemPropertyNames;

+ protected final String[] mStringProperties;

+ protected final ThingItem[] mItemProperties;

+ protected ThingItem(Type type, Element root,

+ String[] stringPropertyNames, String[] itemPropertyNames) {

+ mType = type;

+ mRoot = root;

+ mStringPropertyNames = stringPropertyNames;

+ mItemPropertyNames = itemPropertyNames;

+ mStringProperties = new String[mStringPropertyNames.length];

+ mItemProperties = new ThingItem[mItemPropertyNames.length];

+ }

+ protected String toStringProperty() {

+ return "";

+ }

+ protected MarkupParser.Image getImage() {

+ // Use value of IMAGE_PROP to create a MarkupParser.Image.

+ String imageUrl = getStringProperty(IMAGE_PROP);

+ if (imageUrl.isEmpty()) return null;

+ MarkupParser.Image image = new MarkupParser.Image();

+ image.image = imageUrl;

+ image.url = imageUrl;

+ return image;

+ }

+ protected MarkupParser.Article getArticle() {

+ return null;

+ }

+ protected final boolean isImageRepresentativeOfPage() {

+ String value = getStringProperty(REPRESENTATIVE_PROP);

+ return value.equalsIgnoreCase("true");

+ }

+ protected final void putStringValue(String name, String value) {

+ for (int i = 0; i < mStringPropertyNames.length; i++) {

+ if (name.equals(mStringPropertyNames[i])) {

+ mStringProperties[i] = value;

+ break;

+ }

+ protected final void putItemValue(String name, ThingItem value) {

+ for (int i = 0; i < mItemPropertyNames.length; i++) {

+ if (name.equals(mItemPropertyNames[i])) {

+ mItemProperties[i] = value;

+ break;

+ }

+ protected final String getStringProperty(String name) {

+ // Check if property exists in |mStringProperties|.

+ for (int i = 0; i < mStringPropertyNames.length; i++) {

+ if (name.equals(mStringPropertyNames[i])) {

+ String value = mStringProperties[i];

+ if (value != null && !value.isEmpty()) return value;

+ break;

+ }

+ // Otherwise, repeat for |mItemProperties|.

+ for (int i = 0; i < mItemPropertyNames.length; i++) {

+ if (!name.equals(mItemPropertyNames[i])) continue;

+ if (mItemProperties[i] != null) return mItemProperties[i].toStringProperty();

+ break;

+ }

+ return "";

+ }

+ private final List<ThingItem> mItemScopes;

cjhopman 2014/04/18 01:17:01 this can be static and initialized in a static ini

kuan 2014/04/18 23:34:38 different instances of SchemaOrgParser have differ

+ private Element mRoot = null;

cjhopman 2014/04/18 01:17:01 is this used?

kuan 2014/04/18 23:34:38 Done.

+ private final Map<String, Type> mTypeUrls = new HashMap<String, Type>();

+ /**

+ * The object that extracts and verifies Schema.org markup tags from |root|.

+ */

+ public SchemaOrgParser(Element root) {

+ mRoot = root;

+ mItemScopes = new ArrayList<ThingItem>();

+ mTypeUrls.put("http://schema.org/ImageObject", Type.IMAGE);

+ mTypeUrls.put("http://schema.org/Article", Type.ARTICLE);

+ mTypeUrls.put("http://schema.org/Person", Type.PERSON);

+ mTypeUrls.put("http://schema.org/Organization", Type.ORGANIZATION);

+ mTypeUrls.put("", Type.UNSUPPORTED);

+ // TODO(kuan): Parsing all tags is pretty expensive, should we do so only lazily?

+ // If parse lazily, all get* methods will need to check for parsed state and, if necessary,

+ // parse before returning the requested properties.

+ // Note that the <html> element can also be the start of a Schema.org item, and hence needs

+ // to be parsed.

+ parse(mRoot, null);

+ }

+ @Override

+ public String getTitle() {

+ String title = findStringProperty(HEADLINE_PROP);

+ if (title.isEmpty()) title = findStringProperty(NAME_PROP);

+ return title;

+ }

+ @Override

+ public String getType() {

+ if (mItemScopes.isEmpty()) return null;

+ // Assume the type of the first item is the page type.

+ return mItemScopes.get(0).mType.toString();

+ }

+ @Override

+ public String getUrl() {

+ return findStringProperty(URL_PROP);

+ }

+ @Override

+ public MarkupParser.Image[] getImages() {

+ if (mItemScopes.isEmpty()) return null;

+ List<MarkupParser.Image> images = new ArrayList<MarkupParser.Image>();

+ for (int i = 0; i < mItemScopes.size(); i++) {

+ ThingItem item = mItemScopes.get(i);

+ MarkupParser.Image image = item.getImage();

+ if (image != null) {

+ if (item.isImageRepresentativeOfPage()) {

+ // Image should be the dominant, i.e. first, one.

+ images.add(0, image);

+ } else {

+ images.add(image);

+ }

+ if (images.isEmpty()) return null;

+ return images.toArray(new MarkupParser.Image[images.size()]);

+ }

+ @Override

+ public String getDescription() {

+ return findStringProperty(DESCRIPTION_PROP);

+ }

+ @Override

+ public String getPublisher() {

+ return findStringProperty(PUBLISHER_PROP);

+ }

+ @Override

+ public String getCopyright() {

+ if (mItemScopes.isEmpty()) return "";

+ // Returns a concatenated string of copyright year and copyright holder of the first item

+ // that has these properties, delimited by a whitespace.

+ String copyright = "";

+ for (int i = 0; i < mItemScopes.size() && copyright.isEmpty(); i++) {

+ ThingItem item = mItemScopes.get(i);

+ copyright = concat(item.getStringProperty(COPYRIGHT_YEAR_PROP),

+ item.getStringProperty(COPYRIGHT_HOLDER_PROP));

+ }

+ return copyright.isEmpty() ? copyright : "Copyright " + copyright;

+ }

+ @Override

+ public String getAuthor() {

+ return findStringProperty(AUTHOR_PROP);

+ }

+ @Override

+ public MarkupParser.Article getArticle() {

+ if (mItemScopes.isEmpty()) return null;

+ // Returns the first article.

+ MarkupParser.Article article = null;

+ for (int i = 0; i < mItemScopes.size() && article == null; i++) {

+ article = mItemScopes.get(i).getArticle();

+ }

+ return article;

+ }

+ @Override

+ public boolean optOut() {

+ return false;

+ }

+ private void parse(Element e, ThingItem parentItem) {

+ ThingItem newItem = null;

+ boolean isItemScope = isItemscope(e);

+ if (isItemScope) {

+ // The "itemscope" and "itemtype" attributes of |e| indicate the start of an item.

+ // If the type is supported, create the corresponding extended-ThingItem and recursively

+ // parse it.

+ newItem = createItemForElement(e);

cjhopman 2014/04/18 01:17:01 It looks like we might handle nesting of elements

kuan 2014/04/18 23:34:38 Done. before, i coded it based on the assumption,

+ if (newItem != null) {

+ mItemScopes.add(newItem);

+ Element child = e.getFirstChildElement();

+ if (child != null) parse(child, newItem);

+ }

+ // A non-null |parentItem| means we're currently parsing the elements for a Schema.org type.

+ // Check if the current element has a "itemprop" attribute.

+ if (parentItem != null) {

+ String propertyName = getItemprop(e);

+ if (!propertyName.isEmpty()) {

+ // If a new item was created above, the property value of this "itemprop" attribute

+ // is an embedded item, so add it to the parent item.

+ if (newItem != null) {

+ parentItem.putItemValue(propertyName, newItem);

+ } else {

+ // Otherwise, extract the property value from the tag itself, and add it to the

+ // parent item.

+ parentItem.putStringValue(propertyName, getPropertyValue(e));

+ }

cjhopman 2014/04/18 01:17:01 Clarification of what I meant with "both cases wou

kuan 2014/04/18 23:34:38 Done. fyi, can't use forEach for NodeList.

+ // If |e| is an itemsope, its children would have been parsed by the parse() call above,

+ // so only recurse into immediate children otherwise.

+ if (!isItemScope) {

+ Element child = e.getFirstChildElement();

+ if (child != null) parse(child, parentItem);

+ }

+ // Parse the next available sibling element.

+ Element next = e.getNextSiblingElement();

+ if (next != null) parse(next, parentItem);

+ }

+ private Type getItemType(Element e) {

+ String type = e.getAttribute("ITEMTYPE");

+ return mTypeUrls.containsKey(type) ? mTypeUrls.get(type) : Type.UNSUPPORTED;

+ }

+ private ThingItem createItemForElement(Element e) {

+ ThingItem newItem = null;

+ Type type = getItemType(e);

+ switch (type) {

+ case IMAGE:

+ newItem = new ImageItem(e);

+ break;

+ case ARTICLE:

+ newItem = new ArticleItem(e);

+ break;

+ case PERSON:

+ newItem = new PersonItem(e);

+ break;

+ case ORGANIZATION:

+ newItem = new OrganizationItem(e);

+ break;

+ case UNSUPPORTED:

+ default:

+ return null;

+ }

+ return newItem;

+ }

+ // Returns the first item that has the requested property value.

+ private String findStringProperty(String name) {

+ if (mItemScopes.isEmpty()) return "";

+ for (int i = 0; i < mItemScopes.size(); i++) {

+ String value = mItemScopes.get(i).getStringProperty(name);

+ if (!value.isEmpty()) return value;

+ }

+ return "";

+ }

+ private static class ImageItem extends ThingItem {

+ private static final String[] mStringPropertyNames = {

+ NAME_PROP,

+ URL_PROP,

+ DESCRIPTION_PROP,

+ IMAGE_PROP,

+ HEADLINE_PROP,

+ PUBLISHER_PROP,

+ COPYRIGHT_HOLDER_PROP,

+ COPYRIGHT_YEAR_PROP,

+ CONTENT_URL_PROP,

+ ENCODING_FORMAT_PROP,

+ CAPTION_PROP,

+ REPRESENTATIVE_PROP,

+ WIDTH_PROP,

+ HEIGHT_PROP,

+ };

+ private static final String[] mItemPropertyNames = {

+ PUBLISHER_PROP,

+ COPYRIGHT_HOLDER_PROP,

+ };

+ protected ImageItem(Element elem) {

+ super(Type.IMAGE, elem, mStringPropertyNames, mItemPropertyNames);

+ }

+ @Override

+ protected MarkupParser.Image getImage() {

+ MarkupParser.Image image = new MarkupParser.Image();

+ String url = getStringProperty(CONTENT_URL_PROP);

+ image.image = !url.isEmpty() ? url : getStringProperty(NAME_PROP);

+ image.url = image.image;

+ image.type = getStringProperty(ENCODING_FORMAT_PROP);

+ image.caption = getStringProperty(CAPTION_PROP);

+ try {

+ image.width = Integer.parseInt(getStringProperty(WIDTH_PROP), 10);

+ } catch (Exception e) {

+ }

+ try {

+ image.height = Integer.parseInt(getStringProperty(HEIGHT_PROP), 10);

+ } catch (Exception e) {

+ }

+ return image;

+ }

+ private static class ArticleItem extends ThingItem {

+ private static final String[] mStringPropertyNames = {

+ NAME_PROP,

+ URL_PROP,

+ DESCRIPTION_PROP,

+ IMAGE_PROP,

+ HEADLINE_PROP,

+ PUBLISHER_PROP,

+ COPYRIGHT_HOLDER_PROP,

+ COPYRIGHT_YEAR_PROP,

+ DATE_MODIFIED_PROP,

+ DATE_PUBLISHED_PROP,

+ AUTHOR_PROP,

+ SECTION_PROP,

+ };

+ private static final String[] mItemPropertyNames = {

+ PUBLISHER_PROP,

+ COPYRIGHT_HOLDER_PROP,

+ AUTHOR_PROP,

+ };

+ protected ArticleItem(Element elem) {

+ super(Type.ARTICLE, elem, mStringPropertyNames, mItemPropertyNames);

+ }

+ @Override

+ protected MarkupParser.Article getArticle() {

+ MarkupParser.Article article = new MarkupParser.Article();

+ article.publishedTime = getStringProperty(DATE_PUBLISHED_PROP);

+ article.modifiedTime = getStringProperty(DATE_MODIFIED_PROP);

+ article.section = getStringProperty(SECTION_PROP);

+ String author = getStringProperty(AUTHOR_PROP);

+ article.authors = author.isEmpty() ? new String[0] : new String[] { author };

+ return article;

+ }

+ private static class PersonItem extends ThingItem {

+ private static final String[] mStringPropertyNames = {

+ NAME_PROP,

+ URL_PROP,

+ DESCRIPTION_PROP,

+ IMAGE_PROP,

+ FAMILY_NAME_PROP,

+ GIVEN_NAME_PROP,

+ };

+ protected PersonItem(Element elem) {

+ super(Type.PERSON, elem, mStringPropertyNames, new String[0]);

+ }

+ // Returns either the value of NAME_PROP, or concatenated values of GIVEN_NAME_PROP and

+ // FAILY_NAME_PROP delimited by a whitespace.

+ @Override

+ protected String toStringProperty() {

+ String fullname = getStringProperty(NAME_PROP);

+ if (fullname.isEmpty()) {

+ fullname = concat(getStringProperty(GIVEN_NAME_PROP),

+ getStringProperty(FAMILY_NAME_PROP));

+ }

+ return fullname;

+ }

+ private static class OrganizationItem extends ThingItem {

+ private static final String[] mStringPropertyNames = {

+ NAME_PROP,

+ URL_PROP,

+ DESCRIPTION_PROP,

+ IMAGE_PROP,

+ LEGAL_NAME_PROP,

+ };

+ protected OrganizationItem(Element elem) {

+ super(Type.ORGANIZATION, elem, mStringPropertyNames, new String[0]);

+ }

+ // Returns either the value of NAME_PROP or LEGAL_NAME_PROP.

+ @Override

+ protected String toStringProperty() {

+ String name = getStringProperty(NAME_PROP);

+ if (name.isEmpty()) name = getStringProperty(LEGAL_NAME_PROP);

+ return name;

+ }

+ private static boolean isItemscope(Element e) {

+ return e.hasAttribute("ITEMSCOPE") && e.hasAttribute("ITEMTYPE");

+ }

+ private static String getItemprop(Element e) {

+ // "itemprop" attribute is case-sensitive.

+ return e.getAttribute("ITEMPROP");

+ }

+ // Extracts the property value from |e|. For some tags, the value is a specific attribute,

+ // while for others, it's the text between the start and end tags.

+ private static String getPropertyValue(Element e) {

+ String propertyValue = null;

+ if (e.hasTagName("A")) {

+ propertyValue = AnchorElement.as(e).getHref();

+ } else if (e.hasTagName("IMG")) {

+ propertyValue = ImageElement.as(e).getSrc();

+ } else if (e.hasTagName("META")) {

+ propertyValue = MetaElement.as(e).getContent();

+ } else if (e.hasTagName("TIME")) {

+ propertyValue = e.getAttribute("datetime");

+ }

+ if (propertyValue == null || propertyValue.isEmpty()) propertyValue = e.getInnerText();

+ return propertyValue;

+ }

+ private static String concat(String first, String second) {

+ String concat = first;

+ if (!concat.isEmpty() && !second.isEmpty()) concat += " ";

+ concat += second;

+ return concat;

+ }