Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(64)

Unified Diff: src/com/dom_distiller/client/SchemaOrgParser.java

Issue 240073007: recognize and parse Schema.org Markup (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: Created 6 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/com/dom_distiller/client/SchemaOrgParser.java
diff --git a/src/com/dom_distiller/client/SchemaOrgParser.java b/src/com/dom_distiller/client/SchemaOrgParser.java
new file mode 100644
index 0000000000000000000000000000000000000000..cc00c4823b8282187ea2f9b170b7994e8e3f8f82
--- /dev/null
+++ b/src/com/dom_distiller/client/SchemaOrgParser.java
@@ -0,0 +1,517 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package com.dom_distiller.client;
+
+import java.util.ArrayList;
+import java.util.EnumMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import com.google.gwt.dom.client.AnchorElement;
+import com.google.gwt.dom.client.Element;
+import com.google.gwt.dom.client.ImageElement;
+import com.google.gwt.dom.client.MetaElement;
+import com.google.gwt.dom.client.NodeList;
+
+/**
+ * This class recognizes and parses Schema.org markup tags, and returns the properties that matter
+ * to distilled content.
+ * For the basic Schema.org Thing type, the basic properties are: name, url, description, image.
+ * In addition, for each type that we support, we also parse more specific properties:
+ * - Article: headline (i.e. title), publisher, copyright year, copyright holder, date published,
+ * date modified, author, article section
+ * - ImageObject: headline (i.e. title), publisher, copyright year, copyright holder, content url,
+ * encoding format, caption, representative of page, width, height
+ * - Person: family name, given name
+ * - Organization: legal name.
+ * The value of a Schema.Org property can be a Schema.Org type, i.e. embedded. E.g., the author or
+ * publisher of article or publisher of image could be a Schema.Org Person or Organization type;
+ * in fact, this is the reason we support Person and Organization types.
+ */
+public class SchemaOrgParser implements MarkupParser.Parser {
+ private static final String NAME_PROP = "name";
+ private static final String URL_PROP = "url";
+ private static final String DESCRIPTION_PROP = "description";
+ private static final String IMAGE_PROP = "image";
+ private static final String HEADLINE_PROP = "headline";
+ private static final String PUBLISHER_PROP = "publisher";
+ private static final String COPYRIGHT_HOLDER_PROP = "copyrightHolder";
+ private static final String COPYRIGHT_YEAR_PROP = "copyrightYear";
+ private static final String CONTENT_URL_PROP = "contentUrl";
+ private static final String ENCODING_FORMAT_PROP = "encodingFormat";
+ private static final String CAPTION_PROP = "caption";
+ private static final String REPRESENTATIVE_PROP = "representativeOfPage";
+ private static final String WIDTH_PROP = "width";
+ private static final String HEIGHT_PROP = "height";
+ private static final String DATE_PUBLISHED_PROP = "datePublished";
+ private static final String DATE_MODIFIED_PROP = "dateModified";
+ private static final String AUTHOR_PROP = "author";
+ private static final String SECTION_PROP = "articleSection";
+ private static final String FAMILY_NAME_PROP = "familyName";
+ private static final String GIVEN_NAME_PROP = "givenName";
+ private static final String LEGAL_NAME_PROP = "legalName";
+
+ private enum Type { // All these types are extended from Thing, directly or indirectly.
+ IMAGE,
+ ARTICLE,
+ PERSON,
+ ORGANIZATION,
+ UNSUPPORTED,
+ }
+
+ private static class ThingItem {
+ protected final Type mType;
+ protected final Element mRoot;
+ protected final String[] mStringPropertyNames;
+ protected final String[] mItemPropertyNames;
+ protected final String[] mStringProperties;
+ protected final ThingItem[] mItemProperties;
+
+ protected ThingItem(Type type, Element root,
+ String[] stringPropertyNames, String[] itemPropertyNames) {
+ mType = type;
+ mRoot = root;
+ mStringPropertyNames = stringPropertyNames;
+ mItemPropertyNames = itemPropertyNames;
+ mStringProperties = new String[mStringPropertyNames.length];
+ mItemProperties = new ThingItem[mItemPropertyNames.length];
+ }
+
+ protected String toStringProperty() {
+ return "";
+ }
+
+ protected MarkupParser.Image getImage() {
+ // Use value of IMAGE_PROP to create a MarkupParser.Image.
+ String imageUrl = getStringProperty(IMAGE_PROP);
+ if (imageUrl.isEmpty()) return null;
+ MarkupParser.Image image = new MarkupParser.Image();
+ image.image = imageUrl;
+ image.url = imageUrl;
+ return image;
+ }
+
+ protected MarkupParser.Article getArticle() {
+ return null;
+ }
+
+ protected final boolean isImageRepresentativeOfPage() {
+ String value = getStringProperty(REPRESENTATIVE_PROP);
+ return value.equalsIgnoreCase("true");
+ }
+
+ protected final void putStringValue(String name, String value) {
+ for (int i = 0; i < mStringPropertyNames.length; i++) {
+ if (name.equals(mStringPropertyNames[i])) {
+ mStringProperties[i] = value;
+ break;
+ }
+ }
+ }
+
+ protected final void putItemValue(String name, ThingItem value) {
+ for (int i = 0; i < mItemPropertyNames.length; i++) {
+ if (name.equals(mItemPropertyNames[i])) {
+ mItemProperties[i] = value;
+ break;
+ }
+ }
+ }
+
+ protected final String getStringProperty(String name) {
+ // Check if property exists in |mStringProperties|.
+ for (int i = 0; i < mStringPropertyNames.length; i++) {
+ if (name.equals(mStringPropertyNames[i])) {
+ String value = mStringProperties[i];
+ if (value != null && !value.isEmpty()) return value;
+ break;
+ }
+ }
+ // Otherwise, repeat for |mItemProperties|.
+ for (int i = 0; i < mItemPropertyNames.length; i++) {
+ if (!name.equals(mItemPropertyNames[i])) continue;
+ if (mItemProperties[i] != null) return mItemProperties[i].toStringProperty();
+ break;
+ }
+ return "";
+ }
+ }
+
+ private final List<ThingItem> mItemScopes;
+ private Element mRoot = null;
+ private final Map<Type, String> mTypeUrls = new EnumMap<Type, String>(Type.class);
cjhopman 2014/04/17 17:35:26 This appears to only be used to lookup a type for
kuan 2014/04/18 00:19:03 Done.
+
+ /**
+ * The object that extracts and verifies Schema.org markup tags from |root|.
+ */
+ public SchemaOrgParser(Element root) {
+ mRoot = root;
+ mItemScopes = new ArrayList<ThingItem>();
+
+ mTypeUrls.put(Type.IMAGE, "http://schema.org/ImageObject");
+ mTypeUrls.put(Type.ARTICLE, "http://schema.org/Article");
+ mTypeUrls.put(Type.PERSON, "http://schema.org/Person");
+ mTypeUrls.put(Type.ORGANIZATION, "http://schema.org/Organization");
+ mTypeUrls.put(Type.UNSUPPORTED, "");
+
+ // TODO(kuan): Parsing all tags is pretty expensive, should we do so only lazily?
+ // If parse lazily, all get* methods will need to check for parsed state and, if necessary,
+ // parse before returning the requested properties.
+ parseRoot();
+ }
+
+ @Override
+ public String getTitle() {
+ String title = findStringProperty(HEADLINE_PROP);
+ if (title.isEmpty()) title = findStringProperty(NAME_PROP);
+ return title;
+ }
+
+ @Override
+ public String getType() {
+ if (mItemScopes.isEmpty()) return null;
+ // Assume the type of the first item is the page type.
+ return mItemScopes.get(0).mType.toString();
+ }
+
+ @Override
+ public String getUrl() {
+ return findStringProperty(URL_PROP);
+ }
+
+ @Override
+ public MarkupParser.Image[] getImages() {
+ if (mItemScopes.isEmpty()) return null;
+ List<MarkupParser.Image> images = new ArrayList<MarkupParser.Image>();
+ for (int i = 0; i < mItemScopes.size(); i++) {
+ ThingItem item = mItemScopes.get(i);
+ MarkupParser.Image image = item.getImage();
+ if (image != null) {
+ if (item.isImageRepresentativeOfPage()) {
+ // Image should be the dominant, i.e. first, one.
+ images.add(0, image);
+ } else {
+ images.add(image);
+ }
+ }
+ }
+ if (images.isEmpty()) return null;
+ return images.toArray(new MarkupParser.Image[images.size()]);
+ }
+
+ @Override
+ public String getDescription() {
+ return findStringProperty(DESCRIPTION_PROP);
+ }
+
+ @Override
+ public String getPublisher() {
+ return findStringProperty(PUBLISHER_PROP);
+ }
+
+ @Override
+ public String getCopyright() {
+ if (mItemScopes.isEmpty()) return "";
+ // Returns a concatenated string of copyright year and copyright holder of the first item
+ // that has these properties, delimited by a whitespace.
+ String copyright = "";
+ for (int i = 0; i < mItemScopes.size() && copyright.isEmpty(); i++) {
+ ThingItem item = mItemScopes.get(i);
+ copyright = concat(item.getStringProperty(COPYRIGHT_YEAR_PROP),
+ item.getStringProperty(COPYRIGHT_HOLDER_PROP));
+ }
+ return copyright.isEmpty() ? copyright : "Copyright " + copyright;
+ }
+
+ @Override
+ public String getAuthor() {
+ return findStringProperty(AUTHOR_PROP);
+ }
+
+ @Override
+ public MarkupParser.Article getArticle() {
+ if (mItemScopes.isEmpty()) return null;
+ // Returns the first article.
+ MarkupParser.Article article = null;
+ for (int i = 0; i < mItemScopes.size() && article == null; i++) {
+ article = mItemScopes.get(i).getArticle();
+ }
+ return article;
+ }
+
+ @Override
+ public boolean optOut() {
+ return false;
+ }
+
+ private void parseRoot() {
+ // The <html> element can also be the start of a Schema.org item, and hence needs to be
+ // parsed.
+
+ // Use a boolean array for |skipChildren|, instead of the boolean primitive, so that it
+ // can be updated in checkIfElementIsSupported().
+ boolean[] skipChildren = new boolean[] { false };
+ checkIfElementIsSupported(mRoot, skipChildren);
+ if (skipChildren[0]) return; // Skipping children of root means there's nothing more to do.
+ // Recursively parse each element that is an Schema.org type.
+ parse(mRoot, null);
+ }
+
+ private void parse(Element root, ThingItem currItem) {
cjhopman 2014/04/17 17:35:26 This function and its uses will be simplified if c
kuan 2014/04/18 00:19:03 Done.
+ NodeList<Element> allElems = root.getElementsByTagName("*");
+ for (int i = 0; i < allElems.getLength(); i++) {
cjhopman 2014/04/17 17:35:26 The way that the tree is parsed is hard for me to
kuan 2014/04/18 00:19:03 Done. what's the worst case behavior?
cjhopman 2014/04/18 01:17:01 The previous version had O(n^2) worst case complex
+ Element e = allElems.getItem(i);
+ // See comments in parseRoot() for using boolean array for |skipChildren|.
+ boolean[] skipChildren = new boolean[] { false };
+
+ ThingItem newItem = checkIfElementIsSupported(e, skipChildren);
+
+ // If we're currently parsing a Schema.org type, if it has an "itemprop" attribute that
+ // we care for, extract and store its value.
+ if (currItem != null) extractProperty(e, currItem, newItem);
+
+ // If current element has "itemscope" and "itemtype" attributes and is a supported type,
+ // its children would have been parsed by |newItem| via the recursive parse() call.
+ // If it's an unsupported type, its children should be ignored. In both cases, we
+ // should skip these children to the next sibling of the current element. So, determine
+ // the index of the next sibiling in |allElems|, so that the next iteration will jump to
+ // that element.
+ if (skipChildren[0]) {
+ Element next = e.getNextSiblingElement();
+ if (next != null) {
+ for (i++; i < allElems.getLength() && next != allElems.getItem(i); i++) {}
+ i--; // Decrement because it'll be incremented in the outer for loop.
+ } else {
+ break; // No next sibling means there's no more elements to process.
+ }
+ }
+ } // for all elements
+ }
+
+ // If |e| has "itemscope" and "itemtype" attributes and a supported type, a ThingItem-extended
+ // object is created based on the type.
+ // Returns this object after it has recursively parsed |e|'s children, returns null otherwise.
+ // @param skipChildren[0] is set to true if |e| specifies a Schema.org type, supported or not.
+ private ThingItem checkIfElementIsSupported(Element e, boolean[] skipChildren) {
+ // If element has "itemscope" and "itemtype" attributes, it's the start of an item.
+ // If the type is what we care for, instantiate the corresponding extended ThingItem and
+ // recursively parse it.
+ if (!e.hasAttribute("ITEMSCOPE") || !e.hasAttribute("ITEMTYPE")) return null;
cjhopman 2014/04/17 17:35:26 This should be a different function so you don't h
kuan 2014/04/18 00:19:03 Done.
+
+ skipChildren[0] = true; // Indicate to skip the children of this supported element.
+ ThingItem newItem = null;
+ Type type = getType(e);
+ switch (type) {
+ case IMAGE:
+ newItem = new ImageItem(e);
+ break;
+ case ARTICLE:
+ newItem = new ArticleItem(e);
+ break;
+ case PERSON:
+ newItem = new PersonItem(e);
+ break;
+ case ORGANIZATION:
+ newItem = new OrganizationItem(e);
+ break;
+ case UNSUPPORTED:
+ default:
+ return null;
+ }
+
+ mItemScopes.add(newItem);
+ parse(e, newItem);
cjhopman 2014/04/17 17:35:26 This parse() call makes it harder for me to reason
kuan 2014/04/18 00:19:03 Done. i'm not sure if i code it the way u want re
cjhopman 2014/04/18 01:17:01 See the new comment in parse() for what I meant by
+ return newItem;
+ }
+
+ private Type getType(Element e) {
+ String type = e.getAttribute("ITEMTYPE");
+ Set<Map.Entry<Type, String>> typeUrls = mTypeUrls.entrySet();
+ Iterator<Map.Entry<Type, String>> iter = typeUrls.iterator();
cjhopman 2014/04/17 17:35:26 I think you can do: for (Map.Entry<Type, String>
kuan 2014/04/18 00:19:03 Done. since it's now a HashMap of <String, Type>,
+ while (iter.hasNext()) {
+ Map.Entry<Type, String> typeUrl = iter.next();
+ if (typeUrl.getValue().equalsIgnoreCase(type)) return typeUrl.getKey();
+ }
+ return Type.UNSUPPORTED;
+ }
+
+ // Extract the value of the "itemprop" attribute in |e|.
+ // @param currItem ThingItem-extended item for the current Schema.org type being parsed.
+ // @param embeddedItem ThingItem-extended item for the Schema.org type created for |e|, i.e. |e| // had specified a Schema.org type.
+ private void extractProperty(Element e, ThingItem currItem, ThingItem embeddedItem) {
cjhopman 2014/04/17 17:35:26 This function does a lot (and most of what it does
kuan 2014/04/18 00:19:03 Done.
+ // "itemprop" attribute is case-sensitive.
+ String name = e.getAttribute("ITEMPROP");
+ if (name == null || name.isEmpty()) return;
+ if (embeddedItem != null) { // This "itemprop" attribute is an embedded item.
+ currItem.putItemValue(name, embeddedItem);
+ } else { // Extract value from the tag.
+ String value = null;
+ if (e.hasTagName("A")) {
+ value = AnchorElement.as(e).getHref();
+ } else if (e.hasTagName("IMG")) {
+ value = ImageElement.as(e).getSrc();
+ } else if (e.hasTagName("META")) {
+ value = MetaElement.as(e).getContent();
+ } else if (e.hasTagName("TIME")) {
+ value = e.getAttribute("datetime");
+ }
+ if (value == null || value.isEmpty()) value = e.getInnerText();
+ currItem.putStringValue(name, value);
+ }
+ }
+
+ // Returns the first item that has the requested property value.
+ private String findStringProperty(String name) {
+ if (mItemScopes.isEmpty()) return null;
+ for (int i = 0; i < mItemScopes.size(); i++) {
+ String value = mItemScopes.get(i).getStringProperty(name);
+ if (!value.isEmpty()) return value;
+ }
+ return "";
+ }
+
+ private static class ImageItem extends ThingItem {
+ private static final String[] mStringPropertyNames = {
+ NAME_PROP,
+ URL_PROP,
+ DESCRIPTION_PROP,
+ IMAGE_PROP,
+ HEADLINE_PROP,
+ PUBLISHER_PROP,
+ COPYRIGHT_HOLDER_PROP,
+ COPYRIGHT_YEAR_PROP,
+ CONTENT_URL_PROP,
+ ENCODING_FORMAT_PROP,
+ CAPTION_PROP,
+ REPRESENTATIVE_PROP,
+ WIDTH_PROP,
+ HEIGHT_PROP,
+ };
+
+ private static final String[] mItemPropertyNames = {
+ PUBLISHER_PROP,
+ COPYRIGHT_HOLDER_PROP,
+ };
+
+ protected ImageItem(Element elem) {
+ super(Type.IMAGE, elem, mStringPropertyNames, mItemPropertyNames);
+ }
+
+ @Override
+ protected MarkupParser.Image getImage() {
+ MarkupParser.Image image = new MarkupParser.Image();
+ String url = getStringProperty(CONTENT_URL_PROP);
+ image.image = !url.isEmpty() ? url : getStringProperty(NAME_PROP);
+ image.url = image.image;
+ image.type = getStringProperty(ENCODING_FORMAT_PROP);
+ image.caption = getStringProperty(CAPTION_PROP);
+ try {
+ image.width = Integer.parseInt(getStringProperty(WIDTH_PROP), 10);
+ } catch (Exception e) {
+ }
+ try {
+ image.height = Integer.parseInt(getStringProperty(HEIGHT_PROP), 10);
+ } catch (Exception e) {
+ }
+ return image;
+ }
+ }
+
+ private static class ArticleItem extends ThingItem {
+ private static final String[] mStringPropertyNames = {
+ NAME_PROP,
+ URL_PROP,
+ DESCRIPTION_PROP,
+ IMAGE_PROP,
+ HEADLINE_PROP,
+ PUBLISHER_PROP,
+ COPYRIGHT_HOLDER_PROP,
+ COPYRIGHT_YEAR_PROP,
+ DATE_MODIFIED_PROP,
+ DATE_PUBLISHED_PROP,
+ AUTHOR_PROP,
+ SECTION_PROP,
+ };
+
+ private static final String[] mItemPropertyNames = {
+ PUBLISHER_PROP,
+ COPYRIGHT_HOLDER_PROP,
+ AUTHOR_PROP,
+ };
+
+ protected ArticleItem(Element elem) {
+ super(Type.ARTICLE, elem, mStringPropertyNames, mItemPropertyNames);
+ }
+
+ @Override
+ protected MarkupParser.Article getArticle() {
+ MarkupParser.Article article = new MarkupParser.Article();
+ article.publishedTime = getStringProperty(DATE_PUBLISHED_PROP);
+ article.modifiedTime = getStringProperty(DATE_MODIFIED_PROP);
+ article.section = getStringProperty(SECTION_PROP);
+ String author = getStringProperty(AUTHOR_PROP);
+ article.authors = author.isEmpty() ? new String[0] : new String[] { author };
+ return article;
+ }
+ }
+
+ private static class PersonItem extends ThingItem {
+ private static final String[] mStringPropertyNames = {
+ NAME_PROP,
+ URL_PROP,
+ DESCRIPTION_PROP,
+ IMAGE_PROP,
+ FAMILY_NAME_PROP,
+ GIVEN_NAME_PROP,
+ };
+
+ protected PersonItem(Element elem) {
+ super(Type.PERSON, elem, mStringPropertyNames, new String[0]);
+ }
+
+ // Returns either the value of NAME_PROP, or concatenated values of GIVEN_NAME_PROP and
+ // FAILY_NAME_PROP delimited by a whitespace.
+ @Override
+ protected String toStringProperty() {
+ String fullname = getStringProperty(NAME_PROP);
+ if (fullname.isEmpty()) {
+ fullname = concat(getStringProperty(GIVEN_NAME_PROP),
+ getStringProperty(FAMILY_NAME_PROP));
+ }
+ return fullname;
+ }
+ }
+
+ private static class OrganizationItem extends ThingItem {
+ private static final String[] mStringPropertyNames = {
+ NAME_PROP,
+ URL_PROP,
+ DESCRIPTION_PROP,
+ IMAGE_PROP,
+ LEGAL_NAME_PROP,
+ };
+
+ protected OrganizationItem(Element elem) {
+ super(Type.ORGANIZATION, elem, mStringPropertyNames, new String[0]);
+ }
+
+ // Returns either the value of NAME_PROP or LEGAL_NAME_PROP.
+ @Override
+ protected String toStringProperty() {
+ String name = getStringProperty(NAME_PROP);
+ if (name.isEmpty()) name = getStringProperty(LEGAL_NAME_PROP);
+ return name;
+ }
+ }
+
+ private static String concat(String first, String second) {
+ String concat = first;
+ if (!concat.isEmpty() && !second.isEmpty()) concat += " ";
+ concat += second;
+ return concat;
+ }
+}

Powered by Google App Engine
This is Rietveld 408576698