Index: src/com/dom_distiller/client/SchemaOrgParser.java |
diff --git a/src/com/dom_distiller/client/SchemaOrgParser.java b/src/com/dom_distiller/client/SchemaOrgParser.java |
new file mode 100644 |
index 0000000000000000000000000000000000000000..7d2937ee5067952d8985884412a431dd57af3c80 |
--- /dev/null |
+++ b/src/com/dom_distiller/client/SchemaOrgParser.java |
@@ -0,0 +1,499 @@ |
+// Copyright 2014 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+package com.dom_distiller.client; |
+ |
+import java.util.ArrayList; |
+import java.util.HashMap; |
+import java.util.List; |
+import java.util.Map; |
+ |
+import com.google.gwt.dom.client.AnchorElement; |
+import com.google.gwt.dom.client.Element; |
+import com.google.gwt.dom.client.ImageElement; |
+import com.google.gwt.dom.client.MetaElement; |
+import com.google.gwt.dom.client.NodeList; |
+ |
+/** |
+ * This class recognizes and parses Schema.org markup tags, and returns the properties that matter |
+ * to distilled content. |
+ * For the basic Schema.org Thing type, the basic properties are: name, url, description, image. |
+ * In addition, for each type that we support, we also parse more specific properties: |
+ * - Article: headline (i.e. title), publisher, copyright year, copyright holder, date published, |
+ * date modified, author, article section |
+ * - ImageObject: headline (i.e. title), publisher, copyright year, copyright holder, content url, |
+ * encoding format, caption, representative of page, width, height |
+ * - Person: family name, given name |
+ * - Organization: legal name. |
+ * The value of a Schema.Org property can be a Schema.Org type, i.e. embedded. E.g., the author or |
+ * publisher of article or publisher of image could be a Schema.Org Person or Organization type; |
+ * in fact, this is the reason we support Person and Organization types. |
+ */ |
+public class SchemaOrgParser implements MarkupParser.Parser { |
+ private static final String NAME_PROP = "name"; |
+ private static final String URL_PROP = "url"; |
+ private static final String DESCRIPTION_PROP = "description"; |
+ private static final String IMAGE_PROP = "image"; |
+ private static final String HEADLINE_PROP = "headline"; |
+ private static final String PUBLISHER_PROP = "publisher"; |
+ private static final String COPYRIGHT_HOLDER_PROP = "copyrightHolder"; |
+ private static final String COPYRIGHT_YEAR_PROP = "copyrightYear"; |
+ private static final String CONTENT_URL_PROP = "contentUrl"; |
+ private static final String ENCODING_FORMAT_PROP = "encodingFormat"; |
+ private static final String CAPTION_PROP = "caption"; |
+ private static final String REPRESENTATIVE_PROP = "representativeOfPage"; |
+ private static final String WIDTH_PROP = "width"; |
+ private static final String HEIGHT_PROP = "height"; |
+ private static final String DATE_PUBLISHED_PROP = "datePublished"; |
+ private static final String DATE_MODIFIED_PROP = "dateModified"; |
+ private static final String AUTHOR_PROP = "author"; |
+ private static final String SECTION_PROP = "articleSection"; |
+ private static final String FAMILY_NAME_PROP = "familyName"; |
+ private static final String GIVEN_NAME_PROP = "givenName"; |
+ private static final String LEGAL_NAME_PROP = "legalName"; |
+ |
+ private enum Type { // All these types are extended from Thing, directly or indirectly. |
+ IMAGE, |
+ ARTICLE, |
+ PERSON, |
+ ORGANIZATION, |
+ UNSUPPORTED, |
+ } |
+ |
+ private static class ThingItem { |
+ protected final Type mType; |
cjhopman
2014/04/18 01:17:01
nit: I think all the fields here could be private
kuan
2014/04/18 23:34:38
Done.
|
+ protected final Element mRoot; |
cjhopman
2014/04/18 01:17:01
mRoot looks unused.
kuan
2014/04/18 23:34:38
Done.
|
+ protected final String[] mStringPropertyNames; |
+ protected final String[] mItemPropertyNames; |
+ protected final String[] mStringProperties; |
+ protected final ThingItem[] mItemProperties; |
+ |
+ protected ThingItem(Type type, Element root, |
+ String[] stringPropertyNames, String[] itemPropertyNames) { |
+ mType = type; |
+ mRoot = root; |
+ mStringPropertyNames = stringPropertyNames; |
+ mItemPropertyNames = itemPropertyNames; |
+ mStringProperties = new String[mStringPropertyNames.length]; |
+ mItemProperties = new ThingItem[mItemPropertyNames.length]; |
+ } |
+ |
+ protected String toStringProperty() { |
+ return ""; |
+ } |
+ |
+ protected MarkupParser.Image getImage() { |
+ // Use value of IMAGE_PROP to create a MarkupParser.Image. |
+ String imageUrl = getStringProperty(IMAGE_PROP); |
+ if (imageUrl.isEmpty()) return null; |
+ MarkupParser.Image image = new MarkupParser.Image(); |
+ image.image = imageUrl; |
+ image.url = imageUrl; |
+ return image; |
+ } |
+ |
+ protected MarkupParser.Article getArticle() { |
+ return null; |
+ } |
+ |
+ protected final boolean isImageRepresentativeOfPage() { |
+ String value = getStringProperty(REPRESENTATIVE_PROP); |
+ return value.equalsIgnoreCase("true"); |
+ } |
+ |
+ protected final void putStringValue(String name, String value) { |
+ for (int i = 0; i < mStringPropertyNames.length; i++) { |
+ if (name.equals(mStringPropertyNames[i])) { |
+ mStringProperties[i] = value; |
+ break; |
+ } |
+ } |
+ } |
+ |
+ protected final void putItemValue(String name, ThingItem value) { |
+ for (int i = 0; i < mItemPropertyNames.length; i++) { |
+ if (name.equals(mItemPropertyNames[i])) { |
+ mItemProperties[i] = value; |
+ break; |
+ } |
+ } |
+ } |
+ |
+ protected final String getStringProperty(String name) { |
+ // Check if property exists in |mStringProperties|. |
+ for (int i = 0; i < mStringPropertyNames.length; i++) { |
+ if (name.equals(mStringPropertyNames[i])) { |
+ String value = mStringProperties[i]; |
+ if (value != null && !value.isEmpty()) return value; |
+ break; |
+ } |
+ } |
+ // Otherwise, repeat for |mItemProperties|. |
+ for (int i = 0; i < mItemPropertyNames.length; i++) { |
+ if (!name.equals(mItemPropertyNames[i])) continue; |
+ if (mItemProperties[i] != null) return mItemProperties[i].toStringProperty(); |
+ break; |
+ } |
+ return ""; |
+ } |
+ } |
+ |
+ private final List<ThingItem> mItemScopes; |
cjhopman
2014/04/18 01:17:01
this can be static and initialized in a static ini
kuan
2014/04/18 23:34:38
different instances of SchemaOrgParser have differ
|
+ private Element mRoot = null; |
cjhopman
2014/04/18 01:17:01
is this used?
kuan
2014/04/18 23:34:38
Done.
|
+ private final Map<String, Type> mTypeUrls = new HashMap<String, Type>(); |
+ |
+ /** |
+ * The object that extracts and verifies Schema.org markup tags from |root|. |
+ */ |
+ public SchemaOrgParser(Element root) { |
+ mRoot = root; |
+ mItemScopes = new ArrayList<ThingItem>(); |
+ |
+ mTypeUrls.put("http://schema.org/ImageObject", Type.IMAGE); |
+ mTypeUrls.put("http://schema.org/Article", Type.ARTICLE); |
+ mTypeUrls.put("http://schema.org/Person", Type.PERSON); |
+ mTypeUrls.put("http://schema.org/Organization", Type.ORGANIZATION); |
+ mTypeUrls.put("", Type.UNSUPPORTED); |
+ |
+ // TODO(kuan): Parsing all tags is pretty expensive, should we do so only lazily? |
+ // If parse lazily, all get* methods will need to check for parsed state and, if necessary, |
+ // parse before returning the requested properties. |
+ // Note that the <html> element can also be the start of a Schema.org item, and hence needs |
+ // to be parsed. |
+ parse(mRoot, null); |
+ } |
+ |
+ @Override |
+ public String getTitle() { |
+ String title = findStringProperty(HEADLINE_PROP); |
+ if (title.isEmpty()) title = findStringProperty(NAME_PROP); |
+ return title; |
+ } |
+ |
+ @Override |
+ public String getType() { |
+ if (mItemScopes.isEmpty()) return null; |
+ // Assume the type of the first item is the page type. |
+ return mItemScopes.get(0).mType.toString(); |
+ } |
+ |
+ @Override |
+ public String getUrl() { |
+ return findStringProperty(URL_PROP); |
+ } |
+ |
+ @Override |
+ public MarkupParser.Image[] getImages() { |
+ if (mItemScopes.isEmpty()) return null; |
+ List<MarkupParser.Image> images = new ArrayList<MarkupParser.Image>(); |
+ for (int i = 0; i < mItemScopes.size(); i++) { |
+ ThingItem item = mItemScopes.get(i); |
+ MarkupParser.Image image = item.getImage(); |
+ if (image != null) { |
+ if (item.isImageRepresentativeOfPage()) { |
+ // Image should be the dominant, i.e. first, one. |
+ images.add(0, image); |
+ } else { |
+ images.add(image); |
+ } |
+ } |
+ } |
+ if (images.isEmpty()) return null; |
+ return images.toArray(new MarkupParser.Image[images.size()]); |
+ } |
+ |
+ @Override |
+ public String getDescription() { |
+ return findStringProperty(DESCRIPTION_PROP); |
+ } |
+ |
+ @Override |
+ public String getPublisher() { |
+ return findStringProperty(PUBLISHER_PROP); |
+ } |
+ |
+ @Override |
+ public String getCopyright() { |
+ if (mItemScopes.isEmpty()) return ""; |
+ // Returns a concatenated string of copyright year and copyright holder of the first item |
+ // that has these properties, delimited by a whitespace. |
+ String copyright = ""; |
+ for (int i = 0; i < mItemScopes.size() && copyright.isEmpty(); i++) { |
+ ThingItem item = mItemScopes.get(i); |
+ copyright = concat(item.getStringProperty(COPYRIGHT_YEAR_PROP), |
+ item.getStringProperty(COPYRIGHT_HOLDER_PROP)); |
+ } |
+ return copyright.isEmpty() ? copyright : "Copyright " + copyright; |
+ } |
+ |
+ @Override |
+ public String getAuthor() { |
+ return findStringProperty(AUTHOR_PROP); |
+ } |
+ |
+ @Override |
+ public MarkupParser.Article getArticle() { |
+ if (mItemScopes.isEmpty()) return null; |
+ // Returns the first article. |
+ MarkupParser.Article article = null; |
+ for (int i = 0; i < mItemScopes.size() && article == null; i++) { |
+ article = mItemScopes.get(i).getArticle(); |
+ } |
+ return article; |
+ } |
+ |
+ @Override |
+ public boolean optOut() { |
+ return false; |
+ } |
+ |
+ private void parse(Element e, ThingItem parentItem) { |
+ ThingItem newItem = null; |
+ boolean isItemScope = isItemscope(e); |
+ if (isItemScope) { |
+ // The "itemscope" and "itemtype" attributes of |e| indicate the start of an item. |
+ // If the type is supported, create the corresponding extended-ThingItem and recursively |
+ // parse it. |
+ newItem = createItemForElement(e); |
cjhopman
2014/04/18 01:17:01
It looks like we might handle nesting of elements
kuan
2014/04/18 23:34:38
Done. before, i coded it based on the assumption,
|
+ if (newItem != null) { |
+ mItemScopes.add(newItem); |
+ Element child = e.getFirstChildElement(); |
+ if (child != null) parse(child, newItem); |
+ } |
+ } |
+ |
+ // A non-null |parentItem| means we're currently parsing the elements for a Schema.org type. |
+ // Check if the current element has a "itemprop" attribute. |
+ if (parentItem != null) { |
+ String propertyName = getItemprop(e); |
+ if (!propertyName.isEmpty()) { |
+ // If a new item was created above, the property value of this "itemprop" attribute |
+ // is an embedded item, so add it to the parent item. |
+ if (newItem != null) { |
+ parentItem.putItemValue(propertyName, newItem); |
+ } else { |
+ // Otherwise, extract the property value from the tag itself, and add it to the |
+ // parent item. |
+ parentItem.putStringValue(propertyName, getPropertyValue(e)); |
+ } |
+ } |
+ } |
+ |
cjhopman
2014/04/18 01:17:01
Clarification of what I meant with "both cases wou
kuan
2014/04/18 23:34:38
Done. fyi, can't use forEach for NodeList.
|
+ // If |e| is an itemsope, its children would have been parsed by the parse() call above, |
+ // so only recurse into immediate children otherwise. |
+ if (!isItemScope) { |
+ Element child = e.getFirstChildElement(); |
+ if (child != null) parse(child, parentItem); |
+ } |
+ |
+ // Parse the next available sibling element. |
+ Element next = e.getNextSiblingElement(); |
+ if (next != null) parse(next, parentItem); |
+ } |
+ |
+ private Type getItemType(Element e) { |
+ String type = e.getAttribute("ITEMTYPE"); |
+ return mTypeUrls.containsKey(type) ? mTypeUrls.get(type) : Type.UNSUPPORTED; |
+ } |
+ |
+ private ThingItem createItemForElement(Element e) { |
+ ThingItem newItem = null; |
+ Type type = getItemType(e); |
+ switch (type) { |
+ case IMAGE: |
+ newItem = new ImageItem(e); |
+ break; |
+ case ARTICLE: |
+ newItem = new ArticleItem(e); |
+ break; |
+ case PERSON: |
+ newItem = new PersonItem(e); |
+ break; |
+ case ORGANIZATION: |
+ newItem = new OrganizationItem(e); |
+ break; |
+ case UNSUPPORTED: |
+ default: |
+ return null; |
+ } |
+ return newItem; |
+ } |
+ |
+ // Returns the first item that has the requested property value. |
+ private String findStringProperty(String name) { |
+ if (mItemScopes.isEmpty()) return ""; |
+ for (int i = 0; i < mItemScopes.size(); i++) { |
+ String value = mItemScopes.get(i).getStringProperty(name); |
+ if (!value.isEmpty()) return value; |
+ } |
+ return ""; |
+ } |
+ |
+ private static class ImageItem extends ThingItem { |
+ private static final String[] mStringPropertyNames = { |
+ NAME_PROP, |
+ URL_PROP, |
+ DESCRIPTION_PROP, |
+ IMAGE_PROP, |
+ HEADLINE_PROP, |
+ PUBLISHER_PROP, |
+ COPYRIGHT_HOLDER_PROP, |
+ COPYRIGHT_YEAR_PROP, |
+ CONTENT_URL_PROP, |
+ ENCODING_FORMAT_PROP, |
+ CAPTION_PROP, |
+ REPRESENTATIVE_PROP, |
+ WIDTH_PROP, |
+ HEIGHT_PROP, |
+ }; |
+ |
+ private static final String[] mItemPropertyNames = { |
+ PUBLISHER_PROP, |
+ COPYRIGHT_HOLDER_PROP, |
+ }; |
+ |
+ protected ImageItem(Element elem) { |
+ super(Type.IMAGE, elem, mStringPropertyNames, mItemPropertyNames); |
+ } |
+ |
+ @Override |
+ protected MarkupParser.Image getImage() { |
+ MarkupParser.Image image = new MarkupParser.Image(); |
+ String url = getStringProperty(CONTENT_URL_PROP); |
+ image.image = !url.isEmpty() ? url : getStringProperty(NAME_PROP); |
+ image.url = image.image; |
+ image.type = getStringProperty(ENCODING_FORMAT_PROP); |
+ image.caption = getStringProperty(CAPTION_PROP); |
+ try { |
+ image.width = Integer.parseInt(getStringProperty(WIDTH_PROP), 10); |
+ } catch (Exception e) { |
+ } |
+ try { |
+ image.height = Integer.parseInt(getStringProperty(HEIGHT_PROP), 10); |
+ } catch (Exception e) { |
+ } |
+ return image; |
+ } |
+ } |
+ |
+ private static class ArticleItem extends ThingItem { |
+ private static final String[] mStringPropertyNames = { |
+ NAME_PROP, |
+ URL_PROP, |
+ DESCRIPTION_PROP, |
+ IMAGE_PROP, |
+ HEADLINE_PROP, |
+ PUBLISHER_PROP, |
+ COPYRIGHT_HOLDER_PROP, |
+ COPYRIGHT_YEAR_PROP, |
+ DATE_MODIFIED_PROP, |
+ DATE_PUBLISHED_PROP, |
+ AUTHOR_PROP, |
+ SECTION_PROP, |
+ }; |
+ |
+ private static final String[] mItemPropertyNames = { |
+ PUBLISHER_PROP, |
+ COPYRIGHT_HOLDER_PROP, |
+ AUTHOR_PROP, |
+ }; |
+ |
+ protected ArticleItem(Element elem) { |
+ super(Type.ARTICLE, elem, mStringPropertyNames, mItemPropertyNames); |
+ } |
+ |
+ @Override |
+ protected MarkupParser.Article getArticle() { |
+ MarkupParser.Article article = new MarkupParser.Article(); |
+ article.publishedTime = getStringProperty(DATE_PUBLISHED_PROP); |
+ article.modifiedTime = getStringProperty(DATE_MODIFIED_PROP); |
+ article.section = getStringProperty(SECTION_PROP); |
+ String author = getStringProperty(AUTHOR_PROP); |
+ article.authors = author.isEmpty() ? new String[0] : new String[] { author }; |
+ return article; |
+ } |
+ } |
+ |
+ private static class PersonItem extends ThingItem { |
+ private static final String[] mStringPropertyNames = { |
+ NAME_PROP, |
+ URL_PROP, |
+ DESCRIPTION_PROP, |
+ IMAGE_PROP, |
+ FAMILY_NAME_PROP, |
+ GIVEN_NAME_PROP, |
+ }; |
+ |
+ protected PersonItem(Element elem) { |
+ super(Type.PERSON, elem, mStringPropertyNames, new String[0]); |
+ } |
+ |
+ // Returns either the value of NAME_PROP, or concatenated values of GIVEN_NAME_PROP and |
+ // FAILY_NAME_PROP delimited by a whitespace. |
+ @Override |
+ protected String toStringProperty() { |
+ String fullname = getStringProperty(NAME_PROP); |
+ if (fullname.isEmpty()) { |
+ fullname = concat(getStringProperty(GIVEN_NAME_PROP), |
+ getStringProperty(FAMILY_NAME_PROP)); |
+ } |
+ return fullname; |
+ } |
+ } |
+ |
+ private static class OrganizationItem extends ThingItem { |
+ private static final String[] mStringPropertyNames = { |
+ NAME_PROP, |
+ URL_PROP, |
+ DESCRIPTION_PROP, |
+ IMAGE_PROP, |
+ LEGAL_NAME_PROP, |
+ }; |
+ |
+ protected OrganizationItem(Element elem) { |
+ super(Type.ORGANIZATION, elem, mStringPropertyNames, new String[0]); |
+ } |
+ |
+ // Returns either the value of NAME_PROP or LEGAL_NAME_PROP. |
+ @Override |
+ protected String toStringProperty() { |
+ String name = getStringProperty(NAME_PROP); |
+ if (name.isEmpty()) name = getStringProperty(LEGAL_NAME_PROP); |
+ return name; |
+ } |
+ } |
+ |
+ private static boolean isItemscope(Element e) { |
+ return e.hasAttribute("ITEMSCOPE") && e.hasAttribute("ITEMTYPE"); |
+ } |
+ |
+ private static String getItemprop(Element e) { |
+ // "itemprop" attribute is case-sensitive. |
+ return e.getAttribute("ITEMPROP"); |
+ } |
+ |
+ // Extracts the property value from |e|. For some tags, the value is a specific attribute, |
+ // while for others, it's the text between the start and end tags. |
+ private static String getPropertyValue(Element e) { |
+ String propertyValue = null; |
+ if (e.hasTagName("A")) { |
+ propertyValue = AnchorElement.as(e).getHref(); |
+ } else if (e.hasTagName("IMG")) { |
+ propertyValue = ImageElement.as(e).getSrc(); |
+ } else if (e.hasTagName("META")) { |
+ propertyValue = MetaElement.as(e).getContent(); |
+ } else if (e.hasTagName("TIME")) { |
+ propertyValue = e.getAttribute("datetime"); |
+ } |
+ if (propertyValue == null || propertyValue.isEmpty()) propertyValue = e.getInnerText(); |
+ return propertyValue; |
+ } |
+ |
+ private static String concat(String first, String second) { |
+ String concat = first; |
+ if (!concat.isEmpty() && !second.isEmpty()) concat += " "; |
+ concat += second; |
+ return concat; |
+ } |
+} |