Chromium Code Reviews| Index: src/com/dom_distiller/client/SchemaOrgParser.java |
| diff --git a/src/com/dom_distiller/client/SchemaOrgParser.java b/src/com/dom_distiller/client/SchemaOrgParser.java |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..cc00c4823b8282187ea2f9b170b7994e8e3f8f82 |
| --- /dev/null |
| +++ b/src/com/dom_distiller/client/SchemaOrgParser.java |
| @@ -0,0 +1,517 @@ |
| +// Copyright 2014 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +package com.dom_distiller.client; |
| + |
| +import java.util.ArrayList; |
| +import java.util.EnumMap; |
| +import java.util.Iterator; |
| +import java.util.List; |
| +import java.util.Map; |
| +import java.util.Set; |
| + |
| +import com.google.gwt.dom.client.AnchorElement; |
| +import com.google.gwt.dom.client.Element; |
| +import com.google.gwt.dom.client.ImageElement; |
| +import com.google.gwt.dom.client.MetaElement; |
| +import com.google.gwt.dom.client.NodeList; |
| + |
| +/** |
| + * This class recognizes and parses Schema.org markup tags, and returns the properties that matter |
| + * to distilled content. |
| + * For the basic Schema.org Thing type, the basic properties are: name, url, description, image. |
| + * In addition, for each type that we support, we also parse more specific properties: |
| + * - Article: headline (i.e. title), publisher, copyright year, copyright holder, date published, |
| + * date modified, author, article section |
| + * - ImageObject: headline (i.e. title), publisher, copyright year, copyright holder, content url, |
| + * encoding format, caption, representative of page, width, height |
| + * - Person: family name, given name |
| + * - Organization: legal name. |
| + * The value of a Schema.Org property can be a Schema.Org type, i.e. embedded. E.g., the author or |
| + * publisher of article or publisher of image could be a Schema.Org Person or Organization type; |
| + * in fact, this is the reason we support Person and Organization types. |
| + */ |
| +public class SchemaOrgParser implements MarkupParser.Parser { |
| + private static final String NAME_PROP = "name"; |
| + private static final String URL_PROP = "url"; |
| + private static final String DESCRIPTION_PROP = "description"; |
| + private static final String IMAGE_PROP = "image"; |
| + private static final String HEADLINE_PROP = "headline"; |
| + private static final String PUBLISHER_PROP = "publisher"; |
| + private static final String COPYRIGHT_HOLDER_PROP = "copyrightHolder"; |
| + private static final String COPYRIGHT_YEAR_PROP = "copyrightYear"; |
| + private static final String CONTENT_URL_PROP = "contentUrl"; |
| + private static final String ENCODING_FORMAT_PROP = "encodingFormat"; |
| + private static final String CAPTION_PROP = "caption"; |
| + private static final String REPRESENTATIVE_PROP = "representativeOfPage"; |
| + private static final String WIDTH_PROP = "width"; |
| + private static final String HEIGHT_PROP = "height"; |
| + private static final String DATE_PUBLISHED_PROP = "datePublished"; |
| + private static final String DATE_MODIFIED_PROP = "dateModified"; |
| + private static final String AUTHOR_PROP = "author"; |
| + private static final String SECTION_PROP = "articleSection"; |
| + private static final String FAMILY_NAME_PROP = "familyName"; |
| + private static final String GIVEN_NAME_PROP = "givenName"; |
| + private static final String LEGAL_NAME_PROP = "legalName"; |
| + |
| + private enum Type { // All these types are extended from Thing, directly or indirectly. |
| + IMAGE, |
| + ARTICLE, |
| + PERSON, |
| + ORGANIZATION, |
| + UNSUPPORTED, |
| + } |
| + |
| + private static class ThingItem { |
| + protected final Type mType; |
| + protected final Element mRoot; |
| + protected final String[] mStringPropertyNames; |
| + protected final String[] mItemPropertyNames; |
| + protected final String[] mStringProperties; |
| + protected final ThingItem[] mItemProperties; |
| + |
| + protected ThingItem(Type type, Element root, |
| + String[] stringPropertyNames, String[] itemPropertyNames) { |
| + mType = type; |
| + mRoot = root; |
| + mStringPropertyNames = stringPropertyNames; |
| + mItemPropertyNames = itemPropertyNames; |
| + mStringProperties = new String[mStringPropertyNames.length]; |
| + mItemProperties = new ThingItem[mItemPropertyNames.length]; |
| + } |
| + |
| + protected String toStringProperty() { |
| + return ""; |
| + } |
| + |
| + protected MarkupParser.Image getImage() { |
| + // Use value of IMAGE_PROP to create a MarkupParser.Image. |
| + String imageUrl = getStringProperty(IMAGE_PROP); |
| + if (imageUrl.isEmpty()) return null; |
| + MarkupParser.Image image = new MarkupParser.Image(); |
| + image.image = imageUrl; |
| + image.url = imageUrl; |
| + return image; |
| + } |
| + |
| + protected MarkupParser.Article getArticle() { |
| + return null; |
| + } |
| + |
| + protected final boolean isImageRepresentativeOfPage() { |
| + String value = getStringProperty(REPRESENTATIVE_PROP); |
| + return value.equalsIgnoreCase("true"); |
| + } |
| + |
| + protected final void putStringValue(String name, String value) { |
| + for (int i = 0; i < mStringPropertyNames.length; i++) { |
| + if (name.equals(mStringPropertyNames[i])) { |
| + mStringProperties[i] = value; |
| + break; |
| + } |
| + } |
| + } |
| + |
| + protected final void putItemValue(String name, ThingItem value) { |
| + for (int i = 0; i < mItemPropertyNames.length; i++) { |
| + if (name.equals(mItemPropertyNames[i])) { |
| + mItemProperties[i] = value; |
| + break; |
| + } |
| + } |
| + } |
| + |
| + protected final String getStringProperty(String name) { |
| + // Check if property exists in |mStringProperties|. |
| + for (int i = 0; i < mStringPropertyNames.length; i++) { |
| + if (name.equals(mStringPropertyNames[i])) { |
| + String value = mStringProperties[i]; |
| + if (value != null && !value.isEmpty()) return value; |
| + break; |
| + } |
| + } |
| + // Otherwise, repeat for |mItemProperties|. |
| + for (int i = 0; i < mItemPropertyNames.length; i++) { |
| + if (!name.equals(mItemPropertyNames[i])) continue; |
| + if (mItemProperties[i] != null) return mItemProperties[i].toStringProperty(); |
| + break; |
| + } |
| + return ""; |
| + } |
| + } |
| + |
| + private final List<ThingItem> mItemScopes; |
| + private Element mRoot = null; |
| + private final Map<Type, String> mTypeUrls = new EnumMap<Type, String>(Type.class); |
|
cjhopman
2014/04/17 17:35:26
This appears to only be used to lookup a type for
kuan
2014/04/18 00:19:03
Done.
|
| + |
| + /** |
| + * The object that extracts and verifies Schema.org markup tags from |root|. |
| + */ |
| + public SchemaOrgParser(Element root) { |
| + mRoot = root; |
| + mItemScopes = new ArrayList<ThingItem>(); |
| + |
| + mTypeUrls.put(Type.IMAGE, "http://schema.org/ImageObject"); |
| + mTypeUrls.put(Type.ARTICLE, "http://schema.org/Article"); |
| + mTypeUrls.put(Type.PERSON, "http://schema.org/Person"); |
| + mTypeUrls.put(Type.ORGANIZATION, "http://schema.org/Organization"); |
| + mTypeUrls.put(Type.UNSUPPORTED, ""); |
| + |
| + // TODO(kuan): Parsing all tags is pretty expensive, should we do so only lazily? |
| + // If parse lazily, all get* methods will need to check for parsed state and, if necessary, |
| + // parse before returning the requested properties. |
| + parseRoot(); |
| + } |
| + |
| + @Override |
| + public String getTitle() { |
| + String title = findStringProperty(HEADLINE_PROP); |
| + if (title.isEmpty()) title = findStringProperty(NAME_PROP); |
| + return title; |
| + } |
| + |
| + @Override |
| + public String getType() { |
| + if (mItemScopes.isEmpty()) return null; |
| + // Assume the type of the first item is the page type. |
| + return mItemScopes.get(0).mType.toString(); |
| + } |
| + |
| + @Override |
| + public String getUrl() { |
| + return findStringProperty(URL_PROP); |
| + } |
| + |
| + @Override |
| + public MarkupParser.Image[] getImages() { |
| + if (mItemScopes.isEmpty()) return null; |
| + List<MarkupParser.Image> images = new ArrayList<MarkupParser.Image>(); |
| + for (int i = 0; i < mItemScopes.size(); i++) { |
| + ThingItem item = mItemScopes.get(i); |
| + MarkupParser.Image image = item.getImage(); |
| + if (image != null) { |
| + if (item.isImageRepresentativeOfPage()) { |
| + // Image should be the dominant, i.e. first, one. |
| + images.add(0, image); |
| + } else { |
| + images.add(image); |
| + } |
| + } |
| + } |
| + if (images.isEmpty()) return null; |
| + return images.toArray(new MarkupParser.Image[images.size()]); |
| + } |
| + |
| + @Override |
| + public String getDescription() { |
| + return findStringProperty(DESCRIPTION_PROP); |
| + } |
| + |
| + @Override |
| + public String getPublisher() { |
| + return findStringProperty(PUBLISHER_PROP); |
| + } |
| + |
| + @Override |
| + public String getCopyright() { |
| + if (mItemScopes.isEmpty()) return ""; |
| + // Returns a concatenated string of copyright year and copyright holder of the first item |
| + // that has these properties, delimited by a whitespace. |
| + String copyright = ""; |
| + for (int i = 0; i < mItemScopes.size() && copyright.isEmpty(); i++) { |
| + ThingItem item = mItemScopes.get(i); |
| + copyright = concat(item.getStringProperty(COPYRIGHT_YEAR_PROP), |
| + item.getStringProperty(COPYRIGHT_HOLDER_PROP)); |
| + } |
| + return copyright.isEmpty() ? copyright : "Copyright " + copyright; |
| + } |
| + |
| + @Override |
| + public String getAuthor() { |
| + return findStringProperty(AUTHOR_PROP); |
| + } |
| + |
| + @Override |
| + public MarkupParser.Article getArticle() { |
| + if (mItemScopes.isEmpty()) return null; |
| + // Returns the first article. |
| + MarkupParser.Article article = null; |
| + for (int i = 0; i < mItemScopes.size() && article == null; i++) { |
| + article = mItemScopes.get(i).getArticle(); |
| + } |
| + return article; |
| + } |
| + |
| + @Override |
| + public boolean optOut() { |
| + return false; |
| + } |
| + |
| + private void parseRoot() { |
| + // The <html> element can also be the start of a Schema.org item, and hence needs to be |
| + // parsed. |
| + |
| + // Use a boolean array for |skipChildren|, instead of the boolean primitive, so that it |
| + // can be updated in checkIfElementIsSupported(). |
| + boolean[] skipChildren = new boolean[] { false }; |
| + checkIfElementIsSupported(mRoot, skipChildren); |
| + if (skipChildren[0]) return; // Skipping children of root means there's nothing more to do. |
| + // Recursively parse each element that is an Schema.org type. |
| + parse(mRoot, null); |
| + } |
| + |
| + private void parse(Element root, ThingItem currItem) { |
|
cjhopman
2014/04/17 17:35:26
This function and its uses will be simplified if c
kuan
2014/04/18 00:19:03
Done.
|
| + NodeList<Element> allElems = root.getElementsByTagName("*"); |
| + for (int i = 0; i < allElems.getLength(); i++) { |
|
cjhopman
2014/04/17 17:35:26
The way that the tree is parsed is hard for me to
kuan
2014/04/18 00:19:03
Done. what's the worst case behavior?
cjhopman
2014/04/18 01:17:01
The previous version had O(n^2) worst case complex
|
| + Element e = allElems.getItem(i); |
| + // See comments in parseRoot() for using boolean array for |skipChildren|. |
| + boolean[] skipChildren = new boolean[] { false }; |
| + |
| + ThingItem newItem = checkIfElementIsSupported(e, skipChildren); |
| + |
| + // If we're currently parsing a Schema.org type, if it has an "itemprop" attribute that |
| + // we care for, extract and store its value. |
| + if (currItem != null) extractProperty(e, currItem, newItem); |
| + |
| + // If current element has "itemscope" and "itemtype" attributes and is a supported type, |
| + // its children would have been parsed by |newItem| via the recursive parse() call. |
| + // If it's an unsupported type, its children should be ignored. In both cases, we |
| + // should skip these children to the next sibling of the current element. So, determine |
| + // the index of the next sibiling in |allElems|, so that the next iteration will jump to |
| + // that element. |
| + if (skipChildren[0]) { |
| + Element next = e.getNextSiblingElement(); |
| + if (next != null) { |
| + for (i++; i < allElems.getLength() && next != allElems.getItem(i); i++) {} |
| + i--; // Decrement because it'll be incremented in the outer for loop. |
| + } else { |
| + break; // No next sibling means there's no more elements to process. |
| + } |
| + } |
| + } // for all elements |
| + } |
| + |
| + // If |e| has "itemscope" and "itemtype" attributes and a supported type, a ThingItem-extended |
| + // object is created based on the type. |
| + // Returns this object after it has recursively parsed |e|'s children, returns null otherwise. |
| + // @param skipChildren[0] is set to true if |e| specifies a Schema.org type, supported or not. |
| + private ThingItem checkIfElementIsSupported(Element e, boolean[] skipChildren) { |
| + // If element has "itemscope" and "itemtype" attributes, it's the start of an item. |
| + // If the type is what we care for, instantiate the corresponding extended ThingItem and |
| + // recursively parse it. |
| + if (!e.hasAttribute("ITEMSCOPE") || !e.hasAttribute("ITEMTYPE")) return null; |
|
cjhopman
2014/04/17 17:35:26
This should be a different function so you don't h
kuan
2014/04/18 00:19:03
Done.
|
| + |
| + skipChildren[0] = true; // Indicate to skip the children of this supported element. |
| + ThingItem newItem = null; |
| + Type type = getType(e); |
| + switch (type) { |
| + case IMAGE: |
| + newItem = new ImageItem(e); |
| + break; |
| + case ARTICLE: |
| + newItem = new ArticleItem(e); |
| + break; |
| + case PERSON: |
| + newItem = new PersonItem(e); |
| + break; |
| + case ORGANIZATION: |
| + newItem = new OrganizationItem(e); |
| + break; |
| + case UNSUPPORTED: |
| + default: |
| + return null; |
| + } |
| + |
| + mItemScopes.add(newItem); |
| + parse(e, newItem); |
|
cjhopman
2014/04/17 17:35:26
This parse() call makes it harder for me to reason
kuan
2014/04/18 00:19:03
Done. i'm not sure if i code it the way u want re
cjhopman
2014/04/18 01:17:01
See the new comment in parse() for what I meant by
|
| + return newItem; |
| + } |
| + |
| + private Type getType(Element e) { |
| + String type = e.getAttribute("ITEMTYPE"); |
| + Set<Map.Entry<Type, String>> typeUrls = mTypeUrls.entrySet(); |
| + Iterator<Map.Entry<Type, String>> iter = typeUrls.iterator(); |
|
cjhopman
2014/04/17 17:35:26
I think you can do:
for (Map.Entry<Type, String>
kuan
2014/04/18 00:19:03
Done. since it's now a HashMap of <String, Type>,
|
| + while (iter.hasNext()) { |
| + Map.Entry<Type, String> typeUrl = iter.next(); |
| + if (typeUrl.getValue().equalsIgnoreCase(type)) return typeUrl.getKey(); |
| + } |
| + return Type.UNSUPPORTED; |
| + } |
| + |
| + // Extract the value of the "itemprop" attribute in |e|. |
| + // @param currItem ThingItem-extended item for the current Schema.org type being parsed. |
| + // @param embeddedItem ThingItem-extended item for the Schema.org type created for |e|, i.e. |e| // had specified a Schema.org type. |
| + private void extractProperty(Element e, ThingItem currItem, ThingItem embeddedItem) { |
|
cjhopman
2014/04/17 17:35:26
This function does a lot (and most of what it does
kuan
2014/04/18 00:19:03
Done.
|
| + // "itemprop" attribute is case-sensitive. |
| + String name = e.getAttribute("ITEMPROP"); |
| + if (name == null || name.isEmpty()) return; |
| + if (embeddedItem != null) { // This "itemprop" attribute is an embedded item. |
| + currItem.putItemValue(name, embeddedItem); |
| + } else { // Extract value from the tag. |
| + String value = null; |
| + if (e.hasTagName("A")) { |
| + value = AnchorElement.as(e).getHref(); |
| + } else if (e.hasTagName("IMG")) { |
| + value = ImageElement.as(e).getSrc(); |
| + } else if (e.hasTagName("META")) { |
| + value = MetaElement.as(e).getContent(); |
| + } else if (e.hasTagName("TIME")) { |
| + value = e.getAttribute("datetime"); |
| + } |
| + if (value == null || value.isEmpty()) value = e.getInnerText(); |
| + currItem.putStringValue(name, value); |
| + } |
| + } |
| + |
| + // Returns the first item that has the requested property value. |
| + private String findStringProperty(String name) { |
| + if (mItemScopes.isEmpty()) return null; |
| + for (int i = 0; i < mItemScopes.size(); i++) { |
| + String value = mItemScopes.get(i).getStringProperty(name); |
| + if (!value.isEmpty()) return value; |
| + } |
| + return ""; |
| + } |
| + |
| + private static class ImageItem extends ThingItem { |
| + private static final String[] mStringPropertyNames = { |
| + NAME_PROP, |
| + URL_PROP, |
| + DESCRIPTION_PROP, |
| + IMAGE_PROP, |
| + HEADLINE_PROP, |
| + PUBLISHER_PROP, |
| + COPYRIGHT_HOLDER_PROP, |
| + COPYRIGHT_YEAR_PROP, |
| + CONTENT_URL_PROP, |
| + ENCODING_FORMAT_PROP, |
| + CAPTION_PROP, |
| + REPRESENTATIVE_PROP, |
| + WIDTH_PROP, |
| + HEIGHT_PROP, |
| + }; |
| + |
| + private static final String[] mItemPropertyNames = { |
| + PUBLISHER_PROP, |
| + COPYRIGHT_HOLDER_PROP, |
| + }; |
| + |
| + protected ImageItem(Element elem) { |
| + super(Type.IMAGE, elem, mStringPropertyNames, mItemPropertyNames); |
| + } |
| + |
| + @Override |
| + protected MarkupParser.Image getImage() { |
| + MarkupParser.Image image = new MarkupParser.Image(); |
| + String url = getStringProperty(CONTENT_URL_PROP); |
| + image.image = !url.isEmpty() ? url : getStringProperty(NAME_PROP); |
| + image.url = image.image; |
| + image.type = getStringProperty(ENCODING_FORMAT_PROP); |
| + image.caption = getStringProperty(CAPTION_PROP); |
| + try { |
| + image.width = Integer.parseInt(getStringProperty(WIDTH_PROP), 10); |
| + } catch (Exception e) { |
| + } |
| + try { |
| + image.height = Integer.parseInt(getStringProperty(HEIGHT_PROP), 10); |
| + } catch (Exception e) { |
| + } |
| + return image; |
| + } |
| + } |
| + |
| + private static class ArticleItem extends ThingItem { |
| + private static final String[] mStringPropertyNames = { |
| + NAME_PROP, |
| + URL_PROP, |
| + DESCRIPTION_PROP, |
| + IMAGE_PROP, |
| + HEADLINE_PROP, |
| + PUBLISHER_PROP, |
| + COPYRIGHT_HOLDER_PROP, |
| + COPYRIGHT_YEAR_PROP, |
| + DATE_MODIFIED_PROP, |
| + DATE_PUBLISHED_PROP, |
| + AUTHOR_PROP, |
| + SECTION_PROP, |
| + }; |
| + |
| + private static final String[] mItemPropertyNames = { |
| + PUBLISHER_PROP, |
| + COPYRIGHT_HOLDER_PROP, |
| + AUTHOR_PROP, |
| + }; |
| + |
| + protected ArticleItem(Element elem) { |
| + super(Type.ARTICLE, elem, mStringPropertyNames, mItemPropertyNames); |
| + } |
| + |
| + @Override |
| + protected MarkupParser.Article getArticle() { |
| + MarkupParser.Article article = new MarkupParser.Article(); |
| + article.publishedTime = getStringProperty(DATE_PUBLISHED_PROP); |
| + article.modifiedTime = getStringProperty(DATE_MODIFIED_PROP); |
| + article.section = getStringProperty(SECTION_PROP); |
| + String author = getStringProperty(AUTHOR_PROP); |
| + article.authors = author.isEmpty() ? new String[0] : new String[] { author }; |
| + return article; |
| + } |
| + } |
| + |
| + private static class PersonItem extends ThingItem { |
| + private static final String[] mStringPropertyNames = { |
| + NAME_PROP, |
| + URL_PROP, |
| + DESCRIPTION_PROP, |
| + IMAGE_PROP, |
| + FAMILY_NAME_PROP, |
| + GIVEN_NAME_PROP, |
| + }; |
| + |
| + protected PersonItem(Element elem) { |
| + super(Type.PERSON, elem, mStringPropertyNames, new String[0]); |
| + } |
| + |
| + // Returns either the value of NAME_PROP, or concatenated values of GIVEN_NAME_PROP and |
| + // FAILY_NAME_PROP delimited by a whitespace. |
| + @Override |
| + protected String toStringProperty() { |
| + String fullname = getStringProperty(NAME_PROP); |
| + if (fullname.isEmpty()) { |
| + fullname = concat(getStringProperty(GIVEN_NAME_PROP), |
| + getStringProperty(FAMILY_NAME_PROP)); |
| + } |
| + return fullname; |
| + } |
| + } |
| + |
| + private static class OrganizationItem extends ThingItem { |
| + private static final String[] mStringPropertyNames = { |
| + NAME_PROP, |
| + URL_PROP, |
| + DESCRIPTION_PROP, |
| + IMAGE_PROP, |
| + LEGAL_NAME_PROP, |
| + }; |
| + |
| + protected OrganizationItem(Element elem) { |
| + super(Type.ORGANIZATION, elem, mStringPropertyNames, new String[0]); |
| + } |
| + |
| + // Returns either the value of NAME_PROP or LEGAL_NAME_PROP. |
| + @Override |
| + protected String toStringProperty() { |
| + String name = getStringProperty(NAME_PROP); |
| + if (name.isEmpty()) name = getStringProperty(LEGAL_NAME_PROP); |
| + return name; |
| + } |
| + } |
| + |
| + private static String concat(String first, String second) { |
| + String concat = first; |
| + if (!concat.isEmpty() && !second.isEmpty()) concat += " "; |
| + concat += second; |
| + return concat; |
| + } |
| +} |