Index: src/com/dom_distiller/client/SchemaOrgParser.java |
diff --git a/src/com/dom_distiller/client/SchemaOrgParser.java b/src/com/dom_distiller/client/SchemaOrgParser.java |
new file mode 100644 |
index 0000000000000000000000000000000000000000..3bbba9d9b2c70269db634a98d22e6f2a52df2d19 |
--- /dev/null |
+++ b/src/com/dom_distiller/client/SchemaOrgParser.java |
@@ -0,0 +1,544 @@ |
+// Copyright 2014 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+package com.dom_distiller.client; |
+ |
+import java.util.ArrayList; |
+import java.util.HashMap; |
+import java.util.List; |
+import java.util.Map; |
+ |
+import com.google.gwt.dom.client.AnchorElement; |
+import com.google.gwt.dom.client.Element; |
+import com.google.gwt.dom.client.ImageElement; |
+import com.google.gwt.dom.client.MetaElement; |
+import com.google.gwt.dom.client.Node; |
+import com.google.gwt.dom.client.NodeList; |
+ |
+/** |
+ * This class recognizes and parses schema.org markup tags, and returns the properties that matter |
+ * to distilled content. |
+ * Schema.org markup (http://schema.org) is based on the microdata format |
+ * (http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html). |
+ * For the basic Schema.org Thing type, the basic properties are: name, url, description, image. |
+ * In addition, for each type that we support, we also parse more specific properties: |
+ * - Article: headline (i.e. title), publisher, copyright year, copyright holder, date published, |
+ * date modified, author, article section |
+ * - ImageObject: headline (i.e. title), publisher, copyright year, copyright holder, content url, |
+ * encoding format, caption, representative of page, width, height |
+ * - Person: family name, given name |
+ * - Organization: legal name. |
+ * The value of a Schema.Org property can be a Schema.Org type, i.e. embedded. E.g., the author or |
+ * publisher of article or publisher of image could be a Schema.Org Person or Organization type; |
+ * in fact, this is the reason we support Person and Organization types. |
+ */ |
+public class SchemaOrgParser { |
+ static final String NAME_PROP = "name"; |
+ static final String URL_PROP = "url"; |
+ static final String DESCRIPTION_PROP = "description"; |
+ static final String IMAGE_PROP = "image"; |
+ static final String HEADLINE_PROP = "headline"; |
+ static final String PUBLISHER_PROP = "publisher"; |
+ static final String COPYRIGHT_HOLDER_PROP = "copyrightHolder"; |
+ static final String COPYRIGHT_YEAR_PROP = "copyrightYear"; |
+ static final String CONTENT_URL_PROP = "contentUrl"; |
+ static final String ENCODING_FORMAT_PROP = "encodingFormat"; |
+ static final String CAPTION_PROP = "caption"; |
+ static final String REPRESENTATIVE_PROP = "representativeOfPage"; |
+ static final String WIDTH_PROP = "width"; |
+ static final String HEIGHT_PROP = "height"; |
+ static final String DATE_PUBLISHED_PROP = "datePublished"; |
+ static final String DATE_MODIFIED_PROP = "dateModified"; |
+ static final String AUTHOR_PROP = "author"; |
+ static final String CREATOR_PROP = "creator"; |
+ static final String SECTION_PROP = "articleSection"; |
+ static final String ASSOCIATED_MEDIA_PROP = "associatedMedia"; |
+ static final String ENCODING_PROP = "encoding"; |
+ static final String FAMILY_NAME_PROP = "familyName"; |
+ static final String GIVEN_NAME_PROP = "givenName"; |
+ static final String LEGAL_NAME_PROP = "legalName"; |
+ static final String AUTHOR_REL = "author"; |
+ |
+ enum Type { // All these types are extended from Thing, directly or indirectly. |
+ IMAGE, |
+ ARTICLE, |
+ PERSON, |
+ ORGANIZATION, |
+ UNSUPPORTED, |
+ } |
+ |
+ static class ThingItem { |
+ private final Type mType; |
+ private final String[] mStringPropertyNames; |
cjhopman
2014/04/25 20:52:34
How this all works together is rather confusing to
kuan
2014/04/29 00:23:10
Done. i was using map b4, but changed to arrays w
|
+ private final String[][] mItemPropertyNames; |
+ private final String[] mStringProperties; |
+ private final ThingItem[] mItemProperties; |
+ |
+ // |stringPropertyNames| and |itemPropertyNames| are names of properties that this |
cjhopman
2014/04/25 20:52:34
nit: Use javadoc comment format
kuan
2014/04/29 00:23:10
comments r not needed now.
|
+ // ThingItem extracts from the page. |
+ // @param stringPropertyNames is a String[] of property names whose values are String type. |
+ // @param itemPropertyNames is a 2-dimensional array of String where: |
+ // - 1st row: array of property names whose values are of extended-ThingItem type |
+ // - 2nd row: array of property names whose values are to be retrieved from the |
+ // corresponding extended-ThingItem object. |
+ ThingItem(Type type, String[] stringPropertyNames, String[][] itemPropertyNames) { |
+ mType = type; |
+ mStringPropertyNames = stringPropertyNames; |
+ mItemPropertyNames = itemPropertyNames; |
+ mStringProperties = new String[mStringPropertyNames.length]; |
+ mItemProperties = new ThingItem[mItemPropertyNames.length]; |
+ } |
+ |
+ MarkupParser.Image getImage() { |
+ return null; |
+ } |
+ |
+ MarkupParser.Article getArticle() { |
cjhopman
2014/04/25 20:52:34
Let's get rid of these functions that only make se
kuan
2014/04/29 00:23:10
Done. i was doing it this way to avoid casting Th
|
+ return null; |
+ } |
+ |
+ String getStringProperty(String name) { |
+ // Check if property exists in |mStringProperties|. |
+ for (int i = 0; i < mStringPropertyNames.length; i++) { |
+ if (name.equals(mStringPropertyNames[i])) { |
+ String value = mStringProperties[i]; |
+ if (value != null && !value.isEmpty()) return value; |
+ break; |
+ } |
+ } |
+ // Otherwise, repeat for |mItemProperties|. |
cjhopman
2014/04/25 20:52:34
This is weird that we go looking into the properti
kuan
2014/04/29 00:23:10
Done. author, publisher and copyright holder can
|
+ for (int i = 0; i < mItemPropertyNames.length; i++) { |
+ if (!name.equals(mItemPropertyNames[i][0])) continue; |
+ if (mItemProperties[i] != null) { |
+ return mItemProperties[i].getStringProperty(mItemPropertyNames[i][1]); |
+ } |
+ break; |
+ } |
+ return ""; |
+ } |
+ |
+ final ThingItem getItemProperty(String name) { |
+ for (int i = 0; i < mItemPropertyNames.length; i++) { |
+ if (name.equals(mItemPropertyNames[i][0])) return mItemProperties[i]; |
+ } |
+ return null; |
+ } |
+ |
+ final Type getType() { return mType; } |
+ |
+ final boolean isSupported() { return mType != Type.UNSUPPORTED; } |
+ |
+ final boolean isImageRepresentativeOfPage() { |
+ String value = getStringProperty(REPRESENTATIVE_PROP); |
+ return value.equalsIgnoreCase("true"); |
+ } |
+ |
+ // Store |value| for property with |name|. |
+ // @param override: set to true to override the property's value, false to keep property's |
+ // first non-empty value and ignore |value|. |
+ final void putStringValue(String name, String value, boolean override) { |
+ for (int i = 0; i < mStringPropertyNames.length; i++) { |
+ if (name.equals(mStringPropertyNames[i])) { |
+ String existing = override ? null : mStringProperties[i]; |
+ if (existing == null || existing.isEmpty()) mStringProperties[i] = value; |
+ break; |
+ } |
+ } |
+ } |
+ |
+ // Store |value| for property with |name|, unless the property already has a non-null value, |
+ // in which case, |value| will be ignored. This means we only keep the first value. |
+ final void putItemValue(String name, ThingItem value) { |
+ for (int i = 0; i < mItemPropertyNames.length; i++) { |
+ if (name.equals(mItemPropertyNames[i][0])) { |
+ if (mItemProperties[i] == null) mItemProperties[i] = value; |
+ break; |
+ } |
+ } |
+ } |
+ } |
+ |
+ private final List<ThingItem> mItemScopes = new ArrayList<ThingItem>(); |
+ private String mAuthorFromRel = ""; |
+ private static final Map<String, Type> sTypeUrls; |
+ private static final Map<String, String[]> sTagAttributesMap; |
+ private static final String[] sEmptyStringPropertyNames = { |
+ // Intentionally empty, declared so that it's initialized statically. |
+ }; |
+ private static final String[][] sEmptyItemPropertyNames = { |
+ // Intentionally empty, declared so that it's initialized statically. |
+ }; |
+ |
+ static { |
+ sTypeUrls = new HashMap<String, Type>(); |
+ sTypeUrls.put("http://schema.org/ImageObject", Type.IMAGE); |
+ sTypeUrls.put("http://schema.org/Article", Type.ARTICLE); |
+ sTypeUrls.put("http://schema.org/BlogPosting", Type.ARTICLE); |
+ sTypeUrls.put("http://schema.org/NewsArticle", Type.ARTICLE); |
+ sTypeUrls.put("http://schema.org/ScholarlyArticle", Type.ARTICLE); |
+ sTypeUrls.put("http://schema.org/TechArticle", Type.ARTICLE); |
+ sTypeUrls.put("http://schema.org/Person", Type.PERSON); |
+ sTypeUrls.put("http://schema.org/Organization", Type.ORGANIZATION); |
+ sTypeUrls.put("http://schema.org/Corporation", Type.ORGANIZATION); |
+ sTypeUrls.put("http://schema.org/EducationalOrganization", Type.ORGANIZATION); |
+ sTypeUrls.put("http://schema.org/GovernmentOrganization", Type.ORGANIZATION); |
+ sTypeUrls.put("http://schema.org/NGO", Type.ORGANIZATION); |
+ |
+ // The key for |sTagAttributesMap| is the tag name, while the entry value is an array of |
+ // attributes in the specified tag from which to extract information: |
+ // - 0th attribute: contains the value for the property specified in itemprop |
+ // - 1st attribute: if available, contains the value for the author property. |
+ sTagAttributesMap = new HashMap<String, String[]>(); |
cjhopman
2014/04/25 20:52:34
nit: move this (and the sTagAttributesMap declarat
kuan
2014/04/29 00:23:10
Done.
|
+ sTagAttributesMap.put("IMG", new String[] { "SRC" }); |
+ sTagAttributesMap.put("AUDIO", new String[] { "SRC" }); |
+ sTagAttributesMap.put("EMBED", new String[] { "SRC" }); |
+ sTagAttributesMap.put("IFRAME", new String[] { "SRC" }); |
+ sTagAttributesMap.put("SOURCE", new String[] { "SRC" }); |
+ sTagAttributesMap.put("TRACK", new String[] { "SRC" }); |
+ sTagAttributesMap.put("VIDEO", new String[] { "SRC" }); |
+ sTagAttributesMap.put("A", new String[] { "HREF", "REL" }); |
cjhopman
2014/04/25 20:52:34
I can't find documentation anywhere that says to u
kuan
2014/04/29 00:23:10
http://schema.org/author, or from http://schema.or
cjhopman
2014/04/29 17:04:19
That's interesting. Maybe you could add a comment
kuan
2014/04/29 23:26:43
Done. comment is in parse() where the rel attribu
|
+ sTagAttributesMap.put("LINK", new String[] { "HREF", "REL" }); |
+ sTagAttributesMap.put("AREA", new String[] { "HREF" }); |
+ sTagAttributesMap.put("META", new String[] { "CONTENT" }); |
+ sTagAttributesMap.put("TIME", new String[] { "DATETIME" }); |
+ sTagAttributesMap.put("OBJECT", new String[] { "DATA" }); |
+ sTagAttributesMap.put("DATA", new String[] { "VALUE" }); |
+ sTagAttributesMap.put("METER", new String[] { "VALUE" }); |
+ } |
+ |
+ /** |
+ * The object that extracts and verifies Schema.org markup tags from |root|. |
+ */ |
+ public SchemaOrgParser(Element root) { |
+ // TODO(kuan): Parsing all tags is pretty expensive, should we do so only lazily? |
+ // If parse lazily, all get* methods will need to check for parsed state and, if necessary, |
+ // parse before returning the requested properties. |
+ // Note that the <html> element can also be the start of a Schema.org item, and hence needs |
+ // to be parsed. |
+ parse(root, null); |
+ } |
+ |
+ // Returns the first item that has the requested property value. |
cjhopman
2014/04/25 20:52:34
I don't think that this is something we would ever
kuan
2014/04/29 00:23:10
Done. Image also has headline, but i assume we on
|
+ String findStringProperty(String name) { |
+ if (mItemScopes.isEmpty()) return ""; |
+ for (int i = 0; i < mItemScopes.size(); i++) { |
+ String value = mItemScopes.get(i).getStringProperty(name); |
+ if (!value.isEmpty()) return value; |
+ } |
+ return ""; |
+ } |
+ |
+ ThingItem findFirstArticle() { |
cjhopman
2014/04/25 20:52:34
This should return an ArticleItem
kuan
2014/04/29 00:23:10
Done.
|
+ for (int i = 0; i < mItemScopes.size(); i++) { |
+ ThingItem item = mItemScopes.get(i); |
+ if (item.mType == Type.ARTICLE) return item; |
+ } |
+ return null; |
+ } |
+ |
+ final List<ThingItem> getItemScopes() { return mItemScopes; } |
+ |
+ final String getAuthorFromRel() { return mAuthorFromRel; } |
+ |
+ static String concat(String first, String second) { |
+ String concat = first; |
+ if (!concat.isEmpty() && !second.isEmpty()) concat += " "; |
+ concat += second; |
+ return concat; |
+ } |
+ |
+ private void parse(Element e, ThingItem parentItem) { |
+ ThingItem newItem = null; |
+ boolean isItemScope = isItemScope(e); |
+ // A non-null |parentItem| means we're currently parsing the elements for a schema.org type. |
+ String[] propertyNames = parentItem != null ? getItemProp(e) : new String[0]; |
+ |
+ if (isItemScope) { |
+ // The "itemscope" and "itemtype" attributes of |e| indicate the start of an item. |
+ // Create the corresponding extended-ThingItem, and add it to the list if: |
+ // 1) its type is supported, and |
+ // 2) if the parent is an unsupported type, it's not an "itemprop" attribute of the |
+ // parent, based on the rule that an item is a top-level item if its element doesn't |
+ // have an itemprop attribute. |
+ newItem = createItemForElement(e); |
+ if (newItem != null && newItem.isSupported() && |
+ (parentItem == null || parentItem.isSupported() || propertyNames.length == 0)) { |
+ mItemScopes.add(newItem); |
+ } |
+ } |
+ |
+ // If parent is a supported type, parse the element for >= 1 properties in "itemprop" |
+ // attribute. |
+ if (propertyNames.length > 0 && parentItem.isSupported() && |
+ (newItem == null || newItem.isSupported())) { |
+ for (int i = 0; i < propertyNames.length; i++) { |
+ // If a new item was created above, the property value of this "itemprop" attribute |
+ // is an embedded item, so add it to the parent item. |
+ if (newItem != null) { |
+ parentItem.putItemValue(propertyNames[i], newItem); |
+ } else { |
+ // Otherwise, extract the property value from the tag itself, and add it to the |
+ // parent item. |
+ parentItem.putStringValue(propertyNames[i], getPropertyValue(e), false); |
+ } |
+ } |
+ } |
+ |
+ // If <a> or <link> tags specify rel="author", extract it. |
+ if (mAuthorFromRel.isEmpty()) mAuthorFromRel = getAuthorFromRelAttribute(e); |
+ |
+ // Now, parse each child element recursively. |
+ NodeList<Node> children = e.getChildNodes(); |
+ for (int i = 0; i < children.getLength(); i++) { |
+ Node child = children.getItem(i); |
+ if (child.getNodeType() != Node.ELEMENT_NODE) continue; |
+ parse(Element.as(child), newItem != null ? newItem : parentItem); |
+ } |
+ } |
+ |
+ private Type getItemType(Element e) { |
+ // "itemtype" attribute is case-sensitive. |
+ String type = e.getAttribute("ITEMTYPE"); |
+ return sTypeUrls.containsKey(type) ? sTypeUrls.get(type) : Type.UNSUPPORTED; |
+ } |
+ |
+ private ThingItem createItemForElement(Element e) { |
+ ThingItem newItem = null; |
+ Type type = getItemType(e); |
+ switch (type) { |
+ case IMAGE: |
+ newItem = new ImageItem(); |
+ break; |
+ case ARTICLE: |
+ newItem = new ArticleItem(); |
+ break; |
+ case PERSON: |
+ newItem = new PersonItem(); |
+ break; |
+ case ORGANIZATION: |
+ newItem = new OrganizationItem(); |
+ break; |
+ case UNSUPPORTED: |
+ newItem = new UnsupportedItem(); |
+ break; |
+ default: |
+ return null; |
+ } |
+ return newItem; |
+ } |
+ |
+ private static class ImageItem extends ThingItem { |
+ private static final String[] sStringPropertyNames = { |
+ NAME_PROP, |
+ URL_PROP, |
+ DESCRIPTION_PROP, |
+ IMAGE_PROP, |
+ HEADLINE_PROP, |
+ PUBLISHER_PROP, |
+ COPYRIGHT_HOLDER_PROP, |
+ COPYRIGHT_YEAR_PROP, |
+ CONTENT_URL_PROP, |
+ ENCODING_FORMAT_PROP, |
+ CAPTION_PROP, |
+ REPRESENTATIVE_PROP, |
+ WIDTH_PROP, |
+ HEIGHT_PROP, |
+ }; |
+ |
+ private static final String[][] sItemPropertyNames = { |
+ new String[] { PUBLISHER_PROP, NAME_PROP }, |
+ new String[] { COPYRIGHT_HOLDER_PROP, NAME_PROP }, |
+ }; |
+ |
+ ImageItem() { |
+ super(Type.IMAGE, sStringPropertyNames, sItemPropertyNames); |
+ } |
+ |
+ @Override |
+ String getStringProperty(String propertyName) { |
+ if (!propertyName.equals(CONTENT_URL_PROP) && !propertyName.equals(URL_PROP)) { |
+ return super.getStringProperty(propertyName); |
+ } |
+ // Returns either the value of CONTENT_URL_PROP or URL_PROP. |
+ String url = super.getStringProperty(CONTENT_URL_PROP); |
+ return url.isEmpty() ? super.getStringProperty(URL_PROP) : url; |
+ } |
+ |
+ @Override |
+ MarkupParser.Image getImage() { |
+ MarkupParser.Image image = new MarkupParser.Image(); |
+ image.image = getStringProperty(CONTENT_URL_PROP); |
+ image.url = image.image; |
+ image.type = getStringProperty(ENCODING_FORMAT_PROP); |
+ image.caption = getStringProperty(CAPTION_PROP); |
+ try { |
+ image.width = Integer.parseInt(getStringProperty(WIDTH_PROP), 10); |
+ } catch (Exception e) { |
+ } |
+ try { |
+ image.height = Integer.parseInt(getStringProperty(HEIGHT_PROP), 10); |
+ } catch (Exception e) { |
+ } |
+ return image; |
+ } |
+ } |
+ |
+ private static class ArticleItem extends ThingItem { |
+ private static final String[] sStringPropertyNames = { |
+ NAME_PROP, |
+ URL_PROP, |
+ DESCRIPTION_PROP, |
+ IMAGE_PROP, |
+ HEADLINE_PROP, |
+ PUBLISHER_PROP, |
+ COPYRIGHT_HOLDER_PROP, |
+ COPYRIGHT_YEAR_PROP, |
+ DATE_MODIFIED_PROP, |
+ DATE_PUBLISHED_PROP, |
+ AUTHOR_PROP, |
+ CREATOR_PROP, |
+ SECTION_PROP, |
+ }; |
+ |
+ private static final String[][] sItemPropertyNames = { |
+ new String[] { PUBLISHER_PROP, NAME_PROP }, |
+ new String[] { COPYRIGHT_HOLDER_PROP, NAME_PROP }, |
+ new String[] { AUTHOR_PROP, NAME_PROP }, |
+ new String[] { CREATOR_PROP, NAME_PROP }, |
+ new String[] { ASSOCIATED_MEDIA_PROP, CONTENT_URL_PROP }, |
+ new String[] { ENCODING_PROP, CONTENT_URL_PROP }, |
+ }; |
+ |
+ ArticleItem() { |
+ super(Type.ARTICLE, sStringPropertyNames, sItemPropertyNames); |
+ } |
+ |
+ @Override |
+ MarkupParser.Image getImage() { |
+ // If "associatedMedia" or "encoding" property exists, set the "representativeOfPage" |
+ // property of the corresponding ImageItem to "true", so that that image (which will |
+ // be picked up when looping through |mItemScopes|) will be the dominant one. |
+ ThingItem imageItem = getItemProperty(ASSOCIATED_MEDIA_PROP); |
+ if (imageItem == null) imageItem = getItemProperty(ENCODING_PROP); |
+ if (imageItem != null) { |
+ imageItem.putStringValue(REPRESENTATIVE_PROP, "true", true); |
cjhopman
2014/04/25 20:52:34
This is strange, and only seems to work if I do ge
kuan
2014/04/29 00:23:10
Done. i've removed setting representativeOfPage a
|
+ return null; |
+ } |
+ |
+ // Use value of IMAGE_PROP to create a MarkupParser.Image. |
+ String imageUrl = getStringProperty(IMAGE_PROP); |
+ if (imageUrl.isEmpty()) return null; |
+ MarkupParser.Image image = new MarkupParser.Image(); |
+ image.image = imageUrl; |
+ image.url = imageUrl; |
+ return image; |
+ } |
+ |
+ @Override |
+ MarkupParser.Article getArticle() { |
+ MarkupParser.Article article = new MarkupParser.Article(); |
+ article.publishedTime = getStringProperty(DATE_PUBLISHED_PROP); |
+ article.modifiedTime = getStringProperty(DATE_MODIFIED_PROP); |
+ article.section = getStringProperty(SECTION_PROP); |
+ String author = getStringProperty(AUTHOR_PROP); |
+ if (author.isEmpty()) author = getStringProperty(CREATOR_PROP); |
+ article.authors = author.isEmpty() ? new String[0] : new String[] { author }; |
+ return article; |
+ } |
+ } |
+ |
+ private static class PersonItem extends ThingItem { |
+ private static final String[] sStringPropertyNames = { |
+ NAME_PROP, |
+ URL_PROP, |
+ DESCRIPTION_PROP, |
+ IMAGE_PROP, |
+ FAMILY_NAME_PROP, |
+ GIVEN_NAME_PROP, |
+ }; |
+ |
+ PersonItem() { |
+ super(Type.PERSON, sStringPropertyNames, sEmptyItemPropertyNames); |
+ } |
+ |
+ @Override |
+ String getStringProperty(String propertyName) { |
cjhopman
2014/04/25 20:52:34
Overriding getStringProperty like this is a little
kuan
2014/04/29 00:23:10
Done. i've made getStringProperty final, but impl
|
+ if (!propertyName.equals(NAME_PROP)) return super.getStringProperty(propertyName); |
+ // Returns either the value of NAME_PROP, or concatenated values of GIVEN_NAME_PROP and |
+ // FAMILY_NAME_PROP delimited by a whitespace. |
+ String fullname = super.getStringProperty(NAME_PROP); |
+ if (fullname.isEmpty()) { |
+ fullname = concat(super.getStringProperty(GIVEN_NAME_PROP), |
+ super.getStringProperty(FAMILY_NAME_PROP)); |
+ } |
+ return fullname; |
+ } |
+ } |
+ |
+ private static class OrganizationItem extends ThingItem { |
+ private static final String[] sStringPropertyNames = { |
+ NAME_PROP, |
+ URL_PROP, |
+ DESCRIPTION_PROP, |
+ IMAGE_PROP, |
+ LEGAL_NAME_PROP, |
+ }; |
+ |
+ OrganizationItem() { |
+ super(Type.ORGANIZATION, sStringPropertyNames, sEmptyItemPropertyNames); |
+ } |
+ |
+ @Override |
+ String getStringProperty(String propertyName) { |
+ if (!propertyName.equals(NAME_PROP)) return super.getStringProperty(propertyName); |
+ // Returns either the value of NAME_PROP or LEGAL_NAME_PROP. |
+ String name = super.getStringProperty(NAME_PROP); |
+ if (name.isEmpty()) name = super.getStringProperty(LEGAL_NAME_PROP); |
+ return name; |
+ } |
+ } |
+ |
+ private static class UnsupportedItem extends ThingItem { |
+ UnsupportedItem(){ |
+ super(Type.UNSUPPORTED, sEmptyStringPropertyNames, sEmptyItemPropertyNames); |
+ } |
+ } |
+ |
+ private static boolean isItemScope(Element e) { |
+ return e.hasAttribute("ITEMSCOPE") && e.hasAttribute("ITEMTYPE"); |
+ } |
+ |
+ private static String[] getItemProp(Element e) { |
+ // "itemprop" attribute is case-sensitive, and can have multiple properties. |
+ String itemprop = e.getAttribute("ITEMPROP"); |
+ if (itemprop.isEmpty()) return new String[0]; |
+ String[] splits = StringUtil.split(itemprop, "\\s+"); |
+ return splits.length > 0 ? splits : new String[] { itemprop }; |
+ } |
+ |
+ // Extracts the property value from |e|. For some tags, the value is a specific attribute, |
+ // while for others, it's the text between the start and end tags. |
+ private static String getPropertyValue(Element e) { |
+ String value = ""; |
+ String tagName = e.getTagName(); |
+ if (sTagAttributesMap.containsKey(tagName)) { |
+ value = e.getAttribute(sTagAttributesMap.get(tagName)[0]); |
+ } |
+ if (value.isEmpty()) value = e.getInnerText(); |
+ return value; |
+ } |
+ |
+ // Extracts the author property from |e|'s "rel=author" attribute. |
+ private static String getAuthorFromRelAttribute(Element e) { |
+ String author = ""; |
+ String tagName = e.getTagName(); |
+ if (sTagAttributesMap.containsKey(tagName)) { |
+ String[] attrs = sTagAttributesMap.get(tagName); |
+ if (attrs.length > 1 && e.getAttribute(attrs[1]).equals(AUTHOR_REL)) { |
+ author = e.getInnerText(); |
+ } |
+ } |
+ return author; |
+ } |
+} |