OLD | NEW |
(Empty) | |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 package com.dom_distiller.client; |
| 6 |
| 7 import java.util.ArrayList; |
| 8 import java.util.HashMap; |
| 9 import java.util.List; |
| 10 import java.util.Map; |
| 11 |
| 12 import com.google.gwt.dom.client.AnchorElement; |
| 13 import com.google.gwt.dom.client.Element; |
| 14 import com.google.gwt.dom.client.ImageElement; |
| 15 import com.google.gwt.dom.client.MetaElement; |
| 16 import com.google.gwt.dom.client.Node; |
| 17 import com.google.gwt.dom.client.NodeList; |
| 18 |
| 19 /** |
| 20 * This class recognizes and parses schema.org markup tags, and returns the prop
erties that matter |
| 21 * to distilled content. |
| 22 * Schema.org markup (http://schema.org) is based on the microdata format |
| 23 * (http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html). |
| 24 * For the basic Schema.org Thing type, the basic properties are: name, url, des
cription, image. |
| 25 * In addition, for each type that we support, we also parse more specific prope
rties: |
| 26 * - Article: headline (i.e. title), publisher, copyright year, copyright holder
, date published, |
| 27 * date modified, author, article section |
| 28 * - ImageObject: headline (i.e. title), publisher, copyright year, copyright ho
lder, content url, |
| 29 * encoding format, caption, representative of page, width, heigh
t |
| 30 * - Person: family name, given name |
| 31 * - Organization: legal name. |
| 32 * The value of a Schema.Org property can be a Schema.Org type, i.e. embedded.
E.g., the author or |
| 33 * publisher of article or publisher of image could be a Schema.Org Person or Or
ganization type; |
| 34 * in fact, this is the reason we support Person and Organization types. |
| 35 */ |
| 36 public class SchemaOrgParser { |
| 37 static final String NAME_PROP = "name"; |
| 38 static final String URL_PROP = "url"; |
| 39 static final String DESCRIPTION_PROP = "description"; |
| 40 static final String IMAGE_PROP = "image"; |
| 41 static final String HEADLINE_PROP = "headline"; |
| 42 static final String PUBLISHER_PROP = "publisher"; |
| 43 static final String COPYRIGHT_HOLDER_PROP = "copyrightHolder"; |
| 44 static final String COPYRIGHT_YEAR_PROP = "copyrightYear"; |
| 45 static final String CONTENT_URL_PROP = "contentUrl"; |
| 46 static final String ENCODING_FORMAT_PROP = "encodingFormat"; |
| 47 static final String CAPTION_PROP = "caption"; |
| 48 static final String REPRESENTATIVE_PROP = "representativeOfPage"; |
| 49 static final String WIDTH_PROP = "width"; |
| 50 static final String HEIGHT_PROP = "height"; |
| 51 static final String DATE_PUBLISHED_PROP = "datePublished"; |
| 52 static final String DATE_MODIFIED_PROP = "dateModified"; |
| 53 static final String AUTHOR_PROP = "author"; |
| 54 static final String CREATOR_PROP = "creator"; |
| 55 static final String SECTION_PROP = "articleSection"; |
| 56 static final String ASSOCIATED_MEDIA_PROP = "associatedMedia"; |
| 57 static final String ENCODING_PROP = "encoding"; |
| 58 static final String FAMILY_NAME_PROP = "familyName"; |
| 59 static final String GIVEN_NAME_PROP = "givenName"; |
| 60 static final String LEGAL_NAME_PROP = "legalName"; |
| 61 static final String AUTHOR_REL = "author"; |
| 62 |
| 63 enum Type { // All these types are extended from Thing, directly or indirec
tly. |
| 64 IMAGE, |
| 65 ARTICLE, |
| 66 PERSON, |
| 67 ORGANIZATION, |
| 68 UNSUPPORTED, |
| 69 } |
| 70 |
| 71 static class ThingItem { |
| 72 private final Type mType; |
| 73 private final Map<String, String> mStringProperties; |
| 74 private final Map<String, ThingItem> mItemProperties; |
| 75 |
| 76 ThingItem(Type type) { |
| 77 mType = type; |
| 78 mStringProperties = new HashMap<String, String>(); |
| 79 mItemProperties = new HashMap<String, ThingItem>(); |
| 80 |
| 81 addStringPropertyName(NAME_PROP); |
| 82 addStringPropertyName(URL_PROP); |
| 83 addStringPropertyName(DESCRIPTION_PROP); |
| 84 addStringPropertyName(IMAGE_PROP); |
| 85 } |
| 86 |
| 87 final void addStringPropertyName(String name) { |
| 88 mStringProperties.put(name, ""); |
| 89 } |
| 90 |
| 91 final void addItemPropertyName(String name) { |
| 92 mItemProperties.put(name, null); |
| 93 } |
| 94 |
| 95 final String getStringProperty(String name) { |
| 96 return !mStringProperties.containsKey(name) ? "" : mStringProperties
.get(name); |
| 97 } |
| 98 |
| 99 final ThingItem getItemProperty(String name) { |
| 100 return !mItemProperties.containsKey(name) ? null : mItemProperties.g
et(name); |
| 101 } |
| 102 |
| 103 final Type getType() { return mType; } |
| 104 |
| 105 final boolean isSupported() { return mType != Type.UNSUPPORTED; } |
| 106 |
| 107 // Store |value| for property with |name|, unless the property already h
as a non-empty |
| 108 // value, in which case |value| will be ignored. This means we only kee
p the first value. |
| 109 final void putStringValue(String name, String value) { |
| 110 if (mStringProperties.containsKey(name) && mStringProperties.get(nam
e).isEmpty()) { |
| 111 mStringProperties.put(name, value); |
| 112 } |
| 113 } |
| 114 |
| 115 // Store |value| for property with |name|, unless the property already h
as a non-null value, |
| 116 // in which case, |value| will be ignored. This means we only keep the
first value. |
| 117 final void putItemValue(String name, ThingItem value) { |
| 118 if (mItemProperties.containsKey(name)) mItemProperties.put(name, val
ue); |
| 119 } |
| 120 } |
| 121 |
| 122 private final List<ThingItem> mItemScopes = new ArrayList<ThingItem>(); |
| 123 private String mAuthorFromRel = ""; |
| 124 private static final Map<String, Type> sTypeUrls; |
| 125 |
| 126 static { |
| 127 sTypeUrls = new HashMap<String, Type>(); |
| 128 sTypeUrls.put("http://schema.org/ImageObject", Type.IMAGE); |
| 129 sTypeUrls.put("http://schema.org/Article", Type.ARTICLE); |
| 130 sTypeUrls.put("http://schema.org/BlogPosting", Type.ARTICLE); |
| 131 sTypeUrls.put("http://schema.org/NewsArticle", Type.ARTICLE); |
| 132 sTypeUrls.put("http://schema.org/ScholarlyArticle", Type.ARTICLE); |
| 133 sTypeUrls.put("http://schema.org/TechArticle", Type.ARTICLE); |
| 134 sTypeUrls.put("http://schema.org/Person", Type.PERSON); |
| 135 sTypeUrls.put("http://schema.org/Organization", Type.ORGANIZATION); |
| 136 sTypeUrls.put("http://schema.org/Corporation", Type.ORGANIZATION); |
| 137 sTypeUrls.put("http://schema.org/EducationalOrganization", Type.ORGANIZA
TION); |
| 138 sTypeUrls.put("http://schema.org/GovernmentOrganization", Type.ORGANIZAT
ION); |
| 139 sTypeUrls.put("http://schema.org/NGO", Type.ORGANIZATION); |
| 140 } |
| 141 |
| 142 /** |
| 143 * The object that extracts and verifies Schema.org markup tags from |root|. |
| 144 */ |
| 145 public SchemaOrgParser(Element root) { |
| 146 // TODO(kuan): Parsing all tags is pretty expensive, should we do so onl
y lazily? |
| 147 // If parse lazily, all get* methods will need to check for parsed state
and, if necessary, |
| 148 // parse before returning the requested properties. |
| 149 // Note that the <html> element can also be the start of a Schema.org it
em, and hence needs |
| 150 // to be parsed. |
| 151 parse(root, null); |
| 152 } |
| 153 |
| 154 final List<ArticleItem> getArticleItems() { |
| 155 List<ArticleItem> articles = new ArrayList<ArticleItem>(); |
| 156 for (int i = 0; i < mItemScopes.size(); i++) { |
| 157 ThingItem item = mItemScopes.get(i); |
| 158 if (item.mType == Type.ARTICLE) articles.add((ArticleItem) item); |
| 159 } |
| 160 return articles; |
| 161 } |
| 162 |
| 163 final List<ImageItem> getImageItems() { |
| 164 List<ImageItem> images = new ArrayList<ImageItem>(); |
| 165 for (int i = 0; i < mItemScopes.size(); i++) { |
| 166 ThingItem item = mItemScopes.get(i); |
| 167 if (item.mType == Type.IMAGE) images.add((ImageItem) item); |
| 168 } |
| 169 return images; |
| 170 } |
| 171 |
| 172 final String getAuthorFromRel() { return mAuthorFromRel; } |
| 173 |
| 174 private void parse(Element e, ThingItem parentItem) { |
| 175 ThingItem newItem = null; |
| 176 boolean isItemScope = isItemScope(e); |
| 177 // A non-null |parentItem| means we're currently parsing the elements fo
r a schema.org type. |
| 178 String[] propertyNames = parentItem != null ? getItemProp(e) : new Strin
g[0]; |
| 179 |
| 180 if (isItemScope) { |
| 181 // The "itemscope" and "itemtype" attributes of |e| indicate the sta
rt of an item. |
| 182 // Create the corresponding extended-ThingItem, and add it to the li
st if: |
| 183 // 1) its type is supported, and |
| 184 // 2) if the parent is an unsupported type, it's not an "itemprop" a
ttribute of the |
| 185 // parent, based on the rule that an item is a top-level item if
its element doesn't |
| 186 // have an itemprop attribute. |
| 187 newItem = createItemForElement(e); |
| 188 if (newItem != null && newItem.isSupported() && |
| 189 (parentItem == null || parentItem.isSupported() || propertyNames
.length == 0)) { |
| 190 mItemScopes.add(newItem); |
| 191 } |
| 192 } |
| 193 |
| 194 // If parent is a supported type, parse the element for >= 1 properties
in "itemprop" |
| 195 // attribute. |
| 196 if (propertyNames.length > 0 && parentItem.isSupported() && |
| 197 (newItem == null || newItem.isSupported())) { |
| 198 for (int i = 0; i < propertyNames.length; i++) { |
| 199 // If a new item was created above, the property value of this "
itemprop" attribute |
| 200 // is an embedded item, so add it to the parent item. |
| 201 if (newItem != null) { |
| 202 parentItem.putItemValue(propertyNames[i], newItem); |
| 203 } else { |
| 204 // Otherwise, extract the property value from the tag itself,
and add it to the |
| 205 // parent item. |
| 206 parentItem.putStringValue(propertyNames[i], getPropertyValue(
e)); |
| 207 } |
| 208 } |
| 209 } |
| 210 |
| 211 // As per http://schema.org/author (or http://schema.org/Article and sea
rch for "author" |
| 212 // property), if <a> or <link> tags specify rel="author", extract it. |
| 213 if (mAuthorFromRel.isEmpty()) mAuthorFromRel = getAuthorFromRelAttribute
(e); |
| 214 |
| 215 // Now, parse each child element recursively. |
| 216 NodeList<Node> children = e.getChildNodes(); |
| 217 for (int i = 0; i < children.getLength(); i++) { |
| 218 Node child = children.getItem(i); |
| 219 if (child.getNodeType() != Node.ELEMENT_NODE) continue; |
| 220 parse(Element.as(child), newItem != null ? newItem : parentItem); |
| 221 } |
| 222 } |
| 223 |
| 224 private Type getItemType(Element e) { |
| 225 // "itemtype" attribute is case-sensitive. |
| 226 String type = e.getAttribute("ITEMTYPE"); |
| 227 return sTypeUrls.containsKey(type) ? sTypeUrls.get(type) : Type.UNSUPPOR
TED; |
| 228 } |
| 229 |
| 230 private ThingItem createItemForElement(Element e) { |
| 231 ThingItem newItem = null; |
| 232 Type type = getItemType(e); |
| 233 switch (type) { |
| 234 case IMAGE: |
| 235 newItem = new ImageItem(); |
| 236 break; |
| 237 case ARTICLE: |
| 238 newItem = new ArticleItem(); |
| 239 break; |
| 240 case PERSON: |
| 241 newItem = new PersonItem(); |
| 242 break; |
| 243 case ORGANIZATION: |
| 244 newItem = new OrganizationItem(); |
| 245 break; |
| 246 case UNSUPPORTED: |
| 247 newItem = new UnsupportedItem(); |
| 248 break; |
| 249 default: |
| 250 return null; |
| 251 } |
| 252 return newItem; |
| 253 } |
| 254 |
| 255 static class ImageItem extends ThingItem { |
| 256 ImageItem() { |
| 257 super(Type.IMAGE); |
| 258 |
| 259 addStringPropertyName(CONTENT_URL_PROP); |
| 260 addStringPropertyName(ENCODING_FORMAT_PROP); |
| 261 addStringPropertyName(CAPTION_PROP); |
| 262 addStringPropertyName(REPRESENTATIVE_PROP); |
| 263 addStringPropertyName(WIDTH_PROP); |
| 264 addStringPropertyName(HEIGHT_PROP); |
| 265 } |
| 266 |
| 267 final boolean isRepresentativeOfPage() { |
| 268 return getStringProperty(REPRESENTATIVE_PROP).equalsIgnoreCase("true
"); |
| 269 } |
| 270 |
| 271 final MarkupParser.Image getImage() { |
| 272 MarkupParser.Image image = new MarkupParser.Image(); |
| 273 image.image = getStringProperty(CONTENT_URL_PROP); |
| 274 if (image.image.isEmpty()) image.image = getStringProperty(URL_PROP)
; |
| 275 image.url = image.image; |
| 276 image.type = getStringProperty(ENCODING_FORMAT_PROP); |
| 277 image.caption = getStringProperty(CAPTION_PROP); |
| 278 try { |
| 279 image.width = Integer.parseInt(getStringProperty(WIDTH_PROP), 10
); |
| 280 } catch (Exception e) { |
| 281 } |
| 282 try { |
| 283 image.height = Integer.parseInt(getStringProperty(HEIGHT_PROP),
10); |
| 284 } catch (Exception e) { |
| 285 } |
| 286 return image; |
| 287 } |
| 288 } |
| 289 |
| 290 static class ArticleItem extends ThingItem { |
| 291 ArticleItem() { |
| 292 super(Type.ARTICLE); |
| 293 |
| 294 addStringPropertyName(HEADLINE_PROP); |
| 295 addStringPropertyName(PUBLISHER_PROP); |
| 296 addStringPropertyName(COPYRIGHT_HOLDER_PROP); |
| 297 addStringPropertyName(COPYRIGHT_YEAR_PROP); |
| 298 addStringPropertyName(DATE_MODIFIED_PROP); |
| 299 addStringPropertyName(DATE_PUBLISHED_PROP); |
| 300 addStringPropertyName(AUTHOR_PROP); |
| 301 addStringPropertyName(CREATOR_PROP); |
| 302 addStringPropertyName(SECTION_PROP); |
| 303 |
| 304 addItemPropertyName(PUBLISHER_PROP); |
| 305 addItemPropertyName(COPYRIGHT_HOLDER_PROP); |
| 306 addItemPropertyName(AUTHOR_PROP); |
| 307 addItemPropertyName(CREATOR_PROP); |
| 308 addItemPropertyName(ASSOCIATED_MEDIA_PROP); |
| 309 addItemPropertyName(ENCODING_PROP); |
| 310 } |
| 311 |
| 312 final MarkupParser.Article getArticle() { |
| 313 MarkupParser.Article article = new MarkupParser.Article(); |
| 314 article.publishedTime = getStringProperty(DATE_PUBLISHED_PROP); |
| 315 article.modifiedTime = getStringProperty(DATE_MODIFIED_PROP); |
| 316 article.section = getStringProperty(SECTION_PROP); |
| 317 String author = getPersonOrOrganizationName(AUTHOR_PROP); |
| 318 if (author.isEmpty()) author = getPersonOrOrganizationName(CREATOR_P
ROP); |
| 319 article.authors = author.isEmpty() ? new String[0] : new String[] {
author }; |
| 320 return article; |
| 321 } |
| 322 |
| 323 final String getCopyright() { |
| 324 // Returns a concatenated string of copyright year and copyright hol
der of the article, |
| 325 // delimited by a whitespace. |
| 326 String copyright = concat(getStringProperty(COPYRIGHT_YEAR_PROP), |
| 327 getPersonOrOrganizationName(COPYRIGHT_HOLD
ER_PROP)); |
| 328 return copyright.isEmpty() ? copyright : "Copyright " + copyright; |
| 329 } |
| 330 |
| 331 final String getPersonOrOrganizationName(String propertyName) { |
| 332 // Returns either the string value of |propertyName| or the value re
turned by getName() |
| 333 // of PersonItem or OrganizationItem. |
| 334 String value = getStringProperty(propertyName); |
| 335 if (!value.isEmpty()) return value; |
| 336 |
| 337 ThingItem valueItem = getItemProperty(propertyName); |
| 338 if (valueItem != null) { |
| 339 if (valueItem.getType() == Type.PERSON) { |
| 340 value = ((PersonItem) valueItem).getName(); |
| 341 } else if (valueItem.getType() == Type.ORGANIZATION) { |
| 342 value = ((OrganizationItem) valueItem).getName(); |
| 343 } |
| 344 } |
| 345 return value; |
| 346 } |
| 347 |
| 348 final ImageItem getRepresentativeImageItem() { |
| 349 // Returns the corrresponding ImageItem for "associatedMedia" or "en
coding" property. |
| 350 ThingItem imageItem = getItemProperty(ASSOCIATED_MEDIA_PROP); |
| 351 if (imageItem == null) imageItem = getItemProperty(ENCODING_PROP); |
| 352 return imageItem != null && imageItem.getType() == Type.IMAGE ? |
| 353 (ImageItem) imageItem : null; |
| 354 } |
| 355 |
| 356 final MarkupParser.Image getImage() { |
| 357 // Use value of "image" property to create a MarkupParser.Image. |
| 358 String imageUrl = getStringProperty(IMAGE_PROP); |
| 359 if (imageUrl.isEmpty()) return null; |
| 360 MarkupParser.Image image = new MarkupParser.Image(); |
| 361 image.image = imageUrl; |
| 362 image.url = imageUrl; |
| 363 return image; |
| 364 } |
| 365 } |
| 366 |
| 367 private static class PersonItem extends ThingItem { |
| 368 PersonItem() { |
| 369 super(Type.PERSON); |
| 370 |
| 371 addStringPropertyName(FAMILY_NAME_PROP); |
| 372 addStringPropertyName(GIVEN_NAME_PROP); |
| 373 } |
| 374 |
| 375 String getName() { |
| 376 // Returns either the value of NAME_PROP, or concatenated values of
GIVEN_NAME_PROP and |
| 377 // FAMILY_NAME_PROP delimited by a whitespace. |
| 378 String name = getStringProperty(NAME_PROP); |
| 379 return !name.isEmpty() ? name : |
| 380 concat(getStringProperty(GIVEN_NAME_PROP), getStringProperty
(FAMILY_NAME_PROP)); |
| 381 } |
| 382 } |
| 383 |
| 384 private static class OrganizationItem extends ThingItem { |
| 385 OrganizationItem() { |
| 386 super(Type.ORGANIZATION); |
| 387 |
| 388 addStringPropertyName(LEGAL_NAME_PROP); |
| 389 } |
| 390 |
| 391 String getName() { |
| 392 // Returns either the value of NAME_PROP or LEGAL_NAME_PROP. |
| 393 String name = getStringProperty(NAME_PROP); |
| 394 return !name.isEmpty() ? name : getStringProperty(LEGAL_NAME_PROP); |
| 395 } |
| 396 } |
| 397 |
| 398 private static class UnsupportedItem extends ThingItem { |
| 399 UnsupportedItem() { |
| 400 super(Type.UNSUPPORTED); |
| 401 } |
| 402 } |
| 403 |
| 404 private static boolean isItemScope(Element e) { |
| 405 return e.hasAttribute("ITEMSCOPE") && e.hasAttribute("ITEMTYPE"); |
| 406 } |
| 407 |
| 408 private static String[] getItemProp(Element e) { |
| 409 // "itemprop" attribute is case-sensitive, and can have multiple propert
ies. |
| 410 String itemprop = e.getAttribute("ITEMPROP"); |
| 411 if (itemprop.isEmpty()) return new String[0]; |
| 412 String[] splits = StringUtil.split(itemprop, "\\s+"); |
| 413 return splits.length > 0 ? splits : new String[] { itemprop }; |
| 414 } |
| 415 |
| 416 private static final Map<String, String> sTagAttributeMap; |
| 417 |
| 418 static { |
| 419 // The key for |sTagAttributeMap| is the tag name, while the entry value
is an array of |
| 420 // attributes in the specified tag from which to extract information: |
| 421 // - 0th attribute: contains the value for the property specified in ite
mprop |
| 422 // - 1st attribute: if available, contains the value for the author prop
erty. |
| 423 sTagAttributeMap = new HashMap<String, String>(); |
| 424 sTagAttributeMap.put("IMG", "SRC"); |
| 425 sTagAttributeMap.put("AUDIO", "SRC"); |
| 426 sTagAttributeMap.put("EMBED", "SRC"); |
| 427 sTagAttributeMap.put("IFRAME", "SRC"); |
| 428 sTagAttributeMap.put("SOURCE", "SRC"); |
| 429 sTagAttributeMap.put("TRACK", "SRC"); |
| 430 sTagAttributeMap.put("VIDEO", "SRC"); |
| 431 sTagAttributeMap.put("A", "HREF"); |
| 432 sTagAttributeMap.put("LINK", "HREF"); |
| 433 sTagAttributeMap.put("AREA", "HREF"); |
| 434 sTagAttributeMap.put("META", "CONTENT"); |
| 435 sTagAttributeMap.put("TIME", "DATETIME"); |
| 436 sTagAttributeMap.put("OBJECT", "DATA"); |
| 437 sTagAttributeMap.put("DATA", "VALUE"); |
| 438 sTagAttributeMap.put("METER", "VALUE"); |
| 439 } |
| 440 |
| 441 // Extracts the property value from |e|. For some tags, the value is a spec
ific attribute, |
| 442 // while for others, it's the text between the start and end tags. |
| 443 private static String getPropertyValue(Element e) { |
| 444 String value = ""; |
| 445 String tagName = e.getTagName(); |
| 446 if (sTagAttributeMap.containsKey(tagName)) { |
| 447 value = e.getAttribute(sTagAttributeMap.get(tagName)); |
| 448 } |
| 449 if (value.isEmpty()) value = e.getInnerText(); |
| 450 return value; |
| 451 } |
| 452 |
| 453 // Extracts the author property from the "rel=author" attribute of an anchor
or a link element. |
| 454 private static String getAuthorFromRelAttribute(Element e) { |
| 455 String author = ""; |
| 456 String tagName = e.getTagName(); |
| 457 if ((tagName.equalsIgnoreCase("A") || tagName.equalsIgnoreCase("LINK"))
&& |
| 458 e.getAttribute("REL").equalsIgnoreCase(AUTHOR_REL)) { |
| 459 author = e.getInnerText(); |
| 460 } |
| 461 return author; |
| 462 } |
| 463 |
| 464 private static String concat(String first, String second) { |
| 465 String concat = first; |
| 466 if (!concat.isEmpty() && !second.isEmpty()) concat += " "; |
| 467 concat += second; |
| 468 return concat; |
| 469 } |
| 470 } |
OLD | NEW |