Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3)

Side by Side Diff: src/com/dom_distiller/client/SchemaOrgParser.java

Issue 240073007: recognize and parse Schema.org Markup (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: addressed comments Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 package com.dom_distiller.client;
6
7 import java.util.ArrayList;
8 import java.util.HashMap;
9 import java.util.List;
10 import java.util.Map;
11
12 import com.google.gwt.dom.client.AnchorElement;
13 import com.google.gwt.dom.client.Element;
14 import com.google.gwt.dom.client.ImageElement;
15 import com.google.gwt.dom.client.MetaElement;
16 import com.google.gwt.dom.client.Node;
17 import com.google.gwt.dom.client.NodeList;
18
19 /**
20 * This class recognizes and parses schema.org markup tags, and returns the prop erties that matter
21 * to distilled content.
22 * Schema.org markup (http://schema.org) is based on the microdata format
23 * (http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html).
24 * For the basic Schema.org Thing type, the basic properties are: name, url, des cription, image.
25 * In addition, for each type that we support, we also parse more specific prope rties:
26 * - Article: headline (i.e. title), publisher, copyright year, copyright holder , date published,
27 * date modified, author, article section
28 * - ImageObject: headline (i.e. title), publisher, copyright year, copyright ho lder, content url,
29 * encoding format, caption, representative of page, width, heigh t
30 * - Person: family name, given name
31 * - Organization: legal name.
32 * The value of a Schema.Org property can be a Schema.Org type, i.e. embedded. E.g., the author or
33 * publisher of article or publisher of image could be a Schema.Org Person or Or ganization type;
34 * in fact, this is the reason we support Person and Organization types.
35 */
36 public class SchemaOrgParser {
37 static final String NAME_PROP = "name";
38 static final String URL_PROP = "url";
39 static final String DESCRIPTION_PROP = "description";
40 static final String IMAGE_PROP = "image";
41 static final String HEADLINE_PROP = "headline";
42 static final String PUBLISHER_PROP = "publisher";
43 static final String COPYRIGHT_HOLDER_PROP = "copyrightHolder";
44 static final String COPYRIGHT_YEAR_PROP = "copyrightYear";
45 static final String CONTENT_URL_PROP = "contentUrl";
46 static final String ENCODING_FORMAT_PROP = "encodingFormat";
47 static final String CAPTION_PROP = "caption";
48 static final String REPRESENTATIVE_PROP = "representativeOfPage";
49 static final String WIDTH_PROP = "width";
50 static final String HEIGHT_PROP = "height";
51 static final String DATE_PUBLISHED_PROP = "datePublished";
52 static final String DATE_MODIFIED_PROP = "dateModified";
53 static final String AUTHOR_PROP = "author";
54 static final String CREATOR_PROP = "creator";
55 static final String SECTION_PROP = "articleSection";
56 static final String ASSOCIATED_MEDIA_PROP = "associatedMedia";
57 static final String ENCODING_PROP = "encoding";
58 static final String FAMILY_NAME_PROP = "familyName";
59 static final String GIVEN_NAME_PROP = "givenName";
60 static final String LEGAL_NAME_PROP = "legalName";
61 static final String AUTHOR_REL = "author";
62
63 enum Type { // All these types are extended from Thing, directly or indirec tly.
64 IMAGE,
65 ARTICLE,
66 PERSON,
67 ORGANIZATION,
68 UNSUPPORTED,
69 }
70
71 static class ThingItem {
72 private final Type mType;
73 private final Map<String, String> mStringProperties;
74 private final Map<String, ThingItem> mItemProperties;
75
76 ThingItem(Type type) {
77 mType = type;
78 mStringProperties = new HashMap<String, String>();
79 mItemProperties = new HashMap<String, ThingItem>();
80
81 addStringPropertyName(NAME_PROP);
82 addStringPropertyName(URL_PROP);
83 addStringPropertyName(DESCRIPTION_PROP);
84 addStringPropertyName(IMAGE_PROP);
85 }
86
87 final void addStringPropertyName(String name) {
88 mStringProperties.put(name, "");
89 }
90
91 final void addItemPropertyName(String name) {
92 mItemProperties.put(name, null);
93 }
94
95 final String getStringProperty(String name) {
96 return !mStringProperties.containsKey(name) ? "" : mStringProperties .get(name);
97 }
98
99 final ThingItem getItemProperty(String name) {
100 return !mItemProperties.containsKey(name) ? null : mItemProperties.g et(name);
101 }
102
103 final Type getType() { return mType; }
104
105 final boolean isSupported() { return mType != Type.UNSUPPORTED; }
106
107 // Store |value| for property with |name|, unless the property already h as a non-empty
108 // value, in which case |value| will be ignored. This means we only kee p the first value.
109 final void putStringValue(String name, String value) {
110 if (mStringProperties.containsKey(name) && mStringProperties.get(nam e).isEmpty()) {
111 mStringProperties.put(name, value);
112 }
113 }
114
115 // Store |value| for property with |name|, unless the property already h as a non-null value,
116 // in which case, |value| will be ignored. This means we only keep the first value.
117 final void putItemValue(String name, ThingItem value) {
118 if (mItemProperties.containsKey(name)) mItemProperties.put(name, val ue);
119 }
120 }
121
122 private final List<ThingItem> mItemScopes = new ArrayList<ThingItem>();
123 private String mAuthorFromRel = "";
124 private static final Map<String, Type> sTypeUrls;
125
126 static {
127 sTypeUrls = new HashMap<String, Type>();
128 sTypeUrls.put("http://schema.org/ImageObject", Type.IMAGE);
129 sTypeUrls.put("http://schema.org/Article", Type.ARTICLE);
130 sTypeUrls.put("http://schema.org/BlogPosting", Type.ARTICLE);
131 sTypeUrls.put("http://schema.org/NewsArticle", Type.ARTICLE);
132 sTypeUrls.put("http://schema.org/ScholarlyArticle", Type.ARTICLE);
133 sTypeUrls.put("http://schema.org/TechArticle", Type.ARTICLE);
134 sTypeUrls.put("http://schema.org/Person", Type.PERSON);
135 sTypeUrls.put("http://schema.org/Organization", Type.ORGANIZATION);
136 sTypeUrls.put("http://schema.org/Corporation", Type.ORGANIZATION);
137 sTypeUrls.put("http://schema.org/EducationalOrganization", Type.ORGANIZA TION);
138 sTypeUrls.put("http://schema.org/GovernmentOrganization", Type.ORGANIZAT ION);
139 sTypeUrls.put("http://schema.org/NGO", Type.ORGANIZATION);
140 }
141
142 /**
143 * The object that extracts and verifies Schema.org markup tags from |root|.
144 */
145 public SchemaOrgParser(Element root) {
146 // TODO(kuan): Parsing all tags is pretty expensive, should we do so onl y lazily?
147 // If parse lazily, all get* methods will need to check for parsed state and, if necessary,
148 // parse before returning the requested properties.
149 // Note that the <html> element can also be the start of a Schema.org it em, and hence needs
150 // to be parsed.
151 parse(root, null);
152 }
153
154 final List<ArticleItem> getArticleItems() {
155 List<ArticleItem> articles = new ArrayList<ArticleItem>();
156 for (int i = 0; i < mItemScopes.size(); i++) {
157 ThingItem item = mItemScopes.get(i);
158 if (item.mType == Type.ARTICLE) articles.add((ArticleItem) item);
159 }
160 return articles;
161 }
162
163 final List<ImageItem> getImageItems() {
164 List<ImageItem> images = new ArrayList<ImageItem>();
165 for (int i = 0; i < mItemScopes.size(); i++) {
166 ThingItem item = mItemScopes.get(i);
167 if (item.mType == Type.IMAGE) images.add((ImageItem) item);
168 }
169 return images;
170 }
171
172 final String getAuthorFromRel() { return mAuthorFromRel; }
173
174 private void parse(Element e, ThingItem parentItem) {
175 ThingItem newItem = null;
176 boolean isItemScope = isItemScope(e);
177 // A non-null |parentItem| means we're currently parsing the elements fo r a schema.org type.
178 String[] propertyNames = parentItem != null ? getItemProp(e) : new Strin g[0];
179
180 if (isItemScope) {
181 // The "itemscope" and "itemtype" attributes of |e| indicate the sta rt of an item.
182 // Create the corresponding extended-ThingItem, and add it to the li st if:
183 // 1) its type is supported, and
184 // 2) if the parent is an unsupported type, it's not an "itemprop" a ttribute of the
185 // parent, based on the rule that an item is a top-level item if its element doesn't
186 // have an itemprop attribute.
187 newItem = createItemForElement(e);
188 if (newItem != null && newItem.isSupported() &&
189 (parentItem == null || parentItem.isSupported() || propertyNames .length == 0)) {
190 mItemScopes.add(newItem);
191 }
192 }
193
194 // If parent is a supported type, parse the element for >= 1 properties in "itemprop"
195 // attribute.
196 if (propertyNames.length > 0 && parentItem.isSupported() &&
197 (newItem == null || newItem.isSupported())) {
198 for (int i = 0; i < propertyNames.length; i++) {
199 // If a new item was created above, the property value of this " itemprop" attribute
200 // is an embedded item, so add it to the parent item.
201 if (newItem != null) {
202 parentItem.putItemValue(propertyNames[i], newItem);
203 } else {
204 // Otherwise, extract the property value from the tag itself, and add it to the
205 // parent item.
206 parentItem.putStringValue(propertyNames[i], getPropertyValue( e));
207 }
208 }
209 }
210
211 // As per http://schema.org/author (or http://schema.org/Article and sea rch for "author"
212 // property), if <a> or <link> tags specify rel="author", extract it.
213 if (mAuthorFromRel.isEmpty()) mAuthorFromRel = getAuthorFromRelAttribute (e);
214
215 // Now, parse each child element recursively.
216 NodeList<Node> children = e.getChildNodes();
217 for (int i = 0; i < children.getLength(); i++) {
218 Node child = children.getItem(i);
219 if (child.getNodeType() != Node.ELEMENT_NODE) continue;
220 parse(Element.as(child), newItem != null ? newItem : parentItem);
221 }
222 }
223
224 private Type getItemType(Element e) {
225 // "itemtype" attribute is case-sensitive.
226 String type = e.getAttribute("ITEMTYPE");
227 return sTypeUrls.containsKey(type) ? sTypeUrls.get(type) : Type.UNSUPPOR TED;
228 }
229
230 private ThingItem createItemForElement(Element e) {
231 ThingItem newItem = null;
232 Type type = getItemType(e);
233 switch (type) {
234 case IMAGE:
235 newItem = new ImageItem();
236 break;
237 case ARTICLE:
238 newItem = new ArticleItem();
239 break;
240 case PERSON:
241 newItem = new PersonItem();
242 break;
243 case ORGANIZATION:
244 newItem = new OrganizationItem();
245 break;
246 case UNSUPPORTED:
247 newItem = new UnsupportedItem();
248 break;
249 default:
250 return null;
251 }
252 return newItem;
253 }
254
255 static class ImageItem extends ThingItem {
256 ImageItem() {
257 super(Type.IMAGE);
258
259 addStringPropertyName(CONTENT_URL_PROP);
260 addStringPropertyName(ENCODING_FORMAT_PROP);
261 addStringPropertyName(CAPTION_PROP);
262 addStringPropertyName(REPRESENTATIVE_PROP);
263 addStringPropertyName(WIDTH_PROP);
264 addStringPropertyName(HEIGHT_PROP);
265 }
266
267 final boolean isRepresentativeOfPage() {
268 return getStringProperty(REPRESENTATIVE_PROP).equalsIgnoreCase("true ");
269 }
270
271 final MarkupParser.Image getImage() {
272 MarkupParser.Image image = new MarkupParser.Image();
273 image.image = getStringProperty(CONTENT_URL_PROP);
274 if (image.image.isEmpty()) image.image = getStringProperty(URL_PROP) ;
275 image.url = image.image;
276 image.type = getStringProperty(ENCODING_FORMAT_PROP);
277 image.caption = getStringProperty(CAPTION_PROP);
278 try {
279 image.width = Integer.parseInt(getStringProperty(WIDTH_PROP), 10 );
280 } catch (Exception e) {
281 }
282 try {
283 image.height = Integer.parseInt(getStringProperty(HEIGHT_PROP), 10);
284 } catch (Exception e) {
285 }
286 return image;
287 }
288 }
289
290 static class ArticleItem extends ThingItem {
291 ArticleItem() {
292 super(Type.ARTICLE);
293
294 addStringPropertyName(HEADLINE_PROP);
295 addStringPropertyName(PUBLISHER_PROP);
296 addStringPropertyName(COPYRIGHT_HOLDER_PROP);
297 addStringPropertyName(COPYRIGHT_YEAR_PROP);
298 addStringPropertyName(DATE_MODIFIED_PROP);
299 addStringPropertyName(DATE_PUBLISHED_PROP);
300 addStringPropertyName(AUTHOR_PROP);
301 addStringPropertyName(CREATOR_PROP);
302 addStringPropertyName(SECTION_PROP);
303
304 addItemPropertyName(PUBLISHER_PROP);
305 addItemPropertyName(COPYRIGHT_HOLDER_PROP);
306 addItemPropertyName(AUTHOR_PROP);
307 addItemPropertyName(CREATOR_PROP);
308 addItemPropertyName(ASSOCIATED_MEDIA_PROP);
309 addItemPropertyName(ENCODING_PROP);
310 }
311
312 final MarkupParser.Article getArticle() {
313 MarkupParser.Article article = new MarkupParser.Article();
314 article.publishedTime = getStringProperty(DATE_PUBLISHED_PROP);
315 article.modifiedTime = getStringProperty(DATE_MODIFIED_PROP);
316 article.section = getStringProperty(SECTION_PROP);
317 String author = getPersonOrOrganizationName(AUTHOR_PROP);
318 if (author.isEmpty()) author = getPersonOrOrganizationName(CREATOR_P ROP);
319 article.authors = author.isEmpty() ? new String[0] : new String[] { author };
320 return article;
321 }
322
323 final String getCopyright() {
324 // Returns a concatenated string of copyright year and copyright hol der of the article,
325 // delimited by a whitespace.
326 String copyright = concat(getStringProperty(COPYRIGHT_YEAR_PROP),
327 getPersonOrOrganizationName(COPYRIGHT_HOLD ER_PROP));
328 return copyright.isEmpty() ? copyright : "Copyright " + copyright;
329 }
330
331 final String getPersonOrOrganizationName(String propertyName) {
332 // Returns either the string value of |propertyName| or the value re turned by getName()
333 // of PersonItem or OrganizationItem.
334 String value = getStringProperty(propertyName);
335 if (!value.isEmpty()) return value;
336
337 ThingItem valueItem = getItemProperty(propertyName);
338 if (valueItem != null) {
339 if (valueItem.getType() == Type.PERSON) {
340 value = ((PersonItem) valueItem).getName();
341 } else if (valueItem.getType() == Type.ORGANIZATION) {
342 value = ((OrganizationItem) valueItem).getName();
343 }
344 }
345 return value;
346 }
347
348 final ImageItem getRepresentativeImageItem() {
349 // Returns the corrresponding ImageItem for "associatedMedia" or "en coding" property.
350 ThingItem imageItem = getItemProperty(ASSOCIATED_MEDIA_PROP);
351 if (imageItem == null) imageItem = getItemProperty(ENCODING_PROP);
352 return imageItem != null && imageItem.getType() == Type.IMAGE ?
353 (ImageItem) imageItem : null;
354 }
355
356 final MarkupParser.Image getImage() {
357 // Use value of "image" property to create a MarkupParser.Image.
358 String imageUrl = getStringProperty(IMAGE_PROP);
359 if (imageUrl.isEmpty()) return null;
360 MarkupParser.Image image = new MarkupParser.Image();
361 image.image = imageUrl;
362 image.url = imageUrl;
363 return image;
364 }
365 }
366
367 private static class PersonItem extends ThingItem {
368 PersonItem() {
369 super(Type.PERSON);
370
371 addStringPropertyName(FAMILY_NAME_PROP);
372 addStringPropertyName(GIVEN_NAME_PROP);
373 }
374
375 String getName() {
376 // Returns either the value of NAME_PROP, or concatenated values of GIVEN_NAME_PROP and
377 // FAMILY_NAME_PROP delimited by a whitespace.
378 String name = getStringProperty(NAME_PROP);
379 return !name.isEmpty() ? name :
380 concat(getStringProperty(GIVEN_NAME_PROP), getStringProperty (FAMILY_NAME_PROP));
381 }
382 }
383
384 private static class OrganizationItem extends ThingItem {
385 OrganizationItem() {
386 super(Type.ORGANIZATION);
387
388 addStringPropertyName(LEGAL_NAME_PROP);
389 }
390
391 String getName() {
392 // Returns either the value of NAME_PROP or LEGAL_NAME_PROP.
393 String name = getStringProperty(NAME_PROP);
394 return !name.isEmpty() ? name : getStringProperty(LEGAL_NAME_PROP);
395 }
396 }
397
398 private static class UnsupportedItem extends ThingItem {
399 UnsupportedItem() {
400 super(Type.UNSUPPORTED);
401 }
402 }
403
404 private static boolean isItemScope(Element e) {
405 return e.hasAttribute("ITEMSCOPE") && e.hasAttribute("ITEMTYPE");
406 }
407
408 private static String[] getItemProp(Element e) {
409 // "itemprop" attribute is case-sensitive, and can have multiple propert ies.
410 String itemprop = e.getAttribute("ITEMPROP");
411 if (itemprop.isEmpty()) return new String[0];
412 String[] splits = StringUtil.split(itemprop, "\\s+");
413 return splits.length > 0 ? splits : new String[] { itemprop };
414 }
415
416 private static final Map<String, String> sTagAttributeMap;
417
418 static {
419 // The key for |sTagAttributeMap| is the tag name, while the entry value is an array of
420 // attributes in the specified tag from which to extract information:
421 // - 0th attribute: contains the value for the property specified in ite mprop
422 // - 1st attribute: if available, contains the value for the author prop erty.
423 sTagAttributeMap = new HashMap<String, String>();
424 sTagAttributeMap.put("IMG", "SRC");
425 sTagAttributeMap.put("AUDIO", "SRC");
426 sTagAttributeMap.put("EMBED", "SRC");
427 sTagAttributeMap.put("IFRAME", "SRC");
428 sTagAttributeMap.put("SOURCE", "SRC");
429 sTagAttributeMap.put("TRACK", "SRC");
430 sTagAttributeMap.put("VIDEO", "SRC");
431 sTagAttributeMap.put("A", "HREF");
432 sTagAttributeMap.put("LINK", "HREF");
433 sTagAttributeMap.put("AREA", "HREF");
434 sTagAttributeMap.put("META", "CONTENT");
435 sTagAttributeMap.put("TIME", "DATETIME");
436 sTagAttributeMap.put("OBJECT", "DATA");
437 sTagAttributeMap.put("DATA", "VALUE");
438 sTagAttributeMap.put("METER", "VALUE");
439 }
440
441 // Extracts the property value from |e|. For some tags, the value is a spec ific attribute,
442 // while for others, it's the text between the start and end tags.
443 private static String getPropertyValue(Element e) {
444 String value = "";
445 String tagName = e.getTagName();
446 if (sTagAttributeMap.containsKey(tagName)) {
447 value = e.getAttribute(sTagAttributeMap.get(tagName));
448 }
449 if (value.isEmpty()) value = e.getInnerText();
450 return value;
451 }
452
453 // Extracts the author property from the "rel=author" attribute of an anchor or a link element.
454 private static String getAuthorFromRelAttribute(Element e) {
455 String author = "";
456 String tagName = e.getTagName();
457 if ((tagName.equalsIgnoreCase("A") || tagName.equalsIgnoreCase("LINK")) &&
458 e.getAttribute("REL").equalsIgnoreCase(AUTHOR_REL)) {
459 author = e.getInnerText();
460 }
461 return author;
462 }
463
464 private static String concat(String first, String second) {
465 String concat = first;
466 if (!concat.isEmpty() && !second.isEmpty()) concat += " ";
467 concat += second;
468 return concat;
469 }
470 }
OLDNEW
« no previous file with comments | « src/com/dom_distiller/client/MarkupParser.java ('k') | src/com/dom_distiller/client/SchemaOrgParserAccessor.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698