Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(814)

Side by Side Diff: src/com/dom_distiller/client/SchemaOrgParser.java

Issue 240073007: recognize and parse Schema.org Markup (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: rm 1 more unused prop in image Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 package com.dom_distiller.client;
6
7 import java.util.ArrayList;
8 import java.util.HashMap;
9 import java.util.List;
10 import java.util.Map;
11
12 import com.google.gwt.dom.client.AnchorElement;
13 import com.google.gwt.dom.client.Element;
14 import com.google.gwt.dom.client.ImageElement;
15 import com.google.gwt.dom.client.MetaElement;
16 import com.google.gwt.dom.client.Node;
17 import com.google.gwt.dom.client.NodeList;
18
19 /**
20 * This class recognizes and parses schema.org markup tags, and returns the prop erties that matter
21 * to distilled content.
22 * Schema.org markup (http://schema.org) is based on the microdata format
23 * (http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html).
24 * For the basic Schema.org Thing type, the basic properties are: name, url, des cription, image.
25 * In addition, for each type that we support, we also parse more specific prope rties:
26 * - Article: headline (i.e. title), publisher, copyright year, copyright holder , date published,
27 * date modified, author, article section
28 * - ImageObject: headline (i.e. title), publisher, copyright year, copyright ho lder, content url,
29 * encoding format, caption, representative of page, width, heigh t
30 * - Person: family name, given name
31 * - Organization: legal name.
32 * The value of a Schema.Org property can be a Schema.Org type, i.e. embedded. E.g., the author or
33 * publisher of article or publisher of image could be a Schema.Org Person or Or ganization type;
34 * in fact, this is the reason we support Person and Organization types.
35 */
36 public class SchemaOrgParser {
37 static final String NAME_PROP = "name";
38 static final String URL_PROP = "url";
39 static final String DESCRIPTION_PROP = "description";
40 static final String IMAGE_PROP = "image";
41 static final String HEADLINE_PROP = "headline";
42 static final String PUBLISHER_PROP = "publisher";
43 static final String COPYRIGHT_HOLDER_PROP = "copyrightHolder";
44 static final String COPYRIGHT_YEAR_PROP = "copyrightYear";
45 static final String CONTENT_URL_PROP = "contentUrl";
46 static final String ENCODING_FORMAT_PROP = "encodingFormat";
47 static final String CAPTION_PROP = "caption";
48 static final String REPRESENTATIVE_PROP = "representativeOfPage";
49 static final String WIDTH_PROP = "width";
50 static final String HEIGHT_PROP = "height";
51 static final String DATE_PUBLISHED_PROP = "datePublished";
52 static final String DATE_MODIFIED_PROP = "dateModified";
53 static final String AUTHOR_PROP = "author";
54 static final String CREATOR_PROP = "creator";
55 static final String SECTION_PROP = "articleSection";
56 static final String ASSOCIATED_MEDIA_PROP = "associatedMedia";
57 static final String ENCODING_PROP = "encoding";
58 static final String FAMILY_NAME_PROP = "familyName";
59 static final String GIVEN_NAME_PROP = "givenName";
60 static final String LEGAL_NAME_PROP = "legalName";
61 static final String AUTHOR_REL = "author";
62
63 enum Type { // All these types are extended from Thing, directly or indirec tly.
64 IMAGE,
65 ARTICLE,
66 PERSON,
67 ORGANIZATION,
68 UNSUPPORTED,
69 }
70
71 static class ThingItem {
72 private final Type mType;
73 private final Map<String, String> mStringProperties;
74 private final Map<String, ThingItem> mItemProperties;
75
76 ThingItem(Type type) {
77 mType = type;
78 mStringProperties = new HashMap<String, String>();
79 mItemProperties = new HashMap<String, ThingItem>();
80
81 addStringPropertyName(NAME_PROP);
82 addStringPropertyName(URL_PROP);
83 addStringPropertyName(DESCRIPTION_PROP);
84 addStringPropertyName(IMAGE_PROP);
85 }
86
87 final void addStringPropertyName(String name) {
88 mStringProperties.put(name, "");
89 }
90
91 final void addItemPropertyName(String name) {
92 mItemProperties.put(name, null);
93 }
94
95 final String getStringProperty(String name) {
96 return !mStringProperties.containsKey(name) ? "" : mStringProperties .get(name);
97 }
98
99 final ThingItem getItemProperty(String name) {
100 return !mItemProperties.containsKey(name) ? null : mItemProperties.g et(name);
101 }
102
103 final Type getType() { return mType; }
104
105 final boolean isSupported() { return mType != Type.UNSUPPORTED; }
106
107 // Store |value| for property with |name|, unless the property already h as a non-empty
108 // value, in which case |value| will be ignored. This means we only kee p the first value.
109 final void putStringValue(String name, String value) {
110 if (mStringProperties.containsKey(name) && mStringProperties.get(nam e).isEmpty()) {
111 mStringProperties.put(name, value);
112 }
113 }
114
115 // Store |value| for property with |name|, unless the property already h as a non-null value,
116 // in which case, |value| will be ignored. This means we only keep the first value.
117 final void putItemValue(String name, ThingItem value) {
118 if (mItemProperties.containsKey(name)) mItemProperties.put(name, val ue);
119 }
120 }
121
122 private final List<ThingItem> mItemScopes = new ArrayList<ThingItem>();
123 private String mAuthorFromRel = "";
124 private static final Map<String, Type> sTypeUrls;
125
126 static {
127 sTypeUrls = new HashMap<String, Type>();
128 sTypeUrls.put("http://schema.org/ImageObject", Type.IMAGE);
129 sTypeUrls.put("http://schema.org/Article", Type.ARTICLE);
130 sTypeUrls.put("http://schema.org/BlogPosting", Type.ARTICLE);
131 sTypeUrls.put("http://schema.org/NewsArticle", Type.ARTICLE);
132 sTypeUrls.put("http://schema.org/ScholarlyArticle", Type.ARTICLE);
133 sTypeUrls.put("http://schema.org/TechArticle", Type.ARTICLE);
134 sTypeUrls.put("http://schema.org/Person", Type.PERSON);
135 sTypeUrls.put("http://schema.org/Organization", Type.ORGANIZATION);
136 sTypeUrls.put("http://schema.org/Corporation", Type.ORGANIZATION);
137 sTypeUrls.put("http://schema.org/EducationalOrganization", Type.ORGANIZA TION);
138 sTypeUrls.put("http://schema.org/GovernmentOrganization", Type.ORGANIZAT ION);
139 sTypeUrls.put("http://schema.org/NGO", Type.ORGANIZATION);
140 }
141
142 /**
143 * The object that extracts and verifies Schema.org markup tags from |root|.
144 */
145 public SchemaOrgParser(Element root) {
146 // TODO(kuan): Parsing all tags is pretty expensive, should we do so onl y lazily?
147 // If parse lazily, all get* methods will need to check for parsed state and, if necessary,
148 // parse before returning the requested properties.
149 // Note that the <html> element can also be the start of a Schema.org it em, and hence needs
150 // to be parsed.
151 parse(root, null);
152 }
153
154 final ArticleItem findFirstArticle() {
155 for (int i = 0; i < mItemScopes.size(); i++) {
156 ThingItem item = mItemScopes.get(i);
157 if (item.mType == Type.ARTICLE) return (ArticleItem) item;
158 }
159 return null;
160 }
161
162 final List<ThingItem> getItemScopes() { return mItemScopes; }
163
164 final String getAuthorFromRel() { return mAuthorFromRel; }
165
166 static String concat(String first, String second) {
167 String concat = first;
168 if (!concat.isEmpty() && !second.isEmpty()) concat += " ";
169 concat += second;
170 return concat;
171 }
172
173 private void parse(Element e, ThingItem parentItem) {
174 ThingItem newItem = null;
175 boolean isItemScope = isItemScope(e);
176 // A non-null |parentItem| means we're currently parsing the elements fo r a schema.org type.
177 String[] propertyNames = parentItem != null ? getItemProp(e) : new Strin g[0];
178
179 if (isItemScope) {
180 // The "itemscope" and "itemtype" attributes of |e| indicate the sta rt of an item.
181 // Create the corresponding extended-ThingItem, and add it to the li st if:
182 // 1) its type is supported, and
183 // 2) if the parent is an unsupported type, it's not an "itemprop" a ttribute of the
184 // parent, based on the rule that an item is a top-level item if its element doesn't
185 // have an itemprop attribute.
186 newItem = createItemForElement(e);
187 if (newItem != null && newItem.isSupported() &&
188 (parentItem == null || parentItem.isSupported() || propertyNames .length == 0)) {
189 mItemScopes.add(newItem);
190 }
191 }
192
193 // If parent is a supported type, parse the element for >= 1 properties in "itemprop"
194 // attribute.
195 if (propertyNames.length > 0 && parentItem.isSupported() &&
196 (newItem == null || newItem.isSupported())) {
197 for (int i = 0; i < propertyNames.length; i++) {
198 // If a new item was created above, the property value of this " itemprop" attribute
199 // is an embedded item, so add it to the parent item.
200 if (newItem != null) {
201 parentItem.putItemValue(propertyNames[i], newItem);
202 } else {
203 // Otherwise, extract the property value from the tag itself, and add it to the
204 // parent item.
205 parentItem.putStringValue(propertyNames[i], getPropertyValue( e));
206 }
207 }
208 }
209
210 // If <a> or <link> tags specify rel="author", extract it.
211 if (mAuthorFromRel.isEmpty()) mAuthorFromRel = getAuthorFromRelAttribute (e);
212
213 // Now, parse each child element recursively.
214 NodeList<Node> children = e.getChildNodes();
215 for (int i = 0; i < children.getLength(); i++) {
216 Node child = children.getItem(i);
217 if (child.getNodeType() != Node.ELEMENT_NODE) continue;
218 parse(Element.as(child), newItem != null ? newItem : parentItem);
219 }
220 }
221
222 private Type getItemType(Element e) {
223 // "itemtype" attribute is case-sensitive.
224 String type = e.getAttribute("ITEMTYPE");
225 return sTypeUrls.containsKey(type) ? sTypeUrls.get(type) : Type.UNSUPPOR TED;
226 }
227
228 private ThingItem createItemForElement(Element e) {
229 ThingItem newItem = null;
230 Type type = getItemType(e);
231 switch (type) {
232 case IMAGE:
233 newItem = new ImageItem();
234 break;
235 case ARTICLE:
236 newItem = new ArticleItem();
237 break;
238 case PERSON:
239 newItem = new PersonItem();
240 break;
241 case ORGANIZATION:
242 newItem = new OrganizationItem();
243 break;
244 case UNSUPPORTED:
245 newItem = new UnsupportedItem();
246 break;
247 default:
248 return null;
249 }
250 return newItem;
251 }
252
253 static class ImageItem extends ThingItem {
254 ImageItem() {
255 super(Type.IMAGE);
256
257 addStringPropertyName(CONTENT_URL_PROP);
258 addStringPropertyName(ENCODING_FORMAT_PROP);
259 addStringPropertyName(CAPTION_PROP);
260 addStringPropertyName(REPRESENTATIVE_PROP);
261 addStringPropertyName(WIDTH_PROP);
262 addStringPropertyName(HEIGHT_PROP);
263 }
264
265 final boolean isRepresentativeOfPage() {
266 return getStringProperty(REPRESENTATIVE_PROP).equalsIgnoreCase("true ");
267 }
268
269 final MarkupParser.Image getImage() {
270 MarkupParser.Image image = new MarkupParser.Image();
271 image.image = getStringProperty(CONTENT_URL_PROP);
272 if (image.image.isEmpty()) image.image = getStringProperty(URL_PROP) ;
273 image.url = image.image;
274 image.type = getStringProperty(ENCODING_FORMAT_PROP);
275 image.caption = getStringProperty(CAPTION_PROP);
276 try {
277 image.width = Integer.parseInt(getStringProperty(WIDTH_PROP), 10 );
278 } catch (Exception e) {
279 }
280 try {
281 image.height = Integer.parseInt(getStringProperty(HEIGHT_PROP), 10);
282 } catch (Exception e) {
283 }
284 return image;
285 }
286 }
287
288 static class ArticleItem extends ThingItem {
289 ArticleItem() {
290 super(Type.ARTICLE);
291
292 addStringPropertyName(HEADLINE_PROP);
293 addStringPropertyName(PUBLISHER_PROP);
294 addStringPropertyName(COPYRIGHT_HOLDER_PROP);
295 addStringPropertyName(COPYRIGHT_YEAR_PROP);
296 addStringPropertyName(DATE_MODIFIED_PROP);
297 addStringPropertyName(DATE_PUBLISHED_PROP);
298 addStringPropertyName(AUTHOR_PROP);
299 addStringPropertyName(CREATOR_PROP);
300 addStringPropertyName(SECTION_PROP);
301
302 addItemPropertyName(PUBLISHER_PROP);
303 addItemPropertyName(COPYRIGHT_HOLDER_PROP);
304 addItemPropertyName(AUTHOR_PROP);
305 addItemPropertyName(CREATOR_PROP);
306 addItemPropertyName(ASSOCIATED_MEDIA_PROP);
307 addItemPropertyName(ENCODING_PROP);
308 }
309
310 final MarkupParser.Article getArticle() {
311 MarkupParser.Article article = new MarkupParser.Article();
312 article.publishedTime = getStringProperty(DATE_PUBLISHED_PROP);
313 article.modifiedTime = getStringProperty(DATE_MODIFIED_PROP);
314 article.section = getStringProperty(SECTION_PROP);
315 String author = getPersonOrOrganizationName(AUTHOR_PROP);
316 if (author.isEmpty()) author = getPersonOrOrganizationName(CREATOR_P ROP);
317 article.authors = author.isEmpty() ? new String[0] : new String[] { author };
318 return article;
319 }
320
321 final String getPersonOrOrganizationName(String propertyName) {
322 // Returns either the string value of |propertyName| or the value re turned by getName()
323 // of PersonItem or OrganizationItem.
324 String value = getStringProperty(propertyName);
325 if (!value.isEmpty()) return value;
326
327 ThingItem valueItem = getItemProperty(propertyName);
328 if (valueItem != null) {
329 if (valueItem.getType() == Type.PERSON) {
330 value = ((PersonItem) valueItem).getName();
331 } else if (valueItem.getType() == Type.ORGANIZATION) {
332 value = ((OrganizationItem) valueItem).getName();
333 }
334 }
335 return value;
336 }
337
338 final ImageItem getRepresentativeImageItem() {
339 // Returns the corrresponding ImageItem for "associatedMedia" or "en coding" property.
340 ThingItem imageItem = getItemProperty(ASSOCIATED_MEDIA_PROP);
341 if (imageItem == null) imageItem = getItemProperty(ENCODING_PROP);
342 return imageItem != null && imageItem.getType() == Type.IMAGE ?
343 (ImageItem) imageItem : null;
344 }
345
346 final MarkupParser.Image getImage() {
347 // Use value of "image" property to create a MarkupParser.Image.
348 String imageUrl = getStringProperty(IMAGE_PROP);
349 if (imageUrl.isEmpty()) return null;
350 MarkupParser.Image image = new MarkupParser.Image();
351 image.image = imageUrl;
352 image.url = imageUrl;
353 return image;
354 }
355 }
356
357 private static class PersonItem extends ThingItem {
358 PersonItem() {
359 super(Type.PERSON);
360
361 addStringPropertyName(FAMILY_NAME_PROP);
362 addStringPropertyName(GIVEN_NAME_PROP);
363 }
364
365 String getName() {
366 // Returns either the value of NAME_PROP, or concatenated values of GIVEN_NAME_PROP and
367 // FAMILY_NAME_PROP delimited by a whitespace.
368 String name = getStringProperty(NAME_PROP);
369 return !name.isEmpty() ? name :
370 concat(getStringProperty(GIVEN_NAME_PROP), getStringProperty (FAMILY_NAME_PROP));
371 }
372 }
373
374 private static class OrganizationItem extends ThingItem {
375 OrganizationItem() {
376 super(Type.ORGANIZATION);
377
378 addStringPropertyName(LEGAL_NAME_PROP);
379 }
380
381 String getName() {
382 // Returns either the value of NAME_PROP or LEGAL_NAME_PROP.
383 String name = getStringProperty(NAME_PROP);
384 return !name.isEmpty() ? name : getStringProperty(LEGAL_NAME_PROP);
385 }
386 }
387
388 private static class UnsupportedItem extends ThingItem {
389 UnsupportedItem() {
390 super(Type.UNSUPPORTED);
391 }
392 }
393
394 private static boolean isItemScope(Element e) {
395 return e.hasAttribute("ITEMSCOPE") && e.hasAttribute("ITEMTYPE");
396 }
397
398 private static String[] getItemProp(Element e) {
399 // "itemprop" attribute is case-sensitive, and can have multiple propert ies.
400 String itemprop = e.getAttribute("ITEMPROP");
401 if (itemprop.isEmpty()) return new String[0];
402 String[] splits = StringUtil.split(itemprop, "\\s+");
403 return splits.length > 0 ? splits : new String[] { itemprop };
404 }
405
406 private static final Map<String, String[]> sTagAttributesMap;
407
408 static {
409 // The key for |sTagAttributesMap| is the tag name, while the entry valu e is an array of
410 // attributes in the specified tag from which to extract information:
411 // - 0th attribute: contains the value for the property specified in ite mprop
412 // - 1st attribute: if available, contains the value for the author prop erty.
413 sTagAttributesMap = new HashMap<String, String[]>();
414 sTagAttributesMap.put("IMG", new String[] { "SRC" });
415 sTagAttributesMap.put("AUDIO", new String[] { "SRC" });
416 sTagAttributesMap.put("EMBED", new String[] { "SRC" });
417 sTagAttributesMap.put("IFRAME", new String[] { "SRC" });
418 sTagAttributesMap.put("SOURCE", new String[] { "SRC" });
419 sTagAttributesMap.put("TRACK", new String[] { "SRC" });
420 sTagAttributesMap.put("VIDEO", new String[] { "SRC" });
421 sTagAttributesMap.put("A", new String[] { "HREF", "REL" });
422 sTagAttributesMap.put("LINK", new String[] { "HREF", "REL" });
cjhopman 2014/04/29 17:04:19 I think this would be clearer if this was just a m
kuan 2014/04/29 23:26:43 Done. hehe.. back then when i changed to include
423 sTagAttributesMap.put("AREA", new String[] { "HREF" });
424 sTagAttributesMap.put("META", new String[] { "CONTENT" });
425 sTagAttributesMap.put("TIME", new String[] { "DATETIME" });
426 sTagAttributesMap.put("OBJECT", new String[] { "DATA" });
427 sTagAttributesMap.put("DATA", new String[] { "VALUE" });
428 sTagAttributesMap.put("METER", new String[] { "VALUE" });
429 }
430
431 // Extracts the property value from |e|. For some tags, the value is a spec ific attribute,
432 // while for others, it's the text between the start and end tags.
433 private static String getPropertyValue(Element e) {
434 String value = "";
435 String tagName = e.getTagName();
436 if (sTagAttributesMap.containsKey(tagName)) {
437 value = e.getAttribute(sTagAttributesMap.get(tagName)[0]);
438 }
439 if (value.isEmpty()) value = e.getInnerText();
440 return value;
441 }
442
443 // Extracts the author property from |e|'s "rel=author" attribute.
444 private static String getAuthorFromRelAttribute(Element e) {
445 String author = "";
446 String tagName = e.getTagName();
447 if (sTagAttributesMap.containsKey(tagName)) {
448 String[] attrs = sTagAttributesMap.get(tagName);
449 if (attrs.length > 1 && e.getAttribute(attrs[1]).equals(AUTHOR_REL)) {
450 author = e.getInnerText();
451 }
452 }
453 return author;
454 }
455 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698