Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2)

Side by Side Diff: src/com/dom_distiller/client/SchemaOrgParser.java

Issue 240073007: recognize and parse Schema.org Markup (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: fine-tune prev bug fix Created 6 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 package com.dom_distiller.client;
6
7 import java.util.ArrayList;
8 import java.util.HashMap;
9 import java.util.List;
10 import java.util.Map;
11
12 import com.google.gwt.dom.client.AnchorElement;
13 import com.google.gwt.dom.client.Element;
14 import com.google.gwt.dom.client.ImageElement;
15 import com.google.gwt.dom.client.MetaElement;
16 import com.google.gwt.dom.client.Node;
17 import com.google.gwt.dom.client.NodeList;
18
19 /**
20 * This class recognizes and parses schema.org markup tags, and returns the prop erties that matter
21 * to distilled content.
22 * Schema.org markup (http://schema.org) is based on the microdata format
23 * (http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html).
24 * For the basic Schema.org Thing type, the basic properties are: name, url, des cription, image.
25 * In addition, for each type that we support, we also parse more specific prope rties:
26 * - Article: headline (i.e. title), publisher, copyright year, copyright holder , date published,
27 * date modified, author, article section
28 * - ImageObject: headline (i.e. title), publisher, copyright year, copyright ho lder, content url,
29 * encoding format, caption, representative of page, width, heigh t
30 * - Person: family name, given name
31 * - Organization: legal name.
32 * The value of a Schema.Org property can be a Schema.Org type, i.e. embedded. E.g., the author or
33 * publisher of article or publisher of image could be a Schema.Org Person or Or ganization type;
34 * in fact, this is the reason we support Person and Organization types.
35 */
36 public class SchemaOrgParser implements MarkupParser.Parser {
cjhopman 2014/04/21 16:52:22 Can we split this class into two parts: 1. A Sche
kuan 2014/04/23 15:32:36 Done.
37 private static final String NAME_PROP = "name";
38 private static final String URL_PROP = "url";
39 private static final String DESCRIPTION_PROP = "description";
40 private static final String IMAGE_PROP = "image";
41 private static final String HEADLINE_PROP = "headline";
42 private static final String PUBLISHER_PROP = "publisher";
43 private static final String COPYRIGHT_HOLDER_PROP = "copyrightHolder";
44 private static final String COPYRIGHT_YEAR_PROP = "copyrightYear";
45 private static final String CONTENT_URL_PROP = "contentUrl";
46 private static final String ENCODING_FORMAT_PROP = "encodingFormat";
47 private static final String CAPTION_PROP = "caption";
48 private static final String REPRESENTATIVE_PROP = "representativeOfPage";
49 private static final String WIDTH_PROP = "width";
50 private static final String HEIGHT_PROP = "height";
51 private static final String DATE_PUBLISHED_PROP = "datePublished";
52 private static final String DATE_MODIFIED_PROP = "dateModified";
53 private static final String AUTHOR_PROP = "author";
54 private static final String SECTION_PROP = "articleSection";
55 private static final String FAMILY_NAME_PROP = "familyName";
56 private static final String GIVEN_NAME_PROP = "givenName";
57 private static final String LEGAL_NAME_PROP = "legalName";
58 private static final String AUTHOR_REL = "author";
59
60 private enum Type { // All these types are extended from Thing, directly or indirectly.
61 IMAGE,
62 ARTICLE,
63 PERSON,
64 ORGANIZATION,
65 UNSUPPORTED,
66 }
67
68 private static class ThingItem {
69 private final Type mType;
70 private final String[] mStringPropertyNames;
71 private final String[] mItemPropertyNames;
72 private final String[] mStringProperties;
73 private final ThingItem[] mItemProperties;
74
75 ThingItem(Type type, String[] stringPropertyNames, String[] itemProperty Names) {
76 mType = type;
77 mStringPropertyNames = stringPropertyNames;
78 mItemPropertyNames = itemPropertyNames;
79 mStringProperties = new String[mStringPropertyNames.length];
80 mItemProperties = new ThingItem[mItemPropertyNames.length];
81 }
82
83 String toStringProperty() {
84 return "";
85 }
86
87 MarkupParser.Image getImage() {
88 // Use value of IMAGE_PROP to create a MarkupParser.Image.
89 String imageUrl = getStringProperty(IMAGE_PROP);
90 if (imageUrl.isEmpty()) return null;
91 MarkupParser.Image image = new MarkupParser.Image();
92 image.image = imageUrl;
93 image.url = imageUrl;
94 return image;
95 }
96
97 MarkupParser.Article getArticle() {
98 return null;
99 }
100
101 final boolean isSupported() { return mType != Type.UNSUPPORTED; }
102
103 final boolean isImageRepresentativeOfPage() {
104 String value = getStringProperty(REPRESENTATIVE_PROP);
105 return value.equalsIgnoreCase("true");
106 }
107
108 // Store |value| for property with |name|, unless the property already h as a non-empty
109 // value, in which case, |value| will be ignored. This means we only ke ep the first value.
110 final void putStringValue(String name, String value) {
111 for (int i = 0; i < mStringPropertyNames.length; i++) {
112 if (name.equals(mStringPropertyNames[i])) {
113 String existing = mStringProperties[i];
114 if (existing == null || existing.isEmpty()) mStringPropertie s[i] = value;
115 break;
116 }
117 }
118 }
119
120 // Store |value| for property with |name|, unless the property already h as a non-null value,
121 // in which case, |value| will be ignored. This means we only keep the first value.
122 final void putItemValue(String name, ThingItem value) {
123 for (int i = 0; i < mItemPropertyNames.length; i++) {
124 if (name.equals(mItemPropertyNames[i])) {
125 if (mItemProperties[i] == null) mItemProperties[i] = value;
126 break;
127 }
128 }
129 }
130
131 final String getStringProperty(String name) {
132 // Check if property exists in |mStringProperties|.
133 for (int i = 0; i < mStringPropertyNames.length; i++) {
134 if (name.equals(mStringPropertyNames[i])) {
135 String value = mStringProperties[i];
136 if (value != null && !value.isEmpty()) return value;
137 break;
138 }
139 }
140 // Otherwise, repeat for |mItemProperties|.
141 for (int i = 0; i < mItemPropertyNames.length; i++) {
142 if (!name.equals(mItemPropertyNames[i])) continue;
143 if (mItemProperties[i] != null) return mItemProperties[i].toStri ngProperty();
144 break;
145 }
146 return "";
147 }
148 }
149
150 private final List<ThingItem> mItemScopes = new ArrayList<ThingItem>();
151 private String mAuthorFromRel = "";
152 private static final Map<String, Type> sTypeUrls;
153 private static final Map<String, String[]> sTagAttributesMap;
154 private static final String[] sEmptyPropertyNames = {
155 // Intentionally empty, declared so that it's initialized statically.
156 };
157
158 static {
159 sTypeUrls = new HashMap<String, Type>();
160 sTypeUrls.put("http://schema.org/ImageObject", Type.IMAGE);
161 sTypeUrls.put("http://schema.org/Article", Type.ARTICLE);
cjhopman 2014/04/21 16:52:22 We should probably recognize schema.org/NewsArticl
kuan 2014/04/23 15:32:36 Done.
162 sTypeUrls.put("http://schema.org/Person", Type.PERSON);
163 sTypeUrls.put("http://schema.org/Organization", Type.ORGANIZATION);
cjhopman 2014/04/21 16:52:22 There are a whole bunch of subtypes of Organizatio
kuan 2014/04/23 15:32:36 i added more subtypes, but i'm not sure if they're
164
165 // The key for |sTagAttributesMap| is the tag name, while the entry valu e is an array of
166 // attributes in the specified tag from which to extract information:
167 // - 0th attribute: contains the value for the property specified in ite mprop
168 // - 1st attribute: if available, contains the value for the author prop erty.
169 sTagAttributesMap = new HashMap<String, String[]>();
170 sTagAttributesMap.put("IMG", new String[] { "SRC" });
171 sTagAttributesMap.put("AUDIO", new String[] { "SRC" });
172 sTagAttributesMap.put("EMBED", new String[] { "SRC" });
173 sTagAttributesMap.put("IFRAME", new String[] { "SRC" });
174 sTagAttributesMap.put("SOURCE", new String[] { "SRC" });
175 sTagAttributesMap.put("TRACK", new String[] { "SRC" });
176 sTagAttributesMap.put("VIDEO", new String[] { "SRC" });
177 sTagAttributesMap.put("A", new String[] { "HREF", "REL" });
178 sTagAttributesMap.put("LINK", new String[] { "HREF", "REL" });
179 sTagAttributesMap.put("AREA", new String[] { "HREF" });
180 sTagAttributesMap.put("META", new String[] { "CONTENT" });
181 sTagAttributesMap.put("TIME", new String[] { "DATETIME" });
182 sTagAttributesMap.put("OBJECT", new String[] { "DATA" });
183 sTagAttributesMap.put("DATA", new String[] { "VALUE" });
184 sTagAttributesMap.put("METER", new String[] { "VALUE" });
185 }
186
187 /**
188 * The object that extracts and verifies Schema.org markup tags from |root|.
189 */
190 public SchemaOrgParser(Element root) {
191 // TODO(kuan): Parsing all tags is pretty expensive, should we do so onl y lazily?
192 // If parse lazily, all get* methods will need to check for parsed state and, if necessary,
193 // parse before returning the requested properties.
194 // Note that the <html> element can also be the start of a Schema.org it em, and hence needs
195 // to be parsed.
196 parse(root, null);
197 }
198
199 @Override
200 public String getTitle() {
201 String title = findStringProperty(HEADLINE_PROP);
202 if (title.isEmpty()) title = findStringProperty(NAME_PROP);
cjhopman 2014/04/21 16:52:22 This seems to mean that we will get the first item
kuan 2014/04/23 15:32:36 Done.
203 return title;
204 }
205
206 @Override
207 public String getType() {
208 if (mItemScopes.isEmpty()) return "";
209 // Assume the type of the first item is the page type.
210 return mItemScopes.get(0).mType.toString();
cjhopman 2014/04/21 16:52:22 Does schema.org specify that this is a good way to
kuan 2014/04/23 15:32:36 Done. for the record, as per our off-line discuss
211 }
212
213 @Override
214 public String getUrl() {
215 return findStringProperty(URL_PROP);
cjhopman 2014/04/21 16:52:22 I don't really understand what we expect the url f
kuan 2014/04/23 15:32:36 Done.
216 }
217
218 @Override
219 public MarkupParser.Image[] getImages() {
cjhopman 2014/04/21 16:52:22 We should be careful about what this returns, we o
kuan 2014/04/23 15:32:36 Done. per our discussion off-line, i won't impl t
220 if (mItemScopes.isEmpty()) return null;
221 List<MarkupParser.Image> images = new ArrayList<MarkupParser.Image>();
222 for (int i = 0; i < mItemScopes.size(); i++) {
223 ThingItem item = mItemScopes.get(i);
224 MarkupParser.Image image = item.getImage();
225 if (image != null) {
226 if (item.isImageRepresentativeOfPage()) {
227 // Image should be the dominant, i.e. first, one.
228 images.add(0, image);
229 } else {
230 images.add(image);
231 }
232 }
233 }
234 if (images.isEmpty()) return null;
235 return images.toArray(new MarkupParser.Image[images.size()]);
236 }
237
238 @Override
239 public String getDescription() {
240 return findStringProperty(DESCRIPTION_PROP);
cjhopman 2014/04/21 16:52:22 Again, this should probably only be the descriptio
kuan 2014/04/23 15:32:36 Done.
241 }
242
243 @Override
244 public String getPublisher() {
245 return findStringProperty(PUBLISHER_PROP);
cjhopman 2014/04/21 16:52:22 I think we would only want the publisher property
kuan 2014/04/23 15:32:36 Done.
246 }
247
248 @Override
249 public String getCopyright() {
250 if (mItemScopes.isEmpty()) return "";
251 // Returns a concatenated string of copyright year and copyright holder of the first item
252 // that has these properties, delimited by a whitespace.
253 String copyright = "";
254 for (int i = 0; i < mItemScopes.size() && copyright.isEmpty(); i++) {
255 ThingItem item = mItemScopes.get(i);
256 copyright = concat(item.getStringProperty(COPYRIGHT_YEAR_PROP),
257 item.getStringProperty(COPYRIGHT_HOLDER_PROP));
258 }
259 return copyright.isEmpty() ? copyright : "Copyright " + copyright;
260 }
261
262 @Override
263 public String getAuthor() {
264 String author = findStringProperty(AUTHOR_PROP);
265 return author.isEmpty() ? mAuthorFromRel : author;
266 }
267
268 @Override
269 public MarkupParser.Article getArticle() {
270 if (mItemScopes.isEmpty()) return null;
271 // Returns the first article.
272 MarkupParser.Article article = null;
273 for (int i = 0; i < mItemScopes.size() && article == null; i++) {
274 article = mItemScopes.get(i).getArticle();
275 }
276 return article;
277 }
278
279 @Override
280 public boolean optOut() {
281 return false;
282 }
283
284 private void parse(Element e, ThingItem parentItem) {
285 ThingItem newItem = null;
286 boolean isItemScope = isItemScope(e);
287 // A non-null |parentItem| means we're currently parsing the elements fo r a schema.org type.
288 String[] propertyNames = parentItem != null ? getItemProp(e) : new Strin g[0];
289
290 if (isItemScope) {
291 // The "itemscope" and "itemtype" attributes of |e| indicate the sta rt of an item.
292 // Create the corresponding extended-ThingItem, and add it to the li st if:
293 // 1) its type is supported, and
294 // 2) if the parent is an unsupported type, it's not an "itemprop" a ttribute of the
295 // parent, based on the rule that an item is a top-level item if its element doesn't
296 // have an itemprop attribute.
297 newItem = createItemForElement(e);
298 if (newItem != null && newItem.isSupported() &&
299 (parentItem == null || parentItem.isSupported() || propertyNames .length == 0)) {
300 mItemScopes.add(newItem);
301 }
302 }
303
304 // If parent is a supported type, parse the element for >= 1 properties in "itemprop"
305 // attribute.
306 if (propertyNames.length > 0 && parentItem.isSupported() &&
307 (newItem == null || newItem.isSupported())) {
308 for (int i = 0; i < propertyNames.length; i++) {
309 // If a new item was created above, the property value of this " itemprop" attribute
310 // is an embedded item, so add it to the parent item.
311 if (newItem != null) {
312 parentItem.putItemValue(propertyNames[i], newItem);
313 } else {
314 // Otherwise, extract the property value from the tag itself, and add it to the
315 // parent item.
316 parentItem.putStringValue(propertyNames[i], getPropertyValue( e));
317 }
318 }
319 }
320
321 // If <a> or <link> tags specify rel="author", extract it.
cjhopman 2014/04/25 20:52:34 Where does this rel="author" stuff come from? I ca
322 if (mAuthorFromRel.isEmpty()) mAuthorFromRel = getAuthorFromRelAttribute (e);
323
324 // Now, parse each child element recursively.
325 NodeList<Node> children = e.getChildNodes();
326 for (int i = 0; i < children.getLength(); i++) {
327 Node child = children.getItem(i);
328 if (child.getNodeType() != Node.ELEMENT_NODE) continue;
329 parse(Element.as(child), newItem != null ? newItem : parentItem);
330 }
331 }
332
333 private Type getItemType(Element e) {
334 // "itemtype" attribute is case-sensitive.
335 String type = e.getAttribute("ITEMTYPE");
336 return sTypeUrls.containsKey(type) ? sTypeUrls.get(type) : Type.UNSUPPOR TED;
337 }
338
339 private ThingItem createItemForElement(Element e) {
340 ThingItem newItem = null;
341 Type type = getItemType(e);
342 switch (type) {
343 case IMAGE:
344 newItem = new ImageItem();
345 break;
346 case ARTICLE:
347 newItem = new ArticleItem();
348 break;
349 case PERSON:
350 newItem = new PersonItem();
351 break;
352 case ORGANIZATION:
353 newItem = new OrganizationItem();
354 break;
355 case UNSUPPORTED:
356 newItem = new UnsupportedItem();
357 break;
358 default:
359 return null;
360 }
361 return newItem;
362 }
363
364 // Returns the first item that has the requested property value.
365 private String findStringProperty(String name) {
366 if (mItemScopes.isEmpty()) return "";
367 for (int i = 0; i < mItemScopes.size(); i++) {
368 String value = mItemScopes.get(i).getStringProperty(name);
369 if (!value.isEmpty()) return value;
370 }
371 return "";
372 }
373
374 private static class ImageItem extends ThingItem {
375 private static final String[] sStringPropertyNames = {
376 NAME_PROP,
377 URL_PROP,
378 DESCRIPTION_PROP,
379 IMAGE_PROP,
380 HEADLINE_PROP,
381 PUBLISHER_PROP,
382 COPYRIGHT_HOLDER_PROP,
383 COPYRIGHT_YEAR_PROP,
384 CONTENT_URL_PROP,
385 ENCODING_FORMAT_PROP,
386 CAPTION_PROP,
387 REPRESENTATIVE_PROP,
388 WIDTH_PROP,
389 HEIGHT_PROP,
390 };
391
392 private static final String[] sItemPropertyNames = {
393 PUBLISHER_PROP,
394 COPYRIGHT_HOLDER_PROP,
395 };
396
397 ImageItem() {
398 super(Type.IMAGE, sStringPropertyNames, sItemPropertyNames);
399 }
400
401 @Override
402 MarkupParser.Image getImage() {
403 MarkupParser.Image image = new MarkupParser.Image();
404 String url = getStringProperty(CONTENT_URL_PROP);
405 image.image = !url.isEmpty() ? url : getStringProperty(NAME_PROP);
406 image.url = image.image;
407 image.type = getStringProperty(ENCODING_FORMAT_PROP);
408 image.caption = getStringProperty(CAPTION_PROP);
409 try {
410 image.width = Integer.parseInt(getStringProperty(WIDTH_PROP), 10 );
411 } catch (Exception e) {
412 }
413 try {
414 image.height = Integer.parseInt(getStringProperty(HEIGHT_PROP), 10);
415 } catch (Exception e) {
416 }
417 return image;
418 }
419 }
420
421 private static class ArticleItem extends ThingItem {
422 private static final String[] sStringPropertyNames = {
423 NAME_PROP,
424 URL_PROP,
425 DESCRIPTION_PROP,
426 IMAGE_PROP,
427 HEADLINE_PROP,
428 PUBLISHER_PROP,
429 COPYRIGHT_HOLDER_PROP,
430 COPYRIGHT_YEAR_PROP,
431 DATE_MODIFIED_PROP,
432 DATE_PUBLISHED_PROP,
433 AUTHOR_PROP,
434 SECTION_PROP,
435 };
436
437 private static final String[] sItemPropertyNames = {
438 PUBLISHER_PROP,
439 COPYRIGHT_HOLDER_PROP,
440 AUTHOR_PROP,
441 };
442
443 ArticleItem() {
444 super(Type.ARTICLE, sStringPropertyNames, sItemPropertyNames);
445 }
446
447 @Override
448 MarkupParser.Article getArticle() {
449 MarkupParser.Article article = new MarkupParser.Article();
450 article.publishedTime = getStringProperty(DATE_PUBLISHED_PROP);
451 article.modifiedTime = getStringProperty(DATE_MODIFIED_PROP);
452 article.section = getStringProperty(SECTION_PROP);
453 String author = getStringProperty(AUTHOR_PROP);
454 article.authors = author.isEmpty() ? new String[0] : new String[] { author };
455 return article;
456 }
457 }
458
459 private static class PersonItem extends ThingItem {
460 private static final String[] sStringPropertyNames = {
461 NAME_PROP,
462 URL_PROP,
463 DESCRIPTION_PROP,
464 IMAGE_PROP,
465 FAMILY_NAME_PROP,
466 GIVEN_NAME_PROP,
467 };
468
469 PersonItem() {
470 super(Type.PERSON, sStringPropertyNames, sEmptyPropertyNames);
471 }
472
473 // Returns either the value of NAME_PROP, or concatenated values of GIVE N_NAME_PROP and
474 // FAILY_NAME_PROP delimited by a whitespace.
cjhopman 2014/04/21 16:52:22 s/FAILY/FAMILY
kuan 2014/04/23 15:32:36 Done.
475 @Override
476 String toStringProperty() {
477 String fullname = getStringProperty(NAME_PROP);
478 if (fullname.isEmpty()) {
479 fullname = concat(getStringProperty(GIVEN_NAME_PROP),
480 getStringProperty(FAMILY_NAME_PROP));
481 }
482 return fullname;
483 }
484 }
485
486 private static class OrganizationItem extends ThingItem {
487 private static final String[] sStringPropertyNames = {
488 NAME_PROP,
489 URL_PROP,
490 DESCRIPTION_PROP,
491 IMAGE_PROP,
492 LEGAL_NAME_PROP,
493 };
494
495 OrganizationItem() {
496 super(Type.ORGANIZATION, sStringPropertyNames, sEmptyPropertyNames);
497 }
498
499 // Returns either the value of NAME_PROP or LEGAL_NAME_PROP.
500 @Override
501 String toStringProperty() {
502 String name = getStringProperty(NAME_PROP);
503 if (name.isEmpty()) name = getStringProperty(LEGAL_NAME_PROP);
504 return name;
505 }
506 }
507
508 private static class UnsupportedItem extends ThingItem {
509 UnsupportedItem(){
510 super(Type.UNSUPPORTED, sEmptyPropertyNames, sEmptyPropertyNames);
511 }
512 }
513
514 private static boolean isItemScope(Element e) {
515 return e.hasAttribute("ITEMSCOPE") && e.hasAttribute("ITEMTYPE");
516 }
517
518 private static String[] getItemProp(Element e) {
519 // "itemprop" attribute is case-sensitive, and can have multiple propert ies.
520 String itemprop = e.getAttribute("ITEMPROP");
521 if (itemprop.isEmpty()) return new String[0];
522 String[] splits = StringUtil.split(itemprop, "\\s+");
523 return splits.length > 0 ? splits : new String[] { itemprop };
524 }
525
526 // Extracts the property value from |e|. For some tags, the value is a spec ific attribute,
527 // while for others, it's the text between the start and end tags.
528 private static String getPropertyValue(Element e) {
529 String value = "";
530 String tagName = e.getTagName();
531 if (sTagAttributesMap.containsKey(tagName)) {
532 value = e.getAttribute(sTagAttributesMap.get(tagName)[0]);
533 }
534 if (value.isEmpty()) value = e.getInnerText();
535 return value;
536 }
537
538 // Extracts the author property from |e|'s "rel=author" attribute.
539 private static String getAuthorFromRelAttribute(Element e) {
540 String author = "";
541 String tagName = e.getTagName();
542 if (sTagAttributesMap.containsKey(tagName)) {
543 String[] attrs = sTagAttributesMap.get(tagName);
544 if (attrs.length > 1 && e.getAttribute(attrs[1]).equals(AUTHOR_REL)) {
545 author = e.getInnerText();
546 }
547 }
548 return author;
549 }
550
551 private static String concat(String first, String second) {
552 String concat = first;
553 if (!concat.isEmpty() && !second.isEmpty()) concat += " ";
554 concat += second;
555 return concat;
556 }
557 }
OLDNEW
« no previous file with comments | « src/com/dom_distiller/client/MarkupParser.java ('k') | test/com/dom_distiller/client/SchemaOrgParserTest.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698