Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(326)

Side by Side Diff: src/com/dom_distiller/client/SchemaOrgParser.java

Issue 240073007: recognize and parse Schema.org Markup (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: Created 6 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 package com.dom_distiller.client;
6
7 import java.util.ArrayList;
8 import java.util.EnumMap;
9 import java.util.Iterator;
10 import java.util.List;
11 import java.util.Map;
12 import java.util.Set;
13
14 import com.google.gwt.dom.client.AnchorElement;
15 import com.google.gwt.dom.client.Element;
16 import com.google.gwt.dom.client.ImageElement;
17 import com.google.gwt.dom.client.MetaElement;
18 import com.google.gwt.dom.client.NodeList;
19
20 /**
21 * This class recognizes and parses Schema.org markup tags, and returns the prop erties that matter
22 * to distilled content.
23 * For the basic Schema.org Thing type, the basic properties are: name, url, des cription, image.
24 * In addition, for each type that we support, we also parse more specific prope rties:
25 * - Article: headline (i.e. title), publisher, copyright year, copyright holder , date published,
26 * date modified, author, article section
27 * - ImageObject: headline (i.e. title), publisher, copyright year, copyright ho lder, content url,
28 * encoding format, caption, representative of page, width, heigh t
29 * - Person: family name, given name
30 * - Organization: legal name.
31 * The value of a Schema.Org property can be a Schema.Org type, i.e. embedded. E.g., the author or
32 * publisher of article or publisher of image could be a Schema.Org Person or Or ganization type;
33 * in fact, this is the reason we support Person and Organization types.
34 */
35 public class SchemaOrgParser implements MarkupParser.Parser {
36 private static final String NAME_PROP = "name";
37 private static final String URL_PROP = "url";
38 private static final String DESCRIPTION_PROP = "description";
39 private static final String IMAGE_PROP = "image";
40 private static final String HEADLINE_PROP = "headline";
41 private static final String PUBLISHER_PROP = "publisher";
42 private static final String COPYRIGHT_HOLDER_PROP = "copyrightHolder";
43 private static final String COPYRIGHT_YEAR_PROP = "copyrightYear";
44 private static final String CONTENT_URL_PROP = "contentUrl";
45 private static final String ENCODING_FORMAT_PROP = "encodingFormat";
46 private static final String CAPTION_PROP = "caption";
47 private static final String REPRESENTATIVE_PROP = "representativeOfPage";
48 private static final String WIDTH_PROP = "width";
49 private static final String HEIGHT_PROP = "height";
50 private static final String DATE_PUBLISHED_PROP = "datePublished";
51 private static final String DATE_MODIFIED_PROP = "dateModified";
52 private static final String AUTHOR_PROP = "author";
53 private static final String SECTION_PROP = "articleSection";
54 private static final String FAMILY_NAME_PROP = "familyName";
55 private static final String GIVEN_NAME_PROP = "givenName";
56 private static final String LEGAL_NAME_PROP = "legalName";
57
58 private enum Type { // All these types are extended from Thing, directly or indirectly.
59 IMAGE,
60 ARTICLE,
61 PERSON,
62 ORGANIZATION,
63 UNSUPPORTED,
64 }
65
66 private static class ThingItem {
67 protected final Type mType;
68 protected final Element mRoot;
69 protected final String[] mStringPropertyNames;
70 protected final String[] mItemPropertyNames;
71 protected final String[] mStringProperties;
72 protected final ThingItem[] mItemProperties;
73
74 protected ThingItem(Type type, Element root,
75 String[] stringPropertyNames, String[] itemPropertyN ames) {
76 mType = type;
77 mRoot = root;
78 mStringPropertyNames = stringPropertyNames;
79 mItemPropertyNames = itemPropertyNames;
80 mStringProperties = new String[mStringPropertyNames.length];
81 mItemProperties = new ThingItem[mItemPropertyNames.length];
82 }
83
84 protected String toStringProperty() {
85 return "";
86 }
87
88 protected MarkupParser.Image getImage() {
89 // Use value of IMAGE_PROP to create a MarkupParser.Image.
90 String imageUrl = getStringProperty(IMAGE_PROP);
91 if (imageUrl.isEmpty()) return null;
92 MarkupParser.Image image = new MarkupParser.Image();
93 image.image = imageUrl;
94 image.url = imageUrl;
95 return image;
96 }
97
98 protected MarkupParser.Article getArticle() {
99 return null;
100 }
101
102 protected final boolean isImageRepresentativeOfPage() {
103 String value = getStringProperty(REPRESENTATIVE_PROP);
104 return value.equalsIgnoreCase("true");
105 }
106
107 protected final void putStringValue(String name, String value) {
108 for (int i = 0; i < mStringPropertyNames.length; i++) {
109 if (name.equals(mStringPropertyNames[i])) {
110 mStringProperties[i] = value;
111 break;
112 }
113 }
114 }
115
116 protected final void putItemValue(String name, ThingItem value) {
117 for (int i = 0; i < mItemPropertyNames.length; i++) {
118 if (name.equals(mItemPropertyNames[i])) {
119 mItemProperties[i] = value;
120 break;
121 }
122 }
123 }
124
125 protected final String getStringProperty(String name) {
126 // Check if property exists in |mStringProperties|.
127 for (int i = 0; i < mStringPropertyNames.length; i++) {
128 if (name.equals(mStringPropertyNames[i])) {
129 String value = mStringProperties[i];
130 if (value != null && !value.isEmpty()) return value;
131 break;
132 }
133 }
134 // Otherwise, repeat for |mItemProperties|.
135 for (int i = 0; i < mItemPropertyNames.length; i++) {
136 if (!name.equals(mItemPropertyNames[i])) continue;
137 if (mItemProperties[i] != null) return mItemProperties[i].toStri ngProperty();
138 break;
139 }
140 return "";
141 }
142 }
143
144 private final List<ThingItem> mItemScopes;
145 private Element mRoot = null;
146 private final Map<Type, String> mTypeUrls = new EnumMap<Type, String>(Type.c lass);
cjhopman 2014/04/17 17:35:26 This appears to only be used to lookup a type for
kuan 2014/04/18 00:19:03 Done.
147
148 /**
149 * The object that extracts and verifies Schema.org markup tags from |root|.
150 */
151 public SchemaOrgParser(Element root) {
152 mRoot = root;
153 mItemScopes = new ArrayList<ThingItem>();
154
155 mTypeUrls.put(Type.IMAGE, "http://schema.org/ImageObject");
156 mTypeUrls.put(Type.ARTICLE, "http://schema.org/Article");
157 mTypeUrls.put(Type.PERSON, "http://schema.org/Person");
158 mTypeUrls.put(Type.ORGANIZATION, "http://schema.org/Organization");
159 mTypeUrls.put(Type.UNSUPPORTED, "");
160
161 // TODO(kuan): Parsing all tags is pretty expensive, should we do so onl y lazily?
162 // If parse lazily, all get* methods will need to check for parsed state and, if necessary,
163 // parse before returning the requested properties.
164 parseRoot();
165 }
166
167 @Override
168 public String getTitle() {
169 String title = findStringProperty(HEADLINE_PROP);
170 if (title.isEmpty()) title = findStringProperty(NAME_PROP);
171 return title;
172 }
173
174 @Override
175 public String getType() {
176 if (mItemScopes.isEmpty()) return null;
177 // Assume the type of the first item is the page type.
178 return mItemScopes.get(0).mType.toString();
179 }
180
181 @Override
182 public String getUrl() {
183 return findStringProperty(URL_PROP);
184 }
185
186 @Override
187 public MarkupParser.Image[] getImages() {
188 if (mItemScopes.isEmpty()) return null;
189 List<MarkupParser.Image> images = new ArrayList<MarkupParser.Image>();
190 for (int i = 0; i < mItemScopes.size(); i++) {
191 ThingItem item = mItemScopes.get(i);
192 MarkupParser.Image image = item.getImage();
193 if (image != null) {
194 if (item.isImageRepresentativeOfPage()) {
195 // Image should be the dominant, i.e. first, one.
196 images.add(0, image);
197 } else {
198 images.add(image);
199 }
200 }
201 }
202 if (images.isEmpty()) return null;
203 return images.toArray(new MarkupParser.Image[images.size()]);
204 }
205
206 @Override
207 public String getDescription() {
208 return findStringProperty(DESCRIPTION_PROP);
209 }
210
211 @Override
212 public String getPublisher() {
213 return findStringProperty(PUBLISHER_PROP);
214 }
215
216 @Override
217 public String getCopyright() {
218 if (mItemScopes.isEmpty()) return "";
219 // Returns a concatenated string of copyright year and copyright holder of the first item
220 // that has these properties, delimited by a whitespace.
221 String copyright = "";
222 for (int i = 0; i < mItemScopes.size() && copyright.isEmpty(); i++) {
223 ThingItem item = mItemScopes.get(i);
224 copyright = concat(item.getStringProperty(COPYRIGHT_YEAR_PROP),
225 item.getStringProperty(COPYRIGHT_HOLDER_PROP));
226 }
227 return copyright.isEmpty() ? copyright : "Copyright " + copyright;
228 }
229
230 @Override
231 public String getAuthor() {
232 return findStringProperty(AUTHOR_PROP);
233 }
234
235 @Override
236 public MarkupParser.Article getArticle() {
237 if (mItemScopes.isEmpty()) return null;
238 // Returns the first article.
239 MarkupParser.Article article = null;
240 for (int i = 0; i < mItemScopes.size() && article == null; i++) {
241 article = mItemScopes.get(i).getArticle();
242 }
243 return article;
244 }
245
246 @Override
247 public boolean optOut() {
248 return false;
249 }
250
251 private void parseRoot() {
252 // The <html> element can also be the start of a Schema.org item, and he nce needs to be
253 // parsed.
254
255 // Use a boolean array for |skipChildren|, instead of the boolean primit ive, so that it
256 // can be updated in checkIfElementIsSupported().
257 boolean[] skipChildren = new boolean[] { false };
258 checkIfElementIsSupported(mRoot, skipChildren);
259 if (skipChildren[0]) return; // Skipping children of root means there's nothing more to do.
260 // Recursively parse each element that is an Schema.org type.
261 parse(mRoot, null);
262 }
263
264 private void parse(Element root, ThingItem currItem) {
cjhopman 2014/04/17 17:35:26 This function and its uses will be simplified if c
kuan 2014/04/18 00:19:03 Done.
265 NodeList<Element> allElems = root.getElementsByTagName("*");
266 for (int i = 0; i < allElems.getLength(); i++) {
cjhopman 2014/04/17 17:35:26 The way that the tree is parsed is hard for me to
kuan 2014/04/18 00:19:03 Done. what's the worst case behavior?
cjhopman 2014/04/18 01:17:01 The previous version had O(n^2) worst case complex
267 Element e = allElems.getItem(i);
268 // See comments in parseRoot() for using boolean array for |skipChil dren|.
269 boolean[] skipChildren = new boolean[] { false };
270
271 ThingItem newItem = checkIfElementIsSupported(e, skipChildren);
272
273 // If we're currently parsing a Schema.org type, if it has an "itemp rop" attribute that
274 // we care for, extract and store its value.
275 if (currItem != null) extractProperty(e, currItem, newItem);
276
277 // If current element has "itemscope" and "itemtype" attributes and is a supported type,
278 // its children would have been parsed by |newItem| via the recursiv e parse() call.
279 // If it's an unsupported type, its children should be ignored. In both cases, we
280 // should skip these children to the next sibling of the current ele ment. So, determine
281 // the index of the next sibiling in |allElems|, so that the next it eration will jump to
282 // that element.
283 if (skipChildren[0]) {
284 Element next = e.getNextSiblingElement();
285 if (next != null) {
286 for (i++; i < allElems.getLength() && next != allElems.getIt em(i); i++) {}
287 i--; // Decrement because it'll be incremented in the outer for loop.
288 } else {
289 break; // No next sibling means there's no more elements to process.
290 }
291 }
292 } // for all elements
293 }
294
295 // If |e| has "itemscope" and "itemtype" attributes and a supported type, a ThingItem-extended
296 // object is created based on the type.
297 // Returns this object after it has recursively parsed |e|'s children, retur ns null otherwise.
298 // @param skipChildren[0] is set to true if |e| specifies a Schema.org type, supported or not.
299 private ThingItem checkIfElementIsSupported(Element e, boolean[] skipChildre n) {
300 // If element has "itemscope" and "itemtype" attributes, it's the start of an item.
301 // If the type is what we care for, instantiate the corresponding exten ded ThingItem and
302 // recursively parse it.
303 if (!e.hasAttribute("ITEMSCOPE") || !e.hasAttribute("ITEMTYPE")) return null;
cjhopman 2014/04/17 17:35:26 This should be a different function so you don't h
kuan 2014/04/18 00:19:03 Done.
304
305 skipChildren[0] = true; // Indicate to skip the children of this suppor ted element.
306 ThingItem newItem = null;
307 Type type = getType(e);
308 switch (type) {
309 case IMAGE:
310 newItem = new ImageItem(e);
311 break;
312 case ARTICLE:
313 newItem = new ArticleItem(e);
314 break;
315 case PERSON:
316 newItem = new PersonItem(e);
317 break;
318 case ORGANIZATION:
319 newItem = new OrganizationItem(e);
320 break;
321 case UNSUPPORTED:
322 default:
323 return null;
324 }
325
326 mItemScopes.add(newItem);
327 parse(e, newItem);
cjhopman 2014/04/17 17:35:26 This parse() call makes it harder for me to reason
kuan 2014/04/18 00:19:03 Done. i'm not sure if i code it the way u want re
cjhopman 2014/04/18 01:17:01 See the new comment in parse() for what I meant by
328 return newItem;
329 }
330
331 private Type getType(Element e) {
332 String type = e.getAttribute("ITEMTYPE");
333 Set<Map.Entry<Type, String>> typeUrls = mTypeUrls.entrySet();
334 Iterator<Map.Entry<Type, String>> iter = typeUrls.iterator();
cjhopman 2014/04/17 17:35:26 I think you can do: for (Map.Entry<Type, String>
kuan 2014/04/18 00:19:03 Done. since it's now a HashMap of <String, Type>,
335 while (iter.hasNext()) {
336 Map.Entry<Type, String> typeUrl = iter.next();
337 if (typeUrl.getValue().equalsIgnoreCase(type)) return typeUrl.getKey ();
338 }
339 return Type.UNSUPPORTED;
340 }
341
342 // Extract the value of the "itemprop" attribute in |e|.
343 // @param currItem ThingItem-extended item for the current Schema.org type b eing parsed.
344 // @param embeddedItem ThingItem-extended item for the Schema.org type creat ed for |e|, i.e. |e| // had specified a Schema.org type.
345 private void extractProperty(Element e, ThingItem currItem, ThingItem embedd edItem) {
cjhopman 2014/04/17 17:35:26 This function does a lot (and most of what it does
kuan 2014/04/18 00:19:03 Done.
346 // "itemprop" attribute is case-sensitive.
347 String name = e.getAttribute("ITEMPROP");
348 if (name == null || name.isEmpty()) return;
349 if (embeddedItem != null) { // This "itemprop" attribute is an embedded item.
350 currItem.putItemValue(name, embeddedItem);
351 } else { // Extract value from the tag.
352 String value = null;
353 if (e.hasTagName("A")) {
354 value = AnchorElement.as(e).getHref();
355 } else if (e.hasTagName("IMG")) {
356 value = ImageElement.as(e).getSrc();
357 } else if (e.hasTagName("META")) {
358 value = MetaElement.as(e).getContent();
359 } else if (e.hasTagName("TIME")) {
360 value = e.getAttribute("datetime");
361 }
362 if (value == null || value.isEmpty()) value = e.getInnerText();
363 currItem.putStringValue(name, value);
364 }
365 }
366
367 // Returns the first item that has the requested property value.
368 private String findStringProperty(String name) {
369 if (mItemScopes.isEmpty()) return null;
370 for (int i = 0; i < mItemScopes.size(); i++) {
371 String value = mItemScopes.get(i).getStringProperty(name);
372 if (!value.isEmpty()) return value;
373 }
374 return "";
375 }
376
377 private static class ImageItem extends ThingItem {
378 private static final String[] mStringPropertyNames = {
379 NAME_PROP,
380 URL_PROP,
381 DESCRIPTION_PROP,
382 IMAGE_PROP,
383 HEADLINE_PROP,
384 PUBLISHER_PROP,
385 COPYRIGHT_HOLDER_PROP,
386 COPYRIGHT_YEAR_PROP,
387 CONTENT_URL_PROP,
388 ENCODING_FORMAT_PROP,
389 CAPTION_PROP,
390 REPRESENTATIVE_PROP,
391 WIDTH_PROP,
392 HEIGHT_PROP,
393 };
394
395 private static final String[] mItemPropertyNames = {
396 PUBLISHER_PROP,
397 COPYRIGHT_HOLDER_PROP,
398 };
399
400 protected ImageItem(Element elem) {
401 super(Type.IMAGE, elem, mStringPropertyNames, mItemPropertyNames);
402 }
403
404 @Override
405 protected MarkupParser.Image getImage() {
406 MarkupParser.Image image = new MarkupParser.Image();
407 String url = getStringProperty(CONTENT_URL_PROP);
408 image.image = !url.isEmpty() ? url : getStringProperty(NAME_PROP);
409 image.url = image.image;
410 image.type = getStringProperty(ENCODING_FORMAT_PROP);
411 image.caption = getStringProperty(CAPTION_PROP);
412 try {
413 image.width = Integer.parseInt(getStringProperty(WIDTH_PROP), 10 );
414 } catch (Exception e) {
415 }
416 try {
417 image.height = Integer.parseInt(getStringProperty(HEIGHT_PROP), 10);
418 } catch (Exception e) {
419 }
420 return image;
421 }
422 }
423
424 private static class ArticleItem extends ThingItem {
425 private static final String[] mStringPropertyNames = {
426 NAME_PROP,
427 URL_PROP,
428 DESCRIPTION_PROP,
429 IMAGE_PROP,
430 HEADLINE_PROP,
431 PUBLISHER_PROP,
432 COPYRIGHT_HOLDER_PROP,
433 COPYRIGHT_YEAR_PROP,
434 DATE_MODIFIED_PROP,
435 DATE_PUBLISHED_PROP,
436 AUTHOR_PROP,
437 SECTION_PROP,
438 };
439
440 private static final String[] mItemPropertyNames = {
441 PUBLISHER_PROP,
442 COPYRIGHT_HOLDER_PROP,
443 AUTHOR_PROP,
444 };
445
446 protected ArticleItem(Element elem) {
447 super(Type.ARTICLE, elem, mStringPropertyNames, mItemPropertyNames);
448 }
449
450 @Override
451 protected MarkupParser.Article getArticle() {
452 MarkupParser.Article article = new MarkupParser.Article();
453 article.publishedTime = getStringProperty(DATE_PUBLISHED_PROP);
454 article.modifiedTime = getStringProperty(DATE_MODIFIED_PROP);
455 article.section = getStringProperty(SECTION_PROP);
456 String author = getStringProperty(AUTHOR_PROP);
457 article.authors = author.isEmpty() ? new String[0] : new String[] { author };
458 return article;
459 }
460 }
461
462 private static class PersonItem extends ThingItem {
463 private static final String[] mStringPropertyNames = {
464 NAME_PROP,
465 URL_PROP,
466 DESCRIPTION_PROP,
467 IMAGE_PROP,
468 FAMILY_NAME_PROP,
469 GIVEN_NAME_PROP,
470 };
471
472 protected PersonItem(Element elem) {
473 super(Type.PERSON, elem, mStringPropertyNames, new String[0]);
474 }
475
476 // Returns either the value of NAME_PROP, or concatenated values of GIVE N_NAME_PROP and
477 // FAILY_NAME_PROP delimited by a whitespace.
478 @Override
479 protected String toStringProperty() {
480 String fullname = getStringProperty(NAME_PROP);
481 if (fullname.isEmpty()) {
482 fullname = concat(getStringProperty(GIVEN_NAME_PROP),
483 getStringProperty(FAMILY_NAME_PROP));
484 }
485 return fullname;
486 }
487 }
488
489 private static class OrganizationItem extends ThingItem {
490 private static final String[] mStringPropertyNames = {
491 NAME_PROP,
492 URL_PROP,
493 DESCRIPTION_PROP,
494 IMAGE_PROP,
495 LEGAL_NAME_PROP,
496 };
497
498 protected OrganizationItem(Element elem) {
499 super(Type.ORGANIZATION, elem, mStringPropertyNames, new String[0]);
500 }
501
502 // Returns either the value of NAME_PROP or LEGAL_NAME_PROP.
503 @Override
504 protected String toStringProperty() {
505 String name = getStringProperty(NAME_PROP);
506 if (name.isEmpty()) name = getStringProperty(LEGAL_NAME_PROP);
507 return name;
508 }
509 }
510
511 private static String concat(String first, String second) {
512 String concat = first;
513 if (!concat.isEmpty() && !second.isEmpty()) concat += " ";
514 concat += second;
515 return concat;
516 }
517 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698