OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 package com.dom_distiller.client; | |
6 | |
7 import java.util.ArrayList; | |
8 import java.util.EnumMap; | |
9 import java.util.Iterator; | |
10 import java.util.List; | |
11 import java.util.Map; | |
12 import java.util.Set; | |
13 | |
14 import com.google.gwt.dom.client.AnchorElement; | |
15 import com.google.gwt.dom.client.Element; | |
16 import com.google.gwt.dom.client.ImageElement; | |
17 import com.google.gwt.dom.client.MetaElement; | |
18 import com.google.gwt.dom.client.NodeList; | |
19 | |
20 /** | |
21 * This class recognizes and parses Schema.org markup tags, and returns the prop erties that matter | |
22 * to distilled content. | |
23 * For the basic Schema.org Thing type, the basic properties are: name, url, des cription, image. | |
24 * In addition, for each type that we support, we also parse more specific prope rties: | |
25 * - Article: headline (i.e. title), publisher, copyright year, copyright holder , date published, | |
26 * date modified, author, article section | |
27 * - ImageObject: headline (i.e. title), publisher, copyright year, copyright ho lder, content url, | |
28 * encoding format, caption, representative of page, width, heigh t | |
29 * - Person: family name, given name | |
30 * - Organization: legal name. | |
31 * The value of a Schema.Org property can be a Schema.Org type, i.e. embedded. E.g., the author or | |
32 * publisher of article or publisher of image could be a Schema.Org Person or Or ganization type; | |
33 * in fact, this is the reason we support Person and Organization types. | |
34 */ | |
35 public class SchemaOrgParser implements MarkupParser.Parser { | |
36 private static final String NAME_PROP = "name"; | |
37 private static final String URL_PROP = "url"; | |
38 private static final String DESCRIPTION_PROP = "description"; | |
39 private static final String IMAGE_PROP = "image"; | |
40 private static final String HEADLINE_PROP = "headline"; | |
41 private static final String PUBLISHER_PROP = "publisher"; | |
42 private static final String COPYRIGHT_HOLDER_PROP = "copyrightHolder"; | |
43 private static final String COPYRIGHT_YEAR_PROP = "copyrightYear"; | |
44 private static final String CONTENT_URL_PROP = "contentUrl"; | |
45 private static final String ENCODING_FORMAT_PROP = "encodingFormat"; | |
46 private static final String CAPTION_PROP = "caption"; | |
47 private static final String REPRESENTATIVE_PROP = "representativeOfPage"; | |
48 private static final String WIDTH_PROP = "width"; | |
49 private static final String HEIGHT_PROP = "height"; | |
50 private static final String DATE_PUBLISHED_PROP = "datePublished"; | |
51 private static final String DATE_MODIFIED_PROP = "dateModified"; | |
52 private static final String AUTHOR_PROP = "author"; | |
53 private static final String SECTION_PROP = "articleSection"; | |
54 private static final String FAMILY_NAME_PROP = "familyName"; | |
55 private static final String GIVEN_NAME_PROP = "givenName"; | |
56 private static final String LEGAL_NAME_PROP = "legalName"; | |
57 | |
58 private enum Type { // All these types are extended from Thing, directly or indirectly. | |
59 IMAGE, | |
60 ARTICLE, | |
61 PERSON, | |
62 ORGANIZATION, | |
63 UNSUPPORTED, | |
64 } | |
65 | |
66 private static class ThingItem { | |
67 protected final Type mType; | |
68 protected final Element mRoot; | |
69 protected final String[] mStringPropertyNames; | |
70 protected final String[] mItemPropertyNames; | |
71 protected final String[] mStringProperties; | |
72 protected final ThingItem[] mItemProperties; | |
73 | |
74 protected ThingItem(Type type, Element root, | |
75 String[] stringPropertyNames, String[] itemPropertyN ames) { | |
76 mType = type; | |
77 mRoot = root; | |
78 mStringPropertyNames = stringPropertyNames; | |
79 mItemPropertyNames = itemPropertyNames; | |
80 mStringProperties = new String[mStringPropertyNames.length]; | |
81 mItemProperties = new ThingItem[mItemPropertyNames.length]; | |
82 } | |
83 | |
84 protected String toStringProperty() { | |
85 return ""; | |
86 } | |
87 | |
88 protected MarkupParser.Image getImage() { | |
89 // Use value of IMAGE_PROP to create a MarkupParser.Image. | |
90 String imageUrl = getStringProperty(IMAGE_PROP); | |
91 if (imageUrl.isEmpty()) return null; | |
92 MarkupParser.Image image = new MarkupParser.Image(); | |
93 image.image = imageUrl; | |
94 image.url = imageUrl; | |
95 return image; | |
96 } | |
97 | |
98 protected MarkupParser.Article getArticle() { | |
99 return null; | |
100 } | |
101 | |
102 protected final boolean isImageRepresentativeOfPage() { | |
103 String value = getStringProperty(REPRESENTATIVE_PROP); | |
104 return value.equalsIgnoreCase("true"); | |
105 } | |
106 | |
107 protected final void putStringValue(String name, String value) { | |
108 for (int i = 0; i < mStringPropertyNames.length; i++) { | |
109 if (name.equals(mStringPropertyNames[i])) { | |
110 mStringProperties[i] = value; | |
111 break; | |
112 } | |
113 } | |
114 } | |
115 | |
116 protected final void putItemValue(String name, ThingItem value) { | |
117 for (int i = 0; i < mItemPropertyNames.length; i++) { | |
118 if (name.equals(mItemPropertyNames[i])) { | |
119 mItemProperties[i] = value; | |
120 break; | |
121 } | |
122 } | |
123 } | |
124 | |
125 protected final String getStringProperty(String name) { | |
126 // Check if property exists in |mStringProperties|. | |
127 for (int i = 0; i < mStringPropertyNames.length; i++) { | |
128 if (name.equals(mStringPropertyNames[i])) { | |
129 String value = mStringProperties[i]; | |
130 if (value != null && !value.isEmpty()) return value; | |
131 break; | |
132 } | |
133 } | |
134 // Otherwise, repeat for |mItemProperties|. | |
135 for (int i = 0; i < mItemPropertyNames.length; i++) { | |
136 if (!name.equals(mItemPropertyNames[i])) continue; | |
137 if (mItemProperties[i] != null) return mItemProperties[i].toStri ngProperty(); | |
138 break; | |
139 } | |
140 return ""; | |
141 } | |
142 } | |
143 | |
144 private final List<ThingItem> mItemScopes; | |
145 private Element mRoot = null; | |
146 private final Map<Type, String> mTypeUrls = new EnumMap<Type, String>(Type.c lass); | |
cjhopman
2014/04/17 17:35:26
This appears to only be used to lookup a type for
kuan
2014/04/18 00:19:03
Done.
| |
147 | |
148 /** | |
149 * The object that extracts and verifies Schema.org markup tags from |root|. | |
150 */ | |
151 public SchemaOrgParser(Element root) { | |
152 mRoot = root; | |
153 mItemScopes = new ArrayList<ThingItem>(); | |
154 | |
155 mTypeUrls.put(Type.IMAGE, "http://schema.org/ImageObject"); | |
156 mTypeUrls.put(Type.ARTICLE, "http://schema.org/Article"); | |
157 mTypeUrls.put(Type.PERSON, "http://schema.org/Person"); | |
158 mTypeUrls.put(Type.ORGANIZATION, "http://schema.org/Organization"); | |
159 mTypeUrls.put(Type.UNSUPPORTED, ""); | |
160 | |
161 // TODO(kuan): Parsing all tags is pretty expensive, should we do so onl y lazily? | |
162 // If parse lazily, all get* methods will need to check for parsed state and, if necessary, | |
163 // parse before returning the requested properties. | |
164 parseRoot(); | |
165 } | |
166 | |
167 @Override | |
168 public String getTitle() { | |
169 String title = findStringProperty(HEADLINE_PROP); | |
170 if (title.isEmpty()) title = findStringProperty(NAME_PROP); | |
171 return title; | |
172 } | |
173 | |
174 @Override | |
175 public String getType() { | |
176 if (mItemScopes.isEmpty()) return null; | |
177 // Assume the type of the first item is the page type. | |
178 return mItemScopes.get(0).mType.toString(); | |
179 } | |
180 | |
181 @Override | |
182 public String getUrl() { | |
183 return findStringProperty(URL_PROP); | |
184 } | |
185 | |
186 @Override | |
187 public MarkupParser.Image[] getImages() { | |
188 if (mItemScopes.isEmpty()) return null; | |
189 List<MarkupParser.Image> images = new ArrayList<MarkupParser.Image>(); | |
190 for (int i = 0; i < mItemScopes.size(); i++) { | |
191 ThingItem item = mItemScopes.get(i); | |
192 MarkupParser.Image image = item.getImage(); | |
193 if (image != null) { | |
194 if (item.isImageRepresentativeOfPage()) { | |
195 // Image should be the dominant, i.e. first, one. | |
196 images.add(0, image); | |
197 } else { | |
198 images.add(image); | |
199 } | |
200 } | |
201 } | |
202 if (images.isEmpty()) return null; | |
203 return images.toArray(new MarkupParser.Image[images.size()]); | |
204 } | |
205 | |
206 @Override | |
207 public String getDescription() { | |
208 return findStringProperty(DESCRIPTION_PROP); | |
209 } | |
210 | |
211 @Override | |
212 public String getPublisher() { | |
213 return findStringProperty(PUBLISHER_PROP); | |
214 } | |
215 | |
216 @Override | |
217 public String getCopyright() { | |
218 if (mItemScopes.isEmpty()) return ""; | |
219 // Returns a concatenated string of copyright year and copyright holder of the first item | |
220 // that has these properties, delimited by a whitespace. | |
221 String copyright = ""; | |
222 for (int i = 0; i < mItemScopes.size() && copyright.isEmpty(); i++) { | |
223 ThingItem item = mItemScopes.get(i); | |
224 copyright = concat(item.getStringProperty(COPYRIGHT_YEAR_PROP), | |
225 item.getStringProperty(COPYRIGHT_HOLDER_PROP)); | |
226 } | |
227 return copyright.isEmpty() ? copyright : "Copyright " + copyright; | |
228 } | |
229 | |
230 @Override | |
231 public String getAuthor() { | |
232 return findStringProperty(AUTHOR_PROP); | |
233 } | |
234 | |
235 @Override | |
236 public MarkupParser.Article getArticle() { | |
237 if (mItemScopes.isEmpty()) return null; | |
238 // Returns the first article. | |
239 MarkupParser.Article article = null; | |
240 for (int i = 0; i < mItemScopes.size() && article == null; i++) { | |
241 article = mItemScopes.get(i).getArticle(); | |
242 } | |
243 return article; | |
244 } | |
245 | |
246 @Override | |
247 public boolean optOut() { | |
248 return false; | |
249 } | |
250 | |
251 private void parseRoot() { | |
252 // The <html> element can also be the start of a Schema.org item, and he nce needs to be | |
253 // parsed. | |
254 | |
255 // Use a boolean array for |skipChildren|, instead of the boolean primit ive, so that it | |
256 // can be updated in checkIfElementIsSupported(). | |
257 boolean[] skipChildren = new boolean[] { false }; | |
258 checkIfElementIsSupported(mRoot, skipChildren); | |
259 if (skipChildren[0]) return; // Skipping children of root means there's nothing more to do. | |
260 // Recursively parse each element that is an Schema.org type. | |
261 parse(mRoot, null); | |
262 } | |
263 | |
264 private void parse(Element root, ThingItem currItem) { | |
cjhopman
2014/04/17 17:35:26
This function and its uses will be simplified if c
kuan
2014/04/18 00:19:03
Done.
| |
265 NodeList<Element> allElems = root.getElementsByTagName("*"); | |
266 for (int i = 0; i < allElems.getLength(); i++) { | |
cjhopman
2014/04/17 17:35:26
The way that the tree is parsed is hard for me to
kuan
2014/04/18 00:19:03
Done. what's the worst case behavior?
cjhopman
2014/04/18 01:17:01
The previous version had O(n^2) worst case complex
| |
267 Element e = allElems.getItem(i); | |
268 // See comments in parseRoot() for using boolean array for |skipChil dren|. | |
269 boolean[] skipChildren = new boolean[] { false }; | |
270 | |
271 ThingItem newItem = checkIfElementIsSupported(e, skipChildren); | |
272 | |
273 // If we're currently parsing a Schema.org type, if it has an "itemp rop" attribute that | |
274 // we care for, extract and store its value. | |
275 if (currItem != null) extractProperty(e, currItem, newItem); | |
276 | |
277 // If current element has "itemscope" and "itemtype" attributes and is a supported type, | |
278 // its children would have been parsed by |newItem| via the recursiv e parse() call. | |
279 // If it's an unsupported type, its children should be ignored. In both cases, we | |
280 // should skip these children to the next sibling of the current ele ment. So, determine | |
281 // the index of the next sibiling in |allElems|, so that the next it eration will jump to | |
282 // that element. | |
283 if (skipChildren[0]) { | |
284 Element next = e.getNextSiblingElement(); | |
285 if (next != null) { | |
286 for (i++; i < allElems.getLength() && next != allElems.getIt em(i); i++) {} | |
287 i--; // Decrement because it'll be incremented in the outer for loop. | |
288 } else { | |
289 break; // No next sibling means there's no more elements to process. | |
290 } | |
291 } | |
292 } // for all elements | |
293 } | |
294 | |
295 // If |e| has "itemscope" and "itemtype" attributes and a supported type, a ThingItem-extended | |
296 // object is created based on the type. | |
297 // Returns this object after it has recursively parsed |e|'s children, retur ns null otherwise. | |
298 // @param skipChildren[0] is set to true if |e| specifies a Schema.org type, supported or not. | |
299 private ThingItem checkIfElementIsSupported(Element e, boolean[] skipChildre n) { | |
300 // If element has "itemscope" and "itemtype" attributes, it's the start of an item. | |
301 // If the type is what we care for, instantiate the corresponding exten ded ThingItem and | |
302 // recursively parse it. | |
303 if (!e.hasAttribute("ITEMSCOPE") || !e.hasAttribute("ITEMTYPE")) return null; | |
cjhopman
2014/04/17 17:35:26
This should be a different function so you don't h
kuan
2014/04/18 00:19:03
Done.
| |
304 | |
305 skipChildren[0] = true; // Indicate to skip the children of this suppor ted element. | |
306 ThingItem newItem = null; | |
307 Type type = getType(e); | |
308 switch (type) { | |
309 case IMAGE: | |
310 newItem = new ImageItem(e); | |
311 break; | |
312 case ARTICLE: | |
313 newItem = new ArticleItem(e); | |
314 break; | |
315 case PERSON: | |
316 newItem = new PersonItem(e); | |
317 break; | |
318 case ORGANIZATION: | |
319 newItem = new OrganizationItem(e); | |
320 break; | |
321 case UNSUPPORTED: | |
322 default: | |
323 return null; | |
324 } | |
325 | |
326 mItemScopes.add(newItem); | |
327 parse(e, newItem); | |
cjhopman
2014/04/17 17:35:26
This parse() call makes it harder for me to reason
kuan
2014/04/18 00:19:03
Done. i'm not sure if i code it the way u want re
cjhopman
2014/04/18 01:17:01
See the new comment in parse() for what I meant by
| |
328 return newItem; | |
329 } | |
330 | |
331 private Type getType(Element e) { | |
332 String type = e.getAttribute("ITEMTYPE"); | |
333 Set<Map.Entry<Type, String>> typeUrls = mTypeUrls.entrySet(); | |
334 Iterator<Map.Entry<Type, String>> iter = typeUrls.iterator(); | |
cjhopman
2014/04/17 17:35:26
I think you can do:
for (Map.Entry<Type, String>
kuan
2014/04/18 00:19:03
Done. since it's now a HashMap of <String, Type>,
| |
335 while (iter.hasNext()) { | |
336 Map.Entry<Type, String> typeUrl = iter.next(); | |
337 if (typeUrl.getValue().equalsIgnoreCase(type)) return typeUrl.getKey (); | |
338 } | |
339 return Type.UNSUPPORTED; | |
340 } | |
341 | |
342 // Extract the value of the "itemprop" attribute in |e|. | |
343 // @param currItem ThingItem-extended item for the current Schema.org type b eing parsed. | |
344 // @param embeddedItem ThingItem-extended item for the Schema.org type creat ed for |e|, i.e. |e| // had specified a Schema.org type. | |
345 private void extractProperty(Element e, ThingItem currItem, ThingItem embedd edItem) { | |
cjhopman
2014/04/17 17:35:26
This function does a lot (and most of what it does
kuan
2014/04/18 00:19:03
Done.
| |
346 // "itemprop" attribute is case-sensitive. | |
347 String name = e.getAttribute("ITEMPROP"); | |
348 if (name == null || name.isEmpty()) return; | |
349 if (embeddedItem != null) { // This "itemprop" attribute is an embedded item. | |
350 currItem.putItemValue(name, embeddedItem); | |
351 } else { // Extract value from the tag. | |
352 String value = null; | |
353 if (e.hasTagName("A")) { | |
354 value = AnchorElement.as(e).getHref(); | |
355 } else if (e.hasTagName("IMG")) { | |
356 value = ImageElement.as(e).getSrc(); | |
357 } else if (e.hasTagName("META")) { | |
358 value = MetaElement.as(e).getContent(); | |
359 } else if (e.hasTagName("TIME")) { | |
360 value = e.getAttribute("datetime"); | |
361 } | |
362 if (value == null || value.isEmpty()) value = e.getInnerText(); | |
363 currItem.putStringValue(name, value); | |
364 } | |
365 } | |
366 | |
367 // Returns the first item that has the requested property value. | |
368 private String findStringProperty(String name) { | |
369 if (mItemScopes.isEmpty()) return null; | |
370 for (int i = 0; i < mItemScopes.size(); i++) { | |
371 String value = mItemScopes.get(i).getStringProperty(name); | |
372 if (!value.isEmpty()) return value; | |
373 } | |
374 return ""; | |
375 } | |
376 | |
377 private static class ImageItem extends ThingItem { | |
378 private static final String[] mStringPropertyNames = { | |
379 NAME_PROP, | |
380 URL_PROP, | |
381 DESCRIPTION_PROP, | |
382 IMAGE_PROP, | |
383 HEADLINE_PROP, | |
384 PUBLISHER_PROP, | |
385 COPYRIGHT_HOLDER_PROP, | |
386 COPYRIGHT_YEAR_PROP, | |
387 CONTENT_URL_PROP, | |
388 ENCODING_FORMAT_PROP, | |
389 CAPTION_PROP, | |
390 REPRESENTATIVE_PROP, | |
391 WIDTH_PROP, | |
392 HEIGHT_PROP, | |
393 }; | |
394 | |
395 private static final String[] mItemPropertyNames = { | |
396 PUBLISHER_PROP, | |
397 COPYRIGHT_HOLDER_PROP, | |
398 }; | |
399 | |
400 protected ImageItem(Element elem) { | |
401 super(Type.IMAGE, elem, mStringPropertyNames, mItemPropertyNames); | |
402 } | |
403 | |
404 @Override | |
405 protected MarkupParser.Image getImage() { | |
406 MarkupParser.Image image = new MarkupParser.Image(); | |
407 String url = getStringProperty(CONTENT_URL_PROP); | |
408 image.image = !url.isEmpty() ? url : getStringProperty(NAME_PROP); | |
409 image.url = image.image; | |
410 image.type = getStringProperty(ENCODING_FORMAT_PROP); | |
411 image.caption = getStringProperty(CAPTION_PROP); | |
412 try { | |
413 image.width = Integer.parseInt(getStringProperty(WIDTH_PROP), 10 ); | |
414 } catch (Exception e) { | |
415 } | |
416 try { | |
417 image.height = Integer.parseInt(getStringProperty(HEIGHT_PROP), 10); | |
418 } catch (Exception e) { | |
419 } | |
420 return image; | |
421 } | |
422 } | |
423 | |
424 private static class ArticleItem extends ThingItem { | |
425 private static final String[] mStringPropertyNames = { | |
426 NAME_PROP, | |
427 URL_PROP, | |
428 DESCRIPTION_PROP, | |
429 IMAGE_PROP, | |
430 HEADLINE_PROP, | |
431 PUBLISHER_PROP, | |
432 COPYRIGHT_HOLDER_PROP, | |
433 COPYRIGHT_YEAR_PROP, | |
434 DATE_MODIFIED_PROP, | |
435 DATE_PUBLISHED_PROP, | |
436 AUTHOR_PROP, | |
437 SECTION_PROP, | |
438 }; | |
439 | |
440 private static final String[] mItemPropertyNames = { | |
441 PUBLISHER_PROP, | |
442 COPYRIGHT_HOLDER_PROP, | |
443 AUTHOR_PROP, | |
444 }; | |
445 | |
446 protected ArticleItem(Element elem) { | |
447 super(Type.ARTICLE, elem, mStringPropertyNames, mItemPropertyNames); | |
448 } | |
449 | |
450 @Override | |
451 protected MarkupParser.Article getArticle() { | |
452 MarkupParser.Article article = new MarkupParser.Article(); | |
453 article.publishedTime = getStringProperty(DATE_PUBLISHED_PROP); | |
454 article.modifiedTime = getStringProperty(DATE_MODIFIED_PROP); | |
455 article.section = getStringProperty(SECTION_PROP); | |
456 String author = getStringProperty(AUTHOR_PROP); | |
457 article.authors = author.isEmpty() ? new String[0] : new String[] { author }; | |
458 return article; | |
459 } | |
460 } | |
461 | |
462 private static class PersonItem extends ThingItem { | |
463 private static final String[] mStringPropertyNames = { | |
464 NAME_PROP, | |
465 URL_PROP, | |
466 DESCRIPTION_PROP, | |
467 IMAGE_PROP, | |
468 FAMILY_NAME_PROP, | |
469 GIVEN_NAME_PROP, | |
470 }; | |
471 | |
472 protected PersonItem(Element elem) { | |
473 super(Type.PERSON, elem, mStringPropertyNames, new String[0]); | |
474 } | |
475 | |
476 // Returns either the value of NAME_PROP, or concatenated values of GIVE N_NAME_PROP and | |
477 // FAILY_NAME_PROP delimited by a whitespace. | |
478 @Override | |
479 protected String toStringProperty() { | |
480 String fullname = getStringProperty(NAME_PROP); | |
481 if (fullname.isEmpty()) { | |
482 fullname = concat(getStringProperty(GIVEN_NAME_PROP), | |
483 getStringProperty(FAMILY_NAME_PROP)); | |
484 } | |
485 return fullname; | |
486 } | |
487 } | |
488 | |
489 private static class OrganizationItem extends ThingItem { | |
490 private static final String[] mStringPropertyNames = { | |
491 NAME_PROP, | |
492 URL_PROP, | |
493 DESCRIPTION_PROP, | |
494 IMAGE_PROP, | |
495 LEGAL_NAME_PROP, | |
496 }; | |
497 | |
498 protected OrganizationItem(Element elem) { | |
499 super(Type.ORGANIZATION, elem, mStringPropertyNames, new String[0]); | |
500 } | |
501 | |
502 // Returns either the value of NAME_PROP or LEGAL_NAME_PROP. | |
503 @Override | |
504 protected String toStringProperty() { | |
505 String name = getStringProperty(NAME_PROP); | |
506 if (name.isEmpty()) name = getStringProperty(LEGAL_NAME_PROP); | |
507 return name; | |
508 } | |
509 } | |
510 | |
511 private static String concat(String first, String second) { | |
512 String concat = first; | |
513 if (!concat.isEmpty() && !second.isEmpty()) concat += " "; | |
514 concat += second; | |
515 return concat; | |
516 } | |
517 } | |
OLD | NEW |