OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 package com.dom_distiller.client; | |
6 | |
7 import java.util.ArrayList; | |
8 import java.util.HashMap; | |
9 import java.util.List; | |
10 import java.util.Map; | |
11 | |
12 import com.google.gwt.dom.client.AnchorElement; | |
13 import com.google.gwt.dom.client.Element; | |
14 import com.google.gwt.dom.client.ImageElement; | |
15 import com.google.gwt.dom.client.MetaElement; | |
16 import com.google.gwt.dom.client.Node; | |
17 import com.google.gwt.dom.client.NodeList; | |
18 | |
19 /** | |
20 * This class recognizes and parses schema.org markup tags, and returns the prop erties that matter | |
21 * to distilled content. | |
22 * Schema.org markup (http://schema.org) is based on the microdata format | |
23 * (http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html). | |
24 * For the basic Schema.org Thing type, the basic properties are: name, url, des cription, image. | |
25 * In addition, for each type that we support, we also parse more specific prope rties: | |
26 * - Article: headline (i.e. title), publisher, copyright year, copyright holder , date published, | |
27 * date modified, author, article section | |
28 * - ImageObject: headline (i.e. title), publisher, copyright year, copyright ho lder, content url, | |
29 * encoding format, caption, representative of page, width, heigh t | |
30 * - Person: family name, given name | |
31 * - Organization: legal name. | |
32 * The value of a Schema.Org property can be a Schema.Org type, i.e. embedded. E.g., the author or | |
33 * publisher of article or publisher of image could be a Schema.Org Person or Or ganization type; | |
34 * in fact, this is the reason we support Person and Organization types. | |
35 */ | |
36 public class SchemaOrgParser { | |
37 static final String NAME_PROP = "name"; | |
38 static final String URL_PROP = "url"; | |
39 static final String DESCRIPTION_PROP = "description"; | |
40 static final String IMAGE_PROP = "image"; | |
41 static final String HEADLINE_PROP = "headline"; | |
42 static final String PUBLISHER_PROP = "publisher"; | |
43 static final String COPYRIGHT_HOLDER_PROP = "copyrightHolder"; | |
44 static final String COPYRIGHT_YEAR_PROP = "copyrightYear"; | |
45 static final String CONTENT_URL_PROP = "contentUrl"; | |
46 static final String ENCODING_FORMAT_PROP = "encodingFormat"; | |
47 static final String CAPTION_PROP = "caption"; | |
48 static final String REPRESENTATIVE_PROP = "representativeOfPage"; | |
49 static final String WIDTH_PROP = "width"; | |
50 static final String HEIGHT_PROP = "height"; | |
51 static final String DATE_PUBLISHED_PROP = "datePublished"; | |
52 static final String DATE_MODIFIED_PROP = "dateModified"; | |
53 static final String AUTHOR_PROP = "author"; | |
54 static final String CREATOR_PROP = "creator"; | |
55 static final String SECTION_PROP = "articleSection"; | |
56 static final String ASSOCIATED_MEDIA_PROP = "associatedMedia"; | |
57 static final String ENCODING_PROP = "encoding"; | |
58 static final String FAMILY_NAME_PROP = "familyName"; | |
59 static final String GIVEN_NAME_PROP = "givenName"; | |
60 static final String LEGAL_NAME_PROP = "legalName"; | |
61 static final String AUTHOR_REL = "author"; | |
62 | |
63 enum Type { // All these types are extended from Thing, directly or indirec tly. | |
64 IMAGE, | |
65 ARTICLE, | |
66 PERSON, | |
67 ORGANIZATION, | |
68 UNSUPPORTED, | |
69 } | |
70 | |
71 static class ThingItem { | |
72 private final Type mType; | |
73 private final String[] mStringPropertyNames; | |
cjhopman
2014/04/25 20:52:34
How this all works together is rather confusing to
kuan
2014/04/29 00:23:10
Done. i was using map b4, but changed to arrays w
| |
74 private final String[][] mItemPropertyNames; | |
75 private final String[] mStringProperties; | |
76 private final ThingItem[] mItemProperties; | |
77 | |
78 // |stringPropertyNames| and |itemPropertyNames| are names of properties that this | |
cjhopman
2014/04/25 20:52:34
nit: Use javadoc comment format
kuan
2014/04/29 00:23:10
comments r not needed now.
| |
79 // ThingItem extracts from the page. | |
80 // @param stringPropertyNames is a String[] of property names whose valu es are String type. | |
81 // @param itemPropertyNames is a 2-dimensional array of String where: | |
82 // - 1st row: array of property names whose values are of extended-Thing Item type | |
83 // - 2nd row: array of property names whose values are to be retrieved f rom the | |
84 // corresponding extended-ThingItem object. | |
85 ThingItem(Type type, String[] stringPropertyNames, String[][] itemProper tyNames) { | |
86 mType = type; | |
87 mStringPropertyNames = stringPropertyNames; | |
88 mItemPropertyNames = itemPropertyNames; | |
89 mStringProperties = new String[mStringPropertyNames.length]; | |
90 mItemProperties = new ThingItem[mItemPropertyNames.length]; | |
91 } | |
92 | |
93 MarkupParser.Image getImage() { | |
94 return null; | |
95 } | |
96 | |
97 MarkupParser.Article getArticle() { | |
cjhopman
2014/04/25 20:52:34
Let's get rid of these functions that only make se
kuan
2014/04/29 00:23:10
Done. i was doing it this way to avoid casting Th
| |
98 return null; | |
99 } | |
100 | |
101 String getStringProperty(String name) { | |
102 // Check if property exists in |mStringProperties|. | |
103 for (int i = 0; i < mStringPropertyNames.length; i++) { | |
104 if (name.equals(mStringPropertyNames[i])) { | |
105 String value = mStringProperties[i]; | |
106 if (value != null && !value.isEmpty()) return value; | |
107 break; | |
108 } | |
109 } | |
110 // Otherwise, repeat for |mItemProperties|. | |
cjhopman
2014/04/25 20:52:34
This is weird that we go looking into the properti
kuan
2014/04/29 00:23:10
Done. author, publisher and copyright holder can
| |
111 for (int i = 0; i < mItemPropertyNames.length; i++) { | |
112 if (!name.equals(mItemPropertyNames[i][0])) continue; | |
113 if (mItemProperties[i] != null) { | |
114 return mItemProperties[i].getStringProperty(mItemPropertyNam es[i][1]); | |
115 } | |
116 break; | |
117 } | |
118 return ""; | |
119 } | |
120 | |
121 final ThingItem getItemProperty(String name) { | |
122 for (int i = 0; i < mItemPropertyNames.length; i++) { | |
123 if (name.equals(mItemPropertyNames[i][0])) return mItemPropertie s[i]; | |
124 } | |
125 return null; | |
126 } | |
127 | |
128 final Type getType() { return mType; } | |
129 | |
130 final boolean isSupported() { return mType != Type.UNSUPPORTED; } | |
131 | |
132 final boolean isImageRepresentativeOfPage() { | |
133 String value = getStringProperty(REPRESENTATIVE_PROP); | |
134 return value.equalsIgnoreCase("true"); | |
135 } | |
136 | |
137 // Store |value| for property with |name|. | |
138 // @param override: set to true to override the property's value, false to keep property's | |
139 // first non-empty value and ignore |value|. | |
140 final void putStringValue(String name, String value, boolean override) { | |
141 for (int i = 0; i < mStringPropertyNames.length; i++) { | |
142 if (name.equals(mStringPropertyNames[i])) { | |
143 String existing = override ? null : mStringProperties[i]; | |
144 if (existing == null || existing.isEmpty()) mStringPropertie s[i] = value; | |
145 break; | |
146 } | |
147 } | |
148 } | |
149 | |
150 // Store |value| for property with |name|, unless the property already h as a non-null value, | |
151 // in which case, |value| will be ignored. This means we only keep the first value. | |
152 final void putItemValue(String name, ThingItem value) { | |
153 for (int i = 0; i < mItemPropertyNames.length; i++) { | |
154 if (name.equals(mItemPropertyNames[i][0])) { | |
155 if (mItemProperties[i] == null) mItemProperties[i] = value; | |
156 break; | |
157 } | |
158 } | |
159 } | |
160 } | |
161 | |
162 private final List<ThingItem> mItemScopes = new ArrayList<ThingItem>(); | |
163 private String mAuthorFromRel = ""; | |
164 private static final Map<String, Type> sTypeUrls; | |
165 private static final Map<String, String[]> sTagAttributesMap; | |
166 private static final String[] sEmptyStringPropertyNames = { | |
167 // Intentionally empty, declared so that it's initialized statically. | |
168 }; | |
169 private static final String[][] sEmptyItemPropertyNames = { | |
170 // Intentionally empty, declared so that it's initialized statically. | |
171 }; | |
172 | |
173 static { | |
174 sTypeUrls = new HashMap<String, Type>(); | |
175 sTypeUrls.put("http://schema.org/ImageObject", Type.IMAGE); | |
176 sTypeUrls.put("http://schema.org/Article", Type.ARTICLE); | |
177 sTypeUrls.put("http://schema.org/BlogPosting", Type.ARTICLE); | |
178 sTypeUrls.put("http://schema.org/NewsArticle", Type.ARTICLE); | |
179 sTypeUrls.put("http://schema.org/ScholarlyArticle", Type.ARTICLE); | |
180 sTypeUrls.put("http://schema.org/TechArticle", Type.ARTICLE); | |
181 sTypeUrls.put("http://schema.org/Person", Type.PERSON); | |
182 sTypeUrls.put("http://schema.org/Organization", Type.ORGANIZATION); | |
183 sTypeUrls.put("http://schema.org/Corporation", Type.ORGANIZATION); | |
184 sTypeUrls.put("http://schema.org/EducationalOrganization", Type.ORGANIZA TION); | |
185 sTypeUrls.put("http://schema.org/GovernmentOrganization", Type.ORGANIZAT ION); | |
186 sTypeUrls.put("http://schema.org/NGO", Type.ORGANIZATION); | |
187 | |
188 // The key for |sTagAttributesMap| is the tag name, while the entry valu e is an array of | |
189 // attributes in the specified tag from which to extract information: | |
190 // - 0th attribute: contains the value for the property specified in ite mprop | |
191 // - 1st attribute: if available, contains the value for the author prop erty. | |
192 sTagAttributesMap = new HashMap<String, String[]>(); | |
cjhopman
2014/04/25 20:52:34
nit: move this (and the sTagAttributesMap declarat
kuan
2014/04/29 00:23:10
Done.
| |
193 sTagAttributesMap.put("IMG", new String[] { "SRC" }); | |
194 sTagAttributesMap.put("AUDIO", new String[] { "SRC" }); | |
195 sTagAttributesMap.put("EMBED", new String[] { "SRC" }); | |
196 sTagAttributesMap.put("IFRAME", new String[] { "SRC" }); | |
197 sTagAttributesMap.put("SOURCE", new String[] { "SRC" }); | |
198 sTagAttributesMap.put("TRACK", new String[] { "SRC" }); | |
199 sTagAttributesMap.put("VIDEO", new String[] { "SRC" }); | |
200 sTagAttributesMap.put("A", new String[] { "HREF", "REL" }); | |
cjhopman
2014/04/25 20:52:34
I can't find documentation anywhere that says to u
kuan
2014/04/29 00:23:10
http://schema.org/author, or from http://schema.or
cjhopman
2014/04/29 17:04:19
That's interesting. Maybe you could add a comment
kuan
2014/04/29 23:26:43
Done. comment is in parse() where the rel attribu
| |
201 sTagAttributesMap.put("LINK", new String[] { "HREF", "REL" }); | |
202 sTagAttributesMap.put("AREA", new String[] { "HREF" }); | |
203 sTagAttributesMap.put("META", new String[] { "CONTENT" }); | |
204 sTagAttributesMap.put("TIME", new String[] { "DATETIME" }); | |
205 sTagAttributesMap.put("OBJECT", new String[] { "DATA" }); | |
206 sTagAttributesMap.put("DATA", new String[] { "VALUE" }); | |
207 sTagAttributesMap.put("METER", new String[] { "VALUE" }); | |
208 } | |
209 | |
210 /** | |
211 * The object that extracts and verifies Schema.org markup tags from |root|. | |
212 */ | |
213 public SchemaOrgParser(Element root) { | |
214 // TODO(kuan): Parsing all tags is pretty expensive, should we do so onl y lazily? | |
215 // If parse lazily, all get* methods will need to check for parsed state and, if necessary, | |
216 // parse before returning the requested properties. | |
217 // Note that the <html> element can also be the start of a Schema.org it em, and hence needs | |
218 // to be parsed. | |
219 parse(root, null); | |
220 } | |
221 | |
222 // Returns the first item that has the requested property value. | |
cjhopman
2014/04/25 20:52:34
I don't think that this is something we would ever
kuan
2014/04/29 00:23:10
Done. Image also has headline, but i assume we on
| |
223 String findStringProperty(String name) { | |
224 if (mItemScopes.isEmpty()) return ""; | |
225 for (int i = 0; i < mItemScopes.size(); i++) { | |
226 String value = mItemScopes.get(i).getStringProperty(name); | |
227 if (!value.isEmpty()) return value; | |
228 } | |
229 return ""; | |
230 } | |
231 | |
232 ThingItem findFirstArticle() { | |
cjhopman
2014/04/25 20:52:34
This should return an ArticleItem
kuan
2014/04/29 00:23:10
Done.
| |
233 for (int i = 0; i < mItemScopes.size(); i++) { | |
234 ThingItem item = mItemScopes.get(i); | |
235 if (item.mType == Type.ARTICLE) return item; | |
236 } | |
237 return null; | |
238 } | |
239 | |
240 final List<ThingItem> getItemScopes() { return mItemScopes; } | |
241 | |
242 final String getAuthorFromRel() { return mAuthorFromRel; } | |
243 | |
244 static String concat(String first, String second) { | |
245 String concat = first; | |
246 if (!concat.isEmpty() && !second.isEmpty()) concat += " "; | |
247 concat += second; | |
248 return concat; | |
249 } | |
250 | |
251 private void parse(Element e, ThingItem parentItem) { | |
252 ThingItem newItem = null; | |
253 boolean isItemScope = isItemScope(e); | |
254 // A non-null |parentItem| means we're currently parsing the elements fo r a schema.org type. | |
255 String[] propertyNames = parentItem != null ? getItemProp(e) : new Strin g[0]; | |
256 | |
257 if (isItemScope) { | |
258 // The "itemscope" and "itemtype" attributes of |e| indicate the sta rt of an item. | |
259 // Create the corresponding extended-ThingItem, and add it to the li st if: | |
260 // 1) its type is supported, and | |
261 // 2) if the parent is an unsupported type, it's not an "itemprop" a ttribute of the | |
262 // parent, based on the rule that an item is a top-level item if its element doesn't | |
263 // have an itemprop attribute. | |
264 newItem = createItemForElement(e); | |
265 if (newItem != null && newItem.isSupported() && | |
266 (parentItem == null || parentItem.isSupported() || propertyNames .length == 0)) { | |
267 mItemScopes.add(newItem); | |
268 } | |
269 } | |
270 | |
271 // If parent is a supported type, parse the element for >= 1 properties in "itemprop" | |
272 // attribute. | |
273 if (propertyNames.length > 0 && parentItem.isSupported() && | |
274 (newItem == null || newItem.isSupported())) { | |
275 for (int i = 0; i < propertyNames.length; i++) { | |
276 // If a new item was created above, the property value of this " itemprop" attribute | |
277 // is an embedded item, so add it to the parent item. | |
278 if (newItem != null) { | |
279 parentItem.putItemValue(propertyNames[i], newItem); | |
280 } else { | |
281 // Otherwise, extract the property value from the tag itself, and add it to the | |
282 // parent item. | |
283 parentItem.putStringValue(propertyNames[i], getPropertyValue( e), false); | |
284 } | |
285 } | |
286 } | |
287 | |
288 // If <a> or <link> tags specify rel="author", extract it. | |
289 if (mAuthorFromRel.isEmpty()) mAuthorFromRel = getAuthorFromRelAttribute (e); | |
290 | |
291 // Now, parse each child element recursively. | |
292 NodeList<Node> children = e.getChildNodes(); | |
293 for (int i = 0; i < children.getLength(); i++) { | |
294 Node child = children.getItem(i); | |
295 if (child.getNodeType() != Node.ELEMENT_NODE) continue; | |
296 parse(Element.as(child), newItem != null ? newItem : parentItem); | |
297 } | |
298 } | |
299 | |
300 private Type getItemType(Element e) { | |
301 // "itemtype" attribute is case-sensitive. | |
302 String type = e.getAttribute("ITEMTYPE"); | |
303 return sTypeUrls.containsKey(type) ? sTypeUrls.get(type) : Type.UNSUPPOR TED; | |
304 } | |
305 | |
306 private ThingItem createItemForElement(Element e) { | |
307 ThingItem newItem = null; | |
308 Type type = getItemType(e); | |
309 switch (type) { | |
310 case IMAGE: | |
311 newItem = new ImageItem(); | |
312 break; | |
313 case ARTICLE: | |
314 newItem = new ArticleItem(); | |
315 break; | |
316 case PERSON: | |
317 newItem = new PersonItem(); | |
318 break; | |
319 case ORGANIZATION: | |
320 newItem = new OrganizationItem(); | |
321 break; | |
322 case UNSUPPORTED: | |
323 newItem = new UnsupportedItem(); | |
324 break; | |
325 default: | |
326 return null; | |
327 } | |
328 return newItem; | |
329 } | |
330 | |
331 private static class ImageItem extends ThingItem { | |
332 private static final String[] sStringPropertyNames = { | |
333 NAME_PROP, | |
334 URL_PROP, | |
335 DESCRIPTION_PROP, | |
336 IMAGE_PROP, | |
337 HEADLINE_PROP, | |
338 PUBLISHER_PROP, | |
339 COPYRIGHT_HOLDER_PROP, | |
340 COPYRIGHT_YEAR_PROP, | |
341 CONTENT_URL_PROP, | |
342 ENCODING_FORMAT_PROP, | |
343 CAPTION_PROP, | |
344 REPRESENTATIVE_PROP, | |
345 WIDTH_PROP, | |
346 HEIGHT_PROP, | |
347 }; | |
348 | |
349 private static final String[][] sItemPropertyNames = { | |
350 new String[] { PUBLISHER_PROP, NAME_PROP }, | |
351 new String[] { COPYRIGHT_HOLDER_PROP, NAME_PROP }, | |
352 }; | |
353 | |
354 ImageItem() { | |
355 super(Type.IMAGE, sStringPropertyNames, sItemPropertyNames); | |
356 } | |
357 | |
358 @Override | |
359 String getStringProperty(String propertyName) { | |
360 if (!propertyName.equals(CONTENT_URL_PROP) && !propertyName.equals(U RL_PROP)) { | |
361 return super.getStringProperty(propertyName); | |
362 } | |
363 // Returns either the value of CONTENT_URL_PROP or URL_PROP. | |
364 String url = super.getStringProperty(CONTENT_URL_PROP); | |
365 return url.isEmpty() ? super.getStringProperty(URL_PROP) : url; | |
366 } | |
367 | |
368 @Override | |
369 MarkupParser.Image getImage() { | |
370 MarkupParser.Image image = new MarkupParser.Image(); | |
371 image.image = getStringProperty(CONTENT_URL_PROP); | |
372 image.url = image.image; | |
373 image.type = getStringProperty(ENCODING_FORMAT_PROP); | |
374 image.caption = getStringProperty(CAPTION_PROP); | |
375 try { | |
376 image.width = Integer.parseInt(getStringProperty(WIDTH_PROP), 10 ); | |
377 } catch (Exception e) { | |
378 } | |
379 try { | |
380 image.height = Integer.parseInt(getStringProperty(HEIGHT_PROP), 10); | |
381 } catch (Exception e) { | |
382 } | |
383 return image; | |
384 } | |
385 } | |
386 | |
387 private static class ArticleItem extends ThingItem { | |
388 private static final String[] sStringPropertyNames = { | |
389 NAME_PROP, | |
390 URL_PROP, | |
391 DESCRIPTION_PROP, | |
392 IMAGE_PROP, | |
393 HEADLINE_PROP, | |
394 PUBLISHER_PROP, | |
395 COPYRIGHT_HOLDER_PROP, | |
396 COPYRIGHT_YEAR_PROP, | |
397 DATE_MODIFIED_PROP, | |
398 DATE_PUBLISHED_PROP, | |
399 AUTHOR_PROP, | |
400 CREATOR_PROP, | |
401 SECTION_PROP, | |
402 }; | |
403 | |
404 private static final String[][] sItemPropertyNames = { | |
405 new String[] { PUBLISHER_PROP, NAME_PROP }, | |
406 new String[] { COPYRIGHT_HOLDER_PROP, NAME_PROP }, | |
407 new String[] { AUTHOR_PROP, NAME_PROP }, | |
408 new String[] { CREATOR_PROP, NAME_PROP }, | |
409 new String[] { ASSOCIATED_MEDIA_PROP, CONTENT_URL_PROP }, | |
410 new String[] { ENCODING_PROP, CONTENT_URL_PROP }, | |
411 }; | |
412 | |
413 ArticleItem() { | |
414 super(Type.ARTICLE, sStringPropertyNames, sItemPropertyNames); | |
415 } | |
416 | |
417 @Override | |
418 MarkupParser.Image getImage() { | |
419 // If "associatedMedia" or "encoding" property exists, set the "repr esentativeOfPage" | |
420 // property of the corresponding ImageItem to "true", so that that i mage (which will | |
421 // be picked up when looping through |mItemScopes|) will be the domi nant one. | |
422 ThingItem imageItem = getItemProperty(ASSOCIATED_MEDIA_PROP); | |
423 if (imageItem == null) imageItem = getItemProperty(ENCODING_PROP); | |
424 if (imageItem != null) { | |
425 imageItem.putStringValue(REPRESENTATIVE_PROP, "true", true); | |
cjhopman
2014/04/25 20:52:34
This is strange, and only seems to work if I do ge
kuan
2014/04/29 00:23:10
Done. i've removed setting representativeOfPage a
| |
426 return null; | |
427 } | |
428 | |
429 // Use value of IMAGE_PROP to create a MarkupParser.Image. | |
430 String imageUrl = getStringProperty(IMAGE_PROP); | |
431 if (imageUrl.isEmpty()) return null; | |
432 MarkupParser.Image image = new MarkupParser.Image(); | |
433 image.image = imageUrl; | |
434 image.url = imageUrl; | |
435 return image; | |
436 } | |
437 | |
438 @Override | |
439 MarkupParser.Article getArticle() { | |
440 MarkupParser.Article article = new MarkupParser.Article(); | |
441 article.publishedTime = getStringProperty(DATE_PUBLISHED_PROP); | |
442 article.modifiedTime = getStringProperty(DATE_MODIFIED_PROP); | |
443 article.section = getStringProperty(SECTION_PROP); | |
444 String author = getStringProperty(AUTHOR_PROP); | |
445 if (author.isEmpty()) author = getStringProperty(CREATOR_PROP); | |
446 article.authors = author.isEmpty() ? new String[0] : new String[] { author }; | |
447 return article; | |
448 } | |
449 } | |
450 | |
451 private static class PersonItem extends ThingItem { | |
452 private static final String[] sStringPropertyNames = { | |
453 NAME_PROP, | |
454 URL_PROP, | |
455 DESCRIPTION_PROP, | |
456 IMAGE_PROP, | |
457 FAMILY_NAME_PROP, | |
458 GIVEN_NAME_PROP, | |
459 }; | |
460 | |
461 PersonItem() { | |
462 super(Type.PERSON, sStringPropertyNames, sEmptyItemPropertyNames); | |
463 } | |
464 | |
465 @Override | |
466 String getStringProperty(String propertyName) { | |
cjhopman
2014/04/25 20:52:34
Overriding getStringProperty like this is a little
kuan
2014/04/29 00:23:10
Done. i've made getStringProperty final, but impl
| |
467 if (!propertyName.equals(NAME_PROP)) return super.getStringProperty( propertyName); | |
468 // Returns either the value of NAME_PROP, or concatenated values of GIVEN_NAME_PROP and | |
469 // FAMILY_NAME_PROP delimited by a whitespace. | |
470 String fullname = super.getStringProperty(NAME_PROP); | |
471 if (fullname.isEmpty()) { | |
472 fullname = concat(super.getStringProperty(GIVEN_NAME_PROP), | |
473 super.getStringProperty(FAMILY_NAME_PROP)); | |
474 } | |
475 return fullname; | |
476 } | |
477 } | |
478 | |
479 private static class OrganizationItem extends ThingItem { | |
480 private static final String[] sStringPropertyNames = { | |
481 NAME_PROP, | |
482 URL_PROP, | |
483 DESCRIPTION_PROP, | |
484 IMAGE_PROP, | |
485 LEGAL_NAME_PROP, | |
486 }; | |
487 | |
488 OrganizationItem() { | |
489 super(Type.ORGANIZATION, sStringPropertyNames, sEmptyItemPropertyNam es); | |
490 } | |
491 | |
492 @Override | |
493 String getStringProperty(String propertyName) { | |
494 if (!propertyName.equals(NAME_PROP)) return super.getStringProperty( propertyName); | |
495 // Returns either the value of NAME_PROP or LEGAL_NAME_PROP. | |
496 String name = super.getStringProperty(NAME_PROP); | |
497 if (name.isEmpty()) name = super.getStringProperty(LEGAL_NAME_PROP); | |
498 return name; | |
499 } | |
500 } | |
501 | |
502 private static class UnsupportedItem extends ThingItem { | |
503 UnsupportedItem(){ | |
504 super(Type.UNSUPPORTED, sEmptyStringPropertyNames, sEmptyItemPropert yNames); | |
505 } | |
506 } | |
507 | |
508 private static boolean isItemScope(Element e) { | |
509 return e.hasAttribute("ITEMSCOPE") && e.hasAttribute("ITEMTYPE"); | |
510 } | |
511 | |
512 private static String[] getItemProp(Element e) { | |
513 // "itemprop" attribute is case-sensitive, and can have multiple propert ies. | |
514 String itemprop = e.getAttribute("ITEMPROP"); | |
515 if (itemprop.isEmpty()) return new String[0]; | |
516 String[] splits = StringUtil.split(itemprop, "\\s+"); | |
517 return splits.length > 0 ? splits : new String[] { itemprop }; | |
518 } | |
519 | |
520 // Extracts the property value from |e|. For some tags, the value is a spec ific attribute, | |
521 // while for others, it's the text between the start and end tags. | |
522 private static String getPropertyValue(Element e) { | |
523 String value = ""; | |
524 String tagName = e.getTagName(); | |
525 if (sTagAttributesMap.containsKey(tagName)) { | |
526 value = e.getAttribute(sTagAttributesMap.get(tagName)[0]); | |
527 } | |
528 if (value.isEmpty()) value = e.getInnerText(); | |
529 return value; | |
530 } | |
531 | |
532 // Extracts the author property from |e|'s "rel=author" attribute. | |
533 private static String getAuthorFromRelAttribute(Element e) { | |
534 String author = ""; | |
535 String tagName = e.getTagName(); | |
536 if (sTagAttributesMap.containsKey(tagName)) { | |
537 String[] attrs = sTagAttributesMap.get(tagName); | |
538 if (attrs.length > 1 && e.getAttribute(attrs[1]).equals(AUTHOR_REL)) { | |
539 author = e.getInnerText(); | |
540 } | |
541 } | |
542 return author; | |
543 } | |
544 } | |
OLD | NEW |