OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 package com.dom_distiller.client; | |
6 | |
7 import java.util.ArrayList; | |
8 import java.util.HashMap; | |
9 import java.util.List; | |
10 import java.util.Map; | |
11 | |
12 import com.google.gwt.dom.client.AnchorElement; | |
13 import com.google.gwt.dom.client.Element; | |
14 import com.google.gwt.dom.client.ImageElement; | |
15 import com.google.gwt.dom.client.MetaElement; | |
16 import com.google.gwt.dom.client.Node; | |
17 import com.google.gwt.dom.client.NodeList; | |
18 | |
19 /** | |
20 * This class recognizes and parses schema.org markup tags, and returns the prop erties that matter | |
21 * to distilled content. | |
22 * Schema.org markup (http://schema.org) is based on the microdata format | |
23 * (http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html). | |
24 * For the basic Schema.org Thing type, the basic properties are: name, url, des cription, image. | |
25 * In addition, for each type that we support, we also parse more specific prope rties: | |
26 * - Article: headline (i.e. title), publisher, copyright year, copyright holder , date published, | |
27 * date modified, author, article section | |
28 * - ImageObject: headline (i.e. title), publisher, copyright year, copyright ho lder, content url, | |
29 * encoding format, caption, representative of page, width, heigh t | |
30 * - Person: family name, given name | |
31 * - Organization: legal name. | |
32 * The value of a Schema.Org property can be a Schema.Org type, i.e. embedded. E.g., the author or | |
33 * publisher of article or publisher of image could be a Schema.Org Person or Or ganization type; | |
34 * in fact, this is the reason we support Person and Organization types. | |
35 */ | |
36 public class SchemaOrgParser implements MarkupParser.Parser { | |
cjhopman
2014/04/21 16:52:22
Can we split this class into two parts:
1. A Sche
kuan
2014/04/23 15:32:36
Done.
| |
37 private static final String NAME_PROP = "name"; | |
38 private static final String URL_PROP = "url"; | |
39 private static final String DESCRIPTION_PROP = "description"; | |
40 private static final String IMAGE_PROP = "image"; | |
41 private static final String HEADLINE_PROP = "headline"; | |
42 private static final String PUBLISHER_PROP = "publisher"; | |
43 private static final String COPYRIGHT_HOLDER_PROP = "copyrightHolder"; | |
44 private static final String COPYRIGHT_YEAR_PROP = "copyrightYear"; | |
45 private static final String CONTENT_URL_PROP = "contentUrl"; | |
46 private static final String ENCODING_FORMAT_PROP = "encodingFormat"; | |
47 private static final String CAPTION_PROP = "caption"; | |
48 private static final String REPRESENTATIVE_PROP = "representativeOfPage"; | |
49 private static final String WIDTH_PROP = "width"; | |
50 private static final String HEIGHT_PROP = "height"; | |
51 private static final String DATE_PUBLISHED_PROP = "datePublished"; | |
52 private static final String DATE_MODIFIED_PROP = "dateModified"; | |
53 private static final String AUTHOR_PROP = "author"; | |
54 private static final String SECTION_PROP = "articleSection"; | |
55 private static final String FAMILY_NAME_PROP = "familyName"; | |
56 private static final String GIVEN_NAME_PROP = "givenName"; | |
57 private static final String LEGAL_NAME_PROP = "legalName"; | |
58 private static final String AUTHOR_REL = "author"; | |
59 | |
60 private enum Type { // All these types are extended from Thing, directly or indirectly. | |
61 IMAGE, | |
62 ARTICLE, | |
63 PERSON, | |
64 ORGANIZATION, | |
65 UNSUPPORTED, | |
66 } | |
67 | |
68 private static class ThingItem { | |
69 private final Type mType; | |
70 private final String[] mStringPropertyNames; | |
71 private final String[] mItemPropertyNames; | |
72 private final String[] mStringProperties; | |
73 private final ThingItem[] mItemProperties; | |
74 | |
75 ThingItem(Type type, String[] stringPropertyNames, String[] itemProperty Names) { | |
76 mType = type; | |
77 mStringPropertyNames = stringPropertyNames; | |
78 mItemPropertyNames = itemPropertyNames; | |
79 mStringProperties = new String[mStringPropertyNames.length]; | |
80 mItemProperties = new ThingItem[mItemPropertyNames.length]; | |
81 } | |
82 | |
83 String toStringProperty() { | |
84 return ""; | |
85 } | |
86 | |
87 MarkupParser.Image getImage() { | |
88 // Use value of IMAGE_PROP to create a MarkupParser.Image. | |
89 String imageUrl = getStringProperty(IMAGE_PROP); | |
90 if (imageUrl.isEmpty()) return null; | |
91 MarkupParser.Image image = new MarkupParser.Image(); | |
92 image.image = imageUrl; | |
93 image.url = imageUrl; | |
94 return image; | |
95 } | |
96 | |
97 MarkupParser.Article getArticle() { | |
98 return null; | |
99 } | |
100 | |
101 final boolean isSupported() { return mType != Type.UNSUPPORTED; } | |
102 | |
103 final boolean isImageRepresentativeOfPage() { | |
104 String value = getStringProperty(REPRESENTATIVE_PROP); | |
105 return value.equalsIgnoreCase("true"); | |
106 } | |
107 | |
108 // Store |value| for property with |name|, unless the property already h as a non-empty | |
109 // value, in which case, |value| will be ignored. This means we only ke ep the first value. | |
110 final void putStringValue(String name, String value) { | |
111 for (int i = 0; i < mStringPropertyNames.length; i++) { | |
112 if (name.equals(mStringPropertyNames[i])) { | |
113 String existing = mStringProperties[i]; | |
114 if (existing == null || existing.isEmpty()) mStringPropertie s[i] = value; | |
115 break; | |
116 } | |
117 } | |
118 } | |
119 | |
120 // Store |value| for property with |name|, unless the property already h as a non-null value, | |
121 // in which case, |value| will be ignored. This means we only keep the first value. | |
122 final void putItemValue(String name, ThingItem value) { | |
123 for (int i = 0; i < mItemPropertyNames.length; i++) { | |
124 if (name.equals(mItemPropertyNames[i])) { | |
125 if (mItemProperties[i] == null) mItemProperties[i] = value; | |
126 break; | |
127 } | |
128 } | |
129 } | |
130 | |
131 final String getStringProperty(String name) { | |
132 // Check if property exists in |mStringProperties|. | |
133 for (int i = 0; i < mStringPropertyNames.length; i++) { | |
134 if (name.equals(mStringPropertyNames[i])) { | |
135 String value = mStringProperties[i]; | |
136 if (value != null && !value.isEmpty()) return value; | |
137 break; | |
138 } | |
139 } | |
140 // Otherwise, repeat for |mItemProperties|. | |
141 for (int i = 0; i < mItemPropertyNames.length; i++) { | |
142 if (!name.equals(mItemPropertyNames[i])) continue; | |
143 if (mItemProperties[i] != null) return mItemProperties[i].toStri ngProperty(); | |
144 break; | |
145 } | |
146 return ""; | |
147 } | |
148 } | |
149 | |
150 private final List<ThingItem> mItemScopes = new ArrayList<ThingItem>(); | |
151 private String mAuthorFromRel = ""; | |
152 private static final Map<String, Type> sTypeUrls; | |
153 private static final Map<String, String[]> sTagAttributesMap; | |
154 private static final String[] sEmptyPropertyNames = { | |
155 // Intentionally empty, declared so that it's initialized statically. | |
156 }; | |
157 | |
158 static { | |
159 sTypeUrls = new HashMap<String, Type>(); | |
160 sTypeUrls.put("http://schema.org/ImageObject", Type.IMAGE); | |
161 sTypeUrls.put("http://schema.org/Article", Type.ARTICLE); | |
cjhopman
2014/04/21 16:52:22
We should probably recognize schema.org/NewsArticl
kuan
2014/04/23 15:32:36
Done.
| |
162 sTypeUrls.put("http://schema.org/Person", Type.PERSON); | |
163 sTypeUrls.put("http://schema.org/Organization", Type.ORGANIZATION); | |
cjhopman
2014/04/21 16:52:22
There are a whole bunch of subtypes of Organizatio
kuan
2014/04/23 15:32:36
i added more subtypes, but i'm not sure if they're
| |
164 | |
165 // The key for |sTagAttributesMap| is the tag name, while the entry valu e is an array of | |
166 // attributes in the specified tag from which to extract information: | |
167 // - 0th attribute: contains the value for the property specified in ite mprop | |
168 // - 1st attribute: if available, contains the value for the author prop erty. | |
169 sTagAttributesMap = new HashMap<String, String[]>(); | |
170 sTagAttributesMap.put("IMG", new String[] { "SRC" }); | |
171 sTagAttributesMap.put("AUDIO", new String[] { "SRC" }); | |
172 sTagAttributesMap.put("EMBED", new String[] { "SRC" }); | |
173 sTagAttributesMap.put("IFRAME", new String[] { "SRC" }); | |
174 sTagAttributesMap.put("SOURCE", new String[] { "SRC" }); | |
175 sTagAttributesMap.put("TRACK", new String[] { "SRC" }); | |
176 sTagAttributesMap.put("VIDEO", new String[] { "SRC" }); | |
177 sTagAttributesMap.put("A", new String[] { "HREF", "REL" }); | |
178 sTagAttributesMap.put("LINK", new String[] { "HREF", "REL" }); | |
179 sTagAttributesMap.put("AREA", new String[] { "HREF" }); | |
180 sTagAttributesMap.put("META", new String[] { "CONTENT" }); | |
181 sTagAttributesMap.put("TIME", new String[] { "DATETIME" }); | |
182 sTagAttributesMap.put("OBJECT", new String[] { "DATA" }); | |
183 sTagAttributesMap.put("DATA", new String[] { "VALUE" }); | |
184 sTagAttributesMap.put("METER", new String[] { "VALUE" }); | |
185 } | |
186 | |
187 /** | |
188 * The object that extracts and verifies Schema.org markup tags from |root|. | |
189 */ | |
190 public SchemaOrgParser(Element root) { | |
191 // TODO(kuan): Parsing all tags is pretty expensive, should we do so onl y lazily? | |
192 // If parse lazily, all get* methods will need to check for parsed state and, if necessary, | |
193 // parse before returning the requested properties. | |
194 // Note that the <html> element can also be the start of a Schema.org it em, and hence needs | |
195 // to be parsed. | |
196 parse(root, null); | |
197 } | |
198 | |
199 @Override | |
200 public String getTitle() { | |
201 String title = findStringProperty(HEADLINE_PROP); | |
202 if (title.isEmpty()) title = findStringProperty(NAME_PROP); | |
cjhopman
2014/04/21 16:52:22
This seems to mean that we will get the first item
kuan
2014/04/23 15:32:36
Done.
| |
203 return title; | |
204 } | |
205 | |
206 @Override | |
207 public String getType() { | |
208 if (mItemScopes.isEmpty()) return ""; | |
209 // Assume the type of the first item is the page type. | |
210 return mItemScopes.get(0).mType.toString(); | |
cjhopman
2014/04/21 16:52:22
Does schema.org specify that this is a good way to
kuan
2014/04/23 15:32:36
Done. for the record, as per our off-line discuss
| |
211 } | |
212 | |
213 @Override | |
214 public String getUrl() { | |
215 return findStringProperty(URL_PROP); | |
cjhopman
2014/04/21 16:52:22
I don't really understand what we expect the url f
kuan
2014/04/23 15:32:36
Done.
| |
216 } | |
217 | |
218 @Override | |
219 public MarkupParser.Image[] getImages() { | |
cjhopman
2014/04/21 16:52:22
We should be careful about what this returns, we o
kuan
2014/04/23 15:32:36
Done.
per our discussion off-line, i won't impl t
| |
220 if (mItemScopes.isEmpty()) return null; | |
221 List<MarkupParser.Image> images = new ArrayList<MarkupParser.Image>(); | |
222 for (int i = 0; i < mItemScopes.size(); i++) { | |
223 ThingItem item = mItemScopes.get(i); | |
224 MarkupParser.Image image = item.getImage(); | |
225 if (image != null) { | |
226 if (item.isImageRepresentativeOfPage()) { | |
227 // Image should be the dominant, i.e. first, one. | |
228 images.add(0, image); | |
229 } else { | |
230 images.add(image); | |
231 } | |
232 } | |
233 } | |
234 if (images.isEmpty()) return null; | |
235 return images.toArray(new MarkupParser.Image[images.size()]); | |
236 } | |
237 | |
238 @Override | |
239 public String getDescription() { | |
240 return findStringProperty(DESCRIPTION_PROP); | |
cjhopman
2014/04/21 16:52:22
Again, this should probably only be the descriptio
kuan
2014/04/23 15:32:36
Done.
| |
241 } | |
242 | |
243 @Override | |
244 public String getPublisher() { | |
245 return findStringProperty(PUBLISHER_PROP); | |
cjhopman
2014/04/21 16:52:22
I think we would only want the publisher property
kuan
2014/04/23 15:32:36
Done.
| |
246 } | |
247 | |
248 @Override | |
249 public String getCopyright() { | |
250 if (mItemScopes.isEmpty()) return ""; | |
251 // Returns a concatenated string of copyright year and copyright holder of the first item | |
252 // that has these properties, delimited by a whitespace. | |
253 String copyright = ""; | |
254 for (int i = 0; i < mItemScopes.size() && copyright.isEmpty(); i++) { | |
255 ThingItem item = mItemScopes.get(i); | |
256 copyright = concat(item.getStringProperty(COPYRIGHT_YEAR_PROP), | |
257 item.getStringProperty(COPYRIGHT_HOLDER_PROP)); | |
258 } | |
259 return copyright.isEmpty() ? copyright : "Copyright " + copyright; | |
260 } | |
261 | |
262 @Override | |
263 public String getAuthor() { | |
264 String author = findStringProperty(AUTHOR_PROP); | |
265 return author.isEmpty() ? mAuthorFromRel : author; | |
266 } | |
267 | |
268 @Override | |
269 public MarkupParser.Article getArticle() { | |
270 if (mItemScopes.isEmpty()) return null; | |
271 // Returns the first article. | |
272 MarkupParser.Article article = null; | |
273 for (int i = 0; i < mItemScopes.size() && article == null; i++) { | |
274 article = mItemScopes.get(i).getArticle(); | |
275 } | |
276 return article; | |
277 } | |
278 | |
279 @Override | |
280 public boolean optOut() { | |
281 return false; | |
282 } | |
283 | |
284 private void parse(Element e, ThingItem parentItem) { | |
285 ThingItem newItem = null; | |
286 boolean isItemScope = isItemScope(e); | |
287 // A non-null |parentItem| means we're currently parsing the elements fo r a schema.org type. | |
288 String[] propertyNames = parentItem != null ? getItemProp(e) : new Strin g[0]; | |
289 | |
290 if (isItemScope) { | |
291 // The "itemscope" and "itemtype" attributes of |e| indicate the sta rt of an item. | |
292 // Create the corresponding extended-ThingItem, and add it to the li st if: | |
293 // 1) its type is supported, and | |
294 // 2) if the parent is an unsupported type, it's not an "itemprop" a ttribute of the | |
295 // parent, based on the rule that an item is a top-level item if its element doesn't | |
296 // have an itemprop attribute. | |
297 newItem = createItemForElement(e); | |
298 if (newItem != null && newItem.isSupported() && | |
299 (parentItem == null || parentItem.isSupported() || propertyNames .length == 0)) { | |
300 mItemScopes.add(newItem); | |
301 } | |
302 } | |
303 | |
304 // If parent is a supported type, parse the element for >= 1 properties in "itemprop" | |
305 // attribute. | |
306 if (propertyNames.length > 0 && parentItem.isSupported() && | |
307 (newItem == null || newItem.isSupported())) { | |
308 for (int i = 0; i < propertyNames.length; i++) { | |
309 // If a new item was created above, the property value of this " itemprop" attribute | |
310 // is an embedded item, so add it to the parent item. | |
311 if (newItem != null) { | |
312 parentItem.putItemValue(propertyNames[i], newItem); | |
313 } else { | |
314 // Otherwise, extract the property value from the tag itself, and add it to the | |
315 // parent item. | |
316 parentItem.putStringValue(propertyNames[i], getPropertyValue( e)); | |
317 } | |
318 } | |
319 } | |
320 | |
321 // If <a> or <link> tags specify rel="author", extract it. | |
cjhopman
2014/04/25 20:52:34
Where does this rel="author" stuff come from? I ca
| |
322 if (mAuthorFromRel.isEmpty()) mAuthorFromRel = getAuthorFromRelAttribute (e); | |
323 | |
324 // Now, parse each child element recursively. | |
325 NodeList<Node> children = e.getChildNodes(); | |
326 for (int i = 0; i < children.getLength(); i++) { | |
327 Node child = children.getItem(i); | |
328 if (child.getNodeType() != Node.ELEMENT_NODE) continue; | |
329 parse(Element.as(child), newItem != null ? newItem : parentItem); | |
330 } | |
331 } | |
332 | |
333 private Type getItemType(Element e) { | |
334 // "itemtype" attribute is case-sensitive. | |
335 String type = e.getAttribute("ITEMTYPE"); | |
336 return sTypeUrls.containsKey(type) ? sTypeUrls.get(type) : Type.UNSUPPOR TED; | |
337 } | |
338 | |
339 private ThingItem createItemForElement(Element e) { | |
340 ThingItem newItem = null; | |
341 Type type = getItemType(e); | |
342 switch (type) { | |
343 case IMAGE: | |
344 newItem = new ImageItem(); | |
345 break; | |
346 case ARTICLE: | |
347 newItem = new ArticleItem(); | |
348 break; | |
349 case PERSON: | |
350 newItem = new PersonItem(); | |
351 break; | |
352 case ORGANIZATION: | |
353 newItem = new OrganizationItem(); | |
354 break; | |
355 case UNSUPPORTED: | |
356 newItem = new UnsupportedItem(); | |
357 break; | |
358 default: | |
359 return null; | |
360 } | |
361 return newItem; | |
362 } | |
363 | |
364 // Returns the first item that has the requested property value. | |
365 private String findStringProperty(String name) { | |
366 if (mItemScopes.isEmpty()) return ""; | |
367 for (int i = 0; i < mItemScopes.size(); i++) { | |
368 String value = mItemScopes.get(i).getStringProperty(name); | |
369 if (!value.isEmpty()) return value; | |
370 } | |
371 return ""; | |
372 } | |
373 | |
374 private static class ImageItem extends ThingItem { | |
375 private static final String[] sStringPropertyNames = { | |
376 NAME_PROP, | |
377 URL_PROP, | |
378 DESCRIPTION_PROP, | |
379 IMAGE_PROP, | |
380 HEADLINE_PROP, | |
381 PUBLISHER_PROP, | |
382 COPYRIGHT_HOLDER_PROP, | |
383 COPYRIGHT_YEAR_PROP, | |
384 CONTENT_URL_PROP, | |
385 ENCODING_FORMAT_PROP, | |
386 CAPTION_PROP, | |
387 REPRESENTATIVE_PROP, | |
388 WIDTH_PROP, | |
389 HEIGHT_PROP, | |
390 }; | |
391 | |
392 private static final String[] sItemPropertyNames = { | |
393 PUBLISHER_PROP, | |
394 COPYRIGHT_HOLDER_PROP, | |
395 }; | |
396 | |
397 ImageItem() { | |
398 super(Type.IMAGE, sStringPropertyNames, sItemPropertyNames); | |
399 } | |
400 | |
401 @Override | |
402 MarkupParser.Image getImage() { | |
403 MarkupParser.Image image = new MarkupParser.Image(); | |
404 String url = getStringProperty(CONTENT_URL_PROP); | |
405 image.image = !url.isEmpty() ? url : getStringProperty(NAME_PROP); | |
406 image.url = image.image; | |
407 image.type = getStringProperty(ENCODING_FORMAT_PROP); | |
408 image.caption = getStringProperty(CAPTION_PROP); | |
409 try { | |
410 image.width = Integer.parseInt(getStringProperty(WIDTH_PROP), 10 ); | |
411 } catch (Exception e) { | |
412 } | |
413 try { | |
414 image.height = Integer.parseInt(getStringProperty(HEIGHT_PROP), 10); | |
415 } catch (Exception e) { | |
416 } | |
417 return image; | |
418 } | |
419 } | |
420 | |
421 private static class ArticleItem extends ThingItem { | |
422 private static final String[] sStringPropertyNames = { | |
423 NAME_PROP, | |
424 URL_PROP, | |
425 DESCRIPTION_PROP, | |
426 IMAGE_PROP, | |
427 HEADLINE_PROP, | |
428 PUBLISHER_PROP, | |
429 COPYRIGHT_HOLDER_PROP, | |
430 COPYRIGHT_YEAR_PROP, | |
431 DATE_MODIFIED_PROP, | |
432 DATE_PUBLISHED_PROP, | |
433 AUTHOR_PROP, | |
434 SECTION_PROP, | |
435 }; | |
436 | |
437 private static final String[] sItemPropertyNames = { | |
438 PUBLISHER_PROP, | |
439 COPYRIGHT_HOLDER_PROP, | |
440 AUTHOR_PROP, | |
441 }; | |
442 | |
443 ArticleItem() { | |
444 super(Type.ARTICLE, sStringPropertyNames, sItemPropertyNames); | |
445 } | |
446 | |
447 @Override | |
448 MarkupParser.Article getArticle() { | |
449 MarkupParser.Article article = new MarkupParser.Article(); | |
450 article.publishedTime = getStringProperty(DATE_PUBLISHED_PROP); | |
451 article.modifiedTime = getStringProperty(DATE_MODIFIED_PROP); | |
452 article.section = getStringProperty(SECTION_PROP); | |
453 String author = getStringProperty(AUTHOR_PROP); | |
454 article.authors = author.isEmpty() ? new String[0] : new String[] { author }; | |
455 return article; | |
456 } | |
457 } | |
458 | |
459 private static class PersonItem extends ThingItem { | |
460 private static final String[] sStringPropertyNames = { | |
461 NAME_PROP, | |
462 URL_PROP, | |
463 DESCRIPTION_PROP, | |
464 IMAGE_PROP, | |
465 FAMILY_NAME_PROP, | |
466 GIVEN_NAME_PROP, | |
467 }; | |
468 | |
469 PersonItem() { | |
470 super(Type.PERSON, sStringPropertyNames, sEmptyPropertyNames); | |
471 } | |
472 | |
473 // Returns either the value of NAME_PROP, or concatenated values of GIVE N_NAME_PROP and | |
474 // FAILY_NAME_PROP delimited by a whitespace. | |
cjhopman
2014/04/21 16:52:22
s/FAILY/FAMILY
kuan
2014/04/23 15:32:36
Done.
| |
475 @Override | |
476 String toStringProperty() { | |
477 String fullname = getStringProperty(NAME_PROP); | |
478 if (fullname.isEmpty()) { | |
479 fullname = concat(getStringProperty(GIVEN_NAME_PROP), | |
480 getStringProperty(FAMILY_NAME_PROP)); | |
481 } | |
482 return fullname; | |
483 } | |
484 } | |
485 | |
486 private static class OrganizationItem extends ThingItem { | |
487 private static final String[] sStringPropertyNames = { | |
488 NAME_PROP, | |
489 URL_PROP, | |
490 DESCRIPTION_PROP, | |
491 IMAGE_PROP, | |
492 LEGAL_NAME_PROP, | |
493 }; | |
494 | |
495 OrganizationItem() { | |
496 super(Type.ORGANIZATION, sStringPropertyNames, sEmptyPropertyNames); | |
497 } | |
498 | |
499 // Returns either the value of NAME_PROP or LEGAL_NAME_PROP. | |
500 @Override | |
501 String toStringProperty() { | |
502 String name = getStringProperty(NAME_PROP); | |
503 if (name.isEmpty()) name = getStringProperty(LEGAL_NAME_PROP); | |
504 return name; | |
505 } | |
506 } | |
507 | |
508 private static class UnsupportedItem extends ThingItem { | |
509 UnsupportedItem(){ | |
510 super(Type.UNSUPPORTED, sEmptyPropertyNames, sEmptyPropertyNames); | |
511 } | |
512 } | |
513 | |
514 private static boolean isItemScope(Element e) { | |
515 return e.hasAttribute("ITEMSCOPE") && e.hasAttribute("ITEMTYPE"); | |
516 } | |
517 | |
518 private static String[] getItemProp(Element e) { | |
519 // "itemprop" attribute is case-sensitive, and can have multiple propert ies. | |
520 String itemprop = e.getAttribute("ITEMPROP"); | |
521 if (itemprop.isEmpty()) return new String[0]; | |
522 String[] splits = StringUtil.split(itemprop, "\\s+"); | |
523 return splits.length > 0 ? splits : new String[] { itemprop }; | |
524 } | |
525 | |
526 // Extracts the property value from |e|. For some tags, the value is a spec ific attribute, | |
527 // while for others, it's the text between the start and end tags. | |
528 private static String getPropertyValue(Element e) { | |
529 String value = ""; | |
530 String tagName = e.getTagName(); | |
531 if (sTagAttributesMap.containsKey(tagName)) { | |
532 value = e.getAttribute(sTagAttributesMap.get(tagName)[0]); | |
533 } | |
534 if (value.isEmpty()) value = e.getInnerText(); | |
535 return value; | |
536 } | |
537 | |
538 // Extracts the author property from |e|'s "rel=author" attribute. | |
539 private static String getAuthorFromRelAttribute(Element e) { | |
540 String author = ""; | |
541 String tagName = e.getTagName(); | |
542 if (sTagAttributesMap.containsKey(tagName)) { | |
543 String[] attrs = sTagAttributesMap.get(tagName); | |
544 if (attrs.length > 1 && e.getAttribute(attrs[1]).equals(AUTHOR_REL)) { | |
545 author = e.getInnerText(); | |
546 } | |
547 } | |
548 return author; | |
549 } | |
550 | |
551 private static String concat(String first, String second) { | |
552 String concat = first; | |
553 if (!concat.isEmpty() && !second.isEmpty()) concat += " "; | |
554 concat += second; | |
555 return concat; | |
556 } | |
557 } | |
OLD | NEW |