src/com/dom_distiller/client/SchemaOrgParser.java - Issue 240073007: recognize and parse Schema.org Markup

Side by Side Diff: src/com/dom_distiller/client/SchemaOrgParser.java

Issue 240073007: recognize and parse Schema.org Markup (Closed) Base URL: https://code.google.com/p/dom-distiller/@master

Patch Set: Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« src/com/dom_distiller/client/MarkupParser.java ('K') | « src/com/dom_distiller/client/MarkupParser.java ('k') | test/com/dom_distiller/client/SchemaOrgParserTest.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright 2014 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 package com.dom_distiller.client;

	6

	7 import java.util.ArrayList;

	8 import java.util.EnumMap;

	9 import java.util.Iterator;

	10 import java.util.List;

	11 import java.util.Map;

	12 import java.util.Set;

	13

	14 import com.google.gwt.dom.client.AnchorElement;

	15 import com.google.gwt.dom.client.Element;

	16 import com.google.gwt.dom.client.ImageElement;

	17 import com.google.gwt.dom.client.MetaElement;

	18 import com.google.gwt.dom.client.NodeList;

	19

	20 /**

	21 * This class recognizes and parses Schema.org markup tags, and returns the prop erties that matter

	22 * to distilled content.

	23 * For the basic Schema.org Thing type, the basic properties are: name, url, des cription, image.

	24 * In addition, for each type that we support, we also parse more specific prope rties:

	25 * - Article: headline (i.e. title), publisher, copyright year, copyright holder , date published,

	26 * date modified, author, article section

	27 * - ImageObject: headline (i.e. title), publisher, copyright year, copyright ho lder, content url,

	28 * encoding format, caption, representative of page, width, heigh t

	29 * - Person: family name, given name

	30 * - Organization: legal name.

	31 * The value of a Schema.Org property can be a Schema.Org type, i.e. embedded. E.g., the author or

	32 * publisher of article or publisher of image could be a Schema.Org Person or Or ganization type;

	33 * in fact, this is the reason we support Person and Organization types.

	34 */

	35 public class SchemaOrgParser implements MarkupParser.Parser {

	36 private static final String NAME_PROP = "name";

	37 private static final String URL_PROP = "url";

	38 private static final String DESCRIPTION_PROP = "description";

	39 private static final String IMAGE_PROP = "image";

	40 private static final String HEADLINE_PROP = "headline";

	41 private static final String PUBLISHER_PROP = "publisher";

	42 private static final String COPYRIGHT_HOLDER_PROP = "copyrightHolder";

	43 private static final String COPYRIGHT_YEAR_PROP = "copyrightYear";

	44 private static final String CONTENT_URL_PROP = "contentUrl";

	45 private static final String ENCODING_FORMAT_PROP = "encodingFormat";

	46 private static final String CAPTION_PROP = "caption";

	47 private static final String REPRESENTATIVE_PROP = "representativeOfPage";

	48 private static final String WIDTH_PROP = "width";

	49 private static final String HEIGHT_PROP = "height";

	50 private static final String DATE_PUBLISHED_PROP = "datePublished";

	51 private static final String DATE_MODIFIED_PROP = "dateModified";

	52 private static final String AUTHOR_PROP = "author";

	53 private static final String SECTION_PROP = "articleSection";

	54 private static final String FAMILY_NAME_PROP = "familyName";

	55 private static final String GIVEN_NAME_PROP = "givenName";

	56 private static final String LEGAL_NAME_PROP = "legalName";

	57

	58 private enum Type { // All these types are extended from Thing, directly or indirectly.

	59 IMAGE,

	60 ARTICLE,

	61 PERSON,

	62 ORGANIZATION,

	63 UNSUPPORTED,

	64 }

	65

	66 private static class ThingItem {

	67 protected final Type mType;

	68 protected final Element mRoot;

	69 protected final String[] mStringPropertyNames;

	70 protected final String[] mItemPropertyNames;

	71 protected final String[] mStringProperties;

	72 protected final ThingItem[] mItemProperties;

	73

	74 protected ThingItem(Type type, Element root,

	75 String[] stringPropertyNames, String[] itemPropertyN ames) {

	76 mType = type;

	77 mRoot = root;

	78 mStringPropertyNames = stringPropertyNames;

	79 mItemPropertyNames = itemPropertyNames;

	80 mStringProperties = new String[mStringPropertyNames.length];

	81 mItemProperties = new ThingItem[mItemPropertyNames.length];

	82 }

	83

	84 protected String toStringProperty() {

	85 return "";

	86 }

	87

	88 protected MarkupParser.Image getImage() {

	89 // Use value of IMAGE_PROP to create a MarkupParser.Image.

	90 String imageUrl = getStringProperty(IMAGE_PROP);

	91 if (imageUrl.isEmpty()) return null;

	92 MarkupParser.Image image = new MarkupParser.Image();

	93 image.image = imageUrl;

	94 image.url = imageUrl;

	95 return image;

	96 }

	97

	98 protected MarkupParser.Article getArticle() {

	99 return null;

	100 }

	101

	102 protected final boolean isImageRepresentativeOfPage() {

	103 String value = getStringProperty(REPRESENTATIVE_PROP);

	104 return value.equalsIgnoreCase("true");

	105 }

	106

	107 protected final void putStringValue(String name, String value) {

	108 for (int i = 0; i < mStringPropertyNames.length; i++) {

	109 if (name.equals(mStringPropertyNames[i])) {

	110 mStringProperties[i] = value;

	111 break;

	112 }

	113 }

	114 }

	115

	116 protected final void putItemValue(String name, ThingItem value) {

	117 for (int i = 0; i < mItemPropertyNames.length; i++) {

	118 if (name.equals(mItemPropertyNames[i])) {

	119 mItemProperties[i] = value;

	120 break;

	121 }

	122 }

	123 }

	124

	125 protected final String getStringProperty(String name) {

	126 // Check if property exists in \|mStringProperties\|.

	127 for (int i = 0; i < mStringPropertyNames.length; i++) {

	128 if (name.equals(mStringPropertyNames[i])) {

	129 String value = mStringProperties[i];

	130 if (value != null && !value.isEmpty()) return value;

	131 break;

	132 }

	133 }

	134 // Otherwise, repeat for \|mItemProperties\|.

	135 for (int i = 0; i < mItemPropertyNames.length; i++) {

	136 if (!name.equals(mItemPropertyNames[i])) continue;

	137 if (mItemProperties[i] != null) return mItemProperties[i].toStri ngProperty();

	138 break;

	139 }

	140 return "";

	141 }

	142 }

	143

	144 private final List<ThingItem> mItemScopes;

	145 private Element mRoot = null;

	146 private final Map<Type, String> mTypeUrls = new EnumMap<Type, String>(Type.c lass);
	cjhopman 2014/04/17 17:35:26 This appears to only be used to lookup a type for This appears to only be used to lookup a type for a string, so it should probably be a Map<String, Type> kuan 2014/04/18 00:19:03 Done. Show quoted text On 2014/04/17 17:35:26, cjhopman wrote: > This appears to only be used to lookup a type for a string, so it should > probably be a Map<String, Type> Done.
	147

	148 /**

	149 * The object that extracts and verifies Schema.org markup tags from \|root\|.

	150 */

	151 public SchemaOrgParser(Element root) {

	152 mRoot = root;

	153 mItemScopes = new ArrayList<ThingItem>();

	154

	155 mTypeUrls.put(Type.IMAGE, "http://schema.org/ImageObject");

	156 mTypeUrls.put(Type.ARTICLE, "http://schema.org/Article");

	157 mTypeUrls.put(Type.PERSON, "http://schema.org/Person");

	158 mTypeUrls.put(Type.ORGANIZATION, "http://schema.org/Organization");

	159 mTypeUrls.put(Type.UNSUPPORTED, "");

	160

	161 // TODO(kuan): Parsing all tags is pretty expensive, should we do so onl y lazily?

	162 // If parse lazily, all get* methods will need to check for parsed state and, if necessary,

	163 // parse before returning the requested properties.

	164 parseRoot();

	165 }

	166

	167 @Override

	168 public String getTitle() {

	169 String title = findStringProperty(HEADLINE_PROP);

	170 if (title.isEmpty()) title = findStringProperty(NAME_PROP);

	171 return title;

	172 }

	173

	174 @Override

	175 public String getType() {

	176 if (mItemScopes.isEmpty()) return null;

	177 // Assume the type of the first item is the page type.

	178 return mItemScopes.get(0).mType.toString();

	179 }

	180

	181 @Override

	182 public String getUrl() {

	183 return findStringProperty(URL_PROP);

	184 }

	185

	186 @Override

	187 public MarkupParser.Image[] getImages() {

	188 if (mItemScopes.isEmpty()) return null;

	189 List<MarkupParser.Image> images = new ArrayList<MarkupParser.Image>();

	190 for (int i = 0; i < mItemScopes.size(); i++) {

	191 ThingItem item = mItemScopes.get(i);

	192 MarkupParser.Image image = item.getImage();

	193 if (image != null) {

	194 if (item.isImageRepresentativeOfPage()) {

	195 // Image should be the dominant, i.e. first, one.

	196 images.add(0, image);

	197 } else {

	198 images.add(image);

	199 }

	200 }

	201 }

	202 if (images.isEmpty()) return null;

	203 return images.toArray(new MarkupParser.Image[images.size()]);

	204 }

	205

	206 @Override

	207 public String getDescription() {

	208 return findStringProperty(DESCRIPTION_PROP);

	209 }

	210

	211 @Override

	212 public String getPublisher() {

	213 return findStringProperty(PUBLISHER_PROP);

	214 }

	215

	216 @Override

	217 public String getCopyright() {

	218 if (mItemScopes.isEmpty()) return "";

	219 // Returns a concatenated string of copyright year and copyright holder of the first item

	220 // that has these properties, delimited by a whitespace.

	221 String copyright = "";

	222 for (int i = 0; i < mItemScopes.size() && copyright.isEmpty(); i++) {

	223 ThingItem item = mItemScopes.get(i);

	224 copyright = concat(item.getStringProperty(COPYRIGHT_YEAR_PROP),

	225 item.getStringProperty(COPYRIGHT_HOLDER_PROP));

	226 }

	227 return copyright.isEmpty() ? copyright : "Copyright " + copyright;

	228 }

	229

	230 @Override

	231 public String getAuthor() {

	232 return findStringProperty(AUTHOR_PROP);

	233 }

	234

	235 @Override

	236 public MarkupParser.Article getArticle() {

	237 if (mItemScopes.isEmpty()) return null;

	238 // Returns the first article.

	239 MarkupParser.Article article = null;

	240 for (int i = 0; i < mItemScopes.size() && article == null; i++) {

	241 article = mItemScopes.get(i).getArticle();

	242 }

	243 return article;

	244 }

	245

	246 @Override

	247 public boolean optOut() {

	248 return false;

	249 }

	250

	251 private void parseRoot() {

	252 // The <html> element can also be the start of a Schema.org item, and he nce needs to be

	253 // parsed.

	254

	255 // Use a boolean array for \|skipChildren\|, instead of the boolean primit ive, so that it

	256 // can be updated in checkIfElementIsSupported().

	257 boolean[] skipChildren = new boolean[] { false };

	258 checkIfElementIsSupported(mRoot, skipChildren);

	259 if (skipChildren[0]) return; // Skipping children of root means there's nothing more to do.

	260 // Recursively parse each element that is an Schema.org type.

	261 parse(mRoot, null);

	262 }

	263

	264 private void parse(Element root, ThingItem currItem) {
	cjhopman 2014/04/17 17:35:26 This function and its uses will be simplified if c This function and its uses will be simplified if currItem is the closest ancestor item of the element passed in. (I.e. it is never the item started at the element passed in). kuan 2014/04/18 00:19:03 Done. Show quoted text On 2014/04/17 17:35:26, cjhopman wrote: > This function and its uses will be simplified if currItem is the closest > ancestor item of the element passed in. (I.e. it is never the item started at > the element passed in). Done.
	265 NodeList<Element> allElems = root.getElementsByTagName("*");

	266 for (int i = 0; i < allElems.getLength(); i++) {
	cjhopman 2014/04/17 17:35:26 The way that the tree is parsed is hard for me to The way that the tree is parsed is hard for me to follow. Why not walk the tree recursively rather than iterating over all the elements? (The current approach also has bad worst case behavior) kuan 2014/04/18 00:19:03 Done. what's the worst case behavior? Show quoted text On 2014/04/17 17:35:26, cjhopman wrote: > The way that the tree is parsed is hard for me to follow. Why not walk the tree > recursively rather than iterating over all the elements? (The current approach > also has bad worst case behavior) Done. what's the worst case behavior? cjhopman 2014/04/18 01:17:01 The previous version had O(n^2) worst case complex Show quoted text On 2014/04/18 00:19:03, kuan wrote: > On 2014/04/17 17:35:26, cjhopman wrote: > > The way that the tree is parsed is hard for me to follow. Why not walk the > tree > > recursively rather than iterating over all the elements? (The current approach > > also has bad worst case behavior) > > Done. what's the worst case behavior? The previous version had O(n^2) worst case complexity (this would happen if you had a bunch of nested elements with each one starting a new nested item).
	267 Element e = allElems.getItem(i);

	268 // See comments in parseRoot() for using boolean array for \|skipChil dren\|.

	269 boolean[] skipChildren = new boolean[] { false };

	270

	271 ThingItem newItem = checkIfElementIsSupported(e, skipChildren);

	272

	273 // If we're currently parsing a Schema.org type, if it has an "itemp rop" attribute that

	274 // we care for, extract and store its value.

	275 if (currItem != null) extractProperty(e, currItem, newItem);

	276

	277 // If current element has "itemscope" and "itemtype" attributes and is a supported type,

	278 // its children would have been parsed by \|newItem\| via the recursiv e parse() call.

	279 // If it's an unsupported type, its children should be ignored. In both cases, we

	280 // should skip these children to the next sibling of the current ele ment. So, determine

	281 // the index of the next sibiling in \|allElems\|, so that the next it eration will jump to

	282 // that element.

	283 if (skipChildren[0]) {

	284 Element next = e.getNextSiblingElement();

	285 if (next != null) {

	286 for (i++; i < allElems.getLength() && next != allElems.getIt em(i); i++) {}

	287 i--; // Decrement because it'll be incremented in the outer for loop.

	288 } else {

	289 break; // No next sibling means there's no more elements to process.

	290 }

	291 }

	292 } // for all elements

	293 }

	294

	295 // If \|e\| has "itemscope" and "itemtype" attributes and a supported type, a ThingItem-extended

	296 // object is created based on the type.

	297 // Returns this object after it has recursively parsed \|e\|'s children, retur ns null otherwise.

	298 // @param skipChildren[0] is set to true if \|e\| specifies a Schema.org type, supported or not.

	299 private ThingItem checkIfElementIsSupported(Element e, boolean[] skipChildre n) {

	300 // If element has "itemscope" and "itemtype" attributes, it's the start of an item.

	301 // If the type is what we care for, instantiate the corresponding exten ded ThingItem and

	302 // recursively parse it.

	303 if (!e.hasAttribute("ITEMSCOPE") \|\| !e.hasAttribute("ITEMTYPE")) return null;
	cjhopman 2014/04/17 17:35:26 This should be a different function so you don't h This should be a different function so you don't have to pass around a one-element boolean array to get the result of this check. kuan 2014/04/18 00:19:03 Done. Show quoted text On 2014/04/17 17:35:26, cjhopman wrote: > This should be a different function so you don't have to pass around a > one-element boolean array to get the result of this check. Done.
	304

	305 skipChildren[0] = true; // Indicate to skip the children of this suppor ted element.

	306 ThingItem newItem = null;

	307 Type type = getType(e);

	308 switch (type) {

	309 case IMAGE:

	310 newItem = new ImageItem(e);

	311 break;

	312 case ARTICLE:

	313 newItem = new ArticleItem(e);

	314 break;

	315 case PERSON:

	316 newItem = new PersonItem(e);

	317 break;

	318 case ORGANIZATION:

	319 newItem = new OrganizationItem(e);

	320 break;

	321 case UNSUPPORTED:

	322 default:

	323 return null;

	324 }

	325

	326 mItemScopes.add(newItem);

	327 parse(e, newItem);
	cjhopman 2014/04/17 17:35:26 This parse() call makes it harder for me to reason This parse() call makes it harder for me to reason about the tree parsing, could we move this recursion out of this function (and then into parse() directly)? (and actually if change parse to recurse over the tree like my comment above, both cases would advance the same way through the recursion, just with different args). kuan 2014/04/18 00:19:03 Done. i'm not sure if i code it the way u want re Show quoted text On 2014/04/17 17:35:26, cjhopman wrote: > This parse() call makes it harder for me to reason about the tree parsing, could > we move this recursion out of this function (and then into parse() directly)? > (and actually if change parse to recurse over the tree like my comment above, > both cases would advance the same way through the recursion, just with different > args). Done. i'm not sure if i code it the way u want regarding "both cases would advance the same way through the recursion, just with different args". cjhopman 2014/04/18 01:17:01 See the new comment in parse() for what I meant by Show quoted text On 2014/04/18 00:19:03, kuan wrote: > On 2014/04/17 17:35:26, cjhopman wrote: > > This parse() call makes it harder for me to reason about the tree parsing, > could > > we move this recursion out of this function (and then into parse() directly)? > > (and actually if change parse to recurse over the tree like my comment above, > > both cases would advance the same way through the recursion, just with > different > > args). > > Done. i'm not sure if i code it the way u want regarding "both cases would > advance the same way through the recursion, just with different args". See the new comment in parse() for what I meant by that.
	328 return newItem;

	329 }

	330

	331 private Type getType(Element e) {

	332 String type = e.getAttribute("ITEMTYPE");

	333 Set<Map.Entry<Type, String>> typeUrls = mTypeUrls.entrySet();

	334 Iterator<Map.Entry<Type, String>> iter = typeUrls.iterator();
	cjhopman 2014/04/17 17:35:26 I think you can do: for (Map.Entry<Type, String> I think you can do: for (Map.Entry<Type, String> typeUrl : mTypeUrls.entrySet()) { if (typeUrl...) return ...; } kuan 2014/04/18 00:19:03 Done. since it's now a HashMap of <String, Type>, Show quoted text On 2014/04/17 17:35:26, cjhopman wrote: > I think you can do: > > for (Map.Entry<Type, String> typeUrl : mTypeUrls.entrySet()) { > if (typeUrl...) return ...; > } Done. since it's now a HashMap of <String, Type>, i just need to use containsKey() and get().
	335 while (iter.hasNext()) {

	336 Map.Entry<Type, String> typeUrl = iter.next();

	337 if (typeUrl.getValue().equalsIgnoreCase(type)) return typeUrl.getKey ();

	338 }

	339 return Type.UNSUPPORTED;

	340 }

	341

	342 // Extract the value of the "itemprop" attribute in \|e\|.

	343 // @param currItem ThingItem-extended item for the current Schema.org type b eing parsed.

	344 // @param embeddedItem ThingItem-extended item for the Schema.org type creat ed for \|e\|, i.e. \|e\| // had specified a Schema.org type.

	345 private void extractProperty(Element e, ThingItem currItem, ThingItem embedd edItem) {
	cjhopman 2014/04/17 17:35:26 This function does a lot (and most of what it does This function does a lot (and most of what it does isn't reflected in the name). It gets the itemprop and then adds the embedded item to current item or gets the element's property value and then adds that property to the current item. This would be clearer as two functions: getItemprop(Element) getPropertyValue(Element) and then handle the putItem/putString separately elsewhere. If you keep this function close to what it currently is, the name should at least indicate that it will add the extracted property to currItem. kuan 2014/04/18 00:19:03 Done. Show quoted text On 2014/04/17 17:35:26, cjhopman wrote: > This function does a lot (and most of what it does isn't reflected in the name). > It gets the itemprop and then adds the embedded item to current item or gets the > element's property value and then adds that property to the current item. > > This would be clearer as two functions: > getItemprop(Element) > getPropertyValue(Element) > > and then handle the putItem/putString separately elsewhere. > > If you keep this function close to what it currently is, the name should at > least indicate that it will add the extracted property to currItem. Done.
	346 // "itemprop" attribute is case-sensitive.

	347 String name = e.getAttribute("ITEMPROP");

	348 if (name == null \|\| name.isEmpty()) return;

	349 if (embeddedItem != null) { // This "itemprop" attribute is an embedded item.

	350 currItem.putItemValue(name, embeddedItem);

	351 } else { // Extract value from the tag.

	352 String value = null;

	353 if (e.hasTagName("A")) {

	354 value = AnchorElement.as(e).getHref();

	355 } else if (e.hasTagName("IMG")) {

	356 value = ImageElement.as(e).getSrc();

	357 } else if (e.hasTagName("META")) {

	358 value = MetaElement.as(e).getContent();

	359 } else if (e.hasTagName("TIME")) {

	360 value = e.getAttribute("datetime");

	361 }

	362 if (value == null \|\| value.isEmpty()) value = e.getInnerText();

	363 currItem.putStringValue(name, value);

	364 }

	365 }

	366

	367 // Returns the first item that has the requested property value.

	368 private String findStringProperty(String name) {

	369 if (mItemScopes.isEmpty()) return null;

	370 for (int i = 0; i < mItemScopes.size(); i++) {

	371 String value = mItemScopes.get(i).getStringProperty(name);

	372 if (!value.isEmpty()) return value;

	373 }

	374 return "";

	375 }

	376

	377 private static class ImageItem extends ThingItem {

	378 private static final String[] mStringPropertyNames = {

	379 NAME_PROP,

	380 URL_PROP,

	381 DESCRIPTION_PROP,

	382 IMAGE_PROP,

	383 HEADLINE_PROP,

	384 PUBLISHER_PROP,

	385 COPYRIGHT_HOLDER_PROP,

	386 COPYRIGHT_YEAR_PROP,

	387 CONTENT_URL_PROP,

	388 ENCODING_FORMAT_PROP,

	389 CAPTION_PROP,

	390 REPRESENTATIVE_PROP,

	391 WIDTH_PROP,

	392 HEIGHT_PROP,

	393 };

	394

	395 private static final String[] mItemPropertyNames = {

	396 PUBLISHER_PROP,

	397 COPYRIGHT_HOLDER_PROP,

	398 };

	399

	400 protected ImageItem(Element elem) {

	401 super(Type.IMAGE, elem, mStringPropertyNames, mItemPropertyNames);

	402 }

	403

	404 @Override

	405 protected MarkupParser.Image getImage() {

	406 MarkupParser.Image image = new MarkupParser.Image();

	407 String url = getStringProperty(CONTENT_URL_PROP);

	408 image.image = !url.isEmpty() ? url : getStringProperty(NAME_PROP);

	409 image.url = image.image;

	410 image.type = getStringProperty(ENCODING_FORMAT_PROP);

	411 image.caption = getStringProperty(CAPTION_PROP);

	412 try {

	413 image.width = Integer.parseInt(getStringProperty(WIDTH_PROP), 10 );

	414 } catch (Exception e) {

	415 }

	416 try {

	417 image.height = Integer.parseInt(getStringProperty(HEIGHT_PROP), 10);

	418 } catch (Exception e) {

	419 }

	420 return image;

	421 }

	422 }

	423

	424 private static class ArticleItem extends ThingItem {

	425 private static final String[] mStringPropertyNames = {

	426 NAME_PROP,

	427 URL_PROP,

	428 DESCRIPTION_PROP,

	429 IMAGE_PROP,

	430 HEADLINE_PROP,

	431 PUBLISHER_PROP,

	432 COPYRIGHT_HOLDER_PROP,

	433 COPYRIGHT_YEAR_PROP,

	434 DATE_MODIFIED_PROP,

	435 DATE_PUBLISHED_PROP,

	436 AUTHOR_PROP,

	437 SECTION_PROP,

	438 };

	439

	440 private static final String[] mItemPropertyNames = {

	441 PUBLISHER_PROP,

	442 COPYRIGHT_HOLDER_PROP,

	443 AUTHOR_PROP,

	444 };

	445

	446 protected ArticleItem(Element elem) {

	447 super(Type.ARTICLE, elem, mStringPropertyNames, mItemPropertyNames);

	448 }

	449

	450 @Override

	451 protected MarkupParser.Article getArticle() {

	452 MarkupParser.Article article = new MarkupParser.Article();

	453 article.publishedTime = getStringProperty(DATE_PUBLISHED_PROP);

	454 article.modifiedTime = getStringProperty(DATE_MODIFIED_PROP);

	455 article.section = getStringProperty(SECTION_PROP);

	456 String author = getStringProperty(AUTHOR_PROP);

	457 article.authors = author.isEmpty() ? new String[0] : new String[] { author };

	458 return article;

	459 }

	460 }

	461

	462 private static class PersonItem extends ThingItem {

	463 private static final String[] mStringPropertyNames = {

	464 NAME_PROP,

	465 URL_PROP,

	466 DESCRIPTION_PROP,

	467 IMAGE_PROP,

	468 FAMILY_NAME_PROP,

	469 GIVEN_NAME_PROP,

	470 };

	471

	472 protected PersonItem(Element elem) {

	473 super(Type.PERSON, elem, mStringPropertyNames, new String[0]);

	474 }

	475

	476 // Returns either the value of NAME_PROP, or concatenated values of GIVE N_NAME_PROP and

	477 // FAILY_NAME_PROP delimited by a whitespace.

	478 @Override

	479 protected String toStringProperty() {

	480 String fullname = getStringProperty(NAME_PROP);

	481 if (fullname.isEmpty()) {

	482 fullname = concat(getStringProperty(GIVEN_NAME_PROP),

	483 getStringProperty(FAMILY_NAME_PROP));

	484 }

	485 return fullname;

	486 }

	487 }

	488

	489 private static class OrganizationItem extends ThingItem {

	490 private static final String[] mStringPropertyNames = {

	491 NAME_PROP,

	492 URL_PROP,

	493 DESCRIPTION_PROP,

	494 IMAGE_PROP,

	495 LEGAL_NAME_PROP,

	496 };

	497

	498 protected OrganizationItem(Element elem) {

	499 super(Type.ORGANIZATION, elem, mStringPropertyNames, new String[0]);

	500 }

	501

	502 // Returns either the value of NAME_PROP or LEGAL_NAME_PROP.

	503 @Override

	504 protected String toStringProperty() {

	505 String name = getStringProperty(NAME_PROP);

	506 if (name.isEmpty()) name = getStringProperty(LEGAL_NAME_PROP);

	507 return name;

	508 }

	509 }

	510

	511 private static String concat(String first, String second) {

	512 String concat = first;

	513 if (!concat.isEmpty() && !second.isEmpty()) concat += " ";

	514 concat += second;

	515 return concat;

	516 }

	517 }

OLD	NEW