java/org/chromium/distiller/extractors/embeds/ImageExtractor.java - Issue 2638823002: Support <picture> in image extraction

Side by Side Diff: java/org/chromium/distiller/extractors/embeds/ImageExtractor.java

Issue 2638823002: Support <picture> in image extraction (Closed)

Patch Set: Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « java/org/chromium/distiller/DomUtil.java ('k') | java/org/chromium/distiller/webdocument/WebImage.java » ('j') | java/org/chromium/distiller/webdocument/WebImage.java » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright 2015 The Chromium Authors. All rights reserved.	1 // Copyright 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package org.chromium.distiller.extractors.embeds;	5 package org.chromium.distiller.extractors.embeds;

6	6

7 import com.google.gwt.dom.client.Document;	7 import com.google.gwt.dom.client.Document;

8 import com.google.gwt.dom.client.Element;	8 import com.google.gwt.dom.client.Element;

9 import com.google.gwt.dom.client.ImageElement;	9 import com.google.gwt.dom.client.ImageElement;

10 import com.google.gwt.dom.client.NodeList;	10 import com.google.gwt.dom.client.NodeList;

(...skipping 11 matching lines...) Expand all Loading...
22 */	22 */

23 public class ImageExtractor implements EmbedExtractor {	23 public class ImageExtractor implements EmbedExtractor {

24 private static final Set<String> relevantTags = new HashSet<>();	24 private static final Set<String> relevantTags = new HashSet<>();

25 private String imgSrc;	25 private String imgSrc;

26 private int width;	26 private int width;

27 private int height;	27 private int height;

28	28

29 static {	29 static {

30 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions.	30 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions.

31 relevantTags.add("IMG");	31 relevantTags.add("IMG");

	32 relevantTags.add("PICTURE");

32 relevantTags.add("FIGURE");	33 relevantTags.add("FIGURE");

33 }	34 }

34	35

35 private static final String[] LAZY_IMAGE_ATTRIBUTES =	36 private static final String[] LAZY_IMAGE_ATTRIBUTES =

36 {"data-src", "data-original", "datasrc", "data-url"};	37 {"data-src", "data-original", "datasrc", "data-url"};

37	38

38 @Override	39 @Override

39 public Set<String> getRelevantTagNames() {	40 public Set<String> getRelevantTagNames() {

40 return relevantTags;	41 return relevantTags;

41 }	42 }

42	43

43 @Override	44 @Override

44 public WebImage extract(Element e) {	45 public WebImage extract(Element e) {

45 if (!relevantTags.contains(e.getTagName())) {	46 if (!relevantTags.contains(e.getTagName())) {

46 return null;	47 return null;

47 }	48 }

48 imgSrc = "";	49 imgSrc = "";

49	50

50 if ("IMG".equals(e.getTagName())) {	51 if ("IMG".equals(e.getTagName())) {

51 extractImageAttributes(ImageElement.as(e));	52 extractImageAttributes(ImageElement.as(e));

52 return new WebImage(e, width, height, imgSrc);	53 return new WebImage(e, width, height, imgSrc);

	54 } else if ("PICTURE".equals(e.getTagName())) {
	mdjones 2017/01/17 17:41:46 How would you feel about having some sort of utili How would you feel about having some sort of utility function in this class called "getImageElement"? That way we could eliminate the duplicate "img"/"picture" tag checks here and below. It's not that hard to read now, but adding another tag (maybe div) would further complicate this. wdyt? wychen 2017/01/18 17:29:54 Done. Show quoted text On 2017/01/17 17:41:46, mdjones wrote: > How would you feel about having some sort of utility function in this class > called "getImageElement"? That way we could eliminate the duplicate > "img"/"picture" tag checks here and below. It's not that hard to read now, but > adding another tag (maybe div) would further complicate this. wdyt? Done.
	55 return new WebImage(e, width, height, imgSrc);
	mdjones 2017/01/17 17:41:47 nit: indented too far. nit: indented too far. wychen 2017/01/18 17:29:54 Done. Show quoted text On 2017/01/17 17:41:47, mdjones wrote: > nit: indented too far. Done.
53 } else if ("FIGURE".equals(e.getTagName())) {	56 } else if ("FIGURE".equals(e.getTagName())) {

54 Element img = getFirstElementByTagName(e, "IMG");	57 Element img = getFirstElementByTagName(e, "PICTURE");

	58 if (img == null) {

	59 img = getFirstElementByTagName(e, "IMG");

	60 }

55 if (img != null) {	61 if (img != null) {
	mdjones 2017/01/17 17:41:46 nit: early return instead of nesting? nit: early return instead of nesting? wychen 2017/01/18 17:29:54 Done. Show quoted text On 2017/01/17 17:41:46, mdjones wrote: > nit: early return instead of nesting? Done.
56 extractImageAttributes(ImageElement.as(img));	62 if ("IMG".equals(img.getTagName())) {

	63 extractImageAttributes(ImageElement.as(img));

	64 }

57 Element figcaption;	65 Element figcaption;

58 Element cap = getFirstElementByTagName(e, "FIGCAPTION");	66 Element cap = getFirstElementByTagName(e, "FIGCAPTION");

59 if (cap != null) {	67 if (cap != null) {

60 // We look for links because some sites put non-caption	68 // We look for links because some sites put non-caption

61 // elements into <figcaption>. For example: image credit	69 // elements into <figcaption>. For example: image credit

62 // could contain a link. So we get the whole DOM structure w ithin	70 // could contain a link. So we get the whole DOM structure w ithin

63 // <figcaption> only when it contains links, otherwise we ge t the innerText.	71 // <figcaption> only when it contains links, otherwise we ge t the innerText.

64 figcaption = getFirstElementByTagName(cap, "A") != null ?	72 figcaption = getFirstElementByTagName(cap, "A") != null ?

65 cap : createFigcaptionElement(cap);	73 cap : createFigcaptionElement(cap);

66 } else {	74 } else {

(...skipping 38 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
105 }	113 }

106 return null;	114 return null;

107 }	115 }

108	116

109 private Element createFigcaptionElement(Element element) {	117 private Element createFigcaptionElement(Element element) {

110 Element figcaption = Document.get().createElement("FIGCAPTION");	118 Element figcaption = Document.get().createElement("FIGCAPTION");

111 figcaption.setInnerText(DomUtil.getInnerText(element));	119 figcaption.setInnerText(DomUtil.getInnerText(element));

112 return figcaption;	120 return figcaption;

113 }	121 }

114 }	122 }

OLD	NEW