java/org/chromium/distiller/extractors/embeds/ImageExtractor.java - Issue 2638823002: Support <picture> in image extraction

Side by Side Diff: java/org/chromium/distiller/extractors/embeds/ImageExtractor.java

Issue 2638823002: Support <picture> in image extraction (Closed)

Patch Set: support lazy loading in <picture> Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2015 The Chromium Authors. All rights reserved.	1 // Copyright 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package org.chromium.distiller.extractors.embeds;	5 package org.chromium.distiller.extractors.embeds;

6	6

7 import com.google.gwt.dom.client.Document;	7 import com.google.gwt.dom.client.Document;

8 import com.google.gwt.dom.client.Element;	8 import com.google.gwt.dom.client.Element;

9 import com.google.gwt.dom.client.ImageElement;	9 import com.google.gwt.dom.client.ImageElement;

10 import com.google.gwt.dom.client.NodeList;	10 import com.google.gwt.dom.client.NodeList;

(...skipping 11 matching lines...) Expand all Loading...
22 */	22 */

23 public class ImageExtractor implements EmbedExtractor {	23 public class ImageExtractor implements EmbedExtractor {

24 private static final Set<String> relevantTags = new HashSet<>();	24 private static final Set<String> relevantTags = new HashSet<>();

25 private String imgSrc;	25 private String imgSrc;

26 private int width;	26 private int width;

27 private int height;	27 private int height;

28	28

29 static {	29 static {

30 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions.	30 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions.

31 relevantTags.add("IMG");	31 relevantTags.add("IMG");

	32 relevantTags.add("PICTURE");

32 relevantTags.add("FIGURE");	33 relevantTags.add("FIGURE");

33 }	34 }

34	35

35 private static final String[] LAZY_IMAGE_ATTRIBUTES =	36 private static final String[] LAZY_IMAGE_ATTRIBUTES =

36 {"data-src", "data-original", "datasrc", "data-url"};	37 {"data-src", "data-original", "datasrc", "data-url"};

37	38

38 @Override	39 @Override

39 public Set<String> getRelevantTagNames() {	40 public Set<String> getRelevantTagNames() {

40 return relevantTags;	41 return relevantTags;

41 }	42 }

42	43

43 @Override	44 @Override

44 public WebImage extract(Element e) {	45 public WebImage extract(Element e) {

45 if (!relevantTags.contains(e.getTagName())) {	46 if (!relevantTags.contains(e.getTagName())) {

46 return null;	47 return null;

47 }	48 }

48 imgSrc = "";	49 imgSrc = "";

49	50

50 if ("IMG".equals(e.getTagName())) {	51 ImageElement ie = ImageElement.as(DomUtil.getFirstElementByTagNameInc(e, "IMG"));

51 extractImageAttributes(ImageElement.as(e));	52

52 return new WebImage(e, width, height, imgSrc);	53 if ("FIGURE".equals(e.getTagName())) {

53 } else if ("FIGURE".equals(e.getTagName())) {	54 Element img = DomUtil.getFirstElementByTagName(e, "PICTURE");

54 Element img = getFirstElementByTagName(e, "IMG");	55 if (img == null) {

55 if (img != null) {	56 img = DomUtil.getFirstElementByTagName(e, "IMG");

56 extractImageAttributes(ImageElement.as(img));

57 Element figcaption;

58 Element cap = getFirstElementByTagName(e, "FIGCAPTION");

59 if (cap != null) {

60 // We look for links because some sites put non-caption

61 // elements into <figcaption>. For example: image credit

62 // could contain a link. So we get the whole DOM structure w ithin

63 // <figcaption> only when it contains links, otherwise we ge t the innerText.

64 figcaption = getFirstElementByTagName(cap, "A") != null ?

65 cap : createFigcaptionElement(cap);

66 } else {

67 figcaption = createFigcaptionElement(e);

68 }

69 return new WebFigure(img, width, height, imgSrc, figcaption);

70 }	57 }

	58 if (img == null) {

	59 return null;

	60 }

	61 extractImageAttributes(ie);

	62 Element figcaption;

	63 Element cap = DomUtil.getFirstElementByTagName(e, "FIGCAPTION");

	64 if (cap != null) {

	65 // We look for links because some sites put non-caption

	66 // elements into <figcaption>. For example: image credit

	67 // could contain a link. So we get the whole DOM structure withi n

	68 // <figcaption> only when it contains links, otherwise we get th e innerText.

	69 figcaption = DomUtil.getFirstElementByTagName(cap, "A") != null ?

	70 cap : createFigcaptionElement(cap);

	71 } else {

	72 figcaption = createFigcaptionElement(e);

	73 }

	74 return new WebFigure(img, width, height, imgSrc, figcaption);

71 }	75 }

72 return null;	76

	77 extractImageAttributes(ie);

	78 return new WebImage(e, width, height, imgSrc);

73 }	79 }

74	80

75 private void extractImageAttributes(ImageElement imageElement) {	81 private void extractImageAttributes(ImageElement imageElement) {

76 // This will get the absolute URL of the image and	82 // This will get the absolute URL of the image and

77 // the displayed image dimension.	83 // the displayed image dimension.

78 // Try to get lazily-loaded images before falling back to get the src at tribute.	84 // Try to get lazily-loaded images before falling back to get the src at tribute.

79 for (String attr : LAZY_IMAGE_ATTRIBUTES) {	85 for (String attr : LAZY_IMAGE_ATTRIBUTES) {

80 imgSrc = imageElement.getAttribute(attr);	86 imgSrc = imageElement.getAttribute(attr);

81 if (!imgSrc.isEmpty())	87 if (!imgSrc.isEmpty())

82 break;	88 break;

83 }	89 }

84 if (!imgSrc.isEmpty()) {	90 if (!imgSrc.isEmpty()) {

85 // We cannot trust the dimension if the image is not loaded yet.	91 // We cannot trust the dimension if the image is not loaded yet.

86 // In some cases there are 1x1 placeholder images.	92 // In some cases there are 1x1 placeholder images.

87 width = 0;	93 width = 0;

88 height = 0;	94 height = 0;

89 } else {	95 } else {

90 imgSrc = imageElement.getSrc();	96 imgSrc = imageElement.getSrc();

91 // As an ImageElement is manipulated here, it is possible	97 // As an ImageElement is manipulated here, it is possible

92 // to get the real dimensions.	98 // to get the real dimensions.

93 width = imageElement.getWidth();	99 width = imageElement.getWidth();

94 height = imageElement.getHeight();	100 height = imageElement.getHeight();

95 }	101 }

96 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) {	102 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) {

97 LogUtil.logToConsole("Extracted WebImage: " + imgSrc);	103 LogUtil.logToConsole("Extracted WebImage: " + imgSrc);

98 }	104 }

99 }	105 }

100	106

101 private Element getFirstElementByTagName(Element e, String tagName) {

102 NodeList<Element> elements = e.getElementsByTagName(tagName);

103 if (elements.getLength() > 0) {

104 return elements.getItem(0);

105 }

106 return null;

107 }

108

109 private Element createFigcaptionElement(Element element) {	107 private Element createFigcaptionElement(Element element) {

110 Element figcaption = Document.get().createElement("FIGCAPTION");	108 Element figcaption = Document.get().createElement("FIGCAPTION");

111 figcaption.setInnerText(DomUtil.getInnerText(element));	109 figcaption.setInnerText(DomUtil.getInnerText(element));

112 return figcaption;	110 return figcaption;

113 }	111 }

114 }	112 }

OLD	NEW

« no previous file with comments | « java/org/chromium/distiller/DomUtil.java ('k') | java/org/chromium/distiller/webdocument/WebImage.java » ('j') | no next file with comments »