java/org/chromium/distiller/extractors/embeds/ImageExtractor.java - Issue 2020403002: Add support for figure element

Side by Side Diff: java/org/chromium/distiller/extractors/embeds/ImageExtractor.java

Issue 2020403002: Add support for figure element (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master

Patch Set: mdjones' comments addressed Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2015 The Chromium Authors. All rights reserved.	1 // Copyright 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package org.chromium.distiller.extractors.embeds;	5 package org.chromium.distiller.extractors.embeds;

6	6

	7 import com.google.gwt.dom.client.Document;

7 import com.google.gwt.dom.client.Element;	8 import com.google.gwt.dom.client.Element;

8 import com.google.gwt.dom.client.ImageElement;	9 import com.google.gwt.dom.client.ImageElement;

	10 import com.google.gwt.dom.client.NodeList;

	11 import org.chromium.distiller.DomUtil;

9 import org.chromium.distiller.LogUtil;	12 import org.chromium.distiller.LogUtil;

	13 import org.chromium.distiller.webdocument.WebFigure;

10 import org.chromium.distiller.webdocument.WebImage;	14 import org.chromium.distiller.webdocument.WebImage;

11	15

12 import java.util.HashSet;	16 import java.util.HashSet;

13 import java.util.Set;	17 import java.util.Set;

14	18

15 /**	19 /**

16 * This class treats images as another type of embed and provides heuristics for lead image	20 * This class treats images as another type of embed and provides heuristics for lead image

17 * candidacy.	21 * candidacy.

18 */	22 */

19 public class ImageExtractor implements EmbedExtractor {	23 public class ImageExtractor implements EmbedExtractor {

20 private static final Set<String> relevantTags = new HashSet<>();	24 private static final Set<String> relevantTags = new HashSet<>();

	25 private String imgSrc;

	26 private int width;

	27 private int height;

	28

21 static {	29 static {

22 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions.	30 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions.

23 relevantTags.add("IMG");	31 relevantTags.add("IMG");

	32 relevantTags.add("FIGURE");

24 }	33 }

	34

25 private static final String[] LAZY_IMAGE_ATTRIBUTES =	35 private static final String[] LAZY_IMAGE_ATTRIBUTES =

26 {"data-src", "data-original", "datasrc", "data-url"};	36 {"data-src", "data-original", "datasrc", "data-url"};

27	37

28 @Override	38 @Override

29 public Set<String> getRelevantTagNames() {	39 public Set<String> getRelevantTagNames() {

30 return relevantTags;	40 return relevantTags;

31 }	41 }

32	42

33 @Override	43 @Override

34 public WebImage extract(Element e) {	44 public WebImage extract(Element e) {

35 if (!relevantTags.contains(e.getTagName())) {	45 if (!relevantTags.contains(e.getTagName())) {

36 return null;	46 return null;

37 }	47 }

38 String imgSrc = "";	48 imgSrc = "";

39 // Getting OffSetWidth/Height as default values, even they are	49

40 // affected by padding, border, etc.

41 int width = e.getOffsetWidth();

42 int height = e.getOffsetHeight();

43 if ("IMG".equals(e.getTagName())) {	50 if ("IMG".equals(e.getTagName())) {

44 // This will get the absolute URL of the image and	51 extractImageAttributes(ImageElement.as(e));

45 // the displayed image dimension.	52 return new WebImage(e, width, height, imgSrc);

46 ImageElement imageElement = ImageElement.as(e);	53 } else if ("FIGURE".equals(e.getTagName())) {

47 // Try to get lazily-loaded images before falling back to get the sr c attribute.	54 Element img = getFirstElementByTagName(e, "IMG");

48 for(String attr: LAZY_IMAGE_ATTRIBUTES) {	55 if (img != null) {

49 imgSrc = imageElement.getAttribute(attr);	56 extractImageAttributes(ImageElement.as(img));

50 if (!imgSrc.isEmpty())	57 Element figcaption;

51 break;	58 Element cap = getFirstElementByTagName(e, "FIGCAPTION");

52 }	59 if (cap != null) {

53 if (!imgSrc.isEmpty()) {	60 // We look for links because some sites put non-caption

54 // We cannot trust the dimension if the image is not loaded yet.	61 // elements into <figcaption>. For example: image credit

55 // In some cases there are 1x1 placeholder images.	62 // could contain a link. So we get the whole DOM structure w ithin

56 width = 0;	63 // <figcaption> only when it contains links, otherwise we ge t the innerText.

57 height = 0;	64 figcaption = getFirstElementByTagName(cap, "A") != null ?

58 } else {	65 cap : createFigcaptionElement(cap);

59 imgSrc = imageElement.getSrc();	66 } else {

60 // As an ImageElement is manipulated here, it is possible	67 figcaption = createFigcaptionElement(e);

61 // to get the real dimensions.	68 }

62 width = imageElement.getWidth();	69 return new WebFigure(img, width, height, imgSrc, figcaption);

63 height = imageElement.getHeight();

64 }	70 }

65 }	71 }

	72 return null;

	73 }

66	74

	75 private void extractImageAttributes(ImageElement imageElement) {

	76 // This will get the absolute URL of the image and

	77 // the displayed image dimension.

	78 // Try to get lazily-loaded images before falling back to get the src at tribute.

	79 for (String attr : LAZY_IMAGE_ATTRIBUTES) {

	80 imgSrc = imageElement.getAttribute(attr);

	81 if (!imgSrc.isEmpty())

	82 break;

	83 }

	84 if (!imgSrc.isEmpty()) {

	85 // We cannot trust the dimension if the image is not loaded yet.

	86 // In some cases there are 1x1 placeholder images.

	87 width = 0;

	88 height = 0;

	89 } else {

	90 imgSrc = imageElement.getSrc();

	91 // As an ImageElement is manipulated here, it is possible

	92 // to get the real dimensions.

	93 width = imageElement.getWidth();

	94 height = imageElement.getHeight();

	95 }

67 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) {	96 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) {

68 LogUtil.logToConsole("Extracted WebImage: " + imgSrc);	97 LogUtil.logToConsole("Extracted WebImage: " + imgSrc);

69 }	98 }

70 return new WebImage(e, width, height, imgSrc);	99 }

	100

	101 private Element getFirstElementByTagName(Element e, String tagName) {

	102 NodeList<Element> elements = e.getElementsByTagName(tagName);

	103 if (elements.getLength() > 0) {

	104 return elements.getItem(0);

	105 }

	106 return null;

	107 }

	108

	109 private Element createFigcaptionElement(Element element) {

	110 Element figcaption = Document.get().createElement("FIGCAPTION");

	111 figcaption.setInnerText(element.getInnerText());

	112 return figcaption;

71 }	113 }

72 }	114 }

OLD	NEW

« no previous file with comments | « no previous file | java/org/chromium/distiller/webdocument/WebFigure.java » ('j') | no next file with comments »