java/org/chromium/distiller/extractors/embeds/ImageExtractor.java - Issue 2729143003: Handle image lazy loading on Wikipedia

Side by Side Diff: java/org/chromium/distiller/extractors/embeds/ImageExtractor.java

Issue 2729143003: Handle image lazy loading on Wikipedia (Closed)

Patch Set: fixup non-matching case Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2015 The Chromium Authors. All rights reserved.	1 // Copyright 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package org.chromium.distiller.extractors.embeds;	5 package org.chromium.distiller.extractors.embeds;

6	6

7 import com.google.gwt.dom.client.Document;	7 import com.google.gwt.dom.client.Document;

8 import com.google.gwt.dom.client.Element;	8 import com.google.gwt.dom.client.Element;

9 import com.google.gwt.dom.client.ImageElement;	9 import com.google.gwt.dom.client.ImageElement;

10 import com.google.gwt.dom.client.NodeList;	10 import com.google.gwt.dom.client.NodeList;

11 import org.chromium.distiller.DomUtil;	11 import org.chromium.distiller.DomUtil;

	12 import org.chromium.distiller.JavaScript;

12 import org.chromium.distiller.LogUtil;	13 import org.chromium.distiller.LogUtil;

13 import org.chromium.distiller.webdocument.WebFigure;	14 import org.chromium.distiller.webdocument.WebFigure;

14 import org.chromium.distiller.webdocument.WebImage;	15 import org.chromium.distiller.webdocument.WebImage;

15	16

16 import java.util.HashSet;	17 import java.util.HashSet;

17 import java.util.Set;	18 import java.util.Set;

18	19

19 /**	20 /**

20 * This class treats images as another type of embed and provides heuristics for lead image	21 * This class treats images as another type of embed and provides heuristics for lead image

21 * candidacy.	22 * candidacy.

22 */	23 */

23 public class ImageExtractor implements EmbedExtractor {	24 public class ImageExtractor implements EmbedExtractor {

24 private static final Set<String> relevantTags = new HashSet<>();	25 private static final Set<String> relevantTags = new HashSet<>();

25 private String imgSrc;	26 private String imgSrc;

26 private int width;	27 private int width;

27 private int height;	28 private int height;

28	29

29 static {	30 static {

30 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions.	31 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions.

31 relevantTags.add("IMG");	32 relevantTags.add("IMG");

32 relevantTags.add("PICTURE");	33 relevantTags.add("PICTURE");

33 relevantTags.add("FIGURE");	34 relevantTags.add("FIGURE");

	35 relevantTags.add("SPAN");

34 }	36 }

35	37

36 private static final String[] LAZY_IMAGE_ATTRIBUTES =	38 private static final String[] LAZY_IMAGE_ATTRIBUTES =

37 {"data-src", "data-original", "datasrc", "data-url"};	39 {"data-src", "data-original", "datasrc", "data-url"};

38	40

39 @Override	41 @Override

40 public Set<String> getRelevantTagNames() {	42 public Set<String> getRelevantTagNames() {

41 return relevantTags;	43 return relevantTags;

42 }	44 }

43	45

(...skipping 24 matching lines...) Expand all Loading...
68 // <figcaption> only when it contains links, otherwise we get th e innerText.	70 // <figcaption> only when it contains links, otherwise we get th e innerText.

69 NodeList<Element> links = DomUtil.querySelectorAll(cap, "A[HREF] ");	71 NodeList<Element> links = DomUtil.querySelectorAll(cap, "A[HREF] ");

70 figcaption = links.getLength() > 0 ?	72 figcaption = links.getLength() > 0 ?

71 cap : createFigcaptionElement(cap);	73 cap : createFigcaptionElement(cap);

72 } else {	74 } else {

73 figcaption = createFigcaptionElement(e);	75 figcaption = createFigcaptionElement(e);

74 }	76 }

75 return new WebFigure(img, width, height, imgSrc, figcaption);	77 return new WebFigure(img, width, height, imgSrc, figcaption);

76 }	78 }

77	79

	80 if ("SPAN".equals(e.getTagName())) {

	81 if (!e.getAttribute("class").contains("lazy-image-placeholder")) {

	82 return null;

	83 }

	84 // Image lazy loading on Wikipedia.

	85 ie = Document.get().createImageElement();

	86 imgSrc = e.getAttribute("data-src");

	87 width = JavaScript.parseInt(e.getAttribute("data-width"));

	88 height = JavaScript.parseInt(e.getAttribute("data-height"));

	89 ie.setAttribute("srcset", e.getAttribute("data-srcset"));

	90 return new WebImage(ie, width, height, imgSrc);

	91 }

	92

78 extractImageAttributes(ie);	93 extractImageAttributes(ie);

79 return new WebImage(e, width, height, imgSrc);	94 return new WebImage(e, width, height, imgSrc);

80 }	95 }

81	96

82 private void extractImageAttributes(ImageElement imageElement) {	97 private void extractImageAttributes(ImageElement imageElement) {

83 // This will get the absolute URL of the image and	98 // This will get the absolute URL of the image and

84 // the displayed image dimension.	99 // the displayed image dimension.

85 // Try to get lazily-loaded images before falling back to get the src at tribute.	100 // Try to get lazily-loaded images before falling back to get the src at tribute.

86 for (String attr : LAZY_IMAGE_ATTRIBUTES) {	101 for (String attr : LAZY_IMAGE_ATTRIBUTES) {

87 imgSrc = imageElement.getAttribute(attr);	102 imgSrc = imageElement.getAttribute(attr);

(...skipping 16 matching lines...) Expand all Loading...
104 LogUtil.logToConsole("Extracted WebImage: " + imgSrc);	119 LogUtil.logToConsole("Extracted WebImage: " + imgSrc);

105 }	120 }

106 }	121 }

107	122

108 private Element createFigcaptionElement(Element element) {	123 private Element createFigcaptionElement(Element element) {

109 Element figcaption = Document.get().createElement("FIGCAPTION");	124 Element figcaption = Document.get().createElement("FIGCAPTION");

110 figcaption.setInnerText(DomUtil.getInnerText(element));	125 figcaption.setInnerText(DomUtil.getInnerText(element));

111 return figcaption;	126 return figcaption;

112 }	127 }

113 }	128 }

OLD	NEW

« no previous file with comments | « no previous file | javatests/org/chromium/distiller/ContentExtractorTest.java » ('j') | no next file with comments »