OLD | NEW |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 package org.chromium.distiller.extractors.embeds; | 5 package org.chromium.distiller.extractors.embeds; |
6 | 6 |
| 7 import com.google.gwt.dom.client.Document; |
7 import com.google.gwt.dom.client.Element; | 8 import com.google.gwt.dom.client.Element; |
8 import com.google.gwt.dom.client.ImageElement; | 9 import com.google.gwt.dom.client.ImageElement; |
| 10 import com.google.gwt.dom.client.NodeList; |
| 11 import org.chromium.distiller.DomUtil; |
9 import org.chromium.distiller.LogUtil; | 12 import org.chromium.distiller.LogUtil; |
| 13 import org.chromium.distiller.webdocument.WebFigure; |
10 import org.chromium.distiller.webdocument.WebImage; | 14 import org.chromium.distiller.webdocument.WebImage; |
11 | 15 |
12 import java.util.HashSet; | 16 import java.util.HashSet; |
13 import java.util.Set; | 17 import java.util.Set; |
14 | 18 |
15 /** | 19 /** |
16 * This class treats images as another type of embed and provides heuristics for
lead image | 20 * This class treats images as another type of embed and provides heuristics for
lead image |
17 * candidacy. | 21 * candidacy. |
18 */ | 22 */ |
19 public class ImageExtractor implements EmbedExtractor { | 23 public class ImageExtractor implements EmbedExtractor { |
20 private static final Set<String> relevantTags = new HashSet<>(); | 24 private static final Set<String> relevantTags = new HashSet<>(); |
| 25 private String imgSrc; |
| 26 private int width; |
| 27 private int height; |
| 28 |
21 static { | 29 static { |
22 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap
tions. | 30 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap
tions. |
23 relevantTags.add("IMG"); | 31 relevantTags.add("IMG"); |
| 32 relevantTags.add("FIGURE"); |
24 } | 33 } |
| 34 |
25 private static final String[] LAZY_IMAGE_ATTRIBUTES = | 35 private static final String[] LAZY_IMAGE_ATTRIBUTES = |
26 {"data-src", "data-original", "datasrc", "data-url"}; | 36 {"data-src", "data-original", "datasrc", "data-url"}; |
27 | 37 |
28 @Override | 38 @Override |
29 public Set<String> getRelevantTagNames() { | 39 public Set<String> getRelevantTagNames() { |
30 return relevantTags; | 40 return relevantTags; |
31 } | 41 } |
32 | 42 |
33 @Override | 43 @Override |
34 public WebImage extract(Element e) { | 44 public WebImage extract(Element e) { |
35 if (!relevantTags.contains(e.getTagName())) { | 45 if (!relevantTags.contains(e.getTagName())) { |
36 return null; | 46 return null; |
37 } | 47 } |
38 String imgSrc = ""; | 48 imgSrc = ""; |
39 // Getting OffSetWidth/Height as default values, even they are | 49 |
40 // affected by padding, border, etc. | |
41 int width = e.getOffsetWidth(); | |
42 int height = e.getOffsetHeight(); | |
43 if ("IMG".equals(e.getTagName())) { | 50 if ("IMG".equals(e.getTagName())) { |
44 // This will get the absolute URL of the image and | 51 extractImageAttributes(ImageElement.as(e)); |
45 // the displayed image dimension. | 52 return new WebImage(e, width, height, imgSrc); |
46 ImageElement imageElement = ImageElement.as(e); | 53 } else if ("FIGURE".equals(e.getTagName())) { |
47 // Try to get lazily-loaded images before falling back to get the sr
c attribute. | 54 Element img = getFirstElementByTagName(e, "IMG"); |
48 for(String attr: LAZY_IMAGE_ATTRIBUTES) { | 55 if (img != null) { |
49 imgSrc = imageElement.getAttribute(attr); | 56 extractImageAttributes(ImageElement.as(img)); |
50 if (!imgSrc.isEmpty()) | 57 Element figcaption; |
51 break; | 58 Element cap = getFirstElementByTagName(e, "FIGCAPTION"); |
52 } | 59 if (cap != null) { |
53 if (!imgSrc.isEmpty()) { | 60 // We look for links because some sites put non-caption |
54 // We cannot trust the dimension if the image is not loaded yet. | 61 // elements into <figcaption>. For example: image credit |
55 // In some cases there are 1x1 placeholder images. | 62 // could contain a link. So we get the whole DOM structure w
ithin |
56 width = 0; | 63 // <figcaption> only when it contains links, otherwise we ge
t the innerText. |
57 height = 0; | 64 figcaption = getFirstElementByTagName(cap, "A") != null ? |
58 } else { | 65 cap : createFigcaptionElement(cap); |
59 imgSrc = imageElement.getSrc(); | 66 } else { |
60 // As an ImageElement is manipulated here, it is possible | 67 figcaption = createFigcaptionElement(e); |
61 // to get the real dimensions. | 68 } |
62 width = imageElement.getWidth(); | 69 return new WebFigure(img, width, height, imgSrc, figcaption); |
63 height = imageElement.getHeight(); | |
64 } | 70 } |
65 } | 71 } |
| 72 return null; |
| 73 } |
66 | 74 |
| 75 private void extractImageAttributes(ImageElement imageElement) { |
| 76 // This will get the absolute URL of the image and |
| 77 // the displayed image dimension. |
| 78 // Try to get lazily-loaded images before falling back to get the src at
tribute. |
| 79 for (String attr : LAZY_IMAGE_ATTRIBUTES) { |
| 80 imgSrc = imageElement.getAttribute(attr); |
| 81 if (!imgSrc.isEmpty()) |
| 82 break; |
| 83 } |
| 84 if (!imgSrc.isEmpty()) { |
| 85 // We cannot trust the dimension if the image is not loaded yet. |
| 86 // In some cases there are 1x1 placeholder images. |
| 87 width = 0; |
| 88 height = 0; |
| 89 } else { |
| 90 imgSrc = imageElement.getSrc(); |
| 91 // As an ImageElement is manipulated here, it is possible |
| 92 // to get the real dimensions. |
| 93 width = imageElement.getWidth(); |
| 94 height = imageElement.getHeight(); |
| 95 } |
67 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) { | 96 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) { |
68 LogUtil.logToConsole("Extracted WebImage: " + imgSrc); | 97 LogUtil.logToConsole("Extracted WebImage: " + imgSrc); |
69 } | 98 } |
70 return new WebImage(e, width, height, imgSrc); | 99 } |
| 100 |
| 101 private Element getFirstElementByTagName(Element e, String tagName) { |
| 102 NodeList<Element> elements = e.getElementsByTagName(tagName); |
| 103 if (elements.getLength() > 0) { |
| 104 return elements.getItem(0); |
| 105 } |
| 106 return null; |
| 107 } |
| 108 |
| 109 private Element createFigcaptionElement(Element element) { |
| 110 Element figcaption = Document.get().createElement("FIGCAPTION"); |
| 111 figcaption.setInnerText(element.getInnerText()); |
| 112 return figcaption; |
71 } | 113 } |
72 } | 114 } |
OLD | NEW |