Index: java/org/chromium/distiller/extractors/embeds/ImageExtractor.java |
diff --git a/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java b/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java |
index 5b4eb0037ded139f59cadf45ba8df2cb4622e4f6..540ae995d9eff91db60471bc757c33db9555b05e 100644 |
--- a/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java |
+++ b/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java |
@@ -4,9 +4,13 @@ |
package org.chromium.distiller.extractors.embeds; |
+import com.google.gwt.dom.client.Document; |
import com.google.gwt.dom.client.Element; |
import com.google.gwt.dom.client.ImageElement; |
+import com.google.gwt.dom.client.NodeList; |
+import org.chromium.distiller.DomUtil; |
import org.chromium.distiller.LogUtil; |
+import org.chromium.distiller.webdocument.WebFigure; |
import org.chromium.distiller.webdocument.WebImage; |
import java.util.HashSet; |
@@ -18,12 +22,18 @@ import java.util.Set; |
*/ |
public class ImageExtractor implements EmbedExtractor { |
private static final Set<String> relevantTags = new HashSet<>(); |
+ private String imgSrc; |
+ private int width; |
+ private int height; |
+ |
static { |
// TODO(mdjones): Add "DIV" to this list for css images and possibly captions. |
relevantTags.add("IMG"); |
+ relevantTags.add("FIGURE"); |
} |
+ |
private static final String[] LAZY_IMAGE_ATTRIBUTES = |
- {"data-src", "data-original", "datasrc", "data-url"}; |
+ {"data-src", "data-original", "datasrc", "data-url"}; |
@Override |
public Set<String> getRelevantTagNames() { |
@@ -35,38 +45,70 @@ public class ImageExtractor implements EmbedExtractor { |
if (!relevantTags.contains(e.getTagName())) { |
return null; |
} |
- String imgSrc = ""; |
- // Getting OffSetWidth/Height as default values, even they are |
- // affected by padding, border, etc. |
- int width = e.getOffsetWidth(); |
- int height = e.getOffsetHeight(); |
+ imgSrc = ""; |
+ |
if ("IMG".equals(e.getTagName())) { |
- // This will get the absolute URL of the image and |
- // the displayed image dimension. |
- ImageElement imageElement = ImageElement.as(e); |
- // Try to get lazily-loaded images before falling back to get the src attribute. |
- for(String attr: LAZY_IMAGE_ATTRIBUTES) { |
- imgSrc = imageElement.getAttribute(attr); |
- if (!imgSrc.isEmpty()) |
- break; |
- } |
- if (!imgSrc.isEmpty()) { |
- // We cannot trust the dimension if the image is not loaded yet. |
- // In some cases there are 1x1 placeholder images. |
- width = 0; |
- height = 0; |
- } else { |
- imgSrc = imageElement.getSrc(); |
- // As an ImageElement is manipulated here, it is possible |
- // to get the real dimensions. |
- width = imageElement.getWidth(); |
- height = imageElement.getHeight(); |
+ extractImageAttributes(ImageElement.as(e)); |
+ return new WebImage(e, width, height, imgSrc); |
+ } else if ("FIGURE".equals(e.getTagName())) { |
+ Element img = getFirstElementByTagName(e, "IMG"); |
+ if (img != null) { |
+ extractImageAttributes(ImageElement.as(img)); |
+ Element figcaption; |
+ Element cap = getFirstElementByTagName(e, "FIGCAPTION"); |
+ if (cap != null) { |
+ // We look for links because some sites put non-caption |
+ // elements into <figcaption>. For example: image credit |
+ // could contain a link. So we get the whole DOM structure within |
+ // <figcaption> only when it contains links, otherwise we get the innerText. |
+ figcaption = getFirstElementByTagName(cap, "A") != null ? |
+ cap : createFigcaptionElement(cap); |
+ } else { |
+ figcaption = createFigcaptionElement(e); |
+ } |
+ return new WebFigure(img, width, height, imgSrc, figcaption); |
} |
} |
+ return null; |
+ } |
+ private void extractImageAttributes(ImageElement imageElement) { |
+ // This will get the absolute URL of the image and |
+ // the displayed image dimension. |
+ // Try to get lazily-loaded images before falling back to get the src attribute. |
+ for (String attr : LAZY_IMAGE_ATTRIBUTES) { |
+ imgSrc = imageElement.getAttribute(attr); |
+ if (!imgSrc.isEmpty()) |
+ break; |
+ } |
+ if (!imgSrc.isEmpty()) { |
+ // We cannot trust the dimension if the image is not loaded yet. |
+ // In some cases there are 1x1 placeholder images. |
+ width = 0; |
+ height = 0; |
+ } else { |
+ imgSrc = imageElement.getSrc(); |
+ // As an ImageElement is manipulated here, it is possible |
+ // to get the real dimensions. |
+ width = imageElement.getWidth(); |
+ height = imageElement.getHeight(); |
+ } |
if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) { |
LogUtil.logToConsole("Extracted WebImage: " + imgSrc); |
} |
- return new WebImage(e, width, height, imgSrc); |
+ } |
+ |
+ private Element getFirstElementByTagName(Element e, String tagName) { |
+ NodeList<Element> elements = e.getElementsByTagName(tagName); |
+ if (elements.getLength() > 0) { |
+ return elements.getItem(0); |
+ } |
+ return null; |
+ } |
+ |
+ private Element createFigcaptionElement(Element element) { |
+ Element figcaption = Document.get().createElement("FIGCAPTION"); |
+ figcaption.setInnerText(element.getInnerText()); |
+ return figcaption; |
} |
} |