| Index: java/org/chromium/distiller/extractors/embeds/ImageExtractor.java
|
| diff --git a/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java b/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java
|
| index 5b4eb0037ded139f59cadf45ba8df2cb4622e4f6..2e525f4e501358e6c0674e4bbc47a634e42736b6 100644
|
| --- a/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java
|
| +++ b/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java
|
| @@ -6,7 +6,10 @@ package org.chromium.distiller.extractors.embeds;
|
|
|
| import com.google.gwt.dom.client.Element;
|
| import com.google.gwt.dom.client.ImageElement;
|
| +import com.google.gwt.dom.client.NodeList;
|
| +import org.chromium.distiller.DomUtil;
|
| import org.chromium.distiller.LogUtil;
|
| +import org.chromium.distiller.webdocument.WebFigure;
|
| import org.chromium.distiller.webdocument.WebImage;
|
|
|
| import java.util.HashSet;
|
| @@ -18,12 +21,18 @@ import java.util.Set;
|
| */
|
| public class ImageExtractor implements EmbedExtractor {
|
| private static final Set<String> relevantTags = new HashSet<>();
|
| + private String imgSrc;
|
| + private int width;
|
| + private int height;
|
| +
|
| static {
|
| // TODO(mdjones): Add "DIV" to this list for css images and possibly captions.
|
| relevantTags.add("IMG");
|
| + relevantTags.add("FIGURE");
|
| }
|
| +
|
| private static final String[] LAZY_IMAGE_ATTRIBUTES =
|
| - {"data-src", "data-original", "datasrc", "data-url"};
|
| + {"data-src", "data-original", "datasrc", "data-url"};
|
|
|
| @Override
|
| public Set<String> getRelevantTagNames() {
|
| @@ -35,38 +44,60 @@ public class ImageExtractor implements EmbedExtractor {
|
| if (!relevantTags.contains(e.getTagName())) {
|
| return null;
|
| }
|
| - String imgSrc = "";
|
| - // Getting OffSetWidth/Height as default values, even they are
|
| - // affected by padding, border, etc.
|
| - int width = e.getOffsetWidth();
|
| - int height = e.getOffsetHeight();
|
| + imgSrc = "";
|
| +
|
| if ("IMG".equals(e.getTagName())) {
|
| - // This will get the absolute URL of the image and
|
| - // the displayed image dimension.
|
| - ImageElement imageElement = ImageElement.as(e);
|
| - // Try to get lazily-loaded images before falling back to get the src attribute.
|
| - for(String attr: LAZY_IMAGE_ATTRIBUTES) {
|
| - imgSrc = imageElement.getAttribute(attr);
|
| - if (!imgSrc.isEmpty())
|
| - break;
|
| - }
|
| - if (!imgSrc.isEmpty()) {
|
| - // We cannot trust the dimension if the image is not loaded yet.
|
| - // In some cases there are 1x1 placeholder images.
|
| - width = 0;
|
| - height = 0;
|
| - } else {
|
| - imgSrc = imageElement.getSrc();
|
| - // As an ImageElement is manipulated here, it is possible
|
| - // to get the real dimensions.
|
| - width = imageElement.getWidth();
|
| - height = imageElement.getHeight();
|
| + extractImageAttributes(ImageElement.as(e));
|
| + return new WebImage(e, width, height, imgSrc);
|
| + } else if ("FIGURE".equals(e.getTagName())) {
|
| + Element img = getFirstElementByTagName(e, "IMG");
|
| + if (img != null) {
|
| + String caption;
|
| + extractImageAttributes(ImageElement.as(img));
|
| + Element cap = getFirstElementByTagName(e, "FIGCAPTION");
|
| + if (cap != null) {
|
| + caption = getFirstElementByTagName(cap, "A") != null ?
|
| + cap.getInnerHTML() : DomUtil.escapeHTML(cap.getInnerText());
|
| + } else {
|
| + caption = DomUtil.escapeHTML(e.getInnerText());
|
| + }
|
| + return new WebFigure(img, width, height, imgSrc, caption);
|
| }
|
| }
|
| + return null;
|
| + }
|
|
|
| + private void extractImageAttributes(ImageElement imageElement) {
|
| + // This will get the absolute URL of the image and
|
| + // the displayed image dimension.
|
| + // Try to get lazily-loaded images before falling back to get the src attribute.
|
| + for (String attr : LAZY_IMAGE_ATTRIBUTES) {
|
| + imgSrc = imageElement.getAttribute(attr);
|
| + if (!imgSrc.isEmpty())
|
| + break;
|
| + }
|
| + if (!imgSrc.isEmpty()) {
|
| + // We cannot trust the dimension if the image is not loaded yet.
|
| + // In some cases there are 1x1 placeholder images.
|
| + width = 0;
|
| + height = 0;
|
| + } else {
|
| + imgSrc = imageElement.getSrc();
|
| + // As an ImageElement is manipulated here, it is possible
|
| + // to get the real dimensions.
|
| + width = imageElement.getWidth();
|
| + height = imageElement.getHeight();
|
| + }
|
| if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) {
|
| LogUtil.logToConsole("Extracted WebImage: " + imgSrc);
|
| }
|
| - return new WebImage(e, width, height, imgSrc);
|
| + }
|
| +
|
| + private Element getFirstElementByTagName(Element e, String tagName) {
|
| + NodeList<Element> elements = e.getElementsByTagName(tagName);
|
| + if (elements.getLength() > 0) {
|
| + return elements.getItem(0);
|
| + }
|
| + return null;
|
| }
|
| }
|
|
|