Chromium Code Reviews| Index: java/org/chromium/distiller/extractors/embeds/ImageExtractor.java |
| diff --git a/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java b/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java |
| index 5b4eb0037ded139f59cadf45ba8df2cb4622e4f6..c879e5bfcb32e960c07d9fe3b54db1366ff6d4cd 100644 |
| --- a/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java |
| +++ b/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java |
| @@ -4,9 +4,13 @@ |
| package org.chromium.distiller.extractors.embeds; |
| +import com.google.gwt.dom.client.Document; |
| import com.google.gwt.dom.client.Element; |
| import com.google.gwt.dom.client.ImageElement; |
| +import com.google.gwt.dom.client.NodeList; |
| +import org.chromium.distiller.DomUtil; |
| import org.chromium.distiller.LogUtil; |
| +import org.chromium.distiller.webdocument.WebFigure; |
| import org.chromium.distiller.webdocument.WebImage; |
| import java.util.HashSet; |
| @@ -18,12 +22,18 @@ import java.util.Set; |
| */ |
| public class ImageExtractor implements EmbedExtractor { |
| private static final Set<String> relevantTags = new HashSet<>(); |
| + private String imgSrc; |
| + private int width; |
| + private int height; |
| + |
| static { |
| // TODO(mdjones): Add "DIV" to this list for css images and possibly captions. |
| relevantTags.add("IMG"); |
| + relevantTags.add("FIGURE"); |
| } |
| + |
| private static final String[] LAZY_IMAGE_ATTRIBUTES = |
| - {"data-src", "data-original", "datasrc", "data-url"}; |
| + {"data-src", "data-original", "datasrc", "data-url"}; |
| @Override |
| public Set<String> getRelevantTagNames() { |
| @@ -35,38 +45,63 @@ public class ImageExtractor implements EmbedExtractor { |
| if (!relevantTags.contains(e.getTagName())) { |
| return null; |
| } |
| - String imgSrc = ""; |
| - // Getting OffSetWidth/Height as default values, even they are |
| - // affected by padding, border, etc. |
| - int width = e.getOffsetWidth(); |
| - int height = e.getOffsetHeight(); |
| + imgSrc = ""; |
| + |
| if ("IMG".equals(e.getTagName())) { |
| - // This will get the absolute URL of the image and |
| - // the displayed image dimension. |
| - ImageElement imageElement = ImageElement.as(e); |
| - // Try to get lazily-loaded images before falling back to get the src attribute. |
| - for(String attr: LAZY_IMAGE_ATTRIBUTES) { |
| - imgSrc = imageElement.getAttribute(attr); |
| - if (!imgSrc.isEmpty()) |
| - break; |
| - } |
| - if (!imgSrc.isEmpty()) { |
| - // We cannot trust the dimension if the image is not loaded yet. |
| - // In some cases there are 1x1 placeholder images. |
| - width = 0; |
| - height = 0; |
| - } else { |
| - imgSrc = imageElement.getSrc(); |
| - // As an ImageElement is manipulated here, it is possible |
| - // to get the real dimensions. |
| - width = imageElement.getWidth(); |
| - height = imageElement.getHeight(); |
| + extractImageAttributes(ImageElement.as(e)); |
| + return new WebImage(e, width, height, imgSrc); |
| + } else if ("FIGURE".equals(e.getTagName())) { |
| + Element img = getFirstElementByTagName(e, "IMG"); |
| + if (img != null) { |
| + extractImageAttributes(ImageElement.as(img)); |
| + Element figcaption = Document.get().createElement("FIGCAPTION"); |
|
wychen
2016/08/05 02:13:03
If we need to keep the structure, directly pass th
|
| + Element cap = getFirstElementByTagName(e, "FIGCAPTION"); |
| + if (cap != null) { |
| + if (getFirstElementByTagName(cap, "A") != null) { |
| + figcaption.setInnerHTML(cap.getInnerHTML()); |
| + } else { |
| + figcaption.setInnerText(cap.getInnerText()); |
| + } |
| + } else { |
| + figcaption.setInnerText(e.getInnerText()); |
| + } |
| + return new WebFigure(img, width, height, imgSrc, figcaption); |
| } |
| } |
| + return null; |
| + } |
| + private void extractImageAttributes(ImageElement imageElement) { |
| + // This will get the absolute URL of the image and |
| + // the displayed image dimension. |
| + // Try to get lazily-loaded images before falling back to get the src attribute. |
| + for (String attr : LAZY_IMAGE_ATTRIBUTES) { |
| + imgSrc = imageElement.getAttribute(attr); |
| + if (!imgSrc.isEmpty()) |
| + break; |
| + } |
| + if (!imgSrc.isEmpty()) { |
| + // We cannot trust the dimension if the image is not loaded yet. |
| + // In some cases there are 1x1 placeholder images. |
| + width = 0; |
| + height = 0; |
| + } else { |
| + imgSrc = imageElement.getSrc(); |
| + // As an ImageElement is manipulated here, it is possible |
| + // to get the real dimensions. |
| + width = imageElement.getWidth(); |
| + height = imageElement.getHeight(); |
| + } |
| if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) { |
| LogUtil.logToConsole("Extracted WebImage: " + imgSrc); |
| } |
| - return new WebImage(e, width, height, imgSrc); |
| + } |
| + |
| + private Element getFirstElementByTagName(Element e, String tagName) { |
| + NodeList<Element> elements = e.getElementsByTagName(tagName); |
| + if (elements.getLength() > 0) { |
| + return elements.getItem(0); |
| + } |
| + return null; |
| } |
| } |