Chromium Code Reviews| Index: java/org/chromium/distiller/extractors/embeds/ImageExtractor.java |
| diff --git a/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java b/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java |
| index c9b527a117c8c5d99d483b94de5ff2bd6d486f9c..23b10640758eb9dfc760af960481899fe3cbd326 100644 |
| --- a/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java |
| +++ b/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java |
| @@ -6,6 +6,8 @@ package org.chromium.distiller.extractors.embeds; |
| import com.google.gwt.dom.client.Element; |
| import com.google.gwt.dom.client.ImageElement; |
| +import com.google.gwt.dom.client.NodeList; |
| +import org.chromium.distiller.webdocument.WebFigure; |
| import org.chromium.distiller.webdocument.WebImage; |
| import java.util.HashSet; |
| @@ -17,9 +19,14 @@ import java.util.Set; |
| */ |
| public class ImageExtractor implements EmbedExtractor { |
| private static final Set<String> relevantTags = new HashSet<>(); |
| + private String src; |
| + private int width; |
| + private int height; |
| + |
| static { |
| // TODO(mdjones): Add "DIV" to this list for css images and possibly captions. |
| relevantTags.add("IMG"); |
| + relevantTags.add("FIGURE"); |
| } |
| @Override |
| @@ -32,22 +39,41 @@ public class ImageExtractor implements EmbedExtractor { |
| if (!relevantTags.contains(e.getTagName())) { |
| return null; |
| } |
| - String imgSrc = ""; |
| // Getting OffSetWidth/Height as default values, even they are |
| // affected by padding, border, etc. |
| - int width = e.getOffsetWidth(); |
| - int height = e.getOffsetHeight(); |
| + width = e.getOffsetWidth(); |
| + height = e.getOffsetHeight(); |
| + src = ""; |
| + |
| if ("IMG".equals(e.getTagName())) { |
| - // This will get the absolute URL of the image and |
| - // the displayed image dimension. |
| - ImageElement imageElement = ImageElement.as(e); |
| - imgSrc = imageElement.getSrc(); |
| - // As an ImageElement is manipulated here, it is possible |
| - // to get the real dimensions. |
| - width = imageElement.getWidth(); |
| - height = imageElement.getHeight(); |
| + extractImageAttributes(ImageElement.as(e)); |
| + return new WebImage(e, width, height, src); |
| + } else if ("FIGURE".equals(e.getTagName())) { |
| + Element img = getFirstElementByTagName(e, "IMG"); |
| + if (img != null) { |
| + String caption = ""; |
| + extractImageAttributes(ImageElement.as(img)); |
| + Element cap = getFirstElementByTagName(e, "FIGCAPTION"); |
|
wychen
2016/06/02 23:48:49
Sadly some web sites don't follow the spec.
For e
marcelorcorrea
2016/06/06 20:38:30
I see your point. I thought about doing that too,
wychen
2016/06/06 21:49:05
Sounds good.
|
| + if (cap != null) { |
| + caption = cap.getInnerText(); |
|
wychen
2016/06/02 23:48:48
Some sites put non-caption elements into <figcapti
wychen
2016/06/02 23:48:49
Another issue: image credit could contain a link.
marcelorcorrea
2016/06/06 20:38:30
Do you think it would be better if we kept the lin
wychen
2016/06/06 21:49:05
I'd like to keep the link, but retaining the DOM t
|
| + } |
| + return new WebFigure(img, width, height, src, caption); |
| + } |
| } |
| + return null; |
| + } |
| - return new WebImage(e, width, height, imgSrc); |
| + private void extractImageAttributes(ImageElement img) { |
| + src = img.getSrc(); |
| + width = img.getWidth(); |
| + height = img.getHeight(); |
| + } |
| + |
| + private Element getFirstElementByTagName(Element e, String tagName) { |
| + NodeList<Element> elements = e.getElementsByTagName(tagName); |
| + if (elements.getLength() > 0) { |
| + return elements.getItem(0); |
| + } |
| + return null; |
| } |
| } |