Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(243)

Unified Diff: java/org/chromium/distiller/extractors/embeds/ImageExtractor.java

Issue 2020403002: Add support for figure element (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: mdjones' comments addressed Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | java/org/chromium/distiller/webdocument/WebFigure.java » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: java/org/chromium/distiller/extractors/embeds/ImageExtractor.java
diff --git a/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java b/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java
index 5b4eb0037ded139f59cadf45ba8df2cb4622e4f6..540ae995d9eff91db60471bc757c33db9555b05e 100644
--- a/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java
+++ b/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java
@@ -4,9 +4,13 @@
package org.chromium.distiller.extractors.embeds;
+import com.google.gwt.dom.client.Document;
import com.google.gwt.dom.client.Element;
import com.google.gwt.dom.client.ImageElement;
+import com.google.gwt.dom.client.NodeList;
+import org.chromium.distiller.DomUtil;
import org.chromium.distiller.LogUtil;
+import org.chromium.distiller.webdocument.WebFigure;
import org.chromium.distiller.webdocument.WebImage;
import java.util.HashSet;
@@ -18,12 +22,18 @@ import java.util.Set;
*/
public class ImageExtractor implements EmbedExtractor {
private static final Set<String> relevantTags = new HashSet<>();
+ private String imgSrc;
+ private int width;
+ private int height;
+
static {
// TODO(mdjones): Add "DIV" to this list for css images and possibly captions.
relevantTags.add("IMG");
+ relevantTags.add("FIGURE");
}
+
private static final String[] LAZY_IMAGE_ATTRIBUTES =
- {"data-src", "data-original", "datasrc", "data-url"};
+ {"data-src", "data-original", "datasrc", "data-url"};
@Override
public Set<String> getRelevantTagNames() {
@@ -35,38 +45,70 @@ public class ImageExtractor implements EmbedExtractor {
if (!relevantTags.contains(e.getTagName())) {
return null;
}
- String imgSrc = "";
- // Getting OffSetWidth/Height as default values, even they are
- // affected by padding, border, etc.
- int width = e.getOffsetWidth();
- int height = e.getOffsetHeight();
+ imgSrc = "";
+
if ("IMG".equals(e.getTagName())) {
- // This will get the absolute URL of the image and
- // the displayed image dimension.
- ImageElement imageElement = ImageElement.as(e);
- // Try to get lazily-loaded images before falling back to get the src attribute.
- for(String attr: LAZY_IMAGE_ATTRIBUTES) {
- imgSrc = imageElement.getAttribute(attr);
- if (!imgSrc.isEmpty())
- break;
- }
- if (!imgSrc.isEmpty()) {
- // We cannot trust the dimension if the image is not loaded yet.
- // In some cases there are 1x1 placeholder images.
- width = 0;
- height = 0;
- } else {
- imgSrc = imageElement.getSrc();
- // As an ImageElement is manipulated here, it is possible
- // to get the real dimensions.
- width = imageElement.getWidth();
- height = imageElement.getHeight();
+ extractImageAttributes(ImageElement.as(e));
+ return new WebImage(e, width, height, imgSrc);
+ } else if ("FIGURE".equals(e.getTagName())) {
+ Element img = getFirstElementByTagName(e, "IMG");
+ if (img != null) {
+ extractImageAttributes(ImageElement.as(img));
+ Element figcaption;
+ Element cap = getFirstElementByTagName(e, "FIGCAPTION");
+ if (cap != null) {
+ // We look for links because some sites put non-caption
+ // elements into <figcaption>. For example: image credit
+ // could contain a link. So we get the whole DOM structure within
+ // <figcaption> only when it contains links, otherwise we get the innerText.
+ figcaption = getFirstElementByTagName(cap, "A") != null ?
+ cap : createFigcaptionElement(cap);
+ } else {
+ figcaption = createFigcaptionElement(e);
+ }
+ return new WebFigure(img, width, height, imgSrc, figcaption);
}
}
+ return null;
+ }
+ private void extractImageAttributes(ImageElement imageElement) {
+ // This will get the absolute URL of the image and
+ // the displayed image dimension.
+ // Try to get lazily-loaded images before falling back to get the src attribute.
+ for (String attr : LAZY_IMAGE_ATTRIBUTES) {
+ imgSrc = imageElement.getAttribute(attr);
+ if (!imgSrc.isEmpty())
+ break;
+ }
+ if (!imgSrc.isEmpty()) {
+ // We cannot trust the dimension if the image is not loaded yet.
+ // In some cases there are 1x1 placeholder images.
+ width = 0;
+ height = 0;
+ } else {
+ imgSrc = imageElement.getSrc();
+ // As an ImageElement is manipulated here, it is possible
+ // to get the real dimensions.
+ width = imageElement.getWidth();
+ height = imageElement.getHeight();
+ }
if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) {
LogUtil.logToConsole("Extracted WebImage: " + imgSrc);
}
- return new WebImage(e, width, height, imgSrc);
+ }
+
+ private Element getFirstElementByTagName(Element e, String tagName) {
+ NodeList<Element> elements = e.getElementsByTagName(tagName);
+ if (elements.getLength() > 0) {
+ return elements.getItem(0);
+ }
+ return null;
+ }
+
+ private Element createFigcaptionElement(Element element) {
+ Element figcaption = Document.get().createElement("FIGCAPTION");
+ figcaption.setInnerText(element.getInnerText());
+ return figcaption;
}
}
« no previous file with comments | « no previous file | java/org/chromium/distiller/webdocument/WebFigure.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698