java/org/chromium/distiller/extractors/embeds/ImageExtractor.java - Issue 2020403002: Add support for figure element

Unified Diff: java/org/chromium/distiller/extractors/embeds/ImageExtractor.java

Issue 2020403002: Add support for figure element (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master

Patch Set: comments addressed Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: java/org/chromium/distiller/extractors/embeds/ImageExtractor.java

diff --git a/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java b/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java

index c9b527a117c8c5d99d483b94de5ff2bd6d486f9c..23b10640758eb9dfc760af960481899fe3cbd326 100644

--- a/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java

+++ b/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java

@@ -6,6 +6,8 @@ package org.chromium.distiller.extractors.embeds;

import com.google.gwt.dom.client.Element;

import com.google.gwt.dom.client.ImageElement;

+import com.google.gwt.dom.client.NodeList;

+import org.chromium.distiller.webdocument.WebFigure;

import org.chromium.distiller.webdocument.WebImage;

import java.util.HashSet;

@@ -17,9 +19,14 @@ import java.util.Set;

public class ImageExtractor implements EmbedExtractor {

private static final Set<String> relevantTags = new HashSet<>();

+ private String src;

+ private int width;

+ private int height;

static {

// TODO(mdjones): Add "DIV" to this list for css images and possibly captions.

relevantTags.add("IMG");

+ relevantTags.add("FIGURE");

}

@Override

@@ -32,22 +39,41 @@ public class ImageExtractor implements EmbedExtractor {

if (!relevantTags.contains(e.getTagName())) {

return null;

}

- String imgSrc = "";

// Getting OffSetWidth/Height as default values, even they are

// affected by padding, border, etc.

- int width = e.getOffsetWidth();

- int height = e.getOffsetHeight();

+ width = e.getOffsetWidth();

+ height = e.getOffsetHeight();

+ src = "";

if ("IMG".equals(e.getTagName())) {

- // This will get the absolute URL of the image and

- // the displayed image dimension.

- ImageElement imageElement = ImageElement.as(e);

- imgSrc = imageElement.getSrc();

- // As an ImageElement is manipulated here, it is possible

- // to get the real dimensions.

- width = imageElement.getWidth();

- height = imageElement.getHeight();

+ extractImageAttributes(ImageElement.as(e));

+ return new WebImage(e, width, height, src);

+ } else if ("FIGURE".equals(e.getTagName())) {

+ Element img = getFirstElementByTagName(e, "IMG");

+ if (img != null) {

+ String caption = "";

+ extractImageAttributes(ImageElement.as(img));

+ Element cap = getFirstElementByTagName(e, "FIGCAPTION");

wychen 2016/06/02 23:48:49 Sadly some web sites don't follow the spec. For e

marcelorcorrea 2016/06/06 20:38:30 I see your point. I thought about doing that too,

wychen 2016/06/06 21:49:05 Sounds good.

+ if (cap != null) {

+ caption = cap.getInnerText();

wychen 2016/06/02 23:48:48 Some sites put non-caption elements into <figcapti

wychen 2016/06/02 23:48:49 Another issue: image credit could contain a link.

marcelorcorrea 2016/06/06 20:38:30 Do you think it would be better if we kept the lin

wychen 2016/06/06 21:49:05 I'd like to keep the link, but retaining the DOM t

+ }

+ return new WebFigure(img, width, height, src, caption);

+ }

}

+ return null;

+ }

- return new WebImage(e, width, height, imgSrc);

+ private void extractImageAttributes(ImageElement img) {

+ src = img.getSrc();

+ width = img.getWidth();

+ height = img.getHeight();

+ }

+ private Element getFirstElementByTagName(Element e, String tagName) {

+ NodeList<Element> elements = e.getElementsByTagName(tagName);

+ if (elements.getLength() > 0) {

+ return elements.getItem(0);

+ }

+ return null;

}

« no previous file with comments | « no previous file | java/org/chromium/distiller/webdocument/WebFigure.java » ('j') | java/org/chromium/distiller/webdocument/WebFigure.java » ('J')