Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(391)

Side by Side Diff: java/org/chromium/distiller/extractors/embeds/ImageExtractor.java

Issue 2020403002: Add support for figure element (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: comments addressed Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package org.chromium.distiller.extractors.embeds; 5 package org.chromium.distiller.extractors.embeds;
6 6
7 import com.google.gwt.dom.client.Element; 7 import com.google.gwt.dom.client.Element;
8 import com.google.gwt.dom.client.ImageElement; 8 import com.google.gwt.dom.client.ImageElement;
9 import com.google.gwt.dom.client.NodeList;
10 import org.chromium.distiller.webdocument.WebFigure;
9 import org.chromium.distiller.webdocument.WebImage; 11 import org.chromium.distiller.webdocument.WebImage;
10 12
11 import java.util.HashSet; 13 import java.util.HashSet;
12 import java.util.Set; 14 import java.util.Set;
13 15
14 /** 16 /**
15 * This class treats images as another type of embed and provides heuristics for lead image 17 * This class treats images as another type of embed and provides heuristics for lead image
16 * candidacy. 18 * candidacy.
17 */ 19 */
18 public class ImageExtractor implements EmbedExtractor { 20 public class ImageExtractor implements EmbedExtractor {
19 private static final Set<String> relevantTags = new HashSet<>(); 21 private static final Set<String> relevantTags = new HashSet<>();
22 private String src;
23 private int width;
24 private int height;
25
20 static { 26 static {
21 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions. 27 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions.
22 relevantTags.add("IMG"); 28 relevantTags.add("IMG");
29 relevantTags.add("FIGURE");
23 } 30 }
24 31
25 @Override 32 @Override
26 public Set<String> getRelevantTagNames() { 33 public Set<String> getRelevantTagNames() {
27 return relevantTags; 34 return relevantTags;
28 } 35 }
29 36
30 @Override 37 @Override
31 public WebImage extract(Element e) { 38 public WebImage extract(Element e) {
32 if (!relevantTags.contains(e.getTagName())) { 39 if (!relevantTags.contains(e.getTagName())) {
33 return null; 40 return null;
34 } 41 }
35 String imgSrc = "";
36 // Getting OffSetWidth/Height as default values, even they are 42 // Getting OffSetWidth/Height as default values, even they are
37 // affected by padding, border, etc. 43 // affected by padding, border, etc.
38 int width = e.getOffsetWidth(); 44 width = e.getOffsetWidth();
39 int height = e.getOffsetHeight(); 45 height = e.getOffsetHeight();
46 src = "";
47
40 if ("IMG".equals(e.getTagName())) { 48 if ("IMG".equals(e.getTagName())) {
41 // This will get the absolute URL of the image and 49 extractImageAttributes(ImageElement.as(e));
42 // the displayed image dimension. 50 return new WebImage(e, width, height, src);
43 ImageElement imageElement = ImageElement.as(e); 51 } else if ("FIGURE".equals(e.getTagName())) {
44 imgSrc = imageElement.getSrc(); 52 Element img = getFirstElementByTagName(e, "IMG");
45 // As an ImageElement is manipulated here, it is possible 53 if (img != null) {
46 // to get the real dimensions. 54 String caption = "";
47 width = imageElement.getWidth(); 55 extractImageAttributes(ImageElement.as(img));
48 height = imageElement.getHeight(); 56 Element cap = getFirstElementByTagName(e, "FIGCAPTION");
wychen 2016/06/02 23:48:49 Sadly some web sites don't follow the spec. For e
marcelorcorrea 2016/06/06 20:38:30 I see your point. I thought about doing that too,
wychen 2016/06/06 21:49:05 Sounds good.
57 if (cap != null) {
58 caption = cap.getInnerText();
wychen 2016/06/02 23:48:48 Some sites put non-caption elements into <figcapti
wychen 2016/06/02 23:48:49 Another issue: image credit could contain a link.
marcelorcorrea 2016/06/06 20:38:30 Do you think it would be better if we kept the lin
wychen 2016/06/06 21:49:05 I'd like to keep the link, but retaining the DOM t
59 }
60 return new WebFigure(img, width, height, src, caption);
61 }
49 } 62 }
63 return null;
64 }
50 65
51 return new WebImage(e, width, height, imgSrc); 66 private void extractImageAttributes(ImageElement img) {
67 src = img.getSrc();
68 width = img.getWidth();
69 height = img.getHeight();
70 }
71
72 private Element getFirstElementByTagName(Element e, String tagName) {
73 NodeList<Element> elements = e.getElementsByTagName(tagName);
74 if (elements.getLength() > 0) {
75 return elements.getItem(0);
76 }
77 return null;
52 } 78 }
53 } 79 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698