Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(370)

Side by Side Diff: java/org/chromium/distiller/extractors/embeds/ImageExtractor.java

Issue 2020403002: Add support for figure element (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: mdjones' comments addressed Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | java/org/chromium/distiller/webdocument/WebFigure.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package org.chromium.distiller.extractors.embeds; 5 package org.chromium.distiller.extractors.embeds;
6 6
7 import com.google.gwt.dom.client.Document;
7 import com.google.gwt.dom.client.Element; 8 import com.google.gwt.dom.client.Element;
8 import com.google.gwt.dom.client.ImageElement; 9 import com.google.gwt.dom.client.ImageElement;
10 import com.google.gwt.dom.client.NodeList;
11 import org.chromium.distiller.DomUtil;
9 import org.chromium.distiller.LogUtil; 12 import org.chromium.distiller.LogUtil;
13 import org.chromium.distiller.webdocument.WebFigure;
10 import org.chromium.distiller.webdocument.WebImage; 14 import org.chromium.distiller.webdocument.WebImage;
11 15
12 import java.util.HashSet; 16 import java.util.HashSet;
13 import java.util.Set; 17 import java.util.Set;
14 18
15 /** 19 /**
16 * This class treats images as another type of embed and provides heuristics for lead image 20 * This class treats images as another type of embed and provides heuristics for lead image
17 * candidacy. 21 * candidacy.
18 */ 22 */
19 public class ImageExtractor implements EmbedExtractor { 23 public class ImageExtractor implements EmbedExtractor {
20 private static final Set<String> relevantTags = new HashSet<>(); 24 private static final Set<String> relevantTags = new HashSet<>();
25 private String imgSrc;
26 private int width;
27 private int height;
28
21 static { 29 static {
22 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions. 30 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions.
23 relevantTags.add("IMG"); 31 relevantTags.add("IMG");
32 relevantTags.add("FIGURE");
24 } 33 }
34
25 private static final String[] LAZY_IMAGE_ATTRIBUTES = 35 private static final String[] LAZY_IMAGE_ATTRIBUTES =
26 {"data-src", "data-original", "datasrc", "data-url"}; 36 {"data-src", "data-original", "datasrc", "data-url"};
27 37
28 @Override 38 @Override
29 public Set<String> getRelevantTagNames() { 39 public Set<String> getRelevantTagNames() {
30 return relevantTags; 40 return relevantTags;
31 } 41 }
32 42
33 @Override 43 @Override
34 public WebImage extract(Element e) { 44 public WebImage extract(Element e) {
35 if (!relevantTags.contains(e.getTagName())) { 45 if (!relevantTags.contains(e.getTagName())) {
36 return null; 46 return null;
37 } 47 }
38 String imgSrc = ""; 48 imgSrc = "";
39 // Getting OffSetWidth/Height as default values, even they are 49
40 // affected by padding, border, etc.
41 int width = e.getOffsetWidth();
42 int height = e.getOffsetHeight();
43 if ("IMG".equals(e.getTagName())) { 50 if ("IMG".equals(e.getTagName())) {
44 // This will get the absolute URL of the image and 51 extractImageAttributes(ImageElement.as(e));
45 // the displayed image dimension. 52 return new WebImage(e, width, height, imgSrc);
46 ImageElement imageElement = ImageElement.as(e); 53 } else if ("FIGURE".equals(e.getTagName())) {
47 // Try to get lazily-loaded images before falling back to get the sr c attribute. 54 Element img = getFirstElementByTagName(e, "IMG");
48 for(String attr: LAZY_IMAGE_ATTRIBUTES) { 55 if (img != null) {
49 imgSrc = imageElement.getAttribute(attr); 56 extractImageAttributes(ImageElement.as(img));
50 if (!imgSrc.isEmpty()) 57 Element figcaption;
51 break; 58 Element cap = getFirstElementByTagName(e, "FIGCAPTION");
52 } 59 if (cap != null) {
53 if (!imgSrc.isEmpty()) { 60 // We look for links because some sites put non-caption
54 // We cannot trust the dimension if the image is not loaded yet. 61 // elements into <figcaption>. For example: image credit
55 // In some cases there are 1x1 placeholder images. 62 // could contain a link. So we get the whole DOM structure w ithin
56 width = 0; 63 // <figcaption> only when it contains links, otherwise we ge t the innerText.
57 height = 0; 64 figcaption = getFirstElementByTagName(cap, "A") != null ?
58 } else { 65 cap : createFigcaptionElement(cap);
59 imgSrc = imageElement.getSrc(); 66 } else {
60 // As an ImageElement is manipulated here, it is possible 67 figcaption = createFigcaptionElement(e);
61 // to get the real dimensions. 68 }
62 width = imageElement.getWidth(); 69 return new WebFigure(img, width, height, imgSrc, figcaption);
63 height = imageElement.getHeight();
64 } 70 }
65 } 71 }
72 return null;
73 }
66 74
75 private void extractImageAttributes(ImageElement imageElement) {
76 // This will get the absolute URL of the image and
77 // the displayed image dimension.
78 // Try to get lazily-loaded images before falling back to get the src at tribute.
79 for (String attr : LAZY_IMAGE_ATTRIBUTES) {
80 imgSrc = imageElement.getAttribute(attr);
81 if (!imgSrc.isEmpty())
82 break;
83 }
84 if (!imgSrc.isEmpty()) {
85 // We cannot trust the dimension if the image is not loaded yet.
86 // In some cases there are 1x1 placeholder images.
87 width = 0;
88 height = 0;
89 } else {
90 imgSrc = imageElement.getSrc();
91 // As an ImageElement is manipulated here, it is possible
92 // to get the real dimensions.
93 width = imageElement.getWidth();
94 height = imageElement.getHeight();
95 }
67 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) { 96 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) {
68 LogUtil.logToConsole("Extracted WebImage: " + imgSrc); 97 LogUtil.logToConsole("Extracted WebImage: " + imgSrc);
69 } 98 }
70 return new WebImage(e, width, height, imgSrc); 99 }
100
101 private Element getFirstElementByTagName(Element e, String tagName) {
102 NodeList<Element> elements = e.getElementsByTagName(tagName);
103 if (elements.getLength() > 0) {
104 return elements.getItem(0);
105 }
106 return null;
107 }
108
109 private Element createFigcaptionElement(Element element) {
110 Element figcaption = Document.get().createElement("FIGCAPTION");
111 figcaption.setInnerText(element.getInnerText());
112 return figcaption;
71 } 113 }
72 } 114 }
OLDNEW
« no previous file with comments | « no previous file | java/org/chromium/distiller/webdocument/WebFigure.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698