Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(96)

Side by Side Diff: java/org/chromium/distiller/extractors/embeds/ImageExtractor.java

Issue 2638823002: Support <picture> in image extraction (Closed)
Patch Set: support lazy loading in <picture> Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package org.chromium.distiller.extractors.embeds; 5 package org.chromium.distiller.extractors.embeds;
6 6
7 import com.google.gwt.dom.client.Document; 7 import com.google.gwt.dom.client.Document;
8 import com.google.gwt.dom.client.Element; 8 import com.google.gwt.dom.client.Element;
9 import com.google.gwt.dom.client.ImageElement; 9 import com.google.gwt.dom.client.ImageElement;
10 import com.google.gwt.dom.client.NodeList; 10 import com.google.gwt.dom.client.NodeList;
(...skipping 11 matching lines...) Expand all
22 */ 22 */
23 public class ImageExtractor implements EmbedExtractor { 23 public class ImageExtractor implements EmbedExtractor {
24 private static final Set<String> relevantTags = new HashSet<>(); 24 private static final Set<String> relevantTags = new HashSet<>();
25 private String imgSrc; 25 private String imgSrc;
26 private int width; 26 private int width;
27 private int height; 27 private int height;
28 28
29 static { 29 static {
30 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions. 30 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions.
31 relevantTags.add("IMG"); 31 relevantTags.add("IMG");
32 relevantTags.add("PICTURE");
32 relevantTags.add("FIGURE"); 33 relevantTags.add("FIGURE");
33 } 34 }
34 35
35 private static final String[] LAZY_IMAGE_ATTRIBUTES = 36 private static final String[] LAZY_IMAGE_ATTRIBUTES =
36 {"data-src", "data-original", "datasrc", "data-url"}; 37 {"data-src", "data-original", "datasrc", "data-url"};
37 38
38 @Override 39 @Override
39 public Set<String> getRelevantTagNames() { 40 public Set<String> getRelevantTagNames() {
40 return relevantTags; 41 return relevantTags;
41 } 42 }
42 43
43 @Override 44 @Override
44 public WebImage extract(Element e) { 45 public WebImage extract(Element e) {
45 if (!relevantTags.contains(e.getTagName())) { 46 if (!relevantTags.contains(e.getTagName())) {
46 return null; 47 return null;
47 } 48 }
48 imgSrc = ""; 49 imgSrc = "";
49 50
50 if ("IMG".equals(e.getTagName())) { 51 ImageElement ie = ImageElement.as(DomUtil.getFirstElementByTagNameInc(e, "IMG"));
51 extractImageAttributes(ImageElement.as(e)); 52
52 return new WebImage(e, width, height, imgSrc); 53 if ("FIGURE".equals(e.getTagName())) {
53 } else if ("FIGURE".equals(e.getTagName())) { 54 Element img = DomUtil.getFirstElementByTagName(e, "PICTURE");
54 Element img = getFirstElementByTagName(e, "IMG"); 55 if (img == null) {
55 if (img != null) { 56 img = DomUtil.getFirstElementByTagName(e, "IMG");
56 extractImageAttributes(ImageElement.as(img));
57 Element figcaption;
58 Element cap = getFirstElementByTagName(e, "FIGCAPTION");
59 if (cap != null) {
60 // We look for links because some sites put non-caption
61 // elements into <figcaption>. For example: image credit
62 // could contain a link. So we get the whole DOM structure w ithin
63 // <figcaption> only when it contains links, otherwise we ge t the innerText.
64 figcaption = getFirstElementByTagName(cap, "A") != null ?
65 cap : createFigcaptionElement(cap);
66 } else {
67 figcaption = createFigcaptionElement(e);
68 }
69 return new WebFigure(img, width, height, imgSrc, figcaption);
70 } 57 }
58 if (img == null) {
59 return null;
60 }
61 extractImageAttributes(ie);
62 Element figcaption;
63 Element cap = DomUtil.getFirstElementByTagName(e, "FIGCAPTION");
64 if (cap != null) {
65 // We look for links because some sites put non-caption
66 // elements into <figcaption>. For example: image credit
67 // could contain a link. So we get the whole DOM structure withi n
68 // <figcaption> only when it contains links, otherwise we get th e innerText.
69 figcaption = DomUtil.getFirstElementByTagName(cap, "A") != null ?
70 cap : createFigcaptionElement(cap);
71 } else {
72 figcaption = createFigcaptionElement(e);
73 }
74 return new WebFigure(img, width, height, imgSrc, figcaption);
71 } 75 }
72 return null; 76
77 extractImageAttributes(ie);
78 return new WebImage(e, width, height, imgSrc);
73 } 79 }
74 80
75 private void extractImageAttributes(ImageElement imageElement) { 81 private void extractImageAttributes(ImageElement imageElement) {
76 // This will get the absolute URL of the image and 82 // This will get the absolute URL of the image and
77 // the displayed image dimension. 83 // the displayed image dimension.
78 // Try to get lazily-loaded images before falling back to get the src at tribute. 84 // Try to get lazily-loaded images before falling back to get the src at tribute.
79 for (String attr : LAZY_IMAGE_ATTRIBUTES) { 85 for (String attr : LAZY_IMAGE_ATTRIBUTES) {
80 imgSrc = imageElement.getAttribute(attr); 86 imgSrc = imageElement.getAttribute(attr);
81 if (!imgSrc.isEmpty()) 87 if (!imgSrc.isEmpty())
82 break; 88 break;
83 } 89 }
84 if (!imgSrc.isEmpty()) { 90 if (!imgSrc.isEmpty()) {
85 // We cannot trust the dimension if the image is not loaded yet. 91 // We cannot trust the dimension if the image is not loaded yet.
86 // In some cases there are 1x1 placeholder images. 92 // In some cases there are 1x1 placeholder images.
87 width = 0; 93 width = 0;
88 height = 0; 94 height = 0;
89 } else { 95 } else {
90 imgSrc = imageElement.getSrc(); 96 imgSrc = imageElement.getSrc();
91 // As an ImageElement is manipulated here, it is possible 97 // As an ImageElement is manipulated here, it is possible
92 // to get the real dimensions. 98 // to get the real dimensions.
93 width = imageElement.getWidth(); 99 width = imageElement.getWidth();
94 height = imageElement.getHeight(); 100 height = imageElement.getHeight();
95 } 101 }
96 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) { 102 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) {
97 LogUtil.logToConsole("Extracted WebImage: " + imgSrc); 103 LogUtil.logToConsole("Extracted WebImage: " + imgSrc);
98 } 104 }
99 } 105 }
100 106
101 private Element getFirstElementByTagName(Element e, String tagName) {
102 NodeList<Element> elements = e.getElementsByTagName(tagName);
103 if (elements.getLength() > 0) {
104 return elements.getItem(0);
105 }
106 return null;
107 }
108
109 private Element createFigcaptionElement(Element element) { 107 private Element createFigcaptionElement(Element element) {
110 Element figcaption = Document.get().createElement("FIGCAPTION"); 108 Element figcaption = Document.get().createElement("FIGCAPTION");
111 figcaption.setInnerText(DomUtil.getInnerText(element)); 109 figcaption.setInnerText(DomUtil.getInnerText(element));
112 return figcaption; 110 return figcaption;
113 } 111 }
114 } 112 }
OLDNEW
« no previous file with comments | « java/org/chromium/distiller/DomUtil.java ('k') | java/org/chromium/distiller/webdocument/WebImage.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698