Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(667)

Side by Side Diff: java/org/chromium/distiller/extractors/embeds/ImageExtractor.java

Issue 2729143003: Handle image lazy loading on Wikipedia (Closed)
Patch Set: fixup non-matching case Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | javatests/org/chromium/distiller/ContentExtractorTest.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package org.chromium.distiller.extractors.embeds; 5 package org.chromium.distiller.extractors.embeds;
6 6
7 import com.google.gwt.dom.client.Document; 7 import com.google.gwt.dom.client.Document;
8 import com.google.gwt.dom.client.Element; 8 import com.google.gwt.dom.client.Element;
9 import com.google.gwt.dom.client.ImageElement; 9 import com.google.gwt.dom.client.ImageElement;
10 import com.google.gwt.dom.client.NodeList; 10 import com.google.gwt.dom.client.NodeList;
11 import org.chromium.distiller.DomUtil; 11 import org.chromium.distiller.DomUtil;
12 import org.chromium.distiller.JavaScript;
12 import org.chromium.distiller.LogUtil; 13 import org.chromium.distiller.LogUtil;
13 import org.chromium.distiller.webdocument.WebFigure; 14 import org.chromium.distiller.webdocument.WebFigure;
14 import org.chromium.distiller.webdocument.WebImage; 15 import org.chromium.distiller.webdocument.WebImage;
15 16
16 import java.util.HashSet; 17 import java.util.HashSet;
17 import java.util.Set; 18 import java.util.Set;
18 19
19 /** 20 /**
20 * This class treats images as another type of embed and provides heuristics for lead image 21 * This class treats images as another type of embed and provides heuristics for lead image
21 * candidacy. 22 * candidacy.
22 */ 23 */
23 public class ImageExtractor implements EmbedExtractor { 24 public class ImageExtractor implements EmbedExtractor {
24 private static final Set<String> relevantTags = new HashSet<>(); 25 private static final Set<String> relevantTags = new HashSet<>();
25 private String imgSrc; 26 private String imgSrc;
26 private int width; 27 private int width;
27 private int height; 28 private int height;
28 29
29 static { 30 static {
30 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions. 31 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions.
31 relevantTags.add("IMG"); 32 relevantTags.add("IMG");
32 relevantTags.add("PICTURE"); 33 relevantTags.add("PICTURE");
33 relevantTags.add("FIGURE"); 34 relevantTags.add("FIGURE");
35 relevantTags.add("SPAN");
34 } 36 }
35 37
36 private static final String[] LAZY_IMAGE_ATTRIBUTES = 38 private static final String[] LAZY_IMAGE_ATTRIBUTES =
37 {"data-src", "data-original", "datasrc", "data-url"}; 39 {"data-src", "data-original", "datasrc", "data-url"};
38 40
39 @Override 41 @Override
40 public Set<String> getRelevantTagNames() { 42 public Set<String> getRelevantTagNames() {
41 return relevantTags; 43 return relevantTags;
42 } 44 }
43 45
(...skipping 24 matching lines...) Expand all
68 // <figcaption> only when it contains links, otherwise we get th e innerText. 70 // <figcaption> only when it contains links, otherwise we get th e innerText.
69 NodeList<Element> links = DomUtil.querySelectorAll(cap, "A[HREF] "); 71 NodeList<Element> links = DomUtil.querySelectorAll(cap, "A[HREF] ");
70 figcaption = links.getLength() > 0 ? 72 figcaption = links.getLength() > 0 ?
71 cap : createFigcaptionElement(cap); 73 cap : createFigcaptionElement(cap);
72 } else { 74 } else {
73 figcaption = createFigcaptionElement(e); 75 figcaption = createFigcaptionElement(e);
74 } 76 }
75 return new WebFigure(img, width, height, imgSrc, figcaption); 77 return new WebFigure(img, width, height, imgSrc, figcaption);
76 } 78 }
77 79
80 if ("SPAN".equals(e.getTagName())) {
81 if (!e.getAttribute("class").contains("lazy-image-placeholder")) {
82 return null;
83 }
84 // Image lazy loading on Wikipedia.
85 ie = Document.get().createImageElement();
86 imgSrc = e.getAttribute("data-src");
87 width = JavaScript.parseInt(e.getAttribute("data-width"));
88 height = JavaScript.parseInt(e.getAttribute("data-height"));
89 ie.setAttribute("srcset", e.getAttribute("data-srcset"));
90 return new WebImage(ie, width, height, imgSrc);
91 }
92
78 extractImageAttributes(ie); 93 extractImageAttributes(ie);
79 return new WebImage(e, width, height, imgSrc); 94 return new WebImage(e, width, height, imgSrc);
80 } 95 }
81 96
82 private void extractImageAttributes(ImageElement imageElement) { 97 private void extractImageAttributes(ImageElement imageElement) {
83 // This will get the absolute URL of the image and 98 // This will get the absolute URL of the image and
84 // the displayed image dimension. 99 // the displayed image dimension.
85 // Try to get lazily-loaded images before falling back to get the src at tribute. 100 // Try to get lazily-loaded images before falling back to get the src at tribute.
86 for (String attr : LAZY_IMAGE_ATTRIBUTES) { 101 for (String attr : LAZY_IMAGE_ATTRIBUTES) {
87 imgSrc = imageElement.getAttribute(attr); 102 imgSrc = imageElement.getAttribute(attr);
(...skipping 16 matching lines...) Expand all
104 LogUtil.logToConsole("Extracted WebImage: " + imgSrc); 119 LogUtil.logToConsole("Extracted WebImage: " + imgSrc);
105 } 120 }
106 } 121 }
107 122
108 private Element createFigcaptionElement(Element element) { 123 private Element createFigcaptionElement(Element element) {
109 Element figcaption = Document.get().createElement("FIGCAPTION"); 124 Element figcaption = Document.get().createElement("FIGCAPTION");
110 figcaption.setInnerText(DomUtil.getInnerText(element)); 125 figcaption.setInnerText(DomUtil.getInnerText(element));
111 return figcaption; 126 return figcaption;
112 } 127 }
113 } 128 }
OLDNEW
« no previous file with comments | « no previous file | javatests/org/chromium/distiller/ContentExtractorTest.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698