OLD | NEW |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 package org.chromium.distiller.extractors.embeds; | 5 package org.chromium.distiller.extractors.embeds; |
6 | 6 |
7 import com.google.gwt.dom.client.Document; | 7 import com.google.gwt.dom.client.Document; |
8 import com.google.gwt.dom.client.Element; | 8 import com.google.gwt.dom.client.Element; |
9 import com.google.gwt.dom.client.ImageElement; | 9 import com.google.gwt.dom.client.ImageElement; |
10 import com.google.gwt.dom.client.NodeList; | 10 import com.google.gwt.dom.client.NodeList; |
11 import org.chromium.distiller.DomUtil; | 11 import org.chromium.distiller.DomUtil; |
| 12 import org.chromium.distiller.JavaScript; |
12 import org.chromium.distiller.LogUtil; | 13 import org.chromium.distiller.LogUtil; |
13 import org.chromium.distiller.webdocument.WebFigure; | 14 import org.chromium.distiller.webdocument.WebFigure; |
14 import org.chromium.distiller.webdocument.WebImage; | 15 import org.chromium.distiller.webdocument.WebImage; |
15 | 16 |
16 import java.util.HashSet; | 17 import java.util.HashSet; |
17 import java.util.Set; | 18 import java.util.Set; |
18 | 19 |
19 /** | 20 /** |
20 * This class treats images as another type of embed and provides heuristics for
lead image | 21 * This class treats images as another type of embed and provides heuristics for
lead image |
21 * candidacy. | 22 * candidacy. |
22 */ | 23 */ |
23 public class ImageExtractor implements EmbedExtractor { | 24 public class ImageExtractor implements EmbedExtractor { |
24 private static final Set<String> relevantTags = new HashSet<>(); | 25 private static final Set<String> relevantTags = new HashSet<>(); |
25 private String imgSrc; | 26 private String imgSrc; |
26 private int width; | 27 private int width; |
27 private int height; | 28 private int height; |
28 | 29 |
29 static { | 30 static { |
30 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap
tions. | 31 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap
tions. |
31 relevantTags.add("IMG"); | 32 relevantTags.add("IMG"); |
32 relevantTags.add("PICTURE"); | 33 relevantTags.add("PICTURE"); |
33 relevantTags.add("FIGURE"); | 34 relevantTags.add("FIGURE"); |
| 35 relevantTags.add("SPAN"); |
34 } | 36 } |
35 | 37 |
36 private static final String[] LAZY_IMAGE_ATTRIBUTES = | 38 private static final String[] LAZY_IMAGE_ATTRIBUTES = |
37 {"data-src", "data-original", "datasrc", "data-url"}; | 39 {"data-src", "data-original", "datasrc", "data-url"}; |
38 | 40 |
39 @Override | 41 @Override |
40 public Set<String> getRelevantTagNames() { | 42 public Set<String> getRelevantTagNames() { |
41 return relevantTags; | 43 return relevantTags; |
42 } | 44 } |
43 | 45 |
(...skipping 24 matching lines...) Expand all Loading... |
68 // <figcaption> only when it contains links, otherwise we get th
e innerText. | 70 // <figcaption> only when it contains links, otherwise we get th
e innerText. |
69 NodeList<Element> links = DomUtil.querySelectorAll(cap, "A[HREF]
"); | 71 NodeList<Element> links = DomUtil.querySelectorAll(cap, "A[HREF]
"); |
70 figcaption = links.getLength() > 0 ? | 72 figcaption = links.getLength() > 0 ? |
71 cap : createFigcaptionElement(cap); | 73 cap : createFigcaptionElement(cap); |
72 } else { | 74 } else { |
73 figcaption = createFigcaptionElement(e); | 75 figcaption = createFigcaptionElement(e); |
74 } | 76 } |
75 return new WebFigure(img, width, height, imgSrc, figcaption); | 77 return new WebFigure(img, width, height, imgSrc, figcaption); |
76 } | 78 } |
77 | 79 |
| 80 if ("SPAN".equals(e.getTagName())) { |
| 81 if (!e.getAttribute("class").contains("lazy-image-placeholder")) { |
| 82 return null; |
| 83 } |
| 84 // Image lazy loading on Wikipedia. |
| 85 ie = Document.get().createImageElement(); |
| 86 imgSrc = e.getAttribute("data-src"); |
| 87 width = JavaScript.parseInt(e.getAttribute("data-width")); |
| 88 height = JavaScript.parseInt(e.getAttribute("data-height")); |
| 89 ie.setAttribute("srcset", e.getAttribute("data-srcset")); |
| 90 return new WebImage(ie, width, height, imgSrc); |
| 91 } |
| 92 |
78 extractImageAttributes(ie); | 93 extractImageAttributes(ie); |
79 return new WebImage(e, width, height, imgSrc); | 94 return new WebImage(e, width, height, imgSrc); |
80 } | 95 } |
81 | 96 |
82 private void extractImageAttributes(ImageElement imageElement) { | 97 private void extractImageAttributes(ImageElement imageElement) { |
83 // This will get the absolute URL of the image and | 98 // This will get the absolute URL of the image and |
84 // the displayed image dimension. | 99 // the displayed image dimension. |
85 // Try to get lazily-loaded images before falling back to get the src at
tribute. | 100 // Try to get lazily-loaded images before falling back to get the src at
tribute. |
86 for (String attr : LAZY_IMAGE_ATTRIBUTES) { | 101 for (String attr : LAZY_IMAGE_ATTRIBUTES) { |
87 imgSrc = imageElement.getAttribute(attr); | 102 imgSrc = imageElement.getAttribute(attr); |
(...skipping 16 matching lines...) Expand all Loading... |
104 LogUtil.logToConsole("Extracted WebImage: " + imgSrc); | 119 LogUtil.logToConsole("Extracted WebImage: " + imgSrc); |
105 } | 120 } |
106 } | 121 } |
107 | 122 |
108 private Element createFigcaptionElement(Element element) { | 123 private Element createFigcaptionElement(Element element) { |
109 Element figcaption = Document.get().createElement("FIGCAPTION"); | 124 Element figcaption = Document.get().createElement("FIGCAPTION"); |
110 figcaption.setInnerText(DomUtil.getInnerText(element)); | 125 figcaption.setInnerText(DomUtil.getInnerText(element)); |
111 return figcaption; | 126 return figcaption; |
112 } | 127 } |
113 } | 128 } |
OLD | NEW |