Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 package org.chromium.distiller.extractors.embeds; | 5 package org.chromium.distiller.extractors.embeds; |
| 6 | 6 |
| 7 import com.google.gwt.dom.client.Element; | 7 import com.google.gwt.dom.client.Element; |
| 8 import com.google.gwt.dom.client.ImageElement; | 8 import com.google.gwt.dom.client.ImageElement; |
| 9 import com.google.gwt.dom.client.NodeList; | |
| 10 import org.chromium.distiller.webdocument.WebFigure; | |
| 11 import org.chromium.distiller.webdocument.WebImage; | |
| 9 import org.chromium.distiller.LogUtil; | 12 import org.chromium.distiller.LogUtil; |
|
wychen
2016/06/07 17:39:19
The sorting is case sensitive. Capital ones come f
| |
| 10 import org.chromium.distiller.webdocument.WebImage; | |
| 11 | 13 |
| 12 import java.util.HashSet; | 14 import java.util.HashSet; |
| 13 import java.util.Set; | 15 import java.util.Set; |
| 14 | 16 |
| 15 /** | 17 /** |
| 16 * This class treats images as another type of embed and provides heuristics for lead image | 18 * This class treats images as another type of embed and provides heuristics for lead image |
| 17 * candidacy. | 19 * candidacy. |
| 18 */ | 20 */ |
| 19 public class ImageExtractor implements EmbedExtractor { | 21 public class ImageExtractor implements EmbedExtractor { |
| 20 private static final Set<String> relevantTags = new HashSet<>(); | 22 private static final Set<String> relevantTags = new HashSet<>(); |
| 23 private String imgSrc; | |
| 24 private int width; | |
| 25 private int height; | |
| 26 | |
| 21 static { | 27 static { |
| 22 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions. | 28 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions. |
| 23 relevantTags.add("IMG"); | 29 relevantTags.add("IMG"); |
| 30 relevantTags.add("FIGURE"); | |
| 24 } | 31 } |
| 32 | |
| 25 private static final String[] LAZY_IMAGE_ATTRIBUTES = | 33 private static final String[] LAZY_IMAGE_ATTRIBUTES = |
| 26 {"data-src", "data-original", "datasrc", "data-url"}; | 34 {"data-src", "data-original", "datasrc", "data-url"}; |
| 27 | 35 |
| 28 @Override | 36 @Override |
| 29 public Set<String> getRelevantTagNames() { | 37 public Set<String> getRelevantTagNames() { |
| 30 return relevantTags; | 38 return relevantTags; |
| 31 } | 39 } |
| 32 | 40 |
| 33 @Override | 41 @Override |
| 34 public WebImage extract(Element e) { | 42 public WebImage extract(Element e) { |
| 35 if (!relevantTags.contains(e.getTagName())) { | 43 if (!relevantTags.contains(e.getTagName())) { |
| 36 return null; | 44 return null; |
| 37 } | 45 } |
| 38 String imgSrc = ""; | 46 imgSrc = ""; |
| 39 // Getting OffSetWidth/Height as default values, even they are | 47 |
| 40 // affected by padding, border, etc. | |
| 41 int width = e.getOffsetWidth(); | |
| 42 int height = e.getOffsetHeight(); | |
| 43 if ("IMG".equals(e.getTagName())) { | 48 if ("IMG".equals(e.getTagName())) { |
| 44 // This will get the absolute URL of the image and | 49 extractImageAttributes(ImageElement.as(e)); |
| 45 // the displayed image dimension. | 50 return new WebImage(e, width, height, imgSrc); |
| 46 ImageElement imageElement = ImageElement.as(e); | 51 } else if ("FIGURE".equals(e.getTagName())) { |
| 47 // Try to get lazily-loaded images before falling back to get the sr c attribute. | 52 Element img = getFirstElementByTagName(e, "IMG"); |
| 48 for(String attr: LAZY_IMAGE_ATTRIBUTES) { | 53 if (img != null) { |
| 49 imgSrc = imageElement.getAttribute(attr); | 54 String caption; |
| 50 if (!imgSrc.isEmpty()) | 55 extractImageAttributes(ImageElement.as(img)); |
| 51 break; | 56 Element cap = getFirstElementByTagName(e, "FIGCAPTION"); |
| 52 } | 57 if (cap != null) { |
| 53 if (!imgSrc.isEmpty()) { | 58 caption = getFirstElementByTagName(cap, "A") != null ? |
| 54 // We cannot trust the dimension if the image is not loaded yet. | 59 cap.getInnerHTML() : cap.getInnerText(); |
|
wychen
2016/06/07 17:39:19
This innerText should be escaped. Or a vulnerabili
marcelorcorrea
2016/06/08 14:34:15
Done.
wychen
2016/06/20 18:54:20
WebTable stores an element, and use DomUtil.genera
| |
| 55 // In some cases there are 1x1 placeholder images. | 60 } else { |
| 56 width = 0; | 61 caption = e.getInnerText(); |
| 57 height = 0; | 62 } |
| 58 } else { | 63 return new WebFigure(img, width, height, imgSrc, caption); |
| 59 imgSrc = imageElement.getSrc(); | |
| 60 // As an ImageElement is manipulated here, it is possible | |
| 61 // to get the real dimensions. | |
| 62 width = imageElement.getWidth(); | |
| 63 height = imageElement.getHeight(); | |
| 64 } | 64 } |
| 65 } | 65 } |
| 66 return null; | |
| 67 } | |
| 66 | 68 |
| 69 private void extractImageAttributes(ImageElement imageElement) { | |
| 70 // This will get the absolute URL of the image and | |
| 71 // the displayed image dimension. | |
| 72 // Try to get lazily-loaded images before falling back to get the src at tribute. | |
| 73 for (String attr : LAZY_IMAGE_ATTRIBUTES) { | |
| 74 imgSrc = imageElement.getAttribute(attr); | |
| 75 if (!imgSrc.isEmpty()) | |
| 76 break; | |
| 77 } | |
| 78 if (!imgSrc.isEmpty()) { | |
| 79 // We cannot trust the dimension if the image is not loaded yet. | |
| 80 // In some cases there are 1x1 placeholder images. | |
| 81 width = 0; | |
| 82 height = 0; | |
| 83 } else { | |
| 84 imgSrc = imageElement.getSrc(); | |
| 85 // As an ImageElement is manipulated here, it is possible | |
| 86 // to get the real dimensions. | |
| 87 width = imageElement.getWidth(); | |
| 88 height = imageElement.getHeight(); | |
| 89 } | |
| 67 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) { | 90 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) { |
| 68 LogUtil.logToConsole("Extracted WebImage: " + imgSrc); | 91 LogUtil.logToConsole("Extracted WebImage: " + imgSrc); |
| 69 } | 92 } |
| 70 return new WebImage(e, width, height, imgSrc); | 93 } |
| 94 | |
| 95 private Element getFirstElementByTagName(Element e, String tagName) { | |
| 96 NodeList<Element> elements = e.getElementsByTagName(tagName); | |
| 97 if (elements.getLength() > 0) { | |
| 98 return elements.getItem(0); | |
| 99 } | |
| 100 return null; | |
| 71 } | 101 } |
| 72 } | 102 } |
| OLD | NEW |