Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 package org.chromium.distiller.extractors.embeds; | 5 package org.chromium.distiller.extractors.embeds; |
| 6 | 6 |
| 7 import com.google.gwt.dom.client.Element; | 7 import com.google.gwt.dom.client.Element; |
| 8 import com.google.gwt.dom.client.ImageElement; | 8 import com.google.gwt.dom.client.ImageElement; |
| 9 import com.google.gwt.dom.client.NodeList; | |
| 10 import org.chromium.distiller.webdocument.WebFigure; | |
| 9 import org.chromium.distiller.webdocument.WebImage; | 11 import org.chromium.distiller.webdocument.WebImage; |
| 10 | 12 |
| 11 import java.util.HashSet; | 13 import java.util.HashSet; |
| 12 import java.util.Set; | 14 import java.util.Set; |
| 13 | 15 |
| 14 /** | 16 /** |
| 15 * This class treats images as another type of embed and provides heuristics for lead image | 17 * This class treats images as another type of embed and provides heuristics for lead image |
| 16 * candidacy. | 18 * candidacy. |
| 17 */ | 19 */ |
| 18 public class ImageExtractor implements EmbedExtractor { | 20 public class ImageExtractor implements EmbedExtractor { |
| 19 private static final Set<String> relevantTags = new HashSet<>(); | 21 private static final Set<String> relevantTags = new HashSet<>(); |
| 22 private String src; | |
| 23 private int width; | |
| 24 private int height; | |
| 25 | |
| 20 static { | 26 static { |
| 21 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions. | 27 // TODO(mdjones): Add "DIV" to this list for css images and possibly cap tions. |
| 22 relevantTags.add("IMG"); | 28 relevantTags.add("IMG"); |
| 29 relevantTags.add("FIGURE"); | |
| 23 } | 30 } |
| 24 | 31 |
| 25 @Override | 32 @Override |
| 26 public Set<String> getRelevantTagNames() { | 33 public Set<String> getRelevantTagNames() { |
| 27 return relevantTags; | 34 return relevantTags; |
| 28 } | 35 } |
| 29 | 36 |
| 30 @Override | 37 @Override |
| 31 public WebImage extract(Element e) { | 38 public WebImage extract(Element e) { |
| 32 if (!relevantTags.contains(e.getTagName())) { | 39 if (!relevantTags.contains(e.getTagName())) { |
| 33 return null; | 40 return null; |
| 34 } | 41 } |
| 35 String imgSrc = ""; | |
| 36 // Getting OffSetWidth/Height as default values, even they are | 42 // Getting OffSetWidth/Height as default values, even they are |
| 37 // affected by padding, border, etc. | 43 // affected by padding, border, etc. |
| 38 int width = e.getOffsetWidth(); | 44 width = e.getOffsetWidth(); |
| 39 int height = e.getOffsetHeight(); | 45 height = e.getOffsetHeight(); |
| 46 src = ""; | |
| 47 | |
| 40 if ("IMG".equals(e.getTagName())) { | 48 if ("IMG".equals(e.getTagName())) { |
| 41 // This will get the absolute URL of the image and | 49 extractImageAttributes(ImageElement.as(e)); |
| 42 // the displayed image dimension. | 50 return new WebImage(e, width, height, src); |
| 43 ImageElement imageElement = ImageElement.as(e); | 51 } else if ("FIGURE".equals(e.getTagName())) { |
| 44 imgSrc = imageElement.getSrc(); | 52 Element img = getFirstElementByTagName(e, "IMG"); |
| 45 // As an ImageElement is manipulated here, it is possible | 53 if (img != null) { |
| 46 // to get the real dimensions. | 54 String caption = ""; |
| 47 width = imageElement.getWidth(); | 55 extractImageAttributes(ImageElement.as(img)); |
| 48 height = imageElement.getHeight(); | 56 Element cap = getFirstElementByTagName(e, "FIGCAPTION"); |
|
wychen
2016/06/02 23:48:49
Sadly some web sites don't follow the spec.
For e
marcelorcorrea
2016/06/06 20:38:30
I see your point. I thought about doing that too,
wychen
2016/06/06 21:49:05
Sounds good.
| |
| 57 if (cap != null) { | |
| 58 caption = cap.getInnerText(); | |
|
wychen
2016/06/02 23:48:48
Some sites put non-caption elements into <figcapti
wychen
2016/06/02 23:48:49
Another issue: image credit could contain a link.
marcelorcorrea
2016/06/06 20:38:30
Do you think it would be better if we kept the lin
wychen
2016/06/06 21:49:05
I'd like to keep the link, but retaining the DOM t
| |
| 59 } | |
| 60 return new WebFigure(img, width, height, src, caption); | |
| 61 } | |
| 49 } | 62 } |
| 63 return null; | |
| 64 } | |
| 50 | 65 |
| 51 return new WebImage(e, width, height, imgSrc); | 66 private void extractImageAttributes(ImageElement img) { |
| 67 src = img.getSrc(); | |
| 68 width = img.getWidth(); | |
| 69 height = img.getHeight(); | |
| 70 } | |
| 71 | |
| 72 private Element getFirstElementByTagName(Element e, String tagName) { | |
| 73 NodeList<Element> elements = e.getElementsByTagName(tagName); | |
| 74 if (elements.getLength() > 0) { | |
| 75 return elements.getItem(0); | |
| 76 } | |
| 77 return null; | |
| 52 } | 78 } |
| 53 } | 79 } |
| OLD | NEW |