Chromium Code Reviews| Index: java/org/chromium/distiller/ContentExtractor.java |
| diff --git a/java/org/chromium/distiller/ContentExtractor.java b/java/org/chromium/distiller/ContentExtractor.java |
| index 4a8f8bd8dbee9acff98fbbc9084f7f5d8b4626fe..3a08dd44cac51952c2bcb68d9a5f0fa52a08ae73 100644 |
| --- a/java/org/chromium/distiller/ContentExtractor.java |
| +++ b/java/org/chromium/distiller/ContentExtractor.java |
| @@ -167,23 +167,40 @@ public class ContentExtractor { |
| */ |
| private Element getArticleElement(Element root) { |
| NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE"); |
| + List<Element> visibleElements = getVisibleElements(allArticles); |
| // Having multiple article elements usually indicates a bad case for this shortcut. |
| // TODO(wychen): some sites exclude things like title and author in article element. |
| - if (allArticles.getLength() == 1) { |
| - return allArticles.getItem(0); |
| + if (visibleElements.size() == 1) { |
| + return visibleElements.get(0); |
| } |
| // Note that the CSS property matching is case sensitive, and "Article" is the correct |
| // capitalization. |
| String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*=\"Post\"]"; |
| allArticles = DomUtil.querySelectorAll(root, query); |
| + visibleElements = getVisibleElements(allArticles); |
| // It is commonly seen that the article is wrapped separately or in multiple layers. |
| - if (allArticles.getLength() > 0) { |
| - return Element.as(DomUtil.getNearestCommonAncestor(allArticles)); |
| + if (visibleElements.size() > 0) { |
| + return Element.as(DomUtil.getNearestCommonAncestor(visibleElements)); |
| } |
| return null; |
| } |
| /** |
| + * Get a list of visible elements. |
| + * @return A list of visible elements. |
| + */ |
| + private List<Element> getVisibleElements(NodeList<Element> nodeList) { |
| + List<Element> visibleElements = new ArrayList<>(); |
| + for (int i = 0; i < nodeList.getLength(); i ++) { |
|
mdjones
2015/10/21 18:15:02
nit: i++
|
| + Element element = nodeList.getItem(i); |
| + if (DomUtil.isVisible(element)) { |
| + visibleElements.add(element); |
| + } |
| + } |
| + return visibleElements; |
| + } |
| + |
| + /** |
| * Converts the original HTML page into a WebDocument for analysis. |
| */ |
| private WebDocumentInfo createWebDocumentInfoFromPage() { |