Index: java/org/chromium/distiller/DomUtil.java |
diff --git a/java/org/chromium/distiller/DomUtil.java b/java/org/chromium/distiller/DomUtil.java |
index 5d3ace5179e4350fb1b70bb19cd738a2616c68a0..47e68efcd3182a5ea89833e4952e33d7b678ac02 100644 |
--- a/java/org/chromium/distiller/DomUtil.java |
+++ b/java/org/chromium/distiller/DomUtil.java |
@@ -102,6 +102,52 @@ public class DomUtil { |
opacity == 0.0F); |
} |
+ /** |
+ * Verifies if a given element is visible by checking its offset. |
+ */ |
+ public static boolean isVisibleByItsOffset(Element e) { |
wychen
2016/06/02 05:56:01
Rename to IsVisibleByOffset()?
marcelorcorrea
2016/06/03 16:21:57
Done.
|
+ return !(e.getOffsetHeight() <= 0 || e.getOffsetWidth() <= 0); |
+ } |
+ |
+ /** |
+ * Get the element of the main article, if any. |
+ * @return An element of article (not necessarily the html5 article element). |
+ */ |
+ public static Element getArticleElement(Element root) { |
+ NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE"); |
+ List<Element> visibleElements = getVisibleElements(allArticles); |
+ // Having multiple article elements usually indicates a bad case for this shortcut. |
+ // TODO(wychen): some sites exclude things like title and author in article element. |
+ if (visibleElements.size() == 1) { |
+ return visibleElements.get(0); |
+ } |
+ // Note that the CSS property matching is case sensitive, and "Article" is the correct |
+ // capitalization. |
+ String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*=\"Post\"]"; |
+ allArticles = DomUtil.querySelectorAll(root, query); |
+ visibleElements = getVisibleElements(allArticles); |
+ // It is commonly seen that the article is wrapped separately or in multiple layers. |
+ if (visibleElements.size() > 0) { |
+ return Element.as(DomUtil.getNearestCommonAncestor(visibleElements)); |
+ } |
+ return null; |
+ } |
+ |
+ /** |
+ * Get a list of visible elements. |
+ * @return A list of visible elements. |
+ */ |
+ public static List<Element> getVisibleElements(NodeList<Element> nodeList) { |
+ List<Element> visibleElements = new ArrayList<>(); |
+ for (int i = 0; i < nodeList.getLength(); i++) { |
+ Element element = nodeList.getItem(i); |
+ if (DomUtil.isVisible(element) && DomUtil.isVisibleByItsOffset(element)) { |
+ visibleElements.add(element); |
+ } |
+ } |
+ return visibleElements; |
+ } |
+ |
/* |
* We want to use jsni for direct access to javascript's innerText. This avoids GWT's |
* implementation of Element::getInnerText(), which is intentionally different to mimic an old |
@@ -171,11 +217,11 @@ public class DomUtil { |
/** |
* Get the nearest common ancestor of nodes. |
*/ |
- public static Node getNearestCommonAncestor(final NodeList ns) { |
- if (ns.getLength() == 0) return null; |
- Node parent = ns.getItem(0); |
- for (int i = 1; i < ns.getLength(); i++) { |
- parent = getNearestCommonAncestor(parent, ns.getItem(i)); |
+ public static Node getNearestCommonAncestor(final List<Element> ns) { |
+ if (ns.size() == 0) return null; |
+ Node parent = ns.get(0); |
+ for (int i = 1; i < ns.size(); i++) { |
+ parent = getNearestCommonAncestor(parent, ns.get(i)); |
} |
return parent; |
} |