Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(502)

Unified Diff: java/org/chromium/distiller/DomUtil.java

Issue 1411603004: Discard hidden articles when using fast path (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: nit fixed 4 Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: java/org/chromium/distiller/DomUtil.java
diff --git a/java/org/chromium/distiller/DomUtil.java b/java/org/chromium/distiller/DomUtil.java
index 5d3ace5179e4350fb1b70bb19cd738a2616c68a0..49dc8ef47db5dc57d8d17dee8cca19907cbff0a2 100644
--- a/java/org/chromium/distiller/DomUtil.java
+++ b/java/org/chromium/distiller/DomUtil.java
@@ -102,6 +102,57 @@ public class DomUtil {
opacity == 0.0F);
}
+ /**
+ * Verifies if a given element is visible by checking its offset.
+ */
+ public static boolean isVisibleByOffset(Element e) {
+ // Detect whether any of the ancestors has "display: none".
+ // Using offsetParent alone wouldn't work because it's also null when position is fixed.
+ // Using offsetHeight/Width alone makes sense in production, but we have too many
+ // zero-sized elements in our tests.
+ return e.getOffsetParent() != null || e.getOffsetHeight() != 0 || e.getOffsetWidth() != 0;
+ }
+
+ /**
+ * Get the element of the main article, if any.
+ * @return An element of article (not necessarily the html5 article element).
+ */
+ public static Element getArticleElement(Element root) {
+ NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE");
+ List<Element> visibleElements = getVisibleElements(allArticles);
+ // Having multiple article elements usually indicates a bad case for this shortcut.
+ // TODO(wychen): some sites exclude things like title and author in article element.
+ if (visibleElements.size() == 1) {
+ return visibleElements.get(0);
+ }
+ // Note that the CSS property matching is case sensitive, and "Article" is the correct
+ // capitalization.
+ String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*=\"Post\"]";
+ allArticles = DomUtil.querySelectorAll(root, query);
+ visibleElements = getVisibleElements(allArticles);
+ // It is commonly seen that the article is wrapped separately or in multiple layers.
+ if (visibleElements.size() > 0) {
+ return Element.as(DomUtil.getNearestCommonAncestor(visibleElements));
+ }
+ return null;
+ }
+
+ /**
+ * Get a list of visible elements.
+ * @return A list of visible elements.
+ */
+ public static List<Element> getVisibleElements(NodeList<Element> nodeList) {
+ List<Element> visibleElements = new ArrayList<>();
+ for (int i = 0; i < nodeList.getLength(); i++) {
+ Element element = nodeList.getItem(i);
+ if (DomUtil.isVisible(element) &&
+ DomUtil.isVisibleByOffset(element)) {
+ visibleElements.add(element);
+ }
+ }
+ return visibleElements;
+ }
+
/*
* We want to use jsni for direct access to javascript's innerText. This avoids GWT's
* implementation of Element::getInnerText(), which is intentionally different to mimic an old
@@ -171,11 +222,11 @@ public class DomUtil {
/**
* Get the nearest common ancestor of nodes.
*/
- public static Node getNearestCommonAncestor(final NodeList ns) {
- if (ns.getLength() == 0) return null;
- Node parent = ns.getItem(0);
- for (int i = 1; i < ns.getLength(); i++) {
- parent = getNearestCommonAncestor(parent, ns.getItem(i));
+ public static Node getNearestCommonAncestor(final List<Element> ns) {
+ if (ns.size() == 0) return null;
+ Node parent = ns.get(0);
+ for (int i = 1; i < ns.size(); i++) {
+ parent = getNearestCommonAncestor(parent, ns.get(i));
}
return parent;
}
« no previous file with comments | « java/org/chromium/distiller/ContentExtractor.java ('k') | javatests/org/chromium/distiller/ContentExtractorTest.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698