Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(479)

Unified Diff: java/org/chromium/distiller/ContentExtractor.java

Issue 1411603004: Discard hidden articles when using fast path (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: Fixed inconsistent indentation Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | java/org/chromium/distiller/DomUtil.java » ('j') | java/org/chromium/distiller/DomUtil.java » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: java/org/chromium/distiller/ContentExtractor.java
diff --git a/java/org/chromium/distiller/ContentExtractor.java b/java/org/chromium/distiller/ContentExtractor.java
index 4a8f8bd8dbee9acff98fbbc9084f7f5d8b4626fe..3a08dd44cac51952c2bcb68d9a5f0fa52a08ae73 100644
--- a/java/org/chromium/distiller/ContentExtractor.java
+++ b/java/org/chromium/distiller/ContentExtractor.java
@@ -167,23 +167,40 @@ public class ContentExtractor {
*/
private Element getArticleElement(Element root) {
NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE");
+ List<Element> visibleElements = getVisibleElements(allArticles);
// Having multiple article elements usually indicates a bad case for this shortcut.
// TODO(wychen): some sites exclude things like title and author in article element.
- if (allArticles.getLength() == 1) {
- return allArticles.getItem(0);
+ if (visibleElements.size() == 1) {
+ return visibleElements.get(0);
}
// Note that the CSS property matching is case sensitive, and "Article" is the correct
// capitalization.
String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*=\"Post\"]";
allArticles = DomUtil.querySelectorAll(root, query);
+ visibleElements = getVisibleElements(allArticles);
// It is commonly seen that the article is wrapped separately or in multiple layers.
- if (allArticles.getLength() > 0) {
- return Element.as(DomUtil.getNearestCommonAncestor(allArticles));
+ if (visibleElements.size() > 0) {
+ return Element.as(DomUtil.getNearestCommonAncestor(visibleElements));
}
return null;
}
/**
+ * Get a list of visible elements.
+ * @return A list of visible elements.
+ */
+ private List<Element> getVisibleElements(NodeList<Element> nodeList) {
+ List<Element> visibleElements = new ArrayList<>();
+ for (int i = 0; i < nodeList.getLength(); i ++) {
mdjones 2015/10/21 18:15:02 nit: i++
+ Element element = nodeList.getItem(i);
+ if (DomUtil.isVisible(element)) {
+ visibleElements.add(element);
+ }
+ }
+ return visibleElements;
+ }
+
+ /**
* Converts the original HTML page into a WebDocument for analysis.
*/
private WebDocumentInfo createWebDocumentInfoFromPage() {
« no previous file with comments | « no previous file | java/org/chromium/distiller/DomUtil.java » ('j') | java/org/chromium/distiller/DomUtil.java » ('J')

Powered by Google App Engine
This is Rietveld 408576698