| Index: java/org/chromium/distiller/ContentExtractor.java
|
| diff --git a/java/org/chromium/distiller/ContentExtractor.java b/java/org/chromium/distiller/ContentExtractor.java
|
| index 4a8f8bd8dbee9acff98fbbc9084f7f5d8b4626fe..9e6c4f60e979672e39d0133ed2de863b4c856811 100644
|
| --- a/java/org/chromium/distiller/ContentExtractor.java
|
| +++ b/java/org/chromium/distiller/ContentExtractor.java
|
| @@ -167,23 +167,40 @@ public class ContentExtractor {
|
| */
|
| private Element getArticleElement(Element root) {
|
| NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE");
|
| + List<Element> visibleElements = getVisibleElements(allArticles);
|
| // Having multiple article elements usually indicates a bad case for this shortcut.
|
| // TODO(wychen): some sites exclude things like title and author in article element.
|
| - if (allArticles.getLength() == 1) {
|
| - return allArticles.getItem(0);
|
| + if (visibleElements.size() == 1) {
|
| + return visibleElements.get(0);
|
| }
|
| // Note that the CSS property matching is case sensitive, and "Article" is the correct
|
| // capitalization.
|
| String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*=\"Post\"]";
|
| allArticles = DomUtil.querySelectorAll(root, query);
|
| + visibleElements = getVisibleElements(allArticles);
|
| // It is commonly seen that the article is wrapped separately or in multiple layers.
|
| - if (allArticles.getLength() > 0) {
|
| - return Element.as(DomUtil.getNearestCommonAncestor(allArticles));
|
| + if (visibleElements.size() > 0) {
|
| + return Element.as(DomUtil.getNearestCommonAncestor(visibleElements));
|
| }
|
| return null;
|
| }
|
|
|
| /**
|
| + * Get a list of visible elements.
|
| + * @return A list of visible elements.
|
| + */
|
| + private List<Element> getVisibleElements(NodeList<Element> nodeList) {
|
| + List<Element> visibleElements = new ArrayList<>();
|
| + for (int i = 0; i < nodeList.getLength(); i++) {
|
| + Element element = nodeList.getItem(i);
|
| + if (DomUtil.isVisible(element)) {
|
| + visibleElements.add(element);
|
| + }
|
| + }
|
| + return visibleElements;
|
| + }
|
| +
|
| + /**
|
| * Converts the original HTML page into a WebDocument for analysis.
|
| */
|
| private WebDocumentInfo createWebDocumentInfoFromPage() {
|
|
|