Index: java/org/chromium/distiller/ContentExtractor.java |
diff --git a/java/org/chromium/distiller/ContentExtractor.java b/java/org/chromium/distiller/ContentExtractor.java |
index 4a8f8bd8dbee9acff98fbbc9084f7f5d8b4626fe..7d16a774dd4360a4ce2015ed5296b466d386e969 100644 |
--- a/java/org/chromium/distiller/ContentExtractor.java |
+++ b/java/org/chromium/distiller/ContentExtractor.java |
@@ -162,35 +162,13 @@ public class ContentExtractor { |
} |
/** |
- * Get the element of the main article, if any. |
- * @return An element of article (not necessarily the html5 article element). |
- */ |
- private Element getArticleElement(Element root) { |
- NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE"); |
- // Having multiple article elements usually indicates a bad case for this shortcut. |
- // TODO(wychen): some sites exclude things like title and author in article element. |
- if (allArticles.getLength() == 1) { |
- return allArticles.getItem(0); |
- } |
- // Note that the CSS property matching is case sensitive, and "Article" is the correct |
- // capitalization. |
- String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*=\"Post\"]"; |
- allArticles = DomUtil.querySelectorAll(root, query); |
- // It is commonly seen that the article is wrapped separately or in multiple layers. |
- if (allArticles.getLength() > 0) { |
- return Element.as(DomUtil.getNearestCommonAncestor(allArticles)); |
- } |
- return null; |
- } |
- |
- /** |
* Converts the original HTML page into a WebDocument for analysis. |
*/ |
private WebDocumentInfo createWebDocumentInfoFromPage() { |
WebDocumentInfo info = new WebDocumentInfo(); |
WebDocumentBuilder documentBuilder = new WebDocumentBuilder(); |
DomConverter converter = new DomConverter(documentBuilder); |
- Element walkerRoot = getArticleElement(documentElement); |
+ Element walkerRoot = DomUtil.getArticleElement(documentElement); |
if (walkerRoot == null) { |
walkerRoot = documentElement; |
} |