| OLD | NEW |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 package org.chromium.distiller; | 5 package org.chromium.distiller; |
| 6 | 6 |
| 7 import org.chromium.distiller.document.TextDocument; | 7 import org.chromium.distiller.document.TextDocument; |
| 8 import org.chromium.distiller.document.TextDocumentStatistics; | 8 import org.chromium.distiller.document.TextDocumentStatistics; |
| 9 import org.chromium.distiller.extractors.ArticleExtractor; | 9 import org.chromium.distiller.extractors.ArticleExtractor; |
| 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; | 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; |
| (...skipping 144 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 155 | 155 |
| 156 /** | 156 /** |
| 157 * Get a list of the content image URLs in the provided document. | 157 * Get a list of the content image URLs in the provided document. |
| 158 * @return A list of image URLs. | 158 * @return A list of image URLs. |
| 159 */ | 159 */ |
| 160 public List<String> getImageUrls() { | 160 public List<String> getImageUrls() { |
| 161 return imageUrls; | 161 return imageUrls; |
| 162 } | 162 } |
| 163 | 163 |
| 164 /** | 164 /** |
| 165 * Get the element of the main article, if any. | |
| 166 * @return An element of article (not necessarily the html5 article element)
. | |
| 167 */ | |
| 168 private Element getArticleElement(Element root) { | |
| 169 NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE"); | |
| 170 // Having multiple article elements usually indicates a bad case for thi
s shortcut. | |
| 171 // TODO(wychen): some sites exclude things like title and author in arti
cle element. | |
| 172 if (allArticles.getLength() == 1) { | |
| 173 return allArticles.getItem(0); | |
| 174 } | |
| 175 // Note that the CSS property matching is case sensitive, and "Article"
is the correct | |
| 176 // capitalization. | |
| 177 String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*
=\"Post\"]"; | |
| 178 allArticles = DomUtil.querySelectorAll(root, query); | |
| 179 // It is commonly seen that the article is wrapped separately or in mult
iple layers. | |
| 180 if (allArticles.getLength() > 0) { | |
| 181 return Element.as(DomUtil.getNearestCommonAncestor(allArticles)); | |
| 182 } | |
| 183 return null; | |
| 184 } | |
| 185 | |
| 186 /** | |
| 187 * Converts the original HTML page into a WebDocument for analysis. | 165 * Converts the original HTML page into a WebDocument for analysis. |
| 188 */ | 166 */ |
| 189 private WebDocumentInfo createWebDocumentInfoFromPage() { | 167 private WebDocumentInfo createWebDocumentInfoFromPage() { |
| 190 WebDocumentInfo info = new WebDocumentInfo(); | 168 WebDocumentInfo info = new WebDocumentInfo(); |
| 191 WebDocumentBuilder documentBuilder = new WebDocumentBuilder(); | 169 WebDocumentBuilder documentBuilder = new WebDocumentBuilder(); |
| 192 DomConverter converter = new DomConverter(documentBuilder); | 170 DomConverter converter = new DomConverter(documentBuilder); |
| 193 Element walkerRoot = getArticleElement(documentElement); | 171 Element walkerRoot = DomUtil.getArticleElement(documentElement); |
| 194 if (walkerRoot == null) { | 172 if (walkerRoot == null) { |
| 195 walkerRoot = documentElement; | 173 walkerRoot = documentElement; |
| 196 } | 174 } |
| 197 new DomWalker(converter).walk(walkerRoot); | 175 new DomWalker(converter).walk(walkerRoot); |
| 198 info.document = documentBuilder.toWebDocument(); | 176 info.document = documentBuilder.toWebDocument(); |
| 199 ensureTitleInitialized(); | 177 ensureTitleInitialized(); |
| 200 info.hiddenElements = converter.getHiddenElements(); | 178 info.hiddenElements = converter.getHiddenElements(); |
| 201 | 179 |
| 202 return info; | 180 return info; |
| 203 } | 181 } |
| 204 | 182 |
| 205 /** | 183 /** |
| 206 * Implements the actual analysis of the page content, identifying the core
elements of the | 184 * Implements the actual analysis of the page content, identifying the core
elements of the |
| 207 * page. | 185 * page. |
| 208 * | 186 * |
| 209 * @param document the WebDocument representation of the page extracted from
the DOM. | 187 * @param document the WebDocument representation of the page extracted from
the DOM. |
| 210 */ | 188 */ |
| 211 private void processDocument(WebDocument document) { | 189 private void processDocument(WebDocument document) { |
| 212 TextDocument textDocument = document.createTextDocumentView(); | 190 TextDocument textDocument = document.createTextDocumentView(); |
| 213 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); | 191 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); |
| 214 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(
textDocument)); | 192 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(
textDocument)); |
| 215 textDocument.applyToModel(); | 193 textDocument.applyToModel(); |
| 216 } | 194 } |
| 217 } | 195 } |
| OLD | NEW |