OLD | NEW |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 package org.chromium.distiller; | 5 package org.chromium.distiller; |
6 | 6 |
7 import org.chromium.distiller.document.TextDocument; | 7 import org.chromium.distiller.document.TextDocument; |
8 import org.chromium.distiller.document.TextDocumentStatistics; | 8 import org.chromium.distiller.document.TextDocumentStatistics; |
9 import org.chromium.distiller.extractors.ArticleExtractor; | 9 import org.chromium.distiller.extractors.ArticleExtractor; |
10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; | 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; |
(...skipping 144 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
155 | 155 |
156 /** | 156 /** |
157 * Get a list of the content image URLs in the provided document. | 157 * Get a list of the content image URLs in the provided document. |
158 * @return A list of image URLs. | 158 * @return A list of image URLs. |
159 */ | 159 */ |
160 public List<String> getImageUrls() { | 160 public List<String> getImageUrls() { |
161 return imageUrls; | 161 return imageUrls; |
162 } | 162 } |
163 | 163 |
164 /** | 164 /** |
165 * Get the element of the main article, if any. | |
166 * @return An element of article (not necessarily the html5 article element)
. | |
167 */ | |
168 private Element getArticleElement(Element root) { | |
169 NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE"); | |
170 // Having multiple article elements usually indicates a bad case for thi
s shortcut. | |
171 // TODO(wychen): some sites exclude things like title and author in arti
cle element. | |
172 if (allArticles.getLength() == 1) { | |
173 return allArticles.getItem(0); | |
174 } | |
175 // Note that the CSS property matching is case sensitive, and "Article"
is the correct | |
176 // capitalization. | |
177 String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*
=\"Post\"]"; | |
178 allArticles = DomUtil.querySelectorAll(root, query); | |
179 // It is commonly seen that the article is wrapped separately or in mult
iple layers. | |
180 if (allArticles.getLength() > 0) { | |
181 return Element.as(DomUtil.getNearestCommonAncestor(allArticles)); | |
182 } | |
183 return null; | |
184 } | |
185 | |
186 /** | |
187 * Converts the original HTML page into a WebDocument for analysis. | 165 * Converts the original HTML page into a WebDocument for analysis. |
188 */ | 166 */ |
189 private WebDocumentInfo createWebDocumentInfoFromPage() { | 167 private WebDocumentInfo createWebDocumentInfoFromPage() { |
190 WebDocumentInfo info = new WebDocumentInfo(); | 168 WebDocumentInfo info = new WebDocumentInfo(); |
191 WebDocumentBuilder documentBuilder = new WebDocumentBuilder(); | 169 WebDocumentBuilder documentBuilder = new WebDocumentBuilder(); |
192 DomConverter converter = new DomConverter(documentBuilder); | 170 DomConverter converter = new DomConverter(documentBuilder); |
193 Element walkerRoot = getArticleElement(documentElement); | 171 Element walkerRoot = DomUtil.getArticleElement(documentElement); |
194 if (walkerRoot == null) { | 172 if (walkerRoot == null) { |
195 walkerRoot = documentElement; | 173 walkerRoot = documentElement; |
196 } | 174 } |
197 new DomWalker(converter).walk(walkerRoot); | 175 new DomWalker(converter).walk(walkerRoot); |
198 info.document = documentBuilder.toWebDocument(); | 176 info.document = documentBuilder.toWebDocument(); |
199 ensureTitleInitialized(); | 177 ensureTitleInitialized(); |
200 info.hiddenElements = converter.getHiddenElements(); | 178 info.hiddenElements = converter.getHiddenElements(); |
201 | 179 |
202 return info; | 180 return info; |
203 } | 181 } |
204 | 182 |
205 /** | 183 /** |
206 * Implements the actual analysis of the page content, identifying the core
elements of the | 184 * Implements the actual analysis of the page content, identifying the core
elements of the |
207 * page. | 185 * page. |
208 * | 186 * |
209 * @param document the WebDocument representation of the page extracted from
the DOM. | 187 * @param document the WebDocument representation of the page extracted from
the DOM. |
210 */ | 188 */ |
211 private void processDocument(WebDocument document) { | 189 private void processDocument(WebDocument document) { |
212 TextDocument textDocument = document.createTextDocumentView(); | 190 TextDocument textDocument = document.createTextDocumentView(); |
213 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); | 191 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); |
214 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(
textDocument)); | 192 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(
textDocument)); |
215 textDocument.applyToModel(); | 193 textDocument.applyToModel(); |
216 } | 194 } |
217 } | 195 } |
OLD | NEW |