OLD | NEW |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 package org.chromium.distiller; | 5 package org.chromium.distiller; |
6 | 6 |
7 import org.chromium.distiller.document.TextDocument; | 7 import org.chromium.distiller.document.TextDocument; |
8 import org.chromium.distiller.document.TextDocumentStatistics; | 8 import org.chromium.distiller.document.TextDocumentStatistics; |
9 import org.chromium.distiller.extractors.ArticleExtractor; | 9 import org.chromium.distiller.extractors.ArticleExtractor; |
10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; | 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; |
(...skipping 26 matching lines...) Expand all Loading... |
37 private List<String> imageUrls; | 37 private List<String> imageUrls; |
38 private String textDirection; | 38 private String textDirection; |
39 | 39 |
40 private class WebDocumentInfo { | 40 private class WebDocumentInfo { |
41 WebDocument document; | 41 WebDocument document; |
42 Set<Node> hiddenElements; | 42 Set<Node> hiddenElements; |
43 } | 43 } |
44 | 44 |
45 public ContentExtractor(Element root) { | 45 public ContentExtractor(Element root) { |
46 documentElement = root; | 46 documentElement = root; |
47 candidateTitles = new LinkedList<String>(); | 47 candidateTitles = new LinkedList<>(); |
48 mTimingInfo = TimingInfo.create(); | 48 mTimingInfo = TimingInfo.create(); |
49 mStatisticsInfo = StatisticsInfo.create(); | 49 mStatisticsInfo = StatisticsInfo.create(); |
50 | 50 |
51 double startTime = DomUtil.getTime(); | 51 double startTime = DomUtil.getTime(); |
52 parser = new MarkupParser(root, mTimingInfo); | 52 parser = new MarkupParser(root, mTimingInfo); |
53 mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime); | 53 mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime); |
54 textDirection = ""; | 54 textDirection = ""; |
55 } | 55 } |
56 | 56 |
57 // Grabs a list of candidate titles in descending priority order: | 57 // Grabs a list of candidate titles in descending priority order: |
(...skipping 99 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
157 public List<String> getImageUrls() { | 157 public List<String> getImageUrls() { |
158 return imageUrls; | 158 return imageUrls; |
159 } | 159 } |
160 | 160 |
161 /** | 161 /** |
162 * Converts the original HTML page into a WebDocument for analysis. | 162 * Converts the original HTML page into a WebDocument for analysis. |
163 */ | 163 */ |
164 private WebDocumentInfo createWebDocumentInfoFromPage() { | 164 private WebDocumentInfo createWebDocumentInfoFromPage() { |
165 WebDocumentInfo info = new WebDocumentInfo(); | 165 WebDocumentInfo info = new WebDocumentInfo(); |
166 WebDocumentBuilder documentBuilder = new WebDocumentBuilder(); | 166 WebDocumentBuilder documentBuilder = new WebDocumentBuilder(); |
| 167 |
| 168 NodeList<Element> mobileViewport = DomUtil.querySelectorAll(documentElem
ent, |
| 169 "meta[name=\"viewport\"][content*=\"width=device-width\"]"); |
167 DomConverter converter = new DomConverter(documentBuilder); | 170 DomConverter converter = new DomConverter(documentBuilder); |
| 171 converter.setIsMobileFriendly(mobileViewport.getLength() > 0); |
| 172 |
168 Element walkerRoot = DomUtil.getArticleElement(documentElement); | 173 Element walkerRoot = DomUtil.getArticleElement(documentElement); |
169 if (walkerRoot == null) { | 174 converter.setHasArticleElement(walkerRoot != null); |
| 175 |
| 176 if (walkerRoot != null) { |
| 177 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_BOILER_PIPE_PHASES)) { |
| 178 LogUtil.logToConsole("Extracted article element: " + walkerRoot)
; |
| 179 } |
| 180 } else { |
170 walkerRoot = documentElement; | 181 walkerRoot = documentElement; |
171 } | 182 } |
172 new DomWalker(converter).walk(walkerRoot); | 183 new DomWalker(converter).walk(walkerRoot); |
173 info.document = documentBuilder.toWebDocument(); | 184 info.document = documentBuilder.toWebDocument(); |
174 ensureTitleInitialized(); | 185 ensureTitleInitialized(); |
175 info.hiddenElements = converter.getHiddenElements(); | 186 info.hiddenElements = converter.getHiddenElements(); |
176 | 187 |
177 return info; | 188 return info; |
178 } | 189 } |
179 | 190 |
180 /** | 191 /** |
181 * Implements the actual analysis of the page content, identifying the core
elements of the | 192 * Implements the actual analysis of the page content, identifying the core
elements of the |
182 * page. | 193 * page. |
183 * | 194 * |
184 * @param document the WebDocument representation of the page extracted from
the DOM. | 195 * @param document the WebDocument representation of the page extracted from
the DOM. |
185 */ | 196 */ |
186 private void processDocument(WebDocument document) { | 197 private void processDocument(WebDocument document) { |
187 TextDocument textDocument = document.createTextDocumentView(); | 198 TextDocument textDocument = document.createTextDocumentView(); |
188 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); | 199 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); |
189 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(
textDocument)); | 200 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(
textDocument)); |
190 textDocument.applyToModel(); | 201 textDocument.applyToModel(); |
191 } | 202 } |
192 } | 203 } |
OLD | NEW |