Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(221)

Side by Side Diff: java/org/chromium/distiller/ContentExtractor.java

Issue 2267403008: Fix partially hidden article (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: speed optimization Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | java/org/chromium/distiller/webdocument/DomConverter.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package org.chromium.distiller; 5 package org.chromium.distiller;
6 6
7 import org.chromium.distiller.document.TextDocument; 7 import org.chromium.distiller.document.TextDocument;
8 import org.chromium.distiller.document.TextDocumentStatistics; 8 import org.chromium.distiller.document.TextDocumentStatistics;
9 import org.chromium.distiller.extractors.ArticleExtractor; 9 import org.chromium.distiller.extractors.ArticleExtractor;
10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;
(...skipping 26 matching lines...) Expand all
37 private List<String> imageUrls; 37 private List<String> imageUrls;
38 private String textDirection; 38 private String textDirection;
39 39
40 private class WebDocumentInfo { 40 private class WebDocumentInfo {
41 WebDocument document; 41 WebDocument document;
42 Set<Node> hiddenElements; 42 Set<Node> hiddenElements;
43 } 43 }
44 44
45 public ContentExtractor(Element root) { 45 public ContentExtractor(Element root) {
46 documentElement = root; 46 documentElement = root;
47 candidateTitles = new LinkedList<String>(); 47 candidateTitles = new LinkedList<>();
48 mTimingInfo = TimingInfo.create(); 48 mTimingInfo = TimingInfo.create();
49 mStatisticsInfo = StatisticsInfo.create(); 49 mStatisticsInfo = StatisticsInfo.create();
50 50
51 double startTime = DomUtil.getTime(); 51 double startTime = DomUtil.getTime();
52 parser = new MarkupParser(root, mTimingInfo); 52 parser = new MarkupParser(root, mTimingInfo);
53 mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime); 53 mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime);
54 textDirection = ""; 54 textDirection = "";
55 } 55 }
56 56
57 // Grabs a list of candidate titles in descending priority order: 57 // Grabs a list of candidate titles in descending priority order:
(...skipping 99 matching lines...) Expand 10 before | Expand all | Expand 10 after
157 public List<String> getImageUrls() { 157 public List<String> getImageUrls() {
158 return imageUrls; 158 return imageUrls;
159 } 159 }
160 160
161 /** 161 /**
162 * Converts the original HTML page into a WebDocument for analysis. 162 * Converts the original HTML page into a WebDocument for analysis.
163 */ 163 */
164 private WebDocumentInfo createWebDocumentInfoFromPage() { 164 private WebDocumentInfo createWebDocumentInfoFromPage() {
165 WebDocumentInfo info = new WebDocumentInfo(); 165 WebDocumentInfo info = new WebDocumentInfo();
166 WebDocumentBuilder documentBuilder = new WebDocumentBuilder(); 166 WebDocumentBuilder documentBuilder = new WebDocumentBuilder();
167
168 NodeList<Element> mobileViewport = DomUtil.querySelectorAll(documentElem ent,
169 "meta[name=\"viewport\"][content*=\"width=device-width\"]");
167 DomConverter converter = new DomConverter(documentBuilder); 170 DomConverter converter = new DomConverter(documentBuilder);
171 converter.setIsMobileFriendly(mobileViewport.getLength() > 0);
172
168 Element walkerRoot = DomUtil.getArticleElement(documentElement); 173 Element walkerRoot = DomUtil.getArticleElement(documentElement);
169 if (walkerRoot == null) { 174 converter.setHasArticleElement(walkerRoot != null);
175
176 if (walkerRoot != null) {
177 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_BOILER_PIPE_PHASES)) {
178 LogUtil.logToConsole("Extracted article element: " + walkerRoot) ;
179 }
180 } else {
170 walkerRoot = documentElement; 181 walkerRoot = documentElement;
171 } 182 }
172 new DomWalker(converter).walk(walkerRoot); 183 new DomWalker(converter).walk(walkerRoot);
173 info.document = documentBuilder.toWebDocument(); 184 info.document = documentBuilder.toWebDocument();
174 ensureTitleInitialized(); 185 ensureTitleInitialized();
175 info.hiddenElements = converter.getHiddenElements(); 186 info.hiddenElements = converter.getHiddenElements();
176 187
177 return info; 188 return info;
178 } 189 }
179 190
180 /** 191 /**
181 * Implements the actual analysis of the page content, identifying the core elements of the 192 * Implements the actual analysis of the page content, identifying the core elements of the
182 * page. 193 * page.
183 * 194 *
184 * @param document the WebDocument representation of the page extracted from the DOM. 195 * @param document the WebDocument representation of the page extracted from the DOM.
185 */ 196 */
186 private void processDocument(WebDocument document) { 197 private void processDocument(WebDocument document) {
187 TextDocument textDocument = document.createTextDocumentView(); 198 TextDocument textDocument = document.createTextDocumentView();
188 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); 199 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);
189 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument)); 200 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument));
190 textDocument.applyToModel(); 201 textDocument.applyToModel();
191 } 202 }
192 } 203 }
OLDNEW
« no previous file with comments | « no previous file | java/org/chromium/distiller/webdocument/DomConverter.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698