Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(33)

Side by Side Diff: java/org/chromium/distiller/ContentExtractor.java

Issue 1705123002: Add support for Schema.org/Recipe Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: wychen's comments addressed Created 4 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package org.chromium.distiller; 5 package org.chromium.distiller;
6 6
7 import org.chromium.distiller.document.TextDocument; 7 import org.chromium.distiller.document.TextDocument;
8 import org.chromium.distiller.document.TextDocumentStatistics; 8 import org.chromium.distiller.document.TextDocumentStatistics;
9 import org.chromium.distiller.extractors.ArticleExtractor; 9 import org.chromium.distiller.extractors.ArticleExtractor;
10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after
79 ensureTitleInitialized(); 79 ensureTitleInitialized();
80 assert candidateTitles.size() > 0; 80 assert candidateTitles.size() > 0;
81 return candidateTitles.get(0); 81 return candidateTitles.get(0);
82 } 82 }
83 83
84 public String extractContent() { 84 public String extractContent() {
85 return extractContent(false); 85 return extractContent(false);
86 } 86 }
87 87
88 public String extractContent(boolean textOnly) { 88 public String extractContent(boolean textOnly) {
89
wychen 2016/07/24 23:06:34 nit: extra line
89 double now = DomUtil.getTime(); 90 double now = DomUtil.getTime();
91 String structuredData = parser.getStructuredData();
92 LogUtil.addTimingInfo(now, mTimingInfo, "parser.getStructuredData()");
wychen 2016/07/24 23:06:33 Maybe just "getStructuredData" for consistency.
93 if (!structuredData.isEmpty()) {
94 return structuredData;
95 }
96 now = DomUtil.getTime();
90 WebDocumentInfo documentInfo = createWebDocumentInfoFromPage(); 97 WebDocumentInfo documentInfo = createWebDocumentInfoFromPage();
91 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now); 98 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now);
92 99
93 now = DomUtil.getTime(); 100 now = DomUtil.getTime();
94 processDocument(documentInfo.document); 101 processDocument(documentInfo.document);
95 RelevantElements.process(documentInfo.document); 102 RelevantElements.process(documentInfo.document);
96 LeadImageFinder.process(documentInfo.document); 103 LeadImageFinder.process(documentInfo.document);
97 NestedElementRetainer.process(documentInfo.document); 104 NestedElementRetainer.process(documentInfo.document);
98 105
99 List<WebImage> images = documentInfo.document.getContentImages(); 106 List<WebImage> images = documentInfo.document.getContentImages();
(...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after
186 * 193 *
187 * @param document the WebDocument representation of the page extracted from the DOM. 194 * @param document the WebDocument representation of the page extracted from the DOM.
188 */ 195 */
189 private void processDocument(WebDocument document) { 196 private void processDocument(WebDocument document) {
190 TextDocument textDocument = document.createTextDocumentView(); 197 TextDocument textDocument = document.createTextDocumentView();
191 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); 198 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);
192 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument)); 199 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument));
193 textDocument.applyToModel(); 200 textDocument.applyToModel();
194 } 201 }
195 } 202 }
OLDNEW
« no previous file with comments | « no previous file | java/org/chromium/distiller/DomUtil.java » ('j') | java/org/chromium/distiller/DomUtil.java » ('J')

Powered by Google App Engine
This is Rietveld 408576698