java/org/chromium/distiller/ContentExtractor.java - Issue 1705123002: Add support for Schema.org/Recipe

Side by Side Diff: java/org/chromium/distiller/ContentExtractor.java

Issue 1705123002: Add support for Schema.org/Recipe Base URL: https://github.com/chromium/dom-distiller.git@master

Patch Set: wychen's comments addressed Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package org.chromium.distiller;	5 package org.chromium.distiller;

6	6

7 import org.chromium.distiller.document.TextDocument;	7 import org.chromium.distiller.document.TextDocument;

8 import org.chromium.distiller.document.TextDocumentStatistics;	8 import org.chromium.distiller.document.TextDocumentStatistics;

9 import org.chromium.distiller.extractors.ArticleExtractor;	9 import org.chromium.distiller.extractors.ArticleExtractor;

10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;	10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;

(...skipping 68 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
79 ensureTitleInitialized();	79 ensureTitleInitialized();

80 assert candidateTitles.size() > 0;	80 assert candidateTitles.size() > 0;

81 return candidateTitles.get(0);	81 return candidateTitles.get(0);

82 }	82 }

83	83

84 public String extractContent() {	84 public String extractContent() {

85 return extractContent(false);	85 return extractContent(false);

86 }	86 }

87	87

88 public String extractContent(boolean textOnly) {	88 public String extractContent(boolean textOnly) {

	89
	wychen 2016/07/24 23:06:34 nit: extra line nit: extra line
89 double now = DomUtil.getTime();	90 double now = DomUtil.getTime();

	91 String structuredData = parser.getStructuredData();

	92 LogUtil.addTimingInfo(now, mTimingInfo, "parser.getStructuredData()");
	wychen 2016/07/24 23:06:33 Maybe just "getStructuredData" for consistency. Maybe just "getStructuredData" for consistency.
	93 if (!structuredData.isEmpty()) {

	94 return structuredData;

	95 }

	96 now = DomUtil.getTime();

90 WebDocumentInfo documentInfo = createWebDocumentInfoFromPage();	97 WebDocumentInfo documentInfo = createWebDocumentInfoFromPage();

91 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now);	98 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now);

92	99

93 now = DomUtil.getTime();	100 now = DomUtil.getTime();

94 processDocument(documentInfo.document);	101 processDocument(documentInfo.document);

95 RelevantElements.process(documentInfo.document);	102 RelevantElements.process(documentInfo.document);

96 LeadImageFinder.process(documentInfo.document);	103 LeadImageFinder.process(documentInfo.document);

97 NestedElementRetainer.process(documentInfo.document);	104 NestedElementRetainer.process(documentInfo.document);

98	105

99 List<WebImage> images = documentInfo.document.getContentImages();	106 List<WebImage> images = documentInfo.document.getContentImages();

(...skipping 86 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
186 *	193 *

187 * @param document the WebDocument representation of the page extracted from the DOM.	194 * @param document the WebDocument representation of the page extracted from the DOM.

188 */	195 */

189 private void processDocument(WebDocument document) {	196 private void processDocument(WebDocument document) {

190 TextDocument textDocument = document.createTextDocumentView();	197 TextDocument textDocument = document.createTextDocumentView();

191 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);	198 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);

192 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument));	199 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument));

193 textDocument.applyToModel();	200 textDocument.applyToModel();

194 }	201 }

195 }	202 }

OLD	NEW

« no previous file with comments | « no previous file | java/org/chromium/distiller/DomUtil.java » ('j') | java/org/chromium/distiller/DomUtil.java » ('J')