java/org/chromium/distiller/ContentExtractor.java - Issue 1705123002: Add support for Schema.org/Recipe

Side by Side Diff: java/org/chromium/distiller/ContentExtractor.java

Issue 1705123002: Add support for Schema.org/Recipe Base URL: https://github.com/chromium/dom-distiller.git@master

Patch Set: Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « no previous file | java/org/chromium/distiller/DomUtil.java » ('j') | java/org/chromium/distiller/DomUtil.java » ('J')

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package org.chromium.distiller;	5 package org.chromium.distiller;

6	6

7 import org.chromium.distiller.document.TextDocument;	7 import org.chromium.distiller.document.TextDocument;

8 import org.chromium.distiller.document.TextDocumentStatistics;	8 import org.chromium.distiller.document.TextDocumentStatistics;

9 import org.chromium.distiller.extractors.ArticleExtractor;	9 import org.chromium.distiller.extractors.ArticleExtractor;

10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;	10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;

(...skipping 68 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
79 ensureTitleInitialized();	79 ensureTitleInitialized();

80 assert candidateTitles.size() > 0;	80 assert candidateTitles.size() > 0;

81 return candidateTitles.get(0);	81 return candidateTitles.get(0);

82 }	82 }

83	83

84 public String extractContent() {	84 public String extractContent() {

85 return extractContent(false);	85 return extractContent(false);

86 }	86 }

87	87

88 public String extractContent(boolean textOnly) {	88 public String extractContent(boolean textOnly) {

	89

	90 String structuredData = parser.getStructuredData();
	wychen 2016/03/14 22:58:42 Might make sense to measure the time spent in this Might make sense to measure the time spent in this section and record in TimingInfo.
	91 if (!structuredData.isEmpty()) {

	92 return structuredData;

	93 }

	94

89 double now = DomUtil.getTime();	95 double now = DomUtil.getTime();

90 WebDocumentInfo documentInfo = createWebDocumentInfoFromPage();	96 WebDocumentInfo documentInfo = createWebDocumentInfoFromPage();

91 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now);	97 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now);

92	98

93 now = DomUtil.getTime();	99 now = DomUtil.getTime();

94 processDocument(documentInfo.document);	100 processDocument(documentInfo.document);

95 RelevantElements.process(documentInfo.document);	101 RelevantElements.process(documentInfo.document);

96 LeadImageFinder.process(documentInfo.document);	102 LeadImageFinder.process(documentInfo.document);

97 NestedElementRetainer.process(documentInfo.document);	103 NestedElementRetainer.process(documentInfo.document);

98	104

(...skipping 109 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
208 *	214 *

209 * @param document the WebDocument representation of the page extracted from the DOM.	215 * @param document the WebDocument representation of the page extracted from the DOM.

210 */	216 */

211 private void processDocument(WebDocument document) {	217 private void processDocument(WebDocument document) {

212 TextDocument textDocument = document.createTextDocumentView();	218 TextDocument textDocument = document.createTextDocumentView();

213 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);	219 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);

214 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument));	220 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument));

215 textDocument.applyToModel();	221 textDocument.applyToModel();

216 }	222 }

217 }	223 }

OLD	NEW