OLD | NEW |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 package org.chromium.distiller; | 5 package org.chromium.distiller; |
6 | 6 |
7 import org.chromium.distiller.document.TextDocument; | 7 import org.chromium.distiller.document.TextDocument; |
8 import org.chromium.distiller.document.TextDocumentStatistics; | 8 import org.chromium.distiller.document.TextDocumentStatistics; |
9 import org.chromium.distiller.extractors.ArticleExtractor; | 9 import org.chromium.distiller.extractors.ArticleExtractor; |
10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; | 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; |
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
79 ensureTitleInitialized(); | 79 ensureTitleInitialized(); |
80 assert candidateTitles.size() > 0; | 80 assert candidateTitles.size() > 0; |
81 return candidateTitles.get(0); | 81 return candidateTitles.get(0); |
82 } | 82 } |
83 | 83 |
84 public String extractContent() { | 84 public String extractContent() { |
85 return extractContent(false); | 85 return extractContent(false); |
86 } | 86 } |
87 | 87 |
88 public String extractContent(boolean textOnly) { | 88 public String extractContent(boolean textOnly) { |
| 89 |
89 double now = DomUtil.getTime(); | 90 double now = DomUtil.getTime(); |
| 91 String structuredData = parser.getStructuredData(); |
| 92 LogUtil.addTimingInfo(now, mTimingInfo, "parser.getStructuredData()"); |
| 93 if (!structuredData.isEmpty()) { |
| 94 return structuredData; |
| 95 } |
| 96 now = DomUtil.getTime(); |
90 WebDocumentInfo documentInfo = createWebDocumentInfoFromPage(); | 97 WebDocumentInfo documentInfo = createWebDocumentInfoFromPage(); |
91 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now); | 98 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now); |
92 | 99 |
93 now = DomUtil.getTime(); | 100 now = DomUtil.getTime(); |
94 processDocument(documentInfo.document); | 101 processDocument(documentInfo.document); |
95 RelevantElements.process(documentInfo.document); | 102 RelevantElements.process(documentInfo.document); |
96 LeadImageFinder.process(documentInfo.document); | 103 LeadImageFinder.process(documentInfo.document); |
97 NestedElementRetainer.process(documentInfo.document); | 104 NestedElementRetainer.process(documentInfo.document); |
98 | 105 |
99 List<WebImage> images = documentInfo.document.getContentImages(); | 106 List<WebImage> images = documentInfo.document.getContentImages(); |
(...skipping 108 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
208 * | 215 * |
209 * @param document the WebDocument representation of the page extracted from
the DOM. | 216 * @param document the WebDocument representation of the page extracted from
the DOM. |
210 */ | 217 */ |
211 private void processDocument(WebDocument document) { | 218 private void processDocument(WebDocument document) { |
212 TextDocument textDocument = document.createTextDocumentView(); | 219 TextDocument textDocument = document.createTextDocumentView(); |
213 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); | 220 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); |
214 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(
textDocument)); | 221 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(
textDocument)); |
215 textDocument.applyToModel(); | 222 textDocument.applyToModel(); |
216 } | 223 } |
217 } | 224 } |
OLD | NEW |