| OLD | NEW |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 package org.chromium.distiller; | 5 package org.chromium.distiller; |
| 6 | 6 |
| 7 import org.chromium.distiller.document.TextDocument; | 7 import org.chromium.distiller.document.TextDocument; |
| 8 import org.chromium.distiller.document.TextDocumentStatistics; | 8 import org.chromium.distiller.document.TextDocumentStatistics; |
| 9 import org.chromium.distiller.extractors.ArticleExtractor; | 9 import org.chromium.distiller.extractors.ArticleExtractor; |
| 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; | 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; |
| (...skipping 16 matching lines...) Expand all Loading... |
| 27 import java.util.LinkedList; | 27 import java.util.LinkedList; |
| 28 import java.util.List; | 28 import java.util.List; |
| 29 import java.util.Set; | 29 import java.util.Set; |
| 30 | 30 |
| 31 public class ContentExtractor { | 31 public class ContentExtractor { |
| 32 private final Element documentElement; | 32 private final Element documentElement; |
| 33 private final List<String> candidateTitles; | 33 private final List<String> candidateTitles; |
| 34 private final TimingInfo mTimingInfo; | 34 private final TimingInfo mTimingInfo; |
| 35 private final StatisticsInfo mStatisticsInfo; | 35 private final StatisticsInfo mStatisticsInfo; |
| 36 private final MarkupParser parser; | 36 private final MarkupParser parser; |
| 37 private final List<String> imageUrls; | 37 private List<String> imageUrls; |
| 38 private String textDirection; | 38 private String textDirection; |
| 39 | 39 |
| 40 private class WebDocumentInfo { | 40 private class WebDocumentInfo { |
| 41 WebDocument document; | 41 WebDocument document; |
| 42 Set<Node> hiddenElements; | 42 Set<Node> hiddenElements; |
| 43 } | 43 } |
| 44 | 44 |
| 45 public ContentExtractor(Element root) { | 45 public ContentExtractor(Element root) { |
| 46 documentElement = root; | 46 documentElement = root; |
| 47 candidateTitles = new LinkedList<String>(); | 47 candidateTitles = new LinkedList<String>(); |
| 48 mTimingInfo = TimingInfo.create(); | 48 mTimingInfo = TimingInfo.create(); |
| 49 mStatisticsInfo = StatisticsInfo.create(); | 49 mStatisticsInfo = StatisticsInfo.create(); |
| 50 imageUrls = new ArrayList<String>(); | |
| 51 | 50 |
| 52 double startTime = DomUtil.getTime(); | 51 double startTime = DomUtil.getTime(); |
| 53 parser = new MarkupParser(root, mTimingInfo); | 52 parser = new MarkupParser(root, mTimingInfo); |
| 54 mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime); | 53 mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime); |
| 55 textDirection = ""; | 54 textDirection = ""; |
| 56 } | 55 } |
| 57 | 56 |
| 58 // Grabs a list of candidate titles in descending priority order: | 57 // Grabs a list of candidate titles in descending priority order: |
| 59 // 1) meta-information | 58 // 1) meta-information |
| 60 // 2) The document's title element, modified based on some readability heuri
stics | 59 // 2) The document's title element, modified based on some readability heuri
stics |
| (...skipping 28 matching lines...) Expand all Loading... |
| 89 double now = DomUtil.getTime(); | 88 double now = DomUtil.getTime(); |
| 90 WebDocumentInfo documentInfo = createWebDocumentInfoFromPage(); | 89 WebDocumentInfo documentInfo = createWebDocumentInfoFromPage(); |
| 91 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now); | 90 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now); |
| 92 | 91 |
| 93 now = DomUtil.getTime(); | 92 now = DomUtil.getTime(); |
| 94 processDocument(documentInfo.document); | 93 processDocument(documentInfo.document); |
| 95 RelevantElements.process(documentInfo.document); | 94 RelevantElements.process(documentInfo.document); |
| 96 LeadImageFinder.process(documentInfo.document); | 95 LeadImageFinder.process(documentInfo.document); |
| 97 NestedElementRetainer.process(documentInfo.document); | 96 NestedElementRetainer.process(documentInfo.document); |
| 98 | 97 |
| 99 List<WebImage> images = documentInfo.document.getContentImages(); | |
| 100 for (WebImage wi : images) { | |
| 101 imageUrls.add(wi.getSrc()); | |
| 102 } | |
| 103 mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now); | 98 mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now); |
| 104 | 99 |
| 105 now = DomUtil.getTime(); | 100 now = DomUtil.getTime(); |
| 106 String html = documentInfo.document.generateOutput(textOnly); | 101 String html = documentInfo.document.generateOutput(textOnly); |
| 107 mTimingInfo.setFormattingTime(DomUtil.getTime() - now); | 102 mTimingInfo.setFormattingTime(DomUtil.getTime() - now); |
| 108 | 103 |
| 104 imageUrls = documentInfo.document.getImageUrls(); |
| 105 |
| 109 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_TIMING_INFO)) { | 106 if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_TIMING_INFO)) { |
| 110 for (int i = 0; i < mTimingInfo.getOtherTimesCount(); i++) { | 107 for (int i = 0; i < mTimingInfo.getOtherTimesCount(); i++) { |
| 111 TimingEntry entry = mTimingInfo.getOtherTimes(i); | 108 TimingEntry entry = mTimingInfo.getOtherTimes(i); |
| 112 LogUtil.logToConsole("Timing: " + entry.getName() + " = " + entr
y.getTime()); | 109 LogUtil.logToConsole("Timing: " + entry.getName() + " = " + entr
y.getTime()); |
| 113 } | 110 } |
| 114 | 111 |
| 115 LogUtil.logToConsole( | 112 LogUtil.logToConsole( |
| 116 "Timing: MarkupParsingTime = " + | 113 "Timing: MarkupParsingTime = " + |
| 117 mTimingInfo.getMarkupParsingTime() + | 114 mTimingInfo.getMarkupParsingTime() + |
| 118 "\nTiming: DocumentConstructionTime = " + | 115 "\nTiming: DocumentConstructionTime = " + |
| (...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 186 * | 183 * |
| 187 * @param document the WebDocument representation of the page extracted from
the DOM. | 184 * @param document the WebDocument representation of the page extracted from
the DOM. |
| 188 */ | 185 */ |
| 189 private void processDocument(WebDocument document) { | 186 private void processDocument(WebDocument document) { |
| 190 TextDocument textDocument = document.createTextDocumentView(); | 187 TextDocument textDocument = document.createTextDocumentView(); |
| 191 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); | 188 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); |
| 192 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(
textDocument)); | 189 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(
textDocument)); |
| 193 textDocument.applyToModel(); | 190 textDocument.applyToModel(); |
| 194 } | 191 } |
| 195 } | 192 } |
| OLD | NEW |