| OLD | NEW |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 package org.chromium.distiller; | 5 package org.chromium.distiller; |
| 6 | 6 |
| 7 import org.chromium.distiller.document.TextDocument; | 7 import org.chromium.distiller.document.TextDocument; |
| 8 import org.chromium.distiller.document.TextDocumentStatistics; | 8 import org.chromium.distiller.document.TextDocumentStatistics; |
| 9 import org.chromium.distiller.extractors.ArticleExtractor; | 9 import org.chromium.distiller.extractors.ArticleExtractor; |
| 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; | 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; |
| 11 import org.chromium.distiller.proto.DomDistillerProtos.TimingEntry; | 11 import org.chromium.distiller.proto.DomDistillerProtos.TimingEntry; |
| 12 import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo; | 12 import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo; |
| 13 import org.chromium.distiller.webdocument.DomConverter; | 13 import org.chromium.distiller.webdocument.DomConverter; |
| 14 import org.chromium.distiller.webdocument.WebDocument; | 14 import org.chromium.distiller.webdocument.WebDocument; |
| 15 import org.chromium.distiller.webdocument.WebDocumentBuilder; | 15 import org.chromium.distiller.webdocument.WebDocumentBuilder; |
| 16 import org.chromium.distiller.webdocument.WebImage; | 16 import org.chromium.distiller.webdocument.WebImage; |
| 17 import org.chromium.distiller.webdocument.filters.RelevantElements; | 17 import org.chromium.distiller.webdocument.filters.RelevantElements; |
| 18 import org.chromium.distiller.webdocument.filters.LeadImageFinder; | 18 import org.chromium.distiller.webdocument.filters.LeadImageFinder; |
| 19 import org.chromium.distiller.webdocument.filters.NestedElementRetainer; |
| 19 | 20 |
| 20 import com.google.gwt.dom.client.Document; | 21 import com.google.gwt.dom.client.Document; |
| 21 import com.google.gwt.dom.client.Element; | 22 import com.google.gwt.dom.client.Element; |
| 22 import com.google.gwt.dom.client.Node; | 23 import com.google.gwt.dom.client.Node; |
| 23 import com.google.gwt.dom.client.NodeList; | 24 import com.google.gwt.dom.client.NodeList; |
| 24 | 25 |
| 25 import java.util.ArrayList; | 26 import java.util.ArrayList; |
| 26 import java.util.LinkedList; | 27 import java.util.LinkedList; |
| 27 import java.util.List; | 28 import java.util.List; |
| 28 import java.util.Set; | 29 import java.util.Set; |
| (...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 86 | 87 |
| 87 public String extractContent(boolean textOnly) { | 88 public String extractContent(boolean textOnly) { |
| 88 double now = DomUtil.getTime(); | 89 double now = DomUtil.getTime(); |
| 89 WebDocumentInfo documentInfo = createWebDocumentInfoFromPage(); | 90 WebDocumentInfo documentInfo = createWebDocumentInfoFromPage(); |
| 90 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now); | 91 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now); |
| 91 | 92 |
| 92 now = DomUtil.getTime(); | 93 now = DomUtil.getTime(); |
| 93 processDocument(documentInfo.document); | 94 processDocument(documentInfo.document); |
| 94 RelevantElements.process(documentInfo.document); | 95 RelevantElements.process(documentInfo.document); |
| 95 LeadImageFinder.process(documentInfo.document); | 96 LeadImageFinder.process(documentInfo.document); |
| 97 NestedElementRetainer.process(documentInfo.document); |
| 96 | 98 |
| 97 List<WebImage> images = documentInfo.document.getContentImages(); | 99 List<WebImage> images = documentInfo.document.getContentImages(); |
| 98 for (WebImage wi : images) { | 100 for (WebImage wi : images) { |
| 99 imageUrls.add(wi.getSrc()); | 101 imageUrls.add(wi.getSrc()); |
| 100 } | 102 } |
| 101 mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now); | 103 mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now); |
| 102 | 104 |
| 103 now = DomUtil.getTime(); | 105 now = DomUtil.getTime(); |
| 104 String html = documentInfo.document.generateOutput(textOnly); | 106 String html = documentInfo.document.generateOutput(textOnly); |
| 105 mTimingInfo.setFormattingTime(DomUtil.getTime() - now); | 107 mTimingInfo.setFormattingTime(DomUtil.getTime() - now); |
| (...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 206 * | 208 * |
| 207 * @param document the WebDocument representation of the page extracted from
the DOM. | 209 * @param document the WebDocument representation of the page extracted from
the DOM. |
| 208 */ | 210 */ |
| 209 private void processDocument(WebDocument document) { | 211 private void processDocument(WebDocument document) { |
| 210 TextDocument textDocument = document.createTextDocumentView(); | 212 TextDocument textDocument = document.createTextDocumentView(); |
| 211 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); | 213 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); |
| 212 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(
textDocument)); | 214 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(
textDocument)); |
| 213 textDocument.applyToModel(); | 215 textDocument.applyToModel(); |
| 214 } | 216 } |
| 215 } | 217 } |
| OLD | NEW |