| Index: components/dom_distiller/standalone/content_extractor.cc
|
| diff --git a/components/dom_distiller/standalone/content_extractor.cc b/components/dom_distiller/standalone/content_extractor.cc
|
| deleted file mode 100644
|
| index 1643b12ca84c2625539b558e39b3f4821998d4aa..0000000000000000000000000000000000000000
|
| --- a/components/dom_distiller/standalone/content_extractor.cc
|
| +++ /dev/null
|
| @@ -1,419 +0,0 @@
|
| -// Copyright 2014 The Chromium Authors. All rights reserved.
|
| -// Use of this source code is governed by a BSD-style license that can be
|
| -// found in the LICENSE file.
|
| -
|
| -#include <sstream>
|
| -
|
| -#include "base/command_line.h"
|
| -#include "base/files/scoped_temp_dir.h"
|
| -#include "base/id_map.h"
|
| -#include "base/message_loop/message_loop.h"
|
| -#include "base/path_service.h"
|
| -#include "base/run_loop.h"
|
| -#include "base/strings/string_number_conversions.h"
|
| -#include "base/strings/string_split.h"
|
| -#include "components/dom_distiller/content/distiller_page_web_contents.h"
|
| -#include "components/dom_distiller/core/article_entry.h"
|
| -#include "components/dom_distiller/core/distilled_page_prefs.h"
|
| -#include "components/dom_distiller/core/distiller.h"
|
| -#include "components/dom_distiller/core/dom_distiller_service.h"
|
| -#include "components/dom_distiller/core/dom_distiller_store.h"
|
| -#include "components/dom_distiller/core/proto/distilled_article.pb.h"
|
| -#include "components/dom_distiller/core/proto/distilled_page.pb.h"
|
| -#include "components/dom_distiller/core/task_tracker.h"
|
| -#include "components/leveldb_proto/proto_database.h"
|
| -#include "components/leveldb_proto/proto_database_impl.h"
|
| -#include "components/pref_registry/testing_pref_service_syncable.h"
|
| -#include "content/public/browser/browser_context.h"
|
| -#include "content/public/browser/browser_thread.h"
|
| -#include "content/public/test/content_browser_test.h"
|
| -#include "content/shell/browser/shell.h"
|
| -#include "google/protobuf/io/coded_stream.h"
|
| -#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
|
| -#include "net/dns/mock_host_resolver.h"
|
| -#include "third_party/dom_distiller_js/dom_distiller.pb.h"
|
| -#include "ui/base/resource/resource_bundle.h"
|
| -
|
| -using content::ContentBrowserTest;
|
| -
|
| -namespace dom_distiller {
|
| -
|
| -namespace {
|
| -
|
| -typedef base::hash_map<std::string, std::string> UrlToDomainMap;
|
| -
|
| -}
|
| -
|
| -// Factory for creating a Distiller that creates different DomDistillerOptions
|
| -// for different URLs, i.e. a specific kOriginalDomain option for each URL.
|
| -class TestDistillerFactoryImpl : public DistillerFactory {
|
| - public:
|
| - TestDistillerFactoryImpl(
|
| - scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
|
| - const dom_distiller::proto::DomDistillerOptions& dom_distiller_options,
|
| - const UrlToDomainMap& url_to_domain_map)
|
| - : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()),
|
| - dom_distiller_options_(dom_distiller_options),
|
| - url_to_domain_map_(url_to_domain_map) {
|
| - }
|
| -
|
| - ~TestDistillerFactoryImpl() override {}
|
| -
|
| - scoped_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override {
|
| - dom_distiller::proto::DomDistillerOptions options;
|
| - options = dom_distiller_options_;
|
| - UrlToDomainMap::const_iterator it = url_to_domain_map_.find(url.spec());
|
| - if (it != url_to_domain_map_.end()) options.set_original_domain(it->second);
|
| - scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
|
| - *distiller_url_fetcher_factory_, options));
|
| - return distiller.Pass();
|
| - }
|
| -
|
| - private:
|
| - scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_;
|
| - dom_distiller::proto::DomDistillerOptions dom_distiller_options_;
|
| - UrlToDomainMap url_to_domain_map_;
|
| -};
|
| -
|
| -namespace {
|
| -
|
| -// The url to distill.
|
| -const char* kUrlSwitch = "url";
|
| -
|
| -// A space-separated list of urls to distill.
|
| -const char* kUrlsSwitch = "urls";
|
| -
|
| -// Indicates that DNS resolution should be disabled for this test.
|
| -const char* kDisableDnsSwitch = "disable-dns";
|
| -
|
| -// Will write the distilled output to the given file instead of to stdout.
|
| -const char* kOutputFile = "output-file";
|
| -
|
| -// Indicates to output a serialized protocol buffer instead of human-readable
|
| -// output.
|
| -const char* kShouldOutputBinary = "output-binary";
|
| -
|
| -// Indicates to output only the text of the article and not the enclosing html.
|
| -const char* kExtractTextOnly = "extract-text-only";
|
| -
|
| -// Indicates to include debug output.
|
| -const char* kDebugLevel = "debug-level";
|
| -
|
| -// The original domain of the page if |kUrlSwitch| is a file.
|
| -const char* kOriginalDomain = "original-domain";
|
| -
|
| -// A semi-colon-separated (i.e. ';') list of original domains corresponding to
|
| -// "kUrlsSwitch".
|
| -const char* kOriginalDomains = "original-domains";
|
| -
|
| -// Maximum number of concurrent started extractor requests.
|
| -const int kMaxExtractorTasks = 8;
|
| -
|
| -scoped_ptr<DomDistillerService> CreateDomDistillerService(
|
| - content::BrowserContext* context,
|
| - const base::FilePath& db_path,
|
| - const UrlToDomainMap& url_to_domain_map) {
|
| - scoped_refptr<base::SequencedTaskRunner> background_task_runner =
|
| - content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
|
| - content::BrowserThread::GetBlockingPool()->GetSequenceToken());
|
| -
|
| - // TODO(cjhopman): use an in-memory database instead of an on-disk one with
|
| - // temporary directory.
|
| - scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db(
|
| - new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>(
|
| - background_task_runner));
|
| - scoped_ptr<DomDistillerStore> dom_distiller_store(
|
| - new DomDistillerStore(db.Pass(), db_path));
|
| -
|
| - scoped_ptr<DistillerPageFactory> distiller_page_factory(
|
| - new DistillerPageWebContentsFactory(context));
|
| - scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory(
|
| - new DistillerURLFetcherFactory(context->GetRequestContext()));
|
| -
|
| - dom_distiller::proto::DomDistillerOptions options;
|
| - if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly)) {
|
| - options.set_extract_text_only(true);
|
| - }
|
| - int debug_level = 0;
|
| - if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) &&
|
| - base::StringToInt(
|
| - base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
|
| - kDebugLevel),
|
| - &debug_level)) {
|
| - options.set_debug_level(debug_level);
|
| - }
|
| - scoped_ptr<DistillerFactory> distiller_factory(
|
| - new TestDistillerFactoryImpl(distiller_url_fetcher_factory.Pass(),
|
| - options,
|
| - url_to_domain_map));
|
| -
|
| - // Setting up PrefService for DistilledPagePrefs.
|
| - user_prefs::TestingPrefServiceSyncable* pref_service =
|
| - new user_prefs::TestingPrefServiceSyncable();
|
| - DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry());
|
| -
|
| - return scoped_ptr<DomDistillerService>(new DomDistillerService(
|
| - dom_distiller_store.Pass(),
|
| - distiller_factory.Pass(),
|
| - distiller_page_factory.Pass(),
|
| - scoped_ptr<DistilledPagePrefs>(new DistilledPagePrefs(pref_service))));
|
| -}
|
| -
|
| -void AddComponentsTestResources() {
|
| - base::FilePath pak_file;
|
| - base::FilePath pak_dir;
|
| - PathService::Get(base::DIR_MODULE, &pak_dir);
|
| - pak_file =
|
| - pak_dir.Append(FILE_PATH_LITERAL("components_tests_resources.pak"));
|
| - ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath(
|
| - pak_file, ui::SCALE_FACTOR_NONE);
|
| -}
|
| -
|
| -bool WriteProtobufWithSize(
|
| - const google::protobuf::MessageLite& message,
|
| - google::protobuf::io::ZeroCopyOutputStream* output_stream) {
|
| - google::protobuf::io::CodedOutputStream coded_output(output_stream);
|
| -
|
| - // Write the size.
|
| - const int size = message.ByteSize();
|
| - coded_output.WriteLittleEndian32(size);
|
| - message.SerializeWithCachedSizes(&coded_output);
|
| - return !coded_output.HadError();
|
| -}
|
| -
|
| -std::string GetReadableArticleString(
|
| - const DistilledArticleProto& article_proto) {
|
| - std::stringstream output;
|
| - output << "Article Title: " << article_proto.title() << std::endl;
|
| - output << "# of pages: " << article_proto.pages_size() << std::endl;
|
| - for (int i = 0; i < article_proto.pages_size(); ++i) {
|
| - if (i > 0) output << std::endl;
|
| - const DistilledPageProto& page = article_proto.pages(i);
|
| - output << "Page " << i << std::endl;
|
| - output << "URL: " << page.url() << std::endl;
|
| - output << "Content: " << page.html() << std::endl;
|
| - if (page.has_debug_info() && page.debug_info().has_log())
|
| - output << "Log: " << page.debug_info().log() << std::endl;
|
| - if (page.has_pagination_info()) {
|
| - if (page.pagination_info().has_next_page()) {
|
| - output << "Next Page: " << page.pagination_info().next_page()
|
| - << std::endl;
|
| - }
|
| - if (page.pagination_info().has_prev_page()) {
|
| - output << "Prev Page: " << page.pagination_info().prev_page()
|
| - << std::endl;
|
| - }
|
| - }
|
| - }
|
| - return output.str();
|
| -}
|
| -
|
| -} // namespace
|
| -
|
| -class ContentExtractionRequest : public ViewRequestDelegate {
|
| - public:
|
| - void Start(DomDistillerService* service, const gfx::Size& render_view_size,
|
| - base::Closure finished_callback) {
|
| - finished_callback_ = finished_callback;
|
| - viewer_handle_ =
|
| - service->ViewUrl(this,
|
| - service->CreateDefaultDistillerPage(render_view_size),
|
| - url_);
|
| - }
|
| -
|
| - DistilledArticleProto GetArticleCopy() {
|
| - return *article_proto_;
|
| - }
|
| -
|
| - static ScopedVector<ContentExtractionRequest> CreateForCommandLine(
|
| - const base::CommandLine& command_line,
|
| - UrlToDomainMap* url_to_domain_map) {
|
| - ScopedVector<ContentExtractionRequest> requests;
|
| - if (command_line.HasSwitch(kUrlSwitch)) {
|
| - GURL url;
|
| - std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
|
| - url = GURL(url_string);
|
| - if (url.is_valid()) {
|
| - requests.push_back(new ContentExtractionRequest(url));
|
| - if (command_line.HasSwitch(kOriginalDomain)) {
|
| - (*url_to_domain_map)[url.spec()] =
|
| - command_line.GetSwitchValueASCII(kOriginalDomain);
|
| - }
|
| - }
|
| - } else if (command_line.HasSwitch(kUrlsSwitch)) {
|
| - std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
|
| - std::vector<std::string> urls;
|
| - base::SplitString(urls_string, ' ', &urls);
|
| - // Check for original-domains switch, which must exactly pair up with
|
| - // |kUrlsSwitch| i.e. number of domains must be same as that of urls.
|
| - std::vector<std::string> domains;
|
| - if (command_line.HasSwitch(kOriginalDomains)) {
|
| - std::string domains_string =
|
| - command_line.GetSwitchValueASCII( kOriginalDomains);
|
| - base::SplitString(domains_string, ';', &domains);
|
| - if (domains.size() != urls.size()) domains.clear();
|
| - }
|
| - for (size_t i = 0; i < urls.size(); ++i) {
|
| - GURL url(urls[i]);
|
| - if (url.is_valid()) {
|
| - requests.push_back(new ContentExtractionRequest(url));
|
| - // Only regard non-empty domain.
|
| - if (!domains.empty() && !domains[i].empty()) {
|
| - (*url_to_domain_map)[url.spec()] = domains[i];
|
| - }
|
| - } else {
|
| - ADD_FAILURE() << "Bad url";
|
| - }
|
| - }
|
| - }
|
| - if (requests.empty()) {
|
| - ADD_FAILURE() << "No valid url provided";
|
| - }
|
| -
|
| - return requests.Pass();
|
| - }
|
| -
|
| - private:
|
| - ContentExtractionRequest(const GURL& url) : url_(url) {}
|
| -
|
| - void OnArticleUpdated(ArticleDistillationUpdate article_update) override {}
|
| -
|
| - void OnArticleReady(const DistilledArticleProto* article_proto) override {
|
| - article_proto_ = article_proto;
|
| - CHECK(article_proto->pages_size()) << "Failed extracting " << url_;
|
| - base::MessageLoop::current()->PostTask(
|
| - FROM_HERE,
|
| - finished_callback_);
|
| - }
|
| -
|
| - const DistilledArticleProto* article_proto_;
|
| - scoped_ptr<ViewerHandle> viewer_handle_;
|
| - GURL url_;
|
| - base::Closure finished_callback_;
|
| -};
|
| -
|
| -class ContentExtractor : public ContentBrowserTest {
|
| - public:
|
| - ContentExtractor()
|
| - : pending_tasks_(0),
|
| - max_tasks_(kMaxExtractorTasks),
|
| - next_request_(0),
|
| - output_data_(),
|
| - protobuf_output_stream_(
|
| - new google::protobuf::io::StringOutputStream(&output_data_)) {}
|
| -
|
| - // Change behavior of the default host resolver to avoid DNS lookup errors, so
|
| - // we can make network calls.
|
| - void SetUpOnMainThread() override {
|
| - if (!base::CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch)) {
|
| - EnableDNSLookupForThisTest();
|
| - }
|
| - CHECK(db_dir_.CreateUniqueTempDir());
|
| - AddComponentsTestResources();
|
| - }
|
| -
|
| - void TearDownOnMainThread() override { DisableDNSLookupForThisTest(); }
|
| -
|
| - protected:
|
| - // Creates the DomDistillerService and creates and starts the extraction
|
| - // request.
|
| - void Start() {
|
| - const base::CommandLine& command_line =
|
| - *base::CommandLine::ForCurrentProcess();
|
| - UrlToDomainMap url_to_domain_map;
|
| - requests_ = ContentExtractionRequest::CreateForCommandLine(
|
| - command_line, &url_to_domain_map);
|
| - content::BrowserContext* context =
|
| - shell()->web_contents()->GetBrowserContext();
|
| - service_ = CreateDomDistillerService(context,
|
| - db_dir_.path(),
|
| - url_to_domain_map);
|
| - PumpQueue();
|
| - }
|
| -
|
| - void PumpQueue() {
|
| - while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) {
|
| - requests_[next_request_]->Start(
|
| - service_.get(),
|
| - shell()->web_contents()->GetContainerBounds().size(),
|
| - base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this)));
|
| - ++next_request_;
|
| - ++pending_tasks_;
|
| - }
|
| - }
|
| -
|
| - private:
|
| - // Change behavior of the default host resolver to allow DNS lookup
|
| - // to proceed instead of being blocked by the test infrastructure.
|
| - void EnableDNSLookupForThisTest() {
|
| - // mock_host_resolver_override_ takes ownership of the resolver.
|
| - scoped_refptr<net::RuleBasedHostResolverProc> resolver =
|
| - new net::RuleBasedHostResolverProc(host_resolver());
|
| - resolver->AllowDirectLookup("*");
|
| - mock_host_resolver_override_.reset(
|
| - new net::ScopedDefaultHostResolverProc(resolver.get()));
|
| - }
|
| -
|
| - // We need to reset the DNS lookup when we finish, or the test will fail.
|
| - void DisableDNSLookupForThisTest() {
|
| - mock_host_resolver_override_.reset();
|
| - }
|
| -
|
| - void FinishRequest() {
|
| - --pending_tasks_;
|
| - if (next_request_ == requests_.size() && pending_tasks_ == 0) {
|
| - Finish();
|
| - } else {
|
| - PumpQueue();
|
| - }
|
| - }
|
| -
|
| - void DoArticleOutput() {
|
| - const base::CommandLine& command_line =
|
| - *base::CommandLine::ForCurrentProcess();
|
| - for (size_t i = 0; i < requests_.size(); ++i) {
|
| - const DistilledArticleProto& article = requests_[i]->GetArticleCopy();
|
| - if (command_line.HasSwitch(kShouldOutputBinary)) {
|
| - WriteProtobufWithSize(article, protobuf_output_stream_.get());
|
| - } else {
|
| - output_data_ += GetReadableArticleString(article) + "\n";
|
| - }
|
| - }
|
| -
|
| - if (command_line.HasSwitch(kOutputFile)) {
|
| - base::FilePath filename = command_line.GetSwitchValuePath(kOutputFile);
|
| - ASSERT_EQ(
|
| - (int)output_data_.size(),
|
| - base::WriteFile(filename, output_data_.c_str(), output_data_.size()));
|
| - } else {
|
| - VLOG(0) << output_data_;
|
| - }
|
| - }
|
| -
|
| - void Finish() {
|
| - DoArticleOutput();
|
| - requests_.clear();
|
| - service_.reset();
|
| - base::MessageLoop::current()->PostTask(
|
| - FROM_HERE, base::MessageLoop::QuitWhenIdleClosure());
|
| - }
|
| -
|
| - size_t pending_tasks_;
|
| - size_t max_tasks_;
|
| - size_t next_request_;
|
| -
|
| - base::ScopedTempDir db_dir_;
|
| - scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_;
|
| - scoped_ptr<DomDistillerService> service_;
|
| - ScopedVector<ContentExtractionRequest> requests_;
|
| -
|
| - std::string output_data_;
|
| - scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_;
|
| -};
|
| -
|
| -IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {
|
| - Start();
|
| - base::RunLoop().Run();
|
| -}
|
| -
|
| -} // namespace dom_distiller
|
|
|