Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(317)

Unified Diff: components/dom_distiller/standalone/content_extractor.cc

Issue 917663002: Rename content_extractor.cc to content_extractor_browsertest.cc (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: components/dom_distiller/standalone/content_extractor.cc
diff --git a/components/dom_distiller/standalone/content_extractor.cc b/components/dom_distiller/standalone/content_extractor.cc
deleted file mode 100644
index 1643b12ca84c2625539b558e39b3f4821998d4aa..0000000000000000000000000000000000000000
--- a/components/dom_distiller/standalone/content_extractor.cc
+++ /dev/null
@@ -1,419 +0,0 @@
-// Copyright 2014 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include <sstream>
-
-#include "base/command_line.h"
-#include "base/files/scoped_temp_dir.h"
-#include "base/id_map.h"
-#include "base/message_loop/message_loop.h"
-#include "base/path_service.h"
-#include "base/run_loop.h"
-#include "base/strings/string_number_conversions.h"
-#include "base/strings/string_split.h"
-#include "components/dom_distiller/content/distiller_page_web_contents.h"
-#include "components/dom_distiller/core/article_entry.h"
-#include "components/dom_distiller/core/distilled_page_prefs.h"
-#include "components/dom_distiller/core/distiller.h"
-#include "components/dom_distiller/core/dom_distiller_service.h"
-#include "components/dom_distiller/core/dom_distiller_store.h"
-#include "components/dom_distiller/core/proto/distilled_article.pb.h"
-#include "components/dom_distiller/core/proto/distilled_page.pb.h"
-#include "components/dom_distiller/core/task_tracker.h"
-#include "components/leveldb_proto/proto_database.h"
-#include "components/leveldb_proto/proto_database_impl.h"
-#include "components/pref_registry/testing_pref_service_syncable.h"
-#include "content/public/browser/browser_context.h"
-#include "content/public/browser/browser_thread.h"
-#include "content/public/test/content_browser_test.h"
-#include "content/shell/browser/shell.h"
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
-#include "net/dns/mock_host_resolver.h"
-#include "third_party/dom_distiller_js/dom_distiller.pb.h"
-#include "ui/base/resource/resource_bundle.h"
-
-using content::ContentBrowserTest;
-
-namespace dom_distiller {
-
-namespace {
-
-typedef base::hash_map<std::string, std::string> UrlToDomainMap;
-
-}
-
-// Factory for creating a Distiller that creates different DomDistillerOptions
-// for different URLs, i.e. a specific kOriginalDomain option for each URL.
-class TestDistillerFactoryImpl : public DistillerFactory {
- public:
- TestDistillerFactoryImpl(
- scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
- const dom_distiller::proto::DomDistillerOptions& dom_distiller_options,
- const UrlToDomainMap& url_to_domain_map)
- : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()),
- dom_distiller_options_(dom_distiller_options),
- url_to_domain_map_(url_to_domain_map) {
- }
-
- ~TestDistillerFactoryImpl() override {}
-
- scoped_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override {
- dom_distiller::proto::DomDistillerOptions options;
- options = dom_distiller_options_;
- UrlToDomainMap::const_iterator it = url_to_domain_map_.find(url.spec());
- if (it != url_to_domain_map_.end()) options.set_original_domain(it->second);
- scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
- *distiller_url_fetcher_factory_, options));
- return distiller.Pass();
- }
-
- private:
- scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_;
- dom_distiller::proto::DomDistillerOptions dom_distiller_options_;
- UrlToDomainMap url_to_domain_map_;
-};
-
-namespace {
-
-// The url to distill.
-const char* kUrlSwitch = "url";
-
-// A space-separated list of urls to distill.
-const char* kUrlsSwitch = "urls";
-
-// Indicates that DNS resolution should be disabled for this test.
-const char* kDisableDnsSwitch = "disable-dns";
-
-// Will write the distilled output to the given file instead of to stdout.
-const char* kOutputFile = "output-file";
-
-// Indicates to output a serialized protocol buffer instead of human-readable
-// output.
-const char* kShouldOutputBinary = "output-binary";
-
-// Indicates to output only the text of the article and not the enclosing html.
-const char* kExtractTextOnly = "extract-text-only";
-
-// Indicates to include debug output.
-const char* kDebugLevel = "debug-level";
-
-// The original domain of the page if |kUrlSwitch| is a file.
-const char* kOriginalDomain = "original-domain";
-
-// A semi-colon-separated (i.e. ';') list of original domains corresponding to
-// "kUrlsSwitch".
-const char* kOriginalDomains = "original-domains";
-
-// Maximum number of concurrent started extractor requests.
-const int kMaxExtractorTasks = 8;
-
-scoped_ptr<DomDistillerService> CreateDomDistillerService(
- content::BrowserContext* context,
- const base::FilePath& db_path,
- const UrlToDomainMap& url_to_domain_map) {
- scoped_refptr<base::SequencedTaskRunner> background_task_runner =
- content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
- content::BrowserThread::GetBlockingPool()->GetSequenceToken());
-
- // TODO(cjhopman): use an in-memory database instead of an on-disk one with
- // temporary directory.
- scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db(
- new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>(
- background_task_runner));
- scoped_ptr<DomDistillerStore> dom_distiller_store(
- new DomDistillerStore(db.Pass(), db_path));
-
- scoped_ptr<DistillerPageFactory> distiller_page_factory(
- new DistillerPageWebContentsFactory(context));
- scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory(
- new DistillerURLFetcherFactory(context->GetRequestContext()));
-
- dom_distiller::proto::DomDistillerOptions options;
- if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly)) {
- options.set_extract_text_only(true);
- }
- int debug_level = 0;
- if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) &&
- base::StringToInt(
- base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
- kDebugLevel),
- &debug_level)) {
- options.set_debug_level(debug_level);
- }
- scoped_ptr<DistillerFactory> distiller_factory(
- new TestDistillerFactoryImpl(distiller_url_fetcher_factory.Pass(),
- options,
- url_to_domain_map));
-
- // Setting up PrefService for DistilledPagePrefs.
- user_prefs::TestingPrefServiceSyncable* pref_service =
- new user_prefs::TestingPrefServiceSyncable();
- DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry());
-
- return scoped_ptr<DomDistillerService>(new DomDistillerService(
- dom_distiller_store.Pass(),
- distiller_factory.Pass(),
- distiller_page_factory.Pass(),
- scoped_ptr<DistilledPagePrefs>(new DistilledPagePrefs(pref_service))));
-}
-
-void AddComponentsTestResources() {
- base::FilePath pak_file;
- base::FilePath pak_dir;
- PathService::Get(base::DIR_MODULE, &pak_dir);
- pak_file =
- pak_dir.Append(FILE_PATH_LITERAL("components_tests_resources.pak"));
- ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath(
- pak_file, ui::SCALE_FACTOR_NONE);
-}
-
-bool WriteProtobufWithSize(
- const google::protobuf::MessageLite& message,
- google::protobuf::io::ZeroCopyOutputStream* output_stream) {
- google::protobuf::io::CodedOutputStream coded_output(output_stream);
-
- // Write the size.
- const int size = message.ByteSize();
- coded_output.WriteLittleEndian32(size);
- message.SerializeWithCachedSizes(&coded_output);
- return !coded_output.HadError();
-}
-
-std::string GetReadableArticleString(
- const DistilledArticleProto& article_proto) {
- std::stringstream output;
- output << "Article Title: " << article_proto.title() << std::endl;
- output << "# of pages: " << article_proto.pages_size() << std::endl;
- for (int i = 0; i < article_proto.pages_size(); ++i) {
- if (i > 0) output << std::endl;
- const DistilledPageProto& page = article_proto.pages(i);
- output << "Page " << i << std::endl;
- output << "URL: " << page.url() << std::endl;
- output << "Content: " << page.html() << std::endl;
- if (page.has_debug_info() && page.debug_info().has_log())
- output << "Log: " << page.debug_info().log() << std::endl;
- if (page.has_pagination_info()) {
- if (page.pagination_info().has_next_page()) {
- output << "Next Page: " << page.pagination_info().next_page()
- << std::endl;
- }
- if (page.pagination_info().has_prev_page()) {
- output << "Prev Page: " << page.pagination_info().prev_page()
- << std::endl;
- }
- }
- }
- return output.str();
-}
-
-} // namespace
-
-class ContentExtractionRequest : public ViewRequestDelegate {
- public:
- void Start(DomDistillerService* service, const gfx::Size& render_view_size,
- base::Closure finished_callback) {
- finished_callback_ = finished_callback;
- viewer_handle_ =
- service->ViewUrl(this,
- service->CreateDefaultDistillerPage(render_view_size),
- url_);
- }
-
- DistilledArticleProto GetArticleCopy() {
- return *article_proto_;
- }
-
- static ScopedVector<ContentExtractionRequest> CreateForCommandLine(
- const base::CommandLine& command_line,
- UrlToDomainMap* url_to_domain_map) {
- ScopedVector<ContentExtractionRequest> requests;
- if (command_line.HasSwitch(kUrlSwitch)) {
- GURL url;
- std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
- url = GURL(url_string);
- if (url.is_valid()) {
- requests.push_back(new ContentExtractionRequest(url));
- if (command_line.HasSwitch(kOriginalDomain)) {
- (*url_to_domain_map)[url.spec()] =
- command_line.GetSwitchValueASCII(kOriginalDomain);
- }
- }
- } else if (command_line.HasSwitch(kUrlsSwitch)) {
- std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
- std::vector<std::string> urls;
- base::SplitString(urls_string, ' ', &urls);
- // Check for original-domains switch, which must exactly pair up with
- // |kUrlsSwitch| i.e. number of domains must be same as that of urls.
- std::vector<std::string> domains;
- if (command_line.HasSwitch(kOriginalDomains)) {
- std::string domains_string =
- command_line.GetSwitchValueASCII( kOriginalDomains);
- base::SplitString(domains_string, ';', &domains);
- if (domains.size() != urls.size()) domains.clear();
- }
- for (size_t i = 0; i < urls.size(); ++i) {
- GURL url(urls[i]);
- if (url.is_valid()) {
- requests.push_back(new ContentExtractionRequest(url));
- // Only regard non-empty domain.
- if (!domains.empty() && !domains[i].empty()) {
- (*url_to_domain_map)[url.spec()] = domains[i];
- }
- } else {
- ADD_FAILURE() << "Bad url";
- }
- }
- }
- if (requests.empty()) {
- ADD_FAILURE() << "No valid url provided";
- }
-
- return requests.Pass();
- }
-
- private:
- ContentExtractionRequest(const GURL& url) : url_(url) {}
-
- void OnArticleUpdated(ArticleDistillationUpdate article_update) override {}
-
- void OnArticleReady(const DistilledArticleProto* article_proto) override {
- article_proto_ = article_proto;
- CHECK(article_proto->pages_size()) << "Failed extracting " << url_;
- base::MessageLoop::current()->PostTask(
- FROM_HERE,
- finished_callback_);
- }
-
- const DistilledArticleProto* article_proto_;
- scoped_ptr<ViewerHandle> viewer_handle_;
- GURL url_;
- base::Closure finished_callback_;
-};
-
-class ContentExtractor : public ContentBrowserTest {
- public:
- ContentExtractor()
- : pending_tasks_(0),
- max_tasks_(kMaxExtractorTasks),
- next_request_(0),
- output_data_(),
- protobuf_output_stream_(
- new google::protobuf::io::StringOutputStream(&output_data_)) {}
-
- // Change behavior of the default host resolver to avoid DNS lookup errors, so
- // we can make network calls.
- void SetUpOnMainThread() override {
- if (!base::CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch)) {
- EnableDNSLookupForThisTest();
- }
- CHECK(db_dir_.CreateUniqueTempDir());
- AddComponentsTestResources();
- }
-
- void TearDownOnMainThread() override { DisableDNSLookupForThisTest(); }
-
- protected:
- // Creates the DomDistillerService and creates and starts the extraction
- // request.
- void Start() {
- const base::CommandLine& command_line =
- *base::CommandLine::ForCurrentProcess();
- UrlToDomainMap url_to_domain_map;
- requests_ = ContentExtractionRequest::CreateForCommandLine(
- command_line, &url_to_domain_map);
- content::BrowserContext* context =
- shell()->web_contents()->GetBrowserContext();
- service_ = CreateDomDistillerService(context,
- db_dir_.path(),
- url_to_domain_map);
- PumpQueue();
- }
-
- void PumpQueue() {
- while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) {
- requests_[next_request_]->Start(
- service_.get(),
- shell()->web_contents()->GetContainerBounds().size(),
- base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this)));
- ++next_request_;
- ++pending_tasks_;
- }
- }
-
- private:
- // Change behavior of the default host resolver to allow DNS lookup
- // to proceed instead of being blocked by the test infrastructure.
- void EnableDNSLookupForThisTest() {
- // mock_host_resolver_override_ takes ownership of the resolver.
- scoped_refptr<net::RuleBasedHostResolverProc> resolver =
- new net::RuleBasedHostResolverProc(host_resolver());
- resolver->AllowDirectLookup("*");
- mock_host_resolver_override_.reset(
- new net::ScopedDefaultHostResolverProc(resolver.get()));
- }
-
- // We need to reset the DNS lookup when we finish, or the test will fail.
- void DisableDNSLookupForThisTest() {
- mock_host_resolver_override_.reset();
- }
-
- void FinishRequest() {
- --pending_tasks_;
- if (next_request_ == requests_.size() && pending_tasks_ == 0) {
- Finish();
- } else {
- PumpQueue();
- }
- }
-
- void DoArticleOutput() {
- const base::CommandLine& command_line =
- *base::CommandLine::ForCurrentProcess();
- for (size_t i = 0; i < requests_.size(); ++i) {
- const DistilledArticleProto& article = requests_[i]->GetArticleCopy();
- if (command_line.HasSwitch(kShouldOutputBinary)) {
- WriteProtobufWithSize(article, protobuf_output_stream_.get());
- } else {
- output_data_ += GetReadableArticleString(article) + "\n";
- }
- }
-
- if (command_line.HasSwitch(kOutputFile)) {
- base::FilePath filename = command_line.GetSwitchValuePath(kOutputFile);
- ASSERT_EQ(
- (int)output_data_.size(),
- base::WriteFile(filename, output_data_.c_str(), output_data_.size()));
- } else {
- VLOG(0) << output_data_;
- }
- }
-
- void Finish() {
- DoArticleOutput();
- requests_.clear();
- service_.reset();
- base::MessageLoop::current()->PostTask(
- FROM_HERE, base::MessageLoop::QuitWhenIdleClosure());
- }
-
- size_t pending_tasks_;
- size_t max_tasks_;
- size_t next_request_;
-
- base::ScopedTempDir db_dir_;
- scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_;
- scoped_ptr<DomDistillerService> service_;
- ScopedVector<ContentExtractionRequest> requests_;
-
- std::string output_data_;
- scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_;
-};
-
-IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {
- Start();
- base::RunLoop().Run();
-}
-
-} // namespace dom_distiller
« no previous file with comments | « components/components_tests.gyp ('k') | components/dom_distiller/standalone/content_extractor_browsertest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698