| Index: components/dom_distiller/standalone/content_extractor.cc
|
| diff --git a/components/dom_distiller/standalone/content_extractor.cc b/components/dom_distiller/standalone/content_extractor.cc
|
| index e851a711e75c95a2a25941d52c1ca78dfd84cea6..0b609ed21064ede18d7f9ff8b60b0605628fcf2d 100644
|
| --- a/components/dom_distiller/standalone/content_extractor.cc
|
| +++ b/components/dom_distiller/standalone/content_extractor.cc
|
| @@ -10,6 +10,7 @@
|
| #include "base/path_service.h"
|
| #include "base/run_loop.h"
|
| #include "base/strings/string_number_conversions.h"
|
| +#include "base/strings/string_split.h"
|
| #include "components/dom_distiller/content/distiller_page_web_contents.h"
|
| #include "components/dom_distiller/core/distiller.h"
|
| #include "components/dom_distiller/core/dom_distiller_database.h"
|
| @@ -22,6 +23,8 @@
|
| #include "content/public/browser/browser_thread.h"
|
| #include "content/public/test/content_browser_test.h"
|
| #include "content/shell/browser/shell.h"
|
| +#include "google/protobuf/io/coded_stream.h"
|
| +#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
|
| #include "net/dns/mock_host_resolver.h"
|
| #include "third_party/dom_distiller_js/dom_distiller.pb.h"
|
| #include "ui/base/resource/resource_bundle.h"
|
| @@ -35,6 +38,9 @@ namespace {
|
| // The url to distill.
|
| const char* kUrlSwitch = "url";
|
|
|
| +// A space-separated list of urls to distill.
|
| +const char* kUrlsSwitch = "urls";
|
| +
|
| // Indicates that DNS resolution should be disabled for this test.
|
| const char* kDisableDnsSwitch = "disable-dns";
|
|
|
| @@ -51,6 +57,9 @@ const char* kExtractTextOnly = "extract-text-only";
|
| // Indicates to include debug output.
|
| const char* kDebugLevel = "debug-level";
|
|
|
| +// Maximum number of concurrent started extractor requests.
|
| +const int kMaxExtractorTasks = 8;
|
| +
|
| scoped_ptr<DomDistillerService> CreateDomDistillerService(
|
| content::BrowserContext* context,
|
| const base::FilePath& db_path) {
|
| @@ -100,29 +109,30 @@ void AddComponentsResources() {
|
| pak_file, ui::SCALE_FACTOR_NONE);
|
| }
|
|
|
| -void LogArticle(const DistilledArticleProto& article_proto) {
|
| - std::stringstream output;
|
| - if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) {
|
| - output << article_proto.SerializeAsString();
|
| - } else {
|
| - output << "Article Title: " << article_proto.title() << std::endl;
|
| - output << "# of pages: " << article_proto.pages_size() << std::endl;
|
| - for (int i = 0; i < article_proto.pages_size(); ++i) {
|
| - const DistilledPageProto& page = article_proto.pages(i);
|
| - output << "Page " << i << std::endl;
|
| - output << "URL: " << page.url() << std::endl;
|
| - output << "Content: " << page.html() << std::endl;
|
| - }
|
| - }
|
| +bool WriteProtobufWithSize(
|
| + const google::protobuf::MessageLite& message,
|
| + google::protobuf::io::ZeroCopyOutputStream* output_stream) {
|
| + google::protobuf::io::CodedOutputStream coded_output(output_stream);
|
| +
|
| + // Write the size.
|
| + const int size = message.ByteSize();
|
| + coded_output.WriteLittleEndian32(size);
|
| + message.SerializeWithCachedSizes(&coded_output);
|
| + return !coded_output.HadError();
|
| +}
|
|
|
| - std::string data = output.str();
|
| - if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) {
|
| - base::FilePath filename =
|
| - CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile);
|
| - base::WriteFile(filename, data.c_str(), data.size());
|
| - } else {
|
| - VLOG(0) << data;
|
| +std::string GetReadableArticleString(
|
| + const DistilledArticleProto& article_proto) {
|
| + std::stringstream output;
|
| + output << "Article Title: " << article_proto.title() << std::endl;
|
| + output << "# of pages: " << article_proto.pages_size() << std::endl;
|
| + for (int i = 0; i < article_proto.pages_size(); ++i) {
|
| + const DistilledPageProto& page = article_proto.pages(i);
|
| + output << "Page " << i << std::endl;
|
| + output << "URL: " << page.url() << std::endl;
|
| + output << "Content: " << page.html() << std::endl;
|
| }
|
| + return output.str();
|
| }
|
|
|
| } // namespace
|
| @@ -139,19 +149,34 @@ class ContentExtractionRequest : public ViewRequestDelegate {
|
| return *article_proto_;
|
| }
|
|
|
| - static scoped_ptr<ContentExtractionRequest> CreateForCommandLine(
|
| + static ScopedVector<ContentExtractionRequest> CreateForCommandLine(
|
| const CommandLine& command_line) {
|
| - GURL url;
|
| + ScopedVector<ContentExtractionRequest> requests;
|
| if (command_line.HasSwitch(kUrlSwitch)) {
|
| + GURL url;
|
| std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
|
| url = GURL(url_string);
|
| + if (url.is_valid()) {
|
| + requests.push_back(new ContentExtractionRequest(url));
|
| + }
|
| + } else if (command_line.HasSwitch(kUrlsSwitch)) {
|
| + std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
|
| + std::vector<std::string> urls;
|
| + base::SplitString(urls_string, ' ', &urls);
|
| + for (size_t i = 0; i < urls.size(); ++i) {
|
| + GURL url(urls[i]);
|
| + if (url.is_valid()) {
|
| + requests.push_back(new ContentExtractionRequest(url));
|
| + } else {
|
| + ADD_FAILURE() << "Bad url";
|
| + }
|
| + }
|
| }
|
| - if (!url.is_valid()) {
|
| + if (requests.empty()) {
|
| ADD_FAILURE() << "No valid url provided";
|
| - return scoped_ptr<ContentExtractionRequest>();
|
| }
|
| - return scoped_ptr<ContentExtractionRequest>(
|
| - new ContentExtractionRequest(url));
|
| +
|
| + return requests.Pass();
|
| }
|
|
|
| private:
|
| @@ -175,6 +200,15 @@ class ContentExtractionRequest : public ViewRequestDelegate {
|
| };
|
|
|
| class ContentExtractor : public ContentBrowserTest {
|
| + public:
|
| + ContentExtractor()
|
| + : pending_tasks_(0),
|
| + max_tasks_(kMaxExtractorTasks),
|
| + next_request_(0),
|
| + output_data_(),
|
| + protobuf_output_stream_(
|
| + new google::protobuf::io::StringOutputStream(&output_data_)) {}
|
| +
|
| // Change behavior of the default host resolver to avoid DNS lookup errors, so
|
| // we can make network calls.
|
| virtual void SetUpOnMainThread() OVERRIDE {
|
| @@ -198,10 +232,18 @@ class ContentExtractor : public ContentBrowserTest {
|
| service_ = CreateDomDistillerService(context,
|
| db_dir_.path());
|
| const CommandLine& command_line = *CommandLine::ForCurrentProcess();
|
| - request_ = ContentExtractionRequest::CreateForCommandLine(command_line);
|
| - request_->Start(
|
| - service_.get(),
|
| - base::Bind(&ContentExtractor::Finish, base::Unretained(this)));
|
| + requests_ = ContentExtractionRequest::CreateForCommandLine(command_line);
|
| + PumpQueue();
|
| + }
|
| +
|
| + void PumpQueue() {
|
| + while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) {
|
| + requests_[next_request_]->Start(
|
| + service_.get(),
|
| + base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this)));
|
| + ++next_request_;
|
| + ++pending_tasks_;
|
| + }
|
| }
|
|
|
| private:
|
| @@ -221,18 +263,55 @@ class ContentExtractor : public ContentBrowserTest {
|
| mock_host_resolver_override_.reset();
|
| }
|
|
|
| + void FinishRequest() {
|
| + --pending_tasks_;
|
| + if (next_request_ == requests_.size() && pending_tasks_ == 0) {
|
| + Finish();
|
| + } else {
|
| + PumpQueue();
|
| + }
|
| + }
|
| +
|
| + void DoArticleOutput() {
|
| + for (size_t i = 0; i < requests_.size(); ++i) {
|
| + const DistilledArticleProto& article = requests_[i]->GetArticleCopy();
|
| + if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) {
|
| + WriteProtobufWithSize(article, protobuf_output_stream_.get());
|
| + } else {
|
| + output_data_ += GetReadableArticleString(article) + "\n";
|
| + }
|
| + }
|
| +
|
| + if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) {
|
| + base::FilePath filename =
|
| + CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile);
|
| + ASSERT_EQ(
|
| + (int)output_data_.size(),
|
| + base::WriteFile(filename, output_data_.c_str(), output_data_.size()));
|
| + } else {
|
| + VLOG(0) << output_data_;
|
| + }
|
| + }
|
| +
|
| void Finish() {
|
| - LogArticle(request_->GetArticleCopy());
|
| - request_.reset();
|
| + DoArticleOutput();
|
| + requests_.clear();
|
| service_.reset();
|
| base::MessageLoop::current()->PostTask(
|
| FROM_HERE, base::MessageLoop::QuitWhenIdleClosure());
|
| }
|
|
|
| + size_t pending_tasks_;
|
| + size_t max_tasks_;
|
| + size_t next_request_;
|
| +
|
| base::ScopedTempDir db_dir_;
|
| scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_;
|
| scoped_ptr<DomDistillerService> service_;
|
| - scoped_ptr<ContentExtractionRequest> request_;
|
| + ScopedVector<ContentExtractionRequest> requests_;
|
| +
|
| + std::string output_data_;
|
| + scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_;
|
| };
|
|
|
| IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {
|
|
|