Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(124)

Side by Side Diff: components/dom_distiller/standalone/content_extractor.cc

Issue 901793002: Add support for providing an external file for extracting content. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include <sstream> 5 #include <sstream>
6 6
7 #include "base/command_line.h" 7 #include "base/command_line.h"
8 #include "base/files/file_path.h"
9 #include "base/files/file_util.h"
8 #include "base/files/scoped_temp_dir.h" 10 #include "base/files/scoped_temp_dir.h"
9 #include "base/id_map.h" 11 #include "base/id_map.h"
10 #include "base/message_loop/message_loop.h" 12 #include "base/message_loop/message_loop.h"
11 #include "base/path_service.h" 13 #include "base/path_service.h"
12 #include "base/run_loop.h" 14 #include "base/run_loop.h"
13 #include "base/strings/string_number_conversions.h" 15 #include "base/strings/string_number_conversions.h"
14 #include "base/strings/string_split.h" 16 #include "base/strings/string_split.h"
15 #include "components/dom_distiller/content/distiller_page_web_contents.h" 17 #include "components/dom_distiller/content/distiller_page_web_contents.h"
16 #include "components/dom_distiller/core/article_entry.h" 18 #include "components/dom_distiller/core/article_entry.h"
17 #include "components/dom_distiller/core/distilled_page_prefs.h" 19 #include "components/dom_distiller/core/distilled_page_prefs.h"
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
102 // The original domain of the page if |kUrlSwitch| is a file. 104 // The original domain of the page if |kUrlSwitch| is a file.
103 const char* kOriginalDomain = "original-domain"; 105 const char* kOriginalDomain = "original-domain";
104 106
105 // A semi-colon-separated (i.e. ';') list of original domains corresponding to 107 // A semi-colon-separated (i.e. ';') list of original domains corresponding to
106 // "kUrlsSwitch". 108 // "kUrlsSwitch".
107 const char* kOriginalDomains = "original-domains"; 109 const char* kOriginalDomains = "original-domains";
108 110
109 // Maximum number of concurrent started extractor requests. 111 // Maximum number of concurrent started extractor requests.
110 const int kMaxExtractorTasks = 8; 112 const int kMaxExtractorTasks = 8;
111 113
114 // A path to a script for extracting content (domdistiller.js). If this argument
115 // is passed in, the script will be used instead of using the bundled version
116 // of the script.
117 const char* kExternalDomDistillerJs = "external-dom-distiller-js";
118
112 scoped_ptr<DomDistillerService> CreateDomDistillerService( 119 scoped_ptr<DomDistillerService> CreateDomDistillerService(
113 content::BrowserContext* context, 120 content::BrowserContext* context,
114 const base::FilePath& db_path, 121 const base::FilePath& db_path,
115 const UrlToDomainMap& url_to_domain_map) { 122 const UrlToDomainMap& url_to_domain_map) {
116 scoped_refptr<base::SequencedTaskRunner> background_task_runner = 123 scoped_refptr<base::SequencedTaskRunner> background_task_runner =
117 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner( 124 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
118 content::BrowserThread::GetBlockingPool()->GetSequenceToken()); 125 content::BrowserThread::GetBlockingPool()->GetSequenceToken());
119 126
120 // TODO(cjhopman): use an in-memory database instead of an on-disk one with 127 // TODO(cjhopman): use an in-memory database instead of an on-disk one with
121 // temporary directory. 128 // temporary directory.
122 scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db( 129 scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db(
123 new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>( 130 new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>(
124 background_task_runner)); 131 background_task_runner));
125 scoped_ptr<DomDistillerStore> dom_distiller_store( 132 scoped_ptr<DomDistillerStore> dom_distiller_store(
126 new DomDistillerStore(db.Pass(), db_path)); 133 new DomDistillerStore(db.Pass(), db_path));
127 134
128 scoped_ptr<DistillerPageFactory> distiller_page_factory( 135 scoped_ptr<DistillerPageFactory> distiller_page_factory;
129 new DistillerPageWebContentsFactory(context)); 136 if (base::CommandLine::ForCurrentProcess()->HasSwitch(
137 kExternalDomDistillerJs)) {
138 std::string external_script_path =
139 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
140 kExternalDomDistillerJs);
141 std::string script_content;
142 if (!base::ReadFileToString(base::FilePath(external_script_path),
143 &script_content)) {
144 ADD_FAILURE() << "Failed to read external script for distillation.";
145 return nullptr;
146 }
147 distiller_page_factory.reset(
148 new DistillerPageWebContentsFactory(context, script_content));
149 } else {
150 distiller_page_factory.reset(new DistillerPageWebContentsFactory(context));
151 }
152
130 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory( 153 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory(
131 new DistillerURLFetcherFactory(context->GetRequestContext())); 154 new DistillerURLFetcherFactory(context->GetRequestContext()));
132 155
133 dom_distiller::proto::DomDistillerOptions options; 156 dom_distiller::proto::DomDistillerOptions options;
134 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly)) { 157 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly)) {
135 options.set_extract_text_only(true); 158 options.set_extract_text_only(true);
136 } 159 }
137 int debug_level = 0; 160 int debug_level = 0;
138 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) && 161 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) &&
139 base::StringToInt( 162 base::StringToInt(
(...skipping 269 matching lines...) Expand 10 before | Expand all | Expand 10 after
409 std::string output_data_; 432 std::string output_data_;
410 scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_; 433 scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_;
411 }; 434 };
412 435
413 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) { 436 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {
414 Start(); 437 Start();
415 base::RunLoop().Run(); 438 base::RunLoop().Run();
416 } 439 }
417 440
418 } // namespace dom_distiller 441 } // namespace dom_distiller
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698