Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(62)

Side by Side Diff: components/dom_distiller/standalone/content_extractor.cc

Issue 687183003: add options to specify original domain(s) (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 6 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include <sstream> 5 #include <sstream>
6 6
7 #include "base/command_line.h" 7 #include "base/command_line.h"
8 #include "base/files/scoped_temp_dir.h" 8 #include "base/files/scoped_temp_dir.h"
9 #include "base/id_map.h"
9 #include "base/message_loop/message_loop.h" 10 #include "base/message_loop/message_loop.h"
10 #include "base/path_service.h" 11 #include "base/path_service.h"
11 #include "base/run_loop.h" 12 #include "base/run_loop.h"
12 #include "base/strings/string_number_conversions.h" 13 #include "base/strings/string_number_conversions.h"
13 #include "base/strings/string_split.h" 14 #include "base/strings/string_split.h"
14 #include "components/dom_distiller/content/distiller_page_web_contents.h" 15 #include "components/dom_distiller/content/distiller_page_web_contents.h"
15 #include "components/dom_distiller/core/article_entry.h" 16 #include "components/dom_distiller/core/article_entry.h"
16 #include "components/dom_distiller/core/distilled_page_prefs.h" 17 #include "components/dom_distiller/core/distilled_page_prefs.h"
17 #include "components/dom_distiller/core/distiller.h" 18 #include "components/dom_distiller/core/distiller.h"
18 #include "components/dom_distiller/core/dom_distiller_service.h" 19 #include "components/dom_distiller/core/dom_distiller_service.h"
(...skipping 13 matching lines...) Expand all
32 #include "net/dns/mock_host_resolver.h" 33 #include "net/dns/mock_host_resolver.h"
33 #include "third_party/dom_distiller_js/dom_distiller.pb.h" 34 #include "third_party/dom_distiller_js/dom_distiller.pb.h"
34 #include "ui/base/resource/resource_bundle.h" 35 #include "ui/base/resource/resource_bundle.h"
35 36
36 using content::ContentBrowserTest; 37 using content::ContentBrowserTest;
37 38
38 namespace dom_distiller { 39 namespace dom_distiller {
39 40
40 namespace { 41 namespace {
41 42
43 typedef base::hash_map<std::string, std::string> UrlToDomainMap;
44
45 }
46
47 // Factory for creating a Distiller that creates different DomDistillerOptions
48 // for different URLs, i.e. a specific kOriginalDomain option for each URL.
49 class TestDistillerFactoryImpl : public DistillerFactory {
50 public:
51 TestDistillerFactoryImpl(
52 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
53 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options,
54 const UrlToDomainMap& url_to_domain_map)
55 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()),
56 dom_distiller_options_(dom_distiller_options),
57 url_to_domain_map_(url_to_domain_map) {
58 }
59
60 ~TestDistillerFactoryImpl() override {}
61
62 scoped_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override {
63 dom_distiller::proto::DomDistillerOptions options;
64 options = dom_distiller_options_;
65 UrlToDomainMap::const_iterator it = url_to_domain_map_.find(url.spec());
66 if (it != url_to_domain_map_.end()) options.set_original_domain(it->second);
67 scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
68 *distiller_url_fetcher_factory_, options));
69 return distiller.Pass();
70 }
71
72 private:
73 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_;
74 dom_distiller::proto::DomDistillerOptions dom_distiller_options_;
75 UrlToDomainMap url_to_domain_map_;
76 };
77
78 namespace {
79
42 // The url to distill. 80 // The url to distill.
43 const char* kUrlSwitch = "url"; 81 const char* kUrlSwitch = "url";
44 82
45 // A space-separated list of urls to distill. 83 // A space-separated list of urls to distill.
46 const char* kUrlsSwitch = "urls"; 84 const char* kUrlsSwitch = "urls";
47 85
48 // Indicates that DNS resolution should be disabled for this test. 86 // Indicates that DNS resolution should be disabled for this test.
49 const char* kDisableDnsSwitch = "disable-dns"; 87 const char* kDisableDnsSwitch = "disable-dns";
50 88
51 // Will write the distilled output to the given file instead of to stdout. 89 // Will write the distilled output to the given file instead of to stdout.
52 const char* kOutputFile = "output-file"; 90 const char* kOutputFile = "output-file";
53 91
54 // Indicates to output a serialized protocol buffer instead of human-readable 92 // Indicates to output a serialized protocol buffer instead of human-readable
55 // output. 93 // output.
56 const char* kShouldOutputBinary = "output-binary"; 94 const char* kShouldOutputBinary = "output-binary";
57 95
58 // Indicates to output only the text of the article and not the enclosing html. 96 // Indicates to output only the text of the article and not the enclosing html.
59 const char* kExtractTextOnly = "extract-text-only"; 97 const char* kExtractTextOnly = "extract-text-only";
60 98
61 // Indicates to include debug output. 99 // Indicates to include debug output.
62 const char* kDebugLevel = "debug-level"; 100 const char* kDebugLevel = "debug-level";
63 101
102 // The original domain of the page if |kUrlSwitch| is a file.
103 const char* kOriginalDomain = "original-domain";
104
105 // A semi-colon-separated (i.e. ';') list of original domains corresponding to
cjhopman 2014/10/29 17:13:41 Do space-separated instead (like --urls)
kuan 2014/10/29 17:26:53 hm.. i changed to use ';' in case the domain can't
106 // "kUrlsSwitch".
107 const char* kOriginalDomains = "original-domains";
108
64 // Maximum number of concurrent started extractor requests. 109 // Maximum number of concurrent started extractor requests.
65 const int kMaxExtractorTasks = 8; 110 const int kMaxExtractorTasks = 8;
66 111
67 scoped_ptr<DomDistillerService> CreateDomDistillerService( 112 scoped_ptr<DomDistillerService> CreateDomDistillerService(
68 content::BrowserContext* context, 113 content::BrowserContext* context,
69 const base::FilePath& db_path) { 114 const base::FilePath& db_path,
115 const UrlToDomainMap& url_to_domain_map) {
70 scoped_refptr<base::SequencedTaskRunner> background_task_runner = 116 scoped_refptr<base::SequencedTaskRunner> background_task_runner =
71 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner( 117 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
72 content::BrowserThread::GetBlockingPool()->GetSequenceToken()); 118 content::BrowserThread::GetBlockingPool()->GetSequenceToken());
73 119
74 // TODO(cjhopman): use an in-memory database instead of an on-disk one with 120 // TODO(cjhopman): use an in-memory database instead of an on-disk one with
75 // temporary directory. 121 // temporary directory.
76 scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db( 122 scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db(
77 new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>( 123 new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>(
78 background_task_runner)); 124 background_task_runner));
79 scoped_ptr<DomDistillerStore> dom_distiller_store( 125 scoped_ptr<DomDistillerStore> dom_distiller_store(
(...skipping 10 matching lines...) Expand all
90 } 136 }
91 int debug_level = 0; 137 int debug_level = 0;
92 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) && 138 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) &&
93 base::StringToInt( 139 base::StringToInt(
94 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII( 140 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
95 kDebugLevel), 141 kDebugLevel),
96 &debug_level)) { 142 &debug_level)) {
97 options.set_debug_level(debug_level); 143 options.set_debug_level(debug_level);
98 } 144 }
99 scoped_ptr<DistillerFactory> distiller_factory( 145 scoped_ptr<DistillerFactory> distiller_factory(
100 new DistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), options)); 146 new TestDistillerFactoryImpl(distiller_url_fetcher_factory.Pass(),
147 options,
148 url_to_domain_map));
101 149
102 // Setting up PrefService for DistilledPagePrefs. 150 // Setting up PrefService for DistilledPagePrefs.
103 user_prefs::TestingPrefServiceSyncable* pref_service = 151 user_prefs::TestingPrefServiceSyncable* pref_service =
104 new user_prefs::TestingPrefServiceSyncable(); 152 new user_prefs::TestingPrefServiceSyncable();
105 DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry()); 153 DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry());
106 154
107 return scoped_ptr<DomDistillerService>(new DomDistillerService( 155 return scoped_ptr<DomDistillerService>(new DomDistillerService(
108 dom_distiller_store.Pass(), 156 dom_distiller_store.Pass(),
109 distiller_factory.Pass(), 157 distiller_factory.Pass(),
110 distiller_page_factory.Pass(), 158 distiller_page_factory.Pass(),
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
170 service->ViewUrl(this, 218 service->ViewUrl(this,
171 service->CreateDefaultDistillerPage(render_view_size), 219 service->CreateDefaultDistillerPage(render_view_size),
172 url_); 220 url_);
173 } 221 }
174 222
175 DistilledArticleProto GetArticleCopy() { 223 DistilledArticleProto GetArticleCopy() {
176 return *article_proto_; 224 return *article_proto_;
177 } 225 }
178 226
179 static ScopedVector<ContentExtractionRequest> CreateForCommandLine( 227 static ScopedVector<ContentExtractionRequest> CreateForCommandLine(
180 const CommandLine& command_line) { 228 const CommandLine& command_line,
229 UrlToDomainMap* url_to_domain_map) {
181 ScopedVector<ContentExtractionRequest> requests; 230 ScopedVector<ContentExtractionRequest> requests;
182 if (command_line.HasSwitch(kUrlSwitch)) { 231 if (command_line.HasSwitch(kUrlSwitch)) {
183 GURL url; 232 GURL url;
184 std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch); 233 std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
185 url = GURL(url_string); 234 url = GURL(url_string);
186 if (url.is_valid()) { 235 if (url.is_valid()) {
187 requests.push_back(new ContentExtractionRequest(url)); 236 requests.push_back(new ContentExtractionRequest(url));
237 if (command_line.HasSwitch(kOriginalDomain)) {
238 (*url_to_domain_map)[url.spec()] =
239 command_line.GetSwitchValueASCII(kOriginalDomain);
240 }
188 } 241 }
189 } else if (command_line.HasSwitch(kUrlsSwitch)) { 242 } else if (command_line.HasSwitch(kUrlsSwitch)) {
190 std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch); 243 std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
191 std::vector<std::string> urls; 244 std::vector<std::string> urls;
192 base::SplitString(urls_string, ' ', &urls); 245 base::SplitString(urls_string, ' ', &urls);
246 // Check for original-domains switch, which must exactly pair up with
247 // |kUrlsSwitch| i.e. number of domains must be same as that of urls.
248 std::vector<std::string> domains;
249 if (command_line.HasSwitch(kOriginalDomains)) {
250 std::string domains_string =
251 command_line.GetSwitchValueASCII( kOriginalDomains);
252 base::SplitString(domains_string, ';', &domains);
253 if (domains.size() != urls.size()) domains.clear();
254 }
193 for (size_t i = 0; i < urls.size(); ++i) { 255 for (size_t i = 0; i < urls.size(); ++i) {
194 GURL url(urls[i]); 256 GURL url(urls[i]);
195 if (url.is_valid()) { 257 if (url.is_valid()) {
196 requests.push_back(new ContentExtractionRequest(url)); 258 requests.push_back(new ContentExtractionRequest(url));
259 // Only regard non-empty domain.
260 if (!domains.empty() && !domains[i].empty()) {
261 (*url_to_domain_map)[url.spec()] = domains[i];
262 }
197 } else { 263 } else {
198 ADD_FAILURE() << "Bad url"; 264 ADD_FAILURE() << "Bad url";
199 } 265 }
200 } 266 }
201 } 267 }
202 if (requests.empty()) { 268 if (requests.empty()) {
203 ADD_FAILURE() << "No valid url provided"; 269 ADD_FAILURE() << "No valid url provided";
204 } 270 }
205 271
206 return requests.Pass(); 272 return requests.Pass();
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
248 } 314 }
249 315
250 virtual void TearDownOnMainThread() override { 316 virtual void TearDownOnMainThread() override {
251 DisableDNSLookupForThisTest(); 317 DisableDNSLookupForThisTest();
252 } 318 }
253 319
254 protected: 320 protected:
255 // Creates the DomDistillerService and creates and starts the extraction 321 // Creates the DomDistillerService and creates and starts the extraction
256 // request. 322 // request.
257 void Start() { 323 void Start() {
324 const CommandLine& command_line = *CommandLine::ForCurrentProcess();
325 UrlToDomainMap url_to_domain_map;
326 requests_ = ContentExtractionRequest::CreateForCommandLine(
327 command_line, &url_to_domain_map);
258 content::BrowserContext* context = 328 content::BrowserContext* context =
259 shell()->web_contents()->GetBrowserContext(); 329 shell()->web_contents()->GetBrowserContext();
260 service_ = CreateDomDistillerService(context, 330 service_ = CreateDomDistillerService(context,
261 db_dir_.path()); 331 db_dir_.path(),
262 const CommandLine& command_line = *CommandLine::ForCurrentProcess(); 332 url_to_domain_map);
263 requests_ = ContentExtractionRequest::CreateForCommandLine(command_line);
264 PumpQueue(); 333 PumpQueue();
265 } 334 }
266 335
267 void PumpQueue() { 336 void PumpQueue() {
268 while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) { 337 while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) {
269 requests_[next_request_]->Start( 338 requests_[next_request_]->Start(
270 service_.get(), 339 service_.get(),
271 shell()->web_contents()->GetContainerBounds().size(), 340 shell()->web_contents()->GetContainerBounds().size(),
272 base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this))); 341 base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this)));
273 ++next_request_; 342 ++next_request_;
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after
342 std::string output_data_; 411 std::string output_data_;
343 scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_; 412 scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_;
344 }; 413 };
345 414
346 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) { 415 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {
347 Start(); 416 Start();
348 base::RunLoop().Run(); 417 base::RunLoop().Run();
349 } 418 }
350 419
351 } // namespace dom_distiller 420 } // namespace dom_distiller
OLDNEW
« components/dom_distiller/core/distiller.cc ('K') | « components/dom_distiller/core/task_tracker.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698