OLD | NEW |
---|---|
1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include <sstream> | 5 #include <sstream> |
6 | 6 |
7 #include "base/command_line.h" | 7 #include "base/command_line.h" |
8 #include "base/files/scoped_temp_dir.h" | 8 #include "base/files/scoped_temp_dir.h" |
9 #include "base/id_map.h" | |
9 #include "base/message_loop/message_loop.h" | 10 #include "base/message_loop/message_loop.h" |
10 #include "base/path_service.h" | 11 #include "base/path_service.h" |
11 #include "base/run_loop.h" | 12 #include "base/run_loop.h" |
12 #include "base/strings/string_number_conversions.h" | 13 #include "base/strings/string_number_conversions.h" |
13 #include "base/strings/string_split.h" | 14 #include "base/strings/string_split.h" |
14 #include "components/dom_distiller/content/distiller_page_web_contents.h" | 15 #include "components/dom_distiller/content/distiller_page_web_contents.h" |
15 #include "components/dom_distiller/core/article_entry.h" | 16 #include "components/dom_distiller/core/article_entry.h" |
16 #include "components/dom_distiller/core/distilled_page_prefs.h" | 17 #include "components/dom_distiller/core/distilled_page_prefs.h" |
17 #include "components/dom_distiller/core/distiller.h" | 18 #include "components/dom_distiller/core/distiller.h" |
18 #include "components/dom_distiller/core/dom_distiller_service.h" | 19 #include "components/dom_distiller/core/dom_distiller_service.h" |
(...skipping 13 matching lines...) Expand all Loading... | |
32 #include "net/dns/mock_host_resolver.h" | 33 #include "net/dns/mock_host_resolver.h" |
33 #include "third_party/dom_distiller_js/dom_distiller.pb.h" | 34 #include "third_party/dom_distiller_js/dom_distiller.pb.h" |
34 #include "ui/base/resource/resource_bundle.h" | 35 #include "ui/base/resource/resource_bundle.h" |
35 | 36 |
36 using content::ContentBrowserTest; | 37 using content::ContentBrowserTest; |
37 | 38 |
38 namespace dom_distiller { | 39 namespace dom_distiller { |
39 | 40 |
40 namespace { | 41 namespace { |
41 | 42 |
43 typedef base::hash_map<std::string, std::string> UrlToDomainMap; | |
44 | |
45 } | |
46 | |
47 // Factory for creating a Distiller that creates different DomDistillerOptions | |
48 // for different URLs, i.e. a specific kOriginalDomain option for each URL. | |
49 class TestDistillerFactoryImpl : public DistillerFactory { | |
50 public: | |
51 TestDistillerFactoryImpl( | |
52 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory, | |
53 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options, | |
54 const UrlToDomainMap& url_to_domain_map) | |
55 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()), | |
56 dom_distiller_options_(dom_distiller_options), | |
57 url_to_domain_map_(url_to_domain_map) { | |
58 } | |
59 | |
60 ~TestDistillerFactoryImpl() override {} | |
61 | |
62 scoped_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override { | |
63 dom_distiller::proto::DomDistillerOptions options; | |
64 options = dom_distiller_options_; | |
65 UrlToDomainMap::const_iterator it = url_to_domain_map_.find(url.spec()); | |
66 if (it != url_to_domain_map_.end()) options.set_original_domain(it->second); | |
67 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( | |
68 *distiller_url_fetcher_factory_, options)); | |
69 return distiller.Pass(); | |
70 } | |
71 | |
72 private: | |
73 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_; | |
74 dom_distiller::proto::DomDistillerOptions dom_distiller_options_; | |
75 UrlToDomainMap url_to_domain_map_; | |
76 }; | |
77 | |
78 namespace { | |
79 | |
42 // The url to distill. | 80 // The url to distill. |
43 const char* kUrlSwitch = "url"; | 81 const char* kUrlSwitch = "url"; |
44 | 82 |
45 // A space-separated list of urls to distill. | 83 // A space-separated list of urls to distill. |
46 const char* kUrlsSwitch = "urls"; | 84 const char* kUrlsSwitch = "urls"; |
47 | 85 |
48 // Indicates that DNS resolution should be disabled for this test. | 86 // Indicates that DNS resolution should be disabled for this test. |
49 const char* kDisableDnsSwitch = "disable-dns"; | 87 const char* kDisableDnsSwitch = "disable-dns"; |
50 | 88 |
51 // Will write the distilled output to the given file instead of to stdout. | 89 // Will write the distilled output to the given file instead of to stdout. |
52 const char* kOutputFile = "output-file"; | 90 const char* kOutputFile = "output-file"; |
53 | 91 |
54 // Indicates to output a serialized protocol buffer instead of human-readable | 92 // Indicates to output a serialized protocol buffer instead of human-readable |
55 // output. | 93 // output. |
56 const char* kShouldOutputBinary = "output-binary"; | 94 const char* kShouldOutputBinary = "output-binary"; |
57 | 95 |
58 // Indicates to output only the text of the article and not the enclosing html. | 96 // Indicates to output only the text of the article and not the enclosing html. |
59 const char* kExtractTextOnly = "extract-text-only"; | 97 const char* kExtractTextOnly = "extract-text-only"; |
60 | 98 |
61 // Indicates to include debug output. | 99 // Indicates to include debug output. |
62 const char* kDebugLevel = "debug-level"; | 100 const char* kDebugLevel = "debug-level"; |
63 | 101 |
102 // The original domain of the page if |kUrlSwitch| is a file. | |
103 const char* kOriginalDomain = "original-domain"; | |
104 | |
105 // A semi-colon-separated (i.e. ';') list of original domains corresponding to | |
cjhopman
2014/10/29 17:13:41
Do space-separated instead (like --urls)
kuan
2014/10/29 17:26:53
hm.. i changed to use ';' in case the domain can't
| |
106 // "kUrlsSwitch". | |
107 const char* kOriginalDomains = "original-domains"; | |
108 | |
64 // Maximum number of concurrent started extractor requests. | 109 // Maximum number of concurrent started extractor requests. |
65 const int kMaxExtractorTasks = 8; | 110 const int kMaxExtractorTasks = 8; |
66 | 111 |
67 scoped_ptr<DomDistillerService> CreateDomDistillerService( | 112 scoped_ptr<DomDistillerService> CreateDomDistillerService( |
68 content::BrowserContext* context, | 113 content::BrowserContext* context, |
69 const base::FilePath& db_path) { | 114 const base::FilePath& db_path, |
115 const UrlToDomainMap& url_to_domain_map) { | |
70 scoped_refptr<base::SequencedTaskRunner> background_task_runner = | 116 scoped_refptr<base::SequencedTaskRunner> background_task_runner = |
71 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner( | 117 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner( |
72 content::BrowserThread::GetBlockingPool()->GetSequenceToken()); | 118 content::BrowserThread::GetBlockingPool()->GetSequenceToken()); |
73 | 119 |
74 // TODO(cjhopman): use an in-memory database instead of an on-disk one with | 120 // TODO(cjhopman): use an in-memory database instead of an on-disk one with |
75 // temporary directory. | 121 // temporary directory. |
76 scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db( | 122 scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db( |
77 new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>( | 123 new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>( |
78 background_task_runner)); | 124 background_task_runner)); |
79 scoped_ptr<DomDistillerStore> dom_distiller_store( | 125 scoped_ptr<DomDistillerStore> dom_distiller_store( |
(...skipping 10 matching lines...) Expand all Loading... | |
90 } | 136 } |
91 int debug_level = 0; | 137 int debug_level = 0; |
92 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) && | 138 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) && |
93 base::StringToInt( | 139 base::StringToInt( |
94 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII( | 140 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII( |
95 kDebugLevel), | 141 kDebugLevel), |
96 &debug_level)) { | 142 &debug_level)) { |
97 options.set_debug_level(debug_level); | 143 options.set_debug_level(debug_level); |
98 } | 144 } |
99 scoped_ptr<DistillerFactory> distiller_factory( | 145 scoped_ptr<DistillerFactory> distiller_factory( |
100 new DistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), options)); | 146 new TestDistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), |
147 options, | |
148 url_to_domain_map)); | |
101 | 149 |
102 // Setting up PrefService for DistilledPagePrefs. | 150 // Setting up PrefService for DistilledPagePrefs. |
103 user_prefs::TestingPrefServiceSyncable* pref_service = | 151 user_prefs::TestingPrefServiceSyncable* pref_service = |
104 new user_prefs::TestingPrefServiceSyncable(); | 152 new user_prefs::TestingPrefServiceSyncable(); |
105 DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry()); | 153 DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry()); |
106 | 154 |
107 return scoped_ptr<DomDistillerService>(new DomDistillerService( | 155 return scoped_ptr<DomDistillerService>(new DomDistillerService( |
108 dom_distiller_store.Pass(), | 156 dom_distiller_store.Pass(), |
109 distiller_factory.Pass(), | 157 distiller_factory.Pass(), |
110 distiller_page_factory.Pass(), | 158 distiller_page_factory.Pass(), |
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
170 service->ViewUrl(this, | 218 service->ViewUrl(this, |
171 service->CreateDefaultDistillerPage(render_view_size), | 219 service->CreateDefaultDistillerPage(render_view_size), |
172 url_); | 220 url_); |
173 } | 221 } |
174 | 222 |
175 DistilledArticleProto GetArticleCopy() { | 223 DistilledArticleProto GetArticleCopy() { |
176 return *article_proto_; | 224 return *article_proto_; |
177 } | 225 } |
178 | 226 |
179 static ScopedVector<ContentExtractionRequest> CreateForCommandLine( | 227 static ScopedVector<ContentExtractionRequest> CreateForCommandLine( |
180 const CommandLine& command_line) { | 228 const CommandLine& command_line, |
229 UrlToDomainMap* url_to_domain_map) { | |
181 ScopedVector<ContentExtractionRequest> requests; | 230 ScopedVector<ContentExtractionRequest> requests; |
182 if (command_line.HasSwitch(kUrlSwitch)) { | 231 if (command_line.HasSwitch(kUrlSwitch)) { |
183 GURL url; | 232 GURL url; |
184 std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch); | 233 std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch); |
185 url = GURL(url_string); | 234 url = GURL(url_string); |
186 if (url.is_valid()) { | 235 if (url.is_valid()) { |
187 requests.push_back(new ContentExtractionRequest(url)); | 236 requests.push_back(new ContentExtractionRequest(url)); |
237 if (command_line.HasSwitch(kOriginalDomain)) { | |
238 (*url_to_domain_map)[url.spec()] = | |
239 command_line.GetSwitchValueASCII(kOriginalDomain); | |
240 } | |
188 } | 241 } |
189 } else if (command_line.HasSwitch(kUrlsSwitch)) { | 242 } else if (command_line.HasSwitch(kUrlsSwitch)) { |
190 std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch); | 243 std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch); |
191 std::vector<std::string> urls; | 244 std::vector<std::string> urls; |
192 base::SplitString(urls_string, ' ', &urls); | 245 base::SplitString(urls_string, ' ', &urls); |
246 // Check for original-domains switch, which must exactly pair up with | |
247 // |kUrlsSwitch| i.e. number of domains must be same as that of urls. | |
248 std::vector<std::string> domains; | |
249 if (command_line.HasSwitch(kOriginalDomains)) { | |
250 std::string domains_string = | |
251 command_line.GetSwitchValueASCII( kOriginalDomains); | |
252 base::SplitString(domains_string, ';', &domains); | |
253 if (domains.size() != urls.size()) domains.clear(); | |
254 } | |
193 for (size_t i = 0; i < urls.size(); ++i) { | 255 for (size_t i = 0; i < urls.size(); ++i) { |
194 GURL url(urls[i]); | 256 GURL url(urls[i]); |
195 if (url.is_valid()) { | 257 if (url.is_valid()) { |
196 requests.push_back(new ContentExtractionRequest(url)); | 258 requests.push_back(new ContentExtractionRequest(url)); |
259 // Only regard non-empty domain. | |
260 if (!domains.empty() && !domains[i].empty()) { | |
261 (*url_to_domain_map)[url.spec()] = domains[i]; | |
262 } | |
197 } else { | 263 } else { |
198 ADD_FAILURE() << "Bad url"; | 264 ADD_FAILURE() << "Bad url"; |
199 } | 265 } |
200 } | 266 } |
201 } | 267 } |
202 if (requests.empty()) { | 268 if (requests.empty()) { |
203 ADD_FAILURE() << "No valid url provided"; | 269 ADD_FAILURE() << "No valid url provided"; |
204 } | 270 } |
205 | 271 |
206 return requests.Pass(); | 272 return requests.Pass(); |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
248 } | 314 } |
249 | 315 |
250 virtual void TearDownOnMainThread() override { | 316 virtual void TearDownOnMainThread() override { |
251 DisableDNSLookupForThisTest(); | 317 DisableDNSLookupForThisTest(); |
252 } | 318 } |
253 | 319 |
254 protected: | 320 protected: |
255 // Creates the DomDistillerService and creates and starts the extraction | 321 // Creates the DomDistillerService and creates and starts the extraction |
256 // request. | 322 // request. |
257 void Start() { | 323 void Start() { |
324 const CommandLine& command_line = *CommandLine::ForCurrentProcess(); | |
325 UrlToDomainMap url_to_domain_map; | |
326 requests_ = ContentExtractionRequest::CreateForCommandLine( | |
327 command_line, &url_to_domain_map); | |
258 content::BrowserContext* context = | 328 content::BrowserContext* context = |
259 shell()->web_contents()->GetBrowserContext(); | 329 shell()->web_contents()->GetBrowserContext(); |
260 service_ = CreateDomDistillerService(context, | 330 service_ = CreateDomDistillerService(context, |
261 db_dir_.path()); | 331 db_dir_.path(), |
262 const CommandLine& command_line = *CommandLine::ForCurrentProcess(); | 332 url_to_domain_map); |
263 requests_ = ContentExtractionRequest::CreateForCommandLine(command_line); | |
264 PumpQueue(); | 333 PumpQueue(); |
265 } | 334 } |
266 | 335 |
267 void PumpQueue() { | 336 void PumpQueue() { |
268 while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) { | 337 while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) { |
269 requests_[next_request_]->Start( | 338 requests_[next_request_]->Start( |
270 service_.get(), | 339 service_.get(), |
271 shell()->web_contents()->GetContainerBounds().size(), | 340 shell()->web_contents()->GetContainerBounds().size(), |
272 base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this))); | 341 base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this))); |
273 ++next_request_; | 342 ++next_request_; |
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
342 std::string output_data_; | 411 std::string output_data_; |
343 scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_; | 412 scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_; |
344 }; | 413 }; |
345 | 414 |
346 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) { | 415 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) { |
347 Start(); | 416 Start(); |
348 base::RunLoop().Run(); | 417 base::RunLoop().Run(); |
349 } | 418 } |
350 | 419 |
351 } // namespace dom_distiller | 420 } // namespace dom_distiller |
OLD | NEW |