OLD | NEW |
| (Empty) |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include <sstream> | |
6 | |
7 #include "base/command_line.h" | |
8 #include "base/files/scoped_temp_dir.h" | |
9 #include "base/id_map.h" | |
10 #include "base/message_loop/message_loop.h" | |
11 #include "base/path_service.h" | |
12 #include "base/run_loop.h" | |
13 #include "base/strings/string_number_conversions.h" | |
14 #include "base/strings/string_split.h" | |
15 #include "components/dom_distiller/content/distiller_page_web_contents.h" | |
16 #include "components/dom_distiller/core/article_entry.h" | |
17 #include "components/dom_distiller/core/distilled_page_prefs.h" | |
18 #include "components/dom_distiller/core/distiller.h" | |
19 #include "components/dom_distiller/core/dom_distiller_service.h" | |
20 #include "components/dom_distiller/core/dom_distiller_store.h" | |
21 #include "components/dom_distiller/core/proto/distilled_article.pb.h" | |
22 #include "components/dom_distiller/core/proto/distilled_page.pb.h" | |
23 #include "components/dom_distiller/core/task_tracker.h" | |
24 #include "components/leveldb_proto/proto_database.h" | |
25 #include "components/leveldb_proto/proto_database_impl.h" | |
26 #include "components/pref_registry/testing_pref_service_syncable.h" | |
27 #include "content/public/browser/browser_context.h" | |
28 #include "content/public/browser/browser_thread.h" | |
29 #include "content/public/test/content_browser_test.h" | |
30 #include "content/shell/browser/shell.h" | |
31 #include "google/protobuf/io/coded_stream.h" | |
32 #include "google/protobuf/io/zero_copy_stream_impl_lite.h" | |
33 #include "net/dns/mock_host_resolver.h" | |
34 #include "third_party/dom_distiller_js/dom_distiller.pb.h" | |
35 #include "ui/base/resource/resource_bundle.h" | |
36 | |
37 using content::ContentBrowserTest; | |
38 | |
39 namespace dom_distiller { | |
40 | |
41 namespace { | |
42 | |
43 typedef base::hash_map<std::string, std::string> UrlToDomainMap; | |
44 | |
45 } | |
46 | |
47 // Factory for creating a Distiller that creates different DomDistillerOptions | |
48 // for different URLs, i.e. a specific kOriginalDomain option for each URL. | |
49 class TestDistillerFactoryImpl : public DistillerFactory { | |
50 public: | |
51 TestDistillerFactoryImpl( | |
52 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory, | |
53 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options, | |
54 const UrlToDomainMap& url_to_domain_map) | |
55 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()), | |
56 dom_distiller_options_(dom_distiller_options), | |
57 url_to_domain_map_(url_to_domain_map) { | |
58 } | |
59 | |
60 ~TestDistillerFactoryImpl() override {} | |
61 | |
62 scoped_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override { | |
63 dom_distiller::proto::DomDistillerOptions options; | |
64 options = dom_distiller_options_; | |
65 UrlToDomainMap::const_iterator it = url_to_domain_map_.find(url.spec()); | |
66 if (it != url_to_domain_map_.end()) options.set_original_domain(it->second); | |
67 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( | |
68 *distiller_url_fetcher_factory_, options)); | |
69 return distiller.Pass(); | |
70 } | |
71 | |
72 private: | |
73 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_; | |
74 dom_distiller::proto::DomDistillerOptions dom_distiller_options_; | |
75 UrlToDomainMap url_to_domain_map_; | |
76 }; | |
77 | |
78 namespace { | |
79 | |
80 // The url to distill. | |
81 const char* kUrlSwitch = "url"; | |
82 | |
83 // A space-separated list of urls to distill. | |
84 const char* kUrlsSwitch = "urls"; | |
85 | |
86 // Indicates that DNS resolution should be disabled for this test. | |
87 const char* kDisableDnsSwitch = "disable-dns"; | |
88 | |
89 // Will write the distilled output to the given file instead of to stdout. | |
90 const char* kOutputFile = "output-file"; | |
91 | |
92 // Indicates to output a serialized protocol buffer instead of human-readable | |
93 // output. | |
94 const char* kShouldOutputBinary = "output-binary"; | |
95 | |
96 // Indicates to output only the text of the article and not the enclosing html. | |
97 const char* kExtractTextOnly = "extract-text-only"; | |
98 | |
99 // Indicates to include debug output. | |
100 const char* kDebugLevel = "debug-level"; | |
101 | |
102 // The original domain of the page if |kUrlSwitch| is a file. | |
103 const char* kOriginalDomain = "original-domain"; | |
104 | |
105 // A semi-colon-separated (i.e. ';') list of original domains corresponding to | |
106 // "kUrlsSwitch". | |
107 const char* kOriginalDomains = "original-domains"; | |
108 | |
109 // Maximum number of concurrent started extractor requests. | |
110 const int kMaxExtractorTasks = 8; | |
111 | |
112 scoped_ptr<DomDistillerService> CreateDomDistillerService( | |
113 content::BrowserContext* context, | |
114 const base::FilePath& db_path, | |
115 const UrlToDomainMap& url_to_domain_map) { | |
116 scoped_refptr<base::SequencedTaskRunner> background_task_runner = | |
117 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner( | |
118 content::BrowserThread::GetBlockingPool()->GetSequenceToken()); | |
119 | |
120 // TODO(cjhopman): use an in-memory database instead of an on-disk one with | |
121 // temporary directory. | |
122 scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db( | |
123 new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>( | |
124 background_task_runner)); | |
125 scoped_ptr<DomDistillerStore> dom_distiller_store( | |
126 new DomDistillerStore(db.Pass(), db_path)); | |
127 | |
128 scoped_ptr<DistillerPageFactory> distiller_page_factory( | |
129 new DistillerPageWebContentsFactory(context)); | |
130 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory( | |
131 new DistillerURLFetcherFactory(context->GetRequestContext())); | |
132 | |
133 dom_distiller::proto::DomDistillerOptions options; | |
134 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly)) { | |
135 options.set_extract_text_only(true); | |
136 } | |
137 int debug_level = 0; | |
138 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) && | |
139 base::StringToInt( | |
140 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII( | |
141 kDebugLevel), | |
142 &debug_level)) { | |
143 options.set_debug_level(debug_level); | |
144 } | |
145 scoped_ptr<DistillerFactory> distiller_factory( | |
146 new TestDistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), | |
147 options, | |
148 url_to_domain_map)); | |
149 | |
150 // Setting up PrefService for DistilledPagePrefs. | |
151 user_prefs::TestingPrefServiceSyncable* pref_service = | |
152 new user_prefs::TestingPrefServiceSyncable(); | |
153 DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry()); | |
154 | |
155 return scoped_ptr<DomDistillerService>(new DomDistillerService( | |
156 dom_distiller_store.Pass(), | |
157 distiller_factory.Pass(), | |
158 distiller_page_factory.Pass(), | |
159 scoped_ptr<DistilledPagePrefs>(new DistilledPagePrefs(pref_service)))); | |
160 } | |
161 | |
162 void AddComponentsTestResources() { | |
163 base::FilePath pak_file; | |
164 base::FilePath pak_dir; | |
165 PathService::Get(base::DIR_MODULE, &pak_dir); | |
166 pak_file = | |
167 pak_dir.Append(FILE_PATH_LITERAL("components_tests_resources.pak")); | |
168 ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath( | |
169 pak_file, ui::SCALE_FACTOR_NONE); | |
170 } | |
171 | |
172 bool WriteProtobufWithSize( | |
173 const google::protobuf::MessageLite& message, | |
174 google::protobuf::io::ZeroCopyOutputStream* output_stream) { | |
175 google::protobuf::io::CodedOutputStream coded_output(output_stream); | |
176 | |
177 // Write the size. | |
178 const int size = message.ByteSize(); | |
179 coded_output.WriteLittleEndian32(size); | |
180 message.SerializeWithCachedSizes(&coded_output); | |
181 return !coded_output.HadError(); | |
182 } | |
183 | |
184 std::string GetReadableArticleString( | |
185 const DistilledArticleProto& article_proto) { | |
186 std::stringstream output; | |
187 output << "Article Title: " << article_proto.title() << std::endl; | |
188 output << "# of pages: " << article_proto.pages_size() << std::endl; | |
189 for (int i = 0; i < article_proto.pages_size(); ++i) { | |
190 if (i > 0) output << std::endl; | |
191 const DistilledPageProto& page = article_proto.pages(i); | |
192 output << "Page " << i << std::endl; | |
193 output << "URL: " << page.url() << std::endl; | |
194 output << "Content: " << page.html() << std::endl; | |
195 if (page.has_debug_info() && page.debug_info().has_log()) | |
196 output << "Log: " << page.debug_info().log() << std::endl; | |
197 if (page.has_pagination_info()) { | |
198 if (page.pagination_info().has_next_page()) { | |
199 output << "Next Page: " << page.pagination_info().next_page() | |
200 << std::endl; | |
201 } | |
202 if (page.pagination_info().has_prev_page()) { | |
203 output << "Prev Page: " << page.pagination_info().prev_page() | |
204 << std::endl; | |
205 } | |
206 } | |
207 } | |
208 return output.str(); | |
209 } | |
210 | |
211 } // namespace | |
212 | |
213 class ContentExtractionRequest : public ViewRequestDelegate { | |
214 public: | |
215 void Start(DomDistillerService* service, const gfx::Size& render_view_size, | |
216 base::Closure finished_callback) { | |
217 finished_callback_ = finished_callback; | |
218 viewer_handle_ = | |
219 service->ViewUrl(this, | |
220 service->CreateDefaultDistillerPage(render_view_size), | |
221 url_); | |
222 } | |
223 | |
224 DistilledArticleProto GetArticleCopy() { | |
225 return *article_proto_; | |
226 } | |
227 | |
228 static ScopedVector<ContentExtractionRequest> CreateForCommandLine( | |
229 const base::CommandLine& command_line, | |
230 UrlToDomainMap* url_to_domain_map) { | |
231 ScopedVector<ContentExtractionRequest> requests; | |
232 if (command_line.HasSwitch(kUrlSwitch)) { | |
233 GURL url; | |
234 std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch); | |
235 url = GURL(url_string); | |
236 if (url.is_valid()) { | |
237 requests.push_back(new ContentExtractionRequest(url)); | |
238 if (command_line.HasSwitch(kOriginalDomain)) { | |
239 (*url_to_domain_map)[url.spec()] = | |
240 command_line.GetSwitchValueASCII(kOriginalDomain); | |
241 } | |
242 } | |
243 } else if (command_line.HasSwitch(kUrlsSwitch)) { | |
244 std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch); | |
245 std::vector<std::string> urls; | |
246 base::SplitString(urls_string, ' ', &urls); | |
247 // Check for original-domains switch, which must exactly pair up with | |
248 // |kUrlsSwitch| i.e. number of domains must be same as that of urls. | |
249 std::vector<std::string> domains; | |
250 if (command_line.HasSwitch(kOriginalDomains)) { | |
251 std::string domains_string = | |
252 command_line.GetSwitchValueASCII( kOriginalDomains); | |
253 base::SplitString(domains_string, ';', &domains); | |
254 if (domains.size() != urls.size()) domains.clear(); | |
255 } | |
256 for (size_t i = 0; i < urls.size(); ++i) { | |
257 GURL url(urls[i]); | |
258 if (url.is_valid()) { | |
259 requests.push_back(new ContentExtractionRequest(url)); | |
260 // Only regard non-empty domain. | |
261 if (!domains.empty() && !domains[i].empty()) { | |
262 (*url_to_domain_map)[url.spec()] = domains[i]; | |
263 } | |
264 } else { | |
265 ADD_FAILURE() << "Bad url"; | |
266 } | |
267 } | |
268 } | |
269 if (requests.empty()) { | |
270 ADD_FAILURE() << "No valid url provided"; | |
271 } | |
272 | |
273 return requests.Pass(); | |
274 } | |
275 | |
276 private: | |
277 ContentExtractionRequest(const GURL& url) : url_(url) {} | |
278 | |
279 void OnArticleUpdated(ArticleDistillationUpdate article_update) override {} | |
280 | |
281 void OnArticleReady(const DistilledArticleProto* article_proto) override { | |
282 article_proto_ = article_proto; | |
283 CHECK(article_proto->pages_size()) << "Failed extracting " << url_; | |
284 base::MessageLoop::current()->PostTask( | |
285 FROM_HERE, | |
286 finished_callback_); | |
287 } | |
288 | |
289 const DistilledArticleProto* article_proto_; | |
290 scoped_ptr<ViewerHandle> viewer_handle_; | |
291 GURL url_; | |
292 base::Closure finished_callback_; | |
293 }; | |
294 | |
295 class ContentExtractor : public ContentBrowserTest { | |
296 public: | |
297 ContentExtractor() | |
298 : pending_tasks_(0), | |
299 max_tasks_(kMaxExtractorTasks), | |
300 next_request_(0), | |
301 output_data_(), | |
302 protobuf_output_stream_( | |
303 new google::protobuf::io::StringOutputStream(&output_data_)) {} | |
304 | |
305 // Change behavior of the default host resolver to avoid DNS lookup errors, so | |
306 // we can make network calls. | |
307 void SetUpOnMainThread() override { | |
308 if (!base::CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch)) { | |
309 EnableDNSLookupForThisTest(); | |
310 } | |
311 CHECK(db_dir_.CreateUniqueTempDir()); | |
312 AddComponentsTestResources(); | |
313 } | |
314 | |
315 void TearDownOnMainThread() override { DisableDNSLookupForThisTest(); } | |
316 | |
317 protected: | |
318 // Creates the DomDistillerService and creates and starts the extraction | |
319 // request. | |
320 void Start() { | |
321 const base::CommandLine& command_line = | |
322 *base::CommandLine::ForCurrentProcess(); | |
323 UrlToDomainMap url_to_domain_map; | |
324 requests_ = ContentExtractionRequest::CreateForCommandLine( | |
325 command_line, &url_to_domain_map); | |
326 content::BrowserContext* context = | |
327 shell()->web_contents()->GetBrowserContext(); | |
328 service_ = CreateDomDistillerService(context, | |
329 db_dir_.path(), | |
330 url_to_domain_map); | |
331 PumpQueue(); | |
332 } | |
333 | |
334 void PumpQueue() { | |
335 while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) { | |
336 requests_[next_request_]->Start( | |
337 service_.get(), | |
338 shell()->web_contents()->GetContainerBounds().size(), | |
339 base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this))); | |
340 ++next_request_; | |
341 ++pending_tasks_; | |
342 } | |
343 } | |
344 | |
345 private: | |
346 // Change behavior of the default host resolver to allow DNS lookup | |
347 // to proceed instead of being blocked by the test infrastructure. | |
348 void EnableDNSLookupForThisTest() { | |
349 // mock_host_resolver_override_ takes ownership of the resolver. | |
350 scoped_refptr<net::RuleBasedHostResolverProc> resolver = | |
351 new net::RuleBasedHostResolverProc(host_resolver()); | |
352 resolver->AllowDirectLookup("*"); | |
353 mock_host_resolver_override_.reset( | |
354 new net::ScopedDefaultHostResolverProc(resolver.get())); | |
355 } | |
356 | |
357 // We need to reset the DNS lookup when we finish, or the test will fail. | |
358 void DisableDNSLookupForThisTest() { | |
359 mock_host_resolver_override_.reset(); | |
360 } | |
361 | |
362 void FinishRequest() { | |
363 --pending_tasks_; | |
364 if (next_request_ == requests_.size() && pending_tasks_ == 0) { | |
365 Finish(); | |
366 } else { | |
367 PumpQueue(); | |
368 } | |
369 } | |
370 | |
371 void DoArticleOutput() { | |
372 const base::CommandLine& command_line = | |
373 *base::CommandLine::ForCurrentProcess(); | |
374 for (size_t i = 0; i < requests_.size(); ++i) { | |
375 const DistilledArticleProto& article = requests_[i]->GetArticleCopy(); | |
376 if (command_line.HasSwitch(kShouldOutputBinary)) { | |
377 WriteProtobufWithSize(article, protobuf_output_stream_.get()); | |
378 } else { | |
379 output_data_ += GetReadableArticleString(article) + "\n"; | |
380 } | |
381 } | |
382 | |
383 if (command_line.HasSwitch(kOutputFile)) { | |
384 base::FilePath filename = command_line.GetSwitchValuePath(kOutputFile); | |
385 ASSERT_EQ( | |
386 (int)output_data_.size(), | |
387 base::WriteFile(filename, output_data_.c_str(), output_data_.size())); | |
388 } else { | |
389 VLOG(0) << output_data_; | |
390 } | |
391 } | |
392 | |
393 void Finish() { | |
394 DoArticleOutput(); | |
395 requests_.clear(); | |
396 service_.reset(); | |
397 base::MessageLoop::current()->PostTask( | |
398 FROM_HERE, base::MessageLoop::QuitWhenIdleClosure()); | |
399 } | |
400 | |
401 size_t pending_tasks_; | |
402 size_t max_tasks_; | |
403 size_t next_request_; | |
404 | |
405 base::ScopedTempDir db_dir_; | |
406 scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_; | |
407 scoped_ptr<DomDistillerService> service_; | |
408 ScopedVector<ContentExtractionRequest> requests_; | |
409 | |
410 std::string output_data_; | |
411 scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_; | |
412 }; | |
413 | |
414 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) { | |
415 Start(); | |
416 base::RunLoop().Run(); | |
417 } | |
418 | |
419 } // namespace dom_distiller | |
OLD | NEW |