| OLD | NEW |
| (Empty) |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include <sstream> | |
| 6 | |
| 7 #include "base/command_line.h" | |
| 8 #include "base/files/scoped_temp_dir.h" | |
| 9 #include "base/id_map.h" | |
| 10 #include "base/message_loop/message_loop.h" | |
| 11 #include "base/path_service.h" | |
| 12 #include "base/run_loop.h" | |
| 13 #include "base/strings/string_number_conversions.h" | |
| 14 #include "base/strings/string_split.h" | |
| 15 #include "components/dom_distiller/content/distiller_page_web_contents.h" | |
| 16 #include "components/dom_distiller/core/article_entry.h" | |
| 17 #include "components/dom_distiller/core/distilled_page_prefs.h" | |
| 18 #include "components/dom_distiller/core/distiller.h" | |
| 19 #include "components/dom_distiller/core/dom_distiller_service.h" | |
| 20 #include "components/dom_distiller/core/dom_distiller_store.h" | |
| 21 #include "components/dom_distiller/core/proto/distilled_article.pb.h" | |
| 22 #include "components/dom_distiller/core/proto/distilled_page.pb.h" | |
| 23 #include "components/dom_distiller/core/task_tracker.h" | |
| 24 #include "components/leveldb_proto/proto_database.h" | |
| 25 #include "components/leveldb_proto/proto_database_impl.h" | |
| 26 #include "components/pref_registry/testing_pref_service_syncable.h" | |
| 27 #include "content/public/browser/browser_context.h" | |
| 28 #include "content/public/browser/browser_thread.h" | |
| 29 #include "content/public/test/content_browser_test.h" | |
| 30 #include "content/shell/browser/shell.h" | |
| 31 #include "google/protobuf/io/coded_stream.h" | |
| 32 #include "google/protobuf/io/zero_copy_stream_impl_lite.h" | |
| 33 #include "net/dns/mock_host_resolver.h" | |
| 34 #include "third_party/dom_distiller_js/dom_distiller.pb.h" | |
| 35 #include "ui/base/resource/resource_bundle.h" | |
| 36 | |
| 37 using content::ContentBrowserTest; | |
| 38 | |
| 39 namespace dom_distiller { | |
| 40 | |
| 41 namespace { | |
| 42 | |
| 43 typedef base::hash_map<std::string, std::string> UrlToDomainMap; | |
| 44 | |
| 45 } | |
| 46 | |
| 47 // Factory for creating a Distiller that creates different DomDistillerOptions | |
| 48 // for different URLs, i.e. a specific kOriginalDomain option for each URL. | |
| 49 class TestDistillerFactoryImpl : public DistillerFactory { | |
| 50 public: | |
| 51 TestDistillerFactoryImpl( | |
| 52 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory, | |
| 53 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options, | |
| 54 const UrlToDomainMap& url_to_domain_map) | |
| 55 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()), | |
| 56 dom_distiller_options_(dom_distiller_options), | |
| 57 url_to_domain_map_(url_to_domain_map) { | |
| 58 } | |
| 59 | |
| 60 ~TestDistillerFactoryImpl() override {} | |
| 61 | |
| 62 scoped_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override { | |
| 63 dom_distiller::proto::DomDistillerOptions options; | |
| 64 options = dom_distiller_options_; | |
| 65 UrlToDomainMap::const_iterator it = url_to_domain_map_.find(url.spec()); | |
| 66 if (it != url_to_domain_map_.end()) options.set_original_domain(it->second); | |
| 67 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( | |
| 68 *distiller_url_fetcher_factory_, options)); | |
| 69 return distiller.Pass(); | |
| 70 } | |
| 71 | |
| 72 private: | |
| 73 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_; | |
| 74 dom_distiller::proto::DomDistillerOptions dom_distiller_options_; | |
| 75 UrlToDomainMap url_to_domain_map_; | |
| 76 }; | |
| 77 | |
| 78 namespace { | |
| 79 | |
| 80 // The url to distill. | |
| 81 const char* kUrlSwitch = "url"; | |
| 82 | |
| 83 // A space-separated list of urls to distill. | |
| 84 const char* kUrlsSwitch = "urls"; | |
| 85 | |
| 86 // Indicates that DNS resolution should be disabled for this test. | |
| 87 const char* kDisableDnsSwitch = "disable-dns"; | |
| 88 | |
| 89 // Will write the distilled output to the given file instead of to stdout. | |
| 90 const char* kOutputFile = "output-file"; | |
| 91 | |
| 92 // Indicates to output a serialized protocol buffer instead of human-readable | |
| 93 // output. | |
| 94 const char* kShouldOutputBinary = "output-binary"; | |
| 95 | |
| 96 // Indicates to output only the text of the article and not the enclosing html. | |
| 97 const char* kExtractTextOnly = "extract-text-only"; | |
| 98 | |
| 99 // Indicates to include debug output. | |
| 100 const char* kDebugLevel = "debug-level"; | |
| 101 | |
| 102 // The original domain of the page if |kUrlSwitch| is a file. | |
| 103 const char* kOriginalDomain = "original-domain"; | |
| 104 | |
| 105 // A semi-colon-separated (i.e. ';') list of original domains corresponding to | |
| 106 // "kUrlsSwitch". | |
| 107 const char* kOriginalDomains = "original-domains"; | |
| 108 | |
| 109 // Maximum number of concurrent started extractor requests. | |
| 110 const int kMaxExtractorTasks = 8; | |
| 111 | |
| 112 scoped_ptr<DomDistillerService> CreateDomDistillerService( | |
| 113 content::BrowserContext* context, | |
| 114 const base::FilePath& db_path, | |
| 115 const UrlToDomainMap& url_to_domain_map) { | |
| 116 scoped_refptr<base::SequencedTaskRunner> background_task_runner = | |
| 117 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner( | |
| 118 content::BrowserThread::GetBlockingPool()->GetSequenceToken()); | |
| 119 | |
| 120 // TODO(cjhopman): use an in-memory database instead of an on-disk one with | |
| 121 // temporary directory. | |
| 122 scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db( | |
| 123 new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>( | |
| 124 background_task_runner)); | |
| 125 scoped_ptr<DomDistillerStore> dom_distiller_store( | |
| 126 new DomDistillerStore(db.Pass(), db_path)); | |
| 127 | |
| 128 scoped_ptr<DistillerPageFactory> distiller_page_factory( | |
| 129 new DistillerPageWebContentsFactory(context)); | |
| 130 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory( | |
| 131 new DistillerURLFetcherFactory(context->GetRequestContext())); | |
| 132 | |
| 133 dom_distiller::proto::DomDistillerOptions options; | |
| 134 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly)) { | |
| 135 options.set_extract_text_only(true); | |
| 136 } | |
| 137 int debug_level = 0; | |
| 138 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) && | |
| 139 base::StringToInt( | |
| 140 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII( | |
| 141 kDebugLevel), | |
| 142 &debug_level)) { | |
| 143 options.set_debug_level(debug_level); | |
| 144 } | |
| 145 scoped_ptr<DistillerFactory> distiller_factory( | |
| 146 new TestDistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), | |
| 147 options, | |
| 148 url_to_domain_map)); | |
| 149 | |
| 150 // Setting up PrefService for DistilledPagePrefs. | |
| 151 user_prefs::TestingPrefServiceSyncable* pref_service = | |
| 152 new user_prefs::TestingPrefServiceSyncable(); | |
| 153 DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry()); | |
| 154 | |
| 155 return scoped_ptr<DomDistillerService>(new DomDistillerService( | |
| 156 dom_distiller_store.Pass(), | |
| 157 distiller_factory.Pass(), | |
| 158 distiller_page_factory.Pass(), | |
| 159 scoped_ptr<DistilledPagePrefs>(new DistilledPagePrefs(pref_service)))); | |
| 160 } | |
| 161 | |
| 162 void AddComponentsTestResources() { | |
| 163 base::FilePath pak_file; | |
| 164 base::FilePath pak_dir; | |
| 165 PathService::Get(base::DIR_MODULE, &pak_dir); | |
| 166 pak_file = | |
| 167 pak_dir.Append(FILE_PATH_LITERAL("components_tests_resources.pak")); | |
| 168 ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath( | |
| 169 pak_file, ui::SCALE_FACTOR_NONE); | |
| 170 } | |
| 171 | |
| 172 bool WriteProtobufWithSize( | |
| 173 const google::protobuf::MessageLite& message, | |
| 174 google::protobuf::io::ZeroCopyOutputStream* output_stream) { | |
| 175 google::protobuf::io::CodedOutputStream coded_output(output_stream); | |
| 176 | |
| 177 // Write the size. | |
| 178 const int size = message.ByteSize(); | |
| 179 coded_output.WriteLittleEndian32(size); | |
| 180 message.SerializeWithCachedSizes(&coded_output); | |
| 181 return !coded_output.HadError(); | |
| 182 } | |
| 183 | |
| 184 std::string GetReadableArticleString( | |
| 185 const DistilledArticleProto& article_proto) { | |
| 186 std::stringstream output; | |
| 187 output << "Article Title: " << article_proto.title() << std::endl; | |
| 188 output << "# of pages: " << article_proto.pages_size() << std::endl; | |
| 189 for (int i = 0; i < article_proto.pages_size(); ++i) { | |
| 190 if (i > 0) output << std::endl; | |
| 191 const DistilledPageProto& page = article_proto.pages(i); | |
| 192 output << "Page " << i << std::endl; | |
| 193 output << "URL: " << page.url() << std::endl; | |
| 194 output << "Content: " << page.html() << std::endl; | |
| 195 if (page.has_debug_info() && page.debug_info().has_log()) | |
| 196 output << "Log: " << page.debug_info().log() << std::endl; | |
| 197 if (page.has_pagination_info()) { | |
| 198 if (page.pagination_info().has_next_page()) { | |
| 199 output << "Next Page: " << page.pagination_info().next_page() | |
| 200 << std::endl; | |
| 201 } | |
| 202 if (page.pagination_info().has_prev_page()) { | |
| 203 output << "Prev Page: " << page.pagination_info().prev_page() | |
| 204 << std::endl; | |
| 205 } | |
| 206 } | |
| 207 } | |
| 208 return output.str(); | |
| 209 } | |
| 210 | |
| 211 } // namespace | |
| 212 | |
| 213 class ContentExtractionRequest : public ViewRequestDelegate { | |
| 214 public: | |
| 215 void Start(DomDistillerService* service, const gfx::Size& render_view_size, | |
| 216 base::Closure finished_callback) { | |
| 217 finished_callback_ = finished_callback; | |
| 218 viewer_handle_ = | |
| 219 service->ViewUrl(this, | |
| 220 service->CreateDefaultDistillerPage(render_view_size), | |
| 221 url_); | |
| 222 } | |
| 223 | |
| 224 DistilledArticleProto GetArticleCopy() { | |
| 225 return *article_proto_; | |
| 226 } | |
| 227 | |
| 228 static ScopedVector<ContentExtractionRequest> CreateForCommandLine( | |
| 229 const base::CommandLine& command_line, | |
| 230 UrlToDomainMap* url_to_domain_map) { | |
| 231 ScopedVector<ContentExtractionRequest> requests; | |
| 232 if (command_line.HasSwitch(kUrlSwitch)) { | |
| 233 GURL url; | |
| 234 std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch); | |
| 235 url = GURL(url_string); | |
| 236 if (url.is_valid()) { | |
| 237 requests.push_back(new ContentExtractionRequest(url)); | |
| 238 if (command_line.HasSwitch(kOriginalDomain)) { | |
| 239 (*url_to_domain_map)[url.spec()] = | |
| 240 command_line.GetSwitchValueASCII(kOriginalDomain); | |
| 241 } | |
| 242 } | |
| 243 } else if (command_line.HasSwitch(kUrlsSwitch)) { | |
| 244 std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch); | |
| 245 std::vector<std::string> urls; | |
| 246 base::SplitString(urls_string, ' ', &urls); | |
| 247 // Check for original-domains switch, which must exactly pair up with | |
| 248 // |kUrlsSwitch| i.e. number of domains must be same as that of urls. | |
| 249 std::vector<std::string> domains; | |
| 250 if (command_line.HasSwitch(kOriginalDomains)) { | |
| 251 std::string domains_string = | |
| 252 command_line.GetSwitchValueASCII( kOriginalDomains); | |
| 253 base::SplitString(domains_string, ';', &domains); | |
| 254 if (domains.size() != urls.size()) domains.clear(); | |
| 255 } | |
| 256 for (size_t i = 0; i < urls.size(); ++i) { | |
| 257 GURL url(urls[i]); | |
| 258 if (url.is_valid()) { | |
| 259 requests.push_back(new ContentExtractionRequest(url)); | |
| 260 // Only regard non-empty domain. | |
| 261 if (!domains.empty() && !domains[i].empty()) { | |
| 262 (*url_to_domain_map)[url.spec()] = domains[i]; | |
| 263 } | |
| 264 } else { | |
| 265 ADD_FAILURE() << "Bad url"; | |
| 266 } | |
| 267 } | |
| 268 } | |
| 269 if (requests.empty()) { | |
| 270 ADD_FAILURE() << "No valid url provided"; | |
| 271 } | |
| 272 | |
| 273 return requests.Pass(); | |
| 274 } | |
| 275 | |
| 276 private: | |
| 277 ContentExtractionRequest(const GURL& url) : url_(url) {} | |
| 278 | |
| 279 void OnArticleUpdated(ArticleDistillationUpdate article_update) override {} | |
| 280 | |
| 281 void OnArticleReady(const DistilledArticleProto* article_proto) override { | |
| 282 article_proto_ = article_proto; | |
| 283 CHECK(article_proto->pages_size()) << "Failed extracting " << url_; | |
| 284 base::MessageLoop::current()->PostTask( | |
| 285 FROM_HERE, | |
| 286 finished_callback_); | |
| 287 } | |
| 288 | |
| 289 const DistilledArticleProto* article_proto_; | |
| 290 scoped_ptr<ViewerHandle> viewer_handle_; | |
| 291 GURL url_; | |
| 292 base::Closure finished_callback_; | |
| 293 }; | |
| 294 | |
| 295 class ContentExtractor : public ContentBrowserTest { | |
| 296 public: | |
| 297 ContentExtractor() | |
| 298 : pending_tasks_(0), | |
| 299 max_tasks_(kMaxExtractorTasks), | |
| 300 next_request_(0), | |
| 301 output_data_(), | |
| 302 protobuf_output_stream_( | |
| 303 new google::protobuf::io::StringOutputStream(&output_data_)) {} | |
| 304 | |
| 305 // Change behavior of the default host resolver to avoid DNS lookup errors, so | |
| 306 // we can make network calls. | |
| 307 void SetUpOnMainThread() override { | |
| 308 if (!base::CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch)) { | |
| 309 EnableDNSLookupForThisTest(); | |
| 310 } | |
| 311 CHECK(db_dir_.CreateUniqueTempDir()); | |
| 312 AddComponentsTestResources(); | |
| 313 } | |
| 314 | |
| 315 void TearDownOnMainThread() override { DisableDNSLookupForThisTest(); } | |
| 316 | |
| 317 protected: | |
| 318 // Creates the DomDistillerService and creates and starts the extraction | |
| 319 // request. | |
| 320 void Start() { | |
| 321 const base::CommandLine& command_line = | |
| 322 *base::CommandLine::ForCurrentProcess(); | |
| 323 UrlToDomainMap url_to_domain_map; | |
| 324 requests_ = ContentExtractionRequest::CreateForCommandLine( | |
| 325 command_line, &url_to_domain_map); | |
| 326 content::BrowserContext* context = | |
| 327 shell()->web_contents()->GetBrowserContext(); | |
| 328 service_ = CreateDomDistillerService(context, | |
| 329 db_dir_.path(), | |
| 330 url_to_domain_map); | |
| 331 PumpQueue(); | |
| 332 } | |
| 333 | |
| 334 void PumpQueue() { | |
| 335 while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) { | |
| 336 requests_[next_request_]->Start( | |
| 337 service_.get(), | |
| 338 shell()->web_contents()->GetContainerBounds().size(), | |
| 339 base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this))); | |
| 340 ++next_request_; | |
| 341 ++pending_tasks_; | |
| 342 } | |
| 343 } | |
| 344 | |
| 345 private: | |
| 346 // Change behavior of the default host resolver to allow DNS lookup | |
| 347 // to proceed instead of being blocked by the test infrastructure. | |
| 348 void EnableDNSLookupForThisTest() { | |
| 349 // mock_host_resolver_override_ takes ownership of the resolver. | |
| 350 scoped_refptr<net::RuleBasedHostResolverProc> resolver = | |
| 351 new net::RuleBasedHostResolverProc(host_resolver()); | |
| 352 resolver->AllowDirectLookup("*"); | |
| 353 mock_host_resolver_override_.reset( | |
| 354 new net::ScopedDefaultHostResolverProc(resolver.get())); | |
| 355 } | |
| 356 | |
| 357 // We need to reset the DNS lookup when we finish, or the test will fail. | |
| 358 void DisableDNSLookupForThisTest() { | |
| 359 mock_host_resolver_override_.reset(); | |
| 360 } | |
| 361 | |
| 362 void FinishRequest() { | |
| 363 --pending_tasks_; | |
| 364 if (next_request_ == requests_.size() && pending_tasks_ == 0) { | |
| 365 Finish(); | |
| 366 } else { | |
| 367 PumpQueue(); | |
| 368 } | |
| 369 } | |
| 370 | |
| 371 void DoArticleOutput() { | |
| 372 const base::CommandLine& command_line = | |
| 373 *base::CommandLine::ForCurrentProcess(); | |
| 374 for (size_t i = 0; i < requests_.size(); ++i) { | |
| 375 const DistilledArticleProto& article = requests_[i]->GetArticleCopy(); | |
| 376 if (command_line.HasSwitch(kShouldOutputBinary)) { | |
| 377 WriteProtobufWithSize(article, protobuf_output_stream_.get()); | |
| 378 } else { | |
| 379 output_data_ += GetReadableArticleString(article) + "\n"; | |
| 380 } | |
| 381 } | |
| 382 | |
| 383 if (command_line.HasSwitch(kOutputFile)) { | |
| 384 base::FilePath filename = command_line.GetSwitchValuePath(kOutputFile); | |
| 385 ASSERT_EQ( | |
| 386 (int)output_data_.size(), | |
| 387 base::WriteFile(filename, output_data_.c_str(), output_data_.size())); | |
| 388 } else { | |
| 389 VLOG(0) << output_data_; | |
| 390 } | |
| 391 } | |
| 392 | |
| 393 void Finish() { | |
| 394 DoArticleOutput(); | |
| 395 requests_.clear(); | |
| 396 service_.reset(); | |
| 397 base::MessageLoop::current()->PostTask( | |
| 398 FROM_HERE, base::MessageLoop::QuitWhenIdleClosure()); | |
| 399 } | |
| 400 | |
| 401 size_t pending_tasks_; | |
| 402 size_t max_tasks_; | |
| 403 size_t next_request_; | |
| 404 | |
| 405 base::ScopedTempDir db_dir_; | |
| 406 scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_; | |
| 407 scoped_ptr<DomDistillerService> service_; | |
| 408 ScopedVector<ContentExtractionRequest> requests_; | |
| 409 | |
| 410 std::string output_data_; | |
| 411 scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_; | |
| 412 }; | |
| 413 | |
| 414 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) { | |
| 415 Start(); | |
| 416 base::RunLoop().Run(); | |
| 417 } | |
| 418 | |
| 419 } // namespace dom_distiller | |
| OLD | NEW |