| OLD | NEW |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include <sstream> | 5 #include <sstream> |
| 6 | 6 |
| 7 #include "base/command_line.h" | 7 #include "base/command_line.h" |
| 8 #include "base/files/scoped_temp_dir.h" | 8 #include "base/files/scoped_temp_dir.h" |
| 9 #include "base/id_map.h" | 9 #include "base/id_map.h" |
| 10 #include "base/message_loop/message_loop.h" | 10 #include "base/message_loop/message_loop.h" |
| (...skipping 22 matching lines...) Expand all Loading... |
| 33 #include "net/dns/mock_host_resolver.h" | 33 #include "net/dns/mock_host_resolver.h" |
| 34 #include "third_party/dom_distiller_js/dom_distiller.pb.h" | 34 #include "third_party/dom_distiller_js/dom_distiller.pb.h" |
| 35 #include "ui/base/resource/resource_bundle.h" | 35 #include "ui/base/resource/resource_bundle.h" |
| 36 | 36 |
| 37 using content::ContentBrowserTest; | 37 using content::ContentBrowserTest; |
| 38 | 38 |
| 39 namespace dom_distiller { | 39 namespace dom_distiller { |
| 40 | 40 |
| 41 namespace { | 41 namespace { |
| 42 | 42 |
| 43 typedef base::hash_map<std::string, std::string> UrlToDomainMap; | 43 typedef base::hash_map<std::string, std::string> FileToUrlMap; |
| 44 | 44 |
| 45 } | 45 } |
| 46 | 46 |
| 47 // Factory for creating a Distiller that creates different DomDistillerOptions | 47 // Factory for creating a Distiller that creates different DomDistillerOptions |
| 48 // for different URLs, i.e. a specific kOriginalDomain option for each URL. | 48 // for different URLs, i.e. a specific kOriginalUrl option for each URL. |
| 49 class TestDistillerFactoryImpl : public DistillerFactory { | 49 class TestDistillerFactoryImpl : public DistillerFactory { |
| 50 public: | 50 public: |
| 51 TestDistillerFactoryImpl( | 51 TestDistillerFactoryImpl( |
| 52 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory, | 52 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory, |
| 53 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options, | 53 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options, |
| 54 const UrlToDomainMap& url_to_domain_map) | 54 const FileToUrlMap& file_to_url_map) |
| 55 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()), | 55 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()), |
| 56 dom_distiller_options_(dom_distiller_options), | 56 dom_distiller_options_(dom_distiller_options), |
| 57 url_to_domain_map_(url_to_domain_map) { | 57 file_to_url_map_(file_to_url_map) { |
| 58 } | 58 } |
| 59 | 59 |
| 60 ~TestDistillerFactoryImpl() override {} | 60 ~TestDistillerFactoryImpl() override {} |
| 61 | 61 |
| 62 scoped_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override { | 62 scoped_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override { |
| 63 dom_distiller::proto::DomDistillerOptions options; | 63 dom_distiller::proto::DomDistillerOptions options; |
| 64 options = dom_distiller_options_; | 64 options = dom_distiller_options_; |
| 65 UrlToDomainMap::const_iterator it = url_to_domain_map_.find(url.spec()); | 65 FileToUrlMap::const_iterator it = file_to_url_map_.find(url.spec()); |
| 66 if (it != url_to_domain_map_.end()) options.set_original_domain(it->second); | 66 if (it != file_to_url_map_.end()) { |
| 67 options.set_original_url(it->second); |
| 68 } |
| 67 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( | 69 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( |
| 68 *distiller_url_fetcher_factory_, options)); | 70 *distiller_url_fetcher_factory_, options)); |
| 69 return distiller.Pass(); | 71 return distiller.Pass(); |
| 70 } | 72 } |
| 71 | 73 |
| 72 private: | 74 private: |
| 73 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_; | 75 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_; |
| 74 dom_distiller::proto::DomDistillerOptions dom_distiller_options_; | 76 dom_distiller::proto::DomDistillerOptions dom_distiller_options_; |
| 75 UrlToDomainMap url_to_domain_map_; | 77 FileToUrlMap file_to_url_map_; |
| 76 }; | 78 }; |
| 77 | 79 |
| 78 namespace { | 80 namespace { |
| 79 | 81 |
| 80 // The url to distill. | 82 // The url to distill. |
| 81 const char* kUrlSwitch = "url"; | 83 const char* kUrlSwitch = "url"; |
| 82 | 84 |
| 83 // A space-separated list of urls to distill. | 85 // A space-separated list of urls to distill. |
| 84 const char* kUrlsSwitch = "urls"; | 86 const char* kUrlsSwitch = "urls"; |
| 85 | 87 |
| 86 // Indicates that DNS resolution should be disabled for this test. | 88 // Indicates that DNS resolution should be disabled for this test. |
| 87 const char* kDisableDnsSwitch = "disable-dns"; | 89 const char* kDisableDnsSwitch = "disable-dns"; |
| 88 | 90 |
| 89 // Will write the distilled output to the given file instead of to stdout. | 91 // Will write the distilled output to the given file instead of to stdout. |
| 90 const char* kOutputFile = "output-file"; | 92 const char* kOutputFile = "output-file"; |
| 91 | 93 |
| 92 // Indicates to output a serialized protocol buffer instead of human-readable | 94 // Indicates to output a serialized protocol buffer instead of human-readable |
| 93 // output. | 95 // output. |
| 94 const char* kShouldOutputBinary = "output-binary"; | 96 const char* kShouldOutputBinary = "output-binary"; |
| 95 | 97 |
| 96 // Indicates to output only the text of the article and not the enclosing html. | 98 // Indicates to output only the text of the article and not the enclosing html. |
| 97 const char* kExtractTextOnly = "extract-text-only"; | 99 const char* kExtractTextOnly = "extract-text-only"; |
| 98 | 100 |
| 99 // Indicates to include debug output. | 101 // Indicates to include debug output. |
| 100 const char* kDebugLevel = "debug-level"; | 102 const char* kDebugLevel = "debug-level"; |
| 101 | 103 |
| 102 // The original domain of the page if |kUrlSwitch| is a file. | 104 // The original URL of the page if |kUrlSwitch| is a file. |
| 103 const char* kOriginalDomain = "original-domain"; | 105 const char* kOriginalUrl = "original-url"; |
| 104 | 106 |
| 105 // A semi-colon-separated (i.e. ';') list of original domains corresponding to | 107 // A semi-colon-separated (i.e. ';') list of original URLs corresponding to |
| 106 // "kUrlsSwitch". | 108 // "kUrlsSwitch". |
| 107 const char* kOriginalDomains = "original-domains"; | 109 const char* kOriginalUrls = "original-urls"; |
| 108 | 110 |
| 109 // Maximum number of concurrent started extractor requests. | 111 // Maximum number of concurrent started extractor requests. |
| 110 const int kMaxExtractorTasks = 8; | 112 const int kMaxExtractorTasks = 8; |
| 111 | 113 |
| 112 scoped_ptr<DomDistillerService> CreateDomDistillerService( | 114 scoped_ptr<DomDistillerService> CreateDomDistillerService( |
| 113 content::BrowserContext* context, | 115 content::BrowserContext* context, |
| 114 const base::FilePath& db_path, | 116 const base::FilePath& db_path, |
| 115 const UrlToDomainMap& url_to_domain_map) { | 117 const FileToUrlMap& file_to_url_map) { |
| 116 scoped_refptr<base::SequencedTaskRunner> background_task_runner = | 118 scoped_refptr<base::SequencedTaskRunner> background_task_runner = |
| 117 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner( | 119 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner( |
| 118 content::BrowserThread::GetBlockingPool()->GetSequenceToken()); | 120 content::BrowserThread::GetBlockingPool()->GetSequenceToken()); |
| 119 | 121 |
| 120 // TODO(cjhopman): use an in-memory database instead of an on-disk one with | 122 // TODO(cjhopman): use an in-memory database instead of an on-disk one with |
| 121 // temporary directory. | 123 // temporary directory. |
| 122 scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db( | 124 scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db( |
| 123 new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>( | 125 new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>( |
| 124 background_task_runner)); | 126 background_task_runner)); |
| 125 scoped_ptr<DomDistillerStore> dom_distiller_store( | 127 scoped_ptr<DomDistillerStore> dom_distiller_store( |
| (...skipping 12 matching lines...) Expand all Loading... |
| 138 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) && | 140 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) && |
| 139 base::StringToInt( | 141 base::StringToInt( |
| 140 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII( | 142 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII( |
| 141 kDebugLevel), | 143 kDebugLevel), |
| 142 &debug_level)) { | 144 &debug_level)) { |
| 143 options.set_debug_level(debug_level); | 145 options.set_debug_level(debug_level); |
| 144 } | 146 } |
| 145 scoped_ptr<DistillerFactory> distiller_factory( | 147 scoped_ptr<DistillerFactory> distiller_factory( |
| 146 new TestDistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), | 148 new TestDistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), |
| 147 options, | 149 options, |
| 148 url_to_domain_map)); | 150 file_to_url_map)); |
| 149 | 151 |
| 150 // Setting up PrefService for DistilledPagePrefs. | 152 // Setting up PrefService for DistilledPagePrefs. |
| 151 user_prefs::TestingPrefServiceSyncable* pref_service = | 153 user_prefs::TestingPrefServiceSyncable* pref_service = |
| 152 new user_prefs::TestingPrefServiceSyncable(); | 154 new user_prefs::TestingPrefServiceSyncable(); |
| 153 DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry()); | 155 DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry()); |
| 154 | 156 |
| 155 return scoped_ptr<DomDistillerService>(new DomDistillerService( | 157 return scoped_ptr<DomDistillerService>(new DomDistillerService( |
| 156 dom_distiller_store.Pass(), | 158 dom_distiller_store.Pass(), |
| 157 distiller_factory.Pass(), | 159 distiller_factory.Pass(), |
| 158 distiller_page_factory.Pass(), | 160 distiller_page_factory.Pass(), |
| (...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 220 service->CreateDefaultDistillerPage(render_view_size), | 222 service->CreateDefaultDistillerPage(render_view_size), |
| 221 url_); | 223 url_); |
| 222 } | 224 } |
| 223 | 225 |
| 224 DistilledArticleProto GetArticleCopy() { | 226 DistilledArticleProto GetArticleCopy() { |
| 225 return *article_proto_; | 227 return *article_proto_; |
| 226 } | 228 } |
| 227 | 229 |
| 228 static ScopedVector<ContentExtractionRequest> CreateForCommandLine( | 230 static ScopedVector<ContentExtractionRequest> CreateForCommandLine( |
| 229 const base::CommandLine& command_line, | 231 const base::CommandLine& command_line, |
| 230 UrlToDomainMap* url_to_domain_map) { | 232 FileToUrlMap* file_to_url_map) { |
| 231 ScopedVector<ContentExtractionRequest> requests; | 233 ScopedVector<ContentExtractionRequest> requests; |
| 232 if (command_line.HasSwitch(kUrlSwitch)) { | 234 if (command_line.HasSwitch(kUrlSwitch)) { |
| 233 GURL url; | 235 GURL url; |
| 234 std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch); | 236 std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch); |
| 235 url = GURL(url_string); | 237 url = GURL(url_string); |
| 236 if (url.is_valid()) { | 238 if (url.is_valid()) { |
| 237 requests.push_back(new ContentExtractionRequest(url)); | 239 requests.push_back(new ContentExtractionRequest(url)); |
| 238 if (command_line.HasSwitch(kOriginalDomain)) { | 240 if (command_line.HasSwitch(kOriginalUrl)) { |
| 239 (*url_to_domain_map)[url.spec()] = | 241 (*file_to_url_map)[url.spec()] = |
| 240 command_line.GetSwitchValueASCII(kOriginalDomain); | 242 command_line.GetSwitchValueASCII(kOriginalUrl); |
| 241 } | 243 } |
| 242 } | 244 } |
| 243 } else if (command_line.HasSwitch(kUrlsSwitch)) { | 245 } else if (command_line.HasSwitch(kUrlsSwitch)) { |
| 244 std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch); | 246 std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch); |
| 245 std::vector<std::string> urls; | 247 std::vector<std::string> urls; |
| 246 base::SplitString(urls_string, ' ', &urls); | 248 base::SplitString(urls_string, ' ', &urls); |
| 247 // Check for original-domains switch, which must exactly pair up with | 249 // Check for original-urls switch, which must exactly pair up with |
| 248 // |kUrlsSwitch| i.e. number of domains must be same as that of urls. | 250 // |kUrlsSwitch| i.e. number of original urls must be same as that of |
| 249 std::vector<std::string> domains; | 251 // urls. |
| 250 if (command_line.HasSwitch(kOriginalDomains)) { | 252 std::vector<std::string> original_urls; |
| 251 std::string domains_string = | 253 if (command_line.HasSwitch(kOriginalUrls)) { |
| 252 command_line.GetSwitchValueASCII( kOriginalDomains); | 254 std::string original_urls_string = |
| 253 base::SplitString(domains_string, ';', &domains); | 255 command_line.GetSwitchValueASCII(kOriginalUrls); |
| 254 if (domains.size() != urls.size()) domains.clear(); | 256 base::SplitString(original_urls_string, ' ', &original_urls); |
| 257 if (original_urls.size() != urls.size()) original_urls.clear(); |
| 255 } | 258 } |
| 256 for (size_t i = 0; i < urls.size(); ++i) { | 259 for (size_t i = 0; i < urls.size(); ++i) { |
| 257 GURL url(urls[i]); | 260 GURL url(urls[i]); |
| 258 if (url.is_valid()) { | 261 if (url.is_valid()) { |
| 259 requests.push_back(new ContentExtractionRequest(url)); | 262 requests.push_back(new ContentExtractionRequest(url)); |
| 260 // Only regard non-empty domain. | 263 // Only regard non-empty original urls. |
| 261 if (!domains.empty() && !domains[i].empty()) { | 264 if (!original_urls.empty() && !original_urls[i].empty()) { |
| 262 (*url_to_domain_map)[url.spec()] = domains[i]; | 265 (*file_to_url_map)[url.spec()] = original_urls[i]; |
| 263 } | 266 } |
| 264 } else { | 267 } else { |
| 265 ADD_FAILURE() << "Bad url"; | 268 ADD_FAILURE() << "Bad url"; |
| 266 } | 269 } |
| 267 } | 270 } |
| 268 } | 271 } |
| 269 if (requests.empty()) { | 272 if (requests.empty()) { |
| 270 ADD_FAILURE() << "No valid url provided"; | 273 ADD_FAILURE() << "No valid url provided"; |
| 271 } | 274 } |
| 272 | 275 |
| (...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 313 } | 316 } |
| 314 | 317 |
| 315 void TearDownOnMainThread() override { DisableDNSLookupForThisTest(); } | 318 void TearDownOnMainThread() override { DisableDNSLookupForThisTest(); } |
| 316 | 319 |
| 317 protected: | 320 protected: |
| 318 // Creates the DomDistillerService and creates and starts the extraction | 321 // Creates the DomDistillerService and creates and starts the extraction |
| 319 // request. | 322 // request. |
| 320 void Start() { | 323 void Start() { |
| 321 const base::CommandLine& command_line = | 324 const base::CommandLine& command_line = |
| 322 *base::CommandLine::ForCurrentProcess(); | 325 *base::CommandLine::ForCurrentProcess(); |
| 323 UrlToDomainMap url_to_domain_map; | 326 FileToUrlMap file_to_url_map; |
| 324 requests_ = ContentExtractionRequest::CreateForCommandLine( | 327 requests_ = ContentExtractionRequest::CreateForCommandLine( |
| 325 command_line, &url_to_domain_map); | 328 command_line, &file_to_url_map); |
| 326 content::BrowserContext* context = | 329 content::BrowserContext* context = |
| 327 shell()->web_contents()->GetBrowserContext(); | 330 shell()->web_contents()->GetBrowserContext(); |
| 328 service_ = CreateDomDistillerService(context, | 331 service_ = CreateDomDistillerService(context, |
| 329 db_dir_.path(), | 332 db_dir_.path(), |
| 330 url_to_domain_map); | 333 file_to_url_map); |
| 331 PumpQueue(); | 334 PumpQueue(); |
| 332 } | 335 } |
| 333 | 336 |
| 334 void PumpQueue() { | 337 void PumpQueue() { |
| 335 while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) { | 338 while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) { |
| 336 requests_[next_request_]->Start( | 339 requests_[next_request_]->Start( |
| 337 service_.get(), | 340 service_.get(), |
| 338 shell()->web_contents()->GetContainerBounds().size(), | 341 shell()->web_contents()->GetContainerBounds().size(), |
| 339 base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this))); | 342 base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this))); |
| 340 ++next_request_; | 343 ++next_request_; |
| (...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 410 std::string output_data_; | 413 std::string output_data_; |
| 411 scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_; | 414 scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_; |
| 412 }; | 415 }; |
| 413 | 416 |
| 414 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) { | 417 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) { |
| 415 Start(); | 418 Start(); |
| 416 base::RunLoop().Run(); | 419 base::RunLoop().Run(); |
| 417 } | 420 } |
| 418 | 421 |
| 419 } // namespace dom_distiller | 422 } // namespace dom_distiller |
| OLD | NEW |