OLD | NEW |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include <sstream> | 5 #include <sstream> |
6 | 6 |
7 #include "base/command_line.h" | 7 #include "base/command_line.h" |
8 #include "base/files/scoped_temp_dir.h" | 8 #include "base/files/scoped_temp_dir.h" |
| 9 #include "base/id_map.h" |
9 #include "base/message_loop/message_loop.h" | 10 #include "base/message_loop/message_loop.h" |
10 #include "base/path_service.h" | 11 #include "base/path_service.h" |
11 #include "base/run_loop.h" | 12 #include "base/run_loop.h" |
12 #include "base/strings/string_number_conversions.h" | 13 #include "base/strings/string_number_conversions.h" |
13 #include "base/strings/string_split.h" | 14 #include "base/strings/string_split.h" |
14 #include "components/dom_distiller/content/distiller_page_web_contents.h" | 15 #include "components/dom_distiller/content/distiller_page_web_contents.h" |
15 #include "components/dom_distiller/core/article_entry.h" | 16 #include "components/dom_distiller/core/article_entry.h" |
16 #include "components/dom_distiller/core/distilled_page_prefs.h" | 17 #include "components/dom_distiller/core/distilled_page_prefs.h" |
17 #include "components/dom_distiller/core/distiller.h" | 18 #include "components/dom_distiller/core/distiller.h" |
18 #include "components/dom_distiller/core/dom_distiller_service.h" | 19 #include "components/dom_distiller/core/dom_distiller_service.h" |
(...skipping 13 matching lines...) Expand all Loading... |
32 #include "net/dns/mock_host_resolver.h" | 33 #include "net/dns/mock_host_resolver.h" |
33 #include "third_party/dom_distiller_js/dom_distiller.pb.h" | 34 #include "third_party/dom_distiller_js/dom_distiller.pb.h" |
34 #include "ui/base/resource/resource_bundle.h" | 35 #include "ui/base/resource/resource_bundle.h" |
35 | 36 |
36 using content::ContentBrowserTest; | 37 using content::ContentBrowserTest; |
37 | 38 |
38 namespace dom_distiller { | 39 namespace dom_distiller { |
39 | 40 |
40 namespace { | 41 namespace { |
41 | 42 |
| 43 typedef base::hash_map<std::string, std::string> UrlToDomainMap; |
| 44 |
| 45 } |
| 46 |
| 47 // Factory for creating a Distiller that creates different DomDistillerOptions |
| 48 // for different URLs, i.e. a specific kOriginalDomain option for each URL. |
| 49 class TestDistillerFactoryImpl : public DistillerFactory { |
| 50 public: |
| 51 TestDistillerFactoryImpl( |
| 52 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory, |
| 53 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options, |
| 54 const UrlToDomainMap& url_to_domain_map) |
| 55 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()), |
| 56 dom_distiller_options_(dom_distiller_options), |
| 57 url_to_domain_map_(url_to_domain_map) { |
| 58 } |
| 59 |
| 60 ~TestDistillerFactoryImpl() override {} |
| 61 |
| 62 scoped_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override { |
| 63 dom_distiller::proto::DomDistillerOptions options; |
| 64 options = dom_distiller_options_; |
| 65 UrlToDomainMap::const_iterator it = url_to_domain_map_.find(url.spec()); |
| 66 if (it != url_to_domain_map_.end()) options.set_original_domain(it->second); |
| 67 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( |
| 68 *distiller_url_fetcher_factory_, options)); |
| 69 return distiller.Pass(); |
| 70 } |
| 71 |
| 72 private: |
| 73 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_; |
| 74 dom_distiller::proto::DomDistillerOptions dom_distiller_options_; |
| 75 UrlToDomainMap url_to_domain_map_; |
| 76 }; |
| 77 |
| 78 namespace { |
| 79 |
42 // The url to distill. | 80 // The url to distill. |
43 const char* kUrlSwitch = "url"; | 81 const char* kUrlSwitch = "url"; |
44 | 82 |
45 // A space-separated list of urls to distill. | 83 // A space-separated list of urls to distill. |
46 const char* kUrlsSwitch = "urls"; | 84 const char* kUrlsSwitch = "urls"; |
47 | 85 |
48 // Indicates that DNS resolution should be disabled for this test. | 86 // Indicates that DNS resolution should be disabled for this test. |
49 const char* kDisableDnsSwitch = "disable-dns"; | 87 const char* kDisableDnsSwitch = "disable-dns"; |
50 | 88 |
51 // Will write the distilled output to the given file instead of to stdout. | 89 // Will write the distilled output to the given file instead of to stdout. |
52 const char* kOutputFile = "output-file"; | 90 const char* kOutputFile = "output-file"; |
53 | 91 |
54 // Indicates to output a serialized protocol buffer instead of human-readable | 92 // Indicates to output a serialized protocol buffer instead of human-readable |
55 // output. | 93 // output. |
56 const char* kShouldOutputBinary = "output-binary"; | 94 const char* kShouldOutputBinary = "output-binary"; |
57 | 95 |
58 // Indicates to output only the text of the article and not the enclosing html. | 96 // Indicates to output only the text of the article and not the enclosing html. |
59 const char* kExtractTextOnly = "extract-text-only"; | 97 const char* kExtractTextOnly = "extract-text-only"; |
60 | 98 |
61 // Indicates to include debug output. | 99 // Indicates to include debug output. |
62 const char* kDebugLevel = "debug-level"; | 100 const char* kDebugLevel = "debug-level"; |
63 | 101 |
| 102 // The original domain of the page if |kUrlSwitch| is a file. |
| 103 const char* kOriginalDomain = "original-domain"; |
| 104 |
| 105 // A semi-colon-separated (i.e. ';') list of original domains corresponding to |
| 106 // "kUrlsSwitch". |
| 107 const char* kOriginalDomains = "original-domains"; |
| 108 |
64 // Maximum number of concurrent started extractor requests. | 109 // Maximum number of concurrent started extractor requests. |
65 const int kMaxExtractorTasks = 8; | 110 const int kMaxExtractorTasks = 8; |
66 | 111 |
67 scoped_ptr<DomDistillerService> CreateDomDistillerService( | 112 scoped_ptr<DomDistillerService> CreateDomDistillerService( |
68 content::BrowserContext* context, | 113 content::BrowserContext* context, |
69 const base::FilePath& db_path) { | 114 const base::FilePath& db_path, |
| 115 const UrlToDomainMap& url_to_domain_map) { |
70 scoped_refptr<base::SequencedTaskRunner> background_task_runner = | 116 scoped_refptr<base::SequencedTaskRunner> background_task_runner = |
71 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner( | 117 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner( |
72 content::BrowserThread::GetBlockingPool()->GetSequenceToken()); | 118 content::BrowserThread::GetBlockingPool()->GetSequenceToken()); |
73 | 119 |
74 // TODO(cjhopman): use an in-memory database instead of an on-disk one with | 120 // TODO(cjhopman): use an in-memory database instead of an on-disk one with |
75 // temporary directory. | 121 // temporary directory. |
76 scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db( | 122 scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db( |
77 new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>( | 123 new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>( |
78 background_task_runner)); | 124 background_task_runner)); |
79 scoped_ptr<DomDistillerStore> dom_distiller_store( | 125 scoped_ptr<DomDistillerStore> dom_distiller_store( |
(...skipping 10 matching lines...) Expand all Loading... |
90 } | 136 } |
91 int debug_level = 0; | 137 int debug_level = 0; |
92 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) && | 138 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) && |
93 base::StringToInt( | 139 base::StringToInt( |
94 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII( | 140 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII( |
95 kDebugLevel), | 141 kDebugLevel), |
96 &debug_level)) { | 142 &debug_level)) { |
97 options.set_debug_level(debug_level); | 143 options.set_debug_level(debug_level); |
98 } | 144 } |
99 scoped_ptr<DistillerFactory> distiller_factory( | 145 scoped_ptr<DistillerFactory> distiller_factory( |
100 new DistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), options)); | 146 new TestDistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), |
| 147 options, |
| 148 url_to_domain_map)); |
101 | 149 |
102 // Setting up PrefService for DistilledPagePrefs. | 150 // Setting up PrefService for DistilledPagePrefs. |
103 user_prefs::TestingPrefServiceSyncable* pref_service = | 151 user_prefs::TestingPrefServiceSyncable* pref_service = |
104 new user_prefs::TestingPrefServiceSyncable(); | 152 new user_prefs::TestingPrefServiceSyncable(); |
105 DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry()); | 153 DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry()); |
106 | 154 |
107 return scoped_ptr<DomDistillerService>(new DomDistillerService( | 155 return scoped_ptr<DomDistillerService>(new DomDistillerService( |
108 dom_distiller_store.Pass(), | 156 dom_distiller_store.Pass(), |
109 distiller_factory.Pass(), | 157 distiller_factory.Pass(), |
110 distiller_page_factory.Pass(), | 158 distiller_page_factory.Pass(), |
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
170 service->ViewUrl(this, | 218 service->ViewUrl(this, |
171 service->CreateDefaultDistillerPage(render_view_size), | 219 service->CreateDefaultDistillerPage(render_view_size), |
172 url_); | 220 url_); |
173 } | 221 } |
174 | 222 |
175 DistilledArticleProto GetArticleCopy() { | 223 DistilledArticleProto GetArticleCopy() { |
176 return *article_proto_; | 224 return *article_proto_; |
177 } | 225 } |
178 | 226 |
179 static ScopedVector<ContentExtractionRequest> CreateForCommandLine( | 227 static ScopedVector<ContentExtractionRequest> CreateForCommandLine( |
180 const CommandLine& command_line) { | 228 const CommandLine& command_line, |
| 229 UrlToDomainMap* url_to_domain_map) { |
181 ScopedVector<ContentExtractionRequest> requests; | 230 ScopedVector<ContentExtractionRequest> requests; |
182 if (command_line.HasSwitch(kUrlSwitch)) { | 231 if (command_line.HasSwitch(kUrlSwitch)) { |
183 GURL url; | 232 GURL url; |
184 std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch); | 233 std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch); |
185 url = GURL(url_string); | 234 url = GURL(url_string); |
186 if (url.is_valid()) { | 235 if (url.is_valid()) { |
187 requests.push_back(new ContentExtractionRequest(url)); | 236 requests.push_back(new ContentExtractionRequest(url)); |
| 237 if (command_line.HasSwitch(kOriginalDomain)) { |
| 238 (*url_to_domain_map)[url.spec()] = |
| 239 command_line.GetSwitchValueASCII(kOriginalDomain); |
| 240 } |
188 } | 241 } |
189 } else if (command_line.HasSwitch(kUrlsSwitch)) { | 242 } else if (command_line.HasSwitch(kUrlsSwitch)) { |
190 std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch); | 243 std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch); |
191 std::vector<std::string> urls; | 244 std::vector<std::string> urls; |
192 base::SplitString(urls_string, ' ', &urls); | 245 base::SplitString(urls_string, ' ', &urls); |
| 246 // Check for original-domains switch, which must exactly pair up with |
| 247 // |kUrlsSwitch| i.e. number of domains must be same as that of urls. |
| 248 std::vector<std::string> domains; |
| 249 if (command_line.HasSwitch(kOriginalDomains)) { |
| 250 std::string domains_string = |
| 251 command_line.GetSwitchValueASCII( kOriginalDomains); |
| 252 base::SplitString(domains_string, ';', &domains); |
| 253 if (domains.size() != urls.size()) domains.clear(); |
| 254 } |
193 for (size_t i = 0; i < urls.size(); ++i) { | 255 for (size_t i = 0; i < urls.size(); ++i) { |
194 GURL url(urls[i]); | 256 GURL url(urls[i]); |
195 if (url.is_valid()) { | 257 if (url.is_valid()) { |
196 requests.push_back(new ContentExtractionRequest(url)); | 258 requests.push_back(new ContentExtractionRequest(url)); |
| 259 // Only regard non-empty domain. |
| 260 if (!domains.empty() && !domains[i].empty()) { |
| 261 (*url_to_domain_map)[url.spec()] = domains[i]; |
| 262 } |
197 } else { | 263 } else { |
198 ADD_FAILURE() << "Bad url"; | 264 ADD_FAILURE() << "Bad url"; |
199 } | 265 } |
200 } | 266 } |
201 } | 267 } |
202 if (requests.empty()) { | 268 if (requests.empty()) { |
203 ADD_FAILURE() << "No valid url provided"; | 269 ADD_FAILURE() << "No valid url provided"; |
204 } | 270 } |
205 | 271 |
206 return requests.Pass(); | 272 return requests.Pass(); |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
248 } | 314 } |
249 | 315 |
250 virtual void TearDownOnMainThread() override { | 316 virtual void TearDownOnMainThread() override { |
251 DisableDNSLookupForThisTest(); | 317 DisableDNSLookupForThisTest(); |
252 } | 318 } |
253 | 319 |
254 protected: | 320 protected: |
255 // Creates the DomDistillerService and creates and starts the extraction | 321 // Creates the DomDistillerService and creates and starts the extraction |
256 // request. | 322 // request. |
257 void Start() { | 323 void Start() { |
| 324 const CommandLine& command_line = *CommandLine::ForCurrentProcess(); |
| 325 UrlToDomainMap url_to_domain_map; |
| 326 requests_ = ContentExtractionRequest::CreateForCommandLine( |
| 327 command_line, &url_to_domain_map); |
258 content::BrowserContext* context = | 328 content::BrowserContext* context = |
259 shell()->web_contents()->GetBrowserContext(); | 329 shell()->web_contents()->GetBrowserContext(); |
260 service_ = CreateDomDistillerService(context, | 330 service_ = CreateDomDistillerService(context, |
261 db_dir_.path()); | 331 db_dir_.path(), |
262 const CommandLine& command_line = *CommandLine::ForCurrentProcess(); | 332 url_to_domain_map); |
263 requests_ = ContentExtractionRequest::CreateForCommandLine(command_line); | |
264 PumpQueue(); | 333 PumpQueue(); |
265 } | 334 } |
266 | 335 |
267 void PumpQueue() { | 336 void PumpQueue() { |
268 while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) { | 337 while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) { |
269 requests_[next_request_]->Start( | 338 requests_[next_request_]->Start( |
270 service_.get(), | 339 service_.get(), |
271 shell()->web_contents()->GetContainerBounds().size(), | 340 shell()->web_contents()->GetContainerBounds().size(), |
272 base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this))); | 341 base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this))); |
273 ++next_request_; | 342 ++next_request_; |
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
342 std::string output_data_; | 411 std::string output_data_; |
343 scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_; | 412 scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_; |
344 }; | 413 }; |
345 | 414 |
346 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) { | 415 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) { |
347 Start(); | 416 Start(); |
348 base::RunLoop().Run(); | 417 base::RunLoop().Run(); |
349 } | 418 } |
350 | 419 |
351 } // namespace dom_distiller | 420 } // namespace dom_distiller |
OLD | NEW |