Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(446)

Side by Side Diff: components/dom_distiller/standalone/content_extractor.cc

Issue 887803002: Provide original URLs for next page detection in dom_distiller (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: rebase Created 5 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | third_party/dom_distiller_js/README.chromium » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include <sstream> 5 #include <sstream>
6 6
7 #include "base/command_line.h" 7 #include "base/command_line.h"
8 #include "base/files/scoped_temp_dir.h" 8 #include "base/files/scoped_temp_dir.h"
9 #include "base/id_map.h" 9 #include "base/id_map.h"
10 #include "base/message_loop/message_loop.h" 10 #include "base/message_loop/message_loop.h"
(...skipping 22 matching lines...) Expand all
33 #include "net/dns/mock_host_resolver.h" 33 #include "net/dns/mock_host_resolver.h"
34 #include "third_party/dom_distiller_js/dom_distiller.pb.h" 34 #include "third_party/dom_distiller_js/dom_distiller.pb.h"
35 #include "ui/base/resource/resource_bundle.h" 35 #include "ui/base/resource/resource_bundle.h"
36 36
37 using content::ContentBrowserTest; 37 using content::ContentBrowserTest;
38 38
39 namespace dom_distiller { 39 namespace dom_distiller {
40 40
41 namespace { 41 namespace {
42 42
43 typedef base::hash_map<std::string, std::string> UrlToDomainMap; 43 typedef base::hash_map<std::string, std::string> FileToUrlMap;
44 44
45 } 45 }
46 46
47 // Factory for creating a Distiller that creates different DomDistillerOptions 47 // Factory for creating a Distiller that creates different DomDistillerOptions
48 // for different URLs, i.e. a specific kOriginalDomain option for each URL. 48 // for different URLs, i.e. a specific kOriginalUrl option for each URL.
49 class TestDistillerFactoryImpl : public DistillerFactory { 49 class TestDistillerFactoryImpl : public DistillerFactory {
50 public: 50 public:
51 TestDistillerFactoryImpl( 51 TestDistillerFactoryImpl(
52 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory, 52 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
53 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options, 53 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options,
54 const UrlToDomainMap& url_to_domain_map) 54 const FileToUrlMap& file_to_url_map)
55 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()), 55 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()),
56 dom_distiller_options_(dom_distiller_options), 56 dom_distiller_options_(dom_distiller_options),
57 url_to_domain_map_(url_to_domain_map) { 57 file_to_url_map_(file_to_url_map) {
58 } 58 }
59 59
60 ~TestDistillerFactoryImpl() override {} 60 ~TestDistillerFactoryImpl() override {}
61 61
62 scoped_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override { 62 scoped_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override {
63 dom_distiller::proto::DomDistillerOptions options; 63 dom_distiller::proto::DomDistillerOptions options;
64 options = dom_distiller_options_; 64 options = dom_distiller_options_;
65 UrlToDomainMap::const_iterator it = url_to_domain_map_.find(url.spec()); 65 FileToUrlMap::const_iterator it = file_to_url_map_.find(url.spec());
66 if (it != url_to_domain_map_.end()) options.set_original_domain(it->second); 66 if (it != file_to_url_map_.end()) {
67 options.set_original_url(it->second);
68 }
67 scoped_ptr<DistillerImpl> distiller(new DistillerImpl( 69 scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
68 *distiller_url_fetcher_factory_, options)); 70 *distiller_url_fetcher_factory_, options));
69 return distiller.Pass(); 71 return distiller.Pass();
70 } 72 }
71 73
72 private: 74 private:
73 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_; 75 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_;
74 dom_distiller::proto::DomDistillerOptions dom_distiller_options_; 76 dom_distiller::proto::DomDistillerOptions dom_distiller_options_;
75 UrlToDomainMap url_to_domain_map_; 77 FileToUrlMap file_to_url_map_;
76 }; 78 };
77 79
78 namespace { 80 namespace {
79 81
80 // The url to distill. 82 // The url to distill.
81 const char* kUrlSwitch = "url"; 83 const char* kUrlSwitch = "url";
82 84
83 // A space-separated list of urls to distill. 85 // A space-separated list of urls to distill.
84 const char* kUrlsSwitch = "urls"; 86 const char* kUrlsSwitch = "urls";
85 87
86 // Indicates that DNS resolution should be disabled for this test. 88 // Indicates that DNS resolution should be disabled for this test.
87 const char* kDisableDnsSwitch = "disable-dns"; 89 const char* kDisableDnsSwitch = "disable-dns";
88 90
89 // Will write the distilled output to the given file instead of to stdout. 91 // Will write the distilled output to the given file instead of to stdout.
90 const char* kOutputFile = "output-file"; 92 const char* kOutputFile = "output-file";
91 93
92 // Indicates to output a serialized protocol buffer instead of human-readable 94 // Indicates to output a serialized protocol buffer instead of human-readable
93 // output. 95 // output.
94 const char* kShouldOutputBinary = "output-binary"; 96 const char* kShouldOutputBinary = "output-binary";
95 97
96 // Indicates to output only the text of the article and not the enclosing html. 98 // Indicates to output only the text of the article and not the enclosing html.
97 const char* kExtractTextOnly = "extract-text-only"; 99 const char* kExtractTextOnly = "extract-text-only";
98 100
99 // Indicates to include debug output. 101 // Indicates to include debug output.
100 const char* kDebugLevel = "debug-level"; 102 const char* kDebugLevel = "debug-level";
101 103
102 // The original domain of the page if |kUrlSwitch| is a file. 104 // The original URL of the page if |kUrlSwitch| is a file.
103 const char* kOriginalDomain = "original-domain"; 105 const char* kOriginalUrl = "original-url";
104 106
105 // A semi-colon-separated (i.e. ';') list of original domains corresponding to 107 // A semi-colon-separated (i.e. ';') list of original URLs corresponding to
106 // "kUrlsSwitch". 108 // "kUrlsSwitch".
107 const char* kOriginalDomains = "original-domains"; 109 const char* kOriginalUrls = "original-urls";
108 110
109 // Maximum number of concurrent started extractor requests. 111 // Maximum number of concurrent started extractor requests.
110 const int kMaxExtractorTasks = 8; 112 const int kMaxExtractorTasks = 8;
111 113
112 scoped_ptr<DomDistillerService> CreateDomDistillerService( 114 scoped_ptr<DomDistillerService> CreateDomDistillerService(
113 content::BrowserContext* context, 115 content::BrowserContext* context,
114 const base::FilePath& db_path, 116 const base::FilePath& db_path,
115 const UrlToDomainMap& url_to_domain_map) { 117 const FileToUrlMap& file_to_url_map) {
116 scoped_refptr<base::SequencedTaskRunner> background_task_runner = 118 scoped_refptr<base::SequencedTaskRunner> background_task_runner =
117 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner( 119 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
118 content::BrowserThread::GetBlockingPool()->GetSequenceToken()); 120 content::BrowserThread::GetBlockingPool()->GetSequenceToken());
119 121
120 // TODO(cjhopman): use an in-memory database instead of an on-disk one with 122 // TODO(cjhopman): use an in-memory database instead of an on-disk one with
121 // temporary directory. 123 // temporary directory.
122 scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db( 124 scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db(
123 new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>( 125 new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>(
124 background_task_runner)); 126 background_task_runner));
125 scoped_ptr<DomDistillerStore> dom_distiller_store( 127 scoped_ptr<DomDistillerStore> dom_distiller_store(
(...skipping 12 matching lines...) Expand all
138 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) && 140 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) &&
139 base::StringToInt( 141 base::StringToInt(
140 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII( 142 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
141 kDebugLevel), 143 kDebugLevel),
142 &debug_level)) { 144 &debug_level)) {
143 options.set_debug_level(debug_level); 145 options.set_debug_level(debug_level);
144 } 146 }
145 scoped_ptr<DistillerFactory> distiller_factory( 147 scoped_ptr<DistillerFactory> distiller_factory(
146 new TestDistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), 148 new TestDistillerFactoryImpl(distiller_url_fetcher_factory.Pass(),
147 options, 149 options,
148 url_to_domain_map)); 150 file_to_url_map));
149 151
150 // Setting up PrefService for DistilledPagePrefs. 152 // Setting up PrefService for DistilledPagePrefs.
151 user_prefs::TestingPrefServiceSyncable* pref_service = 153 user_prefs::TestingPrefServiceSyncable* pref_service =
152 new user_prefs::TestingPrefServiceSyncable(); 154 new user_prefs::TestingPrefServiceSyncable();
153 DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry()); 155 DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry());
154 156
155 return scoped_ptr<DomDistillerService>(new DomDistillerService( 157 return scoped_ptr<DomDistillerService>(new DomDistillerService(
156 dom_distiller_store.Pass(), 158 dom_distiller_store.Pass(),
157 distiller_factory.Pass(), 159 distiller_factory.Pass(),
158 distiller_page_factory.Pass(), 160 distiller_page_factory.Pass(),
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after
220 service->CreateDefaultDistillerPage(render_view_size), 222 service->CreateDefaultDistillerPage(render_view_size),
221 url_); 223 url_);
222 } 224 }
223 225
224 DistilledArticleProto GetArticleCopy() { 226 DistilledArticleProto GetArticleCopy() {
225 return *article_proto_; 227 return *article_proto_;
226 } 228 }
227 229
228 static ScopedVector<ContentExtractionRequest> CreateForCommandLine( 230 static ScopedVector<ContentExtractionRequest> CreateForCommandLine(
229 const base::CommandLine& command_line, 231 const base::CommandLine& command_line,
230 UrlToDomainMap* url_to_domain_map) { 232 FileToUrlMap* file_to_url_map) {
231 ScopedVector<ContentExtractionRequest> requests; 233 ScopedVector<ContentExtractionRequest> requests;
232 if (command_line.HasSwitch(kUrlSwitch)) { 234 if (command_line.HasSwitch(kUrlSwitch)) {
233 GURL url; 235 GURL url;
234 std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch); 236 std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
235 url = GURL(url_string); 237 url = GURL(url_string);
236 if (url.is_valid()) { 238 if (url.is_valid()) {
237 requests.push_back(new ContentExtractionRequest(url)); 239 requests.push_back(new ContentExtractionRequest(url));
238 if (command_line.HasSwitch(kOriginalDomain)) { 240 if (command_line.HasSwitch(kOriginalUrl)) {
239 (*url_to_domain_map)[url.spec()] = 241 (*file_to_url_map)[url.spec()] =
240 command_line.GetSwitchValueASCII(kOriginalDomain); 242 command_line.GetSwitchValueASCII(kOriginalUrl);
241 } 243 }
242 } 244 }
243 } else if (command_line.HasSwitch(kUrlsSwitch)) { 245 } else if (command_line.HasSwitch(kUrlsSwitch)) {
244 std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch); 246 std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
245 std::vector<std::string> urls; 247 std::vector<std::string> urls;
246 base::SplitString(urls_string, ' ', &urls); 248 base::SplitString(urls_string, ' ', &urls);
247 // Check for original-domains switch, which must exactly pair up with 249 // Check for original-urls switch, which must exactly pair up with
248 // |kUrlsSwitch| i.e. number of domains must be same as that of urls. 250 // |kUrlsSwitch| i.e. number of original urls must be same as that of
249 std::vector<std::string> domains; 251 // urls.
250 if (command_line.HasSwitch(kOriginalDomains)) { 252 std::vector<std::string> original_urls;
251 std::string domains_string = 253 if (command_line.HasSwitch(kOriginalUrls)) {
252 command_line.GetSwitchValueASCII( kOriginalDomains); 254 std::string original_urls_string =
253 base::SplitString(domains_string, ';', &domains); 255 command_line.GetSwitchValueASCII(kOriginalUrls);
254 if (domains.size() != urls.size()) domains.clear(); 256 base::SplitString(original_urls_string, ' ', &original_urls);
257 if (original_urls.size() != urls.size()) original_urls.clear();
255 } 258 }
256 for (size_t i = 0; i < urls.size(); ++i) { 259 for (size_t i = 0; i < urls.size(); ++i) {
257 GURL url(urls[i]); 260 GURL url(urls[i]);
258 if (url.is_valid()) { 261 if (url.is_valid()) {
259 requests.push_back(new ContentExtractionRequest(url)); 262 requests.push_back(new ContentExtractionRequest(url));
260 // Only regard non-empty domain. 263 // Only regard non-empty original urls.
261 if (!domains.empty() && !domains[i].empty()) { 264 if (!original_urls.empty() && !original_urls[i].empty()) {
262 (*url_to_domain_map)[url.spec()] = domains[i]; 265 (*file_to_url_map)[url.spec()] = original_urls[i];
263 } 266 }
264 } else { 267 } else {
265 ADD_FAILURE() << "Bad url"; 268 ADD_FAILURE() << "Bad url";
266 } 269 }
267 } 270 }
268 } 271 }
269 if (requests.empty()) { 272 if (requests.empty()) {
270 ADD_FAILURE() << "No valid url provided"; 273 ADD_FAILURE() << "No valid url provided";
271 } 274 }
272 275
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
313 } 316 }
314 317
315 void TearDownOnMainThread() override { DisableDNSLookupForThisTest(); } 318 void TearDownOnMainThread() override { DisableDNSLookupForThisTest(); }
316 319
317 protected: 320 protected:
318 // Creates the DomDistillerService and creates and starts the extraction 321 // Creates the DomDistillerService and creates and starts the extraction
319 // request. 322 // request.
320 void Start() { 323 void Start() {
321 const base::CommandLine& command_line = 324 const base::CommandLine& command_line =
322 *base::CommandLine::ForCurrentProcess(); 325 *base::CommandLine::ForCurrentProcess();
323 UrlToDomainMap url_to_domain_map; 326 FileToUrlMap file_to_url_map;
324 requests_ = ContentExtractionRequest::CreateForCommandLine( 327 requests_ = ContentExtractionRequest::CreateForCommandLine(
325 command_line, &url_to_domain_map); 328 command_line, &file_to_url_map);
326 content::BrowserContext* context = 329 content::BrowserContext* context =
327 shell()->web_contents()->GetBrowserContext(); 330 shell()->web_contents()->GetBrowserContext();
328 service_ = CreateDomDistillerService(context, 331 service_ = CreateDomDistillerService(context,
329 db_dir_.path(), 332 db_dir_.path(),
330 url_to_domain_map); 333 file_to_url_map);
331 PumpQueue(); 334 PumpQueue();
332 } 335 }
333 336
334 void PumpQueue() { 337 void PumpQueue() {
335 while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) { 338 while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) {
336 requests_[next_request_]->Start( 339 requests_[next_request_]->Start(
337 service_.get(), 340 service_.get(),
338 shell()->web_contents()->GetContainerBounds().size(), 341 shell()->web_contents()->GetContainerBounds().size(),
339 base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this))); 342 base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this)));
340 ++next_request_; 343 ++next_request_;
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after
410 std::string output_data_; 413 std::string output_data_;
411 scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_; 414 scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_;
412 }; 415 };
413 416
414 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) { 417 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {
415 Start(); 418 Start();
416 base::RunLoop().Run(); 419 base::RunLoop().Run();
417 } 420 }
418 421
419 } // namespace dom_distiller 422 } // namespace dom_distiller
OLDNEW
« no previous file with comments | « no previous file | third_party/dom_distiller_js/README.chromium » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698