Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(230)

Side by Side Diff: components/dom_distiller/standalone/content_extractor.cc

Issue 917663002: Rename content_extractor.cc to content_extractor_browsertest.cc (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <sstream>
6
7 #include "base/command_line.h"
8 #include "base/files/scoped_temp_dir.h"
9 #include "base/id_map.h"
10 #include "base/message_loop/message_loop.h"
11 #include "base/path_service.h"
12 #include "base/run_loop.h"
13 #include "base/strings/string_number_conversions.h"
14 #include "base/strings/string_split.h"
15 #include "components/dom_distiller/content/distiller_page_web_contents.h"
16 #include "components/dom_distiller/core/article_entry.h"
17 #include "components/dom_distiller/core/distilled_page_prefs.h"
18 #include "components/dom_distiller/core/distiller.h"
19 #include "components/dom_distiller/core/dom_distiller_service.h"
20 #include "components/dom_distiller/core/dom_distiller_store.h"
21 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
22 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
23 #include "components/dom_distiller/core/task_tracker.h"
24 #include "components/leveldb_proto/proto_database.h"
25 #include "components/leveldb_proto/proto_database_impl.h"
26 #include "components/pref_registry/testing_pref_service_syncable.h"
27 #include "content/public/browser/browser_context.h"
28 #include "content/public/browser/browser_thread.h"
29 #include "content/public/test/content_browser_test.h"
30 #include "content/shell/browser/shell.h"
31 #include "google/protobuf/io/coded_stream.h"
32 #include "google/protobuf/io/zero_copy_stream_impl_lite.h"
33 #include "net/dns/mock_host_resolver.h"
34 #include "third_party/dom_distiller_js/dom_distiller.pb.h"
35 #include "ui/base/resource/resource_bundle.h"
36
37 using content::ContentBrowserTest;
38
39 namespace dom_distiller {
40
41 namespace {
42
43 typedef base::hash_map<std::string, std::string> UrlToDomainMap;
44
45 }
46
47 // Factory for creating a Distiller that creates different DomDistillerOptions
48 // for different URLs, i.e. a specific kOriginalDomain option for each URL.
49 class TestDistillerFactoryImpl : public DistillerFactory {
50 public:
51 TestDistillerFactoryImpl(
52 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
53 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options,
54 const UrlToDomainMap& url_to_domain_map)
55 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()),
56 dom_distiller_options_(dom_distiller_options),
57 url_to_domain_map_(url_to_domain_map) {
58 }
59
60 ~TestDistillerFactoryImpl() override {}
61
62 scoped_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override {
63 dom_distiller::proto::DomDistillerOptions options;
64 options = dom_distiller_options_;
65 UrlToDomainMap::const_iterator it = url_to_domain_map_.find(url.spec());
66 if (it != url_to_domain_map_.end()) options.set_original_domain(it->second);
67 scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
68 *distiller_url_fetcher_factory_, options));
69 return distiller.Pass();
70 }
71
72 private:
73 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_;
74 dom_distiller::proto::DomDistillerOptions dom_distiller_options_;
75 UrlToDomainMap url_to_domain_map_;
76 };
77
78 namespace {
79
80 // The url to distill.
81 const char* kUrlSwitch = "url";
82
83 // A space-separated list of urls to distill.
84 const char* kUrlsSwitch = "urls";
85
86 // Indicates that DNS resolution should be disabled for this test.
87 const char* kDisableDnsSwitch = "disable-dns";
88
89 // Will write the distilled output to the given file instead of to stdout.
90 const char* kOutputFile = "output-file";
91
92 // Indicates to output a serialized protocol buffer instead of human-readable
93 // output.
94 const char* kShouldOutputBinary = "output-binary";
95
96 // Indicates to output only the text of the article and not the enclosing html.
97 const char* kExtractTextOnly = "extract-text-only";
98
99 // Indicates to include debug output.
100 const char* kDebugLevel = "debug-level";
101
102 // The original domain of the page if |kUrlSwitch| is a file.
103 const char* kOriginalDomain = "original-domain";
104
105 // A semi-colon-separated (i.e. ';') list of original domains corresponding to
106 // "kUrlsSwitch".
107 const char* kOriginalDomains = "original-domains";
108
109 // Maximum number of concurrent started extractor requests.
110 const int kMaxExtractorTasks = 8;
111
112 scoped_ptr<DomDistillerService> CreateDomDistillerService(
113 content::BrowserContext* context,
114 const base::FilePath& db_path,
115 const UrlToDomainMap& url_to_domain_map) {
116 scoped_refptr<base::SequencedTaskRunner> background_task_runner =
117 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
118 content::BrowserThread::GetBlockingPool()->GetSequenceToken());
119
120 // TODO(cjhopman): use an in-memory database instead of an on-disk one with
121 // temporary directory.
122 scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db(
123 new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>(
124 background_task_runner));
125 scoped_ptr<DomDistillerStore> dom_distiller_store(
126 new DomDistillerStore(db.Pass(), db_path));
127
128 scoped_ptr<DistillerPageFactory> distiller_page_factory(
129 new DistillerPageWebContentsFactory(context));
130 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory(
131 new DistillerURLFetcherFactory(context->GetRequestContext()));
132
133 dom_distiller::proto::DomDistillerOptions options;
134 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly)) {
135 options.set_extract_text_only(true);
136 }
137 int debug_level = 0;
138 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) &&
139 base::StringToInt(
140 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
141 kDebugLevel),
142 &debug_level)) {
143 options.set_debug_level(debug_level);
144 }
145 scoped_ptr<DistillerFactory> distiller_factory(
146 new TestDistillerFactoryImpl(distiller_url_fetcher_factory.Pass(),
147 options,
148 url_to_domain_map));
149
150 // Setting up PrefService for DistilledPagePrefs.
151 user_prefs::TestingPrefServiceSyncable* pref_service =
152 new user_prefs::TestingPrefServiceSyncable();
153 DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry());
154
155 return scoped_ptr<DomDistillerService>(new DomDistillerService(
156 dom_distiller_store.Pass(),
157 distiller_factory.Pass(),
158 distiller_page_factory.Pass(),
159 scoped_ptr<DistilledPagePrefs>(new DistilledPagePrefs(pref_service))));
160 }
161
162 void AddComponentsTestResources() {
163 base::FilePath pak_file;
164 base::FilePath pak_dir;
165 PathService::Get(base::DIR_MODULE, &pak_dir);
166 pak_file =
167 pak_dir.Append(FILE_PATH_LITERAL("components_tests_resources.pak"));
168 ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath(
169 pak_file, ui::SCALE_FACTOR_NONE);
170 }
171
172 bool WriteProtobufWithSize(
173 const google::protobuf::MessageLite& message,
174 google::protobuf::io::ZeroCopyOutputStream* output_stream) {
175 google::protobuf::io::CodedOutputStream coded_output(output_stream);
176
177 // Write the size.
178 const int size = message.ByteSize();
179 coded_output.WriteLittleEndian32(size);
180 message.SerializeWithCachedSizes(&coded_output);
181 return !coded_output.HadError();
182 }
183
184 std::string GetReadableArticleString(
185 const DistilledArticleProto& article_proto) {
186 std::stringstream output;
187 output << "Article Title: " << article_proto.title() << std::endl;
188 output << "# of pages: " << article_proto.pages_size() << std::endl;
189 for (int i = 0; i < article_proto.pages_size(); ++i) {
190 if (i > 0) output << std::endl;
191 const DistilledPageProto& page = article_proto.pages(i);
192 output << "Page " << i << std::endl;
193 output << "URL: " << page.url() << std::endl;
194 output << "Content: " << page.html() << std::endl;
195 if (page.has_debug_info() && page.debug_info().has_log())
196 output << "Log: " << page.debug_info().log() << std::endl;
197 if (page.has_pagination_info()) {
198 if (page.pagination_info().has_next_page()) {
199 output << "Next Page: " << page.pagination_info().next_page()
200 << std::endl;
201 }
202 if (page.pagination_info().has_prev_page()) {
203 output << "Prev Page: " << page.pagination_info().prev_page()
204 << std::endl;
205 }
206 }
207 }
208 return output.str();
209 }
210
211 } // namespace
212
213 class ContentExtractionRequest : public ViewRequestDelegate {
214 public:
215 void Start(DomDistillerService* service, const gfx::Size& render_view_size,
216 base::Closure finished_callback) {
217 finished_callback_ = finished_callback;
218 viewer_handle_ =
219 service->ViewUrl(this,
220 service->CreateDefaultDistillerPage(render_view_size),
221 url_);
222 }
223
224 DistilledArticleProto GetArticleCopy() {
225 return *article_proto_;
226 }
227
228 static ScopedVector<ContentExtractionRequest> CreateForCommandLine(
229 const base::CommandLine& command_line,
230 UrlToDomainMap* url_to_domain_map) {
231 ScopedVector<ContentExtractionRequest> requests;
232 if (command_line.HasSwitch(kUrlSwitch)) {
233 GURL url;
234 std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
235 url = GURL(url_string);
236 if (url.is_valid()) {
237 requests.push_back(new ContentExtractionRequest(url));
238 if (command_line.HasSwitch(kOriginalDomain)) {
239 (*url_to_domain_map)[url.spec()] =
240 command_line.GetSwitchValueASCII(kOriginalDomain);
241 }
242 }
243 } else if (command_line.HasSwitch(kUrlsSwitch)) {
244 std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
245 std::vector<std::string> urls;
246 base::SplitString(urls_string, ' ', &urls);
247 // Check for original-domains switch, which must exactly pair up with
248 // |kUrlsSwitch| i.e. number of domains must be same as that of urls.
249 std::vector<std::string> domains;
250 if (command_line.HasSwitch(kOriginalDomains)) {
251 std::string domains_string =
252 command_line.GetSwitchValueASCII( kOriginalDomains);
253 base::SplitString(domains_string, ';', &domains);
254 if (domains.size() != urls.size()) domains.clear();
255 }
256 for (size_t i = 0; i < urls.size(); ++i) {
257 GURL url(urls[i]);
258 if (url.is_valid()) {
259 requests.push_back(new ContentExtractionRequest(url));
260 // Only regard non-empty domain.
261 if (!domains.empty() && !domains[i].empty()) {
262 (*url_to_domain_map)[url.spec()] = domains[i];
263 }
264 } else {
265 ADD_FAILURE() << "Bad url";
266 }
267 }
268 }
269 if (requests.empty()) {
270 ADD_FAILURE() << "No valid url provided";
271 }
272
273 return requests.Pass();
274 }
275
276 private:
277 ContentExtractionRequest(const GURL& url) : url_(url) {}
278
279 void OnArticleUpdated(ArticleDistillationUpdate article_update) override {}
280
281 void OnArticleReady(const DistilledArticleProto* article_proto) override {
282 article_proto_ = article_proto;
283 CHECK(article_proto->pages_size()) << "Failed extracting " << url_;
284 base::MessageLoop::current()->PostTask(
285 FROM_HERE,
286 finished_callback_);
287 }
288
289 const DistilledArticleProto* article_proto_;
290 scoped_ptr<ViewerHandle> viewer_handle_;
291 GURL url_;
292 base::Closure finished_callback_;
293 };
294
295 class ContentExtractor : public ContentBrowserTest {
296 public:
297 ContentExtractor()
298 : pending_tasks_(0),
299 max_tasks_(kMaxExtractorTasks),
300 next_request_(0),
301 output_data_(),
302 protobuf_output_stream_(
303 new google::protobuf::io::StringOutputStream(&output_data_)) {}
304
305 // Change behavior of the default host resolver to avoid DNS lookup errors, so
306 // we can make network calls.
307 void SetUpOnMainThread() override {
308 if (!base::CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch)) {
309 EnableDNSLookupForThisTest();
310 }
311 CHECK(db_dir_.CreateUniqueTempDir());
312 AddComponentsTestResources();
313 }
314
315 void TearDownOnMainThread() override { DisableDNSLookupForThisTest(); }
316
317 protected:
318 // Creates the DomDistillerService and creates and starts the extraction
319 // request.
320 void Start() {
321 const base::CommandLine& command_line =
322 *base::CommandLine::ForCurrentProcess();
323 UrlToDomainMap url_to_domain_map;
324 requests_ = ContentExtractionRequest::CreateForCommandLine(
325 command_line, &url_to_domain_map);
326 content::BrowserContext* context =
327 shell()->web_contents()->GetBrowserContext();
328 service_ = CreateDomDistillerService(context,
329 db_dir_.path(),
330 url_to_domain_map);
331 PumpQueue();
332 }
333
334 void PumpQueue() {
335 while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) {
336 requests_[next_request_]->Start(
337 service_.get(),
338 shell()->web_contents()->GetContainerBounds().size(),
339 base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this)));
340 ++next_request_;
341 ++pending_tasks_;
342 }
343 }
344
345 private:
346 // Change behavior of the default host resolver to allow DNS lookup
347 // to proceed instead of being blocked by the test infrastructure.
348 void EnableDNSLookupForThisTest() {
349 // mock_host_resolver_override_ takes ownership of the resolver.
350 scoped_refptr<net::RuleBasedHostResolverProc> resolver =
351 new net::RuleBasedHostResolverProc(host_resolver());
352 resolver->AllowDirectLookup("*");
353 mock_host_resolver_override_.reset(
354 new net::ScopedDefaultHostResolverProc(resolver.get()));
355 }
356
357 // We need to reset the DNS lookup when we finish, or the test will fail.
358 void DisableDNSLookupForThisTest() {
359 mock_host_resolver_override_.reset();
360 }
361
362 void FinishRequest() {
363 --pending_tasks_;
364 if (next_request_ == requests_.size() && pending_tasks_ == 0) {
365 Finish();
366 } else {
367 PumpQueue();
368 }
369 }
370
371 void DoArticleOutput() {
372 const base::CommandLine& command_line =
373 *base::CommandLine::ForCurrentProcess();
374 for (size_t i = 0; i < requests_.size(); ++i) {
375 const DistilledArticleProto& article = requests_[i]->GetArticleCopy();
376 if (command_line.HasSwitch(kShouldOutputBinary)) {
377 WriteProtobufWithSize(article, protobuf_output_stream_.get());
378 } else {
379 output_data_ += GetReadableArticleString(article) + "\n";
380 }
381 }
382
383 if (command_line.HasSwitch(kOutputFile)) {
384 base::FilePath filename = command_line.GetSwitchValuePath(kOutputFile);
385 ASSERT_EQ(
386 (int)output_data_.size(),
387 base::WriteFile(filename, output_data_.c_str(), output_data_.size()));
388 } else {
389 VLOG(0) << output_data_;
390 }
391 }
392
393 void Finish() {
394 DoArticleOutput();
395 requests_.clear();
396 service_.reset();
397 base::MessageLoop::current()->PostTask(
398 FROM_HERE, base::MessageLoop::QuitWhenIdleClosure());
399 }
400
401 size_t pending_tasks_;
402 size_t max_tasks_;
403 size_t next_request_;
404
405 base::ScopedTempDir db_dir_;
406 scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_;
407 scoped_ptr<DomDistillerService> service_;
408 ScopedVector<ContentExtractionRequest> requests_;
409
410 std::string output_data_;
411 scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_;
412 };
413
414 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {
415 Start();
416 base::RunLoop().Run();
417 }
418
419 } // namespace dom_distiller
OLDNEW
« no previous file with comments | « components/components_tests.gyp ('k') | components/dom_distiller/standalone/content_extractor_browsertest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698