Index: components/dom_distiller/standalone/content_extractor.cc |
diff --git a/components/dom_distiller/standalone/content_extractor.cc b/components/dom_distiller/standalone/content_extractor.cc |
index b6a5a4e98995f031a480efbf158ac52d48a2b8c7..697152b92f50407e6f40283930643b456887a2d3 100644 |
--- a/components/dom_distiller/standalone/content_extractor.cc |
+++ b/components/dom_distiller/standalone/content_extractor.cc |
@@ -6,6 +6,7 @@ |
#include "base/command_line.h" |
#include "base/files/scoped_temp_dir.h" |
+#include "base/id_map.h" |
#include "base/message_loop/message_loop.h" |
#include "base/path_service.h" |
#include "base/run_loop.h" |
@@ -39,6 +40,43 @@ namespace dom_distiller { |
namespace { |
+typedef base::hash_map<std::string, std::string> UrlToDomainMap; |
+ |
+} |
+ |
+// Factory for creating a Distiller that creates different DomDistillerOptions |
+// for different URLs, i.e. a specific kOriginalDomain option for each URL. |
+class TestDistillerFactoryImpl : public DistillerFactory { |
+ public: |
+ TestDistillerFactoryImpl( |
+ scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory, |
+ const dom_distiller::proto::DomDistillerOptions& dom_distiller_options, |
+ const UrlToDomainMap& url_to_domain_map) |
+ : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()), |
+ dom_distiller_options_(dom_distiller_options), |
+ url_to_domain_map_(url_to_domain_map) { |
+ } |
+ |
+ ~TestDistillerFactoryImpl() override {} |
+ |
+ scoped_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override { |
+ dom_distiller::proto::DomDistillerOptions options; |
+ options = dom_distiller_options_; |
+ UrlToDomainMap::const_iterator it = url_to_domain_map_.find(url.spec()); |
+ if (it != url_to_domain_map_.end()) options.set_original_domain(it->second); |
+ scoped_ptr<DistillerImpl> distiller(new DistillerImpl( |
+ *distiller_url_fetcher_factory_, options)); |
+ return distiller.Pass(); |
+ } |
+ |
+ private: |
+ scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_; |
+ dom_distiller::proto::DomDistillerOptions dom_distiller_options_; |
+ UrlToDomainMap url_to_domain_map_; |
+}; |
+ |
+namespace { |
+ |
// The url to distill. |
const char* kUrlSwitch = "url"; |
@@ -61,12 +99,20 @@ const char* kExtractTextOnly = "extract-text-only"; |
// Indicates to include debug output. |
const char* kDebugLevel = "debug-level"; |
+// The original domain of the page if |kUrlSwitch| is a file. |
+const char* kOriginalDomain = "original-domain"; |
+ |
+// A semi-colon-separated (i.e. ';') list of original domains corresponding to |
+// "kUrlsSwitch". |
+const char* kOriginalDomains = "original-domains"; |
+ |
// Maximum number of concurrent started extractor requests. |
const int kMaxExtractorTasks = 8; |
scoped_ptr<DomDistillerService> CreateDomDistillerService( |
content::BrowserContext* context, |
- const base::FilePath& db_path) { |
+ const base::FilePath& db_path, |
+ const UrlToDomainMap& url_to_domain_map) { |
scoped_refptr<base::SequencedTaskRunner> background_task_runner = |
content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner( |
content::BrowserThread::GetBlockingPool()->GetSequenceToken()); |
@@ -97,7 +143,9 @@ scoped_ptr<DomDistillerService> CreateDomDistillerService( |
options.set_debug_level(debug_level); |
} |
scoped_ptr<DistillerFactory> distiller_factory( |
- new DistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), options)); |
+ new TestDistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), |
+ options, |
+ url_to_domain_map)); |
// Setting up PrefService for DistilledPagePrefs. |
user_prefs::TestingPrefServiceSyncable* pref_service = |
@@ -177,7 +225,8 @@ class ContentExtractionRequest : public ViewRequestDelegate { |
} |
static ScopedVector<ContentExtractionRequest> CreateForCommandLine( |
- const CommandLine& command_line) { |
+ const CommandLine& command_line, |
+ UrlToDomainMap* url_to_domain_map) { |
ScopedVector<ContentExtractionRequest> requests; |
if (command_line.HasSwitch(kUrlSwitch)) { |
GURL url; |
@@ -185,15 +234,32 @@ class ContentExtractionRequest : public ViewRequestDelegate { |
url = GURL(url_string); |
if (url.is_valid()) { |
requests.push_back(new ContentExtractionRequest(url)); |
+ if (command_line.HasSwitch(kOriginalDomain)) { |
+ (*url_to_domain_map)[url.spec()] = |
+ command_line.GetSwitchValueASCII(kOriginalDomain); |
+ } |
} |
} else if (command_line.HasSwitch(kUrlsSwitch)) { |
std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch); |
std::vector<std::string> urls; |
base::SplitString(urls_string, ' ', &urls); |
+ // Check for original-domains switch, which must exactly pair up with |
+ // |kUrlsSwitch| i.e. number of domains must be same as that of urls. |
+ std::vector<std::string> domains; |
+ if (command_line.HasSwitch(kOriginalDomains)) { |
+ std::string domains_string = |
+ command_line.GetSwitchValueASCII( kOriginalDomains); |
+ base::SplitString(domains_string, ';', &domains); |
+ if (domains.size() != urls.size()) domains.clear(); |
+ } |
for (size_t i = 0; i < urls.size(); ++i) { |
GURL url(urls[i]); |
if (url.is_valid()) { |
requests.push_back(new ContentExtractionRequest(url)); |
+ // Only regard non-empty domain. |
+ if (!domains.empty() && !domains[i].empty()) { |
+ (*url_to_domain_map)[url.spec()] = domains[i]; |
+ } |
} else { |
ADD_FAILURE() << "Bad url"; |
} |
@@ -255,12 +321,15 @@ class ContentExtractor : public ContentBrowserTest { |
// Creates the DomDistillerService and creates and starts the extraction |
// request. |
void Start() { |
+ const CommandLine& command_line = *CommandLine::ForCurrentProcess(); |
+ UrlToDomainMap url_to_domain_map; |
+ requests_ = ContentExtractionRequest::CreateForCommandLine( |
+ command_line, &url_to_domain_map); |
content::BrowserContext* context = |
shell()->web_contents()->GetBrowserContext(); |
service_ = CreateDomDistillerService(context, |
- db_dir_.path()); |
- const CommandLine& command_line = *CommandLine::ForCurrentProcess(); |
- requests_ = ContentExtractionRequest::CreateForCommandLine(command_line); |
+ db_dir_.path(), |
+ url_to_domain_map); |
PumpQueue(); |
} |