Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1053)

Unified Diff: components/dom_distiller/standalone/content_extractor.cc

Issue 887803002: Provide original URLs for next page detection in dom_distiller (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: rebase Created 5 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | third_party/dom_distiller_js/README.chromium » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: components/dom_distiller/standalone/content_extractor.cc
diff --git a/components/dom_distiller/standalone/content_extractor.cc b/components/dom_distiller/standalone/content_extractor.cc
index 1643b12ca84c2625539b558e39b3f4821998d4aa..b9766f6885fa2d005197fa5d2d1a3fbdbe7f3f05 100644
--- a/components/dom_distiller/standalone/content_extractor.cc
+++ b/components/dom_distiller/standalone/content_extractor.cc
@@ -40,21 +40,21 @@ namespace dom_distiller {
namespace {
-typedef base::hash_map<std::string, std::string> UrlToDomainMap;
+typedef base::hash_map<std::string, std::string> FileToUrlMap;
}
// Factory for creating a Distiller that creates different DomDistillerOptions
-// for different URLs, i.e. a specific kOriginalDomain option for each URL.
+// for different URLs, i.e. a specific kOriginalUrl option for each URL.
class TestDistillerFactoryImpl : public DistillerFactory {
public:
TestDistillerFactoryImpl(
scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
const dom_distiller::proto::DomDistillerOptions& dom_distiller_options,
- const UrlToDomainMap& url_to_domain_map)
+ const FileToUrlMap& file_to_url_map)
: distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()),
dom_distiller_options_(dom_distiller_options),
- url_to_domain_map_(url_to_domain_map) {
+ file_to_url_map_(file_to_url_map) {
}
~TestDistillerFactoryImpl() override {}
@@ -62,8 +62,10 @@ class TestDistillerFactoryImpl : public DistillerFactory {
scoped_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override {
dom_distiller::proto::DomDistillerOptions options;
options = dom_distiller_options_;
- UrlToDomainMap::const_iterator it = url_to_domain_map_.find(url.spec());
- if (it != url_to_domain_map_.end()) options.set_original_domain(it->second);
+ FileToUrlMap::const_iterator it = file_to_url_map_.find(url.spec());
+ if (it != file_to_url_map_.end()) {
+ options.set_original_url(it->second);
+ }
scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
*distiller_url_fetcher_factory_, options));
return distiller.Pass();
@@ -72,7 +74,7 @@ class TestDistillerFactoryImpl : public DistillerFactory {
private:
scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_;
dom_distiller::proto::DomDistillerOptions dom_distiller_options_;
- UrlToDomainMap url_to_domain_map_;
+ FileToUrlMap file_to_url_map_;
};
namespace {
@@ -99,12 +101,12 @@ const char* kExtractTextOnly = "extract-text-only";
// Indicates to include debug output.
const char* kDebugLevel = "debug-level";
-// The original domain of the page if |kUrlSwitch| is a file.
-const char* kOriginalDomain = "original-domain";
+// The original URL of the page if |kUrlSwitch| is a file.
+const char* kOriginalUrl = "original-url";
-// A semi-colon-separated (i.e. ';') list of original domains corresponding to
+// A semi-colon-separated (i.e. ';') list of original URLs corresponding to
// "kUrlsSwitch".
-const char* kOriginalDomains = "original-domains";
+const char* kOriginalUrls = "original-urls";
// Maximum number of concurrent started extractor requests.
const int kMaxExtractorTasks = 8;
@@ -112,7 +114,7 @@ const int kMaxExtractorTasks = 8;
scoped_ptr<DomDistillerService> CreateDomDistillerService(
content::BrowserContext* context,
const base::FilePath& db_path,
- const UrlToDomainMap& url_to_domain_map) {
+ const FileToUrlMap& file_to_url_map) {
scoped_refptr<base::SequencedTaskRunner> background_task_runner =
content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
content::BrowserThread::GetBlockingPool()->GetSequenceToken());
@@ -145,7 +147,7 @@ scoped_ptr<DomDistillerService> CreateDomDistillerService(
scoped_ptr<DistillerFactory> distiller_factory(
new TestDistillerFactoryImpl(distiller_url_fetcher_factory.Pass(),
options,
- url_to_domain_map));
+ file_to_url_map));
// Setting up PrefService for DistilledPagePrefs.
user_prefs::TestingPrefServiceSyncable* pref_service =
@@ -227,7 +229,7 @@ class ContentExtractionRequest : public ViewRequestDelegate {
static ScopedVector<ContentExtractionRequest> CreateForCommandLine(
const base::CommandLine& command_line,
- UrlToDomainMap* url_to_domain_map) {
+ FileToUrlMap* file_to_url_map) {
ScopedVector<ContentExtractionRequest> requests;
if (command_line.HasSwitch(kUrlSwitch)) {
GURL url;
@@ -235,31 +237,32 @@ class ContentExtractionRequest : public ViewRequestDelegate {
url = GURL(url_string);
if (url.is_valid()) {
requests.push_back(new ContentExtractionRequest(url));
- if (command_line.HasSwitch(kOriginalDomain)) {
- (*url_to_domain_map)[url.spec()] =
- command_line.GetSwitchValueASCII(kOriginalDomain);
+ if (command_line.HasSwitch(kOriginalUrl)) {
+ (*file_to_url_map)[url.spec()] =
+ command_line.GetSwitchValueASCII(kOriginalUrl);
}
}
} else if (command_line.HasSwitch(kUrlsSwitch)) {
std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
std::vector<std::string> urls;
base::SplitString(urls_string, ' ', &urls);
- // Check for original-domains switch, which must exactly pair up with
- // |kUrlsSwitch| i.e. number of domains must be same as that of urls.
- std::vector<std::string> domains;
- if (command_line.HasSwitch(kOriginalDomains)) {
- std::string domains_string =
- command_line.GetSwitchValueASCII( kOriginalDomains);
- base::SplitString(domains_string, ';', &domains);
- if (domains.size() != urls.size()) domains.clear();
+ // Check for original-urls switch, which must exactly pair up with
+ // |kUrlsSwitch| i.e. number of original urls must be same as that of
+ // urls.
+ std::vector<std::string> original_urls;
+ if (command_line.HasSwitch(kOriginalUrls)) {
+ std::string original_urls_string =
+ command_line.GetSwitchValueASCII(kOriginalUrls);
+ base::SplitString(original_urls_string, ' ', &original_urls);
+ if (original_urls.size() != urls.size()) original_urls.clear();
}
for (size_t i = 0; i < urls.size(); ++i) {
GURL url(urls[i]);
if (url.is_valid()) {
requests.push_back(new ContentExtractionRequest(url));
- // Only regard non-empty domain.
- if (!domains.empty() && !domains[i].empty()) {
- (*url_to_domain_map)[url.spec()] = domains[i];
+ // Only regard non-empty original urls.
+ if (!original_urls.empty() && !original_urls[i].empty()) {
+ (*file_to_url_map)[url.spec()] = original_urls[i];
}
} else {
ADD_FAILURE() << "Bad url";
@@ -320,14 +323,14 @@ class ContentExtractor : public ContentBrowserTest {
void Start() {
const base::CommandLine& command_line =
*base::CommandLine::ForCurrentProcess();
- UrlToDomainMap url_to_domain_map;
+ FileToUrlMap file_to_url_map;
requests_ = ContentExtractionRequest::CreateForCommandLine(
- command_line, &url_to_domain_map);
+ command_line, &file_to_url_map);
content::BrowserContext* context =
shell()->web_contents()->GetBrowserContext();
service_ = CreateDomDistillerService(context,
db_dir_.path(),
- url_to_domain_map);
+ file_to_url_map);
PumpQueue();
}
« no previous file with comments | « no previous file | third_party/dom_distiller_js/README.chromium » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698