Chromium Code Reviews| Index: components/precache/core/precache_fetcher.cc |
| diff --git a/components/precache/core/precache_fetcher.cc b/components/precache/core/precache_fetcher.cc |
| index 8f45bff3136821d26dcf92c8a563c76179d90e90..536420c67b568cd8ae183e3bfa3f3e9a5b787b67 100644 |
| --- a/components/precache/core/precache_fetcher.cc |
| +++ b/components/precache/core/precache_fetcher.cc |
| @@ -6,6 +6,7 @@ |
| #include <algorithm> |
| #include <limits> |
| +#include <set> |
| #include <utility> |
| #include <vector> |
| @@ -53,11 +54,14 @@ const int kNoTracking = |
| net::LOAD_DO_NOT_SAVE_COOKIES | net::LOAD_DO_NOT_SEND_COOKIES | |
| net::LOAD_DO_NOT_SEND_AUTH_DATA; |
| -namespace { |
| - |
| // The maximum number of URLFetcher requests that can be on flight in parallel. |
| +// Note that OnManifestFetchComplete and OnResourceFetchComplete perform |
| +// remove_if operations which are O(kMaxParallelFetches). Those should be |
| +// optimized before increasing this value significantly. |
| const int kMaxParallelFetches = 10; |
| +namespace { |
| + |
| // The maximum for the Precache.Fetch.ResponseBytes.* histograms. We set this to |
| // a number we expect to be in the 99th percentile for the histogram, give or |
| // take. |
| @@ -183,25 +187,26 @@ std::string GetResourceURLBase64Hash(const std::vector<GURL>& urls) { |
| // hosts in |hosts_to_fetch|, is added to |hosts_info|. |
| std::deque<ManifestHostInfo> RetrieveManifestInfo( |
| const base::WeakPtr<PrecacheDatabase>& precache_database, |
| - std::vector<std::string> hosts_to_fetch) { |
| + std::vector<std::pair<std::string, int64_t>> hosts_to_fetch) { |
| + VLOG(9) << "RetrieveManifestInfo"; |
| std::deque<ManifestHostInfo> hosts_info; |
| if (!precache_database) |
| return hosts_info; |
| for (const auto& host : hosts_to_fetch) { |
| - auto referrer_host_info = precache_database->GetReferrerHost(host); |
| + auto referrer_host_info = precache_database->GetReferrerHost(host.first); |
| if (referrer_host_info.id != PrecacheReferrerHostEntry::kInvalidId) { |
| std::vector<GURL> used_urls, unused_urls; |
| precache_database->GetURLListForReferrerHost(referrer_host_info.id, |
| &used_urls, &unused_urls); |
| hosts_info.push_back( |
| - ManifestHostInfo(referrer_host_info.manifest_id, host, |
| - GetResourceURLBase64Hash(used_urls), |
| + ManifestHostInfo(referrer_host_info.manifest_id, host.first, |
| + host.second, GetResourceURLBase64Hash(used_urls), |
| GetResourceURLBase64Hash(unused_urls))); |
| } else { |
| hosts_info.push_back( |
| - ManifestHostInfo(PrecacheReferrerHostEntry::kInvalidId, host, |
| - std::string(), std::string())); |
| + ManifestHostInfo(PrecacheReferrerHostEntry::kInvalidId, host.first, |
| + host.second, std::string(), std::string())); |
| } |
| } |
| return hosts_info; |
| @@ -209,6 +214,7 @@ std::deque<ManifestHostInfo> RetrieveManifestInfo( |
| PrecacheQuota RetrieveQuotaInfo( |
| const base::WeakPtr<PrecacheDatabase>& precache_database) { |
| + VLOG(9) << "RetrieveQuotaInfo"; |
| PrecacheQuota quota; |
| if (precache_database) { |
| quota = precache_database->GetQuota(); |
| @@ -363,30 +369,24 @@ void PrecacheFetcher::RecordCompletionStatistics( |
| base::TimeDelta::FromSeconds(1), |
| base::TimeDelta::FromHours(4), 50); |
| - // Number of manifests for which we have downloaded all resources. |
| - int manifests_completed = |
| - unfinished_work.num_manifest_urls() - remaining_manifest_urls_to_fetch; |
| + int num_total_resources = unfinished_work.num_resource_urls(); |
| + int percent_completed = |
| + num_total_resources == 0 |
| + ? 0 |
| + : (100 * (static_cast<double>(num_total_resources - |
| + remaining_resource_urls_to_fetch) / |
| + num_total_resources)); |
| - // If there are resource URLs left to fetch, the last manifest is not yet |
| - // completed. |
| - if (remaining_resource_urls_to_fetch > 0) |
| - --manifests_completed; |
| - |
| - DCHECK_GE(manifests_completed, 0); |
| - int percent_completed = unfinished_work.num_manifest_urls() == 0 |
| - ? 0 |
| - : (static_cast<double>(manifests_completed) / |
| - unfinished_work.num_manifest_urls() * 100); |
| + VLOG(6) << "Percent completed: " << percent_completed; |
|
bengr
2016/10/14 21:52:19
Do you need these VLOG statements?
twifkak
2016/10/14 22:41:45
They were very helpful during debugging, but I can
bengr
2016/10/18 18:57:20
Chromium style is to remove logging statements, so
twifkak
2016/10/18 19:52:27
Done.
|
| UMA_HISTOGRAM_PERCENTAGE("Precache.Fetch.PercentCompleted", |
| percent_completed); |
| - UMA_HISTOGRAM_CUSTOM_COUNTS("Precache.Fetch.ResponseBytes.Total", |
| - unfinished_work.total_bytes(), |
| - 1, kMaxResponseBytes, 100); |
| + UMA_HISTOGRAM_CUSTOM_COUNTS("Precache.Fetch.ResponseBytes.Total", |
| + unfinished_work.total_bytes(), 1, |
| + kMaxResponseBytes, 100); |
| UMA_HISTOGRAM_CUSTOM_COUNTS("Precache.Fetch.ResponseBytes.Network", |
| - unfinished_work.network_bytes(), |
| - 1, kMaxResponseBytes, |
| - 100); |
| + unfinished_work.network_bytes(), 1, |
| + kMaxResponseBytes, 100); |
| } |
| // static |
| @@ -399,6 +399,7 @@ PrecacheFetcher::PrecacheFetcher( |
| net::URLRequestContextGetter* request_context, |
| const GURL& config_url, |
| const std::string& manifest_url_prefix, |
| + bool global_ranking, |
| std::unique_ptr<PrecacheUnfinishedWork> unfinished_work, |
| uint32_t experiment_id, |
| const base::WeakPtr<PrecacheDatabase>& precache_database, |
| @@ -407,6 +408,7 @@ PrecacheFetcher::PrecacheFetcher( |
| : request_context_(request_context), |
| config_url_(config_url), |
| manifest_url_prefix_(manifest_url_prefix), |
| + global_ranking_(global_ranking), |
| precache_database_(precache_database), |
| db_task_runner_(std::move(db_task_runner)), |
| precache_delegate_(precache_delegate), |
| @@ -426,8 +428,10 @@ PrecacheFetcher::PrecacheFetcher( |
| // keeping track of the current resource index. |
| for (const auto& resource : unfinished_work->resource()) { |
| if (resource.has_url() && resource.has_top_host_name()) { |
| + // Weight doesn't matter, as the resources have already been sorted by |
| + // this point. |
| resources_to_fetch_.emplace_back(GURL(resource.url()), |
| - resource.top_host_name()); |
| + resource.top_host_name(), 0); |
| } |
| } |
| unfinished_work_ = std::move(unfinished_work); |
| @@ -446,28 +450,26 @@ std::unique_ptr<PrecacheUnfinishedWork> PrecacheFetcher::CancelPrecaching() { |
| // If config fetch is incomplete, |top_hosts_to_fetch_| will be empty and |
| // top hosts should be left as is in |unfinished_work_|. |
| unfinished_work_->clear_top_host(); |
| + for (const auto& top_host : top_hosts_fetching_) { |
|
bengr
2016/10/14 21:52:19
Remove curly braces.
twifkak
2016/10/14 22:41:45
Done.
|
| + unfinished_work_->add_top_host()->set_hostname(top_host.hostname); |
| + } |
| for (const auto& top_host : top_hosts_to_fetch_) { |
|
bengr
2016/10/14 21:52:19
Remove curly braces.
twifkak
2016/10/14 22:41:45
Done.
|
| unfinished_work_->add_top_host()->set_hostname(top_host.hostname); |
| } |
| } |
| + for (const auto& resource : resources_fetching_) { |
| + auto new_resource = unfinished_work_->add_resource(); |
| + new_resource->set_url(resource.url.spec()); |
| + new_resource->set_top_host_name(resource.referrer); |
| + } |
| for (const auto& resource : resources_to_fetch_) { |
| auto new_resource = unfinished_work_->add_resource(); |
| - new_resource->set_url(resource.first.spec()); |
| - new_resource->set_top_host_name(resource.second); |
| - } |
| - for (const auto& it : pool_.elements()) { |
| - const Fetcher* fetcher = it.first; |
| - GURL config_url = |
| - config_url_.is_empty() ? GetDefaultConfigURL() : config_url_; |
| - if (fetcher->is_resource_request()) { |
| - auto resource = unfinished_work_->add_resource(); |
| - resource->set_url(fetcher->url().spec()); |
| - resource->set_top_host_name(fetcher->referrer()); |
| - } else if (fetcher->url() != config_url) { |
| - unfinished_work_->add_top_host()->set_hostname(fetcher->referrer()); |
| - } |
| + new_resource->set_url(resource.url.spec()); |
| + new_resource->set_top_host_name(resource.referrer); |
| } |
| + top_hosts_fetching_.clear(); |
| top_hosts_to_fetch_.clear(); |
| + resources_fetching_.clear(); |
| resources_to_fetch_.clear(); |
| pool_.DeleteAll(); |
| return std::move(unfinished_work_); |
| @@ -498,40 +500,43 @@ void PrecacheFetcher::Start() { |
| void PrecacheFetcher::StartNextResourceFetch() { |
| DCHECK(unfinished_work_->has_config_settings()); |
| while (!resources_to_fetch_.empty() && pool_.IsAvailable()) { |
| - const auto& resource = resources_to_fetch_.front(); |
| + ResourceInfo& resource = resources_to_fetch_.front(); |
| const size_t max_bytes = std::min( |
| quota_.remaining(), |
| std::min(unfinished_work_->config_settings().max_bytes_per_resource(), |
| unfinished_work_->config_settings().max_bytes_total() - |
| unfinished_work_->total_bytes())); |
| - VLOG(3) << "Fetching " << resource.first << " " << resource.second; |
| + VLOG(3) << "Fetching " << resource.url << " " << resource.referrer; |
| pool_.Add(base::MakeUnique<Fetcher>( |
| - request_context_.get(), resource.first, resource.second, |
| + request_context_.get(), resource.url, resource.referrer, |
| base::Bind(&PrecacheFetcher::OnResourceFetchComplete, AsWeakPtr()), |
| true /* is_resource_request */, max_bytes)); |
| + resources_fetching_.push_back(std::move(resource)); |
| resources_to_fetch_.pop_front(); |
| } |
| } |
| -void PrecacheFetcher::StartNextManifestFetch() { |
| - if (top_hosts_to_fetch_.empty() || !pool_.IsAvailable()) |
| - return; |
| - |
| - // We only fetch one manifest at a time to keep the size of |
| - // resources_to_fetch_ as small as possible. |
| - VLOG(3) << "Fetching " << top_hosts_to_fetch_.front().manifest_url; |
| - pool_.Add(base::MakeUnique<Fetcher>( |
| - request_context_.get(), top_hosts_to_fetch_.front().manifest_url, |
| - top_hosts_to_fetch_.front().hostname, |
| - base::Bind(&PrecacheFetcher::OnManifestFetchComplete, AsWeakPtr()), |
| - false /* is_resource_request */, std::numeric_limits<int32_t>::max())); |
| - top_hosts_to_fetch_.pop_front(); |
| +void PrecacheFetcher::StartNextManifestFetches() { |
| + // We fetch as many manifests at a time as possible, as we need all resource |
| + // URLs in memory in order to rank them. |
| + while (!top_hosts_to_fetch_.empty() && pool_.IsAvailable()) { |
| + ManifestHostInfo& top_host = top_hosts_to_fetch_.front(); |
| + VLOG(3) << "Fetching " << top_host.manifest_url; |
|
bengr
2016/10/14 21:52:19
Do we need the VLOG (here and below)?
twifkak
2016/10/14 22:41:45
This VLOG is not new. The ones I added are all --v
|
| + pool_.Add(base::MakeUnique<Fetcher>( |
| + request_context_.get(), top_host.manifest_url, top_host.hostname, |
| + base::Bind(&PrecacheFetcher::OnManifestFetchComplete, AsWeakPtr(), |
| + top_host.visits), |
| + false /* is_resource_request */, std::numeric_limits<int32_t>::max())); |
| + top_hosts_fetching_.push_back(std::move(top_host)); |
| + top_hosts_to_fetch_.pop_front(); |
| + } |
| } |
| void PrecacheFetcher::NotifyDone( |
| size_t remaining_manifest_urls_to_fetch, |
| size_t remaining_resource_urls_to_fetch) { |
| + VLOG(9) << "NotifyDone"; |
| RecordCompletionStatistics(*unfinished_work_, |
| remaining_manifest_urls_to_fetch, |
| remaining_resource_urls_to_fetch); |
| @@ -539,29 +544,21 @@ void PrecacheFetcher::NotifyDone( |
| } |
| void PrecacheFetcher::StartNextFetch() { |
| + VLOG(9) << "StartNextFetch"; |
| DCHECK(unfinished_work_->has_config_settings()); |
| // If over the precache total size cap or daily quota, then stop prefetching. |
| if ((unfinished_work_->total_bytes() > |
| unfinished_work_->config_settings().max_bytes_total()) || |
| quota_.remaining() == 0) { |
| - size_t pending_manifests_in_pool = 0; |
| - size_t pending_resources_in_pool = 0; |
| - for (const auto& element_pair : pool_.elements()) { |
| - const Fetcher* fetcher = element_pair.first; |
| - if (fetcher->is_resource_request()) |
| - pending_resources_in_pool++; |
| - else if (fetcher->url() != config_url_) |
| - pending_manifests_in_pool++; |
| - } |
| pool_.DeleteAll(); |
| - NotifyDone(top_hosts_to_fetch_.size() + pending_manifests_in_pool, |
| - resources_to_fetch_.size() + pending_resources_in_pool); |
| + NotifyDone(top_hosts_to_fetch_.size() + top_hosts_fetching_.size(), |
| + resources_to_fetch_.size() + resources_fetching_.size()); |
| return; |
| } |
| StartNextResourceFetch(); |
| - StartNextManifestFetch(); |
| + StartNextManifestFetches(); |
| if (top_hosts_to_fetch_.empty() && resources_to_fetch_.empty() && |
| pool_.IsEmpty()) { |
| // There are no more URLs to fetch, so end the precache cycle. |
| @@ -572,6 +569,7 @@ void PrecacheFetcher::StartNextFetch() { |
| } |
| void PrecacheFetcher::OnConfigFetchComplete(const Fetcher& source) { |
| + VLOG(9) << "OnConfigFetchComplete"; |
| UpdateStats(source.response_bytes(), source.network_response_bytes()); |
| if (source.network_url_fetcher() == nullptr) { |
| pool_.DeleteAll(); // Cancel any other ongoing request. |
| @@ -589,9 +587,7 @@ void PrecacheFetcher::OnConfigFetchComplete(const Fetcher& source) { |
| void PrecacheFetcher::DetermineManifests() { |
| DCHECK(unfinished_work_->has_config_settings()); |
| - std::vector<std::string> top_hosts_to_fetch; |
| - std::unique_ptr<std::deque<ManifestHostInfo>> top_hosts_info( |
| - new std::deque<ManifestHostInfo>); |
| + std::vector<std::pair<std::string, int64_t>> top_hosts_to_fetch; |
| // Keep track of manifest URLs that are being fetched, in order to elide |
| // duplicates. |
| std::set<base::StringPiece> seen_top_hosts; |
| @@ -602,7 +598,7 @@ void PrecacheFetcher::DetermineManifests() { |
| if (rank > unfinished_work_->config_settings().top_sites_count()) |
| break; |
| if (seen_top_hosts.insert(host.hostname()).second) |
| - top_hosts_to_fetch.push_back(host.hostname()); |
| + top_hosts_to_fetch.emplace_back(host.hostname(), host.visits()); |
| } |
| // Attempt to fetch manifests for starting hosts up to the maximum top sites |
| @@ -613,12 +609,15 @@ void PrecacheFetcher::DetermineManifests() { |
| if (resources_to_fetch_.empty()) { |
| for (const std::string& host : |
| unfinished_work_->config_settings().forced_site()) { |
| + // We add a forced site with visits == 0, which means its resources will |
| + // be downloaded last. TODO(twifkak): Consider removing support for |
| + // forced_site. |
| if (seen_top_hosts.insert(host).second) |
| - top_hosts_to_fetch.push_back(host); |
| + top_hosts_to_fetch.emplace_back(host, 0); |
| } |
| } |
| - // We only fetch one manifest at a time to keep the size of |
| - // resources_to_fetch_ as small as possible. |
| + // We retrieve manifest usage and quota info from the local database before |
| + // fetching the manifests. |
| PostTaskAndReplyWithResult( |
| db_task_runner_.get(), FROM_HERE, |
| base::Bind(&RetrieveManifestInfo, precache_database_, |
| @@ -628,6 +627,7 @@ void PrecacheFetcher::DetermineManifests() { |
| void PrecacheFetcher::OnManifestInfoRetrieved( |
| std::deque<ManifestHostInfo> manifests_info) { |
| + VLOG(9) << "OnManifestInfoRetrieved"; |
| const std::string prefix = manifest_url_prefix_.empty() |
| ? GetDefaultManifestURLPrefix() |
| : manifest_url_prefix_; |
| @@ -636,7 +636,7 @@ void PrecacheFetcher::OnManifestInfoRetrieved( |
| // is invalid. |
| top_hosts_to_fetch_.clear(); |
| unfinished_work_->set_num_manifest_urls(manifests_info.size()); |
| - NotifyDone(manifests_info.size(), resources_to_fetch_.size()); |
| + NotifyDone(manifests_info.size(), resources_to_rank_.size()); |
| return; |
| } |
| @@ -666,6 +666,7 @@ void PrecacheFetcher::OnManifestInfoRetrieved( |
| } |
| void PrecacheFetcher::OnQuotaInfoRetrieved(const PrecacheQuota& quota) { |
| + VLOG(9) << "OnQuotaInfoRetrieved"; |
| quota_ = quota; |
| base::Time time_now = base::Time::Now(); |
| if (IsQuotaTimeExpired(quota_, time_now)) { |
| @@ -683,10 +684,12 @@ void PrecacheFetcher::OnQuotaInfoRetrieved(const PrecacheQuota& quota) { |
| ManifestHostInfo::ManifestHostInfo(int64_t manifest_id, |
| const std::string& hostname, |
| + int64_t visits, |
| const std::string& used_url_hash, |
| const std::string& unused_url_hash) |
| : manifest_id(manifest_id), |
| hostname(hostname), |
| + visits(visits), |
| used_url_hash(used_url_hash), |
| unused_url_hash(unused_url_hash) {} |
| @@ -696,7 +699,20 @@ ManifestHostInfo::ManifestHostInfo(ManifestHostInfo&&) = default; |
| ManifestHostInfo& ManifestHostInfo::operator=(ManifestHostInfo&&) = default; |
| -void PrecacheFetcher::OnManifestFetchComplete(const Fetcher& source) { |
| +ResourceInfo::ResourceInfo(const GURL& url, |
| + const std::string& referrer, |
| + double weight) |
| + : url(url), referrer(referrer), weight(weight) {} |
| + |
| +ResourceInfo::~ResourceInfo() {} |
| + |
| +ResourceInfo::ResourceInfo(ResourceInfo&&) = default; |
| + |
| +ResourceInfo& ResourceInfo::operator=(ResourceInfo&&) = default; |
| + |
| +void PrecacheFetcher::OnManifestFetchComplete(int64_t host_visits, |
| + const Fetcher& source) { |
| + VLOG(9) << "OnManifestFetchComplete " << source.referrer(); |
| DCHECK(unfinished_work_->has_config_settings()); |
| UpdateStats(source.response_bytes(), source.network_response_bytes()); |
| if (source.network_url_fetcher() == nullptr) { |
| @@ -705,9 +721,10 @@ void PrecacheFetcher::OnManifestFetchComplete(const Fetcher& source) { |
| PrecacheManifest manifest; |
| if (ParseProtoFromFetchResponse(*source.network_url_fetcher(), &manifest)) { |
| - const int32_t len = |
| - std::min(manifest.resource_size(), |
| - unfinished_work_->config_settings().top_resources_count()); |
| + int32_t len = manifest.resource_size(); |
| + if (!global_ranking_) |
|
bengr
2016/10/14 21:52:19
Add curly braces.
twifkak
2016/10/14 22:41:45
Done.
|
| + len = std::min( |
| + len, unfinished_work_->config_settings().top_resources_count()); |
| const uint64_t resource_bitset = |
| GetResourceBitset(manifest, experiment_id_); |
| for (int i = 0; i < len; ++i) { |
| @@ -715,7 +732,10 @@ void PrecacheFetcher::OnManifestFetchComplete(const Fetcher& source) { |
| manifest.resource(i).has_url()) { |
| GURL url(manifest.resource(i).url()); |
| if (url.is_valid()) { |
| - resources_to_fetch_.emplace_back(url, source.referrer()); |
| + VLOG(9) << "Adding resource " << url.spec(); |
| + double weight = manifest.resource(i).weight_ratio() * host_visits; |
| + if (weight >= unfinished_work_->config_settings().min_weight()) |
| + resources_to_rank_.emplace_back(url, source.referrer(), weight); |
| } |
| } |
| } |
| @@ -726,7 +746,36 @@ void PrecacheFetcher::OnManifestFetchComplete(const Fetcher& source) { |
| } |
| } |
| + top_hosts_fetching_.remove_if([&source](const ManifestHostInfo& top_host) { |
| + return top_host.manifest_url == source.url(); |
| + }); |
| + |
| pool_.Delete(source); |
| + |
| + if (top_hosts_to_fetch_.empty() && top_hosts_fetching_.empty()) { |
| + VLOG(9) << "Ranking resources."; |
| + // Done fetching manifests. Now sort resources_to_rank_ into |
| + // resources_to_fetch_, by descending weight. When StartNextFetch runs, it |
| + // will begin fetching resources. |
| + resources_to_fetch_ = std::move(resources_to_rank_); |
| + if (global_ranking_) { |
| + std::stable_sort( |
| + resources_to_fetch_.begin(), resources_to_fetch_.end(), |
| + [](const ResourceInfo& first, const ResourceInfo& second) { |
| + return first.weight > second.weight; |
| + }); |
| + } |
| + // Truncate to size |total_resources_count|. |
| + const size_t num_resources = std::min( |
| + resources_to_fetch_.size(), |
| + static_cast<size_t>( |
| + unfinished_work_->config_settings().total_resources_count())); |
| + resources_to_fetch_.erase(resources_to_fetch_.begin() + num_resources, |
| + resources_to_fetch_.end()); |
| + // Save denominator for PercentCompleted UMA. |
| + unfinished_work_->set_num_resource_urls(resources_to_fetch_.size()); |
| + } |
| + |
| StartNextFetch(); |
| } |
| @@ -739,6 +788,10 @@ void PrecacheFetcher::OnResourceFetchComplete(const Fetcher& source) { |
| source.url(), source.referrer(), base::Time::Now(), |
| source.was_cached(), source.response_bytes())); |
| + resources_fetching_.remove_if([&source](const ResourceInfo& resource) { |
| + return resource.url == source.url(); |
| + }); |
| + |
| pool_.Delete(source); |
| // The resource has already been put in the cache during the fetch process, so |