components/precache/core/precache_fetcher.cc - Issue 2403193002: Precache: Optionally rank resources-to-precache globally.

Unified Diff: components/precache/core/precache_fetcher.cc

Issue 2403193002: Precache: Optionally rank resources-to-precache globally. (Closed)

Patch Set: Code readability improvements per bengr. Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: components/precache/core/precache_fetcher.cc

diff --git a/components/precache/core/precache_fetcher.cc b/components/precache/core/precache_fetcher.cc

index 8f45bff3136821d26dcf92c8a563c76179d90e90..9a8fa6de738bddfb9b97cb12d6c5760f57d417df 100644

--- a/components/precache/core/precache_fetcher.cc

+++ b/components/precache/core/precache_fetcher.cc

@@ -6,6 +6,7 @@

#include <algorithm>

#include <limits>

+#include <set>

#include <utility>

#include <vector>

@@ -53,11 +54,14 @@ const int kNoTracking =

net::LOAD_DO_NOT_SAVE_COOKIES | net::LOAD_DO_NOT_SEND_COOKIES |

net::LOAD_DO_NOT_SEND_AUTH_DATA;

-namespace {

// The maximum number of URLFetcher requests that can be on flight in parallel.

bengr 2016/10/18 18:57:20 nit: on -> in

twifkak 2016/10/18 19:52:27 Done.

+// Note that OnManifestFetchComplete and OnResourceFetchComplete perform

+// remove_if operations which are O(kMaxParallelFetches). Those should be

+// optimized before increasing this value significantly.

const int kMaxParallelFetches = 10;

+namespace {

// The maximum for the Precache.Fetch.ResponseBytes.* histograms. We set this to

// a number we expect to be in the 99th percentile for the histogram, give or

// take.

@@ -183,25 +187,26 @@ std::string GetResourceURLBase64Hash(const std::vector<GURL>& urls) {

// hosts in |hosts_to_fetch|, is added to |hosts_info|.

std::deque<ManifestHostInfo> RetrieveManifestInfo(

const base::WeakPtr<PrecacheDatabase>& precache_database,

- std::vector<std::string> hosts_to_fetch) {

+ std::vector<std::pair<std::string, int64_t>> hosts_to_fetch) {

+ VLOG(9) << "RetrieveManifestInfo";

std::deque<ManifestHostInfo> hosts_info;

if (!precache_database)

return hosts_info;

for (const auto& host : hosts_to_fetch) {

- auto referrer_host_info = precache_database->GetReferrerHost(host);

+ auto referrer_host_info = precache_database->GetReferrerHost(host.first);

if (referrer_host_info.id != PrecacheReferrerHostEntry::kInvalidId) {

std::vector<GURL> used_urls, unused_urls;

precache_database->GetURLListForReferrerHost(referrer_host_info.id,

&used_urls, &unused_urls);

hosts_info.push_back(

- ManifestHostInfo(referrer_host_info.manifest_id, host,

- GetResourceURLBase64Hash(used_urls),

+ ManifestHostInfo(referrer_host_info.manifest_id, host.first,

+ host.second, GetResourceURLBase64Hash(used_urls),

GetResourceURLBase64Hash(unused_urls)));

} else {

hosts_info.push_back(

- ManifestHostInfo(PrecacheReferrerHostEntry::kInvalidId, host,

- std::string(), std::string()));

+ ManifestHostInfo(PrecacheReferrerHostEntry::kInvalidId, host.first,

+ host.second, std::string(), std::string()));

}

return hosts_info;

@@ -209,6 +214,7 @@ std::deque<ManifestHostInfo> RetrieveManifestInfo(

PrecacheQuota RetrieveQuotaInfo(

const base::WeakPtr<PrecacheDatabase>& precache_database) {

+ VLOG(9) << "RetrieveQuotaInfo";

PrecacheQuota quota;

if (precache_database) {

quota = precache_database->GetQuota();

@@ -363,30 +369,24 @@ void PrecacheFetcher::RecordCompletionStatistics(

base::TimeDelta::FromSeconds(1),

base::TimeDelta::FromHours(4), 50);

- // Number of manifests for which we have downloaded all resources.

- int manifests_completed =

- unfinished_work.num_manifest_urls() - remaining_manifest_urls_to_fetch;

+ int num_total_resources = unfinished_work.num_resource_urls();

+ int percent_completed =

+ num_total_resources == 0

bengr 2016/10/18 18:57:20 I wonder if we should be reporting num_total_resou

twifkak 2016/10/18 19:52:27 I'm amenable to that. My intuition is that this is

bengr 2016/10/18 21:50:31 sgtm

twifkak 2016/10/18 22:55:47 Done.

+ ? 0

+ : (100 * (static_cast<double>(num_total_resources -

+ remaining_resource_urls_to_fetch) /

+ num_total_resources));

- // If there are resource URLs left to fetch, the last manifest is not yet

- // completed.

- if (remaining_resource_urls_to_fetch > 0)

- --manifests_completed;

- DCHECK_GE(manifests_completed, 0);

- int percent_completed = unfinished_work.num_manifest_urls() == 0

- ? 0

- : (static_cast<double>(manifests_completed) /

- unfinished_work.num_manifest_urls() * 100);

+ VLOG(6) << "Percent completed: " << percent_completed;

UMA_HISTOGRAM_PERCENTAGE("Precache.Fetch.PercentCompleted",

percent_completed);

- UMA_HISTOGRAM_CUSTOM_COUNTS("Precache.Fetch.ResponseBytes.Total",

- unfinished_work.total_bytes(),

- 1, kMaxResponseBytes, 100);

+ UMA_HISTOGRAM_CUSTOM_COUNTS("Precache.Fetch.ResponseBytes.Total",

+ unfinished_work.total_bytes(), 1,

+ kMaxResponseBytes, 100);

UMA_HISTOGRAM_CUSTOM_COUNTS("Precache.Fetch.ResponseBytes.Network",

- unfinished_work.network_bytes(),

- 1, kMaxResponseBytes,

- 100);

+ unfinished_work.network_bytes(), 1,

+ kMaxResponseBytes, 100);

}

// static

@@ -399,6 +399,7 @@ PrecacheFetcher::PrecacheFetcher(

net::URLRequestContextGetter* request_context,

const GURL& config_url,

const std::string& manifest_url_prefix,

+ bool global_ranking,

std::unique_ptr<PrecacheUnfinishedWork> unfinished_work,

uint32_t experiment_id,

const base::WeakPtr<PrecacheDatabase>& precache_database,

@@ -407,6 +408,7 @@ PrecacheFetcher::PrecacheFetcher(

: request_context_(request_context),

config_url_(config_url),

manifest_url_prefix_(manifest_url_prefix),

+ global_ranking_(global_ranking),

precache_database_(precache_database),

db_task_runner_(std::move(db_task_runner)),

precache_delegate_(precache_delegate),

@@ -426,8 +428,10 @@ PrecacheFetcher::PrecacheFetcher(

// keeping track of the current resource index.

for (const auto& resource : unfinished_work->resource()) {

if (resource.has_url() && resource.has_top_host_name()) {

+ // Weight doesn't matter, as the resources have already been sorted by

+ // this point.

resources_to_fetch_.emplace_back(GURL(resource.url()),

- resource.top_host_name());

+ resource.top_host_name(), 0);

}

unfinished_work_ = std::move(unfinished_work);

@@ -446,28 +450,24 @@ std::unique_ptr<PrecacheUnfinishedWork> PrecacheFetcher::CancelPrecaching() {

// If config fetch is incomplete, |top_hosts_to_fetch_| will be empty and

// top hosts should be left as is in |unfinished_work_|.

unfinished_work_->clear_top_host();

- for (const auto& top_host : top_hosts_to_fetch_) {

+ for (const auto& top_host : top_hosts_fetching_)

+ unfinished_work_->add_top_host()->set_hostname(top_host.hostname);

+ for (const auto& top_host : top_hosts_to_fetch_)

unfinished_work_->add_top_host()->set_hostname(top_host.hostname);

- }

+ }

+ for (const auto& resource : resources_fetching_) {

+ auto new_resource = unfinished_work_->add_resource();

+ new_resource->set_url(resource.url.spec());

+ new_resource->set_top_host_name(resource.referrer);

}

for (const auto& resource : resources_to_fetch_) {

auto new_resource = unfinished_work_->add_resource();

- new_resource->set_url(resource.first.spec());

- new_resource->set_top_host_name(resource.second);

- }

- for (const auto& it : pool_.elements()) {

- const Fetcher* fetcher = it.first;

- GURL config_url =

- config_url_.is_empty() ? GetDefaultConfigURL() : config_url_;

- if (fetcher->is_resource_request()) {

- auto resource = unfinished_work_->add_resource();

- resource->set_url(fetcher->url().spec());

- resource->set_top_host_name(fetcher->referrer());

- } else if (fetcher->url() != config_url) {

- unfinished_work_->add_top_host()->set_hostname(fetcher->referrer());

- }

+ new_resource->set_url(resource.url.spec());

+ new_resource->set_top_host_name(resource.referrer);

}

+ top_hosts_fetching_.clear();

top_hosts_to_fetch_.clear();

+ resources_fetching_.clear();

resources_to_fetch_.clear();

pool_.DeleteAll();

return std::move(unfinished_work_);

@@ -498,40 +498,43 @@ void PrecacheFetcher::Start() {

void PrecacheFetcher::StartNextResourceFetch() {

DCHECK(unfinished_work_->has_config_settings());

while (!resources_to_fetch_.empty() && pool_.IsAvailable()) {

- const auto& resource = resources_to_fetch_.front();

+ ResourceInfo& resource = resources_to_fetch_.front();

const size_t max_bytes = std::min(

quota_.remaining(),

std::min(unfinished_work_->config_settings().max_bytes_per_resource(),

unfinished_work_->config_settings().max_bytes_total() -

unfinished_work_->total_bytes()));

- VLOG(3) << "Fetching " << resource.first << " " << resource.second;

+ VLOG(3) << "Fetching " << resource.url << " " << resource.referrer;

pool_.Add(base::MakeUnique<Fetcher>(

- request_context_.get(), resource.first, resource.second,

+ request_context_.get(), resource.url, resource.referrer,

base::Bind(&PrecacheFetcher::OnResourceFetchComplete, AsWeakPtr()),

true /* is_resource_request */, max_bytes));

+ resources_fetching_.push_back(std::move(resource));

resources_to_fetch_.pop_front();

}

-void PrecacheFetcher::StartNextManifestFetch() {

- if (top_hosts_to_fetch_.empty() || !pool_.IsAvailable())

- return;

- // We only fetch one manifest at a time to keep the size of

- // resources_to_fetch_ as small as possible.

- VLOG(3) << "Fetching " << top_hosts_to_fetch_.front().manifest_url;

- pool_.Add(base::MakeUnique<Fetcher>(

- request_context_.get(), top_hosts_to_fetch_.front().manifest_url,

- top_hosts_to_fetch_.front().hostname,

- base::Bind(&PrecacheFetcher::OnManifestFetchComplete, AsWeakPtr()),

- false /* is_resource_request */, std::numeric_limits<int32_t>::max()));

- top_hosts_to_fetch_.pop_front();

+void PrecacheFetcher::StartNextManifestFetches() {

+ // We fetch as many manifests at a time as possible, as we need all resource

+ // URLs in memory in order to rank them.

+ while (!top_hosts_to_fetch_.empty() && pool_.IsAvailable()) {

+ ManifestHostInfo& top_host = top_hosts_to_fetch_.front();

+ VLOG(3) << "Fetching " << top_host.manifest_url;

bengr 2016/10/18 18:57:20 Please remove all VLOG statements.

twifkak 2016/10/18 19:52:27 Done.

+ pool_.Add(base::MakeUnique<Fetcher>(

+ request_context_.get(), top_host.manifest_url, top_host.hostname,

+ base::Bind(&PrecacheFetcher::OnManifestFetchComplete, AsWeakPtr(),

+ top_host.visits),

+ false /* is_resource_request */, std::numeric_limits<int32_t>::max()));

+ top_hosts_fetching_.push_back(std::move(top_host));

+ top_hosts_to_fetch_.pop_front();

+ }

}

void PrecacheFetcher::NotifyDone(

size_t remaining_manifest_urls_to_fetch,

size_t remaining_resource_urls_to_fetch) {

+ VLOG(9) << "NotifyDone";

RecordCompletionStatistics(*unfinished_work_,

remaining_manifest_urls_to_fetch,

remaining_resource_urls_to_fetch);

@@ -539,29 +542,21 @@ void PrecacheFetcher::NotifyDone(

}

void PrecacheFetcher::StartNextFetch() {

+ VLOG(9) << "StartNextFetch";

DCHECK(unfinished_work_->has_config_settings());

// If over the precache total size cap or daily quota, then stop prefetching.

if ((unfinished_work_->total_bytes() >

unfinished_work_->config_settings().max_bytes_total()) ||

quota_.remaining() == 0) {

- size_t pending_manifests_in_pool = 0;

- size_t pending_resources_in_pool = 0;

- for (const auto& element_pair : pool_.elements()) {

- const Fetcher* fetcher = element_pair.first;

- if (fetcher->is_resource_request())

- pending_resources_in_pool++;

- else if (fetcher->url() != config_url_)

- pending_manifests_in_pool++;

- }

pool_.DeleteAll();

- NotifyDone(top_hosts_to_fetch_.size() + pending_manifests_in_pool,

- resources_to_fetch_.size() + pending_resources_in_pool);

+ NotifyDone(top_hosts_to_fetch_.size() + top_hosts_fetching_.size(),

+ resources_to_fetch_.size() + resources_fetching_.size());

return;

}

StartNextResourceFetch();

- StartNextManifestFetch();

+ StartNextManifestFetches();

if (top_hosts_to_fetch_.empty() && resources_to_fetch_.empty() &&

pool_.IsEmpty()) {

// There are no more URLs to fetch, so end the precache cycle.

@@ -572,6 +567,7 @@ void PrecacheFetcher::StartNextFetch() {

}

void PrecacheFetcher::OnConfigFetchComplete(const Fetcher& source) {

+ VLOG(9) << "OnConfigFetchComplete";

UpdateStats(source.response_bytes(), source.network_response_bytes());

if (source.network_url_fetcher() == nullptr) {

pool_.DeleteAll(); // Cancel any other ongoing request.

@@ -589,9 +585,7 @@ void PrecacheFetcher::OnConfigFetchComplete(const Fetcher& source) {

void PrecacheFetcher::DetermineManifests() {

DCHECK(unfinished_work_->has_config_settings());

- std::vector<std::string> top_hosts_to_fetch;

- std::unique_ptr<std::deque<ManifestHostInfo>> top_hosts_info(

- new std::deque<ManifestHostInfo>);

+ std::vector<std::pair<std::string, int64_t>> top_hosts_to_fetch;

// Keep track of manifest URLs that are being fetched, in order to elide

// duplicates.

std::set<base::StringPiece> seen_top_hosts;

@@ -602,7 +596,7 @@ void PrecacheFetcher::DetermineManifests() {

if (rank > unfinished_work_->config_settings().top_sites_count())

break;

if (seen_top_hosts.insert(host.hostname()).second)

- top_hosts_to_fetch.push_back(host.hostname());

+ top_hosts_to_fetch.emplace_back(host.hostname(), host.visits());

}

// Attempt to fetch manifests for starting hosts up to the maximum top sites

@@ -613,12 +607,15 @@ void PrecacheFetcher::DetermineManifests() {

if (resources_to_fetch_.empty()) {

for (const std::string& host :

unfinished_work_->config_settings().forced_site()) {

+ // We add a forced site with visits == 0, which means its resources will

+ // be downloaded last. TODO(twifkak): Consider removing support for

+ // forced_site.

if (seen_top_hosts.insert(host).second)

- top_hosts_to_fetch.push_back(host);

+ top_hosts_to_fetch.emplace_back(host, 0);

}

- // We only fetch one manifest at a time to keep the size of

- // resources_to_fetch_ as small as possible.

+ // We retrieve manifest usage and quota info from the local database before

+ // fetching the manifests.

PostTaskAndReplyWithResult(

db_task_runner_.get(), FROM_HERE,

base::Bind(&RetrieveManifestInfo, precache_database_,

@@ -628,6 +625,7 @@ void PrecacheFetcher::DetermineManifests() {

void PrecacheFetcher::OnManifestInfoRetrieved(

std::deque<ManifestHostInfo> manifests_info) {

+ VLOG(9) << "OnManifestInfoRetrieved";

const std::string prefix = manifest_url_prefix_.empty()

? GetDefaultManifestURLPrefix()

: manifest_url_prefix_;

@@ -636,7 +634,7 @@ void PrecacheFetcher::OnManifestInfoRetrieved(

// is invalid.

top_hosts_to_fetch_.clear();

unfinished_work_->set_num_manifest_urls(manifests_info.size());

- NotifyDone(manifests_info.size(), resources_to_fetch_.size());

+ NotifyDone(manifests_info.size(), resources_to_rank_.size());

return;

}

@@ -666,6 +664,7 @@ void PrecacheFetcher::OnManifestInfoRetrieved(

}

void PrecacheFetcher::OnQuotaInfoRetrieved(const PrecacheQuota& quota) {

+ VLOG(9) << "OnQuotaInfoRetrieved";

quota_ = quota;

base::Time time_now = base::Time::Now();

if (IsQuotaTimeExpired(quota_, time_now)) {

@@ -683,10 +682,12 @@ void PrecacheFetcher::OnQuotaInfoRetrieved(const PrecacheQuota& quota) {

ManifestHostInfo::ManifestHostInfo(int64_t manifest_id,

const std::string& hostname,

+ int64_t visits,

const std::string& used_url_hash,

const std::string& unused_url_hash)

: manifest_id(manifest_id),

hostname(hostname),

+ visits(visits),

used_url_hash(used_url_hash),

unused_url_hash(unused_url_hash) {}

@@ -696,7 +697,20 @@ ManifestHostInfo::ManifestHostInfo(ManifestHostInfo&&) = default;

ManifestHostInfo& ManifestHostInfo::operator=(ManifestHostInfo&&) = default;

-void PrecacheFetcher::OnManifestFetchComplete(const Fetcher& source) {

+ResourceInfo::ResourceInfo(const GURL& url,

+ const std::string& referrer,

+ double weight)

+ : url(url), referrer(referrer), weight(weight) {}

+ResourceInfo::~ResourceInfo() {}

+ResourceInfo::ResourceInfo(ResourceInfo&&) = default;

+ResourceInfo& ResourceInfo::operator=(ResourceInfo&&) = default;

+void PrecacheFetcher::OnManifestFetchComplete(int64_t host_visits,

+ const Fetcher& source) {

+ VLOG(9) << "OnManifestFetchComplete " << source.referrer();

DCHECK(unfinished_work_->has_config_settings());

UpdateStats(source.response_bytes(), source.network_response_bytes());

if (source.network_url_fetcher() == nullptr) {

@@ -705,9 +719,11 @@ void PrecacheFetcher::OnManifestFetchComplete(const Fetcher& source) {

PrecacheManifest manifest;

if (ParseProtoFromFetchResponse(*source.network_url_fetcher(), &manifest)) {

- const int32_t len =

- std::min(manifest.resource_size(),

- unfinished_work_->config_settings().top_resources_count());

+ int32_t len = manifest.resource_size();

+ if (!global_ranking_) {

+ len = std::min(

+ len, unfinished_work_->config_settings().top_resources_count());

+ }

const uint64_t resource_bitset =

GetResourceBitset(manifest, experiment_id_);

for (int i = 0; i < len; ++i) {

@@ -715,7 +731,10 @@ void PrecacheFetcher::OnManifestFetchComplete(const Fetcher& source) {

manifest.resource(i).has_url()) {

GURL url(manifest.resource(i).url());

if (url.is_valid()) {

- resources_to_fetch_.emplace_back(url, source.referrer());

+ VLOG(9) << "Adding resource " << url.spec();

+ double weight = manifest.resource(i).weight_ratio() * host_visits;

+ if (weight >= unfinished_work_->config_settings().min_weight())

+ resources_to_rank_.emplace_back(url, source.referrer(), weight);

}

@@ -726,7 +745,36 @@ void PrecacheFetcher::OnManifestFetchComplete(const Fetcher& source) {

}

+ top_hosts_fetching_.remove_if([&source](const ManifestHostInfo& top_host) {

+ return top_host.manifest_url == source.url();

+ });

pool_.Delete(source);

+ if (top_hosts_to_fetch_.empty() && top_hosts_fetching_.empty()) {

+ VLOG(9) << "Ranking resources.";

+ // Done fetching manifests. Now sort resources_to_rank_ into

bengr 2016/10/18 18:57:20 Not a requirement, but this seems like it's gettin

twifkak 2016/10/18 19:52:27 Okay, I extracted the inside of this if statement

+ // resources_to_fetch_, by descending weight. When StartNextFetch runs, it

+ // will begin fetching resources.

+ resources_to_fetch_ = std::move(resources_to_rank_);

+ if (global_ranking_) {

+ std::stable_sort(

+ resources_to_fetch_.begin(), resources_to_fetch_.end(),

+ [](const ResourceInfo& first, const ResourceInfo& second) {

+ return first.weight > second.weight;

+ });

+ }

+ // Truncate to size |total_resources_count|.

+ const size_t num_resources = std::min(

+ resources_to_fetch_.size(),

+ static_cast<size_t>(

+ unfinished_work_->config_settings().total_resources_count()));

+ resources_to_fetch_.erase(resources_to_fetch_.begin() + num_resources,

+ resources_to_fetch_.end());

+ // Save denominator for PercentCompleted UMA.

+ unfinished_work_->set_num_resource_urls(resources_to_fetch_.size());

+ }

StartNextFetch();

}

@@ -739,6 +787,10 @@ void PrecacheFetcher::OnResourceFetchComplete(const Fetcher& source) {

source.url(), source.referrer(), base::Time::Now(),

source.was_cached(), source.response_bytes()));

+ resources_fetching_.remove_if([&source](const ResourceInfo& resource) {

+ return resource.url == source.url();

+ });

pool_.Delete(source);

// The resource has already been put in the cache during the fetch process, so

« no previous file with comments | « components/precache/core/precache_fetcher.h ('k') | components/precache/core/precache_fetcher_unittest.cc » ('j') | no next file with comments »