chrome/browser/safe_browsing/safe_browsing_database.cc - Issue 220493003: Safebrowsing: change gethash caching to match api 2.3 rules, fix some corner cases.

Unified Diff: chrome/browser/safe_browsing/safe_browsing_database.cc

Issue 220493003: Safebrowsing: change gethash caching to match api 2.3 rules, fix some corner cases. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: . Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« chrome/browser/safe_browsing/safe_browsing_database.h ('K') | « chrome/browser/safe_browsing/safe_browsing_database.h ('k') | chrome/browser/safe_browsing/safe_browsing_database_unittest.cc » ('j') | chrome/browser/safe_browsing/safe_browsing_store_file.h » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/browser/safe_browsing/safe_browsing_database.cc

diff --git a/chrome/browser/safe_browsing/safe_browsing_database.cc b/chrome/browser/safe_browsing/safe_browsing_database.cc

index 2b5b2e256bcdf199691453784246ac5a14ccc20c..ece3138a042b65ef909612658d517a063a20ed70 100644

--- a/chrome/browser/safe_browsing/safe_browsing_database.cc

+++ b/chrome/browser/safe_browsing/safe_browsing_database.cc

@@ -66,9 +66,6 @@ const base::FilePath::CharType kIPBlacklistDBFile[] =

// this.

const base::FilePath::CharType kBrowseDBFile[] = FILE_PATH_LITERAL(" Bloom");

-// The maximum staleness for a cached entry.

-const int kMaxStalenessMinutes = 45;

// Maximum number of entries we allow in any of the whitelists.

// If a whitelist on disk contains more entries then all lookups to

// the whitelist will be considered a match.

@@ -185,51 +182,6 @@ bool MatchAddPrefixes(SafeBrowsingStore* store,

return found_match;

}

-// Find the entries in |full_hashes| with prefix in |prefix_hits|, and

-// add them to |full_hits| if not expired. "Not expired" is when

-// either |last_update| was recent enough, or the item has been

-// received recently enough. Expired items are not deleted because a

-// future update may make them acceptable again.

-//

-// For efficiency reasons the code walks |prefix_hits| and

-// |full_hashes| in parallel, so they must be sorted by prefix.

-void GetCachedFullHashesForBrowse(const std::vector<SBPrefix>& prefix_hits,

- const std::vector<SBAddFullHash>& full_hashes,

- std::vector<SBFullHashResult>* full_hits,

- base::Time last_update) {

- const base::Time expire_time =

- base::Time::Now() - base::TimeDelta::FromMinutes(kMaxStalenessMinutes);

- std::vector<SBPrefix>::const_iterator piter = prefix_hits.begin();

- std::vector<SBAddFullHash>::const_iterator hiter = full_hashes.begin();

- while (piter != prefix_hits.end() && hiter != full_hashes.end()) {

- if (*piter < hiter->full_hash.prefix) {

- ++piter;

- } else if (hiter->full_hash.prefix < *piter) {

- ++hiter;

- } else {

- if (expire_time < last_update ||

- expire_time.ToTimeT() < hiter->received) {

- SBFullHashResult result;

- const int list_bit = GetListIdBit(hiter->chunk_id);

- DCHECK(list_bit == safe_browsing_util::MALWARE ||

- list_bit == safe_browsing_util::PHISH);

- const safe_browsing_util::ListType list_id =

- static_cast<safe_browsing_util::ListType>(list_bit);

- if (!safe_browsing_util::GetListName(list_id, &result.list_name))

- continue;

- result.add_chunk_id = DecodeChunkId(hiter->chunk_id);

- result.hash = hiter->full_hash;

- full_hits->push_back(result);

- }

- // Only increment |hiter|, |piter| might have multiple hits.

- ++hiter;

- }

// This function generates a chunk range string for |chunks|. It

// outputs one chunk range string per list and writes it to the

// |list_ranges| vector. We expect |list_ranges| to already be of the

@@ -321,11 +273,6 @@ int64 GetFileSizeOrZero(const base::FilePath& file_path) {

return size_64;

}

-// Used to order whitelist storage in memory.

-bool SBFullHashLess(const SBFullHash& a, const SBFullHash& b) {

- return memcmp(a.full_hash, b.full_hash, sizeof(a.full_hash)) < 0;

} // namespace

// The default SafeBrowsingDatabaseFactory.

@@ -531,7 +478,7 @@ void SafeBrowsingDatabaseNew::Init(const base::FilePath& filename_base) {

// contention on the lock...

base::AutoLock locked(lookup_lock_);

full_browse_hashes_.clear();

- pending_browse_hashes_.clear();

+ browse_gethash_cache_.clear();

LoadPrefixSet();

}

@@ -655,8 +602,7 @@ bool SafeBrowsingDatabaseNew::ResetDatabase() {

{

base::AutoLock locked(lookup_lock_);

full_browse_hashes_.clear();

- pending_browse_hashes_.clear();

- prefix_miss_cache_.clear();

+ browse_gethash_cache_.clear();

browse_prefix_set_.reset();

side_effect_free_whitelist_prefix_set_.reset();

ip_blacklist_.clear();

@@ -667,23 +613,28 @@ bool SafeBrowsingDatabaseNew::ResetDatabase() {

return true;

}

-// TODO(lzheng): Remove matching_list, it is not used anywhere.

bool SafeBrowsingDatabaseNew::ContainsBrowseUrl(

const GURL& url,

- std::string* matching_list,

std::vector<SBPrefix>* prefix_hits,

- std::vector<SBFullHashResult>* full_hits,

- base::Time last_update) {

+ std::vector<SBFullHashResult>* cache_hits) {

// Clear the results first.

- matching_list->clear();

prefix_hits->clear();

- full_hits->clear();

+ cache_hits->clear();

std::vector<SBFullHash> full_hashes;

BrowseFullHashesToCheck(url, false, &full_hashes);

if (full_hashes.empty())

return false;

+ std::sort(full_hashes.begin(), full_hashes.end(), SBFullHashLess);

+ return ContainsBrowseUrlHashes(full_hashes, prefix_hits, cache_hits);

+bool SafeBrowsingDatabaseNew::ContainsBrowseUrlHashes(

+ const std::vector<SBFullHash>& full_hashes,

+ std::vector<SBPrefix>* prefix_hits,

+ std::vector<SBFullHashResult>* cache_hits) {

// This function is called on the I/O thread, prevent changes to

// filter and caches.

base::AutoLock locked(lookup_lock_);

@@ -694,30 +645,81 @@ bool SafeBrowsingDatabaseNew::ContainsBrowseUrl(

if (!browse_prefix_set_.get())

return false;

- size_t miss_count = 0;

+ const base::Time now = base::Time::Now();

for (size_t i = 0; i < full_hashes.size(); ++i) {

const SBPrefix prefix = full_hashes[i].prefix;

if (browse_prefix_set_->Exists(prefix)) {

- prefix_hits->push_back(prefix);

- if (prefix_miss_cache_.count(prefix) > 0)

- ++miss_count;

- }

+ // Stupid workaround since std::equal_range requires you to have the

+ // search key be the same type as the elements in the collection.

Scott Hess - ex-Googler 2014/04/01 22:08:36 My impression is that this is because while you ca

mattm 2014/04/03 01:38:12 Yeah. I had tried to hack by defining both orderin

+ SBAddFullHash search_value;

+ search_value.full_hash.prefix = prefix;

+ std::pair<std::vector<SBAddFullHash>::const_iterator,

+ std::vector<SBAddFullHash>::const_iterator>

+ db_fullhash_prefix_matches =

+ std::equal_range(full_browse_hashes_.begin(),

+ full_browse_hashes_.end(),

+ search_value,

+ SBAddFullHashPrefixLess);

+ bool match = false;

+ // If the database contains any fullhashes with the same prefix, then we

+ // only count it as a prefix hit if there is an exact fullhash match.

+ if (db_fullhash_prefix_matches.first !=

+ db_fullhash_prefix_matches.second) {

+ // Could do binary_search here, but unlikely to be enough matches to

+ // matter.

+ for (std::vector<SBAddFullHash>::const_iterator hiter =

+ db_fullhash_prefix_matches.first;

+ hiter != db_fullhash_prefix_matches.second;

+ ++hiter) {

+ if (SBFullHashEqual(hiter->full_hash, full_hashes[i])) {

+ match = true;

+ break;

+ }

+ } else {

+ // If db has only a prefix hit, that's ok too.

+ match = true;

+ }

- // If all the prefixes are cached as 'misses', don't issue a GetHash.

- if (miss_count == prefix_hits->size())

- return false;

+ // If we have a db match, check if we have a valid cached result for this

+ // prefix.

Scott Hess - ex-Googler 2014/04/01 22:08:36 I think maybe this is backwards. If there is a va

mattm 2014/04/03 01:38:12 Ah, good point. Could even check the cache before

+ if (match) {

+ std::map<SBPrefix, SBCachedFullHashResult>::iterator citer =

+ browse_gethash_cache_.find(prefix);

+ if (citer != browse_gethash_cache_.end()) {

+ if (now > citer->second.expire_after) {

+ // If the cached entry is expired, remove it and ignore it.

+ browse_gethash_cache_.erase(citer);

+ } else {

+ for (std::vector<SBFullHashResult>::const_iterator fiter =

+ citer->second.full_hashes.begin();

+ fiter != citer->second.full_hashes.end();

+ ++fiter) {

+ if (SBFullHashEqual(full_hashes[i], fiter->hash)) {

+ cache_hits->push_back(*fiter);

+ }

+ // If the prefix was in the cache, don't add the prefix to

+ // prefix_hits. The result will be in cache_hits (if the fullhash

+ // matched), or not (if it was a cached miss).

+ match = false;

+ }

- // Find the matching full-hash results. |full_browse_hashes_| are from the

- // database, |pending_browse_hashes_| are from GetHash requests between

- // updates.

- std::sort(prefix_hits->begin(), prefix_hits->end());

+ // If there was a prefix match and no cached result, add to prefix_hits.

+ if (match) {

+ // Only add a given prefix once. Since full_hashes is sorted, we only

+ // need to check the last entry of prefix_hits.

+ if (prefix_hits->empty() || prefix_hits->back() != prefix)

+ prefix_hits->push_back(prefix);

+ }

- GetCachedFullHashesForBrowse(*prefix_hits, full_browse_hashes_,

- full_hits, last_update);

- GetCachedFullHashesForBrowse(*prefix_hits, pending_browse_hashes_,

- full_hits, last_update);

- return true;

+ return !prefix_hits->empty() || !cache_hits->empty();

}

bool SafeBrowsingDatabaseNew::ContainsDownloadUrl(

@@ -876,7 +878,6 @@ void SafeBrowsingDatabaseNew::InsertAdd(int chunk_id, SBPrefix host,

}

} else {

// Prefixes and hashes.

- const base::Time receive_time = base::Time::Now();

for (int i = 0; i < count; ++i) {

const SBFullHash full_hash = entry->FullHashAt(i);

const SBPrefix prefix = full_hash.prefix;

@@ -885,7 +886,7 @@ void SafeBrowsingDatabaseNew::InsertAdd(int chunk_id, SBPrefix host,

store->WriteAddPrefix(encoded_chunk_id, prefix);

STATS_COUNTER("SB.PrefixAddFull", 1);

- store->WriteAddHash(encoded_chunk_id, receive_time, full_hash);

+ store->WriteAddHash(encoded_chunk_id, full_hash);

}

@@ -1053,37 +1054,28 @@ void SafeBrowsingDatabaseNew::DeleteChunks(

void SafeBrowsingDatabaseNew::CacheHashResults(

const std::vector<SBPrefix>& prefixes,

- const std::vector<SBFullHashResult>& full_hits) {

+ const std::vector<SBFullHashResult>& full_hits,

+ const base::TimeDelta& cache_lifetime) {

+ const base::Time expire_after = base::Time::Now() + cache_lifetime;

// This is called on the I/O thread, lock against updates.

base::AutoLock locked(lookup_lock_);

- if (full_hits.empty()) {

- prefix_miss_cache_.insert(prefixes.begin(), prefixes.end());

- return;

+ // Create or reset all cached results for these prefixes.

+ for (std::vector<SBPrefix>::const_iterator i = prefixes.begin();

+ i != prefixes.end();

+ ++i) {

+ browse_gethash_cache_[*i] = SBCachedFullHashResult(expire_after);

}

- // TODO(shess): SBFullHashResult and SBAddFullHash are very similar.

- // Refactor to make them identical.

- const base::Time now = base::Time::Now();

- const size_t orig_size = pending_browse_hashes_.size();

- for (std::vector<SBFullHashResult>::const_iterator iter = full_hits.begin();

- iter != full_hits.end(); ++iter) {

- const int list_id = safe_browsing_util::GetListId(iter->list_name);

- if (list_id == safe_browsing_util::MALWARE ||

- list_id == safe_browsing_util::PHISH) {

- int encoded_chunk_id = EncodeChunkId(iter->add_chunk_id, list_id);

- SBAddFullHash add_full_hash(encoded_chunk_id, now, iter->hash);

- pending_browse_hashes_.push_back(add_full_hash);

- }

+ // Insert any fullhash hits. Note that there may be one, multiple, or no

+ // fullhashes for any given entry in |prefixes|.

+ for (std::vector<SBFullHashResult>::const_iterator i = full_hits.begin();

+ i != full_hits.end();

+ ++i) {

+ browse_gethash_cache_[i->hash.prefix].full_hashes.push_back(*i);

}

- // Sort new entries then merge with the previously-sorted entries.

- std::vector<SBAddFullHash>::iterator

- orig_end = pending_browse_hashes_.begin() + orig_size;

- std::sort(orig_end, pending_browse_hashes_.end(), SBAddFullHashPrefixLess);

- std::inplace_merge(pending_browse_hashes_.begin(),

- orig_end, pending_browse_hashes_.end(),

- SBAddFullHashPrefixLess);

}

bool SafeBrowsingDatabaseNew::UpdateStarted(

@@ -1141,6 +1133,9 @@ bool SafeBrowsingDatabaseNew::UpdateStarted(

safe_browsing_util::kMalwareList,

safe_browsing_util::kPhishingList,

lists);

+ // Cached fullhash results must be cleared on every database update (whether

+ // successful or not.)

+ browse_gethash_cache_.clear();

Scott Hess - ex-Googler 2014/04/01 22:08:36 I think this should be cleared when the update is

mattm 2014/04/03 01:38:12 The spec says "Clients must clear cached full-leng

// NOTE(shess): |download_store_| used to contain kBinHashList, which has been

// deprecated. Code to delete the list from the store shows ~15k hits/day as

@@ -1275,15 +1270,11 @@ void SafeBrowsingDatabaseNew::UpdateWhitelistStore(

if (!store)

return;

- // For the whitelists, we don't cache and save full hashes since all

- // hashes are already full.

- std::vector<SBAddFullHash> empty_add_hashes;

// Note: |builder| will not be empty. The current data store implementation

// stores all full-length hashes as both full and prefix hashes.

safe_browsing::PrefixSetBuilder builder;

std::vector<SBAddFullHash> full_hashes;

- if (!store->FinishUpdate(empty_add_hashes, &builder, &full_hashes)) {

+ if (!store->FinishUpdate(&builder, &full_hashes)) {

RecordFailure(FAILURE_WHITELIST_DATABASE_UPDATE_FINISH);

WhitelistEverything(whitelist);

return;

@@ -1300,17 +1291,12 @@ int64 SafeBrowsingDatabaseNew::UpdateHashPrefixStore(

const base::FilePath& store_filename,

SafeBrowsingStore* store,

FailureType failure_type) {

- // We don't cache and save full hashes.

- std::vector<SBAddFullHash> empty_add_hashes;

// These results are not used after this call. Simply ignore the

// returned value after FinishUpdate(...).

safe_browsing::PrefixSetBuilder builder;

std::vector<SBAddFullHash> add_full_hashes_result;

- if (!store->FinishUpdate(empty_add_hashes,

- &builder,

- &add_full_hashes_result)) {

+ if (!store->FinishUpdate(&builder, &add_full_hashes_result)) {

RecordFailure(failure_type);

}

@@ -1322,16 +1308,6 @@ int64 SafeBrowsingDatabaseNew::UpdateHashPrefixStore(

}

void SafeBrowsingDatabaseNew::UpdateBrowseStore() {

- // Copy out the pending add hashes. Copy rather than swapping in

- // case |ContainsBrowseURL()| is called before the new filter is complete.

- std::vector<SBAddFullHash> pending_add_hashes;

- {

- base::AutoLock locked(lookup_lock_);

- pending_add_hashes.insert(pending_add_hashes.end(),

- pending_browse_hashes_.begin(),

- pending_browse_hashes_.end());

- }

// Measure the amount of IO during the filter build.

base::IoCounters io_before, io_after;

base::ProcessHandle handle = base::Process::Current().handle();

@@ -1353,8 +1329,7 @@ void SafeBrowsingDatabaseNew::UpdateBrowseStore() {

safe_browsing::PrefixSetBuilder builder;

std::vector<SBAddFullHash> add_full_hashes;

- if (!browse_store_->FinishUpdate(pending_add_hashes,

- &builder, &add_full_hashes)) {

+ if (!browse_store_->FinishUpdate(&builder, &add_full_hashes)) {

RecordFailure(FAILURE_BROWSE_DATABASE_UPDATE_FINISH);

return;

}

@@ -1368,14 +1343,6 @@ void SafeBrowsingDatabaseNew::UpdateBrowseStore() {

{

base::AutoLock locked(lookup_lock_);

full_browse_hashes_.swap(add_full_hashes);

- // TODO(shess): If |CacheHashResults()| is posted between the

- // earlier lock and this clear, those pending hashes will be lost.

- // It could be fixed by only removing hashes which were collected

- // at the earlier point. I believe that is fail-safe as-is (the

- // hash will be fetched again).

- pending_browse_hashes_.clear();

- prefix_miss_cache_.clear();

browse_prefix_set_.swap(prefix_set);

}

@@ -1417,14 +1384,11 @@ void SafeBrowsingDatabaseNew::UpdateBrowseStore() {

}

void SafeBrowsingDatabaseNew::UpdateSideEffectFreeWhitelistStore() {

- std::vector<SBAddFullHash> empty_add_hashes;

safe_browsing::PrefixSetBuilder builder;

std::vector<SBAddFullHash> add_full_hashes_result;

if (!side_effect_free_whitelist_store_->FinishUpdate(

- empty_add_hashes,

- &builder,

- &add_full_hashes_result)) {

+ &builder, &add_full_hashes_result)) {

RecordFailure(FAILURE_SIDE_EFFECT_FREE_WHITELIST_UPDATE_FINISH);

return;

}

@@ -1465,16 +1429,11 @@ void SafeBrowsingDatabaseNew::UpdateSideEffectFreeWhitelistStore() {

}

void SafeBrowsingDatabaseNew::UpdateIpBlacklistStore() {

- // For the IP blacklist, we don't cache and save full hashes since all

- // hashes are already full.

- std::vector<SBAddFullHash> empty_add_hashes;

// Note: prefixes will not be empty. The current data store implementation

// stores all full-length hashes as both full and prefix hashes.

safe_browsing::PrefixSetBuilder builder;

std::vector<SBAddFullHash> full_hashes;

- if (!ip_blacklist_store_->FinishUpdate(empty_add_hashes,

- &builder, &full_hashes)) {

+ if (!ip_blacklist_store_->FinishUpdate(&builder, &full_hashes)) {

RecordFailure(FAILURE_IP_BLACKLIST_UPDATE_FINISH);

LoadIpBlacklist(std::vector<SBAddFullHash>()); // Clear the list.

return;