components/safe_browsing_db/v4_store.cc - Issue 2128173003: Merge the hash prefixes from the old store and the additions in the partial

Side by Side Diff: components/safe_browsing_db/v4_store.cc

Issue 2128173003: Merge the hash prefixes from the old store and the additions in the partial (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: git fetch and pull Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2016 The Chromium Authors. All rights reserved.	1 // Copyright 2016 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "base/base64.h"	5 #include "base/base64.h"

6 #include "base/bind.h"	6 #include "base/bind.h"

7 #include "base/files/file_util.h"	7 #include "base/files/file_util.h"

8 #include "base/metrics/histogram_macros.h"	8 #include "base/metrics/histogram_macros.h"

9 #include "base/metrics/sparse_histogram.h"	9 #include "base/metrics/sparse_histogram.h"

10 #include "base/strings/stringprintf.h"	10 #include "base/strings/stringprintf.h"

11 #include "components/safe_browsing_db/v4_store.h"	11 #include "components/safe_browsing_db/v4_store.h"

12 #include "components/safe_browsing_db/v4_store.pb.h"	12 #include "components/safe_browsing_db/v4_store.pb.h"

13	13

14 namespace safe_browsing {	14 namespace safe_browsing {

15	15

16 namespace {	16 namespace {

17 const uint32_t kFileMagic = 0x600D71FE;	17 const uint32_t kFileMagic = 0x600D71FE;

18	18

19 const uint32_t kFileVersion = 9;	19 const uint32_t kFileVersion = 9;

20	20

	21 // The minimum expected size of a hash-prefix.
	Nathan Parker 2016/07/12 20:35:05 ...size in bytes ...size in bytes vakh (use Gerrit instead) 2016/07/12 21:57:02 Done. Show quoted text On 2016/07/12 20:35:05, Nathan Parker wrote: > ...size in bytes Done.
	22 const uint32_t kMinHashPrefixLength = 4;

	23

	24 // The maximum expected size of a hash-prefix. This represents a full SHA256

	25 // hash.

	26 const uint32_t kMaxHashPrefixLength = 32;

	27

21 void RecordStoreReadResult(StoreReadResult result) {	28 void RecordStoreReadResult(StoreReadResult result) {

22 UMA_HISTOGRAM_ENUMERATION("SafeBrowsing.V4StoreReadResult", result,	29 UMA_HISTOGRAM_ENUMERATION("SafeBrowsing.V4StoreReadResult", result,

23 STORE_READ_RESULT_MAX);	30 STORE_READ_RESULT_MAX);

24 }	31 }

25	32

26 void RecordStoreWriteResult(StoreWriteResult result) {	33 void RecordStoreWriteResult(StoreWriteResult result) {

27 UMA_HISTOGRAM_ENUMERATION("SafeBrowsing.V4StoreWriteResult", result,	34 UMA_HISTOGRAM_ENUMERATION("SafeBrowsing.V4StoreWriteResult", result,

28 STORE_WRITE_RESULT_MAX);	35 STORE_WRITE_RESULT_MAX);

29 }	36 }

30	37

	38 void RecordMergeResult(MergeResult result) {

	39 UMA_HISTOGRAM_ENUMERATION("SafeBrowsing.V4MergeResult", result,

	40 MERGE_RESULT_MAX);

	41 }

	42

31 // Returns the name of the temporary file used to buffer data for	43 // Returns the name of the temporary file used to buffer data for

32 // \|filename\|. Exported for unit tests.	44 // \|filename\|. Exported for unit tests.

33 const base::FilePath TemporaryFileForFilename(const base::FilePath& filename) {	45 const base::FilePath TemporaryFileForFilename(const base::FilePath& filename) {

34 return base::FilePath(filename.value() + FILE_PATH_LITERAL("_new"));	46 return base::FilePath(filename.value() + FILE_PATH_LITERAL("_new"));

35 }	47 }

36	48

37 } // namespace	49 } // namespace

38	50

39 std::ostream& operator<<(std::ostream& os, const V4Store& store) {	51 std::ostream& operator<<(std::ostream& os, const V4Store& store) {

40 os << store.DebugString();	52 os << store.DebugString();

(...skipping 35 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
76 state_ = "";	88 state_ = "";

77 return true;	89 return true;

78 }	90 }

79	91

80 void V4Store::ApplyUpdate(	92 void V4Store::ApplyUpdate(

81 std::unique_ptr<ListUpdateResponse> response,	93 std::unique_ptr<ListUpdateResponse> response,

82 const scoped_refptr<base::SingleThreadTaskRunner>& callback_task_runner,	94 const scoped_refptr<base::SingleThreadTaskRunner>& callback_task_runner,

83 UpdatedStoreReadyCallback callback) {	95 UpdatedStoreReadyCallback callback) {

84 std::unique_ptr<V4Store> new_store(	96 std::unique_ptr<V4Store> new_store(

85 new V4Store(this->task_runner_, this->store_path_));	97 new V4Store(this->task_runner_, this->store_path_));

86

87 new_store->state_ = response->new_client_state();	98 new_store->state_ = response->new_client_state();

88 // TODO(vakh):	99 // TODO(vakh):

89 // 1. Merge the old store and the new update in new_store.	100 // 1. Merge the old store and the new update in new_store.

90 // 2. Create a ListUpdateResponse containing RICE encoded hash-prefixes and	101 // 2. Create a ListUpdateResponse containing RICE encoded hash-prefixes and

91 // response_type as FULL_UPDATE, and write that to disk.	102 // response_type as FULL_UPDATE, and write that to disk.

92 // 3. Remove this if condition after completing 1. and 2.	103 // 3. Remove this if condition after completing 1. and 2.

93 if (response->has_response_type() &&	104 if (response->response_type() == ListUpdateResponse::PARTIAL_UPDATE) {

94 response->response_type() == ListUpdateResponse::FULL_UPDATE) {	105 for (const auto& removal : response->removals()) {

	106 // TODO(vakh): Allow other compression types.

	107 // See: https://bugs.chromium.org/p/chromium/issues/detail?id=624567

	108 DCHECK_EQ(RAW, removal.compression_type());

	109 }

	110

	111 HashPrefixMap additions_map;

	112 UpdateHashPrefixMapFromAdditions(response->additions(), &additions_map);

	113

	114 new_store->MergeUpdate(hash_prefix_map_, additions_map);

	115

	116 // TODO(vakh): Generate the updated ListUpdateResponse to write to disk.

	117 } else if (response->response_type() == ListUpdateResponse::FULL_UPDATE) {

95 StoreWriteResult result = new_store->WriteToDisk(std::move(response));	118 StoreWriteResult result = new_store->WriteToDisk(std::move(response));

96 RecordStoreWriteResult(result);	119 RecordStoreWriteResult(result);

	120 } else {

	121 NOTREACHED() << "Unexpected response type: " << response->response_type();

97 }	122 }

98	123

99 // new_store is done updating, pass it to the callback.	124 // new_store is done updating, pass it to the callback.

100 callback_task_runner->PostTask(	125 callback_task_runner->PostTask(

101 FROM_HERE, base::Bind(callback, base::Passed(&new_store)));	126 FROM_HERE, base::Bind(callback, base::Passed(&new_store)));

102 }	127 }

103	128

	129 // static

	130 void V4Store::UpdateHashPrefixMapFromAdditions(

	131 const ::google::protobuf::RepeatedPtrField<ThreatEntrySet>& additions,

	132 HashPrefixMap* additions_map) {

	133 for (const auto& addition : additions) {

	134 // TODO(vakh): Allow other compression types.

	135 // See: https://bugs.chromium.org/p/chromium/issues/detail?id=624567

	136 DCHECK_EQ(RAW, addition.compression_type());

	137

	138 DCHECK(addition.has_raw_hashes());

	139 DCHECK(addition.raw_hashes().has_raw_hashes());

	140

	141 PrefixSize prefix_size = addition.raw_hashes().prefix_size();

	142 RecordMergeResult(AddUnlumpedHashes(

	143 prefix_size, addition.raw_hashes().raw_hashes(), additions_map));

	144 }

	145 }

	146

	147 // static

	148 MergeResult V4Store::AddUnlumpedHashes(PrefixSize prefix_size,

	149 const std::string& lumped_hashes,

	150 HashPrefixMap* additions_map) {

	151 if (prefix_size < kMinHashPrefixLength) {

	152 NOTREACHED();

	153 return PREFIX_SIZE_TOO_SMALL_FAILURE;
	Nathan Parker 2016/07/12 20:35:06 You don't need any of the else's here. You don't need any of the else's here. vakh (use Gerrit instead) 2016/07/12 21:57:02 Done. Show quoted text On 2016/07/12 20:35:06, Nathan Parker wrote: > You don't need any of the else's here. Done.
	154 } else if (prefix_size > kMaxHashPrefixLength) {

	155 NOTREACHED();

	156 return PREFIX_SIZE_TOO_LARGE_FAILURE;

	157 } else {

	158 if (lumped_hashes.size() % prefix_size != 0) {

	159 return ADDITIONS_SIZE_UNEXPECTED_FAILURE;

	160 } else {

	161 (*additions_map)[prefix_size] = lumped_hashes;
	Nathan Parker 2016/07/12 20:35:06 To consider, or add a todo: This copies the string To consider, or add a todo: This copies the string unnecessarily. If you guaranteed that the proto's lived at least as long as the update process, you could store a pointed to the proto's string. Nathan Parker 2016/07/12 22:47:12 (Missed this comment) Though, I'm not sure how to Show quoted text On 2016/07/12 20:35:06, Nathan Parker wrote: > To consider, or add a todo: This copies the string unnecessarily. If you > guaranteed that the proto's lived at least as long as the update process, you > could store a pointed to the proto's string. (Missed this comment) Though, I'm not sure how to implement this easily, since it'd require HashPrefixMap to not own the data in some cases (additions) and own it in others (hash_prefix_map_). Maybe just a TODO for now. This probably only really matters at startup -- For a 2MB list loaded from disk, this'll use a extra 2MB RAM. vakh (use Gerrit instead) 2016/07/13 00:29:17 Done. Show quoted text On 2016/07/12 22:47:12, Nathan Parker wrote: > On 2016/07/12 20:35:06, Nathan Parker wrote: > > To consider, or add a todo: This copies the string unnecessarily. If you > > guaranteed that the proto's lived at least as long as the update process, you > > could store a pointed to the proto's string. > > (Missed this comment) Though, I'm not sure how to implement this easily, since > it'd require HashPrefixMap to not own the data in some cases (additions) and own > it in others (hash_prefix_map_). Maybe just a TODO for now. This probably only > really matters at startup -- For a 2MB list loaded from disk, this'll use a > extra 2MB RAM. Done.
	162 }

	163 }

	164 return MERGE_SUCCESS;

	165 }

	166

	167 // static

	168 bool V4Store::GetNextSmallestPrefixSize(const HashPrefixMap& hash_prefix_map,

	169 const CounterMap& counter_map,

	170 PrefixSize* smallest_prefix_size) {

	171 HashPrefix smallest_prefix;
	Nathan Parker 2016/07/12 20:35:06 You could use a StringPiece here and within the lo You could use a StringPiece here and within the loop so it doesn't actually copy the substring. vakh (use Gerrit instead) 2016/07/12 21:57:02 That would break the consistency of hash prefixes Show quoted text On 2016/07/12 20:35:06, Nathan Parker wrote: > You could use a StringPiece here and within the loop so it doesn't actually copy > the substring. That would break the consistency of hash prefixes being HashPrefixes and expose the implementation detail about a hash prefix being a string. These strings are fairly small so copying them isn't worth the sacrifice in readability. Nathan Parker 2016/07/12 22:47:12 They are small, but there is a dynamic allocation Show quoted text On 2016/07/12 21:57:02, vakh wrote: > On 2016/07/12 20:35:06, Nathan Parker wrote: > > You could use a StringPiece here and within the loop so it doesn't actually > copy > > the substring. > > That would break the consistency of hash prefixes being HashPrefixes and expose > the implementation detail about a hash prefix being a string. > These strings are fairly small so copying them isn't worth the sacrifice in > readability. They are small, but there is a dynamic allocation everytime one gets copied. Actually, just pulling it out of the loop would fix most of that, since then it could reuse the same buffer over and over. vakh (use Gerrit instead) 2016/07/13 00:29:17 Done. Show quoted text On 2016/07/12 22:47:12, Nathan Parker wrote: > On 2016/07/12 21:57:02, vakh wrote: > > On 2016/07/12 20:35:06, Nathan Parker wrote: > > > You could use a StringPiece here and within the loop so it doesn't actually > > copy > > > the substring. > > > > That would break the consistency of hash prefixes being HashPrefixes and > expose > > the implementation detail about a hash prefix being a string. > > These strings are fairly small so copying them isn't worth the sacrifice in > > readability. > > They are small, but there is a dynamic allocation everytime one gets copied. > Actually, just pulling it out of the loop would fix most of that, since then it > could reuse the same buffer over and over. Done.
	172 for (const auto& counter_pair : counter_map) {

	173 PrefixSize prefix_size = counter_pair.first;

	174 size_t index = counter_pair.second;

	175 size_t sized_index = prefix_size * index;

	176

	177 const HashPrefixes& hash_prefixes = hash_prefix_map.at(prefix_size);

	178 if (sized_index < hash_prefixes.size()) {

	179 HashPrefix this_prefix = hash_prefixes.substr(sized_index, prefix_size);

	180 if (smallest_prefix.empty() \|\| smallest_prefix > this_prefix) {

	181 smallest_prefix = this_prefix;

	182 *smallest_prefix_size = prefix_size;

	183 }

	184 }
	Nathan Parker 2016/07/12 20:35:06 else dcheck else dcheck vakh (use Gerrit instead) 2016/07/12 21:57:02 That's a valid case. For instance, when one map ha Show quoted text On 2016/07/12 20:35:06, Nathan Parker wrote: > else dcheck That's a valid case. For instance, when one map has been merged entirely. In that case, all sized_index values would be outside the valid range for all corresponding HashPrefixes.
	185 }

	186 return !smallest_prefix.empty();

	187 }

	188

	189 // static

	190 void V4Store::InitializeCounterMap(const HashPrefixMap& hash_prefix_map,

	191 CounterMap* counter_map) {

	192 for (const auto& map_pair : hash_prefix_map) {

	193 (*counter_map)[map_pair.first] = 0;

	194 }

	195 }

	196

	197 // static

	198 HashPrefix V4Store::GetNextUnmergedPrefixForSize(

	199 PrefixSize prefix_size,

	200 const HashPrefixMap& hash_prefix_map,

	201 const CounterMap& counter_map) {

	202 HashPrefixes hash_prefixes = hash_prefix_map.at(prefix_size);
	Nathan Parker 2016/07/12 20:35:06 This copies the whole string. You could use const This copies the whole string. You could use const HashPrefixes&. vakh (use Gerrit instead) 2016/07/12 21:57:03 Done. Show quoted text On 2016/07/12 20:35:06, Nathan Parker wrote: > This copies the whole string. You could use const HashPrefixes&. Done.
	203 size_t index_within_list = counter_map.at(prefix_size);

	204 size_t sized_index = prefix_size * index_within_list;

	205 return hash_prefixes.substr(sized_index, prefix_size);

	206 }

	207

	208 // static

	209 void V4Store::ReserveSpaceInPrefixMap(const HashPrefixMap& other_prefixes_map,

	210 HashPrefixMap* prefix_map_to_update) {

	211 for (const auto& pair : other_prefixes_map) {
	Nathan Parker 2016/07/12 20:35:06 Add a comment about how this gets close to ideal, Add a comment about how this gets close to ideal, but will leave a bit of space due to deletions. vakh (use Gerrit instead) 2016/07/12 21:57:02 Updated the comment in header file (consistent wit Show quoted text On 2016/07/12 20:35:06, Nathan Parker wrote: > Add a comment about how this gets close to ideal, but will leave a bit of space > due to deletions. Updated the comment in header file (consistent with the other functions).
	212 PrefixSize prefix_size = pair.first;

	213 size_t prefix_length_to_add = pair.second.length();

	214

	215 HashPrefixes existing_prefixes = (*prefix_map_to_update)[prefix_size];
	Nathan Parker 2016/07/12 20:35:06 const HashPrefixes&. Otherwise it makes a copy. const HashPrefixes&. Otherwise it makes a copy. vakh (use Gerrit instead) 2016/07/12 21:57:02 Done. Show quoted text On 2016/07/12 20:35:06, Nathan Parker wrote: > const HashPrefixes&. Otherwise it makes a copy. Done.
	216 size_t existing_capacity = existing_prefixes.capacity();

	217

	218 (*prefix_map_to_update)[prefix_size].reserve(existing_capacity +

	219 prefix_length_to_add);

	220 }

	221 }

	222

	223 void V4Store::MergeUpdate(const HashPrefixMap& old_prefixes_map,

	224 const HashPrefixMap& additions_map) {

	225 ReserveSpaceInPrefixMap(old_prefixes_map, &hash_prefix_map_);
	Nathan Parker 2016/07/12 20:35:06 Is the assumption that hash_prefix_map_ is empty h Is the assumption that hash_prefix_map_ is empty here? Otherwise, it'll add old+additions to current. Maybe making this one call (that takes both as args) could enforce that, by not using the existing capacity. vakh (use Gerrit instead) 2016/07/12 21:57:02 Good map. However, for code reuse purposes, I thin Show quoted text On 2016/07/12 20:35:06, Nathan Parker wrote: > Is the assumption that hash_prefix_map_ is empty here? Otherwise, it'll add > old+additions to current. Maybe making this one call (that takes both as args) > could enforce that, by not using the existing capacity. Good map. However, for code reuse purposes, I think it is better to have a method that method as-is. Clearing out the hash_prefix_map_ explicitly and added a DCHECK.
	226 ReserveSpaceInPrefixMap(additions_map, &hash_prefix_map_);

	227

	228 PrefixSize next_size_for_old;

	229 CounterMap old_counter_map;

	230 InitializeCounterMap(old_prefixes_map, &old_counter_map);

	231 bool found_in_old = GetNextSmallestPrefixSize(

	232 old_prefixes_map, old_counter_map, &next_size_for_old);

	233

	234 PrefixSize next_size_for_additions;

	235 CounterMap additions_counter_map;

	236 InitializeCounterMap(additions_map, &additions_counter_map);

	237 bool found_in_additions = GetNextSmallestPrefixSize(

	238 additions_map, additions_counter_map, &next_size_for_additions);

	239

	240 // Classical merge sort.

	241 // The two constructs to merge are maps: old_prefixes_map, additions_map.

	242

	243 // At least one of the maps still has elements that need to be merged into the

	244 // new store.

	245 while (found_in_old \|\| found_in_additions) {

	246 // Both maps still have elements that need to be merged into the new store.

	247 if (found_in_old && found_in_additions) {

	248 // Get the lexographically smallest hash prefix from the old store.

	249 HashPrefix next_smallest_prefix_old = GetNextUnmergedPrefixForSize(

	250 next_size_for_old, old_prefixes_map, old_counter_map);

	251 // Get the lexographically smallest hash prefix from the additions in the

	252 // latest update from the server.

	253 HashPrefix next_smallest_prefix_additions = GetNextUnmergedPrefixForSize(

	254 next_size_for_additions, additions_map, additions_counter_map);

	255

	256 // If the smallest unmerged hash prefix in old store is smaller than that

	257 // in the update, add that to the new store.

	258 if (next_smallest_prefix_old < next_smallest_prefix_additions) {

	259 hash_prefix_map_[next_size_for_old] += next_smallest_prefix_old;

	260

	261 // Update the counter map, which means that we have merged one hash

	262 // prefix of size \|next_size_for_old\| from the old store.

	263 old_counter_map[next_size_for_old]++;

	264

	265 // Find the next smallest unmerged element in the old store's map.

	266 found_in_old = GetNextSmallestPrefixSize(

	267 old_prefixes_map, old_counter_map, &next_size_for_old);

	268 } else {

	269 // This means that the smallest unmerged hash prefix in the update was

	270 // smaller than that in the old store, so add that hash prefix (from the

	271 // update) into the new store.

	272 hash_prefix_map_[next_size_for_additions] +=

	273 next_smallest_prefix_additions;

	274

	275 // Update the counter map, which means that we have merged one hash

	276 // prefix of size \|next_size_for_additions\| from the update.

	277 additions_counter_map[next_size_for_additions]++;

	278

	279 // Find the next smallest unmerged element in the additions map.

	280 found_in_additions = GetNextSmallestPrefixSize(

	281 additions_map, additions_counter_map, &next_size_for_additions);

	282 }

	283 } else {

	284 // At least one of the maps has become empty.

	285 if (!found_in_old) {

	286 // The map for the old store has been completely merged. Now only merge

	287 // the hash prefixes from the additions map.

	288 HashPrefix next_smallest_prefix_additions =

	289 GetNextUnmergedPrefixForSize(next_size_for_additions, additions_map,

	290 additions_counter_map);

	291 hash_prefix_map_[next_size_for_additions] +=

	292 next_smallest_prefix_additions;

	293 additions_counter_map[next_size_for_additions]++;

	294 found_in_additions = GetNextSmallestPrefixSize(

	295 additions_map, additions_counter_map, &next_size_for_additions);

	296 } else {

	297 // The additions map has been completely merged. Now only merge

	298 // the hash prefixes from the old store map.

	299 HashPrefix next_smallest_prefix_old = GetNextUnmergedPrefixForSize(

	300 next_size_for_old, old_prefixes_map, old_counter_map);

	301 hash_prefix_map_[next_size_for_old] += next_smallest_prefix_old;

	302 old_counter_map[next_size_for_old]++;

	303 found_in_old = GetNextSmallestPrefixSize(

	304 old_prefixes_map, old_counter_map, &next_size_for_old);

	305 }

	306 }

	307 }

	308 }

	309

104 StoreReadResult V4Store::ReadFromDisk() {	310 StoreReadResult V4Store::ReadFromDisk() {

105 DCHECK(task_runner_->RunsTasksOnCurrentThread());	311 DCHECK(task_runner_->RunsTasksOnCurrentThread());

106	312

107 std::string contents;	313 std::string contents;

108 bool read_success = base::ReadFileToString(store_path_, &contents);	314 bool read_success = base::ReadFileToString(store_path_, &contents);

109 if (!read_success) {	315 if (!read_success) {

110 return FILE_UNREADABLE_FAILURE;	316 return FILE_UNREADABLE_FAILURE;

111 }	317 }

112	318

113 if (contents.empty()) {	319 if (contents.empty()) {

(...skipping 60 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
174	380

175 if (!base::Move(new_filename, store_path_)) {	381 if (!base::Move(new_filename, store_path_)) {

176 DVLOG(1) << "store_path_: " << store_path_.value();	382 DVLOG(1) << "store_path_: " << store_path_.value();

177 return UNABLE_TO_RENAME_FAILURE;	383 return UNABLE_TO_RENAME_FAILURE;

178 }	384 }

179	385

180 return WRITE_SUCCESS;	386 return WRITE_SUCCESS;

181 }	387 }

182	388

183 } // namespace safe_browsing	389 } // namespace safe_browsing

OLD	NEW

« no previous file with comments | « components/safe_browsing_db/v4_store.h ('k') | components/safe_browsing_db/v4_store_unittest.cc » ('j') | no next file with comments »