Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(594)

Side by Side Diff: components/safe_browsing_db/prefix_set.cc

Issue 1551433002: Switch to standard integer types in components/, part 3 of 4. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: more Created 4 years, 12 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/safe_browsing_db/prefix_set.h" 5 #include "components/safe_browsing_db/prefix_set.h"
6 6
7 #include <algorithm> 7 #include <algorithm>
8 8
9 #include "base/files/file_util.h" 9 #include "base/files/file_util.h"
10 #include "base/files/scoped_file.h" 10 #include "base/files/scoped_file.h"
11 #include "base/logging.h" 11 #include "base/logging.h"
12 #include "base/md5.h" 12 #include "base/md5.h"
13 #include "base/metrics/histogram.h" 13 #include "base/metrics/histogram.h"
14 #include "base/metrics/sparse_histogram.h" 14 #include "base/metrics/sparse_histogram.h"
15 15
16 namespace safe_browsing { 16 namespace safe_browsing {
17 17
18 namespace { 18 namespace {
19 19
20 // |kMagic| should be reasonably unique, and not match itself across 20 // |kMagic| should be reasonably unique, and not match itself across
21 // endianness changes. I generated this value with: 21 // endianness changes. I generated this value with:
22 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9 22 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9
23 static uint32 kMagic = 0x864088dd; 23 static uint32_t kMagic = 0x864088dd;
24 24
25 // Version history: 25 // Version history:
26 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10 26 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10
27 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27 27 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27
28 // Version 3: dd07faf5/r268145 by shess@chromium.org on 2014-05-05 28 // Version 3: dd07faf5/r268145 by shess@chromium.org on 2014-05-05
29 29
30 // Version 2 layout is identical to version 1. The sort order of |index_| 30 // Version 2 layout is identical to version 1. The sort order of |index_|
31 // changed from |int32| to |uint32| to match the change of |SBPrefix|. 31 // changed from |int32_t| to |uint32_t| to match the change of |SBPrefix|.
32 // Version 3 adds storage for full hashes. 32 // Version 3 adds storage for full hashes.
33 static uint32 kVersion = 3; 33 static uint32_t kVersion = 3;
34 static uint32 kDeprecatedVersion = 2; // And lower. 34 static uint32_t kDeprecatedVersion = 2; // And lower.
35 35
36 typedef struct { 36 typedef struct {
37 uint32 magic; 37 uint32_t magic;
38 uint32 version; 38 uint32_t version;
39 uint32 index_size; 39 uint32_t index_size;
40 uint32 deltas_size; 40 uint32_t deltas_size;
41 uint32 full_hashes_size; 41 uint32_t full_hashes_size;
42 } FileHeader; 42 } FileHeader;
43 43
44 // Common std::vector<> implementations add capacity by multiplying from the 44 // Common std::vector<> implementations add capacity by multiplying from the
45 // current size (usually either by 2 or 1.5) to satisfy push_back() running in 45 // current size (usually either by 2 or 1.5) to satisfy push_back() running in
46 // amortized constant time. This is not necessary for insert() at end(), but 46 // amortized constant time. This is not necessary for insert() at end(), but
47 // AFAICT it seems true for some implementations. SBPrefix values should 47 // AFAICT it seems true for some implementations. SBPrefix values should
48 // uniformly cover the 32-bit space, so the final size can be estimated given a 48 // uniformly cover the 32-bit space, so the final size can be estimated given a
49 // subset of the input. 49 // subset of the input.
50 // 50 //
51 // |kEstimateThreshold| is when estimates start converging. Results are strong 51 // |kEstimateThreshold| is when estimates start converging. Results are strong
52 // starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of 52 // starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of
53 // resizing capacity from >50% to 100%. 53 // resizing capacity from >50% to 100%.
54 // 54 //
55 // TODO(shess): I'm sure there is math in the world to describe good settings 55 // TODO(shess): I'm sure there is math in the world to describe good settings
56 // for estimating the size of a uniformly-distributed set of integers from a 56 // for estimating the size of a uniformly-distributed set of integers from a
57 // sorted subset. I do not have such math in me, so I assumed that my current 57 // sorted subset. I do not have such math in me, so I assumed that my current
58 // organic database of prefixes was scale-free, and wrote a script to see how 58 // organic database of prefixes was scale-free, and wrote a script to see how
59 // often given slop values would always suffice for given strides. At 1<<30, 59 // often given slop values would always suffice for given strides. At 1<<30,
60 // .5% slop was sufficient to cover all cases (though the code below uses 1%). 60 // .5% slop was sufficient to cover all cases (though the code below uses 1%).
61 // 61 //
62 // TODO(shess): A smaller threshold uses less transient space in reallocation. 62 // TODO(shess): A smaller threshold uses less transient space in reallocation.
63 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost 63 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost
64 // is that a smaller threshold needs more slop (locked down for the long term). 64 // is that a smaller threshold needs more slop (locked down for the long term).
65 // 1<<29 worked well with 1%, 1<<27 worked well with 2%. 65 // 1<<29 worked well with 1%, 1<<27 worked well with 2%.
66 const SBPrefix kEstimateThreshold = 1 << 30; 66 const SBPrefix kEstimateThreshold = 1 << 30;
67 size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) { 67 size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) {
68 // estimated_count / current_count == estimated_max / current_prefix 68 // estimated_count / current_count == estimated_max / current_prefix
69 // For large input sets, estimated_max of 2^32 is close enough. 69 // For large input sets, estimated_max of 2^32 is close enough.
70 const size_t estimated_prefix_count = static_cast<size_t>( 70 const size_t estimated_prefix_count = static_cast<size_t>(
71 (static_cast<uint64>(current_count) << 32) / current_prefix); 71 (static_cast<uint64_t>(current_count) << 32) / current_prefix);
72 72
73 // The estimate has an error bar, if the final total is below the estimate, no 73 // The estimate has an error bar, if the final total is below the estimate, no
74 // harm, but if it is above a capacity resize will happen at nearly 100%. Add 74 // harm, but if it is above a capacity resize will happen at nearly 100%. Add
75 // some slop to make sure all cases are covered. 75 // some slop to make sure all cases are covered.
76 return estimated_prefix_count + estimated_prefix_count / 100; 76 return estimated_prefix_count + estimated_prefix_count / 100;
77 } 77 }
78 78
79 } // namespace 79 } // namespace
80 80
81 // For |std::upper_bound()| to find a prefix w/in a vector of pairs. 81 // For |std::upper_bound()| to find a prefix w/in a vector of pairs.
82 // static 82 // static
83 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) { 83 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) {
84 return a.first < b.first; 84 return a.first < b.first;
85 } 85 }
86 86
87 PrefixSet::PrefixSet() { 87 PrefixSet::PrefixSet() {
88 } 88 }
89 89
90 PrefixSet::PrefixSet(IndexVector* index, 90 PrefixSet::PrefixSet(IndexVector* index,
91 std::vector<uint16>* deltas, 91 std::vector<uint16_t>* deltas,
92 std::vector<SBFullHash>* full_hashes) { 92 std::vector<SBFullHash>* full_hashes) {
93 DCHECK(index && deltas && full_hashes); 93 DCHECK(index && deltas && full_hashes);
94 index_.swap(*index); 94 index_.swap(*index);
95 deltas_.swap(*deltas); 95 deltas_.swap(*deltas);
96 full_hashes_.swap(*full_hashes); 96 full_hashes_.swap(*full_hashes);
97 } 97 }
98 98
99 PrefixSet::~PrefixSet() {} 99 PrefixSet::~PrefixSet() {}
100 100
101 bool PrefixSet::PrefixExists(SBPrefix prefix) const { 101 bool PrefixSet::PrefixExists(SBPrefix prefix) const {
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
152 for (size_t di = index_[ii].second; di < deltas_end; ++di) { 152 for (size_t di = index_[ii].second; di < deltas_end; ++di) {
153 current += deltas_[di]; 153 current += deltas_[di];
154 prefixes->push_back(current); 154 prefixes->push_back(current);
155 } 155 }
156 } 156 }
157 } 157 }
158 158
159 // static 159 // static
160 scoped_ptr<const PrefixSet> PrefixSet::LoadFile( 160 scoped_ptr<const PrefixSet> PrefixSet::LoadFile(
161 const base::FilePath& filter_name) { 161 const base::FilePath& filter_name) {
162 int64 size_64; 162 int64_t size_64;
163 if (!base::GetFileSize(filter_name, &size_64)) 163 if (!base::GetFileSize(filter_name, &size_64))
164 return nullptr; 164 return nullptr;
165 using base::MD5Digest; 165 using base::MD5Digest;
166 if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest))) 166 if (size_64 < static_cast<int64_t>(sizeof(FileHeader) + sizeof(MD5Digest)))
167 return nullptr; 167 return nullptr;
168 168
169 base::ScopedFILE file(base::OpenFile(filter_name, "rb")); 169 base::ScopedFILE file(base::OpenFile(filter_name, "rb"));
170 if (!file.get()) 170 if (!file.get())
171 return nullptr; 171 return nullptr;
172 172
173 FileHeader header; 173 FileHeader header;
174 size_t read = fread(&header, sizeof(header), 1, file.get()); 174 size_t read = fread(&header, sizeof(header), 1, file.get());
175 if (read != 1) 175 if (read != 1)
176 return nullptr; 176 return nullptr;
(...skipping 12 matching lines...) Expand all
189 189
190 if (header.version <= kDeprecatedVersion) { 190 if (header.version <= kDeprecatedVersion) {
191 return nullptr; 191 return nullptr;
192 } else if (header.version != kVersion) { 192 } else if (header.version != kVersion) {
193 return nullptr; 193 return nullptr;
194 } 194 }
195 195
196 IndexVector index; 196 IndexVector index;
197 const size_t index_bytes = sizeof(index[0]) * header.index_size; 197 const size_t index_bytes = sizeof(index[0]) * header.index_size;
198 198
199 std::vector<uint16> deltas; 199 std::vector<uint16_t> deltas;
200 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size; 200 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;
201 201
202 std::vector<SBFullHash> full_hashes; 202 std::vector<SBFullHash> full_hashes;
203 const size_t full_hashes_bytes = 203 const size_t full_hashes_bytes =
204 sizeof(full_hashes[0]) * header.full_hashes_size; 204 sizeof(full_hashes[0]) * header.full_hashes_size;
205 205
206 // Check for bogus sizes before allocating any space. 206 // Check for bogus sizes before allocating any space.
207 const size_t expected_bytes = sizeof(header) + 207 const size_t expected_bytes = sizeof(header) +
208 index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest); 208 index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest);
209 if (static_cast<int64>(expected_bytes) != size_64) 209 if (static_cast<int64_t>(expected_bytes) != size_64)
210 return nullptr; 210 return nullptr;
211 211
212 // Read the index vector. Herb Sutter indicates that vectors are 212 // Read the index vector. Herb Sutter indicates that vectors are
213 // guaranteed to be contiuguous, so reading to where element 0 lives 213 // guaranteed to be contiuguous, so reading to where element 0 lives
214 // is valid. 214 // is valid.
215 if (header.index_size) { 215 if (header.index_size) {
216 index.resize(header.index_size); 216 index.resize(header.index_size);
217 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get()); 217 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());
218 if (read != index.size()) 218 if (read != index.size())
219 return nullptr; 219 return nullptr;
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
259 259
260 // Steals vector contents using swap(). 260 // Steals vector contents using swap().
261 return make_scoped_ptr( 261 return make_scoped_ptr(
262 new PrefixSet(&index, &deltas, &full_hashes)); 262 new PrefixSet(&index, &deltas, &full_hashes));
263 } 263 }
264 264
265 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const { 265 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const {
266 FileHeader header; 266 FileHeader header;
267 header.magic = kMagic; 267 header.magic = kMagic;
268 header.version = kVersion; 268 header.version = kVersion;
269 header.index_size = static_cast<uint32>(index_.size()); 269 header.index_size = static_cast<uint32_t>(index_.size());
270 header.deltas_size = static_cast<uint32>(deltas_.size()); 270 header.deltas_size = static_cast<uint32_t>(deltas_.size());
271 header.full_hashes_size = static_cast<uint32>(full_hashes_.size()); 271 header.full_hashes_size = static_cast<uint32_t>(full_hashes_.size());
272 272
273 // Sanity check that the 32-bit values never mess things up. 273 // Sanity check that the 32-bit values never mess things up.
274 if (static_cast<size_t>(header.index_size) != index_.size() || 274 if (static_cast<size_t>(header.index_size) != index_.size() ||
275 static_cast<size_t>(header.deltas_size) != deltas_.size() || 275 static_cast<size_t>(header.deltas_size) != deltas_.size() ||
276 static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) { 276 static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) {
277 NOTREACHED(); 277 NOTREACHED();
278 return false; 278 return false;
279 } 279 }
280 280
281 base::ScopedFILE file(base::OpenFile(filter_name, "wb")); 281 base::ScopedFILE file(base::OpenFile(filter_name, "wb"));
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
338 if (written != 1) 338 if (written != 1)
339 return false; 339 return false;
340 340
341 // TODO(shess): Can this code check that the close was successful? 341 // TODO(shess): Can this code check that the close was successful?
342 file.reset(); 342 file.reset();
343 343
344 return true; 344 return true;
345 } 345 }
346 346
347 void PrefixSet::AddRun(SBPrefix index_prefix, 347 void PrefixSet::AddRun(SBPrefix index_prefix,
348 const uint16* run_begin, const uint16* run_end) { 348 const uint16_t* run_begin,
349 const uint16_t* run_end) {
349 // Preempt organic capacity decisions for |delta_| once strong estimates can 350 // Preempt organic capacity decisions for |delta_| once strong estimates can
350 // be made. 351 // be made.
351 if (index_prefix > kEstimateThreshold && 352 if (index_prefix > kEstimateThreshold &&
352 deltas_.capacity() < deltas_.size() + (run_end - run_begin)) { 353 deltas_.capacity() < deltas_.size() + (run_end - run_begin)) {
353 deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size())); 354 deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size()));
354 } 355 }
355 356
356 index_.push_back( 357 index_.push_back(
357 std::make_pair(index_prefix, static_cast<uint32>(deltas_.size()))); 358 std::make_pair(index_prefix, static_cast<uint32_t>(deltas_.size())));
358 deltas_.insert(deltas_.end(), run_begin, run_end); 359 deltas_.insert(deltas_.end(), run_begin, run_end);
359 } 360 }
360 361
361 PrefixSetBuilder::PrefixSetBuilder() 362 PrefixSetBuilder::PrefixSetBuilder()
362 : prefix_set_(new PrefixSet()) { 363 : prefix_set_(new PrefixSet()) {
363 } 364 }
364 365
365 PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes) 366 PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes)
366 : prefix_set_(new PrefixSet()) { 367 : prefix_set_(new PrefixSet()) {
367 for (size_t i = 0; i < prefixes.size(); ++i) { 368 for (size_t i = 0; i < prefixes.size(); ++i) {
(...skipping 25 matching lines...) Expand all
393 } 394 }
394 395
395 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() { 396 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() {
396 return GetPrefixSet(std::vector<SBFullHash>()).Pass(); 397 return GetPrefixSet(std::vector<SBFullHash>()).Pass();
397 } 398 }
398 399
399 void PrefixSetBuilder::EmitRun() { 400 void PrefixSetBuilder::EmitRun() {
400 DCHECK(prefix_set_.get()); 401 DCHECK(prefix_set_.get());
401 402
402 SBPrefix prev_prefix = buffer_[0]; 403 SBPrefix prev_prefix = buffer_[0];
403 uint16 run[PrefixSet::kMaxRun]; 404 uint16_t run[PrefixSet::kMaxRun];
404 size_t run_pos = 0; 405 size_t run_pos = 0;
405 406
406 size_t i; 407 size_t i;
407 for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) { 408 for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) {
408 // Calculate the delta. |unsigned| is mandatory, because the 409 // Calculate the delta. |unsigned| is mandatory, because the
409 // sorted_prefixes could be more than INT_MAX apart. 410 // sorted_prefixes could be more than INT_MAX apart.
410 DCHECK_GT(buffer_[i], prev_prefix); 411 DCHECK_GT(buffer_[i], prev_prefix);
411 const unsigned delta = buffer_[i] - prev_prefix; 412 const unsigned delta = buffer_[i] - prev_prefix;
412 const uint16 delta16 = static_cast<uint16>(delta); 413 const uint16_t delta16 = static_cast<uint16_t>(delta);
413 414
414 // Break the run if the delta doesn't fit. 415 // Break the run if the delta doesn't fit.
415 if (delta != static_cast<unsigned>(delta16)) 416 if (delta != static_cast<unsigned>(delta16))
416 break; 417 break;
417 418
418 // Continue the run of deltas. 419 // Continue the run of deltas.
419 run[run_pos++] = delta16; 420 run[run_pos++] = delta16;
420 DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta); 421 DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta);
421 422
422 prev_prefix = buffer_[i]; 423 prev_prefix = buffer_[i];
(...skipping 17 matching lines...) Expand all
440 } 441 }
441 buffer_.push_back(prefix); 442 buffer_.push_back(prefix);
442 443
443 // Flush buffer when a run can be constructed. +1 for the index item, and +1 444 // Flush buffer when a run can be constructed. +1 for the index item, and +1
444 // to leave at least one item in the buffer for dropping duplicates. 445 // to leave at least one item in the buffer for dropping duplicates.
445 if (buffer_.size() > PrefixSet::kMaxRun + 2) 446 if (buffer_.size() > PrefixSet::kMaxRun + 2)
446 EmitRun(); 447 EmitRun();
447 } 448 }
448 449
449 } // namespace safe_browsing 450 } // namespace safe_browsing
OLDNEW
« no previous file with comments | « components/safe_browsing_db/prefix_set.h ('k') | components/safe_browsing_db/prefix_set_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698