OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/safe_browsing_db/prefix_set.h" | 5 #include "components/safe_browsing_db/prefix_set.h" |
6 | 6 |
7 #include <algorithm> | 7 #include <algorithm> |
8 | 8 |
9 #include "base/files/file_util.h" | 9 #include "base/files/file_util.h" |
10 #include "base/files/scoped_file.h" | 10 #include "base/files/scoped_file.h" |
11 #include "base/logging.h" | 11 #include "base/logging.h" |
12 #include "base/md5.h" | 12 #include "base/md5.h" |
13 #include "base/metrics/histogram.h" | 13 #include "base/metrics/histogram.h" |
14 #include "base/metrics/sparse_histogram.h" | 14 #include "base/metrics/sparse_histogram.h" |
15 | 15 |
16 namespace safe_browsing { | 16 namespace safe_browsing { |
17 | 17 |
18 namespace { | 18 namespace { |
19 | 19 |
20 // |kMagic| should be reasonably unique, and not match itself across | 20 // |kMagic| should be reasonably unique, and not match itself across |
21 // endianness changes. I generated this value with: | 21 // endianness changes. I generated this value with: |
22 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9 | 22 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9 |
23 static uint32 kMagic = 0x864088dd; | 23 static uint32_t kMagic = 0x864088dd; |
24 | 24 |
25 // Version history: | 25 // Version history: |
26 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10 | 26 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10 |
27 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27 | 27 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27 |
28 // Version 3: dd07faf5/r268145 by shess@chromium.org on 2014-05-05 | 28 // Version 3: dd07faf5/r268145 by shess@chromium.org on 2014-05-05 |
29 | 29 |
30 // Version 2 layout is identical to version 1. The sort order of |index_| | 30 // Version 2 layout is identical to version 1. The sort order of |index_| |
31 // changed from |int32| to |uint32| to match the change of |SBPrefix|. | 31 // changed from |int32_t| to |uint32_t| to match the change of |SBPrefix|. |
32 // Version 3 adds storage for full hashes. | 32 // Version 3 adds storage for full hashes. |
33 static uint32 kVersion = 3; | 33 static uint32_t kVersion = 3; |
34 static uint32 kDeprecatedVersion = 2; // And lower. | 34 static uint32_t kDeprecatedVersion = 2; // And lower. |
35 | 35 |
36 typedef struct { | 36 typedef struct { |
37 uint32 magic; | 37 uint32_t magic; |
38 uint32 version; | 38 uint32_t version; |
39 uint32 index_size; | 39 uint32_t index_size; |
40 uint32 deltas_size; | 40 uint32_t deltas_size; |
41 uint32 full_hashes_size; | 41 uint32_t full_hashes_size; |
42 } FileHeader; | 42 } FileHeader; |
43 | 43 |
44 // Common std::vector<> implementations add capacity by multiplying from the | 44 // Common std::vector<> implementations add capacity by multiplying from the |
45 // current size (usually either by 2 or 1.5) to satisfy push_back() running in | 45 // current size (usually either by 2 or 1.5) to satisfy push_back() running in |
46 // amortized constant time. This is not necessary for insert() at end(), but | 46 // amortized constant time. This is not necessary for insert() at end(), but |
47 // AFAICT it seems true for some implementations. SBPrefix values should | 47 // AFAICT it seems true for some implementations. SBPrefix values should |
48 // uniformly cover the 32-bit space, so the final size can be estimated given a | 48 // uniformly cover the 32-bit space, so the final size can be estimated given a |
49 // subset of the input. | 49 // subset of the input. |
50 // | 50 // |
51 // |kEstimateThreshold| is when estimates start converging. Results are strong | 51 // |kEstimateThreshold| is when estimates start converging. Results are strong |
52 // starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of | 52 // starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of |
53 // resizing capacity from >50% to 100%. | 53 // resizing capacity from >50% to 100%. |
54 // | 54 // |
55 // TODO(shess): I'm sure there is math in the world to describe good settings | 55 // TODO(shess): I'm sure there is math in the world to describe good settings |
56 // for estimating the size of a uniformly-distributed set of integers from a | 56 // for estimating the size of a uniformly-distributed set of integers from a |
57 // sorted subset. I do not have such math in me, so I assumed that my current | 57 // sorted subset. I do not have such math in me, so I assumed that my current |
58 // organic database of prefixes was scale-free, and wrote a script to see how | 58 // organic database of prefixes was scale-free, and wrote a script to see how |
59 // often given slop values would always suffice for given strides. At 1<<30, | 59 // often given slop values would always suffice for given strides. At 1<<30, |
60 // .5% slop was sufficient to cover all cases (though the code below uses 1%). | 60 // .5% slop was sufficient to cover all cases (though the code below uses 1%). |
61 // | 61 // |
62 // TODO(shess): A smaller threshold uses less transient space in reallocation. | 62 // TODO(shess): A smaller threshold uses less transient space in reallocation. |
63 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost | 63 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost |
64 // is that a smaller threshold needs more slop (locked down for the long term). | 64 // is that a smaller threshold needs more slop (locked down for the long term). |
65 // 1<<29 worked well with 1%, 1<<27 worked well with 2%. | 65 // 1<<29 worked well with 1%, 1<<27 worked well with 2%. |
66 const SBPrefix kEstimateThreshold = 1 << 30; | 66 const SBPrefix kEstimateThreshold = 1 << 30; |
67 size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) { | 67 size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) { |
68 // estimated_count / current_count == estimated_max / current_prefix | 68 // estimated_count / current_count == estimated_max / current_prefix |
69 // For large input sets, estimated_max of 2^32 is close enough. | 69 // For large input sets, estimated_max of 2^32 is close enough. |
70 const size_t estimated_prefix_count = static_cast<size_t>( | 70 const size_t estimated_prefix_count = static_cast<size_t>( |
71 (static_cast<uint64>(current_count) << 32) / current_prefix); | 71 (static_cast<uint64_t>(current_count) << 32) / current_prefix); |
72 | 72 |
73 // The estimate has an error bar, if the final total is below the estimate, no | 73 // The estimate has an error bar, if the final total is below the estimate, no |
74 // harm, but if it is above a capacity resize will happen at nearly 100%. Add | 74 // harm, but if it is above a capacity resize will happen at nearly 100%. Add |
75 // some slop to make sure all cases are covered. | 75 // some slop to make sure all cases are covered. |
76 return estimated_prefix_count + estimated_prefix_count / 100; | 76 return estimated_prefix_count + estimated_prefix_count / 100; |
77 } | 77 } |
78 | 78 |
79 } // namespace | 79 } // namespace |
80 | 80 |
81 // For |std::upper_bound()| to find a prefix w/in a vector of pairs. | 81 // For |std::upper_bound()| to find a prefix w/in a vector of pairs. |
82 // static | 82 // static |
83 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) { | 83 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) { |
84 return a.first < b.first; | 84 return a.first < b.first; |
85 } | 85 } |
86 | 86 |
87 PrefixSet::PrefixSet() { | 87 PrefixSet::PrefixSet() { |
88 } | 88 } |
89 | 89 |
90 PrefixSet::PrefixSet(IndexVector* index, | 90 PrefixSet::PrefixSet(IndexVector* index, |
91 std::vector<uint16>* deltas, | 91 std::vector<uint16_t>* deltas, |
92 std::vector<SBFullHash>* full_hashes) { | 92 std::vector<SBFullHash>* full_hashes) { |
93 DCHECK(index && deltas && full_hashes); | 93 DCHECK(index && deltas && full_hashes); |
94 index_.swap(*index); | 94 index_.swap(*index); |
95 deltas_.swap(*deltas); | 95 deltas_.swap(*deltas); |
96 full_hashes_.swap(*full_hashes); | 96 full_hashes_.swap(*full_hashes); |
97 } | 97 } |
98 | 98 |
99 PrefixSet::~PrefixSet() {} | 99 PrefixSet::~PrefixSet() {} |
100 | 100 |
101 bool PrefixSet::PrefixExists(SBPrefix prefix) const { | 101 bool PrefixSet::PrefixExists(SBPrefix prefix) const { |
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
152 for (size_t di = index_[ii].second; di < deltas_end; ++di) { | 152 for (size_t di = index_[ii].second; di < deltas_end; ++di) { |
153 current += deltas_[di]; | 153 current += deltas_[di]; |
154 prefixes->push_back(current); | 154 prefixes->push_back(current); |
155 } | 155 } |
156 } | 156 } |
157 } | 157 } |
158 | 158 |
159 // static | 159 // static |
160 scoped_ptr<const PrefixSet> PrefixSet::LoadFile( | 160 scoped_ptr<const PrefixSet> PrefixSet::LoadFile( |
161 const base::FilePath& filter_name) { | 161 const base::FilePath& filter_name) { |
162 int64 size_64; | 162 int64_t size_64; |
163 if (!base::GetFileSize(filter_name, &size_64)) | 163 if (!base::GetFileSize(filter_name, &size_64)) |
164 return nullptr; | 164 return nullptr; |
165 using base::MD5Digest; | 165 using base::MD5Digest; |
166 if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest))) | 166 if (size_64 < static_cast<int64_t>(sizeof(FileHeader) + sizeof(MD5Digest))) |
167 return nullptr; | 167 return nullptr; |
168 | 168 |
169 base::ScopedFILE file(base::OpenFile(filter_name, "rb")); | 169 base::ScopedFILE file(base::OpenFile(filter_name, "rb")); |
170 if (!file.get()) | 170 if (!file.get()) |
171 return nullptr; | 171 return nullptr; |
172 | 172 |
173 FileHeader header; | 173 FileHeader header; |
174 size_t read = fread(&header, sizeof(header), 1, file.get()); | 174 size_t read = fread(&header, sizeof(header), 1, file.get()); |
175 if (read != 1) | 175 if (read != 1) |
176 return nullptr; | 176 return nullptr; |
(...skipping 12 matching lines...) Expand all Loading... |
189 | 189 |
190 if (header.version <= kDeprecatedVersion) { | 190 if (header.version <= kDeprecatedVersion) { |
191 return nullptr; | 191 return nullptr; |
192 } else if (header.version != kVersion) { | 192 } else if (header.version != kVersion) { |
193 return nullptr; | 193 return nullptr; |
194 } | 194 } |
195 | 195 |
196 IndexVector index; | 196 IndexVector index; |
197 const size_t index_bytes = sizeof(index[0]) * header.index_size; | 197 const size_t index_bytes = sizeof(index[0]) * header.index_size; |
198 | 198 |
199 std::vector<uint16> deltas; | 199 std::vector<uint16_t> deltas; |
200 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size; | 200 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size; |
201 | 201 |
202 std::vector<SBFullHash> full_hashes; | 202 std::vector<SBFullHash> full_hashes; |
203 const size_t full_hashes_bytes = | 203 const size_t full_hashes_bytes = |
204 sizeof(full_hashes[0]) * header.full_hashes_size; | 204 sizeof(full_hashes[0]) * header.full_hashes_size; |
205 | 205 |
206 // Check for bogus sizes before allocating any space. | 206 // Check for bogus sizes before allocating any space. |
207 const size_t expected_bytes = sizeof(header) + | 207 const size_t expected_bytes = sizeof(header) + |
208 index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest); | 208 index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest); |
209 if (static_cast<int64>(expected_bytes) != size_64) | 209 if (static_cast<int64_t>(expected_bytes) != size_64) |
210 return nullptr; | 210 return nullptr; |
211 | 211 |
212 // Read the index vector. Herb Sutter indicates that vectors are | 212 // Read the index vector. Herb Sutter indicates that vectors are |
213 // guaranteed to be contiuguous, so reading to where element 0 lives | 213 // guaranteed to be contiuguous, so reading to where element 0 lives |
214 // is valid. | 214 // is valid. |
215 if (header.index_size) { | 215 if (header.index_size) { |
216 index.resize(header.index_size); | 216 index.resize(header.index_size); |
217 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get()); | 217 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get()); |
218 if (read != index.size()) | 218 if (read != index.size()) |
219 return nullptr; | 219 return nullptr; |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
259 | 259 |
260 // Steals vector contents using swap(). | 260 // Steals vector contents using swap(). |
261 return make_scoped_ptr( | 261 return make_scoped_ptr( |
262 new PrefixSet(&index, &deltas, &full_hashes)); | 262 new PrefixSet(&index, &deltas, &full_hashes)); |
263 } | 263 } |
264 | 264 |
265 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const { | 265 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const { |
266 FileHeader header; | 266 FileHeader header; |
267 header.magic = kMagic; | 267 header.magic = kMagic; |
268 header.version = kVersion; | 268 header.version = kVersion; |
269 header.index_size = static_cast<uint32>(index_.size()); | 269 header.index_size = static_cast<uint32_t>(index_.size()); |
270 header.deltas_size = static_cast<uint32>(deltas_.size()); | 270 header.deltas_size = static_cast<uint32_t>(deltas_.size()); |
271 header.full_hashes_size = static_cast<uint32>(full_hashes_.size()); | 271 header.full_hashes_size = static_cast<uint32_t>(full_hashes_.size()); |
272 | 272 |
273 // Sanity check that the 32-bit values never mess things up. | 273 // Sanity check that the 32-bit values never mess things up. |
274 if (static_cast<size_t>(header.index_size) != index_.size() || | 274 if (static_cast<size_t>(header.index_size) != index_.size() || |
275 static_cast<size_t>(header.deltas_size) != deltas_.size() || | 275 static_cast<size_t>(header.deltas_size) != deltas_.size() || |
276 static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) { | 276 static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) { |
277 NOTREACHED(); | 277 NOTREACHED(); |
278 return false; | 278 return false; |
279 } | 279 } |
280 | 280 |
281 base::ScopedFILE file(base::OpenFile(filter_name, "wb")); | 281 base::ScopedFILE file(base::OpenFile(filter_name, "wb")); |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
338 if (written != 1) | 338 if (written != 1) |
339 return false; | 339 return false; |
340 | 340 |
341 // TODO(shess): Can this code check that the close was successful? | 341 // TODO(shess): Can this code check that the close was successful? |
342 file.reset(); | 342 file.reset(); |
343 | 343 |
344 return true; | 344 return true; |
345 } | 345 } |
346 | 346 |
347 void PrefixSet::AddRun(SBPrefix index_prefix, | 347 void PrefixSet::AddRun(SBPrefix index_prefix, |
348 const uint16* run_begin, const uint16* run_end) { | 348 const uint16_t* run_begin, |
| 349 const uint16_t* run_end) { |
349 // Preempt organic capacity decisions for |delta_| once strong estimates can | 350 // Preempt organic capacity decisions for |delta_| once strong estimates can |
350 // be made. | 351 // be made. |
351 if (index_prefix > kEstimateThreshold && | 352 if (index_prefix > kEstimateThreshold && |
352 deltas_.capacity() < deltas_.size() + (run_end - run_begin)) { | 353 deltas_.capacity() < deltas_.size() + (run_end - run_begin)) { |
353 deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size())); | 354 deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size())); |
354 } | 355 } |
355 | 356 |
356 index_.push_back( | 357 index_.push_back( |
357 std::make_pair(index_prefix, static_cast<uint32>(deltas_.size()))); | 358 std::make_pair(index_prefix, static_cast<uint32_t>(deltas_.size()))); |
358 deltas_.insert(deltas_.end(), run_begin, run_end); | 359 deltas_.insert(deltas_.end(), run_begin, run_end); |
359 } | 360 } |
360 | 361 |
361 PrefixSetBuilder::PrefixSetBuilder() | 362 PrefixSetBuilder::PrefixSetBuilder() |
362 : prefix_set_(new PrefixSet()) { | 363 : prefix_set_(new PrefixSet()) { |
363 } | 364 } |
364 | 365 |
365 PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes) | 366 PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes) |
366 : prefix_set_(new PrefixSet()) { | 367 : prefix_set_(new PrefixSet()) { |
367 for (size_t i = 0; i < prefixes.size(); ++i) { | 368 for (size_t i = 0; i < prefixes.size(); ++i) { |
(...skipping 25 matching lines...) Expand all Loading... |
393 } | 394 } |
394 | 395 |
395 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() { | 396 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() { |
396 return GetPrefixSet(std::vector<SBFullHash>()).Pass(); | 397 return GetPrefixSet(std::vector<SBFullHash>()).Pass(); |
397 } | 398 } |
398 | 399 |
399 void PrefixSetBuilder::EmitRun() { | 400 void PrefixSetBuilder::EmitRun() { |
400 DCHECK(prefix_set_.get()); | 401 DCHECK(prefix_set_.get()); |
401 | 402 |
402 SBPrefix prev_prefix = buffer_[0]; | 403 SBPrefix prev_prefix = buffer_[0]; |
403 uint16 run[PrefixSet::kMaxRun]; | 404 uint16_t run[PrefixSet::kMaxRun]; |
404 size_t run_pos = 0; | 405 size_t run_pos = 0; |
405 | 406 |
406 size_t i; | 407 size_t i; |
407 for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) { | 408 for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) { |
408 // Calculate the delta. |unsigned| is mandatory, because the | 409 // Calculate the delta. |unsigned| is mandatory, because the |
409 // sorted_prefixes could be more than INT_MAX apart. | 410 // sorted_prefixes could be more than INT_MAX apart. |
410 DCHECK_GT(buffer_[i], prev_prefix); | 411 DCHECK_GT(buffer_[i], prev_prefix); |
411 const unsigned delta = buffer_[i] - prev_prefix; | 412 const unsigned delta = buffer_[i] - prev_prefix; |
412 const uint16 delta16 = static_cast<uint16>(delta); | 413 const uint16_t delta16 = static_cast<uint16_t>(delta); |
413 | 414 |
414 // Break the run if the delta doesn't fit. | 415 // Break the run if the delta doesn't fit. |
415 if (delta != static_cast<unsigned>(delta16)) | 416 if (delta != static_cast<unsigned>(delta16)) |
416 break; | 417 break; |
417 | 418 |
418 // Continue the run of deltas. | 419 // Continue the run of deltas. |
419 run[run_pos++] = delta16; | 420 run[run_pos++] = delta16; |
420 DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta); | 421 DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta); |
421 | 422 |
422 prev_prefix = buffer_[i]; | 423 prev_prefix = buffer_[i]; |
(...skipping 17 matching lines...) Expand all Loading... |
440 } | 441 } |
441 buffer_.push_back(prefix); | 442 buffer_.push_back(prefix); |
442 | 443 |
443 // Flush buffer when a run can be constructed. +1 for the index item, and +1 | 444 // Flush buffer when a run can be constructed. +1 for the index item, and +1 |
444 // to leave at least one item in the buffer for dropping duplicates. | 445 // to leave at least one item in the buffer for dropping duplicates. |
445 if (buffer_.size() > PrefixSet::kMaxRun + 2) | 446 if (buffer_.size() > PrefixSet::kMaxRun + 2) |
446 EmitRun(); | 447 EmitRun(); |
447 } | 448 } |
448 | 449 |
449 } // namespace safe_browsing | 450 } // namespace safe_browsing |
OLD | NEW |