| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/safe_browsing_db/prefix_set.h" | 5 #include "components/safe_browsing_db/prefix_set.h" |
| 6 | 6 |
| 7 #include <algorithm> | 7 #include <algorithm> |
| 8 | 8 |
| 9 #include "base/files/file_util.h" | 9 #include "base/files/file_util.h" |
| 10 #include "base/files/scoped_file.h" | 10 #include "base/files/scoped_file.h" |
| 11 #include "base/logging.h" | 11 #include "base/logging.h" |
| 12 #include "base/md5.h" | 12 #include "base/md5.h" |
| 13 #include "base/metrics/histogram.h" | 13 #include "base/metrics/histogram.h" |
| 14 #include "base/metrics/sparse_histogram.h" | 14 #include "base/metrics/sparse_histogram.h" |
| 15 | 15 |
| 16 namespace safe_browsing { | 16 namespace safe_browsing { |
| 17 | 17 |
| 18 namespace { | 18 namespace { |
| 19 | 19 |
| 20 // |kMagic| should be reasonably unique, and not match itself across | 20 // |kMagic| should be reasonably unique, and not match itself across |
| 21 // endianness changes. I generated this value with: | 21 // endianness changes. I generated this value with: |
| 22 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9 | 22 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9 |
| 23 static uint32 kMagic = 0x864088dd; | 23 static uint32_t kMagic = 0x864088dd; |
| 24 | 24 |
| 25 // Version history: | 25 // Version history: |
| 26 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10 | 26 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10 |
| 27 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27 | 27 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27 |
| 28 // Version 3: dd07faf5/r268145 by shess@chromium.org on 2014-05-05 | 28 // Version 3: dd07faf5/r268145 by shess@chromium.org on 2014-05-05 |
| 29 | 29 |
| 30 // Version 2 layout is identical to version 1. The sort order of |index_| | 30 // Version 2 layout is identical to version 1. The sort order of |index_| |
| 31 // changed from |int32| to |uint32| to match the change of |SBPrefix|. | 31 // changed from |int32_t| to |uint32_t| to match the change of |SBPrefix|. |
| 32 // Version 3 adds storage for full hashes. | 32 // Version 3 adds storage for full hashes. |
| 33 static uint32 kVersion = 3; | 33 static uint32_t kVersion = 3; |
| 34 static uint32 kDeprecatedVersion = 2; // And lower. | 34 static uint32_t kDeprecatedVersion = 2; // And lower. |
| 35 | 35 |
| 36 typedef struct { | 36 typedef struct { |
| 37 uint32 magic; | 37 uint32_t magic; |
| 38 uint32 version; | 38 uint32_t version; |
| 39 uint32 index_size; | 39 uint32_t index_size; |
| 40 uint32 deltas_size; | 40 uint32_t deltas_size; |
| 41 uint32 full_hashes_size; | 41 uint32_t full_hashes_size; |
| 42 } FileHeader; | 42 } FileHeader; |
| 43 | 43 |
| 44 // Common std::vector<> implementations add capacity by multiplying from the | 44 // Common std::vector<> implementations add capacity by multiplying from the |
| 45 // current size (usually either by 2 or 1.5) to satisfy push_back() running in | 45 // current size (usually either by 2 or 1.5) to satisfy push_back() running in |
| 46 // amortized constant time. This is not necessary for insert() at end(), but | 46 // amortized constant time. This is not necessary for insert() at end(), but |
| 47 // AFAICT it seems true for some implementations. SBPrefix values should | 47 // AFAICT it seems true for some implementations. SBPrefix values should |
| 48 // uniformly cover the 32-bit space, so the final size can be estimated given a | 48 // uniformly cover the 32-bit space, so the final size can be estimated given a |
| 49 // subset of the input. | 49 // subset of the input. |
| 50 // | 50 // |
| 51 // |kEstimateThreshold| is when estimates start converging. Results are strong | 51 // |kEstimateThreshold| is when estimates start converging. Results are strong |
| 52 // starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of | 52 // starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of |
| 53 // resizing capacity from >50% to 100%. | 53 // resizing capacity from >50% to 100%. |
| 54 // | 54 // |
| 55 // TODO(shess): I'm sure there is math in the world to describe good settings | 55 // TODO(shess): I'm sure there is math in the world to describe good settings |
| 56 // for estimating the size of a uniformly-distributed set of integers from a | 56 // for estimating the size of a uniformly-distributed set of integers from a |
| 57 // sorted subset. I do not have such math in me, so I assumed that my current | 57 // sorted subset. I do not have such math in me, so I assumed that my current |
| 58 // organic database of prefixes was scale-free, and wrote a script to see how | 58 // organic database of prefixes was scale-free, and wrote a script to see how |
| 59 // often given slop values would always suffice for given strides. At 1<<30, | 59 // often given slop values would always suffice for given strides. At 1<<30, |
| 60 // .5% slop was sufficient to cover all cases (though the code below uses 1%). | 60 // .5% slop was sufficient to cover all cases (though the code below uses 1%). |
| 61 // | 61 // |
| 62 // TODO(shess): A smaller threshold uses less transient space in reallocation. | 62 // TODO(shess): A smaller threshold uses less transient space in reallocation. |
| 63 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost | 63 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost |
| 64 // is that a smaller threshold needs more slop (locked down for the long term). | 64 // is that a smaller threshold needs more slop (locked down for the long term). |
| 65 // 1<<29 worked well with 1%, 1<<27 worked well with 2%. | 65 // 1<<29 worked well with 1%, 1<<27 worked well with 2%. |
| 66 const SBPrefix kEstimateThreshold = 1 << 30; | 66 const SBPrefix kEstimateThreshold = 1 << 30; |
| 67 size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) { | 67 size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) { |
| 68 // estimated_count / current_count == estimated_max / current_prefix | 68 // estimated_count / current_count == estimated_max / current_prefix |
| 69 // For large input sets, estimated_max of 2^32 is close enough. | 69 // For large input sets, estimated_max of 2^32 is close enough. |
| 70 const size_t estimated_prefix_count = static_cast<size_t>( | 70 const size_t estimated_prefix_count = static_cast<size_t>( |
| 71 (static_cast<uint64>(current_count) << 32) / current_prefix); | 71 (static_cast<uint64_t>(current_count) << 32) / current_prefix); |
| 72 | 72 |
| 73 // The estimate has an error bar, if the final total is below the estimate, no | 73 // The estimate has an error bar, if the final total is below the estimate, no |
| 74 // harm, but if it is above a capacity resize will happen at nearly 100%. Add | 74 // harm, but if it is above a capacity resize will happen at nearly 100%. Add |
| 75 // some slop to make sure all cases are covered. | 75 // some slop to make sure all cases are covered. |
| 76 return estimated_prefix_count + estimated_prefix_count / 100; | 76 return estimated_prefix_count + estimated_prefix_count / 100; |
| 77 } | 77 } |
| 78 | 78 |
| 79 } // namespace | 79 } // namespace |
| 80 | 80 |
| 81 // For |std::upper_bound()| to find a prefix w/in a vector of pairs. | 81 // For |std::upper_bound()| to find a prefix w/in a vector of pairs. |
| 82 // static | 82 // static |
| 83 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) { | 83 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) { |
| 84 return a.first < b.first; | 84 return a.first < b.first; |
| 85 } | 85 } |
| 86 | 86 |
| 87 PrefixSet::PrefixSet() { | 87 PrefixSet::PrefixSet() { |
| 88 } | 88 } |
| 89 | 89 |
| 90 PrefixSet::PrefixSet(IndexVector* index, | 90 PrefixSet::PrefixSet(IndexVector* index, |
| 91 std::vector<uint16>* deltas, | 91 std::vector<uint16_t>* deltas, |
| 92 std::vector<SBFullHash>* full_hashes) { | 92 std::vector<SBFullHash>* full_hashes) { |
| 93 DCHECK(index && deltas && full_hashes); | 93 DCHECK(index && deltas && full_hashes); |
| 94 index_.swap(*index); | 94 index_.swap(*index); |
| 95 deltas_.swap(*deltas); | 95 deltas_.swap(*deltas); |
| 96 full_hashes_.swap(*full_hashes); | 96 full_hashes_.swap(*full_hashes); |
| 97 } | 97 } |
| 98 | 98 |
| 99 PrefixSet::~PrefixSet() {} | 99 PrefixSet::~PrefixSet() {} |
| 100 | 100 |
| 101 bool PrefixSet::PrefixExists(SBPrefix prefix) const { | 101 bool PrefixSet::PrefixExists(SBPrefix prefix) const { |
| (...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 152 for (size_t di = index_[ii].second; di < deltas_end; ++di) { | 152 for (size_t di = index_[ii].second; di < deltas_end; ++di) { |
| 153 current += deltas_[di]; | 153 current += deltas_[di]; |
| 154 prefixes->push_back(current); | 154 prefixes->push_back(current); |
| 155 } | 155 } |
| 156 } | 156 } |
| 157 } | 157 } |
| 158 | 158 |
| 159 // static | 159 // static |
| 160 scoped_ptr<const PrefixSet> PrefixSet::LoadFile( | 160 scoped_ptr<const PrefixSet> PrefixSet::LoadFile( |
| 161 const base::FilePath& filter_name) { | 161 const base::FilePath& filter_name) { |
| 162 int64 size_64; | 162 int64_t size_64; |
| 163 if (!base::GetFileSize(filter_name, &size_64)) | 163 if (!base::GetFileSize(filter_name, &size_64)) |
| 164 return nullptr; | 164 return nullptr; |
| 165 using base::MD5Digest; | 165 using base::MD5Digest; |
| 166 if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest))) | 166 if (size_64 < static_cast<int64_t>(sizeof(FileHeader) + sizeof(MD5Digest))) |
| 167 return nullptr; | 167 return nullptr; |
| 168 | 168 |
| 169 base::ScopedFILE file(base::OpenFile(filter_name, "rb")); | 169 base::ScopedFILE file(base::OpenFile(filter_name, "rb")); |
| 170 if (!file.get()) | 170 if (!file.get()) |
| 171 return nullptr; | 171 return nullptr; |
| 172 | 172 |
| 173 FileHeader header; | 173 FileHeader header; |
| 174 size_t read = fread(&header, sizeof(header), 1, file.get()); | 174 size_t read = fread(&header, sizeof(header), 1, file.get()); |
| 175 if (read != 1) | 175 if (read != 1) |
| 176 return nullptr; | 176 return nullptr; |
| (...skipping 12 matching lines...) Expand all Loading... |
| 189 | 189 |
| 190 if (header.version <= kDeprecatedVersion) { | 190 if (header.version <= kDeprecatedVersion) { |
| 191 return nullptr; | 191 return nullptr; |
| 192 } else if (header.version != kVersion) { | 192 } else if (header.version != kVersion) { |
| 193 return nullptr; | 193 return nullptr; |
| 194 } | 194 } |
| 195 | 195 |
| 196 IndexVector index; | 196 IndexVector index; |
| 197 const size_t index_bytes = sizeof(index[0]) * header.index_size; | 197 const size_t index_bytes = sizeof(index[0]) * header.index_size; |
| 198 | 198 |
| 199 std::vector<uint16> deltas; | 199 std::vector<uint16_t> deltas; |
| 200 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size; | 200 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size; |
| 201 | 201 |
| 202 std::vector<SBFullHash> full_hashes; | 202 std::vector<SBFullHash> full_hashes; |
| 203 const size_t full_hashes_bytes = | 203 const size_t full_hashes_bytes = |
| 204 sizeof(full_hashes[0]) * header.full_hashes_size; | 204 sizeof(full_hashes[0]) * header.full_hashes_size; |
| 205 | 205 |
| 206 // Check for bogus sizes before allocating any space. | 206 // Check for bogus sizes before allocating any space. |
| 207 const size_t expected_bytes = sizeof(header) + | 207 const size_t expected_bytes = sizeof(header) + |
| 208 index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest); | 208 index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest); |
| 209 if (static_cast<int64>(expected_bytes) != size_64) | 209 if (static_cast<int64_t>(expected_bytes) != size_64) |
| 210 return nullptr; | 210 return nullptr; |
| 211 | 211 |
| 212 // Read the index vector. Herb Sutter indicates that vectors are | 212 // Read the index vector. Herb Sutter indicates that vectors are |
| 213 // guaranteed to be contiuguous, so reading to where element 0 lives | 213 // guaranteed to be contiuguous, so reading to where element 0 lives |
| 214 // is valid. | 214 // is valid. |
| 215 if (header.index_size) { | 215 if (header.index_size) { |
| 216 index.resize(header.index_size); | 216 index.resize(header.index_size); |
| 217 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get()); | 217 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get()); |
| 218 if (read != index.size()) | 218 if (read != index.size()) |
| 219 return nullptr; | 219 return nullptr; |
| (...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 259 | 259 |
| 260 // Steals vector contents using swap(). | 260 // Steals vector contents using swap(). |
| 261 return make_scoped_ptr( | 261 return make_scoped_ptr( |
| 262 new PrefixSet(&index, &deltas, &full_hashes)); | 262 new PrefixSet(&index, &deltas, &full_hashes)); |
| 263 } | 263 } |
| 264 | 264 |
| 265 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const { | 265 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const { |
| 266 FileHeader header; | 266 FileHeader header; |
| 267 header.magic = kMagic; | 267 header.magic = kMagic; |
| 268 header.version = kVersion; | 268 header.version = kVersion; |
| 269 header.index_size = static_cast<uint32>(index_.size()); | 269 header.index_size = static_cast<uint32_t>(index_.size()); |
| 270 header.deltas_size = static_cast<uint32>(deltas_.size()); | 270 header.deltas_size = static_cast<uint32_t>(deltas_.size()); |
| 271 header.full_hashes_size = static_cast<uint32>(full_hashes_.size()); | 271 header.full_hashes_size = static_cast<uint32_t>(full_hashes_.size()); |
| 272 | 272 |
| 273 // Sanity check that the 32-bit values never mess things up. | 273 // Sanity check that the 32-bit values never mess things up. |
| 274 if (static_cast<size_t>(header.index_size) != index_.size() || | 274 if (static_cast<size_t>(header.index_size) != index_.size() || |
| 275 static_cast<size_t>(header.deltas_size) != deltas_.size() || | 275 static_cast<size_t>(header.deltas_size) != deltas_.size() || |
| 276 static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) { | 276 static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) { |
| 277 NOTREACHED(); | 277 NOTREACHED(); |
| 278 return false; | 278 return false; |
| 279 } | 279 } |
| 280 | 280 |
| 281 base::ScopedFILE file(base::OpenFile(filter_name, "wb")); | 281 base::ScopedFILE file(base::OpenFile(filter_name, "wb")); |
| (...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 338 if (written != 1) | 338 if (written != 1) |
| 339 return false; | 339 return false; |
| 340 | 340 |
| 341 // TODO(shess): Can this code check that the close was successful? | 341 // TODO(shess): Can this code check that the close was successful? |
| 342 file.reset(); | 342 file.reset(); |
| 343 | 343 |
| 344 return true; | 344 return true; |
| 345 } | 345 } |
| 346 | 346 |
| 347 void PrefixSet::AddRun(SBPrefix index_prefix, | 347 void PrefixSet::AddRun(SBPrefix index_prefix, |
| 348 const uint16* run_begin, const uint16* run_end) { | 348 const uint16_t* run_begin, |
| 349 const uint16_t* run_end) { |
| 349 // Preempt organic capacity decisions for |delta_| once strong estimates can | 350 // Preempt organic capacity decisions for |delta_| once strong estimates can |
| 350 // be made. | 351 // be made. |
| 351 if (index_prefix > kEstimateThreshold && | 352 if (index_prefix > kEstimateThreshold && |
| 352 deltas_.capacity() < deltas_.size() + (run_end - run_begin)) { | 353 deltas_.capacity() < deltas_.size() + (run_end - run_begin)) { |
| 353 deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size())); | 354 deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size())); |
| 354 } | 355 } |
| 355 | 356 |
| 356 index_.push_back( | 357 index_.push_back( |
| 357 std::make_pair(index_prefix, static_cast<uint32>(deltas_.size()))); | 358 std::make_pair(index_prefix, static_cast<uint32_t>(deltas_.size()))); |
| 358 deltas_.insert(deltas_.end(), run_begin, run_end); | 359 deltas_.insert(deltas_.end(), run_begin, run_end); |
| 359 } | 360 } |
| 360 | 361 |
| 361 PrefixSetBuilder::PrefixSetBuilder() | 362 PrefixSetBuilder::PrefixSetBuilder() |
| 362 : prefix_set_(new PrefixSet()) { | 363 : prefix_set_(new PrefixSet()) { |
| 363 } | 364 } |
| 364 | 365 |
| 365 PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes) | 366 PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes) |
| 366 : prefix_set_(new PrefixSet()) { | 367 : prefix_set_(new PrefixSet()) { |
| 367 for (size_t i = 0; i < prefixes.size(); ++i) { | 368 for (size_t i = 0; i < prefixes.size(); ++i) { |
| (...skipping 25 matching lines...) Expand all Loading... |
| 393 } | 394 } |
| 394 | 395 |
| 395 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() { | 396 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() { |
| 396 return GetPrefixSet(std::vector<SBFullHash>()).Pass(); | 397 return GetPrefixSet(std::vector<SBFullHash>()).Pass(); |
| 397 } | 398 } |
| 398 | 399 |
| 399 void PrefixSetBuilder::EmitRun() { | 400 void PrefixSetBuilder::EmitRun() { |
| 400 DCHECK(prefix_set_.get()); | 401 DCHECK(prefix_set_.get()); |
| 401 | 402 |
| 402 SBPrefix prev_prefix = buffer_[0]; | 403 SBPrefix prev_prefix = buffer_[0]; |
| 403 uint16 run[PrefixSet::kMaxRun]; | 404 uint16_t run[PrefixSet::kMaxRun]; |
| 404 size_t run_pos = 0; | 405 size_t run_pos = 0; |
| 405 | 406 |
| 406 size_t i; | 407 size_t i; |
| 407 for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) { | 408 for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) { |
| 408 // Calculate the delta. |unsigned| is mandatory, because the | 409 // Calculate the delta. |unsigned| is mandatory, because the |
| 409 // sorted_prefixes could be more than INT_MAX apart. | 410 // sorted_prefixes could be more than INT_MAX apart. |
| 410 DCHECK_GT(buffer_[i], prev_prefix); | 411 DCHECK_GT(buffer_[i], prev_prefix); |
| 411 const unsigned delta = buffer_[i] - prev_prefix; | 412 const unsigned delta = buffer_[i] - prev_prefix; |
| 412 const uint16 delta16 = static_cast<uint16>(delta); | 413 const uint16_t delta16 = static_cast<uint16_t>(delta); |
| 413 | 414 |
| 414 // Break the run if the delta doesn't fit. | 415 // Break the run if the delta doesn't fit. |
| 415 if (delta != static_cast<unsigned>(delta16)) | 416 if (delta != static_cast<unsigned>(delta16)) |
| 416 break; | 417 break; |
| 417 | 418 |
| 418 // Continue the run of deltas. | 419 // Continue the run of deltas. |
| 419 run[run_pos++] = delta16; | 420 run[run_pos++] = delta16; |
| 420 DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta); | 421 DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta); |
| 421 | 422 |
| 422 prev_prefix = buffer_[i]; | 423 prev_prefix = buffer_[i]; |
| (...skipping 17 matching lines...) Expand all Loading... |
| 440 } | 441 } |
| 441 buffer_.push_back(prefix); | 442 buffer_.push_back(prefix); |
| 442 | 443 |
| 443 // Flush buffer when a run can be constructed. +1 for the index item, and +1 | 444 // Flush buffer when a run can be constructed. +1 for the index item, and +1 |
| 444 // to leave at least one item in the buffer for dropping duplicates. | 445 // to leave at least one item in the buffer for dropping duplicates. |
| 445 if (buffer_.size() > PrefixSet::kMaxRun + 2) | 446 if (buffer_.size() > PrefixSet::kMaxRun + 2) |
| 446 EmitRun(); | 447 EmitRun(); |
| 447 } | 448 } |
| 448 | 449 |
| 449 } // namespace safe_browsing | 450 } // namespace safe_browsing |
| OLD | NEW |