OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "chrome/browser/safe_browsing/prefix_set.h" |
| 6 |
| 7 #include <algorithm> |
| 8 |
| 9 #include "base/files/file_util.h" |
| 10 #include "base/files/scoped_file.h" |
| 11 #include "base/logging.h" |
| 12 #include "base/md5.h" |
| 13 #include "base/metrics/histogram.h" |
| 14 #include "base/metrics/sparse_histogram.h" |
| 15 |
| 16 namespace { |
| 17 |
| 18 // |kMagic| should be reasonably unique, and not match itself across |
| 19 // endianness changes. I generated this value with: |
| 20 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9 |
| 21 static uint32 kMagic = 0x864088dd; |
| 22 |
| 23 // Version history: |
| 24 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10 |
| 25 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27 |
| 26 // Version 3: dd07faf5/r268145 by shess@chromium.org on 2014-05-05 |
| 27 |
| 28 // Version 2 layout is identical to version 1. The sort order of |index_| |
| 29 // changed from |int32| to |uint32| to match the change of |SBPrefix|. |
| 30 // Version 3 adds storage for full hashes. |
| 31 static uint32 kVersion = 3; |
| 32 static uint32 kDeprecatedVersion = 2; // And lower. |
| 33 |
| 34 typedef struct { |
| 35 uint32 magic; |
| 36 uint32 version; |
| 37 uint32 index_size; |
| 38 uint32 deltas_size; |
| 39 uint32 full_hashes_size; |
| 40 } FileHeader; |
| 41 |
| 42 // Common std::vector<> implementations add capacity by multiplying from the |
| 43 // current size (usually either by 2 or 1.5) to satisfy push_back() running in |
| 44 // amortized constant time. This is not necessary for insert() at end(), but |
| 45 // AFAICT it seems true for some implementations. SBPrefix values should |
| 46 // uniformly cover the 32-bit space, so the final size can be estimated given a |
| 47 // subset of the input. |
| 48 // |
| 49 // |kEstimateThreshold| is when estimates start converging. Results are strong |
| 50 // starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of |
| 51 // resizing capacity from >50% to 100%. |
| 52 // |
| 53 // TODO(shess): I'm sure there is math in the world to describe good settings |
| 54 // for estimating the size of a uniformly-distributed set of integers from a |
| 55 // sorted subset. I do not have such math in me, so I assumed that my current |
| 56 // organic database of prefixes was scale-free, and wrote a script to see how |
| 57 // often given slop values would always suffice for given strides. At 1<<30, |
| 58 // .5% slop was sufficient to cover all cases (though the code below uses 1%). |
| 59 // |
| 60 // TODO(shess): A smaller threshold uses less transient space in reallocation. |
| 61 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost |
| 62 // is that a smaller threshold needs more slop (locked down for the long term). |
| 63 // 1<<29 worked well with 1%, 1<<27 worked well with 2%. |
| 64 const SBPrefix kEstimateThreshold = 1 << 30; |
| 65 size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) { |
| 66 // estimated_count / current_count == estimated_max / current_prefix |
| 67 // For large input sets, estimated_max of 2^32 is close enough. |
| 68 const size_t estimated_prefix_count = static_cast<size_t>( |
| 69 (static_cast<uint64>(current_count) << 32) / current_prefix); |
| 70 |
| 71 // The estimate has an error bar, if the final total is below the estimate, no |
| 72 // harm, but if it is above a capacity resize will happen at nearly 100%. Add |
| 73 // some slop to make sure all cases are covered. |
| 74 return estimated_prefix_count + estimated_prefix_count / 100; |
| 75 } |
| 76 |
| 77 } // namespace |
| 78 |
| 79 namespace safe_browsing { |
| 80 |
| 81 // For |std::upper_bound()| to find a prefix w/in a vector of pairs. |
| 82 // static |
| 83 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) { |
| 84 return a.first < b.first; |
| 85 } |
| 86 |
| 87 PrefixSet::PrefixSet() { |
| 88 } |
| 89 |
| 90 PrefixSet::PrefixSet(IndexVector* index, |
| 91 std::vector<uint16>* deltas, |
| 92 std::vector<SBFullHash>* full_hashes) { |
| 93 DCHECK(index && deltas && full_hashes); |
| 94 index_.swap(*index); |
| 95 deltas_.swap(*deltas); |
| 96 full_hashes_.swap(*full_hashes); |
| 97 } |
| 98 |
| 99 PrefixSet::~PrefixSet() {} |
| 100 |
| 101 bool PrefixSet::PrefixExists(SBPrefix prefix) const { |
| 102 if (index_.empty()) |
| 103 return false; |
| 104 |
| 105 // Find the first position after |prefix| in |index_|. |
| 106 IndexVector::const_iterator iter = |
| 107 std::upper_bound(index_.begin(), index_.end(), |
| 108 IndexPair(prefix, 0), PrefixLess); |
| 109 |
| 110 // |prefix| comes before anything that's in the set. |
| 111 if (iter == index_.begin()) |
| 112 return false; |
| 113 |
| 114 // Capture the upper bound of our target entry's deltas. |
| 115 const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second); |
| 116 |
| 117 // Back up to the entry our target is in. |
| 118 --iter; |
| 119 |
| 120 // All prefixes in |index_| are in the set. |
| 121 SBPrefix current = iter->first; |
| 122 if (current == prefix) |
| 123 return true; |
| 124 |
| 125 // Scan forward accumulating deltas while a match is possible. |
| 126 for (size_t di = iter->second; di < bound && current < prefix; ++di) { |
| 127 current += deltas_[di]; |
| 128 } |
| 129 |
| 130 return current == prefix; |
| 131 } |
| 132 |
| 133 bool PrefixSet::Exists(const SBFullHash& hash) const { |
| 134 if (std::binary_search(full_hashes_.begin(), full_hashes_.end(), |
| 135 hash, SBFullHashLess)) { |
| 136 return true; |
| 137 } |
| 138 return PrefixExists(hash.prefix); |
| 139 } |
| 140 |
| 141 void PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const { |
| 142 prefixes->reserve(index_.size() + deltas_.size()); |
| 143 |
| 144 for (size_t ii = 0; ii < index_.size(); ++ii) { |
| 145 // The deltas for this |index_| entry run to the next index entry, |
| 146 // or the end of the deltas. |
| 147 const size_t deltas_end = |
| 148 (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size(); |
| 149 |
| 150 SBPrefix current = index_[ii].first; |
| 151 prefixes->push_back(current); |
| 152 for (size_t di = index_[ii].second; di < deltas_end; ++di) { |
| 153 current += deltas_[di]; |
| 154 prefixes->push_back(current); |
| 155 } |
| 156 } |
| 157 } |
| 158 |
| 159 // static |
| 160 scoped_ptr<const PrefixSet> PrefixSet::LoadFile( |
| 161 const base::FilePath& filter_name) { |
| 162 int64 size_64; |
| 163 if (!base::GetFileSize(filter_name, &size_64)) |
| 164 return nullptr; |
| 165 using base::MD5Digest; |
| 166 if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest))) |
| 167 return nullptr; |
| 168 |
| 169 base::ScopedFILE file(base::OpenFile(filter_name, "rb")); |
| 170 if (!file.get()) |
| 171 return nullptr; |
| 172 |
| 173 FileHeader header; |
| 174 size_t read = fread(&header, sizeof(header), 1, file.get()); |
| 175 if (read != 1) |
| 176 return nullptr; |
| 177 |
| 178 // The file looks valid, start building the digest. |
| 179 base::MD5Context context; |
| 180 base::MD5Init(&context); |
| 181 base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header), |
| 182 sizeof(header))); |
| 183 |
| 184 if (header.magic != kMagic) |
| 185 return nullptr; |
| 186 |
| 187 // Track version read to inform removal of support for older versions. |
| 188 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.PrefixSetVersionRead", header.version); |
| 189 |
| 190 if (header.version <= kDeprecatedVersion) { |
| 191 return nullptr; |
| 192 } else if (header.version != kVersion) { |
| 193 return nullptr; |
| 194 } |
| 195 |
| 196 IndexVector index; |
| 197 const size_t index_bytes = sizeof(index[0]) * header.index_size; |
| 198 |
| 199 std::vector<uint16> deltas; |
| 200 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size; |
| 201 |
| 202 std::vector<SBFullHash> full_hashes; |
| 203 const size_t full_hashes_bytes = |
| 204 sizeof(full_hashes[0]) * header.full_hashes_size; |
| 205 |
| 206 // Check for bogus sizes before allocating any space. |
| 207 const size_t expected_bytes = sizeof(header) + |
| 208 index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest); |
| 209 if (static_cast<int64>(expected_bytes) != size_64) |
| 210 return nullptr; |
| 211 |
| 212 // Read the index vector. Herb Sutter indicates that vectors are |
| 213 // guaranteed to be contiuguous, so reading to where element 0 lives |
| 214 // is valid. |
| 215 if (header.index_size) { |
| 216 index.resize(header.index_size); |
| 217 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get()); |
| 218 if (read != index.size()) |
| 219 return nullptr; |
| 220 base::MD5Update(&context, |
| 221 base::StringPiece(reinterpret_cast<char*>(&(index[0])), |
| 222 index_bytes)); |
| 223 } |
| 224 |
| 225 // Read vector of deltas. |
| 226 if (header.deltas_size) { |
| 227 deltas.resize(header.deltas_size); |
| 228 read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get()); |
| 229 if (read != deltas.size()) |
| 230 return nullptr; |
| 231 base::MD5Update(&context, |
| 232 base::StringPiece(reinterpret_cast<char*>(&(deltas[0])), |
| 233 deltas_bytes)); |
| 234 } |
| 235 |
| 236 // Read vector of full hashes. |
| 237 if (header.full_hashes_size) { |
| 238 full_hashes.resize(header.full_hashes_size); |
| 239 read = fread(&(full_hashes[0]), sizeof(full_hashes[0]), full_hashes.size(), |
| 240 file.get()); |
| 241 if (read != full_hashes.size()) |
| 242 return nullptr; |
| 243 base::MD5Update(&context, |
| 244 base::StringPiece( |
| 245 reinterpret_cast<char*>(&(full_hashes[0])), |
| 246 full_hashes_bytes)); |
| 247 } |
| 248 |
| 249 base::MD5Digest calculated_digest; |
| 250 base::MD5Final(&calculated_digest, &context); |
| 251 |
| 252 base::MD5Digest file_digest; |
| 253 read = fread(&file_digest, sizeof(file_digest), 1, file.get()); |
| 254 if (read != 1) |
| 255 return nullptr; |
| 256 |
| 257 if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest))) |
| 258 return nullptr; |
| 259 |
| 260 // Steals vector contents using swap(). |
| 261 return make_scoped_ptr( |
| 262 new PrefixSet(&index, &deltas, &full_hashes)); |
| 263 } |
| 264 |
| 265 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const { |
| 266 FileHeader header; |
| 267 header.magic = kMagic; |
| 268 header.version = kVersion; |
| 269 header.index_size = static_cast<uint32>(index_.size()); |
| 270 header.deltas_size = static_cast<uint32>(deltas_.size()); |
| 271 header.full_hashes_size = static_cast<uint32>(full_hashes_.size()); |
| 272 |
| 273 // Sanity check that the 32-bit values never mess things up. |
| 274 if (static_cast<size_t>(header.index_size) != index_.size() || |
| 275 static_cast<size_t>(header.deltas_size) != deltas_.size() || |
| 276 static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) { |
| 277 NOTREACHED(); |
| 278 return false; |
| 279 } |
| 280 |
| 281 base::ScopedFILE file(base::OpenFile(filter_name, "wb")); |
| 282 if (!file.get()) |
| 283 return false; |
| 284 |
| 285 base::MD5Context context; |
| 286 base::MD5Init(&context); |
| 287 |
| 288 // TODO(shess): The I/O code in safe_browsing_store_file.cc would |
| 289 // sure be useful about now. |
| 290 size_t written = fwrite(&header, sizeof(header), 1, file.get()); |
| 291 if (written != 1) |
| 292 return false; |
| 293 base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header), |
| 294 sizeof(header))); |
| 295 |
| 296 // As for reads, the standard guarantees the ability to access the |
| 297 // contents of the vector by a pointer to an element. |
| 298 if (index_.size()) { |
| 299 const size_t index_bytes = sizeof(index_[0]) * index_.size(); |
| 300 written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(), |
| 301 file.get()); |
| 302 if (written != index_.size()) |
| 303 return false; |
| 304 base::MD5Update(&context, |
| 305 base::StringPiece( |
| 306 reinterpret_cast<const char*>(&(index_[0])), |
| 307 index_bytes)); |
| 308 } |
| 309 |
| 310 if (deltas_.size()) { |
| 311 const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size(); |
| 312 written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(), |
| 313 file.get()); |
| 314 if (written != deltas_.size()) |
| 315 return false; |
| 316 base::MD5Update(&context, |
| 317 base::StringPiece( |
| 318 reinterpret_cast<const char*>(&(deltas_[0])), |
| 319 deltas_bytes)); |
| 320 } |
| 321 |
| 322 if (full_hashes_.size()) { |
| 323 const size_t elt_size = sizeof(full_hashes_[0]); |
| 324 const size_t elts = full_hashes_.size(); |
| 325 const size_t full_hashes_bytes = elt_size * elts; |
| 326 written = fwrite(&(full_hashes_[0]), elt_size, elts, file.get()); |
| 327 if (written != elts) |
| 328 return false; |
| 329 base::MD5Update(&context, |
| 330 base::StringPiece( |
| 331 reinterpret_cast<const char*>(&(full_hashes_[0])), |
| 332 full_hashes_bytes)); |
| 333 } |
| 334 |
| 335 base::MD5Digest digest; |
| 336 base::MD5Final(&digest, &context); |
| 337 written = fwrite(&digest, sizeof(digest), 1, file.get()); |
| 338 if (written != 1) |
| 339 return false; |
| 340 |
| 341 // TODO(shess): Can this code check that the close was successful? |
| 342 file.reset(); |
| 343 |
| 344 return true; |
| 345 } |
| 346 |
| 347 void PrefixSet::AddRun(SBPrefix index_prefix, |
| 348 const uint16* run_begin, const uint16* run_end) { |
| 349 // Preempt organic capacity decisions for |delta_| once strong estimates can |
| 350 // be made. |
| 351 if (index_prefix > kEstimateThreshold && |
| 352 deltas_.capacity() < deltas_.size() + (run_end - run_begin)) { |
| 353 deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size())); |
| 354 } |
| 355 |
| 356 index_.push_back(std::make_pair(index_prefix, deltas_.size())); |
| 357 deltas_.insert(deltas_.end(), run_begin, run_end); |
| 358 } |
| 359 |
| 360 PrefixSetBuilder::PrefixSetBuilder() |
| 361 : prefix_set_(new PrefixSet()) { |
| 362 } |
| 363 |
| 364 PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes) |
| 365 : prefix_set_(new PrefixSet()) { |
| 366 for (size_t i = 0; i < prefixes.size(); ++i) { |
| 367 AddPrefix(prefixes[i]); |
| 368 } |
| 369 } |
| 370 |
| 371 PrefixSetBuilder::~PrefixSetBuilder() { |
| 372 } |
| 373 |
| 374 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSet( |
| 375 const std::vector<SBFullHash>& hashes) { |
| 376 DCHECK(prefix_set_.get()); |
| 377 |
| 378 // Flush runs until buffered data is gone. |
| 379 while (!buffer_.empty()) { |
| 380 EmitRun(); |
| 381 } |
| 382 |
| 383 // Precisely size |index_| for read-only. It's 50k-60k, so minor savings, but |
| 384 // they're almost free. |
| 385 PrefixSet::IndexVector(prefix_set_->index_).swap(prefix_set_->index_); |
| 386 |
| 387 prefix_set_->full_hashes_ = hashes; |
| 388 std::sort(prefix_set_->full_hashes_.begin(), prefix_set_->full_hashes_.end(), |
| 389 SBFullHashLess); |
| 390 |
| 391 return prefix_set_.Pass(); |
| 392 } |
| 393 |
| 394 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() { |
| 395 return GetPrefixSet(std::vector<SBFullHash>()).Pass(); |
| 396 } |
| 397 |
| 398 void PrefixSetBuilder::EmitRun() { |
| 399 DCHECK(prefix_set_.get()); |
| 400 |
| 401 SBPrefix prev_prefix = buffer_[0]; |
| 402 uint16 run[PrefixSet::kMaxRun]; |
| 403 size_t run_pos = 0; |
| 404 |
| 405 size_t i; |
| 406 for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) { |
| 407 // Calculate the delta. |unsigned| is mandatory, because the |
| 408 // sorted_prefixes could be more than INT_MAX apart. |
| 409 DCHECK_GT(buffer_[i], prev_prefix); |
| 410 const unsigned delta = buffer_[i] - prev_prefix; |
| 411 const uint16 delta16 = static_cast<uint16>(delta); |
| 412 |
| 413 // Break the run if the delta doesn't fit. |
| 414 if (delta != static_cast<unsigned>(delta16)) |
| 415 break; |
| 416 |
| 417 // Continue the run of deltas. |
| 418 run[run_pos++] = delta16; |
| 419 DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta); |
| 420 |
| 421 prev_prefix = buffer_[i]; |
| 422 } |
| 423 prefix_set_->AddRun(buffer_[0], run, run + run_pos); |
| 424 buffer_.erase(buffer_.begin(), buffer_.begin() + i); |
| 425 } |
| 426 |
| 427 void PrefixSetBuilder::AddPrefix(SBPrefix prefix) { |
| 428 DCHECK(prefix_set_.get()); |
| 429 |
| 430 if (buffer_.empty()) { |
| 431 DCHECK(prefix_set_->index_.empty()); |
| 432 DCHECK(prefix_set_->deltas_.empty()); |
| 433 } else { |
| 434 // Drop duplicates. |
| 435 if (buffer_.back() == prefix) |
| 436 return; |
| 437 |
| 438 DCHECK_LT(buffer_.back(), prefix); |
| 439 } |
| 440 buffer_.push_back(prefix); |
| 441 |
| 442 // Flush buffer when a run can be constructed. +1 for the index item, and +1 |
| 443 // to leave at least one item in the buffer for dropping duplicates. |
| 444 if (buffer_.size() > PrefixSet::kMaxRun + 2) |
| 445 EmitRun(); |
| 446 } |
| 447 |
| 448 } // namespace safe_browsing |
OLD | NEW |