OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "chrome/browser/safe_browsing/prefix_set.h" | |
6 | |
7 #include <algorithm> | |
8 | |
9 #include "base/files/file_util.h" | |
10 #include "base/files/scoped_file.h" | |
11 #include "base/logging.h" | |
12 #include "base/md5.h" | |
13 #include "base/metrics/histogram.h" | |
14 #include "base/metrics/sparse_histogram.h" | |
15 | |
16 namespace { | |
17 | |
18 // |kMagic| should be reasonably unique, and not match itself across | |
19 // endianness changes. I generated this value with: | |
20 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9 | |
21 static uint32 kMagic = 0x864088dd; | |
22 | |
23 // Version history: | |
24 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10 | |
25 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27 | |
26 // Version 3: dd07faf5/r268145 by shess@chromium.org on 2014-05-05 | |
27 | |
28 // Version 2 layout is identical to version 1. The sort order of |index_| | |
29 // changed from |int32| to |uint32| to match the change of |SBPrefix|. | |
30 // Version 3 adds storage for full hashes. | |
31 static uint32 kVersion = 3; | |
32 static uint32 kDeprecatedVersion = 2; // And lower. | |
33 | |
34 typedef struct { | |
35 uint32 magic; | |
36 uint32 version; | |
37 uint32 index_size; | |
38 uint32 deltas_size; | |
39 uint32 full_hashes_size; | |
40 } FileHeader; | |
41 | |
42 // Common std::vector<> implementations add capacity by multiplying from the | |
43 // current size (usually either by 2 or 1.5) to satisfy push_back() running in | |
44 // amortized constant time. This is not necessary for insert() at end(), but | |
45 // AFAICT it seems true for some implementations. SBPrefix values should | |
46 // uniformly cover the 32-bit space, so the final size can be estimated given a | |
47 // subset of the input. | |
48 // | |
49 // |kEstimateThreshold| is when estimates start converging. Results are strong | |
50 // starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of | |
51 // resizing capacity from >50% to 100%. | |
52 // | |
53 // TODO(shess): I'm sure there is math in the world to describe good settings | |
54 // for estimating the size of a uniformly-distributed set of integers from a | |
55 // sorted subset. I do not have such math in me, so I assumed that my current | |
56 // organic database of prefixes was scale-free, and wrote a script to see how | |
57 // often given slop values would always suffice for given strides. At 1<<30, | |
58 // .5% slop was sufficient to cover all cases (though the code below uses 1%). | |
59 // | |
60 // TODO(shess): A smaller threshold uses less transient space in reallocation. | |
61 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost | |
62 // is that a smaller threshold needs more slop (locked down for the long term). | |
63 // 1<<29 worked well with 1%, 1<<27 worked well with 2%. | |
64 const SBPrefix kEstimateThreshold = 1 << 30; | |
65 size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) { | |
66 // estimated_count / current_count == estimated_max / current_prefix | |
67 // For large input sets, estimated_max of 2^32 is close enough. | |
68 const size_t estimated_prefix_count = static_cast<size_t>( | |
69 (static_cast<uint64>(current_count) << 32) / current_prefix); | |
70 | |
71 // The estimate has an error bar, if the final total is below the estimate, no | |
72 // harm, but if it is above a capacity resize will happen at nearly 100%. Add | |
73 // some slop to make sure all cases are covered. | |
74 return estimated_prefix_count + estimated_prefix_count / 100; | |
75 } | |
76 | |
77 } // namespace | |
78 | |
79 namespace safe_browsing { | |
80 | |
81 // For |std::upper_bound()| to find a prefix w/in a vector of pairs. | |
82 // static | |
83 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) { | |
84 return a.first < b.first; | |
85 } | |
86 | |
87 PrefixSet::PrefixSet() { | |
88 } | |
89 | |
90 PrefixSet::PrefixSet(IndexVector* index, | |
91 std::vector<uint16>* deltas, | |
92 std::vector<SBFullHash>* full_hashes) { | |
93 DCHECK(index && deltas && full_hashes); | |
94 index_.swap(*index); | |
95 deltas_.swap(*deltas); | |
96 full_hashes_.swap(*full_hashes); | |
97 } | |
98 | |
99 PrefixSet::~PrefixSet() {} | |
100 | |
101 bool PrefixSet::PrefixExists(SBPrefix prefix) const { | |
102 if (index_.empty()) | |
103 return false; | |
104 | |
105 // Find the first position after |prefix| in |index_|. | |
106 IndexVector::const_iterator iter = | |
107 std::upper_bound(index_.begin(), index_.end(), | |
108 IndexPair(prefix, 0), PrefixLess); | |
109 | |
110 // |prefix| comes before anything that's in the set. | |
111 if (iter == index_.begin()) | |
112 return false; | |
113 | |
114 // Capture the upper bound of our target entry's deltas. | |
115 const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second); | |
116 | |
117 // Back up to the entry our target is in. | |
118 --iter; | |
119 | |
120 // All prefixes in |index_| are in the set. | |
121 SBPrefix current = iter->first; | |
122 if (current == prefix) | |
123 return true; | |
124 | |
125 // Scan forward accumulating deltas while a match is possible. | |
126 for (size_t di = iter->second; di < bound && current < prefix; ++di) { | |
127 current += deltas_[di]; | |
128 } | |
129 | |
130 return current == prefix; | |
131 } | |
132 | |
133 bool PrefixSet::Exists(const SBFullHash& hash) const { | |
134 if (std::binary_search(full_hashes_.begin(), full_hashes_.end(), | |
135 hash, SBFullHashLess)) { | |
136 return true; | |
137 } | |
138 return PrefixExists(hash.prefix); | |
139 } | |
140 | |
141 void PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const { | |
142 prefixes->reserve(index_.size() + deltas_.size()); | |
143 | |
144 for (size_t ii = 0; ii < index_.size(); ++ii) { | |
145 // The deltas for this |index_| entry run to the next index entry, | |
146 // or the end of the deltas. | |
147 const size_t deltas_end = | |
148 (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size(); | |
149 | |
150 SBPrefix current = index_[ii].first; | |
151 prefixes->push_back(current); | |
152 for (size_t di = index_[ii].second; di < deltas_end; ++di) { | |
153 current += deltas_[di]; | |
154 prefixes->push_back(current); | |
155 } | |
156 } | |
157 } | |
158 | |
159 // static | |
160 scoped_ptr<const PrefixSet> PrefixSet::LoadFile( | |
161 const base::FilePath& filter_name) { | |
162 int64 size_64; | |
163 if (!base::GetFileSize(filter_name, &size_64)) | |
164 return nullptr; | |
165 using base::MD5Digest; | |
166 if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest))) | |
167 return nullptr; | |
168 | |
169 base::ScopedFILE file(base::OpenFile(filter_name, "rb")); | |
170 if (!file.get()) | |
171 return nullptr; | |
172 | |
173 FileHeader header; | |
174 size_t read = fread(&header, sizeof(header), 1, file.get()); | |
175 if (read != 1) | |
176 return nullptr; | |
177 | |
178 // The file looks valid, start building the digest. | |
179 base::MD5Context context; | |
180 base::MD5Init(&context); | |
181 base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header), | |
182 sizeof(header))); | |
183 | |
184 if (header.magic != kMagic) | |
185 return nullptr; | |
186 | |
187 // Track version read to inform removal of support for older versions. | |
188 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.PrefixSetVersionRead", header.version); | |
189 | |
190 if (header.version <= kDeprecatedVersion) { | |
191 return nullptr; | |
192 } else if (header.version != kVersion) { | |
193 return nullptr; | |
194 } | |
195 | |
196 IndexVector index; | |
197 const size_t index_bytes = sizeof(index[0]) * header.index_size; | |
198 | |
199 std::vector<uint16> deltas; | |
200 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size; | |
201 | |
202 std::vector<SBFullHash> full_hashes; | |
203 const size_t full_hashes_bytes = | |
204 sizeof(full_hashes[0]) * header.full_hashes_size; | |
205 | |
206 // Check for bogus sizes before allocating any space. | |
207 const size_t expected_bytes = sizeof(header) + | |
208 index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest); | |
209 if (static_cast<int64>(expected_bytes) != size_64) | |
210 return nullptr; | |
211 | |
212 // Read the index vector. Herb Sutter indicates that vectors are | |
213 // guaranteed to be contiuguous, so reading to where element 0 lives | |
214 // is valid. | |
215 if (header.index_size) { | |
216 index.resize(header.index_size); | |
217 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get()); | |
218 if (read != index.size()) | |
219 return nullptr; | |
220 base::MD5Update(&context, | |
221 base::StringPiece(reinterpret_cast<char*>(&(index[0])), | |
222 index_bytes)); | |
223 } | |
224 | |
225 // Read vector of deltas. | |
226 if (header.deltas_size) { | |
227 deltas.resize(header.deltas_size); | |
228 read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get()); | |
229 if (read != deltas.size()) | |
230 return nullptr; | |
231 base::MD5Update(&context, | |
232 base::StringPiece(reinterpret_cast<char*>(&(deltas[0])), | |
233 deltas_bytes)); | |
234 } | |
235 | |
236 // Read vector of full hashes. | |
237 if (header.full_hashes_size) { | |
238 full_hashes.resize(header.full_hashes_size); | |
239 read = fread(&(full_hashes[0]), sizeof(full_hashes[0]), full_hashes.size(), | |
240 file.get()); | |
241 if (read != full_hashes.size()) | |
242 return nullptr; | |
243 base::MD5Update(&context, | |
244 base::StringPiece( | |
245 reinterpret_cast<char*>(&(full_hashes[0])), | |
246 full_hashes_bytes)); | |
247 } | |
248 | |
249 base::MD5Digest calculated_digest; | |
250 base::MD5Final(&calculated_digest, &context); | |
251 | |
252 base::MD5Digest file_digest; | |
253 read = fread(&file_digest, sizeof(file_digest), 1, file.get()); | |
254 if (read != 1) | |
255 return nullptr; | |
256 | |
257 if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest))) | |
258 return nullptr; | |
259 | |
260 // Steals vector contents using swap(). | |
261 return make_scoped_ptr( | |
262 new PrefixSet(&index, &deltas, &full_hashes)); | |
263 } | |
264 | |
265 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const { | |
266 FileHeader header; | |
267 header.magic = kMagic; | |
268 header.version = kVersion; | |
269 header.index_size = static_cast<uint32>(index_.size()); | |
270 header.deltas_size = static_cast<uint32>(deltas_.size()); | |
271 header.full_hashes_size = static_cast<uint32>(full_hashes_.size()); | |
272 | |
273 // Sanity check that the 32-bit values never mess things up. | |
274 if (static_cast<size_t>(header.index_size) != index_.size() || | |
275 static_cast<size_t>(header.deltas_size) != deltas_.size() || | |
276 static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) { | |
277 NOTREACHED(); | |
278 return false; | |
279 } | |
280 | |
281 base::ScopedFILE file(base::OpenFile(filter_name, "wb")); | |
282 if (!file.get()) | |
283 return false; | |
284 | |
285 base::MD5Context context; | |
286 base::MD5Init(&context); | |
287 | |
288 // TODO(shess): The I/O code in safe_browsing_store_file.cc would | |
289 // sure be useful about now. | |
290 size_t written = fwrite(&header, sizeof(header), 1, file.get()); | |
291 if (written != 1) | |
292 return false; | |
293 base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header), | |
294 sizeof(header))); | |
295 | |
296 // As for reads, the standard guarantees the ability to access the | |
297 // contents of the vector by a pointer to an element. | |
298 if (index_.size()) { | |
299 const size_t index_bytes = sizeof(index_[0]) * index_.size(); | |
300 written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(), | |
301 file.get()); | |
302 if (written != index_.size()) | |
303 return false; | |
304 base::MD5Update(&context, | |
305 base::StringPiece( | |
306 reinterpret_cast<const char*>(&(index_[0])), | |
307 index_bytes)); | |
308 } | |
309 | |
310 if (deltas_.size()) { | |
311 const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size(); | |
312 written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(), | |
313 file.get()); | |
314 if (written != deltas_.size()) | |
315 return false; | |
316 base::MD5Update(&context, | |
317 base::StringPiece( | |
318 reinterpret_cast<const char*>(&(deltas_[0])), | |
319 deltas_bytes)); | |
320 } | |
321 | |
322 if (full_hashes_.size()) { | |
323 const size_t elt_size = sizeof(full_hashes_[0]); | |
324 const size_t elts = full_hashes_.size(); | |
325 const size_t full_hashes_bytes = elt_size * elts; | |
326 written = fwrite(&(full_hashes_[0]), elt_size, elts, file.get()); | |
327 if (written != elts) | |
328 return false; | |
329 base::MD5Update(&context, | |
330 base::StringPiece( | |
331 reinterpret_cast<const char*>(&(full_hashes_[0])), | |
332 full_hashes_bytes)); | |
333 } | |
334 | |
335 base::MD5Digest digest; | |
336 base::MD5Final(&digest, &context); | |
337 written = fwrite(&digest, sizeof(digest), 1, file.get()); | |
338 if (written != 1) | |
339 return false; | |
340 | |
341 // TODO(shess): Can this code check that the close was successful? | |
342 file.reset(); | |
343 | |
344 return true; | |
345 } | |
346 | |
347 void PrefixSet::AddRun(SBPrefix index_prefix, | |
348 const uint16* run_begin, const uint16* run_end) { | |
349 // Preempt organic capacity decisions for |delta_| once strong estimates can | |
350 // be made. | |
351 if (index_prefix > kEstimateThreshold && | |
352 deltas_.capacity() < deltas_.size() + (run_end - run_begin)) { | |
353 deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size())); | |
354 } | |
355 | |
356 index_.push_back(std::make_pair(index_prefix, deltas_.size())); | |
357 deltas_.insert(deltas_.end(), run_begin, run_end); | |
358 } | |
359 | |
360 PrefixSetBuilder::PrefixSetBuilder() | |
361 : prefix_set_(new PrefixSet()) { | |
362 } | |
363 | |
364 PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes) | |
365 : prefix_set_(new PrefixSet()) { | |
366 for (size_t i = 0; i < prefixes.size(); ++i) { | |
367 AddPrefix(prefixes[i]); | |
368 } | |
369 } | |
370 | |
371 PrefixSetBuilder::~PrefixSetBuilder() { | |
372 } | |
373 | |
374 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSet( | |
375 const std::vector<SBFullHash>& hashes) { | |
376 DCHECK(prefix_set_.get()); | |
377 | |
378 // Flush runs until buffered data is gone. | |
379 while (!buffer_.empty()) { | |
380 EmitRun(); | |
381 } | |
382 | |
383 // Precisely size |index_| for read-only. It's 50k-60k, so minor savings, but | |
384 // they're almost free. | |
385 PrefixSet::IndexVector(prefix_set_->index_).swap(prefix_set_->index_); | |
386 | |
387 prefix_set_->full_hashes_ = hashes; | |
388 std::sort(prefix_set_->full_hashes_.begin(), prefix_set_->full_hashes_.end(), | |
389 SBFullHashLess); | |
390 | |
391 return prefix_set_.Pass(); | |
392 } | |
393 | |
394 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() { | |
395 return GetPrefixSet(std::vector<SBFullHash>()).Pass(); | |
396 } | |
397 | |
398 void PrefixSetBuilder::EmitRun() { | |
399 DCHECK(prefix_set_.get()); | |
400 | |
401 SBPrefix prev_prefix = buffer_[0]; | |
402 uint16 run[PrefixSet::kMaxRun]; | |
403 size_t run_pos = 0; | |
404 | |
405 size_t i; | |
406 for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) { | |
407 // Calculate the delta. |unsigned| is mandatory, because the | |
408 // sorted_prefixes could be more than INT_MAX apart. | |
409 DCHECK_GT(buffer_[i], prev_prefix); | |
410 const unsigned delta = buffer_[i] - prev_prefix; | |
411 const uint16 delta16 = static_cast<uint16>(delta); | |
412 | |
413 // Break the run if the delta doesn't fit. | |
414 if (delta != static_cast<unsigned>(delta16)) | |
415 break; | |
416 | |
417 // Continue the run of deltas. | |
418 run[run_pos++] = delta16; | |
419 DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta); | |
420 | |
421 prev_prefix = buffer_[i]; | |
422 } | |
423 prefix_set_->AddRun(buffer_[0], run, run + run_pos); | |
424 buffer_.erase(buffer_.begin(), buffer_.begin() + i); | |
425 } | |
426 | |
427 void PrefixSetBuilder::AddPrefix(SBPrefix prefix) { | |
428 DCHECK(prefix_set_.get()); | |
429 | |
430 if (buffer_.empty()) { | |
431 DCHECK(prefix_set_->index_.empty()); | |
432 DCHECK(prefix_set_->deltas_.empty()); | |
433 } else { | |
434 // Drop duplicates. | |
435 if (buffer_.back() == prefix) | |
436 return; | |
437 | |
438 DCHECK_LT(buffer_.back(), prefix); | |
439 } | |
440 buffer_.push_back(prefix); | |
441 | |
442 // Flush buffer when a run can be constructed. +1 for the index item, and +1 | |
443 // to leave at least one item in the buffer for dropping duplicates. | |
444 if (buffer_.size() > PrefixSet::kMaxRun + 2) | |
445 EmitRun(); | |
446 } | |
447 | |
448 } // namespace safe_browsing | |
OLD | NEW |