chrome/browser/safe_browsing/prefix_set.cc - Issue 1420143002: Revert of Move prefix_set and parts of s_b_util into a new component safe_browsing_db.

Side by Side Diff: chrome/browser/safe_browsing/prefix_set.cc

Issue 1420143002: Revert of Move prefix_set and parts of s_b_util into a new component safe_browsing_db. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "chrome/browser/safe_browsing/prefix_set.h"

	6

	7 #include <algorithm>

	8

	9 #include "base/files/file_util.h"

	10 #include "base/files/scoped_file.h"

	11 #include "base/logging.h"

	12 #include "base/md5.h"

	13 #include "base/metrics/histogram.h"

	14 #include "base/metrics/sparse_histogram.h"

	15

	16 namespace {

	17

	18 // \|kMagic\| should be reasonably unique, and not match itself across

	19 // endianness changes. I generated this value with:

	20 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc \| colrm 9

	21 static uint32 kMagic = 0x864088dd;

	22

	23 // Version history:

	24 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10

	25 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27

	26 // Version 3: dd07faf5/r268145 by shess@chromium.org on 2014-05-05

	27

	28 // Version 2 layout is identical to version 1. The sort order of \|index_\|

	29 // changed from \|int32\| to \|uint32\| to match the change of \|SBPrefix\|.

	30 // Version 3 adds storage for full hashes.

	31 static uint32 kVersion = 3;

	32 static uint32 kDeprecatedVersion = 2; // And lower.

	33

	34 typedef struct {

	35 uint32 magic;

	36 uint32 version;

	37 uint32 index_size;

	38 uint32 deltas_size;

	39 uint32 full_hashes_size;

	40 } FileHeader;

	41

	42 // Common std::vector<> implementations add capacity by multiplying from the

	43 // current size (usually either by 2 or 1.5) to satisfy push_back() running in

	44 // amortized constant time. This is not necessary for insert() at end(), but

	45 // AFAICT it seems true for some implementations. SBPrefix values should

	46 // uniformly cover the 32-bit space, so the final size can be estimated given a

	47 // subset of the input.

	48 //

	49 // \|kEstimateThreshold\| is when estimates start converging. Results are strong

	50 // starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of

	51 // resizing capacity from >50% to 100%.

	52 //

	53 // TODO(shess): I'm sure there is math in the world to describe good settings

	54 // for estimating the size of a uniformly-distributed set of integers from a

	55 // sorted subset. I do not have such math in me, so I assumed that my current

	56 // organic database of prefixes was scale-free, and wrote a script to see how

	57 // often given slop values would always suffice for given strides. At 1<<30,

	58 // .5% slop was sufficient to cover all cases (though the code below uses 1%).

	59 //

	60 // TODO(shess): A smaller threshold uses less transient space in reallocation.

	61 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost

	62 // is that a smaller threshold needs more slop (locked down for the long term).

	63 // 1<<29 worked well with 1%, 1<<27 worked well with 2%.

	64 const SBPrefix kEstimateThreshold = 1 << 30;

	65 size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) {

	66 // estimated_count / current_count == estimated_max / current_prefix

	67 // For large input sets, estimated_max of 2^32 is close enough.

	68 const size_t estimated_prefix_count = static_cast<size_t>(

	69 (static_cast<uint64>(current_count) << 32) / current_prefix);

	70

	71 // The estimate has an error bar, if the final total is below the estimate, no

	72 // harm, but if it is above a capacity resize will happen at nearly 100%. Add

	73 // some slop to make sure all cases are covered.

	74 return estimated_prefix_count + estimated_prefix_count / 100;

	75 }

	76

	77 } // namespace

	78

	79 namespace safe_browsing {

	80

	81 // For \|std::upper_bound()\| to find a prefix w/in a vector of pairs.

	82 // static

	83 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) {

	84 return a.first < b.first;

	85 }

	86

	87 PrefixSet::PrefixSet() {

	88 }

	89

	90 PrefixSet::PrefixSet(IndexVector* index,

	91 std::vector<uint16>* deltas,

	92 std::vector<SBFullHash>* full_hashes) {

	93 DCHECK(index && deltas && full_hashes);

	94 index_.swap(*index);

	95 deltas_.swap(*deltas);

	96 full_hashes_.swap(*full_hashes);

	97 }

	98

	99 PrefixSet::~PrefixSet() {}

	100

	101 bool PrefixSet::PrefixExists(SBPrefix prefix) const {

	102 if (index_.empty())

	103 return false;

	104

	105 // Find the first position after \|prefix\| in \|index_\|.

	106 IndexVector::const_iterator iter =

	107 std::upper_bound(index_.begin(), index_.end(),

	108 IndexPair(prefix, 0), PrefixLess);

	109

	110 // \|prefix\| comes before anything that's in the set.

	111 if (iter == index_.begin())

	112 return false;

	113

	114 // Capture the upper bound of our target entry's deltas.

	115 const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second);

	116

	117 // Back up to the entry our target is in.

	118 --iter;

	119

	120 // All prefixes in \|index_\| are in the set.

	121 SBPrefix current = iter->first;

	122 if (current == prefix)

	123 return true;

	124

	125 // Scan forward accumulating deltas while a match is possible.

	126 for (size_t di = iter->second; di < bound && current < prefix; ++di) {

	127 current += deltas_[di];

	128 }

	129

	130 return current == prefix;

	131 }

	132

	133 bool PrefixSet::Exists(const SBFullHash& hash) const {

	134 if (std::binary_search(full_hashes_.begin(), full_hashes_.end(),

	135 hash, SBFullHashLess)) {

	136 return true;

	137 }

	138 return PrefixExists(hash.prefix);

	139 }

	140

	141 void PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const {

	142 prefixes->reserve(index_.size() + deltas_.size());

	143

	144 for (size_t ii = 0; ii < index_.size(); ++ii) {

	145 // The deltas for this \|index_\| entry run to the next index entry,

	146 // or the end of the deltas.

	147 const size_t deltas_end =

	148 (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size();

	149

	150 SBPrefix current = index_[ii].first;

	151 prefixes->push_back(current);

	152 for (size_t di = index_[ii].second; di < deltas_end; ++di) {

	153 current += deltas_[di];

	154 prefixes->push_back(current);

	155 }

	156 }

	157 }

	158

	159 // static

	160 scoped_ptr<const PrefixSet> PrefixSet::LoadFile(

	161 const base::FilePath& filter_name) {

	162 int64 size_64;

	163 if (!base::GetFileSize(filter_name, &size_64))

	164 return nullptr;

	165 using base::MD5Digest;

	166 if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest)))

	167 return nullptr;

	168

	169 base::ScopedFILE file(base::OpenFile(filter_name, "rb"));

	170 if (!file.get())

	171 return nullptr;

	172

	173 FileHeader header;

	174 size_t read = fread(&header, sizeof(header), 1, file.get());

	175 if (read != 1)

	176 return nullptr;

	177

	178 // The file looks valid, start building the digest.

	179 base::MD5Context context;

	180 base::MD5Init(&context);

	181 base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),

	182 sizeof(header)));

	183

	184 if (header.magic != kMagic)

	185 return nullptr;

	186

	187 // Track version read to inform removal of support for older versions.

	188 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.PrefixSetVersionRead", header.version);

	189

	190 if (header.version <= kDeprecatedVersion) {

	191 return nullptr;

	192 } else if (header.version != kVersion) {

	193 return nullptr;

	194 }

	195

	196 IndexVector index;

	197 const size_t index_bytes = sizeof(index[0]) * header.index_size;

	198

	199 std::vector<uint16> deltas;

	200 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;

	201

	202 std::vector<SBFullHash> full_hashes;

	203 const size_t full_hashes_bytes =

	204 sizeof(full_hashes[0]) * header.full_hashes_size;

	205

	206 // Check for bogus sizes before allocating any space.

	207 const size_t expected_bytes = sizeof(header) +

	208 index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest);

	209 if (static_cast<int64>(expected_bytes) != size_64)

	210 return nullptr;

	211

	212 // Read the index vector. Herb Sutter indicates that vectors are

	213 // guaranteed to be contiuguous, so reading to where element 0 lives

	214 // is valid.

	215 if (header.index_size) {

	216 index.resize(header.index_size);

	217 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());

	218 if (read != index.size())

	219 return nullptr;

	220 base::MD5Update(&context,

	221 base::StringPiece(reinterpret_cast<char*>(&(index[0])),

	222 index_bytes));

	223 }

	224

	225 // Read vector of deltas.

	226 if (header.deltas_size) {

	227 deltas.resize(header.deltas_size);

	228 read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get());

	229 if (read != deltas.size())

	230 return nullptr;

	231 base::MD5Update(&context,

	232 base::StringPiece(reinterpret_cast<char*>(&(deltas[0])),

	233 deltas_bytes));

	234 }

	235

	236 // Read vector of full hashes.

	237 if (header.full_hashes_size) {

	238 full_hashes.resize(header.full_hashes_size);

	239 read = fread(&(full_hashes[0]), sizeof(full_hashes[0]), full_hashes.size(),

	240 file.get());

	241 if (read != full_hashes.size())

	242 return nullptr;

	243 base::MD5Update(&context,

	244 base::StringPiece(

	245 reinterpret_cast<char*>(&(full_hashes[0])),

	246 full_hashes_bytes));

	247 }

	248

	249 base::MD5Digest calculated_digest;

	250 base::MD5Final(&calculated_digest, &context);

	251

	252 base::MD5Digest file_digest;

	253 read = fread(&file_digest, sizeof(file_digest), 1, file.get());

	254 if (read != 1)

	255 return nullptr;

	256

	257 if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))

	258 return nullptr;

	259

	260 // Steals vector contents using swap().

	261 return make_scoped_ptr(

	262 new PrefixSet(&index, &deltas, &full_hashes));

	263 }

	264

	265 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const {

	266 FileHeader header;

	267 header.magic = kMagic;

	268 header.version = kVersion;

	269 header.index_size = static_cast<uint32>(index_.size());

	270 header.deltas_size = static_cast<uint32>(deltas_.size());

	271 header.full_hashes_size = static_cast<uint32>(full_hashes_.size());

	272

	273 // Sanity check that the 32-bit values never mess things up.

	274 if (static_cast<size_t>(header.index_size) != index_.size() \|\|

	275 static_cast<size_t>(header.deltas_size) != deltas_.size() \|\|

	276 static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) {

	277 NOTREACHED();

	278 return false;

	279 }

	280

	281 base::ScopedFILE file(base::OpenFile(filter_name, "wb"));

	282 if (!file.get())

	283 return false;

	284

	285 base::MD5Context context;

	286 base::MD5Init(&context);

	287

	288 // TODO(shess): The I/O code in safe_browsing_store_file.cc would

	289 // sure be useful about now.

	290 size_t written = fwrite(&header, sizeof(header), 1, file.get());

	291 if (written != 1)

	292 return false;

	293 base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),

	294 sizeof(header)));

	295

	296 // As for reads, the standard guarantees the ability to access the

	297 // contents of the vector by a pointer to an element.

	298 if (index_.size()) {

	299 const size_t index_bytes = sizeof(index_[0]) * index_.size();

	300 written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(),

	301 file.get());

	302 if (written != index_.size())

	303 return false;

	304 base::MD5Update(&context,

	305 base::StringPiece(

	306 reinterpret_cast<const char*>(&(index_[0])),

	307 index_bytes));

	308 }

	309

	310 if (deltas_.size()) {

	311 const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size();

	312 written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(),

	313 file.get());

	314 if (written != deltas_.size())

	315 return false;

	316 base::MD5Update(&context,

	317 base::StringPiece(

	318 reinterpret_cast<const char*>(&(deltas_[0])),

	319 deltas_bytes));

	320 }

	321

	322 if (full_hashes_.size()) {

	323 const size_t elt_size = sizeof(full_hashes_[0]);

	324 const size_t elts = full_hashes_.size();

	325 const size_t full_hashes_bytes = elt_size * elts;

	326 written = fwrite(&(full_hashes_[0]), elt_size, elts, file.get());

	327 if (written != elts)

	328 return false;

	329 base::MD5Update(&context,

	330 base::StringPiece(

	331 reinterpret_cast<const char*>(&(full_hashes_[0])),

	332 full_hashes_bytes));

	333 }

	334

	335 base::MD5Digest digest;

	336 base::MD5Final(&digest, &context);

	337 written = fwrite(&digest, sizeof(digest), 1, file.get());

	338 if (written != 1)

	339 return false;

	340

	341 // TODO(shess): Can this code check that the close was successful?

	342 file.reset();

	343

	344 return true;

	345 }

	346

	347 void PrefixSet::AddRun(SBPrefix index_prefix,

	348 const uint16* run_begin, const uint16* run_end) {

	349 // Preempt organic capacity decisions for \|delta_\| once strong estimates can

	350 // be made.

	351 if (index_prefix > kEstimateThreshold &&

	352 deltas_.capacity() < deltas_.size() + (run_end - run_begin)) {

	353 deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size()));

	354 }

	355

	356 index_.push_back(std::make_pair(index_prefix, deltas_.size()));

	357 deltas_.insert(deltas_.end(), run_begin, run_end);

	358 }

	359

	360 PrefixSetBuilder::PrefixSetBuilder()

	361 : prefix_set_(new PrefixSet()) {

	362 }

	363

	364 PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes)

	365 : prefix_set_(new PrefixSet()) {

	366 for (size_t i = 0; i < prefixes.size(); ++i) {

	367 AddPrefix(prefixes[i]);

	368 }

	369 }

	370

	371 PrefixSetBuilder::~PrefixSetBuilder() {

	372 }

	373

	374 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSet(

	375 const std::vector<SBFullHash>& hashes) {

	376 DCHECK(prefix_set_.get());

	377

	378 // Flush runs until buffered data is gone.

	379 while (!buffer_.empty()) {

	380 EmitRun();

	381 }

	382

	383 // Precisely size \|index_\| for read-only. It's 50k-60k, so minor savings, but

	384 // they're almost free.

	385 PrefixSet::IndexVector(prefix_set_->index_).swap(prefix_set_->index_);

	386

	387 prefix_set_->full_hashes_ = hashes;

	388 std::sort(prefix_set_->full_hashes_.begin(), prefix_set_->full_hashes_.end(),

	389 SBFullHashLess);

	390

	391 return prefix_set_.Pass();

	392 }

	393

	394 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() {

	395 return GetPrefixSet(std::vector<SBFullHash>()).Pass();

	396 }

	397

	398 void PrefixSetBuilder::EmitRun() {

	399 DCHECK(prefix_set_.get());

	400

	401 SBPrefix prev_prefix = buffer_[0];

	402 uint16 run[PrefixSet::kMaxRun];

	403 size_t run_pos = 0;

	404

	405 size_t i;

	406 for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) {

	407 // Calculate the delta. \|unsigned\| is mandatory, because the

	408 // sorted_prefixes could be more than INT_MAX apart.

	409 DCHECK_GT(buffer_[i], prev_prefix);

	410 const unsigned delta = buffer_[i] - prev_prefix;

	411 const uint16 delta16 = static_cast<uint16>(delta);

	412

	413 // Break the run if the delta doesn't fit.

	414 if (delta != static_cast<unsigned>(delta16))

	415 break;

	416

	417 // Continue the run of deltas.

	418 run[run_pos++] = delta16;

	419 DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta);

	420

	421 prev_prefix = buffer_[i];

	422 }

	423 prefix_set_->AddRun(buffer_[0], run, run + run_pos);

	424 buffer_.erase(buffer_.begin(), buffer_.begin() + i);

	425 }

	426

	427 void PrefixSetBuilder::AddPrefix(SBPrefix prefix) {

	428 DCHECK(prefix_set_.get());

	429

	430 if (buffer_.empty()) {

	431 DCHECK(prefix_set_->index_.empty());

	432 DCHECK(prefix_set_->deltas_.empty());

	433 } else {

	434 // Drop duplicates.

	435 if (buffer_.back() == prefix)

	436 return;

	437

	438 DCHECK_LT(buffer_.back(), prefix);

	439 }

	440 buffer_.push_back(prefix);

	441

	442 // Flush buffer when a run can be constructed. +1 for the index item, and +1

	443 // to leave at least one item in the buffer for dropping duplicates.

	444 if (buffer_.size() > PrefixSet::kMaxRun + 2)

	445 EmitRun();

	446 }

	447

	448 } // namespace safe_browsing

OLD	NEW

« no previous file with comments | « chrome/browser/safe_browsing/prefix_set.h ('k') | chrome/browser/safe_browsing/prefix_set_unittest.cc » ('j') | no next file with comments »