components/safe_browsing_db/prefix_set.cc - Issue 1551433002: Switch to standard integer types in components/, part 3 of 4.

Side by Side Diff: components/safe_browsing_db/prefix_set.cc

Issue 1551433002: Switch to standard integer types in components/, part 3 of 4. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: more Created 4 years, 12 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/safe_browsing_db/prefix_set.h"	5 #include "components/safe_browsing_db/prefix_set.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8	8

9 #include "base/files/file_util.h"	9 #include "base/files/file_util.h"

10 #include "base/files/scoped_file.h"	10 #include "base/files/scoped_file.h"

11 #include "base/logging.h"	11 #include "base/logging.h"

12 #include "base/md5.h"	12 #include "base/md5.h"

13 #include "base/metrics/histogram.h"	13 #include "base/metrics/histogram.h"

14 #include "base/metrics/sparse_histogram.h"	14 #include "base/metrics/sparse_histogram.h"

15	15

16 namespace safe_browsing {	16 namespace safe_browsing {

17	17

18 namespace {	18 namespace {

19	19

20 // \|kMagic\| should be reasonably unique, and not match itself across	20 // \|kMagic\| should be reasonably unique, and not match itself across

21 // endianness changes. I generated this value with:	21 // endianness changes. I generated this value with:

22 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc \| colrm 9	22 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc \| colrm 9

23 static uint32 kMagic = 0x864088dd;	23 static uint32_t kMagic = 0x864088dd;

24	24

25 // Version history:	25 // Version history:

26 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10	26 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10

27 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27	27 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27

28 // Version 3: dd07faf5/r268145 by shess@chromium.org on 2014-05-05	28 // Version 3: dd07faf5/r268145 by shess@chromium.org on 2014-05-05

29	29

30 // Version 2 layout is identical to version 1. The sort order of \|index_\|	30 // Version 2 layout is identical to version 1. The sort order of \|index_\|

31 // changed from \|int32\| to \|uint32\| to match the change of \|SBPrefix\|.	31 // changed from \|int32_t\| to \|uint32_t\| to match the change of \|SBPrefix\|.

32 // Version 3 adds storage for full hashes.	32 // Version 3 adds storage for full hashes.

33 static uint32 kVersion = 3;	33 static uint32_t kVersion = 3;

34 static uint32 kDeprecatedVersion = 2; // And lower.	34 static uint32_t kDeprecatedVersion = 2; // And lower.

35	35

36 typedef struct {	36 typedef struct {

37 uint32 magic;	37 uint32_t magic;

38 uint32 version;	38 uint32_t version;

39 uint32 index_size;	39 uint32_t index_size;

40 uint32 deltas_size;	40 uint32_t deltas_size;

41 uint32 full_hashes_size;	41 uint32_t full_hashes_size;

42 } FileHeader;	42 } FileHeader;

43	43

44 // Common std::vector<> implementations add capacity by multiplying from the	44 // Common std::vector<> implementations add capacity by multiplying from the

45 // current size (usually either by 2 or 1.5) to satisfy push_back() running in	45 // current size (usually either by 2 or 1.5) to satisfy push_back() running in

46 // amortized constant time. This is not necessary for insert() at end(), but	46 // amortized constant time. This is not necessary for insert() at end(), but

47 // AFAICT it seems true for some implementations. SBPrefix values should	47 // AFAICT it seems true for some implementations. SBPrefix values should

48 // uniformly cover the 32-bit space, so the final size can be estimated given a	48 // uniformly cover the 32-bit space, so the final size can be estimated given a

49 // subset of the input.	49 // subset of the input.

50 //	50 //

51 // \|kEstimateThreshold\| is when estimates start converging. Results are strong	51 // \|kEstimateThreshold\| is when estimates start converging. Results are strong

52 // starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of	52 // starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of

53 // resizing capacity from >50% to 100%.	53 // resizing capacity from >50% to 100%.

54 //	54 //

55 // TODO(shess): I'm sure there is math in the world to describe good settings	55 // TODO(shess): I'm sure there is math in the world to describe good settings

56 // for estimating the size of a uniformly-distributed set of integers from a	56 // for estimating the size of a uniformly-distributed set of integers from a

57 // sorted subset. I do not have such math in me, so I assumed that my current	57 // sorted subset. I do not have such math in me, so I assumed that my current

58 // organic database of prefixes was scale-free, and wrote a script to see how	58 // organic database of prefixes was scale-free, and wrote a script to see how

59 // often given slop values would always suffice for given strides. At 1<<30,	59 // often given slop values would always suffice for given strides. At 1<<30,

60 // .5% slop was sufficient to cover all cases (though the code below uses 1%).	60 // .5% slop was sufficient to cover all cases (though the code below uses 1%).

61 //	61 //

62 // TODO(shess): A smaller threshold uses less transient space in reallocation.	62 // TODO(shess): A smaller threshold uses less transient space in reallocation.

63 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost	63 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost

64 // is that a smaller threshold needs more slop (locked down for the long term).	64 // is that a smaller threshold needs more slop (locked down for the long term).

65 // 1<<29 worked well with 1%, 1<<27 worked well with 2%.	65 // 1<<29 worked well with 1%, 1<<27 worked well with 2%.

66 const SBPrefix kEstimateThreshold = 1 << 30;	66 const SBPrefix kEstimateThreshold = 1 << 30;

67 size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) {	67 size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) {

68 // estimated_count / current_count == estimated_max / current_prefix	68 // estimated_count / current_count == estimated_max / current_prefix

69 // For large input sets, estimated_max of 2^32 is close enough.	69 // For large input sets, estimated_max of 2^32 is close enough.

70 const size_t estimated_prefix_count = static_cast<size_t>(	70 const size_t estimated_prefix_count = static_cast<size_t>(

71 (static_cast<uint64>(current_count) << 32) / current_prefix);	71 (static_cast<uint64_t>(current_count) << 32) / current_prefix);

72	72

73 // The estimate has an error bar, if the final total is below the estimate, no	73 // The estimate has an error bar, if the final total is below the estimate, no

74 // harm, but if it is above a capacity resize will happen at nearly 100%. Add	74 // harm, but if it is above a capacity resize will happen at nearly 100%. Add

75 // some slop to make sure all cases are covered.	75 // some slop to make sure all cases are covered.

76 return estimated_prefix_count + estimated_prefix_count / 100;	76 return estimated_prefix_count + estimated_prefix_count / 100;

77 }	77 }

78	78

79 } // namespace	79 } // namespace

80	80

81 // For \|std::upper_bound()\| to find a prefix w/in a vector of pairs.	81 // For \|std::upper_bound()\| to find a prefix w/in a vector of pairs.

82 // static	82 // static

83 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) {	83 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) {

84 return a.first < b.first;	84 return a.first < b.first;

85 }	85 }

86	86

87 PrefixSet::PrefixSet() {	87 PrefixSet::PrefixSet() {

88 }	88 }

89	89

90 PrefixSet::PrefixSet(IndexVector* index,	90 PrefixSet::PrefixSet(IndexVector* index,

91 std::vector<uint16>* deltas,	91 std::vector<uint16_t>* deltas,

92 std::vector<SBFullHash>* full_hashes) {	92 std::vector<SBFullHash>* full_hashes) {

93 DCHECK(index && deltas && full_hashes);	93 DCHECK(index && deltas && full_hashes);

94 index_.swap(*index);	94 index_.swap(*index);

95 deltas_.swap(*deltas);	95 deltas_.swap(*deltas);

96 full_hashes_.swap(*full_hashes);	96 full_hashes_.swap(*full_hashes);

97 }	97 }

98	98

99 PrefixSet::~PrefixSet() {}	99 PrefixSet::~PrefixSet() {}

100	100

101 bool PrefixSet::PrefixExists(SBPrefix prefix) const {	101 bool PrefixSet::PrefixExists(SBPrefix prefix) const {

(...skipping 50 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
152 for (size_t di = index_[ii].second; di < deltas_end; ++di) {	152 for (size_t di = index_[ii].second; di < deltas_end; ++di) {

153 current += deltas_[di];	153 current += deltas_[di];

154 prefixes->push_back(current);	154 prefixes->push_back(current);

155 }	155 }

156 }	156 }

157 }	157 }

158	158

159 // static	159 // static

160 scoped_ptr<const PrefixSet> PrefixSet::LoadFile(	160 scoped_ptr<const PrefixSet> PrefixSet::LoadFile(

161 const base::FilePath& filter_name) {	161 const base::FilePath& filter_name) {

162 int64 size_64;	162 int64_t size_64;

163 if (!base::GetFileSize(filter_name, &size_64))	163 if (!base::GetFileSize(filter_name, &size_64))

164 return nullptr;	164 return nullptr;

165 using base::MD5Digest;	165 using base::MD5Digest;

166 if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest)))	166 if (size_64 < static_cast<int64_t>(sizeof(FileHeader) + sizeof(MD5Digest)))

167 return nullptr;	167 return nullptr;

168	168

169 base::ScopedFILE file(base::OpenFile(filter_name, "rb"));	169 base::ScopedFILE file(base::OpenFile(filter_name, "rb"));

170 if (!file.get())	170 if (!file.get())

171 return nullptr;	171 return nullptr;

172	172

173 FileHeader header;	173 FileHeader header;

174 size_t read = fread(&header, sizeof(header), 1, file.get());	174 size_t read = fread(&header, sizeof(header), 1, file.get());

175 if (read != 1)	175 if (read != 1)

176 return nullptr;	176 return nullptr;

(...skipping 12 matching lines...) Expand all Loading...
189	189

190 if (header.version <= kDeprecatedVersion) {	190 if (header.version <= kDeprecatedVersion) {

191 return nullptr;	191 return nullptr;

192 } else if (header.version != kVersion) {	192 } else if (header.version != kVersion) {

193 return nullptr;	193 return nullptr;

194 }	194 }

195	195

196 IndexVector index;	196 IndexVector index;

197 const size_t index_bytes = sizeof(index[0]) * header.index_size;	197 const size_t index_bytes = sizeof(index[0]) * header.index_size;

198	198

199 std::vector<uint16> deltas;	199 std::vector<uint16_t> deltas;

200 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;	200 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;

201	201

202 std::vector<SBFullHash> full_hashes;	202 std::vector<SBFullHash> full_hashes;

203 const size_t full_hashes_bytes =	203 const size_t full_hashes_bytes =

204 sizeof(full_hashes[0]) * header.full_hashes_size;	204 sizeof(full_hashes[0]) * header.full_hashes_size;

205	205

206 // Check for bogus sizes before allocating any space.	206 // Check for bogus sizes before allocating any space.

207 const size_t expected_bytes = sizeof(header) +	207 const size_t expected_bytes = sizeof(header) +

208 index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest);	208 index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest);

209 if (static_cast<int64>(expected_bytes) != size_64)	209 if (static_cast<int64_t>(expected_bytes) != size_64)

210 return nullptr;	210 return nullptr;

211	211

212 // Read the index vector. Herb Sutter indicates that vectors are	212 // Read the index vector. Herb Sutter indicates that vectors are

213 // guaranteed to be contiuguous, so reading to where element 0 lives	213 // guaranteed to be contiuguous, so reading to where element 0 lives

214 // is valid.	214 // is valid.

215 if (header.index_size) {	215 if (header.index_size) {

216 index.resize(header.index_size);	216 index.resize(header.index_size);

217 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());	217 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());

218 if (read != index.size())	218 if (read != index.size())

219 return nullptr;	219 return nullptr;

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
259	259

260 // Steals vector contents using swap().	260 // Steals vector contents using swap().

261 return make_scoped_ptr(	261 return make_scoped_ptr(

262 new PrefixSet(&index, &deltas, &full_hashes));	262 new PrefixSet(&index, &deltas, &full_hashes));

263 }	263 }

264	264

265 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const {	265 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const {

266 FileHeader header;	266 FileHeader header;

267 header.magic = kMagic;	267 header.magic = kMagic;

268 header.version = kVersion;	268 header.version = kVersion;

269 header.index_size = static_cast<uint32>(index_.size());	269 header.index_size = static_cast<uint32_t>(index_.size());

270 header.deltas_size = static_cast<uint32>(deltas_.size());	270 header.deltas_size = static_cast<uint32_t>(deltas_.size());

271 header.full_hashes_size = static_cast<uint32>(full_hashes_.size());	271 header.full_hashes_size = static_cast<uint32_t>(full_hashes_.size());

272	272

273 // Sanity check that the 32-bit values never mess things up.	273 // Sanity check that the 32-bit values never mess things up.

274 if (static_cast<size_t>(header.index_size) != index_.size() \|\|	274 if (static_cast<size_t>(header.index_size) != index_.size() \|\|

275 static_cast<size_t>(header.deltas_size) != deltas_.size() \|\|	275 static_cast<size_t>(header.deltas_size) != deltas_.size() \|\|

276 static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) {	276 static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) {

277 NOTREACHED();	277 NOTREACHED();

278 return false;	278 return false;

279 }	279 }

280	280

281 base::ScopedFILE file(base::OpenFile(filter_name, "wb"));	281 base::ScopedFILE file(base::OpenFile(filter_name, "wb"));

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
338 if (written != 1)	338 if (written != 1)

339 return false;	339 return false;

340	340

341 // TODO(shess): Can this code check that the close was successful?	341 // TODO(shess): Can this code check that the close was successful?

342 file.reset();	342 file.reset();

343	343

344 return true;	344 return true;

345 }	345 }

346	346

347 void PrefixSet::AddRun(SBPrefix index_prefix,	347 void PrefixSet::AddRun(SBPrefix index_prefix,

348 const uint16* run_begin, const uint16* run_end) {	348 const uint16_t* run_begin,

	349 const uint16_t* run_end) {

349 // Preempt organic capacity decisions for \|delta_\| once strong estimates can	350 // Preempt organic capacity decisions for \|delta_\| once strong estimates can

350 // be made.	351 // be made.

351 if (index_prefix > kEstimateThreshold &&	352 if (index_prefix > kEstimateThreshold &&

352 deltas_.capacity() < deltas_.size() + (run_end - run_begin)) {	353 deltas_.capacity() < deltas_.size() + (run_end - run_begin)) {

353 deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size()));	354 deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size()));

354 }	355 }

355	356

356 index_.push_back(	357 index_.push_back(

357 std::make_pair(index_prefix, static_cast<uint32>(deltas_.size())));	358 std::make_pair(index_prefix, static_cast<uint32_t>(deltas_.size())));

358 deltas_.insert(deltas_.end(), run_begin, run_end);	359 deltas_.insert(deltas_.end(), run_begin, run_end);

359 }	360 }

360	361

361 PrefixSetBuilder::PrefixSetBuilder()	362 PrefixSetBuilder::PrefixSetBuilder()

362 : prefix_set_(new PrefixSet()) {	363 : prefix_set_(new PrefixSet()) {

363 }	364 }

364	365

365 PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes)	366 PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes)

366 : prefix_set_(new PrefixSet()) {	367 : prefix_set_(new PrefixSet()) {

367 for (size_t i = 0; i < prefixes.size(); ++i) {	368 for (size_t i = 0; i < prefixes.size(); ++i) {

(...skipping 25 matching lines...) Expand all Loading...
393 }	394 }

394	395

395 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() {	396 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() {

396 return GetPrefixSet(std::vector<SBFullHash>()).Pass();	397 return GetPrefixSet(std::vector<SBFullHash>()).Pass();

397 }	398 }

398	399

399 void PrefixSetBuilder::EmitRun() {	400 void PrefixSetBuilder::EmitRun() {

400 DCHECK(prefix_set_.get());	401 DCHECK(prefix_set_.get());

401	402

402 SBPrefix prev_prefix = buffer_[0];	403 SBPrefix prev_prefix = buffer_[0];

403 uint16 run[PrefixSet::kMaxRun];	404 uint16_t run[PrefixSet::kMaxRun];

404 size_t run_pos = 0;	405 size_t run_pos = 0;

405	406

406 size_t i;	407 size_t i;

407 for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) {	408 for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) {

408 // Calculate the delta. \|unsigned\| is mandatory, because the	409 // Calculate the delta. \|unsigned\| is mandatory, because the

409 // sorted_prefixes could be more than INT_MAX apart.	410 // sorted_prefixes could be more than INT_MAX apart.

410 DCHECK_GT(buffer_[i], prev_prefix);	411 DCHECK_GT(buffer_[i], prev_prefix);

411 const unsigned delta = buffer_[i] - prev_prefix;	412 const unsigned delta = buffer_[i] - prev_prefix;

412 const uint16 delta16 = static_cast<uint16>(delta);	413 const uint16_t delta16 = static_cast<uint16_t>(delta);

413	414

414 // Break the run if the delta doesn't fit.	415 // Break the run if the delta doesn't fit.

415 if (delta != static_cast<unsigned>(delta16))	416 if (delta != static_cast<unsigned>(delta16))

416 break;	417 break;

417	418

418 // Continue the run of deltas.	419 // Continue the run of deltas.

419 run[run_pos++] = delta16;	420 run[run_pos++] = delta16;

420 DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta);	421 DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta);

421	422

422 prev_prefix = buffer_[i];	423 prev_prefix = buffer_[i];

(...skipping 17 matching lines...) Expand all Loading...
440 }	441 }

441 buffer_.push_back(prefix);	442 buffer_.push_back(prefix);

442	443

443 // Flush buffer when a run can be constructed. +1 for the index item, and +1	444 // Flush buffer when a run can be constructed. +1 for the index item, and +1

444 // to leave at least one item in the buffer for dropping duplicates.	445 // to leave at least one item in the buffer for dropping duplicates.

445 if (buffer_.size() > PrefixSet::kMaxRun + 2)	446 if (buffer_.size() > PrefixSet::kMaxRun + 2)

446 EmitRun();	447 EmitRun();

447 }	448 }

448	449

449 } // namespace safe_browsing	450 } // namespace safe_browsing

OLD	NEW

« no previous file with comments | « components/safe_browsing_db/prefix_set.h ('k') | components/safe_browsing_db/prefix_set_unittest.cc » ('j') | no next file with comments »