chrome/browser/safe_browsing/prefix_set.cc - Issue 1399843003: Move prefix_set and parts of s_b_util into a new component safe_browsing_db.

Side by Side Diff: chrome/browser/safe_browsing/prefix_set.cc

Issue 1399843003: Move prefix_set and parts of s_b_util into a new component safe_browsing_db. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Use group instead of a static_library for targets that have no sources. Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "chrome/browser/safe_browsing/prefix_set.h"

6

7 #include <algorithm>

8

9 #include "base/files/file_util.h"

10 #include "base/files/scoped_file.h"

11 #include "base/logging.h"

12 #include "base/md5.h"

13 #include "base/metrics/histogram.h"

14 #include "base/metrics/sparse_histogram.h"

15

16 namespace {

17

18 // \|kMagic\| should be reasonably unique, and not match itself across

19 // endianness changes. I generated this value with:

20 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc \| colrm 9

21 static uint32 kMagic = 0x864088dd;

22

23 // Version history:

24 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10

25 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27

26 // Version 3: dd07faf5/r268145 by shess@chromium.org on 2014-05-05

27

28 // Version 2 layout is identical to version 1. The sort order of \|index_\|

29 // changed from \|int32\| to \|uint32\| to match the change of \|SBPrefix\|.

30 // Version 3 adds storage for full hashes.

31 static uint32 kVersion = 3;

32 static uint32 kDeprecatedVersion = 2; // And lower.

33

34 typedef struct {

35 uint32 magic;

36 uint32 version;

37 uint32 index_size;

38 uint32 deltas_size;

39 uint32 full_hashes_size;

40 } FileHeader;

41

42 // Common std::vector<> implementations add capacity by multiplying from the

43 // current size (usually either by 2 or 1.5) to satisfy push_back() running in

44 // amortized constant time. This is not necessary for insert() at end(), but

45 // AFAICT it seems true for some implementations. SBPrefix values should

46 // uniformly cover the 32-bit space, so the final size can be estimated given a

47 // subset of the input.

48 //

49 // \|kEstimateThreshold\| is when estimates start converging. Results are strong

50 // starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of

51 // resizing capacity from >50% to 100%.

52 //

53 // TODO(shess): I'm sure there is math in the world to describe good settings

54 // for estimating the size of a uniformly-distributed set of integers from a

55 // sorted subset. I do not have such math in me, so I assumed that my current

56 // organic database of prefixes was scale-free, and wrote a script to see how

57 // often given slop values would always suffice for given strides. At 1<<30,

58 // .5% slop was sufficient to cover all cases (though the code below uses 1%).

59 //

60 // TODO(shess): A smaller threshold uses less transient space in reallocation.

61 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost

62 // is that a smaller threshold needs more slop (locked down for the long term).

63 // 1<<29 worked well with 1%, 1<<27 worked well with 2%.

64 const SBPrefix kEstimateThreshold = 1 << 30;

65 size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) {

66 // estimated_count / current_count == estimated_max / current_prefix

67 // For large input sets, estimated_max of 2^32 is close enough.

68 const size_t estimated_prefix_count = static_cast<size_t>(

69 (static_cast<uint64>(current_count) << 32) / current_prefix);

70

71 // The estimate has an error bar, if the final total is below the estimate, no

72 // harm, but if it is above a capacity resize will happen at nearly 100%. Add

73 // some slop to make sure all cases are covered.

74 return estimated_prefix_count + estimated_prefix_count / 100;

75 }

76

77 } // namespace

78

79 namespace safe_browsing {

80

81 // For \|std::upper_bound()\| to find a prefix w/in a vector of pairs.

82 // static

83 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) {

84 return a.first < b.first;

85 }

86

87 PrefixSet::PrefixSet() {

88 }

89

90 PrefixSet::PrefixSet(IndexVector* index,

91 std::vector<uint16>* deltas,

92 std::vector<SBFullHash>* full_hashes) {

93 DCHECK(index && deltas && full_hashes);

94 index_.swap(*index);

95 deltas_.swap(*deltas);

96 full_hashes_.swap(*full_hashes);

97 }

98

99 PrefixSet::~PrefixSet() {}

100

101 bool PrefixSet::PrefixExists(SBPrefix prefix) const {

102 if (index_.empty())

103 return false;

104

105 // Find the first position after \|prefix\| in \|index_\|.

106 IndexVector::const_iterator iter =

107 std::upper_bound(index_.begin(), index_.end(),

108 IndexPair(prefix, 0), PrefixLess);

109

110 // \|prefix\| comes before anything that's in the set.

111 if (iter == index_.begin())

112 return false;

113

114 // Capture the upper bound of our target entry's deltas.

115 const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second);

116

117 // Back up to the entry our target is in.

118 --iter;

119

120 // All prefixes in \|index_\| are in the set.

121 SBPrefix current = iter->first;

122 if (current == prefix)

123 return true;

124

125 // Scan forward accumulating deltas while a match is possible.

126 for (size_t di = iter->second; di < bound && current < prefix; ++di) {

127 current += deltas_[di];

128 }

129

130 return current == prefix;

131 }

132

133 bool PrefixSet::Exists(const SBFullHash& hash) const {

134 if (std::binary_search(full_hashes_.begin(), full_hashes_.end(),

135 hash, SBFullHashLess)) {

136 return true;

137 }

138 return PrefixExists(hash.prefix);

139 }

140

141 void PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const {

142 prefixes->reserve(index_.size() + deltas_.size());

143

144 for (size_t ii = 0; ii < index_.size(); ++ii) {

145 // The deltas for this \|index_\| entry run to the next index entry,

146 // or the end of the deltas.

147 const size_t deltas_end =

148 (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size();

149

150 SBPrefix current = index_[ii].first;

151 prefixes->push_back(current);

152 for (size_t di = index_[ii].second; di < deltas_end; ++di) {

153 current += deltas_[di];

154 prefixes->push_back(current);

155 }

156 }

157 }

158

159 // static

160 scoped_ptr<const PrefixSet> PrefixSet::LoadFile(

161 const base::FilePath& filter_name) {

162 int64 size_64;

163 if (!base::GetFileSize(filter_name, &size_64))

164 return nullptr;

165 using base::MD5Digest;

166 if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest)))

167 return nullptr;

168

169 base::ScopedFILE file(base::OpenFile(filter_name, "rb"));

170 if (!file.get())

171 return nullptr;

172

173 FileHeader header;

174 size_t read = fread(&header, sizeof(header), 1, file.get());

175 if (read != 1)

176 return nullptr;

177

178 // The file looks valid, start building the digest.

179 base::MD5Context context;

180 base::MD5Init(&context);

181 base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),

182 sizeof(header)));

183

184 if (header.magic != kMagic)

185 return nullptr;

186

187 // Track version read to inform removal of support for older versions.

188 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.PrefixSetVersionRead", header.version);

189

190 if (header.version <= kDeprecatedVersion) {

191 return nullptr;

192 } else if (header.version != kVersion) {

193 return nullptr;

194 }

195

196 IndexVector index;

197 const size_t index_bytes = sizeof(index[0]) * header.index_size;

198

199 std::vector<uint16> deltas;

200 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;

201

202 std::vector<SBFullHash> full_hashes;

203 const size_t full_hashes_bytes =

204 sizeof(full_hashes[0]) * header.full_hashes_size;

205

206 // Check for bogus sizes before allocating any space.

207 const size_t expected_bytes = sizeof(header) +

208 index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest);

209 if (static_cast<int64>(expected_bytes) != size_64)

210 return nullptr;

211

212 // Read the index vector. Herb Sutter indicates that vectors are

213 // guaranteed to be contiuguous, so reading to where element 0 lives

214 // is valid.

215 if (header.index_size) {

216 index.resize(header.index_size);

217 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());

218 if (read != index.size())

219 return nullptr;

220 base::MD5Update(&context,

221 base::StringPiece(reinterpret_cast<char*>(&(index[0])),

222 index_bytes));

223 }

224

225 // Read vector of deltas.

226 if (header.deltas_size) {

227 deltas.resize(header.deltas_size);

228 read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get());

229 if (read != deltas.size())

230 return nullptr;

231 base::MD5Update(&context,

232 base::StringPiece(reinterpret_cast<char*>(&(deltas[0])),

233 deltas_bytes));

234 }

235

236 // Read vector of full hashes.

237 if (header.full_hashes_size) {

238 full_hashes.resize(header.full_hashes_size);

239 read = fread(&(full_hashes[0]), sizeof(full_hashes[0]), full_hashes.size(),

240 file.get());

241 if (read != full_hashes.size())

242 return nullptr;

243 base::MD5Update(&context,

244 base::StringPiece(

245 reinterpret_cast<char*>(&(full_hashes[0])),

246 full_hashes_bytes));

247 }

248

249 base::MD5Digest calculated_digest;

250 base::MD5Final(&calculated_digest, &context);

251

252 base::MD5Digest file_digest;

253 read = fread(&file_digest, sizeof(file_digest), 1, file.get());

254 if (read != 1)

255 return nullptr;

256

257 if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))

258 return nullptr;

259

260 // Steals vector contents using swap().

261 return make_scoped_ptr(

262 new PrefixSet(&index, &deltas, &full_hashes));

263 }

264

265 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const {

266 FileHeader header;

267 header.magic = kMagic;

268 header.version = kVersion;

269 header.index_size = static_cast<uint32>(index_.size());

270 header.deltas_size = static_cast<uint32>(deltas_.size());

271 header.full_hashes_size = static_cast<uint32>(full_hashes_.size());

272

273 // Sanity check that the 32-bit values never mess things up.

274 if (static_cast<size_t>(header.index_size) != index_.size() \|\|

275 static_cast<size_t>(header.deltas_size) != deltas_.size() \|\|

276 static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) {

277 NOTREACHED();

278 return false;

279 }

280

281 base::ScopedFILE file(base::OpenFile(filter_name, "wb"));

282 if (!file.get())

283 return false;

284

285 base::MD5Context context;

286 base::MD5Init(&context);

287

288 // TODO(shess): The I/O code in safe_browsing_store_file.cc would

289 // sure be useful about now.

290 size_t written = fwrite(&header, sizeof(header), 1, file.get());

291 if (written != 1)

292 return false;

293 base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),

294 sizeof(header)));

295

296 // As for reads, the standard guarantees the ability to access the

297 // contents of the vector by a pointer to an element.

298 if (index_.size()) {

299 const size_t index_bytes = sizeof(index_[0]) * index_.size();

300 written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(),

301 file.get());

302 if (written != index_.size())

303 return false;

304 base::MD5Update(&context,

305 base::StringPiece(

306 reinterpret_cast<const char*>(&(index_[0])),

307 index_bytes));

308 }

309

310 if (deltas_.size()) {

311 const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size();

312 written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(),

313 file.get());

314 if (written != deltas_.size())

315 return false;

316 base::MD5Update(&context,

317 base::StringPiece(

318 reinterpret_cast<const char*>(&(deltas_[0])),

319 deltas_bytes));

320 }

321

322 if (full_hashes_.size()) {

323 const size_t elt_size = sizeof(full_hashes_[0]);

324 const size_t elts = full_hashes_.size();

325 const size_t full_hashes_bytes = elt_size * elts;

326 written = fwrite(&(full_hashes_[0]), elt_size, elts, file.get());

327 if (written != elts)

328 return false;

329 base::MD5Update(&context,

330 base::StringPiece(

331 reinterpret_cast<const char*>(&(full_hashes_[0])),

332 full_hashes_bytes));

333 }

334

335 base::MD5Digest digest;

336 base::MD5Final(&digest, &context);

337 written = fwrite(&digest, sizeof(digest), 1, file.get());

338 if (written != 1)

339 return false;

340

341 // TODO(shess): Can this code check that the close was successful?

342 file.reset();

343

344 return true;

345 }

346

347 void PrefixSet::AddRun(SBPrefix index_prefix,

348 const uint16* run_begin, const uint16* run_end) {

349 // Preempt organic capacity decisions for \|delta_\| once strong estimates can

350 // be made.

351 if (index_prefix > kEstimateThreshold &&

352 deltas_.capacity() < deltas_.size() + (run_end - run_begin)) {

353 deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size()));

354 }

355

356 index_.push_back(std::make_pair(index_prefix, deltas_.size()));

357 deltas_.insert(deltas_.end(), run_begin, run_end);

358 }

359

360 PrefixSetBuilder::PrefixSetBuilder()

361 : prefix_set_(new PrefixSet()) {

362 }

363

364 PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes)

365 : prefix_set_(new PrefixSet()) {

366 for (size_t i = 0; i < prefixes.size(); ++i) {

367 AddPrefix(prefixes[i]);

368 }

369 }

370

371 PrefixSetBuilder::~PrefixSetBuilder() {

372 }

373

374 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSet(

375 const std::vector<SBFullHash>& hashes) {

376 DCHECK(prefix_set_.get());

377

378 // Flush runs until buffered data is gone.

379 while (!buffer_.empty()) {

380 EmitRun();

381 }

382

383 // Precisely size \|index_\| for read-only. It's 50k-60k, so minor savings, but

384 // they're almost free.

385 PrefixSet::IndexVector(prefix_set_->index_).swap(prefix_set_->index_);

386

387 prefix_set_->full_hashes_ = hashes;

388 std::sort(prefix_set_->full_hashes_.begin(), prefix_set_->full_hashes_.end(),

389 SBFullHashLess);

390

391 return prefix_set_.Pass();

392 }

393

394 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() {

395 return GetPrefixSet(std::vector<SBFullHash>()).Pass();

396 }

397

398 void PrefixSetBuilder::EmitRun() {

399 DCHECK(prefix_set_.get());

400

401 SBPrefix prev_prefix = buffer_[0];

402 uint16 run[PrefixSet::kMaxRun];

403 size_t run_pos = 0;

404

405 size_t i;

406 for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) {

407 // Calculate the delta. \|unsigned\| is mandatory, because the

408 // sorted_prefixes could be more than INT_MAX apart.

409 DCHECK_GT(buffer_[i], prev_prefix);

410 const unsigned delta = buffer_[i] - prev_prefix;

411 const uint16 delta16 = static_cast<uint16>(delta);

412

413 // Break the run if the delta doesn't fit.

414 if (delta != static_cast<unsigned>(delta16))

415 break;

416

417 // Continue the run of deltas.

418 run[run_pos++] = delta16;

419 DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta);

420

421 prev_prefix = buffer_[i];

422 }

423 prefix_set_->AddRun(buffer_[0], run, run + run_pos);

424 buffer_.erase(buffer_.begin(), buffer_.begin() + i);

425 }

426

427 void PrefixSetBuilder::AddPrefix(SBPrefix prefix) {

428 DCHECK(prefix_set_.get());

429

430 if (buffer_.empty()) {

431 DCHECK(prefix_set_->index_.empty());

432 DCHECK(prefix_set_->deltas_.empty());

433 } else {

434 // Drop duplicates.

435 if (buffer_.back() == prefix)

436 return;

437

438 DCHECK_LT(buffer_.back(), prefix);

439 }

440 buffer_.push_back(prefix);

441

442 // Flush buffer when a run can be constructed. +1 for the index item, and +1

443 // to leave at least one item in the buffer for dropping duplicates.

444 if (buffer_.size() > PrefixSet::kMaxRun + 2)

445 EmitRun();

446 }

447

448 } // namespace safe_browsing

OLD	NEW

« no previous file with comments | « chrome/browser/safe_browsing/prefix_set.h ('k') | chrome/browser/safe_browsing/prefix_set_unittest.cc » ('j') | no next file with comments »