chrome/browser/safe_browsing/prefix_set.cc - Issue 266793002: [safe browsing] Serialize full hashes with prefix set.

Side by Side Diff: chrome/browser/safe_browsing/prefix_set.cc

Issue 266793002: [safe browsing] Serialize full hashes with prefix set. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Add a v3 golden test. Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/browser/safe_browsing/prefix_set.h"	5 #include "chrome/browser/safe_browsing/prefix_set.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <math.h>	8 #include <math.h>

9	9

10 #include "base/file_util.h"	10 #include "base/file_util.h"

11 #include "base/files/scoped_file.h"	11 #include "base/files/scoped_file.h"

12 #include "base/logging.h"	12 #include "base/logging.h"

13 #include "base/md5.h"	13 #include "base/md5.h"

14 #include "base/metrics/histogram.h"	14 #include "base/metrics/histogram.h"

15 #include "base/metrics/sparse_histogram.h"	15 #include "base/metrics/sparse_histogram.h"

16	16

17 namespace {	17 namespace {

18	18

19 // \|kMagic\| should be reasonably unique, and not match itself across	19 // \|kMagic\| should be reasonably unique, and not match itself across

20 // endianness changes. I generated this value with:	20 // endianness changes. I generated this value with:

21 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc \| colrm 9	21 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc \| colrm 9

22 static uint32 kMagic = 0x864088dd;	22 static uint32 kMagic = 0x864088dd;

23	23

24 // Version history:	24 // Version history:

25 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10	25 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10

26 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27	26 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27

	27 // Version 3: ????????/r?????? by shess@chromium.org on 2014-04-??

27	28

28 // Version 2 layout is identical to version 1. The sort order of \|index_\|	29 // Version 2 layout is identical to version 1. The sort order of \|index_\|

29 // changed from \|int32\| to \|uint32\| to match the change of \|SBPrefix\|.	30 // changed from \|int32\| to \|uint32\| to match the change of \|SBPrefix\|.

30 static uint32 kVersion = 0x2;	31 // Version 3 adds storage for full hashes.

	32 static uint32 kVersion = 0x3;

31	33

32 typedef struct {	34 typedef struct {

33 uint32 magic;	35 uint32 magic;

34 uint32 version;	36 uint32 version;

35 uint32 index_size;	37 uint32 index_size;

36 uint32 deltas_size;	38 uint32 deltas_size;

	39 } FileHeader_v2;

	40

	41 typedef struct {

	42 uint32 magic;

	43 uint32 version;

	44 uint32 index_size;

	45 uint32 deltas_size;

	46 uint32 full_hashes_size;

37 } FileHeader;	47 } FileHeader;

38	48

39 // Common std::vector<> implementations add capacity by multiplying from the	49 // Common std::vector<> implementations add capacity by multiplying from the

40 // current size (usually either by 2 or 1.5) to satisfy push_back() running in	50 // current size (usually either by 2 or 1.5) to satisfy push_back() running in

41 // amortized constant time. This is not necessary for insert() at end(), but	51 // amortized constant time. This is not necessary for insert() at end(), but

42 // AFAICT it seems true for some implementations. SBPrefix values should	52 // AFAICT it seems true for some implementations. SBPrefix values should

43 // uniformly cover the 32-bit space, so the final size can be estimated given a	53 // uniformly cover the 32-bit space, so the final size can be estimated given a

44 // subset of the input.	54 // subset of the input.

45 //	55 //

46 // \|kEstimateThreshold\| is when estimates start converging. Results are strong	56 // \|kEstimateThreshold\| is when estimates start converging. Results are strong

(...skipping 30 matching lines...) Expand all Loading...
77	87

78 // For \|std::upper_bound()\| to find a prefix w/in a vector of pairs.	88 // For \|std::upper_bound()\| to find a prefix w/in a vector of pairs.

79 // static	89 // static

80 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) {	90 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) {

81 return a.first < b.first;	91 return a.first < b.first;

82 }	92 }

83	93

84 PrefixSet::PrefixSet() {	94 PrefixSet::PrefixSet() {

85 }	95 }

86	96

87 PrefixSet::PrefixSet(IndexVector* index, std::vector<uint16>* deltas) {	97 PrefixSet::PrefixSet(IndexVector* index,

88 DCHECK(index && deltas);	98 std::vector<uint16>* deltas,

	99 std::vector<SBFullHash>* full_hashes) {

	100 DCHECK(index && deltas && full_hashes);

89 index_.swap(*index);	101 index_.swap(*index);

90 deltas_.swap(*deltas);	102 deltas_.swap(*deltas);

	103 full_hashes_.swap(*full_hashes);

91 }	104 }

92	105

93 PrefixSet::~PrefixSet() {}	106 PrefixSet::~PrefixSet() {}

94	107

95 bool PrefixSet::PrefixExists(SBPrefix prefix) const {	108 bool PrefixSet::PrefixExists(SBPrefix prefix) const {

96 if (index_.empty())	109 if (index_.empty())

97 return false;	110 return false;

98	111

99 // Find the first position after \|prefix\| in \|index_\|.	112 // Find the first position after \|prefix\| in \|index_\|.

100 IndexVector::const_iterator iter =	113 IndexVector::const_iterator iter =

(...skipping 48 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
149 }	162 }

150 }	163 }

151 }	164 }

152	165

153 // static	166 // static

154 scoped_ptr<PrefixSet> PrefixSet::LoadFile(const base::FilePath& filter_name) {	167 scoped_ptr<PrefixSet> PrefixSet::LoadFile(const base::FilePath& filter_name) {

155 int64 size_64;	168 int64 size_64;

156 if (!base::GetFileSize(filter_name, &size_64))	169 if (!base::GetFileSize(filter_name, &size_64))

157 return scoped_ptr<PrefixSet>();	170 return scoped_ptr<PrefixSet>();

158 using base::MD5Digest;	171 using base::MD5Digest;

159 if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest)))	172 // TODO(shess): Revert to sizeof(FileHeader) for sanity check once v2 is

	173 // deprecated.

	174 if (size_64 < static_cast<int64>(sizeof(FileHeader_v2) + sizeof(MD5Digest)))

160 return scoped_ptr<PrefixSet>();	175 return scoped_ptr<PrefixSet>();

161	176

162 base::ScopedFILE file(base::OpenFile(filter_name, "rb"));	177 base::ScopedFILE file(base::OpenFile(filter_name, "rb"));

163 if (!file.get())	178 if (!file.get())

164 return scoped_ptr<PrefixSet>();	179 return scoped_ptr<PrefixSet>();

165	180

166 FileHeader header;	181 FileHeader header;

167 size_t read = fread(&header, sizeof(header), 1, file.get());	182 size_t read = fread(&header, sizeof(header), 1, file.get());

168 if (read != 1)	183 if (read != 1)

169 return scoped_ptr<PrefixSet>();	184 return scoped_ptr<PrefixSet>();

170	185

	186 // The file looks valid, start building the digest.

	187 base::MD5Context context;

	188 base::MD5Init(&context);

	189 base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),

	190 sizeof(header)));

	191

171 if (header.magic != kMagic)	192 if (header.magic != kMagic)

172 return scoped_ptr<PrefixSet>();	193 return scoped_ptr<PrefixSet>();

173	194

174 // Track version read to inform removal of support for older versions.	195 // Track version read to inform removal of support for older versions.

175 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.PrefixSetVersionRead", header.version);	196 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.PrefixSetVersionRead", header.version);

176	197

177 // TODO(shess): Version 1 and 2 use the same file structure, with version 1	198 // TODO(shess): Version 1 and 2 use the same file structure, with version 1

178 // data using a signed sort. For M-35, the data is re-sorted before return.	199 // data using a signed sort. For M-35, the data is re-sorted before return.

179 // After M-35, just drop v1 support. <http://crbug.com/346405>	200 // After M-36, just drop v1 support. <http://crbug.com/346405>

180 if (header.version != kVersion && header.version != 1)	201 // TODO(shess): <http://crbug.com/368044> for removing v2 support.

	202 size_t header_size = sizeof(header);

	203 if (header.version == 2 \|\| header.version == 1) {

	204 // Rewind the file and restart building the digest with the old header

	205 // structure.

	206 FileHeader_v2 v2_header;

	207 if (0 != fseek(file.get(), 0, SEEK_SET))

	208 return scoped_ptr<PrefixSet>();

	209

	210 size_t read = fread(&v2_header, sizeof(v2_header), 1, file.get());

	211 if (read != 1)

	212 return scoped_ptr<PrefixSet>();

	213

	214 base::MD5Init(&context);

	215 base::MD5Update(&context,

	216 base::StringPiece(reinterpret_cast<char*>(&v2_header),

	217 sizeof(v2_header)));

	218

	219 // The current header is a superset of the old header, fill it in with the

	220 // information read.

	221 header.magic = v2_header.magic;

	222 header.version = v2_header.version;

	223 header.index_size = v2_header.index_size;

	224 header.deltas_size = v2_header.deltas_size;

	225 header.full_hashes_size = 0;

	226 header_size = sizeof(v2_header);

	227 } else if (header.version != kVersion) {

181 return scoped_ptr<PrefixSet>();	228 return scoped_ptr<PrefixSet>();

	229 }

182	230

183 IndexVector index;	231 IndexVector index;

184 const size_t index_bytes = sizeof(index[0]) * header.index_size;	232 const size_t index_bytes = sizeof(index[0]) * header.index_size;

185	233

186 std::vector<uint16> deltas;	234 std::vector<uint16> deltas;

187 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;	235 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;

188	236

	237 std::vector<SBFullHash> full_hashes;

	238 const size_t full_hashes_bytes =

	239 sizeof(full_hashes[0]) * header.full_hashes_size;

	240

189 // Check for bogus sizes before allocating any space.	241 // Check for bogus sizes before allocating any space.

190 const size_t expected_bytes =	242 const size_t expected_bytes = header_size +

191 sizeof(header) + index_bytes + deltas_bytes + sizeof(MD5Digest);	243 index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest);

192 if (static_cast<int64>(expected_bytes) != size_64)	244 if (static_cast<int64>(expected_bytes) != size_64)

193 return scoped_ptr<PrefixSet>();	245 return scoped_ptr<PrefixSet>();

194	246

195 // The file looks valid, start building the digest.

196 base::MD5Context context;

197 base::MD5Init(&context);

198 base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),

199 sizeof(header)));

200

201 // Read the index vector. Herb Sutter indicates that vectors are	247 // Read the index vector. Herb Sutter indicates that vectors are

202 // guaranteed to be contiuguous, so reading to where element 0 lives	248 // guaranteed to be contiuguous, so reading to where element 0 lives

203 // is valid.	249 // is valid.

204 if (header.index_size) {	250 if (header.index_size) {

205 index.resize(header.index_size);	251 index.resize(header.index_size);

206 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());	252 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());

207 if (read != index.size())	253 if (read != index.size())

208 return scoped_ptr<PrefixSet>();	254 return scoped_ptr<PrefixSet>();

209 base::MD5Update(&context,	255 base::MD5Update(&context,

210 base::StringPiece(reinterpret_cast<char*>(&(index[0])),	256 base::StringPiece(reinterpret_cast<char*>(&(index[0])),

211 index_bytes));	257 index_bytes));

212 }	258 }

213	259

214 // Read vector of deltas.	260 // Read vector of deltas.

215 if (header.deltas_size) {	261 if (header.deltas_size) {

216 deltas.resize(header.deltas_size);	262 deltas.resize(header.deltas_size);

217 read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get());	263 read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get());

218 if (read != deltas.size())	264 if (read != deltas.size())

219 return scoped_ptr<PrefixSet>();	265 return scoped_ptr<PrefixSet>();

220 base::MD5Update(&context,	266 base::MD5Update(&context,

221 base::StringPiece(reinterpret_cast<char*>(&(deltas[0])),	267 base::StringPiece(reinterpret_cast<char*>(&(deltas[0])),

222 deltas_bytes));	268 deltas_bytes));

223 }	269 }

224	270

	271 // Read vector of full hashes.

	272 if (header.full_hashes_size) {

	273 full_hashes.resize(header.full_hashes_size);

	274 read = fread(&(full_hashes[0]), sizeof(full_hashes[0]), full_hashes.size(),

	275 file.get());

	276 if (read != full_hashes.size())

	277 return scoped_ptr<PrefixSet>();

	278 base::MD5Update(&context,

	279 base::StringPiece(

	280 reinterpret_cast<char*>(&(full_hashes[0])),

	281 full_hashes_bytes));

	282 }

	283

225 base::MD5Digest calculated_digest;	284 base::MD5Digest calculated_digest;

226 base::MD5Final(&calculated_digest, &context);	285 base::MD5Final(&calculated_digest, &context);

227	286

228 base::MD5Digest file_digest;	287 base::MD5Digest file_digest;

229 read = fread(&file_digest, sizeof(file_digest), 1, file.get());	288 read = fread(&file_digest, sizeof(file_digest), 1, file.get());

230 if (read != 1)	289 if (read != 1)

231 return scoped_ptr<PrefixSet>();	290 return scoped_ptr<PrefixSet>();

232	291

233 if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))	292 if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))

234 return scoped_ptr<PrefixSet>();	293 return scoped_ptr<PrefixSet>();

235	294

236 // For version 1, fetch the prefixes and re-sort.	295 // For version 1, fetch the prefixes and re-sort.

237 if (header.version == 1) {	296 if (header.version == 1) {

238 std::vector<SBPrefix> prefixes;	297 std::vector<SBPrefix> prefixes;

239 PrefixSet(&index, &deltas).GetPrefixes(&prefixes);	298 PrefixSet(&index, &deltas, &full_hashes).GetPrefixes(&prefixes);

240 std::sort(prefixes.begin(), prefixes.end());	299 std::sort(prefixes.begin(), prefixes.end());

	300

	301 // v1 cannot have full hashes, so no need to propagate a copy here.

241 return PrefixSetBuilder(prefixes).GetPrefixSetNoHashes().Pass();	302 return PrefixSetBuilder(prefixes).GetPrefixSetNoHashes().Pass();

242 }	303 }

243	304

244 // Steals contents of \|index\| and \|deltas\| via swap().	305 // Steals contents of \|index\| and \|deltas\| via swap().
	mattm 2014/05/01 22:26:51 update comment update comment Scott Hess - ex-Googler 2014/05/05 03:47:07 Done. Show quoted text On 2014/05/01 22:26:51, mattm wrote: > update comment Done.
245 return scoped_ptr<PrefixSet>(new PrefixSet(&index, &deltas));	306 return scoped_ptr<PrefixSet>(new PrefixSet(&index, &deltas, &full_hashes));

246 }	307 }

247	308

248 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const {	309 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const {

249 FileHeader header;	310 FileHeader header;

250 header.magic = kMagic;	311 header.magic = kMagic;

251 header.version = kVersion;	312 header.version = kVersion;

252 header.index_size = static_cast<uint32>(index_.size());	313 header.index_size = static_cast<uint32>(index_.size());

253 header.deltas_size = static_cast<uint32>(deltas_.size());	314 header.deltas_size = static_cast<uint32>(deltas_.size());

	315 header.full_hashes_size = static_cast<uint32>(full_hashes_.size());

254	316

255 // Sanity check that the 32-bit values never mess things up.	317 // Sanity check that the 32-bit values never mess things up.

256 if (static_cast<size_t>(header.index_size) != index_.size() \|\|	318 if (static_cast<size_t>(header.index_size) != index_.size() \|\|

257 static_cast<size_t>(header.deltas_size) != deltas_.size()) {	319 static_cast<size_t>(header.deltas_size) != deltas_.size() \|\|

	320 static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) {

258 NOTREACHED();	321 NOTREACHED();

259 return false;	322 return false;

260 }	323 }

261	324

262 base::ScopedFILE file(base::OpenFile(filter_name, "wb"));	325 base::ScopedFILE file(base::OpenFile(filter_name, "wb"));

263 if (!file.get())	326 if (!file.get())

264 return false;	327 return false;

265	328

266 base::MD5Context context;	329 base::MD5Context context;

267 base::MD5Init(&context);	330 base::MD5Init(&context);

(...skipping 25 matching lines...) Expand all Loading...
293 written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(),	356 written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(),

294 file.get());	357 file.get());

295 if (written != deltas_.size())	358 if (written != deltas_.size())

296 return false;	359 return false;

297 base::MD5Update(&context,	360 base::MD5Update(&context,

298 base::StringPiece(	361 base::StringPiece(

299 reinterpret_cast<const char*>(&(deltas_[0])),	362 reinterpret_cast<const char*>(&(deltas_[0])),

300 deltas_bytes));	363 deltas_bytes));

301 }	364 }

302	365

	366 if (full_hashes_.size()) {

	367 const size_t elt_size = sizeof(full_hashes_[0]);

	368 const size_t elts = full_hashes_.size();

	369 const size_t full_hashes_bytes = elt_size * elts;

	370 written = fwrite(&(full_hashes_[0]), elt_size, elts, file.get());

	371 if (written != elts)

	372 return false;

	373 base::MD5Update(&context,

	374 base::StringPiece(

	375 reinterpret_cast<const char*>(&(full_hashes_[0])),

	376 full_hashes_bytes));

	377 }

	378

303 base::MD5Digest digest;	379 base::MD5Digest digest;

304 base::MD5Final(&digest, &context);	380 base::MD5Final(&digest, &context);

305 written = fwrite(&digest, sizeof(digest), 1, file.get());	381 written = fwrite(&digest, sizeof(digest), 1, file.get());

306 if (written != 1)	382 if (written != 1)

307 return false;	383 return false;

308	384

309 // TODO(shess): Can this code check that the close was successful?	385 // TODO(shess): Can this code check that the close was successful?

310 file.reset();	386 file.reset();

311	387

312 return true;	388 return true;

(...skipping 94 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
407 }	483 }

408 buffer_.push_back(prefix);	484 buffer_.push_back(prefix);

409	485

410 // Flush buffer when a run can be constructed. +1 for the index item, and +1	486 // Flush buffer when a run can be constructed. +1 for the index item, and +1

411 // to leave at least one item in the buffer for dropping duplicates.	487 // to leave at least one item in the buffer for dropping duplicates.

412 if (buffer_.size() > PrefixSet::kMaxRun + 2)	488 if (buffer_.size() > PrefixSet::kMaxRun + 2)

413 EmitRun();	489 EmitRun();

414 }	490 }

415	491

416 } // namespace safe_browsing	492 } // namespace safe_browsing

OLD	NEW

« chrome/browser/safe_browsing/prefix_set.h ('K') | « chrome/browser/safe_browsing/prefix_set.h ('k') | chrome/browser/safe_browsing/prefix_set_unittest.cc » ('j') | no next file with comments »