components/safe_browsing_db/util.cc - Issue 2225113002: Reland: Move PVer4 related code from util.* to v4_protocol_manager_util.*

Side by Side Diff: components/safe_browsing_db/util.cc

Issue 2225113002: Reland: Move PVer4 related code from util.* to v4_protocol_manager_util.* (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Minor: Address nparker@'s comments Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2015 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/safe_browsing_db/util.h"	5 #include "components/safe_browsing_db/util.h"

6	6

7 #include <stddef.h>	7 #include <stddef.h>

8	8

9 #include "base/macros.h"	9 #include "base/macros.h"

10 #include "base/strings/string_util.h"

11 #include "base/trace_event/trace_event.h"	10 #include "base/trace_event/trace_event.h"

	11 #include "components/safe_browsing_db/v4_protocol_manager_util.h"

12 #include "crypto/sha2.h"	12 #include "crypto/sha2.h"

13 #include "net/base/escape.h"	13 #include "net/base/escape.h"

14 #include "url/gurl.h"	14 #include "url/gurl.h"

15 #include "url/url_util.h"

16	15

17 namespace safe_browsing {	16 namespace safe_browsing {

18	17

19 // Utility functions -----------------------------------------------------------	18 // Utility functions -----------------------------------------------------------

20	19

21 namespace {	20 namespace {

22	21

23 bool IsKnownList(const std::string& name) {	22 bool IsKnownList(const std::string& name) {

24 for (size_t i = 0; i < arraysize(kAllLists); ++i) {	23 for (size_t i = 0; i < arraysize(kAllLists); ++i) {

25 if (!strcmp(kAllLists[i], name.c_str())) {	24 if (!strcmp(kAllLists[i], name.c_str())) {

26 return true;	25 return true;

27 }	26 }

28 }	27 }

29 return false;	28 return false;

30 }	29 }

31	30

32 void GenerateHostVariantsToCheck(const std::string& host,

33 std::vector<std::string>* hosts) {

34 hosts->clear();

35

36 if (host.empty())

37 return;

38

39 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4

40 // hostnames formed by starting with the last 5 components and successively

41 // removing the leading component. The last component isn't examined alone,

42 // since it's the TLD or a subcomponent thereof.

43 //

44 // Note that we don't need to be clever about stopping at the "real" eTLD --

45 // the data on the server side has been filtered to ensure it will not

46 // blacklist a whole TLD, and it's not significantly slower on our side to

47 // just check too much.

48 //

49 // Also note that because we have a simple blacklist, not some sort of complex

50 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check

51 // these in.

52 const size_t kMaxHostsToCheck = 4;

53 bool skipped_last_component = false;

54 for (std::string::const_reverse_iterator i(host.rbegin());

55 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {

56 if (*i == '.') {

57 if (skipped_last_component)

58 hosts->push_back(std::string(i.base(), host.end()));

59 else

60 skipped_last_component = true;

61 }

62 }

63 hosts->push_back(host);

64 }

65

66 void GeneratePathVariantsToCheck(const std::string& path,

67 const std::string& query,

68 std::vector<std::string>* paths) {

69 paths->clear();

70

71 if (path.empty())

72 return;

73

74 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without

75 // the query parameters, and also up to 4 paths formed by starting at the root

76 // and adding more path components.

77 //

78 // As with the hosts above, it doesn't matter what order we check these in.

79 const size_t kMaxPathsToCheck = 4;

80 for (std::string::const_iterator i(path.begin());

81 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {

82 if (*i == '/')

83 paths->push_back(std::string(path.begin(), i + 1));

84 }

85

86 if (!paths->empty() && paths->back() != path)

87 paths->push_back(path);

88

89 if (!query.empty())

90 paths->push_back(path + "?" + query);

91 }

92

93 } // namespace	31 } // namespace

94	32

95 // ThreatMetadata ------------------------------------------------------------	33 // ThreatMetadata ------------------------------------------------------------

96 ThreatMetadata::ThreatMetadata()	34 ThreatMetadata::ThreatMetadata()

97 : threat_pattern_type(ThreatPatternType::NONE) {}	35 : threat_pattern_type(ThreatPatternType::NONE) {}

98	36

99 ThreatMetadata::ThreatMetadata(const ThreatMetadata& other) = default;	37 ThreatMetadata::ThreatMetadata(const ThreatMetadata& other) = default;

100	38

101 ThreatMetadata::~ThreatMetadata() {}	39 ThreatMetadata::~ThreatMetadata() {}

102	40

(...skipping 106 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
209 SBFullHash hash_out;	147 SBFullHash hash_out;

210 memcpy(hash_out.full_hash, hash_in.data(), crypto::kSHA256Length);	148 memcpy(hash_out.full_hash, hash_in.data(), crypto::kSHA256Length);

211 return hash_out;	149 return hash_out;

212 }	150 }

213	151

214 std::string SBFullHashToString(const SBFullHash& hash) {	152 std::string SBFullHashToString(const SBFullHash& hash) {

215 DCHECK_EQ(crypto::kSHA256Length, sizeof(hash.full_hash));	153 DCHECK_EQ(crypto::kSHA256Length, sizeof(hash.full_hash));

216 return std::string(hash.full_hash, sizeof(hash.full_hash));	154 return std::string(hash.full_hash, sizeof(hash.full_hash));

217 }	155 }

218	156

219

220 std::string Unescape(const std::string& url) {

221 std::string unescaped_str(url);

222 const int kMaxLoopIterations = 1024;

223 size_t old_size = 0;

224 int loop_var = 0;

225 do {

226 old_size = unescaped_str.size();

227 unescaped_str = net::UnescapeURLComponent(

228 unescaped_str,

229 net::UnescapeRule::SPOOFING_AND_CONTROL_CHARS \|

230 net::UnescapeRule::SPACES \| net::UnescapeRule::PATH_SEPARATORS \|

231 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);

232 } while (old_size != unescaped_str.size() &&

233 ++loop_var <= kMaxLoopIterations);

234

235 return unescaped_str;

236 }

237

238 std::string Escape(const std::string& url) {

239 std::string escaped_str;

240 // The escaped string is larger so allocate double the length to reduce the

241 // chance of the string being grown.

242 escaped_str.reserve(url.length() * 2);

243 const char* kHexString = "0123456789ABCDEF";

244 for (size_t i = 0; i < url.length(); i++) {

245 unsigned char c = static_cast<unsigned char>(url[i]);

246 if (c <= ' ' \|\| c > '~' \|\| c == '#' \|\| c == '%') {

247 escaped_str += '%';

248 escaped_str += kHexString[c >> 4];

249 escaped_str += kHexString[c & 0xf];

250 } else {

251 escaped_str += c;

252 }

253 }

254

255 return escaped_str;

256 }

257

258 std::string RemoveConsecutiveChars(base::StringPiece str, const char c) {

259 std::string output;

260 // Output is at most the length of the original string.

261 output.reserve(str.size());

262

263 size_t i = 0;

264 while (i < str.size()) {

265 output.append(1, str[i++]);

266 if (str[i - 1] == c) {

267 while (i < str.size() && str[i] == c) {

268 i++;

269 }

270 }

271 }

272

273 return output;

274 }

275

276 // Canonicalizes url as per Google Safe Browsing Specification.

277 // See section 6.1 in

278 // http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.

279 void CanonicalizeUrl(const GURL& url,

280 std::string* canonicalized_hostname,

281 std::string* canonicalized_path,

282 std::string* canonicalized_query) {

283 DCHECK(url.is_valid());

284

285 // We only canonicalize "normal" URLs.

286 if (!url.IsStandard())

287 return;

288

289 // Following canonicalization steps are excluded since url parsing takes care

290 // of those :-

291 // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url.

292 // (Exclude escaped version of these chars).

293 // 2. Normalize hostname to 4 dot-seperated decimal values.

294 // 3. Lowercase hostname.

295 // 4. Resolve path sequences "/../" and "/./".

296

297 // That leaves us with the following :-

298 // 1. Remove fragment in URL.

299 GURL url_without_fragment;

300 GURL::Replacements f_replacements;

301 f_replacements.ClearRef();

302 f_replacements.ClearUsername();

303 f_replacements.ClearPassword();

304 url_without_fragment = url.ReplaceComponents(f_replacements);

305

306 // 2. Do URL unescaping until no more hex encoded characters exist.

307 std::string url_unescaped_str(Unescape(url_without_fragment.spec()));

308 url::Parsed parsed;

309 url::ParseStandardURL(url_unescaped_str.data(), url_unescaped_str.length(),

310 &parsed);

311

312 // 3. In hostname, remove all leading and trailing dots.

313 base::StringPiece host;

314 if (parsed.host.len > 0)

315 host.set(url_unescaped_str.data() + parsed.host.begin, parsed.host.len);

316

317 base::StringPiece host_without_end_dots =

318 base::TrimString(host, ".", base::TrimPositions::TRIM_ALL);

319

320 // 4. In hostname, replace consecutive dots with a single dot.

321 std::string host_without_consecutive_dots(RemoveConsecutiveChars(

322 host_without_end_dots, '.'));

323

324 // 5. In path, replace runs of consecutive slashes with a single slash.

325 base::StringPiece path;

326 if (parsed.path.len > 0)

327 path.set(url_unescaped_str.data() + parsed.path.begin, parsed.path.len);

328 std::string path_without_consecutive_slash(RemoveConsecutiveChars(path, '/'));

329

330 url::Replacements<char> hp_replacements;

331 hp_replacements.SetHost(

332 host_without_consecutive_dots.data(),

333 url::Component(0, host_without_consecutive_dots.length()));

334 hp_replacements.SetPath(

335 path_without_consecutive_slash.data(),

336 url::Component(0, path_without_consecutive_slash.length()));

337

338 std::string url_unescaped_with_can_hostpath;

339 url::StdStringCanonOutput output(&url_unescaped_with_can_hostpath);

340 url::Parsed temp_parsed;

341 url::ReplaceComponents(url_unescaped_str.data(),

342 url_unescaped_str.length(),

343 parsed,

344 hp_replacements,

345 NULL,

346 &output,

347 &temp_parsed);

348 output.Complete();

349

350 // 6. Step needed to revert escaping done in url::ReplaceComponents.

351 url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath);

352

353 // 7. After performing all above steps, percent-escape all chars in url which

354 // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters.

355 std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath));

356 url::Parsed final_parsed;

357 url::ParseStandardURL(escaped_canon_url_str.data(),

358 escaped_canon_url_str.length(),

359 &final_parsed);

360

361 if (canonicalized_hostname && final_parsed.host.len > 0) {

362 *canonicalized_hostname =

363 escaped_canon_url_str.substr(final_parsed.host.begin,

364 final_parsed.host.len);

365 }

366 if (canonicalized_path && final_parsed.path.len > 0) {

367 *canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin,

368 final_parsed.path.len);

369 }

370 if (canonicalized_query && final_parsed.query.len > 0) {

371 *canonicalized_query = escaped_canon_url_str.substr(

372 final_parsed.query.begin, final_parsed.query.len);

373 }

374 }

375

376 void UrlToFullHashes(const GURL& url,	157 void UrlToFullHashes(const GURL& url,

377 bool include_whitelist_hashes,	158 bool include_whitelist_hashes,

378 std::vector<SBFullHash>* full_hashes) {	159 std::vector<SBFullHash>* full_hashes) {

379 // Include this function in traces because it's not cheap so it should be	160 // Include this function in traces because it's not cheap so it should be

380 // called sparingly.	161 // called sparingly.

381 TRACE_EVENT2("loader", "safe_browsing::UrlToFullHashes", "url", url.spec(),	162 TRACE_EVENT2("loader", "safe_browsing::UrlToFullHashes", "url", url.spec(),

382 "include_whitelist_hashes", include_whitelist_hashes);	163 "include_whitelist_hashes", include_whitelist_hashes);

383 std::string canon_host;	164 std::string canon_host;

384 std::string canon_path;	165 std::string canon_path;

385 std::string canon_query;	166 std::string canon_query;

386 CanonicalizeUrl(url, &canon_host, &canon_path, &canon_query);	167 V4ProtocolManagerUtil::CanonicalizeUrl(url, &canon_host, &canon_path,

	168 &canon_query);

387	169

388 std::vector<std::string> hosts;	170 std::vector<std::string> hosts;

389 if (url.HostIsIPAddress()) {	171 if (url.HostIsIPAddress()) {

390 hosts.push_back(url.host());	172 hosts.push_back(url.host());

391 } else {	173 } else {

392 GenerateHostVariantsToCheck(canon_host, &hosts);	174 V4ProtocolManagerUtil::GenerateHostVariantsToCheck(canon_host, &hosts);

393 }	175 }

394	176

395 std::vector<std::string> paths;	177 std::vector<std::string> paths;

396 GeneratePathVariantsToCheck(canon_path, canon_query, &paths);	178 V4ProtocolManagerUtil::GeneratePathVariantsToCheck(canon_path, canon_query,

	179 &paths);

397	180

398 for (const std::string& host : hosts) {	181 for (const std::string& host : hosts) {

399 for (const std::string& path : paths) {	182 for (const std::string& path : paths) {

400 full_hashes->push_back(	183 full_hashes->push_back(

401 SBFullHashForString(host + path));	184 SBFullHashForString(host + path));

402	185

403 // We may have /foo as path-prefix in the whitelist which should	186 // We may have /foo as path-prefix in the whitelist which should

404 // also match with /foo/bar and /foo?bar. Hence, for every path	187 // also match with /foo/bar and /foo?bar. Hence, for every path

405 // that ends in '/' we also add the path without the slash.	188 // that ends in '/' we also add the path without the slash.

406 if (include_whitelist_hashes && path.size() > 1 && path.back() == '/') {	189 if (include_whitelist_hashes && path.size() > 1 && path.back() == '/') {

407 full_hashes->push_back(SBFullHashForString(	190 full_hashes->push_back(SBFullHashForString(

408 host + path.substr(0, path.size() - 1)));	191 host + path.substr(0, path.size() - 1)));

409 }	192 }

410 }	193 }

411 }	194 }

412 }	195 }

413	196

414 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) {

415 std::string canon_host;

416 CanonicalizeUrl(url, &canon_host, NULL, NULL);

417 GenerateHostVariantsToCheck(canon_host, hosts);

418 }

419

420 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) {

421 std::string canon_path;

422 std::string canon_query;

423 CanonicalizeUrl(url, NULL, &canon_path, &canon_query);

424 GeneratePathVariantsToCheck(canon_path, canon_query, paths);

425 }

426

427 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) {

428 std::string canon_host;

429 std::string canon_path;

430 std::string canon_query;

431 CanonicalizeUrl(url, &canon_host, &canon_path, &canon_query);

432

433 std::vector<std::string> hosts, paths;

434 GenerateHostVariantsToCheck(canon_host, &hosts);

435 GeneratePathVariantsToCheck(canon_path, canon_query, &paths);

436 for (size_t h = 0; h < hosts.size(); ++h) {

437 for (size_t p = 0; p < paths.size(); ++p) {

438 urls->push_back(hosts[h] + paths[p]);

439 }

440 }

441 }

442

443 } // namespace safe_browsing	197 } // namespace safe_browsing

OLD	NEW

« no previous file with comments | « components/safe_browsing_db/util.h ('k') | components/safe_browsing_db/util_unittest.cc » ('j') | no next file with comments »