components/safe_browsing_db/util.cc - Issue 2010713003: Only call CanonicalizeUrl once from UrlToFullHashes

Side by Side Diff: components/safe_browsing_db/util.cc

Issue 2010713003: Only call CanonicalizeUrl once from UrlToFullHashes (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2015 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/safe_browsing_db/util.h"	5 #include "components/safe_browsing_db/util.h"

6	6

7 #include <stddef.h>	7 #include <stddef.h>

8	8

9 #include "base/macros.h"	9 #include "base/macros.h"

10 #include "base/strings/string_util.h"	10 #include "base/strings/string_util.h"

11 #include "base/trace_event/trace_event.h"	11 #include "base/trace_event/trace_event.h"

12 #include "crypto/sha2.h"	12 #include "crypto/sha2.h"

13 #include "net/base/escape.h"	13 #include "net/base/escape.h"

14 #include "url/gurl.h"	14 #include "url/gurl.h"

15 #include "url/url_util.h"	15 #include "url/url_util.h"

16	16

17 namespace safe_browsing {	17 namespace safe_browsing {

18	18

19 // Utility functions -----------------------------------------------------------	19 // Utility functions -----------------------------------------------------------

20	20

21 namespace {	21 namespace {

	22

22 bool IsKnownList(const std::string& name) {	23 bool IsKnownList(const std::string& name) {

23 for (size_t i = 0; i < arraysize(kAllLists); ++i) {	24 for (size_t i = 0; i < arraysize(kAllLists); ++i) {

24 if (!strcmp(kAllLists[i], name.c_str())) {	25 if (!strcmp(kAllLists[i], name.c_str())) {

25 return true;	26 return true;

26 }	27 }

27 }	28 }

28 return false;	29 return false;

29 }	30 }

	31

	32 void GenerateHostsToCheck(const std::string& host,
	Nathan Parker 2016/05/25 17:23:35 Give these a different name, so they differ by mor Give these a different name, so they differ by more than signature. Maybe GenerateHostsToCheckFromHost and FromParts or some such Joe Mason 2016/05/25 18:06:21 Done. Show quoted text On 2016/05/25 17:23:35, Nathan Parker wrote: > Give these a different name, so they differ by more than signature. Maybe > GenerateHostsToCheckFromHost and FromParts or some such Done.
	33 std::vector<std::string>* hosts) {

	34 hosts->clear();

	35

	36 if (host.empty())

	37 return;

	38

	39 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4

	40 // hostnames formed by starting with the last 5 components and successively

	41 // removing the leading component. The last component isn't examined alone,

	42 // since it's the TLD or a subcomponent thereof.

	43 //

	44 // Note that we don't need to be clever about stopping at the "real" eTLD --

	45 // the data on the server side has been filtered to ensure it will not

	46 // blacklist a whole TLD, and it's not significantly slower on our side to

	47 // just check too much.

	48 //

	49 // Also note that because we have a simple blacklist, not some sort of complex

	50 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check

	51 // these in.

	52 const size_t kMaxHostsToCheck = 4;

	53 bool skipped_last_component = false;

	54 for (std::string::const_reverse_iterator i(host.rbegin());

	55 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {

	56 if (*i == '.') {

	57 if (skipped_last_component)

	58 hosts->push_back(std::string(i.base(), host.end()));

	59 else

	60 skipped_last_component = true;

	61 }

	62 }

	63 hosts->push_back(host);

	64 }

	65

	66 void GeneratePathsToCheck(const std::string& path, const std::string& query,

	67 std::vector<std::string>* paths) {

	68 paths->clear();

	69

	70 if (path.empty())

	71 return;

	72

	73 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without

	74 // the query parameters, and also up to 4 paths formed by starting at the root

	75 // and adding more path components.

	76 //

	77 // As with the hosts above, it doesn't matter what order we check these in.

	78 const size_t kMaxPathsToCheck = 4;

	79 for (std::string::const_iterator i(path.begin());

	80 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {

	81 if (*i == '/')

	82 paths->push_back(std::string(path.begin(), i + 1));

	83 }

	84

	85 if (!paths->empty() && paths->back() != path)

	86 paths->push_back(path);

	87

	88 if (!query.empty())

	89 paths->push_back(path + "?" + query);

	90 }

	91

30 } // namespace	92 } // namespace

31	93

32 // ThreatMetadata ------------------------------------------------------------	94 // ThreatMetadata ------------------------------------------------------------

33 ThreatMetadata::ThreatMetadata()	95 ThreatMetadata::ThreatMetadata()

34 : threat_pattern_type(ThreatPatternType::NONE) {}	96 : threat_pattern_type(ThreatPatternType::NONE) {}

35	97

36 ThreatMetadata::ThreatMetadata(const ThreatMetadata& other) = default;	98 ThreatMetadata::ThreatMetadata(const ThreatMetadata& other) = default;

37	99

38 ThreatMetadata::~ThreatMetadata() {}	100 ThreatMetadata::~ThreatMetadata() {}

39	101

(...skipping 270 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
310 }	372 }

311 }	373 }

312	374

313 void UrlToFullHashes(const GURL& url,	375 void UrlToFullHashes(const GURL& url,

314 bool include_whitelist_hashes,	376 bool include_whitelist_hashes,

315 std::vector<SBFullHash>* full_hashes) {	377 std::vector<SBFullHash>* full_hashes) {

316 // Include this function in traces because it's not cheap so it should be	378 // Include this function in traces because it's not cheap so it should be

317 // called sparingly.	379 // called sparingly.

318 TRACE_EVENT2("loader", "safe_browsing::UrlToFullHashes", "url", url.spec(),	380 TRACE_EVENT2("loader", "safe_browsing::UrlToFullHashes", "url", url.spec(),

319 "include_whitelist_hashes", include_whitelist_hashes);	381 "include_whitelist_hashes", include_whitelist_hashes);

	382 std::string canon_host;

	383 std::string canon_path;

	384 std::string canon_query;

	385 CanonicalizeUrl(url, &canon_host, &canon_path, &canon_query);

	386

320 std::vector<std::string> hosts;	387 std::vector<std::string> hosts;

321 if (url.HostIsIPAddress()) {	388 if (url.HostIsIPAddress()) {

322 hosts.push_back(url.host());	389 hosts.push_back(url.host());

323 } else {	390 } else {

324 GenerateHostsToCheck(url, &hosts);	391 GenerateHostsToCheck(canon_host, &hosts);

325 }	392 }

326	393

327 std::vector<std::string> paths;	394 std::vector<std::string> paths;

328 GeneratePathsToCheck(url, &paths);	395 GeneratePathsToCheck(canon_path, canon_query, &paths);

329	396

330 for (const std::string& host : hosts) {	397 for (const std::string& host : hosts) {

331 for (const std::string& path : paths) {	398 for (const std::string& path : paths) {

332 full_hashes->push_back(	399 full_hashes->push_back(

333 SBFullHashForString(host + path));	400 SBFullHashForString(host + path));

334	401

335 // We may have /foo as path-prefix in the whitelist which should	402 // We may have /foo as path-prefix in the whitelist which should

336 // also match with /foo/bar and /foo?bar. Hence, for every path	403 // also match with /foo/bar and /foo?bar. Hence, for every path

337 // that ends in '/' we also add the path without the slash.	404 // that ends in '/' we also add the path without the slash.

338 if (include_whitelist_hashes && path.size() > 1 &&	405 if (include_whitelist_hashes && path.size() > 1 &&

339 path[path.size() - 1] == '/') {	406 path[path.size() - 1] == '/') {

340 full_hashes->push_back(SBFullHashForString(	407 full_hashes->push_back(SBFullHashForString(

341 host + path.substr(0, path.size() - 1)));	408 host + path.substr(0, path.size() - 1)));

342 }	409 }

343 }	410 }

344 }	411 }

345 }	412 }

346	413

347 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) {	414 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) {

348 hosts->clear();

349

350 std::string canon_host;	415 std::string canon_host;

351 CanonicalizeUrl(url, &canon_host, NULL, NULL);	416 CanonicalizeUrl(url, &canon_host, NULL, NULL);

352	417 GenerateHostsToCheck(canon_host, hosts);

353 const std::string host = canon_host; // const sidesteps GCC bugs below!
Joe Mason 2016/05/25 15:44:44 This note was added in commit 080438b8886070e399c3 This note was added in commit 080438b8886070e399c326f757cd96976a7a81ec in 2008. I think it's safe to remove now, especially as the new code passes the host through a const reference param, so if the GCC bug in question still exists it probably wouldn't triggered anymore.
354 if (host.empty())

355 return;

356

357 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4

358 // hostnames formed by starting with the last 5 components and successively

359 // removing the leading component. The last component isn't examined alone,

360 // since it's the TLD or a subcomponent thereof.

361 //

362 // Note that we don't need to be clever about stopping at the "real" eTLD --

363 // the data on the server side has been filtered to ensure it will not

364 // blacklist a whole TLD, and it's not significantly slower on our side to

365 // just check too much.

366 //

367 // Also note that because we have a simple blacklist, not some sort of complex

368 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check

369 // these in.

370 const size_t kMaxHostsToCheck = 4;

371 bool skipped_last_component = false;

372 for (std::string::const_reverse_iterator i(host.rbegin());

373 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {

374 if (*i == '.') {

375 if (skipped_last_component)

376 hosts->push_back(std::string(i.base(), host.end()));

377 else

378 skipped_last_component = true;

379 }

380 }

381 hosts->push_back(host);

382 }	418 }

383	419

384 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) {	420 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) {

385 paths->clear();

386

387 std::string canon_path;	421 std::string canon_path;

388 std::string canon_query;	422 std::string canon_query;

389 CanonicalizeUrl(url, NULL, &canon_path, &canon_query);	423 CanonicalizeUrl(url, NULL, &canon_path, &canon_query);

390	424 GeneratePathsToCheck(canon_path, canon_query, paths);

391 const std::string path = canon_path; // const sidesteps GCC bugs below!

392 const std::string query = canon_query;

393 if (path.empty())

394 return;

395

396 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without

397 // the query parameters, and also up to 4 paths formed by starting at the root

398 // and adding more path components.

399 //

400 // As with the hosts above, it doesn't matter what order we check these in.

401 const size_t kMaxPathsToCheck = 4;

402 for (std::string::const_iterator i(path.begin());

403 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {

404 if (*i == '/')

405 paths->push_back(std::string(path.begin(), i + 1));

406 }

407

408 if (!paths->empty() && paths->back() != path)

409 paths->push_back(path);

410

411 if (!query.empty())

412 paths->push_back(path + "?" + query);

413 }	425 }

414	426

415 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) {	427 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) {

	428 std::string canon_host;

	429 std::string canon_path;

	430 std::string canon_query;

	431 CanonicalizeUrl(url, &canon_host, &canon_path, &canon_query);

	432

416 std::vector<std::string> hosts, paths;	433 std::vector<std::string> hosts, paths;

417 GenerateHostsToCheck(url, &hosts);	434 GenerateHostsToCheck(canon_host, &hosts);

418 GeneratePathsToCheck(url, &paths);	435 GeneratePathsToCheck(canon_path, canon_query, &paths);

419 for (size_t h = 0; h < hosts.size(); ++h) {	436 for (size_t h = 0; h < hosts.size(); ++h) {

420 for (size_t p = 0; p < paths.size(); ++p) {	437 for (size_t p = 0; p < paths.size(); ++p) {

421 urls->push_back(hosts[h] + paths[p]);	438 urls->push_back(hosts[h] + paths[p]);

422 }	439 }

423 }	440 }

424 }	441 }

425	442

426 } // namespace safe_browsing	443 } // namespace safe_browsing

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »