components/safe_browsing_db/util.cc - Issue 2010713003: Only call CanonicalizeUrl once from UrlToFullHashes

Side by Side Diff: components/safe_browsing_db/util.cc

Issue 2010713003: Only call CanonicalizeUrl once from UrlToFullHashes (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Rename GenerateHostEtc Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2015 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/safe_browsing_db/util.h"	5 #include "components/safe_browsing_db/util.h"

6	6

7 #include <stddef.h>	7 #include <stddef.h>

8	8

9 #include "base/macros.h"	9 #include "base/macros.h"

10 #include "base/strings/string_util.h"	10 #include "base/strings/string_util.h"

11 #include "base/trace_event/trace_event.h"	11 #include "base/trace_event/trace_event.h"

12 #include "crypto/sha2.h"	12 #include "crypto/sha2.h"

13 #include "net/base/escape.h"	13 #include "net/base/escape.h"

14 #include "url/gurl.h"	14 #include "url/gurl.h"

15 #include "url/url_util.h"	15 #include "url/url_util.h"

16	16

17 namespace safe_browsing {	17 namespace safe_browsing {

18	18

19 // Utility functions -----------------------------------------------------------	19 // Utility functions -----------------------------------------------------------

20	20

21 namespace {	21 namespace {

	22

22 bool IsKnownList(const std::string& name) {	23 bool IsKnownList(const std::string& name) {

23 for (size_t i = 0; i < arraysize(kAllLists); ++i) {	24 for (size_t i = 0; i < arraysize(kAllLists); ++i) {

24 if (!strcmp(kAllLists[i], name.c_str())) {	25 if (!strcmp(kAllLists[i], name.c_str())) {

25 return true;	26 return true;

26 }	27 }

27 }	28 }

28 return false;	29 return false;

29 }	30 }

	31

	32 void GenerateHostVariantsToCheck(const std::string& host,

	33 std::vector<std::string>* hosts) {

	34 hosts->clear();

	35

	36 if (host.empty())

	37 return;

	38

	39 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4

	40 // hostnames formed by starting with the last 5 components and successively

	41 // removing the leading component. The last component isn't examined alone,

	42 // since it's the TLD or a subcomponent thereof.

	43 //

	44 // Note that we don't need to be clever about stopping at the "real" eTLD --

	45 // the data on the server side has been filtered to ensure it will not

	46 // blacklist a whole TLD, and it's not significantly slower on our side to

	47 // just check too much.

	48 //

	49 // Also note that because we have a simple blacklist, not some sort of complex

	50 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check

	51 // these in.

	52 const size_t kMaxHostsToCheck = 4;

	53 bool skipped_last_component = false;

	54 for (std::string::const_reverse_iterator i(host.rbegin());

	55 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {

	56 if (*i == '.') {

	57 if (skipped_last_component)

	58 hosts->push_back(std::string(i.base(), host.end()));

	59 else

	60 skipped_last_component = true;

	61 }

	62 }

	63 hosts->push_back(host);

	64 }

	65

	66 void GeneratePathVariantsToCheck(const std::string& path,

	67 const std::string& query,

	68 std::vector<std::string>* paths) {

	69 paths->clear();

	70

	71 if (path.empty())

	72 return;

	73

	74 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without

	75 // the query parameters, and also up to 4 paths formed by starting at the root

	76 // and adding more path components.

	77 //

	78 // As with the hosts above, it doesn't matter what order we check these in.

	79 const size_t kMaxPathsToCheck = 4;

	80 for (std::string::const_iterator i(path.begin());

	81 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {

	82 if (*i == '/')

	83 paths->push_back(std::string(path.begin(), i + 1));

	84 }

	85

	86 if (!paths->empty() && paths->back() != path)

	87 paths->push_back(path);

	88

	89 if (!query.empty())

	90 paths->push_back(path + "?" + query);

	91 }

	92

30 } // namespace	93 } // namespace

31	94

32 // ThreatMetadata ------------------------------------------------------------	95 // ThreatMetadata ------------------------------------------------------------

33 ThreatMetadata::ThreatMetadata()	96 ThreatMetadata::ThreatMetadata()

34 : threat_pattern_type(ThreatPatternType::NONE) {}	97 : threat_pattern_type(ThreatPatternType::NONE) {}

35	98

36 ThreatMetadata::ThreatMetadata(const ThreatMetadata& other) = default;	99 ThreatMetadata::ThreatMetadata(const ThreatMetadata& other) = default;

37	100

38 ThreatMetadata::~ThreatMetadata() {}	101 ThreatMetadata::~ThreatMetadata() {}

39	102

(...skipping 270 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
310 }	373 }

311 }	374 }

312	375

313 void UrlToFullHashes(const GURL& url,	376 void UrlToFullHashes(const GURL& url,

314 bool include_whitelist_hashes,	377 bool include_whitelist_hashes,

315 std::vector<SBFullHash>* full_hashes) {	378 std::vector<SBFullHash>* full_hashes) {

316 // Include this function in traces because it's not cheap so it should be	379 // Include this function in traces because it's not cheap so it should be

317 // called sparingly.	380 // called sparingly.

318 TRACE_EVENT2("loader", "safe_browsing::UrlToFullHashes", "url", url.spec(),	381 TRACE_EVENT2("loader", "safe_browsing::UrlToFullHashes", "url", url.spec(),

319 "include_whitelist_hashes", include_whitelist_hashes);	382 "include_whitelist_hashes", include_whitelist_hashes);

	383 std::string canon_host;

	384 std::string canon_path;

	385 std::string canon_query;

	386 CanonicalizeUrl(url, &canon_host, &canon_path, &canon_query);

	387

320 std::vector<std::string> hosts;	388 std::vector<std::string> hosts;

321 if (url.HostIsIPAddress()) {	389 if (url.HostIsIPAddress()) {

322 hosts.push_back(url.host());	390 hosts.push_back(url.host());

323 } else {	391 } else {

324 GenerateHostsToCheck(url, &hosts);	392 GenerateHostVariantsToCheck(canon_host, &hosts);

325 }	393 }

326	394

327 std::vector<std::string> paths;	395 std::vector<std::string> paths;

328 GeneratePathsToCheck(url, &paths);	396 GeneratePathVariantsToCheck(canon_path, canon_query, &paths);

329	397

330 for (const std::string& host : hosts) {	398 for (const std::string& host : hosts) {

331 for (const std::string& path : paths) {	399 for (const std::string& path : paths) {

332 full_hashes->push_back(	400 full_hashes->push_back(

333 SBFullHashForString(host + path));	401 SBFullHashForString(host + path));

334	402

335 // We may have /foo as path-prefix in the whitelist which should	403 // We may have /foo as path-prefix in the whitelist which should

336 // also match with /foo/bar and /foo?bar. Hence, for every path	404 // also match with /foo/bar and /foo?bar. Hence, for every path

337 // that ends in '/' we also add the path without the slash.	405 // that ends in '/' we also add the path without the slash.

338 if (include_whitelist_hashes && path.size() > 1 &&	406 if (include_whitelist_hashes && path.size() > 1 &&

339 path[path.size() - 1] == '/') {	407 path[path.size() - 1] == '/') {

340 full_hashes->push_back(SBFullHashForString(	408 full_hashes->push_back(SBFullHashForString(

341 host + path.substr(0, path.size() - 1)));	409 host + path.substr(0, path.size() - 1)));

342 }	410 }

343 }	411 }

344 }	412 }

345 }	413 }

346	414

347 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) {	415 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) {

348 hosts->clear();

349

350 std::string canon_host;	416 std::string canon_host;

351 CanonicalizeUrl(url, &canon_host, NULL, NULL);	417 CanonicalizeUrl(url, &canon_host, NULL, NULL);

352	418 GenerateHostVariantsToCheck(canon_host, hosts);

353 const std::string host = canon_host; // const sidesteps GCC bugs below!

354 if (host.empty())

355 return;

356

357 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4

358 // hostnames formed by starting with the last 5 components and successively

359 // removing the leading component. The last component isn't examined alone,

360 // since it's the TLD or a subcomponent thereof.

361 //

362 // Note that we don't need to be clever about stopping at the "real" eTLD --

363 // the data on the server side has been filtered to ensure it will not

364 // blacklist a whole TLD, and it's not significantly slower on our side to

365 // just check too much.

366 //

367 // Also note that because we have a simple blacklist, not some sort of complex

368 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check

369 // these in.

370 const size_t kMaxHostsToCheck = 4;

371 bool skipped_last_component = false;

372 for (std::string::const_reverse_iterator i(host.rbegin());

373 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {

374 if (*i == '.') {

375 if (skipped_last_component)

376 hosts->push_back(std::string(i.base(), host.end()));

377 else

378 skipped_last_component = true;

379 }

380 }

381 hosts->push_back(host);

382 }	419 }

383	420

384 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) {	421 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) {

385 paths->clear();

386

387 std::string canon_path;	422 std::string canon_path;

388 std::string canon_query;	423 std::string canon_query;

389 CanonicalizeUrl(url, NULL, &canon_path, &canon_query);	424 CanonicalizeUrl(url, NULL, &canon_path, &canon_query);

390	425 GeneratePathVariantsToCheck(canon_path, canon_query, paths);

391 const std::string path = canon_path; // const sidesteps GCC bugs below!

392 const std::string query = canon_query;

393 if (path.empty())

394 return;

395

396 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without

397 // the query parameters, and also up to 4 paths formed by starting at the root

398 // and adding more path components.

399 //

400 // As with the hosts above, it doesn't matter what order we check these in.

401 const size_t kMaxPathsToCheck = 4;

402 for (std::string::const_iterator i(path.begin());

403 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {

404 if (*i == '/')

405 paths->push_back(std::string(path.begin(), i + 1));

406 }

407

408 if (!paths->empty() && paths->back() != path)

409 paths->push_back(path);

410

411 if (!query.empty())

412 paths->push_back(path + "?" + query);

413 }	426 }

414	427

415 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) {	428 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) {

	429 std::string canon_host;

	430 std::string canon_path;

	431 std::string canon_query;

	432 CanonicalizeUrl(url, &canon_host, &canon_path, &canon_query);

	433

416 std::vector<std::string> hosts, paths;	434 std::vector<std::string> hosts, paths;

417 GenerateHostsToCheck(url, &hosts);	435 GenerateHostVariantsToCheck(canon_host, &hosts);

418 GeneratePathsToCheck(url, &paths);	436 GeneratePathVariantsToCheck(canon_path, canon_query, &paths);

419 for (size_t h = 0; h < hosts.size(); ++h) {	437 for (size_t h = 0; h < hosts.size(); ++h) {

420 for (size_t p = 0; p < paths.size(); ++p) {	438 for (size_t p = 0; p < paths.size(); ++p) {

421 urls->push_back(hosts[h] + paths[p]);	439 urls->push_back(hosts[h] + paths[p]);

422 }	440 }

423 }	441 }

424 }	442 }

425	443

426 } // namespace safe_browsing	444 } // namespace safe_browsing

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »