Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(46)

Side by Side Diff: components/safe_browsing_db/util.cc

Issue 2010713003: Only call CanonicalizeUrl once from UrlToFullHashes (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Rename GenerateHostEtc Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2015 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/safe_browsing_db/util.h" 5 #include "components/safe_browsing_db/util.h"
6 6
7 #include <stddef.h> 7 #include <stddef.h>
8 8
9 #include "base/macros.h" 9 #include "base/macros.h"
10 #include "base/strings/string_util.h" 10 #include "base/strings/string_util.h"
11 #include "base/trace_event/trace_event.h" 11 #include "base/trace_event/trace_event.h"
12 #include "crypto/sha2.h" 12 #include "crypto/sha2.h"
13 #include "net/base/escape.h" 13 #include "net/base/escape.h"
14 #include "url/gurl.h" 14 #include "url/gurl.h"
15 #include "url/url_util.h" 15 #include "url/url_util.h"
16 16
17 namespace safe_browsing { 17 namespace safe_browsing {
18 18
19 // Utility functions ----------------------------------------------------------- 19 // Utility functions -----------------------------------------------------------
20 20
21 namespace { 21 namespace {
22
22 bool IsKnownList(const std::string& name) { 23 bool IsKnownList(const std::string& name) {
23 for (size_t i = 0; i < arraysize(kAllLists); ++i) { 24 for (size_t i = 0; i < arraysize(kAllLists); ++i) {
24 if (!strcmp(kAllLists[i], name.c_str())) { 25 if (!strcmp(kAllLists[i], name.c_str())) {
25 return true; 26 return true;
26 } 27 }
27 } 28 }
28 return false; 29 return false;
29 } 30 }
31
32 void GenerateHostVariantsToCheck(const std::string& host,
33 std::vector<std::string>* hosts) {
34 hosts->clear();
35
36 if (host.empty())
37 return;
38
39 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4
40 // hostnames formed by starting with the last 5 components and successively
41 // removing the leading component. The last component isn't examined alone,
42 // since it's the TLD or a subcomponent thereof.
43 //
44 // Note that we don't need to be clever about stopping at the "real" eTLD --
45 // the data on the server side has been filtered to ensure it will not
46 // blacklist a whole TLD, and it's not significantly slower on our side to
47 // just check too much.
48 //
49 // Also note that because we have a simple blacklist, not some sort of complex
50 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check
51 // these in.
52 const size_t kMaxHostsToCheck = 4;
53 bool skipped_last_component = false;
54 for (std::string::const_reverse_iterator i(host.rbegin());
55 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {
56 if (*i == '.') {
57 if (skipped_last_component)
58 hosts->push_back(std::string(i.base(), host.end()));
59 else
60 skipped_last_component = true;
61 }
62 }
63 hosts->push_back(host);
64 }
65
66 void GeneratePathVariantsToCheck(const std::string& path,
67 const std::string& query,
68 std::vector<std::string>* paths) {
69 paths->clear();
70
71 if (path.empty())
72 return;
73
74 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without
75 // the query parameters, and also up to 4 paths formed by starting at the root
76 // and adding more path components.
77 //
78 // As with the hosts above, it doesn't matter what order we check these in.
79 const size_t kMaxPathsToCheck = 4;
80 for (std::string::const_iterator i(path.begin());
81 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {
82 if (*i == '/')
83 paths->push_back(std::string(path.begin(), i + 1));
84 }
85
86 if (!paths->empty() && paths->back() != path)
87 paths->push_back(path);
88
89 if (!query.empty())
90 paths->push_back(path + "?" + query);
91 }
92
30 } // namespace 93 } // namespace
31 94
32 // ThreatMetadata ------------------------------------------------------------ 95 // ThreatMetadata ------------------------------------------------------------
33 ThreatMetadata::ThreatMetadata() 96 ThreatMetadata::ThreatMetadata()
34 : threat_pattern_type(ThreatPatternType::NONE) {} 97 : threat_pattern_type(ThreatPatternType::NONE) {}
35 98
36 ThreatMetadata::ThreatMetadata(const ThreatMetadata& other) = default; 99 ThreatMetadata::ThreatMetadata(const ThreatMetadata& other) = default;
37 100
38 ThreatMetadata::~ThreatMetadata() {} 101 ThreatMetadata::~ThreatMetadata() {}
39 102
(...skipping 270 matching lines...) Expand 10 before | Expand all | Expand 10 after
310 } 373 }
311 } 374 }
312 375
313 void UrlToFullHashes(const GURL& url, 376 void UrlToFullHashes(const GURL& url,
314 bool include_whitelist_hashes, 377 bool include_whitelist_hashes,
315 std::vector<SBFullHash>* full_hashes) { 378 std::vector<SBFullHash>* full_hashes) {
316 // Include this function in traces because it's not cheap so it should be 379 // Include this function in traces because it's not cheap so it should be
317 // called sparingly. 380 // called sparingly.
318 TRACE_EVENT2("loader", "safe_browsing::UrlToFullHashes", "url", url.spec(), 381 TRACE_EVENT2("loader", "safe_browsing::UrlToFullHashes", "url", url.spec(),
319 "include_whitelist_hashes", include_whitelist_hashes); 382 "include_whitelist_hashes", include_whitelist_hashes);
383 std::string canon_host;
384 std::string canon_path;
385 std::string canon_query;
386 CanonicalizeUrl(url, &canon_host, &canon_path, &canon_query);
387
320 std::vector<std::string> hosts; 388 std::vector<std::string> hosts;
321 if (url.HostIsIPAddress()) { 389 if (url.HostIsIPAddress()) {
322 hosts.push_back(url.host()); 390 hosts.push_back(url.host());
323 } else { 391 } else {
324 GenerateHostsToCheck(url, &hosts); 392 GenerateHostVariantsToCheck(canon_host, &hosts);
325 } 393 }
326 394
327 std::vector<std::string> paths; 395 std::vector<std::string> paths;
328 GeneratePathsToCheck(url, &paths); 396 GeneratePathVariantsToCheck(canon_path, canon_query, &paths);
329 397
330 for (const std::string& host : hosts) { 398 for (const std::string& host : hosts) {
331 for (const std::string& path : paths) { 399 for (const std::string& path : paths) {
332 full_hashes->push_back( 400 full_hashes->push_back(
333 SBFullHashForString(host + path)); 401 SBFullHashForString(host + path));
334 402
335 // We may have /foo as path-prefix in the whitelist which should 403 // We may have /foo as path-prefix in the whitelist which should
336 // also match with /foo/bar and /foo?bar. Hence, for every path 404 // also match with /foo/bar and /foo?bar. Hence, for every path
337 // that ends in '/' we also add the path without the slash. 405 // that ends in '/' we also add the path without the slash.
338 if (include_whitelist_hashes && path.size() > 1 && 406 if (include_whitelist_hashes && path.size() > 1 &&
339 path[path.size() - 1] == '/') { 407 path[path.size() - 1] == '/') {
340 full_hashes->push_back(SBFullHashForString( 408 full_hashes->push_back(SBFullHashForString(
341 host + path.substr(0, path.size() - 1))); 409 host + path.substr(0, path.size() - 1)));
342 } 410 }
343 } 411 }
344 } 412 }
345 } 413 }
346 414
347 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) { 415 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) {
348 hosts->clear();
349
350 std::string canon_host; 416 std::string canon_host;
351 CanonicalizeUrl(url, &canon_host, NULL, NULL); 417 CanonicalizeUrl(url, &canon_host, NULL, NULL);
352 418 GenerateHostVariantsToCheck(canon_host, hosts);
353 const std::string host = canon_host; // const sidesteps GCC bugs below!
354 if (host.empty())
355 return;
356
357 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4
358 // hostnames formed by starting with the last 5 components and successively
359 // removing the leading component. The last component isn't examined alone,
360 // since it's the TLD or a subcomponent thereof.
361 //
362 // Note that we don't need to be clever about stopping at the "real" eTLD --
363 // the data on the server side has been filtered to ensure it will not
364 // blacklist a whole TLD, and it's not significantly slower on our side to
365 // just check too much.
366 //
367 // Also note that because we have a simple blacklist, not some sort of complex
368 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check
369 // these in.
370 const size_t kMaxHostsToCheck = 4;
371 bool skipped_last_component = false;
372 for (std::string::const_reverse_iterator i(host.rbegin());
373 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {
374 if (*i == '.') {
375 if (skipped_last_component)
376 hosts->push_back(std::string(i.base(), host.end()));
377 else
378 skipped_last_component = true;
379 }
380 }
381 hosts->push_back(host);
382 } 419 }
383 420
384 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) { 421 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) {
385 paths->clear();
386
387 std::string canon_path; 422 std::string canon_path;
388 std::string canon_query; 423 std::string canon_query;
389 CanonicalizeUrl(url, NULL, &canon_path, &canon_query); 424 CanonicalizeUrl(url, NULL, &canon_path, &canon_query);
390 425 GeneratePathVariantsToCheck(canon_path, canon_query, paths);
391 const std::string path = canon_path; // const sidesteps GCC bugs below!
392 const std::string query = canon_query;
393 if (path.empty())
394 return;
395
396 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without
397 // the query parameters, and also up to 4 paths formed by starting at the root
398 // and adding more path components.
399 //
400 // As with the hosts above, it doesn't matter what order we check these in.
401 const size_t kMaxPathsToCheck = 4;
402 for (std::string::const_iterator i(path.begin());
403 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {
404 if (*i == '/')
405 paths->push_back(std::string(path.begin(), i + 1));
406 }
407
408 if (!paths->empty() && paths->back() != path)
409 paths->push_back(path);
410
411 if (!query.empty())
412 paths->push_back(path + "?" + query);
413 } 426 }
414 427
415 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) { 428 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) {
429 std::string canon_host;
430 std::string canon_path;
431 std::string canon_query;
432 CanonicalizeUrl(url, &canon_host, &canon_path, &canon_query);
433
416 std::vector<std::string> hosts, paths; 434 std::vector<std::string> hosts, paths;
417 GenerateHostsToCheck(url, &hosts); 435 GenerateHostVariantsToCheck(canon_host, &hosts);
418 GeneratePathsToCheck(url, &paths); 436 GeneratePathVariantsToCheck(canon_path, canon_query, &paths);
419 for (size_t h = 0; h < hosts.size(); ++h) { 437 for (size_t h = 0; h < hosts.size(); ++h) {
420 for (size_t p = 0; p < paths.size(); ++p) { 438 for (size_t p = 0; p < paths.size(); ++p) {
421 urls->push_back(hosts[h] + paths[p]); 439 urls->push_back(hosts[h] + paths[p]);
422 } 440 }
423 } 441 }
424 } 442 }
425 443
426 } // namespace safe_browsing 444 } // namespace safe_browsing
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698