Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2015 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/safe_browsing_db/util.h" | 5 #include "components/safe_browsing_db/util.h" |
| 6 | 6 |
| 7 #include <stddef.h> | 7 #include <stddef.h> |
| 8 | 8 |
| 9 #include "base/macros.h" | 9 #include "base/macros.h" |
| 10 #include "base/strings/string_util.h" | 10 #include "base/strings/string_util.h" |
| 11 #include "base/trace_event/trace_event.h" | 11 #include "base/trace_event/trace_event.h" |
| 12 #include "crypto/sha2.h" | 12 #include "crypto/sha2.h" |
| 13 #include "net/base/escape.h" | 13 #include "net/base/escape.h" |
| 14 #include "url/gurl.h" | 14 #include "url/gurl.h" |
| 15 #include "url/url_util.h" | 15 #include "url/url_util.h" |
| 16 | 16 |
| 17 namespace safe_browsing { | 17 namespace safe_browsing { |
| 18 | 18 |
| 19 // Utility functions ----------------------------------------------------------- | 19 // Utility functions ----------------------------------------------------------- |
| 20 | 20 |
| 21 namespace { | 21 namespace { |
| 22 | |
| 22 bool IsKnownList(const std::string& name) { | 23 bool IsKnownList(const std::string& name) { |
| 23 for (size_t i = 0; i < arraysize(kAllLists); ++i) { | 24 for (size_t i = 0; i < arraysize(kAllLists); ++i) { |
| 24 if (!strcmp(kAllLists[i], name.c_str())) { | 25 if (!strcmp(kAllLists[i], name.c_str())) { |
| 25 return true; | 26 return true; |
| 26 } | 27 } |
| 27 } | 28 } |
| 28 return false; | 29 return false; |
| 29 } | 30 } |
| 31 | |
| 32 void GenerateHostsToCheck(const std::string& host, | |
|
Nathan Parker
2016/05/25 17:23:35
Give these a different name, so they differ by mor
Joe Mason
2016/05/25 18:06:21
Done.
| |
| 33 std::vector<std::string>* hosts) { | |
| 34 hosts->clear(); | |
| 35 | |
| 36 if (host.empty()) | |
| 37 return; | |
| 38 | |
| 39 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4 | |
| 40 // hostnames formed by starting with the last 5 components and successively | |
| 41 // removing the leading component. The last component isn't examined alone, | |
| 42 // since it's the TLD or a subcomponent thereof. | |
| 43 // | |
| 44 // Note that we don't need to be clever about stopping at the "real" eTLD -- | |
| 45 // the data on the server side has been filtered to ensure it will not | |
| 46 // blacklist a whole TLD, and it's not significantly slower on our side to | |
| 47 // just check too much. | |
| 48 // | |
| 49 // Also note that because we have a simple blacklist, not some sort of complex | |
| 50 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check | |
| 51 // these in. | |
| 52 const size_t kMaxHostsToCheck = 4; | |
| 53 bool skipped_last_component = false; | |
| 54 for (std::string::const_reverse_iterator i(host.rbegin()); | |
| 55 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) { | |
| 56 if (*i == '.') { | |
| 57 if (skipped_last_component) | |
| 58 hosts->push_back(std::string(i.base(), host.end())); | |
| 59 else | |
| 60 skipped_last_component = true; | |
| 61 } | |
| 62 } | |
| 63 hosts->push_back(host); | |
| 64 } | |
| 65 | |
| 66 void GeneratePathsToCheck(const std::string& path, const std::string& query, | |
| 67 std::vector<std::string>* paths) { | |
| 68 paths->clear(); | |
| 69 | |
| 70 if (path.empty()) | |
| 71 return; | |
| 72 | |
| 73 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without | |
| 74 // the query parameters, and also up to 4 paths formed by starting at the root | |
| 75 // and adding more path components. | |
| 76 // | |
| 77 // As with the hosts above, it doesn't matter what order we check these in. | |
| 78 const size_t kMaxPathsToCheck = 4; | |
| 79 for (std::string::const_iterator i(path.begin()); | |
| 80 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) { | |
| 81 if (*i == '/') | |
| 82 paths->push_back(std::string(path.begin(), i + 1)); | |
| 83 } | |
| 84 | |
| 85 if (!paths->empty() && paths->back() != path) | |
| 86 paths->push_back(path); | |
| 87 | |
| 88 if (!query.empty()) | |
| 89 paths->push_back(path + "?" + query); | |
| 90 } | |
| 91 | |
| 30 } // namespace | 92 } // namespace |
| 31 | 93 |
| 32 // ThreatMetadata ------------------------------------------------------------ | 94 // ThreatMetadata ------------------------------------------------------------ |
| 33 ThreatMetadata::ThreatMetadata() | 95 ThreatMetadata::ThreatMetadata() |
| 34 : threat_pattern_type(ThreatPatternType::NONE) {} | 96 : threat_pattern_type(ThreatPatternType::NONE) {} |
| 35 | 97 |
| 36 ThreatMetadata::ThreatMetadata(const ThreatMetadata& other) = default; | 98 ThreatMetadata::ThreatMetadata(const ThreatMetadata& other) = default; |
| 37 | 99 |
| 38 ThreatMetadata::~ThreatMetadata() {} | 100 ThreatMetadata::~ThreatMetadata() {} |
| 39 | 101 |
| (...skipping 270 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 310 } | 372 } |
| 311 } | 373 } |
| 312 | 374 |
| 313 void UrlToFullHashes(const GURL& url, | 375 void UrlToFullHashes(const GURL& url, |
| 314 bool include_whitelist_hashes, | 376 bool include_whitelist_hashes, |
| 315 std::vector<SBFullHash>* full_hashes) { | 377 std::vector<SBFullHash>* full_hashes) { |
| 316 // Include this function in traces because it's not cheap so it should be | 378 // Include this function in traces because it's not cheap so it should be |
| 317 // called sparingly. | 379 // called sparingly. |
| 318 TRACE_EVENT2("loader", "safe_browsing::UrlToFullHashes", "url", url.spec(), | 380 TRACE_EVENT2("loader", "safe_browsing::UrlToFullHashes", "url", url.spec(), |
| 319 "include_whitelist_hashes", include_whitelist_hashes); | 381 "include_whitelist_hashes", include_whitelist_hashes); |
| 382 std::string canon_host; | |
| 383 std::string canon_path; | |
| 384 std::string canon_query; | |
| 385 CanonicalizeUrl(url, &canon_host, &canon_path, &canon_query); | |
| 386 | |
| 320 std::vector<std::string> hosts; | 387 std::vector<std::string> hosts; |
| 321 if (url.HostIsIPAddress()) { | 388 if (url.HostIsIPAddress()) { |
| 322 hosts.push_back(url.host()); | 389 hosts.push_back(url.host()); |
| 323 } else { | 390 } else { |
| 324 GenerateHostsToCheck(url, &hosts); | 391 GenerateHostsToCheck(canon_host, &hosts); |
| 325 } | 392 } |
| 326 | 393 |
| 327 std::vector<std::string> paths; | 394 std::vector<std::string> paths; |
| 328 GeneratePathsToCheck(url, &paths); | 395 GeneratePathsToCheck(canon_path, canon_query, &paths); |
| 329 | 396 |
| 330 for (const std::string& host : hosts) { | 397 for (const std::string& host : hosts) { |
| 331 for (const std::string& path : paths) { | 398 for (const std::string& path : paths) { |
| 332 full_hashes->push_back( | 399 full_hashes->push_back( |
| 333 SBFullHashForString(host + path)); | 400 SBFullHashForString(host + path)); |
| 334 | 401 |
| 335 // We may have /foo as path-prefix in the whitelist which should | 402 // We may have /foo as path-prefix in the whitelist which should |
| 336 // also match with /foo/bar and /foo?bar. Hence, for every path | 403 // also match with /foo/bar and /foo?bar. Hence, for every path |
| 337 // that ends in '/' we also add the path without the slash. | 404 // that ends in '/' we also add the path without the slash. |
| 338 if (include_whitelist_hashes && path.size() > 1 && | 405 if (include_whitelist_hashes && path.size() > 1 && |
| 339 path[path.size() - 1] == '/') { | 406 path[path.size() - 1] == '/') { |
| 340 full_hashes->push_back(SBFullHashForString( | 407 full_hashes->push_back(SBFullHashForString( |
| 341 host + path.substr(0, path.size() - 1))); | 408 host + path.substr(0, path.size() - 1))); |
| 342 } | 409 } |
| 343 } | 410 } |
| 344 } | 411 } |
| 345 } | 412 } |
| 346 | 413 |
| 347 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) { | 414 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) { |
| 348 hosts->clear(); | |
| 349 | |
| 350 std::string canon_host; | 415 std::string canon_host; |
| 351 CanonicalizeUrl(url, &canon_host, NULL, NULL); | 416 CanonicalizeUrl(url, &canon_host, NULL, NULL); |
| 352 | 417 GenerateHostsToCheck(canon_host, hosts); |
| 353 const std::string host = canon_host; // const sidesteps GCC bugs below! | |
|
Joe Mason
2016/05/25 15:44:44
This note was added in commit 080438b8886070e399c3
| |
| 354 if (host.empty()) | |
| 355 return; | |
| 356 | |
| 357 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4 | |
| 358 // hostnames formed by starting with the last 5 components and successively | |
| 359 // removing the leading component. The last component isn't examined alone, | |
| 360 // since it's the TLD or a subcomponent thereof. | |
| 361 // | |
| 362 // Note that we don't need to be clever about stopping at the "real" eTLD -- | |
| 363 // the data on the server side has been filtered to ensure it will not | |
| 364 // blacklist a whole TLD, and it's not significantly slower on our side to | |
| 365 // just check too much. | |
| 366 // | |
| 367 // Also note that because we have a simple blacklist, not some sort of complex | |
| 368 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check | |
| 369 // these in. | |
| 370 const size_t kMaxHostsToCheck = 4; | |
| 371 bool skipped_last_component = false; | |
| 372 for (std::string::const_reverse_iterator i(host.rbegin()); | |
| 373 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) { | |
| 374 if (*i == '.') { | |
| 375 if (skipped_last_component) | |
| 376 hosts->push_back(std::string(i.base(), host.end())); | |
| 377 else | |
| 378 skipped_last_component = true; | |
| 379 } | |
| 380 } | |
| 381 hosts->push_back(host); | |
| 382 } | 418 } |
| 383 | 419 |
| 384 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) { | 420 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) { |
| 385 paths->clear(); | |
| 386 | |
| 387 std::string canon_path; | 421 std::string canon_path; |
| 388 std::string canon_query; | 422 std::string canon_query; |
| 389 CanonicalizeUrl(url, NULL, &canon_path, &canon_query); | 423 CanonicalizeUrl(url, NULL, &canon_path, &canon_query); |
| 390 | 424 GeneratePathsToCheck(canon_path, canon_query, paths); |
| 391 const std::string path = canon_path; // const sidesteps GCC bugs below! | |
| 392 const std::string query = canon_query; | |
| 393 if (path.empty()) | |
| 394 return; | |
| 395 | |
| 396 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without | |
| 397 // the query parameters, and also up to 4 paths formed by starting at the root | |
| 398 // and adding more path components. | |
| 399 // | |
| 400 // As with the hosts above, it doesn't matter what order we check these in. | |
| 401 const size_t kMaxPathsToCheck = 4; | |
| 402 for (std::string::const_iterator i(path.begin()); | |
| 403 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) { | |
| 404 if (*i == '/') | |
| 405 paths->push_back(std::string(path.begin(), i + 1)); | |
| 406 } | |
| 407 | |
| 408 if (!paths->empty() && paths->back() != path) | |
| 409 paths->push_back(path); | |
| 410 | |
| 411 if (!query.empty()) | |
| 412 paths->push_back(path + "?" + query); | |
| 413 } | 425 } |
| 414 | 426 |
| 415 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) { | 427 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) { |
| 428 std::string canon_host; | |
| 429 std::string canon_path; | |
| 430 std::string canon_query; | |
| 431 CanonicalizeUrl(url, &canon_host, &canon_path, &canon_query); | |
| 432 | |
| 416 std::vector<std::string> hosts, paths; | 433 std::vector<std::string> hosts, paths; |
| 417 GenerateHostsToCheck(url, &hosts); | 434 GenerateHostsToCheck(canon_host, &hosts); |
| 418 GeneratePathsToCheck(url, &paths); | 435 GeneratePathsToCheck(canon_path, canon_query, &paths); |
| 419 for (size_t h = 0; h < hosts.size(); ++h) { | 436 for (size_t h = 0; h < hosts.size(); ++h) { |
| 420 for (size_t p = 0; p < paths.size(); ++p) { | 437 for (size_t p = 0; p < paths.size(); ++p) { |
| 421 urls->push_back(hosts[h] + paths[p]); | 438 urls->push_back(hosts[h] + paths[p]); |
| 422 } | 439 } |
| 423 } | 440 } |
| 424 } | 441 } |
| 425 | 442 |
| 426 } // namespace safe_browsing | 443 } // namespace safe_browsing |
| OLD | NEW |