Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(64)

Side by Side Diff: components/safe_browsing_db/util.cc

Issue 2010713003: Only call CanonicalizeUrl once from UrlToFullHashes (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2015 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/safe_browsing_db/util.h" 5 #include "components/safe_browsing_db/util.h"
6 6
7 #include <stddef.h> 7 #include <stddef.h>
8 8
9 #include "base/macros.h" 9 #include "base/macros.h"
10 #include "base/strings/string_util.h" 10 #include "base/strings/string_util.h"
11 #include "base/trace_event/trace_event.h" 11 #include "base/trace_event/trace_event.h"
12 #include "crypto/sha2.h" 12 #include "crypto/sha2.h"
13 #include "net/base/escape.h" 13 #include "net/base/escape.h"
14 #include "url/gurl.h" 14 #include "url/gurl.h"
15 #include "url/url_util.h" 15 #include "url/url_util.h"
16 16
17 namespace safe_browsing { 17 namespace safe_browsing {
18 18
19 // Utility functions ----------------------------------------------------------- 19 // Utility functions -----------------------------------------------------------
20 20
21 namespace { 21 namespace {
22
22 bool IsKnownList(const std::string& name) { 23 bool IsKnownList(const std::string& name) {
23 for (size_t i = 0; i < arraysize(kAllLists); ++i) { 24 for (size_t i = 0; i < arraysize(kAllLists); ++i) {
24 if (!strcmp(kAllLists[i], name.c_str())) { 25 if (!strcmp(kAllLists[i], name.c_str())) {
25 return true; 26 return true;
26 } 27 }
27 } 28 }
28 return false; 29 return false;
29 } 30 }
31
32 void GenerateHostsToCheck(const std::string& host,
Nathan Parker 2016/05/25 17:23:35 Give these a different name, so they differ by mor
Joe Mason 2016/05/25 18:06:21 Done.
33 std::vector<std::string>* hosts) {
34 hosts->clear();
35
36 if (host.empty())
37 return;
38
39 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4
40 // hostnames formed by starting with the last 5 components and successively
41 // removing the leading component. The last component isn't examined alone,
42 // since it's the TLD or a subcomponent thereof.
43 //
44 // Note that we don't need to be clever about stopping at the "real" eTLD --
45 // the data on the server side has been filtered to ensure it will not
46 // blacklist a whole TLD, and it's not significantly slower on our side to
47 // just check too much.
48 //
49 // Also note that because we have a simple blacklist, not some sort of complex
50 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check
51 // these in.
52 const size_t kMaxHostsToCheck = 4;
53 bool skipped_last_component = false;
54 for (std::string::const_reverse_iterator i(host.rbegin());
55 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {
56 if (*i == '.') {
57 if (skipped_last_component)
58 hosts->push_back(std::string(i.base(), host.end()));
59 else
60 skipped_last_component = true;
61 }
62 }
63 hosts->push_back(host);
64 }
65
66 void GeneratePathsToCheck(const std::string& path, const std::string& query,
67 std::vector<std::string>* paths) {
68 paths->clear();
69
70 if (path.empty())
71 return;
72
73 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without
74 // the query parameters, and also up to 4 paths formed by starting at the root
75 // and adding more path components.
76 //
77 // As with the hosts above, it doesn't matter what order we check these in.
78 const size_t kMaxPathsToCheck = 4;
79 for (std::string::const_iterator i(path.begin());
80 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {
81 if (*i == '/')
82 paths->push_back(std::string(path.begin(), i + 1));
83 }
84
85 if (!paths->empty() && paths->back() != path)
86 paths->push_back(path);
87
88 if (!query.empty())
89 paths->push_back(path + "?" + query);
90 }
91
30 } // namespace 92 } // namespace
31 93
32 // ThreatMetadata ------------------------------------------------------------ 94 // ThreatMetadata ------------------------------------------------------------
33 ThreatMetadata::ThreatMetadata() 95 ThreatMetadata::ThreatMetadata()
34 : threat_pattern_type(ThreatPatternType::NONE) {} 96 : threat_pattern_type(ThreatPatternType::NONE) {}
35 97
36 ThreatMetadata::ThreatMetadata(const ThreatMetadata& other) = default; 98 ThreatMetadata::ThreatMetadata(const ThreatMetadata& other) = default;
37 99
38 ThreatMetadata::~ThreatMetadata() {} 100 ThreatMetadata::~ThreatMetadata() {}
39 101
(...skipping 270 matching lines...) Expand 10 before | Expand all | Expand 10 after
310 } 372 }
311 } 373 }
312 374
313 void UrlToFullHashes(const GURL& url, 375 void UrlToFullHashes(const GURL& url,
314 bool include_whitelist_hashes, 376 bool include_whitelist_hashes,
315 std::vector<SBFullHash>* full_hashes) { 377 std::vector<SBFullHash>* full_hashes) {
316 // Include this function in traces because it's not cheap so it should be 378 // Include this function in traces because it's not cheap so it should be
317 // called sparingly. 379 // called sparingly.
318 TRACE_EVENT2("loader", "safe_browsing::UrlToFullHashes", "url", url.spec(), 380 TRACE_EVENT2("loader", "safe_browsing::UrlToFullHashes", "url", url.spec(),
319 "include_whitelist_hashes", include_whitelist_hashes); 381 "include_whitelist_hashes", include_whitelist_hashes);
382 std::string canon_host;
383 std::string canon_path;
384 std::string canon_query;
385 CanonicalizeUrl(url, &canon_host, &canon_path, &canon_query);
386
320 std::vector<std::string> hosts; 387 std::vector<std::string> hosts;
321 if (url.HostIsIPAddress()) { 388 if (url.HostIsIPAddress()) {
322 hosts.push_back(url.host()); 389 hosts.push_back(url.host());
323 } else { 390 } else {
324 GenerateHostsToCheck(url, &hosts); 391 GenerateHostsToCheck(canon_host, &hosts);
325 } 392 }
326 393
327 std::vector<std::string> paths; 394 std::vector<std::string> paths;
328 GeneratePathsToCheck(url, &paths); 395 GeneratePathsToCheck(canon_path, canon_query, &paths);
329 396
330 for (const std::string& host : hosts) { 397 for (const std::string& host : hosts) {
331 for (const std::string& path : paths) { 398 for (const std::string& path : paths) {
332 full_hashes->push_back( 399 full_hashes->push_back(
333 SBFullHashForString(host + path)); 400 SBFullHashForString(host + path));
334 401
335 // We may have /foo as path-prefix in the whitelist which should 402 // We may have /foo as path-prefix in the whitelist which should
336 // also match with /foo/bar and /foo?bar. Hence, for every path 403 // also match with /foo/bar and /foo?bar. Hence, for every path
337 // that ends in '/' we also add the path without the slash. 404 // that ends in '/' we also add the path without the slash.
338 if (include_whitelist_hashes && path.size() > 1 && 405 if (include_whitelist_hashes && path.size() > 1 &&
339 path[path.size() - 1] == '/') { 406 path[path.size() - 1] == '/') {
340 full_hashes->push_back(SBFullHashForString( 407 full_hashes->push_back(SBFullHashForString(
341 host + path.substr(0, path.size() - 1))); 408 host + path.substr(0, path.size() - 1)));
342 } 409 }
343 } 410 }
344 } 411 }
345 } 412 }
346 413
347 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) { 414 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) {
348 hosts->clear();
349
350 std::string canon_host; 415 std::string canon_host;
351 CanonicalizeUrl(url, &canon_host, NULL, NULL); 416 CanonicalizeUrl(url, &canon_host, NULL, NULL);
352 417 GenerateHostsToCheck(canon_host, hosts);
353 const std::string host = canon_host; // const sidesteps GCC bugs below!
Joe Mason 2016/05/25 15:44:44 This note was added in commit 080438b8886070e399c3
354 if (host.empty())
355 return;
356
357 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4
358 // hostnames formed by starting with the last 5 components and successively
359 // removing the leading component. The last component isn't examined alone,
360 // since it's the TLD or a subcomponent thereof.
361 //
362 // Note that we don't need to be clever about stopping at the "real" eTLD --
363 // the data on the server side has been filtered to ensure it will not
364 // blacklist a whole TLD, and it's not significantly slower on our side to
365 // just check too much.
366 //
367 // Also note that because we have a simple blacklist, not some sort of complex
368 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check
369 // these in.
370 const size_t kMaxHostsToCheck = 4;
371 bool skipped_last_component = false;
372 for (std::string::const_reverse_iterator i(host.rbegin());
373 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {
374 if (*i == '.') {
375 if (skipped_last_component)
376 hosts->push_back(std::string(i.base(), host.end()));
377 else
378 skipped_last_component = true;
379 }
380 }
381 hosts->push_back(host);
382 } 418 }
383 419
384 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) { 420 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) {
385 paths->clear();
386
387 std::string canon_path; 421 std::string canon_path;
388 std::string canon_query; 422 std::string canon_query;
389 CanonicalizeUrl(url, NULL, &canon_path, &canon_query); 423 CanonicalizeUrl(url, NULL, &canon_path, &canon_query);
390 424 GeneratePathsToCheck(canon_path, canon_query, paths);
391 const std::string path = canon_path; // const sidesteps GCC bugs below!
392 const std::string query = canon_query;
393 if (path.empty())
394 return;
395
396 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without
397 // the query parameters, and also up to 4 paths formed by starting at the root
398 // and adding more path components.
399 //
400 // As with the hosts above, it doesn't matter what order we check these in.
401 const size_t kMaxPathsToCheck = 4;
402 for (std::string::const_iterator i(path.begin());
403 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {
404 if (*i == '/')
405 paths->push_back(std::string(path.begin(), i + 1));
406 }
407
408 if (!paths->empty() && paths->back() != path)
409 paths->push_back(path);
410
411 if (!query.empty())
412 paths->push_back(path + "?" + query);
413 } 425 }
414 426
415 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) { 427 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) {
428 std::string canon_host;
429 std::string canon_path;
430 std::string canon_query;
431 CanonicalizeUrl(url, &canon_host, &canon_path, &canon_query);
432
416 std::vector<std::string> hosts, paths; 433 std::vector<std::string> hosts, paths;
417 GenerateHostsToCheck(url, &hosts); 434 GenerateHostsToCheck(canon_host, &hosts);
418 GeneratePathsToCheck(url, &paths); 435 GeneratePathsToCheck(canon_path, canon_query, &paths);
419 for (size_t h = 0; h < hosts.size(); ++h) { 436 for (size_t h = 0; h < hosts.size(); ++h) {
420 for (size_t p = 0; p < paths.size(); ++p) { 437 for (size_t p = 0; p < paths.size(); ++p) {
421 urls->push_back(hosts[h] + paths[p]); 438 urls->push_back(hosts[h] + paths[p]);
422 } 439 }
423 } 440 }
424 } 441 }
425 442
426 } // namespace safe_browsing 443 } // namespace safe_browsing
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698