Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(75)

Unified Diff: components/safe_browsing_db/util.cc

Issue 2010713003: Only call CanonicalizeUrl once from UrlToFullHashes (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Rename GenerateHostEtc Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: components/safe_browsing_db/util.cc
diff --git a/components/safe_browsing_db/util.cc b/components/safe_browsing_db/util.cc
index 822d1293b14bd24647f65c7139b38e71ee0eaa20..3289aaae5bd0158fe12f914a1bde7596800454d0 100644
--- a/components/safe_browsing_db/util.cc
+++ b/components/safe_browsing_db/util.cc
@@ -19,6 +19,7 @@ namespace safe_browsing {
// Utility functions -----------------------------------------------------------
namespace {
+
bool IsKnownList(const std::string& name) {
for (size_t i = 0; i < arraysize(kAllLists); ++i) {
if (!strcmp(kAllLists[i], name.c_str())) {
@@ -27,6 +28,68 @@ bool IsKnownList(const std::string& name) {
}
return false;
}
+
+void GenerateHostVariantsToCheck(const std::string& host,
+ std::vector<std::string>* hosts) {
+ hosts->clear();
+
+ if (host.empty())
+ return;
+
+ // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4
+ // hostnames formed by starting with the last 5 components and successively
+ // removing the leading component. The last component isn't examined alone,
+ // since it's the TLD or a subcomponent thereof.
+ //
+ // Note that we don't need to be clever about stopping at the "real" eTLD --
+ // the data on the server side has been filtered to ensure it will not
+ // blacklist a whole TLD, and it's not significantly slower on our side to
+ // just check too much.
+ //
+ // Also note that because we have a simple blacklist, not some sort of complex
+ // whitelist-in-blacklist or vice versa, it doesn't matter what order we check
+ // these in.
+ const size_t kMaxHostsToCheck = 4;
+ bool skipped_last_component = false;
+ for (std::string::const_reverse_iterator i(host.rbegin());
+ i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {
+ if (*i == '.') {
+ if (skipped_last_component)
+ hosts->push_back(std::string(i.base(), host.end()));
+ else
+ skipped_last_component = true;
+ }
+ }
+ hosts->push_back(host);
+}
+
+void GeneratePathVariantsToCheck(const std::string& path,
+ const std::string& query,
+ std::vector<std::string>* paths) {
+ paths->clear();
+
+ if (path.empty())
+ return;
+
+ // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without
+ // the query parameters, and also up to 4 paths formed by starting at the root
+ // and adding more path components.
+ //
+ // As with the hosts above, it doesn't matter what order we check these in.
+ const size_t kMaxPathsToCheck = 4;
+ for (std::string::const_iterator i(path.begin());
+ i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {
+ if (*i == '/')
+ paths->push_back(std::string(path.begin(), i + 1));
+ }
+
+ if (!paths->empty() && paths->back() != path)
+ paths->push_back(path);
+
+ if (!query.empty())
+ paths->push_back(path + "?" + query);
+}
+
} // namespace
// ThreatMetadata ------------------------------------------------------------
@@ -317,15 +380,20 @@ void UrlToFullHashes(const GURL& url,
// called sparingly.
TRACE_EVENT2("loader", "safe_browsing::UrlToFullHashes", "url", url.spec(),
"include_whitelist_hashes", include_whitelist_hashes);
+ std::string canon_host;
+ std::string canon_path;
+ std::string canon_query;
+ CanonicalizeUrl(url, &canon_host, &canon_path, &canon_query);
+
std::vector<std::string> hosts;
if (url.HostIsIPAddress()) {
hosts.push_back(url.host());
} else {
- GenerateHostsToCheck(url, &hosts);
+ GenerateHostVariantsToCheck(canon_host, &hosts);
}
std::vector<std::string> paths;
- GeneratePathsToCheck(url, &paths);
+ GeneratePathVariantsToCheck(canon_path, canon_query, &paths);
for (const std::string& host : hosts) {
for (const std::string& path : paths) {
@@ -345,77 +413,27 @@ void UrlToFullHashes(const GURL& url,
}
void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) {
- hosts->clear();
-
std::string canon_host;
CanonicalizeUrl(url, &canon_host, NULL, NULL);
-
- const std::string host = canon_host; // const sidesteps GCC bugs below!
- if (host.empty())
- return;
-
- // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4
- // hostnames formed by starting with the last 5 components and successively
- // removing the leading component. The last component isn't examined alone,
- // since it's the TLD or a subcomponent thereof.
- //
- // Note that we don't need to be clever about stopping at the "real" eTLD --
- // the data on the server side has been filtered to ensure it will not
- // blacklist a whole TLD, and it's not significantly slower on our side to
- // just check too much.
- //
- // Also note that because we have a simple blacklist, not some sort of complex
- // whitelist-in-blacklist or vice versa, it doesn't matter what order we check
- // these in.
- const size_t kMaxHostsToCheck = 4;
- bool skipped_last_component = false;
- for (std::string::const_reverse_iterator i(host.rbegin());
- i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {
- if (*i == '.') {
- if (skipped_last_component)
- hosts->push_back(std::string(i.base(), host.end()));
- else
- skipped_last_component = true;
- }
- }
- hosts->push_back(host);
+ GenerateHostVariantsToCheck(canon_host, hosts);
}
void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) {
- paths->clear();
-
std::string canon_path;
std::string canon_query;
CanonicalizeUrl(url, NULL, &canon_path, &canon_query);
-
- const std::string path = canon_path; // const sidesteps GCC bugs below!
- const std::string query = canon_query;
- if (path.empty())
- return;
-
- // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without
- // the query parameters, and also up to 4 paths formed by starting at the root
- // and adding more path components.
- //
- // As with the hosts above, it doesn't matter what order we check these in.
- const size_t kMaxPathsToCheck = 4;
- for (std::string::const_iterator i(path.begin());
- i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {
- if (*i == '/')
- paths->push_back(std::string(path.begin(), i + 1));
- }
-
- if (!paths->empty() && paths->back() != path)
- paths->push_back(path);
-
- if (!query.empty())
- paths->push_back(path + "?" + query);
+ GeneratePathVariantsToCheck(canon_path, canon_query, paths);
}
void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) {
+ std::string canon_host;
+ std::string canon_path;
+ std::string canon_query;
+ CanonicalizeUrl(url, &canon_host, &canon_path, &canon_query);
+
std::vector<std::string> hosts, paths;
- GenerateHostsToCheck(url, &hosts);
- GeneratePathsToCheck(url, &paths);
+ GenerateHostVariantsToCheck(canon_host, &hosts);
+ GeneratePathVariantsToCheck(canon_path, canon_query, &paths);
for (size_t h = 0; h < hosts.size(); ++h) {
for (size_t p = 0; p < paths.size(); ++p) {
urls->push_back(hosts[h] + paths[p]);
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698