Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(87)

Side by Side Diff: components/safe_browsing_db/v4_protocol_manager_util.cc

Issue 2225113002: Reland: Move PVer4 related code from util.* to v4_protocol_manager_util.* (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Minor: Address nparker@'s comments Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2016 The Chromium Authors. All rights reserved. 1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/safe_browsing_db/v4_protocol_manager_util.h" 5 #include "components/safe_browsing_db/v4_protocol_manager_util.h"
6 6
7 #include "base/base64.h" 7 #include "base/base64.h"
8 #include "base/metrics/sparse_histogram.h" 8 #include "base/metrics/sparse_histogram.h"
9 #include "base/rand_util.h" 9 #include "base/rand_util.h"
10 #include "base/strings/string_util.h"
10 #include "base/strings/stringprintf.h" 11 #include "base/strings/stringprintf.h"
12 #include "crypto/sha2.h"
11 #include "net/base/escape.h" 13 #include "net/base/escape.h"
12 #include "net/http/http_request_headers.h" 14 #include "net/http/http_request_headers.h"
15 #include "url/url_util.h"
13 16
14 using base::Time; 17 using base::Time;
15 using base::TimeDelta; 18 using base::TimeDelta;
16 19
17 namespace safe_browsing { 20 namespace safe_browsing {
18 21
22 namespace {
23
24 std::string Unescape(const std::string& url) {
25 std::string unescaped_str(url);
26 const int kMaxLoopIterations = 1024;
27 size_t old_size = 0;
28 int loop_var = 0;
29 do {
30 old_size = unescaped_str.size();
31 unescaped_str = net::UnescapeURLComponent(
32 unescaped_str,
33 net::UnescapeRule::SPOOFING_AND_CONTROL_CHARS |
34 net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS |
35 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);
36 } while (old_size != unescaped_str.size() &&
37 ++loop_var <= kMaxLoopIterations);
38
39 return unescaped_str;
40 }
41
42 std::string Escape(const std::string& url) {
43 std::string escaped_str;
44 // The escaped string is larger so allocate double the length to reduce the
45 // chance of the string being grown.
46 escaped_str.reserve(url.length() * 2);
47 const char* kHexString = "0123456789ABCDEF";
48 for (size_t i = 0; i < url.length(); i++) {
49 unsigned char c = static_cast<unsigned char>(url[i]);
50 if (c <= ' ' || c > '~' || c == '#' || c == '%') {
51 escaped_str += '%';
52 escaped_str += kHexString[c >> 4];
53 escaped_str += kHexString[c & 0xf];
54 } else {
55 escaped_str += c;
56 }
57 }
58
59 return escaped_str;
60 }
61
62 } // namespace
63
19 std::ostream& operator<<(std::ostream& os, const UpdateListIdentifier& id) { 64 std::ostream& operator<<(std::ostream& os, const UpdateListIdentifier& id) {
20 os << "{hash: " << id.hash() << "; platform_type: " << id.platform_type 65 os << "{hash: " << id.hash() << "; platform_type: " << id.platform_type
21 << "; threat_entry_type: " << id.threat_entry_type 66 << "; threat_entry_type: " << id.threat_entry_type
22 << "; threat_type: " << id.threat_type << "}"; 67 << "; threat_type: " << id.threat_type << "}";
23 return os; 68 return os;
24 } 69 }
25 70
71 #if defined(OS_WIN)
72 #define PLATFORM_TYPE WINDOWS_PLATFORM
73 #elif defined(OS_LINUX)
74 #define PLATFORM_TYPE LINUX_PLATFORM
75 #elif defined(OS_MACOSX)
76 #define PLATFORM_TYPE OSX_PLATFORM
77 #else
78 // This should ideally never compile but it is getting compiled on Android.
79 // See: https://bugs.chromium.org/p/chromium/issues/detail?id=621647
80 // TODO(vakh): Once that bug is fixed, this should be removed. If we leave
81 // the platform_type empty, the server won't recognize the request and
82 // return an error response which will pollute our UMA metrics.
83 #define PLATFORM_TYPE LINUX_PLATFORM
84 #endif
85
86 const UpdateListIdentifier GetUrlMalwareId() {
87 return UpdateListIdentifier(PLATFORM_TYPE, URL, MALWARE_THREAT);
88 }
89
90 const UpdateListIdentifier GetUrlSocEngId() {
91 return UpdateListIdentifier(PLATFORM_TYPE, URL, SOCIAL_ENGINEERING_PUBLIC);
92 }
93
26 // The Safe Browsing V4 server URL prefix. 94 // The Safe Browsing V4 server URL prefix.
27 const char kSbV4UrlPrefix[] = "https://safebrowsing.googleapis.com/v4"; 95 const char kSbV4UrlPrefix[] = "https://safebrowsing.googleapis.com/v4";
28 96
29 bool UpdateListIdentifier::operator==(const UpdateListIdentifier& other) const { 97 bool UpdateListIdentifier::operator==(const UpdateListIdentifier& other) const {
30 return platform_type == other.platform_type && 98 return platform_type == other.platform_type &&
31 threat_entry_type == other.threat_entry_type && 99 threat_entry_type == other.threat_entry_type &&
32 threat_type == other.threat_type; 100 threat_type == other.threat_type;
33 } 101 }
34 102
35 bool UpdateListIdentifier::operator!=(const UpdateListIdentifier& other) const { 103 bool UpdateListIdentifier::operator!=(const UpdateListIdentifier& other) const {
(...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after
131 } 199 }
132 200
133 // static 201 // static
134 void V4ProtocolManagerUtil::UpdateHeaders(net::HttpRequestHeaders* headers) { 202 void V4ProtocolManagerUtil::UpdateHeaders(net::HttpRequestHeaders* headers) {
135 // NOTE(vakh): The following header informs the envelope server (which sits in 203 // NOTE(vakh): The following header informs the envelope server (which sits in
136 // front of Google's stubby server) that the received GET request should be 204 // front of Google's stubby server) that the received GET request should be
137 // interpreted as a POST. 205 // interpreted as a POST.
138 headers->SetHeaderIfMissing("X-HTTP-Method-Override", "POST"); 206 headers->SetHeaderIfMissing("X-HTTP-Method-Override", "POST");
139 } 207 }
140 208
209 // static
210 void V4ProtocolManagerUtil::UrlToFullHashes(
211 const GURL& url,
212 base::hash_set<FullHash>* full_hashes) {
213 std::string canon_host, canon_path, canon_query;
214 CanonicalizeUrl(url, &canon_host, &canon_path, &canon_query);
215
216 std::vector<std::string> hosts;
217 if (url.HostIsIPAddress()) {
218 hosts.push_back(url.host());
219 } else {
220 GenerateHostVariantsToCheck(canon_host, &hosts);
221 }
222
223 std::vector<std::string> paths;
224 GeneratePathVariantsToCheck(canon_path, canon_query, &paths);
225 for (const std::string& host : hosts) {
226 for (const std::string& path : paths) {
227 full_hashes->insert(crypto::SHA256HashString(host + path));
228 }
229 }
230 }
231
232 // static
233 void V4ProtocolManagerUtil::GenerateHostsToCheck(
234 const GURL& url,
235 std::vector<std::string>* hosts) {
236 std::string canon_host;
237 CanonicalizeUrl(url, &canon_host, NULL, NULL);
238 GenerateHostVariantsToCheck(canon_host, hosts);
239 }
240
241 // static
242 void V4ProtocolManagerUtil::GeneratePathsToCheck(
243 const GURL& url,
244 std::vector<std::string>* paths) {
245 std::string canon_path;
246 std::string canon_query;
247 CanonicalizeUrl(url, NULL, &canon_path, &canon_query);
248 GeneratePathVariantsToCheck(canon_path, canon_query, paths);
249 }
250
251 // static
252 void V4ProtocolManagerUtil::GeneratePatternsToCheck(
253 const GURL& url,
254 std::vector<std::string>* urls) {
255 std::string canon_host;
256 std::string canon_path;
257 std::string canon_query;
258 CanonicalizeUrl(url, &canon_host, &canon_path, &canon_query);
259
260 std::vector<std::string> hosts, paths;
261 GenerateHostVariantsToCheck(canon_host, &hosts);
262 GeneratePathVariantsToCheck(canon_path, canon_query, &paths);
263 for (size_t h = 0; h < hosts.size(); ++h) {
264 for (size_t p = 0; p < paths.size(); ++p) {
265 urls->push_back(hosts[h] + paths[p]);
266 }
267 }
268 }
269
270 // static
271 void V4ProtocolManagerUtil::CanonicalizeUrl(const GURL& url,
272 std::string* canonicalized_hostname,
273 std::string* canonicalized_path,
274 std::string* canonicalized_query) {
275 DCHECK(url.is_valid());
276
277 // We only canonicalize "normal" URLs.
278 if (!url.IsStandard())
279 return;
280
281 // Following canonicalization steps are excluded since url parsing takes care
282 // of those :-
283 // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url.
284 // (Exclude escaped version of these chars).
285 // 2. Normalize hostname to 4 dot-seperated decimal values.
286 // 3. Lowercase hostname.
287 // 4. Resolve path sequences "/../" and "/./".
288
289 // That leaves us with the following :-
290 // 1. Remove fragment in URL.
291 GURL url_without_fragment;
292 GURL::Replacements f_replacements;
293 f_replacements.ClearRef();
294 f_replacements.ClearUsername();
295 f_replacements.ClearPassword();
296 url_without_fragment = url.ReplaceComponents(f_replacements);
297
298 // 2. Do URL unescaping until no more hex encoded characters exist.
299 std::string url_unescaped_str(Unescape(url_without_fragment.spec()));
300 url::Parsed parsed;
301 url::ParseStandardURL(url_unescaped_str.data(), url_unescaped_str.length(),
302 &parsed);
303
304 // 3. In hostname, remove all leading and trailing dots.
305 base::StringPiece host;
306 if (parsed.host.len > 0)
307 host.set(url_unescaped_str.data() + parsed.host.begin, parsed.host.len);
308
309 base::StringPiece host_without_end_dots =
310 base::TrimString(host, ".", base::TrimPositions::TRIM_ALL);
311
312 // 4. In hostname, replace consecutive dots with a single dot.
313 std::string host_without_consecutive_dots(
314 RemoveConsecutiveChars(host_without_end_dots, '.'));
315
316 // 5. In path, replace runs of consecutive slashes with a single slash.
317 base::StringPiece path;
318 if (parsed.path.len > 0)
319 path.set(url_unescaped_str.data() + parsed.path.begin, parsed.path.len);
320 std::string path_without_consecutive_slash(RemoveConsecutiveChars(path, '/'));
321
322 url::Replacements<char> hp_replacements;
323 hp_replacements.SetHost(
324 host_without_consecutive_dots.data(),
325 url::Component(0, host_without_consecutive_dots.length()));
326 hp_replacements.SetPath(
327 path_without_consecutive_slash.data(),
328 url::Component(0, path_without_consecutive_slash.length()));
329
330 std::string url_unescaped_with_can_hostpath;
331 url::StdStringCanonOutput output(&url_unescaped_with_can_hostpath);
332 url::Parsed temp_parsed;
333 url::ReplaceComponents(url_unescaped_str.data(), url_unescaped_str.length(),
334 parsed, hp_replacements, NULL, &output, &temp_parsed);
335 output.Complete();
336
337 // 6. Step needed to revert escaping done in url::ReplaceComponents.
338 url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath);
339
340 // 7. After performing all above steps, percent-escape all chars in url which
341 // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters.
342 std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath));
343 url::Parsed final_parsed;
344 url::ParseStandardURL(escaped_canon_url_str.data(),
345 escaped_canon_url_str.length(), &final_parsed);
346
347 if (canonicalized_hostname && final_parsed.host.len > 0) {
348 *canonicalized_hostname = escaped_canon_url_str.substr(
349 final_parsed.host.begin, final_parsed.host.len);
350 }
351 if (canonicalized_path && final_parsed.path.len > 0) {
352 *canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin,
353 final_parsed.path.len);
354 }
355 if (canonicalized_query && final_parsed.query.len > 0) {
356 *canonicalized_query = escaped_canon_url_str.substr(
357 final_parsed.query.begin, final_parsed.query.len);
358 }
359 }
360
361 // static
362 std::string V4ProtocolManagerUtil::RemoveConsecutiveChars(base::StringPiece str,
363 const char c) {
364 std::string output;
365 // Output is at most the length of the original string.
366 output.reserve(str.size());
367
368 size_t i = 0;
369 while (i < str.size()) {
370 output.append(1, str[i++]);
371 if (str[i - 1] == c) {
372 while (i < str.size() && str[i] == c) {
373 i++;
374 }
375 }
376 }
377
378 return output;
379 }
380
381 // static
382 void V4ProtocolManagerUtil::GenerateHostVariantsToCheck(
383 const std::string& host,
384 std::vector<std::string>* hosts) {
385 hosts->clear();
386
387 if (host.empty())
388 return;
389
390 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4
391 // hostnames formed by starting with the last 5 components and successively
392 // removing the leading component. The last component isn't examined alone,
393 // since it's the TLD or a subcomponent thereof.
394 //
395 // Note that we don't need to be clever about stopping at the "real" eTLD --
396 // the data on the server side has been filtered to ensure it will not
397 // blacklist a whole TLD, and it's not significantly slower on our side to
398 // just check too much.
399 //
400 // Also note that because we have a simple blacklist, not some sort of complex
401 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check
402 // these in.
403 const size_t kMaxHostsToCheck = 4;
404 bool skipped_last_component = false;
405 for (std::string::const_reverse_iterator i(host.rbegin());
406 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {
407 if (*i == '.') {
408 if (skipped_last_component)
409 hosts->push_back(std::string(i.base(), host.end()));
410 else
411 skipped_last_component = true;
412 }
413 }
414 hosts->push_back(host);
415 }
416
417 // static
418 void V4ProtocolManagerUtil::GeneratePathVariantsToCheck(
419 const std::string& path,
420 const std::string& query,
421 std::vector<std::string>* paths) {
422 paths->clear();
423
424 if (path.empty())
425 return;
426
427 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without
428 // the query parameters, and also up to 4 paths formed by starting at the root
429 // and adding more path components.
430 //
431 // As with the hosts above, it doesn't matter what order we check these in.
432 const size_t kMaxPathsToCheck = 4;
433 for (std::string::const_iterator i(path.begin());
434 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {
435 if (*i == '/')
436 paths->push_back(std::string(path.begin(), i + 1));
437 }
438
439 if (!paths->empty() && paths->back() != path)
440 paths->push_back(path);
441
442 if (!query.empty())
443 paths->push_back(path + "?" + query);
444 }
445
141 } // namespace safe_browsing 446 } // namespace safe_browsing
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698