components/safe_browsing_db/util.cc - Issue 2229533002: Revert of Simple: Move PVer4 related code from util.* to v4_protocol_manager_util.*

Side by Side Diff: components/safe_browsing_db/util.cc

Issue 2229533002: Revert of Simple: Move PVer4 related code from util.* to v4_protocol_manager_util.* (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2015 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/safe_browsing_db/util.h"	5 #include "components/safe_browsing_db/util.h"

6	6

7 #include <stddef.h>	7 #include <stddef.h>

8	8

9 #include "base/macros.h"	9 #include "base/macros.h"

	10 #include "base/strings/string_util.h"

10 #include "base/trace_event/trace_event.h"	11 #include "base/trace_event/trace_event.h"

11 #include "components/safe_browsing_db/v4_protocol_manager_util.h"

12 #include "crypto/sha2.h"	12 #include "crypto/sha2.h"

13 #include "net/base/escape.h"	13 #include "net/base/escape.h"

14 #include "url/gurl.h"	14 #include "url/gurl.h"

	15 #include "url/url_util.h"

15	16

16 namespace safe_browsing {	17 namespace safe_browsing {

17	18

18 // Utility functions -----------------------------------------------------------	19 // Utility functions -----------------------------------------------------------

19	20

20 namespace {	21 namespace {

21	22

22 bool IsKnownList(const std::string& name) {	23 bool IsKnownList(const std::string& name) {

23 for (size_t i = 0; i < arraysize(kAllLists); ++i) {	24 for (size_t i = 0; i < arraysize(kAllLists); ++i) {

24 if (!strcmp(kAllLists[i], name.c_str())) {	25 if (!strcmp(kAllLists[i], name.c_str())) {

25 return true;	26 return true;

26 }	27 }

27 }	28 }

28 return false;	29 return false;

29 }	30 }

30	31

	32 void GenerateHostVariantsToCheck(const std::string& host,

	33 std::vector<std::string>* hosts) {

	34 hosts->clear();

	35

	36 if (host.empty())

	37 return;

	38

	39 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4

	40 // hostnames formed by starting with the last 5 components and successively

	41 // removing the leading component. The last component isn't examined alone,

	42 // since it's the TLD or a subcomponent thereof.

	43 //

	44 // Note that we don't need to be clever about stopping at the "real" eTLD --

	45 // the data on the server side has been filtered to ensure it will not

	46 // blacklist a whole TLD, and it's not significantly slower on our side to

	47 // just check too much.

	48 //

	49 // Also note that because we have a simple blacklist, not some sort of complex

	50 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check

	51 // these in.

	52 const size_t kMaxHostsToCheck = 4;

	53 bool skipped_last_component = false;

	54 for (std::string::const_reverse_iterator i(host.rbegin());

	55 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {

	56 if (*i == '.') {

	57 if (skipped_last_component)

	58 hosts->push_back(std::string(i.base(), host.end()));

	59 else

	60 skipped_last_component = true;

	61 }

	62 }

	63 hosts->push_back(host);

	64 }

	65

	66 void GeneratePathVariantsToCheck(const std::string& path,

	67 const std::string& query,

	68 std::vector<std::string>* paths) {

	69 paths->clear();

	70

	71 if (path.empty())

	72 return;

	73

	74 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without

	75 // the query parameters, and also up to 4 paths formed by starting at the root

	76 // and adding more path components.

	77 //

	78 // As with the hosts above, it doesn't matter what order we check these in.

	79 const size_t kMaxPathsToCheck = 4;

	80 for (std::string::const_iterator i(path.begin());

	81 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {

	82 if (*i == '/')

	83 paths->push_back(std::string(path.begin(), i + 1));

	84 }

	85

	86 if (!paths->empty() && paths->back() != path)

	87 paths->push_back(path);

	88

	89 if (!query.empty())

	90 paths->push_back(path + "?" + query);

	91 }

	92

31 } // namespace	93 } // namespace

32	94

33 // ThreatMetadata ------------------------------------------------------------	95 // ThreatMetadata ------------------------------------------------------------

34 ThreatMetadata::ThreatMetadata()	96 ThreatMetadata::ThreatMetadata()

35 : threat_pattern_type(ThreatPatternType::NONE) {}	97 : threat_pattern_type(ThreatPatternType::NONE) {}

36	98

37 ThreatMetadata::ThreatMetadata(const ThreatMetadata& other) = default;	99 ThreatMetadata::ThreatMetadata(const ThreatMetadata& other) = default;

38	100

39 ThreatMetadata::~ThreatMetadata() {}	101 ThreatMetadata::~ThreatMetadata() {}

40	102

(...skipping 106 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
147 SBFullHash hash_out;	209 SBFullHash hash_out;

148 memcpy(hash_out.full_hash, hash_in.data(), crypto::kSHA256Length);	210 memcpy(hash_out.full_hash, hash_in.data(), crypto::kSHA256Length);

149 return hash_out;	211 return hash_out;

150 }	212 }

151	213

152 std::string SBFullHashToString(const SBFullHash& hash) {	214 std::string SBFullHashToString(const SBFullHash& hash) {

153 DCHECK_EQ(crypto::kSHA256Length, sizeof(hash.full_hash));	215 DCHECK_EQ(crypto::kSHA256Length, sizeof(hash.full_hash));

154 return std::string(hash.full_hash, sizeof(hash.full_hash));	216 return std::string(hash.full_hash, sizeof(hash.full_hash));

155 }	217 }

156	218

	219

	220 std::string Unescape(const std::string& url) {

	221 std::string unescaped_str(url);

	222 const int kMaxLoopIterations = 1024;

	223 size_t old_size = 0;

	224 int loop_var = 0;

	225 do {

	226 old_size = unescaped_str.size();

	227 unescaped_str = net::UnescapeURLComponent(

	228 unescaped_str,

	229 net::UnescapeRule::SPOOFING_AND_CONTROL_CHARS \|

	230 net::UnescapeRule::SPACES \| net::UnescapeRule::PATH_SEPARATORS \|

	231 net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);

	232 } while (old_size != unescaped_str.size() &&

	233 ++loop_var <= kMaxLoopIterations);

	234

	235 return unescaped_str;

	236 }

	237

	238 std::string Escape(const std::string& url) {

	239 std::string escaped_str;

	240 // The escaped string is larger so allocate double the length to reduce the

	241 // chance of the string being grown.

	242 escaped_str.reserve(url.length() * 2);

	243 const char* kHexString = "0123456789ABCDEF";

	244 for (size_t i = 0; i < url.length(); i++) {

	245 unsigned char c = static_cast<unsigned char>(url[i]);

	246 if (c <= ' ' \|\| c > '~' \|\| c == '#' \|\| c == '%') {

	247 escaped_str += '%';

	248 escaped_str += kHexString[c >> 4];

	249 escaped_str += kHexString[c & 0xf];

	250 } else {

	251 escaped_str += c;

	252 }

	253 }

	254

	255 return escaped_str;

	256 }

	257

	258 std::string RemoveConsecutiveChars(base::StringPiece str, const char c) {

	259 std::string output;

	260 // Output is at most the length of the original string.

	261 output.reserve(str.size());

	262

	263 size_t i = 0;

	264 while (i < str.size()) {

	265 output.append(1, str[i++]);

	266 if (str[i - 1] == c) {

	267 while (i < str.size() && str[i] == c) {

	268 i++;

	269 }

	270 }

	271 }

	272

	273 return output;

	274 }

	275

	276 // Canonicalizes url as per Google Safe Browsing Specification.

	277 // See section 6.1 in

	278 // http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.

	279 void CanonicalizeUrl(const GURL& url,

	280 std::string* canonicalized_hostname,

	281 std::string* canonicalized_path,

	282 std::string* canonicalized_query) {

	283 DCHECK(url.is_valid());

	284

	285 // We only canonicalize "normal" URLs.

	286 if (!url.IsStandard())

	287 return;

	288

	289 // Following canonicalization steps are excluded since url parsing takes care

	290 // of those :-

	291 // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url.

	292 // (Exclude escaped version of these chars).

	293 // 2. Normalize hostname to 4 dot-seperated decimal values.

	294 // 3. Lowercase hostname.

	295 // 4. Resolve path sequences "/../" and "/./".

	296

	297 // That leaves us with the following :-

	298 // 1. Remove fragment in URL.

	299 GURL url_without_fragment;

	300 GURL::Replacements f_replacements;

	301 f_replacements.ClearRef();

	302 f_replacements.ClearUsername();

	303 f_replacements.ClearPassword();

	304 url_without_fragment = url.ReplaceComponents(f_replacements);

	305

	306 // 2. Do URL unescaping until no more hex encoded characters exist.

	307 std::string url_unescaped_str(Unescape(url_without_fragment.spec()));

	308 url::Parsed parsed;

	309 url::ParseStandardURL(url_unescaped_str.data(), url_unescaped_str.length(),

	310 &parsed);

	311

	312 // 3. In hostname, remove all leading and trailing dots.

	313 base::StringPiece host;

	314 if (parsed.host.len > 0)

	315 host.set(url_unescaped_str.data() + parsed.host.begin, parsed.host.len);

	316

	317 base::StringPiece host_without_end_dots =

	318 base::TrimString(host, ".", base::TrimPositions::TRIM_ALL);

	319

	320 // 4. In hostname, replace consecutive dots with a single dot.

	321 std::string host_without_consecutive_dots(RemoveConsecutiveChars(

	322 host_without_end_dots, '.'));

	323

	324 // 5. In path, replace runs of consecutive slashes with a single slash.

	325 base::StringPiece path;

	326 if (parsed.path.len > 0)

	327 path.set(url_unescaped_str.data() + parsed.path.begin, parsed.path.len);

	328 std::string path_without_consecutive_slash(RemoveConsecutiveChars(path, '/'));

	329

	330 url::Replacements<char> hp_replacements;

	331 hp_replacements.SetHost(

	332 host_without_consecutive_dots.data(),

	333 url::Component(0, host_without_consecutive_dots.length()));

	334 hp_replacements.SetPath(

	335 path_without_consecutive_slash.data(),

	336 url::Component(0, path_without_consecutive_slash.length()));

	337

	338 std::string url_unescaped_with_can_hostpath;

	339 url::StdStringCanonOutput output(&url_unescaped_with_can_hostpath);

	340 url::Parsed temp_parsed;

	341 url::ReplaceComponents(url_unescaped_str.data(),

	342 url_unescaped_str.length(),

	343 parsed,

	344 hp_replacements,

	345 NULL,

	346 &output,

	347 &temp_parsed);

	348 output.Complete();

	349

	350 // 6. Step needed to revert escaping done in url::ReplaceComponents.

	351 url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath);

	352

	353 // 7. After performing all above steps, percent-escape all chars in url which

	354 // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters.

	355 std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath));

	356 url::Parsed final_parsed;

	357 url::ParseStandardURL(escaped_canon_url_str.data(),

	358 escaped_canon_url_str.length(),

	359 &final_parsed);

	360

	361 if (canonicalized_hostname && final_parsed.host.len > 0) {

	362 *canonicalized_hostname =

	363 escaped_canon_url_str.substr(final_parsed.host.begin,

	364 final_parsed.host.len);

	365 }

	366 if (canonicalized_path && final_parsed.path.len > 0) {

	367 *canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin,

	368 final_parsed.path.len);

	369 }

	370 if (canonicalized_query && final_parsed.query.len > 0) {

	371 *canonicalized_query = escaped_canon_url_str.substr(

	372 final_parsed.query.begin, final_parsed.query.len);

	373 }

	374 }

	375

157 void UrlToFullHashes(const GURL& url,	376 void UrlToFullHashes(const GURL& url,

158 bool include_whitelist_hashes,	377 bool include_whitelist_hashes,

159 std::vector<SBFullHash>* full_hashes) {	378 std::vector<SBFullHash>* full_hashes) {

160 // Include this function in traces because it's not cheap so it should be	379 // Include this function in traces because it's not cheap so it should be

161 // called sparingly.	380 // called sparingly.

162 TRACE_EVENT2("loader", "safe_browsing::UrlToFullHashes", "url", url.spec(),	381 TRACE_EVENT2("loader", "safe_browsing::UrlToFullHashes", "url", url.spec(),

163 "include_whitelist_hashes", include_whitelist_hashes);	382 "include_whitelist_hashes", include_whitelist_hashes);

164 std::string canon_host;	383 std::string canon_host;

165 std::string canon_path;	384 std::string canon_path;

166 std::string canon_query;	385 std::string canon_query;

167 V4ProtocolManagerUtil::CanonicalizeUrl(url, &canon_host, &canon_path,	386 CanonicalizeUrl(url, &canon_host, &canon_path, &canon_query);

168 &canon_query);

169	387

170 std::vector<std::string> hosts;	388 std::vector<std::string> hosts;

171 if (url.HostIsIPAddress()) {	389 if (url.HostIsIPAddress()) {

172 hosts.push_back(url.host());	390 hosts.push_back(url.host());

173 } else {	391 } else {

174 V4ProtocolManagerUtil::GenerateHostVariantsToCheck(canon_host, &hosts);	392 GenerateHostVariantsToCheck(canon_host, &hosts);

175 }	393 }

176	394

177 std::vector<std::string> paths;	395 std::vector<std::string> paths;

178 V4ProtocolManagerUtil::GeneratePathVariantsToCheck(canon_path, canon_query,	396 GeneratePathVariantsToCheck(canon_path, canon_query, &paths);

179 &paths);

180	397

181 for (const std::string& host : hosts) {	398 for (const std::string& host : hosts) {

182 for (const std::string& path : paths) {	399 for (const std::string& path : paths) {

183 full_hashes->push_back(	400 full_hashes->push_back(

184 SBFullHashForString(host + path));	401 SBFullHashForString(host + path));

185	402

186 // We may have /foo as path-prefix in the whitelist which should	403 // We may have /foo as path-prefix in the whitelist which should

187 // also match with /foo/bar and /foo?bar. Hence, for every path	404 // also match with /foo/bar and /foo?bar. Hence, for every path

188 // that ends in '/' we also add the path without the slash.	405 // that ends in '/' we also add the path without the slash.

189 if (include_whitelist_hashes && path.size() > 1 && path.back() == '/') {	406 if (include_whitelist_hashes && path.size() > 1 && path.back() == '/') {

190 full_hashes->push_back(SBFullHashForString(	407 full_hashes->push_back(SBFullHashForString(

191 host + path.substr(0, path.size() - 1)));	408 host + path.substr(0, path.size() - 1)));

192 }	409 }

193 }	410 }

194 }	411 }

195 }	412 }

196	413

	414 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) {

	415 std::string canon_host;

	416 CanonicalizeUrl(url, &canon_host, NULL, NULL);

	417 GenerateHostVariantsToCheck(canon_host, hosts);

	418 }

	419

	420 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) {

	421 std::string canon_path;

	422 std::string canon_query;

	423 CanonicalizeUrl(url, NULL, &canon_path, &canon_query);

	424 GeneratePathVariantsToCheck(canon_path, canon_query, paths);

	425 }

	426

	427 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) {

	428 std::string canon_host;

	429 std::string canon_path;

	430 std::string canon_query;

	431 CanonicalizeUrl(url, &canon_host, &canon_path, &canon_query);

	432

	433 std::vector<std::string> hosts, paths;

	434 GenerateHostVariantsToCheck(canon_host, &hosts);

	435 GeneratePathVariantsToCheck(canon_path, canon_query, &paths);

	436 for (size_t h = 0; h < hosts.size(); ++h) {

	437 for (size_t p = 0; p < paths.size(); ++p) {

	438 urls->push_back(hosts[h] + paths[p]);

	439 }

	440 }

	441 }

	442

197 } // namespace safe_browsing	443 } // namespace safe_browsing

OLD	NEW

« no previous file with comments | « components/safe_browsing_db/util.h ('k') | components/safe_browsing_db/util_unittest.cc » ('j') | no next file with comments »