OLD | NEW |
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/browser/safe_browsing/safe_browsing_util.h" | 5 #include "chrome/browser/safe_browsing/safe_browsing_util.h" |
6 | 6 |
7 #include "base/base64.h" | 7 #include "base/base64.h" |
8 #include "base/hmac.h" | 8 #include "base/hmac.h" |
9 #include "base/sha2.h" | 9 #include "base/sha2.h" |
10 #include "base/string_util.h" | 10 #include "base/string_util.h" |
11 #include "chrome/browser/google_util.h" | 11 #include "chrome/browser/google_util.h" |
12 #include "googleurl/src/gurl.h" | 12 #include "googleurl/src/gurl.h" |
| 13 #include "googleurl/src/url_util.h" |
13 #include "net/base/escape.h" | 14 #include "net/base/escape.h" |
14 #include "unicode/locid.h" | 15 #include "unicode/locid.h" |
15 | 16 |
16 #if defined(OS_WIN) | 17 #if defined(OS_WIN) |
17 #include "chrome/installer/util/browser_distribution.h" | 18 #include "chrome/installer/util/browser_distribution.h" |
18 #endif | 19 #endif |
19 | 20 |
20 static const int kSafeBrowsingMacDigestSize = 20; | 21 static const int kSafeBrowsingMacDigestSize = 20; |
21 | 22 |
22 // Continue to this URL after submitting the phishing report form. | 23 // Continue to this URL after submitting the phishing report form. |
(...skipping 131 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
154 return MALWARE; | 155 return MALWARE; |
155 return (name == kPhishingList) ? PHISH : INVALID; | 156 return (name == kPhishingList) ? PHISH : INVALID; |
156 } | 157 } |
157 | 158 |
158 std::string GetListName(int list_id) { | 159 std::string GetListName(int list_id) { |
159 if (list_id == MALWARE) | 160 if (list_id == MALWARE) |
160 return kMalwareList; | 161 return kMalwareList; |
161 return (list_id == PHISH) ? kPhishingList : std::string(); | 162 return (list_id == PHISH) ? kPhishingList : std::string(); |
162 } | 163 } |
163 | 164 |
| 165 std::string Unescape(const std::string& url) { |
| 166 std::string unescaped_str(url); |
| 167 std::string old_unescaped_str; |
| 168 const int kMaxLoopIterations = 1024; |
| 169 int loop_var = 0; |
| 170 do { |
| 171 old_unescaped_str = unescaped_str; |
| 172 unescaped_str = UnescapeURLComponent(old_unescaped_str, |
| 173 UnescapeRule::CONTROL_CHARS | UnescapeRule::SPACES | |
| 174 UnescapeRule::URL_SPECIAL_CHARS); |
| 175 } while (unescaped_str != old_unescaped_str && ++loop_var <= |
| 176 kMaxLoopIterations); |
| 177 |
| 178 return unescaped_str; |
| 179 } |
| 180 |
| 181 std::string Escape(const std::string& url) { |
| 182 std::string escaped_str; |
| 183 const char* kHexString = "0123456789ABCDEF"; |
| 184 for (size_t i = 0; i < url.length(); i++) { |
| 185 unsigned char c = static_cast<unsigned char>(url[i]); |
| 186 if (c <= ' ' || c > '~' || c == '#' || c == '%') { |
| 187 escaped_str.push_back('%'); |
| 188 escaped_str.push_back(kHexString[c >> 4]); |
| 189 escaped_str.push_back(kHexString[c & 0xf]); |
| 190 } else { |
| 191 escaped_str.push_back(c); |
| 192 } |
| 193 } |
| 194 |
| 195 return escaped_str; |
| 196 } |
| 197 |
| 198 std::string RemoveConsecutiveChars(const std::string& str, const char c) { |
| 199 std::string output(str); |
| 200 std::string string_to_find; |
| 201 std::string::size_type loc = 0; |
| 202 string_to_find.append(2, c); |
| 203 while ((loc = output.find(string_to_find, loc)) != std::string::npos) { |
| 204 output.erase(loc, 1); |
| 205 } |
| 206 |
| 207 return output; |
| 208 } |
| 209 |
| 210 // Canonicalizes url as per Google Safe Browsing Specification. |
| 211 // See section 6.1 in |
| 212 // http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec. |
| 213 void CanonicalizeUrl(const GURL& url, |
| 214 std::string* canonicalized_hostname, |
| 215 std::string* canonicalized_path, |
| 216 std::string* canonicalized_query) { |
| 217 // Following canonicalization steps are excluded since url parsing takes care |
| 218 // of those :- |
| 219 // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url. |
| 220 // (Exclude escaped version of these chars). |
| 221 // 2. Normalize hostname to 4 dot-seperated decimal values. |
| 222 // 3. Lowercase hostname. |
| 223 // 4. Resolve path sequences "/../" and "/./". |
| 224 |
| 225 // That leaves us with the following :- |
| 226 // 1. Remove fragment in URL. |
| 227 GURL url_without_fragment; |
| 228 GURL::Replacements f_replacements; |
| 229 f_replacements.ClearRef(); |
| 230 f_replacements.ClearUsername(); |
| 231 f_replacements.ClearPassword(); |
| 232 url_without_fragment = url.ReplaceComponents(f_replacements); |
| 233 |
| 234 // 2. Do URL unescaping until no more hex encoded characters exist. |
| 235 std::string url_unescaped_str(Unescape(url_without_fragment.spec())); |
| 236 url_parse::Parsed parsed; |
| 237 url_parse::ParseStandardURL(url_unescaped_str.data(), |
| 238 url_unescaped_str.length(), &parsed); |
| 239 |
| 240 // 3. In hostname, remove all leading and trailing dots. |
| 241 const std::string host = (parsed.host.len > 0) ? url_unescaped_str.substr( |
| 242 parsed.host.begin, parsed.host.len) : ""; |
| 243 const char kCharsToTrim[] = "."; |
| 244 std::string host_without_end_dots; |
| 245 TrimString(host, kCharsToTrim, &host_without_end_dots); |
| 246 |
| 247 // 4. In hostname, replace consecutive dots with a single dot. |
| 248 std::string host_without_consecutive_dots(RemoveConsecutiveChars( |
| 249 host_without_end_dots, '.')); |
| 250 |
| 251 // 5. In path, replace runs of consecutive slashes with a single slash. |
| 252 std::string path = (parsed.path.len > 0) ? url_unescaped_str.substr( |
| 253 parsed.path.begin, parsed.path.len): ""; |
| 254 std::string path_without_consecutive_slash(RemoveConsecutiveChars( |
| 255 path, '/')); |
| 256 |
| 257 url_canon::Replacements<char> hp_replacements; |
| 258 hp_replacements.SetHost(host_without_consecutive_dots.data(), |
| 259 url_parse::Component(0, host_without_consecutive_dots.length())); |
| 260 hp_replacements.SetPath(path_without_consecutive_slash.data(), |
| 261 url_parse::Component(0, path_without_consecutive_slash.length())); |
| 262 |
| 263 std::string url_unescaped_with_can_hostpath; |
| 264 url_canon::StdStringCanonOutput output(&url_unescaped_with_can_hostpath); |
| 265 url_parse::Parsed temp_parsed; |
| 266 url_util::ReplaceComponents(url_unescaped_str.data(), |
| 267 url_unescaped_str.length(), parsed, |
| 268 hp_replacements, NULL, &output, &temp_parsed); |
| 269 output.Complete(); |
| 270 |
| 271 // 6. Step needed to revert escaping done in url_util::ReplaceComponents. |
| 272 url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath); |
| 273 |
| 274 // 7. After performing all above steps, percent-escape all chars in url which |
| 275 // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters. |
| 276 std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath)); |
| 277 url_parse::Parsed final_parsed; |
| 278 url_parse::ParseStandardURL(escaped_canon_url_str.data(), |
| 279 escaped_canon_url_str.length(), &final_parsed); |
| 280 |
| 281 if (canonicalized_hostname && final_parsed.host.len > 0) { |
| 282 *canonicalized_hostname = |
| 283 escaped_canon_url_str.substr(final_parsed.host.begin, |
| 284 final_parsed.host.len); |
| 285 } |
| 286 if (canonicalized_path && final_parsed.path.len > 0) { |
| 287 *canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin, |
| 288 final_parsed.path.len); |
| 289 } |
| 290 if (canonicalized_query && final_parsed.query.len > 0) { |
| 291 *canonicalized_query = escaped_canon_url_str.substr( |
| 292 final_parsed.query.begin, final_parsed.query.len); |
| 293 } |
| 294 } |
| 295 |
164 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) { | 296 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) { |
165 hosts->clear(); | 297 hosts->clear(); |
166 const std::string host = url.host(); // const sidesteps GCC bugs below! | 298 |
| 299 std::string canon_host; |
| 300 CanonicalizeUrl(url, &canon_host, NULL, NULL); |
| 301 |
| 302 const std::string host = canon_host; // const sidesteps GCC bugs below! |
167 if (host.empty()) | 303 if (host.empty()) |
168 return; | 304 return; |
169 | 305 |
170 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4 | 306 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4 |
171 // hostnames formed by starting with the last 5 components and successively | 307 // hostnames formed by starting with the last 5 components and successively |
172 // removing the leading component. The last component isn't examined alone, | 308 // removing the leading component. The last component isn't examined alone, |
173 // since it's the TLD or a subcomponent thereof. | 309 // since it's the TLD or a subcomponent thereof. |
174 // | 310 // |
175 // Note that we don't need to be clever about stopping at the "real" eTLD -- | 311 // Note that we don't need to be clever about stopping at the "real" eTLD -- |
176 // the data on the server side has been filtered to ensure it will not | 312 // the data on the server side has been filtered to ensure it will not |
(...skipping 12 matching lines...) Expand all Loading... |
189 hosts->push_back(std::string(i.base(), host.end())); | 325 hosts->push_back(std::string(i.base(), host.end())); |
190 else | 326 else |
191 skipped_last_component = true; | 327 skipped_last_component = true; |
192 } | 328 } |
193 } | 329 } |
194 hosts->push_back(host); | 330 hosts->push_back(host); |
195 } | 331 } |
196 | 332 |
197 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) { | 333 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) { |
198 paths->clear(); | 334 paths->clear(); |
199 const std::string path = url.path(); // const sidesteps GCC bugs below! | 335 |
| 336 std::string canon_path; |
| 337 std::string canon_query; |
| 338 CanonicalizeUrl(url, NULL, &canon_path, &canon_query); |
| 339 |
| 340 const std::string path = canon_path; // const sidesteps GCC bugs below! |
| 341 const std::string query = canon_query; |
200 if (path.empty()) | 342 if (path.empty()) |
201 return; | 343 return; |
202 | 344 |
203 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without | 345 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without |
204 // the query parameters, and also up to 4 paths formed by starting at the root | 346 // the query parameters, and also up to 4 paths formed by starting at the root |
205 // and adding more path components. | 347 // and adding more path components. |
206 // | 348 // |
207 // As with the hosts above, it doesn't matter what order we check these in. | 349 // As with the hosts above, it doesn't matter what order we check these in. |
208 const size_t kMaxPathsToCheck = 4; | 350 const size_t kMaxPathsToCheck = 4; |
209 for (std::string::const_iterator i(path.begin()); | 351 for (std::string::const_iterator i(path.begin()); |
210 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) { | 352 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) { |
211 if (*i == '/') | 353 if (*i == '/') |
212 paths->push_back(std::string(path.begin(), i + 1)); | 354 paths->push_back(std::string(path.begin(), i + 1)); |
213 } | 355 } |
214 | 356 |
215 if (paths->back() != path) | 357 if (paths->back() != path) |
216 paths->push_back(path); | 358 paths->push_back(path); |
217 | 359 |
218 if (url.has_query()) | 360 if (!query.empty()) |
219 paths->push_back(path + "?" + url.query()); | 361 paths->push_back(path + "?" + query); |
220 } | 362 } |
221 | 363 |
222 int CompareFullHashes(const GURL& url, | 364 int CompareFullHashes(const GURL& url, |
223 const std::vector<SBFullHashResult>& full_hashes) { | 365 const std::vector<SBFullHashResult>& full_hashes) { |
224 if (full_hashes.empty()) | 366 if (full_hashes.empty()) |
225 return -1; | 367 return -1; |
226 | 368 |
227 std::vector<std::string> hosts, paths; | 369 std::vector<std::string> hosts, paths; |
228 GenerateHostsToCheck(url, &hosts); | 370 GenerateHostsToCheck(url, &hosts); |
229 GeneratePathsToCheck(url, &paths); | 371 GeneratePathsToCheck(url, &paths); |
(...skipping 73 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
303 std::string client_name("googlechrome"); | 445 std::string client_name("googlechrome"); |
304 #endif | 446 #endif |
305 | 447 |
306 GURL report_url(report_page + | 448 GURL report_url(report_page + |
307 StringPrintf(kReportParams, client_name.c_str(), continue_esc.c_str(), | 449 StringPrintf(kReportParams, client_name.c_str(), continue_esc.c_str(), |
308 current_esc.c_str())); | 450 current_esc.c_str())); |
309 return google_util::AppendGoogleLocaleParam(report_url); | 451 return google_util::AppendGoogleLocaleParam(report_url); |
310 } | 452 } |
311 | 453 |
312 } // namespace safe_browsing_util | 454 } // namespace safe_browsing_util |
OLD | NEW |