chrome/browser/safe_browsing/safe_browsing_util.cc - Issue 1275002: Canonicalize the url based on Section 6.1 Safe Browsing Spec. Also fix the un...

Unified Diff: chrome/browser/safe_browsing/safe_browsing_util.cc

Issue 1275002: Canonicalize the url based on Section 6.1 Safe Browsing Spec. Also fix the un... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 10 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « chrome/browser/safe_browsing/safe_browsing_util.h ('k') | chrome/browser/safe_browsing/safe_browsing_util_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/browser/safe_browsing/safe_browsing_util.cc

===================================================================

--- chrome/browser/safe_browsing/safe_browsing_util.cc (revision 43057)

+++ chrome/browser/safe_browsing/safe_browsing_util.cc (working copy)

@@ -1,4 +1,4 @@

// Use of this source code is governed by a BSD-style license that can be

// found in the LICENSE file.

@@ -10,6 +10,7 @@

#include "base/string_util.h"

#include "chrome/browser/google_util.h"

#include "googleurl/src/gurl.h"

+#include "googleurl/src/url_util.h"

#include "net/base/escape.h"

#include "unicode/locid.h"

@@ -161,9 +162,144 @@

return (list_id == PHISH) ? kPhishingList : std::string();

}

+std::string Unescape(const std::string& url) {

+ std::string unescaped_str(url);

+ std::string old_unescaped_str;

+ const int kMaxLoopIterations = 1024;

+ int loop_var = 0;

+ do {

+ old_unescaped_str = unescaped_str;

+ unescaped_str = UnescapeURLComponent(old_unescaped_str,

+ UnescapeRule::CONTROL_CHARS | UnescapeRule::SPACES |

+ UnescapeRule::URL_SPECIAL_CHARS);

+ } while (unescaped_str != old_unescaped_str && ++loop_var <=

+ kMaxLoopIterations);

+ return unescaped_str;

+std::string Escape(const std::string& url) {

+ std::string escaped_str;

+ const char* kHexString = "0123456789ABCDEF";

+ for (size_t i = 0; i < url.length(); i++) {

+ unsigned char c = static_cast<unsigned char>(url[i]);

+ if (c <= ' ' || c > '~' || c == '#' || c == '%') {

+ escaped_str.push_back('%');

+ escaped_str.push_back(kHexString[c >> 4]);

+ escaped_str.push_back(kHexString[c & 0xf]);

+ } else {

+ escaped_str.push_back(c);

+ }

+ return escaped_str;

+std::string RemoveConsecutiveChars(const std::string& str, const char c) {

+ std::string output(str);

+ std::string string_to_find;

+ std::string::size_type loc = 0;

+ string_to_find.append(2, c);

+ while ((loc = output.find(string_to_find, loc)) != std::string::npos) {

+ output.erase(loc, 1);

+ }

+ return output;

+// Canonicalizes url as per Google Safe Browsing Specification.

+// See section 6.1 in

+// http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.

+void CanonicalizeUrl(const GURL& url,

+ std::string* canonicalized_hostname,

+ std::string* canonicalized_path,

+ std::string* canonicalized_query) {

+ // Following canonicalization steps are excluded since url parsing takes care

+ // of those :-

+ // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url.

+ // (Exclude escaped version of these chars).

+ // 2. Normalize hostname to 4 dot-seperated decimal values.

+ // 3. Lowercase hostname.

+ // 4. Resolve path sequences "/../" and "/./".

+ // That leaves us with the following :-

+ // 1. Remove fragment in URL.

+ GURL url_without_fragment;

+ GURL::Replacements f_replacements;

+ f_replacements.ClearRef();

+ f_replacements.ClearUsername();

+ f_replacements.ClearPassword();

+ url_without_fragment = url.ReplaceComponents(f_replacements);

+ // 2. Do URL unescaping until no more hex encoded characters exist.

+ std::string url_unescaped_str(Unescape(url_without_fragment.spec()));

+ url_parse::Parsed parsed;

+ url_parse::ParseStandardURL(url_unescaped_str.data(),

+ url_unescaped_str.length(), &parsed);

+ // 3. In hostname, remove all leading and trailing dots.

+ const std::string host = (parsed.host.len > 0) ? url_unescaped_str.substr(

+ parsed.host.begin, parsed.host.len) : "";

+ const char kCharsToTrim[] = ".";

+ std::string host_without_end_dots;

+ TrimString(host, kCharsToTrim, &host_without_end_dots);

+ // 4. In hostname, replace consecutive dots with a single dot.

+ std::string host_without_consecutive_dots(RemoveConsecutiveChars(

+ host_without_end_dots, '.'));

+ // 5. In path, replace runs of consecutive slashes with a single slash.

+ std::string path = (parsed.path.len > 0) ? url_unescaped_str.substr(

+ parsed.path.begin, parsed.path.len): "";

+ std::string path_without_consecutive_slash(RemoveConsecutiveChars(

+ path, '/'));

+ url_canon::Replacements<char> hp_replacements;

+ hp_replacements.SetHost(host_without_consecutive_dots.data(),

+ url_parse::Component(0, host_without_consecutive_dots.length()));

+ hp_replacements.SetPath(path_without_consecutive_slash.data(),

+ url_parse::Component(0, path_without_consecutive_slash.length()));

+ std::string url_unescaped_with_can_hostpath;

+ url_canon::StdStringCanonOutput output(&url_unescaped_with_can_hostpath);

+ url_parse::Parsed temp_parsed;

+ url_util::ReplaceComponents(url_unescaped_str.data(),

+ url_unescaped_str.length(), parsed,

+ hp_replacements, NULL, &output, &temp_parsed);

+ output.Complete();

+ // 6. Step needed to revert escaping done in url_util::ReplaceComponents.

+ url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath);

+ // 7. After performing all above steps, percent-escape all chars in url which

+ // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters.

+ std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath));

+ url_parse::Parsed final_parsed;

+ url_parse::ParseStandardURL(escaped_canon_url_str.data(),

+ escaped_canon_url_str.length(), &final_parsed);

+ if (canonicalized_hostname && final_parsed.host.len > 0) {

+ *canonicalized_hostname =

+ escaped_canon_url_str.substr(final_parsed.host.begin,

+ final_parsed.host.len);

+ }

+ if (canonicalized_path && final_parsed.path.len > 0) {

+ *canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin,

+ final_parsed.path.len);

+ }

+ if (canonicalized_query && final_parsed.query.len > 0) {

+ *canonicalized_query = escaped_canon_url_str.substr(

+ final_parsed.query.begin, final_parsed.query.len);

+ }

void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) {

hosts->clear();

- const std::string host = url.host(); // const sidesteps GCC bugs below!

+ std::string canon_host;

+ CanonicalizeUrl(url, &canon_host, NULL, NULL);

+ const std::string host = canon_host; // const sidesteps GCC bugs below!

if (host.empty())

return;

@@ -196,7 +332,13 @@

void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) {

paths->clear();

- const std::string path = url.path(); // const sidesteps GCC bugs below!

+ std::string canon_path;

+ std::string canon_query;

+ CanonicalizeUrl(url, NULL, &canon_path, &canon_query);

+ const std::string path = canon_path; // const sidesteps GCC bugs below!

+ const std::string query = canon_query;

if (path.empty())

return;

@@ -215,8 +357,8 @@

if (paths->back() != path)

paths->push_back(path);

- if (url.has_query())

- paths->push_back(path + "?" + url.query());

+ if (!query.empty())

+ paths->push_back(path + "?" + query);

}

int CompareFullHashes(const GURL& url,

« no previous file with comments | « chrome/browser/safe_browsing/safe_browsing_util.h ('k') | chrome/browser/safe_browsing/safe_browsing_util_unittest.cc » ('j') | no next file with comments »