Index: chrome/browser/safe_browsing/browser_feature_extractor.cc |
diff --git a/chrome/browser/safe_browsing/browser_feature_extractor.cc b/chrome/browser/safe_browsing/browser_feature_extractor.cc |
index 9cf4770b28bf8fd6109a86bf07ca89be8170bebf..69415a54517a12616ebe1381d1ba30bfb2815deb 100644 |
--- a/chrome/browser/safe_browsing/browser_feature_extractor.cc |
+++ b/chrome/browser/safe_browsing/browser_feature_extractor.cc |
@@ -9,6 +9,7 @@ |
#include "base/stl_util.h" |
#include "base/stringprintf.h" |
+#include "base/string_util.h" |
#include "base/task.h" |
#include "base/time.h" |
#include "chrome/common/safe_browsing/csd.pb.h" |
@@ -27,7 +28,7 @@ |
namespace safe_browsing { |
-const int BrowserFeatureExtractor::kSuffixPrefixHashLength = 5; |
+const int BrowserFeatureExtractor::kHashPrefixLength = 5; |
BrowseInfo::BrowseInfo() {} |
@@ -453,9 +454,27 @@ void BrowserFeatureExtractor::ComputeURLHash( |
&host, &path, &query); |
DCHECK(!host.empty()) << request->url(); |
DCHECK(!path.empty()) << request->url(); |
- request->set_suffix_prefix_hash( |
- crypto::SHA256HashString(host + path).substr( |
- 0, kSuffixPrefixHashLength)); |
+ |
+ // Lowercase the URL. Note: canonicalization converts the URL to ASCII. |
+ // Percent encoded characters will not be lowercased but this is consistent |
+ // with what we're doing on the server side. |
+ StringToLowerASCII(&host); |
+ StringToLowerASCII(&path); |
+ |
+ // Remove leading 'www.' from the host. |
+ if (host.size() > 4 && host.substr(0, 4) == "www.") { |
+ host.erase(0, 4); |
+ } |
+ // Remove everything after the last '/' to broaden the pattern. |
+ if (path.size() > 1 && *(path.rbegin()) != '/') { |
+ // The pattern never ends in foo.com/test? because we stripped CGI params. |
+ // Remove everything that comes after the last '/'. |
+ size_t last_path = path.rfind("/"); |
+ path.erase(last_path + 1); |
+ } |
+ |
+ request->set_hash_prefix(crypto::SHA256HashString(host + path).substr( |
+ 0, kHashPrefixLength)); |
} |
}; // namespace safe_browsing |