Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
index 404a0b3b71677cdf9eb589cdf37686eb96ee47dc..ff3a50e82d3e0fde57cec9adae9a7dcae81e6b7a 100644 |
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
@@ -17,6 +17,7 @@ |
#include "crypto/sha2.h" |
#include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
#include "chrome/renderer/safe_browsing/features.h" |
+#include "chrome/renderer/safe_browsing/murmurhash3_util.h" |
#include "ui/base/l10n/l10n_util.h" |
#include "unicode/ubrk.h" |
@@ -40,7 +41,7 @@ const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000; |
// All of the state pertaining to the current feature extraction. |
struct PhishingTermFeatureExtractor::ExtractionState { |
- // Stores up to max_words_per_ngram_ previous words separated by spaces. |
+ // Stores up to max_words_per_term_ previous words separated by spaces. |
std::string previous_words; |
// Stores the sizes of the words in previous_words. Note: the size includes |
@@ -90,12 +91,14 @@ struct PhishingTermFeatureExtractor::ExtractionState { |
PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( |
const base::hash_set<std::string>* page_term_hashes, |
- const base::hash_set<std::string>* page_word_hashes, |
+ const base::hash_set<uint32>* page_word_hashes, |
size_t max_words_per_term, |
+ uint32 murmurhash3_seed, |
FeatureExtractorClock* clock) |
: page_term_hashes_(page_term_hashes), |
page_word_hashes_(page_word_hashes), |
max_words_per_term_(max_words_per_term), |
+ murmurhash3_seed_(murmurhash3_seed), |
negative_word_cache_(kMaxNegativeWordCacheSize), |
clock_(clock), |
ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { |
@@ -206,8 +209,8 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { |
void PhishingTermFeatureExtractor::HandleWord( |
const base::StringPiece16& word) { |
// Quickest out if we have seen this word before and know that it's not |
- // part of any term. This avoids the SHA256, lowercasing, and UTF conversion, |
- // all of which are relatively expensive. |
+ // part of any term. This avoids the lowercasing and UTF conversion, both of |
+ // which are relatively expensive. |
if (negative_word_cache_.Get(word) != negative_word_cache_.end()) { |
// We know we're no longer in a possible n-gram, so clear the previous word |
// state. |
@@ -217,7 +220,7 @@ void PhishingTermFeatureExtractor::HandleWord( |
} |
std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word)); |
- std::string word_hash = crypto::SHA256HashString(word_lower); |
+ uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_); |
// Quick out if the word is not part of any term, which is the common case. |
if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { |
@@ -229,11 +232,11 @@ void PhishingTermFeatureExtractor::HandleWord( |
return; |
} |
- // Find all of the n-grams that we need to check and compute their hashes. |
- // We already have the hash for word_lower, so we don't compute that again. |
+ // Find all of the n-grams that we need to check and compute their SHA-256 |
+ // hashes. |
std::map<std::string /* hash */, std::string /* plaintext */> |
hashes_to_check; |
- hashes_to_check[word_hash] = word_lower; |
+ hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower; |
// Combine the new word with the previous words to find additional n-grams. |
// Note that we don't yet add the new word length to previous_word_sizes, |