chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc - Issue 7866011: Switch to the new client-side phishing model that uses Murmurhash for word hashes.

Unified Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 7866011: Switch to the new client-side phishing model that uses Murmurhash for word hashes. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Fix compile problems and add another test Created 9 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.h ('k') | chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

index 404a0b3b71677cdf9eb589cdf37686eb96ee47dc..ff3a50e82d3e0fde57cec9adae9a7dcae81e6b7a 100644

--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

@@ -17,6 +17,7 @@

#include "crypto/sha2.h"

#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"

#include "chrome/renderer/safe_browsing/features.h"

+#include "chrome/renderer/safe_browsing/murmurhash3_util.h"

#include "ui/base/l10n/l10n_util.h"

#include "unicode/ubrk.h"

@@ -40,7 +41,7 @@ const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;

// All of the state pertaining to the current feature extraction.

struct PhishingTermFeatureExtractor::ExtractionState {

- // Stores up to max_words_per_ngram_ previous words separated by spaces.

+ // Stores up to max_words_per_term_ previous words separated by spaces.

std::string previous_words;

// Stores the sizes of the words in previous_words. Note: the size includes

@@ -90,12 +91,14 @@ struct PhishingTermFeatureExtractor::ExtractionState {

PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(

const base::hash_set<std::string>* page_term_hashes,

- const base::hash_set<std::string>* page_word_hashes,

+ const base::hash_set<uint32>* page_word_hashes,

size_t max_words_per_term,

+ uint32 murmurhash3_seed,

FeatureExtractorClock* clock)

: page_term_hashes_(page_term_hashes),

page_word_hashes_(page_word_hashes),

max_words_per_term_(max_words_per_term),

+ murmurhash3_seed_(murmurhash3_seed),

negative_word_cache_(kMaxNegativeWordCacheSize),

clock_(clock),

ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {

@@ -206,8 +209,8 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {

void PhishingTermFeatureExtractor::HandleWord(

const base::StringPiece16& word) {

// Quickest out if we have seen this word before and know that it's not

- // part of any term. This avoids the SHA256, lowercasing, and UTF conversion,

- // all of which are relatively expensive.

+ // part of any term. This avoids the lowercasing and UTF conversion, both of

+ // which are relatively expensive.

if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {

// We know we're no longer in a possible n-gram, so clear the previous word

// state.

@@ -217,7 +220,7 @@ void PhishingTermFeatureExtractor::HandleWord(

}

std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word));

- std::string word_hash = crypto::SHA256HashString(word_lower);

+ uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);

// Quick out if the word is not part of any term, which is the common case.

if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {

@@ -229,11 +232,11 @@ void PhishingTermFeatureExtractor::HandleWord(

return;

}

- // Find all of the n-grams that we need to check and compute their hashes.

- // We already have the hash for word_lower, so we don't compute that again.

+ // Find all of the n-grams that we need to check and compute their SHA-256

+ // hashes.

std::map<std::string /* hash */, std::string /* plaintext */>

hashes_to_check;

- hashes_to_check[word_hash] = word_lower;

+ hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower;

// Combine the new word with the previous words to find additional n-grams.

// Note that we don't yet add the new word length to previous_word_sizes,