| Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
|
| diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
|
| index d10b575435f9d11bced1bb5a24cffcf213d50f01..4fb26c1db3f62db50a24f6d32fe65f8effcf8370 100644
|
| --- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
|
| +++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
|
| @@ -41,8 +41,8 @@ class PhishingTermFeatureExtractor {
|
| // all of the terms whose SHA-256 hashes are in |page_term_hashes|. These
|
| // terms may be multi-word n-grams, with at most |max_words_per_term| words.
|
| //
|
| - // |page_word_hashes| contains the hashes for all of the individual words
|
| - // that make up the terms. Both sets of strings are UTF-8 encoded and
|
| + // |page_word_hashes| contains the murmur3 hashes for all of the individual
|
| + // words that make up the terms. Both sets of strings are UTF-8 encoded and
|
| // lowercased prior to hashing. The caller owns both sets of strings, and
|
| // must ensure that they are valid until the PhishingTermFeatureExtractor is
|
| // destroyed.
|
| @@ -51,8 +51,9 @@ class PhishingTermFeatureExtractor {
|
| // for testing. The caller keeps ownership of the clock.
|
| PhishingTermFeatureExtractor(
|
| const base::hash_set<std::string>* page_term_hashes,
|
| - const base::hash_set<std::string>* page_word_hashes,
|
| + const base::hash_set<uint32>* page_word_hashes,
|
| size_t max_words_per_term,
|
| + uint32 murmurhash3_seed,
|
| FeatureExtractorClock* clock);
|
| ~PhishingTermFeatureExtractor();
|
|
|
| @@ -121,15 +122,18 @@ class PhishingTermFeatureExtractor {
|
| // All of the term hashes that we are looking for in the page.
|
| const base::hash_set<std::string>* page_term_hashes_;
|
|
|
| - // Hashes of all the individual words in page_term_hashes_. If
|
| + // Murmur3 hashes of all the individual words in page_term_hashes_. If
|
| // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_
|
| // would contain (hashed) "one" and "two". We do this so that we can have a
|
| // quick out in the common case that the current word we are processing
|
| // doesn't contain any part of one of our terms.
|
| - const base::hash_set<std::string>* page_word_hashes_;
|
| + const base::hash_set<uint32>* page_word_hashes_;
|
|
|
| // The maximum number of words in an n-gram.
|
| - size_t max_words_per_term_;
|
| + const size_t max_words_per_term_;
|
| +
|
| + // The seed for murmurhash3.
|
| + const uint32 murmurhash3_seed_;
|
|
|
| // This cache is used to see if we need to check the word at all, as
|
| // converting to UTF8, lowercasing, and hashing are all relatively expensive
|
|
|