Chromium Code Reviews| Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h |
| diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h |
| index cef13ca7df8a370168fa9b1c2c15c90d523e42c4..b07376d696f3a0fa2fe8f6dbb87a7cf08aba51d9 100644 |
| --- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h |
| +++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h |
| @@ -16,6 +16,7 @@ |
| #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
| #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
| +#include <set> |
| #include <string> |
| #include "base/basictypes.h" |
| @@ -47,6 +48,11 @@ class PhishingTermFeatureExtractor { |
| // must ensure that they are valid until the PhishingTermFeatureExtractor is |
| // destroyed. |
| // |
| + // In addition to extracting page terms, we will also extract text shingling |
| + // sketch, which consists of hashes of N-gram-words (referred to as shingles) |
| + // in the page. |shingle_size| defines N, and |max_shingles_per_page| defines |
| + // the maximum number of unique shingle hashes we extracted per page. |
| + // |
| // |clock| is used for timing feature extractor operations, and may be mocked |
| // for testing. The caller keeps ownership of the clock. |
| PhishingTermFeatureExtractor( |
| @@ -54,6 +60,8 @@ class PhishingTermFeatureExtractor { |
| const base::hash_set<uint32>* page_word_hashes, |
| size_t max_words_per_term, |
| uint32 murmurhash3_seed, |
| + size_t max_shingles_per_page, |
| + size_t shingle_size, |
| FeatureExtractorClock* clock); |
| ~PhishingTermFeatureExtractor(); |
| @@ -72,6 +80,7 @@ class PhishingTermFeatureExtractor { |
| // CancelPendingExtraction() is called. |
|
noelutz
2014/05/06 21:40:18
nit: mention ownership of shingle_hashes as well?
zysxqn
2014/05/07 19:29:19
Done.
|
| void ExtractFeatures(const base::string16* page_text, |
| FeatureMap* features, |
| + std::set<uint32>* shingle_hashes, |
| const DoneCallback& done_callback); |
| // Cancels any pending feature extraction. The DoneCallback will not be run. |
| @@ -135,6 +144,12 @@ class PhishingTermFeatureExtractor { |
| // The seed for murmurhash3. |
| const uint32 murmurhash3_seed_; |
| + // The maximum number of unique shingle hashes we extract in a page. |
| + const size_t max_shingles_per_page_; |
| + |
| + // The number of words in a shingle. |
| + const size_t shingle_size_; |
| + |
| // This cache is used to see if we need to check the word at all, as |
| // converting to UTF8, lowercasing, and hashing are all relatively expensive |
| // operations. Though this is called an MRU cache, it seems to behave like |
| @@ -148,6 +163,7 @@ class PhishingTermFeatureExtractor { |
| // The output parameters from the most recent call to ExtractFeatures(). |
| const base::string16* page_text_; // The caller keeps ownership of this. |
| FeatureMap* features_; // The caller keeps ownership of this. |
| + std::set<uint32>* shingle_hashes_; |
| DoneCallback done_callback_; |
| // Stores the current state of term extraction from |page_text_|. |