Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h |
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h |
index cef13ca7df8a370168fa9b1c2c15c90d523e42c4..d5695d8a0811fd10c24587342eedb337dd507e13 100644 |
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h |
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h |
@@ -16,6 +16,7 @@ |
#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
+#include <set> |
#include <string> |
#include "base/basictypes.h" |
@@ -47,13 +48,20 @@ class PhishingTermFeatureExtractor { |
// must ensure that they are valid until the PhishingTermFeatureExtractor is |
// destroyed. |
// |
+ // In addition to extracting page terms, we will also extract text shingling |
+ // sketch, which consists of hashes of N-gram-words (referred to as shingles) |
+ // in the page. |shingle_size| defines N, and |max_hashes_per_page| defines |
+ // the maximum number of unique shingle hashes we extracted per page. |
+ // |
// |clock| is used for timing feature extractor operations, and may be mocked |
// for testing. The caller keeps ownership of the clock. |
PhishingTermFeatureExtractor( |
const base::hash_set<std::string>* page_term_hashes, |
const base::hash_set<uint32>* page_word_hashes, |
+ size_t max_hashes_per_page, |
mattm
2014/05/06 01:00:14
Also move this in argument list next to shingle_si
mattm
2014/05/06 01:00:14
I would suggest max_shingles_per_page for this (an
zysxqn
2014/05/06 20:56:57
Done.
zysxqn
2014/05/06 20:56:57
Done.
|
size_t max_words_per_term, |
uint32 murmurhash3_seed, |
+ size_t shingle_size, |
FeatureExtractorClock* clock); |
~PhishingTermFeatureExtractor(); |
@@ -72,6 +80,7 @@ class PhishingTermFeatureExtractor { |
// CancelPendingExtraction() is called. |
void ExtractFeatures(const base::string16* page_text, |
FeatureMap* features, |
+ std::set<uint32>* shingle_hashes, |
const DoneCallback& done_callback); |
// Cancels any pending feature extraction. The DoneCallback will not be run. |
@@ -129,12 +138,18 @@ class PhishingTermFeatureExtractor { |
// doesn't contain any part of one of our terms. |
const base::hash_set<uint32>* page_word_hashes_; |
+ // The maximum number of unique shingle hashes we extract in a page. |
+ const size_t max_hashes_per_page_; |
+ |
// The maximum number of words in an n-gram. |
const size_t max_words_per_term_; |
// The seed for murmurhash3. |
const uint32 murmurhash3_seed_; |
+ // The number of words in a shingle. |
+ const size_t shingle_size_; |
+ |
// This cache is used to see if we need to check the word at all, as |
// converting to UTF8, lowercasing, and hashing are all relatively expensive |
// operations. Though this is called an MRU cache, it seems to behave like |
@@ -148,6 +163,7 @@ class PhishingTermFeatureExtractor { |
// The output parameters from the most recent call to ExtractFeatures(). |
const base::string16* page_text_; // The caller keeps ownership of this. |
FeatureMap* features_; // The caller keeps ownership of this. |
+ std::set<uint32>* shingle_hashes_; |
DoneCallback done_callback_; |
// Stores the current state of term extraction from |page_text_|. |