Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
index 89994dfd04cf4488d4f4a87689cff92bc8760bb8..ac53d261d6ca68d7a4059e286cbc7596dde5f71f 100644 |
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
@@ -45,6 +45,14 @@ struct PhishingTermFeatureExtractor::ExtractionState { |
// Stores up to max_words_per_term_ previous words separated by spaces. |
std::string previous_words; |
+ // Stores the current shingle after a new word is processed and added in. |
+ std::string current_shingle; |
+ |
+ // Stores the sizes of the words in current_shingle. Note: the size includes |
+ // the space after each word. In other words, the sum of all sizes in this |
+ // list is equal to the length of current_shingle. |
+ std::list<size_t> shingle_word_sizes; |
+ |
// Stores the sizes of the words in previous_words. Note: the size includes |
// the space after each word. In other words, the sum of all sizes in this |
// list is equal to the length of previous_words. |
@@ -93,13 +101,17 @@ struct PhishingTermFeatureExtractor::ExtractionState { |
PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( |
const base::hash_set<std::string>* page_term_hashes, |
const base::hash_set<uint32>* page_word_hashes, |
+ size_t max_hashes_per_page, |
size_t max_words_per_term, |
uint32 murmurhash3_seed, |
+ size_t shingle_size, |
FeatureExtractorClock* clock) |
: page_term_hashes_(page_term_hashes), |
page_word_hashes_(page_word_hashes), |
+ max_hashes_per_page_(max_hashes_per_page), |
max_words_per_term_(max_words_per_term), |
murmurhash3_seed_(murmurhash3_seed), |
+ shingle_size_(shingle_size), |
negative_word_cache_(kMaxNegativeWordCacheSize), |
clock_(clock), |
weak_factory_(this) { |
@@ -115,6 +127,7 @@ PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { |
void PhishingTermFeatureExtractor::ExtractFeatures( |
const base::string16* page_text, |
FeatureMap* features, |
+ std::set<uint32>* shingle_hashes, |
const DoneCallback& done_callback) { |
// The RenderView should have called CancelPendingExtraction() before |
// starting a new extraction, so DCHECK this. |
@@ -125,6 +138,7 @@ void PhishingTermFeatureExtractor::ExtractFeatures( |
page_text_ = page_text; |
features_ = features; |
+ shingle_hashes_ = shingle_hashes, |
done_callback_ = done_callback; |
state_.reset(new ExtractionState(*page_text_, clock_->Now())); |
@@ -210,6 +224,36 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { |
void PhishingTermFeatureExtractor::HandleWord( |
const base::StringPiece16& word) { |
+ // First, extract shingle hashes. We check the size of shingle_hashes_ first |
+ // to skip as soon as we reach |max_hashes_per_page_|. |
+ std::string word_lower; |
+ if (shingle_hashes_->size() < max_hashes_per_page_) { |
+ word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word)); |
+ if (state_->shingle_word_sizes.size() < shingle_size_ - 1) { |
+ // Adding |word| would not form a complete shingle, simply insert it. |
+ state_->current_shingle.append(word_lower + " "); |
+ state_->shingle_word_sizes.push_back(word_lower.size() + 1); |
+ } else if (state_->shingle_word_sizes.size() == shingle_size_- 1) { |
+ // Adding |word| would just form a complete shingle, insert it and |
+ // calculate the shingle hash. |
+ state_->current_shingle.append(word_lower + " "); |
+ state_->shingle_word_sizes.push_back(word_lower.size() + 1); |
+ shingle_hashes_->insert( |
+ MurmurHash3String(state_->current_shingle, murmurhash3_seed_)); |
+ } else { |
+ // We need to remove the first word from current_shingle, and add |word| |
+ // at the end to form current_shingle. |
+ state_->current_shingle.erase(0, state_->shingle_word_sizes.front()); |
+ state_->shingle_word_sizes.pop_front(); |
+ state_->current_shingle.append(word_lower + " "); |
+ state_->shingle_word_sizes.push_back(word_lower.size() + 1); |
+ shingle_hashes_->insert( |
+ MurmurHash3String(state_->current_shingle, murmurhash3_seed_)); |
+ } |
mattm
2014/05/06 01:00:14
each of the cases here duplicates the code of the
zysxqn
2014/05/06 20:56:57
Done.
|
+ } |
+ |
+ // Next, extract page terms. |
+ // |
// Quickest out if we have seen this word before and know that it's not |
// part of any term. This avoids the lowercasing and UTF conversion, both of |
// which are relatively expensive. |
@@ -221,7 +265,10 @@ void PhishingTermFeatureExtractor::HandleWord( |
return; |
} |
- std::string word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word)); |
+ // Don't recalculate. |
+ if (shingle_hashes_->size() >= max_hashes_per_page_) { |
mattm
2014/05/06 01:00:14
I'd probably more comfortable with:
if (word_lower
zysxqn
2014/05/06 20:56:57
Done.
|
+ word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word)); |
+ } |
uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_); |
// Quick out if the word is not part of any term, which is the common case. |
@@ -302,6 +349,7 @@ void PhishingTermFeatureExtractor::RunCallback(bool success) { |
void PhishingTermFeatureExtractor::Clear() { |
page_text_ = NULL; |
features_ = NULL; |
+ shingle_hashes_ = NULL; |
done_callback_.Reset(); |
state_.reset(NULL); |
negative_word_cache_.Clear(); |