Chromium Code Reviews| Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
| diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
| index 89994dfd04cf4488d4f4a87689cff92bc8760bb8..32140f6798dbaeda239ffd1a0c48b069b342166b 100644 |
| --- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
| +++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
| @@ -45,6 +45,14 @@ struct PhishingTermFeatureExtractor::ExtractionState { |
| // Stores up to max_words_per_term_ previous words separated by spaces. |
| std::string previous_words; |
| + // Stores the current shingle after a new word is processed and added in. |
| + std::string current_shingle; |
| + |
| + // Stores the sizes of the words in current_shingle. Note: the size includes |
| + // the space after each word. In other words, the sum of all sizes in this |
| + // list is equal to the length of current_shingle. |
| + std::list<size_t> shingle_word_sizes; |
| + |
| // Stores the sizes of the words in previous_words. Note: the size includes |
| // the space after each word. In other words, the sum of all sizes in this |
| // list is equal to the length of previous_words. |
| @@ -95,11 +103,15 @@ PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( |
| const base::hash_set<uint32>* page_word_hashes, |
| size_t max_words_per_term, |
| uint32 murmurhash3_seed, |
| + size_t max_shingles_per_page, |
| + size_t shingle_size, |
| FeatureExtractorClock* clock) |
| : page_term_hashes_(page_term_hashes), |
| page_word_hashes_(page_word_hashes), |
| max_words_per_term_(max_words_per_term), |
| murmurhash3_seed_(murmurhash3_seed), |
| + max_shingles_per_page_(max_shingles_per_page), |
| + shingle_size_(shingle_size), |
| negative_word_cache_(kMaxNegativeWordCacheSize), |
| clock_(clock), |
| weak_factory_(this) { |
| @@ -115,6 +127,7 @@ PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { |
| void PhishingTermFeatureExtractor::ExtractFeatures( |
| const base::string16* page_text, |
| FeatureMap* features, |
| + std::set<uint32>* shingle_hashes, |
| const DoneCallback& done_callback) { |
| // The RenderView should have called CancelPendingExtraction() before |
| // starting a new extraction, so DCHECK this. |
| @@ -125,6 +138,7 @@ void PhishingTermFeatureExtractor::ExtractFeatures( |
| page_text_ = page_text; |
| features_ = features; |
| + shingle_hashes_ = shingle_hashes, |
| done_callback_ = done_callback; |
| state_.reset(new ExtractionState(*page_text_, clock_->Now())); |
| @@ -210,6 +224,25 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { |
| void PhishingTermFeatureExtractor::HandleWord( |
| const base::StringPiece16& word) { |
| + // First, extract shingle hashes. |
| + const std::string& word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word)); |
| + state_->current_shingle.append(word_lower + " "); |
| + state_->shingle_word_sizes.push_back(word_lower.size() + 1); |
| + if (state_->shingle_word_sizes.size() == shingle_size_) { |
| + shingle_hashes_->insert( |
| + MurmurHash3String(state_->current_shingle, murmurhash3_seed_)); |
| + state_->current_shingle.erase(0, state_->shingle_word_sizes.front()); |
| + state_->shingle_word_sizes.pop_front(); |
| + } |
| + // Check if the size of shingle hashes is over the limit. |
| + if (shingle_hashes_->size() > max_shingles_per_page_) { |
| + // Pop the largest one. |
| + std::set<uint32>::iterator it = shingle_hashes_->end(); |
| + shingle_hashes_->erase(--it); |
|
mattm
2014/05/09 23:28:10
does shingle_hashes_->erase(--shingle_hashes_->end
noelutz
2014/05/10 01:01:20
Or shingle_hashes_->erase(shingle_hashes_->rbegin(
zysxqn
2014/05/12 17:43:57
Unfortunately neither works since set doesn't supp
zysxqn
2014/05/12 17:43:57
Nope.. See the reply below.
|
| + } |
| + |
| + // Next, extract page terms. |
| + // |
| // Quickest out if we have seen this word before and know that it's not |
| // part of any term. This avoids the lowercasing and UTF conversion, both of |
| // which are relatively expensive. |
|
mattm
2014/05/09 23:28:10
Since the conversion and lowercasing is always don
noelutz
2014/05/10 01:01:20
It saves us from unnecessary hashing (line 257), n
zysxqn
2014/05/12 17:43:57
We can still prevent unnecessary hashing on the in
zysxqn
2014/05/12 17:43:57
Acknowledged.
mattm
2014/05/12 20:08:14
Doing the lookup in negative_word_cache_ also requ
zysxqn
2014/05/12 21:09:34
Murmurhash3 is a relatively fast hash function so
noelutz
2014/05/12 21:51:09
I'm fine with removing the cache. It looks like m
zysxqn
2014/05/12 22:08:57
Removed.
|
| @@ -221,7 +254,6 @@ void PhishingTermFeatureExtractor::HandleWord( |
| return; |
| } |
| - std::string word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word)); |
| uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_); |
| // Quick out if the word is not part of any term, which is the common case. |
| @@ -302,6 +334,7 @@ void PhishingTermFeatureExtractor::RunCallback(bool success) { |
| void PhishingTermFeatureExtractor::Clear() { |
| page_text_ = NULL; |
| features_ = NULL; |
| + shingle_hashes_ = NULL; |
| done_callback_.Reset(); |
| state_.reset(NULL); |
| negative_word_cache_.Clear(); |