| Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
|
| diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
|
| index 89994dfd04cf4488d4f4a87689cff92bc8760bb8..7bddec23b06e9d43994b35fcfb2c1a198a34a13e 100644
|
| --- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
|
| +++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
|
| @@ -37,14 +37,19 @@ const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5;
|
| // actual phishing page.
|
| const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;
|
|
|
| -// The maximum size of the negative word cache.
|
| -const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;
|
| -
|
| // All of the state pertaining to the current feature extraction.
|
| struct PhishingTermFeatureExtractor::ExtractionState {
|
| // Stores up to max_words_per_term_ previous words separated by spaces.
|
| std::string previous_words;
|
|
|
| + // Stores the current shingle after a new word is processed and added in.
|
| + std::string current_shingle;
|
| +
|
| + // Stores the sizes of the words in current_shingle. Note: the size includes
|
| + // the space after each word. In other words, the sum of all sizes in this
|
| + // list is equal to the length of current_shingle.
|
| + std::list<size_t> shingle_word_sizes;
|
| +
|
| // Stores the sizes of the words in previous_words. Note: the size includes
|
| // the space after each word. In other words, the sum of all sizes in this
|
| // list is equal to the length of previous_words.
|
| @@ -95,12 +100,15 @@ PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
|
| const base::hash_set<uint32>* page_word_hashes,
|
| size_t max_words_per_term,
|
| uint32 murmurhash3_seed,
|
| + size_t max_shingles_per_page,
|
| + size_t shingle_size,
|
| FeatureExtractorClock* clock)
|
| : page_term_hashes_(page_term_hashes),
|
| page_word_hashes_(page_word_hashes),
|
| max_words_per_term_(max_words_per_term),
|
| murmurhash3_seed_(murmurhash3_seed),
|
| - negative_word_cache_(kMaxNegativeWordCacheSize),
|
| + max_shingles_per_page_(max_shingles_per_page),
|
| + shingle_size_(shingle_size),
|
| clock_(clock),
|
| weak_factory_(this) {
|
| Clear();
|
| @@ -115,6 +123,7 @@ PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
|
| void PhishingTermFeatureExtractor::ExtractFeatures(
|
| const base::string16* page_text,
|
| FeatureMap* features,
|
| + std::set<uint32>* shingle_hashes,
|
| const DoneCallback& done_callback) {
|
| // The RenderView should have called CancelPendingExtraction() before
|
| // starting a new extraction, so DCHECK this.
|
| @@ -125,6 +134,7 @@ void PhishingTermFeatureExtractor::ExtractFeatures(
|
|
|
| page_text_ = page_text;
|
| features_ = features;
|
| + shingle_hashes_ = shingle_hashes,
|
| done_callback_ = done_callback;
|
|
|
| state_.reset(new ExtractionState(*page_text_, clock_->Now()));
|
| @@ -210,18 +220,24 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
|
|
|
| void PhishingTermFeatureExtractor::HandleWord(
|
| const base::StringPiece16& word) {
|
| - // Quickest out if we have seen this word before and know that it's not
|
| - // part of any term. This avoids the lowercasing and UTF conversion, both of
|
| - // which are relatively expensive.
|
| - if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {
|
| - // We know we're no longer in a possible n-gram, so clear the previous word
|
| - // state.
|
| - state_->previous_words.clear();
|
| - state_->previous_word_sizes.clear();
|
| - return;
|
| + // First, extract shingle hashes.
|
| + const std::string& word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));
|
| + state_->current_shingle.append(word_lower + " ");
|
| + state_->shingle_word_sizes.push_back(word_lower.size() + 1);
|
| + if (state_->shingle_word_sizes.size() == shingle_size_) {
|
| + shingle_hashes_->insert(
|
| + MurmurHash3String(state_->current_shingle, murmurhash3_seed_));
|
| + state_->current_shingle.erase(0, state_->shingle_word_sizes.front());
|
| + state_->shingle_word_sizes.pop_front();
|
| + }
|
| + // Check if the size of shingle hashes is over the limit.
|
| + if (shingle_hashes_->size() > max_shingles_per_page_) {
|
| + // Pop the largest one.
|
| + std::set<uint32>::iterator it = shingle_hashes_->end();
|
| + shingle_hashes_->erase(--it);
|
| }
|
|
|
| - std::string word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));
|
| + // Next, extract page terms.
|
| uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);
|
|
|
| // Quick out if the word is not part of any term, which is the common case.
|
| @@ -229,8 +245,6 @@ void PhishingTermFeatureExtractor::HandleWord(
|
| // Word doesn't exist in our terms so we can clear the n-gram state.
|
| state_->previous_words.clear();
|
| state_->previous_word_sizes.clear();
|
| - // Insert into negative cache so that we don't try this again.
|
| - negative_word_cache_.Put(word, true);
|
| return;
|
| }
|
|
|
| @@ -302,9 +316,9 @@ void PhishingTermFeatureExtractor::RunCallback(bool success) {
|
| void PhishingTermFeatureExtractor::Clear() {
|
| page_text_ = NULL;
|
| features_ = NULL;
|
| + shingle_hashes_ = NULL;
|
| done_callback_.Reset();
|
| state_.reset(NULL);
|
| - negative_word_cache_.Clear();
|
| }
|
|
|
| } // namespace safe_browsing
|
|
|