chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc - Issue 268673007: Extracting page shingle hashes for similarity detection.

Unified Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 268673007: Extracting page shingle hashes for similarity detection. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Fix a reference problem. Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« chrome/renderer/safe_browsing/phishing_term_feature_extractor.h ('K') | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.h ('k') | chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc » ('j') | chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

index 89994dfd04cf4488d4f4a87689cff92bc8760bb8..ac53d261d6ca68d7a4059e286cbc7596dde5f71f 100644

--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

@@ -45,6 +45,14 @@ struct PhishingTermFeatureExtractor::ExtractionState {

// Stores up to max_words_per_term_ previous words separated by spaces.

std::string previous_words;

+ // Stores the current shingle after a new word is processed and added in.

+ std::string current_shingle;

+ // Stores the sizes of the words in current_shingle. Note: the size includes

+ // the space after each word. In other words, the sum of all sizes in this

+ // list is equal to the length of current_shingle.

+ std::list<size_t> shingle_word_sizes;

// Stores the sizes of the words in previous_words. Note: the size includes

// the space after each word. In other words, the sum of all sizes in this

// list is equal to the length of previous_words.

@@ -93,13 +101,17 @@ struct PhishingTermFeatureExtractor::ExtractionState {

PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(

const base::hash_set<std::string>* page_term_hashes,

const base::hash_set<uint32>* page_word_hashes,

+ size_t max_hashes_per_page,

size_t max_words_per_term,

uint32 murmurhash3_seed,

+ size_t shingle_size,

FeatureExtractorClock* clock)

: page_term_hashes_(page_term_hashes),

page_word_hashes_(page_word_hashes),

+ max_hashes_per_page_(max_hashes_per_page),

max_words_per_term_(max_words_per_term),

murmurhash3_seed_(murmurhash3_seed),

+ shingle_size_(shingle_size),

negative_word_cache_(kMaxNegativeWordCacheSize),

clock_(clock),

weak_factory_(this) {

@@ -115,6 +127,7 @@ PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {

void PhishingTermFeatureExtractor::ExtractFeatures(

const base::string16* page_text,

FeatureMap* features,

+ std::set<uint32>* shingle_hashes,

const DoneCallback& done_callback) {

// The RenderView should have called CancelPendingExtraction() before

// starting a new extraction, so DCHECK this.

@@ -125,6 +138,7 @@ void PhishingTermFeatureExtractor::ExtractFeatures(

page_text_ = page_text;

features_ = features;

+ shingle_hashes_ = shingle_hashes,

done_callback_ = done_callback;

state_.reset(new ExtractionState(*page_text_, clock_->Now()));

@@ -210,6 +224,36 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {

void PhishingTermFeatureExtractor::HandleWord(

const base::StringPiece16& word) {

+ // First, extract shingle hashes. We check the size of shingle_hashes_ first

+ // to skip as soon as we reach |max_hashes_per_page_|.

+ std::string word_lower;

+ if (shingle_hashes_->size() < max_hashes_per_page_) {

+ word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));

+ if (state_->shingle_word_sizes.size() < shingle_size_ - 1) {

+ // Adding |word| would not form a complete shingle, simply insert it.

+ state_->current_shingle.append(word_lower + " ");

+ state_->shingle_word_sizes.push_back(word_lower.size() + 1);

+ } else if (state_->shingle_word_sizes.size() == shingle_size_- 1) {

+ // Adding |word| would just form a complete shingle, insert it and

+ // calculate the shingle hash.

+ state_->current_shingle.append(word_lower + " ");

+ state_->shingle_word_sizes.push_back(word_lower.size() + 1);

+ shingle_hashes_->insert(

+ MurmurHash3String(state_->current_shingle, murmurhash3_seed_));

+ } else {

+ // We need to remove the first word from current_shingle, and add |word|

+ // at the end to form current_shingle.

+ state_->current_shingle.erase(0, state_->shingle_word_sizes.front());

+ state_->shingle_word_sizes.pop_front();

+ state_->current_shingle.append(word_lower + " ");

+ state_->shingle_word_sizes.push_back(word_lower.size() + 1);

+ shingle_hashes_->insert(

+ MurmurHash3String(state_->current_shingle, murmurhash3_seed_));

+ }

mattm 2014/05/06 01:00:14 each of the cases here duplicates the code of the

zysxqn 2014/05/06 20:56:57 Done.

+ }

+ // Next, extract page terms.

+ //

// Quickest out if we have seen this word before and know that it's not

// part of any term. This avoids the lowercasing and UTF conversion, both of

// which are relatively expensive.

@@ -221,7 +265,10 @@ void PhishingTermFeatureExtractor::HandleWord(

return;

}

- std::string word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));

+ // Don't recalculate.

+ if (shingle_hashes_->size() >= max_hashes_per_page_) {

mattm 2014/05/06 01:00:14 I'd probably more comfortable with: if (word_lower

zysxqn 2014/05/06 20:56:57 Done.

+ word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));

+ }

uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);

// Quick out if the word is not part of any term, which is the common case.

@@ -302,6 +349,7 @@ void PhishingTermFeatureExtractor::RunCallback(bool success) {

void PhishingTermFeatureExtractor::Clear() {

page_text_ = NULL;

features_ = NULL;

+ shingle_hashes_ = NULL;

done_callback_.Reset();

state_.reset(NULL);

negative_word_cache_.Clear();