Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(7020)

Unified Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 268673007: Extracting page shingle hashes for similarity detection. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Fix a nit Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
index 89994dfd04cf4488d4f4a87689cff92bc8760bb8..32140f6798dbaeda239ffd1a0c48b069b342166b 100644
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
@@ -45,6 +45,14 @@ struct PhishingTermFeatureExtractor::ExtractionState {
// Stores up to max_words_per_term_ previous words separated by spaces.
std::string previous_words;
+ // Stores the current shingle after a new word is processed and added in.
+ std::string current_shingle;
+
+ // Stores the sizes of the words in current_shingle. Note: the size includes
+ // the space after each word. In other words, the sum of all sizes in this
+ // list is equal to the length of current_shingle.
+ std::list<size_t> shingle_word_sizes;
+
// Stores the sizes of the words in previous_words. Note: the size includes
// the space after each word. In other words, the sum of all sizes in this
// list is equal to the length of previous_words.
@@ -95,11 +103,15 @@ PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
const base::hash_set<uint32>* page_word_hashes,
size_t max_words_per_term,
uint32 murmurhash3_seed,
+ size_t max_shingles_per_page,
+ size_t shingle_size,
FeatureExtractorClock* clock)
: page_term_hashes_(page_term_hashes),
page_word_hashes_(page_word_hashes),
max_words_per_term_(max_words_per_term),
murmurhash3_seed_(murmurhash3_seed),
+ max_shingles_per_page_(max_shingles_per_page),
+ shingle_size_(shingle_size),
negative_word_cache_(kMaxNegativeWordCacheSize),
clock_(clock),
weak_factory_(this) {
@@ -115,6 +127,7 @@ PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
void PhishingTermFeatureExtractor::ExtractFeatures(
const base::string16* page_text,
FeatureMap* features,
+ std::set<uint32>* shingle_hashes,
const DoneCallback& done_callback) {
// The RenderView should have called CancelPendingExtraction() before
// starting a new extraction, so DCHECK this.
@@ -125,6 +138,7 @@ void PhishingTermFeatureExtractor::ExtractFeatures(
page_text_ = page_text;
features_ = features;
+ shingle_hashes_ = shingle_hashes,
done_callback_ = done_callback;
state_.reset(new ExtractionState(*page_text_, clock_->Now()));
@@ -210,6 +224,25 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
void PhishingTermFeatureExtractor::HandleWord(
const base::StringPiece16& word) {
+ // First, extract shingle hashes.
+ const std::string& word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));
+ state_->current_shingle.append(word_lower + " ");
+ state_->shingle_word_sizes.push_back(word_lower.size() + 1);
+ if (state_->shingle_word_sizes.size() == shingle_size_) {
+ shingle_hashes_->insert(
+ MurmurHash3String(state_->current_shingle, murmurhash3_seed_));
+ state_->current_shingle.erase(0, state_->shingle_word_sizes.front());
+ state_->shingle_word_sizes.pop_front();
+ }
+ // Check if the size of shingle hashes is over the limit.
+ if (shingle_hashes_->size() > max_shingles_per_page_) {
+ // Pop the largest one.
+ std::set<uint32>::iterator it = shingle_hashes_->end();
+ shingle_hashes_->erase(--it);
mattm 2014/05/09 23:28:10 does shingle_hashes_->erase(--shingle_hashes_->end
noelutz 2014/05/10 01:01:20 Or shingle_hashes_->erase(shingle_hashes_->rbegin(
zysxqn 2014/05/12 17:43:57 Unfortunately neither works since set doesn't supp
zysxqn 2014/05/12 17:43:57 Nope.. See the reply below.
+ }
+
+ // Next, extract page terms.
+ //
// Quickest out if we have seen this word before and know that it's not
// part of any term. This avoids the lowercasing and UTF conversion, both of
// which are relatively expensive.
mattm 2014/05/09 23:28:10 Since the conversion and lowercasing is always don
noelutz 2014/05/10 01:01:20 It saves us from unnecessary hashing (line 257), n
zysxqn 2014/05/12 17:43:57 We can still prevent unnecessary hashing on the in
zysxqn 2014/05/12 17:43:57 Acknowledged.
mattm 2014/05/12 20:08:14 Doing the lookup in negative_word_cache_ also requ
zysxqn 2014/05/12 21:09:34 Murmurhash3 is a relatively fast hash function so
noelutz 2014/05/12 21:51:09 I'm fine with removing the cache. It looks like m
zysxqn 2014/05/12 22:08:57 Removed.
@@ -221,7 +254,6 @@ void PhishingTermFeatureExtractor::HandleWord(
return;
}
- std::string word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));
uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);
// Quick out if the word is not part of any term, which is the common case.
@@ -302,6 +334,7 @@ void PhishingTermFeatureExtractor::RunCallback(bool success) {
void PhishingTermFeatureExtractor::Clear() {
page_text_ = NULL;
features_ = NULL;
+ shingle_hashes_ = NULL;
done_callback_.Reset();
state_.reset(NULL);
negative_word_cache_.Clear();

Powered by Google App Engine
This is Rietveld 408576698