chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc - Issue 268673007: Extracting page shingle hashes for similarity detection.

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 268673007: Extracting page shingle hashes for similarity detection. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Address 4th round comment. Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.h ('k') | chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"	5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"

6	6

7 #include <list>	7 #include <list>

8 #include <map>	8 #include <map>

9	9

10 #include "base/bind.h"	10 #include "base/bind.h"

(...skipping 19 matching lines...) Expand all Loading...
30	30

31 // Experimenting shows that we get a reasonable gain in performance by	31 // Experimenting shows that we get a reasonable gain in performance by

32 // increasing this up to around 10, but there's not much benefit in	32 // increasing this up to around 10, but there's not much benefit in

33 // increasing it past that.	33 // increasing it past that.

34 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5;	34 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5;

35	35

36 // This should be longer than we expect feature extraction to take on any	36 // This should be longer than we expect feature extraction to take on any

37 // actual phishing page.	37 // actual phishing page.

38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;	38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;

39	39

40 // The maximum size of the negative word cache.

41 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;

42

43 // All of the state pertaining to the current feature extraction.	40 // All of the state pertaining to the current feature extraction.

44 struct PhishingTermFeatureExtractor::ExtractionState {	41 struct PhishingTermFeatureExtractor::ExtractionState {

45 // Stores up to max_words_per_term_ previous words separated by spaces.	42 // Stores up to max_words_per_term_ previous words separated by spaces.

46 std::string previous_words;	43 std::string previous_words;

47	44

	45 // Stores the current shingle after a new word is processed and added in.

	46 std::string current_shingle;

	47

	48 // Stores the sizes of the words in current_shingle. Note: the size includes

	49 // the space after each word. In other words, the sum of all sizes in this

	50 // list is equal to the length of current_shingle.

	51 std::list<size_t> shingle_word_sizes;

	52

48 // Stores the sizes of the words in previous_words. Note: the size includes	53 // Stores the sizes of the words in previous_words. Note: the size includes

49 // the space after each word. In other words, the sum of all sizes in this	54 // the space after each word. In other words, the sum of all sizes in this

50 // list is equal to the length of previous_words.	55 // list is equal to the length of previous_words.

51 std::list<size_t> previous_word_sizes;	56 std::list<size_t> previous_word_sizes;

52	57

53 // An iterator for word breaking.	58 // An iterator for word breaking.

54 UBreakIterator* iterator;	59 UBreakIterator* iterator;

55	60

56 // Our current position in the text that was passed to the ExtractionState	61 // Our current position in the text that was passed to the ExtractionState

57 // constructor, speciailly, the most recent break position returned by our	62 // constructor, speciailly, the most recent break position returned by our

(...skipping 30 matching lines...) Expand all Loading...
88 ubrk_close(iterator);	93 ubrk_close(iterator);

89 }	94 }

90 }	95 }

91 };	96 };

92	97

93 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(	98 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(

94 const base::hash_set<std::string>* page_term_hashes,	99 const base::hash_set<std::string>* page_term_hashes,

95 const base::hash_set<uint32>* page_word_hashes,	100 const base::hash_set<uint32>* page_word_hashes,

96 size_t max_words_per_term,	101 size_t max_words_per_term,

97 uint32 murmurhash3_seed,	102 uint32 murmurhash3_seed,

	103 size_t max_shingles_per_page,

	104 size_t shingle_size,

98 FeatureExtractorClock* clock)	105 FeatureExtractorClock* clock)

99 : page_term_hashes_(page_term_hashes),	106 : page_term_hashes_(page_term_hashes),

100 page_word_hashes_(page_word_hashes),	107 page_word_hashes_(page_word_hashes),

101 max_words_per_term_(max_words_per_term),	108 max_words_per_term_(max_words_per_term),

102 murmurhash3_seed_(murmurhash3_seed),	109 murmurhash3_seed_(murmurhash3_seed),

103 negative_word_cache_(kMaxNegativeWordCacheSize),	110 max_shingles_per_page_(max_shingles_per_page),

	111 shingle_size_(shingle_size),

104 clock_(clock),	112 clock_(clock),

105 weak_factory_(this) {	113 weak_factory_(this) {

106 Clear();	114 Clear();

107 }	115 }

108	116

109 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {	117 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {

110 // The RenderView should have called CancelPendingExtraction() before	118 // The RenderView should have called CancelPendingExtraction() before

111 // we are destroyed.	119 // we are destroyed.

112 CheckNoPendingExtraction();	120 CheckNoPendingExtraction();

113 }	121 }

114	122

115 void PhishingTermFeatureExtractor::ExtractFeatures(	123 void PhishingTermFeatureExtractor::ExtractFeatures(

116 const base::string16* page_text,	124 const base::string16* page_text,

117 FeatureMap* features,	125 FeatureMap* features,

	126 std::set<uint32>* shingle_hashes,

118 const DoneCallback& done_callback) {	127 const DoneCallback& done_callback) {

119 // The RenderView should have called CancelPendingExtraction() before	128 // The RenderView should have called CancelPendingExtraction() before

120 // starting a new extraction, so DCHECK this.	129 // starting a new extraction, so DCHECK this.

121 CheckNoPendingExtraction();	130 CheckNoPendingExtraction();

122 // However, in an opt build, we will go ahead and clean up the pending	131 // However, in an opt build, we will go ahead and clean up the pending

123 // extraction so that we can start in a known state.	132 // extraction so that we can start in a known state.

124 CancelPendingExtraction();	133 CancelPendingExtraction();

125	134

126 page_text_ = page_text;	135 page_text_ = page_text;

127 features_ = features;	136 features_ = features;

	137 shingle_hashes_ = shingle_hashes,

128 done_callback_ = done_callback;	138 done_callback_ = done_callback;

129	139

130 state_.reset(new ExtractionState(*page_text_, clock_->Now()));	140 state_.reset(new ExtractionState(*page_text_, clock_->Now()));

131 base::MessageLoop::current()->PostTask(	141 base::MessageLoop::current()->PostTask(

132 FROM_HERE,	142 FROM_HERE,

133 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,	143 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,

134 weak_factory_.GetWeakPtr()));	144 weak_factory_.GetWeakPtr()));

135 }	145 }

136	146

137 void PhishingTermFeatureExtractor::CancelPendingExtraction() {	147 void PhishingTermFeatureExtractor::CancelPendingExtraction() {

(...skipping 65 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
203 return;	213 return;

204 }	214 }

205 // Otherwise, continue.	215 // Otherwise, continue.

206 }	216 }

207 }	217 }

208 RunCallback(true);	218 RunCallback(true);

209 }	219 }

210	220

211 void PhishingTermFeatureExtractor::HandleWord(	221 void PhishingTermFeatureExtractor::HandleWord(

212 const base::StringPiece16& word) {	222 const base::StringPiece16& word) {

213 // Quickest out if we have seen this word before and know that it's not	223 // First, extract shingle hashes.

214 // part of any term. This avoids the lowercasing and UTF conversion, both of	224 const std::string& word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));

215 // which are relatively expensive.	225 state_->current_shingle.append(word_lower + " ");

216 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {	226 state_->shingle_word_sizes.push_back(word_lower.size() + 1);

217 // We know we're no longer in a possible n-gram, so clear the previous word	227 if (state_->shingle_word_sizes.size() == shingle_size_) {

218 // state.	228 shingle_hashes_->insert(

219 state_->previous_words.clear();	229 MurmurHash3String(state_->current_shingle, murmurhash3_seed_));

220 state_->previous_word_sizes.clear();	230 state_->current_shingle.erase(0, state_->shingle_word_sizes.front());

221 return;	231 state_->shingle_word_sizes.pop_front();

	232 }

	233 // Check if the size of shingle hashes is over the limit.

	234 if (shingle_hashes_->size() > max_shingles_per_page_) {

	235 // Pop the largest one.

	236 std::set<uint32>::iterator it = shingle_hashes_->end();

	237 shingle_hashes_->erase(--it);

222 }	238 }

223	239

224 std::string word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));	240 // Next, extract page terms.

225 uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);	241 uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);

226	242

227 // Quick out if the word is not part of any term, which is the common case.	243 // Quick out if the word is not part of any term, which is the common case.

228 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {	244 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {

229 // Word doesn't exist in our terms so we can clear the n-gram state.	245 // Word doesn't exist in our terms so we can clear the n-gram state.

230 state_->previous_words.clear();	246 state_->previous_words.clear();

231 state_->previous_word_sizes.clear();	247 state_->previous_word_sizes.clear();

232 // Insert into negative cache so that we don't try this again.

233 negative_word_cache_.Put(word, true);

234 return;	248 return;

235 }	249 }

236	250

237 // Find all of the n-grams that we need to check and compute their SHA-256	251 // Find all of the n-grams that we need to check and compute their SHA-256

238 // hashes.	252 // hashes.

239 std::map<std::string /* hash /, std::string / plaintext */>	253 std::map<std::string /* hash /, std::string / plaintext */>

240 hashes_to_check;	254 hashes_to_check;

241 hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower;	255 hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower;

242	256

243 // Combine the new word with the previous words to find additional n-grams.	257 // Combine the new word with the previous words to find additional n-grams.

(...skipping 51 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
295 clock_->Now() - state_->start_time);	309 clock_->Now() - state_->start_time);

296	310

297 DCHECK(!done_callback_.is_null());	311 DCHECK(!done_callback_.is_null());

298 done_callback_.Run(success);	312 done_callback_.Run(success);

299 Clear();	313 Clear();

300 }	314 }

301	315

302 void PhishingTermFeatureExtractor::Clear() {	316 void PhishingTermFeatureExtractor::Clear() {

303 page_text_ = NULL;	317 page_text_ = NULL;

304 features_ = NULL;	318 features_ = NULL;

	319 shingle_hashes_ = NULL;

305 done_callback_.Reset();	320 done_callback_.Reset();

306 state_.reset(NULL);	321 state_.reset(NULL);

307 negative_word_cache_.Clear();

308 }	322 }

309	323

310 } // namespace safe_browsing	324 } // namespace safe_browsing

OLD	NEW