chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc - Issue 268673007: Extracting page shingle hashes for similarity detection.

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 268673007: Extracting page shingle hashes for similarity detection. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Address 1st round comment Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« chrome/renderer/safe_browsing/phishing_term_feature_extractor.h ('K') | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.h ('k') | chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"	5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"

6	6

7 #include <list>	7 #include <list>

8 #include <map>	8 #include <map>

9	9

10 #include "base/bind.h"	10 #include "base/bind.h"

(...skipping 27 matching lines...) Expand all Loading...
38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;	38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;

39	39

40 // The maximum size of the negative word cache.	40 // The maximum size of the negative word cache.

41 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;	41 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;

42	42

43 // All of the state pertaining to the current feature extraction.	43 // All of the state pertaining to the current feature extraction.

44 struct PhishingTermFeatureExtractor::ExtractionState {	44 struct PhishingTermFeatureExtractor::ExtractionState {

45 // Stores up to max_words_per_term_ previous words separated by spaces.	45 // Stores up to max_words_per_term_ previous words separated by spaces.

46 std::string previous_words;	46 std::string previous_words;

47	47

	48 // Stores the current shingle after a new word is processed and added in.

	49 std::string current_shingle;

	50

	51 // Stores the sizes of the words in current_shingle. Note: the size includes

	52 // the space after each word. In other words, the sum of all sizes in this

	53 // list is equal to the length of current_shingle.

	54 std::list<size_t> shingle_word_sizes;

	55

48 // Stores the sizes of the words in previous_words. Note: the size includes	56 // Stores the sizes of the words in previous_words. Note: the size includes

49 // the space after each word. In other words, the sum of all sizes in this	57 // the space after each word. In other words, the sum of all sizes in this

50 // list is equal to the length of previous_words.	58 // list is equal to the length of previous_words.

51 std::list<size_t> previous_word_sizes;	59 std::list<size_t> previous_word_sizes;

52	60

53 // An iterator for word breaking.	61 // An iterator for word breaking.

54 UBreakIterator* iterator;	62 UBreakIterator* iterator;

55	63

56 // Our current position in the text that was passed to the ExtractionState	64 // Our current position in the text that was passed to the ExtractionState

57 // constructor, speciailly, the most recent break position returned by our	65 // constructor, speciailly, the most recent break position returned by our

(...skipping 30 matching lines...) Expand all Loading...
88 ubrk_close(iterator);	96 ubrk_close(iterator);

89 }	97 }

90 }	98 }

91 };	99 };

92	100

93 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(	101 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(

94 const base::hash_set<std::string>* page_term_hashes,	102 const base::hash_set<std::string>* page_term_hashes,

95 const base::hash_set<uint32>* page_word_hashes,	103 const base::hash_set<uint32>* page_word_hashes,

96 size_t max_words_per_term,	104 size_t max_words_per_term,

97 uint32 murmurhash3_seed,	105 uint32 murmurhash3_seed,

	106 size_t max_shingles_per_page,

	107 size_t shingle_size,

98 FeatureExtractorClock* clock)	108 FeatureExtractorClock* clock)

99 : page_term_hashes_(page_term_hashes),	109 : page_term_hashes_(page_term_hashes),

100 page_word_hashes_(page_word_hashes),	110 page_word_hashes_(page_word_hashes),

101 max_words_per_term_(max_words_per_term),	111 max_words_per_term_(max_words_per_term),

102 murmurhash3_seed_(murmurhash3_seed),	112 murmurhash3_seed_(murmurhash3_seed),

	113 max_shingles_per_page_(max_shingles_per_page),

	114 shingle_size_(shingle_size),

103 negative_word_cache_(kMaxNegativeWordCacheSize),	115 negative_word_cache_(kMaxNegativeWordCacheSize),

104 clock_(clock),	116 clock_(clock),

105 weak_factory_(this) {	117 weak_factory_(this) {

106 Clear();	118 Clear();

107 }	119 }

108	120

109 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {	121 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {

110 // The RenderView should have called CancelPendingExtraction() before	122 // The RenderView should have called CancelPendingExtraction() before

111 // we are destroyed.	123 // we are destroyed.

112 CheckNoPendingExtraction();	124 CheckNoPendingExtraction();

113 }	125 }

114	126

115 void PhishingTermFeatureExtractor::ExtractFeatures(	127 void PhishingTermFeatureExtractor::ExtractFeatures(

116 const base::string16* page_text,	128 const base::string16* page_text,

117 FeatureMap* features,	129 FeatureMap* features,

	130 std::set<uint32>* shingle_hashes,

118 const DoneCallback& done_callback) {	131 const DoneCallback& done_callback) {

119 // The RenderView should have called CancelPendingExtraction() before	132 // The RenderView should have called CancelPendingExtraction() before

120 // starting a new extraction, so DCHECK this.	133 // starting a new extraction, so DCHECK this.

121 CheckNoPendingExtraction();	134 CheckNoPendingExtraction();

122 // However, in an opt build, we will go ahead and clean up the pending	135 // However, in an opt build, we will go ahead and clean up the pending

123 // extraction so that we can start in a known state.	136 // extraction so that we can start in a known state.

124 CancelPendingExtraction();	137 CancelPendingExtraction();

125	138

126 page_text_ = page_text;	139 page_text_ = page_text;

127 features_ = features;	140 features_ = features;

	141 shingle_hashes_ = shingle_hashes,

128 done_callback_ = done_callback;	142 done_callback_ = done_callback;

129	143

130 state_.reset(new ExtractionState(*page_text_, clock_->Now()));	144 state_.reset(new ExtractionState(*page_text_, clock_->Now()));

131 base::MessageLoop::current()->PostTask(	145 base::MessageLoop::current()->PostTask(

132 FROM_HERE,	146 FROM_HERE,

133 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,	147 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,

134 weak_factory_.GetWeakPtr()));	148 weak_factory_.GetWeakPtr()));

135 }	149 }

136	150

137 void PhishingTermFeatureExtractor::CancelPendingExtraction() {	151 void PhishingTermFeatureExtractor::CancelPendingExtraction() {

(...skipping 65 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
203 return;	217 return;

204 }	218 }

205 // Otherwise, continue.	219 // Otherwise, continue.

206 }	220 }

207 }	221 }

208 RunCallback(true);	222 RunCallback(true);

209 }	223 }

210	224

211 void PhishingTermFeatureExtractor::HandleWord(	225 void PhishingTermFeatureExtractor::HandleWord(

212 const base::StringPiece16& word) {	226 const base::StringPiece16& word) {

	227 // First, extract shingle hashes. We check the size of shingle_hashes_ first

	228 // to skip as soon as we reach \|max_shingles_per_page_\|.

	229 std::string word_lower;

	230 if (shingle_hashes_->size() < max_shingles_per_page_) {
	noelutz 2014/05/06 21:40:18 I think we want to keep the min 200 shingles so th I think we want to keep the min 200 shingles so that code isn't quite right. You should always add the new shingle and then remove the largest one if we have more than max_shingles_per_page_ shingles in the set. zysxqn 2014/05/07 19:29:19 I know on the server side experiment we use min 20 Show quoted text On 2014/05/06 21:40:18, noelutz wrote: > I think we want to keep the min 200 shingles so that code isn't quite right. > You should always add the new shingle and then remove the largest one if we have > more than max_shingles_per_page_ shingles in the set. I know on the server side experiment we use min 200 hashes. But for the sake of computing similarity I don't see any problem with keeping first 200 hashes, as long as the extracting logic is the same among all pages. Am I miss anything? zysxqn 2014/05/09 21:34:24 Done. Show quoted text On 2014/05/06 21:40:18, noelutz wrote: > I think we want to keep the min 200 shingles so that code isn't quite right. > You should always add the new shingle and then remove the largest one if we have > more than max_shingles_per_page_ shingles in the set. Done. mattm 2014/05/09 23:28:10 Have you tested if doing it for the whole text has Show quoted text On 2014/05/09 21:34:24, zysxqn wrote: > On 2014/05/06 21:40:18, noelutz wrote: > > I think we want to keep the min 200 shingles so that code isn't quite right. > > You should always add the new shingle and then remove the largest one if we > have > > more than max_shingles_per_page_ shingles in the set. > > Done. Have you tested if doing it for the whole text has any notable performance impact? (Not just the hashing, but the extra lowercasing and utf conversions which are called expensive below.) noelutz 2014/05/10 01:01:20 +1 on performance testing. Could you add a histog Show quoted text On 2014/05/09 23:28:10, mattm wrote: > On 2014/05/09 21:34:24, zysxqn wrote: > > On 2014/05/06 21:40:18, noelutz wrote: > > > I think we want to keep the min 200 shingles so that code isn't quite right. > > > > You should always add the new shingle and then remove the largest one if we > > have > > > more than max_shingles_per_page_ shingles in the set. > > > > Done. > > > Have you tested if doing it for the whole text has any notable performance > impact? (Not just the hashing, but the extra lowercasing and utf conversions > which are called expensive below.) +1 on performance testing. Could you add a histogram measuring how much time we spend doing that?
	231 word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));

	232 state_->current_shingle.append(word_lower + " ");

	233 state_->shingle_word_sizes.push_back(word_lower.size() + 1);

	234 if (state_->shingle_word_sizes.size() == shingle_size_) {

	235 shingle_hashes_->insert(

	236 MurmurHash3String(state_->current_shingle, murmurhash3_seed_));

	237 state_->current_shingle.erase(0, state_->shingle_word_sizes.front());

	238 state_->shingle_word_sizes.pop_front();

	239 }

	240 }

	241

	242 // Next, extract page terms.

	243 //

213 // Quickest out if we have seen this word before and know that it's not	244 // Quickest out if we have seen this word before and know that it's not

214 // part of any term. This avoids the lowercasing and UTF conversion, both of	245 // part of any term. This avoids the lowercasing and UTF conversion, both of

215 // which are relatively expensive.	246 // which are relatively expensive.

216 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {	247 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {

217 // We know we're no longer in a possible n-gram, so clear the previous word	248 // We know we're no longer in a possible n-gram, so clear the previous word

218 // state.	249 // state.

219 state_->previous_words.clear();	250 state_->previous_words.clear();

220 state_->previous_word_sizes.clear();	251 state_->previous_word_sizes.clear();

221 return;	252 return;

222 }	253 }

223	254

224 std::string word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));	255 if (word_lower.empty()) {

	256 word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));

	257 }

225 uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);	258 uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);

226	259

227 // Quick out if the word is not part of any term, which is the common case.	260 // Quick out if the word is not part of any term, which is the common case.

228 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {	261 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {

229 // Word doesn't exist in our terms so we can clear the n-gram state.	262 // Word doesn't exist in our terms so we can clear the n-gram state.

230 state_->previous_words.clear();	263 state_->previous_words.clear();

231 state_->previous_word_sizes.clear();	264 state_->previous_word_sizes.clear();

232 // Insert into negative cache so that we don't try this again.	265 // Insert into negative cache so that we don't try this again.

233 negative_word_cache_.Put(word, true);	266 negative_word_cache_.Put(word, true);

234 return;	267 return;

(...skipping 60 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
295 clock_->Now() - state_->start_time);	328 clock_->Now() - state_->start_time);

296	329

297 DCHECK(!done_callback_.is_null());	330 DCHECK(!done_callback_.is_null());

298 done_callback_.Run(success);	331 done_callback_.Run(success);

299 Clear();	332 Clear();

300 }	333 }

301	334

302 void PhishingTermFeatureExtractor::Clear() {	335 void PhishingTermFeatureExtractor::Clear() {

303 page_text_ = NULL;	336 page_text_ = NULL;

304 features_ = NULL;	337 features_ = NULL;

	338 shingle_hashes_ = NULL;

305 done_callback_.Reset();	339 done_callback_.Reset();

306 state_.reset(NULL);	340 state_.reset(NULL);

307 negative_word_cache_.Clear();	341 negative_word_cache_.Clear();

308 }	342 }

309	343

310 } // namespace safe_browsing	344 } // namespace safe_browsing

OLD	NEW