chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc - Issue 7866011: Switch to the new client-side phishing model that uses Murmurhash for word hashes.

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 7866011: Switch to the new client-side phishing model that uses Murmurhash for word hashes. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Fix compile problems and add another test Created 9 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.h ('k') | chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"	5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"

6	6

7 #include <list>	7 #include <list>

8 #include <map>	8 #include <map>

9	9

10 #include "base/compiler_specific.h"	10 #include "base/compiler_specific.h"

11 #include "base/i18n/case_conversion.h"	11 #include "base/i18n/case_conversion.h"

12 #include "base/logging.h"	12 #include "base/logging.h"

13 #include "base/message_loop.h"	13 #include "base/message_loop.h"

14 #include "base/metrics/histogram.h"	14 #include "base/metrics/histogram.h"

15 #include "base/time.h"	15 #include "base/time.h"

16 #include "base/utf_string_conversions.h"	16 #include "base/utf_string_conversions.h"

17 #include "crypto/sha2.h"	17 #include "crypto/sha2.h"

18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"	18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"

19 #include "chrome/renderer/safe_browsing/features.h"	19 #include "chrome/renderer/safe_browsing/features.h"

	20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"

20 #include "ui/base/l10n/l10n_util.h"	21 #include "ui/base/l10n/l10n_util.h"

21 #include "unicode/ubrk.h"	22 #include "unicode/ubrk.h"

22	23

23 namespace safe_browsing {	24 namespace safe_browsing {

24	25

25 // This time should be short enough that it doesn't noticeably disrupt the	26 // This time should be short enough that it doesn't noticeably disrupt the

26 // user's interaction with the page.	27 // user's interaction with the page.

27 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 20;	28 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 20;

28	29

29 // Experimenting shows that we get a reasonable gain in performance by	30 // Experimenting shows that we get a reasonable gain in performance by

30 // increasing this up to around 10, but there's not much benefit in	31 // increasing this up to around 10, but there's not much benefit in

31 // increasing it past that.	32 // increasing it past that.

32 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5;	33 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5;

33	34

34 // This should be longer than we expect feature extraction to take on any	35 // This should be longer than we expect feature extraction to take on any

35 // actual phishing page.	36 // actual phishing page.

36 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;	37 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;

37	38

38 // The maximum size of the negative word cache.	39 // The maximum size of the negative word cache.

39 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;	40 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;

40	41

41 // All of the state pertaining to the current feature extraction.	42 // All of the state pertaining to the current feature extraction.

42 struct PhishingTermFeatureExtractor::ExtractionState {	43 struct PhishingTermFeatureExtractor::ExtractionState {

43 // Stores up to max_words_per_ngram_ previous words separated by spaces.	44 // Stores up to max_words_per_term_ previous words separated by spaces.

44 std::string previous_words;	45 std::string previous_words;

45	46

46 // Stores the sizes of the words in previous_words. Note: the size includes	47 // Stores the sizes of the words in previous_words. Note: the size includes

47 // the space after each word. In other words, the sum of all sizes in this	48 // the space after each word. In other words, the sum of all sizes in this

48 // list is equal to the length of previous_words.	49 // list is equal to the length of previous_words.

49 std::list<size_t> previous_word_sizes;	50 std::list<size_t> previous_word_sizes;

50	51

51 // An iterator for word breaking.	52 // An iterator for word breaking.

52 UBreakIterator* iterator;	53 UBreakIterator* iterator;

53	54

(...skipping 29 matching lines...) Expand all Loading...
83	84

84 ~ExtractionState() {	85 ~ExtractionState() {

85 if (iterator) {	86 if (iterator) {

86 ubrk_close(iterator);	87 ubrk_close(iterator);

87 }	88 }

88 }	89 }

89 };	90 };

90	91

91 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(	92 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(

92 const base::hash_set<std::string>* page_term_hashes,	93 const base::hash_set<std::string>* page_term_hashes,

93 const base::hash_set<std::string>* page_word_hashes,	94 const base::hash_set<uint32>* page_word_hashes,

94 size_t max_words_per_term,	95 size_t max_words_per_term,

	96 uint32 murmurhash3_seed,

95 FeatureExtractorClock* clock)	97 FeatureExtractorClock* clock)

96 : page_term_hashes_(page_term_hashes),	98 : page_term_hashes_(page_term_hashes),

97 page_word_hashes_(page_word_hashes),	99 page_word_hashes_(page_word_hashes),

98 max_words_per_term_(max_words_per_term),	100 max_words_per_term_(max_words_per_term),

	101 murmurhash3_seed_(murmurhash3_seed),

99 negative_word_cache_(kMaxNegativeWordCacheSize),	102 negative_word_cache_(kMaxNegativeWordCacheSize),

100 clock_(clock),	103 clock_(clock),

101 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {	104 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {

102 Clear();	105 Clear();

103 }	106 }

104	107

105 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {	108 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {

106 // The RenderView should have called CancelPendingExtraction() before	109 // The RenderView should have called CancelPendingExtraction() before

107 // we are destroyed.	110 // we are destroyed.

108 CheckNoPendingExtraction();	111 CheckNoPendingExtraction();

(...skipping 90 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
199 }	202 }

200 // Otherwise, continue.	203 // Otherwise, continue.

201 }	204 }

202 }	205 }

203 RunCallback(true);	206 RunCallback(true);

204 }	207 }

205	208

206 void PhishingTermFeatureExtractor::HandleWord(	209 void PhishingTermFeatureExtractor::HandleWord(

207 const base::StringPiece16& word) {	210 const base::StringPiece16& word) {

208 // Quickest out if we have seen this word before and know that it's not	211 // Quickest out if we have seen this word before and know that it's not

209 // part of any term. This avoids the SHA256, lowercasing, and UTF conversion,	212 // part of any term. This avoids the lowercasing and UTF conversion, both of

210 // all of which are relatively expensive.	213 // which are relatively expensive.

211 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {	214 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {

212 // We know we're no longer in a possible n-gram, so clear the previous word	215 // We know we're no longer in a possible n-gram, so clear the previous word

213 // state.	216 // state.

214 state_->previous_words.clear();	217 state_->previous_words.clear();

215 state_->previous_word_sizes.clear();	218 state_->previous_word_sizes.clear();

216 return;	219 return;

217 }	220 }

218	221

219 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word));	222 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word));

220 std::string word_hash = crypto::SHA256HashString(word_lower);	223 uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);

221	224

222 // Quick out if the word is not part of any term, which is the common case.	225 // Quick out if the word is not part of any term, which is the common case.

223 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {	226 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {

224 // Word doesn't exist in our terms so we can clear the n-gram state.	227 // Word doesn't exist in our terms so we can clear the n-gram state.

225 state_->previous_words.clear();	228 state_->previous_words.clear();

226 state_->previous_word_sizes.clear();	229 state_->previous_word_sizes.clear();

227 // Insert into negative cache so that we don't try this again.	230 // Insert into negative cache so that we don't try this again.

228 negative_word_cache_.Put(word, true);	231 negative_word_cache_.Put(word, true);

229 return;	232 return;

230 }	233 }

231	234

232 // Find all of the n-grams that we need to check and compute their hashes.	235 // Find all of the n-grams that we need to check and compute their SHA-256

233 // We already have the hash for word_lower, so we don't compute that again.	236 // hashes.

234 std::map<std::string /* hash /, std::string / plaintext */>	237 std::map<std::string /* hash /, std::string / plaintext */>

235 hashes_to_check;	238 hashes_to_check;

236 hashes_to_check[word_hash] = word_lower;	239 hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower;

237	240

238 // Combine the new word with the previous words to find additional n-grams.	241 // Combine the new word with the previous words to find additional n-grams.

239 // Note that we don't yet add the new word length to previous_word_sizes,	242 // Note that we don't yet add the new word length to previous_word_sizes,

240 // since we don't want to compute the hash for the word by itself again.	243 // since we don't want to compute the hash for the word by itself again.

241 //	244 //

242 state_->previous_words.append(word_lower);	245 state_->previous_words.append(word_lower);

243 std::string current_term = state_->previous_words;	246 std::string current_term = state_->previous_words;

244 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();	247 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();

245 it != state_->previous_word_sizes.end(); ++it) {	248 it != state_->previous_word_sizes.end(); ++it) {

246 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;	249 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;

(...skipping 49 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
296	299

297 void PhishingTermFeatureExtractor::Clear() {	300 void PhishingTermFeatureExtractor::Clear() {

298 page_text_ = NULL;	301 page_text_ = NULL;

299 features_ = NULL;	302 features_ = NULL;

300 done_callback_.reset(NULL);	303 done_callback_.reset(NULL);

301 state_.reset(NULL);	304 state_.reset(NULL);

302 negative_word_cache_.Clear();	305 negative_word_cache_.Clear();

303 }	306 }

304	307

305 } // namespace safe_browsing	308 } // namespace safe_browsing

OLD	NEW