Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(656)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 7866011: Switch to the new client-side phishing model that uses Murmurhash for word hashes. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Fix compile problems and add another test Created 9 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
6 6
7 #include <list> 7 #include <list>
8 #include <map> 8 #include <map>
9 9
10 #include "base/compiler_specific.h" 10 #include "base/compiler_specific.h"
11 #include "base/i18n/case_conversion.h" 11 #include "base/i18n/case_conversion.h"
12 #include "base/logging.h" 12 #include "base/logging.h"
13 #include "base/message_loop.h" 13 #include "base/message_loop.h"
14 #include "base/metrics/histogram.h" 14 #include "base/metrics/histogram.h"
15 #include "base/time.h" 15 #include "base/time.h"
16 #include "base/utf_string_conversions.h" 16 #include "base/utf_string_conversions.h"
17 #include "crypto/sha2.h" 17 #include "crypto/sha2.h"
18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" 18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
19 #include "chrome/renderer/safe_browsing/features.h" 19 #include "chrome/renderer/safe_browsing/features.h"
20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
20 #include "ui/base/l10n/l10n_util.h" 21 #include "ui/base/l10n/l10n_util.h"
21 #include "unicode/ubrk.h" 22 #include "unicode/ubrk.h"
22 23
23 namespace safe_browsing { 24 namespace safe_browsing {
24 25
25 // This time should be short enough that it doesn't noticeably disrupt the 26 // This time should be short enough that it doesn't noticeably disrupt the
26 // user's interaction with the page. 27 // user's interaction with the page.
27 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 20; 28 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 20;
28 29
29 // Experimenting shows that we get a reasonable gain in performance by 30 // Experimenting shows that we get a reasonable gain in performance by
30 // increasing this up to around 10, but there's not much benefit in 31 // increasing this up to around 10, but there's not much benefit in
31 // increasing it past that. 32 // increasing it past that.
32 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5; 33 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5;
33 34
34 // This should be longer than we expect feature extraction to take on any 35 // This should be longer than we expect feature extraction to take on any
35 // actual phishing page. 36 // actual phishing page.
36 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; 37 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;
37 38
38 // The maximum size of the negative word cache. 39 // The maximum size of the negative word cache.
39 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000; 40 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;
40 41
41 // All of the state pertaining to the current feature extraction. 42 // All of the state pertaining to the current feature extraction.
42 struct PhishingTermFeatureExtractor::ExtractionState { 43 struct PhishingTermFeatureExtractor::ExtractionState {
43 // Stores up to max_words_per_ngram_ previous words separated by spaces. 44 // Stores up to max_words_per_term_ previous words separated by spaces.
44 std::string previous_words; 45 std::string previous_words;
45 46
46 // Stores the sizes of the words in previous_words. Note: the size includes 47 // Stores the sizes of the words in previous_words. Note: the size includes
47 // the space after each word. In other words, the sum of all sizes in this 48 // the space after each word. In other words, the sum of all sizes in this
48 // list is equal to the length of previous_words. 49 // list is equal to the length of previous_words.
49 std::list<size_t> previous_word_sizes; 50 std::list<size_t> previous_word_sizes;
50 51
51 // An iterator for word breaking. 52 // An iterator for word breaking.
52 UBreakIterator* iterator; 53 UBreakIterator* iterator;
53 54
(...skipping 29 matching lines...) Expand all
83 84
84 ~ExtractionState() { 85 ~ExtractionState() {
85 if (iterator) { 86 if (iterator) {
86 ubrk_close(iterator); 87 ubrk_close(iterator);
87 } 88 }
88 } 89 }
89 }; 90 };
90 91
91 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( 92 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
92 const base::hash_set<std::string>* page_term_hashes, 93 const base::hash_set<std::string>* page_term_hashes,
93 const base::hash_set<std::string>* page_word_hashes, 94 const base::hash_set<uint32>* page_word_hashes,
94 size_t max_words_per_term, 95 size_t max_words_per_term,
96 uint32 murmurhash3_seed,
95 FeatureExtractorClock* clock) 97 FeatureExtractorClock* clock)
96 : page_term_hashes_(page_term_hashes), 98 : page_term_hashes_(page_term_hashes),
97 page_word_hashes_(page_word_hashes), 99 page_word_hashes_(page_word_hashes),
98 max_words_per_term_(max_words_per_term), 100 max_words_per_term_(max_words_per_term),
101 murmurhash3_seed_(murmurhash3_seed),
99 negative_word_cache_(kMaxNegativeWordCacheSize), 102 negative_word_cache_(kMaxNegativeWordCacheSize),
100 clock_(clock), 103 clock_(clock),
101 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { 104 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
102 Clear(); 105 Clear();
103 } 106 }
104 107
105 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { 108 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
106 // The RenderView should have called CancelPendingExtraction() before 109 // The RenderView should have called CancelPendingExtraction() before
107 // we are destroyed. 110 // we are destroyed.
108 CheckNoPendingExtraction(); 111 CheckNoPendingExtraction();
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after
199 } 202 }
200 // Otherwise, continue. 203 // Otherwise, continue.
201 } 204 }
202 } 205 }
203 RunCallback(true); 206 RunCallback(true);
204 } 207 }
205 208
206 void PhishingTermFeatureExtractor::HandleWord( 209 void PhishingTermFeatureExtractor::HandleWord(
207 const base::StringPiece16& word) { 210 const base::StringPiece16& word) {
208 // Quickest out if we have seen this word before and know that it's not 211 // Quickest out if we have seen this word before and know that it's not
209 // part of any term. This avoids the SHA256, lowercasing, and UTF conversion, 212 // part of any term. This avoids the lowercasing and UTF conversion, both of
210 // all of which are relatively expensive. 213 // which are relatively expensive.
211 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) { 214 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {
212 // We know we're no longer in a possible n-gram, so clear the previous word 215 // We know we're no longer in a possible n-gram, so clear the previous word
213 // state. 216 // state.
214 state_->previous_words.clear(); 217 state_->previous_words.clear();
215 state_->previous_word_sizes.clear(); 218 state_->previous_word_sizes.clear();
216 return; 219 return;
217 } 220 }
218 221
219 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word)); 222 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word));
220 std::string word_hash = crypto::SHA256HashString(word_lower); 223 uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);
221 224
222 // Quick out if the word is not part of any term, which is the common case. 225 // Quick out if the word is not part of any term, which is the common case.
223 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { 226 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {
224 // Word doesn't exist in our terms so we can clear the n-gram state. 227 // Word doesn't exist in our terms so we can clear the n-gram state.
225 state_->previous_words.clear(); 228 state_->previous_words.clear();
226 state_->previous_word_sizes.clear(); 229 state_->previous_word_sizes.clear();
227 // Insert into negative cache so that we don't try this again. 230 // Insert into negative cache so that we don't try this again.
228 negative_word_cache_.Put(word, true); 231 negative_word_cache_.Put(word, true);
229 return; 232 return;
230 } 233 }
231 234
232 // Find all of the n-grams that we need to check and compute their hashes. 235 // Find all of the n-grams that we need to check and compute their SHA-256
233 // We already have the hash for word_lower, so we don't compute that again. 236 // hashes.
234 std::map<std::string /* hash */, std::string /* plaintext */> 237 std::map<std::string /* hash */, std::string /* plaintext */>
235 hashes_to_check; 238 hashes_to_check;
236 hashes_to_check[word_hash] = word_lower; 239 hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower;
237 240
238 // Combine the new word with the previous words to find additional n-grams. 241 // Combine the new word with the previous words to find additional n-grams.
239 // Note that we don't yet add the new word length to previous_word_sizes, 242 // Note that we don't yet add the new word length to previous_word_sizes,
240 // since we don't want to compute the hash for the word by itself again. 243 // since we don't want to compute the hash for the word by itself again.
241 // 244 //
242 state_->previous_words.append(word_lower); 245 state_->previous_words.append(word_lower);
243 std::string current_term = state_->previous_words; 246 std::string current_term = state_->previous_words;
244 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); 247 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();
245 it != state_->previous_word_sizes.end(); ++it) { 248 it != state_->previous_word_sizes.end(); ++it) {
246 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term; 249 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
296 299
297 void PhishingTermFeatureExtractor::Clear() { 300 void PhishingTermFeatureExtractor::Clear() {
298 page_text_ = NULL; 301 page_text_ = NULL;
299 features_ = NULL; 302 features_ = NULL;
300 done_callback_.reset(NULL); 303 done_callback_.reset(NULL);
301 state_.reset(NULL); 304 state_.reset(NULL);
302 negative_word_cache_.Clear(); 305 negative_word_cache_.Clear();
303 } 306 }
304 307
305 } // namespace safe_browsing 308 } // namespace safe_browsing
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698