OLD | NEW |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
6 | 6 |
7 #include <list> | 7 #include <list> |
8 #include <map> | 8 #include <map> |
9 | 9 |
10 #include "base/compiler_specific.h" | 10 #include "base/compiler_specific.h" |
11 #include "base/i18n/case_conversion.h" | 11 #include "base/i18n/case_conversion.h" |
12 #include "base/logging.h" | 12 #include "base/logging.h" |
13 #include "base/message_loop.h" | 13 #include "base/message_loop.h" |
14 #include "base/metrics/histogram.h" | 14 #include "base/metrics/histogram.h" |
15 #include "base/time.h" | 15 #include "base/time.h" |
16 #include "base/utf_string_conversions.h" | 16 #include "base/utf_string_conversions.h" |
17 #include "crypto/sha2.h" | 17 #include "crypto/sha2.h" |
18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" | 18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
19 #include "chrome/renderer/safe_browsing/features.h" | 19 #include "chrome/renderer/safe_browsing/features.h" |
| 20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" |
20 #include "ui/base/l10n/l10n_util.h" | 21 #include "ui/base/l10n/l10n_util.h" |
21 #include "unicode/ubrk.h" | 22 #include "unicode/ubrk.h" |
22 | 23 |
23 namespace safe_browsing { | 24 namespace safe_browsing { |
24 | 25 |
25 // This time should be short enough that it doesn't noticeably disrupt the | 26 // This time should be short enough that it doesn't noticeably disrupt the |
26 // user's interaction with the page. | 27 // user's interaction with the page. |
27 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 20; | 28 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 20; |
28 | 29 |
29 // Experimenting shows that we get a reasonable gain in performance by | 30 // Experimenting shows that we get a reasonable gain in performance by |
30 // increasing this up to around 10, but there's not much benefit in | 31 // increasing this up to around 10, but there's not much benefit in |
31 // increasing it past that. | 32 // increasing it past that. |
32 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5; | 33 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5; |
33 | 34 |
34 // This should be longer than we expect feature extraction to take on any | 35 // This should be longer than we expect feature extraction to take on any |
35 // actual phishing page. | 36 // actual phishing page. |
36 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; | 37 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; |
37 | 38 |
38 // The maximum size of the negative word cache. | 39 // The maximum size of the negative word cache. |
39 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000; | 40 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000; |
40 | 41 |
41 // All of the state pertaining to the current feature extraction. | 42 // All of the state pertaining to the current feature extraction. |
42 struct PhishingTermFeatureExtractor::ExtractionState { | 43 struct PhishingTermFeatureExtractor::ExtractionState { |
43 // Stores up to max_words_per_ngram_ previous words separated by spaces. | 44 // Stores up to max_words_per_term_ previous words separated by spaces. |
44 std::string previous_words; | 45 std::string previous_words; |
45 | 46 |
46 // Stores the sizes of the words in previous_words. Note: the size includes | 47 // Stores the sizes of the words in previous_words. Note: the size includes |
47 // the space after each word. In other words, the sum of all sizes in this | 48 // the space after each word. In other words, the sum of all sizes in this |
48 // list is equal to the length of previous_words. | 49 // list is equal to the length of previous_words. |
49 std::list<size_t> previous_word_sizes; | 50 std::list<size_t> previous_word_sizes; |
50 | 51 |
51 // An iterator for word breaking. | 52 // An iterator for word breaking. |
52 UBreakIterator* iterator; | 53 UBreakIterator* iterator; |
53 | 54 |
(...skipping 29 matching lines...) Expand all Loading... |
83 | 84 |
84 ~ExtractionState() { | 85 ~ExtractionState() { |
85 if (iterator) { | 86 if (iterator) { |
86 ubrk_close(iterator); | 87 ubrk_close(iterator); |
87 } | 88 } |
88 } | 89 } |
89 }; | 90 }; |
90 | 91 |
91 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( | 92 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( |
92 const base::hash_set<std::string>* page_term_hashes, | 93 const base::hash_set<std::string>* page_term_hashes, |
93 const base::hash_set<std::string>* page_word_hashes, | 94 const base::hash_set<uint32>* page_word_hashes, |
94 size_t max_words_per_term, | 95 size_t max_words_per_term, |
| 96 uint32 murmurhash3_seed, |
95 FeatureExtractorClock* clock) | 97 FeatureExtractorClock* clock) |
96 : page_term_hashes_(page_term_hashes), | 98 : page_term_hashes_(page_term_hashes), |
97 page_word_hashes_(page_word_hashes), | 99 page_word_hashes_(page_word_hashes), |
98 max_words_per_term_(max_words_per_term), | 100 max_words_per_term_(max_words_per_term), |
| 101 murmurhash3_seed_(murmurhash3_seed), |
99 negative_word_cache_(kMaxNegativeWordCacheSize), | 102 negative_word_cache_(kMaxNegativeWordCacheSize), |
100 clock_(clock), | 103 clock_(clock), |
101 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { | 104 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { |
102 Clear(); | 105 Clear(); |
103 } | 106 } |
104 | 107 |
105 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { | 108 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { |
106 // The RenderView should have called CancelPendingExtraction() before | 109 // The RenderView should have called CancelPendingExtraction() before |
107 // we are destroyed. | 110 // we are destroyed. |
108 CheckNoPendingExtraction(); | 111 CheckNoPendingExtraction(); |
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
199 } | 202 } |
200 // Otherwise, continue. | 203 // Otherwise, continue. |
201 } | 204 } |
202 } | 205 } |
203 RunCallback(true); | 206 RunCallback(true); |
204 } | 207 } |
205 | 208 |
206 void PhishingTermFeatureExtractor::HandleWord( | 209 void PhishingTermFeatureExtractor::HandleWord( |
207 const base::StringPiece16& word) { | 210 const base::StringPiece16& word) { |
208 // Quickest out if we have seen this word before and know that it's not | 211 // Quickest out if we have seen this word before and know that it's not |
209 // part of any term. This avoids the SHA256, lowercasing, and UTF conversion, | 212 // part of any term. This avoids the lowercasing and UTF conversion, both of |
210 // all of which are relatively expensive. | 213 // which are relatively expensive. |
211 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) { | 214 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) { |
212 // We know we're no longer in a possible n-gram, so clear the previous word | 215 // We know we're no longer in a possible n-gram, so clear the previous word |
213 // state. | 216 // state. |
214 state_->previous_words.clear(); | 217 state_->previous_words.clear(); |
215 state_->previous_word_sizes.clear(); | 218 state_->previous_word_sizes.clear(); |
216 return; | 219 return; |
217 } | 220 } |
218 | 221 |
219 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word)); | 222 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word)); |
220 std::string word_hash = crypto::SHA256HashString(word_lower); | 223 uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_); |
221 | 224 |
222 // Quick out if the word is not part of any term, which is the common case. | 225 // Quick out if the word is not part of any term, which is the common case. |
223 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { | 226 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { |
224 // Word doesn't exist in our terms so we can clear the n-gram state. | 227 // Word doesn't exist in our terms so we can clear the n-gram state. |
225 state_->previous_words.clear(); | 228 state_->previous_words.clear(); |
226 state_->previous_word_sizes.clear(); | 229 state_->previous_word_sizes.clear(); |
227 // Insert into negative cache so that we don't try this again. | 230 // Insert into negative cache so that we don't try this again. |
228 negative_word_cache_.Put(word, true); | 231 negative_word_cache_.Put(word, true); |
229 return; | 232 return; |
230 } | 233 } |
231 | 234 |
232 // Find all of the n-grams that we need to check and compute their hashes. | 235 // Find all of the n-grams that we need to check and compute their SHA-256 |
233 // We already have the hash for word_lower, so we don't compute that again. | 236 // hashes. |
234 std::map<std::string /* hash */, std::string /* plaintext */> | 237 std::map<std::string /* hash */, std::string /* plaintext */> |
235 hashes_to_check; | 238 hashes_to_check; |
236 hashes_to_check[word_hash] = word_lower; | 239 hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower; |
237 | 240 |
238 // Combine the new word with the previous words to find additional n-grams. | 241 // Combine the new word with the previous words to find additional n-grams. |
239 // Note that we don't yet add the new word length to previous_word_sizes, | 242 // Note that we don't yet add the new word length to previous_word_sizes, |
240 // since we don't want to compute the hash for the word by itself again. | 243 // since we don't want to compute the hash for the word by itself again. |
241 // | 244 // |
242 state_->previous_words.append(word_lower); | 245 state_->previous_words.append(word_lower); |
243 std::string current_term = state_->previous_words; | 246 std::string current_term = state_->previous_words; |
244 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); | 247 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); |
245 it != state_->previous_word_sizes.end(); ++it) { | 248 it != state_->previous_word_sizes.end(); ++it) { |
246 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term; | 249 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term; |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
296 | 299 |
297 void PhishingTermFeatureExtractor::Clear() { | 300 void PhishingTermFeatureExtractor::Clear() { |
298 page_text_ = NULL; | 301 page_text_ = NULL; |
299 features_ = NULL; | 302 features_ = NULL; |
300 done_callback_.reset(NULL); | 303 done_callback_.reset(NULL); |
301 state_.reset(NULL); | 304 state_.reset(NULL); |
302 negative_word_cache_.Clear(); | 305 negative_word_cache_.Clear(); |
303 } | 306 } |
304 | 307 |
305 } // namespace safe_browsing | 308 } // namespace safe_browsing |
OLD | NEW |