| OLD | NEW |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
| 6 | 6 |
| 7 #include <list> | 7 #include <list> |
| 8 #include <map> | 8 #include <map> |
| 9 | 9 |
| 10 #include "base/compiler_specific.h" | 10 #include "base/compiler_specific.h" |
| 11 #include "base/i18n/case_conversion.h" | 11 #include "base/i18n/case_conversion.h" |
| 12 #include "base/logging.h" | 12 #include "base/logging.h" |
| 13 #include "base/message_loop.h" | 13 #include "base/message_loop.h" |
| 14 #include "base/metrics/histogram.h" | 14 #include "base/metrics/histogram.h" |
| 15 #include "base/time.h" | 15 #include "base/time.h" |
| 16 #include "base/utf_string_conversions.h" | 16 #include "base/utf_string_conversions.h" |
| 17 #include "crypto/sha2.h" | 17 #include "crypto/sha2.h" |
| 18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" | 18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
| 19 #include "chrome/renderer/safe_browsing/features.h" | 19 #include "chrome/renderer/safe_browsing/features.h" |
| 20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" |
| 20 #include "ui/base/l10n/l10n_util.h" | 21 #include "ui/base/l10n/l10n_util.h" |
| 21 #include "unicode/ubrk.h" | 22 #include "unicode/ubrk.h" |
| 22 | 23 |
| 23 namespace safe_browsing { | 24 namespace safe_browsing { |
| 24 | 25 |
| 25 // This time should be short enough that it doesn't noticeably disrupt the | 26 // This time should be short enough that it doesn't noticeably disrupt the |
| 26 // user's interaction with the page. | 27 // user's interaction with the page. |
| 27 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 20; | 28 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 20; |
| 28 | 29 |
| 29 // Experimenting shows that we get a reasonable gain in performance by | 30 // Experimenting shows that we get a reasonable gain in performance by |
| 30 // increasing this up to around 10, but there's not much benefit in | 31 // increasing this up to around 10, but there's not much benefit in |
| 31 // increasing it past that. | 32 // increasing it past that. |
| 32 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5; | 33 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5; |
| 33 | 34 |
| 34 // This should be longer than we expect feature extraction to take on any | 35 // This should be longer than we expect feature extraction to take on any |
| 35 // actual phishing page. | 36 // actual phishing page. |
| 36 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; | 37 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; |
| 37 | 38 |
| 38 // The maximum size of the negative word cache. | 39 // The maximum size of the negative word cache. |
| 39 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000; | 40 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000; |
| 40 | 41 |
| 41 // All of the state pertaining to the current feature extraction. | 42 // All of the state pertaining to the current feature extraction. |
| 42 struct PhishingTermFeatureExtractor::ExtractionState { | 43 struct PhishingTermFeatureExtractor::ExtractionState { |
| 43 // Stores up to max_words_per_ngram_ previous words separated by spaces. | 44 // Stores up to max_words_per_term_ previous words separated by spaces. |
| 44 std::string previous_words; | 45 std::string previous_words; |
| 45 | 46 |
| 46 // Stores the sizes of the words in previous_words. Note: the size includes | 47 // Stores the sizes of the words in previous_words. Note: the size includes |
| 47 // the space after each word. In other words, the sum of all sizes in this | 48 // the space after each word. In other words, the sum of all sizes in this |
| 48 // list is equal to the length of previous_words. | 49 // list is equal to the length of previous_words. |
| 49 std::list<size_t> previous_word_sizes; | 50 std::list<size_t> previous_word_sizes; |
| 50 | 51 |
| 51 // An iterator for word breaking. | 52 // An iterator for word breaking. |
| 52 UBreakIterator* iterator; | 53 UBreakIterator* iterator; |
| 53 | 54 |
| (...skipping 29 matching lines...) Expand all Loading... |
| 83 | 84 |
| 84 ~ExtractionState() { | 85 ~ExtractionState() { |
| 85 if (iterator) { | 86 if (iterator) { |
| 86 ubrk_close(iterator); | 87 ubrk_close(iterator); |
| 87 } | 88 } |
| 88 } | 89 } |
| 89 }; | 90 }; |
| 90 | 91 |
| 91 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( | 92 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( |
| 92 const base::hash_set<std::string>* page_term_hashes, | 93 const base::hash_set<std::string>* page_term_hashes, |
| 93 const base::hash_set<std::string>* page_word_hashes, | 94 const base::hash_set<uint32>* page_word_hashes, |
| 94 size_t max_words_per_term, | 95 size_t max_words_per_term, |
| 96 uint32 murmurhash3_seed, |
| 95 FeatureExtractorClock* clock) | 97 FeatureExtractorClock* clock) |
| 96 : page_term_hashes_(page_term_hashes), | 98 : page_term_hashes_(page_term_hashes), |
| 97 page_word_hashes_(page_word_hashes), | 99 page_word_hashes_(page_word_hashes), |
| 98 max_words_per_term_(max_words_per_term), | 100 max_words_per_term_(max_words_per_term), |
| 101 murmurhash3_seed_(murmurhash3_seed), |
| 99 negative_word_cache_(kMaxNegativeWordCacheSize), | 102 negative_word_cache_(kMaxNegativeWordCacheSize), |
| 100 clock_(clock), | 103 clock_(clock), |
| 101 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { | 104 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { |
| 102 Clear(); | 105 Clear(); |
| 103 } | 106 } |
| 104 | 107 |
| 105 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { | 108 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { |
| 106 // The RenderView should have called CancelPendingExtraction() before | 109 // The RenderView should have called CancelPendingExtraction() before |
| 107 // we are destroyed. | 110 // we are destroyed. |
| 108 CheckNoPendingExtraction(); | 111 CheckNoPendingExtraction(); |
| (...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 199 } | 202 } |
| 200 // Otherwise, continue. | 203 // Otherwise, continue. |
| 201 } | 204 } |
| 202 } | 205 } |
| 203 RunCallback(true); | 206 RunCallback(true); |
| 204 } | 207 } |
| 205 | 208 |
| 206 void PhishingTermFeatureExtractor::HandleWord( | 209 void PhishingTermFeatureExtractor::HandleWord( |
| 207 const base::StringPiece16& word) { | 210 const base::StringPiece16& word) { |
| 208 // Quickest out if we have seen this word before and know that it's not | 211 // Quickest out if we have seen this word before and know that it's not |
| 209 // part of any term. This avoids the SHA256, lowercasing, and UTF conversion, | 212 // part of any term. This avoids the lowercasing and UTF conversion, both of |
| 210 // all of which are relatively expensive. | 213 // which are relatively expensive. |
| 211 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) { | 214 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) { |
| 212 // We know we're no longer in a possible n-gram, so clear the previous word | 215 // We know we're no longer in a possible n-gram, so clear the previous word |
| 213 // state. | 216 // state. |
| 214 state_->previous_words.clear(); | 217 state_->previous_words.clear(); |
| 215 state_->previous_word_sizes.clear(); | 218 state_->previous_word_sizes.clear(); |
| 216 return; | 219 return; |
| 217 } | 220 } |
| 218 | 221 |
| 219 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word)); | 222 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word)); |
| 220 std::string word_hash = crypto::SHA256HashString(word_lower); | 223 uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_); |
| 221 | 224 |
| 222 // Quick out if the word is not part of any term, which is the common case. | 225 // Quick out if the word is not part of any term, which is the common case. |
| 223 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { | 226 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { |
| 224 // Word doesn't exist in our terms so we can clear the n-gram state. | 227 // Word doesn't exist in our terms so we can clear the n-gram state. |
| 225 state_->previous_words.clear(); | 228 state_->previous_words.clear(); |
| 226 state_->previous_word_sizes.clear(); | 229 state_->previous_word_sizes.clear(); |
| 227 // Insert into negative cache so that we don't try this again. | 230 // Insert into negative cache so that we don't try this again. |
| 228 negative_word_cache_.Put(word, true); | 231 negative_word_cache_.Put(word, true); |
| 229 return; | 232 return; |
| 230 } | 233 } |
| 231 | 234 |
| 232 // Find all of the n-grams that we need to check and compute their hashes. | 235 // Find all of the n-grams that we need to check and compute their SHA-256 |
| 233 // We already have the hash for word_lower, so we don't compute that again. | 236 // hashes. |
| 234 std::map<std::string /* hash */, std::string /* plaintext */> | 237 std::map<std::string /* hash */, std::string /* plaintext */> |
| 235 hashes_to_check; | 238 hashes_to_check; |
| 236 hashes_to_check[word_hash] = word_lower; | 239 hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower; |
| 237 | 240 |
| 238 // Combine the new word with the previous words to find additional n-grams. | 241 // Combine the new word with the previous words to find additional n-grams. |
| 239 // Note that we don't yet add the new word length to previous_word_sizes, | 242 // Note that we don't yet add the new word length to previous_word_sizes, |
| 240 // since we don't want to compute the hash for the word by itself again. | 243 // since we don't want to compute the hash for the word by itself again. |
| 241 // | 244 // |
| 242 state_->previous_words.append(word_lower); | 245 state_->previous_words.append(word_lower); |
| 243 std::string current_term = state_->previous_words; | 246 std::string current_term = state_->previous_words; |
| 244 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); | 247 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); |
| 245 it != state_->previous_word_sizes.end(); ++it) { | 248 it != state_->previous_word_sizes.end(); ++it) { |
| 246 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term; | 249 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term; |
| (...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 296 | 299 |
| 297 void PhishingTermFeatureExtractor::Clear() { | 300 void PhishingTermFeatureExtractor::Clear() { |
| 298 page_text_ = NULL; | 301 page_text_ = NULL; |
| 299 features_ = NULL; | 302 features_ = NULL; |
| 300 done_callback_.reset(NULL); | 303 done_callback_.reset(NULL); |
| 301 state_.reset(NULL); | 304 state_.reset(NULL); |
| 302 negative_word_cache_.Clear(); | 305 negative_word_cache_.Clear(); |
| 303 } | 306 } |
| 304 | 307 |
| 305 } // namespace safe_browsing | 308 } // namespace safe_browsing |
| OLD | NEW |