| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
| 6 | 6 |
| 7 #include <list> | 7 #include <list> |
| 8 #include <map> | 8 #include <map> |
| 9 | 9 |
| 10 #include "base/bind.h" | 10 #include "base/bind.h" |
| (...skipping 19 matching lines...) Expand all Loading... |
| 30 | 30 |
| 31 // Experimenting shows that we get a reasonable gain in performance by | 31 // Experimenting shows that we get a reasonable gain in performance by |
| 32 // increasing this up to around 10, but there's not much benefit in | 32 // increasing this up to around 10, but there's not much benefit in |
| 33 // increasing it past that. | 33 // increasing it past that. |
| 34 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5; | 34 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5; |
| 35 | 35 |
| 36 // This should be longer than we expect feature extraction to take on any | 36 // This should be longer than we expect feature extraction to take on any |
| 37 // actual phishing page. | 37 // actual phishing page. |
| 38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; | 38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; |
| 39 | 39 |
| 40 // The maximum size of the negative word cache. | |
| 41 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000; | |
| 42 | |
| 43 // All of the state pertaining to the current feature extraction. | 40 // All of the state pertaining to the current feature extraction. |
| 44 struct PhishingTermFeatureExtractor::ExtractionState { | 41 struct PhishingTermFeatureExtractor::ExtractionState { |
| 45 // Stores up to max_words_per_term_ previous words separated by spaces. | 42 // Stores up to max_words_per_term_ previous words separated by spaces. |
| 46 std::string previous_words; | 43 std::string previous_words; |
| 47 | 44 |
| 45 // Stores the current shingle after a new word is processed and added in. |
| 46 std::string current_shingle; |
| 47 |
| 48 // Stores the sizes of the words in current_shingle. Note: the size includes |
| 49 // the space after each word. In other words, the sum of all sizes in this |
| 50 // list is equal to the length of current_shingle. |
| 51 std::list<size_t> shingle_word_sizes; |
| 52 |
| 48 // Stores the sizes of the words in previous_words. Note: the size includes | 53 // Stores the sizes of the words in previous_words. Note: the size includes |
| 49 // the space after each word. In other words, the sum of all sizes in this | 54 // the space after each word. In other words, the sum of all sizes in this |
| 50 // list is equal to the length of previous_words. | 55 // list is equal to the length of previous_words. |
| 51 std::list<size_t> previous_word_sizes; | 56 std::list<size_t> previous_word_sizes; |
| 52 | 57 |
| 53 // An iterator for word breaking. | 58 // An iterator for word breaking. |
| 54 UBreakIterator* iterator; | 59 UBreakIterator* iterator; |
| 55 | 60 |
| 56 // Our current position in the text that was passed to the ExtractionState | 61 // Our current position in the text that was passed to the ExtractionState |
| 57 // constructor, speciailly, the most recent break position returned by our | 62 // constructor, speciailly, the most recent break position returned by our |
| (...skipping 30 matching lines...) Expand all Loading... |
| 88 ubrk_close(iterator); | 93 ubrk_close(iterator); |
| 89 } | 94 } |
| 90 } | 95 } |
| 91 }; | 96 }; |
| 92 | 97 |
| 93 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( | 98 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( |
| 94 const base::hash_set<std::string>* page_term_hashes, | 99 const base::hash_set<std::string>* page_term_hashes, |
| 95 const base::hash_set<uint32>* page_word_hashes, | 100 const base::hash_set<uint32>* page_word_hashes, |
| 96 size_t max_words_per_term, | 101 size_t max_words_per_term, |
| 97 uint32 murmurhash3_seed, | 102 uint32 murmurhash3_seed, |
| 103 size_t max_shingles_per_page, |
| 104 size_t shingle_size, |
| 98 FeatureExtractorClock* clock) | 105 FeatureExtractorClock* clock) |
| 99 : page_term_hashes_(page_term_hashes), | 106 : page_term_hashes_(page_term_hashes), |
| 100 page_word_hashes_(page_word_hashes), | 107 page_word_hashes_(page_word_hashes), |
| 101 max_words_per_term_(max_words_per_term), | 108 max_words_per_term_(max_words_per_term), |
| 102 murmurhash3_seed_(murmurhash3_seed), | 109 murmurhash3_seed_(murmurhash3_seed), |
| 103 negative_word_cache_(kMaxNegativeWordCacheSize), | 110 max_shingles_per_page_(max_shingles_per_page), |
| 111 shingle_size_(shingle_size), |
| 104 clock_(clock), | 112 clock_(clock), |
| 105 weak_factory_(this) { | 113 weak_factory_(this) { |
| 106 Clear(); | 114 Clear(); |
| 107 } | 115 } |
| 108 | 116 |
| 109 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { | 117 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { |
| 110 // The RenderView should have called CancelPendingExtraction() before | 118 // The RenderView should have called CancelPendingExtraction() before |
| 111 // we are destroyed. | 119 // we are destroyed. |
| 112 CheckNoPendingExtraction(); | 120 CheckNoPendingExtraction(); |
| 113 } | 121 } |
| 114 | 122 |
| 115 void PhishingTermFeatureExtractor::ExtractFeatures( | 123 void PhishingTermFeatureExtractor::ExtractFeatures( |
| 116 const base::string16* page_text, | 124 const base::string16* page_text, |
| 117 FeatureMap* features, | 125 FeatureMap* features, |
| 126 std::set<uint32>* shingle_hashes, |
| 118 const DoneCallback& done_callback) { | 127 const DoneCallback& done_callback) { |
| 119 // The RenderView should have called CancelPendingExtraction() before | 128 // The RenderView should have called CancelPendingExtraction() before |
| 120 // starting a new extraction, so DCHECK this. | 129 // starting a new extraction, so DCHECK this. |
| 121 CheckNoPendingExtraction(); | 130 CheckNoPendingExtraction(); |
| 122 // However, in an opt build, we will go ahead and clean up the pending | 131 // However, in an opt build, we will go ahead and clean up the pending |
| 123 // extraction so that we can start in a known state. | 132 // extraction so that we can start in a known state. |
| 124 CancelPendingExtraction(); | 133 CancelPendingExtraction(); |
| 125 | 134 |
| 126 page_text_ = page_text; | 135 page_text_ = page_text; |
| 127 features_ = features; | 136 features_ = features; |
| 137 shingle_hashes_ = shingle_hashes, |
| 128 done_callback_ = done_callback; | 138 done_callback_ = done_callback; |
| 129 | 139 |
| 130 state_.reset(new ExtractionState(*page_text_, clock_->Now())); | 140 state_.reset(new ExtractionState(*page_text_, clock_->Now())); |
| 131 base::MessageLoop::current()->PostTask( | 141 base::MessageLoop::current()->PostTask( |
| 132 FROM_HERE, | 142 FROM_HERE, |
| 133 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout, | 143 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout, |
| 134 weak_factory_.GetWeakPtr())); | 144 weak_factory_.GetWeakPtr())); |
| 135 } | 145 } |
| 136 | 146 |
| 137 void PhishingTermFeatureExtractor::CancelPendingExtraction() { | 147 void PhishingTermFeatureExtractor::CancelPendingExtraction() { |
| (...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 203 return; | 213 return; |
| 204 } | 214 } |
| 205 // Otherwise, continue. | 215 // Otherwise, continue. |
| 206 } | 216 } |
| 207 } | 217 } |
| 208 RunCallback(true); | 218 RunCallback(true); |
| 209 } | 219 } |
| 210 | 220 |
| 211 void PhishingTermFeatureExtractor::HandleWord( | 221 void PhishingTermFeatureExtractor::HandleWord( |
| 212 const base::StringPiece16& word) { | 222 const base::StringPiece16& word) { |
| 213 // Quickest out if we have seen this word before and know that it's not | 223 // First, extract shingle hashes. |
| 214 // part of any term. This avoids the lowercasing and UTF conversion, both of | 224 const std::string& word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word)); |
| 215 // which are relatively expensive. | 225 state_->current_shingle.append(word_lower + " "); |
| 216 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) { | 226 state_->shingle_word_sizes.push_back(word_lower.size() + 1); |
| 217 // We know we're no longer in a possible n-gram, so clear the previous word | 227 if (state_->shingle_word_sizes.size() == shingle_size_) { |
| 218 // state. | 228 shingle_hashes_->insert( |
| 219 state_->previous_words.clear(); | 229 MurmurHash3String(state_->current_shingle, murmurhash3_seed_)); |
| 220 state_->previous_word_sizes.clear(); | 230 state_->current_shingle.erase(0, state_->shingle_word_sizes.front()); |
| 221 return; | 231 state_->shingle_word_sizes.pop_front(); |
| 232 } |
| 233 // Check if the size of shingle hashes is over the limit. |
| 234 if (shingle_hashes_->size() > max_shingles_per_page_) { |
| 235 // Pop the largest one. |
| 236 std::set<uint32>::iterator it = shingle_hashes_->end(); |
| 237 shingle_hashes_->erase(--it); |
| 222 } | 238 } |
| 223 | 239 |
| 224 std::string word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word)); | 240 // Next, extract page terms. |
| 225 uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_); | 241 uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_); |
| 226 | 242 |
| 227 // Quick out if the word is not part of any term, which is the common case. | 243 // Quick out if the word is not part of any term, which is the common case. |
| 228 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { | 244 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { |
| 229 // Word doesn't exist in our terms so we can clear the n-gram state. | 245 // Word doesn't exist in our terms so we can clear the n-gram state. |
| 230 state_->previous_words.clear(); | 246 state_->previous_words.clear(); |
| 231 state_->previous_word_sizes.clear(); | 247 state_->previous_word_sizes.clear(); |
| 232 // Insert into negative cache so that we don't try this again. | |
| 233 negative_word_cache_.Put(word, true); | |
| 234 return; | 248 return; |
| 235 } | 249 } |
| 236 | 250 |
| 237 // Find all of the n-grams that we need to check and compute their SHA-256 | 251 // Find all of the n-grams that we need to check and compute their SHA-256 |
| 238 // hashes. | 252 // hashes. |
| 239 std::map<std::string /* hash */, std::string /* plaintext */> | 253 std::map<std::string /* hash */, std::string /* plaintext */> |
| 240 hashes_to_check; | 254 hashes_to_check; |
| 241 hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower; | 255 hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower; |
| 242 | 256 |
| 243 // Combine the new word with the previous words to find additional n-grams. | 257 // Combine the new word with the previous words to find additional n-grams. |
| (...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 295 clock_->Now() - state_->start_time); | 309 clock_->Now() - state_->start_time); |
| 296 | 310 |
| 297 DCHECK(!done_callback_.is_null()); | 311 DCHECK(!done_callback_.is_null()); |
| 298 done_callback_.Run(success); | 312 done_callback_.Run(success); |
| 299 Clear(); | 313 Clear(); |
| 300 } | 314 } |
| 301 | 315 |
| 302 void PhishingTermFeatureExtractor::Clear() { | 316 void PhishingTermFeatureExtractor::Clear() { |
| 303 page_text_ = NULL; | 317 page_text_ = NULL; |
| 304 features_ = NULL; | 318 features_ = NULL; |
| 319 shingle_hashes_ = NULL; |
| 305 done_callback_.Reset(); | 320 done_callback_.Reset(); |
| 306 state_.reset(NULL); | 321 state_.reset(NULL); |
| 307 negative_word_cache_.Clear(); | |
| 308 } | 322 } |
| 309 | 323 |
| 310 } // namespace safe_browsing | 324 } // namespace safe_browsing |
| OLD | NEW |