| OLD | NEW |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
| 6 | 6 |
| 7 #include <list> | 7 #include <list> |
| 8 #include <map> | 8 #include <map> |
| 9 | 9 |
| 10 #include "base/compiler_specific.h" | 10 #include "base/compiler_specific.h" |
| (...skipping 17 matching lines...) Expand all Loading... |
| 28 | 28 |
| 29 // Experimenting shows that we get a reasonable gain in performance by | 29 // Experimenting shows that we get a reasonable gain in performance by |
| 30 // increasing this up to around 10, but there's not much benefit in | 30 // increasing this up to around 10, but there's not much benefit in |
| 31 // increasing it past that. | 31 // increasing it past that. |
| 32 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 10; | 32 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 10; |
| 33 | 33 |
| 34 // This should be longer than we expect feature extraction to take on any | 34 // This should be longer than we expect feature extraction to take on any |
| 35 // actual phishing page. | 35 // actual phishing page. |
| 36 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; | 36 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; |
| 37 | 37 |
| 38 // The maximum size of the negative word cache. |
| 39 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000; |
| 40 |
| 38 // All of the state pertaining to the current feature extraction. | 41 // All of the state pertaining to the current feature extraction. |
| 39 struct PhishingTermFeatureExtractor::ExtractionState { | 42 struct PhishingTermFeatureExtractor::ExtractionState { |
| 40 // Stores up to max_words_per_ngram_ previous words separated by spaces. | 43 // Stores up to max_words_per_ngram_ previous words separated by spaces. |
| 41 std::string previous_words; | 44 std::string previous_words; |
| 42 | 45 |
| 43 // Stores the sizes of the words in previous_words. Note: the size includes | 46 // Stores the sizes of the words in previous_words. Note: the size includes |
| 44 // the space after each word. In other words, the sum of all sizes in this | 47 // the space after each word. In other words, the sum of all sizes in this |
| 45 // list is equal to the length of previous_words. | 48 // list is equal to the length of previous_words. |
| 46 std::list<size_t> previous_word_sizes; | 49 std::list<size_t> previous_word_sizes; |
| 47 | 50 |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 86 }; | 89 }; |
| 87 | 90 |
| 88 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( | 91 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( |
| 89 const base::hash_set<std::string>* page_term_hashes, | 92 const base::hash_set<std::string>* page_term_hashes, |
| 90 const base::hash_set<std::string>* page_word_hashes, | 93 const base::hash_set<std::string>* page_word_hashes, |
| 91 size_t max_words_per_term, | 94 size_t max_words_per_term, |
| 92 FeatureExtractorClock* clock) | 95 FeatureExtractorClock* clock) |
| 93 : page_term_hashes_(page_term_hashes), | 96 : page_term_hashes_(page_term_hashes), |
| 94 page_word_hashes_(page_word_hashes), | 97 page_word_hashes_(page_word_hashes), |
| 95 max_words_per_term_(max_words_per_term), | 98 max_words_per_term_(max_words_per_term), |
| 99 negative_word_cache_(kMaxNegativeWordCacheSize), |
| 96 clock_(clock), | 100 clock_(clock), |
| 97 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { | 101 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { |
| 98 Clear(); | 102 Clear(); |
| 99 } | 103 } |
| 100 | 104 |
| 101 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { | 105 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { |
| 102 // The RenderView should have called CancelPendingExtraction() before | 106 // The RenderView should have called CancelPendingExtraction() before |
| 103 // we are destroyed. | 107 // we are destroyed. |
| 104 CheckNoPendingExtraction(); | 108 CheckNoPendingExtraction(); |
| 105 } | 109 } |
| (...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 152 return; | 156 return; |
| 153 } | 157 } |
| 154 state_->position_initialized = true; | 158 state_->position_initialized = true; |
| 155 } | 159 } |
| 156 | 160 |
| 157 int num_words = 0; | 161 int num_words = 0; |
| 158 for (int next = ubrk_next(state_->iterator); | 162 for (int next = ubrk_next(state_->iterator); |
| 159 next != UBRK_DONE; next = ubrk_next(state_->iterator)) { | 163 next != UBRK_DONE; next = ubrk_next(state_->iterator)) { |
| 160 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) { | 164 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) { |
| 161 // next is now positioned at the end of a word. | 165 // next is now positioned at the end of a word. |
| 162 HandleWord(string16(*page_text_, state_->position, | 166 HandleWord(base::StringPiece16(page_text_->data() + state_->position, |
| 163 next - state_->position)); | 167 next - state_->position)); |
| 164 ++num_words; | 168 ++num_words; |
| 165 } | 169 } |
| 166 state_->position = next; | 170 state_->position = next; |
| 167 | 171 |
| 168 if (num_words >= kClockCheckGranularity) { | 172 if (num_words >= kClockCheckGranularity) { |
| 169 num_words = 0; | 173 num_words = 0; |
| 170 base::TimeTicks now = clock_->Now(); | 174 base::TimeTicks now = clock_->Now(); |
| 171 if (now - state_->start_time >= | 175 if (now - state_->start_time >= |
| 172 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { | 176 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { |
| 173 DLOG(ERROR) << "Feature extraction took too long, giving up"; | 177 DLOG(ERROR) << "Feature extraction took too long, giving up"; |
| (...skipping 15 matching lines...) Expand all Loading... |
| 189 chunk_elapsed); | 193 chunk_elapsed); |
| 190 MessageLoop::current()->PostTask( | 194 MessageLoop::current()->PostTask( |
| 191 FROM_HERE, | 195 FROM_HERE, |
| 192 method_factory_.NewRunnableMethod( | 196 method_factory_.NewRunnableMethod( |
| 193 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout)); | 197 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout)); |
| 194 return; | 198 return; |
| 195 } | 199 } |
| 196 // Otherwise, continue. | 200 // Otherwise, continue. |
| 197 } | 201 } |
| 198 } | 202 } |
| 203 // We need to clear the cache because the data that it depends on (page_text_) |
| 204 // is going away. |
| 205 negative_word_cache_.Clear(); |
| 199 RunCallback(true); | 206 RunCallback(true); |
| 200 } | 207 } |
| 201 | 208 |
| 202 void PhishingTermFeatureExtractor::HandleWord(const string16& word) { | 209 void PhishingTermFeatureExtractor::HandleWord( |
| 210 const base::StringPiece16& word) { |
| 211 // Quickest out if we have seen this word before and know that it's not |
| 212 // part of any term. This avoids the SHA256, lowercasing, and UTF conversion, |
| 213 // all of which are relatively expensive. |
| 214 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) { |
| 215 return; |
| 216 } |
| 217 |
| 203 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word)); | 218 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word)); |
| 204 std::string word_hash = crypto::SHA256HashString(word_lower); | 219 std::string word_hash = crypto::SHA256HashString(word_lower); |
| 205 | 220 |
| 206 // Quick out if the word is not part of any term, which is the common case. | 221 // Quick out if the word is not part of any term, which is the common case. |
| 207 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { | 222 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { |
| 208 // Word doesn't exist in our terms so we can clear the n-gram state. | 223 // Word doesn't exist in our terms so we can clear the n-gram state. |
| 209 state_->previous_words.clear(); | 224 state_->previous_words.clear(); |
| 210 state_->previous_word_sizes.clear(); | 225 state_->previous_word_sizes.clear(); |
| 226 // Insert into negative cache so that we don't try this again. |
| 227 negative_word_cache_.Put(word, true); |
| 211 return; | 228 return; |
| 212 } | 229 } |
| 213 | 230 |
| 214 // Find all of the n-grams that we need to check and compute their hashes. | 231 // Find all of the n-grams that we need to check and compute their hashes. |
| 215 // We already have the hash for word_lower, so we don't compute that again. | 232 // We already have the hash for word_lower, so we don't compute that again. |
| 216 std::map<std::string /* hash */, std::string /* plaintext */> | 233 std::map<std::string /* hash */, std::string /* plaintext */> |
| 217 hashes_to_check; | 234 hashes_to_check; |
| 218 hashes_to_check[word_hash] = word_lower; | 235 hashes_to_check[word_hash] = word_lower; |
| 219 | 236 |
| 220 // Combine the new word with the previous words to find additional n-grams. | 237 // Combine the new word with the previous words to find additional n-grams. |
| 221 // Note that we don't yet add the new word length to previous_word_sizes, | 238 // Note that we don't yet add the new word length to previous_word_sizes, |
| 222 // since we don't want to compute the hash for the word by itself again. | 239 // since we don't want to compute the hash for the word by itself again. |
| 223 // | 240 // |
| 224 // TODO(bryner): Use UMA stats to determine whether this is too slow. | |
| 225 // If it is, there are a couple of cases that we could optimize: | |
| 226 // - We could cache plaintext words that are not in page_word_hashes_, so | |
| 227 // that we can avoid hashing these again. | |
| 228 // - We could include positional information about words in the n-grams, | |
| 229 // rather than just a list of all of the words. For example, we could | |
| 230 // change the term format so that each word is hashed separately, or | |
| 231 // we could add extra data to the word list to indicate the position | |
| 232 // at which the word appears in an n-gram, and skip checking the word if | |
| 233 // it's not at that position. | |
| 234 state_->previous_words.append(word_lower); | 241 state_->previous_words.append(word_lower); |
| 235 std::string current_term = state_->previous_words; | 242 std::string current_term = state_->previous_words; |
| 236 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); | 243 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); |
| 237 it != state_->previous_word_sizes.end(); ++it) { | 244 it != state_->previous_word_sizes.end(); ++it) { |
| 238 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term; | 245 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term; |
| 239 current_term.erase(0, *it); | 246 current_term.erase(0, *it); |
| 240 } | 247 } |
| 241 | 248 |
| 242 // Add features for any hashes that match page_term_hashes_. | 249 // Add features for any hashes that match page_term_hashes_. |
| 243 for (std::map<std::string, std::string>::iterator it = | 250 for (std::map<std::string, std::string>::iterator it = |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 287 } | 294 } |
| 288 | 295 |
| 289 void PhishingTermFeatureExtractor::Clear() { | 296 void PhishingTermFeatureExtractor::Clear() { |
| 290 page_text_ = NULL; | 297 page_text_ = NULL; |
| 291 features_ = NULL; | 298 features_ = NULL; |
| 292 done_callback_.reset(NULL); | 299 done_callback_.reset(NULL); |
| 293 state_.reset(NULL); | 300 state_.reset(NULL); |
| 294 } | 301 } |
| 295 | 302 |
| 296 } // namespace safe_browsing | 303 } // namespace safe_browsing |
| OLD | NEW |