OLD | NEW |
---|---|
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
6 | 6 |
7 #include <list> | 7 #include <list> |
8 #include <map> | 8 #include <map> |
9 | 9 |
10 #include "base/compiler_specific.h" | 10 #include "base/compiler_specific.h" |
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
86 }; | 86 }; |
87 | 87 |
88 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( | 88 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( |
89 const base::hash_set<std::string>* page_term_hashes, | 89 const base::hash_set<std::string>* page_term_hashes, |
90 const base::hash_set<std::string>* page_word_hashes, | 90 const base::hash_set<std::string>* page_word_hashes, |
91 size_t max_words_per_term, | 91 size_t max_words_per_term, |
92 FeatureExtractorClock* clock) | 92 FeatureExtractorClock* clock) |
93 : page_term_hashes_(page_term_hashes), | 93 : page_term_hashes_(page_term_hashes), |
94 page_word_hashes_(page_word_hashes), | 94 page_word_hashes_(page_word_hashes), |
95 max_words_per_term_(max_words_per_term), | 95 max_words_per_term_(max_words_per_term), |
96 negative_word_cache_(1000 /* max_size */), | |
Brian Ryner
2011/08/08 22:08:00
Maybe make this a class constant, like the other k
Garrett Casto
2011/08/08 23:19:51
Done.
| |
96 clock_(clock), | 97 clock_(clock), |
97 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { | 98 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { |
98 Clear(); | 99 Clear(); |
99 } | 100 } |
100 | 101 |
101 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { | 102 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { |
102 // The RenderView should have called CancelPendingExtraction() before | 103 // The RenderView should have called CancelPendingExtraction() before |
103 // we are destroyed. | 104 // we are destroyed. |
104 CheckNoPendingExtraction(); | 105 CheckNoPendingExtraction(); |
105 } | 106 } |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
152 return; | 153 return; |
153 } | 154 } |
154 state_->position_initialized = true; | 155 state_->position_initialized = true; |
155 } | 156 } |
156 | 157 |
157 int num_words = 0; | 158 int num_words = 0; |
158 for (int next = ubrk_next(state_->iterator); | 159 for (int next = ubrk_next(state_->iterator); |
159 next != UBRK_DONE; next = ubrk_next(state_->iterator)) { | 160 next != UBRK_DONE; next = ubrk_next(state_->iterator)) { |
160 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) { | 161 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) { |
161 // next is now positioned at the end of a word. | 162 // next is now positioned at the end of a word. |
162 HandleWord(string16(*page_text_, state_->position, | 163 HandleWord(base::StringPiece16(page_text_->data() + state_->position, |
163 next - state_->position)); | 164 next - state_->position)); |
164 ++num_words; | 165 ++num_words; |
165 } | 166 } |
166 state_->position = next; | 167 state_->position = next; |
167 | 168 |
168 if (num_words >= kClockCheckGranularity) { | 169 if (num_words >= kClockCheckGranularity) { |
169 num_words = 0; | 170 num_words = 0; |
170 base::TimeTicks now = clock_->Now(); | 171 base::TimeTicks now = clock_->Now(); |
171 if (now - state_->start_time >= | 172 if (now - state_->start_time >= |
172 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { | 173 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { |
173 DLOG(ERROR) << "Feature extraction took too long, giving up"; | 174 DLOG(ERROR) << "Feature extraction took too long, giving up"; |
(...skipping 15 matching lines...) Expand all Loading... | |
189 chunk_elapsed); | 190 chunk_elapsed); |
190 MessageLoop::current()->PostTask( | 191 MessageLoop::current()->PostTask( |
191 FROM_HERE, | 192 FROM_HERE, |
192 method_factory_.NewRunnableMethod( | 193 method_factory_.NewRunnableMethod( |
193 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout)); | 194 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout)); |
194 return; | 195 return; |
195 } | 196 } |
196 // Otherwise, continue. | 197 // Otherwise, continue. |
197 } | 198 } |
198 } | 199 } |
200 // We need to clear the cache because the data that it depends on (page_text_) | |
201 // is going away. | |
202 negative_word_cache_.Clear(); | |
Brian Ryner
2011/08/08 22:08:00
Hm... I could imagine there being some benefit to
Garrett Casto
2011/08/08 23:19:51
So I thought about this as well. It's rather hard
| |
199 RunCallback(true); | 203 RunCallback(true); |
200 } | 204 } |
201 | 205 |
202 void PhishingTermFeatureExtractor::HandleWord(const string16& word) { | 206 void PhishingTermFeatureExtractor::HandleWord( |
207 const base::StringPiece16& word) { | |
208 // Quickest out if we have seen this word before and know that it's not | |
209 // part of any term. This avoids the SHA256, lowercasing, and UTF conversion, | |
210 // all of which are relatively expensive. | |
211 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) { | |
212 return; | |
213 } | |
214 | |
203 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word)); | 215 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word)); |
204 std::string word_hash = crypto::SHA256HashString(word_lower); | 216 std::string word_hash = crypto::SHA256HashString(word_lower); |
205 | 217 |
206 // Quick out if the word is not part of any term, which is the common case. | 218 // Quick out if the word is not part of any term, which is the common case. |
207 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { | 219 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { |
208 // Word doesn't exist in our terms so we can clear the n-gram state. | 220 // Word doesn't exist in our terms so we can clear the n-gram state. |
209 state_->previous_words.clear(); | 221 state_->previous_words.clear(); |
210 state_->previous_word_sizes.clear(); | 222 state_->previous_word_sizes.clear(); |
223 // Insert into negative cache so that we don't try this again. | |
224 negative_word_cache_.Put(word, true); | |
211 return; | 225 return; |
212 } | 226 } |
213 | 227 |
214 // Find all of the n-grams that we need to check and compute their hashes. | 228 // Find all of the n-grams that we need to check and compute their hashes. |
215 // We already have the hash for word_lower, so we don't compute that again. | 229 // We already have the hash for word_lower, so we don't compute that again. |
216 std::map<std::string /* hash */, std::string /* plaintext */> | 230 std::map<std::string /* hash */, std::string /* plaintext */> |
217 hashes_to_check; | 231 hashes_to_check; |
218 hashes_to_check[word_hash] = word_lower; | 232 hashes_to_check[word_hash] = word_lower; |
219 | 233 |
220 // Combine the new word with the previous words to find additional n-grams. | 234 // Combine the new word with the previous words to find additional n-grams. |
221 // Note that we don't yet add the new word length to previous_word_sizes, | 235 // Note that we don't yet add the new word length to previous_word_sizes, |
222 // since we don't want to compute the hash for the word by itself again. | 236 // since we don't want to compute the hash for the word by itself again. |
223 // | 237 // |
224 // TODO(bryner): Use UMA stats to determine whether this is too slow. | |
225 // If it is, there are a couple of cases that we could optimize: | |
226 // - We could cache plaintext words that are not in page_word_hashes_, so | |
227 // that we can avoid hashing these again. | |
228 // - We could include positional information about words in the n-grams, | |
229 // rather than just a list of all of the words. For example, we could | |
230 // change the term format so that each word is hashed separately, or | |
231 // we could add extra data to the word list to indicate the position | |
232 // at which the word appears in an n-gram, and skip checking the word if | |
233 // it's not at that position. | |
234 state_->previous_words.append(word_lower); | 238 state_->previous_words.append(word_lower); |
235 std::string current_term = state_->previous_words; | 239 std::string current_term = state_->previous_words; |
236 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); | 240 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); |
237 it != state_->previous_word_sizes.end(); ++it) { | 241 it != state_->previous_word_sizes.end(); ++it) { |
238 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term; | 242 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term; |
239 current_term.erase(0, *it); | 243 current_term.erase(0, *it); |
240 } | 244 } |
241 | 245 |
242 // Add features for any hashes that match page_term_hashes_. | 246 // Add features for any hashes that match page_term_hashes_. |
243 for (std::map<std::string, std::string>::iterator it = | 247 for (std::map<std::string, std::string>::iterator it = |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
287 } | 291 } |
288 | 292 |
289 void PhishingTermFeatureExtractor::Clear() { | 293 void PhishingTermFeatureExtractor::Clear() { |
290 page_text_ = NULL; | 294 page_text_ = NULL; |
291 features_ = NULL; | 295 features_ = NULL; |
292 done_callback_.reset(NULL); | 296 done_callback_.reset(NULL); |
293 state_.reset(NULL); | 297 state_.reset(NULL); |
294 } | 298 } |
295 | 299 |
296 } // namespace safe_browsing | 300 } // namespace safe_browsing |
OLD | NEW |