Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(89)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 7549003: Optimize phishing page term feature extraction. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Fix naming of some variables and Windows compile error. Created 9 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
6 6
7 #include <list> 7 #include <list>
8 #include <map> 8 #include <map>
9 9
10 #include "base/compiler_specific.h" 10 #include "base/compiler_specific.h"
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after
86 }; 86 };
87 87
88 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( 88 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
89 const base::hash_set<std::string>* page_term_hashes, 89 const base::hash_set<std::string>* page_term_hashes,
90 const base::hash_set<std::string>* page_word_hashes, 90 const base::hash_set<std::string>* page_word_hashes,
91 size_t max_words_per_term, 91 size_t max_words_per_term,
92 FeatureExtractorClock* clock) 92 FeatureExtractorClock* clock)
93 : page_term_hashes_(page_term_hashes), 93 : page_term_hashes_(page_term_hashes),
94 page_word_hashes_(page_word_hashes), 94 page_word_hashes_(page_word_hashes),
95 max_words_per_term_(max_words_per_term), 95 max_words_per_term_(max_words_per_term),
96 negative_word_cache_(1000 /* max_size */),
Brian Ryner 2011/08/08 22:08:00 Maybe make this a class constant, like the other k
Garrett Casto 2011/08/08 23:19:51 Done.
96 clock_(clock), 97 clock_(clock),
97 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { 98 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
98 Clear(); 99 Clear();
99 } 100 }
100 101
101 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { 102 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
102 // The RenderView should have called CancelPendingExtraction() before 103 // The RenderView should have called CancelPendingExtraction() before
103 // we are destroyed. 104 // we are destroyed.
104 CheckNoPendingExtraction(); 105 CheckNoPendingExtraction();
105 } 106 }
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
152 return; 153 return;
153 } 154 }
154 state_->position_initialized = true; 155 state_->position_initialized = true;
155 } 156 }
156 157
157 int num_words = 0; 158 int num_words = 0;
158 for (int next = ubrk_next(state_->iterator); 159 for (int next = ubrk_next(state_->iterator);
159 next != UBRK_DONE; next = ubrk_next(state_->iterator)) { 160 next != UBRK_DONE; next = ubrk_next(state_->iterator)) {
160 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) { 161 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {
161 // next is now positioned at the end of a word. 162 // next is now positioned at the end of a word.
162 HandleWord(string16(*page_text_, state_->position, 163 HandleWord(base::StringPiece16(page_text_->data() + state_->position,
163 next - state_->position)); 164 next - state_->position));
164 ++num_words; 165 ++num_words;
165 } 166 }
166 state_->position = next; 167 state_->position = next;
167 168
168 if (num_words >= kClockCheckGranularity) { 169 if (num_words >= kClockCheckGranularity) {
169 num_words = 0; 170 num_words = 0;
170 base::TimeTicks now = clock_->Now(); 171 base::TimeTicks now = clock_->Now();
171 if (now - state_->start_time >= 172 if (now - state_->start_time >=
172 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { 173 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
173 DLOG(ERROR) << "Feature extraction took too long, giving up"; 174 DLOG(ERROR) << "Feature extraction took too long, giving up";
(...skipping 15 matching lines...) Expand all
189 chunk_elapsed); 190 chunk_elapsed);
190 MessageLoop::current()->PostTask( 191 MessageLoop::current()->PostTask(
191 FROM_HERE, 192 FROM_HERE,
192 method_factory_.NewRunnableMethod( 193 method_factory_.NewRunnableMethod(
193 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout)); 194 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout));
194 return; 195 return;
195 } 196 }
196 // Otherwise, continue. 197 // Otherwise, continue.
197 } 198 }
198 } 199 }
200 // We need to clear the cache because the data that it depends on (page_text_)
201 // is going away.
202 negative_word_cache_.Clear();
Brian Ryner 2011/08/08 22:08:00 Hm... I could imagine there being some benefit to
Garrett Casto 2011/08/08 23:19:51 So I thought about this as well. It's rather hard
199 RunCallback(true); 203 RunCallback(true);
200 } 204 }
201 205
202 void PhishingTermFeatureExtractor::HandleWord(const string16& word) { 206 void PhishingTermFeatureExtractor::HandleWord(
207 const base::StringPiece16& word) {
208 // Quickest out if we have seen this word before and know that it's not
209 // part of any term. This avoids the SHA256, lowercasing, and UTF conversion,
210 // all of which are relatively expensive.
211 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {
212 return;
213 }
214
203 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word)); 215 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word));
204 std::string word_hash = crypto::SHA256HashString(word_lower); 216 std::string word_hash = crypto::SHA256HashString(word_lower);
205 217
206 // Quick out if the word is not part of any term, which is the common case. 218 // Quick out if the word is not part of any term, which is the common case.
207 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { 219 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {
208 // Word doesn't exist in our terms so we can clear the n-gram state. 220 // Word doesn't exist in our terms so we can clear the n-gram state.
209 state_->previous_words.clear(); 221 state_->previous_words.clear();
210 state_->previous_word_sizes.clear(); 222 state_->previous_word_sizes.clear();
223 // Insert into negative cache so that we don't try this again.
224 negative_word_cache_.Put(word, true);
211 return; 225 return;
212 } 226 }
213 227
214 // Find all of the n-grams that we need to check and compute their hashes. 228 // Find all of the n-grams that we need to check and compute their hashes.
215 // We already have the hash for word_lower, so we don't compute that again. 229 // We already have the hash for word_lower, so we don't compute that again.
216 std::map<std::string /* hash */, std::string /* plaintext */> 230 std::map<std::string /* hash */, std::string /* plaintext */>
217 hashes_to_check; 231 hashes_to_check;
218 hashes_to_check[word_hash] = word_lower; 232 hashes_to_check[word_hash] = word_lower;
219 233
220 // Combine the new word with the previous words to find additional n-grams. 234 // Combine the new word with the previous words to find additional n-grams.
221 // Note that we don't yet add the new word length to previous_word_sizes, 235 // Note that we don't yet add the new word length to previous_word_sizes,
222 // since we don't want to compute the hash for the word by itself again. 236 // since we don't want to compute the hash for the word by itself again.
223 // 237 //
224 // TODO(bryner): Use UMA stats to determine whether this is too slow.
225 // If it is, there are a couple of cases that we could optimize:
226 // - We could cache plaintext words that are not in page_word_hashes_, so
227 // that we can avoid hashing these again.
228 // - We could include positional information about words in the n-grams,
229 // rather than just a list of all of the words. For example, we could
230 // change the term format so that each word is hashed separately, or
231 // we could add extra data to the word list to indicate the position
232 // at which the word appears in an n-gram, and skip checking the word if
233 // it's not at that position.
234 state_->previous_words.append(word_lower); 238 state_->previous_words.append(word_lower);
235 std::string current_term = state_->previous_words; 239 std::string current_term = state_->previous_words;
236 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); 240 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();
237 it != state_->previous_word_sizes.end(); ++it) { 241 it != state_->previous_word_sizes.end(); ++it) {
238 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term; 242 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;
239 current_term.erase(0, *it); 243 current_term.erase(0, *it);
240 } 244 }
241 245
242 // Add features for any hashes that match page_term_hashes_. 246 // Add features for any hashes that match page_term_hashes_.
243 for (std::map<std::string, std::string>::iterator it = 247 for (std::map<std::string, std::string>::iterator it =
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
287 } 291 }
288 292
289 void PhishingTermFeatureExtractor::Clear() { 293 void PhishingTermFeatureExtractor::Clear() {
290 page_text_ = NULL; 294 page_text_ = NULL;
291 features_ = NULL; 295 features_ = NULL;
292 done_callback_.reset(NULL); 296 done_callback_.reset(NULL);
293 state_.reset(NULL); 297 state_.reset(NULL);
294 } 298 }
295 299
296 } // namespace safe_browsing 300 } // namespace safe_browsing
OLDNEW
« base/string_piece.h ('K') | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698