Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(578)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 7549003: Optimize phishing page term feature extraction. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Address Brian's comments Created 9 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
6 6
7 #include <list> 7 #include <list>
8 #include <map> 8 #include <map>
9 9
10 #include "base/compiler_specific.h" 10 #include "base/compiler_specific.h"
(...skipping 17 matching lines...) Expand all
28 28
29 // Experimenting shows that we get a reasonable gain in performance by 29 // Experimenting shows that we get a reasonable gain in performance by
30 // increasing this up to around 10, but there's not much benefit in 30 // increasing this up to around 10, but there's not much benefit in
31 // increasing it past that. 31 // increasing it past that.
32 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 10; 32 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 10;
33 33
34 // This should be longer than we expect feature extraction to take on any 34 // This should be longer than we expect feature extraction to take on any
35 // actual phishing page. 35 // actual phishing page.
36 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; 36 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;
37 37
38 // The maximum size of the negative word cache.
39 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;
40
38 // All of the state pertaining to the current feature extraction. 41 // All of the state pertaining to the current feature extraction.
39 struct PhishingTermFeatureExtractor::ExtractionState { 42 struct PhishingTermFeatureExtractor::ExtractionState {
40 // Stores up to max_words_per_ngram_ previous words separated by spaces. 43 // Stores up to max_words_per_ngram_ previous words separated by spaces.
41 std::string previous_words; 44 std::string previous_words;
42 45
43 // Stores the sizes of the words in previous_words. Note: the size includes 46 // Stores the sizes of the words in previous_words. Note: the size includes
44 // the space after each word. In other words, the sum of all sizes in this 47 // the space after each word. In other words, the sum of all sizes in this
45 // list is equal to the length of previous_words. 48 // list is equal to the length of previous_words.
46 std::list<size_t> previous_word_sizes; 49 std::list<size_t> previous_word_sizes;
47 50
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
86 }; 89 };
87 90
88 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( 91 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
89 const base::hash_set<std::string>* page_term_hashes, 92 const base::hash_set<std::string>* page_term_hashes,
90 const base::hash_set<std::string>* page_word_hashes, 93 const base::hash_set<std::string>* page_word_hashes,
91 size_t max_words_per_term, 94 size_t max_words_per_term,
92 FeatureExtractorClock* clock) 95 FeatureExtractorClock* clock)
93 : page_term_hashes_(page_term_hashes), 96 : page_term_hashes_(page_term_hashes),
94 page_word_hashes_(page_word_hashes), 97 page_word_hashes_(page_word_hashes),
95 max_words_per_term_(max_words_per_term), 98 max_words_per_term_(max_words_per_term),
99 negative_word_cache_(kMaxNegativeWordCacheSize),
96 clock_(clock), 100 clock_(clock),
97 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { 101 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
98 Clear(); 102 Clear();
99 } 103 }
100 104
101 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { 105 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
102 // The RenderView should have called CancelPendingExtraction() before 106 // The RenderView should have called CancelPendingExtraction() before
103 // we are destroyed. 107 // we are destroyed.
104 CheckNoPendingExtraction(); 108 CheckNoPendingExtraction();
105 } 109 }
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
152 return; 156 return;
153 } 157 }
154 state_->position_initialized = true; 158 state_->position_initialized = true;
155 } 159 }
156 160
157 int num_words = 0; 161 int num_words = 0;
158 for (int next = ubrk_next(state_->iterator); 162 for (int next = ubrk_next(state_->iterator);
159 next != UBRK_DONE; next = ubrk_next(state_->iterator)) { 163 next != UBRK_DONE; next = ubrk_next(state_->iterator)) {
160 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) { 164 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {
161 // next is now positioned at the end of a word. 165 // next is now positioned at the end of a word.
162 HandleWord(string16(*page_text_, state_->position, 166 HandleWord(base::StringPiece16(page_text_->data() + state_->position,
163 next - state_->position)); 167 next - state_->position));
164 ++num_words; 168 ++num_words;
165 } 169 }
166 state_->position = next; 170 state_->position = next;
167 171
168 if (num_words >= kClockCheckGranularity) { 172 if (num_words >= kClockCheckGranularity) {
169 num_words = 0; 173 num_words = 0;
170 base::TimeTicks now = clock_->Now(); 174 base::TimeTicks now = clock_->Now();
171 if (now - state_->start_time >= 175 if (now - state_->start_time >=
172 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { 176 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
173 DLOG(ERROR) << "Feature extraction took too long, giving up"; 177 DLOG(ERROR) << "Feature extraction took too long, giving up";
(...skipping 15 matching lines...) Expand all
189 chunk_elapsed); 193 chunk_elapsed);
190 MessageLoop::current()->PostTask( 194 MessageLoop::current()->PostTask(
191 FROM_HERE, 195 FROM_HERE,
192 method_factory_.NewRunnableMethod( 196 method_factory_.NewRunnableMethod(
193 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout)); 197 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout));
194 return; 198 return;
195 } 199 }
196 // Otherwise, continue. 200 // Otherwise, continue.
197 } 201 }
198 } 202 }
203 // We need to clear the cache because the data that it depends on (page_text_)
204 // is going away.
205 negative_word_cache_.Clear();
199 RunCallback(true); 206 RunCallback(true);
200 } 207 }
201 208
202 void PhishingTermFeatureExtractor::HandleWord(const string16& word) { 209 void PhishingTermFeatureExtractor::HandleWord(
210 const base::StringPiece16& word) {
211 // Quickest out if we have seen this word before and know that it's not
212 // part of any term. This avoids the SHA256, lowercasing, and UTF conversion,
213 // all of which are relatively expensive.
214 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {
215 return;
216 }
217
203 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word)); 218 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word));
204 std::string word_hash = crypto::SHA256HashString(word_lower); 219 std::string word_hash = crypto::SHA256HashString(word_lower);
205 220
206 // Quick out if the word is not part of any term, which is the common case. 221 // Quick out if the word is not part of any term, which is the common case.
207 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { 222 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {
208 // Word doesn't exist in our terms so we can clear the n-gram state. 223 // Word doesn't exist in our terms so we can clear the n-gram state.
209 state_->previous_words.clear(); 224 state_->previous_words.clear();
210 state_->previous_word_sizes.clear(); 225 state_->previous_word_sizes.clear();
226 // Insert into negative cache so that we don't try this again.
227 negative_word_cache_.Put(word, true);
211 return; 228 return;
212 } 229 }
213 230
214 // Find all of the n-grams that we need to check and compute their hashes. 231 // Find all of the n-grams that we need to check and compute their hashes.
215 // We already have the hash for word_lower, so we don't compute that again. 232 // We already have the hash for word_lower, so we don't compute that again.
216 std::map<std::string /* hash */, std::string /* plaintext */> 233 std::map<std::string /* hash */, std::string /* plaintext */>
217 hashes_to_check; 234 hashes_to_check;
218 hashes_to_check[word_hash] = word_lower; 235 hashes_to_check[word_hash] = word_lower;
219 236
220 // Combine the new word with the previous words to find additional n-grams. 237 // Combine the new word with the previous words to find additional n-grams.
221 // Note that we don't yet add the new word length to previous_word_sizes, 238 // Note that we don't yet add the new word length to previous_word_sizes,
222 // since we don't want to compute the hash for the word by itself again. 239 // since we don't want to compute the hash for the word by itself again.
223 // 240 //
224 // TODO(bryner): Use UMA stats to determine whether this is too slow.
225 // If it is, there are a couple of cases that we could optimize:
226 // - We could cache plaintext words that are not in page_word_hashes_, so
227 // that we can avoid hashing these again.
228 // - We could include positional information about words in the n-grams,
229 // rather than just a list of all of the words. For example, we could
230 // change the term format so that each word is hashed separately, or
231 // we could add extra data to the word list to indicate the position
232 // at which the word appears in an n-gram, and skip checking the word if
233 // it's not at that position.
234 state_->previous_words.append(word_lower); 241 state_->previous_words.append(word_lower);
235 std::string current_term = state_->previous_words; 242 std::string current_term = state_->previous_words;
236 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); 243 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();
237 it != state_->previous_word_sizes.end(); ++it) { 244 it != state_->previous_word_sizes.end(); ++it) {
238 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term; 245 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;
239 current_term.erase(0, *it); 246 current_term.erase(0, *it);
240 } 247 }
241 248
242 // Add features for any hashes that match page_term_hashes_. 249 // Add features for any hashes that match page_term_hashes_.
243 for (std::map<std::string, std::string>::iterator it = 250 for (std::map<std::string, std::string>::iterator it =
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
287 } 294 }
288 295
289 void PhishingTermFeatureExtractor::Clear() { 296 void PhishingTermFeatureExtractor::Clear() {
290 page_text_ = NULL; 297 page_text_ = NULL;
291 features_ = NULL; 298 features_ = NULL;
292 done_callback_.reset(NULL); 299 done_callback_.reset(NULL);
293 state_.reset(NULL); 300 state_.reset(NULL);
294 } 301 }
295 302
296 } // namespace safe_browsing 303 } // namespace safe_browsing
OLDNEW
« no previous file with comments | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698