chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc - Issue 7549003: Optimize phishing page term feature extraction.

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 7549003: Optimize phishing page term feature extraction. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Address Brian's comments Created 9 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"	5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"

6	6

7 #include <list>	7 #include <list>

8 #include <map>	8 #include <map>

9	9

10 #include "base/compiler_specific.h"	10 #include "base/compiler_specific.h"

(...skipping 17 matching lines...) Expand all Loading...
28	28

29 // Experimenting shows that we get a reasonable gain in performance by	29 // Experimenting shows that we get a reasonable gain in performance by

30 // increasing this up to around 10, but there's not much benefit in	30 // increasing this up to around 10, but there's not much benefit in

31 // increasing it past that.	31 // increasing it past that.

32 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 10;	32 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 10;

33	33

34 // This should be longer than we expect feature extraction to take on any	34 // This should be longer than we expect feature extraction to take on any

35 // actual phishing page.	35 // actual phishing page.

36 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;	36 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;

37	37

	38 // The maximum size of the negative word cache.

	39 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;

	40

38 // All of the state pertaining to the current feature extraction.	41 // All of the state pertaining to the current feature extraction.

39 struct PhishingTermFeatureExtractor::ExtractionState {	42 struct PhishingTermFeatureExtractor::ExtractionState {

40 // Stores up to max_words_per_ngram_ previous words separated by spaces.	43 // Stores up to max_words_per_ngram_ previous words separated by spaces.

41 std::string previous_words;	44 std::string previous_words;

42	45

43 // Stores the sizes of the words in previous_words. Note: the size includes	46 // Stores the sizes of the words in previous_words. Note: the size includes

44 // the space after each word. In other words, the sum of all sizes in this	47 // the space after each word. In other words, the sum of all sizes in this

45 // list is equal to the length of previous_words.	48 // list is equal to the length of previous_words.

46 std::list<size_t> previous_word_sizes;	49 std::list<size_t> previous_word_sizes;

47	50

(...skipping 38 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
86 };	89 };

87	90

88 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(	91 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(

89 const base::hash_set<std::string>* page_term_hashes,	92 const base::hash_set<std::string>* page_term_hashes,

90 const base::hash_set<std::string>* page_word_hashes,	93 const base::hash_set<std::string>* page_word_hashes,

91 size_t max_words_per_term,	94 size_t max_words_per_term,

92 FeatureExtractorClock* clock)	95 FeatureExtractorClock* clock)

93 : page_term_hashes_(page_term_hashes),	96 : page_term_hashes_(page_term_hashes),

94 page_word_hashes_(page_word_hashes),	97 page_word_hashes_(page_word_hashes),

95 max_words_per_term_(max_words_per_term),	98 max_words_per_term_(max_words_per_term),

	99 negative_word_cache_(kMaxNegativeWordCacheSize),

96 clock_(clock),	100 clock_(clock),

97 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {	101 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {

98 Clear();	102 Clear();

99 }	103 }

100	104

101 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {	105 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {

102 // The RenderView should have called CancelPendingExtraction() before	106 // The RenderView should have called CancelPendingExtraction() before

103 // we are destroyed.	107 // we are destroyed.

104 CheckNoPendingExtraction();	108 CheckNoPendingExtraction();

105 }	109 }

(...skipping 46 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
152 return;	156 return;

153 }	157 }

154 state_->position_initialized = true;	158 state_->position_initialized = true;

155 }	159 }

156	160

157 int num_words = 0;	161 int num_words = 0;

158 for (int next = ubrk_next(state_->iterator);	162 for (int next = ubrk_next(state_->iterator);

159 next != UBRK_DONE; next = ubrk_next(state_->iterator)) {	163 next != UBRK_DONE; next = ubrk_next(state_->iterator)) {

160 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {	164 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {

161 // next is now positioned at the end of a word.	165 // next is now positioned at the end of a word.

162 HandleWord(string16(*page_text_, state_->position,	166 HandleWord(base::StringPiece16(page_text_->data() + state_->position,

163 next - state_->position));	167 next - state_->position));

164 ++num_words;	168 ++num_words;

165 }	169 }

166 state_->position = next;	170 state_->position = next;

167	171

168 if (num_words >= kClockCheckGranularity) {	172 if (num_words >= kClockCheckGranularity) {

169 num_words = 0;	173 num_words = 0;

170 base::TimeTicks now = clock_->Now();	174 base::TimeTicks now = clock_->Now();

171 if (now - state_->start_time >=	175 if (now - state_->start_time >=

172 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {	176 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {

173 DLOG(ERROR) << "Feature extraction took too long, giving up";	177 DLOG(ERROR) << "Feature extraction took too long, giving up";

(...skipping 15 matching lines...) Expand all Loading...
189 chunk_elapsed);	193 chunk_elapsed);

190 MessageLoop::current()->PostTask(	194 MessageLoop::current()->PostTask(

191 FROM_HERE,	195 FROM_HERE,

192 method_factory_.NewRunnableMethod(	196 method_factory_.NewRunnableMethod(

193 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout));	197 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout));

194 return;	198 return;

195 }	199 }

196 // Otherwise, continue.	200 // Otherwise, continue.

197 }	201 }

198 }	202 }

	203 // We need to clear the cache because the data that it depends on (page_text_)

	204 // is going away.

	205 negative_word_cache_.Clear();

199 RunCallback(true);	206 RunCallback(true);

200 }	207 }

201	208

202 void PhishingTermFeatureExtractor::HandleWord(const string16& word) {	209 void PhishingTermFeatureExtractor::HandleWord(

	210 const base::StringPiece16& word) {

	211 // Quickest out if we have seen this word before and know that it's not

	212 // part of any term. This avoids the SHA256, lowercasing, and UTF conversion,

	213 // all of which are relatively expensive.

	214 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {

	215 return;

	216 }

	217

203 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word));	218 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word));

204 std::string word_hash = crypto::SHA256HashString(word_lower);	219 std::string word_hash = crypto::SHA256HashString(word_lower);

205	220

206 // Quick out if the word is not part of any term, which is the common case.	221 // Quick out if the word is not part of any term, which is the common case.

207 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {	222 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {

208 // Word doesn't exist in our terms so we can clear the n-gram state.	223 // Word doesn't exist in our terms so we can clear the n-gram state.

209 state_->previous_words.clear();	224 state_->previous_words.clear();

210 state_->previous_word_sizes.clear();	225 state_->previous_word_sizes.clear();

	226 // Insert into negative cache so that we don't try this again.

	227 negative_word_cache_.Put(word, true);

211 return;	228 return;

212 }	229 }

213	230

214 // Find all of the n-grams that we need to check and compute their hashes.	231 // Find all of the n-grams that we need to check and compute their hashes.

215 // We already have the hash for word_lower, so we don't compute that again.	232 // We already have the hash for word_lower, so we don't compute that again.

216 std::map<std::string /* hash /, std::string / plaintext */>	233 std::map<std::string /* hash /, std::string / plaintext */>

217 hashes_to_check;	234 hashes_to_check;

218 hashes_to_check[word_hash] = word_lower;	235 hashes_to_check[word_hash] = word_lower;

219	236

220 // Combine the new word with the previous words to find additional n-grams.	237 // Combine the new word with the previous words to find additional n-grams.

221 // Note that we don't yet add the new word length to previous_word_sizes,	238 // Note that we don't yet add the new word length to previous_word_sizes,

222 // since we don't want to compute the hash for the word by itself again.	239 // since we don't want to compute the hash for the word by itself again.

223 //	240 //

224 // TODO(bryner): Use UMA stats to determine whether this is too slow.

225 // If it is, there are a couple of cases that we could optimize:

226 // - We could cache plaintext words that are not in page_word_hashes_, so

227 // that we can avoid hashing these again.

228 // - We could include positional information about words in the n-grams,

229 // rather than just a list of all of the words. For example, we could

230 // change the term format so that each word is hashed separately, or

231 // we could add extra data to the word list to indicate the position

232 // at which the word appears in an n-gram, and skip checking the word if

233 // it's not at that position.

234 state_->previous_words.append(word_lower);	241 state_->previous_words.append(word_lower);

235 std::string current_term = state_->previous_words;	242 std::string current_term = state_->previous_words;

236 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();	243 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();

237 it != state_->previous_word_sizes.end(); ++it) {	244 it != state_->previous_word_sizes.end(); ++it) {

238 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;	245 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;

239 current_term.erase(0, *it);	246 current_term.erase(0, *it);

240 }	247 }

241	248

242 // Add features for any hashes that match page_term_hashes_.	249 // Add features for any hashes that match page_term_hashes_.

243 for (std::map<std::string, std::string>::iterator it =	250 for (std::map<std::string, std::string>::iterator it =

(...skipping 43 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
287 }	294 }

288	295

289 void PhishingTermFeatureExtractor::Clear() {	296 void PhishingTermFeatureExtractor::Clear() {

290 page_text_ = NULL;	297 page_text_ = NULL;

291 features_ = NULL;	298 features_ = NULL;

292 done_callback_.reset(NULL);	299 done_callback_.reset(NULL);

293 state_.reset(NULL);	300 state_.reset(NULL);

294 }	301 }

295	302

296 } // namespace safe_browsing	303 } // namespace safe_browsing

OLD	NEW

« no previous file with comments | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.h ('k') | no next file » | no next file with comments »