chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc - Issue 7549003: Optimize phishing page term feature extraction.

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 7549003: Optimize phishing page term feature extraction. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Fix naming of some variables and Windows compile error. Created 9 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"	5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"

6	6

7 #include <list>	7 #include <list>

8 #include <map>	8 #include <map>

9	9

10 #include "base/compiler_specific.h"	10 #include "base/compiler_specific.h"

(...skipping 75 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
86 };	86 };

87	87

88 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(	88 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(

89 const base::hash_set<std::string>* page_term_hashes,	89 const base::hash_set<std::string>* page_term_hashes,

90 const base::hash_set<std::string>* page_word_hashes,	90 const base::hash_set<std::string>* page_word_hashes,

91 size_t max_words_per_term,	91 size_t max_words_per_term,

92 FeatureExtractorClock* clock)	92 FeatureExtractorClock* clock)

93 : page_term_hashes_(page_term_hashes),	93 : page_term_hashes_(page_term_hashes),

94 page_word_hashes_(page_word_hashes),	94 page_word_hashes_(page_word_hashes),

95 max_words_per_term_(max_words_per_term),	95 max_words_per_term_(max_words_per_term),

	96 negative_word_cache_(1000 /* max_size */),
	Brian Ryner 2011/08/08 22:08:00 Maybe make this a class constant, like the other k Maybe make this a class constant, like the other knobs that we have for this extractor? Garrett Casto 2011/08/08 23:19:51 Done. Show quoted text On 2011/08/08 22:08:00, Brian Ryner wrote: > Maybe make this a class constant, like the other knobs that we have for this > extractor? Done.
96 clock_(clock),	97 clock_(clock),

97 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {	98 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {

98 Clear();	99 Clear();

99 }	100 }

100	101

101 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {	102 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {

102 // The RenderView should have called CancelPendingExtraction() before	103 // The RenderView should have called CancelPendingExtraction() before

103 // we are destroyed.	104 // we are destroyed.

104 CheckNoPendingExtraction();	105 CheckNoPendingExtraction();

105 }	106 }

(...skipping 46 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
152 return;	153 return;

153 }	154 }

154 state_->position_initialized = true;	155 state_->position_initialized = true;

155 }	156 }

156	157

157 int num_words = 0;	158 int num_words = 0;

158 for (int next = ubrk_next(state_->iterator);	159 for (int next = ubrk_next(state_->iterator);

159 next != UBRK_DONE; next = ubrk_next(state_->iterator)) {	160 next != UBRK_DONE; next = ubrk_next(state_->iterator)) {

160 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {	161 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {

161 // next is now positioned at the end of a word.	162 // next is now positioned at the end of a word.

162 HandleWord(string16(*page_text_, state_->position,	163 HandleWord(base::StringPiece16(page_text_->data() + state_->position,

163 next - state_->position));	164 next - state_->position));

164 ++num_words;	165 ++num_words;

165 }	166 }

166 state_->position = next;	167 state_->position = next;

167	168

168 if (num_words >= kClockCheckGranularity) {	169 if (num_words >= kClockCheckGranularity) {

169 num_words = 0;	170 num_words = 0;

170 base::TimeTicks now = clock_->Now();	171 base::TimeTicks now = clock_->Now();

171 if (now - state_->start_time >=	172 if (now - state_->start_time >=

172 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {	173 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {

173 DLOG(ERROR) << "Feature extraction took too long, giving up";	174 DLOG(ERROR) << "Feature extraction took too long, giving up";

(...skipping 15 matching lines...) Expand all Loading...
189 chunk_elapsed);	190 chunk_elapsed);

190 MessageLoop::current()->PostTask(	191 MessageLoop::current()->PostTask(

191 FROM_HERE,	192 FROM_HERE,

192 method_factory_.NewRunnableMethod(	193 method_factory_.NewRunnableMethod(

193 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout));	194 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout));

194 return;	195 return;

195 }	196 }

196 // Otherwise, continue.	197 // Otherwise, continue.

197 }	198 }

198 }	199 }

	200 // We need to clear the cache because the data that it depends on (page_text_)

	201 // is going away.

	202 negative_word_cache_.Clear();
	Brian Ryner 2011/08/08 22:08:00 Hm... I could imagine there being some benefit to Hm... I could imagine there being some benefit to maintaining the cache across pages, but perhaps it's not as big as the win from using StringPiece and avoiding the string copies on lookup. We could have it both ways, I guess, if we kept separate internal storage for the strings that are in negative_word_cache_, but maybe not worth it. Garrett Casto 2011/08/08 23:19:51 So I thought about this as well. It's rather hard Show quoted text On 2011/08/08 22:08:00, Brian Ryner wrote: > Hm... I could imagine there being some benefit to maintaining the cache across > pages, but perhaps it's not as big as the win from using StringPiece and > avoiding the string copies on lookup. We could have it both ways, I guess, if > we kept separate internal storage for the strings that are in > negative_word_cache_, but maybe not worth it. So I thought about this as well. It's rather hard to test because we don't know what exactly people's browsing history is going to be like. I did do a bunch of iterations in row at one point before I switched to using StringPiece16, and the difference between the first task and subsequent ones was rather small, and that's basically the optimal situation. Admittedly I didn't write down any numbers, but I think that using StringPiece16 is better. If this comes up again I can look into it.
199 RunCallback(true);	203 RunCallback(true);

200 }	204 }

201	205

202 void PhishingTermFeatureExtractor::HandleWord(const string16& word) {	206 void PhishingTermFeatureExtractor::HandleWord(

	207 const base::StringPiece16& word) {

	208 // Quickest out if we have seen this word before and know that it's not

	209 // part of any term. This avoids the SHA256, lowercasing, and UTF conversion,

	210 // all of which are relatively expensive.

	211 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {

	212 return;

	213 }

	214

203 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word));	215 std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word));

204 std::string word_hash = crypto::SHA256HashString(word_lower);	216 std::string word_hash = crypto::SHA256HashString(word_lower);

205	217

206 // Quick out if the word is not part of any term, which is the common case.	218 // Quick out if the word is not part of any term, which is the common case.

207 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {	219 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {

208 // Word doesn't exist in our terms so we can clear the n-gram state.	220 // Word doesn't exist in our terms so we can clear the n-gram state.

209 state_->previous_words.clear();	221 state_->previous_words.clear();

210 state_->previous_word_sizes.clear();	222 state_->previous_word_sizes.clear();

	223 // Insert into negative cache so that we don't try this again.

	224 negative_word_cache_.Put(word, true);

211 return;	225 return;

212 }	226 }

213	227

214 // Find all of the n-grams that we need to check and compute their hashes.	228 // Find all of the n-grams that we need to check and compute their hashes.

215 // We already have the hash for word_lower, so we don't compute that again.	229 // We already have the hash for word_lower, so we don't compute that again.

216 std::map<std::string /* hash /, std::string / plaintext */>	230 std::map<std::string /* hash /, std::string / plaintext */>

217 hashes_to_check;	231 hashes_to_check;

218 hashes_to_check[word_hash] = word_lower;	232 hashes_to_check[word_hash] = word_lower;

219	233

220 // Combine the new word with the previous words to find additional n-grams.	234 // Combine the new word with the previous words to find additional n-grams.

221 // Note that we don't yet add the new word length to previous_word_sizes,	235 // Note that we don't yet add the new word length to previous_word_sizes,

222 // since we don't want to compute the hash for the word by itself again.	236 // since we don't want to compute the hash for the word by itself again.

223 //	237 //

224 // TODO(bryner): Use UMA stats to determine whether this is too slow.

225 // If it is, there are a couple of cases that we could optimize:

226 // - We could cache plaintext words that are not in page_word_hashes_, so

227 // that we can avoid hashing these again.

228 // - We could include positional information about words in the n-grams,

229 // rather than just a list of all of the words. For example, we could

230 // change the term format so that each word is hashed separately, or

231 // we could add extra data to the word list to indicate the position

232 // at which the word appears in an n-gram, and skip checking the word if

233 // it's not at that position.

234 state_->previous_words.append(word_lower);	238 state_->previous_words.append(word_lower);

235 std::string current_term = state_->previous_words;	239 std::string current_term = state_->previous_words;

236 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();	240 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();

237 it != state_->previous_word_sizes.end(); ++it) {	241 it != state_->previous_word_sizes.end(); ++it) {

238 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;	242 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;

239 current_term.erase(0, *it);	243 current_term.erase(0, *it);

240 }	244 }

241	245

242 // Add features for any hashes that match page_term_hashes_.	246 // Add features for any hashes that match page_term_hashes_.

243 for (std::map<std::string, std::string>::iterator it =	247 for (std::map<std::string, std::string>::iterator it =

(...skipping 43 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
287 }	291 }

288	292

289 void PhishingTermFeatureExtractor::Clear() {	293 void PhishingTermFeatureExtractor::Clear() {

290 page_text_ = NULL;	294 page_text_ = NULL;

291 features_ = NULL;	295 features_ = NULL;

292 done_callback_.reset(NULL);	296 done_callback_.reset(NULL);

293 state_.reset(NULL);	297 state_.reset(NULL);

294 }	298 }

295	299

296 } // namespace safe_browsing	300 } // namespace safe_browsing

OLD	NEW

« base/string_piece.h ('K') | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.h ('k') | no next file » | no next file with comments »