OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
6 | 6 |
7 #include <list> | 7 #include <list> |
8 #include <map> | 8 #include <map> |
9 | 9 |
10 #include "base/bind.h" | 10 #include "base/bind.h" |
(...skipping 27 matching lines...) Expand all Loading... | |
38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; | 38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; |
39 | 39 |
40 // The maximum size of the negative word cache. | 40 // The maximum size of the negative word cache. |
41 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000; | 41 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000; |
42 | 42 |
43 // All of the state pertaining to the current feature extraction. | 43 // All of the state pertaining to the current feature extraction. |
44 struct PhishingTermFeatureExtractor::ExtractionState { | 44 struct PhishingTermFeatureExtractor::ExtractionState { |
45 // Stores up to max_words_per_term_ previous words separated by spaces. | 45 // Stores up to max_words_per_term_ previous words separated by spaces. |
46 std::string previous_words; | 46 std::string previous_words; |
47 | 47 |
48 // Stores the current shingle after a new word is processed and added in. | |
49 std::string current_shingle; | |
50 | |
51 // Stores the sizes of the words in current_shingle. Note: the size includes | |
52 // the space after each word. In other words, the sum of all sizes in this | |
53 // list is equal to the length of current_shingle. | |
54 std::list<size_t> shingle_word_sizes; | |
55 | |
48 // Stores the sizes of the words in previous_words. Note: the size includes | 56 // Stores the sizes of the words in previous_words. Note: the size includes |
49 // the space after each word. In other words, the sum of all sizes in this | 57 // the space after each word. In other words, the sum of all sizes in this |
50 // list is equal to the length of previous_words. | 58 // list is equal to the length of previous_words. |
51 std::list<size_t> previous_word_sizes; | 59 std::list<size_t> previous_word_sizes; |
52 | 60 |
53 // An iterator for word breaking. | 61 // An iterator for word breaking. |
54 UBreakIterator* iterator; | 62 UBreakIterator* iterator; |
55 | 63 |
56 // Our current position in the text that was passed to the ExtractionState | 64 // Our current position in the text that was passed to the ExtractionState |
57 // constructor, speciailly, the most recent break position returned by our | 65 // constructor, speciailly, the most recent break position returned by our |
(...skipping 28 matching lines...) Expand all Loading... | |
86 ~ExtractionState() { | 94 ~ExtractionState() { |
87 if (iterator) { | 95 if (iterator) { |
88 ubrk_close(iterator); | 96 ubrk_close(iterator); |
89 } | 97 } |
90 } | 98 } |
91 }; | 99 }; |
92 | 100 |
93 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( | 101 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( |
94 const base::hash_set<std::string>* page_term_hashes, | 102 const base::hash_set<std::string>* page_term_hashes, |
95 const base::hash_set<uint32>* page_word_hashes, | 103 const base::hash_set<uint32>* page_word_hashes, |
104 size_t max_hashes_per_page, | |
96 size_t max_words_per_term, | 105 size_t max_words_per_term, |
97 uint32 murmurhash3_seed, | 106 uint32 murmurhash3_seed, |
107 size_t shingle_size, | |
98 FeatureExtractorClock* clock) | 108 FeatureExtractorClock* clock) |
99 : page_term_hashes_(page_term_hashes), | 109 : page_term_hashes_(page_term_hashes), |
100 page_word_hashes_(page_word_hashes), | 110 page_word_hashes_(page_word_hashes), |
111 max_hashes_per_page_(max_hashes_per_page), | |
101 max_words_per_term_(max_words_per_term), | 112 max_words_per_term_(max_words_per_term), |
102 murmurhash3_seed_(murmurhash3_seed), | 113 murmurhash3_seed_(murmurhash3_seed), |
114 shingle_size_(shingle_size), | |
103 negative_word_cache_(kMaxNegativeWordCacheSize), | 115 negative_word_cache_(kMaxNegativeWordCacheSize), |
104 clock_(clock), | 116 clock_(clock), |
105 weak_factory_(this) { | 117 weak_factory_(this) { |
106 Clear(); | 118 Clear(); |
107 } | 119 } |
108 | 120 |
109 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { | 121 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { |
110 // The RenderView should have called CancelPendingExtraction() before | 122 // The RenderView should have called CancelPendingExtraction() before |
111 // we are destroyed. | 123 // we are destroyed. |
112 CheckNoPendingExtraction(); | 124 CheckNoPendingExtraction(); |
113 } | 125 } |
114 | 126 |
115 void PhishingTermFeatureExtractor::ExtractFeatures( | 127 void PhishingTermFeatureExtractor::ExtractFeatures( |
116 const base::string16* page_text, | 128 const base::string16* page_text, |
117 FeatureMap* features, | 129 FeatureMap* features, |
130 std::set<uint32>* shingle_hashes, | |
118 const DoneCallback& done_callback) { | 131 const DoneCallback& done_callback) { |
119 // The RenderView should have called CancelPendingExtraction() before | 132 // The RenderView should have called CancelPendingExtraction() before |
120 // starting a new extraction, so DCHECK this. | 133 // starting a new extraction, so DCHECK this. |
121 CheckNoPendingExtraction(); | 134 CheckNoPendingExtraction(); |
122 // However, in an opt build, we will go ahead and clean up the pending | 135 // However, in an opt build, we will go ahead and clean up the pending |
123 // extraction so that we can start in a known state. | 136 // extraction so that we can start in a known state. |
124 CancelPendingExtraction(); | 137 CancelPendingExtraction(); |
125 | 138 |
126 page_text_ = page_text; | 139 page_text_ = page_text; |
127 features_ = features; | 140 features_ = features; |
141 shingle_hashes_ = shingle_hashes, | |
128 done_callback_ = done_callback; | 142 done_callback_ = done_callback; |
129 | 143 |
130 state_.reset(new ExtractionState(*page_text_, clock_->Now())); | 144 state_.reset(new ExtractionState(*page_text_, clock_->Now())); |
131 base::MessageLoop::current()->PostTask( | 145 base::MessageLoop::current()->PostTask( |
132 FROM_HERE, | 146 FROM_HERE, |
133 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout, | 147 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout, |
134 weak_factory_.GetWeakPtr())); | 148 weak_factory_.GetWeakPtr())); |
135 } | 149 } |
136 | 150 |
137 void PhishingTermFeatureExtractor::CancelPendingExtraction() { | 151 void PhishingTermFeatureExtractor::CancelPendingExtraction() { |
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
203 return; | 217 return; |
204 } | 218 } |
205 // Otherwise, continue. | 219 // Otherwise, continue. |
206 } | 220 } |
207 } | 221 } |
208 RunCallback(true); | 222 RunCallback(true); |
209 } | 223 } |
210 | 224 |
211 void PhishingTermFeatureExtractor::HandleWord( | 225 void PhishingTermFeatureExtractor::HandleWord( |
212 const base::StringPiece16& word) { | 226 const base::StringPiece16& word) { |
227 // First, extract shingle hashes. We check the size of shingle_hashes_ first | |
228 // to skip as soon as we reach |max_hashes_per_page_|. | |
229 std::string word_lower; | |
230 if (shingle_hashes_->size() < max_hashes_per_page_) { | |
231 word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word)); | |
232 if (state_->shingle_word_sizes.size() < shingle_size_ - 1) { | |
233 // Adding |word| would not form a complete shingle, simply insert it. | |
234 state_->current_shingle.append(word_lower + " "); | |
235 state_->shingle_word_sizes.push_back(word_lower.size() + 1); | |
236 } else if (state_->shingle_word_sizes.size() == shingle_size_- 1) { | |
237 // Adding |word| would just form a complete shingle, insert it and | |
238 // calculate the shingle hash. | |
239 state_->current_shingle.append(word_lower + " "); | |
240 state_->shingle_word_sizes.push_back(word_lower.size() + 1); | |
241 shingle_hashes_->insert( | |
242 MurmurHash3String(state_->current_shingle, murmurhash3_seed_)); | |
243 } else { | |
244 // We need to remove the first word from current_shingle, and add |word| | |
245 // at the end to form current_shingle. | |
246 state_->current_shingle.erase(0, state_->shingle_word_sizes.front()); | |
247 state_->shingle_word_sizes.pop_front(); | |
248 state_->current_shingle.append(word_lower + " "); | |
249 state_->shingle_word_sizes.push_back(word_lower.size() + 1); | |
250 shingle_hashes_->insert( | |
251 MurmurHash3String(state_->current_shingle, murmurhash3_seed_)); | |
252 } | |
mattm
2014/05/06 01:00:14
each of the cases here duplicates the code of the
zysxqn
2014/05/06 20:56:57
Done.
| |
253 } | |
254 | |
255 // Next, extract page terms. | |
256 // | |
213 // Quickest out if we have seen this word before and know that it's not | 257 // Quickest out if we have seen this word before and know that it's not |
214 // part of any term. This avoids the lowercasing and UTF conversion, both of | 258 // part of any term. This avoids the lowercasing and UTF conversion, both of |
215 // which are relatively expensive. | 259 // which are relatively expensive. |
216 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) { | 260 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) { |
217 // We know we're no longer in a possible n-gram, so clear the previous word | 261 // We know we're no longer in a possible n-gram, so clear the previous word |
218 // state. | 262 // state. |
219 state_->previous_words.clear(); | 263 state_->previous_words.clear(); |
220 state_->previous_word_sizes.clear(); | 264 state_->previous_word_sizes.clear(); |
221 return; | 265 return; |
222 } | 266 } |
223 | 267 |
224 std::string word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word)); | 268 // Don't recalculate. |
269 if (shingle_hashes_->size() >= max_hashes_per_page_) { | |
mattm
2014/05/06 01:00:14
I'd probably more comfortable with:
if (word_lower
zysxqn
2014/05/06 20:56:57
Done.
| |
270 word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word)); | |
271 } | |
225 uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_); | 272 uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_); |
226 | 273 |
227 // Quick out if the word is not part of any term, which is the common case. | 274 // Quick out if the word is not part of any term, which is the common case. |
228 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { | 275 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { |
229 // Word doesn't exist in our terms so we can clear the n-gram state. | 276 // Word doesn't exist in our terms so we can clear the n-gram state. |
230 state_->previous_words.clear(); | 277 state_->previous_words.clear(); |
231 state_->previous_word_sizes.clear(); | 278 state_->previous_word_sizes.clear(); |
232 // Insert into negative cache so that we don't try this again. | 279 // Insert into negative cache so that we don't try this again. |
233 negative_word_cache_.Put(word, true); | 280 negative_word_cache_.Put(word, true); |
234 return; | 281 return; |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
295 clock_->Now() - state_->start_time); | 342 clock_->Now() - state_->start_time); |
296 | 343 |
297 DCHECK(!done_callback_.is_null()); | 344 DCHECK(!done_callback_.is_null()); |
298 done_callback_.Run(success); | 345 done_callback_.Run(success); |
299 Clear(); | 346 Clear(); |
300 } | 347 } |
301 | 348 |
302 void PhishingTermFeatureExtractor::Clear() { | 349 void PhishingTermFeatureExtractor::Clear() { |
303 page_text_ = NULL; | 350 page_text_ = NULL; |
304 features_ = NULL; | 351 features_ = NULL; |
352 shingle_hashes_ = NULL; | |
305 done_callback_.Reset(); | 353 done_callback_.Reset(); |
306 state_.reset(NULL); | 354 state_.reset(NULL); |
307 negative_word_cache_.Clear(); | 355 negative_word_cache_.Clear(); |
308 } | 356 } |
309 | 357 |
310 } // namespace safe_browsing | 358 } // namespace safe_browsing |
OLD | NEW |