Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(365)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 268673007: Extracting page shingle hashes for similarity detection. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Fix a reference problem. Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
6 6
7 #include <list> 7 #include <list>
8 #include <map> 8 #include <map>
9 9
10 #include "base/bind.h" 10 #include "base/bind.h"
(...skipping 27 matching lines...) Expand all
38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; 38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;
39 39
40 // The maximum size of the negative word cache. 40 // The maximum size of the negative word cache.
41 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000; 41 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;
42 42
43 // All of the state pertaining to the current feature extraction. 43 // All of the state pertaining to the current feature extraction.
44 struct PhishingTermFeatureExtractor::ExtractionState { 44 struct PhishingTermFeatureExtractor::ExtractionState {
45 // Stores up to max_words_per_term_ previous words separated by spaces. 45 // Stores up to max_words_per_term_ previous words separated by spaces.
46 std::string previous_words; 46 std::string previous_words;
47 47
48 // Stores the current shingle after a new word is processed and added in.
49 std::string current_shingle;
50
51 // Stores the sizes of the words in current_shingle. Note: the size includes
52 // the space after each word. In other words, the sum of all sizes in this
53 // list is equal to the length of current_shingle.
54 std::list<size_t> shingle_word_sizes;
55
48 // Stores the sizes of the words in previous_words. Note: the size includes 56 // Stores the sizes of the words in previous_words. Note: the size includes
49 // the space after each word. In other words, the sum of all sizes in this 57 // the space after each word. In other words, the sum of all sizes in this
50 // list is equal to the length of previous_words. 58 // list is equal to the length of previous_words.
51 std::list<size_t> previous_word_sizes; 59 std::list<size_t> previous_word_sizes;
52 60
53 // An iterator for word breaking. 61 // An iterator for word breaking.
54 UBreakIterator* iterator; 62 UBreakIterator* iterator;
55 63
56 // Our current position in the text that was passed to the ExtractionState 64 // Our current position in the text that was passed to the ExtractionState
57 // constructor, speciailly, the most recent break position returned by our 65 // constructor, speciailly, the most recent break position returned by our
(...skipping 28 matching lines...) Expand all
86 ~ExtractionState() { 94 ~ExtractionState() {
87 if (iterator) { 95 if (iterator) {
88 ubrk_close(iterator); 96 ubrk_close(iterator);
89 } 97 }
90 } 98 }
91 }; 99 };
92 100
93 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( 101 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
94 const base::hash_set<std::string>* page_term_hashes, 102 const base::hash_set<std::string>* page_term_hashes,
95 const base::hash_set<uint32>* page_word_hashes, 103 const base::hash_set<uint32>* page_word_hashes,
104 size_t max_hashes_per_page,
96 size_t max_words_per_term, 105 size_t max_words_per_term,
97 uint32 murmurhash3_seed, 106 uint32 murmurhash3_seed,
107 size_t shingle_size,
98 FeatureExtractorClock* clock) 108 FeatureExtractorClock* clock)
99 : page_term_hashes_(page_term_hashes), 109 : page_term_hashes_(page_term_hashes),
100 page_word_hashes_(page_word_hashes), 110 page_word_hashes_(page_word_hashes),
111 max_hashes_per_page_(max_hashes_per_page),
101 max_words_per_term_(max_words_per_term), 112 max_words_per_term_(max_words_per_term),
102 murmurhash3_seed_(murmurhash3_seed), 113 murmurhash3_seed_(murmurhash3_seed),
114 shingle_size_(shingle_size),
103 negative_word_cache_(kMaxNegativeWordCacheSize), 115 negative_word_cache_(kMaxNegativeWordCacheSize),
104 clock_(clock), 116 clock_(clock),
105 weak_factory_(this) { 117 weak_factory_(this) {
106 Clear(); 118 Clear();
107 } 119 }
108 120
109 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { 121 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
110 // The RenderView should have called CancelPendingExtraction() before 122 // The RenderView should have called CancelPendingExtraction() before
111 // we are destroyed. 123 // we are destroyed.
112 CheckNoPendingExtraction(); 124 CheckNoPendingExtraction();
113 } 125 }
114 126
115 void PhishingTermFeatureExtractor::ExtractFeatures( 127 void PhishingTermFeatureExtractor::ExtractFeatures(
116 const base::string16* page_text, 128 const base::string16* page_text,
117 FeatureMap* features, 129 FeatureMap* features,
130 std::set<uint32>* shingle_hashes,
118 const DoneCallback& done_callback) { 131 const DoneCallback& done_callback) {
119 // The RenderView should have called CancelPendingExtraction() before 132 // The RenderView should have called CancelPendingExtraction() before
120 // starting a new extraction, so DCHECK this. 133 // starting a new extraction, so DCHECK this.
121 CheckNoPendingExtraction(); 134 CheckNoPendingExtraction();
122 // However, in an opt build, we will go ahead and clean up the pending 135 // However, in an opt build, we will go ahead and clean up the pending
123 // extraction so that we can start in a known state. 136 // extraction so that we can start in a known state.
124 CancelPendingExtraction(); 137 CancelPendingExtraction();
125 138
126 page_text_ = page_text; 139 page_text_ = page_text;
127 features_ = features; 140 features_ = features;
141 shingle_hashes_ = shingle_hashes,
128 done_callback_ = done_callback; 142 done_callback_ = done_callback;
129 143
130 state_.reset(new ExtractionState(*page_text_, clock_->Now())); 144 state_.reset(new ExtractionState(*page_text_, clock_->Now()));
131 base::MessageLoop::current()->PostTask( 145 base::MessageLoop::current()->PostTask(
132 FROM_HERE, 146 FROM_HERE,
133 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout, 147 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,
134 weak_factory_.GetWeakPtr())); 148 weak_factory_.GetWeakPtr()));
135 } 149 }
136 150
137 void PhishingTermFeatureExtractor::CancelPendingExtraction() { 151 void PhishingTermFeatureExtractor::CancelPendingExtraction() {
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after
203 return; 217 return;
204 } 218 }
205 // Otherwise, continue. 219 // Otherwise, continue.
206 } 220 }
207 } 221 }
208 RunCallback(true); 222 RunCallback(true);
209 } 223 }
210 224
211 void PhishingTermFeatureExtractor::HandleWord( 225 void PhishingTermFeatureExtractor::HandleWord(
212 const base::StringPiece16& word) { 226 const base::StringPiece16& word) {
227 // First, extract shingle hashes. We check the size of shingle_hashes_ first
228 // to skip as soon as we reach |max_hashes_per_page_|.
229 std::string word_lower;
230 if (shingle_hashes_->size() < max_hashes_per_page_) {
231 word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));
232 if (state_->shingle_word_sizes.size() < shingle_size_ - 1) {
233 // Adding |word| would not form a complete shingle, simply insert it.
234 state_->current_shingle.append(word_lower + " ");
235 state_->shingle_word_sizes.push_back(word_lower.size() + 1);
236 } else if (state_->shingle_word_sizes.size() == shingle_size_- 1) {
237 // Adding |word| would just form a complete shingle, insert it and
238 // calculate the shingle hash.
239 state_->current_shingle.append(word_lower + " ");
240 state_->shingle_word_sizes.push_back(word_lower.size() + 1);
241 shingle_hashes_->insert(
242 MurmurHash3String(state_->current_shingle, murmurhash3_seed_));
243 } else {
244 // We need to remove the first word from current_shingle, and add |word|
245 // at the end to form current_shingle.
246 state_->current_shingle.erase(0, state_->shingle_word_sizes.front());
247 state_->shingle_word_sizes.pop_front();
248 state_->current_shingle.append(word_lower + " ");
249 state_->shingle_word_sizes.push_back(word_lower.size() + 1);
250 shingle_hashes_->insert(
251 MurmurHash3String(state_->current_shingle, murmurhash3_seed_));
252 }
mattm 2014/05/06 01:00:14 each of the cases here duplicates the code of the
zysxqn 2014/05/06 20:56:57 Done.
253 }
254
255 // Next, extract page terms.
256 //
213 // Quickest out if we have seen this word before and know that it's not 257 // Quickest out if we have seen this word before and know that it's not
214 // part of any term. This avoids the lowercasing and UTF conversion, both of 258 // part of any term. This avoids the lowercasing and UTF conversion, both of
215 // which are relatively expensive. 259 // which are relatively expensive.
216 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) { 260 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {
217 // We know we're no longer in a possible n-gram, so clear the previous word 261 // We know we're no longer in a possible n-gram, so clear the previous word
218 // state. 262 // state.
219 state_->previous_words.clear(); 263 state_->previous_words.clear();
220 state_->previous_word_sizes.clear(); 264 state_->previous_word_sizes.clear();
221 return; 265 return;
222 } 266 }
223 267
224 std::string word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word)); 268 // Don't recalculate.
269 if (shingle_hashes_->size() >= max_hashes_per_page_) {
mattm 2014/05/06 01:00:14 I'd probably more comfortable with: if (word_lower
zysxqn 2014/05/06 20:56:57 Done.
270 word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));
271 }
225 uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_); 272 uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);
226 273
227 // Quick out if the word is not part of any term, which is the common case. 274 // Quick out if the word is not part of any term, which is the common case.
228 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { 275 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {
229 // Word doesn't exist in our terms so we can clear the n-gram state. 276 // Word doesn't exist in our terms so we can clear the n-gram state.
230 state_->previous_words.clear(); 277 state_->previous_words.clear();
231 state_->previous_word_sizes.clear(); 278 state_->previous_word_sizes.clear();
232 // Insert into negative cache so that we don't try this again. 279 // Insert into negative cache so that we don't try this again.
233 negative_word_cache_.Put(word, true); 280 negative_word_cache_.Put(word, true);
234 return; 281 return;
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after
295 clock_->Now() - state_->start_time); 342 clock_->Now() - state_->start_time);
296 343
297 DCHECK(!done_callback_.is_null()); 344 DCHECK(!done_callback_.is_null());
298 done_callback_.Run(success); 345 done_callback_.Run(success);
299 Clear(); 346 Clear();
300 } 347 }
301 348
302 void PhishingTermFeatureExtractor::Clear() { 349 void PhishingTermFeatureExtractor::Clear() {
303 page_text_ = NULL; 350 page_text_ = NULL;
304 features_ = NULL; 351 features_ = NULL;
352 shingle_hashes_ = NULL;
305 done_callback_.Reset(); 353 done_callback_.Reset();
306 state_.reset(NULL); 354 state_.reset(NULL);
307 negative_word_cache_.Clear(); 355 negative_word_cache_.Clear();
308 } 356 }
309 357
310 } // namespace safe_browsing 358 } // namespace safe_browsing
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698