| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
| 6 | 6 |
| 7 #include <list> | 7 #include <list> |
| 8 #include <map> | 8 #include <map> |
| 9 | 9 |
| 10 #include "base/bind.h" | 10 #include "base/bind.h" |
| (...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 77 if (i->Init()) { | 77 if (i->Init()) { |
| 78 iterator = i.Pass(); | 78 iterator = i.Pass(); |
| 79 } else { | 79 } else { |
| 80 DLOG(ERROR) << "failed to open iterator"; | 80 DLOG(ERROR) << "failed to open iterator"; |
| 81 } | 81 } |
| 82 } | 82 } |
| 83 }; | 83 }; |
| 84 | 84 |
| 85 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( | 85 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( |
| 86 const base::hash_set<std::string>* page_term_hashes, | 86 const base::hash_set<std::string>* page_term_hashes, |
| 87 const base::hash_set<uint32>* page_word_hashes, | 87 const base::hash_set<uint32_t>* page_word_hashes, |
| 88 size_t max_words_per_term, | 88 size_t max_words_per_term, |
| 89 uint32 murmurhash3_seed, | 89 uint32_t murmurhash3_seed, |
| 90 size_t max_shingles_per_page, | 90 size_t max_shingles_per_page, |
| 91 size_t shingle_size, | 91 size_t shingle_size, |
| 92 FeatureExtractorClock* clock) | 92 FeatureExtractorClock* clock) |
| 93 : page_term_hashes_(page_term_hashes), | 93 : page_term_hashes_(page_term_hashes), |
| 94 page_word_hashes_(page_word_hashes), | 94 page_word_hashes_(page_word_hashes), |
| 95 max_words_per_term_(max_words_per_term), | 95 max_words_per_term_(max_words_per_term), |
| 96 murmurhash3_seed_(murmurhash3_seed), | 96 murmurhash3_seed_(murmurhash3_seed), |
| 97 max_shingles_per_page_(max_shingles_per_page), | 97 max_shingles_per_page_(max_shingles_per_page), |
| 98 shingle_size_(shingle_size), | 98 shingle_size_(shingle_size), |
| 99 clock_(clock), | 99 clock_(clock), |
| 100 weak_factory_(this) { | 100 weak_factory_(this) { |
| 101 Clear(); | 101 Clear(); |
| 102 } | 102 } |
| 103 | 103 |
| 104 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { | 104 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { |
| 105 // The RenderView should have called CancelPendingExtraction() before | 105 // The RenderView should have called CancelPendingExtraction() before |
| 106 // we are destroyed. | 106 // we are destroyed. |
| 107 CheckNoPendingExtraction(); | 107 CheckNoPendingExtraction(); |
| 108 } | 108 } |
| 109 | 109 |
| 110 void PhishingTermFeatureExtractor::ExtractFeatures( | 110 void PhishingTermFeatureExtractor::ExtractFeatures( |
| 111 const base::string16* page_text, | 111 const base::string16* page_text, |
| 112 FeatureMap* features, | 112 FeatureMap* features, |
| 113 std::set<uint32>* shingle_hashes, | 113 std::set<uint32_t>* shingle_hashes, |
| 114 const DoneCallback& done_callback) { | 114 const DoneCallback& done_callback) { |
| 115 // The RenderView should have called CancelPendingExtraction() before | 115 // The RenderView should have called CancelPendingExtraction() before |
| 116 // starting a new extraction, so DCHECK this. | 116 // starting a new extraction, so DCHECK this. |
| 117 CheckNoPendingExtraction(); | 117 CheckNoPendingExtraction(); |
| 118 // However, in an opt build, we will go ahead and clean up the pending | 118 // However, in an opt build, we will go ahead and clean up the pending |
| 119 // extraction so that we can start in a known state. | 119 // extraction so that we can start in a known state. |
| 120 CancelPendingExtraction(); | 120 CancelPendingExtraction(); |
| 121 | 121 |
| 122 page_text_ = page_text; | 122 page_text_ = page_text; |
| 123 features_ = features; | 123 features_ = features; |
| (...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 201 state_->shingle_word_sizes.push_back(word_lower.size() + 1); | 201 state_->shingle_word_sizes.push_back(word_lower.size() + 1); |
| 202 if (state_->shingle_word_sizes.size() == shingle_size_) { | 202 if (state_->shingle_word_sizes.size() == shingle_size_) { |
| 203 shingle_hashes_->insert( | 203 shingle_hashes_->insert( |
| 204 MurmurHash3String(state_->current_shingle, murmurhash3_seed_)); | 204 MurmurHash3String(state_->current_shingle, murmurhash3_seed_)); |
| 205 state_->current_shingle.erase(0, state_->shingle_word_sizes.front()); | 205 state_->current_shingle.erase(0, state_->shingle_word_sizes.front()); |
| 206 state_->shingle_word_sizes.pop_front(); | 206 state_->shingle_word_sizes.pop_front(); |
| 207 } | 207 } |
| 208 // Check if the size of shingle hashes is over the limit. | 208 // Check if the size of shingle hashes is over the limit. |
| 209 if (shingle_hashes_->size() > max_shingles_per_page_) { | 209 if (shingle_hashes_->size() > max_shingles_per_page_) { |
| 210 // Pop the largest one. | 210 // Pop the largest one. |
| 211 std::set<uint32>::iterator it = shingle_hashes_->end(); | 211 std::set<uint32_t>::iterator it = shingle_hashes_->end(); |
| 212 shingle_hashes_->erase(--it); | 212 shingle_hashes_->erase(--it); |
| 213 } | 213 } |
| 214 | 214 |
| 215 // Next, extract page terms. | 215 // Next, extract page terms. |
| 216 uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_); | 216 uint32_t word_hash = MurmurHash3String(word_lower, murmurhash3_seed_); |
| 217 | 217 |
| 218 // Quick out if the word is not part of any term, which is the common case. | 218 // Quick out if the word is not part of any term, which is the common case. |
| 219 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { | 219 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { |
| 220 // Word doesn't exist in our terms so we can clear the n-gram state. | 220 // Word doesn't exist in our terms so we can clear the n-gram state. |
| 221 state_->previous_words.clear(); | 221 state_->previous_words.clear(); |
| 222 state_->previous_word_sizes.clear(); | 222 state_->previous_word_sizes.clear(); |
| 223 return; | 223 return; |
| 224 } | 224 } |
| 225 | 225 |
| 226 // Find all of the n-grams that we need to check and compute their SHA-256 | 226 // Find all of the n-grams that we need to check and compute their SHA-256 |
| (...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 290 | 290 |
| 291 void PhishingTermFeatureExtractor::Clear() { | 291 void PhishingTermFeatureExtractor::Clear() { |
| 292 page_text_ = NULL; | 292 page_text_ = NULL; |
| 293 features_ = NULL; | 293 features_ = NULL; |
| 294 shingle_hashes_ = NULL; | 294 shingle_hashes_ = NULL; |
| 295 done_callback_.Reset(); | 295 done_callback_.Reset(); |
| 296 state_.reset(NULL); | 296 state_.reset(NULL); |
| 297 } | 297 } |
| 298 | 298 |
| 299 } // namespace safe_browsing | 299 } // namespace safe_browsing |
| OLD | NEW |