OLD | NEW |
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
6 | 6 |
7 #include <list> | 7 #include <list> |
8 #include <map> | 8 #include <map> |
9 | 9 |
10 #include "base/compiler_specific.h" | 10 #include "base/compiler_specific.h" |
11 #include "base/logging.h" | 11 #include "base/logging.h" |
12 #include "base/message_loop.h" | 12 #include "base/message_loop.h" |
13 #include "base/sha2.h" | |
14 #include "base/metrics/histogram.h" | 13 #include "base/metrics/histogram.h" |
15 #include "base/time.h" | 14 #include "base/time.h" |
16 #include "base/utf_string_conversions.h" | 15 #include "base/utf_string_conversions.h" |
| 16 #include "crypto/sha2.h" |
17 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" | 17 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
18 #include "chrome/renderer/safe_browsing/features.h" | 18 #include "chrome/renderer/safe_browsing/features.h" |
19 #include "ui/base/l10n/l10n_util.h" | 19 #include "ui/base/l10n/l10n_util.h" |
20 #include "unicode/ubrk.h" | 20 #include "unicode/ubrk.h" |
21 | 21 |
22 namespace safe_browsing { | 22 namespace safe_browsing { |
23 | 23 |
24 // This time should be short enough that it doesn't noticeably disrupt the | 24 // This time should be short enough that it doesn't noticeably disrupt the |
25 // user's interaction with the page. | 25 // user's interaction with the page. |
26 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 20; | 26 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 20; |
(...skipping 166 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
193 return; | 193 return; |
194 } | 194 } |
195 // Otherwise, continue. | 195 // Otherwise, continue. |
196 } | 196 } |
197 } | 197 } |
198 RunCallback(true); | 198 RunCallback(true); |
199 } | 199 } |
200 | 200 |
201 void PhishingTermFeatureExtractor::HandleWord(const string16& word) { | 201 void PhishingTermFeatureExtractor::HandleWord(const string16& word) { |
202 std::string word_lower = UTF16ToUTF8(l10n_util::ToLower(word)); | 202 std::string word_lower = UTF16ToUTF8(l10n_util::ToLower(word)); |
203 std::string word_hash = base::SHA256HashString(word_lower); | 203 std::string word_hash = crypto::SHA256HashString(word_lower); |
204 | 204 |
205 // Quick out if the word is not part of any term, which is the common case. | 205 // Quick out if the word is not part of any term, which is the common case. |
206 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { | 206 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { |
207 // Word doesn't exist in our terms so we can clear the n-gram state. | 207 // Word doesn't exist in our terms so we can clear the n-gram state. |
208 state_->previous_words.clear(); | 208 state_->previous_words.clear(); |
209 state_->previous_word_sizes.clear(); | 209 state_->previous_word_sizes.clear(); |
210 return; | 210 return; |
211 } | 211 } |
212 | 212 |
213 // Find all of the n-grams that we need to check and compute their hashes. | 213 // Find all of the n-grams that we need to check and compute their hashes. |
(...skipping 13 matching lines...) Expand all Loading... |
227 // - We could include positional information about words in the n-grams, | 227 // - We could include positional information about words in the n-grams, |
228 // rather than just a list of all of the words. For example, we could | 228 // rather than just a list of all of the words. For example, we could |
229 // change the term format so that each word is hashed separately, or | 229 // change the term format so that each word is hashed separately, or |
230 // we could add extra data to the word list to indicate the position | 230 // we could add extra data to the word list to indicate the position |
231 // at which the word appears in an n-gram, and skip checking the word if | 231 // at which the word appears in an n-gram, and skip checking the word if |
232 // it's not at that position. | 232 // it's not at that position. |
233 state_->previous_words.append(word_lower); | 233 state_->previous_words.append(word_lower); |
234 std::string current_term = state_->previous_words; | 234 std::string current_term = state_->previous_words; |
235 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); | 235 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); |
236 it != state_->previous_word_sizes.end(); ++it) { | 236 it != state_->previous_word_sizes.end(); ++it) { |
237 hashes_to_check[base::SHA256HashString(current_term)] = current_term; | 237 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term; |
238 current_term.erase(0, *it); | 238 current_term.erase(0, *it); |
239 } | 239 } |
240 | 240 |
241 // Add features for any hashes that match page_term_hashes_. | 241 // Add features for any hashes that match page_term_hashes_. |
242 for (std::map<std::string, std::string>::iterator it = | 242 for (std::map<std::string, std::string>::iterator it = |
243 hashes_to_check.begin(); | 243 hashes_to_check.begin(); |
244 it != hashes_to_check.end(); ++it) { | 244 it != hashes_to_check.end(); ++it) { |
245 if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) { | 245 if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) { |
246 features_->AddBooleanFeature(features::kPageTerm + it->second); | 246 features_->AddBooleanFeature(features::kPageTerm + it->second); |
247 } | 247 } |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
286 } | 286 } |
287 | 287 |
288 void PhishingTermFeatureExtractor::Clear() { | 288 void PhishingTermFeatureExtractor::Clear() { |
289 page_text_ = NULL; | 289 page_text_ = NULL; |
290 features_ = NULL; | 290 features_ = NULL; |
291 done_callback_.reset(NULL); | 291 done_callback_.reset(NULL); |
292 state_.reset(NULL); | 292 state_.reset(NULL); |
293 } | 293 } |
294 | 294 |
295 } // namespace safe_browsing | 295 } // namespace safe_browsing |
OLD | NEW |