| OLD | NEW |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
| 6 | 6 |
| 7 #include <list> | 7 #include <list> |
| 8 #include <map> | 8 #include <map> |
| 9 | 9 |
| 10 #include "base/compiler_specific.h" | 10 #include "base/compiler_specific.h" |
| 11 #include "base/logging.h" | 11 #include "base/logging.h" |
| 12 #include "base/message_loop.h" | 12 #include "base/message_loop.h" |
| 13 #include "base/sha2.h" | |
| 14 #include "base/metrics/histogram.h" | 13 #include "base/metrics/histogram.h" |
| 15 #include "base/time.h" | 14 #include "base/time.h" |
| 16 #include "base/utf_string_conversions.h" | 15 #include "base/utf_string_conversions.h" |
| 16 #include "crypto/sha2.h" |
| 17 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" | 17 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
| 18 #include "chrome/renderer/safe_browsing/features.h" | 18 #include "chrome/renderer/safe_browsing/features.h" |
| 19 #include "ui/base/l10n/l10n_util.h" | 19 #include "ui/base/l10n/l10n_util.h" |
| 20 #include "unicode/ubrk.h" | 20 #include "unicode/ubrk.h" |
| 21 | 21 |
| 22 namespace safe_browsing { | 22 namespace safe_browsing { |
| 23 | 23 |
| 24 // This time should be short enough that it doesn't noticeably disrupt the | 24 // This time should be short enough that it doesn't noticeably disrupt the |
| 25 // user's interaction with the page. | 25 // user's interaction with the page. |
| 26 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 20; | 26 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 20; |
| (...skipping 166 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 193 return; | 193 return; |
| 194 } | 194 } |
| 195 // Otherwise, continue. | 195 // Otherwise, continue. |
| 196 } | 196 } |
| 197 } | 197 } |
| 198 RunCallback(true); | 198 RunCallback(true); |
| 199 } | 199 } |
| 200 | 200 |
| 201 void PhishingTermFeatureExtractor::HandleWord(const string16& word) { | 201 void PhishingTermFeatureExtractor::HandleWord(const string16& word) { |
| 202 std::string word_lower = UTF16ToUTF8(l10n_util::ToLower(word)); | 202 std::string word_lower = UTF16ToUTF8(l10n_util::ToLower(word)); |
| 203 std::string word_hash = base::SHA256HashString(word_lower); | 203 std::string word_hash = crypto::SHA256HashString(word_lower); |
| 204 | 204 |
| 205 // Quick out if the word is not part of any term, which is the common case. | 205 // Quick out if the word is not part of any term, which is the common case. |
| 206 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { | 206 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { |
| 207 // Word doesn't exist in our terms so we can clear the n-gram state. | 207 // Word doesn't exist in our terms so we can clear the n-gram state. |
| 208 state_->previous_words.clear(); | 208 state_->previous_words.clear(); |
| 209 state_->previous_word_sizes.clear(); | 209 state_->previous_word_sizes.clear(); |
| 210 return; | 210 return; |
| 211 } | 211 } |
| 212 | 212 |
| 213 // Find all of the n-grams that we need to check and compute their hashes. | 213 // Find all of the n-grams that we need to check and compute their hashes. |
| (...skipping 13 matching lines...) Expand all Loading... |
| 227 // - We could include positional information about words in the n-grams, | 227 // - We could include positional information about words in the n-grams, |
| 228 // rather than just a list of all of the words. For example, we could | 228 // rather than just a list of all of the words. For example, we could |
| 229 // change the term format so that each word is hashed separately, or | 229 // change the term format so that each word is hashed separately, or |
| 230 // we could add extra data to the word list to indicate the position | 230 // we could add extra data to the word list to indicate the position |
| 231 // at which the word appears in an n-gram, and skip checking the word if | 231 // at which the word appears in an n-gram, and skip checking the word if |
| 232 // it's not at that position. | 232 // it's not at that position. |
| 233 state_->previous_words.append(word_lower); | 233 state_->previous_words.append(word_lower); |
| 234 std::string current_term = state_->previous_words; | 234 std::string current_term = state_->previous_words; |
| 235 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); | 235 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); |
| 236 it != state_->previous_word_sizes.end(); ++it) { | 236 it != state_->previous_word_sizes.end(); ++it) { |
| 237 hashes_to_check[base::SHA256HashString(current_term)] = current_term; | 237 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term; |
| 238 current_term.erase(0, *it); | 238 current_term.erase(0, *it); |
| 239 } | 239 } |
| 240 | 240 |
| 241 // Add features for any hashes that match page_term_hashes_. | 241 // Add features for any hashes that match page_term_hashes_. |
| 242 for (std::map<std::string, std::string>::iterator it = | 242 for (std::map<std::string, std::string>::iterator it = |
| 243 hashes_to_check.begin(); | 243 hashes_to_check.begin(); |
| 244 it != hashes_to_check.end(); ++it) { | 244 it != hashes_to_check.end(); ++it) { |
| 245 if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) { | 245 if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) { |
| 246 features_->AddBooleanFeature(features::kPageTerm + it->second); | 246 features_->AddBooleanFeature(features::kPageTerm + it->second); |
| 247 } | 247 } |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 286 } | 286 } |
| 287 | 287 |
| 288 void PhishingTermFeatureExtractor::Clear() { | 288 void PhishingTermFeatureExtractor::Clear() { |
| 289 page_text_ = NULL; | 289 page_text_ = NULL; |
| 290 features_ = NULL; | 290 features_ = NULL; |
| 291 done_callback_.reset(NULL); | 291 done_callback_.reset(NULL); |
| 292 state_.reset(NULL); | 292 state_.reset(NULL); |
| 293 } | 293 } |
| 294 | 294 |
| 295 } // namespace safe_browsing | 295 } // namespace safe_browsing |
| OLD | NEW |