Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
new file mode 100644 |
index 0000000000000000000000000000000000000000..0ec0dbcbeee9602adb9f16a5a711eef514a7ef60 |
--- /dev/null |
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc |
@@ -0,0 +1,295 @@ |
+// Copyright (c) 2010 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
+ |
+#include <list> |
+#include <map> |
+ |
+#include "app/l10n_util.h" |
+#include "base/compiler_specific.h" |
+#include "base/histogram.h" |
+#include "base/logging.h" |
+#include "base/message_loop.h" |
+#include "base/sha2.h" |
+#include "base/time.h" |
+#include "base/utf_string_conversions.h" |
+#include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
+#include "chrome/renderer/safe_browsing/features.h" |
+#include "unicode/ubrk.h" |
+ |
+namespace safe_browsing { |
+ |
+// This time should be short enough that it doesn't noticeably disrupt the |
+// user's interaction with the page. |
+const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 50; |
+ |
+// Experimenting shows that we get a reasonable gain in performance by |
+// increasing this up to around 10, but there's not much benefit in |
+// increasing it past that. |
+const int PhishingTermFeatureExtractor::kClockCheckGranularity = 10; |
+ |
+// This should be longer than we expect feature extraction to take on any |
+// actual phishing page. |
+const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; |
+ |
+// All of the state pertaining to the current feature extraction. |
+struct PhishingTermFeatureExtractor::ExtractionState { |
+ // Stores up to max_words_per_ngram_ previous words separated by spaces. |
+ std::string previous_words; |
+ |
+ // Stores the sizes of the words in previous_words. Note: the size includes |
+ // the space after each word. In other words, the sum of all sizes in this |
+ // list is equal to the length of previous_words. |
+ std::list<size_t> previous_word_sizes; |
+ |
+ // An iterator for word breaking. |
+ UBreakIterator* iterator; |
+ |
+ // Our current position in the text that was passed to the ExtractionState |
+ // constructor, speciailly, the most recent break position returned by our |
+ // iterator. |
+ int position; |
+ |
+ // True if position has been initialized. |
+ bool position_initialized; |
+ |
+ // The time at which we started feature extraction for the current page. |
+ base::TimeTicks start_time; |
+ |
+ // The number of iterations we've done for the current extraction. |
+ int num_iterations; |
+ |
+ ExtractionState(const string16& text, base::TimeTicks start_time_ticks) |
+ : position(-1), |
+ position_initialized(false), |
+ start_time(start_time_ticks), |
+ num_iterations(0) { |
+ UErrorCode status = U_ZERO_ERROR; |
+ // TODO(bryner): We should pass in the language for the document. |
+ iterator = ubrk_open(UBRK_WORD, NULL, |
+ text.data(), text.size(), |
+ &status); |
+ if (U_FAILURE(status)) { |
+ DLOG(ERROR) << "ubrk_open failed: " << status; |
+ iterator = NULL; |
+ } |
+ } |
+ |
+ ~ExtractionState() { |
+ if (iterator) { |
+ ubrk_close(iterator); |
+ } |
+ } |
+}; |
+ |
+PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( |
+ const base::hash_set<std::string>* page_term_hashes, |
+ const base::hash_set<std::string>* page_word_hashes, |
+ size_t max_words_per_term, |
+ FeatureExtractorClock* clock) |
+ : page_term_hashes_(page_term_hashes), |
+ page_word_hashes_(page_word_hashes), |
+ max_words_per_term_(max_words_per_term), |
+ clock_(clock), |
+ ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { |
+ Clear(); |
+} |
+ |
+PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { |
+ // The RenderView should have called CancelPendingExtraction() before |
+ // we are destroyed. |
+ CheckNoPendingExtraction(); |
+} |
+ |
+void PhishingTermFeatureExtractor::ExtractFeatures( |
+ const string16* page_text, |
+ FeatureMap* features, |
+ DoneCallback* done_callback) { |
+ // The RenderView should have called CancelPendingExtraction() before |
+ // starting a new extraction, so DCHECK this. |
+ CheckNoPendingExtraction(); |
+ // However, in an opt build, we will go ahead and clean up the pending |
+ // extraction so that we can start in a known state. |
+ CancelPendingExtraction(); |
+ |
+ page_text_ = page_text; |
+ features_ = features; |
+ done_callback_.reset(done_callback); |
+ |
+ state_.reset(new ExtractionState(*page_text_, clock_->Now())); |
+ MessageLoop::current()->PostTask( |
+ FROM_HERE, |
+ method_factory_.NewRunnableMethod( |
+ &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout)); |
+} |
+ |
+void PhishingTermFeatureExtractor::CancelPendingExtraction() { |
+ // Cancel any pending callbacks, and clear our state. |
+ method_factory_.RevokeAll(); |
+ Clear(); |
+} |
+ |
+void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { |
+ DCHECK(state_.get()); |
+ ++state_->num_iterations; |
+ base::TimeTicks current_chunk_start_time = clock_->Now(); |
+ |
+ if (!state_->iterator) { |
+ // We failed to initialize the break iterator, so stop now. |
+ UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1); |
+ RunCallback(false); |
+ return; |
+ } |
+ |
+ if (!state_->position_initialized) { |
+ state_->position = ubrk_first(state_->iterator); |
+ if (state_->position == UBRK_DONE) { |
+ // No words present, so we're done. |
+ RunCallback(true); |
+ return; |
+ } |
+ state_->position_initialized = true; |
+ } |
+ |
+ int num_words = 0; |
+ for (int next = ubrk_next(state_->iterator); |
+ next != UBRK_DONE; next = ubrk_next(state_->iterator)) { |
+ if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) { |
+ // next is now positioned at the end of a word. |
+ HandleWord(string16(*page_text_, state_->position, |
+ next - state_->position)); |
+ ++num_words; |
+ } |
+ state_->position = next; |
+ |
+ if (num_words >= kClockCheckGranularity) { |
+ num_words = 0; |
+ base::TimeTicks now = clock_->Now(); |
+ if (now - state_->start_time >= |
+ base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { |
+ DLOG(ERROR) << "Feature extraction took too long, giving up"; |
+ // We expect this to happen infrequently, so record when it does. |
+ UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1); |
+ RunCallback(false); |
+ return; |
+ } |
+ base::TimeDelta chunk_elapsed = now - current_chunk_start_time; |
+ if (chunk_elapsed >= |
+ base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) { |
+ // The time limit for the current chunk is up, so post a task to |
+ // continue extraction. |
+ // |
+ // Record how much time we actually spent on the chunk. If this is |
+ // much higher than kMaxTimePerChunkMs, we may need to adjust the |
+ // clock granularity. |
+ UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime", |
+ chunk_elapsed); |
+ MessageLoop::current()->PostTask( |
+ FROM_HERE, |
+ method_factory_.NewRunnableMethod( |
+ &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout)); |
+ return; |
+ } |
+ // Otherwise, continue. |
+ } |
+ } |
+ RunCallback(true); |
+} |
+ |
+void PhishingTermFeatureExtractor::HandleWord(const string16& word) { |
+ std::string word_lower = UTF16ToUTF8(l10n_util::ToLower(word)); |
+ std::string word_hash = base::SHA256HashString(word_lower); |
+ |
+ // Quick out if the word is not part of any term, which is the common case. |
+ if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { |
+ // Word doesn't exist in our terms so we can clear the n-gram state. |
+ state_->previous_words.clear(); |
+ state_->previous_word_sizes.clear(); |
+ return; |
+ } |
+ |
+ // Find all of the n-grams that we need to check and compute their hashes. |
+ // We already have the hash for word_lower, so we don't compute that again. |
+ std::map<std::string /* hash */, std::string /* plaintext */> |
+ hashes_to_check; |
+ hashes_to_check[word_hash] = word_lower; |
+ |
+ // Combine the new word with the previous words to find additional n-grams. |
+ // Note that we don't yet add the new word length to previous_word_sizes, |
+ // since we don't want to compute the hash for the word by itself again. |
+ // |
+ // TODO(bryner): Use UMA stats to determine whether this is too slow. |
+ // If it is, there are a couple of cases that we could optimize: |
+ // - We could cache plaintext words that are not in page_word_hashes_, so |
+ // that we can avoid hashing these again. |
+ // - We could include positional information about words in the n-grams, |
+ // rather than just a list of all of the words. For example, we could |
+ // change the term format so that each word is hashed separately, or |
+ // we could add extra data to the word list to indicate the position |
+ // at which the word appears in an n-gram, and skip checking the word if |
+ // it's not at that position. |
+ state_->previous_words.append(word_lower); |
+ std::string current_term = state_->previous_words; |
+ for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); |
+ it != state_->previous_word_sizes.end(); ++it) { |
+ hashes_to_check[base::SHA256HashString(current_term)] = current_term; |
+ current_term.erase(0, *it); |
+ } |
+ |
+ // Add features for any hashes that match page_term_hashes_. |
+ for (std::map<std::string, std::string>::iterator it = |
+ hashes_to_check.begin(); |
+ it != hashes_to_check.end(); ++it) { |
+ if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) { |
+ features_->AddBooleanFeature(features::kPageTerm + it->second); |
+ } |
+ } |
+ |
+ // Now that we have handled the current word, we have to add a space at the |
+ // end of it, and add the new word's size (including the space) to |
+ // previous_word_sizes. Note: it's possible that the document language |
+ // doesn't use ASCII spaces to separate words. That's fine though, we just |
+ // need to be consistent with how the model is generated. |
+ state_->previous_words.append(" "); |
+ state_->previous_word_sizes.push_back(word_lower.size() + 1); |
+ |
+ // Cap the number of previous words. |
+ if (state_->previous_word_sizes.size() >= max_words_per_term_) { |
+ state_->previous_words.erase(0, state_->previous_word_sizes.front()); |
+ state_->previous_word_sizes.pop_front(); |
+ } |
+} |
+ |
+void PhishingTermFeatureExtractor::CheckNoPendingExtraction() { |
+ DCHECK(!done_callback_.get()); |
+ DCHECK(!state_.get()); |
+ if (done_callback_.get() || state_.get()) { |
+ LOG(ERROR) << "Extraction in progress, missing call to " |
+ << "CancelPendingExtraction"; |
+ } |
+} |
+ |
+void PhishingTermFeatureExtractor::RunCallback(bool success) { |
+ // Record some timing stats that we can use to evaluate feature extraction |
+ // performance. These include both successful and failed extractions. |
+ DCHECK(state_.get()); |
+ UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations", |
+ state_->num_iterations); |
+ UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime", |
+ clock_->Now() - state_->start_time); |
+ |
+ DCHECK(done_callback_.get()); |
+ done_callback_->Run(success); |
+ Clear(); |
+} |
+ |
+void PhishingTermFeatureExtractor::Clear() { |
+ page_text_ = NULL; |
+ features_ = NULL; |
+ done_callback_.reset(NULL); |
+ state_.reset(NULL); |
+} |
+ |
+} // namespace safe_browsing |