Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(6061)

Unified Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 3214002: Add a term feature extractor for client-side phishing detection. (Closed) Base URL: http://src.chromium.org/git/chromium.git
Patch Set: Add an extra comment/TODO about performance. Created 10 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0ec0dbcbeee9602adb9f16a5a711eef514a7ef60
--- /dev/null
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
@@ -0,0 +1,295 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
+
+#include <list>
+#include <map>
+
+#include "app/l10n_util.h"
+#include "base/compiler_specific.h"
+#include "base/histogram.h"
+#include "base/logging.h"
+#include "base/message_loop.h"
+#include "base/sha2.h"
+#include "base/time.h"
+#include "base/utf_string_conversions.h"
+#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
+#include "chrome/renderer/safe_browsing/features.h"
+#include "unicode/ubrk.h"
+
+namespace safe_browsing {
+
+// This time should be short enough that it doesn't noticeably disrupt the
+// user's interaction with the page.
+const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 50;
+
+// Experimenting shows that we get a reasonable gain in performance by
+// increasing this up to around 10, but there's not much benefit in
+// increasing it past that.
+const int PhishingTermFeatureExtractor::kClockCheckGranularity = 10;
+
+// This should be longer than we expect feature extraction to take on any
+// actual phishing page.
+const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;
+
+// All of the state pertaining to the current feature extraction.
+struct PhishingTermFeatureExtractor::ExtractionState {
+ // Stores up to max_words_per_ngram_ previous words separated by spaces.
+ std::string previous_words;
+
+ // Stores the sizes of the words in previous_words. Note: the size includes
+ // the space after each word. In other words, the sum of all sizes in this
+ // list is equal to the length of previous_words.
+ std::list<size_t> previous_word_sizes;
+
+ // An iterator for word breaking.
+ UBreakIterator* iterator;
+
+ // Our current position in the text that was passed to the ExtractionState
+ // constructor, speciailly, the most recent break position returned by our
+ // iterator.
+ int position;
+
+ // True if position has been initialized.
+ bool position_initialized;
+
+ // The time at which we started feature extraction for the current page.
+ base::TimeTicks start_time;
+
+ // The number of iterations we've done for the current extraction.
+ int num_iterations;
+
+ ExtractionState(const string16& text, base::TimeTicks start_time_ticks)
+ : position(-1),
+ position_initialized(false),
+ start_time(start_time_ticks),
+ num_iterations(0) {
+ UErrorCode status = U_ZERO_ERROR;
+ // TODO(bryner): We should pass in the language for the document.
+ iterator = ubrk_open(UBRK_WORD, NULL,
+ text.data(), text.size(),
+ &status);
+ if (U_FAILURE(status)) {
+ DLOG(ERROR) << "ubrk_open failed: " << status;
+ iterator = NULL;
+ }
+ }
+
+ ~ExtractionState() {
+ if (iterator) {
+ ubrk_close(iterator);
+ }
+ }
+};
+
+PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
+ const base::hash_set<std::string>* page_term_hashes,
+ const base::hash_set<std::string>* page_word_hashes,
+ size_t max_words_per_term,
+ FeatureExtractorClock* clock)
+ : page_term_hashes_(page_term_hashes),
+ page_word_hashes_(page_word_hashes),
+ max_words_per_term_(max_words_per_term),
+ clock_(clock),
+ ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
+ Clear();
+}
+
+PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
+ // The RenderView should have called CancelPendingExtraction() before
+ // we are destroyed.
+ CheckNoPendingExtraction();
+}
+
+void PhishingTermFeatureExtractor::ExtractFeatures(
+ const string16* page_text,
+ FeatureMap* features,
+ DoneCallback* done_callback) {
+ // The RenderView should have called CancelPendingExtraction() before
+ // starting a new extraction, so DCHECK this.
+ CheckNoPendingExtraction();
+ // However, in an opt build, we will go ahead and clean up the pending
+ // extraction so that we can start in a known state.
+ CancelPendingExtraction();
+
+ page_text_ = page_text;
+ features_ = features;
+ done_callback_.reset(done_callback);
+
+ state_.reset(new ExtractionState(*page_text_, clock_->Now()));
+ MessageLoop::current()->PostTask(
+ FROM_HERE,
+ method_factory_.NewRunnableMethod(
+ &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout));
+}
+
+void PhishingTermFeatureExtractor::CancelPendingExtraction() {
+ // Cancel any pending callbacks, and clear our state.
+ method_factory_.RevokeAll();
+ Clear();
+}
+
+void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
+ DCHECK(state_.get());
+ ++state_->num_iterations;
+ base::TimeTicks current_chunk_start_time = clock_->Now();
+
+ if (!state_->iterator) {
+ // We failed to initialize the break iterator, so stop now.
+ UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);
+ RunCallback(false);
+ return;
+ }
+
+ if (!state_->position_initialized) {
+ state_->position = ubrk_first(state_->iterator);
+ if (state_->position == UBRK_DONE) {
+ // No words present, so we're done.
+ RunCallback(true);
+ return;
+ }
+ state_->position_initialized = true;
+ }
+
+ int num_words = 0;
+ for (int next = ubrk_next(state_->iterator);
+ next != UBRK_DONE; next = ubrk_next(state_->iterator)) {
+ if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {
+ // next is now positioned at the end of a word.
+ HandleWord(string16(*page_text_, state_->position,
+ next - state_->position));
+ ++num_words;
+ }
+ state_->position = next;
+
+ if (num_words >= kClockCheckGranularity) {
+ num_words = 0;
+ base::TimeTicks now = clock_->Now();
+ if (now - state_->start_time >=
+ base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
+ DLOG(ERROR) << "Feature extraction took too long, giving up";
+ // We expect this to happen infrequently, so record when it does.
+ UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);
+ RunCallback(false);
+ return;
+ }
+ base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
+ if (chunk_elapsed >=
+ base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
+ // The time limit for the current chunk is up, so post a task to
+ // continue extraction.
+ //
+ // Record how much time we actually spent on the chunk. If this is
+ // much higher than kMaxTimePerChunkMs, we may need to adjust the
+ // clock granularity.
+ UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",
+ chunk_elapsed);
+ MessageLoop::current()->PostTask(
+ FROM_HERE,
+ method_factory_.NewRunnableMethod(
+ &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout));
+ return;
+ }
+ // Otherwise, continue.
+ }
+ }
+ RunCallback(true);
+}
+
+void PhishingTermFeatureExtractor::HandleWord(const string16& word) {
+ std::string word_lower = UTF16ToUTF8(l10n_util::ToLower(word));
+ std::string word_hash = base::SHA256HashString(word_lower);
+
+ // Quick out if the word is not part of any term, which is the common case.
+ if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {
+ // Word doesn't exist in our terms so we can clear the n-gram state.
+ state_->previous_words.clear();
+ state_->previous_word_sizes.clear();
+ return;
+ }
+
+ // Find all of the n-grams that we need to check and compute their hashes.
+ // We already have the hash for word_lower, so we don't compute that again.
+ std::map<std::string /* hash */, std::string /* plaintext */>
+ hashes_to_check;
+ hashes_to_check[word_hash] = word_lower;
+
+ // Combine the new word with the previous words to find additional n-grams.
+ // Note that we don't yet add the new word length to previous_word_sizes,
+ // since we don't want to compute the hash for the word by itself again.
+ //
+ // TODO(bryner): Use UMA stats to determine whether this is too slow.
+ // If it is, there are a couple of cases that we could optimize:
+ // - We could cache plaintext words that are not in page_word_hashes_, so
+ // that we can avoid hashing these again.
+ // - We could include positional information about words in the n-grams,
+ // rather than just a list of all of the words. For example, we could
+ // change the term format so that each word is hashed separately, or
+ // we could add extra data to the word list to indicate the position
+ // at which the word appears in an n-gram, and skip checking the word if
+ // it's not at that position.
+ state_->previous_words.append(word_lower);
+ std::string current_term = state_->previous_words;
+ for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();
+ it != state_->previous_word_sizes.end(); ++it) {
+ hashes_to_check[base::SHA256HashString(current_term)] = current_term;
+ current_term.erase(0, *it);
+ }
+
+ // Add features for any hashes that match page_term_hashes_.
+ for (std::map<std::string, std::string>::iterator it =
+ hashes_to_check.begin();
+ it != hashes_to_check.end(); ++it) {
+ if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) {
+ features_->AddBooleanFeature(features::kPageTerm + it->second);
+ }
+ }
+
+ // Now that we have handled the current word, we have to add a space at the
+ // end of it, and add the new word's size (including the space) to
+ // previous_word_sizes. Note: it's possible that the document language
+ // doesn't use ASCII spaces to separate words. That's fine though, we just
+ // need to be consistent with how the model is generated.
+ state_->previous_words.append(" ");
+ state_->previous_word_sizes.push_back(word_lower.size() + 1);
+
+ // Cap the number of previous words.
+ if (state_->previous_word_sizes.size() >= max_words_per_term_) {
+ state_->previous_words.erase(0, state_->previous_word_sizes.front());
+ state_->previous_word_sizes.pop_front();
+ }
+}
+
+void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {
+ DCHECK(!done_callback_.get());
+ DCHECK(!state_.get());
+ if (done_callback_.get() || state_.get()) {
+ LOG(ERROR) << "Extraction in progress, missing call to "
+ << "CancelPendingExtraction";
+ }
+}
+
+void PhishingTermFeatureExtractor::RunCallback(bool success) {
+ // Record some timing stats that we can use to evaluate feature extraction
+ // performance. These include both successful and failed extractions.
+ DCHECK(state_.get());
+ UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations",
+ state_->num_iterations);
+ UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",
+ clock_->Now() - state_->start_time);
+
+ DCHECK(done_callback_.get());
+ done_callback_->Run(success);
+ Clear();
+}
+
+void PhishingTermFeatureExtractor::Clear() {
+ page_text_ = NULL;
+ features_ = NULL;
+ done_callback_.reset(NULL);
+ state_.reset(NULL);
+}
+
+} // namespace safe_browsing

Powered by Google App Engine
This is Rietveld 408576698