chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc - Issue 3214002: Add a term feature extractor for client-side phishing detection.

Unified Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 3214002: Add a term feature extractor for client-side phishing detection. (Closed) Base URL: http://src.chromium.org/git/chromium.git

Patch Set: Add an extra comment/TODO about performance. Created 10 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.h ('k') | chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

new file mode 100644

index 0000000000000000000000000000000000000000..0ec0dbcbeee9602adb9f16a5a711eef514a7ef60

--- /dev/null

+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

@@ -0,0 +1,295 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"

+#include <list>

+#include <map>

+#include "app/l10n_util.h"

+#include "base/compiler_specific.h"

+#include "base/histogram.h"

+#include "base/logging.h"

+#include "base/message_loop.h"

+#include "base/sha2.h"

+#include "base/time.h"

+#include "base/utf_string_conversions.h"

+#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"

+#include "chrome/renderer/safe_browsing/features.h"

+#include "unicode/ubrk.h"

+namespace safe_browsing {

+// This time should be short enough that it doesn't noticeably disrupt the

+// user's interaction with the page.

+const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 50;

+// Experimenting shows that we get a reasonable gain in performance by

+// increasing this up to around 10, but there's not much benefit in

+// increasing it past that.

+const int PhishingTermFeatureExtractor::kClockCheckGranularity = 10;

+// This should be longer than we expect feature extraction to take on any

+// actual phishing page.

+const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;

+// All of the state pertaining to the current feature extraction.

+struct PhishingTermFeatureExtractor::ExtractionState {

+ // Stores up to max_words_per_ngram_ previous words separated by spaces.

+ std::string previous_words;

+ // Stores the sizes of the words in previous_words. Note: the size includes

+ // the space after each word. In other words, the sum of all sizes in this

+ // list is equal to the length of previous_words.

+ std::list<size_t> previous_word_sizes;

+ // An iterator for word breaking.

+ UBreakIterator* iterator;

+ // Our current position in the text that was passed to the ExtractionState

+ // constructor, speciailly, the most recent break position returned by our

+ // iterator.

+ int position;

+ // True if position has been initialized.

+ bool position_initialized;

+ // The time at which we started feature extraction for the current page.

+ base::TimeTicks start_time;

+ // The number of iterations we've done for the current extraction.

+ int num_iterations;

+ ExtractionState(const string16& text, base::TimeTicks start_time_ticks)

+ : position(-1),

+ position_initialized(false),

+ start_time(start_time_ticks),

+ num_iterations(0) {

+ UErrorCode status = U_ZERO_ERROR;

+ // TODO(bryner): We should pass in the language for the document.

+ iterator = ubrk_open(UBRK_WORD, NULL,

+ text.data(), text.size(),

+ &status);

+ if (U_FAILURE(status)) {

+ DLOG(ERROR) << "ubrk_open failed: " << status;

+ iterator = NULL;

+ }

+ ~ExtractionState() {

+ if (iterator) {

+ ubrk_close(iterator);

+ }

+};

+PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(

+ const base::hash_set<std::string>* page_term_hashes,

+ const base::hash_set<std::string>* page_word_hashes,

+ size_t max_words_per_term,

+ FeatureExtractorClock* clock)

+ : page_term_hashes_(page_term_hashes),

+ page_word_hashes_(page_word_hashes),

+ max_words_per_term_(max_words_per_term),

+ clock_(clock),

+ ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {

+ Clear();

+PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {

+ // The RenderView should have called CancelPendingExtraction() before

+ // we are destroyed.

+ CheckNoPendingExtraction();

+void PhishingTermFeatureExtractor::ExtractFeatures(

+ const string16* page_text,

+ FeatureMap* features,

+ DoneCallback* done_callback) {

+ // The RenderView should have called CancelPendingExtraction() before

+ // starting a new extraction, so DCHECK this.

+ CheckNoPendingExtraction();

+ // However, in an opt build, we will go ahead and clean up the pending

+ // extraction so that we can start in a known state.

+ CancelPendingExtraction();

+ page_text_ = page_text;

+ features_ = features;

+ done_callback_.reset(done_callback);

+ state_.reset(new ExtractionState(*page_text_, clock_->Now()));

+ MessageLoop::current()->PostTask(

+ FROM_HERE,

+ method_factory_.NewRunnableMethod(

+ &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout));

+void PhishingTermFeatureExtractor::CancelPendingExtraction() {

+ // Cancel any pending callbacks, and clear our state.

+ method_factory_.RevokeAll();

+ Clear();

+void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {

+ DCHECK(state_.get());

+ ++state_->num_iterations;

+ base::TimeTicks current_chunk_start_time = clock_->Now();

+ if (!state_->iterator) {

+ // We failed to initialize the break iterator, so stop now.

+ UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);

+ RunCallback(false);

+ return;

+ }

+ if (!state_->position_initialized) {

+ state_->position = ubrk_first(state_->iterator);

+ if (state_->position == UBRK_DONE) {

+ // No words present, so we're done.

+ RunCallback(true);

+ return;

+ }

+ state_->position_initialized = true;

+ }

+ int num_words = 0;

+ for (int next = ubrk_next(state_->iterator);

+ next != UBRK_DONE; next = ubrk_next(state_->iterator)) {

+ if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {

+ // next is now positioned at the end of a word.

+ HandleWord(string16(*page_text_, state_->position,

+ next - state_->position));

+ ++num_words;

+ }

+ state_->position = next;

+ if (num_words >= kClockCheckGranularity) {

+ num_words = 0;

+ base::TimeTicks now = clock_->Now();

+ if (now - state_->start_time >=

+ base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {

+ DLOG(ERROR) << "Feature extraction took too long, giving up";

+ // We expect this to happen infrequently, so record when it does.

+ UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);

+ RunCallback(false);

+ return;

+ }

+ base::TimeDelta chunk_elapsed = now - current_chunk_start_time;

+ if (chunk_elapsed >=

+ base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {

+ // The time limit for the current chunk is up, so post a task to

+ // continue extraction.

+ //

+ // Record how much time we actually spent on the chunk. If this is

+ // much higher than kMaxTimePerChunkMs, we may need to adjust the

+ // clock granularity.

+ UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",

+ chunk_elapsed);

+ MessageLoop::current()->PostTask(

+ FROM_HERE,

+ method_factory_.NewRunnableMethod(

+ &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout));

+ return;

+ }

+ // Otherwise, continue.

+ }

+ RunCallback(true);

+void PhishingTermFeatureExtractor::HandleWord(const string16& word) {

+ std::string word_lower = UTF16ToUTF8(l10n_util::ToLower(word));

+ std::string word_hash = base::SHA256HashString(word_lower);

+ // Quick out if the word is not part of any term, which is the common case.

+ if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {

+ // Word doesn't exist in our terms so we can clear the n-gram state.

+ state_->previous_words.clear();

+ state_->previous_word_sizes.clear();

+ return;

+ }

+ // Find all of the n-grams that we need to check and compute their hashes.

+ // We already have the hash for word_lower, so we don't compute that again.

+ std::map<std::string /* hash */, std::string /* plaintext */>

+ hashes_to_check;

+ hashes_to_check[word_hash] = word_lower;

+ // Combine the new word with the previous words to find additional n-grams.

+ // Note that we don't yet add the new word length to previous_word_sizes,

+ // since we don't want to compute the hash for the word by itself again.

+ //

+ // TODO(bryner): Use UMA stats to determine whether this is too slow.

+ // If it is, there are a couple of cases that we could optimize:

+ // - We could cache plaintext words that are not in page_word_hashes_, so

+ // that we can avoid hashing these again.

+ // - We could include positional information about words in the n-grams,

+ // rather than just a list of all of the words. For example, we could

+ // change the term format so that each word is hashed separately, or

+ // we could add extra data to the word list to indicate the position

+ // at which the word appears in an n-gram, and skip checking the word if

+ // it's not at that position.

+ state_->previous_words.append(word_lower);

+ std::string current_term = state_->previous_words;

+ for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();

+ it != state_->previous_word_sizes.end(); ++it) {

+ hashes_to_check[base::SHA256HashString(current_term)] = current_term;

+ current_term.erase(0, *it);

+ }

+ // Add features for any hashes that match page_term_hashes_.

+ for (std::map<std::string, std::string>::iterator it =

+ hashes_to_check.begin();

+ it != hashes_to_check.end(); ++it) {

+ if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) {

+ features_->AddBooleanFeature(features::kPageTerm + it->second);

+ }

+ // Now that we have handled the current word, we have to add a space at the

+ // end of it, and add the new word's size (including the space) to

+ // previous_word_sizes. Note: it's possible that the document language

+ // doesn't use ASCII spaces to separate words. That's fine though, we just

+ // need to be consistent with how the model is generated.

+ state_->previous_words.append(" ");

+ state_->previous_word_sizes.push_back(word_lower.size() + 1);

+ // Cap the number of previous words.

+ if (state_->previous_word_sizes.size() >= max_words_per_term_) {

+ state_->previous_words.erase(0, state_->previous_word_sizes.front());

+ state_->previous_word_sizes.pop_front();

+ }

+void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {

+ DCHECK(!done_callback_.get());

+ DCHECK(!state_.get());

+ if (done_callback_.get() || state_.get()) {

+ LOG(ERROR) << "Extraction in progress, missing call to "

+ << "CancelPendingExtraction";

+ }

+void PhishingTermFeatureExtractor::RunCallback(bool success) {

+ // Record some timing stats that we can use to evaluate feature extraction

+ // performance. These include both successful and failed extractions.

+ DCHECK(state_.get());

+ UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations",

+ state_->num_iterations);

+ UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",

+ clock_->Now() - state_->start_time);

+ DCHECK(done_callback_.get());

+ done_callback_->Run(success);

+ Clear();

+void PhishingTermFeatureExtractor::Clear() {

+ page_text_ = NULL;

+ features_ = NULL;

+ done_callback_.reset(NULL);

+ state_.reset(NULL);

+} // namespace safe_browsing