chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc - Issue 3214002: Add a term feature extractor for client-side phishing detection.

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 3214002: Add a term feature extractor for client-side phishing detection. (Closed) Base URL: http://src.chromium.org/git/chromium.git

Patch Set: Add an extra comment/TODO about performance. Created 10 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « chrome/renderer/safe_browsing/phishing_term_feature_extractor.h ('k') | chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"

	6

	7 #include <list>

	8 #include <map>

	9

	10 #include "app/l10n_util.h"

	11 #include "base/compiler_specific.h"

	12 #include "base/histogram.h"

	13 #include "base/logging.h"

	14 #include "base/message_loop.h"

	15 #include "base/sha2.h"

	16 #include "base/time.h"

	17 #include "base/utf_string_conversions.h"

	18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"

	19 #include "chrome/renderer/safe_browsing/features.h"

	20 #include "unicode/ubrk.h"

	21

	22 namespace safe_browsing {

	23

	24 // This time should be short enough that it doesn't noticeably disrupt the

	25 // user's interaction with the page.

	26 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 50;

	27

	28 // Experimenting shows that we get a reasonable gain in performance by

	29 // increasing this up to around 10, but there's not much benefit in

	30 // increasing it past that.

	31 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 10;

	32

	33 // This should be longer than we expect feature extraction to take on any

	34 // actual phishing page.

	35 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;

	36

	37 // All of the state pertaining to the current feature extraction.

	38 struct PhishingTermFeatureExtractor::ExtractionState {

	39 // Stores up to max_words_per_ngram_ previous words separated by spaces.

	40 std::string previous_words;

	41

	42 // Stores the sizes of the words in previous_words. Note: the size includes

	43 // the space after each word. In other words, the sum of all sizes in this

	44 // list is equal to the length of previous_words.

	45 std::list<size_t> previous_word_sizes;

	46

	47 // An iterator for word breaking.

	48 UBreakIterator* iterator;

	49

	50 // Our current position in the text that was passed to the ExtractionState

	51 // constructor, speciailly, the most recent break position returned by our

	52 // iterator.

	53 int position;

	54

	55 // True if position has been initialized.

	56 bool position_initialized;

	57

	58 // The time at which we started feature extraction for the current page.

	59 base::TimeTicks start_time;

	60

	61 // The number of iterations we've done for the current extraction.

	62 int num_iterations;

	63

	64 ExtractionState(const string16& text, base::TimeTicks start_time_ticks)

	65 : position(-1),

	66 position_initialized(false),

	67 start_time(start_time_ticks),

	68 num_iterations(0) {

	69 UErrorCode status = U_ZERO_ERROR;

	70 // TODO(bryner): We should pass in the language for the document.

	71 iterator = ubrk_open(UBRK_WORD, NULL,

	72 text.data(), text.size(),

	73 &status);

	74 if (U_FAILURE(status)) {

	75 DLOG(ERROR) << "ubrk_open failed: " << status;

	76 iterator = NULL;

	77 }

	78 }

	79

	80 ~ExtractionState() {

	81 if (iterator) {

	82 ubrk_close(iterator);

	83 }

	84 }

	85 };

	86

	87 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(

	88 const base::hash_set<std::string>* page_term_hashes,

	89 const base::hash_set<std::string>* page_word_hashes,

	90 size_t max_words_per_term,

	91 FeatureExtractorClock* clock)

	92 : page_term_hashes_(page_term_hashes),

	93 page_word_hashes_(page_word_hashes),

	94 max_words_per_term_(max_words_per_term),

	95 clock_(clock),

	96 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {

	97 Clear();

	98 }

	99

	100 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {

	101 // The RenderView should have called CancelPendingExtraction() before

	102 // we are destroyed.

	103 CheckNoPendingExtraction();

	104 }

	105

	106 void PhishingTermFeatureExtractor::ExtractFeatures(

	107 const string16* page_text,

	108 FeatureMap* features,

	109 DoneCallback* done_callback) {

	110 // The RenderView should have called CancelPendingExtraction() before

	111 // starting a new extraction, so DCHECK this.

	112 CheckNoPendingExtraction();

	113 // However, in an opt build, we will go ahead and clean up the pending

	114 // extraction so that we can start in a known state.

	115 CancelPendingExtraction();

	116

	117 page_text_ = page_text;

	118 features_ = features;

	119 done_callback_.reset(done_callback);

	120

	121 state_.reset(new ExtractionState(*page_text_, clock_->Now()));

	122 MessageLoop::current()->PostTask(

	123 FROM_HERE,

	124 method_factory_.NewRunnableMethod(

	125 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout));

	126 }

	127

	128 void PhishingTermFeatureExtractor::CancelPendingExtraction() {

	129 // Cancel any pending callbacks, and clear our state.

	130 method_factory_.RevokeAll();

	131 Clear();

	132 }

	133

	134 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {

	135 DCHECK(state_.get());

	136 ++state_->num_iterations;

	137 base::TimeTicks current_chunk_start_time = clock_->Now();

	138

	139 if (!state_->iterator) {

	140 // We failed to initialize the break iterator, so stop now.

	141 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);

	142 RunCallback(false);

	143 return;

	144 }

	145

	146 if (!state_->position_initialized) {

	147 state_->position = ubrk_first(state_->iterator);

	148 if (state_->position == UBRK_DONE) {

	149 // No words present, so we're done.

	150 RunCallback(true);

	151 return;

	152 }

	153 state_->position_initialized = true;

	154 }

	155

	156 int num_words = 0;

	157 for (int next = ubrk_next(state_->iterator);

	158 next != UBRK_DONE; next = ubrk_next(state_->iterator)) {

	159 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {

	160 // next is now positioned at the end of a word.

	161 HandleWord(string16(*page_text_, state_->position,

	162 next - state_->position));

	163 ++num_words;

	164 }

	165 state_->position = next;

	166

	167 if (num_words >= kClockCheckGranularity) {

	168 num_words = 0;

	169 base::TimeTicks now = clock_->Now();

	170 if (now - state_->start_time >=

	171 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {

	172 DLOG(ERROR) << "Feature extraction took too long, giving up";

	173 // We expect this to happen infrequently, so record when it does.

	174 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);

	175 RunCallback(false);

	176 return;

	177 }

	178 base::TimeDelta chunk_elapsed = now - current_chunk_start_time;

	179 if (chunk_elapsed >=

	180 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {

	181 // The time limit for the current chunk is up, so post a task to

	182 // continue extraction.

	183 //

	184 // Record how much time we actually spent on the chunk. If this is

	185 // much higher than kMaxTimePerChunkMs, we may need to adjust the

	186 // clock granularity.

	187 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",

	188 chunk_elapsed);

	189 MessageLoop::current()->PostTask(

	190 FROM_HERE,

	191 method_factory_.NewRunnableMethod(

	192 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout));

	193 return;

	194 }

	195 // Otherwise, continue.

	196 }

	197 }

	198 RunCallback(true);

	199 }

	200

	201 void PhishingTermFeatureExtractor::HandleWord(const string16& word) {

	202 std::string word_lower = UTF16ToUTF8(l10n_util::ToLower(word));

	203 std::string word_hash = base::SHA256HashString(word_lower);

	204

	205 // Quick out if the word is not part of any term, which is the common case.

	206 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {

	207 // Word doesn't exist in our terms so we can clear the n-gram state.

	208 state_->previous_words.clear();

	209 state_->previous_word_sizes.clear();

	210 return;

	211 }

	212

	213 // Find all of the n-grams that we need to check and compute their hashes.

	214 // We already have the hash for word_lower, so we don't compute that again.

	215 std::map<std::string /* hash /, std::string / plaintext */>

	216 hashes_to_check;

	217 hashes_to_check[word_hash] = word_lower;

	218

	219 // Combine the new word with the previous words to find additional n-grams.

	220 // Note that we don't yet add the new word length to previous_word_sizes,

	221 // since we don't want to compute the hash for the word by itself again.

	222 //

	223 // TODO(bryner): Use UMA stats to determine whether this is too slow.

	224 // If it is, there are a couple of cases that we could optimize:

	225 // - We could cache plaintext words that are not in page_word_hashes_, so

	226 // that we can avoid hashing these again.

	227 // - We could include positional information about words in the n-grams,

	228 // rather than just a list of all of the words. For example, we could

	229 // change the term format so that each word is hashed separately, or

	230 // we could add extra data to the word list to indicate the position

	231 // at which the word appears in an n-gram, and skip checking the word if

	232 // it's not at that position.

	233 state_->previous_words.append(word_lower);

	234 std::string current_term = state_->previous_words;

	235 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();

	236 it != state_->previous_word_sizes.end(); ++it) {

	237 hashes_to_check[base::SHA256HashString(current_term)] = current_term;

	238 current_term.erase(0, *it);

	239 }

	240

	241 // Add features for any hashes that match page_term_hashes_.

	242 for (std::map<std::string, std::string>::iterator it =

	243 hashes_to_check.begin();

	244 it != hashes_to_check.end(); ++it) {

	245 if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) {

	246 features_->AddBooleanFeature(features::kPageTerm + it->second);

	247 }

	248 }

	249

	250 // Now that we have handled the current word, we have to add a space at the

	251 // end of it, and add the new word's size (including the space) to

	252 // previous_word_sizes. Note: it's possible that the document language

	253 // doesn't use ASCII spaces to separate words. That's fine though, we just

	254 // need to be consistent with how the model is generated.

	255 state_->previous_words.append(" ");

	256 state_->previous_word_sizes.push_back(word_lower.size() + 1);

	257

	258 // Cap the number of previous words.

	259 if (state_->previous_word_sizes.size() >= max_words_per_term_) {

	260 state_->previous_words.erase(0, state_->previous_word_sizes.front());

	261 state_->previous_word_sizes.pop_front();

	262 }

	263 }

	264

	265 void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {

	266 DCHECK(!done_callback_.get());

	267 DCHECK(!state_.get());

	268 if (done_callback_.get() \|\| state_.get()) {

	269 LOG(ERROR) << "Extraction in progress, missing call to "

	270 << "CancelPendingExtraction";

	271 }

	272 }

	273

	274 void PhishingTermFeatureExtractor::RunCallback(bool success) {

	275 // Record some timing stats that we can use to evaluate feature extraction

	276 // performance. These include both successful and failed extractions.

	277 DCHECK(state_.get());

	278 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations",

	279 state_->num_iterations);

	280 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",

	281 clock_->Now() - state_->start_time);

	282

	283 DCHECK(done_callback_.get());

	284 done_callback_->Run(success);

	285 Clear();

	286 }

	287

	288 void PhishingTermFeatureExtractor::Clear() {

	289 page_text_ = NULL;

	290 features_ = NULL;

	291 done_callback_.reset(NULL);

	292 state_.reset(NULL);

	293 }

	294

	295 } // namespace safe_browsing

OLD	NEW