Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(82)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 3214002: Add a term feature extractor for client-side phishing detection. (Closed) Base URL: http://src.chromium.org/git/chromium.git
Patch Set: Add an extra comment/TODO about performance. Created 10 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
6
7 #include <list>
8 #include <map>
9
10 #include "app/l10n_util.h"
11 #include "base/compiler_specific.h"
12 #include "base/histogram.h"
13 #include "base/logging.h"
14 #include "base/message_loop.h"
15 #include "base/sha2.h"
16 #include "base/time.h"
17 #include "base/utf_string_conversions.h"
18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
19 #include "chrome/renderer/safe_browsing/features.h"
20 #include "unicode/ubrk.h"
21
22 namespace safe_browsing {
23
24 // This time should be short enough that it doesn't noticeably disrupt the
25 // user's interaction with the page.
26 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 50;
27
28 // Experimenting shows that we get a reasonable gain in performance by
29 // increasing this up to around 10, but there's not much benefit in
30 // increasing it past that.
31 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 10;
32
33 // This should be longer than we expect feature extraction to take on any
34 // actual phishing page.
35 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;
36
37 // All of the state pertaining to the current feature extraction.
38 struct PhishingTermFeatureExtractor::ExtractionState {
39 // Stores up to max_words_per_ngram_ previous words separated by spaces.
40 std::string previous_words;
41
42 // Stores the sizes of the words in previous_words. Note: the size includes
43 // the space after each word. In other words, the sum of all sizes in this
44 // list is equal to the length of previous_words.
45 std::list<size_t> previous_word_sizes;
46
47 // An iterator for word breaking.
48 UBreakIterator* iterator;
49
50 // Our current position in the text that was passed to the ExtractionState
51 // constructor, speciailly, the most recent break position returned by our
52 // iterator.
53 int position;
54
55 // True if position has been initialized.
56 bool position_initialized;
57
58 // The time at which we started feature extraction for the current page.
59 base::TimeTicks start_time;
60
61 // The number of iterations we've done for the current extraction.
62 int num_iterations;
63
64 ExtractionState(const string16& text, base::TimeTicks start_time_ticks)
65 : position(-1),
66 position_initialized(false),
67 start_time(start_time_ticks),
68 num_iterations(0) {
69 UErrorCode status = U_ZERO_ERROR;
70 // TODO(bryner): We should pass in the language for the document.
71 iterator = ubrk_open(UBRK_WORD, NULL,
72 text.data(), text.size(),
73 &status);
74 if (U_FAILURE(status)) {
75 DLOG(ERROR) << "ubrk_open failed: " << status;
76 iterator = NULL;
77 }
78 }
79
80 ~ExtractionState() {
81 if (iterator) {
82 ubrk_close(iterator);
83 }
84 }
85 };
86
87 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
88 const base::hash_set<std::string>* page_term_hashes,
89 const base::hash_set<std::string>* page_word_hashes,
90 size_t max_words_per_term,
91 FeatureExtractorClock* clock)
92 : page_term_hashes_(page_term_hashes),
93 page_word_hashes_(page_word_hashes),
94 max_words_per_term_(max_words_per_term),
95 clock_(clock),
96 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
97 Clear();
98 }
99
100 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
101 // The RenderView should have called CancelPendingExtraction() before
102 // we are destroyed.
103 CheckNoPendingExtraction();
104 }
105
106 void PhishingTermFeatureExtractor::ExtractFeatures(
107 const string16* page_text,
108 FeatureMap* features,
109 DoneCallback* done_callback) {
110 // The RenderView should have called CancelPendingExtraction() before
111 // starting a new extraction, so DCHECK this.
112 CheckNoPendingExtraction();
113 // However, in an opt build, we will go ahead and clean up the pending
114 // extraction so that we can start in a known state.
115 CancelPendingExtraction();
116
117 page_text_ = page_text;
118 features_ = features;
119 done_callback_.reset(done_callback);
120
121 state_.reset(new ExtractionState(*page_text_, clock_->Now()));
122 MessageLoop::current()->PostTask(
123 FROM_HERE,
124 method_factory_.NewRunnableMethod(
125 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout));
126 }
127
128 void PhishingTermFeatureExtractor::CancelPendingExtraction() {
129 // Cancel any pending callbacks, and clear our state.
130 method_factory_.RevokeAll();
131 Clear();
132 }
133
134 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
135 DCHECK(state_.get());
136 ++state_->num_iterations;
137 base::TimeTicks current_chunk_start_time = clock_->Now();
138
139 if (!state_->iterator) {
140 // We failed to initialize the break iterator, so stop now.
141 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);
142 RunCallback(false);
143 return;
144 }
145
146 if (!state_->position_initialized) {
147 state_->position = ubrk_first(state_->iterator);
148 if (state_->position == UBRK_DONE) {
149 // No words present, so we're done.
150 RunCallback(true);
151 return;
152 }
153 state_->position_initialized = true;
154 }
155
156 int num_words = 0;
157 for (int next = ubrk_next(state_->iterator);
158 next != UBRK_DONE; next = ubrk_next(state_->iterator)) {
159 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {
160 // next is now positioned at the end of a word.
161 HandleWord(string16(*page_text_, state_->position,
162 next - state_->position));
163 ++num_words;
164 }
165 state_->position = next;
166
167 if (num_words >= kClockCheckGranularity) {
168 num_words = 0;
169 base::TimeTicks now = clock_->Now();
170 if (now - state_->start_time >=
171 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
172 DLOG(ERROR) << "Feature extraction took too long, giving up";
173 // We expect this to happen infrequently, so record when it does.
174 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);
175 RunCallback(false);
176 return;
177 }
178 base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
179 if (chunk_elapsed >=
180 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
181 // The time limit for the current chunk is up, so post a task to
182 // continue extraction.
183 //
184 // Record how much time we actually spent on the chunk. If this is
185 // much higher than kMaxTimePerChunkMs, we may need to adjust the
186 // clock granularity.
187 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",
188 chunk_elapsed);
189 MessageLoop::current()->PostTask(
190 FROM_HERE,
191 method_factory_.NewRunnableMethod(
192 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout));
193 return;
194 }
195 // Otherwise, continue.
196 }
197 }
198 RunCallback(true);
199 }
200
201 void PhishingTermFeatureExtractor::HandleWord(const string16& word) {
202 std::string word_lower = UTF16ToUTF8(l10n_util::ToLower(word));
203 std::string word_hash = base::SHA256HashString(word_lower);
204
205 // Quick out if the word is not part of any term, which is the common case.
206 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {
207 // Word doesn't exist in our terms so we can clear the n-gram state.
208 state_->previous_words.clear();
209 state_->previous_word_sizes.clear();
210 return;
211 }
212
213 // Find all of the n-grams that we need to check and compute their hashes.
214 // We already have the hash for word_lower, so we don't compute that again.
215 std::map<std::string /* hash */, std::string /* plaintext */>
216 hashes_to_check;
217 hashes_to_check[word_hash] = word_lower;
218
219 // Combine the new word with the previous words to find additional n-grams.
220 // Note that we don't yet add the new word length to previous_word_sizes,
221 // since we don't want to compute the hash for the word by itself again.
222 //
223 // TODO(bryner): Use UMA stats to determine whether this is too slow.
224 // If it is, there are a couple of cases that we could optimize:
225 // - We could cache plaintext words that are not in page_word_hashes_, so
226 // that we can avoid hashing these again.
227 // - We could include positional information about words in the n-grams,
228 // rather than just a list of all of the words. For example, we could
229 // change the term format so that each word is hashed separately, or
230 // we could add extra data to the word list to indicate the position
231 // at which the word appears in an n-gram, and skip checking the word if
232 // it's not at that position.
233 state_->previous_words.append(word_lower);
234 std::string current_term = state_->previous_words;
235 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();
236 it != state_->previous_word_sizes.end(); ++it) {
237 hashes_to_check[base::SHA256HashString(current_term)] = current_term;
238 current_term.erase(0, *it);
239 }
240
241 // Add features for any hashes that match page_term_hashes_.
242 for (std::map<std::string, std::string>::iterator it =
243 hashes_to_check.begin();
244 it != hashes_to_check.end(); ++it) {
245 if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) {
246 features_->AddBooleanFeature(features::kPageTerm + it->second);
247 }
248 }
249
250 // Now that we have handled the current word, we have to add a space at the
251 // end of it, and add the new word's size (including the space) to
252 // previous_word_sizes. Note: it's possible that the document language
253 // doesn't use ASCII spaces to separate words. That's fine though, we just
254 // need to be consistent with how the model is generated.
255 state_->previous_words.append(" ");
256 state_->previous_word_sizes.push_back(word_lower.size() + 1);
257
258 // Cap the number of previous words.
259 if (state_->previous_word_sizes.size() >= max_words_per_term_) {
260 state_->previous_words.erase(0, state_->previous_word_sizes.front());
261 state_->previous_word_sizes.pop_front();
262 }
263 }
264
265 void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {
266 DCHECK(!done_callback_.get());
267 DCHECK(!state_.get());
268 if (done_callback_.get() || state_.get()) {
269 LOG(ERROR) << "Extraction in progress, missing call to "
270 << "CancelPendingExtraction";
271 }
272 }
273
274 void PhishingTermFeatureExtractor::RunCallback(bool success) {
275 // Record some timing stats that we can use to evaluate feature extraction
276 // performance. These include both successful and failed extractions.
277 DCHECK(state_.get());
278 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations",
279 state_->num_iterations);
280 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",
281 clock_->Now() - state_->start_time);
282
283 DCHECK(done_callback_.get());
284 done_callback_->Run(success);
285 Clear();
286 }
287
288 void PhishingTermFeatureExtractor::Clear() {
289 page_text_ = NULL;
290 features_ = NULL;
291 done_callback_.reset(NULL);
292 state_.reset(NULL);
293 }
294
295 } // namespace safe_browsing
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698