OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" |
| 6 |
| 7 #include <list> |
| 8 #include <map> |
| 9 |
| 10 #include "app/l10n_util.h" |
| 11 #include "base/compiler_specific.h" |
| 12 #include "base/histogram.h" |
| 13 #include "base/logging.h" |
| 14 #include "base/message_loop.h" |
| 15 #include "base/sha2.h" |
| 16 #include "base/time.h" |
| 17 #include "base/utf_string_conversions.h" |
| 18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
| 19 #include "chrome/renderer/safe_browsing/features.h" |
| 20 #include "unicode/ubrk.h" |
| 21 |
| 22 namespace safe_browsing { |
| 23 |
| 24 // This time should be short enough that it doesn't noticeably disrupt the |
| 25 // user's interaction with the page. |
| 26 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 50; |
| 27 |
| 28 // Experimenting shows that we get a reasonable gain in performance by |
| 29 // increasing this up to around 10, but there's not much benefit in |
| 30 // increasing it past that. |
| 31 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 10; |
| 32 |
| 33 // This should be longer than we expect feature extraction to take on any |
| 34 // actual phishing page. |
| 35 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; |
| 36 |
| 37 // All of the state pertaining to the current feature extraction. |
| 38 struct PhishingTermFeatureExtractor::ExtractionState { |
| 39 // Stores up to max_words_per_ngram_ previous words separated by spaces. |
| 40 std::string previous_words; |
| 41 |
| 42 // Stores the sizes of the words in previous_words. Note: the size includes |
| 43 // the space after each word. In other words, the sum of all sizes in this |
| 44 // list is equal to the length of previous_words. |
| 45 std::list<size_t> previous_word_sizes; |
| 46 |
| 47 // An iterator for word breaking. |
| 48 UBreakIterator* iterator; |
| 49 |
| 50 // Our current position in the text that was passed to the ExtractionState |
| 51 // constructor, speciailly, the most recent break position returned by our |
| 52 // iterator. |
| 53 int position; |
| 54 |
| 55 // True if position has been initialized. |
| 56 bool position_initialized; |
| 57 |
| 58 // The time at which we started feature extraction for the current page. |
| 59 base::TimeTicks start_time; |
| 60 |
| 61 // The number of iterations we've done for the current extraction. |
| 62 int num_iterations; |
| 63 |
| 64 ExtractionState(const string16& text, base::TimeTicks start_time_ticks) |
| 65 : position(-1), |
| 66 position_initialized(false), |
| 67 start_time(start_time_ticks), |
| 68 num_iterations(0) { |
| 69 UErrorCode status = U_ZERO_ERROR; |
| 70 // TODO(bryner): We should pass in the language for the document. |
| 71 iterator = ubrk_open(UBRK_WORD, NULL, |
| 72 text.data(), text.size(), |
| 73 &status); |
| 74 if (U_FAILURE(status)) { |
| 75 DLOG(ERROR) << "ubrk_open failed: " << status; |
| 76 iterator = NULL; |
| 77 } |
| 78 } |
| 79 |
| 80 ~ExtractionState() { |
| 81 if (iterator) { |
| 82 ubrk_close(iterator); |
| 83 } |
| 84 } |
| 85 }; |
| 86 |
| 87 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( |
| 88 const base::hash_set<std::string>* page_term_hashes, |
| 89 const base::hash_set<std::string>* page_word_hashes, |
| 90 size_t max_words_per_term, |
| 91 FeatureExtractorClock* clock) |
| 92 : page_term_hashes_(page_term_hashes), |
| 93 page_word_hashes_(page_word_hashes), |
| 94 max_words_per_term_(max_words_per_term), |
| 95 clock_(clock), |
| 96 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { |
| 97 Clear(); |
| 98 } |
| 99 |
| 100 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { |
| 101 // The RenderView should have called CancelPendingExtraction() before |
| 102 // we are destroyed. |
| 103 CheckNoPendingExtraction(); |
| 104 } |
| 105 |
| 106 void PhishingTermFeatureExtractor::ExtractFeatures( |
| 107 const string16* page_text, |
| 108 FeatureMap* features, |
| 109 DoneCallback* done_callback) { |
| 110 // The RenderView should have called CancelPendingExtraction() before |
| 111 // starting a new extraction, so DCHECK this. |
| 112 CheckNoPendingExtraction(); |
| 113 // However, in an opt build, we will go ahead and clean up the pending |
| 114 // extraction so that we can start in a known state. |
| 115 CancelPendingExtraction(); |
| 116 |
| 117 page_text_ = page_text; |
| 118 features_ = features; |
| 119 done_callback_.reset(done_callback); |
| 120 |
| 121 state_.reset(new ExtractionState(*page_text_, clock_->Now())); |
| 122 MessageLoop::current()->PostTask( |
| 123 FROM_HERE, |
| 124 method_factory_.NewRunnableMethod( |
| 125 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout)); |
| 126 } |
| 127 |
| 128 void PhishingTermFeatureExtractor::CancelPendingExtraction() { |
| 129 // Cancel any pending callbacks, and clear our state. |
| 130 method_factory_.RevokeAll(); |
| 131 Clear(); |
| 132 } |
| 133 |
| 134 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { |
| 135 DCHECK(state_.get()); |
| 136 ++state_->num_iterations; |
| 137 base::TimeTicks current_chunk_start_time = clock_->Now(); |
| 138 |
| 139 if (!state_->iterator) { |
| 140 // We failed to initialize the break iterator, so stop now. |
| 141 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1); |
| 142 RunCallback(false); |
| 143 return; |
| 144 } |
| 145 |
| 146 if (!state_->position_initialized) { |
| 147 state_->position = ubrk_first(state_->iterator); |
| 148 if (state_->position == UBRK_DONE) { |
| 149 // No words present, so we're done. |
| 150 RunCallback(true); |
| 151 return; |
| 152 } |
| 153 state_->position_initialized = true; |
| 154 } |
| 155 |
| 156 int num_words = 0; |
| 157 for (int next = ubrk_next(state_->iterator); |
| 158 next != UBRK_DONE; next = ubrk_next(state_->iterator)) { |
| 159 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) { |
| 160 // next is now positioned at the end of a word. |
| 161 HandleWord(string16(*page_text_, state_->position, |
| 162 next - state_->position)); |
| 163 ++num_words; |
| 164 } |
| 165 state_->position = next; |
| 166 |
| 167 if (num_words >= kClockCheckGranularity) { |
| 168 num_words = 0; |
| 169 base::TimeTicks now = clock_->Now(); |
| 170 if (now - state_->start_time >= |
| 171 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { |
| 172 DLOG(ERROR) << "Feature extraction took too long, giving up"; |
| 173 // We expect this to happen infrequently, so record when it does. |
| 174 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1); |
| 175 RunCallback(false); |
| 176 return; |
| 177 } |
| 178 base::TimeDelta chunk_elapsed = now - current_chunk_start_time; |
| 179 if (chunk_elapsed >= |
| 180 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) { |
| 181 // The time limit for the current chunk is up, so post a task to |
| 182 // continue extraction. |
| 183 // |
| 184 // Record how much time we actually spent on the chunk. If this is |
| 185 // much higher than kMaxTimePerChunkMs, we may need to adjust the |
| 186 // clock granularity. |
| 187 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime", |
| 188 chunk_elapsed); |
| 189 MessageLoop::current()->PostTask( |
| 190 FROM_HERE, |
| 191 method_factory_.NewRunnableMethod( |
| 192 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout)); |
| 193 return; |
| 194 } |
| 195 // Otherwise, continue. |
| 196 } |
| 197 } |
| 198 RunCallback(true); |
| 199 } |
| 200 |
| 201 void PhishingTermFeatureExtractor::HandleWord(const string16& word) { |
| 202 std::string word_lower = UTF16ToUTF8(l10n_util::ToLower(word)); |
| 203 std::string word_hash = base::SHA256HashString(word_lower); |
| 204 |
| 205 // Quick out if the word is not part of any term, which is the common case. |
| 206 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { |
| 207 // Word doesn't exist in our terms so we can clear the n-gram state. |
| 208 state_->previous_words.clear(); |
| 209 state_->previous_word_sizes.clear(); |
| 210 return; |
| 211 } |
| 212 |
| 213 // Find all of the n-grams that we need to check and compute their hashes. |
| 214 // We already have the hash for word_lower, so we don't compute that again. |
| 215 std::map<std::string /* hash */, std::string /* plaintext */> |
| 216 hashes_to_check; |
| 217 hashes_to_check[word_hash] = word_lower; |
| 218 |
| 219 // Combine the new word with the previous words to find additional n-grams. |
| 220 // Note that we don't yet add the new word length to previous_word_sizes, |
| 221 // since we don't want to compute the hash for the word by itself again. |
| 222 // |
| 223 // TODO(bryner): Use UMA stats to determine whether this is too slow. |
| 224 // If it is, there are a couple of cases that we could optimize: |
| 225 // - We could cache plaintext words that are not in page_word_hashes_, so |
| 226 // that we can avoid hashing these again. |
| 227 // - We could include positional information about words in the n-grams, |
| 228 // rather than just a list of all of the words. For example, we could |
| 229 // change the term format so that each word is hashed separately, or |
| 230 // we could add extra data to the word list to indicate the position |
| 231 // at which the word appears in an n-gram, and skip checking the word if |
| 232 // it's not at that position. |
| 233 state_->previous_words.append(word_lower); |
| 234 std::string current_term = state_->previous_words; |
| 235 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); |
| 236 it != state_->previous_word_sizes.end(); ++it) { |
| 237 hashes_to_check[base::SHA256HashString(current_term)] = current_term; |
| 238 current_term.erase(0, *it); |
| 239 } |
| 240 |
| 241 // Add features for any hashes that match page_term_hashes_. |
| 242 for (std::map<std::string, std::string>::iterator it = |
| 243 hashes_to_check.begin(); |
| 244 it != hashes_to_check.end(); ++it) { |
| 245 if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) { |
| 246 features_->AddBooleanFeature(features::kPageTerm + it->second); |
| 247 } |
| 248 } |
| 249 |
| 250 // Now that we have handled the current word, we have to add a space at the |
| 251 // end of it, and add the new word's size (including the space) to |
| 252 // previous_word_sizes. Note: it's possible that the document language |
| 253 // doesn't use ASCII spaces to separate words. That's fine though, we just |
| 254 // need to be consistent with how the model is generated. |
| 255 state_->previous_words.append(" "); |
| 256 state_->previous_word_sizes.push_back(word_lower.size() + 1); |
| 257 |
| 258 // Cap the number of previous words. |
| 259 if (state_->previous_word_sizes.size() >= max_words_per_term_) { |
| 260 state_->previous_words.erase(0, state_->previous_word_sizes.front()); |
| 261 state_->previous_word_sizes.pop_front(); |
| 262 } |
| 263 } |
| 264 |
| 265 void PhishingTermFeatureExtractor::CheckNoPendingExtraction() { |
| 266 DCHECK(!done_callback_.get()); |
| 267 DCHECK(!state_.get()); |
| 268 if (done_callback_.get() || state_.get()) { |
| 269 LOG(ERROR) << "Extraction in progress, missing call to " |
| 270 << "CancelPendingExtraction"; |
| 271 } |
| 272 } |
| 273 |
| 274 void PhishingTermFeatureExtractor::RunCallback(bool success) { |
| 275 // Record some timing stats that we can use to evaluate feature extraction |
| 276 // performance. These include both successful and failed extractions. |
| 277 DCHECK(state_.get()); |
| 278 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations", |
| 279 state_->num_iterations); |
| 280 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime", |
| 281 clock_->Now() - state_->start_time); |
| 282 |
| 283 DCHECK(done_callback_.get()); |
| 284 done_callback_->Run(success); |
| 285 Clear(); |
| 286 } |
| 287 |
| 288 void PhishingTermFeatureExtractor::Clear() { |
| 289 page_text_ = NULL; |
| 290 features_ = NULL; |
| 291 done_callback_.reset(NULL); |
| 292 state_.reset(NULL); |
| 293 } |
| 294 |
| 295 } // namespace safe_browsing |
OLD | NEW |