| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | |
| 6 | |
| 7 #include <list> | |
| 8 #include <map> | |
| 9 #include <memory> | |
| 10 #include <utility> | |
| 11 | |
| 12 #include "base/bind.h" | |
| 13 #include "base/compiler_specific.h" | |
| 14 #include "base/i18n/break_iterator.h" | |
| 15 #include "base/i18n/case_conversion.h" | |
| 16 #include "base/location.h" | |
| 17 #include "base/logging.h" | |
| 18 #include "base/metrics/histogram_macros.h" | |
| 19 #include "base/single_thread_task_runner.h" | |
| 20 #include "base/strings/utf_string_conversions.h" | |
| 21 #include "base/threading/thread_task_runner_handle.h" | |
| 22 #include "base/time/time.h" | |
| 23 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" | |
| 24 #include "chrome/renderer/safe_browsing/features.h" | |
| 25 #include "chrome/renderer/safe_browsing/murmurhash3_util.h" | |
| 26 #include "crypto/sha2.h" | |
| 27 | |
| 28 namespace safe_browsing { | |
| 29 | |
| 30 // This time should be short enough that it doesn't noticeably disrupt the | |
| 31 // user's interaction with the page. | |
| 32 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10; | |
| 33 | |
| 34 // Experimenting shows that we get a reasonable gain in performance by | |
| 35 // increasing this up to around 10, but there's not much benefit in | |
| 36 // increasing it past that. | |
| 37 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5; | |
| 38 | |
| 39 // This should be longer than we expect feature extraction to take on any | |
| 40 // actual phishing page. | |
| 41 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500; | |
| 42 | |
| 43 // All of the state pertaining to the current feature extraction. | |
| 44 struct PhishingTermFeatureExtractor::ExtractionState { | |
| 45 // Stores up to max_words_per_term_ previous words separated by spaces. | |
| 46 std::string previous_words; | |
| 47 | |
| 48 // Stores the current shingle after a new word is processed and added in. | |
| 49 std::string current_shingle; | |
| 50 | |
| 51 // Stores the sizes of the words in current_shingle. Note: the size includes | |
| 52 // the space after each word. In other words, the sum of all sizes in this | |
| 53 // list is equal to the length of current_shingle. | |
| 54 std::list<size_t> shingle_word_sizes; | |
| 55 | |
| 56 // Stores the sizes of the words in previous_words. Note: the size includes | |
| 57 // the space after each word. In other words, the sum of all sizes in this | |
| 58 // list is equal to the length of previous_words. | |
| 59 std::list<size_t> previous_word_sizes; | |
| 60 | |
| 61 // An iterator for word breaking. | |
| 62 std::unique_ptr<base::i18n::BreakIterator> iterator; | |
| 63 | |
| 64 // The time at which we started feature extraction for the current page. | |
| 65 base::TimeTicks start_time; | |
| 66 | |
| 67 // The number of iterations we've done for the current extraction. | |
| 68 int num_iterations; | |
| 69 | |
| 70 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks) | |
| 71 : start_time(start_time_ticks), | |
| 72 num_iterations(0) { | |
| 73 std::unique_ptr<base::i18n::BreakIterator> i(new base::i18n::BreakIterator( | |
| 74 text, base::i18n::BreakIterator::BREAK_WORD)); | |
| 75 | |
| 76 if (i->Init()) { | |
| 77 iterator = std::move(i); | |
| 78 } else { | |
| 79 DLOG(ERROR) << "failed to open iterator"; | |
| 80 } | |
| 81 } | |
| 82 }; | |
| 83 | |
| 84 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( | |
| 85 const base::hash_set<std::string>* page_term_hashes, | |
| 86 const base::hash_set<uint32_t>* page_word_hashes, | |
| 87 size_t max_words_per_term, | |
| 88 uint32_t murmurhash3_seed, | |
| 89 size_t max_shingles_per_page, | |
| 90 size_t shingle_size, | |
| 91 FeatureExtractorClock* clock) | |
| 92 : page_term_hashes_(page_term_hashes), | |
| 93 page_word_hashes_(page_word_hashes), | |
| 94 max_words_per_term_(max_words_per_term), | |
| 95 murmurhash3_seed_(murmurhash3_seed), | |
| 96 max_shingles_per_page_(max_shingles_per_page), | |
| 97 shingle_size_(shingle_size), | |
| 98 clock_(clock), | |
| 99 weak_factory_(this) { | |
| 100 Clear(); | |
| 101 } | |
| 102 | |
| 103 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() { | |
| 104 // The RenderView should have called CancelPendingExtraction() before | |
| 105 // we are destroyed. | |
| 106 CheckNoPendingExtraction(); | |
| 107 } | |
| 108 | |
| 109 void PhishingTermFeatureExtractor::ExtractFeatures( | |
| 110 const base::string16* page_text, | |
| 111 FeatureMap* features, | |
| 112 std::set<uint32_t>* shingle_hashes, | |
| 113 const DoneCallback& done_callback) { | |
| 114 // The RenderView should have called CancelPendingExtraction() before | |
| 115 // starting a new extraction, so DCHECK this. | |
| 116 CheckNoPendingExtraction(); | |
| 117 // However, in an opt build, we will go ahead and clean up the pending | |
| 118 // extraction so that we can start in a known state. | |
| 119 CancelPendingExtraction(); | |
| 120 | |
| 121 page_text_ = page_text; | |
| 122 features_ = features; | |
| 123 shingle_hashes_ = shingle_hashes, | |
| 124 done_callback_ = done_callback; | |
| 125 | |
| 126 state_.reset(new ExtractionState(*page_text_, clock_->Now())); | |
| 127 base::ThreadTaskRunnerHandle::Get()->PostTask( | |
| 128 FROM_HERE, | |
| 129 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout, | |
| 130 weak_factory_.GetWeakPtr())); | |
| 131 } | |
| 132 | |
| 133 void PhishingTermFeatureExtractor::CancelPendingExtraction() { | |
| 134 // Cancel any pending callbacks, and clear our state. | |
| 135 weak_factory_.InvalidateWeakPtrs(); | |
| 136 Clear(); | |
| 137 } | |
| 138 | |
| 139 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { | |
| 140 DCHECK(state_.get()); | |
| 141 ++state_->num_iterations; | |
| 142 base::TimeTicks current_chunk_start_time = clock_->Now(); | |
| 143 | |
| 144 if (!state_->iterator.get()) { | |
| 145 // We failed to initialize the break iterator, so stop now. | |
| 146 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1); | |
| 147 RunCallback(false); | |
| 148 return; | |
| 149 } | |
| 150 | |
| 151 int num_words = 0; | |
| 152 while (state_->iterator->Advance()) { | |
| 153 if (state_->iterator->IsWord()) { | |
| 154 const size_t start = state_->iterator->prev(); | |
| 155 const size_t length = state_->iterator->pos() - start; | |
| 156 HandleWord(base::StringPiece16(page_text_->data() + start, length)); | |
| 157 ++num_words; | |
| 158 } | |
| 159 | |
| 160 if (num_words >= kClockCheckGranularity) { | |
| 161 num_words = 0; | |
| 162 base::TimeTicks now = clock_->Now(); | |
| 163 if (now - state_->start_time >= | |
| 164 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { | |
| 165 DLOG(ERROR) << "Feature extraction took too long, giving up"; | |
| 166 // We expect this to happen infrequently, so record when it does. | |
| 167 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1); | |
| 168 RunCallback(false); | |
| 169 return; | |
| 170 } | |
| 171 base::TimeDelta chunk_elapsed = now - current_chunk_start_time; | |
| 172 if (chunk_elapsed >= | |
| 173 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) { | |
| 174 // The time limit for the current chunk is up, so post a task to | |
| 175 // continue extraction. | |
| 176 // | |
| 177 // Record how much time we actually spent on the chunk. If this is | |
| 178 // much higher than kMaxTimePerChunkMs, we may need to adjust the | |
| 179 // clock granularity. | |
| 180 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime", | |
| 181 chunk_elapsed); | |
| 182 base::ThreadTaskRunnerHandle::Get()->PostTask( | |
| 183 FROM_HERE, | |
| 184 base::Bind( | |
| 185 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout, | |
| 186 weak_factory_.GetWeakPtr())); | |
| 187 return; | |
| 188 } | |
| 189 // Otherwise, continue. | |
| 190 } | |
| 191 } | |
| 192 RunCallback(true); | |
| 193 } | |
| 194 | |
| 195 void PhishingTermFeatureExtractor::HandleWord( | |
| 196 const base::StringPiece16& word) { | |
| 197 // First, extract shingle hashes. | |
| 198 const std::string& word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word)); | |
| 199 state_->current_shingle.append(word_lower + " "); | |
| 200 state_->shingle_word_sizes.push_back(word_lower.size() + 1); | |
| 201 if (state_->shingle_word_sizes.size() == shingle_size_) { | |
| 202 shingle_hashes_->insert( | |
| 203 MurmurHash3String(state_->current_shingle, murmurhash3_seed_)); | |
| 204 state_->current_shingle.erase(0, state_->shingle_word_sizes.front()); | |
| 205 state_->shingle_word_sizes.pop_front(); | |
| 206 } | |
| 207 // Check if the size of shingle hashes is over the limit. | |
| 208 if (shingle_hashes_->size() > max_shingles_per_page_) { | |
| 209 // Pop the largest one. | |
| 210 std::set<uint32_t>::iterator it = shingle_hashes_->end(); | |
| 211 shingle_hashes_->erase(--it); | |
| 212 } | |
| 213 | |
| 214 // Next, extract page terms. | |
| 215 uint32_t word_hash = MurmurHash3String(word_lower, murmurhash3_seed_); | |
| 216 | |
| 217 // Quick out if the word is not part of any term, which is the common case. | |
| 218 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { | |
| 219 // Word doesn't exist in our terms so we can clear the n-gram state. | |
| 220 state_->previous_words.clear(); | |
| 221 state_->previous_word_sizes.clear(); | |
| 222 return; | |
| 223 } | |
| 224 | |
| 225 // Find all of the n-grams that we need to check and compute their SHA-256 | |
| 226 // hashes. | |
| 227 std::map<std::string /* hash */, std::string /* plaintext */> | |
| 228 hashes_to_check; | |
| 229 hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower; | |
| 230 | |
| 231 // Combine the new word with the previous words to find additional n-grams. | |
| 232 // Note that we don't yet add the new word length to previous_word_sizes, | |
| 233 // since we don't want to compute the hash for the word by itself again. | |
| 234 // | |
| 235 state_->previous_words.append(word_lower); | |
| 236 std::string current_term = state_->previous_words; | |
| 237 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin(); | |
| 238 it != state_->previous_word_sizes.end(); ++it) { | |
| 239 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term; | |
| 240 current_term.erase(0, *it); | |
| 241 } | |
| 242 | |
| 243 // Add features for any hashes that match page_term_hashes_. | |
| 244 for (std::map<std::string, std::string>::iterator it = | |
| 245 hashes_to_check.begin(); | |
| 246 it != hashes_to_check.end(); ++it) { | |
| 247 if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) { | |
| 248 features_->AddBooleanFeature(features::kPageTerm + it->second); | |
| 249 } | |
| 250 } | |
| 251 | |
| 252 // Now that we have handled the current word, we have to add a space at the | |
| 253 // end of it, and add the new word's size (including the space) to | |
| 254 // previous_word_sizes. Note: it's possible that the document language | |
| 255 // doesn't use ASCII spaces to separate words. That's fine though, we just | |
| 256 // need to be consistent with how the model is generated. | |
| 257 state_->previous_words.append(" "); | |
| 258 state_->previous_word_sizes.push_back(word_lower.size() + 1); | |
| 259 | |
| 260 // Cap the number of previous words. | |
| 261 if (state_->previous_word_sizes.size() >= max_words_per_term_) { | |
| 262 state_->previous_words.erase(0, state_->previous_word_sizes.front()); | |
| 263 state_->previous_word_sizes.pop_front(); | |
| 264 } | |
| 265 } | |
| 266 | |
| 267 void PhishingTermFeatureExtractor::CheckNoPendingExtraction() { | |
| 268 DCHECK(done_callback_.is_null()); | |
| 269 DCHECK(!state_.get()); | |
| 270 if (!done_callback_.is_null() || state_.get()) { | |
| 271 LOG(ERROR) << "Extraction in progress, missing call to " | |
| 272 << "CancelPendingExtraction"; | |
| 273 } | |
| 274 } | |
| 275 | |
| 276 void PhishingTermFeatureExtractor::RunCallback(bool success) { | |
| 277 // Record some timing stats that we can use to evaluate feature extraction | |
| 278 // performance. These include both successful and failed extractions. | |
| 279 DCHECK(state_.get()); | |
| 280 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations", | |
| 281 state_->num_iterations); | |
| 282 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime", | |
| 283 clock_->Now() - state_->start_time); | |
| 284 | |
| 285 DCHECK(!done_callback_.is_null()); | |
| 286 done_callback_.Run(success); | |
| 287 Clear(); | |
| 288 } | |
| 289 | |
| 290 void PhishingTermFeatureExtractor::Clear() { | |
| 291 page_text_ = NULL; | |
| 292 features_ = NULL; | |
| 293 shingle_hashes_ = NULL; | |
| 294 done_callback_.Reset(); | |
| 295 state_.reset(NULL); | |
| 296 } | |
| 297 | |
| 298 } // namespace safe_browsing | |
| OLD | NEW |