Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(12)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

Issue 2667343006: Componentize safe_browsing [X+1] : move the renderer part to component.
Patch Set: Created 3 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
6
7 #include <list>
8 #include <map>
9 #include <memory>
10 #include <utility>
11
12 #include "base/bind.h"
13 #include "base/compiler_specific.h"
14 #include "base/i18n/break_iterator.h"
15 #include "base/i18n/case_conversion.h"
16 #include "base/location.h"
17 #include "base/logging.h"
18 #include "base/metrics/histogram_macros.h"
19 #include "base/single_thread_task_runner.h"
20 #include "base/strings/utf_string_conversions.h"
21 #include "base/threading/thread_task_runner_handle.h"
22 #include "base/time/time.h"
23 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
24 #include "chrome/renderer/safe_browsing/features.h"
25 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
26 #include "crypto/sha2.h"
27
28 namespace safe_browsing {
29
30 // This time should be short enough that it doesn't noticeably disrupt the
31 // user's interaction with the page.
32 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10;
33
34 // Experimenting shows that we get a reasonable gain in performance by
35 // increasing this up to around 10, but there's not much benefit in
36 // increasing it past that.
37 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5;
38
39 // This should be longer than we expect feature extraction to take on any
40 // actual phishing page.
41 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;
42
43 // All of the state pertaining to the current feature extraction.
44 struct PhishingTermFeatureExtractor::ExtractionState {
45 // Stores up to max_words_per_term_ previous words separated by spaces.
46 std::string previous_words;
47
48 // Stores the current shingle after a new word is processed and added in.
49 std::string current_shingle;
50
51 // Stores the sizes of the words in current_shingle. Note: the size includes
52 // the space after each word. In other words, the sum of all sizes in this
53 // list is equal to the length of current_shingle.
54 std::list<size_t> shingle_word_sizes;
55
56 // Stores the sizes of the words in previous_words. Note: the size includes
57 // the space after each word. In other words, the sum of all sizes in this
58 // list is equal to the length of previous_words.
59 std::list<size_t> previous_word_sizes;
60
61 // An iterator for word breaking.
62 std::unique_ptr<base::i18n::BreakIterator> iterator;
63
64 // The time at which we started feature extraction for the current page.
65 base::TimeTicks start_time;
66
67 // The number of iterations we've done for the current extraction.
68 int num_iterations;
69
70 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks)
71 : start_time(start_time_ticks),
72 num_iterations(0) {
73 std::unique_ptr<base::i18n::BreakIterator> i(new base::i18n::BreakIterator(
74 text, base::i18n::BreakIterator::BREAK_WORD));
75
76 if (i->Init()) {
77 iterator = std::move(i);
78 } else {
79 DLOG(ERROR) << "failed to open iterator";
80 }
81 }
82 };
83
84 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
85 const base::hash_set<std::string>* page_term_hashes,
86 const base::hash_set<uint32_t>* page_word_hashes,
87 size_t max_words_per_term,
88 uint32_t murmurhash3_seed,
89 size_t max_shingles_per_page,
90 size_t shingle_size,
91 FeatureExtractorClock* clock)
92 : page_term_hashes_(page_term_hashes),
93 page_word_hashes_(page_word_hashes),
94 max_words_per_term_(max_words_per_term),
95 murmurhash3_seed_(murmurhash3_seed),
96 max_shingles_per_page_(max_shingles_per_page),
97 shingle_size_(shingle_size),
98 clock_(clock),
99 weak_factory_(this) {
100 Clear();
101 }
102
103 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
104 // The RenderView should have called CancelPendingExtraction() before
105 // we are destroyed.
106 CheckNoPendingExtraction();
107 }
108
109 void PhishingTermFeatureExtractor::ExtractFeatures(
110 const base::string16* page_text,
111 FeatureMap* features,
112 std::set<uint32_t>* shingle_hashes,
113 const DoneCallback& done_callback) {
114 // The RenderView should have called CancelPendingExtraction() before
115 // starting a new extraction, so DCHECK this.
116 CheckNoPendingExtraction();
117 // However, in an opt build, we will go ahead and clean up the pending
118 // extraction so that we can start in a known state.
119 CancelPendingExtraction();
120
121 page_text_ = page_text;
122 features_ = features;
123 shingle_hashes_ = shingle_hashes,
124 done_callback_ = done_callback;
125
126 state_.reset(new ExtractionState(*page_text_, clock_->Now()));
127 base::ThreadTaskRunnerHandle::Get()->PostTask(
128 FROM_HERE,
129 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,
130 weak_factory_.GetWeakPtr()));
131 }
132
133 void PhishingTermFeatureExtractor::CancelPendingExtraction() {
134 // Cancel any pending callbacks, and clear our state.
135 weak_factory_.InvalidateWeakPtrs();
136 Clear();
137 }
138
139 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
140 DCHECK(state_.get());
141 ++state_->num_iterations;
142 base::TimeTicks current_chunk_start_time = clock_->Now();
143
144 if (!state_->iterator.get()) {
145 // We failed to initialize the break iterator, so stop now.
146 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);
147 RunCallback(false);
148 return;
149 }
150
151 int num_words = 0;
152 while (state_->iterator->Advance()) {
153 if (state_->iterator->IsWord()) {
154 const size_t start = state_->iterator->prev();
155 const size_t length = state_->iterator->pos() - start;
156 HandleWord(base::StringPiece16(page_text_->data() + start, length));
157 ++num_words;
158 }
159
160 if (num_words >= kClockCheckGranularity) {
161 num_words = 0;
162 base::TimeTicks now = clock_->Now();
163 if (now - state_->start_time >=
164 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
165 DLOG(ERROR) << "Feature extraction took too long, giving up";
166 // We expect this to happen infrequently, so record when it does.
167 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);
168 RunCallback(false);
169 return;
170 }
171 base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
172 if (chunk_elapsed >=
173 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
174 // The time limit for the current chunk is up, so post a task to
175 // continue extraction.
176 //
177 // Record how much time we actually spent on the chunk. If this is
178 // much higher than kMaxTimePerChunkMs, we may need to adjust the
179 // clock granularity.
180 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",
181 chunk_elapsed);
182 base::ThreadTaskRunnerHandle::Get()->PostTask(
183 FROM_HERE,
184 base::Bind(
185 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,
186 weak_factory_.GetWeakPtr()));
187 return;
188 }
189 // Otherwise, continue.
190 }
191 }
192 RunCallback(true);
193 }
194
195 void PhishingTermFeatureExtractor::HandleWord(
196 const base::StringPiece16& word) {
197 // First, extract shingle hashes.
198 const std::string& word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));
199 state_->current_shingle.append(word_lower + " ");
200 state_->shingle_word_sizes.push_back(word_lower.size() + 1);
201 if (state_->shingle_word_sizes.size() == shingle_size_) {
202 shingle_hashes_->insert(
203 MurmurHash3String(state_->current_shingle, murmurhash3_seed_));
204 state_->current_shingle.erase(0, state_->shingle_word_sizes.front());
205 state_->shingle_word_sizes.pop_front();
206 }
207 // Check if the size of shingle hashes is over the limit.
208 if (shingle_hashes_->size() > max_shingles_per_page_) {
209 // Pop the largest one.
210 std::set<uint32_t>::iterator it = shingle_hashes_->end();
211 shingle_hashes_->erase(--it);
212 }
213
214 // Next, extract page terms.
215 uint32_t word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);
216
217 // Quick out if the word is not part of any term, which is the common case.
218 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {
219 // Word doesn't exist in our terms so we can clear the n-gram state.
220 state_->previous_words.clear();
221 state_->previous_word_sizes.clear();
222 return;
223 }
224
225 // Find all of the n-grams that we need to check and compute their SHA-256
226 // hashes.
227 std::map<std::string /* hash */, std::string /* plaintext */>
228 hashes_to_check;
229 hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower;
230
231 // Combine the new word with the previous words to find additional n-grams.
232 // Note that we don't yet add the new word length to previous_word_sizes,
233 // since we don't want to compute the hash for the word by itself again.
234 //
235 state_->previous_words.append(word_lower);
236 std::string current_term = state_->previous_words;
237 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();
238 it != state_->previous_word_sizes.end(); ++it) {
239 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;
240 current_term.erase(0, *it);
241 }
242
243 // Add features for any hashes that match page_term_hashes_.
244 for (std::map<std::string, std::string>::iterator it =
245 hashes_to_check.begin();
246 it != hashes_to_check.end(); ++it) {
247 if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) {
248 features_->AddBooleanFeature(features::kPageTerm + it->second);
249 }
250 }
251
252 // Now that we have handled the current word, we have to add a space at the
253 // end of it, and add the new word's size (including the space) to
254 // previous_word_sizes. Note: it's possible that the document language
255 // doesn't use ASCII spaces to separate words. That's fine though, we just
256 // need to be consistent with how the model is generated.
257 state_->previous_words.append(" ");
258 state_->previous_word_sizes.push_back(word_lower.size() + 1);
259
260 // Cap the number of previous words.
261 if (state_->previous_word_sizes.size() >= max_words_per_term_) {
262 state_->previous_words.erase(0, state_->previous_word_sizes.front());
263 state_->previous_word_sizes.pop_front();
264 }
265 }
266
267 void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {
268 DCHECK(done_callback_.is_null());
269 DCHECK(!state_.get());
270 if (!done_callback_.is_null() || state_.get()) {
271 LOG(ERROR) << "Extraction in progress, missing call to "
272 << "CancelPendingExtraction";
273 }
274 }
275
276 void PhishingTermFeatureExtractor::RunCallback(bool success) {
277 // Record some timing stats that we can use to evaluate feature extraction
278 // performance. These include both successful and failed extractions.
279 DCHECK(state_.get());
280 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations",
281 state_->num_iterations);
282 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",
283 clock_->Now() - state_->start_time);
284
285 DCHECK(!done_callback_.is_null());
286 done_callback_.Run(success);
287 Clear();
288 }
289
290 void PhishingTermFeatureExtractor::Clear() {
291 page_text_ = NULL;
292 features_ = NULL;
293 shingle_hashes_ = NULL;
294 done_callback_.Reset();
295 state_.reset(NULL);
296 }
297
298 } // namespace safe_browsing
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698