Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(63)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc

Issue 2667343006: Componentize safe_browsing [X+1] : move the renderer part to component.
Patch Set: Created 3 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
6
7 #include "base/bind.h"
8 #include "base/compiler_specific.h"
9 #include "base/containers/hash_tables.h"
10 #include "base/location.h"
11 #include "base/logging.h"
12 #include "base/metrics/histogram_macros.h"
13 #include "base/single_thread_task_runner.h"
14 #include "base/strings/string_util.h"
15 #include "base/threading/thread_task_runner_handle.h"
16 #include "base/time/time.h"
17 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
18 #include "chrome/renderer/safe_browsing/features.h"
19 #include "content/public/renderer/render_view.h"
20 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
21 #include "third_party/WebKit/public/platform/WebString.h"
22 #include "third_party/WebKit/public/web/WebElement.h"
23 #include "third_party/WebKit/public/web/WebElementCollection.h"
24 #include "third_party/WebKit/public/web/WebLocalFrame.h"
25 #include "third_party/WebKit/public/web/WebView.h"
26
27 namespace safe_browsing {
28
29 // This time should be short enough that it doesn't noticeably disrupt the
30 // user's interaction with the page.
31 const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10;
32
33 // Experimenting shows that we get a reasonable gain in performance by
34 // increasing this up to around 10, but there's not much benefit in
35 // increasing it past that.
36 const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10;
37
38 // This should be longer than we expect feature extraction to take on any
39 // actual phishing page.
40 const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500;
41
42 // Intermediate state used for computing features. See features.h for
43 // descriptions of the DOM features that are computed.
44 struct PhishingDOMFeatureExtractor::PageFeatureState {
45 // Link related features
46 int external_links;
47 base::hash_set<std::string> external_domains;
48 int secure_links;
49 int total_links;
50
51 // Form related features
52 int num_forms;
53 int num_text_inputs;
54 int num_pswd_inputs;
55 int num_radio_inputs;
56 int num_check_inputs;
57 int action_other_domain;
58 int total_actions;
59 base::hash_set<std::string> page_action_urls;
60
61 // Image related features
62 int img_other_domain;
63 int total_imgs;
64
65 // How many script tags
66 int num_script_tags;
67
68 // The time at which we started feature extraction for the current page.
69 base::TimeTicks start_time;
70
71 // The number of iterations we've done for the current extraction.
72 int num_iterations;
73
74 explicit PageFeatureState(base::TimeTicks start_time_ticks)
75 : external_links(0),
76 secure_links(0),
77 total_links(0),
78 num_forms(0),
79 num_text_inputs(0),
80 num_pswd_inputs(0),
81 num_radio_inputs(0),
82 num_check_inputs(0),
83 action_other_domain(0),
84 total_actions(0),
85 img_other_domain(0),
86 total_imgs(0),
87 num_script_tags(0),
88 start_time(start_time_ticks),
89 num_iterations(0) {}
90
91 ~PageFeatureState() {}
92 };
93
94 // Per-frame state
95 struct PhishingDOMFeatureExtractor::FrameData {
96 // This is our reference to document.all, which is an iterator over all
97 // of the elements in the document. It keeps track of our current position.
98 blink::WebElementCollection elements;
99 // The domain of the document URL, stored here so that we don't need to
100 // recompute it every time it's needed.
101 std::string domain;
102 };
103
104 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
105 FeatureExtractorClock* clock)
106 : clock_(clock), weak_factory_(this) {
107 Clear();
108 }
109
110 PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() {
111 // The RenderView should have called CancelPendingExtraction() before
112 // we are destroyed.
113 CheckNoPendingExtraction();
114 }
115
116 void PhishingDOMFeatureExtractor::ExtractFeatures(
117 blink::WebDocument document,
118 FeatureMap* features,
119 const DoneCallback& done_callback) {
120 // The RenderView should have called CancelPendingExtraction() before
121 // starting a new extraction, so DCHECK this.
122 CheckNoPendingExtraction();
123 // However, in an opt build, we will go ahead and clean up the pending
124 // extraction so that we can start in a known state.
125 CancelPendingExtraction();
126
127 features_ = features;
128 done_callback_ = done_callback;
129
130 page_feature_state_.reset(new PageFeatureState(clock_->Now()));
131 cur_document_ = document;
132
133 base::ThreadTaskRunnerHandle::Get()->PostTask(
134 FROM_HERE,
135 base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
136 weak_factory_.GetWeakPtr()));
137 }
138
139 void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
140 // Cancel any pending callbacks, and clear our state.
141 weak_factory_.InvalidateWeakPtrs();
142 Clear();
143 }
144
145 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
146 DCHECK(page_feature_state_.get());
147 ++page_feature_state_->num_iterations;
148 base::TimeTicks current_chunk_start_time = clock_->Now();
149
150 if (cur_document_.isNull()) {
151 // This will only happen if we weren't able to get the document for the
152 // main frame. We'll treat this as an extraction failure.
153 RunCallback(false);
154 return;
155 }
156
157 int num_elements = 0;
158 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) {
159 blink::WebElement cur_element;
160 if (cur_frame_data_.get()) {
161 // We're resuming traversal of a frame, so just advance to the next
162 // element.
163 cur_element = cur_frame_data_->elements.nextItem();
164 // When we resume the traversal, the first call to nextItem() potentially
165 // has to walk through the document again from the beginning, if it was
166 // modified between our chunks of work. Log how long this takes, so we
167 // can tell if it's too slow.
168 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime",
169 clock_->Now() - current_chunk_start_time);
170 } else {
171 // We just moved to a new frame, so update our frame state
172 // and advance to the first element.
173 ResetFrameData();
174 cur_element = cur_frame_data_->elements.firstItem();
175 }
176
177 for (; !cur_element.isNull();
178 cur_element = cur_frame_data_->elements.nextItem()) {
179 if (cur_element.hasHTMLTagName("a")) {
180 HandleLink(cur_element);
181 } else if (cur_element.hasHTMLTagName("form")) {
182 HandleForm(cur_element);
183 } else if (cur_element.hasHTMLTagName("img")) {
184 HandleImage(cur_element);
185 } else if (cur_element.hasHTMLTagName("input")) {
186 HandleInput(cur_element);
187 } else if (cur_element.hasHTMLTagName("script")) {
188 HandleScript(cur_element);
189 }
190
191 if (++num_elements >= kClockCheckGranularity) {
192 num_elements = 0;
193 base::TimeTicks now = clock_->Now();
194 if (now - page_feature_state_->start_time >=
195 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
196 DLOG(ERROR) << "Feature extraction took too long, giving up";
197 // We expect this to happen infrequently, so record when it does.
198 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1);
199 RunCallback(false);
200 return;
201 }
202 base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
203 if (chunk_elapsed >=
204 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
205 // The time limit for the current chunk is up, so post a task to
206 // continue extraction.
207 //
208 // Record how much time we actually spent on the chunk. If this is
209 // much higher than kMaxTimePerChunkMs, we may need to adjust the
210 // clock granularity.
211 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime",
212 chunk_elapsed);
213 base::ThreadTaskRunnerHandle::Get()->PostTask(
214 FROM_HERE,
215 base::Bind(
216 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
217 weak_factory_.GetWeakPtr()));
218 return;
219 }
220 // Otherwise, continue.
221 }
222 }
223
224 // We're done with this frame, recalculate the FrameData when we
225 // advance to the next frame.
226 cur_frame_data_.reset();
227 }
228
229 InsertFeatures();
230 RunCallback(true);
231 }
232
233 void PhishingDOMFeatureExtractor::HandleLink(
234 const blink::WebElement& element) {
235 // Count the number of times we link to a different host.
236 if (!element.hasAttribute("href")) {
237 DVLOG(1) << "Skipping anchor tag with no href";
238 return;
239 }
240
241 // Retrieve the link and resolve the link in case it's relative.
242 blink::WebURL full_url = CompleteURL(element, element.getAttribute("href"));
243
244 std::string domain;
245 bool is_external = IsExternalDomain(full_url, &domain);
246 if (domain.empty()) {
247 DVLOG(1) << "Could not extract domain from link: " << full_url;
248 return;
249 }
250
251 if (is_external) {
252 ++page_feature_state_->external_links;
253
254 // Record each unique domain that we link to.
255 page_feature_state_->external_domains.insert(domain);
256 }
257
258 // Check how many are https links.
259 if (GURL(full_url).SchemeIs("https")) {
260 ++page_feature_state_->secure_links;
261 }
262
263 ++page_feature_state_->total_links;
264 }
265
266 void PhishingDOMFeatureExtractor::HandleForm(
267 const blink::WebElement& element) {
268 // Increment the number of forms on this page.
269 ++page_feature_state_->num_forms;
270
271 // Record whether the action points to a different domain.
272 if (!element.hasAttribute("action")) {
273 return;
274 }
275
276 blink::WebURL full_url = CompleteURL(element, element.getAttribute("action"));
277
278 page_feature_state_->page_action_urls.insert(full_url.string().utf8());
279
280 std::string domain;
281 bool is_external = IsExternalDomain(full_url, &domain);
282 if (domain.empty()) {
283 DVLOG(1) << "Could not extract domain from form action: " << full_url;
284 return;
285 }
286
287 if (is_external) {
288 ++page_feature_state_->action_other_domain;
289 }
290 ++page_feature_state_->total_actions;
291 }
292
293 void PhishingDOMFeatureExtractor::HandleImage(
294 const blink::WebElement& element) {
295 if (!element.hasAttribute("src")) {
296 DVLOG(1) << "Skipping img tag with no src";
297 }
298
299 // Record whether the image points to a different domain.
300 blink::WebURL full_url = CompleteURL(element, element.getAttribute("src"));
301 std::string domain;
302 bool is_external = IsExternalDomain(full_url, &domain);
303 if (domain.empty()) {
304 DVLOG(1) << "Could not extract domain from image src: " << full_url;
305 return;
306 }
307
308 if (is_external) {
309 ++page_feature_state_->img_other_domain;
310 }
311 ++page_feature_state_->total_imgs;
312 }
313
314 void PhishingDOMFeatureExtractor::HandleInput(
315 const blink::WebElement& element) {
316 // The HTML spec says that if the type is unspecified, it defaults to text.
317 // In addition, any unrecognized type will be treated as a text input.
318 //
319 // Note that we use the attribute value rather than
320 // WebFormControlElement::formControlType() for consistency with the
321 // way the phishing classification model is created.
322 std::string type = base::ToLowerASCII(element.getAttribute("type").utf8());
323 if (type == "password") {
324 ++page_feature_state_->num_pswd_inputs;
325 } else if (type == "radio") {
326 ++page_feature_state_->num_radio_inputs;
327 } else if (type == "checkbox") {
328 ++page_feature_state_->num_check_inputs;
329 } else if (type != "submit" && type != "reset" && type != "file" &&
330 type != "hidden" && type != "image" && type != "button") {
331 // Note that there are a number of new input types in HTML5 that are not
332 // handled above. For now, we will consider these as text inputs since
333 // they could be used to capture user input.
334 ++page_feature_state_->num_text_inputs;
335 }
336 }
337
338 void PhishingDOMFeatureExtractor::HandleScript(
339 const blink::WebElement& element) {
340 ++page_feature_state_->num_script_tags;
341 }
342
343 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
344 DCHECK(done_callback_.is_null());
345 DCHECK(!cur_frame_data_.get());
346 DCHECK(cur_document_.isNull());
347 if (!done_callback_.is_null() || cur_frame_data_.get() ||
348 !cur_document_.isNull()) {
349 LOG(ERROR) << "Extraction in progress, missing call to "
350 << "CancelPendingExtraction";
351 }
352 }
353
354 void PhishingDOMFeatureExtractor::RunCallback(bool success) {
355 // Record some timing stats that we can use to evaluate feature extraction
356 // performance. These include both successful and failed extractions.
357 DCHECK(page_feature_state_.get());
358 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations",
359 page_feature_state_->num_iterations);
360 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime",
361 clock_->Now() - page_feature_state_->start_time);
362
363 DCHECK(!done_callback_.is_null());
364 done_callback_.Run(success);
365 Clear();
366 }
367
368 void PhishingDOMFeatureExtractor::Clear() {
369 features_ = NULL;
370 done_callback_.Reset();
371 cur_frame_data_.reset(NULL);
372 cur_document_.reset();
373 }
374
375 void PhishingDOMFeatureExtractor::ResetFrameData() {
376 DCHECK(!cur_document_.isNull());
377 DCHECK(!cur_frame_data_.get());
378
379 cur_frame_data_.reset(new FrameData());
380 cur_frame_data_->elements = cur_document_.all();
381 cur_frame_data_->domain =
382 net::registry_controlled_domains::GetDomainAndRegistry(
383 cur_document_.url(),
384 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
385 }
386
387 blink::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() {
388 DCHECK(!cur_document_.isNull());
389 blink::WebFrame* frame = cur_document_.frame();
390 // Advance to the next frame that contains a document, with no wrapping.
391 if (frame) {
392 for (frame = frame->traverseNext(); frame; frame = frame->traverseNext()) {
393 if (!frame->document().isNull()) {
394 return frame->document();
395 }
396 }
397 } else {
398 // Keep track of how often frame traversal got "stuck" due to the
399 // current subdocument getting removed from the frame tree.
400 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1);
401 }
402 return blink::WebDocument();
403 }
404
405 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url,
406 std::string* domain) const {
407 DCHECK(domain);
408 DCHECK(cur_frame_data_.get());
409
410 if (cur_frame_data_->domain.empty()) {
411 return false;
412 }
413
414 // TODO(bryner): Ensure that the url encoding is consistent with the features
415 // in the model.
416 if (url.HostIsIPAddress()) {
417 domain->assign(url.host());
418 } else {
419 domain->assign(net::registry_controlled_domains::GetDomainAndRegistry(
420 url, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES));
421 }
422
423 return !domain->empty() && *domain != cur_frame_data_->domain;
424 }
425
426 blink::WebURL PhishingDOMFeatureExtractor::CompleteURL(
427 const blink::WebElement& element,
428 const blink::WebString& partial_url) {
429 return element.document().completeURL(partial_url);
430 }
431
432 void PhishingDOMFeatureExtractor::InsertFeatures() {
433 DCHECK(page_feature_state_.get());
434
435 if (page_feature_state_->total_links > 0) {
436 // Add a feature for the fraction of times the page links to an external
437 // domain vs. an internal domain.
438 double link_freq = static_cast<double>(
439 page_feature_state_->external_links) /
440 page_feature_state_->total_links;
441 features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq);
442
443 // Add a feature for each unique domain that we're linking to
444 for (const auto& domain : page_feature_state_->external_domains) {
445 features_->AddBooleanFeature(features::kPageLinkDomain + domain);
446 }
447
448 // Fraction of links that use https.
449 double secure_freq = static_cast<double>(
450 page_feature_state_->secure_links) / page_feature_state_->total_links;
451 features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq);
452 }
453
454 // Record whether forms appear and whether various form elements appear.
455 if (page_feature_state_->num_forms > 0) {
456 features_->AddBooleanFeature(features::kPageHasForms);
457 }
458 if (page_feature_state_->num_text_inputs > 0) {
459 features_->AddBooleanFeature(features::kPageHasTextInputs);
460 }
461 if (page_feature_state_->num_pswd_inputs > 0) {
462 features_->AddBooleanFeature(features::kPageHasPswdInputs);
463 }
464 if (page_feature_state_->num_radio_inputs > 0) {
465 features_->AddBooleanFeature(features::kPageHasRadioInputs);
466 }
467 if (page_feature_state_->num_check_inputs > 0) {
468 features_->AddBooleanFeature(features::kPageHasCheckInputs);
469 }
470
471 // Record fraction of form actions that point to a different domain.
472 if (page_feature_state_->total_actions > 0) {
473 double action_freq = static_cast<double>(
474 page_feature_state_->action_other_domain) /
475 page_feature_state_->total_actions;
476 features_->AddRealFeature(features::kPageActionOtherDomainFreq,
477 action_freq);
478 }
479
480 // Add a feature for each unique external action url.
481 for (const auto& url : page_feature_state_->page_action_urls) {
482 features_->AddBooleanFeature(features::kPageActionURL + url);
483 }
484
485 // Record how many image src attributes point to a different domain.
486 if (page_feature_state_->total_imgs > 0) {
487 double img_freq = static_cast<double>(
488 page_feature_state_->img_other_domain) /
489 page_feature_state_->total_imgs;
490 features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq);
491 }
492
493 // Record number of script tags (discretized for numerical stability.)
494 if (page_feature_state_->num_script_tags > 1) {
495 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne);
496 if (page_feature_state_->num_script_tags > 6) {
497 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix);
498 }
499 }
500 }
501
502 } // namespace safe_browsing
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698