OLD | NEW |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" |
6 | 6 |
7 #include "base/compiler_specific.h" | 7 #include "base/compiler_specific.h" |
8 #include "base/hash_tables.h" | 8 #include "base/hash_tables.h" |
9 #include "base/logging.h" | 9 #include "base/logging.h" |
10 #include "base/message_loop.h" | 10 #include "base/message_loop.h" |
11 #include "base/metrics/histogram.h" | 11 #include "base/metrics/histogram.h" |
12 #include "base/string_util.h" | 12 #include "base/string_util.h" |
13 #include "base/time.h" | 13 #include "base/time.h" |
14 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" | 14 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
15 #include "chrome/renderer/safe_browsing/features.h" | 15 #include "chrome/renderer/safe_browsing/features.h" |
16 #include "content/renderer/render_view.h" | 16 #include "content/renderer/render_view.h" |
17 #include "net/base/registry_controlled_domain.h" | 17 #include "net/base/registry_controlled_domain.h" |
18 #include "third_party/WebKit/Source/WebKit/chromium/public/WebDocument.h" | |
19 #include "third_party/WebKit/Source/WebKit/chromium/public/WebElement.h" | 18 #include "third_party/WebKit/Source/WebKit/chromium/public/WebElement.h" |
20 #include "third_party/WebKit/Source/WebKit/chromium/public/WebFrame.h" | 19 #include "third_party/WebKit/Source/WebKit/chromium/public/WebFrame.h" |
21 #include "third_party/WebKit/Source/WebKit/chromium/public/WebNodeCollection.h" | 20 #include "third_party/WebKit/Source/WebKit/chromium/public/WebNodeCollection.h" |
22 #include "third_party/WebKit/Source/WebKit/chromium/public/WebString.h" | 21 #include "third_party/WebKit/Source/WebKit/chromium/public/WebString.h" |
23 #include "third_party/WebKit/Source/WebKit/chromium/public/WebView.h" | 22 #include "third_party/WebKit/Source/WebKit/chromium/public/WebView.h" |
24 | 23 |
25 namespace safe_browsing { | 24 namespace safe_browsing { |
26 | 25 |
27 // This time should be short enough that it doesn't noticeably disrupt the | 26 // This time should be short enough that it doesn't noticeably disrupt the |
28 // user's interaction with the page. | 27 // user's interaction with the page. |
(...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
120 // starting a new extraction, so DCHECK this. | 119 // starting a new extraction, so DCHECK this. |
121 CheckNoPendingExtraction(); | 120 CheckNoPendingExtraction(); |
122 // However, in an opt build, we will go ahead and clean up the pending | 121 // However, in an opt build, we will go ahead and clean up the pending |
123 // extraction so that we can start in a known state. | 122 // extraction so that we can start in a known state. |
124 CancelPendingExtraction(); | 123 CancelPendingExtraction(); |
125 | 124 |
126 features_ = features; | 125 features_ = features; |
127 done_callback_.reset(done_callback); | 126 done_callback_.reset(done_callback); |
128 | 127 |
129 page_feature_state_.reset(new PageFeatureState(clock_->Now())); | 128 page_feature_state_.reset(new PageFeatureState(clock_->Now())); |
| 129 WebKit::WebView* web_view = render_view_->webview(); |
| 130 if (web_view && web_view->mainFrame()) { |
| 131 cur_document_ = web_view->mainFrame()->document(); |
| 132 } |
| 133 |
130 MessageLoop::current()->PostTask( | 134 MessageLoop::current()->PostTask( |
131 FROM_HERE, | 135 FROM_HERE, |
132 method_factory_.NewRunnableMethod( | 136 method_factory_.NewRunnableMethod( |
133 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout)); | 137 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout)); |
134 } | 138 } |
135 | 139 |
136 void PhishingDOMFeatureExtractor::CancelPendingExtraction() { | 140 void PhishingDOMFeatureExtractor::CancelPendingExtraction() { |
137 // Cancel any pending callbacks, and clear our state. | 141 // Cancel any pending callbacks, and clear our state. |
138 method_factory_.RevokeAll(); | 142 method_factory_.RevokeAll(); |
139 Clear(); | 143 Clear(); |
140 } | 144 } |
141 | 145 |
142 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { | 146 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { |
143 DCHECK(page_feature_state_.get()); | 147 DCHECK(page_feature_state_.get()); |
144 ++page_feature_state_->num_iterations; | 148 ++page_feature_state_->num_iterations; |
145 base::TimeTicks current_chunk_start_time = clock_->Now(); | 149 base::TimeTicks current_chunk_start_time = clock_->Now(); |
146 | 150 |
147 if (!cur_frame_) { | 151 if (cur_document_.isNull()) { |
148 WebKit::WebView* web_view = render_view_->webview(); | 152 // This will only happen if we weren't able to get the document for the |
149 if (!web_view) { | 153 // main frame. We'll treat this as an extraction failure. |
150 RunCallback(false); // The WebView is going away. | 154 RunCallback(false); |
151 return; | 155 return; |
152 } | |
153 cur_frame_ = web_view->mainFrame(); | |
154 } | 156 } |
155 | 157 |
156 int num_elements = 0; | 158 int num_elements = 0; |
157 for (; cur_frame_; | 159 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) { |
158 cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) { | |
159 WebKit::WebNode cur_node; | 160 WebKit::WebNode cur_node; |
160 if (cur_frame_data_.get()) { | 161 if (cur_frame_data_.get()) { |
161 // We're resuming traversal of a frame, so just advance to the next node. | 162 // We're resuming traversal of a frame, so just advance to the next node. |
162 cur_node = cur_frame_data_->elements.nextItem(); | 163 cur_node = cur_frame_data_->elements.nextItem(); |
163 // When we resume the traversal, the first call to nextItem() potentially | 164 // When we resume the traversal, the first call to nextItem() potentially |
164 // has to walk through the document again from the beginning, if it was | 165 // has to walk through the document again from the beginning, if it was |
165 // modified between our chunks of work. Log how long this takes, so we | 166 // modified between our chunks of work. Log how long this takes, so we |
166 // can tell if it's too slow. | 167 // can tell if it's too slow. |
167 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", | 168 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", |
168 clock_->Now() - current_chunk_start_time); | 169 clock_->Now() - current_chunk_start_time); |
(...skipping 175 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
344 } | 345 } |
345 | 346 |
346 void PhishingDOMFeatureExtractor::HandleScript( | 347 void PhishingDOMFeatureExtractor::HandleScript( |
347 const WebKit::WebElement& element) { | 348 const WebKit::WebElement& element) { |
348 ++page_feature_state_->num_script_tags; | 349 ++page_feature_state_->num_script_tags; |
349 } | 350 } |
350 | 351 |
351 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { | 352 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { |
352 DCHECK(!done_callback_.get()); | 353 DCHECK(!done_callback_.get()); |
353 DCHECK(!cur_frame_data_.get()); | 354 DCHECK(!cur_frame_data_.get()); |
354 DCHECK(!cur_frame_); | 355 DCHECK(cur_document_.isNull()); |
355 if (done_callback_.get() || cur_frame_data_.get() || cur_frame_) { | 356 if (done_callback_.get() || cur_frame_data_.get() || |
| 357 !cur_document_.isNull()) { |
356 LOG(ERROR) << "Extraction in progress, missing call to " | 358 LOG(ERROR) << "Extraction in progress, missing call to " |
357 << "CancelPendingExtraction"; | 359 << "CancelPendingExtraction"; |
358 } | 360 } |
359 } | 361 } |
360 | 362 |
361 void PhishingDOMFeatureExtractor::RunCallback(bool success) { | 363 void PhishingDOMFeatureExtractor::RunCallback(bool success) { |
362 // Record some timing stats that we can use to evaluate feature extraction | 364 // Record some timing stats that we can use to evaluate feature extraction |
363 // performance. These include both successful and failed extractions. | 365 // performance. These include both successful and failed extractions. |
364 DCHECK(page_feature_state_.get()); | 366 DCHECK(page_feature_state_.get()); |
365 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations", | 367 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations", |
366 page_feature_state_->num_iterations); | 368 page_feature_state_->num_iterations); |
367 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime", | 369 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime", |
368 clock_->Now() - page_feature_state_->start_time); | 370 clock_->Now() - page_feature_state_->start_time); |
369 | 371 |
370 DCHECK(done_callback_.get()); | 372 DCHECK(done_callback_.get()); |
371 done_callback_->Run(success); | 373 done_callback_->Run(success); |
372 Clear(); | 374 Clear(); |
373 } | 375 } |
374 | 376 |
375 void PhishingDOMFeatureExtractor::Clear() { | 377 void PhishingDOMFeatureExtractor::Clear() { |
376 features_ = NULL; | 378 features_ = NULL; |
377 done_callback_.reset(NULL); | 379 done_callback_.reset(NULL); |
378 cur_frame_data_.reset(NULL); | 380 cur_frame_data_.reset(NULL); |
379 cur_frame_ = NULL; | 381 cur_document_.reset(); |
380 } | 382 } |
381 | 383 |
382 bool PhishingDOMFeatureExtractor::ResetFrameData() { | 384 bool PhishingDOMFeatureExtractor::ResetFrameData() { |
383 DCHECK(cur_frame_); | 385 DCHECK(!cur_document_.isNull()); |
384 DCHECK(!cur_frame_data_.get()); | 386 DCHECK(!cur_frame_data_.get()); |
385 | 387 |
386 WebKit::WebDocument doc = cur_frame_->document(); | |
387 if (doc.isNull()) { | |
388 return false; | |
389 } | |
390 cur_frame_data_.reset(new FrameData()); | 388 cur_frame_data_.reset(new FrameData()); |
391 cur_frame_data_->elements = doc.all(); | 389 cur_frame_data_->elements = cur_document_.all(); |
392 cur_frame_data_->domain = | 390 cur_frame_data_->domain = |
393 net::RegistryControlledDomainService::GetDomainAndRegistry( | 391 net::RegistryControlledDomainService::GetDomainAndRegistry( |
394 cur_frame_->document().url()); | 392 cur_document_.url()); |
395 return true; | 393 return true; |
396 } | 394 } |
397 | 395 |
| 396 WebKit::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() { |
| 397 DCHECK(!cur_document_.isNull()); |
| 398 WebKit::WebFrame* frame = cur_document_.frame(); |
| 399 // Advance to the next frame that contains a document, with no wrapping. |
| 400 if (frame) { |
| 401 while ((frame = frame->traverseNext(false))) { |
| 402 if (!frame->document().isNull()) { |
| 403 return frame->document(); |
| 404 } |
| 405 } |
| 406 } else { |
| 407 // Keep track of how often frame traversal got "stuck" due to the |
| 408 // current subdocument getting removed from the frame tree. |
| 409 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1); |
| 410 } |
| 411 return WebKit::WebDocument(); |
| 412 } |
| 413 |
398 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, | 414 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, |
399 std::string* domain) const { | 415 std::string* domain) const { |
400 DCHECK(domain); | 416 DCHECK(domain); |
401 DCHECK(cur_frame_data_.get()); | 417 DCHECK(cur_frame_data_.get()); |
402 | 418 |
403 if (cur_frame_data_->domain.empty()) { | 419 if (cur_frame_data_->domain.empty()) { |
404 return false; | 420 return false; |
405 } | 421 } |
406 | 422 |
407 // TODO(bryner): Ensure that the url encoding is consistent with the features | 423 // TODO(bryner): Ensure that the url encoding is consistent with the features |
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
477 // Record number of script tags (discretized for numerical stability.) | 493 // Record number of script tags (discretized for numerical stability.) |
478 if (page_feature_state_->num_script_tags > 1) { | 494 if (page_feature_state_->num_script_tags > 1) { |
479 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); | 495 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); |
480 if (page_feature_state_->num_script_tags > 6) { | 496 if (page_feature_state_->num_script_tags > 6) { |
481 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); | 497 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); |
482 } | 498 } |
483 } | 499 } |
484 } | 500 } |
485 | 501 |
486 } // namespace safe_browsing | 502 } // namespace safe_browsing |
OLD | NEW |