| OLD | NEW |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" |
| 6 | 6 |
| 7 #include "base/compiler_specific.h" | 7 #include "base/compiler_specific.h" |
| 8 #include "base/hash_tables.h" | 8 #include "base/hash_tables.h" |
| 9 #include "base/logging.h" | 9 #include "base/logging.h" |
| 10 #include "base/message_loop.h" | 10 #include "base/message_loop.h" |
| 11 #include "base/metrics/histogram.h" | 11 #include "base/metrics/histogram.h" |
| 12 #include "base/string_util.h" | 12 #include "base/string_util.h" |
| 13 #include "base/time.h" | 13 #include "base/time.h" |
| 14 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" | 14 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
| 15 #include "chrome/renderer/safe_browsing/features.h" | 15 #include "chrome/renderer/safe_browsing/features.h" |
| 16 #include "content/renderer/render_view.h" | 16 #include "content/renderer/render_view.h" |
| 17 #include "net/base/registry_controlled_domain.h" | 17 #include "net/base/registry_controlled_domain.h" |
| 18 #include "third_party/WebKit/Source/WebKit/chromium/public/WebDocument.h" | |
| 19 #include "third_party/WebKit/Source/WebKit/chromium/public/WebElement.h" | 18 #include "third_party/WebKit/Source/WebKit/chromium/public/WebElement.h" |
| 20 #include "third_party/WebKit/Source/WebKit/chromium/public/WebFrame.h" | 19 #include "third_party/WebKit/Source/WebKit/chromium/public/WebFrame.h" |
| 21 #include "third_party/WebKit/Source/WebKit/chromium/public/WebNodeCollection.h" | 20 #include "third_party/WebKit/Source/WebKit/chromium/public/WebNodeCollection.h" |
| 22 #include "third_party/WebKit/Source/WebKit/chromium/public/WebString.h" | 21 #include "third_party/WebKit/Source/WebKit/chromium/public/WebString.h" |
| 23 #include "third_party/WebKit/Source/WebKit/chromium/public/WebView.h" | 22 #include "third_party/WebKit/Source/WebKit/chromium/public/WebView.h" |
| 24 | 23 |
| 25 namespace safe_browsing { | 24 namespace safe_browsing { |
| 26 | 25 |
| 27 // This time should be short enough that it doesn't noticeably disrupt the | 26 // This time should be short enough that it doesn't noticeably disrupt the |
| 28 // user's interaction with the page. | 27 // user's interaction with the page. |
| (...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 120 // starting a new extraction, so DCHECK this. | 119 // starting a new extraction, so DCHECK this. |
| 121 CheckNoPendingExtraction(); | 120 CheckNoPendingExtraction(); |
| 122 // However, in an opt build, we will go ahead and clean up the pending | 121 // However, in an opt build, we will go ahead and clean up the pending |
| 123 // extraction so that we can start in a known state. | 122 // extraction so that we can start in a known state. |
| 124 CancelPendingExtraction(); | 123 CancelPendingExtraction(); |
| 125 | 124 |
| 126 features_ = features; | 125 features_ = features; |
| 127 done_callback_.reset(done_callback); | 126 done_callback_.reset(done_callback); |
| 128 | 127 |
| 129 page_feature_state_.reset(new PageFeatureState(clock_->Now())); | 128 page_feature_state_.reset(new PageFeatureState(clock_->Now())); |
| 129 WebKit::WebView* web_view = render_view_->webview(); |
| 130 if (web_view && web_view->mainFrame()) { |
| 131 cur_document_ = web_view->mainFrame()->document(); |
| 132 } |
| 133 |
| 130 MessageLoop::current()->PostTask( | 134 MessageLoop::current()->PostTask( |
| 131 FROM_HERE, | 135 FROM_HERE, |
| 132 method_factory_.NewRunnableMethod( | 136 method_factory_.NewRunnableMethod( |
| 133 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout)); | 137 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout)); |
| 134 } | 138 } |
| 135 | 139 |
| 136 void PhishingDOMFeatureExtractor::CancelPendingExtraction() { | 140 void PhishingDOMFeatureExtractor::CancelPendingExtraction() { |
| 137 // Cancel any pending callbacks, and clear our state. | 141 // Cancel any pending callbacks, and clear our state. |
| 138 method_factory_.RevokeAll(); | 142 method_factory_.RevokeAll(); |
| 139 Clear(); | 143 Clear(); |
| 140 } | 144 } |
| 141 | 145 |
| 142 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { | 146 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { |
| 143 DCHECK(page_feature_state_.get()); | 147 DCHECK(page_feature_state_.get()); |
| 144 ++page_feature_state_->num_iterations; | 148 ++page_feature_state_->num_iterations; |
| 145 base::TimeTicks current_chunk_start_time = clock_->Now(); | 149 base::TimeTicks current_chunk_start_time = clock_->Now(); |
| 146 | 150 |
| 147 if (!cur_frame_) { | 151 if (cur_document_.isNull()) { |
| 148 WebKit::WebView* web_view = render_view_->webview(); | 152 // This will only happen if we weren't able to get the document for the |
| 149 if (!web_view) { | 153 // main frame. We'll treat this as an extraction failure. |
| 150 RunCallback(false); // The WebView is going away. | 154 RunCallback(false); |
| 151 return; | 155 return; |
| 152 } | |
| 153 cur_frame_ = web_view->mainFrame(); | |
| 154 } | 156 } |
| 155 | 157 |
| 156 int num_elements = 0; | 158 int num_elements = 0; |
| 157 for (; cur_frame_; | 159 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) { |
| 158 cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) { | |
| 159 WebKit::WebNode cur_node; | 160 WebKit::WebNode cur_node; |
| 160 if (cur_frame_data_.get()) { | 161 if (cur_frame_data_.get()) { |
| 161 // We're resuming traversal of a frame, so just advance to the next node. | 162 // We're resuming traversal of a frame, so just advance to the next node. |
| 162 cur_node = cur_frame_data_->elements.nextItem(); | 163 cur_node = cur_frame_data_->elements.nextItem(); |
| 163 // When we resume the traversal, the first call to nextItem() potentially | 164 // When we resume the traversal, the first call to nextItem() potentially |
| 164 // has to walk through the document again from the beginning, if it was | 165 // has to walk through the document again from the beginning, if it was |
| 165 // modified between our chunks of work. Log how long this takes, so we | 166 // modified between our chunks of work. Log how long this takes, so we |
| 166 // can tell if it's too slow. | 167 // can tell if it's too slow. |
| 167 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", | 168 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", |
| 168 clock_->Now() - current_chunk_start_time); | 169 clock_->Now() - current_chunk_start_time); |
| (...skipping 175 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 344 } | 345 } |
| 345 | 346 |
| 346 void PhishingDOMFeatureExtractor::HandleScript( | 347 void PhishingDOMFeatureExtractor::HandleScript( |
| 347 const WebKit::WebElement& element) { | 348 const WebKit::WebElement& element) { |
| 348 ++page_feature_state_->num_script_tags; | 349 ++page_feature_state_->num_script_tags; |
| 349 } | 350 } |
| 350 | 351 |
| 351 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { | 352 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { |
| 352 DCHECK(!done_callback_.get()); | 353 DCHECK(!done_callback_.get()); |
| 353 DCHECK(!cur_frame_data_.get()); | 354 DCHECK(!cur_frame_data_.get()); |
| 354 DCHECK(!cur_frame_); | 355 DCHECK(cur_document_.isNull()); |
| 355 if (done_callback_.get() || cur_frame_data_.get() || cur_frame_) { | 356 if (done_callback_.get() || cur_frame_data_.get() || |
| 357 !cur_document_.isNull()) { |
| 356 LOG(ERROR) << "Extraction in progress, missing call to " | 358 LOG(ERROR) << "Extraction in progress, missing call to " |
| 357 << "CancelPendingExtraction"; | 359 << "CancelPendingExtraction"; |
| 358 } | 360 } |
| 359 } | 361 } |
| 360 | 362 |
| 361 void PhishingDOMFeatureExtractor::RunCallback(bool success) { | 363 void PhishingDOMFeatureExtractor::RunCallback(bool success) { |
| 362 // Record some timing stats that we can use to evaluate feature extraction | 364 // Record some timing stats that we can use to evaluate feature extraction |
| 363 // performance. These include both successful and failed extractions. | 365 // performance. These include both successful and failed extractions. |
| 364 DCHECK(page_feature_state_.get()); | 366 DCHECK(page_feature_state_.get()); |
| 365 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations", | 367 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations", |
| 366 page_feature_state_->num_iterations); | 368 page_feature_state_->num_iterations); |
| 367 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime", | 369 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime", |
| 368 clock_->Now() - page_feature_state_->start_time); | 370 clock_->Now() - page_feature_state_->start_time); |
| 369 | 371 |
| 370 DCHECK(done_callback_.get()); | 372 DCHECK(done_callback_.get()); |
| 371 done_callback_->Run(success); | 373 done_callback_->Run(success); |
| 372 Clear(); | 374 Clear(); |
| 373 } | 375 } |
| 374 | 376 |
| 375 void PhishingDOMFeatureExtractor::Clear() { | 377 void PhishingDOMFeatureExtractor::Clear() { |
| 376 features_ = NULL; | 378 features_ = NULL; |
| 377 done_callback_.reset(NULL); | 379 done_callback_.reset(NULL); |
| 378 cur_frame_data_.reset(NULL); | 380 cur_frame_data_.reset(NULL); |
| 379 cur_frame_ = NULL; | 381 cur_document_.reset(); |
| 380 } | 382 } |
| 381 | 383 |
| 382 bool PhishingDOMFeatureExtractor::ResetFrameData() { | 384 bool PhishingDOMFeatureExtractor::ResetFrameData() { |
| 383 DCHECK(cur_frame_); | 385 DCHECK(!cur_document_.isNull()); |
| 384 DCHECK(!cur_frame_data_.get()); | 386 DCHECK(!cur_frame_data_.get()); |
| 385 | 387 |
| 386 WebKit::WebDocument doc = cur_frame_->document(); | |
| 387 if (doc.isNull()) { | |
| 388 return false; | |
| 389 } | |
| 390 cur_frame_data_.reset(new FrameData()); | 388 cur_frame_data_.reset(new FrameData()); |
| 391 cur_frame_data_->elements = doc.all(); | 389 cur_frame_data_->elements = cur_document_.all(); |
| 392 cur_frame_data_->domain = | 390 cur_frame_data_->domain = |
| 393 net::RegistryControlledDomainService::GetDomainAndRegistry( | 391 net::RegistryControlledDomainService::GetDomainAndRegistry( |
| 394 cur_frame_->document().url()); | 392 cur_document_.url()); |
| 395 return true; | 393 return true; |
| 396 } | 394 } |
| 397 | 395 |
| 396 WebKit::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() { |
| 397 DCHECK(!cur_document_.isNull()); |
| 398 WebKit::WebFrame* frame = cur_document_.frame(); |
| 399 // Advance to the next frame that contains a document, with no wrapping. |
| 400 if (frame) { |
| 401 while ((frame = frame->traverseNext(false))) { |
| 402 if (!frame->document().isNull()) { |
| 403 return frame->document(); |
| 404 } |
| 405 } |
| 406 } else { |
| 407 // Keep track of how often frame traversal got "stuck" due to the |
| 408 // current subdocument getting removed from the frame tree. |
| 409 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1); |
| 410 } |
| 411 return WebKit::WebDocument(); |
| 412 } |
| 413 |
| 398 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, | 414 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, |
| 399 std::string* domain) const { | 415 std::string* domain) const { |
| 400 DCHECK(domain); | 416 DCHECK(domain); |
| 401 DCHECK(cur_frame_data_.get()); | 417 DCHECK(cur_frame_data_.get()); |
| 402 | 418 |
| 403 if (cur_frame_data_->domain.empty()) { | 419 if (cur_frame_data_->domain.empty()) { |
| 404 return false; | 420 return false; |
| 405 } | 421 } |
| 406 | 422 |
| 407 // TODO(bryner): Ensure that the url encoding is consistent with the features | 423 // TODO(bryner): Ensure that the url encoding is consistent with the features |
| (...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 477 // Record number of script tags (discretized for numerical stability.) | 493 // Record number of script tags (discretized for numerical stability.) |
| 478 if (page_feature_state_->num_script_tags > 1) { | 494 if (page_feature_state_->num_script_tags > 1) { |
| 479 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); | 495 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); |
| 480 if (page_feature_state_->num_script_tags > 6) { | 496 if (page_feature_state_->num_script_tags > 6) { |
| 481 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); | 497 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); |
| 482 } | 498 } |
| 483 } | 499 } |
| 484 } | 500 } |
| 485 | 501 |
| 486 } // namespace safe_browsing | 502 } // namespace safe_browsing |
| OLD | NEW |