| OLD | NEW |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" |
| 6 | 6 |
| 7 #include "base/compiler_specific.h" | 7 #include "base/compiler_specific.h" |
| 8 #include "base/hash_tables.h" | 8 #include "base/hash_tables.h" |
| 9 #include "base/histogram.h" | 9 #include "base/histogram.h" |
| 10 #include "base/logging.h" | 10 #include "base/logging.h" |
| 11 #include "base/message_loop.h" |
| 12 #include "base/time.h" |
| 11 #include "chrome/renderer/render_view.h" | 13 #include "chrome/renderer/render_view.h" |
| 14 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
| 12 #include "chrome/renderer/safe_browsing/features.h" | 15 #include "chrome/renderer/safe_browsing/features.h" |
| 13 #include "net/base/registry_controlled_domain.h" | 16 #include "net/base/registry_controlled_domain.h" |
| 14 #include "third_party/WebKit/WebKit/chromium/public/WebDocument.h" | 17 #include "third_party/WebKit/WebKit/chromium/public/WebDocument.h" |
| 15 #include "third_party/WebKit/WebKit/chromium/public/WebElement.h" | 18 #include "third_party/WebKit/WebKit/chromium/public/WebElement.h" |
| 16 #include "third_party/WebKit/WebKit/chromium/public/WebFrame.h" | 19 #include "third_party/WebKit/WebKit/chromium/public/WebFrame.h" |
| 17 #include "third_party/WebKit/WebKit/chromium/public/WebNodeCollection.h" | 20 #include "third_party/WebKit/WebKit/chromium/public/WebNodeCollection.h" |
| 18 #include "third_party/WebKit/WebKit/chromium/public/WebString.h" | 21 #include "third_party/WebKit/WebKit/chromium/public/WebString.h" |
| 19 #include "third_party/WebKit/WebKit/chromium/public/WebView.h" | 22 #include "third_party/WebKit/WebKit/chromium/public/WebView.h" |
| 20 | 23 |
| 21 namespace safe_browsing { | 24 namespace safe_browsing { |
| 22 | 25 |
| 26 // This time should be short enough that it doesn't noticeably disrupt the |
| 27 // user's interaction with the page. |
| 28 const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 50; |
| 29 |
| 30 // Experimenting shows that we get a reasonable gain in performance by |
| 31 // increasing this up to around 10, but there's not much benefit in |
| 32 // increasing it past that. |
| 33 const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10; |
| 34 |
| 35 // This should be longer than we expect feature extraction to take on any |
| 36 // actual phishing page. |
| 37 const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500; |
| 38 |
| 23 // Intermediate state used for computing features. See features.h for | 39 // Intermediate state used for computing features. See features.h for |
| 24 // descriptions of the DOM features that are computed. | 40 // descriptions of the DOM features that are computed. |
| 25 struct PhishingDOMFeatureExtractor::PageFeatureState { | 41 struct PhishingDOMFeatureExtractor::PageFeatureState { |
| 26 // Link related features | 42 // Link related features |
| 27 int external_links; | 43 int external_links; |
| 28 base::hash_set<std::string> external_domains; | 44 base::hash_set<std::string> external_domains; |
| 29 int secure_links; | 45 int secure_links; |
| 30 int total_links; | 46 int total_links; |
| 31 | 47 |
| 32 // Form related features | 48 // Form related features |
| 33 int num_forms; | 49 int num_forms; |
| 34 int num_text_inputs; | 50 int num_text_inputs; |
| 35 int num_pswd_inputs; | 51 int num_pswd_inputs; |
| 36 int num_radio_inputs; | 52 int num_radio_inputs; |
| 37 int num_check_inputs; | 53 int num_check_inputs; |
| 38 int action_other_domain; | 54 int action_other_domain; |
| 39 int total_actions; | 55 int total_actions; |
| 40 | 56 |
| 41 // Image related features | 57 // Image related features |
| 42 int img_other_domain; | 58 int img_other_domain; |
| 43 int total_imgs; | 59 int total_imgs; |
| 44 | 60 |
| 45 // How many script tags | 61 // How many script tags |
| 46 int num_script_tags; | 62 int num_script_tags; |
| 47 | 63 |
| 48 PageFeatureState() | 64 // The time at which we started feature extraction for the current page. |
| 65 base::TimeTicks start_time; |
| 66 |
| 67 // The number of iterations we've done for the current extraction. |
| 68 int num_iterations; |
| 69 |
| 70 explicit PageFeatureState(base::TimeTicks start_time_ticks) |
| 49 : external_links(0), | 71 : external_links(0), |
| 50 secure_links(0), | 72 secure_links(0), |
| 51 total_links(0), | 73 total_links(0), |
| 52 num_forms(0), | 74 num_forms(0), |
| 53 num_text_inputs(0), | 75 num_text_inputs(0), |
| 54 num_pswd_inputs(0), | 76 num_pswd_inputs(0), |
| 55 num_radio_inputs(0), | 77 num_radio_inputs(0), |
| 56 num_check_inputs(0), | 78 num_check_inputs(0), |
| 57 action_other_domain(0), | 79 action_other_domain(0), |
| 58 total_actions(0), | 80 total_actions(0), |
| 59 img_other_domain(0), | 81 img_other_domain(0), |
| 60 total_imgs(0), | 82 total_imgs(0), |
| 61 num_script_tags(0) {} | 83 num_script_tags(0), |
| 84 start_time(start_time_ticks), |
| 85 num_iterations(0) {} |
| 62 | 86 |
| 63 ~PageFeatureState() {} | 87 ~PageFeatureState() {} |
| 64 }; | 88 }; |
| 65 | 89 |
| 66 // Per-frame state | 90 // Per-frame state |
| 67 struct PhishingDOMFeatureExtractor::FrameData { | 91 struct PhishingDOMFeatureExtractor::FrameData { |
| 68 // This is our reference to document.all, which is an iterator over all | 92 // This is our reference to document.all, which is an iterator over all |
| 69 // of the elements in the document. It keeps track of our current position. | 93 // of the elements in the document. It keeps track of our current position. |
| 70 WebKit::WebNodeCollection elements; | 94 WebKit::WebNodeCollection elements; |
| 71 // The domain of the document URL, stored here so that we don't need to | 95 // The domain of the document URL, stored here so that we don't need to |
| 72 // recompute it every time it's needed. | 96 // recompute it every time it's needed. |
| 73 std::string domain; | 97 std::string domain; |
| 74 }; | 98 }; |
| 75 | 99 |
| 76 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( | 100 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( |
| 77 RenderView* render_view) | 101 RenderView* render_view, |
| 102 FeatureExtractorClock* clock) |
| 78 : render_view_(render_view), | 103 : render_view_(render_view), |
| 104 clock_(clock), |
| 79 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { | 105 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { |
| 80 Clear(); | 106 Clear(); |
| 81 } | 107 } |
| 82 | 108 |
| 83 PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() { | 109 PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() { |
| 84 // The RenderView should have called CancelPendingExtraction() before | 110 // The RenderView should have called CancelPendingExtraction() before |
| 85 // we are destroyed. | 111 // we are destroyed. |
| 86 CheckNoPendingExtraction(); | 112 CheckNoPendingExtraction(); |
| 87 } | 113 } |
| 88 | 114 |
| 89 void PhishingDOMFeatureExtractor::ExtractFeatures( | 115 void PhishingDOMFeatureExtractor::ExtractFeatures( |
| 90 FeatureMap* features, | 116 FeatureMap* features, |
| 91 DoneCallback* done_callback) { | 117 DoneCallback* done_callback) { |
| 92 // The RenderView should have called CancelPendingExtraction() before | 118 // The RenderView should have called CancelPendingExtraction() before |
| 93 // starting a new extraction, so DCHECK this. | 119 // starting a new extraction, so DCHECK this. |
| 94 CheckNoPendingExtraction(); | 120 CheckNoPendingExtraction(); |
| 95 // However, in an opt build, we will go ahead and clean up the pending | 121 // However, in an opt build, we will go ahead and clean up the pending |
| 96 // extraction so that we can start in a known state. | 122 // extraction so that we can start in a known state. |
| 97 CancelPendingExtraction(); | 123 CancelPendingExtraction(); |
| 98 | 124 |
| 99 features_ = features; | 125 features_ = features; |
| 100 done_callback_.reset(done_callback); | 126 done_callback_.reset(done_callback); |
| 127 |
| 128 page_feature_state_.reset(new PageFeatureState(clock_->Now())); |
| 101 MessageLoop::current()->PostTask( | 129 MessageLoop::current()->PostTask( |
| 102 FROM_HERE, | 130 FROM_HERE, |
| 103 method_factory_.NewRunnableMethod( | 131 method_factory_.NewRunnableMethod( |
| 104 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout)); | 132 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout)); |
| 105 } | 133 } |
| 106 | 134 |
| 107 void PhishingDOMFeatureExtractor::CancelPendingExtraction() { | 135 void PhishingDOMFeatureExtractor::CancelPendingExtraction() { |
| 108 // Cancel any pending callbacks, and clear our state. | 136 // Cancel any pending callbacks, and clear our state. |
| 109 method_factory_.RevokeAll(); | 137 method_factory_.RevokeAll(); |
| 110 Clear(); | 138 Clear(); |
| 111 } | 139 } |
| 112 | 140 |
| 113 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { | 141 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { |
| 142 DCHECK(page_feature_state_.get()); |
| 143 ++page_feature_state_->num_iterations; |
| 144 base::TimeTicks current_chunk_start_time = clock_->Now(); |
| 145 |
| 114 if (!cur_frame_) { | 146 if (!cur_frame_) { |
| 115 WebKit::WebView* web_view = render_view_->webview(); | 147 WebKit::WebView* web_view = render_view_->webview(); |
| 116 if (!web_view) { | 148 if (!web_view) { |
| 117 // When the WebView is going away, the render view should have called | 149 // When the WebView is going away, the render view should have called |
| 118 // CancelPendingExtraction() which should have stopped any pending work, | 150 // CancelPendingExtraction() which should have stopped any pending work, |
| 119 // so this case should not happen. | 151 // so this case should not happen. |
| 120 NOTREACHED(); | 152 NOTREACHED(); |
| 121 RunCallback(false); | 153 RunCallback(false); |
| 122 return; | 154 return; |
| 123 } | 155 } |
| 124 cur_frame_ = web_view->mainFrame(); | 156 cur_frame_ = web_view->mainFrame(); |
| 125 page_feature_state_.reset(new PageFeatureState); | |
| 126 } | 157 } |
| 127 | 158 |
| 159 int num_elements = 0; |
| 128 for (; cur_frame_; | 160 for (; cur_frame_; |
| 129 cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) { | 161 cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) { |
| 130 WebKit::WebNode cur_node; | 162 WebKit::WebNode cur_node; |
| 131 if (cur_frame_data_.get()) { | 163 if (cur_frame_data_.get()) { |
| 132 // We're resuming traversal of a frame, so just advance to the next node. | 164 // We're resuming traversal of a frame, so just advance to the next node. |
| 133 cur_node = cur_frame_data_->elements.nextItem(); | 165 cur_node = cur_frame_data_->elements.nextItem(); |
| 166 // When we resume the traversal, the first call to nextItem() potentially |
| 167 // has to walk through the document again from the beginning, if it was |
| 168 // modified between our chunks of work. Log how long this takes, so we |
| 169 // can tell if it's too slow. |
| 170 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", |
| 171 clock_->Now() - current_chunk_start_time); |
| 134 } else { | 172 } else { |
| 135 // We just moved to a new frame, so update our frame state | 173 // We just moved to a new frame, so update our frame state |
| 136 // and advance to the first element. | 174 // and advance to the first element. |
| 137 if (!ResetFrameData()) { | 175 if (!ResetFrameData()) { |
| 138 // Nothing in this frame, move on to the next one. | 176 // Nothing in this frame, move on to the next one. |
| 139 LOG(WARNING) << "No content in frame, skipping"; | 177 LOG(WARNING) << "No content in frame, skipping"; |
| 140 continue; | 178 continue; |
| 141 } | 179 } |
| 142 cur_node = cur_frame_data_->elements.firstItem(); | 180 cur_node = cur_frame_data_->elements.firstItem(); |
| 143 } | 181 } |
| 144 | 182 |
| 145 for (; !cur_node.isNull(); | 183 for (; !cur_node.isNull(); |
| 146 cur_node = cur_frame_data_->elements.nextItem()) { | 184 cur_node = cur_frame_data_->elements.nextItem()) { |
| 147 if (!cur_node.isElementNode()) { | 185 if (!cur_node.isElementNode()) { |
| 148 continue; | 186 continue; |
| 149 } | 187 } |
| 150 WebKit::WebElement element = cur_node.to<WebKit::WebElement>(); | 188 WebKit::WebElement element = cur_node.to<WebKit::WebElement>(); |
| 151 if (element.hasTagName("a")) { | 189 if (element.hasTagName("a")) { |
| 152 HandleLink(element); | 190 HandleLink(element); |
| 153 } else if (element.hasTagName("form")) { | 191 } else if (element.hasTagName("form")) { |
| 154 HandleForm(element); | 192 HandleForm(element); |
| 155 } else if (element.hasTagName("img")) { | 193 } else if (element.hasTagName("img")) { |
| 156 HandleImage(element); | 194 HandleImage(element); |
| 157 } else if (element.hasTagName("input")) { | 195 } else if (element.hasTagName("input")) { |
| 158 HandleInput(element); | 196 HandleInput(element); |
| 159 } else if (element.hasTagName("script")) { | 197 } else if (element.hasTagName("script")) { |
| 160 HandleScript(element); | 198 HandleScript(element); |
| 161 } | 199 } |
| 162 | 200 |
| 163 // TODO(bryner): stop if too much time has elapsed, and add histograms | 201 if (++num_elements >= kClockCheckGranularity) { |
| 164 // for the time spent processing. | 202 num_elements = 0; |
| 203 base::TimeTicks now = clock_->Now(); |
| 204 if (now - page_feature_state_->start_time >= |
| 205 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { |
| 206 DLOG(ERROR) << "Feature extraction took too long, giving up"; |
| 207 // We expect this to happen infrequently, so record when it does. |
| 208 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1); |
| 209 RunCallback(false); |
| 210 return; |
| 211 } |
| 212 base::TimeDelta chunk_elapsed = now - current_chunk_start_time; |
| 213 if (chunk_elapsed >= |
| 214 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) { |
| 215 // The time limit for the current chunk is up, so post a task to |
| 216 // continue extraction. |
| 217 // |
| 218 // Record how much time we actually spent on the chunk. If this is |
| 219 // much higher than kMaxTimePerChunkMs, we may need to adjust the |
| 220 // clock granularity. |
| 221 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime", |
| 222 chunk_elapsed); |
| 223 MessageLoop::current()->PostTask( |
| 224 FROM_HERE, |
| 225 method_factory_.NewRunnableMethod( |
| 226 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout)); |
| 227 return; |
| 228 } |
| 229 // Otherwise, continue. |
| 230 } |
| 165 } | 231 } |
| 166 | 232 |
| 167 // We're done with this frame, recalculate the FrameData when we | 233 // We're done with this frame, recalculate the FrameData when we |
| 168 // advance to the next frame. | 234 // advance to the next frame. |
| 169 cur_frame_data_.reset(); | 235 cur_frame_data_.reset(); |
| 170 } | 236 } |
| 171 | 237 |
| 172 InsertFeatures(); | 238 InsertFeatures(); |
| 173 RunCallback(true); | 239 RunCallback(true); |
| 174 } | 240 } |
| (...skipping 114 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 289 DCHECK(!done_callback_.get()); | 355 DCHECK(!done_callback_.get()); |
| 290 DCHECK(!cur_frame_data_.get()); | 356 DCHECK(!cur_frame_data_.get()); |
| 291 DCHECK(!cur_frame_); | 357 DCHECK(!cur_frame_); |
| 292 if (done_callback_.get() || cur_frame_data_.get() || cur_frame_) { | 358 if (done_callback_.get() || cur_frame_data_.get() || cur_frame_) { |
| 293 LOG(ERROR) << "Extraction in progress, missing call to " | 359 LOG(ERROR) << "Extraction in progress, missing call to " |
| 294 << "CancelPendingExtraction"; | 360 << "CancelPendingExtraction"; |
| 295 } | 361 } |
| 296 } | 362 } |
| 297 | 363 |
| 298 void PhishingDOMFeatureExtractor::RunCallback(bool success) { | 364 void PhishingDOMFeatureExtractor::RunCallback(bool success) { |
| 365 // Record some timing stats that we can use to evaluate feature extraction |
| 366 // performance. These include both successful and failed extractions. |
| 367 DCHECK(page_feature_state_.get()); |
| 368 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations", |
| 369 page_feature_state_->num_iterations); |
| 370 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime", |
| 371 clock_->Now() - page_feature_state_->start_time); |
| 372 |
| 299 DCHECK(done_callback_.get()); | 373 DCHECK(done_callback_.get()); |
| 300 done_callback_->Run(success); | 374 done_callback_->Run(success); |
| 301 Clear(); | 375 Clear(); |
| 302 } | 376 } |
| 303 | 377 |
| 304 void PhishingDOMFeatureExtractor::Clear() { | 378 void PhishingDOMFeatureExtractor::Clear() { |
| 305 features_ = NULL; | 379 features_ = NULL; |
| 306 done_callback_.reset(NULL); | 380 done_callback_.reset(NULL); |
| 307 cur_frame_data_.reset(NULL); | 381 cur_frame_data_.reset(NULL); |
| 308 cur_frame_ = NULL; | 382 cur_frame_ = NULL; |
| (...skipping 98 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 407 // Record number of script tags (discretized for numerical stability.) | 481 // Record number of script tags (discretized for numerical stability.) |
| 408 if (page_feature_state_->num_script_tags > 1) { | 482 if (page_feature_state_->num_script_tags > 1) { |
| 409 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); | 483 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); |
| 410 if (page_feature_state_->num_script_tags > 6) { | 484 if (page_feature_state_->num_script_tags > 6) { |
| 411 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); | 485 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); |
| 412 } | 486 } |
| 413 } | 487 } |
| 414 } | 488 } |
| 415 | 489 |
| 416 } // namespace safe_browsing | 490 } // namespace safe_browsing |
| OLD | NEW |