OLD | NEW |
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" |
6 | 6 |
7 #include "base/compiler_specific.h" | 7 #include "base/compiler_specific.h" |
8 #include "base/hash_tables.h" | 8 #include "base/hash_tables.h" |
9 #include "base/histogram.h" | 9 #include "base/histogram.h" |
10 #include "base/logging.h" | 10 #include "base/logging.h" |
| 11 #include "base/message_loop.h" |
| 12 #include "base/time.h" |
11 #include "chrome/renderer/render_view.h" | 13 #include "chrome/renderer/render_view.h" |
| 14 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
12 #include "chrome/renderer/safe_browsing/features.h" | 15 #include "chrome/renderer/safe_browsing/features.h" |
13 #include "net/base/registry_controlled_domain.h" | 16 #include "net/base/registry_controlled_domain.h" |
14 #include "third_party/WebKit/WebKit/chromium/public/WebDocument.h" | 17 #include "third_party/WebKit/WebKit/chromium/public/WebDocument.h" |
15 #include "third_party/WebKit/WebKit/chromium/public/WebElement.h" | 18 #include "third_party/WebKit/WebKit/chromium/public/WebElement.h" |
16 #include "third_party/WebKit/WebKit/chromium/public/WebFrame.h" | 19 #include "third_party/WebKit/WebKit/chromium/public/WebFrame.h" |
17 #include "third_party/WebKit/WebKit/chromium/public/WebNodeCollection.h" | 20 #include "third_party/WebKit/WebKit/chromium/public/WebNodeCollection.h" |
18 #include "third_party/WebKit/WebKit/chromium/public/WebString.h" | 21 #include "third_party/WebKit/WebKit/chromium/public/WebString.h" |
19 #include "third_party/WebKit/WebKit/chromium/public/WebView.h" | 22 #include "third_party/WebKit/WebKit/chromium/public/WebView.h" |
20 | 23 |
21 namespace safe_browsing { | 24 namespace safe_browsing { |
22 | 25 |
| 26 // This time should be short enough that it doesn't noticeably disrupt the |
| 27 // user's interaction with the page. |
| 28 const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 50; |
| 29 |
| 30 // Experimenting shows that we get a reasonable gain in performance by |
| 31 // increasing this up to around 10, but there's not much benefit in |
| 32 // increasing it past that. |
| 33 const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10; |
| 34 |
| 35 // This should be longer than we expect feature extraction to take on any |
| 36 // actual phishing page. |
| 37 const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500; |
| 38 |
23 // Intermediate state used for computing features. See features.h for | 39 // Intermediate state used for computing features. See features.h for |
24 // descriptions of the DOM features that are computed. | 40 // descriptions of the DOM features that are computed. |
25 struct PhishingDOMFeatureExtractor::PageFeatureState { | 41 struct PhishingDOMFeatureExtractor::PageFeatureState { |
26 // Link related features | 42 // Link related features |
27 int external_links; | 43 int external_links; |
28 base::hash_set<std::string> external_domains; | 44 base::hash_set<std::string> external_domains; |
29 int secure_links; | 45 int secure_links; |
30 int total_links; | 46 int total_links; |
31 | 47 |
32 // Form related features | 48 // Form related features |
33 int num_forms; | 49 int num_forms; |
34 int num_text_inputs; | 50 int num_text_inputs; |
35 int num_pswd_inputs; | 51 int num_pswd_inputs; |
36 int num_radio_inputs; | 52 int num_radio_inputs; |
37 int num_check_inputs; | 53 int num_check_inputs; |
38 int action_other_domain; | 54 int action_other_domain; |
39 int total_actions; | 55 int total_actions; |
40 | 56 |
41 // Image related features | 57 // Image related features |
42 int img_other_domain; | 58 int img_other_domain; |
43 int total_imgs; | 59 int total_imgs; |
44 | 60 |
45 // How many script tags | 61 // How many script tags |
46 int num_script_tags; | 62 int num_script_tags; |
47 | 63 |
48 PageFeatureState() | 64 // The time at which we started feature extraction for the current page. |
| 65 base::TimeTicks start_time; |
| 66 |
| 67 // The number of iterations we've done for the current extraction. |
| 68 int num_iterations; |
| 69 |
| 70 explicit PageFeatureState(base::TimeTicks start_time_ticks) |
49 : external_links(0), | 71 : external_links(0), |
50 secure_links(0), | 72 secure_links(0), |
51 total_links(0), | 73 total_links(0), |
52 num_forms(0), | 74 num_forms(0), |
53 num_text_inputs(0), | 75 num_text_inputs(0), |
54 num_pswd_inputs(0), | 76 num_pswd_inputs(0), |
55 num_radio_inputs(0), | 77 num_radio_inputs(0), |
56 num_check_inputs(0), | 78 num_check_inputs(0), |
57 action_other_domain(0), | 79 action_other_domain(0), |
58 total_actions(0), | 80 total_actions(0), |
59 img_other_domain(0), | 81 img_other_domain(0), |
60 total_imgs(0), | 82 total_imgs(0), |
61 num_script_tags(0) {} | 83 num_script_tags(0), |
| 84 start_time(start_time_ticks), |
| 85 num_iterations(0) {} |
62 | 86 |
63 ~PageFeatureState() {} | 87 ~PageFeatureState() {} |
64 }; | 88 }; |
65 | 89 |
66 // Per-frame state | 90 // Per-frame state |
67 struct PhishingDOMFeatureExtractor::FrameData { | 91 struct PhishingDOMFeatureExtractor::FrameData { |
68 // This is our reference to document.all, which is an iterator over all | 92 // This is our reference to document.all, which is an iterator over all |
69 // of the elements in the document. It keeps track of our current position. | 93 // of the elements in the document. It keeps track of our current position. |
70 WebKit::WebNodeCollection elements; | 94 WebKit::WebNodeCollection elements; |
71 // The domain of the document URL, stored here so that we don't need to | 95 // The domain of the document URL, stored here so that we don't need to |
72 // recompute it every time it's needed. | 96 // recompute it every time it's needed. |
73 std::string domain; | 97 std::string domain; |
74 }; | 98 }; |
75 | 99 |
76 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( | 100 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( |
77 RenderView* render_view) | 101 RenderView* render_view, |
| 102 FeatureExtractorClock* clock) |
78 : render_view_(render_view), | 103 : render_view_(render_view), |
| 104 clock_(clock), |
79 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { | 105 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { |
80 Clear(); | 106 Clear(); |
81 } | 107 } |
82 | 108 |
83 PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() { | 109 PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() { |
84 // The RenderView should have called CancelPendingExtraction() before | 110 // The RenderView should have called CancelPendingExtraction() before |
85 // we are destroyed. | 111 // we are destroyed. |
86 CheckNoPendingExtraction(); | 112 CheckNoPendingExtraction(); |
87 } | 113 } |
88 | 114 |
89 void PhishingDOMFeatureExtractor::ExtractFeatures( | 115 void PhishingDOMFeatureExtractor::ExtractFeatures( |
90 FeatureMap* features, | 116 FeatureMap* features, |
91 DoneCallback* done_callback) { | 117 DoneCallback* done_callback) { |
92 // The RenderView should have called CancelPendingExtraction() before | 118 // The RenderView should have called CancelPendingExtraction() before |
93 // starting a new extraction, so DCHECK this. | 119 // starting a new extraction, so DCHECK this. |
94 CheckNoPendingExtraction(); | 120 CheckNoPendingExtraction(); |
95 // However, in an opt build, we will go ahead and clean up the pending | 121 // However, in an opt build, we will go ahead and clean up the pending |
96 // extraction so that we can start in a known state. | 122 // extraction so that we can start in a known state. |
97 CancelPendingExtraction(); | 123 CancelPendingExtraction(); |
98 | 124 |
99 features_ = features; | 125 features_ = features; |
100 done_callback_.reset(done_callback); | 126 done_callback_.reset(done_callback); |
| 127 |
| 128 page_feature_state_.reset(new PageFeatureState(clock_->Now())); |
101 MessageLoop::current()->PostTask( | 129 MessageLoop::current()->PostTask( |
102 FROM_HERE, | 130 FROM_HERE, |
103 method_factory_.NewRunnableMethod( | 131 method_factory_.NewRunnableMethod( |
104 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout)); | 132 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout)); |
105 } | 133 } |
106 | 134 |
107 void PhishingDOMFeatureExtractor::CancelPendingExtraction() { | 135 void PhishingDOMFeatureExtractor::CancelPendingExtraction() { |
108 // Cancel any pending callbacks, and clear our state. | 136 // Cancel any pending callbacks, and clear our state. |
109 method_factory_.RevokeAll(); | 137 method_factory_.RevokeAll(); |
110 Clear(); | 138 Clear(); |
111 } | 139 } |
112 | 140 |
113 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { | 141 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { |
| 142 DCHECK(page_feature_state_.get()); |
| 143 ++page_feature_state_->num_iterations; |
| 144 base::TimeTicks current_chunk_start_time = clock_->Now(); |
| 145 |
114 if (!cur_frame_) { | 146 if (!cur_frame_) { |
115 WebKit::WebView* web_view = render_view_->webview(); | 147 WebKit::WebView* web_view = render_view_->webview(); |
116 if (!web_view) { | 148 if (!web_view) { |
117 // When the WebView is going away, the render view should have called | 149 // When the WebView is going away, the render view should have called |
118 // CancelPendingExtraction() which should have stopped any pending work, | 150 // CancelPendingExtraction() which should have stopped any pending work, |
119 // so this case should not happen. | 151 // so this case should not happen. |
120 NOTREACHED(); | 152 NOTREACHED(); |
121 RunCallback(false); | 153 RunCallback(false); |
122 return; | 154 return; |
123 } | 155 } |
124 cur_frame_ = web_view->mainFrame(); | 156 cur_frame_ = web_view->mainFrame(); |
125 page_feature_state_.reset(new PageFeatureState); | |
126 } | 157 } |
127 | 158 |
| 159 int num_elements = 0; |
128 for (; cur_frame_; | 160 for (; cur_frame_; |
129 cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) { | 161 cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) { |
130 WebKit::WebNode cur_node; | 162 WebKit::WebNode cur_node; |
131 if (cur_frame_data_.get()) { | 163 if (cur_frame_data_.get()) { |
132 // We're resuming traversal of a frame, so just advance to the next node. | 164 // We're resuming traversal of a frame, so just advance to the next node. |
133 cur_node = cur_frame_data_->elements.nextItem(); | 165 cur_node = cur_frame_data_->elements.nextItem(); |
| 166 // When we resume the traversal, the first call to nextItem() potentially |
| 167 // has to walk through the document again from the beginning, if it was |
| 168 // modified between our chunks of work. Log how long this takes, so we |
| 169 // can tell if it's too slow. |
| 170 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", |
| 171 clock_->Now() - current_chunk_start_time); |
134 } else { | 172 } else { |
135 // We just moved to a new frame, so update our frame state | 173 // We just moved to a new frame, so update our frame state |
136 // and advance to the first element. | 174 // and advance to the first element. |
137 if (!ResetFrameData()) { | 175 if (!ResetFrameData()) { |
138 // Nothing in this frame, move on to the next one. | 176 // Nothing in this frame, move on to the next one. |
139 LOG(WARNING) << "No content in frame, skipping"; | 177 LOG(WARNING) << "No content in frame, skipping"; |
140 continue; | 178 continue; |
141 } | 179 } |
142 cur_node = cur_frame_data_->elements.firstItem(); | 180 cur_node = cur_frame_data_->elements.firstItem(); |
143 } | 181 } |
144 | 182 |
145 for (; !cur_node.isNull(); | 183 for (; !cur_node.isNull(); |
146 cur_node = cur_frame_data_->elements.nextItem()) { | 184 cur_node = cur_frame_data_->elements.nextItem()) { |
147 if (!cur_node.isElementNode()) { | 185 if (!cur_node.isElementNode()) { |
148 continue; | 186 continue; |
149 } | 187 } |
150 WebKit::WebElement element = cur_node.to<WebKit::WebElement>(); | 188 WebKit::WebElement element = cur_node.to<WebKit::WebElement>(); |
151 if (element.hasTagName("a")) { | 189 if (element.hasTagName("a")) { |
152 HandleLink(element); | 190 HandleLink(element); |
153 } else if (element.hasTagName("form")) { | 191 } else if (element.hasTagName("form")) { |
154 HandleForm(element); | 192 HandleForm(element); |
155 } else if (element.hasTagName("img")) { | 193 } else if (element.hasTagName("img")) { |
156 HandleImage(element); | 194 HandleImage(element); |
157 } else if (element.hasTagName("input")) { | 195 } else if (element.hasTagName("input")) { |
158 HandleInput(element); | 196 HandleInput(element); |
159 } else if (element.hasTagName("script")) { | 197 } else if (element.hasTagName("script")) { |
160 HandleScript(element); | 198 HandleScript(element); |
161 } | 199 } |
162 | 200 |
163 // TODO(bryner): stop if too much time has elapsed, and add histograms | 201 if (++num_elements >= kClockCheckGranularity) { |
164 // for the time spent processing. | 202 num_elements = 0; |
| 203 base::TimeTicks now = clock_->Now(); |
| 204 if (now - page_feature_state_->start_time >= |
| 205 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { |
| 206 DLOG(ERROR) << "Feature extraction took too long, giving up"; |
| 207 // We expect this to happen infrequently, so record when it does. |
| 208 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1); |
| 209 RunCallback(false); |
| 210 return; |
| 211 } |
| 212 base::TimeDelta chunk_elapsed = now - current_chunk_start_time; |
| 213 if (chunk_elapsed >= |
| 214 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) { |
| 215 // The time limit for the current chunk is up, so post a task to |
| 216 // continue extraction. |
| 217 // |
| 218 // Record how much time we actually spent on the chunk. If this is |
| 219 // much higher than kMaxTimePerChunkMs, we may need to adjust the |
| 220 // clock granularity. |
| 221 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime", |
| 222 chunk_elapsed); |
| 223 MessageLoop::current()->PostTask( |
| 224 FROM_HERE, |
| 225 method_factory_.NewRunnableMethod( |
| 226 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout)); |
| 227 return; |
| 228 } |
| 229 // Otherwise, continue. |
| 230 } |
165 } | 231 } |
166 | 232 |
167 // We're done with this frame, recalculate the FrameData when we | 233 // We're done with this frame, recalculate the FrameData when we |
168 // advance to the next frame. | 234 // advance to the next frame. |
169 cur_frame_data_.reset(); | 235 cur_frame_data_.reset(); |
170 } | 236 } |
171 | 237 |
172 InsertFeatures(); | 238 InsertFeatures(); |
173 RunCallback(true); | 239 RunCallback(true); |
174 } | 240 } |
(...skipping 114 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
289 DCHECK(!done_callback_.get()); | 355 DCHECK(!done_callback_.get()); |
290 DCHECK(!cur_frame_data_.get()); | 356 DCHECK(!cur_frame_data_.get()); |
291 DCHECK(!cur_frame_); | 357 DCHECK(!cur_frame_); |
292 if (done_callback_.get() || cur_frame_data_.get() || cur_frame_) { | 358 if (done_callback_.get() || cur_frame_data_.get() || cur_frame_) { |
293 LOG(ERROR) << "Extraction in progress, missing call to " | 359 LOG(ERROR) << "Extraction in progress, missing call to " |
294 << "CancelPendingExtraction"; | 360 << "CancelPendingExtraction"; |
295 } | 361 } |
296 } | 362 } |
297 | 363 |
298 void PhishingDOMFeatureExtractor::RunCallback(bool success) { | 364 void PhishingDOMFeatureExtractor::RunCallback(bool success) { |
| 365 // Record some timing stats that we can use to evaluate feature extraction |
| 366 // performance. These include both successful and failed extractions. |
| 367 DCHECK(page_feature_state_.get()); |
| 368 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations", |
| 369 page_feature_state_->num_iterations); |
| 370 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime", |
| 371 clock_->Now() - page_feature_state_->start_time); |
| 372 |
299 DCHECK(done_callback_.get()); | 373 DCHECK(done_callback_.get()); |
300 done_callback_->Run(success); | 374 done_callback_->Run(success); |
301 Clear(); | 375 Clear(); |
302 } | 376 } |
303 | 377 |
304 void PhishingDOMFeatureExtractor::Clear() { | 378 void PhishingDOMFeatureExtractor::Clear() { |
305 features_ = NULL; | 379 features_ = NULL; |
306 done_callback_.reset(NULL); | 380 done_callback_.reset(NULL); |
307 cur_frame_data_.reset(NULL); | 381 cur_frame_data_.reset(NULL); |
308 cur_frame_ = NULL; | 382 cur_frame_ = NULL; |
(...skipping 98 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
407 // Record number of script tags (discretized for numerical stability.) | 481 // Record number of script tags (discretized for numerical stability.) |
408 if (page_feature_state_->num_script_tags > 1) { | 482 if (page_feature_state_->num_script_tags > 1) { |
409 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); | 483 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); |
410 if (page_feature_state_->num_script_tags > 6) { | 484 if (page_feature_state_->num_script_tags > 6) { |
411 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); | 485 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); |
412 } | 486 } |
413 } | 487 } |
414 } | 488 } |
415 | 489 |
416 } // namespace safe_browsing | 490 } // namespace safe_browsing |
OLD | NEW |