Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(278)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc

Issue 3130039: Limit the time spent on a single iteration of PhishingDOMFeatureExtractor. (Closed) Base URL: http://src.chromium.org/git/chromium.git
Patch Set: tiny comment fix Created 10 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
6 6
7 #include "base/compiler_specific.h" 7 #include "base/compiler_specific.h"
8 #include "base/hash_tables.h" 8 #include "base/hash_tables.h"
9 #include "base/histogram.h" 9 #include "base/histogram.h"
10 #include "base/logging.h" 10 #include "base/logging.h"
11 #include "base/message_loop.h"
12 #include "base/time.h"
11 #include "chrome/renderer/render_view.h" 13 #include "chrome/renderer/render_view.h"
14 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
12 #include "chrome/renderer/safe_browsing/features.h" 15 #include "chrome/renderer/safe_browsing/features.h"
13 #include "net/base/registry_controlled_domain.h" 16 #include "net/base/registry_controlled_domain.h"
14 #include "third_party/WebKit/WebKit/chromium/public/WebDocument.h" 17 #include "third_party/WebKit/WebKit/chromium/public/WebDocument.h"
15 #include "third_party/WebKit/WebKit/chromium/public/WebElement.h" 18 #include "third_party/WebKit/WebKit/chromium/public/WebElement.h"
16 #include "third_party/WebKit/WebKit/chromium/public/WebFrame.h" 19 #include "third_party/WebKit/WebKit/chromium/public/WebFrame.h"
17 #include "third_party/WebKit/WebKit/chromium/public/WebNodeCollection.h" 20 #include "third_party/WebKit/WebKit/chromium/public/WebNodeCollection.h"
18 #include "third_party/WebKit/WebKit/chromium/public/WebString.h" 21 #include "third_party/WebKit/WebKit/chromium/public/WebString.h"
19 #include "third_party/WebKit/WebKit/chromium/public/WebView.h" 22 #include "third_party/WebKit/WebKit/chromium/public/WebView.h"
20 23
21 namespace safe_browsing { 24 namespace safe_browsing {
22 25
26 // This time should be short enough that it doesn't noticeably disrupt the
27 // user's interaction with the page.
28 const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 50;
29
30 // Experimenting shows that we get a reasonable gain in performance by
31 // increasing this up to around 10, but there's not much benefit in
32 // increasing it past that.
33 const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10;
34
35 // This should be longer than we expect feature extraction to take on any
36 // actual phishing page.
37 const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500;
38
23 // Intermediate state used for computing features. See features.h for 39 // Intermediate state used for computing features. See features.h for
24 // descriptions of the DOM features that are computed. 40 // descriptions of the DOM features that are computed.
25 struct PhishingDOMFeatureExtractor::PageFeatureState { 41 struct PhishingDOMFeatureExtractor::PageFeatureState {
26 // Link related features 42 // Link related features
27 int external_links; 43 int external_links;
28 base::hash_set<std::string> external_domains; 44 base::hash_set<std::string> external_domains;
29 int secure_links; 45 int secure_links;
30 int total_links; 46 int total_links;
31 47
32 // Form related features 48 // Form related features
33 int num_forms; 49 int num_forms;
34 int num_text_inputs; 50 int num_text_inputs;
35 int num_pswd_inputs; 51 int num_pswd_inputs;
36 int num_radio_inputs; 52 int num_radio_inputs;
37 int num_check_inputs; 53 int num_check_inputs;
38 int action_other_domain; 54 int action_other_domain;
39 int total_actions; 55 int total_actions;
40 56
41 // Image related features 57 // Image related features
42 int img_other_domain; 58 int img_other_domain;
43 int total_imgs; 59 int total_imgs;
44 60
45 // How many script tags 61 // How many script tags
46 int num_script_tags; 62 int num_script_tags;
47 63
48 PageFeatureState() 64 // The time at which we started feature extraction for the current page.
65 base::TimeTicks start_time;
66
67 // The number of iterations we've done for the current extraction.
68 int num_iterations;
69
70 explicit PageFeatureState(base::TimeTicks start_time_ticks)
49 : external_links(0), 71 : external_links(0),
50 secure_links(0), 72 secure_links(0),
51 total_links(0), 73 total_links(0),
52 num_forms(0), 74 num_forms(0),
53 num_text_inputs(0), 75 num_text_inputs(0),
54 num_pswd_inputs(0), 76 num_pswd_inputs(0),
55 num_radio_inputs(0), 77 num_radio_inputs(0),
56 num_check_inputs(0), 78 num_check_inputs(0),
57 action_other_domain(0), 79 action_other_domain(0),
58 total_actions(0), 80 total_actions(0),
59 img_other_domain(0), 81 img_other_domain(0),
60 total_imgs(0), 82 total_imgs(0),
61 num_script_tags(0) {} 83 num_script_tags(0),
84 start_time(start_time_ticks),
85 num_iterations(0) {}
62 86
63 ~PageFeatureState() {} 87 ~PageFeatureState() {}
64 }; 88 };
65 89
66 // Per-frame state 90 // Per-frame state
67 struct PhishingDOMFeatureExtractor::FrameData { 91 struct PhishingDOMFeatureExtractor::FrameData {
68 // This is our reference to document.all, which is an iterator over all 92 // This is our reference to document.all, which is an iterator over all
69 // of the elements in the document. It keeps track of our current position. 93 // of the elements in the document. It keeps track of our current position.
70 WebKit::WebNodeCollection elements; 94 WebKit::WebNodeCollection elements;
71 // The domain of the document URL, stored here so that we don't need to 95 // The domain of the document URL, stored here so that we don't need to
72 // recompute it every time it's needed. 96 // recompute it every time it's needed.
73 std::string domain; 97 std::string domain;
74 }; 98 };
75 99
76 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( 100 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
77 RenderView* render_view) 101 RenderView* render_view,
102 FeatureExtractorClock* clock)
78 : render_view_(render_view), 103 : render_view_(render_view),
104 clock_(clock),
79 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { 105 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
80 Clear(); 106 Clear();
81 } 107 }
82 108
83 PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() { 109 PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() {
84 // The RenderView should have called CancelPendingExtraction() before 110 // The RenderView should have called CancelPendingExtraction() before
85 // we are destroyed. 111 // we are destroyed.
86 CheckNoPendingExtraction(); 112 CheckNoPendingExtraction();
87 } 113 }
88 114
89 void PhishingDOMFeatureExtractor::ExtractFeatures( 115 void PhishingDOMFeatureExtractor::ExtractFeatures(
90 FeatureMap* features, 116 FeatureMap* features,
91 DoneCallback* done_callback) { 117 DoneCallback* done_callback) {
92 // The RenderView should have called CancelPendingExtraction() before 118 // The RenderView should have called CancelPendingExtraction() before
93 // starting a new extraction, so DCHECK this. 119 // starting a new extraction, so DCHECK this.
94 CheckNoPendingExtraction(); 120 CheckNoPendingExtraction();
95 // However, in an opt build, we will go ahead and clean up the pending 121 // However, in an opt build, we will go ahead and clean up the pending
96 // extraction so that we can start in a known state. 122 // extraction so that we can start in a known state.
97 CancelPendingExtraction(); 123 CancelPendingExtraction();
98 124
99 features_ = features; 125 features_ = features;
100 done_callback_.reset(done_callback); 126 done_callback_.reset(done_callback);
127
128 page_feature_state_.reset(new PageFeatureState(clock_->Now()));
101 MessageLoop::current()->PostTask( 129 MessageLoop::current()->PostTask(
102 FROM_HERE, 130 FROM_HERE,
103 method_factory_.NewRunnableMethod( 131 method_factory_.NewRunnableMethod(
104 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout)); 132 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout));
105 } 133 }
106 134
107 void PhishingDOMFeatureExtractor::CancelPendingExtraction() { 135 void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
108 // Cancel any pending callbacks, and clear our state. 136 // Cancel any pending callbacks, and clear our state.
109 method_factory_.RevokeAll(); 137 method_factory_.RevokeAll();
110 Clear(); 138 Clear();
111 } 139 }
112 140
113 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { 141 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
142 DCHECK(page_feature_state_.get());
143 ++page_feature_state_->num_iterations;
144 base::TimeTicks current_chunk_start_time = clock_->Now();
145
114 if (!cur_frame_) { 146 if (!cur_frame_) {
115 WebKit::WebView* web_view = render_view_->webview(); 147 WebKit::WebView* web_view = render_view_->webview();
116 if (!web_view) { 148 if (!web_view) {
117 // When the WebView is going away, the render view should have called 149 // When the WebView is going away, the render view should have called
118 // CancelPendingExtraction() which should have stopped any pending work, 150 // CancelPendingExtraction() which should have stopped any pending work,
119 // so this case should not happen. 151 // so this case should not happen.
120 NOTREACHED(); 152 NOTREACHED();
121 RunCallback(false); 153 RunCallback(false);
122 return; 154 return;
123 } 155 }
124 cur_frame_ = web_view->mainFrame(); 156 cur_frame_ = web_view->mainFrame();
125 page_feature_state_.reset(new PageFeatureState);
126 } 157 }
127 158
159 int num_elements = 0;
128 for (; cur_frame_; 160 for (; cur_frame_;
129 cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) { 161 cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) {
130 WebKit::WebNode cur_node; 162 WebKit::WebNode cur_node;
131 if (cur_frame_data_.get()) { 163 if (cur_frame_data_.get()) {
132 // We're resuming traversal of a frame, so just advance to the next node. 164 // We're resuming traversal of a frame, so just advance to the next node.
133 cur_node = cur_frame_data_->elements.nextItem(); 165 cur_node = cur_frame_data_->elements.nextItem();
166 // When we resume the traversal, the first call to nextItem() potentially
167 // has to walk through the document again from the beginning, if it was
168 // modified between our chunks of work. Log how long this takes, so we
169 // can tell if it's too slow.
170 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime",
171 clock_->Now() - current_chunk_start_time);
134 } else { 172 } else {
135 // We just moved to a new frame, so update our frame state 173 // We just moved to a new frame, so update our frame state
136 // and advance to the first element. 174 // and advance to the first element.
137 if (!ResetFrameData()) { 175 if (!ResetFrameData()) {
138 // Nothing in this frame, move on to the next one. 176 // Nothing in this frame, move on to the next one.
139 LOG(WARNING) << "No content in frame, skipping"; 177 LOG(WARNING) << "No content in frame, skipping";
140 continue; 178 continue;
141 } 179 }
142 cur_node = cur_frame_data_->elements.firstItem(); 180 cur_node = cur_frame_data_->elements.firstItem();
143 } 181 }
144 182
145 for (; !cur_node.isNull(); 183 for (; !cur_node.isNull();
146 cur_node = cur_frame_data_->elements.nextItem()) { 184 cur_node = cur_frame_data_->elements.nextItem()) {
147 if (!cur_node.isElementNode()) { 185 if (!cur_node.isElementNode()) {
148 continue; 186 continue;
149 } 187 }
150 WebKit::WebElement element = cur_node.to<WebKit::WebElement>(); 188 WebKit::WebElement element = cur_node.to<WebKit::WebElement>();
151 if (element.hasTagName("a")) { 189 if (element.hasTagName("a")) {
152 HandleLink(element); 190 HandleLink(element);
153 } else if (element.hasTagName("form")) { 191 } else if (element.hasTagName("form")) {
154 HandleForm(element); 192 HandleForm(element);
155 } else if (element.hasTagName("img")) { 193 } else if (element.hasTagName("img")) {
156 HandleImage(element); 194 HandleImage(element);
157 } else if (element.hasTagName("input")) { 195 } else if (element.hasTagName("input")) {
158 HandleInput(element); 196 HandleInput(element);
159 } else if (element.hasTagName("script")) { 197 } else if (element.hasTagName("script")) {
160 HandleScript(element); 198 HandleScript(element);
161 } 199 }
162 200
163 // TODO(bryner): stop if too much time has elapsed, and add histograms 201 if (++num_elements >= kClockCheckGranularity) {
164 // for the time spent processing. 202 num_elements = 0;
203 base::TimeTicks now = clock_->Now();
204 if (now - page_feature_state_->start_time >=
205 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
206 DLOG(ERROR) << "Feature extraction took too long, giving up";
207 // We expect this to happen infrequently, so record when it does.
208 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1);
209 RunCallback(false);
210 return;
211 }
212 base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
213 if (chunk_elapsed >=
214 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
215 // The time limit for the current chunk is up, so post a task to
216 // continue extraction.
217 //
218 // Record how much time we actually spent on the chunk. If this is
219 // much higher than kMaxTimePerChunkMs, we may need to adjust the
220 // clock granularity.
221 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime",
222 chunk_elapsed);
223 MessageLoop::current()->PostTask(
224 FROM_HERE,
225 method_factory_.NewRunnableMethod(
226 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout));
227 return;
228 }
229 // Otherwise, continue.
230 }
165 } 231 }
166 232
167 // We're done with this frame, recalculate the FrameData when we 233 // We're done with this frame, recalculate the FrameData when we
168 // advance to the next frame. 234 // advance to the next frame.
169 cur_frame_data_.reset(); 235 cur_frame_data_.reset();
170 } 236 }
171 237
172 InsertFeatures(); 238 InsertFeatures();
173 RunCallback(true); 239 RunCallback(true);
174 } 240 }
(...skipping 114 matching lines...) Expand 10 before | Expand all | Expand 10 after
289 DCHECK(!done_callback_.get()); 355 DCHECK(!done_callback_.get());
290 DCHECK(!cur_frame_data_.get()); 356 DCHECK(!cur_frame_data_.get());
291 DCHECK(!cur_frame_); 357 DCHECK(!cur_frame_);
292 if (done_callback_.get() || cur_frame_data_.get() || cur_frame_) { 358 if (done_callback_.get() || cur_frame_data_.get() || cur_frame_) {
293 LOG(ERROR) << "Extraction in progress, missing call to " 359 LOG(ERROR) << "Extraction in progress, missing call to "
294 << "CancelPendingExtraction"; 360 << "CancelPendingExtraction";
295 } 361 }
296 } 362 }
297 363
298 void PhishingDOMFeatureExtractor::RunCallback(bool success) { 364 void PhishingDOMFeatureExtractor::RunCallback(bool success) {
365 // Record some timing stats that we can use to evaluate feature extraction
366 // performance. These include both successful and failed extractions.
367 DCHECK(page_feature_state_.get());
368 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations",
369 page_feature_state_->num_iterations);
370 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime",
371 clock_->Now() - page_feature_state_->start_time);
372
299 DCHECK(done_callback_.get()); 373 DCHECK(done_callback_.get());
300 done_callback_->Run(success); 374 done_callback_->Run(success);
301 Clear(); 375 Clear();
302 } 376 }
303 377
304 void PhishingDOMFeatureExtractor::Clear() { 378 void PhishingDOMFeatureExtractor::Clear() {
305 features_ = NULL; 379 features_ = NULL;
306 done_callback_.reset(NULL); 380 done_callback_.reset(NULL);
307 cur_frame_data_.reset(NULL); 381 cur_frame_data_.reset(NULL);
308 cur_frame_ = NULL; 382 cur_frame_ = NULL;
(...skipping 98 matching lines...) Expand 10 before | Expand all | Expand 10 after
407 // Record number of script tags (discretized for numerical stability.) 481 // Record number of script tags (discretized for numerical stability.)
408 if (page_feature_state_->num_script_tags > 1) { 482 if (page_feature_state_->num_script_tags > 1) {
409 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); 483 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne);
410 if (page_feature_state_->num_script_tags > 6) { 484 if (page_feature_state_->num_script_tags > 6) {
411 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); 485 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix);
412 } 486 }
413 } 487 }
414 } 488 }
415 489
416 } // namespace safe_browsing 490 } // namespace safe_browsing
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698