OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" |
6 | 6 |
7 #include "base/bind.h" | 7 #include "base/bind.h" |
8 #include "base/compiler_specific.h" | 8 #include "base/compiler_specific.h" |
9 #include "base/containers/hash_tables.h" | 9 #include "base/containers/hash_tables.h" |
10 #include "base/logging.h" | 10 #include "base/logging.h" |
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
85 start_time(start_time_ticks), | 85 start_time(start_time_ticks), |
86 num_iterations(0) {} | 86 num_iterations(0) {} |
87 | 87 |
88 ~PageFeatureState() {} | 88 ~PageFeatureState() {} |
89 }; | 89 }; |
90 | 90 |
91 // Per-frame state | 91 // Per-frame state |
92 struct PhishingDOMFeatureExtractor::FrameData { | 92 struct PhishingDOMFeatureExtractor::FrameData { |
93 // This is our reference to document.all, which is an iterator over all | 93 // This is our reference to document.all, which is an iterator over all |
94 // of the elements in the document. It keeps track of our current position. | 94 // of the elements in the document. It keeps track of our current position. |
95 WebKit::WebNodeCollection elements; | 95 blink::WebNodeCollection elements; |
96 // The domain of the document URL, stored here so that we don't need to | 96 // The domain of the document URL, stored here so that we don't need to |
97 // recompute it every time it's needed. | 97 // recompute it every time it's needed. |
98 std::string domain; | 98 std::string domain; |
99 }; | 99 }; |
100 | 100 |
101 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( | 101 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( |
102 content::RenderView* render_view, | 102 content::RenderView* render_view, |
103 FeatureExtractorClock* clock) | 103 FeatureExtractorClock* clock) |
104 : render_view_(render_view), | 104 : render_view_(render_view), |
105 clock_(clock), | 105 clock_(clock), |
(...skipping 14 matching lines...) Expand all Loading... |
120 // starting a new extraction, so DCHECK this. | 120 // starting a new extraction, so DCHECK this. |
121 CheckNoPendingExtraction(); | 121 CheckNoPendingExtraction(); |
122 // However, in an opt build, we will go ahead and clean up the pending | 122 // However, in an opt build, we will go ahead and clean up the pending |
123 // extraction so that we can start in a known state. | 123 // extraction so that we can start in a known state. |
124 CancelPendingExtraction(); | 124 CancelPendingExtraction(); |
125 | 125 |
126 features_ = features; | 126 features_ = features; |
127 done_callback_ = done_callback; | 127 done_callback_ = done_callback; |
128 | 128 |
129 page_feature_state_.reset(new PageFeatureState(clock_->Now())); | 129 page_feature_state_.reset(new PageFeatureState(clock_->Now())); |
130 WebKit::WebView* web_view = render_view_->GetWebView(); | 130 blink::WebView* web_view = render_view_->GetWebView(); |
131 if (web_view && web_view->mainFrame()) { | 131 if (web_view && web_view->mainFrame()) { |
132 cur_document_ = web_view->mainFrame()->document(); | 132 cur_document_ = web_view->mainFrame()->document(); |
133 } | 133 } |
134 | 134 |
135 base::MessageLoop::current()->PostTask( | 135 base::MessageLoop::current()->PostTask( |
136 FROM_HERE, | 136 FROM_HERE, |
137 base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout, | 137 base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout, |
138 weak_factory_.GetWeakPtr())); | 138 weak_factory_.GetWeakPtr())); |
139 } | 139 } |
140 | 140 |
(...skipping 10 matching lines...) Expand all Loading... |
151 | 151 |
152 if (cur_document_.isNull()) { | 152 if (cur_document_.isNull()) { |
153 // This will only happen if we weren't able to get the document for the | 153 // This will only happen if we weren't able to get the document for the |
154 // main frame. We'll treat this as an extraction failure. | 154 // main frame. We'll treat this as an extraction failure. |
155 RunCallback(false); | 155 RunCallback(false); |
156 return; | 156 return; |
157 } | 157 } |
158 | 158 |
159 int num_elements = 0; | 159 int num_elements = 0; |
160 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) { | 160 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) { |
161 WebKit::WebNode cur_node; | 161 blink::WebNode cur_node; |
162 if (cur_frame_data_.get()) { | 162 if (cur_frame_data_.get()) { |
163 // We're resuming traversal of a frame, so just advance to the next node. | 163 // We're resuming traversal of a frame, so just advance to the next node. |
164 cur_node = cur_frame_data_->elements.nextItem(); | 164 cur_node = cur_frame_data_->elements.nextItem(); |
165 // When we resume the traversal, the first call to nextItem() potentially | 165 // When we resume the traversal, the first call to nextItem() potentially |
166 // has to walk through the document again from the beginning, if it was | 166 // has to walk through the document again from the beginning, if it was |
167 // modified between our chunks of work. Log how long this takes, so we | 167 // modified between our chunks of work. Log how long this takes, so we |
168 // can tell if it's too slow. | 168 // can tell if it's too slow. |
169 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", | 169 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", |
170 clock_->Now() - current_chunk_start_time); | 170 clock_->Now() - current_chunk_start_time); |
171 } else { | 171 } else { |
172 // We just moved to a new frame, so update our frame state | 172 // We just moved to a new frame, so update our frame state |
173 // and advance to the first element. | 173 // and advance to the first element. |
174 ResetFrameData(); | 174 ResetFrameData(); |
175 cur_node = cur_frame_data_->elements.firstItem(); | 175 cur_node = cur_frame_data_->elements.firstItem(); |
176 } | 176 } |
177 | 177 |
178 for (; !cur_node.isNull(); | 178 for (; !cur_node.isNull(); |
179 cur_node = cur_frame_data_->elements.nextItem()) { | 179 cur_node = cur_frame_data_->elements.nextItem()) { |
180 if (!cur_node.isElementNode()) { | 180 if (!cur_node.isElementNode()) { |
181 continue; | 181 continue; |
182 } | 182 } |
183 WebKit::WebElement element = cur_node.to<WebKit::WebElement>(); | 183 blink::WebElement element = cur_node.to<blink::WebElement>(); |
184 if (element.hasTagName("a")) { | 184 if (element.hasTagName("a")) { |
185 HandleLink(element); | 185 HandleLink(element); |
186 } else if (element.hasTagName("form")) { | 186 } else if (element.hasTagName("form")) { |
187 HandleForm(element); | 187 HandleForm(element); |
188 } else if (element.hasTagName("img")) { | 188 } else if (element.hasTagName("img")) { |
189 HandleImage(element); | 189 HandleImage(element); |
190 } else if (element.hasTagName("input")) { | 190 } else if (element.hasTagName("input")) { |
191 HandleInput(element); | 191 HandleInput(element); |
192 } else if (element.hasTagName("script")) { | 192 } else if (element.hasTagName("script")) { |
193 HandleScript(element); | 193 HandleScript(element); |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
229 // We're done with this frame, recalculate the FrameData when we | 229 // We're done with this frame, recalculate the FrameData when we |
230 // advance to the next frame. | 230 // advance to the next frame. |
231 cur_frame_data_.reset(); | 231 cur_frame_data_.reset(); |
232 } | 232 } |
233 | 233 |
234 InsertFeatures(); | 234 InsertFeatures(); |
235 RunCallback(true); | 235 RunCallback(true); |
236 } | 236 } |
237 | 237 |
238 void PhishingDOMFeatureExtractor::HandleLink( | 238 void PhishingDOMFeatureExtractor::HandleLink( |
239 const WebKit::WebElement& element) { | 239 const blink::WebElement& element) { |
240 // Count the number of times we link to a different host. | 240 // Count the number of times we link to a different host. |
241 if (!element.hasAttribute("href")) { | 241 if (!element.hasAttribute("href")) { |
242 DVLOG(1) << "Skipping anchor tag with no href"; | 242 DVLOG(1) << "Skipping anchor tag with no href"; |
243 return; | 243 return; |
244 } | 244 } |
245 | 245 |
246 // Retrieve the link and resolve the link in case it's relative. | 246 // Retrieve the link and resolve the link in case it's relative. |
247 WebKit::WebURL full_url = element.document().completeURL( | 247 blink::WebURL full_url = element.document().completeURL( |
248 element.getAttribute("href")); | 248 element.getAttribute("href")); |
249 | 249 |
250 std::string domain; | 250 std::string domain; |
251 bool is_external = IsExternalDomain(full_url, &domain); | 251 bool is_external = IsExternalDomain(full_url, &domain); |
252 if (domain.empty()) { | 252 if (domain.empty()) { |
253 DVLOG(1) << "Could not extract domain from link: " << full_url; | 253 DVLOG(1) << "Could not extract domain from link: " << full_url; |
254 return; | 254 return; |
255 } | 255 } |
256 | 256 |
257 if (is_external) { | 257 if (is_external) { |
258 ++page_feature_state_->external_links; | 258 ++page_feature_state_->external_links; |
259 | 259 |
260 // Record each unique domain that we link to. | 260 // Record each unique domain that we link to. |
261 page_feature_state_->external_domains.insert(domain); | 261 page_feature_state_->external_domains.insert(domain); |
262 } | 262 } |
263 | 263 |
264 // Check how many are https links. | 264 // Check how many are https links. |
265 if (GURL(full_url).SchemeIs("https")) { | 265 if (GURL(full_url).SchemeIs("https")) { |
266 ++page_feature_state_->secure_links; | 266 ++page_feature_state_->secure_links; |
267 } | 267 } |
268 | 268 |
269 ++page_feature_state_->total_links; | 269 ++page_feature_state_->total_links; |
270 } | 270 } |
271 | 271 |
272 void PhishingDOMFeatureExtractor::HandleForm( | 272 void PhishingDOMFeatureExtractor::HandleForm( |
273 const WebKit::WebElement& element) { | 273 const blink::WebElement& element) { |
274 // Increment the number of forms on this page. | 274 // Increment the number of forms on this page. |
275 ++page_feature_state_->num_forms; | 275 ++page_feature_state_->num_forms; |
276 | 276 |
277 // Record whether the action points to a different domain. | 277 // Record whether the action points to a different domain. |
278 if (!element.hasAttribute("action")) { | 278 if (!element.hasAttribute("action")) { |
279 return; | 279 return; |
280 } | 280 } |
281 | 281 |
282 WebKit::WebURL full_url = element.document().completeURL( | 282 blink::WebURL full_url = element.document().completeURL( |
283 element.getAttribute("action")); | 283 element.getAttribute("action")); |
284 | 284 |
285 std::string domain; | 285 std::string domain; |
286 bool is_external = IsExternalDomain(full_url, &domain); | 286 bool is_external = IsExternalDomain(full_url, &domain); |
287 if (domain.empty()) { | 287 if (domain.empty()) { |
288 DVLOG(1) << "Could not extract domain from form action: " << full_url; | 288 DVLOG(1) << "Could not extract domain from form action: " << full_url; |
289 return; | 289 return; |
290 } | 290 } |
291 | 291 |
292 if (is_external) { | 292 if (is_external) { |
293 ++page_feature_state_->action_other_domain; | 293 ++page_feature_state_->action_other_domain; |
294 } | 294 } |
295 ++page_feature_state_->total_actions; | 295 ++page_feature_state_->total_actions; |
296 } | 296 } |
297 | 297 |
298 void PhishingDOMFeatureExtractor::HandleImage( | 298 void PhishingDOMFeatureExtractor::HandleImage( |
299 const WebKit::WebElement& element) { | 299 const blink::WebElement& element) { |
300 if (!element.hasAttribute("src")) { | 300 if (!element.hasAttribute("src")) { |
301 DVLOG(1) << "Skipping img tag with no src"; | 301 DVLOG(1) << "Skipping img tag with no src"; |
302 } | 302 } |
303 | 303 |
304 // Record whether the image points to a different domain. | 304 // Record whether the image points to a different domain. |
305 WebKit::WebURL full_url = element.document().completeURL( | 305 blink::WebURL full_url = element.document().completeURL( |
306 element.getAttribute("src")); | 306 element.getAttribute("src")); |
307 std::string domain; | 307 std::string domain; |
308 bool is_external = IsExternalDomain(full_url, &domain); | 308 bool is_external = IsExternalDomain(full_url, &domain); |
309 if (domain.empty()) { | 309 if (domain.empty()) { |
310 DVLOG(1) << "Could not extract domain from image src: " << full_url; | 310 DVLOG(1) << "Could not extract domain from image src: " << full_url; |
311 return; | 311 return; |
312 } | 312 } |
313 | 313 |
314 if (is_external) { | 314 if (is_external) { |
315 ++page_feature_state_->img_other_domain; | 315 ++page_feature_state_->img_other_domain; |
316 } | 316 } |
317 ++page_feature_state_->total_imgs; | 317 ++page_feature_state_->total_imgs; |
318 } | 318 } |
319 | 319 |
320 void PhishingDOMFeatureExtractor::HandleInput( | 320 void PhishingDOMFeatureExtractor::HandleInput( |
321 const WebKit::WebElement& element) { | 321 const blink::WebElement& element) { |
322 // The HTML spec says that if the type is unspecified, it defaults to text. | 322 // The HTML spec says that if the type is unspecified, it defaults to text. |
323 // In addition, any unrecognized type will be treated as a text input. | 323 // In addition, any unrecognized type will be treated as a text input. |
324 // | 324 // |
325 // Note that we use the attribute value rather than | 325 // Note that we use the attribute value rather than |
326 // WebFormControlElement::formControlType() for consistency with the | 326 // WebFormControlElement::formControlType() for consistency with the |
327 // way the phishing classification model is created. | 327 // way the phishing classification model is created. |
328 std::string type = element.getAttribute("type").utf8(); | 328 std::string type = element.getAttribute("type").utf8(); |
329 StringToLowerASCII(&type); | 329 StringToLowerASCII(&type); |
330 if (type == "password") { | 330 if (type == "password") { |
331 ++page_feature_state_->num_pswd_inputs; | 331 ++page_feature_state_->num_pswd_inputs; |
332 } else if (type == "radio") { | 332 } else if (type == "radio") { |
333 ++page_feature_state_->num_radio_inputs; | 333 ++page_feature_state_->num_radio_inputs; |
334 } else if (type == "checkbox") { | 334 } else if (type == "checkbox") { |
335 ++page_feature_state_->num_check_inputs; | 335 ++page_feature_state_->num_check_inputs; |
336 } else if (type != "submit" && type != "reset" && type != "file" && | 336 } else if (type != "submit" && type != "reset" && type != "file" && |
337 type != "hidden" && type != "image" && type != "button") { | 337 type != "hidden" && type != "image" && type != "button") { |
338 // Note that there are a number of new input types in HTML5 that are not | 338 // Note that there are a number of new input types in HTML5 that are not |
339 // handled above. For now, we will consider these as text inputs since | 339 // handled above. For now, we will consider these as text inputs since |
340 // they could be used to capture user input. | 340 // they could be used to capture user input. |
341 ++page_feature_state_->num_text_inputs; | 341 ++page_feature_state_->num_text_inputs; |
342 } | 342 } |
343 } | 343 } |
344 | 344 |
345 void PhishingDOMFeatureExtractor::HandleScript( | 345 void PhishingDOMFeatureExtractor::HandleScript( |
346 const WebKit::WebElement& element) { | 346 const blink::WebElement& element) { |
347 ++page_feature_state_->num_script_tags; | 347 ++page_feature_state_->num_script_tags; |
348 } | 348 } |
349 | 349 |
350 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { | 350 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { |
351 DCHECK(done_callback_.is_null()); | 351 DCHECK(done_callback_.is_null()); |
352 DCHECK(!cur_frame_data_.get()); | 352 DCHECK(!cur_frame_data_.get()); |
353 DCHECK(cur_document_.isNull()); | 353 DCHECK(cur_document_.isNull()); |
354 if (!done_callback_.is_null() || cur_frame_data_.get() || | 354 if (!done_callback_.is_null() || cur_frame_data_.get() || |
355 !cur_document_.isNull()) { | 355 !cur_document_.isNull()) { |
356 LOG(ERROR) << "Extraction in progress, missing call to " | 356 LOG(ERROR) << "Extraction in progress, missing call to " |
(...skipping 27 matching lines...) Expand all Loading... |
384 DCHECK(!cur_frame_data_.get()); | 384 DCHECK(!cur_frame_data_.get()); |
385 | 385 |
386 cur_frame_data_.reset(new FrameData()); | 386 cur_frame_data_.reset(new FrameData()); |
387 cur_frame_data_->elements = cur_document_.all(); | 387 cur_frame_data_->elements = cur_document_.all(); |
388 cur_frame_data_->domain = | 388 cur_frame_data_->domain = |
389 net::registry_controlled_domains::GetDomainAndRegistry( | 389 net::registry_controlled_domains::GetDomainAndRegistry( |
390 cur_document_.url(), | 390 cur_document_.url(), |
391 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); | 391 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); |
392 } | 392 } |
393 | 393 |
394 WebKit::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() { | 394 blink::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() { |
395 DCHECK(!cur_document_.isNull()); | 395 DCHECK(!cur_document_.isNull()); |
396 WebKit::WebFrame* frame = cur_document_.frame(); | 396 blink::WebFrame* frame = cur_document_.frame(); |
397 // Advance to the next frame that contains a document, with no wrapping. | 397 // Advance to the next frame that contains a document, with no wrapping. |
398 if (frame) { | 398 if (frame) { |
399 while ((frame = frame->traverseNext(false))) { | 399 while ((frame = frame->traverseNext(false))) { |
400 if (!frame->document().isNull()) { | 400 if (!frame->document().isNull()) { |
401 return frame->document(); | 401 return frame->document(); |
402 } | 402 } |
403 } | 403 } |
404 } else { | 404 } else { |
405 // Keep track of how often frame traversal got "stuck" due to the | 405 // Keep track of how often frame traversal got "stuck" due to the |
406 // current subdocument getting removed from the frame tree. | 406 // current subdocument getting removed from the frame tree. |
407 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1); | 407 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1); |
408 } | 408 } |
409 return WebKit::WebDocument(); | 409 return blink::WebDocument(); |
410 } | 410 } |
411 | 411 |
412 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, | 412 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, |
413 std::string* domain) const { | 413 std::string* domain) const { |
414 DCHECK(domain); | 414 DCHECK(domain); |
415 DCHECK(cur_frame_data_.get()); | 415 DCHECK(cur_frame_data_.get()); |
416 | 416 |
417 if (cur_frame_data_->domain.empty()) { | 417 if (cur_frame_data_->domain.empty()) { |
418 return false; | 418 return false; |
419 } | 419 } |
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
491 // Record number of script tags (discretized for numerical stability.) | 491 // Record number of script tags (discretized for numerical stability.) |
492 if (page_feature_state_->num_script_tags > 1) { | 492 if (page_feature_state_->num_script_tags > 1) { |
493 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); | 493 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); |
494 if (page_feature_state_->num_script_tags > 6) { | 494 if (page_feature_state_->num_script_tags > 6) { |
495 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); | 495 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); |
496 } | 496 } |
497 } | 497 } |
498 } | 498 } |
499 | 499 |
500 } // namespace safe_browsing | 500 } // namespace safe_browsing |
OLD | NEW |