| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" |
| 6 | 6 |
| 7 #include "base/bind.h" | 7 #include "base/bind.h" |
| 8 #include "base/compiler_specific.h" | 8 #include "base/compiler_specific.h" |
| 9 #include "base/containers/hash_tables.h" | 9 #include "base/containers/hash_tables.h" |
| 10 #include "base/logging.h" | 10 #include "base/logging.h" |
| (...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 85 start_time(start_time_ticks), | 85 start_time(start_time_ticks), |
| 86 num_iterations(0) {} | 86 num_iterations(0) {} |
| 87 | 87 |
| 88 ~PageFeatureState() {} | 88 ~PageFeatureState() {} |
| 89 }; | 89 }; |
| 90 | 90 |
| 91 // Per-frame state | 91 // Per-frame state |
| 92 struct PhishingDOMFeatureExtractor::FrameData { | 92 struct PhishingDOMFeatureExtractor::FrameData { |
| 93 // This is our reference to document.all, which is an iterator over all | 93 // This is our reference to document.all, which is an iterator over all |
| 94 // of the elements in the document. It keeps track of our current position. | 94 // of the elements in the document. It keeps track of our current position. |
| 95 WebKit::WebNodeCollection elements; | 95 blink::WebNodeCollection elements; |
| 96 // The domain of the document URL, stored here so that we don't need to | 96 // The domain of the document URL, stored here so that we don't need to |
| 97 // recompute it every time it's needed. | 97 // recompute it every time it's needed. |
| 98 std::string domain; | 98 std::string domain; |
| 99 }; | 99 }; |
| 100 | 100 |
| 101 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( | 101 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( |
| 102 content::RenderView* render_view, | 102 content::RenderView* render_view, |
| 103 FeatureExtractorClock* clock) | 103 FeatureExtractorClock* clock) |
| 104 : render_view_(render_view), | 104 : render_view_(render_view), |
| 105 clock_(clock), | 105 clock_(clock), |
| (...skipping 14 matching lines...) Expand all Loading... |
| 120 // starting a new extraction, so DCHECK this. | 120 // starting a new extraction, so DCHECK this. |
| 121 CheckNoPendingExtraction(); | 121 CheckNoPendingExtraction(); |
| 122 // However, in an opt build, we will go ahead and clean up the pending | 122 // However, in an opt build, we will go ahead and clean up the pending |
| 123 // extraction so that we can start in a known state. | 123 // extraction so that we can start in a known state. |
| 124 CancelPendingExtraction(); | 124 CancelPendingExtraction(); |
| 125 | 125 |
| 126 features_ = features; | 126 features_ = features; |
| 127 done_callback_ = done_callback; | 127 done_callback_ = done_callback; |
| 128 | 128 |
| 129 page_feature_state_.reset(new PageFeatureState(clock_->Now())); | 129 page_feature_state_.reset(new PageFeatureState(clock_->Now())); |
| 130 WebKit::WebView* web_view = render_view_->GetWebView(); | 130 blink::WebView* web_view = render_view_->GetWebView(); |
| 131 if (web_view && web_view->mainFrame()) { | 131 if (web_view && web_view->mainFrame()) { |
| 132 cur_document_ = web_view->mainFrame()->document(); | 132 cur_document_ = web_view->mainFrame()->document(); |
| 133 } | 133 } |
| 134 | 134 |
| 135 base::MessageLoop::current()->PostTask( | 135 base::MessageLoop::current()->PostTask( |
| 136 FROM_HERE, | 136 FROM_HERE, |
| 137 base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout, | 137 base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout, |
| 138 weak_factory_.GetWeakPtr())); | 138 weak_factory_.GetWeakPtr())); |
| 139 } | 139 } |
| 140 | 140 |
| (...skipping 10 matching lines...) Expand all Loading... |
| 151 | 151 |
| 152 if (cur_document_.isNull()) { | 152 if (cur_document_.isNull()) { |
| 153 // This will only happen if we weren't able to get the document for the | 153 // This will only happen if we weren't able to get the document for the |
| 154 // main frame. We'll treat this as an extraction failure. | 154 // main frame. We'll treat this as an extraction failure. |
| 155 RunCallback(false); | 155 RunCallback(false); |
| 156 return; | 156 return; |
| 157 } | 157 } |
| 158 | 158 |
| 159 int num_elements = 0; | 159 int num_elements = 0; |
| 160 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) { | 160 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) { |
| 161 WebKit::WebNode cur_node; | 161 blink::WebNode cur_node; |
| 162 if (cur_frame_data_.get()) { | 162 if (cur_frame_data_.get()) { |
| 163 // We're resuming traversal of a frame, so just advance to the next node. | 163 // We're resuming traversal of a frame, so just advance to the next node. |
| 164 cur_node = cur_frame_data_->elements.nextItem(); | 164 cur_node = cur_frame_data_->elements.nextItem(); |
| 165 // When we resume the traversal, the first call to nextItem() potentially | 165 // When we resume the traversal, the first call to nextItem() potentially |
| 166 // has to walk through the document again from the beginning, if it was | 166 // has to walk through the document again from the beginning, if it was |
| 167 // modified between our chunks of work. Log how long this takes, so we | 167 // modified between our chunks of work. Log how long this takes, so we |
| 168 // can tell if it's too slow. | 168 // can tell if it's too slow. |
| 169 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", | 169 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", |
| 170 clock_->Now() - current_chunk_start_time); | 170 clock_->Now() - current_chunk_start_time); |
| 171 } else { | 171 } else { |
| 172 // We just moved to a new frame, so update our frame state | 172 // We just moved to a new frame, so update our frame state |
| 173 // and advance to the first element. | 173 // and advance to the first element. |
| 174 ResetFrameData(); | 174 ResetFrameData(); |
| 175 cur_node = cur_frame_data_->elements.firstItem(); | 175 cur_node = cur_frame_data_->elements.firstItem(); |
| 176 } | 176 } |
| 177 | 177 |
| 178 for (; !cur_node.isNull(); | 178 for (; !cur_node.isNull(); |
| 179 cur_node = cur_frame_data_->elements.nextItem()) { | 179 cur_node = cur_frame_data_->elements.nextItem()) { |
| 180 if (!cur_node.isElementNode()) { | 180 if (!cur_node.isElementNode()) { |
| 181 continue; | 181 continue; |
| 182 } | 182 } |
| 183 WebKit::WebElement element = cur_node.to<WebKit::WebElement>(); | 183 blink::WebElement element = cur_node.to<blink::WebElement>(); |
| 184 if (element.hasTagName("a")) { | 184 if (element.hasTagName("a")) { |
| 185 HandleLink(element); | 185 HandleLink(element); |
| 186 } else if (element.hasTagName("form")) { | 186 } else if (element.hasTagName("form")) { |
| 187 HandleForm(element); | 187 HandleForm(element); |
| 188 } else if (element.hasTagName("img")) { | 188 } else if (element.hasTagName("img")) { |
| 189 HandleImage(element); | 189 HandleImage(element); |
| 190 } else if (element.hasTagName("input")) { | 190 } else if (element.hasTagName("input")) { |
| 191 HandleInput(element); | 191 HandleInput(element); |
| 192 } else if (element.hasTagName("script")) { | 192 } else if (element.hasTagName("script")) { |
| 193 HandleScript(element); | 193 HandleScript(element); |
| (...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 229 // We're done with this frame, recalculate the FrameData when we | 229 // We're done with this frame, recalculate the FrameData when we |
| 230 // advance to the next frame. | 230 // advance to the next frame. |
| 231 cur_frame_data_.reset(); | 231 cur_frame_data_.reset(); |
| 232 } | 232 } |
| 233 | 233 |
| 234 InsertFeatures(); | 234 InsertFeatures(); |
| 235 RunCallback(true); | 235 RunCallback(true); |
| 236 } | 236 } |
| 237 | 237 |
| 238 void PhishingDOMFeatureExtractor::HandleLink( | 238 void PhishingDOMFeatureExtractor::HandleLink( |
| 239 const WebKit::WebElement& element) { | 239 const blink::WebElement& element) { |
| 240 // Count the number of times we link to a different host. | 240 // Count the number of times we link to a different host. |
| 241 if (!element.hasAttribute("href")) { | 241 if (!element.hasAttribute("href")) { |
| 242 DVLOG(1) << "Skipping anchor tag with no href"; | 242 DVLOG(1) << "Skipping anchor tag with no href"; |
| 243 return; | 243 return; |
| 244 } | 244 } |
| 245 | 245 |
| 246 // Retrieve the link and resolve the link in case it's relative. | 246 // Retrieve the link and resolve the link in case it's relative. |
| 247 WebKit::WebURL full_url = element.document().completeURL( | 247 blink::WebURL full_url = element.document().completeURL( |
| 248 element.getAttribute("href")); | 248 element.getAttribute("href")); |
| 249 | 249 |
| 250 std::string domain; | 250 std::string domain; |
| 251 bool is_external = IsExternalDomain(full_url, &domain); | 251 bool is_external = IsExternalDomain(full_url, &domain); |
| 252 if (domain.empty()) { | 252 if (domain.empty()) { |
| 253 DVLOG(1) << "Could not extract domain from link: " << full_url; | 253 DVLOG(1) << "Could not extract domain from link: " << full_url; |
| 254 return; | 254 return; |
| 255 } | 255 } |
| 256 | 256 |
| 257 if (is_external) { | 257 if (is_external) { |
| 258 ++page_feature_state_->external_links; | 258 ++page_feature_state_->external_links; |
| 259 | 259 |
| 260 // Record each unique domain that we link to. | 260 // Record each unique domain that we link to. |
| 261 page_feature_state_->external_domains.insert(domain); | 261 page_feature_state_->external_domains.insert(domain); |
| 262 } | 262 } |
| 263 | 263 |
| 264 // Check how many are https links. | 264 // Check how many are https links. |
| 265 if (GURL(full_url).SchemeIs("https")) { | 265 if (GURL(full_url).SchemeIs("https")) { |
| 266 ++page_feature_state_->secure_links; | 266 ++page_feature_state_->secure_links; |
| 267 } | 267 } |
| 268 | 268 |
| 269 ++page_feature_state_->total_links; | 269 ++page_feature_state_->total_links; |
| 270 } | 270 } |
| 271 | 271 |
| 272 void PhishingDOMFeatureExtractor::HandleForm( | 272 void PhishingDOMFeatureExtractor::HandleForm( |
| 273 const WebKit::WebElement& element) { | 273 const blink::WebElement& element) { |
| 274 // Increment the number of forms on this page. | 274 // Increment the number of forms on this page. |
| 275 ++page_feature_state_->num_forms; | 275 ++page_feature_state_->num_forms; |
| 276 | 276 |
| 277 // Record whether the action points to a different domain. | 277 // Record whether the action points to a different domain. |
| 278 if (!element.hasAttribute("action")) { | 278 if (!element.hasAttribute("action")) { |
| 279 return; | 279 return; |
| 280 } | 280 } |
| 281 | 281 |
| 282 WebKit::WebURL full_url = element.document().completeURL( | 282 blink::WebURL full_url = element.document().completeURL( |
| 283 element.getAttribute("action")); | 283 element.getAttribute("action")); |
| 284 | 284 |
| 285 std::string domain; | 285 std::string domain; |
| 286 bool is_external = IsExternalDomain(full_url, &domain); | 286 bool is_external = IsExternalDomain(full_url, &domain); |
| 287 if (domain.empty()) { | 287 if (domain.empty()) { |
| 288 DVLOG(1) << "Could not extract domain from form action: " << full_url; | 288 DVLOG(1) << "Could not extract domain from form action: " << full_url; |
| 289 return; | 289 return; |
| 290 } | 290 } |
| 291 | 291 |
| 292 if (is_external) { | 292 if (is_external) { |
| 293 ++page_feature_state_->action_other_domain; | 293 ++page_feature_state_->action_other_domain; |
| 294 } | 294 } |
| 295 ++page_feature_state_->total_actions; | 295 ++page_feature_state_->total_actions; |
| 296 } | 296 } |
| 297 | 297 |
| 298 void PhishingDOMFeatureExtractor::HandleImage( | 298 void PhishingDOMFeatureExtractor::HandleImage( |
| 299 const WebKit::WebElement& element) { | 299 const blink::WebElement& element) { |
| 300 if (!element.hasAttribute("src")) { | 300 if (!element.hasAttribute("src")) { |
| 301 DVLOG(1) << "Skipping img tag with no src"; | 301 DVLOG(1) << "Skipping img tag with no src"; |
| 302 } | 302 } |
| 303 | 303 |
| 304 // Record whether the image points to a different domain. | 304 // Record whether the image points to a different domain. |
| 305 WebKit::WebURL full_url = element.document().completeURL( | 305 blink::WebURL full_url = element.document().completeURL( |
| 306 element.getAttribute("src")); | 306 element.getAttribute("src")); |
| 307 std::string domain; | 307 std::string domain; |
| 308 bool is_external = IsExternalDomain(full_url, &domain); | 308 bool is_external = IsExternalDomain(full_url, &domain); |
| 309 if (domain.empty()) { | 309 if (domain.empty()) { |
| 310 DVLOG(1) << "Could not extract domain from image src: " << full_url; | 310 DVLOG(1) << "Could not extract domain from image src: " << full_url; |
| 311 return; | 311 return; |
| 312 } | 312 } |
| 313 | 313 |
| 314 if (is_external) { | 314 if (is_external) { |
| 315 ++page_feature_state_->img_other_domain; | 315 ++page_feature_state_->img_other_domain; |
| 316 } | 316 } |
| 317 ++page_feature_state_->total_imgs; | 317 ++page_feature_state_->total_imgs; |
| 318 } | 318 } |
| 319 | 319 |
| 320 void PhishingDOMFeatureExtractor::HandleInput( | 320 void PhishingDOMFeatureExtractor::HandleInput( |
| 321 const WebKit::WebElement& element) { | 321 const blink::WebElement& element) { |
| 322 // The HTML spec says that if the type is unspecified, it defaults to text. | 322 // The HTML spec says that if the type is unspecified, it defaults to text. |
| 323 // In addition, any unrecognized type will be treated as a text input. | 323 // In addition, any unrecognized type will be treated as a text input. |
| 324 // | 324 // |
| 325 // Note that we use the attribute value rather than | 325 // Note that we use the attribute value rather than |
| 326 // WebFormControlElement::formControlType() for consistency with the | 326 // WebFormControlElement::formControlType() for consistency with the |
| 327 // way the phishing classification model is created. | 327 // way the phishing classification model is created. |
| 328 std::string type = element.getAttribute("type").utf8(); | 328 std::string type = element.getAttribute("type").utf8(); |
| 329 StringToLowerASCII(&type); | 329 StringToLowerASCII(&type); |
| 330 if (type == "password") { | 330 if (type == "password") { |
| 331 ++page_feature_state_->num_pswd_inputs; | 331 ++page_feature_state_->num_pswd_inputs; |
| 332 } else if (type == "radio") { | 332 } else if (type == "radio") { |
| 333 ++page_feature_state_->num_radio_inputs; | 333 ++page_feature_state_->num_radio_inputs; |
| 334 } else if (type == "checkbox") { | 334 } else if (type == "checkbox") { |
| 335 ++page_feature_state_->num_check_inputs; | 335 ++page_feature_state_->num_check_inputs; |
| 336 } else if (type != "submit" && type != "reset" && type != "file" && | 336 } else if (type != "submit" && type != "reset" && type != "file" && |
| 337 type != "hidden" && type != "image" && type != "button") { | 337 type != "hidden" && type != "image" && type != "button") { |
| 338 // Note that there are a number of new input types in HTML5 that are not | 338 // Note that there are a number of new input types in HTML5 that are not |
| 339 // handled above. For now, we will consider these as text inputs since | 339 // handled above. For now, we will consider these as text inputs since |
| 340 // they could be used to capture user input. | 340 // they could be used to capture user input. |
| 341 ++page_feature_state_->num_text_inputs; | 341 ++page_feature_state_->num_text_inputs; |
| 342 } | 342 } |
| 343 } | 343 } |
| 344 | 344 |
| 345 void PhishingDOMFeatureExtractor::HandleScript( | 345 void PhishingDOMFeatureExtractor::HandleScript( |
| 346 const WebKit::WebElement& element) { | 346 const blink::WebElement& element) { |
| 347 ++page_feature_state_->num_script_tags; | 347 ++page_feature_state_->num_script_tags; |
| 348 } | 348 } |
| 349 | 349 |
| 350 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { | 350 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { |
| 351 DCHECK(done_callback_.is_null()); | 351 DCHECK(done_callback_.is_null()); |
| 352 DCHECK(!cur_frame_data_.get()); | 352 DCHECK(!cur_frame_data_.get()); |
| 353 DCHECK(cur_document_.isNull()); | 353 DCHECK(cur_document_.isNull()); |
| 354 if (!done_callback_.is_null() || cur_frame_data_.get() || | 354 if (!done_callback_.is_null() || cur_frame_data_.get() || |
| 355 !cur_document_.isNull()) { | 355 !cur_document_.isNull()) { |
| 356 LOG(ERROR) << "Extraction in progress, missing call to " | 356 LOG(ERROR) << "Extraction in progress, missing call to " |
| (...skipping 27 matching lines...) Expand all Loading... |
| 384 DCHECK(!cur_frame_data_.get()); | 384 DCHECK(!cur_frame_data_.get()); |
| 385 | 385 |
| 386 cur_frame_data_.reset(new FrameData()); | 386 cur_frame_data_.reset(new FrameData()); |
| 387 cur_frame_data_->elements = cur_document_.all(); | 387 cur_frame_data_->elements = cur_document_.all(); |
| 388 cur_frame_data_->domain = | 388 cur_frame_data_->domain = |
| 389 net::registry_controlled_domains::GetDomainAndRegistry( | 389 net::registry_controlled_domains::GetDomainAndRegistry( |
| 390 cur_document_.url(), | 390 cur_document_.url(), |
| 391 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); | 391 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); |
| 392 } | 392 } |
| 393 | 393 |
| 394 WebKit::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() { | 394 blink::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() { |
| 395 DCHECK(!cur_document_.isNull()); | 395 DCHECK(!cur_document_.isNull()); |
| 396 WebKit::WebFrame* frame = cur_document_.frame(); | 396 blink::WebFrame* frame = cur_document_.frame(); |
| 397 // Advance to the next frame that contains a document, with no wrapping. | 397 // Advance to the next frame that contains a document, with no wrapping. |
| 398 if (frame) { | 398 if (frame) { |
| 399 while ((frame = frame->traverseNext(false))) { | 399 while ((frame = frame->traverseNext(false))) { |
| 400 if (!frame->document().isNull()) { | 400 if (!frame->document().isNull()) { |
| 401 return frame->document(); | 401 return frame->document(); |
| 402 } | 402 } |
| 403 } | 403 } |
| 404 } else { | 404 } else { |
| 405 // Keep track of how often frame traversal got "stuck" due to the | 405 // Keep track of how often frame traversal got "stuck" due to the |
| 406 // current subdocument getting removed from the frame tree. | 406 // current subdocument getting removed from the frame tree. |
| 407 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1); | 407 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1); |
| 408 } | 408 } |
| 409 return WebKit::WebDocument(); | 409 return blink::WebDocument(); |
| 410 } | 410 } |
| 411 | 411 |
| 412 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, | 412 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, |
| 413 std::string* domain) const { | 413 std::string* domain) const { |
| 414 DCHECK(domain); | 414 DCHECK(domain); |
| 415 DCHECK(cur_frame_data_.get()); | 415 DCHECK(cur_frame_data_.get()); |
| 416 | 416 |
| 417 if (cur_frame_data_->domain.empty()) { | 417 if (cur_frame_data_->domain.empty()) { |
| 418 return false; | 418 return false; |
| 419 } | 419 } |
| (...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 491 // Record number of script tags (discretized for numerical stability.) | 491 // Record number of script tags (discretized for numerical stability.) |
| 492 if (page_feature_state_->num_script_tags > 1) { | 492 if (page_feature_state_->num_script_tags > 1) { |
| 493 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); | 493 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); |
| 494 if (page_feature_state_->num_script_tags > 6) { | 494 if (page_feature_state_->num_script_tags > 6) { |
| 495 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); | 495 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); |
| 496 } | 496 } |
| 497 } | 497 } |
| 498 } | 498 } |
| 499 | 499 |
| 500 } // namespace safe_browsing | 500 } // namespace safe_browsing |
| OLD | NEW |