| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" | |
| 6 | |
| 7 #include "base/bind.h" | |
| 8 #include "base/compiler_specific.h" | |
| 9 #include "base/containers/hash_tables.h" | |
| 10 #include "base/location.h" | |
| 11 #include "base/logging.h" | |
| 12 #include "base/metrics/histogram_macros.h" | |
| 13 #include "base/single_thread_task_runner.h" | |
| 14 #include "base/strings/string_util.h" | |
| 15 #include "base/threading/thread_task_runner_handle.h" | |
| 16 #include "base/time/time.h" | |
| 17 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" | |
| 18 #include "chrome/renderer/safe_browsing/features.h" | |
| 19 #include "content/public/renderer/render_view.h" | |
| 20 #include "net/base/registry_controlled_domains/registry_controlled_domain.h" | |
| 21 #include "third_party/WebKit/public/platform/WebString.h" | |
| 22 #include "third_party/WebKit/public/web/WebElement.h" | |
| 23 #include "third_party/WebKit/public/web/WebElementCollection.h" | |
| 24 #include "third_party/WebKit/public/web/WebLocalFrame.h" | |
| 25 #include "third_party/WebKit/public/web/WebView.h" | |
| 26 | |
| 27 namespace safe_browsing { | |
| 28 | |
| 29 // This time should be short enough that it doesn't noticeably disrupt the | |
| 30 // user's interaction with the page. | |
| 31 const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10; | |
| 32 | |
| 33 // Experimenting shows that we get a reasonable gain in performance by | |
| 34 // increasing this up to around 10, but there's not much benefit in | |
| 35 // increasing it past that. | |
| 36 const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10; | |
| 37 | |
| 38 // This should be longer than we expect feature extraction to take on any | |
| 39 // actual phishing page. | |
| 40 const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500; | |
| 41 | |
| 42 // Intermediate state used for computing features. See features.h for | |
| 43 // descriptions of the DOM features that are computed. | |
| 44 struct PhishingDOMFeatureExtractor::PageFeatureState { | |
| 45 // Link related features | |
| 46 int external_links; | |
| 47 base::hash_set<std::string> external_domains; | |
| 48 int secure_links; | |
| 49 int total_links; | |
| 50 | |
| 51 // Form related features | |
| 52 int num_forms; | |
| 53 int num_text_inputs; | |
| 54 int num_pswd_inputs; | |
| 55 int num_radio_inputs; | |
| 56 int num_check_inputs; | |
| 57 int action_other_domain; | |
| 58 int total_actions; | |
| 59 base::hash_set<std::string> page_action_urls; | |
| 60 | |
| 61 // Image related features | |
| 62 int img_other_domain; | |
| 63 int total_imgs; | |
| 64 | |
| 65 // How many script tags | |
| 66 int num_script_tags; | |
| 67 | |
| 68 // The time at which we started feature extraction for the current page. | |
| 69 base::TimeTicks start_time; | |
| 70 | |
| 71 // The number of iterations we've done for the current extraction. | |
| 72 int num_iterations; | |
| 73 | |
| 74 explicit PageFeatureState(base::TimeTicks start_time_ticks) | |
| 75 : external_links(0), | |
| 76 secure_links(0), | |
| 77 total_links(0), | |
| 78 num_forms(0), | |
| 79 num_text_inputs(0), | |
| 80 num_pswd_inputs(0), | |
| 81 num_radio_inputs(0), | |
| 82 num_check_inputs(0), | |
| 83 action_other_domain(0), | |
| 84 total_actions(0), | |
| 85 img_other_domain(0), | |
| 86 total_imgs(0), | |
| 87 num_script_tags(0), | |
| 88 start_time(start_time_ticks), | |
| 89 num_iterations(0) {} | |
| 90 | |
| 91 ~PageFeatureState() {} | |
| 92 }; | |
| 93 | |
| 94 // Per-frame state | |
| 95 struct PhishingDOMFeatureExtractor::FrameData { | |
| 96 // This is our reference to document.all, which is an iterator over all | |
| 97 // of the elements in the document. It keeps track of our current position. | |
| 98 blink::WebElementCollection elements; | |
| 99 // The domain of the document URL, stored here so that we don't need to | |
| 100 // recompute it every time it's needed. | |
| 101 std::string domain; | |
| 102 }; | |
| 103 | |
| 104 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( | |
| 105 FeatureExtractorClock* clock) | |
| 106 : clock_(clock), weak_factory_(this) { | |
| 107 Clear(); | |
| 108 } | |
| 109 | |
| 110 PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() { | |
| 111 // The RenderView should have called CancelPendingExtraction() before | |
| 112 // we are destroyed. | |
| 113 CheckNoPendingExtraction(); | |
| 114 } | |
| 115 | |
| 116 void PhishingDOMFeatureExtractor::ExtractFeatures( | |
| 117 blink::WebDocument document, | |
| 118 FeatureMap* features, | |
| 119 const DoneCallback& done_callback) { | |
| 120 // The RenderView should have called CancelPendingExtraction() before | |
| 121 // starting a new extraction, so DCHECK this. | |
| 122 CheckNoPendingExtraction(); | |
| 123 // However, in an opt build, we will go ahead and clean up the pending | |
| 124 // extraction so that we can start in a known state. | |
| 125 CancelPendingExtraction(); | |
| 126 | |
| 127 features_ = features; | |
| 128 done_callback_ = done_callback; | |
| 129 | |
| 130 page_feature_state_.reset(new PageFeatureState(clock_->Now())); | |
| 131 cur_document_ = document; | |
| 132 | |
| 133 base::ThreadTaskRunnerHandle::Get()->PostTask( | |
| 134 FROM_HERE, | |
| 135 base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout, | |
| 136 weak_factory_.GetWeakPtr())); | |
| 137 } | |
| 138 | |
| 139 void PhishingDOMFeatureExtractor::CancelPendingExtraction() { | |
| 140 // Cancel any pending callbacks, and clear our state. | |
| 141 weak_factory_.InvalidateWeakPtrs(); | |
| 142 Clear(); | |
| 143 } | |
| 144 | |
| 145 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { | |
| 146 DCHECK(page_feature_state_.get()); | |
| 147 ++page_feature_state_->num_iterations; | |
| 148 base::TimeTicks current_chunk_start_time = clock_->Now(); | |
| 149 | |
| 150 if (cur_document_.isNull()) { | |
| 151 // This will only happen if we weren't able to get the document for the | |
| 152 // main frame. We'll treat this as an extraction failure. | |
| 153 RunCallback(false); | |
| 154 return; | |
| 155 } | |
| 156 | |
| 157 int num_elements = 0; | |
| 158 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) { | |
| 159 blink::WebElement cur_element; | |
| 160 if (cur_frame_data_.get()) { | |
| 161 // We're resuming traversal of a frame, so just advance to the next | |
| 162 // element. | |
| 163 cur_element = cur_frame_data_->elements.nextItem(); | |
| 164 // When we resume the traversal, the first call to nextItem() potentially | |
| 165 // has to walk through the document again from the beginning, if it was | |
| 166 // modified between our chunks of work. Log how long this takes, so we | |
| 167 // can tell if it's too slow. | |
| 168 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", | |
| 169 clock_->Now() - current_chunk_start_time); | |
| 170 } else { | |
| 171 // We just moved to a new frame, so update our frame state | |
| 172 // and advance to the first element. | |
| 173 ResetFrameData(); | |
| 174 cur_element = cur_frame_data_->elements.firstItem(); | |
| 175 } | |
| 176 | |
| 177 for (; !cur_element.isNull(); | |
| 178 cur_element = cur_frame_data_->elements.nextItem()) { | |
| 179 if (cur_element.hasHTMLTagName("a")) { | |
| 180 HandleLink(cur_element); | |
| 181 } else if (cur_element.hasHTMLTagName("form")) { | |
| 182 HandleForm(cur_element); | |
| 183 } else if (cur_element.hasHTMLTagName("img")) { | |
| 184 HandleImage(cur_element); | |
| 185 } else if (cur_element.hasHTMLTagName("input")) { | |
| 186 HandleInput(cur_element); | |
| 187 } else if (cur_element.hasHTMLTagName("script")) { | |
| 188 HandleScript(cur_element); | |
| 189 } | |
| 190 | |
| 191 if (++num_elements >= kClockCheckGranularity) { | |
| 192 num_elements = 0; | |
| 193 base::TimeTicks now = clock_->Now(); | |
| 194 if (now - page_feature_state_->start_time >= | |
| 195 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { | |
| 196 DLOG(ERROR) << "Feature extraction took too long, giving up"; | |
| 197 // We expect this to happen infrequently, so record when it does. | |
| 198 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1); | |
| 199 RunCallback(false); | |
| 200 return; | |
| 201 } | |
| 202 base::TimeDelta chunk_elapsed = now - current_chunk_start_time; | |
| 203 if (chunk_elapsed >= | |
| 204 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) { | |
| 205 // The time limit for the current chunk is up, so post a task to | |
| 206 // continue extraction. | |
| 207 // | |
| 208 // Record how much time we actually spent on the chunk. If this is | |
| 209 // much higher than kMaxTimePerChunkMs, we may need to adjust the | |
| 210 // clock granularity. | |
| 211 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime", | |
| 212 chunk_elapsed); | |
| 213 base::ThreadTaskRunnerHandle::Get()->PostTask( | |
| 214 FROM_HERE, | |
| 215 base::Bind( | |
| 216 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout, | |
| 217 weak_factory_.GetWeakPtr())); | |
| 218 return; | |
| 219 } | |
| 220 // Otherwise, continue. | |
| 221 } | |
| 222 } | |
| 223 | |
| 224 // We're done with this frame, recalculate the FrameData when we | |
| 225 // advance to the next frame. | |
| 226 cur_frame_data_.reset(); | |
| 227 } | |
| 228 | |
| 229 InsertFeatures(); | |
| 230 RunCallback(true); | |
| 231 } | |
| 232 | |
| 233 void PhishingDOMFeatureExtractor::HandleLink( | |
| 234 const blink::WebElement& element) { | |
| 235 // Count the number of times we link to a different host. | |
| 236 if (!element.hasAttribute("href")) { | |
| 237 DVLOG(1) << "Skipping anchor tag with no href"; | |
| 238 return; | |
| 239 } | |
| 240 | |
| 241 // Retrieve the link and resolve the link in case it's relative. | |
| 242 blink::WebURL full_url = CompleteURL(element, element.getAttribute("href")); | |
| 243 | |
| 244 std::string domain; | |
| 245 bool is_external = IsExternalDomain(full_url, &domain); | |
| 246 if (domain.empty()) { | |
| 247 DVLOG(1) << "Could not extract domain from link: " << full_url; | |
| 248 return; | |
| 249 } | |
| 250 | |
| 251 if (is_external) { | |
| 252 ++page_feature_state_->external_links; | |
| 253 | |
| 254 // Record each unique domain that we link to. | |
| 255 page_feature_state_->external_domains.insert(domain); | |
| 256 } | |
| 257 | |
| 258 // Check how many are https links. | |
| 259 if (GURL(full_url).SchemeIs("https")) { | |
| 260 ++page_feature_state_->secure_links; | |
| 261 } | |
| 262 | |
| 263 ++page_feature_state_->total_links; | |
| 264 } | |
| 265 | |
| 266 void PhishingDOMFeatureExtractor::HandleForm( | |
| 267 const blink::WebElement& element) { | |
| 268 // Increment the number of forms on this page. | |
| 269 ++page_feature_state_->num_forms; | |
| 270 | |
| 271 // Record whether the action points to a different domain. | |
| 272 if (!element.hasAttribute("action")) { | |
| 273 return; | |
| 274 } | |
| 275 | |
| 276 blink::WebURL full_url = CompleteURL(element, element.getAttribute("action")); | |
| 277 | |
| 278 page_feature_state_->page_action_urls.insert(full_url.string().utf8()); | |
| 279 | |
| 280 std::string domain; | |
| 281 bool is_external = IsExternalDomain(full_url, &domain); | |
| 282 if (domain.empty()) { | |
| 283 DVLOG(1) << "Could not extract domain from form action: " << full_url; | |
| 284 return; | |
| 285 } | |
| 286 | |
| 287 if (is_external) { | |
| 288 ++page_feature_state_->action_other_domain; | |
| 289 } | |
| 290 ++page_feature_state_->total_actions; | |
| 291 } | |
| 292 | |
| 293 void PhishingDOMFeatureExtractor::HandleImage( | |
| 294 const blink::WebElement& element) { | |
| 295 if (!element.hasAttribute("src")) { | |
| 296 DVLOG(1) << "Skipping img tag with no src"; | |
| 297 } | |
| 298 | |
| 299 // Record whether the image points to a different domain. | |
| 300 blink::WebURL full_url = CompleteURL(element, element.getAttribute("src")); | |
| 301 std::string domain; | |
| 302 bool is_external = IsExternalDomain(full_url, &domain); | |
| 303 if (domain.empty()) { | |
| 304 DVLOG(1) << "Could not extract domain from image src: " << full_url; | |
| 305 return; | |
| 306 } | |
| 307 | |
| 308 if (is_external) { | |
| 309 ++page_feature_state_->img_other_domain; | |
| 310 } | |
| 311 ++page_feature_state_->total_imgs; | |
| 312 } | |
| 313 | |
| 314 void PhishingDOMFeatureExtractor::HandleInput( | |
| 315 const blink::WebElement& element) { | |
| 316 // The HTML spec says that if the type is unspecified, it defaults to text. | |
| 317 // In addition, any unrecognized type will be treated as a text input. | |
| 318 // | |
| 319 // Note that we use the attribute value rather than | |
| 320 // WebFormControlElement::formControlType() for consistency with the | |
| 321 // way the phishing classification model is created. | |
| 322 std::string type = base::ToLowerASCII(element.getAttribute("type").utf8()); | |
| 323 if (type == "password") { | |
| 324 ++page_feature_state_->num_pswd_inputs; | |
| 325 } else if (type == "radio") { | |
| 326 ++page_feature_state_->num_radio_inputs; | |
| 327 } else if (type == "checkbox") { | |
| 328 ++page_feature_state_->num_check_inputs; | |
| 329 } else if (type != "submit" && type != "reset" && type != "file" && | |
| 330 type != "hidden" && type != "image" && type != "button") { | |
| 331 // Note that there are a number of new input types in HTML5 that are not | |
| 332 // handled above. For now, we will consider these as text inputs since | |
| 333 // they could be used to capture user input. | |
| 334 ++page_feature_state_->num_text_inputs; | |
| 335 } | |
| 336 } | |
| 337 | |
| 338 void PhishingDOMFeatureExtractor::HandleScript( | |
| 339 const blink::WebElement& element) { | |
| 340 ++page_feature_state_->num_script_tags; | |
| 341 } | |
| 342 | |
| 343 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { | |
| 344 DCHECK(done_callback_.is_null()); | |
| 345 DCHECK(!cur_frame_data_.get()); | |
| 346 DCHECK(cur_document_.isNull()); | |
| 347 if (!done_callback_.is_null() || cur_frame_data_.get() || | |
| 348 !cur_document_.isNull()) { | |
| 349 LOG(ERROR) << "Extraction in progress, missing call to " | |
| 350 << "CancelPendingExtraction"; | |
| 351 } | |
| 352 } | |
| 353 | |
| 354 void PhishingDOMFeatureExtractor::RunCallback(bool success) { | |
| 355 // Record some timing stats that we can use to evaluate feature extraction | |
| 356 // performance. These include both successful and failed extractions. | |
| 357 DCHECK(page_feature_state_.get()); | |
| 358 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations", | |
| 359 page_feature_state_->num_iterations); | |
| 360 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime", | |
| 361 clock_->Now() - page_feature_state_->start_time); | |
| 362 | |
| 363 DCHECK(!done_callback_.is_null()); | |
| 364 done_callback_.Run(success); | |
| 365 Clear(); | |
| 366 } | |
| 367 | |
| 368 void PhishingDOMFeatureExtractor::Clear() { | |
| 369 features_ = NULL; | |
| 370 done_callback_.Reset(); | |
| 371 cur_frame_data_.reset(NULL); | |
| 372 cur_document_.reset(); | |
| 373 } | |
| 374 | |
| 375 void PhishingDOMFeatureExtractor::ResetFrameData() { | |
| 376 DCHECK(!cur_document_.isNull()); | |
| 377 DCHECK(!cur_frame_data_.get()); | |
| 378 | |
| 379 cur_frame_data_.reset(new FrameData()); | |
| 380 cur_frame_data_->elements = cur_document_.all(); | |
| 381 cur_frame_data_->domain = | |
| 382 net::registry_controlled_domains::GetDomainAndRegistry( | |
| 383 cur_document_.url(), | |
| 384 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); | |
| 385 } | |
| 386 | |
| 387 blink::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() { | |
| 388 DCHECK(!cur_document_.isNull()); | |
| 389 blink::WebFrame* frame = cur_document_.frame(); | |
| 390 // Advance to the next frame that contains a document, with no wrapping. | |
| 391 if (frame) { | |
| 392 for (frame = frame->traverseNext(); frame; frame = frame->traverseNext()) { | |
| 393 if (!frame->document().isNull()) { | |
| 394 return frame->document(); | |
| 395 } | |
| 396 } | |
| 397 } else { | |
| 398 // Keep track of how often frame traversal got "stuck" due to the | |
| 399 // current subdocument getting removed from the frame tree. | |
| 400 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1); | |
| 401 } | |
| 402 return blink::WebDocument(); | |
| 403 } | |
| 404 | |
| 405 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, | |
| 406 std::string* domain) const { | |
| 407 DCHECK(domain); | |
| 408 DCHECK(cur_frame_data_.get()); | |
| 409 | |
| 410 if (cur_frame_data_->domain.empty()) { | |
| 411 return false; | |
| 412 } | |
| 413 | |
| 414 // TODO(bryner): Ensure that the url encoding is consistent with the features | |
| 415 // in the model. | |
| 416 if (url.HostIsIPAddress()) { | |
| 417 domain->assign(url.host()); | |
| 418 } else { | |
| 419 domain->assign(net::registry_controlled_domains::GetDomainAndRegistry( | |
| 420 url, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES)); | |
| 421 } | |
| 422 | |
| 423 return !domain->empty() && *domain != cur_frame_data_->domain; | |
| 424 } | |
| 425 | |
| 426 blink::WebURL PhishingDOMFeatureExtractor::CompleteURL( | |
| 427 const blink::WebElement& element, | |
| 428 const blink::WebString& partial_url) { | |
| 429 return element.document().completeURL(partial_url); | |
| 430 } | |
| 431 | |
| 432 void PhishingDOMFeatureExtractor::InsertFeatures() { | |
| 433 DCHECK(page_feature_state_.get()); | |
| 434 | |
| 435 if (page_feature_state_->total_links > 0) { | |
| 436 // Add a feature for the fraction of times the page links to an external | |
| 437 // domain vs. an internal domain. | |
| 438 double link_freq = static_cast<double>( | |
| 439 page_feature_state_->external_links) / | |
| 440 page_feature_state_->total_links; | |
| 441 features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq); | |
| 442 | |
| 443 // Add a feature for each unique domain that we're linking to | |
| 444 for (const auto& domain : page_feature_state_->external_domains) { | |
| 445 features_->AddBooleanFeature(features::kPageLinkDomain + domain); | |
| 446 } | |
| 447 | |
| 448 // Fraction of links that use https. | |
| 449 double secure_freq = static_cast<double>( | |
| 450 page_feature_state_->secure_links) / page_feature_state_->total_links; | |
| 451 features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq); | |
| 452 } | |
| 453 | |
| 454 // Record whether forms appear and whether various form elements appear. | |
| 455 if (page_feature_state_->num_forms > 0) { | |
| 456 features_->AddBooleanFeature(features::kPageHasForms); | |
| 457 } | |
| 458 if (page_feature_state_->num_text_inputs > 0) { | |
| 459 features_->AddBooleanFeature(features::kPageHasTextInputs); | |
| 460 } | |
| 461 if (page_feature_state_->num_pswd_inputs > 0) { | |
| 462 features_->AddBooleanFeature(features::kPageHasPswdInputs); | |
| 463 } | |
| 464 if (page_feature_state_->num_radio_inputs > 0) { | |
| 465 features_->AddBooleanFeature(features::kPageHasRadioInputs); | |
| 466 } | |
| 467 if (page_feature_state_->num_check_inputs > 0) { | |
| 468 features_->AddBooleanFeature(features::kPageHasCheckInputs); | |
| 469 } | |
| 470 | |
| 471 // Record fraction of form actions that point to a different domain. | |
| 472 if (page_feature_state_->total_actions > 0) { | |
| 473 double action_freq = static_cast<double>( | |
| 474 page_feature_state_->action_other_domain) / | |
| 475 page_feature_state_->total_actions; | |
| 476 features_->AddRealFeature(features::kPageActionOtherDomainFreq, | |
| 477 action_freq); | |
| 478 } | |
| 479 | |
| 480 // Add a feature for each unique external action url. | |
| 481 for (const auto& url : page_feature_state_->page_action_urls) { | |
| 482 features_->AddBooleanFeature(features::kPageActionURL + url); | |
| 483 } | |
| 484 | |
| 485 // Record how many image src attributes point to a different domain. | |
| 486 if (page_feature_state_->total_imgs > 0) { | |
| 487 double img_freq = static_cast<double>( | |
| 488 page_feature_state_->img_other_domain) / | |
| 489 page_feature_state_->total_imgs; | |
| 490 features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq); | |
| 491 } | |
| 492 | |
| 493 // Record number of script tags (discretized for numerical stability.) | |
| 494 if (page_feature_state_->num_script_tags > 1) { | |
| 495 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); | |
| 496 if (page_feature_state_->num_script_tags > 6) { | |
| 497 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); | |
| 498 } | |
| 499 } | |
| 500 } | |
| 501 | |
| 502 } // namespace safe_browsing | |
| OLD | NEW |