OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" |
| 6 |
| 7 #include "base/compiler_specific.h" |
| 8 #include "base/hash_tables.h" |
| 9 #include "base/histogram.h" |
| 10 #include "base/logging.h" |
| 11 #include "chrome/renderer/render_view.h" |
| 12 #include "chrome/renderer/safe_browsing/features.h" |
| 13 #include "net/base/registry_controlled_domain.h" |
| 14 #include "third_party/WebKit/WebKit/chromium/public/WebDocument.h" |
| 15 #include "third_party/WebKit/WebKit/chromium/public/WebElement.h" |
| 16 #include "third_party/WebKit/WebKit/chromium/public/WebFrame.h" |
| 17 #include "third_party/WebKit/WebKit/chromium/public/WebNodeCollection.h" |
| 18 #include "third_party/WebKit/WebKit/chromium/public/WebString.h" |
| 19 #include "third_party/WebKit/WebKit/chromium/public/WebView.h" |
| 20 |
| 21 namespace safe_browsing { |
| 22 |
| 23 // Intermediate state used for computing features. See features.h for |
| 24 // descriptions of the DOM features that are computed. |
| 25 struct PhishingDOMFeatureExtractor::PageFeatureState { |
| 26 // Link related features |
| 27 int external_links; |
| 28 base::hash_set<std::string> external_domains; |
| 29 int secure_links; |
| 30 int total_links; |
| 31 |
| 32 // Form related features |
| 33 int num_forms; |
| 34 int num_text_inputs; |
| 35 int num_pswd_inputs; |
| 36 int num_radio_inputs; |
| 37 int num_check_inputs; |
| 38 int action_other_domain; |
| 39 int total_actions; |
| 40 |
| 41 // Image related features |
| 42 int img_other_domain; |
| 43 int total_imgs; |
| 44 |
| 45 // How many script tags |
| 46 int num_script_tags; |
| 47 |
| 48 PageFeatureState() |
| 49 : external_links(0), |
| 50 secure_links(0), |
| 51 total_links(0), |
| 52 num_forms(0), |
| 53 num_text_inputs(0), |
| 54 num_pswd_inputs(0), |
| 55 num_radio_inputs(0), |
| 56 num_check_inputs(0), |
| 57 action_other_domain(0), |
| 58 total_actions(0), |
| 59 img_other_domain(0), |
| 60 total_imgs(0), |
| 61 num_script_tags(0) {} |
| 62 |
| 63 ~PageFeatureState() {} |
| 64 }; |
| 65 |
| 66 // Per-frame state |
| 67 struct PhishingDOMFeatureExtractor::FrameData { |
| 68 // This is our reference to document.all, which is an iterator over all |
| 69 // of the elements in the document. It keeps track of our current position. |
| 70 WebKit::WebNodeCollection elements; |
| 71 // The domain of the document URL, stored here so that we don't need to |
| 72 // recompute it every time it's needed. |
| 73 std::string domain; |
| 74 }; |
| 75 |
| 76 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( |
| 77 RenderView* render_view) |
| 78 : render_view_(render_view), |
| 79 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { |
| 80 Clear(); |
| 81 } |
| 82 |
| 83 PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() { |
| 84 // The RenderView should have called CancelPendingExtraction() before |
| 85 // we are destroyed. |
| 86 CheckNoPendingExtraction(); |
| 87 } |
| 88 |
| 89 void PhishingDOMFeatureExtractor::ExtractFeatures( |
| 90 FeatureMap* features, |
| 91 DoneCallback* done_callback) { |
| 92 // The RenderView should have called CancelPendingExtraction() before |
| 93 // starting a new extraction, so DCHECK this. |
| 94 CheckNoPendingExtraction(); |
| 95 // However, in an opt build, we will go ahead and clean up the pending |
| 96 // extraction so that we can start in a known state. |
| 97 CancelPendingExtraction(); |
| 98 |
| 99 features_ = features; |
| 100 done_callback_.reset(done_callback); |
| 101 MessageLoop::current()->PostTask( |
| 102 FROM_HERE, |
| 103 method_factory_.NewRunnableMethod( |
| 104 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout)); |
| 105 } |
| 106 |
| 107 void PhishingDOMFeatureExtractor::CancelPendingExtraction() { |
| 108 // Cancel any pending callbacks, and clear our state. |
| 109 method_factory_.RevokeAll(); |
| 110 Clear(); |
| 111 } |
| 112 |
| 113 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { |
| 114 if (!cur_frame_) { |
| 115 WebKit::WebView* web_view = render_view_->webview(); |
| 116 if (!web_view) { |
| 117 // When the WebView is going away, the render view should have called |
| 118 // CancelPendingExtraction() which should have stopped any pending work, |
| 119 // so this case should not happen. |
| 120 NOTREACHED(); |
| 121 RunCallback(false); |
| 122 return; |
| 123 } |
| 124 cur_frame_ = web_view->mainFrame(); |
| 125 page_feature_state_.reset(new PageFeatureState); |
| 126 } |
| 127 |
| 128 for (; cur_frame_; |
| 129 cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) { |
| 130 WebKit::WebNode cur_node; |
| 131 if (cur_frame_data_.get()) { |
| 132 // We're resuming traversal of a frame, so just advance to the next node. |
| 133 cur_node = cur_frame_data_->elements.nextItem(); |
| 134 } else { |
| 135 // We just moved to a new frame, so update our frame state |
| 136 // and advance to the first element. |
| 137 if (!ResetFrameData()) { |
| 138 // Nothing in this frame, move on to the next one. |
| 139 LOG(WARNING) << "No content in frame, skipping"; |
| 140 continue; |
| 141 } |
| 142 cur_node = cur_frame_data_->elements.firstItem(); |
| 143 } |
| 144 |
| 145 for (; !cur_node.isNull(); |
| 146 cur_node = cur_frame_data_->elements.nextItem()) { |
| 147 if (!cur_node.isElementNode()) { |
| 148 continue; |
| 149 } |
| 150 WebKit::WebElement element = cur_node.to<WebKit::WebElement>(); |
| 151 if (element.hasTagName("a")) { |
| 152 HandleLink(element); |
| 153 } else if (element.hasTagName("form")) { |
| 154 HandleForm(element); |
| 155 } else if (element.hasTagName("img")) { |
| 156 HandleImage(element); |
| 157 } else if (element.hasTagName("input")) { |
| 158 HandleInput(element); |
| 159 } else if (element.hasTagName("script")) { |
| 160 HandleScript(element); |
| 161 } |
| 162 |
| 163 // TODO(bryner): stop if too much time has elapsed, and add histograms |
| 164 // for the time spent processing. |
| 165 } |
| 166 |
| 167 // We're done with this frame, recalculate the FrameData when we |
| 168 // advance to the next frame. |
| 169 cur_frame_data_.reset(); |
| 170 } |
| 171 |
| 172 InsertFeatures(); |
| 173 RunCallback(true); |
| 174 } |
| 175 |
| 176 void PhishingDOMFeatureExtractor::HandleLink( |
| 177 const WebKit::WebElement& element) { |
| 178 // Count the number of times we link to a different host. |
| 179 if (!element.hasAttribute("href")) { |
| 180 DLOG(INFO) << "Skipping anchor tag with no href"; |
| 181 return; |
| 182 } |
| 183 |
| 184 // Retrieve the link and resolve the link in case it's relative. |
| 185 WebKit::WebURL full_url = element.document().completeURL( |
| 186 element.getAttribute("href")); |
| 187 |
| 188 std::string domain; |
| 189 bool is_external = IsExternalDomain(full_url, &domain); |
| 190 if (domain.empty()) { |
| 191 LOG(ERROR) << "Could not extract domain from link: " << full_url; |
| 192 return; |
| 193 } |
| 194 |
| 195 if (is_external) { |
| 196 ++page_feature_state_->external_links; |
| 197 |
| 198 // Record each unique domain that we link to. |
| 199 page_feature_state_->external_domains.insert(domain); |
| 200 } |
| 201 |
| 202 // Check how many are https links. |
| 203 if (GURL(full_url).SchemeIs("https")) { |
| 204 ++page_feature_state_->secure_links; |
| 205 } |
| 206 |
| 207 ++page_feature_state_->total_links; |
| 208 } |
| 209 |
| 210 void PhishingDOMFeatureExtractor::HandleForm( |
| 211 const WebKit::WebElement& element) { |
| 212 // Increment the number of forms on this page. |
| 213 ++page_feature_state_->num_forms; |
| 214 |
| 215 // Record whether the action points to a different domain. |
| 216 if (!element.hasAttribute("action")) { |
| 217 return; |
| 218 } |
| 219 |
| 220 WebKit::WebURL full_url = element.document().completeURL( |
| 221 element.getAttribute("action")); |
| 222 |
| 223 std::string domain; |
| 224 bool is_external = IsExternalDomain(full_url, &domain); |
| 225 if (domain.empty()) { |
| 226 LOG(ERROR) << "Could not extract domain from form action: " << full_url; |
| 227 return; |
| 228 } |
| 229 |
| 230 if (is_external) { |
| 231 ++page_feature_state_->action_other_domain; |
| 232 } |
| 233 ++page_feature_state_->total_actions; |
| 234 } |
| 235 |
| 236 void PhishingDOMFeatureExtractor::HandleImage( |
| 237 const WebKit::WebElement& element) { |
| 238 if (!element.hasAttribute("src")) { |
| 239 DLOG(INFO) << "Skipping img tag with no src"; |
| 240 } |
| 241 |
| 242 // Record whether the image points to a different domain. |
| 243 WebKit::WebURL full_url = element.document().completeURL( |
| 244 element.getAttribute("src")); |
| 245 std::string domain; |
| 246 bool is_external = IsExternalDomain(full_url, &domain); |
| 247 if (domain.empty()) { |
| 248 LOG(ERROR) << "Could not extract domain from image src: " << full_url; |
| 249 return; |
| 250 } |
| 251 |
| 252 if (is_external) { |
| 253 ++page_feature_state_->img_other_domain; |
| 254 } |
| 255 ++page_feature_state_->total_imgs; |
| 256 } |
| 257 |
| 258 void PhishingDOMFeatureExtractor::HandleInput( |
| 259 const WebKit::WebElement& element) { |
| 260 // The HTML spec says that if the type is unspecified, it defaults to text. |
| 261 // In addition, any unrecognized type will be treated as a text input. |
| 262 // |
| 263 // Note that we use the attribute value rather than |
| 264 // WebFormControlElement::formControlType() for consistency with the |
| 265 // way the phishing classification model is created. |
| 266 std::string type = element.getAttribute("type").utf8(); |
| 267 StringToLowerASCII(&type); |
| 268 if (type == "password") { |
| 269 ++page_feature_state_->num_pswd_inputs; |
| 270 } else if (type == "radio") { |
| 271 ++page_feature_state_->num_radio_inputs; |
| 272 } else if (type == "checkbox") { |
| 273 ++page_feature_state_->num_check_inputs; |
| 274 } else if (type != "submit" && type != "reset" && type != "file" && |
| 275 type != "hidden" && type != "image" && type != "button") { |
| 276 // Note that there are a number of new input types in HTML5 that are not |
| 277 // handled above. For now, we will consider these as text inputs since |
| 278 // they could be used to capture user input. |
| 279 ++page_feature_state_->num_text_inputs; |
| 280 } |
| 281 } |
| 282 |
| 283 void PhishingDOMFeatureExtractor::HandleScript( |
| 284 const WebKit::WebElement& element) { |
| 285 ++page_feature_state_->num_script_tags; |
| 286 } |
| 287 |
| 288 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { |
| 289 DCHECK(!done_callback_.get()); |
| 290 DCHECK(!cur_frame_data_.get()); |
| 291 DCHECK(!cur_frame_); |
| 292 if (done_callback_.get() || cur_frame_data_.get() || cur_frame_) { |
| 293 LOG(ERROR) << "Extraction in progress, missing call to " |
| 294 << "CancelPendingExtraction"; |
| 295 } |
| 296 } |
| 297 |
| 298 void PhishingDOMFeatureExtractor::RunCallback(bool success) { |
| 299 DCHECK(done_callback_.get()); |
| 300 done_callback_->Run(success); |
| 301 Clear(); |
| 302 } |
| 303 |
| 304 void PhishingDOMFeatureExtractor::Clear() { |
| 305 features_ = NULL; |
| 306 done_callback_.reset(NULL); |
| 307 cur_frame_data_.reset(NULL); |
| 308 cur_frame_ = NULL; |
| 309 } |
| 310 |
| 311 bool PhishingDOMFeatureExtractor::ResetFrameData() { |
| 312 DCHECK(cur_frame_); |
| 313 DCHECK(!cur_frame_data_.get()); |
| 314 |
| 315 WebKit::WebDocument doc = cur_frame_->document(); |
| 316 if (doc.isNull()) { |
| 317 return false; |
| 318 } |
| 319 cur_frame_data_.reset(new FrameData()); |
| 320 cur_frame_data_->elements = doc.all(); |
| 321 cur_frame_data_->domain = |
| 322 net::RegistryControlledDomainService::GetDomainAndRegistry( |
| 323 cur_frame_->url()); |
| 324 return true; |
| 325 } |
| 326 |
| 327 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, |
| 328 std::string* domain) const { |
| 329 DCHECK(domain); |
| 330 DCHECK(cur_frame_data_.get()); |
| 331 |
| 332 if (cur_frame_data_->domain.empty()) { |
| 333 return false; |
| 334 } |
| 335 |
| 336 // TODO(bryner): Ensure that the url encoding is consistent with the features |
| 337 // in the model. |
| 338 if (url.HostIsIPAddress()) { |
| 339 domain->assign(url.host()); |
| 340 } else { |
| 341 domain->assign(net::RegistryControlledDomainService::GetDomainAndRegistry( |
| 342 url)); |
| 343 } |
| 344 |
| 345 return !domain->empty() && *domain != cur_frame_data_->domain; |
| 346 } |
| 347 |
| 348 void PhishingDOMFeatureExtractor::InsertFeatures() { |
| 349 DCHECK(page_feature_state_.get()); |
| 350 features_->Clear(); |
| 351 |
| 352 if (page_feature_state_->total_links > 0) { |
| 353 // Add a feature for the fraction of times the page links to an external |
| 354 // domain vs. an internal domain. |
| 355 double link_freq = static_cast<double>( |
| 356 page_feature_state_->external_links) / |
| 357 page_feature_state_->total_links; |
| 358 features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq); |
| 359 |
| 360 // Add a feature for each unique domain that we're linking to |
| 361 for (base::hash_set<std::string>::iterator it = |
| 362 page_feature_state_->external_domains.begin(); |
| 363 it != page_feature_state_->external_domains.end(); ++it) { |
| 364 features_->AddBooleanFeature(features::kPageLinkDomain + *it); |
| 365 } |
| 366 |
| 367 // Fraction of links that use https. |
| 368 double secure_freq = static_cast<double>( |
| 369 page_feature_state_->secure_links) / page_feature_state_->total_links; |
| 370 features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq); |
| 371 } |
| 372 |
| 373 // Record whether forms appear and whether various form elements appear. |
| 374 if (page_feature_state_->num_forms > 0) { |
| 375 features_->AddBooleanFeature(features::kPageHasForms); |
| 376 } |
| 377 if (page_feature_state_->num_text_inputs > 0) { |
| 378 features_->AddBooleanFeature(features::kPageHasTextInputs); |
| 379 } |
| 380 if (page_feature_state_->num_pswd_inputs > 0) { |
| 381 features_->AddBooleanFeature(features::kPageHasPswdInputs); |
| 382 } |
| 383 if (page_feature_state_->num_radio_inputs > 0) { |
| 384 features_->AddBooleanFeature(features::kPageHasRadioInputs); |
| 385 } |
| 386 if (page_feature_state_->num_check_inputs > 0) { |
| 387 features_->AddBooleanFeature(features::kPageHasCheckInputs); |
| 388 } |
| 389 |
| 390 // Record fraction of form actions that point to a different domain. |
| 391 if (page_feature_state_->total_actions > 0) { |
| 392 double action_freq = static_cast<double>( |
| 393 page_feature_state_->action_other_domain) / |
| 394 page_feature_state_->total_actions; |
| 395 features_->AddRealFeature(features::kPageActionOtherDomainFreq, |
| 396 action_freq); |
| 397 } |
| 398 |
| 399 // Record how many image src attributes point to a different domain. |
| 400 if (page_feature_state_->total_imgs > 0) { |
| 401 double img_freq = static_cast<double>( |
| 402 page_feature_state_->img_other_domain) / |
| 403 page_feature_state_->total_imgs; |
| 404 features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq); |
| 405 } |
| 406 |
| 407 // Record number of script tags (discretized for numerical stability.) |
| 408 if (page_feature_state_->num_script_tags > 1) { |
| 409 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); |
| 410 if (page_feature_state_->num_script_tags > 6) { |
| 411 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); |
| 412 } |
| 413 } |
| 414 } |
| 415 |
| 416 } // namespace safe_browsing |
OLD | NEW |