Index: chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc |
diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc |
new file mode 100644 |
index 0000000000000000000000000000000000000000..c8f4bd0d3c6d881a0fa7851a840c149ee32d57dc |
--- /dev/null |
+++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc |
@@ -0,0 +1,416 @@ |
+// Copyright (c) 2010 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" |
+ |
+#include "base/compiler_specific.h" |
+#include "base/hash_tables.h" |
+#include "base/histogram.h" |
+#include "base/logging.h" |
+#include "chrome/renderer/render_view.h" |
+#include "chrome/renderer/safe_browsing/features.h" |
+#include "net/base/registry_controlled_domain.h" |
+#include "third_party/WebKit/WebKit/chromium/public/WebDocument.h" |
+#include "third_party/WebKit/WebKit/chromium/public/WebElement.h" |
+#include "third_party/WebKit/WebKit/chromium/public/WebFrame.h" |
+#include "third_party/WebKit/WebKit/chromium/public/WebNodeCollection.h" |
+#include "third_party/WebKit/WebKit/chromium/public/WebString.h" |
+#include "third_party/WebKit/WebKit/chromium/public/WebView.h" |
+ |
+namespace safe_browsing { |
+ |
+// Intermediate state used for computing features. See features.h for |
+// descriptions of the DOM features that are computed. |
+struct PhishingDOMFeatureExtractor::PageFeatureState { |
+ // Link related features |
+ int external_links; |
+ base::hash_set<std::string> external_domains; |
+ int secure_links; |
+ int total_links; |
+ |
+ // Form related features |
+ int num_forms; |
+ int num_text_inputs; |
+ int num_pswd_inputs; |
+ int num_radio_inputs; |
+ int num_check_inputs; |
+ int action_other_domain; |
+ int total_actions; |
+ |
+ // Image related features |
+ int img_other_domain; |
+ int total_imgs; |
+ |
+ // How many script tags |
+ int num_script_tags; |
+ |
+ PageFeatureState() |
+ : external_links(0), |
+ secure_links(0), |
+ total_links(0), |
+ num_forms(0), |
+ num_text_inputs(0), |
+ num_pswd_inputs(0), |
+ num_radio_inputs(0), |
+ num_check_inputs(0), |
+ action_other_domain(0), |
+ total_actions(0), |
+ img_other_domain(0), |
+ total_imgs(0), |
+ num_script_tags(0) {} |
+ |
+ ~PageFeatureState() {} |
+}; |
+ |
+// Per-frame state |
+struct PhishingDOMFeatureExtractor::FrameData { |
+ // This is our reference to document.all, which is an iterator over all |
+ // of the elements in the document. It keeps track of our current position. |
+ WebKit::WebNodeCollection elements; |
+ // The domain of the document URL, stored here so that we don't need to |
+ // recompute it every time it's needed. |
+ std::string domain; |
+}; |
+ |
+PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( |
+ RenderView* render_view) |
+ : render_view_(render_view), |
+ ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { |
+ Clear(); |
+} |
+ |
+PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() { |
+ // The RenderView should have called CancelPendingExtraction() before |
+ // we are destroyed. |
+ CheckNoPendingExtraction(); |
+} |
+ |
+void PhishingDOMFeatureExtractor::ExtractFeatures( |
+ FeatureMap* features, |
+ DoneCallback* done_callback) { |
+ // The RenderView should have called CancelPendingExtraction() before |
+ // starting a new extraction, so DCHECK this. |
+ CheckNoPendingExtraction(); |
+ // However, in an opt build, we will go ahead and clean up the pending |
+ // extraction so that we can start in a known state. |
+ CancelPendingExtraction(); |
+ |
+ features_ = features; |
+ done_callback_.reset(done_callback); |
+ MessageLoop::current()->PostTask( |
+ FROM_HERE, |
+ method_factory_.NewRunnableMethod( |
+ &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout)); |
+} |
+ |
+void PhishingDOMFeatureExtractor::CancelPendingExtraction() { |
+ // Cancel any pending callbacks, and clear our state. |
+ method_factory_.RevokeAll(); |
+ Clear(); |
+} |
+ |
+void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { |
+ if (!cur_frame_) { |
+ WebKit::WebView* web_view = render_view_->webview(); |
+ if (!web_view) { |
+ // When the WebView is going away, the render view should have called |
+ // CancelPendingExtraction() which should have stopped any pending work, |
+ // so this case should not happen. |
+ NOTREACHED(); |
+ RunCallback(false); |
+ return; |
+ } |
+ cur_frame_ = web_view->mainFrame(); |
+ page_feature_state_.reset(new PageFeatureState); |
+ } |
+ |
+ for (; cur_frame_; |
+ cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) { |
+ WebKit::WebNode cur_node; |
+ if (cur_frame_data_.get()) { |
+ // We're resuming traversal of a frame, so just advance to the next node. |
+ cur_node = cur_frame_data_->elements.nextItem(); |
+ } else { |
+ // We just moved to a new frame, so update our frame state |
+ // and advance to the first element. |
+ if (!ResetFrameData()) { |
+ // Nothing in this frame, move on to the next one. |
+ LOG(WARNING) << "No content in frame, skipping"; |
+ continue; |
+ } |
+ cur_node = cur_frame_data_->elements.firstItem(); |
+ } |
+ |
+ for (; !cur_node.isNull(); |
+ cur_node = cur_frame_data_->elements.nextItem()) { |
+ if (!cur_node.isElementNode()) { |
+ continue; |
+ } |
+ WebKit::WebElement element = cur_node.to<WebKit::WebElement>(); |
+ if (element.hasTagName("a")) { |
+ HandleLink(element); |
+ } else if (element.hasTagName("form")) { |
+ HandleForm(element); |
+ } else if (element.hasTagName("img")) { |
+ HandleImage(element); |
+ } else if (element.hasTagName("input")) { |
+ HandleInput(element); |
+ } else if (element.hasTagName("script")) { |
+ HandleScript(element); |
+ } |
+ |
+ // TODO(bryner): stop if too much time has elapsed, and add histograms |
+ // for the time spent processing. |
+ } |
+ |
+ // We're done with this frame, recalculate the FrameData when we |
+ // advance to the next frame. |
+ cur_frame_data_.reset(); |
+ } |
+ |
+ InsertFeatures(); |
+ RunCallback(true); |
+} |
+ |
+void PhishingDOMFeatureExtractor::HandleLink( |
+ const WebKit::WebElement& element) { |
+ // Count the number of times we link to a different host. |
+ if (!element.hasAttribute("href")) { |
+ DLOG(INFO) << "Skipping anchor tag with no href"; |
+ return; |
+ } |
+ |
+ // Retrieve the link and resolve the link in case it's relative. |
+ WebKit::WebURL full_url = element.document().completeURL( |
+ element.getAttribute("href")); |
+ |
+ std::string domain; |
+ bool is_external = IsExternalDomain(full_url, &domain); |
+ if (domain.empty()) { |
+ LOG(ERROR) << "Could not extract domain from link: " << full_url; |
+ return; |
+ } |
+ |
+ if (is_external) { |
+ ++page_feature_state_->external_links; |
+ |
+ // Record each unique domain that we link to. |
+ page_feature_state_->external_domains.insert(domain); |
+ } |
+ |
+ // Check how many are https links. |
+ if (GURL(full_url).SchemeIs("https")) { |
+ ++page_feature_state_->secure_links; |
+ } |
+ |
+ ++page_feature_state_->total_links; |
+} |
+ |
+void PhishingDOMFeatureExtractor::HandleForm( |
+ const WebKit::WebElement& element) { |
+ // Increment the number of forms on this page. |
+ ++page_feature_state_->num_forms; |
+ |
+ // Record whether the action points to a different domain. |
+ if (!element.hasAttribute("action")) { |
+ return; |
+ } |
+ |
+ WebKit::WebURL full_url = element.document().completeURL( |
+ element.getAttribute("action")); |
+ |
+ std::string domain; |
+ bool is_external = IsExternalDomain(full_url, &domain); |
+ if (domain.empty()) { |
+ LOG(ERROR) << "Could not extract domain from form action: " << full_url; |
+ return; |
+ } |
+ |
+ if (is_external) { |
+ ++page_feature_state_->action_other_domain; |
+ } |
+ ++page_feature_state_->total_actions; |
+} |
+ |
+void PhishingDOMFeatureExtractor::HandleImage( |
+ const WebKit::WebElement& element) { |
+ if (!element.hasAttribute("src")) { |
+ DLOG(INFO) << "Skipping img tag with no src"; |
+ } |
+ |
+ // Record whether the image points to a different domain. |
+ WebKit::WebURL full_url = element.document().completeURL( |
+ element.getAttribute("src")); |
+ std::string domain; |
+ bool is_external = IsExternalDomain(full_url, &domain); |
+ if (domain.empty()) { |
+ LOG(ERROR) << "Could not extract domain from image src: " << full_url; |
+ return; |
+ } |
+ |
+ if (is_external) { |
+ ++page_feature_state_->img_other_domain; |
+ } |
+ ++page_feature_state_->total_imgs; |
+} |
+ |
+void PhishingDOMFeatureExtractor::HandleInput( |
+ const WebKit::WebElement& element) { |
+ // The HTML spec says that if the type is unspecified, it defaults to text. |
+ // In addition, any unrecognized type will be treated as a text input. |
+ // |
+ // Note that we use the attribute value rather than |
+ // WebFormControlElement::formControlType() for consistency with the |
+ // way the phishing classification model is created. |
+ std::string type = element.getAttribute("type").utf8(); |
+ StringToLowerASCII(&type); |
+ if (type == "password") { |
+ ++page_feature_state_->num_pswd_inputs; |
+ } else if (type == "radio") { |
+ ++page_feature_state_->num_radio_inputs; |
+ } else if (type == "checkbox") { |
+ ++page_feature_state_->num_check_inputs; |
+ } else if (type != "submit" && type != "reset" && type != "file" && |
+ type != "hidden" && type != "image" && type != "button") { |
+ // Note that there are a number of new input types in HTML5 that are not |
+ // handled above. For now, we will consider these as text inputs since |
+ // they could be used to capture user input. |
+ ++page_feature_state_->num_text_inputs; |
+ } |
+} |
+ |
+void PhishingDOMFeatureExtractor::HandleScript( |
+ const WebKit::WebElement& element) { |
+ ++page_feature_state_->num_script_tags; |
+} |
+ |
+void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { |
+ DCHECK(!done_callback_.get()); |
+ DCHECK(!cur_frame_data_.get()); |
+ DCHECK(!cur_frame_); |
+ if (done_callback_.get() || cur_frame_data_.get() || cur_frame_) { |
+ LOG(ERROR) << "Extraction in progress, missing call to " |
+ << "CancelPendingExtraction"; |
+ } |
+} |
+ |
+void PhishingDOMFeatureExtractor::RunCallback(bool success) { |
+ DCHECK(done_callback_.get()); |
+ done_callback_->Run(success); |
+ Clear(); |
+} |
+ |
+void PhishingDOMFeatureExtractor::Clear() { |
+ features_ = NULL; |
+ done_callback_.reset(NULL); |
+ cur_frame_data_.reset(NULL); |
+ cur_frame_ = NULL; |
+} |
+ |
+bool PhishingDOMFeatureExtractor::ResetFrameData() { |
+ DCHECK(cur_frame_); |
+ DCHECK(!cur_frame_data_.get()); |
+ |
+ WebKit::WebDocument doc = cur_frame_->document(); |
+ if (doc.isNull()) { |
+ return false; |
+ } |
+ cur_frame_data_.reset(new FrameData()); |
+ cur_frame_data_->elements = doc.all(); |
+ cur_frame_data_->domain = |
+ net::RegistryControlledDomainService::GetDomainAndRegistry( |
+ cur_frame_->url()); |
+ return true; |
+} |
+ |
+bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, |
+ std::string* domain) const { |
+ DCHECK(domain); |
+ DCHECK(cur_frame_data_.get()); |
+ |
+ if (cur_frame_data_->domain.empty()) { |
+ return false; |
+ } |
+ |
+ // TODO(bryner): Ensure that the url encoding is consistent with the features |
+ // in the model. |
+ if (url.HostIsIPAddress()) { |
+ domain->assign(url.host()); |
+ } else { |
+ domain->assign(net::RegistryControlledDomainService::GetDomainAndRegistry( |
+ url)); |
+ } |
+ |
+ return !domain->empty() && *domain != cur_frame_data_->domain; |
+} |
+ |
+void PhishingDOMFeatureExtractor::InsertFeatures() { |
+ DCHECK(page_feature_state_.get()); |
+ features_->Clear(); |
+ |
+ if (page_feature_state_->total_links > 0) { |
+ // Add a feature for the fraction of times the page links to an external |
+ // domain vs. an internal domain. |
+ double link_freq = static_cast<double>( |
+ page_feature_state_->external_links) / |
+ page_feature_state_->total_links; |
+ features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq); |
+ |
+ // Add a feature for each unique domain that we're linking to |
+ for (base::hash_set<std::string>::iterator it = |
+ page_feature_state_->external_domains.begin(); |
+ it != page_feature_state_->external_domains.end(); ++it) { |
+ features_->AddBooleanFeature(features::kPageLinkDomain + *it); |
+ } |
+ |
+ // Fraction of links that use https. |
+ double secure_freq = static_cast<double>( |
+ page_feature_state_->secure_links) / page_feature_state_->total_links; |
+ features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq); |
+ } |
+ |
+ // Record whether forms appear and whether various form elements appear. |
+ if (page_feature_state_->num_forms > 0) { |
+ features_->AddBooleanFeature(features::kPageHasForms); |
+ } |
+ if (page_feature_state_->num_text_inputs > 0) { |
+ features_->AddBooleanFeature(features::kPageHasTextInputs); |
+ } |
+ if (page_feature_state_->num_pswd_inputs > 0) { |
+ features_->AddBooleanFeature(features::kPageHasPswdInputs); |
+ } |
+ if (page_feature_state_->num_radio_inputs > 0) { |
+ features_->AddBooleanFeature(features::kPageHasRadioInputs); |
+ } |
+ if (page_feature_state_->num_check_inputs > 0) { |
+ features_->AddBooleanFeature(features::kPageHasCheckInputs); |
+ } |
+ |
+ // Record fraction of form actions that point to a different domain. |
+ if (page_feature_state_->total_actions > 0) { |
+ double action_freq = static_cast<double>( |
+ page_feature_state_->action_other_domain) / |
+ page_feature_state_->total_actions; |
+ features_->AddRealFeature(features::kPageActionOtherDomainFreq, |
+ action_freq); |
+ } |
+ |
+ // Record how many image src attributes point to a different domain. |
+ if (page_feature_state_->total_imgs > 0) { |
+ double img_freq = static_cast<double>( |
+ page_feature_state_->img_other_domain) / |
+ page_feature_state_->total_imgs; |
+ features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq); |
+ } |
+ |
+ // Record number of script tags (discretized for numerical stability.) |
+ if (page_feature_state_->num_script_tags > 1) { |
+ features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); |
+ if (page_feature_state_->num_script_tags > 6) { |
+ features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); |
+ } |
+ } |
+} |
+ |
+} // namespace safe_browsing |