Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(671)

Unified Diff: chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc

Issue 2878046: Add an extractor for DOM features to be used for client side phishing detection. (Closed)
Patch Set: address marria's comments Created 10 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc
diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c8f4bd0d3c6d881a0fa7851a840c149ee32d57dc
--- /dev/null
+++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc
@@ -0,0 +1,416 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
+
+#include "base/compiler_specific.h"
+#include "base/hash_tables.h"
+#include "base/histogram.h"
+#include "base/logging.h"
+#include "chrome/renderer/render_view.h"
+#include "chrome/renderer/safe_browsing/features.h"
+#include "net/base/registry_controlled_domain.h"
+#include "third_party/WebKit/WebKit/chromium/public/WebDocument.h"
+#include "third_party/WebKit/WebKit/chromium/public/WebElement.h"
+#include "third_party/WebKit/WebKit/chromium/public/WebFrame.h"
+#include "third_party/WebKit/WebKit/chromium/public/WebNodeCollection.h"
+#include "third_party/WebKit/WebKit/chromium/public/WebString.h"
+#include "third_party/WebKit/WebKit/chromium/public/WebView.h"
+
+namespace safe_browsing {
+
+// Intermediate state used for computing features. See features.h for
+// descriptions of the DOM features that are computed.
+struct PhishingDOMFeatureExtractor::PageFeatureState {
+ // Link related features
+ int external_links;
+ base::hash_set<std::string> external_domains;
+ int secure_links;
+ int total_links;
+
+ // Form related features
+ int num_forms;
+ int num_text_inputs;
+ int num_pswd_inputs;
+ int num_radio_inputs;
+ int num_check_inputs;
+ int action_other_domain;
+ int total_actions;
+
+ // Image related features
+ int img_other_domain;
+ int total_imgs;
+
+ // How many script tags
+ int num_script_tags;
+
+ PageFeatureState()
+ : external_links(0),
+ secure_links(0),
+ total_links(0),
+ num_forms(0),
+ num_text_inputs(0),
+ num_pswd_inputs(0),
+ num_radio_inputs(0),
+ num_check_inputs(0),
+ action_other_domain(0),
+ total_actions(0),
+ img_other_domain(0),
+ total_imgs(0),
+ num_script_tags(0) {}
+
+ ~PageFeatureState() {}
+};
+
+// Per-frame state
+struct PhishingDOMFeatureExtractor::FrameData {
+ // This is our reference to document.all, which is an iterator over all
+ // of the elements in the document. It keeps track of our current position.
+ WebKit::WebNodeCollection elements;
+ // The domain of the document URL, stored here so that we don't need to
+ // recompute it every time it's needed.
+ std::string domain;
+};
+
+PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
+ RenderView* render_view)
+ : render_view_(render_view),
+ ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
+ Clear();
+}
+
+PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() {
+ // The RenderView should have called CancelPendingExtraction() before
+ // we are destroyed.
+ CheckNoPendingExtraction();
+}
+
+void PhishingDOMFeatureExtractor::ExtractFeatures(
+ FeatureMap* features,
+ DoneCallback* done_callback) {
+ // The RenderView should have called CancelPendingExtraction() before
+ // starting a new extraction, so DCHECK this.
+ CheckNoPendingExtraction();
+ // However, in an opt build, we will go ahead and clean up the pending
+ // extraction so that we can start in a known state.
+ CancelPendingExtraction();
+
+ features_ = features;
+ done_callback_.reset(done_callback);
+ MessageLoop::current()->PostTask(
+ FROM_HERE,
+ method_factory_.NewRunnableMethod(
+ &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout));
+}
+
+void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
+ // Cancel any pending callbacks, and clear our state.
+ method_factory_.RevokeAll();
+ Clear();
+}
+
+void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
+ if (!cur_frame_) {
+ WebKit::WebView* web_view = render_view_->webview();
+ if (!web_view) {
+ // When the WebView is going away, the render view should have called
+ // CancelPendingExtraction() which should have stopped any pending work,
+ // so this case should not happen.
+ NOTREACHED();
+ RunCallback(false);
+ return;
+ }
+ cur_frame_ = web_view->mainFrame();
+ page_feature_state_.reset(new PageFeatureState);
+ }
+
+ for (; cur_frame_;
+ cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) {
+ WebKit::WebNode cur_node;
+ if (cur_frame_data_.get()) {
+ // We're resuming traversal of a frame, so just advance to the next node.
+ cur_node = cur_frame_data_->elements.nextItem();
+ } else {
+ // We just moved to a new frame, so update our frame state
+ // and advance to the first element.
+ if (!ResetFrameData()) {
+ // Nothing in this frame, move on to the next one.
+ LOG(WARNING) << "No content in frame, skipping";
+ continue;
+ }
+ cur_node = cur_frame_data_->elements.firstItem();
+ }
+
+ for (; !cur_node.isNull();
+ cur_node = cur_frame_data_->elements.nextItem()) {
+ if (!cur_node.isElementNode()) {
+ continue;
+ }
+ WebKit::WebElement element = cur_node.to<WebKit::WebElement>();
+ if (element.hasTagName("a")) {
+ HandleLink(element);
+ } else if (element.hasTagName("form")) {
+ HandleForm(element);
+ } else if (element.hasTagName("img")) {
+ HandleImage(element);
+ } else if (element.hasTagName("input")) {
+ HandleInput(element);
+ } else if (element.hasTagName("script")) {
+ HandleScript(element);
+ }
+
+ // TODO(bryner): stop if too much time has elapsed, and add histograms
+ // for the time spent processing.
+ }
+
+ // We're done with this frame, recalculate the FrameData when we
+ // advance to the next frame.
+ cur_frame_data_.reset();
+ }
+
+ InsertFeatures();
+ RunCallback(true);
+}
+
+void PhishingDOMFeatureExtractor::HandleLink(
+ const WebKit::WebElement& element) {
+ // Count the number of times we link to a different host.
+ if (!element.hasAttribute("href")) {
+ DLOG(INFO) << "Skipping anchor tag with no href";
+ return;
+ }
+
+ // Retrieve the link and resolve the link in case it's relative.
+ WebKit::WebURL full_url = element.document().completeURL(
+ element.getAttribute("href"));
+
+ std::string domain;
+ bool is_external = IsExternalDomain(full_url, &domain);
+ if (domain.empty()) {
+ LOG(ERROR) << "Could not extract domain from link: " << full_url;
+ return;
+ }
+
+ if (is_external) {
+ ++page_feature_state_->external_links;
+
+ // Record each unique domain that we link to.
+ page_feature_state_->external_domains.insert(domain);
+ }
+
+ // Check how many are https links.
+ if (GURL(full_url).SchemeIs("https")) {
+ ++page_feature_state_->secure_links;
+ }
+
+ ++page_feature_state_->total_links;
+}
+
+void PhishingDOMFeatureExtractor::HandleForm(
+ const WebKit::WebElement& element) {
+ // Increment the number of forms on this page.
+ ++page_feature_state_->num_forms;
+
+ // Record whether the action points to a different domain.
+ if (!element.hasAttribute("action")) {
+ return;
+ }
+
+ WebKit::WebURL full_url = element.document().completeURL(
+ element.getAttribute("action"));
+
+ std::string domain;
+ bool is_external = IsExternalDomain(full_url, &domain);
+ if (domain.empty()) {
+ LOG(ERROR) << "Could not extract domain from form action: " << full_url;
+ return;
+ }
+
+ if (is_external) {
+ ++page_feature_state_->action_other_domain;
+ }
+ ++page_feature_state_->total_actions;
+}
+
+void PhishingDOMFeatureExtractor::HandleImage(
+ const WebKit::WebElement& element) {
+ if (!element.hasAttribute("src")) {
+ DLOG(INFO) << "Skipping img tag with no src";
+ }
+
+ // Record whether the image points to a different domain.
+ WebKit::WebURL full_url = element.document().completeURL(
+ element.getAttribute("src"));
+ std::string domain;
+ bool is_external = IsExternalDomain(full_url, &domain);
+ if (domain.empty()) {
+ LOG(ERROR) << "Could not extract domain from image src: " << full_url;
+ return;
+ }
+
+ if (is_external) {
+ ++page_feature_state_->img_other_domain;
+ }
+ ++page_feature_state_->total_imgs;
+}
+
+void PhishingDOMFeatureExtractor::HandleInput(
+ const WebKit::WebElement& element) {
+ // The HTML spec says that if the type is unspecified, it defaults to text.
+ // In addition, any unrecognized type will be treated as a text input.
+ //
+ // Note that we use the attribute value rather than
+ // WebFormControlElement::formControlType() for consistency with the
+ // way the phishing classification model is created.
+ std::string type = element.getAttribute("type").utf8();
+ StringToLowerASCII(&type);
+ if (type == "password") {
+ ++page_feature_state_->num_pswd_inputs;
+ } else if (type == "radio") {
+ ++page_feature_state_->num_radio_inputs;
+ } else if (type == "checkbox") {
+ ++page_feature_state_->num_check_inputs;
+ } else if (type != "submit" && type != "reset" && type != "file" &&
+ type != "hidden" && type != "image" && type != "button") {
+ // Note that there are a number of new input types in HTML5 that are not
+ // handled above. For now, we will consider these as text inputs since
+ // they could be used to capture user input.
+ ++page_feature_state_->num_text_inputs;
+ }
+}
+
+void PhishingDOMFeatureExtractor::HandleScript(
+ const WebKit::WebElement& element) {
+ ++page_feature_state_->num_script_tags;
+}
+
+void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
+ DCHECK(!done_callback_.get());
+ DCHECK(!cur_frame_data_.get());
+ DCHECK(!cur_frame_);
+ if (done_callback_.get() || cur_frame_data_.get() || cur_frame_) {
+ LOG(ERROR) << "Extraction in progress, missing call to "
+ << "CancelPendingExtraction";
+ }
+}
+
+void PhishingDOMFeatureExtractor::RunCallback(bool success) {
+ DCHECK(done_callback_.get());
+ done_callback_->Run(success);
+ Clear();
+}
+
+void PhishingDOMFeatureExtractor::Clear() {
+ features_ = NULL;
+ done_callback_.reset(NULL);
+ cur_frame_data_.reset(NULL);
+ cur_frame_ = NULL;
+}
+
+bool PhishingDOMFeatureExtractor::ResetFrameData() {
+ DCHECK(cur_frame_);
+ DCHECK(!cur_frame_data_.get());
+
+ WebKit::WebDocument doc = cur_frame_->document();
+ if (doc.isNull()) {
+ return false;
+ }
+ cur_frame_data_.reset(new FrameData());
+ cur_frame_data_->elements = doc.all();
+ cur_frame_data_->domain =
+ net::RegistryControlledDomainService::GetDomainAndRegistry(
+ cur_frame_->url());
+ return true;
+}
+
+bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url,
+ std::string* domain) const {
+ DCHECK(domain);
+ DCHECK(cur_frame_data_.get());
+
+ if (cur_frame_data_->domain.empty()) {
+ return false;
+ }
+
+ // TODO(bryner): Ensure that the url encoding is consistent with the features
+ // in the model.
+ if (url.HostIsIPAddress()) {
+ domain->assign(url.host());
+ } else {
+ domain->assign(net::RegistryControlledDomainService::GetDomainAndRegistry(
+ url));
+ }
+
+ return !domain->empty() && *domain != cur_frame_data_->domain;
+}
+
+void PhishingDOMFeatureExtractor::InsertFeatures() {
+ DCHECK(page_feature_state_.get());
+ features_->Clear();
+
+ if (page_feature_state_->total_links > 0) {
+ // Add a feature for the fraction of times the page links to an external
+ // domain vs. an internal domain.
+ double link_freq = static_cast<double>(
+ page_feature_state_->external_links) /
+ page_feature_state_->total_links;
+ features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq);
+
+ // Add a feature for each unique domain that we're linking to
+ for (base::hash_set<std::string>::iterator it =
+ page_feature_state_->external_domains.begin();
+ it != page_feature_state_->external_domains.end(); ++it) {
+ features_->AddBooleanFeature(features::kPageLinkDomain + *it);
+ }
+
+ // Fraction of links that use https.
+ double secure_freq = static_cast<double>(
+ page_feature_state_->secure_links) / page_feature_state_->total_links;
+ features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq);
+ }
+
+ // Record whether forms appear and whether various form elements appear.
+ if (page_feature_state_->num_forms > 0) {
+ features_->AddBooleanFeature(features::kPageHasForms);
+ }
+ if (page_feature_state_->num_text_inputs > 0) {
+ features_->AddBooleanFeature(features::kPageHasTextInputs);
+ }
+ if (page_feature_state_->num_pswd_inputs > 0) {
+ features_->AddBooleanFeature(features::kPageHasPswdInputs);
+ }
+ if (page_feature_state_->num_radio_inputs > 0) {
+ features_->AddBooleanFeature(features::kPageHasRadioInputs);
+ }
+ if (page_feature_state_->num_check_inputs > 0) {
+ features_->AddBooleanFeature(features::kPageHasCheckInputs);
+ }
+
+ // Record fraction of form actions that point to a different domain.
+ if (page_feature_state_->total_actions > 0) {
+ double action_freq = static_cast<double>(
+ page_feature_state_->action_other_domain) /
+ page_feature_state_->total_actions;
+ features_->AddRealFeature(features::kPageActionOtherDomainFreq,
+ action_freq);
+ }
+
+ // Record how many image src attributes point to a different domain.
+ if (page_feature_state_->total_imgs > 0) {
+ double img_freq = static_cast<double>(
+ page_feature_state_->img_other_domain) /
+ page_feature_state_->total_imgs;
+ features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq);
+ }
+
+ // Record number of script tags (discretized for numerical stability.)
+ if (page_feature_state_->num_script_tags > 1) {
+ features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne);
+ if (page_feature_state_->num_script_tags > 6) {
+ features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix);
+ }
+ }
+}
+
+} // namespace safe_browsing

Powered by Google App Engine
This is Rietveld 408576698