Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(8328)

Unified Diff: chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc

Issue 2667343006: Componentize safe_browsing [X+1] : move the renderer part to component.
Patch Set: Created 3 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc
diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc
deleted file mode 100644
index c6cd00149d45d10d7a5f6f18b48c7fd3635e2106..0000000000000000000000000000000000000000
--- a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc
+++ /dev/null
@@ -1,502 +0,0 @@
-// Copyright (c) 2012 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
-
-#include "base/bind.h"
-#include "base/compiler_specific.h"
-#include "base/containers/hash_tables.h"
-#include "base/location.h"
-#include "base/logging.h"
-#include "base/metrics/histogram_macros.h"
-#include "base/single_thread_task_runner.h"
-#include "base/strings/string_util.h"
-#include "base/threading/thread_task_runner_handle.h"
-#include "base/time/time.h"
-#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
-#include "chrome/renderer/safe_browsing/features.h"
-#include "content/public/renderer/render_view.h"
-#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
-#include "third_party/WebKit/public/platform/WebString.h"
-#include "third_party/WebKit/public/web/WebElement.h"
-#include "third_party/WebKit/public/web/WebElementCollection.h"
-#include "third_party/WebKit/public/web/WebLocalFrame.h"
-#include "third_party/WebKit/public/web/WebView.h"
-
-namespace safe_browsing {
-
-// This time should be short enough that it doesn't noticeably disrupt the
-// user's interaction with the page.
-const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10;
-
-// Experimenting shows that we get a reasonable gain in performance by
-// increasing this up to around 10, but there's not much benefit in
-// increasing it past that.
-const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10;
-
-// This should be longer than we expect feature extraction to take on any
-// actual phishing page.
-const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500;
-
-// Intermediate state used for computing features. See features.h for
-// descriptions of the DOM features that are computed.
-struct PhishingDOMFeatureExtractor::PageFeatureState {
- // Link related features
- int external_links;
- base::hash_set<std::string> external_domains;
- int secure_links;
- int total_links;
-
- // Form related features
- int num_forms;
- int num_text_inputs;
- int num_pswd_inputs;
- int num_radio_inputs;
- int num_check_inputs;
- int action_other_domain;
- int total_actions;
- base::hash_set<std::string> page_action_urls;
-
- // Image related features
- int img_other_domain;
- int total_imgs;
-
- // How many script tags
- int num_script_tags;
-
- // The time at which we started feature extraction for the current page.
- base::TimeTicks start_time;
-
- // The number of iterations we've done for the current extraction.
- int num_iterations;
-
- explicit PageFeatureState(base::TimeTicks start_time_ticks)
- : external_links(0),
- secure_links(0),
- total_links(0),
- num_forms(0),
- num_text_inputs(0),
- num_pswd_inputs(0),
- num_radio_inputs(0),
- num_check_inputs(0),
- action_other_domain(0),
- total_actions(0),
- img_other_domain(0),
- total_imgs(0),
- num_script_tags(0),
- start_time(start_time_ticks),
- num_iterations(0) {}
-
- ~PageFeatureState() {}
-};
-
-// Per-frame state
-struct PhishingDOMFeatureExtractor::FrameData {
- // This is our reference to document.all, which is an iterator over all
- // of the elements in the document. It keeps track of our current position.
- blink::WebElementCollection elements;
- // The domain of the document URL, stored here so that we don't need to
- // recompute it every time it's needed.
- std::string domain;
-};
-
-PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
- FeatureExtractorClock* clock)
- : clock_(clock), weak_factory_(this) {
- Clear();
-}
-
-PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() {
- // The RenderView should have called CancelPendingExtraction() before
- // we are destroyed.
- CheckNoPendingExtraction();
-}
-
-void PhishingDOMFeatureExtractor::ExtractFeatures(
- blink::WebDocument document,
- FeatureMap* features,
- const DoneCallback& done_callback) {
- // The RenderView should have called CancelPendingExtraction() before
- // starting a new extraction, so DCHECK this.
- CheckNoPendingExtraction();
- // However, in an opt build, we will go ahead and clean up the pending
- // extraction so that we can start in a known state.
- CancelPendingExtraction();
-
- features_ = features;
- done_callback_ = done_callback;
-
- page_feature_state_.reset(new PageFeatureState(clock_->Now()));
- cur_document_ = document;
-
- base::ThreadTaskRunnerHandle::Get()->PostTask(
- FROM_HERE,
- base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
- weak_factory_.GetWeakPtr()));
-}
-
-void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
- // Cancel any pending callbacks, and clear our state.
- weak_factory_.InvalidateWeakPtrs();
- Clear();
-}
-
-void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
- DCHECK(page_feature_state_.get());
- ++page_feature_state_->num_iterations;
- base::TimeTicks current_chunk_start_time = clock_->Now();
-
- if (cur_document_.isNull()) {
- // This will only happen if we weren't able to get the document for the
- // main frame. We'll treat this as an extraction failure.
- RunCallback(false);
- return;
- }
-
- int num_elements = 0;
- for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) {
- blink::WebElement cur_element;
- if (cur_frame_data_.get()) {
- // We're resuming traversal of a frame, so just advance to the next
- // element.
- cur_element = cur_frame_data_->elements.nextItem();
- // When we resume the traversal, the first call to nextItem() potentially
- // has to walk through the document again from the beginning, if it was
- // modified between our chunks of work. Log how long this takes, so we
- // can tell if it's too slow.
- UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime",
- clock_->Now() - current_chunk_start_time);
- } else {
- // We just moved to a new frame, so update our frame state
- // and advance to the first element.
- ResetFrameData();
- cur_element = cur_frame_data_->elements.firstItem();
- }
-
- for (; !cur_element.isNull();
- cur_element = cur_frame_data_->elements.nextItem()) {
- if (cur_element.hasHTMLTagName("a")) {
- HandleLink(cur_element);
- } else if (cur_element.hasHTMLTagName("form")) {
- HandleForm(cur_element);
- } else if (cur_element.hasHTMLTagName("img")) {
- HandleImage(cur_element);
- } else if (cur_element.hasHTMLTagName("input")) {
- HandleInput(cur_element);
- } else if (cur_element.hasHTMLTagName("script")) {
- HandleScript(cur_element);
- }
-
- if (++num_elements >= kClockCheckGranularity) {
- num_elements = 0;
- base::TimeTicks now = clock_->Now();
- if (now - page_feature_state_->start_time >=
- base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
- DLOG(ERROR) << "Feature extraction took too long, giving up";
- // We expect this to happen infrequently, so record when it does.
- UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1);
- RunCallback(false);
- return;
- }
- base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
- if (chunk_elapsed >=
- base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
- // The time limit for the current chunk is up, so post a task to
- // continue extraction.
- //
- // Record how much time we actually spent on the chunk. If this is
- // much higher than kMaxTimePerChunkMs, we may need to adjust the
- // clock granularity.
- UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime",
- chunk_elapsed);
- base::ThreadTaskRunnerHandle::Get()->PostTask(
- FROM_HERE,
- base::Bind(
- &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
- weak_factory_.GetWeakPtr()));
- return;
- }
- // Otherwise, continue.
- }
- }
-
- // We're done with this frame, recalculate the FrameData when we
- // advance to the next frame.
- cur_frame_data_.reset();
- }
-
- InsertFeatures();
- RunCallback(true);
-}
-
-void PhishingDOMFeatureExtractor::HandleLink(
- const blink::WebElement& element) {
- // Count the number of times we link to a different host.
- if (!element.hasAttribute("href")) {
- DVLOG(1) << "Skipping anchor tag with no href";
- return;
- }
-
- // Retrieve the link and resolve the link in case it's relative.
- blink::WebURL full_url = CompleteURL(element, element.getAttribute("href"));
-
- std::string domain;
- bool is_external = IsExternalDomain(full_url, &domain);
- if (domain.empty()) {
- DVLOG(1) << "Could not extract domain from link: " << full_url;
- return;
- }
-
- if (is_external) {
- ++page_feature_state_->external_links;
-
- // Record each unique domain that we link to.
- page_feature_state_->external_domains.insert(domain);
- }
-
- // Check how many are https links.
- if (GURL(full_url).SchemeIs("https")) {
- ++page_feature_state_->secure_links;
- }
-
- ++page_feature_state_->total_links;
-}
-
-void PhishingDOMFeatureExtractor::HandleForm(
- const blink::WebElement& element) {
- // Increment the number of forms on this page.
- ++page_feature_state_->num_forms;
-
- // Record whether the action points to a different domain.
- if (!element.hasAttribute("action")) {
- return;
- }
-
- blink::WebURL full_url = CompleteURL(element, element.getAttribute("action"));
-
- page_feature_state_->page_action_urls.insert(full_url.string().utf8());
-
- std::string domain;
- bool is_external = IsExternalDomain(full_url, &domain);
- if (domain.empty()) {
- DVLOG(1) << "Could not extract domain from form action: " << full_url;
- return;
- }
-
- if (is_external) {
- ++page_feature_state_->action_other_domain;
- }
- ++page_feature_state_->total_actions;
-}
-
-void PhishingDOMFeatureExtractor::HandleImage(
- const blink::WebElement& element) {
- if (!element.hasAttribute("src")) {
- DVLOG(1) << "Skipping img tag with no src";
- }
-
- // Record whether the image points to a different domain.
- blink::WebURL full_url = CompleteURL(element, element.getAttribute("src"));
- std::string domain;
- bool is_external = IsExternalDomain(full_url, &domain);
- if (domain.empty()) {
- DVLOG(1) << "Could not extract domain from image src: " << full_url;
- return;
- }
-
- if (is_external) {
- ++page_feature_state_->img_other_domain;
- }
- ++page_feature_state_->total_imgs;
-}
-
-void PhishingDOMFeatureExtractor::HandleInput(
- const blink::WebElement& element) {
- // The HTML spec says that if the type is unspecified, it defaults to text.
- // In addition, any unrecognized type will be treated as a text input.
- //
- // Note that we use the attribute value rather than
- // WebFormControlElement::formControlType() for consistency with the
- // way the phishing classification model is created.
- std::string type = base::ToLowerASCII(element.getAttribute("type").utf8());
- if (type == "password") {
- ++page_feature_state_->num_pswd_inputs;
- } else if (type == "radio") {
- ++page_feature_state_->num_radio_inputs;
- } else if (type == "checkbox") {
- ++page_feature_state_->num_check_inputs;
- } else if (type != "submit" && type != "reset" && type != "file" &&
- type != "hidden" && type != "image" && type != "button") {
- // Note that there are a number of new input types in HTML5 that are not
- // handled above. For now, we will consider these as text inputs since
- // they could be used to capture user input.
- ++page_feature_state_->num_text_inputs;
- }
-}
-
-void PhishingDOMFeatureExtractor::HandleScript(
- const blink::WebElement& element) {
- ++page_feature_state_->num_script_tags;
-}
-
-void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
- DCHECK(done_callback_.is_null());
- DCHECK(!cur_frame_data_.get());
- DCHECK(cur_document_.isNull());
- if (!done_callback_.is_null() || cur_frame_data_.get() ||
- !cur_document_.isNull()) {
- LOG(ERROR) << "Extraction in progress, missing call to "
- << "CancelPendingExtraction";
- }
-}
-
-void PhishingDOMFeatureExtractor::RunCallback(bool success) {
- // Record some timing stats that we can use to evaluate feature extraction
- // performance. These include both successful and failed extractions.
- DCHECK(page_feature_state_.get());
- UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations",
- page_feature_state_->num_iterations);
- UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime",
- clock_->Now() - page_feature_state_->start_time);
-
- DCHECK(!done_callback_.is_null());
- done_callback_.Run(success);
- Clear();
-}
-
-void PhishingDOMFeatureExtractor::Clear() {
- features_ = NULL;
- done_callback_.Reset();
- cur_frame_data_.reset(NULL);
- cur_document_.reset();
-}
-
-void PhishingDOMFeatureExtractor::ResetFrameData() {
- DCHECK(!cur_document_.isNull());
- DCHECK(!cur_frame_data_.get());
-
- cur_frame_data_.reset(new FrameData());
- cur_frame_data_->elements = cur_document_.all();
- cur_frame_data_->domain =
- net::registry_controlled_domains::GetDomainAndRegistry(
- cur_document_.url(),
- net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
-}
-
-blink::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() {
- DCHECK(!cur_document_.isNull());
- blink::WebFrame* frame = cur_document_.frame();
- // Advance to the next frame that contains a document, with no wrapping.
- if (frame) {
- for (frame = frame->traverseNext(); frame; frame = frame->traverseNext()) {
- if (!frame->document().isNull()) {
- return frame->document();
- }
- }
- } else {
- // Keep track of how often frame traversal got "stuck" due to the
- // current subdocument getting removed from the frame tree.
- UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1);
- }
- return blink::WebDocument();
-}
-
-bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url,
- std::string* domain) const {
- DCHECK(domain);
- DCHECK(cur_frame_data_.get());
-
- if (cur_frame_data_->domain.empty()) {
- return false;
- }
-
- // TODO(bryner): Ensure that the url encoding is consistent with the features
- // in the model.
- if (url.HostIsIPAddress()) {
- domain->assign(url.host());
- } else {
- domain->assign(net::registry_controlled_domains::GetDomainAndRegistry(
- url, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES));
- }
-
- return !domain->empty() && *domain != cur_frame_data_->domain;
-}
-
-blink::WebURL PhishingDOMFeatureExtractor::CompleteURL(
- const blink::WebElement& element,
- const blink::WebString& partial_url) {
- return element.document().completeURL(partial_url);
-}
-
-void PhishingDOMFeatureExtractor::InsertFeatures() {
- DCHECK(page_feature_state_.get());
-
- if (page_feature_state_->total_links > 0) {
- // Add a feature for the fraction of times the page links to an external
- // domain vs. an internal domain.
- double link_freq = static_cast<double>(
- page_feature_state_->external_links) /
- page_feature_state_->total_links;
- features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq);
-
- // Add a feature for each unique domain that we're linking to
- for (const auto& domain : page_feature_state_->external_domains) {
- features_->AddBooleanFeature(features::kPageLinkDomain + domain);
- }
-
- // Fraction of links that use https.
- double secure_freq = static_cast<double>(
- page_feature_state_->secure_links) / page_feature_state_->total_links;
- features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq);
- }
-
- // Record whether forms appear and whether various form elements appear.
- if (page_feature_state_->num_forms > 0) {
- features_->AddBooleanFeature(features::kPageHasForms);
- }
- if (page_feature_state_->num_text_inputs > 0) {
- features_->AddBooleanFeature(features::kPageHasTextInputs);
- }
- if (page_feature_state_->num_pswd_inputs > 0) {
- features_->AddBooleanFeature(features::kPageHasPswdInputs);
- }
- if (page_feature_state_->num_radio_inputs > 0) {
- features_->AddBooleanFeature(features::kPageHasRadioInputs);
- }
- if (page_feature_state_->num_check_inputs > 0) {
- features_->AddBooleanFeature(features::kPageHasCheckInputs);
- }
-
- // Record fraction of form actions that point to a different domain.
- if (page_feature_state_->total_actions > 0) {
- double action_freq = static_cast<double>(
- page_feature_state_->action_other_domain) /
- page_feature_state_->total_actions;
- features_->AddRealFeature(features::kPageActionOtherDomainFreq,
- action_freq);
- }
-
- // Add a feature for each unique external action url.
- for (const auto& url : page_feature_state_->page_action_urls) {
- features_->AddBooleanFeature(features::kPageActionURL + url);
- }
-
- // Record how many image src attributes point to a different domain.
- if (page_feature_state_->total_imgs > 0) {
- double img_freq = static_cast<double>(
- page_feature_state_->img_other_domain) /
- page_feature_state_->total_imgs;
- features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq);
- }
-
- // Record number of script tags (discretized for numerical stability.)
- if (page_feature_state_->num_script_tags > 1) {
- features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne);
- if (page_feature_state_->num_script_tags > 6) {
- features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix);
- }
- }
-}
-
-} // namespace safe_browsing

Powered by Google App Engine
This is Rietveld 408576698