| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "chrome/renderer/safe_browsing/phishing_classifier.h" | |
| 6 | |
| 7 #include <string> | |
| 8 | |
| 9 #include "base/bind.h" | |
| 10 #include "base/callback.h" | |
| 11 #include "base/compiler_specific.h" | |
| 12 #include "base/location.h" | |
| 13 #include "base/logging.h" | |
| 14 #include "base/metrics/histogram_macros.h" | |
| 15 #include "base/single_thread_task_runner.h" | |
| 16 #include "base/strings/string_util.h" | |
| 17 #include "base/threading/thread_task_runner_handle.h" | |
| 18 #include "chrome/common/safe_browsing/csd.pb.h" | |
| 19 #include "chrome/common/url_constants.h" | |
| 20 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" | |
| 21 #include "chrome/renderer/safe_browsing/features.h" | |
| 22 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" | |
| 23 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" | |
| 24 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h" | |
| 25 #include "chrome/renderer/safe_browsing/scorer.h" | |
| 26 #include "content/public/renderer/render_frame.h" | |
| 27 #include "crypto/sha2.h" | |
| 28 #include "third_party/WebKit/public/platform/WebURL.h" | |
| 29 #include "third_party/WebKit/public/platform/WebURLRequest.h" | |
| 30 #include "third_party/WebKit/public/web/WebDataSource.h" | |
| 31 #include "third_party/WebKit/public/web/WebDocument.h" | |
| 32 #include "third_party/WebKit/public/web/WebLocalFrame.h" | |
| 33 #include "third_party/WebKit/public/web/WebView.h" | |
| 34 #include "url/gurl.h" | |
| 35 | |
| 36 namespace safe_browsing { | |
| 37 | |
| 38 const float PhishingClassifier::kInvalidScore = -1.0; | |
| 39 const float PhishingClassifier::kPhishyThreshold = 0.5; | |
| 40 | |
| 41 namespace { | |
| 42 // Used for UMA, do not reorder. | |
| 43 enum SkipClassificationReason { | |
| 44 CLASSIFICATION_PROCEED = 0, | |
| 45 SKIP_HTTPS = 1, | |
| 46 SKIP_NONE_GET = 2, | |
| 47 SKIP_REASON_MAX | |
| 48 }; | |
| 49 | |
| 50 void RecordReasonForSkippingClassificationToUMA( | |
| 51 SkipClassificationReason reason) { | |
| 52 UMA_HISTOGRAM_ENUMERATION("SBClientPhishing.SkipClassificationReason", | |
| 53 reason, | |
| 54 SKIP_REASON_MAX); | |
| 55 } | |
| 56 | |
| 57 } // namespace | |
| 58 | |
| 59 PhishingClassifier::PhishingClassifier(content::RenderFrame* render_frame, | |
| 60 FeatureExtractorClock* clock) | |
| 61 : render_frame_(render_frame), | |
| 62 scorer_(NULL), | |
| 63 clock_(clock), | |
| 64 weak_factory_(this) { | |
| 65 Clear(); | |
| 66 } | |
| 67 | |
| 68 PhishingClassifier::~PhishingClassifier() { | |
| 69 // The RenderView should have called CancelPendingClassification() before | |
| 70 // we are destroyed. | |
| 71 CheckNoPendingClassification(); | |
| 72 } | |
| 73 | |
| 74 void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) { | |
| 75 CheckNoPendingClassification(); | |
| 76 scorer_ = scorer; | |
| 77 if (scorer_) { | |
| 78 url_extractor_.reset(new PhishingUrlFeatureExtractor); | |
| 79 dom_extractor_.reset(new PhishingDOMFeatureExtractor(clock_.get())); | |
| 80 term_extractor_.reset(new PhishingTermFeatureExtractor( | |
| 81 &scorer_->page_terms(), | |
| 82 &scorer_->page_words(), | |
| 83 scorer_->max_words_per_term(), | |
| 84 scorer_->murmurhash3_seed(), | |
| 85 scorer_->max_shingles_per_page(), | |
| 86 scorer_->shingle_size(), | |
| 87 clock_.get())); | |
| 88 } else { | |
| 89 // We're disabling client-side phishing detection, so tear down all | |
| 90 // of the relevant objects. | |
| 91 url_extractor_.reset(); | |
| 92 dom_extractor_.reset(); | |
| 93 term_extractor_.reset(); | |
| 94 } | |
| 95 } | |
| 96 | |
| 97 bool PhishingClassifier::is_ready() const { | |
| 98 return scorer_ != NULL; | |
| 99 } | |
| 100 | |
| 101 void PhishingClassifier::BeginClassification( | |
| 102 const base::string16* page_text, | |
| 103 const DoneCallback& done_callback) { | |
| 104 DCHECK(is_ready()); | |
| 105 | |
| 106 // The RenderView should have called CancelPendingClassification() before | |
| 107 // starting a new classification, so DCHECK this. | |
| 108 CheckNoPendingClassification(); | |
| 109 // However, in an opt build, we will go ahead and clean up the pending | |
| 110 // classification so that we can start in a known state. | |
| 111 CancelPendingClassification(); | |
| 112 | |
| 113 page_text_ = page_text; | |
| 114 done_callback_ = done_callback; | |
| 115 | |
| 116 // For consistency, we always want to invoke the DoneCallback | |
| 117 // asynchronously, rather than directly from this method. To ensure that | |
| 118 // this is the case, post a task to begin feature extraction on the next | |
| 119 // iteration of the message loop. | |
| 120 base::ThreadTaskRunnerHandle::Get()->PostTask( | |
| 121 FROM_HERE, base::Bind(&PhishingClassifier::BeginFeatureExtraction, | |
| 122 weak_factory_.GetWeakPtr())); | |
| 123 } | |
| 124 | |
| 125 void PhishingClassifier::BeginFeatureExtraction() { | |
| 126 blink::WebLocalFrame* frame = render_frame_->GetWebFrame(); | |
| 127 | |
| 128 // Check whether the URL is one that we should classify. | |
| 129 // Currently, we only classify http: URLs that are GET requests. | |
| 130 GURL url(frame->document().url()); | |
| 131 if (!url.SchemeIs(url::kHttpScheme)) { | |
| 132 RecordReasonForSkippingClassificationToUMA(SKIP_HTTPS); | |
| 133 RunFailureCallback(); | |
| 134 return; | |
| 135 } | |
| 136 | |
| 137 blink::WebDataSource* ds = frame->dataSource(); | |
| 138 if (!ds || ds->getRequest().httpMethod().ascii() != "GET") { | |
| 139 if (ds) | |
| 140 RecordReasonForSkippingClassificationToUMA(SKIP_NONE_GET); | |
| 141 RunFailureCallback(); | |
| 142 return; | |
| 143 } | |
| 144 | |
| 145 RecordReasonForSkippingClassificationToUMA(CLASSIFICATION_PROCEED); | |
| 146 features_.reset(new FeatureMap); | |
| 147 if (!url_extractor_->ExtractFeatures(url, features_.get())) { | |
| 148 RunFailureCallback(); | |
| 149 return; | |
| 150 } | |
| 151 | |
| 152 // DOM feature extraction can take awhile, so it runs asynchronously | |
| 153 // in several chunks of work and invokes the callback when finished. | |
| 154 dom_extractor_->ExtractFeatures( | |
| 155 frame->document(), features_.get(), | |
| 156 base::Bind(&PhishingClassifier::DOMExtractionFinished, | |
| 157 base::Unretained(this))); | |
| 158 } | |
| 159 | |
| 160 void PhishingClassifier::CancelPendingClassification() { | |
| 161 // Note that cancelling the feature extractors is simply a no-op if they | |
| 162 // were not running. | |
| 163 DCHECK(is_ready()); | |
| 164 dom_extractor_->CancelPendingExtraction(); | |
| 165 term_extractor_->CancelPendingExtraction(); | |
| 166 weak_factory_.InvalidateWeakPtrs(); | |
| 167 Clear(); | |
| 168 } | |
| 169 | |
| 170 void PhishingClassifier::DOMExtractionFinished(bool success) { | |
| 171 shingle_hashes_.reset(new std::set<uint32_t>); | |
| 172 if (success) { | |
| 173 // Term feature extraction can take awhile, so it runs asynchronously | |
| 174 // in several chunks of work and invokes the callback when finished. | |
| 175 term_extractor_->ExtractFeatures( | |
| 176 page_text_, | |
| 177 features_.get(), | |
| 178 shingle_hashes_.get(), | |
| 179 base::Bind(&PhishingClassifier::TermExtractionFinished, | |
| 180 base::Unretained(this))); | |
| 181 } else { | |
| 182 RunFailureCallback(); | |
| 183 } | |
| 184 } | |
| 185 | |
| 186 void PhishingClassifier::TermExtractionFinished(bool success) { | |
| 187 if (success) { | |
| 188 blink::WebLocalFrame* main_frame = render_frame_->GetWebFrame(); | |
| 189 | |
| 190 // Hash all of the features so that they match the model, then compute | |
| 191 // the score. | |
| 192 FeatureMap hashed_features; | |
| 193 ClientPhishingRequest verdict; | |
| 194 verdict.set_model_version(scorer_->model_version()); | |
| 195 verdict.set_url(main_frame->document().url().string().utf8()); | |
| 196 for (base::hash_map<std::string, double>::const_iterator it = | |
| 197 features_->features().begin(); | |
| 198 it != features_->features().end(); ++it) { | |
| 199 DVLOG(2) << "Feature: " << it->first << " = " << it->second; | |
| 200 bool result = hashed_features.AddRealFeature( | |
| 201 crypto::SHA256HashString(it->first), it->second); | |
| 202 DCHECK(result); | |
| 203 ClientPhishingRequest::Feature* feature = verdict.add_feature_map(); | |
| 204 feature->set_name(it->first); | |
| 205 feature->set_value(it->second); | |
| 206 } | |
| 207 for (std::set<uint32_t>::const_iterator it = shingle_hashes_->begin(); | |
| 208 it != shingle_hashes_->end(); ++it) { | |
| 209 verdict.add_shingle_hashes(*it); | |
| 210 } | |
| 211 float score = static_cast<float>(scorer_->ComputeScore(hashed_features)); | |
| 212 verdict.set_client_score(score); | |
| 213 verdict.set_is_phishing(score >= kPhishyThreshold); | |
| 214 RunCallback(verdict); | |
| 215 } else { | |
| 216 RunFailureCallback(); | |
| 217 } | |
| 218 } | |
| 219 | |
| 220 void PhishingClassifier::CheckNoPendingClassification() { | |
| 221 DCHECK(done_callback_.is_null()); | |
| 222 DCHECK(!page_text_); | |
| 223 if (!done_callback_.is_null() || page_text_) { | |
| 224 LOG(ERROR) << "Classification in progress, missing call to " | |
| 225 << "CancelPendingClassification"; | |
| 226 } | |
| 227 } | |
| 228 | |
| 229 void PhishingClassifier::RunCallback(const ClientPhishingRequest& verdict) { | |
| 230 done_callback_.Run(verdict); | |
| 231 Clear(); | |
| 232 } | |
| 233 | |
| 234 void PhishingClassifier::RunFailureCallback() { | |
| 235 ClientPhishingRequest verdict; | |
| 236 // In this case we're not guaranteed to have a valid URL. Just set it | |
| 237 // to the empty string to make sure we have a valid protocol buffer. | |
| 238 verdict.set_url(""); | |
| 239 verdict.set_client_score(kInvalidScore); | |
| 240 verdict.set_is_phishing(false); | |
| 241 RunCallback(verdict); | |
| 242 } | |
| 243 | |
| 244 void PhishingClassifier::Clear() { | |
| 245 page_text_ = NULL; | |
| 246 done_callback_.Reset(); | |
| 247 features_.reset(NULL); | |
| 248 shingle_hashes_.reset(NULL); | |
| 249 } | |
| 250 | |
| 251 } // namespace safe_browsing | |
| OLD | NEW |