Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(254)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_classifier.cc

Issue 2667343006: Componentize safe_browsing [X+1] : move the renderer part to component.
Patch Set: Created 3 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/renderer/safe_browsing/phishing_classifier.h"
6
7 #include <string>
8
9 #include "base/bind.h"
10 #include "base/callback.h"
11 #include "base/compiler_specific.h"
12 #include "base/location.h"
13 #include "base/logging.h"
14 #include "base/metrics/histogram_macros.h"
15 #include "base/single_thread_task_runner.h"
16 #include "base/strings/string_util.h"
17 #include "base/threading/thread_task_runner_handle.h"
18 #include "chrome/common/safe_browsing/csd.pb.h"
19 #include "chrome/common/url_constants.h"
20 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
21 #include "chrome/renderer/safe_browsing/features.h"
22 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
23 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
24 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
25 #include "chrome/renderer/safe_browsing/scorer.h"
26 #include "content/public/renderer/render_frame.h"
27 #include "crypto/sha2.h"
28 #include "third_party/WebKit/public/platform/WebURL.h"
29 #include "third_party/WebKit/public/platform/WebURLRequest.h"
30 #include "third_party/WebKit/public/web/WebDataSource.h"
31 #include "third_party/WebKit/public/web/WebDocument.h"
32 #include "third_party/WebKit/public/web/WebLocalFrame.h"
33 #include "third_party/WebKit/public/web/WebView.h"
34 #include "url/gurl.h"
35
36 namespace safe_browsing {
37
38 const float PhishingClassifier::kInvalidScore = -1.0;
39 const float PhishingClassifier::kPhishyThreshold = 0.5;
40
41 namespace {
42 // Used for UMA, do not reorder.
43 enum SkipClassificationReason {
44 CLASSIFICATION_PROCEED = 0,
45 SKIP_HTTPS = 1,
46 SKIP_NONE_GET = 2,
47 SKIP_REASON_MAX
48 };
49
50 void RecordReasonForSkippingClassificationToUMA(
51 SkipClassificationReason reason) {
52 UMA_HISTOGRAM_ENUMERATION("SBClientPhishing.SkipClassificationReason",
53 reason,
54 SKIP_REASON_MAX);
55 }
56
57 } // namespace
58
59 PhishingClassifier::PhishingClassifier(content::RenderFrame* render_frame,
60 FeatureExtractorClock* clock)
61 : render_frame_(render_frame),
62 scorer_(NULL),
63 clock_(clock),
64 weak_factory_(this) {
65 Clear();
66 }
67
68 PhishingClassifier::~PhishingClassifier() {
69 // The RenderView should have called CancelPendingClassification() before
70 // we are destroyed.
71 CheckNoPendingClassification();
72 }
73
74 void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) {
75 CheckNoPendingClassification();
76 scorer_ = scorer;
77 if (scorer_) {
78 url_extractor_.reset(new PhishingUrlFeatureExtractor);
79 dom_extractor_.reset(new PhishingDOMFeatureExtractor(clock_.get()));
80 term_extractor_.reset(new PhishingTermFeatureExtractor(
81 &scorer_->page_terms(),
82 &scorer_->page_words(),
83 scorer_->max_words_per_term(),
84 scorer_->murmurhash3_seed(),
85 scorer_->max_shingles_per_page(),
86 scorer_->shingle_size(),
87 clock_.get()));
88 } else {
89 // We're disabling client-side phishing detection, so tear down all
90 // of the relevant objects.
91 url_extractor_.reset();
92 dom_extractor_.reset();
93 term_extractor_.reset();
94 }
95 }
96
97 bool PhishingClassifier::is_ready() const {
98 return scorer_ != NULL;
99 }
100
101 void PhishingClassifier::BeginClassification(
102 const base::string16* page_text,
103 const DoneCallback& done_callback) {
104 DCHECK(is_ready());
105
106 // The RenderView should have called CancelPendingClassification() before
107 // starting a new classification, so DCHECK this.
108 CheckNoPendingClassification();
109 // However, in an opt build, we will go ahead and clean up the pending
110 // classification so that we can start in a known state.
111 CancelPendingClassification();
112
113 page_text_ = page_text;
114 done_callback_ = done_callback;
115
116 // For consistency, we always want to invoke the DoneCallback
117 // asynchronously, rather than directly from this method. To ensure that
118 // this is the case, post a task to begin feature extraction on the next
119 // iteration of the message loop.
120 base::ThreadTaskRunnerHandle::Get()->PostTask(
121 FROM_HERE, base::Bind(&PhishingClassifier::BeginFeatureExtraction,
122 weak_factory_.GetWeakPtr()));
123 }
124
125 void PhishingClassifier::BeginFeatureExtraction() {
126 blink::WebLocalFrame* frame = render_frame_->GetWebFrame();
127
128 // Check whether the URL is one that we should classify.
129 // Currently, we only classify http: URLs that are GET requests.
130 GURL url(frame->document().url());
131 if (!url.SchemeIs(url::kHttpScheme)) {
132 RecordReasonForSkippingClassificationToUMA(SKIP_HTTPS);
133 RunFailureCallback();
134 return;
135 }
136
137 blink::WebDataSource* ds = frame->dataSource();
138 if (!ds || ds->getRequest().httpMethod().ascii() != "GET") {
139 if (ds)
140 RecordReasonForSkippingClassificationToUMA(SKIP_NONE_GET);
141 RunFailureCallback();
142 return;
143 }
144
145 RecordReasonForSkippingClassificationToUMA(CLASSIFICATION_PROCEED);
146 features_.reset(new FeatureMap);
147 if (!url_extractor_->ExtractFeatures(url, features_.get())) {
148 RunFailureCallback();
149 return;
150 }
151
152 // DOM feature extraction can take awhile, so it runs asynchronously
153 // in several chunks of work and invokes the callback when finished.
154 dom_extractor_->ExtractFeatures(
155 frame->document(), features_.get(),
156 base::Bind(&PhishingClassifier::DOMExtractionFinished,
157 base::Unretained(this)));
158 }
159
160 void PhishingClassifier::CancelPendingClassification() {
161 // Note that cancelling the feature extractors is simply a no-op if they
162 // were not running.
163 DCHECK(is_ready());
164 dom_extractor_->CancelPendingExtraction();
165 term_extractor_->CancelPendingExtraction();
166 weak_factory_.InvalidateWeakPtrs();
167 Clear();
168 }
169
170 void PhishingClassifier::DOMExtractionFinished(bool success) {
171 shingle_hashes_.reset(new std::set<uint32_t>);
172 if (success) {
173 // Term feature extraction can take awhile, so it runs asynchronously
174 // in several chunks of work and invokes the callback when finished.
175 term_extractor_->ExtractFeatures(
176 page_text_,
177 features_.get(),
178 shingle_hashes_.get(),
179 base::Bind(&PhishingClassifier::TermExtractionFinished,
180 base::Unretained(this)));
181 } else {
182 RunFailureCallback();
183 }
184 }
185
186 void PhishingClassifier::TermExtractionFinished(bool success) {
187 if (success) {
188 blink::WebLocalFrame* main_frame = render_frame_->GetWebFrame();
189
190 // Hash all of the features so that they match the model, then compute
191 // the score.
192 FeatureMap hashed_features;
193 ClientPhishingRequest verdict;
194 verdict.set_model_version(scorer_->model_version());
195 verdict.set_url(main_frame->document().url().string().utf8());
196 for (base::hash_map<std::string, double>::const_iterator it =
197 features_->features().begin();
198 it != features_->features().end(); ++it) {
199 DVLOG(2) << "Feature: " << it->first << " = " << it->second;
200 bool result = hashed_features.AddRealFeature(
201 crypto::SHA256HashString(it->first), it->second);
202 DCHECK(result);
203 ClientPhishingRequest::Feature* feature = verdict.add_feature_map();
204 feature->set_name(it->first);
205 feature->set_value(it->second);
206 }
207 for (std::set<uint32_t>::const_iterator it = shingle_hashes_->begin();
208 it != shingle_hashes_->end(); ++it) {
209 verdict.add_shingle_hashes(*it);
210 }
211 float score = static_cast<float>(scorer_->ComputeScore(hashed_features));
212 verdict.set_client_score(score);
213 verdict.set_is_phishing(score >= kPhishyThreshold);
214 RunCallback(verdict);
215 } else {
216 RunFailureCallback();
217 }
218 }
219
220 void PhishingClassifier::CheckNoPendingClassification() {
221 DCHECK(done_callback_.is_null());
222 DCHECK(!page_text_);
223 if (!done_callback_.is_null() || page_text_) {
224 LOG(ERROR) << "Classification in progress, missing call to "
225 << "CancelPendingClassification";
226 }
227 }
228
229 void PhishingClassifier::RunCallback(const ClientPhishingRequest& verdict) {
230 done_callback_.Run(verdict);
231 Clear();
232 }
233
234 void PhishingClassifier::RunFailureCallback() {
235 ClientPhishingRequest verdict;
236 // In this case we're not guaranteed to have a valid URL. Just set it
237 // to the empty string to make sure we have a valid protocol buffer.
238 verdict.set_url("");
239 verdict.set_client_score(kInvalidScore);
240 verdict.set_is_phishing(false);
241 RunCallback(verdict);
242 }
243
244 void PhishingClassifier::Clear() {
245 page_text_ = NULL;
246 done_callback_.Reset();
247 features_.reset(NULL);
248 shingle_hashes_.reset(NULL);
249 }
250
251 } // namespace safe_browsing
OLDNEW
« no previous file with comments | « chrome/renderer/safe_browsing/phishing_classifier.h ('k') | chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698