chrome/renderer/safe_browsing/phishing_classifier.cc - Issue 2667343006: Componentize safe_browsing [X+1] : move the renderer part to component.

Side by Side Diff: chrome/renderer/safe_browsing/phishing_classifier.cc

Issue 2667343006: Componentize safe_browsing [X+1] : move the renderer part to component.

Patch Set: Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "chrome/renderer/safe_browsing/phishing_classifier.h"

6

7 #include <string>

8

9 #include "base/bind.h"

10 #include "base/callback.h"

11 #include "base/compiler_specific.h"

12 #include "base/location.h"

13 #include "base/logging.h"

14 #include "base/metrics/histogram_macros.h"

15 #include "base/single_thread_task_runner.h"

16 #include "base/strings/string_util.h"

17 #include "base/threading/thread_task_runner_handle.h"

18 #include "chrome/common/safe_browsing/csd.pb.h"

19 #include "chrome/common/url_constants.h"

20 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"

21 #include "chrome/renderer/safe_browsing/features.h"

22 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"

23 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"

24 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"

25 #include "chrome/renderer/safe_browsing/scorer.h"

26 #include "content/public/renderer/render_frame.h"

27 #include "crypto/sha2.h"

28 #include "third_party/WebKit/public/platform/WebURL.h"

29 #include "third_party/WebKit/public/platform/WebURLRequest.h"

30 #include "third_party/WebKit/public/web/WebDataSource.h"

31 #include "third_party/WebKit/public/web/WebDocument.h"

32 #include "third_party/WebKit/public/web/WebLocalFrame.h"

33 #include "third_party/WebKit/public/web/WebView.h"

34 #include "url/gurl.h"

35

36 namespace safe_browsing {

37

38 const float PhishingClassifier::kInvalidScore = -1.0;

39 const float PhishingClassifier::kPhishyThreshold = 0.5;

40

41 namespace {

42 // Used for UMA, do not reorder.

43 enum SkipClassificationReason {

44 CLASSIFICATION_PROCEED = 0,

45 SKIP_HTTPS = 1,

46 SKIP_NONE_GET = 2,

47 SKIP_REASON_MAX

48 };

49

50 void RecordReasonForSkippingClassificationToUMA(

51 SkipClassificationReason reason) {

52 UMA_HISTOGRAM_ENUMERATION("SBClientPhishing.SkipClassificationReason",

53 reason,

54 SKIP_REASON_MAX);

55 }

56

57 } // namespace

58

59 PhishingClassifier::PhishingClassifier(content::RenderFrame* render_frame,

60 FeatureExtractorClock* clock)

61 : render_frame_(render_frame),

62 scorer_(NULL),

63 clock_(clock),

64 weak_factory_(this) {

65 Clear();

66 }

67

68 PhishingClassifier::~PhishingClassifier() {

69 // The RenderView should have called CancelPendingClassification() before

70 // we are destroyed.

71 CheckNoPendingClassification();

72 }

73

74 void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) {

75 CheckNoPendingClassification();

76 scorer_ = scorer;

77 if (scorer_) {

78 url_extractor_.reset(new PhishingUrlFeatureExtractor);

79 dom_extractor_.reset(new PhishingDOMFeatureExtractor(clock_.get()));

80 term_extractor_.reset(new PhishingTermFeatureExtractor(

81 &scorer_->page_terms(),

82 &scorer_->page_words(),

83 scorer_->max_words_per_term(),

84 scorer_->murmurhash3_seed(),

85 scorer_->max_shingles_per_page(),

86 scorer_->shingle_size(),

87 clock_.get()));

88 } else {

89 // We're disabling client-side phishing detection, so tear down all

90 // of the relevant objects.

91 url_extractor_.reset();

92 dom_extractor_.reset();

93 term_extractor_.reset();

94 }

95 }

96

97 bool PhishingClassifier::is_ready() const {

98 return scorer_ != NULL;

99 }

100

101 void PhishingClassifier::BeginClassification(

102 const base::string16* page_text,

103 const DoneCallback& done_callback) {

104 DCHECK(is_ready());

105

106 // The RenderView should have called CancelPendingClassification() before

107 // starting a new classification, so DCHECK this.

108 CheckNoPendingClassification();

109 // However, in an opt build, we will go ahead and clean up the pending

110 // classification so that we can start in a known state.

111 CancelPendingClassification();

112

113 page_text_ = page_text;

114 done_callback_ = done_callback;

115

116 // For consistency, we always want to invoke the DoneCallback

117 // asynchronously, rather than directly from this method. To ensure that

118 // this is the case, post a task to begin feature extraction on the next

119 // iteration of the message loop.

120 base::ThreadTaskRunnerHandle::Get()->PostTask(

121 FROM_HERE, base::Bind(&PhishingClassifier::BeginFeatureExtraction,

122 weak_factory_.GetWeakPtr()));

123 }

124

125 void PhishingClassifier::BeginFeatureExtraction() {

126 blink::WebLocalFrame* frame = render_frame_->GetWebFrame();

127

128 // Check whether the URL is one that we should classify.

129 // Currently, we only classify http: URLs that are GET requests.

130 GURL url(frame->document().url());

131 if (!url.SchemeIs(url::kHttpScheme)) {

132 RecordReasonForSkippingClassificationToUMA(SKIP_HTTPS);

133 RunFailureCallback();

134 return;

135 }

136

137 blink::WebDataSource* ds = frame->dataSource();

138 if (!ds \|\| ds->getRequest().httpMethod().ascii() != "GET") {

139 if (ds)

140 RecordReasonForSkippingClassificationToUMA(SKIP_NONE_GET);

141 RunFailureCallback();

142 return;

143 }

144

145 RecordReasonForSkippingClassificationToUMA(CLASSIFICATION_PROCEED);

146 features_.reset(new FeatureMap);

147 if (!url_extractor_->ExtractFeatures(url, features_.get())) {

148 RunFailureCallback();

149 return;

150 }

151

152 // DOM feature extraction can take awhile, so it runs asynchronously

153 // in several chunks of work and invokes the callback when finished.

154 dom_extractor_->ExtractFeatures(

155 frame->document(), features_.get(),

156 base::Bind(&PhishingClassifier::DOMExtractionFinished,

157 base::Unretained(this)));

158 }

159

160 void PhishingClassifier::CancelPendingClassification() {

161 // Note that cancelling the feature extractors is simply a no-op if they

162 // were not running.

163 DCHECK(is_ready());

164 dom_extractor_->CancelPendingExtraction();

165 term_extractor_->CancelPendingExtraction();

166 weak_factory_.InvalidateWeakPtrs();

167 Clear();

168 }

169

170 void PhishingClassifier::DOMExtractionFinished(bool success) {

171 shingle_hashes_.reset(new std::set<uint32_t>);

172 if (success) {

173 // Term feature extraction can take awhile, so it runs asynchronously

174 // in several chunks of work and invokes the callback when finished.

175 term_extractor_->ExtractFeatures(

176 page_text_,

177 features_.get(),

178 shingle_hashes_.get(),

179 base::Bind(&PhishingClassifier::TermExtractionFinished,

180 base::Unretained(this)));

181 } else {

182 RunFailureCallback();

183 }

184 }

185

186 void PhishingClassifier::TermExtractionFinished(bool success) {

187 if (success) {

188 blink::WebLocalFrame* main_frame = render_frame_->GetWebFrame();

189

190 // Hash all of the features so that they match the model, then compute

191 // the score.

192 FeatureMap hashed_features;

193 ClientPhishingRequest verdict;

194 verdict.set_model_version(scorer_->model_version());

195 verdict.set_url(main_frame->document().url().string().utf8());

196 for (base::hash_map<std::string, double>::const_iterator it =

197 features_->features().begin();

198 it != features_->features().end(); ++it) {

199 DVLOG(2) << "Feature: " << it->first << " = " << it->second;

200 bool result = hashed_features.AddRealFeature(

201 crypto::SHA256HashString(it->first), it->second);

202 DCHECK(result);

203 ClientPhishingRequest::Feature* feature = verdict.add_feature_map();

204 feature->set_name(it->first);

205 feature->set_value(it->second);

206 }

207 for (std::set<uint32_t>::const_iterator it = shingle_hashes_->begin();

208 it != shingle_hashes_->end(); ++it) {

209 verdict.add_shingle_hashes(*it);

210 }

211 float score = static_cast<float>(scorer_->ComputeScore(hashed_features));

212 verdict.set_client_score(score);

213 verdict.set_is_phishing(score >= kPhishyThreshold);

214 RunCallback(verdict);

215 } else {

216 RunFailureCallback();

217 }

218 }

219

220 void PhishingClassifier::CheckNoPendingClassification() {

221 DCHECK(done_callback_.is_null());

222 DCHECK(!page_text_);

223 if (!done_callback_.is_null() \|\| page_text_) {

224 LOG(ERROR) << "Classification in progress, missing call to "

225 << "CancelPendingClassification";

226 }

227 }

228

229 void PhishingClassifier::RunCallback(const ClientPhishingRequest& verdict) {

230 done_callback_.Run(verdict);

231 Clear();

232 }

233

234 void PhishingClassifier::RunFailureCallback() {

235 ClientPhishingRequest verdict;

236 // In this case we're not guaranteed to have a valid URL. Just set it

237 // to the empty string to make sure we have a valid protocol buffer.

238 verdict.set_url("");

239 verdict.set_client_score(kInvalidScore);

240 verdict.set_is_phishing(false);

241 RunCallback(verdict);

242 }

243

244 void PhishingClassifier::Clear() {

245 page_text_ = NULL;

246 done_callback_.Reset();

247 features_.reset(NULL);

248 shingle_hashes_.reset(NULL);

249 }

250

251 } // namespace safe_browsing

OLD	NEW