Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(14)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_classifier.cc

Issue 268673007: Extracting page shingle hashes for similarity detection. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Fix a nit Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/renderer/safe_browsing/phishing_classifier.h" 5 #include "chrome/renderer/safe_browsing/phishing_classifier.h"
6 6
7 #include <string> 7 #include <string>
8 8
9 #include "base/bind.h" 9 #include "base/bind.h"
10 #include "base/callback.h" 10 #include "base/callback.h"
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
56 scorer_ = scorer; 56 scorer_ = scorer;
57 if (scorer_) { 57 if (scorer_) {
58 url_extractor_.reset(new PhishingUrlFeatureExtractor); 58 url_extractor_.reset(new PhishingUrlFeatureExtractor);
59 dom_extractor_.reset( 59 dom_extractor_.reset(
60 new PhishingDOMFeatureExtractor(render_view_, clock_.get())); 60 new PhishingDOMFeatureExtractor(render_view_, clock_.get()));
61 term_extractor_.reset(new PhishingTermFeatureExtractor( 61 term_extractor_.reset(new PhishingTermFeatureExtractor(
62 &scorer_->page_terms(), 62 &scorer_->page_terms(),
63 &scorer_->page_words(), 63 &scorer_->page_words(),
64 scorer_->max_words_per_term(), 64 scorer_->max_words_per_term(),
65 scorer_->murmurhash3_seed(), 65 scorer_->murmurhash3_seed(),
66 scorer_->max_shingles_per_page(),
67 scorer_->shingle_size(),
66 clock_.get())); 68 clock_.get()));
67 } else { 69 } else {
68 // We're disabling client-side phishing detection, so tear down all 70 // We're disabling client-side phishing detection, so tear down all
69 // of the relevant objects. 71 // of the relevant objects.
70 url_extractor_.reset(); 72 url_extractor_.reset();
71 dom_extractor_.reset(); 73 dom_extractor_.reset();
72 term_extractor_.reset(); 74 term_extractor_.reset();
73 } 75 }
74 } 76 }
75 77
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after
147 // Note that cancelling the feature extractors is simply a no-op if they 149 // Note that cancelling the feature extractors is simply a no-op if they
148 // were not running. 150 // were not running.
149 DCHECK(is_ready()); 151 DCHECK(is_ready());
150 dom_extractor_->CancelPendingExtraction(); 152 dom_extractor_->CancelPendingExtraction();
151 term_extractor_->CancelPendingExtraction(); 153 term_extractor_->CancelPendingExtraction();
152 weak_factory_.InvalidateWeakPtrs(); 154 weak_factory_.InvalidateWeakPtrs();
153 Clear(); 155 Clear();
154 } 156 }
155 157
156 void PhishingClassifier::DOMExtractionFinished(bool success) { 158 void PhishingClassifier::DOMExtractionFinished(bool success) {
159 shingle_hashes_.reset(new std::set<uint32>);
157 if (success) { 160 if (success) {
158 // Term feature extraction can take awhile, so it runs asynchronously 161 // Term feature extraction can take awhile, so it runs asynchronously
159 // in several chunks of work and invokes the callback when finished. 162 // in several chunks of work and invokes the callback when finished.
160 term_extractor_->ExtractFeatures( 163 term_extractor_->ExtractFeatures(
161 page_text_, 164 page_text_,
162 features_.get(), 165 features_.get(),
166 shingle_hashes_.get(),
163 base::Bind(&PhishingClassifier::TermExtractionFinished, 167 base::Bind(&PhishingClassifier::TermExtractionFinished,
164 base::Unretained(this))); 168 base::Unretained(this)));
165 } else { 169 } else {
166 RunFailureCallback(); 170 RunFailureCallback();
167 } 171 }
168 } 172 }
169 173
170 void PhishingClassifier::TermExtractionFinished(bool success) { 174 void PhishingClassifier::TermExtractionFinished(bool success) {
171 if (success) { 175 if (success) {
172 blink::WebView* web_view = render_view_->GetWebView(); 176 blink::WebView* web_view = render_view_->GetWebView();
(...skipping 17 matching lines...) Expand all
190 features_->features().begin(); 194 features_->features().begin();
191 it != features_->features().end(); ++it) { 195 it != features_->features().end(); ++it) {
192 VLOG(2) << "Feature: " << it->first << " = " << it->second; 196 VLOG(2) << "Feature: " << it->first << " = " << it->second;
193 bool result = hashed_features.AddRealFeature( 197 bool result = hashed_features.AddRealFeature(
194 crypto::SHA256HashString(it->first), it->second); 198 crypto::SHA256HashString(it->first), it->second);
195 DCHECK(result); 199 DCHECK(result);
196 ClientPhishingRequest::Feature* feature = verdict.add_feature_map(); 200 ClientPhishingRequest::Feature* feature = verdict.add_feature_map();
197 feature->set_name(it->first); 201 feature->set_name(it->first);
198 feature->set_value(it->second); 202 feature->set_value(it->second);
199 } 203 }
204 for (std::set<uint32>::const_iterator it = shingle_hashes_->begin();
205 it != shingle_hashes_->end(); ++it) {
206 verdict.add_shingle_hashes(*it);
207 }
200 float score = static_cast<float>(scorer_->ComputeScore(hashed_features)); 208 float score = static_cast<float>(scorer_->ComputeScore(hashed_features));
201 verdict.set_client_score(score); 209 verdict.set_client_score(score);
202 verdict.set_is_phishing(score >= kPhishyThreshold); 210 verdict.set_is_phishing(score >= kPhishyThreshold);
203 RunCallback(verdict); 211 RunCallback(verdict);
204 } else { 212 } else {
205 RunFailureCallback(); 213 RunFailureCallback();
206 } 214 }
207 } 215 }
208 216
209 void PhishingClassifier::CheckNoPendingClassification() { 217 void PhishingClassifier::CheckNoPendingClassification() {
(...skipping 19 matching lines...) Expand all
229 verdict.set_url(""); 237 verdict.set_url("");
230 verdict.set_client_score(kInvalidScore); 238 verdict.set_client_score(kInvalidScore);
231 verdict.set_is_phishing(false); 239 verdict.set_is_phishing(false);
232 RunCallback(verdict); 240 RunCallback(verdict);
233 } 241 }
234 242
235 void PhishingClassifier::Clear() { 243 void PhishingClassifier::Clear() {
236 page_text_ = NULL; 244 page_text_ = NULL;
237 done_callback_.Reset(); 245 done_callback_.Reset();
238 features_.reset(NULL); 246 features_.reset(NULL);
247 shingle_hashes_.reset(NULL);
239 } 248 }
240 249
241 } // namespace safe_browsing 250 } // namespace safe_browsing
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698