Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3923)

Unified Diff: chrome/renderer/safe_browsing/phishing_classifier.cc

Issue 268673007: Extracting page shingle hashes for similarity detection. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Address 1st round comment Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/renderer/safe_browsing/phishing_classifier.cc
diff --git a/chrome/renderer/safe_browsing/phishing_classifier.cc b/chrome/renderer/safe_browsing/phishing_classifier.cc
index f48e6c138c74e444e79c58d8edb32809d2b6daa7..8632c437825f07c2366d4159c05af1034219d83f 100644
--- a/chrome/renderer/safe_browsing/phishing_classifier.cc
+++ b/chrome/renderer/safe_browsing/phishing_classifier.cc
@@ -63,6 +63,8 @@ void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) {
&scorer_->page_words(),
scorer_->max_words_per_term(),
scorer_->murmurhash3_seed(),
+ scorer_->max_shingles_per_page(),
+ scorer_->shingle_size(),
clock_.get()));
} else {
// We're disabling client-side phishing detection, so tear down all
@@ -154,12 +156,14 @@ void PhishingClassifier::CancelPendingClassification() {
}
void PhishingClassifier::DOMExtractionFinished(bool success) {
+ shingle_hashes_.reset(new std::set<uint32>);
if (success) {
// Term feature extraction can take awhile, so it runs asynchronously
// in several chunks of work and invokes the callback when finished.
term_extractor_->ExtractFeatures(
page_text_,
features_.get(),
+ shingle_hashes_.get(),
base::Bind(&PhishingClassifier::TermExtractionFinished,
base::Unretained(this)));
} else {
@@ -197,6 +201,10 @@ void PhishingClassifier::TermExtractionFinished(bool success) {
feature->set_name(it->first);
feature->set_value(it->second);
}
+ for (std::set<uint32>::const_iterator it = shingle_hashes_->begin();
+ it != shingle_hashes_->end(); ++it) {
+ verdict.add_shingle_hashes(*it);
+ }
float score = static_cast<float>(scorer_->ComputeScore(hashed_features));
verdict.set_client_score(score);
verdict.set_is_phishing(score >= kPhishyThreshold);
@@ -236,6 +244,7 @@ void PhishingClassifier::Clear() {
page_text_ = NULL;
done_callback_.Reset();
features_.reset(NULL);
+ shingle_hashes_.reset(NULL);
}
} // namespace safe_browsing

Powered by Google App Engine
This is Rietveld 408576698