chrome/renderer/safe_browsing/phishing_classifier.cc - Issue 268673007: Extracting page shingle hashes for similarity detection.

Keyboard Shortcuts

	File
u :	up to issue
j / k :	jump to file after / before current file
J / K :	jump to next file with a comment after / before current file
	Side-by-side diff
i :	toggle intra-line diffs
e :	expand all comments
c :	collapse all comments
s :	toggle showing all comments
n / p :	next / previous diff chunk or comment
N / P :	next / previous comment
<Up> / <Down> :	next / previous line

	Issue
u :	up to list of issues
j / k :	jump to patch after / before current patch
o / <Enter> :	open current patch in side-by-side view
i :	open current patch in unified diff view

	Issue List
j / k :	jump to issue after / before current issue
o / <Enter> :	open current issue

Unified Diff: chrome/renderer/safe_browsing/phishing_classifier.cc

Issue 268673007: Extracting page shingle hashes for similarity detection. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Address 1st round comment Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« chrome/common/safe_browsing/csd.proto ('K') | « chrome/renderer/safe_browsing/phishing_classifier.h ('k') | chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc » ('j') | chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/renderer/safe_browsing/phishing_classifier.cc

diff --git a/chrome/renderer/safe_browsing/phishing_classifier.cc b/chrome/renderer/safe_browsing/phishing_classifier.cc

index f48e6c138c74e444e79c58d8edb32809d2b6daa7..8632c437825f07c2366d4159c05af1034219d83f 100644

--- a/chrome/renderer/safe_browsing/phishing_classifier.cc

+++ b/chrome/renderer/safe_browsing/phishing_classifier.cc

@@ -63,6 +63,8 @@ void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) {

&scorer_->page_words(),

scorer_->max_words_per_term(),

scorer_->murmurhash3_seed(),

+ scorer_->max_shingles_per_page(),

+ scorer_->shingle_size(),

clock_.get()));

} else {

// We're disabling client-side phishing detection, so tear down all

@@ -154,12 +156,14 @@ void PhishingClassifier::CancelPendingClassification() {

}

void PhishingClassifier::DOMExtractionFinished(bool success) {

+ shingle_hashes_.reset(new std::set<uint32>);

if (success) {

// Term feature extraction can take awhile, so it runs asynchronously

// in several chunks of work and invokes the callback when finished.

term_extractor_->ExtractFeatures(

page_text_,

features_.get(),

+ shingle_hashes_.get(),

base::Bind(&PhishingClassifier::TermExtractionFinished,

base::Unretained(this)));

} else {

@@ -197,6 +201,10 @@ void PhishingClassifier::TermExtractionFinished(bool success) {

feature->set_name(it->first);

feature->set_value(it->second);

}

+ for (std::set<uint32>::const_iterator it = shingle_hashes_->begin();

+ it != shingle_hashes_->end(); ++it) {

+ verdict.add_shingle_hashes(*it);

+ }

float score = static_cast<float>(scorer_->ComputeScore(hashed_features));

verdict.set_client_score(score);

verdict.set_is_phishing(score >= kPhishyThreshold);

@@ -236,6 +244,7 @@ void PhishingClassifier::Clear() {

page_text_ = NULL;

done_callback_.Reset();

features_.reset(NULL);

+ shingle_hashes_.reset(NULL);

}

} // namespace safe_browsing