Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(7369)

Unified Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h

Issue 268673007: Extracting page shingle hashes for similarity detection. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Fix a nit Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
index cef13ca7df8a370168fa9b1c2c15c90d523e42c4..272108c32a167ffda2e7f14992844add8a05091e 100644
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
@@ -16,6 +16,7 @@
#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
+#include <set>
#include <string>
#include "base/basictypes.h"
@@ -47,6 +48,11 @@ class PhishingTermFeatureExtractor {
// must ensure that they are valid until the PhishingTermFeatureExtractor is
// destroyed.
//
+ // In addition to extracting page terms, we will also extract text shingling
+ // sketch, which consists of hashes of N-gram-words (referred to as shingles)
+ // in the page. |shingle_size| defines N, and |max_shingles_per_page| defines
+ // the maximum number of unique shingle hashes we extracted per page.
+ //
// |clock| is used for timing feature extractor operations, and may be mocked
// for testing. The caller keeps ownership of the clock.
PhishingTermFeatureExtractor(
@@ -54,6 +60,8 @@ class PhishingTermFeatureExtractor {
const base::hash_set<uint32>* page_word_hashes,
size_t max_words_per_term,
uint32 murmurhash3_seed,
+ size_t max_shingles_per_page,
+ size_t shingle_size,
FeatureExtractorClock* clock);
~PhishingTermFeatureExtractor();
@@ -67,11 +75,12 @@ class PhishingTermFeatureExtractor {
// |done_callback| is run on the current thread.
// PhishingTermFeatureExtractor takes ownership of the callback.
//
- // |page_text| and |features| are owned by the caller, and must not be
- // destroyed until either |done_callback| is run or
+ // |page_text|, |features|, and |shingle_hashes| are owned by the caller,
+ // and must not be destroyed until either |done_callback| is run or
// CancelPendingExtraction() is called.
void ExtractFeatures(const base::string16* page_text,
FeatureMap* features,
+ std::set<uint32>* shingle_hashes,
const DoneCallback& done_callback);
// Cancels any pending feature extraction. The DoneCallback will not be run.
@@ -135,6 +144,12 @@ class PhishingTermFeatureExtractor {
// The seed for murmurhash3.
const uint32 murmurhash3_seed_;
+ // The maximum number of unique shingle hashes we extract in a page.
+ const size_t max_shingles_per_page_;
+
+ // The number of words in a shingle.
+ const size_t shingle_size_;
+
// This cache is used to see if we need to check the word at all, as
// converting to UTF8, lowercasing, and hashing are all relatively expensive
// operations. Though this is called an MRU cache, it seems to behave like
@@ -148,6 +163,7 @@ class PhishingTermFeatureExtractor {
// The output parameters from the most recent call to ExtractFeatures().
const base::string16* page_text_; // The caller keeps ownership of this.
FeatureMap* features_; // The caller keeps ownership of this.
+ std::set<uint32>* shingle_hashes_;
DoneCallback done_callback_;
// Stores the current state of term extraction from |page_text_|.

Powered by Google App Engine
This is Rietveld 408576698