chrome/renderer/safe_browsing/phishing_term_feature_extractor.h - Issue 268673007: Extracting page shingle hashes for similarity detection.

Unified Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h

Issue 268673007: Extracting page shingle hashes for similarity detection. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Fix a reference problem. Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc ('k') | chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc » ('j') | chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h

diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h

index cef13ca7df8a370168fa9b1c2c15c90d523e42c4..d5695d8a0811fd10c24587342eedb337dd507e13 100644

--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h

+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h

@@ -16,6 +16,7 @@

#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_

#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_

+#include <set>

#include <string>

#include "base/basictypes.h"

@@ -47,13 +48,20 @@ class PhishingTermFeatureExtractor {

// must ensure that they are valid until the PhishingTermFeatureExtractor is

// destroyed.

+ // In addition to extracting page terms, we will also extract text shingling

+ // sketch, which consists of hashes of N-gram-words (referred to as shingles)

+ // in the page. |shingle_size| defines N, and |max_hashes_per_page| defines

+ // the maximum number of unique shingle hashes we extracted per page.

+ //

// |clock| is used for timing feature extractor operations, and may be mocked

// for testing. The caller keeps ownership of the clock.

PhishingTermFeatureExtractor(

const base::hash_set<std::string>* page_term_hashes,

const base::hash_set<uint32>* page_word_hashes,

+ size_t max_hashes_per_page,

mattm 2014/05/06 01:00:14 Also move this in argument list next to shingle_si

mattm 2014/05/06 01:00:14 I would suggest max_shingles_per_page for this (an

zysxqn 2014/05/06 20:56:57 Done.

size_t max_words_per_term,

uint32 murmurhash3_seed,

+ size_t shingle_size,

FeatureExtractorClock* clock);

~PhishingTermFeatureExtractor();

@@ -72,6 +80,7 @@ class PhishingTermFeatureExtractor {

// CancelPendingExtraction() is called.

void ExtractFeatures(const base::string16* page_text,

FeatureMap* features,

+ std::set<uint32>* shingle_hashes,

const DoneCallback& done_callback);

// Cancels any pending feature extraction. The DoneCallback will not be run.

@@ -129,12 +138,18 @@ class PhishingTermFeatureExtractor {

// doesn't contain any part of one of our terms.

const base::hash_set<uint32>* page_word_hashes_;

+ // The maximum number of unique shingle hashes we extract in a page.

+ const size_t max_hashes_per_page_;

// The maximum number of words in an n-gram.

const size_t max_words_per_term_;

// The seed for murmurhash3.

const uint32 murmurhash3_seed_;

+ // The number of words in a shingle.

+ const size_t shingle_size_;

// This cache is used to see if we need to check the word at all, as

// converting to UTF8, lowercasing, and hashing are all relatively expensive

// operations. Though this is called an MRU cache, it seems to behave like

@@ -148,6 +163,7 @@ class PhishingTermFeatureExtractor {

// The output parameters from the most recent call to ExtractFeatures().

const base::string16* page_text_; // The caller keeps ownership of this.

FeatureMap* features_; // The caller keeps ownership of this.

+ std::set<uint32>* shingle_hashes_;

DoneCallback done_callback_;

// Stores the current state of term extraction from |page_text_|.