Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(289)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h

Issue 268673007: Extracting page shingle hashes for similarity detection. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Address 4th round comment. Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 // 4 //
5 // PhishingTermFeatureExtractor handles computing term features from the text 5 // PhishingTermFeatureExtractor handles computing term features from the text
6 // of a web page for the client-side phishing detection model. To do this, it 6 // of a web page for the client-side phishing detection model. To do this, it
7 // takes a list of terms that appear in the model, and scans through the page 7 // takes a list of terms that appear in the model, and scans through the page
8 // text looking for them. Any terms that appear will cause a corresponding 8 // text looking for them. Any terms that appear will cause a corresponding
9 // features::kPageTerm feature to be added to the FeatureMap. 9 // features::kPageTerm feature to be added to the FeatureMap.
10 // 10 //
11 // To make it harder for a phisher to enumerate all of the relevant terms in 11 // To make it harder for a phisher to enumerate all of the relevant terms in
12 // the model, the terms are provided as SHA-256 hashes, rather than plain text. 12 // the model, the terms are provided as SHA-256 hashes, rather than plain text.
13 // 13 //
14 // There is one PhishingTermFeatureExtractor per RenderView. 14 // There is one PhishingTermFeatureExtractor per RenderView.
15 15
16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ 16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ 17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
18 18
19 #include <set>
19 #include <string> 20 #include <string>
20 21
21 #include "base/basictypes.h" 22 #include "base/basictypes.h"
22 #include "base/callback.h" 23 #include "base/callback.h"
23 #include "base/containers/hash_tables.h" 24 #include "base/containers/hash_tables.h"
24 #include "base/containers/mru_cache.h"
25 #include "base/memory/scoped_ptr.h" 25 #include "base/memory/scoped_ptr.h"
26 #include "base/memory/weak_ptr.h" 26 #include "base/memory/weak_ptr.h"
27 #include "base/strings/string16.h" 27 #include "base/strings/string16.h"
28 #include "base/strings/string_piece.h" 28 #include "base/strings/string_piece.h"
29 29
30 namespace safe_browsing { 30 namespace safe_browsing {
31 class FeatureExtractorClock; 31 class FeatureExtractorClock;
32 class FeatureMap; 32 class FeatureMap;
33 33
34 class PhishingTermFeatureExtractor { 34 class PhishingTermFeatureExtractor {
35 public: 35 public:
36 // Callback to be run when feature extraction finishes. The callback 36 // Callback to be run when feature extraction finishes. The callback
37 // argument is true if extraction was successful, false otherwise. 37 // argument is true if extraction was successful, false otherwise.
38 typedef base::Callback<void(bool)> DoneCallback; 38 typedef base::Callback<void(bool)> DoneCallback;
39 39
40 // Creates a PhishingTermFeatureExtractor which will extract features for 40 // Creates a PhishingTermFeatureExtractor which will extract features for
41 // all of the terms whose SHA-256 hashes are in |page_term_hashes|. These 41 // all of the terms whose SHA-256 hashes are in |page_term_hashes|. These
42 // terms may be multi-word n-grams, with at most |max_words_per_term| words. 42 // terms may be multi-word n-grams, with at most |max_words_per_term| words.
43 // 43 //
44 // |page_word_hashes| contains the murmur3 hashes for all of the individual 44 // |page_word_hashes| contains the murmur3 hashes for all of the individual
45 // words that make up the terms. Both sets of strings are UTF-8 encoded and 45 // words that make up the terms. Both sets of strings are UTF-8 encoded and
46 // lowercased prior to hashing. The caller owns both sets of strings, and 46 // lowercased prior to hashing. The caller owns both sets of strings, and
47 // must ensure that they are valid until the PhishingTermFeatureExtractor is 47 // must ensure that they are valid until the PhishingTermFeatureExtractor is
48 // destroyed. 48 // destroyed.
49 // 49 //
50 // In addition to extracting page terms, we will also extract text shingling
51 // sketch, which consists of hashes of N-gram-words (referred to as shingles)
52 // in the page. |shingle_size| defines N, and |max_shingles_per_page| defines
53 // the maximum number of unique shingle hashes we extracted per page.
54 //
50 // |clock| is used for timing feature extractor operations, and may be mocked 55 // |clock| is used for timing feature extractor operations, and may be mocked
51 // for testing. The caller keeps ownership of the clock. 56 // for testing. The caller keeps ownership of the clock.
52 PhishingTermFeatureExtractor( 57 PhishingTermFeatureExtractor(
53 const base::hash_set<std::string>* page_term_hashes, 58 const base::hash_set<std::string>* page_term_hashes,
54 const base::hash_set<uint32>* page_word_hashes, 59 const base::hash_set<uint32>* page_word_hashes,
55 size_t max_words_per_term, 60 size_t max_words_per_term,
56 uint32 murmurhash3_seed, 61 uint32 murmurhash3_seed,
62 size_t max_shingles_per_page,
63 size_t shingle_size,
57 FeatureExtractorClock* clock); 64 FeatureExtractorClock* clock);
58 ~PhishingTermFeatureExtractor(); 65 ~PhishingTermFeatureExtractor();
59 66
60 // Begins extracting features from |page_text| into the given FeatureMap. 67 // Begins extracting features from |page_text| into the given FeatureMap.
61 // |page_text| should contain the plain text of a web page, including any 68 // |page_text| should contain the plain text of a web page, including any
62 // subframes, as returned by RenderView::CaptureText(). 69 // subframes, as returned by RenderView::CaptureText().
63 // 70 //
64 // To avoid blocking the render thread for too long, the feature extractor 71 // To avoid blocking the render thread for too long, the feature extractor
65 // may run in several chunks of work, posting a task to the current 72 // may run in several chunks of work, posting a task to the current
66 // MessageLoop to continue processing. Once feature extraction is complete, 73 // MessageLoop to continue processing. Once feature extraction is complete,
67 // |done_callback| is run on the current thread. 74 // |done_callback| is run on the current thread.
68 // PhishingTermFeatureExtractor takes ownership of the callback. 75 // PhishingTermFeatureExtractor takes ownership of the callback.
69 // 76 //
70 // |page_text| and |features| are owned by the caller, and must not be 77 // |page_text|, |features|, and |shingle_hashes| are owned by the caller,
71 // destroyed until either |done_callback| is run or 78 // and must not be destroyed until either |done_callback| is run or
72 // CancelPendingExtraction() is called. 79 // CancelPendingExtraction() is called.
73 void ExtractFeatures(const base::string16* page_text, 80 void ExtractFeatures(const base::string16* page_text,
74 FeatureMap* features, 81 FeatureMap* features,
82 std::set<uint32>* shingle_hashes,
75 const DoneCallback& done_callback); 83 const DoneCallback& done_callback);
76 84
77 // Cancels any pending feature extraction. The DoneCallback will not be run. 85 // Cancels any pending feature extraction. The DoneCallback will not be run.
78 // Must be called if there is a feature extraction in progress when the page 86 // Must be called if there is a feature extraction in progress when the page
79 // is unloaded or the PhishingTermFeatureExtractor is destroyed. 87 // is unloaded or the PhishingTermFeatureExtractor is destroyed.
80 void CancelPendingExtraction(); 88 void CancelPendingExtraction();
81 89
82 private: 90 private:
83 struct ExtractionState; 91 struct ExtractionState;
84 92
85 // The maximum amount of wall time that we will spend on a single extraction 93 // The maximum amount of wall time that we will spend on a single extraction
86 // iteration before pausing to let other MessageLoop tasks run. 94 // iteration before pausing to let other MessageLoop tasks run.
87 static const int kMaxTimePerChunkMs; 95 static const int kMaxTimePerChunkMs;
88 96
89 // The number of words that we will process before checking to see whether 97 // The number of words that we will process before checking to see whether
90 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be 98 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be
91 // slow, we don't do this on every word processed. 99 // slow, we don't do this on every word processed.
92 static const int kClockCheckGranularity; 100 static const int kClockCheckGranularity;
93 101
94 // The maximum total amount of time that the feature extractor will run 102 // The maximum total amount of time that the feature extractor will run
95 // before giving up on the current page. 103 // before giving up on the current page.
96 static const int kMaxTotalTimeMs; 104 static const int kMaxTotalTimeMs;
97 105
98 // The size of the cache that we use to determine if we can avoid lower
99 // casing, hashing, and UTF conversion.
100 static const int kMaxNegativeWordCacheSize;
101
102 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs 106 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs
103 // until a predefined maximum amount of time has elapsed, then posts a task 107 // until a predefined maximum amount of time has elapsed, then posts a task
104 // to the current MessageLoop to continue extraction. When extraction 108 // to the current MessageLoop to continue extraction. When extraction
105 // finishes, calls RunCallback(). 109 // finishes, calls RunCallback().
106 void ExtractFeaturesWithTimeout(); 110 void ExtractFeaturesWithTimeout();
107 111
108 // Handles a single word in the page text. 112 // Handles a single word in the page text.
109 void HandleWord(const base::StringPiece16& word); 113 void HandleWord(const base::StringPiece16& word);
110 114
111 // Helper to verify that there is no pending feature extraction. Dies in 115 // Helper to verify that there is no pending feature extraction. Dies in
(...skipping 16 matching lines...) Expand all
128 // quick out in the common case that the current word we are processing 132 // quick out in the common case that the current word we are processing
129 // doesn't contain any part of one of our terms. 133 // doesn't contain any part of one of our terms.
130 const base::hash_set<uint32>* page_word_hashes_; 134 const base::hash_set<uint32>* page_word_hashes_;
131 135
132 // The maximum number of words in an n-gram. 136 // The maximum number of words in an n-gram.
133 const size_t max_words_per_term_; 137 const size_t max_words_per_term_;
134 138
135 // The seed for murmurhash3. 139 // The seed for murmurhash3.
136 const uint32 murmurhash3_seed_; 140 const uint32 murmurhash3_seed_;
137 141
138 // This cache is used to see if we need to check the word at all, as 142 // The maximum number of unique shingle hashes we extract in a page.
139 // converting to UTF8, lowercasing, and hashing are all relatively expensive 143 const size_t max_shingles_per_page_;
140 // operations. Though this is called an MRU cache, it seems to behave like 144
141 // an LRU cache (i.e. it evicts the oldest accesses first). 145 // The number of words in a shingle.
142 typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache; 146 const size_t shingle_size_;
143 WordCache negative_word_cache_;
144 147
145 // Non-owned pointer to our clock. 148 // Non-owned pointer to our clock.
146 FeatureExtractorClock* clock_; 149 FeatureExtractorClock* clock_;
147 150
148 // The output parameters from the most recent call to ExtractFeatures(). 151 // The output parameters from the most recent call to ExtractFeatures().
149 const base::string16* page_text_; // The caller keeps ownership of this. 152 const base::string16* page_text_; // The caller keeps ownership of this.
150 FeatureMap* features_; // The caller keeps ownership of this. 153 FeatureMap* features_; // The caller keeps ownership of this.
154 std::set<uint32>* shingle_hashes_;
151 DoneCallback done_callback_; 155 DoneCallback done_callback_;
152 156
153 // Stores the current state of term extraction from |page_text_|. 157 // Stores the current state of term extraction from |page_text_|.
154 scoped_ptr<ExtractionState> state_; 158 scoped_ptr<ExtractionState> state_;
155 159
156 // Used in scheduling ExtractFeaturesWithTimeout tasks. 160 // Used in scheduling ExtractFeaturesWithTimeout tasks.
157 // These pointers are invalidated if extraction is cancelled. 161 // These pointers are invalidated if extraction is cancelled.
158 base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_; 162 base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_;
159 163
160 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); 164 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor);
161 }; 165 };
162 166
163 } // namespace safe_browsing 167 } // namespace safe_browsing
164 168
165 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ 169 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698