Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(126)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h

Issue 1548153002: Switch to standard integer types in chrome/. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 // 4 //
5 // PhishingTermFeatureExtractor handles computing term features from the text 5 // PhishingTermFeatureExtractor handles computing term features from the text
6 // of a web page for the client-side phishing detection model. To do this, it 6 // of a web page for the client-side phishing detection model. To do this, it
7 // takes a list of terms that appear in the model, and scans through the page 7 // takes a list of terms that appear in the model, and scans through the page
8 // text looking for them. Any terms that appear will cause a corresponding 8 // text looking for them. Any terms that appear will cause a corresponding
9 // features::kPageTerm feature to be added to the FeatureMap. 9 // features::kPageTerm feature to be added to the FeatureMap.
10 // 10 //
11 // To make it harder for a phisher to enumerate all of the relevant terms in 11 // To make it harder for a phisher to enumerate all of the relevant terms in
12 // the model, the terms are provided as SHA-256 hashes, rather than plain text. 12 // the model, the terms are provided as SHA-256 hashes, rather than plain text.
13 // 13 //
14 // There is one PhishingTermFeatureExtractor per RenderView. 14 // There is one PhishingTermFeatureExtractor per RenderView.
15 15
16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ 16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ 17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
18 18
19 #include <stddef.h>
20 #include <stdint.h>
21
19 #include <set> 22 #include <set>
20 #include <string> 23 #include <string>
21 24
22 #include "base/basictypes.h"
23 #include "base/callback.h" 25 #include "base/callback.h"
24 #include "base/containers/hash_tables.h" 26 #include "base/containers/hash_tables.h"
27 #include "base/macros.h"
25 #include "base/memory/scoped_ptr.h" 28 #include "base/memory/scoped_ptr.h"
26 #include "base/memory/weak_ptr.h" 29 #include "base/memory/weak_ptr.h"
27 #include "base/strings/string16.h" 30 #include "base/strings/string16.h"
28 #include "base/strings/string_piece.h" 31 #include "base/strings/string_piece.h"
29 32
30 namespace safe_browsing { 33 namespace safe_browsing {
31 class FeatureExtractorClock; 34 class FeatureExtractorClock;
32 class FeatureMap; 35 class FeatureMap;
33 36
34 class PhishingTermFeatureExtractor { 37 class PhishingTermFeatureExtractor {
(...skipping 14 matching lines...) Expand all
49 // 52 //
50 // In addition to extracting page terms, we will also extract text shingling 53 // In addition to extracting page terms, we will also extract text shingling
51 // sketch, which consists of hashes of N-gram-words (referred to as shingles) 54 // sketch, which consists of hashes of N-gram-words (referred to as shingles)
52 // in the page. |shingle_size| defines N, and |max_shingles_per_page| defines 55 // in the page. |shingle_size| defines N, and |max_shingles_per_page| defines
53 // the maximum number of unique shingle hashes we extracted per page. 56 // the maximum number of unique shingle hashes we extracted per page.
54 // 57 //
55 // |clock| is used for timing feature extractor operations, and may be mocked 58 // |clock| is used for timing feature extractor operations, and may be mocked
56 // for testing. The caller keeps ownership of the clock. 59 // for testing. The caller keeps ownership of the clock.
57 PhishingTermFeatureExtractor( 60 PhishingTermFeatureExtractor(
58 const base::hash_set<std::string>* page_term_hashes, 61 const base::hash_set<std::string>* page_term_hashes,
59 const base::hash_set<uint32>* page_word_hashes, 62 const base::hash_set<uint32_t>* page_word_hashes,
60 size_t max_words_per_term, 63 size_t max_words_per_term,
61 uint32 murmurhash3_seed, 64 uint32_t murmurhash3_seed,
62 size_t max_shingles_per_page, 65 size_t max_shingles_per_page,
63 size_t shingle_size, 66 size_t shingle_size,
64 FeatureExtractorClock* clock); 67 FeatureExtractorClock* clock);
65 ~PhishingTermFeatureExtractor(); 68 ~PhishingTermFeatureExtractor();
66 69
67 // Begins extracting features from |page_text| into the given FeatureMap. 70 // Begins extracting features from |page_text| into the given FeatureMap.
68 // |page_text| should contain the plain text of a web page, including any 71 // |page_text| should contain the plain text of a web page, including any
69 // subframes, as returned by RenderView::CaptureText(). 72 // subframes, as returned by RenderView::CaptureText().
70 // 73 //
71 // To avoid blocking the render thread for too long, the feature extractor 74 // To avoid blocking the render thread for too long, the feature extractor
72 // may run in several chunks of work, posting a task to the current 75 // may run in several chunks of work, posting a task to the current
73 // MessageLoop to continue processing. Once feature extraction is complete, 76 // MessageLoop to continue processing. Once feature extraction is complete,
74 // |done_callback| is run on the current thread. 77 // |done_callback| is run on the current thread.
75 // PhishingTermFeatureExtractor takes ownership of the callback. 78 // PhishingTermFeatureExtractor takes ownership of the callback.
76 // 79 //
77 // |page_text|, |features|, and |shingle_hashes| are owned by the caller, 80 // |page_text|, |features|, and |shingle_hashes| are owned by the caller,
78 // and must not be destroyed until either |done_callback| is run or 81 // and must not be destroyed until either |done_callback| is run or
79 // CancelPendingExtraction() is called. 82 // CancelPendingExtraction() is called.
80 void ExtractFeatures(const base::string16* page_text, 83 void ExtractFeatures(const base::string16* page_text,
81 FeatureMap* features, 84 FeatureMap* features,
82 std::set<uint32>* shingle_hashes, 85 std::set<uint32_t>* shingle_hashes,
83 const DoneCallback& done_callback); 86 const DoneCallback& done_callback);
84 87
85 // Cancels any pending feature extraction. The DoneCallback will not be run. 88 // Cancels any pending feature extraction. The DoneCallback will not be run.
86 // Must be called if there is a feature extraction in progress when the page 89 // Must be called if there is a feature extraction in progress when the page
87 // is unloaded or the PhishingTermFeatureExtractor is destroyed. 90 // is unloaded or the PhishingTermFeatureExtractor is destroyed.
88 void CancelPendingExtraction(); 91 void CancelPendingExtraction();
89 92
90 private: 93 private:
91 struct ExtractionState; 94 struct ExtractionState;
92 95
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
124 void Clear(); 127 void Clear();
125 128
126 // All of the term hashes that we are looking for in the page. 129 // All of the term hashes that we are looking for in the page.
127 const base::hash_set<std::string>* page_term_hashes_; 130 const base::hash_set<std::string>* page_term_hashes_;
128 131
129 // Murmur3 hashes of all the individual words in page_term_hashes_. If 132 // Murmur3 hashes of all the individual words in page_term_hashes_. If
130 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ 133 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_
131 // would contain (hashed) "one" and "two". We do this so that we can have a 134 // would contain (hashed) "one" and "two". We do this so that we can have a
132 // quick out in the common case that the current word we are processing 135 // quick out in the common case that the current word we are processing
133 // doesn't contain any part of one of our terms. 136 // doesn't contain any part of one of our terms.
134 const base::hash_set<uint32>* page_word_hashes_; 137 const base::hash_set<uint32_t>* page_word_hashes_;
135 138
136 // The maximum number of words in an n-gram. 139 // The maximum number of words in an n-gram.
137 const size_t max_words_per_term_; 140 const size_t max_words_per_term_;
138 141
139 // The seed for murmurhash3. 142 // The seed for murmurhash3.
140 const uint32 murmurhash3_seed_; 143 const uint32_t murmurhash3_seed_;
141 144
142 // The maximum number of unique shingle hashes we extract in a page. 145 // The maximum number of unique shingle hashes we extract in a page.
143 const size_t max_shingles_per_page_; 146 const size_t max_shingles_per_page_;
144 147
145 // The number of words in a shingle. 148 // The number of words in a shingle.
146 const size_t shingle_size_; 149 const size_t shingle_size_;
147 150
148 // Non-owned pointer to our clock. 151 // Non-owned pointer to our clock.
149 FeatureExtractorClock* clock_; 152 FeatureExtractorClock* clock_;
150 153
151 // The output parameters from the most recent call to ExtractFeatures(). 154 // The output parameters from the most recent call to ExtractFeatures().
152 const base::string16* page_text_; // The caller keeps ownership of this. 155 const base::string16* page_text_; // The caller keeps ownership of this.
153 FeatureMap* features_; // The caller keeps ownership of this. 156 FeatureMap* features_; // The caller keeps ownership of this.
154 std::set<uint32>* shingle_hashes_; 157 std::set<uint32_t>* shingle_hashes_;
155 DoneCallback done_callback_; 158 DoneCallback done_callback_;
156 159
157 // Stores the current state of term extraction from |page_text_|. 160 // Stores the current state of term extraction from |page_text_|.
158 scoped_ptr<ExtractionState> state_; 161 scoped_ptr<ExtractionState> state_;
159 162
160 // Used in scheduling ExtractFeaturesWithTimeout tasks. 163 // Used in scheduling ExtractFeaturesWithTimeout tasks.
161 // These pointers are invalidated if extraction is cancelled. 164 // These pointers are invalidated if extraction is cancelled.
162 base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_; 165 base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_;
163 166
164 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); 167 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor);
165 }; 168 };
166 169
167 } // namespace safe_browsing 170 } // namespace safe_browsing
168 171
169 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ 172 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698