Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(100)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h

Issue 2667343006: Componentize safe_browsing [X+1] : move the renderer part to component.
Patch Set: Created 3 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 //
5 // PhishingTermFeatureExtractor handles computing term features from the text
6 // of a web page for the client-side phishing detection model. To do this, it
7 // takes a list of terms that appear in the model, and scans through the page
8 // text looking for them. Any terms that appear will cause a corresponding
9 // features::kPageTerm feature to be added to the FeatureMap.
10 //
11 // To make it harder for a phisher to enumerate all of the relevant terms in
12 // the model, the terms are provided as SHA-256 hashes, rather than plain text.
13 //
14 // There is one PhishingTermFeatureExtractor per RenderView.
15
16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
18
19 #include <stddef.h>
20 #include <stdint.h>
21
22 #include <memory>
23 #include <set>
24 #include <string>
25
26 #include "base/callback.h"
27 #include "base/containers/hash_tables.h"
28 #include "base/macros.h"
29 #include "base/memory/weak_ptr.h"
30 #include "base/strings/string16.h"
31 #include "base/strings/string_piece.h"
32
33 namespace safe_browsing {
34 class FeatureExtractorClock;
35 class FeatureMap;
36
37 class PhishingTermFeatureExtractor {
38 public:
39 // Callback to be run when feature extraction finishes. The callback
40 // argument is true if extraction was successful, false otherwise.
41 typedef base::Callback<void(bool)> DoneCallback;
42
43 // Creates a PhishingTermFeatureExtractor which will extract features for
44 // all of the terms whose SHA-256 hashes are in |page_term_hashes|. These
45 // terms may be multi-word n-grams, with at most |max_words_per_term| words.
46 //
47 // |page_word_hashes| contains the murmur3 hashes for all of the individual
48 // words that make up the terms. Both sets of strings are UTF-8 encoded and
49 // lowercased prior to hashing. The caller owns both sets of strings, and
50 // must ensure that they are valid until the PhishingTermFeatureExtractor is
51 // destroyed.
52 //
53 // In addition to extracting page terms, we will also extract text shingling
54 // sketch, which consists of hashes of N-gram-words (referred to as shingles)
55 // in the page. |shingle_size| defines N, and |max_shingles_per_page| defines
56 // the maximum number of unique shingle hashes we extracted per page.
57 //
58 // |clock| is used for timing feature extractor operations, and may be mocked
59 // for testing. The caller keeps ownership of the clock.
60 PhishingTermFeatureExtractor(
61 const base::hash_set<std::string>* page_term_hashes,
62 const base::hash_set<uint32_t>* page_word_hashes,
63 size_t max_words_per_term,
64 uint32_t murmurhash3_seed,
65 size_t max_shingles_per_page,
66 size_t shingle_size,
67 FeatureExtractorClock* clock);
68 ~PhishingTermFeatureExtractor();
69
70 // Begins extracting features from |page_text| into the given FeatureMap.
71 // |page_text| should contain the plain text of a web page, including any
72 // subframes, as returned by RenderView::CaptureText().
73 //
74 // To avoid blocking the render thread for too long, the feature extractor
75 // may run in several chunks of work, posting a task to the current
76 // MessageLoop to continue processing. Once feature extraction is complete,
77 // |done_callback| is run on the current thread.
78 // PhishingTermFeatureExtractor takes ownership of the callback.
79 //
80 // |page_text|, |features|, and |shingle_hashes| are owned by the caller,
81 // and must not be destroyed until either |done_callback| is run or
82 // CancelPendingExtraction() is called.
83 void ExtractFeatures(const base::string16* page_text,
84 FeatureMap* features,
85 std::set<uint32_t>* shingle_hashes,
86 const DoneCallback& done_callback);
87
88 // Cancels any pending feature extraction. The DoneCallback will not be run.
89 // Must be called if there is a feature extraction in progress when the page
90 // is unloaded or the PhishingTermFeatureExtractor is destroyed.
91 void CancelPendingExtraction();
92
93 private:
94 struct ExtractionState;
95
96 // The maximum amount of wall time that we will spend on a single extraction
97 // iteration before pausing to let other MessageLoop tasks run.
98 static const int kMaxTimePerChunkMs;
99
100 // The number of words that we will process before checking to see whether
101 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be
102 // slow, we don't do this on every word processed.
103 static const int kClockCheckGranularity;
104
105 // The maximum total amount of time that the feature extractor will run
106 // before giving up on the current page.
107 static const int kMaxTotalTimeMs;
108
109 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs
110 // until a predefined maximum amount of time has elapsed, then posts a task
111 // to the current MessageLoop to continue extraction. When extraction
112 // finishes, calls RunCallback().
113 void ExtractFeaturesWithTimeout();
114
115 // Handles a single word in the page text.
116 void HandleWord(const base::StringPiece16& word);
117
118 // Helper to verify that there is no pending feature extraction. Dies in
119 // debug builds if the state is not as expected. This is a no-op in release
120 // builds.
121 void CheckNoPendingExtraction();
122
123 // Runs |done_callback_| and then clears all internal state.
124 void RunCallback(bool success);
125
126 // Clears all internal feature extraction state.
127 void Clear();
128
129 // All of the term hashes that we are looking for in the page.
130 const base::hash_set<std::string>* page_term_hashes_;
131
132 // Murmur3 hashes of all the individual words in page_term_hashes_. If
133 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_
134 // would contain (hashed) "one" and "two". We do this so that we can have a
135 // quick out in the common case that the current word we are processing
136 // doesn't contain any part of one of our terms.
137 const base::hash_set<uint32_t>* page_word_hashes_;
138
139 // The maximum number of words in an n-gram.
140 const size_t max_words_per_term_;
141
142 // The seed for murmurhash3.
143 const uint32_t murmurhash3_seed_;
144
145 // The maximum number of unique shingle hashes we extract in a page.
146 const size_t max_shingles_per_page_;
147
148 // The number of words in a shingle.
149 const size_t shingle_size_;
150
151 // Non-owned pointer to our clock.
152 FeatureExtractorClock* clock_;
153
154 // The output parameters from the most recent call to ExtractFeatures().
155 const base::string16* page_text_; // The caller keeps ownership of this.
156 FeatureMap* features_; // The caller keeps ownership of this.
157 std::set<uint32_t>* shingle_hashes_;
158 DoneCallback done_callback_;
159
160 // Stores the current state of term extraction from |page_text_|.
161 std::unique_ptr<ExtractionState> state_;
162
163 // Used in scheduling ExtractFeaturesWithTimeout tasks.
164 // These pointers are invalidated if extraction is cancelled.
165 base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_;
166
167 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor);
168 };
169
170 } // namespace safe_browsing
171
172 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698