Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(63)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h

Issue 3214002: Add a term feature extractor for client-side phishing detection. (Closed) Base URL: http://src.chromium.org/git/chromium.git
Patch Set: Add an extra comment/TODO about performance. Created 10 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 //
5 // PhishingTermFeatureExtractor handles computing term features from the text
6 // of a web page for the client-side phishing detection model. To do this, it
7 // takes a list of terms that appear in the model, and scans through the page
8 // text looking for them. Any terms that appear will cause a corresponding
9 // features::kPageTerm feature to be added to the FeatureMap.
10 //
11 // To make it harder for a phisher to enumerate all of the relevant terms in
12 // the model, the terms are provided as SHA-256 hashes, rather than plain text.
13 //
14 // TODO(bryner): When we compute the score, all of the features in the
15 // FeatureMap will be hashed so that they can be compared against the model.
16 // When this is implemented, add a comment about it here.
17 //
18 // There is one PhishingTermFeatureExtractor per RenderView.
19
20 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
21 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
22
23 #include <string>
24
25 #include "base/basictypes.h"
26 #include "base/callback.h"
27 #include "base/hash_tables.h"
28 #include "base/scoped_ptr.h"
29 #include "base/string16.h"
30 #include "base/task.h"
31
32 namespace safe_browsing {
33 class FeatureExtractorClock;
34 class FeatureMap;
35
36 class PhishingTermFeatureExtractor {
37 public:
38 // Callback to be run when feature extraction finishes. The callback
39 // argument is true if extraction was successful, false otherwise.
40 typedef Callback1<bool>::Type DoneCallback;
41
42 // Creates a PhishingTermFeatureExtractor which will extract features for
43 // all of the terms whose SHA-256 hashes are in |page_term_hashes|. These
44 // terms may be multi-word n-grams, with at most |max_words_per_term| words.
45 //
46 // |page_word_hashes| contains the hashes for all of the individual words
47 // that make up the terms. Both sets of strings are UTF-8 encoded and
48 // lowercased prior to hashing. The caller owns both sets of strings, and
49 // must ensure that they are valid until the PhishingTermFeatureExtractor is
50 // destroyed.
51 //
52 // |clock| is used for timing feature extractor operations, and may be mocked
53 // for testing. PhishingTermFeatureExtractor takes ownership of the clock.
54 PhishingTermFeatureExtractor(
55 const base::hash_set<std::string>* page_term_hashes,
56 const base::hash_set<std::string>* page_word_hashes,
57 size_t max_words_per_term,
58 FeatureExtractorClock* clock);
59 ~PhishingTermFeatureExtractor();
60
61 // Begins extracting features from |page_text| into the given FeatureMap.
62 // |page_text| should contain the plain text of a web page, including any
63 // subframes, as returned by RenderView::CaptureText().
64 //
65 // To avoid blocking the render thread for too long, the feature extractor
66 // may run in several chunks of work, posting a task to the current
67 // MessageLoop to continue processing. Once feature extraction is complete,
68 // |done_callback| is run on the current thread.
69 // PhishingTermFeatureExtractor takes ownership of the callback.
70 //
71 // |page_text| and |features| are owned by the caller, and must not be
72 // destroyed until either |done_callback| is run or
73 // CancelPendingExtraction() is called.
74 void ExtractFeatures(const string16* page_text,
75 FeatureMap* features,
76 DoneCallback* done_callback);
77
78 // Cancels any pending feature extraction. The DoneCallback will not be run.
79 // Must be called if there is a feature extraction in progress when the page
80 // is unloaded or the PhishingTermFeatureExtractor is destroyed.
81 void CancelPendingExtraction();
82
83 private:
84 struct ExtractionState;
85
86 // The maximum amount of wall time that we will spend on a single extraction
87 // iteration before pausing to let other MessageLoop tasks run.
88 static const int kMaxTimePerChunkMs;
89
90 // The number of words that we will process before checking to see whether
91 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be
92 // slow, we don't do this on every word processed.
93 static const int kClockCheckGranularity;
94
95 // The maximum total amount of time that the feature extractor will run
96 // before giving up on the current page.
97 static const int kMaxTotalTimeMs;
98
99 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs
100 // until a predefined maximum amount of time has elapsed, then posts a task
101 // to the current MessageLoop to continue extraction. When extraction
102 // finishes, calls RunCallback().
103 void ExtractFeaturesWithTimeout();
104
105 // Handles a single word in the page text.
106 void HandleWord(const string16& word);
107
108 // Helper to verify that there is no pending feature extraction. Dies in
109 // debug builds if the state is not as expected. This is a no-op in release
110 // builds.
111 void CheckNoPendingExtraction();
112
113 // Runs |done_callback_| and then clears all internal state.
114 void RunCallback(bool success);
115
116 // Clears all internal feature extraction state.
117 void Clear();
118
119 // All of the term hashes that we are looking for in the page.
120 const base::hash_set<std::string>* page_term_hashes_;
121
122 // Hashes of all the individual words in page_term_hashes_. If
123 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_
124 // would contain (hashed) "one" and "two". We do this so that we can have a
125 // quick out in the common case that the current word we are processing
126 // doesn't contain any part of one of our terms.
127 const base::hash_set<std::string>* page_word_hashes_;
128
129 // The maximum number of words in an n-gram.
130 size_t max_words_per_term_;
131
132 // Owned pointer to our clock.
133 scoped_ptr<FeatureExtractorClock> clock_;
134
135 // The output parameters from the most recent call to ExtractFeatures().
136 const string16* page_text_; // The caller keeps ownership of this.
137 FeatureMap* features_; // The caller keeps ownership of this.
138 scoped_ptr<DoneCallback> done_callback_;
139
140 // Stores the current state of term extraction from |page_text_|.
141 scoped_ptr<ExtractionState> state_;
142
143 // Used to create ExtractFeaturesWithTimeout tasks.
144 // These tasks are revoked if extraction is cancelled.
145 ScopedRunnableMethodFactory<PhishingTermFeatureExtractor> method_factory_;
146
147 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor);
148 };
149
150 } // namespace safe_browsing
151
152 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698