Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(76)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h

Issue 7549003: Optimize phishing page term feature extraction. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Address Brian's comments Created 9 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 // 4 //
5 // PhishingTermFeatureExtractor handles computing term features from the text 5 // PhishingTermFeatureExtractor handles computing term features from the text
6 // of a web page for the client-side phishing detection model. To do this, it 6 // of a web page for the client-side phishing detection model. To do this, it
7 // takes a list of terms that appear in the model, and scans through the page 7 // takes a list of terms that appear in the model, and scans through the page
8 // text looking for them. Any terms that appear will cause a corresponding 8 // text looking for them. Any terms that appear will cause a corresponding
9 // features::kPageTerm feature to be added to the FeatureMap. 9 // features::kPageTerm feature to be added to the FeatureMap.
10 // 10 //
11 // To make it harder for a phisher to enumerate all of the relevant terms in 11 // To make it harder for a phisher to enumerate all of the relevant terms in
12 // the model, the terms are provided as SHA-256 hashes, rather than plain text. 12 // the model, the terms are provided as SHA-256 hashes, rather than plain text.
13 // 13 //
14 // There is one PhishingTermFeatureExtractor per RenderView. 14 // There is one PhishingTermFeatureExtractor per RenderView.
15 15
16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ 16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ 17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
18 18
19 #include <string> 19 #include <string>
20 20
21 #include "base/basictypes.h" 21 #include "base/basictypes.h"
22 #include "base/callback_old.h" 22 #include "base/callback_old.h"
23 #include "base/hash_tables.h" 23 #include "base/hash_tables.h"
24 #include "base/memory/mru_cache.h"
24 #include "base/memory/scoped_ptr.h" 25 #include "base/memory/scoped_ptr.h"
26 #include "base/string_piece.h"
25 #include "base/string16.h" 27 #include "base/string16.h"
26 #include "base/task.h" 28 #include "base/task.h"
27 29
28 namespace safe_browsing { 30 namespace safe_browsing {
29 class FeatureExtractorClock; 31 class FeatureExtractorClock;
30 class FeatureMap; 32 class FeatureMap;
31 33
32 class PhishingTermFeatureExtractor { 34 class PhishingTermFeatureExtractor {
33 public: 35 public:
34 // Callback to be run when feature extraction finishes. The callback 36 // Callback to be run when feature extraction finishes. The callback
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
85 87
86 // The number of words that we will process before checking to see whether 88 // The number of words that we will process before checking to see whether
87 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be 89 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be
88 // slow, we don't do this on every word processed. 90 // slow, we don't do this on every word processed.
89 static const int kClockCheckGranularity; 91 static const int kClockCheckGranularity;
90 92
91 // The maximum total amount of time that the feature extractor will run 93 // The maximum total amount of time that the feature extractor will run
92 // before giving up on the current page. 94 // before giving up on the current page.
93 static const int kMaxTotalTimeMs; 95 static const int kMaxTotalTimeMs;
94 96
97 // The size of the cache that we use to determine if we can avoid lower
98 // casing, hashing, and UTF conversion.
99 static const int kMaxNegativeWordCacheSize;
100
95 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs 101 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs
96 // until a predefined maximum amount of time has elapsed, then posts a task 102 // until a predefined maximum amount of time has elapsed, then posts a task
97 // to the current MessageLoop to continue extraction. When extraction 103 // to the current MessageLoop to continue extraction. When extraction
98 // finishes, calls RunCallback(). 104 // finishes, calls RunCallback().
99 void ExtractFeaturesWithTimeout(); 105 void ExtractFeaturesWithTimeout();
100 106
101 // Handles a single word in the page text. 107 // Handles a single word in the page text.
102 void HandleWord(const string16& word); 108 void HandleWord(const base::StringPiece16& word);
103 109
104 // Helper to verify that there is no pending feature extraction. Dies in 110 // Helper to verify that there is no pending feature extraction. Dies in
105 // debug builds if the state is not as expected. This is a no-op in release 111 // debug builds if the state is not as expected. This is a no-op in release
106 // builds. 112 // builds.
107 void CheckNoPendingExtraction(); 113 void CheckNoPendingExtraction();
108 114
109 // Runs |done_callback_| and then clears all internal state. 115 // Runs |done_callback_| and then clears all internal state.
110 void RunCallback(bool success); 116 void RunCallback(bool success);
111 117
112 // Clears all internal feature extraction state. 118 // Clears all internal feature extraction state.
113 void Clear(); 119 void Clear();
114 120
115 // All of the term hashes that we are looking for in the page. 121 // All of the term hashes that we are looking for in the page.
116 const base::hash_set<std::string>* page_term_hashes_; 122 const base::hash_set<std::string>* page_term_hashes_;
117 123
118 // Hashes of all the individual words in page_term_hashes_. If 124 // Hashes of all the individual words in page_term_hashes_. If
119 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ 125 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_
120 // would contain (hashed) "one" and "two". We do this so that we can have a 126 // would contain (hashed) "one" and "two". We do this so that we can have a
121 // quick out in the common case that the current word we are processing 127 // quick out in the common case that the current word we are processing
122 // doesn't contain any part of one of our terms. 128 // doesn't contain any part of one of our terms.
123 const base::hash_set<std::string>* page_word_hashes_; 129 const base::hash_set<std::string>* page_word_hashes_;
124 130
125 // The maximum number of words in an n-gram. 131 // The maximum number of words in an n-gram.
126 size_t max_words_per_term_; 132 size_t max_words_per_term_;
127 133
134 // This cache is used to see if we need to check the word at all, as
135 // converting to UTF8, lowercasing, and hashing are all relatively expensive
136 // operations. Though this is called an MRU cache, it seems to behave like
137 // an LRU cache (i.e. it evicts the oldest accesses first).
138 typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache;
139 WordCache negative_word_cache_;
140
128 // Non-owned pointer to our clock. 141 // Non-owned pointer to our clock.
129 FeatureExtractorClock* clock_; 142 FeatureExtractorClock* clock_;
130 143
131 // The output parameters from the most recent call to ExtractFeatures(). 144 // The output parameters from the most recent call to ExtractFeatures().
132 const string16* page_text_; // The caller keeps ownership of this. 145 const string16* page_text_; // The caller keeps ownership of this.
133 FeatureMap* features_; // The caller keeps ownership of this. 146 FeatureMap* features_; // The caller keeps ownership of this.
134 scoped_ptr<DoneCallback> done_callback_; 147 scoped_ptr<DoneCallback> done_callback_;
135 148
136 // Stores the current state of term extraction from |page_text_|. 149 // Stores the current state of term extraction from |page_text_|.
137 scoped_ptr<ExtractionState> state_; 150 scoped_ptr<ExtractionState> state_;
138 151
139 // Used to create ExtractFeaturesWithTimeout tasks. 152 // Used to create ExtractFeaturesWithTimeout tasks.
140 // These tasks are revoked if extraction is cancelled. 153 // These tasks are revoked if extraction is cancelled.
141 ScopedRunnableMethodFactory<PhishingTermFeatureExtractor> method_factory_; 154 ScopedRunnableMethodFactory<PhishingTermFeatureExtractor> method_factory_;
142 155
143 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); 156 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor);
144 }; 157 };
145 158
146 } // namespace safe_browsing 159 } // namespace safe_browsing
147 160
148 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ 161 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
OLDNEW
« no previous file with comments | « base/string_piece_unittest.cc ('k') | chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698