OLD | NEW |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 // | 4 // |
5 // PhishingTermFeatureExtractor handles computing term features from the text | 5 // PhishingTermFeatureExtractor handles computing term features from the text |
6 // of a web page for the client-side phishing detection model. To do this, it | 6 // of a web page for the client-side phishing detection model. To do this, it |
7 // takes a list of terms that appear in the model, and scans through the page | 7 // takes a list of terms that appear in the model, and scans through the page |
8 // text looking for them. Any terms that appear will cause a corresponding | 8 // text looking for them. Any terms that appear will cause a corresponding |
9 // features::kPageTerm feature to be added to the FeatureMap. | 9 // features::kPageTerm feature to be added to the FeatureMap. |
10 // | 10 // |
11 // To make it harder for a phisher to enumerate all of the relevant terms in | 11 // To make it harder for a phisher to enumerate all of the relevant terms in |
12 // the model, the terms are provided as SHA-256 hashes, rather than plain text. | 12 // the model, the terms are provided as SHA-256 hashes, rather than plain text. |
13 // | 13 // |
14 // There is one PhishingTermFeatureExtractor per RenderView. | 14 // There is one PhishingTermFeatureExtractor per RenderView. |
15 | 15 |
16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
18 | 18 |
19 #include <string> | 19 #include <string> |
20 | 20 |
21 #include "base/basictypes.h" | 21 #include "base/basictypes.h" |
22 #include "base/callback_old.h" | 22 #include "base/callback_old.h" |
23 #include "base/hash_tables.h" | 23 #include "base/hash_tables.h" |
| 24 #include "base/memory/mru_cache.h" |
24 #include "base/memory/scoped_ptr.h" | 25 #include "base/memory/scoped_ptr.h" |
25 #include "base/string16.h" | 26 #include "base/string16.h" |
26 #include "base/task.h" | 27 #include "base/task.h" |
| 28 #include "base/wide_string_piece.h" |
27 | 29 |
28 namespace safe_browsing { | 30 namespace safe_browsing { |
29 class FeatureExtractorClock; | 31 class FeatureExtractorClock; |
30 class FeatureMap; | 32 class FeatureMap; |
31 | 33 |
32 class PhishingTermFeatureExtractor { | 34 class PhishingTermFeatureExtractor { |
33 public: | 35 public: |
34 // Callback to be run when feature extraction finishes. The callback | 36 // Callback to be run when feature extraction finishes. The callback |
35 // argument is true if extraction was successful, false otherwise. | 37 // argument is true if extraction was successful, false otherwise. |
36 typedef Callback1<bool>::Type DoneCallback; | 38 typedef Callback1<bool>::Type DoneCallback; |
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
92 // before giving up on the current page. | 94 // before giving up on the current page. |
93 static const int kMaxTotalTimeMs; | 95 static const int kMaxTotalTimeMs; |
94 | 96 |
95 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs | 97 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs |
96 // until a predefined maximum amount of time has elapsed, then posts a task | 98 // until a predefined maximum amount of time has elapsed, then posts a task |
97 // to the current MessageLoop to continue extraction. When extraction | 99 // to the current MessageLoop to continue extraction. When extraction |
98 // finishes, calls RunCallback(). | 100 // finishes, calls RunCallback(). |
99 void ExtractFeaturesWithTimeout(); | 101 void ExtractFeaturesWithTimeout(); |
100 | 102 |
101 // Handles a single word in the page text. | 103 // Handles a single word in the page text. |
102 void HandleWord(const string16& word); | 104 void HandleWord(const base::WideStringPiece& word); |
103 | 105 |
104 // Helper to verify that there is no pending feature extraction. Dies in | 106 // Helper to verify that there is no pending feature extraction. Dies in |
105 // debug builds if the state is not as expected. This is a no-op in release | 107 // debug builds if the state is not as expected. This is a no-op in release |
106 // builds. | 108 // builds. |
107 void CheckNoPendingExtraction(); | 109 void CheckNoPendingExtraction(); |
108 | 110 |
109 // Runs |done_callback_| and then clears all internal state. | 111 // Runs |done_callback_| and then clears all internal state. |
110 void RunCallback(bool success); | 112 void RunCallback(bool success); |
111 | 113 |
112 // Clears all internal feature extraction state. | 114 // Clears all internal feature extraction state. |
113 void Clear(); | 115 void Clear(); |
114 | 116 |
115 // All of the term hashes that we are looking for in the page. | 117 // All of the term hashes that we are looking for in the page. |
116 const base::hash_set<std::string>* page_term_hashes_; | 118 const base::hash_set<std::string>* page_term_hashes_; |
117 | 119 |
118 // Hashes of all the individual words in page_term_hashes_. If | 120 // Hashes of all the individual words in page_term_hashes_. If |
119 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ | 121 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ |
120 // would contain (hashed) "one" and "two". We do this so that we can have a | 122 // would contain (hashed) "one" and "two". We do this so that we can have a |
121 // quick out in the common case that the current word we are processing | 123 // quick out in the common case that the current word we are processing |
122 // doesn't contain any part of one of our terms. | 124 // doesn't contain any part of one of our terms. |
123 const base::hash_set<std::string>* page_word_hashes_; | 125 const base::hash_set<std::string>* page_word_hashes_; |
124 | 126 |
125 // The maximum number of words in an n-gram. | 127 // The maximum number of words in an n-gram. |
126 size_t max_words_per_term_; | 128 size_t max_words_per_term_; |
127 | 129 |
| 130 // This cache is used to see if we need to check the word at all, as |
| 131 // converting to UTF8, lowercasing, and hashing are all relatively expensive |
| 132 // operations. Though this is called an MRU cache, it seems to behave like |
| 133 // an LRU cache (i.e. it evicts the oldest accesses first). |
| 134 typedef base::HashingMRUCache<base::WideStringPiece, bool> WordCache; |
| 135 WordCache negative_word_cache_; |
| 136 |
128 // Non-owned pointer to our clock. | 137 // Non-owned pointer to our clock. |
129 FeatureExtractorClock* clock_; | 138 FeatureExtractorClock* clock_; |
130 | 139 |
131 // The output parameters from the most recent call to ExtractFeatures(). | 140 // The output parameters from the most recent call to ExtractFeatures(). |
132 const string16* page_text_; // The caller keeps ownership of this. | 141 const string16* page_text_; // The caller keeps ownership of this. |
133 FeatureMap* features_; // The caller keeps ownership of this. | 142 FeatureMap* features_; // The caller keeps ownership of this. |
134 scoped_ptr<DoneCallback> done_callback_; | 143 scoped_ptr<DoneCallback> done_callback_; |
135 | 144 |
136 // Stores the current state of term extraction from |page_text_|. | 145 // Stores the current state of term extraction from |page_text_|. |
137 scoped_ptr<ExtractionState> state_; | 146 scoped_ptr<ExtractionState> state_; |
138 | 147 |
139 // Used to create ExtractFeaturesWithTimeout tasks. | 148 // Used to create ExtractFeaturesWithTimeout tasks. |
140 // These tasks are revoked if extraction is cancelled. | 149 // These tasks are revoked if extraction is cancelled. |
141 ScopedRunnableMethodFactory<PhishingTermFeatureExtractor> method_factory_; | 150 ScopedRunnableMethodFactory<PhishingTermFeatureExtractor> method_factory_; |
142 | 151 |
143 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); | 152 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); |
144 }; | 153 }; |
145 | 154 |
146 } // namespace safe_browsing | 155 } // namespace safe_browsing |
147 | 156 |
148 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 157 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
OLD | NEW |