OLD | NEW |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 // | 4 // |
5 // PhishingTermFeatureExtractor handles computing term features from the text | 5 // PhishingTermFeatureExtractor handles computing term features from the text |
6 // of a web page for the client-side phishing detection model. To do this, it | 6 // of a web page for the client-side phishing detection model. To do this, it |
7 // takes a list of terms that appear in the model, and scans through the page | 7 // takes a list of terms that appear in the model, and scans through the page |
8 // text looking for them. Any terms that appear will cause a corresponding | 8 // text looking for them. Any terms that appear will cause a corresponding |
9 // features::kPageTerm feature to be added to the FeatureMap. | 9 // features::kPageTerm feature to be added to the FeatureMap. |
10 // | 10 // |
11 // To make it harder for a phisher to enumerate all of the relevant terms in | 11 // To make it harder for a phisher to enumerate all of the relevant terms in |
12 // the model, the terms are provided as SHA-256 hashes, rather than plain text. | 12 // the model, the terms are provided as SHA-256 hashes, rather than plain text. |
13 // | 13 // |
14 // There is one PhishingTermFeatureExtractor per RenderView. | 14 // There is one PhishingTermFeatureExtractor per RenderView. |
15 | 15 |
16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
18 | 18 |
19 #include <string> | 19 #include <string> |
20 | 20 |
21 #include "base/basictypes.h" | 21 #include "base/basictypes.h" |
22 #include "base/callback_old.h" | 22 #include "base/callback_old.h" |
23 #include "base/hash_tables.h" | 23 #include "base/hash_tables.h" |
| 24 #include "base/memory/mru_cache.h" |
24 #include "base/memory/scoped_ptr.h" | 25 #include "base/memory/scoped_ptr.h" |
| 26 #include "base/string_piece.h" |
25 #include "base/string16.h" | 27 #include "base/string16.h" |
26 #include "base/task.h" | 28 #include "base/task.h" |
27 | 29 |
28 namespace safe_browsing { | 30 namespace safe_browsing { |
29 class FeatureExtractorClock; | 31 class FeatureExtractorClock; |
30 class FeatureMap; | 32 class FeatureMap; |
31 | 33 |
32 class PhishingTermFeatureExtractor { | 34 class PhishingTermFeatureExtractor { |
33 public: | 35 public: |
34 // Callback to be run when feature extraction finishes. The callback | 36 // Callback to be run when feature extraction finishes. The callback |
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
85 | 87 |
86 // The number of words that we will process before checking to see whether | 88 // The number of words that we will process before checking to see whether |
87 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be | 89 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be |
88 // slow, we don't do this on every word processed. | 90 // slow, we don't do this on every word processed. |
89 static const int kClockCheckGranularity; | 91 static const int kClockCheckGranularity; |
90 | 92 |
91 // The maximum total amount of time that the feature extractor will run | 93 // The maximum total amount of time that the feature extractor will run |
92 // before giving up on the current page. | 94 // before giving up on the current page. |
93 static const int kMaxTotalTimeMs; | 95 static const int kMaxTotalTimeMs; |
94 | 96 |
| 97 // The size of the cache that we use to determine if we can avoid lower |
| 98 // casing, hashing, and UTF conversion. |
| 99 static const int kMaxNegativeWordCacheSize; |
| 100 |
95 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs | 101 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs |
96 // until a predefined maximum amount of time has elapsed, then posts a task | 102 // until a predefined maximum amount of time has elapsed, then posts a task |
97 // to the current MessageLoop to continue extraction. When extraction | 103 // to the current MessageLoop to continue extraction. When extraction |
98 // finishes, calls RunCallback(). | 104 // finishes, calls RunCallback(). |
99 void ExtractFeaturesWithTimeout(); | 105 void ExtractFeaturesWithTimeout(); |
100 | 106 |
101 // Handles a single word in the page text. | 107 // Handles a single word in the page text. |
102 void HandleWord(const string16& word); | 108 void HandleWord(const base::StringPiece16& word); |
103 | 109 |
104 // Helper to verify that there is no pending feature extraction. Dies in | 110 // Helper to verify that there is no pending feature extraction. Dies in |
105 // debug builds if the state is not as expected. This is a no-op in release | 111 // debug builds if the state is not as expected. This is a no-op in release |
106 // builds. | 112 // builds. |
107 void CheckNoPendingExtraction(); | 113 void CheckNoPendingExtraction(); |
108 | 114 |
109 // Runs |done_callback_| and then clears all internal state. | 115 // Runs |done_callback_| and then clears all internal state. |
110 void RunCallback(bool success); | 116 void RunCallback(bool success); |
111 | 117 |
112 // Clears all internal feature extraction state. | 118 // Clears all internal feature extraction state. |
113 void Clear(); | 119 void Clear(); |
114 | 120 |
115 // All of the term hashes that we are looking for in the page. | 121 // All of the term hashes that we are looking for in the page. |
116 const base::hash_set<std::string>* page_term_hashes_; | 122 const base::hash_set<std::string>* page_term_hashes_; |
117 | 123 |
118 // Hashes of all the individual words in page_term_hashes_. If | 124 // Hashes of all the individual words in page_term_hashes_. If |
119 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ | 125 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ |
120 // would contain (hashed) "one" and "two". We do this so that we can have a | 126 // would contain (hashed) "one" and "two". We do this so that we can have a |
121 // quick out in the common case that the current word we are processing | 127 // quick out in the common case that the current word we are processing |
122 // doesn't contain any part of one of our terms. | 128 // doesn't contain any part of one of our terms. |
123 const base::hash_set<std::string>* page_word_hashes_; | 129 const base::hash_set<std::string>* page_word_hashes_; |
124 | 130 |
125 // The maximum number of words in an n-gram. | 131 // The maximum number of words in an n-gram. |
126 size_t max_words_per_term_; | 132 size_t max_words_per_term_; |
127 | 133 |
| 134 // This cache is used to see if we need to check the word at all, as |
| 135 // converting to UTF8, lowercasing, and hashing are all relatively expensive |
| 136 // operations. Though this is called an MRU cache, it seems to behave like |
| 137 // an LRU cache (i.e. it evicts the oldest accesses first). |
| 138 typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache; |
| 139 WordCache negative_word_cache_; |
| 140 |
128 // Non-owned pointer to our clock. | 141 // Non-owned pointer to our clock. |
129 FeatureExtractorClock* clock_; | 142 FeatureExtractorClock* clock_; |
130 | 143 |
131 // The output parameters from the most recent call to ExtractFeatures(). | 144 // The output parameters from the most recent call to ExtractFeatures(). |
132 const string16* page_text_; // The caller keeps ownership of this. | 145 const string16* page_text_; // The caller keeps ownership of this. |
133 FeatureMap* features_; // The caller keeps ownership of this. | 146 FeatureMap* features_; // The caller keeps ownership of this. |
134 scoped_ptr<DoneCallback> done_callback_; | 147 scoped_ptr<DoneCallback> done_callback_; |
135 | 148 |
136 // Stores the current state of term extraction from |page_text_|. | 149 // Stores the current state of term extraction from |page_text_|. |
137 scoped_ptr<ExtractionState> state_; | 150 scoped_ptr<ExtractionState> state_; |
138 | 151 |
139 // Used to create ExtractFeaturesWithTimeout tasks. | 152 // Used to create ExtractFeaturesWithTimeout tasks. |
140 // These tasks are revoked if extraction is cancelled. | 153 // These tasks are revoked if extraction is cancelled. |
141 ScopedRunnableMethodFactory<PhishingTermFeatureExtractor> method_factory_; | 154 ScopedRunnableMethodFactory<PhishingTermFeatureExtractor> method_factory_; |
142 | 155 |
143 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); | 156 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); |
144 }; | 157 }; |
145 | 158 |
146 } // namespace safe_browsing | 159 } // namespace safe_browsing |
147 | 160 |
148 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 161 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
OLD | NEW |