OLD | NEW |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 // | 4 // |
5 // PhishingTermFeatureExtractor handles computing term features from the text | 5 // PhishingTermFeatureExtractor handles computing term features from the text |
6 // of a web page for the client-side phishing detection model. To do this, it | 6 // of a web page for the client-side phishing detection model. To do this, it |
7 // takes a list of terms that appear in the model, and scans through the page | 7 // takes a list of terms that appear in the model, and scans through the page |
8 // text looking for them. Any terms that appear will cause a corresponding | 8 // text looking for them. Any terms that appear will cause a corresponding |
9 // features::kPageTerm feature to be added to the FeatureMap. | 9 // features::kPageTerm feature to be added to the FeatureMap. |
10 // | 10 // |
11 // To make it harder for a phisher to enumerate all of the relevant terms in | 11 // To make it harder for a phisher to enumerate all of the relevant terms in |
12 // the model, the terms are provided as SHA-256 hashes, rather than plain text. | 12 // the model, the terms are provided as SHA-256 hashes, rather than plain text. |
13 // | 13 // |
14 // There is one PhishingTermFeatureExtractor per RenderView. | 14 // There is one PhishingTermFeatureExtractor per RenderView. |
15 | 15 |
16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
18 | 18 |
| 19 #include <set> |
19 #include <string> | 20 #include <string> |
20 | 21 |
21 #include "base/basictypes.h" | 22 #include "base/basictypes.h" |
22 #include "base/callback.h" | 23 #include "base/callback.h" |
23 #include "base/containers/hash_tables.h" | 24 #include "base/containers/hash_tables.h" |
24 #include "base/containers/mru_cache.h" | |
25 #include "base/memory/scoped_ptr.h" | 25 #include "base/memory/scoped_ptr.h" |
26 #include "base/memory/weak_ptr.h" | 26 #include "base/memory/weak_ptr.h" |
27 #include "base/strings/string16.h" | 27 #include "base/strings/string16.h" |
28 #include "base/strings/string_piece.h" | 28 #include "base/strings/string_piece.h" |
29 | 29 |
30 namespace safe_browsing { | 30 namespace safe_browsing { |
31 class FeatureExtractorClock; | 31 class FeatureExtractorClock; |
32 class FeatureMap; | 32 class FeatureMap; |
33 | 33 |
34 class PhishingTermFeatureExtractor { | 34 class PhishingTermFeatureExtractor { |
35 public: | 35 public: |
36 // Callback to be run when feature extraction finishes. The callback | 36 // Callback to be run when feature extraction finishes. The callback |
37 // argument is true if extraction was successful, false otherwise. | 37 // argument is true if extraction was successful, false otherwise. |
38 typedef base::Callback<void(bool)> DoneCallback; | 38 typedef base::Callback<void(bool)> DoneCallback; |
39 | 39 |
40 // Creates a PhishingTermFeatureExtractor which will extract features for | 40 // Creates a PhishingTermFeatureExtractor which will extract features for |
41 // all of the terms whose SHA-256 hashes are in |page_term_hashes|. These | 41 // all of the terms whose SHA-256 hashes are in |page_term_hashes|. These |
42 // terms may be multi-word n-grams, with at most |max_words_per_term| words. | 42 // terms may be multi-word n-grams, with at most |max_words_per_term| words. |
43 // | 43 // |
44 // |page_word_hashes| contains the murmur3 hashes for all of the individual | 44 // |page_word_hashes| contains the murmur3 hashes for all of the individual |
45 // words that make up the terms. Both sets of strings are UTF-8 encoded and | 45 // words that make up the terms. Both sets of strings are UTF-8 encoded and |
46 // lowercased prior to hashing. The caller owns both sets of strings, and | 46 // lowercased prior to hashing. The caller owns both sets of strings, and |
47 // must ensure that they are valid until the PhishingTermFeatureExtractor is | 47 // must ensure that they are valid until the PhishingTermFeatureExtractor is |
48 // destroyed. | 48 // destroyed. |
49 // | 49 // |
| 50 // In addition to extracting page terms, we will also extract text shingling |
| 51 // sketch, which consists of hashes of N-gram-words (referred to as shingles) |
| 52 // in the page. |shingle_size| defines N, and |max_shingles_per_page| defines |
| 53 // the maximum number of unique shingle hashes we extracted per page. |
| 54 // |
50 // |clock| is used for timing feature extractor operations, and may be mocked | 55 // |clock| is used for timing feature extractor operations, and may be mocked |
51 // for testing. The caller keeps ownership of the clock. | 56 // for testing. The caller keeps ownership of the clock. |
52 PhishingTermFeatureExtractor( | 57 PhishingTermFeatureExtractor( |
53 const base::hash_set<std::string>* page_term_hashes, | 58 const base::hash_set<std::string>* page_term_hashes, |
54 const base::hash_set<uint32>* page_word_hashes, | 59 const base::hash_set<uint32>* page_word_hashes, |
55 size_t max_words_per_term, | 60 size_t max_words_per_term, |
56 uint32 murmurhash3_seed, | 61 uint32 murmurhash3_seed, |
| 62 size_t max_shingles_per_page, |
| 63 size_t shingle_size, |
57 FeatureExtractorClock* clock); | 64 FeatureExtractorClock* clock); |
58 ~PhishingTermFeatureExtractor(); | 65 ~PhishingTermFeatureExtractor(); |
59 | 66 |
60 // Begins extracting features from |page_text| into the given FeatureMap. | 67 // Begins extracting features from |page_text| into the given FeatureMap. |
61 // |page_text| should contain the plain text of a web page, including any | 68 // |page_text| should contain the plain text of a web page, including any |
62 // subframes, as returned by RenderView::CaptureText(). | 69 // subframes, as returned by RenderView::CaptureText(). |
63 // | 70 // |
64 // To avoid blocking the render thread for too long, the feature extractor | 71 // To avoid blocking the render thread for too long, the feature extractor |
65 // may run in several chunks of work, posting a task to the current | 72 // may run in several chunks of work, posting a task to the current |
66 // MessageLoop to continue processing. Once feature extraction is complete, | 73 // MessageLoop to continue processing. Once feature extraction is complete, |
67 // |done_callback| is run on the current thread. | 74 // |done_callback| is run on the current thread. |
68 // PhishingTermFeatureExtractor takes ownership of the callback. | 75 // PhishingTermFeatureExtractor takes ownership of the callback. |
69 // | 76 // |
70 // |page_text| and |features| are owned by the caller, and must not be | 77 // |page_text|, |features|, and |shingle_hashes| are owned by the caller, |
71 // destroyed until either |done_callback| is run or | 78 // and must not be destroyed until either |done_callback| is run or |
72 // CancelPendingExtraction() is called. | 79 // CancelPendingExtraction() is called. |
73 void ExtractFeatures(const base::string16* page_text, | 80 void ExtractFeatures(const base::string16* page_text, |
74 FeatureMap* features, | 81 FeatureMap* features, |
| 82 std::set<uint32>* shingle_hashes, |
75 const DoneCallback& done_callback); | 83 const DoneCallback& done_callback); |
76 | 84 |
77 // Cancels any pending feature extraction. The DoneCallback will not be run. | 85 // Cancels any pending feature extraction. The DoneCallback will not be run. |
78 // Must be called if there is a feature extraction in progress when the page | 86 // Must be called if there is a feature extraction in progress when the page |
79 // is unloaded or the PhishingTermFeatureExtractor is destroyed. | 87 // is unloaded or the PhishingTermFeatureExtractor is destroyed. |
80 void CancelPendingExtraction(); | 88 void CancelPendingExtraction(); |
81 | 89 |
82 private: | 90 private: |
83 struct ExtractionState; | 91 struct ExtractionState; |
84 | 92 |
85 // The maximum amount of wall time that we will spend on a single extraction | 93 // The maximum amount of wall time that we will spend on a single extraction |
86 // iteration before pausing to let other MessageLoop tasks run. | 94 // iteration before pausing to let other MessageLoop tasks run. |
87 static const int kMaxTimePerChunkMs; | 95 static const int kMaxTimePerChunkMs; |
88 | 96 |
89 // The number of words that we will process before checking to see whether | 97 // The number of words that we will process before checking to see whether |
90 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be | 98 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be |
91 // slow, we don't do this on every word processed. | 99 // slow, we don't do this on every word processed. |
92 static const int kClockCheckGranularity; | 100 static const int kClockCheckGranularity; |
93 | 101 |
94 // The maximum total amount of time that the feature extractor will run | 102 // The maximum total amount of time that the feature extractor will run |
95 // before giving up on the current page. | 103 // before giving up on the current page. |
96 static const int kMaxTotalTimeMs; | 104 static const int kMaxTotalTimeMs; |
97 | 105 |
98 // The size of the cache that we use to determine if we can avoid lower | |
99 // casing, hashing, and UTF conversion. | |
100 static const int kMaxNegativeWordCacheSize; | |
101 | |
102 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs | 106 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs |
103 // until a predefined maximum amount of time has elapsed, then posts a task | 107 // until a predefined maximum amount of time has elapsed, then posts a task |
104 // to the current MessageLoop to continue extraction. When extraction | 108 // to the current MessageLoop to continue extraction. When extraction |
105 // finishes, calls RunCallback(). | 109 // finishes, calls RunCallback(). |
106 void ExtractFeaturesWithTimeout(); | 110 void ExtractFeaturesWithTimeout(); |
107 | 111 |
108 // Handles a single word in the page text. | 112 // Handles a single word in the page text. |
109 void HandleWord(const base::StringPiece16& word); | 113 void HandleWord(const base::StringPiece16& word); |
110 | 114 |
111 // Helper to verify that there is no pending feature extraction. Dies in | 115 // Helper to verify that there is no pending feature extraction. Dies in |
(...skipping 16 matching lines...) Expand all Loading... |
128 // quick out in the common case that the current word we are processing | 132 // quick out in the common case that the current word we are processing |
129 // doesn't contain any part of one of our terms. | 133 // doesn't contain any part of one of our terms. |
130 const base::hash_set<uint32>* page_word_hashes_; | 134 const base::hash_set<uint32>* page_word_hashes_; |
131 | 135 |
132 // The maximum number of words in an n-gram. | 136 // The maximum number of words in an n-gram. |
133 const size_t max_words_per_term_; | 137 const size_t max_words_per_term_; |
134 | 138 |
135 // The seed for murmurhash3. | 139 // The seed for murmurhash3. |
136 const uint32 murmurhash3_seed_; | 140 const uint32 murmurhash3_seed_; |
137 | 141 |
138 // This cache is used to see if we need to check the word at all, as | 142 // The maximum number of unique shingle hashes we extract in a page. |
139 // converting to UTF8, lowercasing, and hashing are all relatively expensive | 143 const size_t max_shingles_per_page_; |
140 // operations. Though this is called an MRU cache, it seems to behave like | 144 |
141 // an LRU cache (i.e. it evicts the oldest accesses first). | 145 // The number of words in a shingle. |
142 typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache; | 146 const size_t shingle_size_; |
143 WordCache negative_word_cache_; | |
144 | 147 |
145 // Non-owned pointer to our clock. | 148 // Non-owned pointer to our clock. |
146 FeatureExtractorClock* clock_; | 149 FeatureExtractorClock* clock_; |
147 | 150 |
148 // The output parameters from the most recent call to ExtractFeatures(). | 151 // The output parameters from the most recent call to ExtractFeatures(). |
149 const base::string16* page_text_; // The caller keeps ownership of this. | 152 const base::string16* page_text_; // The caller keeps ownership of this. |
150 FeatureMap* features_; // The caller keeps ownership of this. | 153 FeatureMap* features_; // The caller keeps ownership of this. |
| 154 std::set<uint32>* shingle_hashes_; |
151 DoneCallback done_callback_; | 155 DoneCallback done_callback_; |
152 | 156 |
153 // Stores the current state of term extraction from |page_text_|. | 157 // Stores the current state of term extraction from |page_text_|. |
154 scoped_ptr<ExtractionState> state_; | 158 scoped_ptr<ExtractionState> state_; |
155 | 159 |
156 // Used in scheduling ExtractFeaturesWithTimeout tasks. | 160 // Used in scheduling ExtractFeaturesWithTimeout tasks. |
157 // These pointers are invalidated if extraction is cancelled. | 161 // These pointers are invalidated if extraction is cancelled. |
158 base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_; | 162 base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_; |
159 | 163 |
160 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); | 164 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); |
161 }; | 165 }; |
162 | 166 |
163 } // namespace safe_browsing | 167 } // namespace safe_browsing |
164 | 168 |
165 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 169 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
OLD | NEW |