OLD | NEW |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 // | 4 // |
5 // PhishingTermFeatureExtractor handles computing term features from the text | 5 // PhishingTermFeatureExtractor handles computing term features from the text |
6 // of a web page for the client-side phishing detection model. To do this, it | 6 // of a web page for the client-side phishing detection model. To do this, it |
7 // takes a list of terms that appear in the model, and scans through the page | 7 // takes a list of terms that appear in the model, and scans through the page |
8 // text looking for them. Any terms that appear will cause a corresponding | 8 // text looking for them. Any terms that appear will cause a corresponding |
9 // features::kPageTerm feature to be added to the FeatureMap. | 9 // features::kPageTerm feature to be added to the FeatureMap. |
10 // | 10 // |
(...skipping 23 matching lines...) Expand all Loading... |
34 class PhishingTermFeatureExtractor { | 34 class PhishingTermFeatureExtractor { |
35 public: | 35 public: |
36 // Callback to be run when feature extraction finishes. The callback | 36 // Callback to be run when feature extraction finishes. The callback |
37 // argument is true if extraction was successful, false otherwise. | 37 // argument is true if extraction was successful, false otherwise. |
38 typedef Callback1<bool>::Type DoneCallback; | 38 typedef Callback1<bool>::Type DoneCallback; |
39 | 39 |
40 // Creates a PhishingTermFeatureExtractor which will extract features for | 40 // Creates a PhishingTermFeatureExtractor which will extract features for |
41 // all of the terms whose SHA-256 hashes are in |page_term_hashes|. These | 41 // all of the terms whose SHA-256 hashes are in |page_term_hashes|. These |
42 // terms may be multi-word n-grams, with at most |max_words_per_term| words. | 42 // terms may be multi-word n-grams, with at most |max_words_per_term| words. |
43 // | 43 // |
44 // |page_word_hashes| contains the hashes for all of the individual words | 44 // |page_word_hashes| contains the murmur3 hashes for all of the individual |
45 // that make up the terms. Both sets of strings are UTF-8 encoded and | 45 // words that make up the terms. Both sets of strings are UTF-8 encoded and |
46 // lowercased prior to hashing. The caller owns both sets of strings, and | 46 // lowercased prior to hashing. The caller owns both sets of strings, and |
47 // must ensure that they are valid until the PhishingTermFeatureExtractor is | 47 // must ensure that they are valid until the PhishingTermFeatureExtractor is |
48 // destroyed. | 48 // destroyed. |
49 // | 49 // |
50 // |clock| is used for timing feature extractor operations, and may be mocked | 50 // |clock| is used for timing feature extractor operations, and may be mocked |
51 // for testing. The caller keeps ownership of the clock. | 51 // for testing. The caller keeps ownership of the clock. |
52 PhishingTermFeatureExtractor( | 52 PhishingTermFeatureExtractor( |
53 const base::hash_set<std::string>* page_term_hashes, | 53 const base::hash_set<std::string>* page_term_hashes, |
54 const base::hash_set<std::string>* page_word_hashes, | 54 const base::hash_set<uint32>* page_word_hashes, |
55 size_t max_words_per_term, | 55 size_t max_words_per_term, |
| 56 uint32 murmurhash3_seed, |
56 FeatureExtractorClock* clock); | 57 FeatureExtractorClock* clock); |
57 ~PhishingTermFeatureExtractor(); | 58 ~PhishingTermFeatureExtractor(); |
58 | 59 |
59 // Begins extracting features from |page_text| into the given FeatureMap. | 60 // Begins extracting features from |page_text| into the given FeatureMap. |
60 // |page_text| should contain the plain text of a web page, including any | 61 // |page_text| should contain the plain text of a web page, including any |
61 // subframes, as returned by RenderView::CaptureText(). | 62 // subframes, as returned by RenderView::CaptureText(). |
62 // | 63 // |
63 // To avoid blocking the render thread for too long, the feature extractor | 64 // To avoid blocking the render thread for too long, the feature extractor |
64 // may run in several chunks of work, posting a task to the current | 65 // may run in several chunks of work, posting a task to the current |
65 // MessageLoop to continue processing. Once feature extraction is complete, | 66 // MessageLoop to continue processing. Once feature extraction is complete, |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
114 | 115 |
115 // Runs |done_callback_| and then clears all internal state. | 116 // Runs |done_callback_| and then clears all internal state. |
116 void RunCallback(bool success); | 117 void RunCallback(bool success); |
117 | 118 |
118 // Clears all internal feature extraction state. | 119 // Clears all internal feature extraction state. |
119 void Clear(); | 120 void Clear(); |
120 | 121 |
121 // All of the term hashes that we are looking for in the page. | 122 // All of the term hashes that we are looking for in the page. |
122 const base::hash_set<std::string>* page_term_hashes_; | 123 const base::hash_set<std::string>* page_term_hashes_; |
123 | 124 |
124 // Hashes of all the individual words in page_term_hashes_. If | 125 // Murmur3 hashes of all the individual words in page_term_hashes_. If |
125 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ | 126 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ |
126 // would contain (hashed) "one" and "two". We do this so that we can have a | 127 // would contain (hashed) "one" and "two". We do this so that we can have a |
127 // quick out in the common case that the current word we are processing | 128 // quick out in the common case that the current word we are processing |
128 // doesn't contain any part of one of our terms. | 129 // doesn't contain any part of one of our terms. |
129 const base::hash_set<std::string>* page_word_hashes_; | 130 const base::hash_set<uint32>* page_word_hashes_; |
130 | 131 |
131 // The maximum number of words in an n-gram. | 132 // The maximum number of words in an n-gram. |
132 size_t max_words_per_term_; | 133 const size_t max_words_per_term_; |
| 134 |
| 135 // The seed for murmurhash3. |
| 136 const uint32 murmurhash3_seed_; |
133 | 137 |
134 // This cache is used to see if we need to check the word at all, as | 138 // This cache is used to see if we need to check the word at all, as |
135 // converting to UTF8, lowercasing, and hashing are all relatively expensive | 139 // converting to UTF8, lowercasing, and hashing are all relatively expensive |
136 // operations. Though this is called an MRU cache, it seems to behave like | 140 // operations. Though this is called an MRU cache, it seems to behave like |
137 // an LRU cache (i.e. it evicts the oldest accesses first). | 141 // an LRU cache (i.e. it evicts the oldest accesses first). |
138 typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache; | 142 typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache; |
139 WordCache negative_word_cache_; | 143 WordCache negative_word_cache_; |
140 | 144 |
141 // Non-owned pointer to our clock. | 145 // Non-owned pointer to our clock. |
142 FeatureExtractorClock* clock_; | 146 FeatureExtractorClock* clock_; |
143 | 147 |
144 // The output parameters from the most recent call to ExtractFeatures(). | 148 // The output parameters from the most recent call to ExtractFeatures(). |
145 const string16* page_text_; // The caller keeps ownership of this. | 149 const string16* page_text_; // The caller keeps ownership of this. |
146 FeatureMap* features_; // The caller keeps ownership of this. | 150 FeatureMap* features_; // The caller keeps ownership of this. |
147 scoped_ptr<DoneCallback> done_callback_; | 151 scoped_ptr<DoneCallback> done_callback_; |
148 | 152 |
149 // Stores the current state of term extraction from |page_text_|. | 153 // Stores the current state of term extraction from |page_text_|. |
150 scoped_ptr<ExtractionState> state_; | 154 scoped_ptr<ExtractionState> state_; |
151 | 155 |
152 // Used to create ExtractFeaturesWithTimeout tasks. | 156 // Used to create ExtractFeaturesWithTimeout tasks. |
153 // These tasks are revoked if extraction is cancelled. | 157 // These tasks are revoked if extraction is cancelled. |
154 ScopedRunnableMethodFactory<PhishingTermFeatureExtractor> method_factory_; | 158 ScopedRunnableMethodFactory<PhishingTermFeatureExtractor> method_factory_; |
155 | 159 |
156 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); | 160 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); |
157 }; | 161 }; |
158 | 162 |
159 } // namespace safe_browsing | 163 } // namespace safe_browsing |
160 | 164 |
161 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 165 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
OLD | NEW |