| OLD | NEW |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 // | 4 // |
| 5 // PhishingTermFeatureExtractor handles computing term features from the text | 5 // PhishingTermFeatureExtractor handles computing term features from the text |
| 6 // of a web page for the client-side phishing detection model. To do this, it | 6 // of a web page for the client-side phishing detection model. To do this, it |
| 7 // takes a list of terms that appear in the model, and scans through the page | 7 // takes a list of terms that appear in the model, and scans through the page |
| 8 // text looking for them. Any terms that appear will cause a corresponding | 8 // text looking for them. Any terms that appear will cause a corresponding |
| 9 // features::kPageTerm feature to be added to the FeatureMap. | 9 // features::kPageTerm feature to be added to the FeatureMap. |
| 10 // | 10 // |
| 11 // To make it harder for a phisher to enumerate all of the relevant terms in | 11 // To make it harder for a phisher to enumerate all of the relevant terms in |
| 12 // the model, the terms are provided as SHA-256 hashes, rather than plain text. | 12 // the model, the terms are provided as SHA-256 hashes, rather than plain text. |
| 13 // | 13 // |
| 14 // There is one PhishingTermFeatureExtractor per RenderView. | 14 // There is one PhishingTermFeatureExtractor per RenderView. |
| 15 | 15 |
| 16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
| 17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
| 18 | 18 |
| 19 #include <stddef.h> |
| 20 #include <stdint.h> |
| 21 |
| 19 #include <set> | 22 #include <set> |
| 20 #include <string> | 23 #include <string> |
| 21 | 24 |
| 22 #include "base/basictypes.h" | |
| 23 #include "base/callback.h" | 25 #include "base/callback.h" |
| 24 #include "base/containers/hash_tables.h" | 26 #include "base/containers/hash_tables.h" |
| 27 #include "base/macros.h" |
| 25 #include "base/memory/scoped_ptr.h" | 28 #include "base/memory/scoped_ptr.h" |
| 26 #include "base/memory/weak_ptr.h" | 29 #include "base/memory/weak_ptr.h" |
| 27 #include "base/strings/string16.h" | 30 #include "base/strings/string16.h" |
| 28 #include "base/strings/string_piece.h" | 31 #include "base/strings/string_piece.h" |
| 29 | 32 |
| 30 namespace safe_browsing { | 33 namespace safe_browsing { |
| 31 class FeatureExtractorClock; | 34 class FeatureExtractorClock; |
| 32 class FeatureMap; | 35 class FeatureMap; |
| 33 | 36 |
| 34 class PhishingTermFeatureExtractor { | 37 class PhishingTermFeatureExtractor { |
| (...skipping 14 matching lines...) Expand all Loading... |
| 49 // | 52 // |
| 50 // In addition to extracting page terms, we will also extract text shingling | 53 // In addition to extracting page terms, we will also extract text shingling |
| 51 // sketch, which consists of hashes of N-gram-words (referred to as shingles) | 54 // sketch, which consists of hashes of N-gram-words (referred to as shingles) |
| 52 // in the page. |shingle_size| defines N, and |max_shingles_per_page| defines | 55 // in the page. |shingle_size| defines N, and |max_shingles_per_page| defines |
| 53 // the maximum number of unique shingle hashes we extracted per page. | 56 // the maximum number of unique shingle hashes we extracted per page. |
| 54 // | 57 // |
| 55 // |clock| is used for timing feature extractor operations, and may be mocked | 58 // |clock| is used for timing feature extractor operations, and may be mocked |
| 56 // for testing. The caller keeps ownership of the clock. | 59 // for testing. The caller keeps ownership of the clock. |
| 57 PhishingTermFeatureExtractor( | 60 PhishingTermFeatureExtractor( |
| 58 const base::hash_set<std::string>* page_term_hashes, | 61 const base::hash_set<std::string>* page_term_hashes, |
| 59 const base::hash_set<uint32>* page_word_hashes, | 62 const base::hash_set<uint32_t>* page_word_hashes, |
| 60 size_t max_words_per_term, | 63 size_t max_words_per_term, |
| 61 uint32 murmurhash3_seed, | 64 uint32_t murmurhash3_seed, |
| 62 size_t max_shingles_per_page, | 65 size_t max_shingles_per_page, |
| 63 size_t shingle_size, | 66 size_t shingle_size, |
| 64 FeatureExtractorClock* clock); | 67 FeatureExtractorClock* clock); |
| 65 ~PhishingTermFeatureExtractor(); | 68 ~PhishingTermFeatureExtractor(); |
| 66 | 69 |
| 67 // Begins extracting features from |page_text| into the given FeatureMap. | 70 // Begins extracting features from |page_text| into the given FeatureMap. |
| 68 // |page_text| should contain the plain text of a web page, including any | 71 // |page_text| should contain the plain text of a web page, including any |
| 69 // subframes, as returned by RenderView::CaptureText(). | 72 // subframes, as returned by RenderView::CaptureText(). |
| 70 // | 73 // |
| 71 // To avoid blocking the render thread for too long, the feature extractor | 74 // To avoid blocking the render thread for too long, the feature extractor |
| 72 // may run in several chunks of work, posting a task to the current | 75 // may run in several chunks of work, posting a task to the current |
| 73 // MessageLoop to continue processing. Once feature extraction is complete, | 76 // MessageLoop to continue processing. Once feature extraction is complete, |
| 74 // |done_callback| is run on the current thread. | 77 // |done_callback| is run on the current thread. |
| 75 // PhishingTermFeatureExtractor takes ownership of the callback. | 78 // PhishingTermFeatureExtractor takes ownership of the callback. |
| 76 // | 79 // |
| 77 // |page_text|, |features|, and |shingle_hashes| are owned by the caller, | 80 // |page_text|, |features|, and |shingle_hashes| are owned by the caller, |
| 78 // and must not be destroyed until either |done_callback| is run or | 81 // and must not be destroyed until either |done_callback| is run or |
| 79 // CancelPendingExtraction() is called. | 82 // CancelPendingExtraction() is called. |
| 80 void ExtractFeatures(const base::string16* page_text, | 83 void ExtractFeatures(const base::string16* page_text, |
| 81 FeatureMap* features, | 84 FeatureMap* features, |
| 82 std::set<uint32>* shingle_hashes, | 85 std::set<uint32_t>* shingle_hashes, |
| 83 const DoneCallback& done_callback); | 86 const DoneCallback& done_callback); |
| 84 | 87 |
| 85 // Cancels any pending feature extraction. The DoneCallback will not be run. | 88 // Cancels any pending feature extraction. The DoneCallback will not be run. |
| 86 // Must be called if there is a feature extraction in progress when the page | 89 // Must be called if there is a feature extraction in progress when the page |
| 87 // is unloaded or the PhishingTermFeatureExtractor is destroyed. | 90 // is unloaded or the PhishingTermFeatureExtractor is destroyed. |
| 88 void CancelPendingExtraction(); | 91 void CancelPendingExtraction(); |
| 89 | 92 |
| 90 private: | 93 private: |
| 91 struct ExtractionState; | 94 struct ExtractionState; |
| 92 | 95 |
| (...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 124 void Clear(); | 127 void Clear(); |
| 125 | 128 |
| 126 // All of the term hashes that we are looking for in the page. | 129 // All of the term hashes that we are looking for in the page. |
| 127 const base::hash_set<std::string>* page_term_hashes_; | 130 const base::hash_set<std::string>* page_term_hashes_; |
| 128 | 131 |
| 129 // Murmur3 hashes of all the individual words in page_term_hashes_. If | 132 // Murmur3 hashes of all the individual words in page_term_hashes_. If |
| 130 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ | 133 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ |
| 131 // would contain (hashed) "one" and "two". We do this so that we can have a | 134 // would contain (hashed) "one" and "two". We do this so that we can have a |
| 132 // quick out in the common case that the current word we are processing | 135 // quick out in the common case that the current word we are processing |
| 133 // doesn't contain any part of one of our terms. | 136 // doesn't contain any part of one of our terms. |
| 134 const base::hash_set<uint32>* page_word_hashes_; | 137 const base::hash_set<uint32_t>* page_word_hashes_; |
| 135 | 138 |
| 136 // The maximum number of words in an n-gram. | 139 // The maximum number of words in an n-gram. |
| 137 const size_t max_words_per_term_; | 140 const size_t max_words_per_term_; |
| 138 | 141 |
| 139 // The seed for murmurhash3. | 142 // The seed for murmurhash3. |
| 140 const uint32 murmurhash3_seed_; | 143 const uint32_t murmurhash3_seed_; |
| 141 | 144 |
| 142 // The maximum number of unique shingle hashes we extract in a page. | 145 // The maximum number of unique shingle hashes we extract in a page. |
| 143 const size_t max_shingles_per_page_; | 146 const size_t max_shingles_per_page_; |
| 144 | 147 |
| 145 // The number of words in a shingle. | 148 // The number of words in a shingle. |
| 146 const size_t shingle_size_; | 149 const size_t shingle_size_; |
| 147 | 150 |
| 148 // Non-owned pointer to our clock. | 151 // Non-owned pointer to our clock. |
| 149 FeatureExtractorClock* clock_; | 152 FeatureExtractorClock* clock_; |
| 150 | 153 |
| 151 // The output parameters from the most recent call to ExtractFeatures(). | 154 // The output parameters from the most recent call to ExtractFeatures(). |
| 152 const base::string16* page_text_; // The caller keeps ownership of this. | 155 const base::string16* page_text_; // The caller keeps ownership of this. |
| 153 FeatureMap* features_; // The caller keeps ownership of this. | 156 FeatureMap* features_; // The caller keeps ownership of this. |
| 154 std::set<uint32>* shingle_hashes_; | 157 std::set<uint32_t>* shingle_hashes_; |
| 155 DoneCallback done_callback_; | 158 DoneCallback done_callback_; |
| 156 | 159 |
| 157 // Stores the current state of term extraction from |page_text_|. | 160 // Stores the current state of term extraction from |page_text_|. |
| 158 scoped_ptr<ExtractionState> state_; | 161 scoped_ptr<ExtractionState> state_; |
| 159 | 162 |
| 160 // Used in scheduling ExtractFeaturesWithTimeout tasks. | 163 // Used in scheduling ExtractFeaturesWithTimeout tasks. |
| 161 // These pointers are invalidated if extraction is cancelled. | 164 // These pointers are invalidated if extraction is cancelled. |
| 162 base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_; | 165 base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_; |
| 163 | 166 |
| 164 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); | 167 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); |
| 165 }; | 168 }; |
| 166 | 169 |
| 167 } // namespace safe_browsing | 170 } // namespace safe_browsing |
| 168 | 171 |
| 169 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 172 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
| OLD | NEW |