| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 // | |
| 5 // PhishingTermFeatureExtractor handles computing term features from the text | |
| 6 // of a web page for the client-side phishing detection model. To do this, it | |
| 7 // takes a list of terms that appear in the model, and scans through the page | |
| 8 // text looking for them. Any terms that appear will cause a corresponding | |
| 9 // features::kPageTerm feature to be added to the FeatureMap. | |
| 10 // | |
| 11 // To make it harder for a phisher to enumerate all of the relevant terms in | |
| 12 // the model, the terms are provided as SHA-256 hashes, rather than plain text. | |
| 13 // | |
| 14 // There is one PhishingTermFeatureExtractor per RenderView. | |
| 15 | |
| 16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | |
| 17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | |
| 18 | |
| 19 #include <stddef.h> | |
| 20 #include <stdint.h> | |
| 21 | |
| 22 #include <memory> | |
| 23 #include <set> | |
| 24 #include <string> | |
| 25 | |
| 26 #include "base/callback.h" | |
| 27 #include "base/containers/hash_tables.h" | |
| 28 #include "base/macros.h" | |
| 29 #include "base/memory/weak_ptr.h" | |
| 30 #include "base/strings/string16.h" | |
| 31 #include "base/strings/string_piece.h" | |
| 32 | |
| 33 namespace safe_browsing { | |
| 34 class FeatureExtractorClock; | |
| 35 class FeatureMap; | |
| 36 | |
| 37 class PhishingTermFeatureExtractor { | |
| 38 public: | |
| 39 // Callback to be run when feature extraction finishes. The callback | |
| 40 // argument is true if extraction was successful, false otherwise. | |
| 41 typedef base::Callback<void(bool)> DoneCallback; | |
| 42 | |
| 43 // Creates a PhishingTermFeatureExtractor which will extract features for | |
| 44 // all of the terms whose SHA-256 hashes are in |page_term_hashes|. These | |
| 45 // terms may be multi-word n-grams, with at most |max_words_per_term| words. | |
| 46 // | |
| 47 // |page_word_hashes| contains the murmur3 hashes for all of the individual | |
| 48 // words that make up the terms. Both sets of strings are UTF-8 encoded and | |
| 49 // lowercased prior to hashing. The caller owns both sets of strings, and | |
| 50 // must ensure that they are valid until the PhishingTermFeatureExtractor is | |
| 51 // destroyed. | |
| 52 // | |
| 53 // In addition to extracting page terms, we will also extract text shingling | |
| 54 // sketch, which consists of hashes of N-gram-words (referred to as shingles) | |
| 55 // in the page. |shingle_size| defines N, and |max_shingles_per_page| defines | |
| 56 // the maximum number of unique shingle hashes we extracted per page. | |
| 57 // | |
| 58 // |clock| is used for timing feature extractor operations, and may be mocked | |
| 59 // for testing. The caller keeps ownership of the clock. | |
| 60 PhishingTermFeatureExtractor( | |
| 61 const base::hash_set<std::string>* page_term_hashes, | |
| 62 const base::hash_set<uint32_t>* page_word_hashes, | |
| 63 size_t max_words_per_term, | |
| 64 uint32_t murmurhash3_seed, | |
| 65 size_t max_shingles_per_page, | |
| 66 size_t shingle_size, | |
| 67 FeatureExtractorClock* clock); | |
| 68 ~PhishingTermFeatureExtractor(); | |
| 69 | |
| 70 // Begins extracting features from |page_text| into the given FeatureMap. | |
| 71 // |page_text| should contain the plain text of a web page, including any | |
| 72 // subframes, as returned by RenderView::CaptureText(). | |
| 73 // | |
| 74 // To avoid blocking the render thread for too long, the feature extractor | |
| 75 // may run in several chunks of work, posting a task to the current | |
| 76 // MessageLoop to continue processing. Once feature extraction is complete, | |
| 77 // |done_callback| is run on the current thread. | |
| 78 // PhishingTermFeatureExtractor takes ownership of the callback. | |
| 79 // | |
| 80 // |page_text|, |features|, and |shingle_hashes| are owned by the caller, | |
| 81 // and must not be destroyed until either |done_callback| is run or | |
| 82 // CancelPendingExtraction() is called. | |
| 83 void ExtractFeatures(const base::string16* page_text, | |
| 84 FeatureMap* features, | |
| 85 std::set<uint32_t>* shingle_hashes, | |
| 86 const DoneCallback& done_callback); | |
| 87 | |
| 88 // Cancels any pending feature extraction. The DoneCallback will not be run. | |
| 89 // Must be called if there is a feature extraction in progress when the page | |
| 90 // is unloaded or the PhishingTermFeatureExtractor is destroyed. | |
| 91 void CancelPendingExtraction(); | |
| 92 | |
| 93 private: | |
| 94 struct ExtractionState; | |
| 95 | |
| 96 // The maximum amount of wall time that we will spend on a single extraction | |
| 97 // iteration before pausing to let other MessageLoop tasks run. | |
| 98 static const int kMaxTimePerChunkMs; | |
| 99 | |
| 100 // The number of words that we will process before checking to see whether | |
| 101 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be | |
| 102 // slow, we don't do this on every word processed. | |
| 103 static const int kClockCheckGranularity; | |
| 104 | |
| 105 // The maximum total amount of time that the feature extractor will run | |
| 106 // before giving up on the current page. | |
| 107 static const int kMaxTotalTimeMs; | |
| 108 | |
| 109 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs | |
| 110 // until a predefined maximum amount of time has elapsed, then posts a task | |
| 111 // to the current MessageLoop to continue extraction. When extraction | |
| 112 // finishes, calls RunCallback(). | |
| 113 void ExtractFeaturesWithTimeout(); | |
| 114 | |
| 115 // Handles a single word in the page text. | |
| 116 void HandleWord(const base::StringPiece16& word); | |
| 117 | |
| 118 // Helper to verify that there is no pending feature extraction. Dies in | |
| 119 // debug builds if the state is not as expected. This is a no-op in release | |
| 120 // builds. | |
| 121 void CheckNoPendingExtraction(); | |
| 122 | |
| 123 // Runs |done_callback_| and then clears all internal state. | |
| 124 void RunCallback(bool success); | |
| 125 | |
| 126 // Clears all internal feature extraction state. | |
| 127 void Clear(); | |
| 128 | |
| 129 // All of the term hashes that we are looking for in the page. | |
| 130 const base::hash_set<std::string>* page_term_hashes_; | |
| 131 | |
| 132 // Murmur3 hashes of all the individual words in page_term_hashes_. If | |
| 133 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ | |
| 134 // would contain (hashed) "one" and "two". We do this so that we can have a | |
| 135 // quick out in the common case that the current word we are processing | |
| 136 // doesn't contain any part of one of our terms. | |
| 137 const base::hash_set<uint32_t>* page_word_hashes_; | |
| 138 | |
| 139 // The maximum number of words in an n-gram. | |
| 140 const size_t max_words_per_term_; | |
| 141 | |
| 142 // The seed for murmurhash3. | |
| 143 const uint32_t murmurhash3_seed_; | |
| 144 | |
| 145 // The maximum number of unique shingle hashes we extract in a page. | |
| 146 const size_t max_shingles_per_page_; | |
| 147 | |
| 148 // The number of words in a shingle. | |
| 149 const size_t shingle_size_; | |
| 150 | |
| 151 // Non-owned pointer to our clock. | |
| 152 FeatureExtractorClock* clock_; | |
| 153 | |
| 154 // The output parameters from the most recent call to ExtractFeatures(). | |
| 155 const base::string16* page_text_; // The caller keeps ownership of this. | |
| 156 FeatureMap* features_; // The caller keeps ownership of this. | |
| 157 std::set<uint32_t>* shingle_hashes_; | |
| 158 DoneCallback done_callback_; | |
| 159 | |
| 160 // Stores the current state of term extraction from |page_text_|. | |
| 161 std::unique_ptr<ExtractionState> state_; | |
| 162 | |
| 163 // Used in scheduling ExtractFeaturesWithTimeout tasks. | |
| 164 // These pointers are invalidated if extraction is cancelled. | |
| 165 base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_; | |
| 166 | |
| 167 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); | |
| 168 }; | |
| 169 | |
| 170 } // namespace safe_browsing | |
| 171 | |
| 172 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | |
| OLD | NEW |