OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 // |
| 5 // PhishingTermFeatureExtractor handles computing term features from the text |
| 6 // of a web page for the client-side phishing detection model. To do this, it |
| 7 // takes a list of terms that appear in the model, and scans through the page |
| 8 // text looking for them. Any terms that appear will cause a corresponding |
| 9 // features::kPageTerm feature to be added to the FeatureMap. |
| 10 // |
| 11 // To make it harder for a phisher to enumerate all of the relevant terms in |
| 12 // the model, the terms are provided as SHA-256 hashes, rather than plain text. |
| 13 // |
| 14 // TODO(bryner): When we compute the score, all of the features in the |
| 15 // FeatureMap will be hashed so that they can be compared against the model. |
| 16 // When this is implemented, add a comment about it here. |
| 17 // |
| 18 // There is one PhishingTermFeatureExtractor per RenderView. |
| 19 |
| 20 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
| 21 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
| 22 |
| 23 #include <string> |
| 24 |
| 25 #include "base/basictypes.h" |
| 26 #include "base/callback.h" |
| 27 #include "base/hash_tables.h" |
| 28 #include "base/scoped_ptr.h" |
| 29 #include "base/string16.h" |
| 30 #include "base/task.h" |
| 31 |
| 32 namespace safe_browsing { |
| 33 class FeatureExtractorClock; |
| 34 class FeatureMap; |
| 35 |
| 36 class PhishingTermFeatureExtractor { |
| 37 public: |
| 38 // Callback to be run when feature extraction finishes. The callback |
| 39 // argument is true if extraction was successful, false otherwise. |
| 40 typedef Callback1<bool>::Type DoneCallback; |
| 41 |
| 42 // Creates a PhishingTermFeatureExtractor which will extract features for |
| 43 // all of the terms whose SHA-256 hashes are in |page_term_hashes|. These |
| 44 // terms may be multi-word n-grams, with at most |max_words_per_term| words. |
| 45 // |
| 46 // |page_word_hashes| contains the hashes for all of the individual words |
| 47 // that make up the terms. Both sets of strings are UTF-8 encoded and |
| 48 // lowercased prior to hashing. The caller owns both sets of strings, and |
| 49 // must ensure that they are valid until the PhishingTermFeatureExtractor is |
| 50 // destroyed. |
| 51 // |
| 52 // |clock| is used for timing feature extractor operations, and may be mocked |
| 53 // for testing. PhishingTermFeatureExtractor takes ownership of the clock. |
| 54 PhishingTermFeatureExtractor( |
| 55 const base::hash_set<std::string>* page_term_hashes, |
| 56 const base::hash_set<std::string>* page_word_hashes, |
| 57 size_t max_words_per_term, |
| 58 FeatureExtractorClock* clock); |
| 59 ~PhishingTermFeatureExtractor(); |
| 60 |
| 61 // Begins extracting features from |page_text| into the given FeatureMap. |
| 62 // |page_text| should contain the plain text of a web page, including any |
| 63 // subframes, as returned by RenderView::CaptureText(). |
| 64 // |
| 65 // To avoid blocking the render thread for too long, the feature extractor |
| 66 // may run in several chunks of work, posting a task to the current |
| 67 // MessageLoop to continue processing. Once feature extraction is complete, |
| 68 // |done_callback| is run on the current thread. |
| 69 // PhishingTermFeatureExtractor takes ownership of the callback. |
| 70 // |
| 71 // |page_text| and |features| are owned by the caller, and must not be |
| 72 // destroyed until either |done_callback| is run or |
| 73 // CancelPendingExtraction() is called. |
| 74 void ExtractFeatures(const string16* page_text, |
| 75 FeatureMap* features, |
| 76 DoneCallback* done_callback); |
| 77 |
| 78 // Cancels any pending feature extraction. The DoneCallback will not be run. |
| 79 // Must be called if there is a feature extraction in progress when the page |
| 80 // is unloaded or the PhishingTermFeatureExtractor is destroyed. |
| 81 void CancelPendingExtraction(); |
| 82 |
| 83 private: |
| 84 struct ExtractionState; |
| 85 |
| 86 // The maximum amount of wall time that we will spend on a single extraction |
| 87 // iteration before pausing to let other MessageLoop tasks run. |
| 88 static const int kMaxTimePerChunkMs; |
| 89 |
| 90 // The number of words that we will process before checking to see whether |
| 91 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be |
| 92 // slow, we don't do this on every word processed. |
| 93 static const int kClockCheckGranularity; |
| 94 |
| 95 // The maximum total amount of time that the feature extractor will run |
| 96 // before giving up on the current page. |
| 97 static const int kMaxTotalTimeMs; |
| 98 |
| 99 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs |
| 100 // until a predefined maximum amount of time has elapsed, then posts a task |
| 101 // to the current MessageLoop to continue extraction. When extraction |
| 102 // finishes, calls RunCallback(). |
| 103 void ExtractFeaturesWithTimeout(); |
| 104 |
| 105 // Handles a single word in the page text. |
| 106 void HandleWord(const string16& word); |
| 107 |
| 108 // Helper to verify that there is no pending feature extraction. Dies in |
| 109 // debug builds if the state is not as expected. This is a no-op in release |
| 110 // builds. |
| 111 void CheckNoPendingExtraction(); |
| 112 |
| 113 // Runs |done_callback_| and then clears all internal state. |
| 114 void RunCallback(bool success); |
| 115 |
| 116 // Clears all internal feature extraction state. |
| 117 void Clear(); |
| 118 |
| 119 // All of the term hashes that we are looking for in the page. |
| 120 const base::hash_set<std::string>* page_term_hashes_; |
| 121 |
| 122 // Hashes of all the individual words in page_term_hashes_. If |
| 123 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ |
| 124 // would contain (hashed) "one" and "two". We do this so that we can have a |
| 125 // quick out in the common case that the current word we are processing |
| 126 // doesn't contain any part of one of our terms. |
| 127 const base::hash_set<std::string>* page_word_hashes_; |
| 128 |
| 129 // The maximum number of words in an n-gram. |
| 130 size_t max_words_per_term_; |
| 131 |
| 132 // Owned pointer to our clock. |
| 133 scoped_ptr<FeatureExtractorClock> clock_; |
| 134 |
| 135 // The output parameters from the most recent call to ExtractFeatures(). |
| 136 const string16* page_text_; // The caller keeps ownership of this. |
| 137 FeatureMap* features_; // The caller keeps ownership of this. |
| 138 scoped_ptr<DoneCallback> done_callback_; |
| 139 |
| 140 // Stores the current state of term extraction from |page_text_|. |
| 141 scoped_ptr<ExtractionState> state_; |
| 142 |
| 143 // Used to create ExtractFeaturesWithTimeout tasks. |
| 144 // These tasks are revoked if extraction is cancelled. |
| 145 ScopedRunnableMethodFactory<PhishingTermFeatureExtractor> method_factory_; |
| 146 |
| 147 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); |
| 148 }; |
| 149 |
| 150 } // namespace safe_browsing |
| 151 |
| 152 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
OLD | NEW |