| OLD | NEW | 
|---|
| (Empty) |  | 
|  | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 
|  | 2 // Use of this source code is governed by a BSD-style license that can be | 
|  | 3 // found in the LICENSE file. | 
|  | 4 // | 
|  | 5 // PhishingTermFeatureExtractor handles computing term features from the text | 
|  | 6 // of a web page for the client-side phishing detection model.  To do this, it | 
|  | 7 // takes a list of terms that appear in the model, and scans through the page | 
|  | 8 // text looking for them.  Any terms that appear will cause a corresponding | 
|  | 9 // features::kPageTerm feature to be added to the FeatureMap. | 
|  | 10 // | 
|  | 11 // To make it harder for a phisher to enumerate all of the relevant terms in | 
|  | 12 // the model, the terms are provided as SHA-256 hashes, rather than plain text. | 
|  | 13 // | 
|  | 14 // TODO(bryner): When we compute the score, all of the features in the | 
|  | 15 // FeatureMap will be hashed so that they can be compared against the model. | 
|  | 16 // When this is implemented, add a comment about it here. | 
|  | 17 // | 
|  | 18 // There is one PhishingTermFeatureExtractor per RenderView. | 
|  | 19 | 
|  | 20 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 
|  | 21 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 
|  | 22 | 
|  | 23 #include <string> | 
|  | 24 | 
|  | 25 #include "base/basictypes.h" | 
|  | 26 #include "base/callback.h" | 
|  | 27 #include "base/hash_tables.h" | 
|  | 28 #include "base/scoped_ptr.h" | 
|  | 29 #include "base/string16.h" | 
|  | 30 #include "base/task.h" | 
|  | 31 | 
|  | 32 namespace safe_browsing { | 
|  | 33 class FeatureExtractorClock; | 
|  | 34 class FeatureMap; | 
|  | 35 | 
|  | 36 class PhishingTermFeatureExtractor { | 
|  | 37  public: | 
|  | 38   // Callback to be run when feature extraction finishes.  The callback | 
|  | 39   // argument is true if extraction was successful, false otherwise. | 
|  | 40   typedef Callback1<bool>::Type DoneCallback; | 
|  | 41 | 
|  | 42   // Creates a PhishingTermFeatureExtractor which will extract features for | 
|  | 43   // all of the terms whose SHA-256 hashes are in |page_term_hashes|.  These | 
|  | 44   // terms may be multi-word n-grams, with at most |max_words_per_term| words. | 
|  | 45   // | 
|  | 46   // |page_word_hashes| contains the hashes for all of the individual words | 
|  | 47   // that make up the terms.  Both sets of strings are UTF-8 encoded and | 
|  | 48   // lowercased prior to hashing.  The caller owns both sets of strings, and | 
|  | 49   // must ensure that they are valid until the PhishingTermFeatureExtractor is | 
|  | 50   // destroyed. | 
|  | 51   // | 
|  | 52   // |clock| is used for timing feature extractor operations, and may be mocked | 
|  | 53   // for testing.  PhishingTermFeatureExtractor takes ownership of the clock. | 
|  | 54   PhishingTermFeatureExtractor( | 
|  | 55       const base::hash_set<std::string>* page_term_hashes, | 
|  | 56       const base::hash_set<std::string>* page_word_hashes, | 
|  | 57       size_t max_words_per_term, | 
|  | 58       FeatureExtractorClock* clock); | 
|  | 59   ~PhishingTermFeatureExtractor(); | 
|  | 60 | 
|  | 61   // Begins extracting features from |page_text| into the given FeatureMap. | 
|  | 62   // |page_text| should contain the plain text of a web page, including any | 
|  | 63   // subframes, as returned by RenderView::CaptureText(). | 
|  | 64   // | 
|  | 65   // To avoid blocking the render thread for too long, the feature extractor | 
|  | 66   // may run in several chunks of work, posting a task to the current | 
|  | 67   // MessageLoop to continue processing.  Once feature extraction is complete, | 
|  | 68   // |done_callback| is run on the current thread. | 
|  | 69   // PhishingTermFeatureExtractor takes ownership of the callback. | 
|  | 70   // | 
|  | 71   // |page_text| and |features| are owned by the caller, and must not be | 
|  | 72   // destroyed until either |done_callback| is run or | 
|  | 73   // CancelPendingExtraction() is called. | 
|  | 74   void ExtractFeatures(const string16* page_text, | 
|  | 75                        FeatureMap* features, | 
|  | 76                        DoneCallback* done_callback); | 
|  | 77 | 
|  | 78   // Cancels any pending feature extraction.  The DoneCallback will not be run. | 
|  | 79   // Must be called if there is a feature extraction in progress when the page | 
|  | 80   // is unloaded or the PhishingTermFeatureExtractor is destroyed. | 
|  | 81   void CancelPendingExtraction(); | 
|  | 82 | 
|  | 83  private: | 
|  | 84   struct ExtractionState; | 
|  | 85 | 
|  | 86   // The maximum amount of wall time that we will spend on a single extraction | 
|  | 87   // iteration before pausing to let other MessageLoop tasks run. | 
|  | 88   static const int kMaxTimePerChunkMs; | 
|  | 89 | 
|  | 90   // The number of words that we will process before checking to see whether | 
|  | 91   // kMaxTimePerChunkMs has elapsed.  Since checking the current time can be | 
|  | 92   // slow, we don't do this on every word processed. | 
|  | 93   static const int kClockCheckGranularity; | 
|  | 94 | 
|  | 95   // The maximum total amount of time that the feature extractor will run | 
|  | 96   // before giving up on the current page. | 
|  | 97   static const int kMaxTotalTimeMs; | 
|  | 98 | 
|  | 99   // Does the actual work of ExtractFeatures.  ExtractFeaturesWithTimeout runs | 
|  | 100   // until a predefined maximum amount of time has elapsed, then posts a task | 
|  | 101   // to the current MessageLoop to continue extraction.  When extraction | 
|  | 102   // finishes, calls RunCallback(). | 
|  | 103   void ExtractFeaturesWithTimeout(); | 
|  | 104 | 
|  | 105   // Handles a single word in the page text. | 
|  | 106   void HandleWord(const string16& word); | 
|  | 107 | 
|  | 108   // Helper to verify that there is no pending feature extraction.  Dies in | 
|  | 109   // debug builds if the state is not as expected.  This is a no-op in release | 
|  | 110   // builds. | 
|  | 111   void CheckNoPendingExtraction(); | 
|  | 112 | 
|  | 113   // Runs |done_callback_| and then clears all internal state. | 
|  | 114   void RunCallback(bool success); | 
|  | 115 | 
|  | 116   // Clears all internal feature extraction state. | 
|  | 117   void Clear(); | 
|  | 118 | 
|  | 119   // All of the term hashes that we are looking for in the page. | 
|  | 120   const base::hash_set<std::string>* page_term_hashes_; | 
|  | 121 | 
|  | 122   // Hashes of all the individual words in page_term_hashes_.  If | 
|  | 123   // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ | 
|  | 124   // would contain (hashed) "one" and "two".  We do this so that we can have a | 
|  | 125   // quick out in the common case that the current word we are processing | 
|  | 126   // doesn't contain any part of one of our terms. | 
|  | 127   const base::hash_set<std::string>* page_word_hashes_; | 
|  | 128 | 
|  | 129   // The maximum number of words in an n-gram. | 
|  | 130   size_t max_words_per_term_; | 
|  | 131 | 
|  | 132   // Owned pointer to our clock. | 
|  | 133   scoped_ptr<FeatureExtractorClock> clock_; | 
|  | 134 | 
|  | 135   // The output parameters from the most recent call to ExtractFeatures(). | 
|  | 136   const string16* page_text_;  // The caller keeps ownership of this. | 
|  | 137   FeatureMap* features_;  // The caller keeps ownership of this. | 
|  | 138   scoped_ptr<DoneCallback> done_callback_; | 
|  | 139 | 
|  | 140   // Stores the current state of term extraction from |page_text_|. | 
|  | 141   scoped_ptr<ExtractionState> state_; | 
|  | 142 | 
|  | 143   // Used to create ExtractFeaturesWithTimeout tasks. | 
|  | 144   // These tasks are revoked if extraction is cancelled. | 
|  | 145   ScopedRunnableMethodFactory<PhishingTermFeatureExtractor> method_factory_; | 
|  | 146 | 
|  | 147   DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); | 
|  | 148 }; | 
|  | 149 | 
|  | 150 }  // namespace safe_browsing | 
|  | 151 | 
|  | 152 #endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ | 
| OLD | NEW | 
|---|