Index: chrome/renderer/safe_browsing/phishing_term_feature_extractor.h |
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h |
new file mode 100644 |
index 0000000000000000000000000000000000000000..d34ad6664ea341bdd7f28b472ef9cc56a02a4c1a |
--- /dev/null |
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h |
@@ -0,0 +1,152 @@ |
+// Copyright (c) 2010 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+// |
+// PhishingTermFeatureExtractor handles computing term features from the text |
+// of a web page for the client-side phishing detection model. To do this, it |
+// takes a list of terms that appear in the model, and scans through the page |
+// text looking for them. Any terms that appear will cause a corresponding |
+// features::kPageTerm feature to be added to the FeatureMap. |
+// |
+// To make it harder for a phisher to enumerate all of the relevant terms in |
+// the model, the terms are provided as SHA-256 hashes, rather than plain text. |
+// |
+// TODO(bryner): When we compute the score, all of the features in the |
+// FeatureMap will be hashed so that they can be compared against the model. |
+// When this is implemented, add a comment about it here. |
+// |
+// There is one PhishingTermFeatureExtractor per RenderView. |
+ |
+#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
+#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |
+ |
+#include <string> |
+ |
+#include "base/basictypes.h" |
+#include "base/callback.h" |
+#include "base/hash_tables.h" |
+#include "base/scoped_ptr.h" |
+#include "base/string16.h" |
+#include "base/task.h" |
+ |
+namespace safe_browsing { |
+class FeatureExtractorClock; |
+class FeatureMap; |
+ |
+class PhishingTermFeatureExtractor { |
+ public: |
+ // Callback to be run when feature extraction finishes. The callback |
+ // argument is true if extraction was successful, false otherwise. |
+ typedef Callback1<bool>::Type DoneCallback; |
+ |
+ // Creates a PhishingTermFeatureExtractor which will extract features for |
+ // all of the terms whose SHA-256 hashes are in |page_term_hashes|. These |
+ // terms may be multi-word n-grams, with at most |max_words_per_term| words. |
+ // |
+ // |page_word_hashes| contains the hashes for all of the individual words |
+ // that make up the terms. Both sets of strings are UTF-8 encoded and |
+ // lowercased prior to hashing. The caller owns both sets of strings, and |
+ // must ensure that they are valid until the PhishingTermFeatureExtractor is |
+ // destroyed. |
+ // |
+ // |clock| is used for timing feature extractor operations, and may be mocked |
+ // for testing. PhishingTermFeatureExtractor takes ownership of the clock. |
+ PhishingTermFeatureExtractor( |
+ const base::hash_set<std::string>* page_term_hashes, |
+ const base::hash_set<std::string>* page_word_hashes, |
+ size_t max_words_per_term, |
+ FeatureExtractorClock* clock); |
+ ~PhishingTermFeatureExtractor(); |
+ |
+ // Begins extracting features from |page_text| into the given FeatureMap. |
+ // |page_text| should contain the plain text of a web page, including any |
+ // subframes, as returned by RenderView::CaptureText(). |
+ // |
+ // To avoid blocking the render thread for too long, the feature extractor |
+ // may run in several chunks of work, posting a task to the current |
+ // MessageLoop to continue processing. Once feature extraction is complete, |
+ // |done_callback| is run on the current thread. |
+ // PhishingTermFeatureExtractor takes ownership of the callback. |
+ // |
+ // |page_text| and |features| are owned by the caller, and must not be |
+ // destroyed until either |done_callback| is run or |
+ // CancelPendingExtraction() is called. |
+ void ExtractFeatures(const string16* page_text, |
+ FeatureMap* features, |
+ DoneCallback* done_callback); |
+ |
+ // Cancels any pending feature extraction. The DoneCallback will not be run. |
+ // Must be called if there is a feature extraction in progress when the page |
+ // is unloaded or the PhishingTermFeatureExtractor is destroyed. |
+ void CancelPendingExtraction(); |
+ |
+ private: |
+ struct ExtractionState; |
+ |
+ // The maximum amount of wall time that we will spend on a single extraction |
+ // iteration before pausing to let other MessageLoop tasks run. |
+ static const int kMaxTimePerChunkMs; |
+ |
+ // The number of words that we will process before checking to see whether |
+ // kMaxTimePerChunkMs has elapsed. Since checking the current time can be |
+ // slow, we don't do this on every word processed. |
+ static const int kClockCheckGranularity; |
+ |
+ // The maximum total amount of time that the feature extractor will run |
+ // before giving up on the current page. |
+ static const int kMaxTotalTimeMs; |
+ |
+ // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs |
+ // until a predefined maximum amount of time has elapsed, then posts a task |
+ // to the current MessageLoop to continue extraction. When extraction |
+ // finishes, calls RunCallback(). |
+ void ExtractFeaturesWithTimeout(); |
+ |
+ // Handles a single word in the page text. |
+ void HandleWord(const string16& word); |
+ |
+ // Helper to verify that there is no pending feature extraction. Dies in |
+ // debug builds if the state is not as expected. This is a no-op in release |
+ // builds. |
+ void CheckNoPendingExtraction(); |
+ |
+ // Runs |done_callback_| and then clears all internal state. |
+ void RunCallback(bool success); |
+ |
+ // Clears all internal feature extraction state. |
+ void Clear(); |
+ |
+ // All of the term hashes that we are looking for in the page. |
+ const base::hash_set<std::string>* page_term_hashes_; |
+ |
+ // Hashes of all the individual words in page_term_hashes_. If |
+ // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ |
+ // would contain (hashed) "one" and "two". We do this so that we can have a |
+ // quick out in the common case that the current word we are processing |
+ // doesn't contain any part of one of our terms. |
+ const base::hash_set<std::string>* page_word_hashes_; |
+ |
+ // The maximum number of words in an n-gram. |
+ size_t max_words_per_term_; |
+ |
+ // Owned pointer to our clock. |
+ scoped_ptr<FeatureExtractorClock> clock_; |
+ |
+ // The output parameters from the most recent call to ExtractFeatures(). |
+ const string16* page_text_; // The caller keeps ownership of this. |
+ FeatureMap* features_; // The caller keeps ownership of this. |
+ scoped_ptr<DoneCallback> done_callback_; |
+ |
+ // Stores the current state of term extraction from |page_text_|. |
+ scoped_ptr<ExtractionState> state_; |
+ |
+ // Used to create ExtractFeaturesWithTimeout tasks. |
+ // These tasks are revoked if extraction is cancelled. |
+ ScopedRunnableMethodFactory<PhishingTermFeatureExtractor> method_factory_; |
+ |
+ DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); |
+}; |
+ |
+} // namespace safe_browsing |
+ |
+#endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |