Index: chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h |
diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h |
new file mode 100644 |
index 0000000000000000000000000000000000000000..bc9d5996aa2ea7d8632510af4536aa3f846c4cb8 |
--- /dev/null |
+++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h |
@@ -0,0 +1,128 @@ |
+// Copyright (c) 2010 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+// |
+// PhishingDOMFeatureExtractor handles computing DOM-based features for the |
+// client-side phishing detection model. These include the presence of various |
+// types of elements, ratios of external and secure links, and tokens for |
+// external domains linked to. |
+ |
+#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_ |
+#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_ |
+ |
+#include <string> |
+ |
+#include "base/basictypes.h" |
+#include "base/callback.h" |
+#include "base/scoped_ptr.h" |
+#include "base/task.h" |
+ |
+class GURL; |
+class RenderView; |
+ |
+namespace WebKit { |
+class WebElement; |
+class WebFrame; |
+} |
+ |
+namespace safe_browsing { |
+class FeatureMap; |
+ |
+class PhishingDOMFeatureExtractor { |
+ public: |
+ // Callback to be run when feature extraction finishes. The callback |
+ // argument is true if extraction was successful, false otherwise. |
+ typedef Callback1<bool>::Type DoneCallback; |
+ |
+ // Creates a PhishingDOMFeatureExtractor for the specified RenderView. |
+ // The PhishingDOMFeatureExtrator should be destroyed prior to destroying |
+ // the RenderView. |
+ explicit PhishingDOMFeatureExtractor(RenderView* render_view); |
+ ~PhishingDOMFeatureExtractor(); |
+ |
+ // Begins extracting features into the given FeatureMap for the page |
+ // currently loaded in this object's RenderView. To avoid blocking the |
+ // render thread for too long, the feature extractor may run in several |
+ // chunks of work, posting a task to the current MessageLoop to continue |
+ // processing. Once feature extraction is complete, |done_callback| |
+ // is run. PhishingDOMFeatureExtractor takes ownership of the callback. |
+ void ExtractFeatures(FeatureMap* features, DoneCallback* done_callback); |
+ |
+ // Cancels any pending feature extraction. The DoneCallback will not be run. |
+ // Must be called if there is a feature extraction in progress when the page |
+ // is unloaded or the PhishingDOMFeatureExtractor is destroyed. |
+ void CancelPendingExtraction(); |
+ |
+ private: |
+ struct FrameData; |
+ struct PageFeatureState; |
+ |
+ // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs |
+ // until a predefined maximum amount of time has elapsed, then posts a task |
+ // to the current MessageLoop to continue extraction. When extraction |
+ // finishes, calls RunCallback(). |
+ void ExtractFeaturesWithTimeout(); |
+ |
+ // Handlers for the various HTML elements that we compute features for. |
+ // Since some of the features (such as ratios) cannot be computed until |
+ // feature extraction is finished, these handlers do not add to the feature |
+ // map directly. Instead, they update the values in the PageFeatureState. |
+ void HandleLink(const WebKit::WebElement& element); |
+ void HandleForm(const WebKit::WebElement& element); |
+ void HandleImage(const WebKit::WebElement& element); |
+ void HandleInput(const WebKit::WebElement& element); |
+ void HandleScript(const WebKit::WebElement& element); |
+ |
+ // Helper to verify that there is no pending feature extraction. Dies in |
+ // debug builds if the state is not as expected. This is a no-op in release |
+ // builds. |
+ void CheckNoPendingExtraction(); |
+ |
+ // Runs |done_callback_| and then clears all internal state. |
+ void RunCallback(bool success); |
+ |
+ // Clears all internal feature extraction state. |
+ void Clear(); |
+ |
+ // Called after advancing |cur_frame_| to update the state in |
+ // |cur_frame_data_|. Returns true if the state was updated successfully. |
+ bool ResetFrameData(); |
+ |
+ // Given a URL, checks whether the domain is different from the domain of |
+ // the current frame's URL. If so, stores the domain in |domain| and returns |
+ // true, otherwise returns false. |
+ bool IsExternalDomain(const GURL& url, std::string* domain) const; |
+ |
+ // Called once all frames have been processed to compute features from the |
+ // PageFeatureState and add them to |features_|. See features.h for a |
+ // description of which features are computed. |
+ void InsertFeatures(); |
+ |
+ // Non-owned pointer to the view that we will extract features from. |
+ RenderView* render_view_; |
+ |
+ // The output parameters from the most recent call to ExtractFeatures(). |
+ FeatureMap* features_; // The caller keeps ownership of this. |
+ scoped_ptr<DoneCallback> done_callback_; |
+ |
+ // Non-owned pointer to the current frame that we are processing. |
+ WebKit::WebFrame* cur_frame_; |
+ |
+ // Stores extra state for |cur_frame_| that will be persisted until we |
+ // advance to the next frame. |
+ scoped_ptr<FrameData> cur_frame_data_; |
+ |
+ // Stores the intermediate data used to create features. This data is |
+ // accumulated across all frames in the RenderView. |
+ scoped_ptr<PageFeatureState> page_feature_state_; |
+ |
+ // Used to create ExtractFeaturesWithTimeout tasks. |
+ // These tasks are revoked if extraction is cancelled. |
+ ScopedRunnableMethodFactory<PhishingDOMFeatureExtractor> method_factory_; |
+ |
+ DISALLOW_COPY_AND_ASSIGN(PhishingDOMFeatureExtractor); |
+}; |
+ |
+} // namespace safe_browsing |
+ |
+#endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_ |