OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 // |
| 5 // PhishingDOMFeatureExtractor handles computing DOM-based features for the |
| 6 // client-side phishing detection model. These include the presence of various |
| 7 // types of elements, ratios of external and secure links, and tokens for |
| 8 // external domains linked to. |
| 9 |
| 10 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_ |
| 11 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_ |
| 12 |
| 13 #include <string> |
| 14 |
| 15 #include "base/basictypes.h" |
| 16 #include "base/callback.h" |
| 17 #include "base/scoped_ptr.h" |
| 18 #include "base/task.h" |
| 19 |
| 20 class GURL; |
| 21 class RenderView; |
| 22 |
| 23 namespace WebKit { |
| 24 class WebElement; |
| 25 class WebFrame; |
| 26 } |
| 27 |
| 28 namespace safe_browsing { |
| 29 class FeatureMap; |
| 30 |
| 31 class PhishingDOMFeatureExtractor { |
| 32 public: |
| 33 // Callback to be run when feature extraction finishes. The callback |
| 34 // argument is true if extraction was successful, false otherwise. |
| 35 typedef Callback1<bool>::Type DoneCallback; |
| 36 |
| 37 // Creates a PhishingDOMFeatureExtractor for the specified RenderView. |
| 38 // The PhishingDOMFeatureExtrator should be destroyed prior to destroying |
| 39 // the RenderView. |
| 40 explicit PhishingDOMFeatureExtractor(RenderView* render_view); |
| 41 ~PhishingDOMFeatureExtractor(); |
| 42 |
| 43 // Begins extracting features into the given FeatureMap for the page |
| 44 // currently loaded in this object's RenderView. To avoid blocking the |
| 45 // render thread for too long, the feature extractor may run in several |
| 46 // chunks of work, posting a task to the current MessageLoop to continue |
| 47 // processing. Once feature extraction is complete, |done_callback| |
| 48 // is run. PhishingDOMFeatureExtractor takes ownership of the callback. |
| 49 void ExtractFeatures(FeatureMap* features, DoneCallback* done_callback); |
| 50 |
| 51 // Cancels any pending feature extraction. The DoneCallback will not be run. |
| 52 // Must be called if there is a feature extraction in progress when the page |
| 53 // is unloaded or the PhishingDOMFeatureExtractor is destroyed. |
| 54 void CancelPendingExtraction(); |
| 55 |
| 56 private: |
| 57 struct FrameData; |
| 58 struct PageFeatureState; |
| 59 |
| 60 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs |
| 61 // until a predefined maximum amount of time has elapsed, then posts a task |
| 62 // to the current MessageLoop to continue extraction. When extraction |
| 63 // finishes, calls RunCallback(). |
| 64 void ExtractFeaturesWithTimeout(); |
| 65 |
| 66 // Handlers for the various HTML elements that we compute features for. |
| 67 // Since some of the features (such as ratios) cannot be computed until |
| 68 // feature extraction is finished, these handlers do not add to the feature |
| 69 // map directly. Instead, they update the values in the PageFeatureState. |
| 70 void HandleLink(const WebKit::WebElement& element); |
| 71 void HandleForm(const WebKit::WebElement& element); |
| 72 void HandleImage(const WebKit::WebElement& element); |
| 73 void HandleInput(const WebKit::WebElement& element); |
| 74 void HandleScript(const WebKit::WebElement& element); |
| 75 |
| 76 // Helper to verify that there is no pending feature extraction. Dies in |
| 77 // debug builds if the state is not as expected. This is a no-op in release |
| 78 // builds. |
| 79 void CheckNoPendingExtraction(); |
| 80 |
| 81 // Runs |done_callback_| and then clears all internal state. |
| 82 void RunCallback(bool success); |
| 83 |
| 84 // Clears all internal feature extraction state. |
| 85 void Clear(); |
| 86 |
| 87 // Called after advancing |cur_frame_| to update the state in |
| 88 // |cur_frame_data_|. Returns true if the state was updated successfully. |
| 89 bool ResetFrameData(); |
| 90 |
| 91 // Given a URL, checks whether the domain is different from the domain of |
| 92 // the current frame's URL. If so, stores the domain in |domain| and returns |
| 93 // true, otherwise returns false. |
| 94 bool IsExternalDomain(const GURL& url, std::string* domain) const; |
| 95 |
| 96 // Called once all frames have been processed to compute features from the |
| 97 // PageFeatureState and add them to |features_|. See features.h for a |
| 98 // description of which features are computed. |
| 99 void InsertFeatures(); |
| 100 |
| 101 // Non-owned pointer to the view that we will extract features from. |
| 102 RenderView* render_view_; |
| 103 |
| 104 // The output parameters from the most recent call to ExtractFeatures(). |
| 105 FeatureMap* features_; // The caller keeps ownership of this. |
| 106 scoped_ptr<DoneCallback> done_callback_; |
| 107 |
| 108 // Non-owned pointer to the current frame that we are processing. |
| 109 WebKit::WebFrame* cur_frame_; |
| 110 |
| 111 // Stores extra state for |cur_frame_| that will be persisted until we |
| 112 // advance to the next frame. |
| 113 scoped_ptr<FrameData> cur_frame_data_; |
| 114 |
| 115 // Stores the intermediate data used to create features. This data is |
| 116 // accumulated across all frames in the RenderView. |
| 117 scoped_ptr<PageFeatureState> page_feature_state_; |
| 118 |
| 119 // Used to create ExtractFeaturesWithTimeout tasks. |
| 120 // These tasks are revoked if extraction is cancelled. |
| 121 ScopedRunnableMethodFactory<PhishingDOMFeatureExtractor> method_factory_; |
| 122 |
| 123 DISALLOW_COPY_AND_ASSIGN(PhishingDOMFeatureExtractor); |
| 124 }; |
| 125 |
| 126 } // namespace safe_browsing |
| 127 |
| 128 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_ |
OLD | NEW |