| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 // | |
| 5 // This class handles the process of extracting all of the features from a | |
| 6 // page and computing a phishyness score. The basic steps are: | |
| 7 // - Run each feature extractor over the page, building up a FeatureMap of | |
| 8 // feature -> value. | |
| 9 // - SHA-256 hash all of the feature names in the map so that they match the | |
| 10 // supplied model. | |
| 11 // - Hand the hashed map off to a Scorer, which computes the probability that | |
| 12 // the page is phishy. | |
| 13 // - If the page is phishy, run the supplied callback. | |
| 14 // | |
| 15 // For more details, see phishing_*_feature_extractor.h, scorer.h, and | |
| 16 // client_model.proto. | |
| 17 | |
| 18 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_ | |
| 19 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_ | |
| 20 | |
| 21 #include <stdint.h> | |
| 22 | |
| 23 #include <memory> | |
| 24 #include <set> | |
| 25 | |
| 26 #include "base/callback.h" | |
| 27 #include "base/macros.h" | |
| 28 #include "base/memory/weak_ptr.h" | |
| 29 #include "base/strings/string16.h" | |
| 30 | |
| 31 namespace content { | |
| 32 class RenderFrame; | |
| 33 } | |
| 34 | |
| 35 namespace safe_browsing { | |
| 36 class ClientPhishingRequest; | |
| 37 class FeatureExtractorClock; | |
| 38 class FeatureMap; | |
| 39 class PhishingDOMFeatureExtractor; | |
| 40 class PhishingTermFeatureExtractor; | |
| 41 class PhishingUrlFeatureExtractor; | |
| 42 class Scorer; | |
| 43 | |
| 44 class PhishingClassifier { | |
| 45 public: | |
| 46 // Callback to be run when phishing classification finishes. The verdict | |
| 47 // is a ClientPhishingRequest which contains the verdict computed by the | |
| 48 // classifier as well as the extracted features. If the verdict.is_phishing() | |
| 49 // is true, the page is considered phishy by the client-side model, | |
| 50 // and the browser should ping back to get a final verdict. The | |
| 51 // verdict.client_score() is set to kInvalidScore if classification failed. | |
| 52 typedef base::Callback<void(const ClientPhishingRequest& /* verdict */)> | |
| 53 DoneCallback; | |
| 54 | |
| 55 static const float kInvalidScore; | |
| 56 | |
| 57 // Creates a new PhishingClassifier object that will operate on | |
| 58 // |render_view|. |clock| is used to time feature extractor operations, and | |
| 59 // the PhishingClassifier takes ownership of this object. Note that the | |
| 60 // classifier will not be 'ready' until set_phishing_scorer() is called. | |
| 61 PhishingClassifier(content::RenderFrame* render_frame, | |
| 62 FeatureExtractorClock* clock); | |
| 63 virtual ~PhishingClassifier(); | |
| 64 | |
| 65 // Sets a scorer for the classifier to use in computing the phishiness score. | |
| 66 // This must live at least as long as the PhishingClassifier. The caller is | |
| 67 // expected to cancel any pending classification before setting a phishing | |
| 68 // scorer. | |
| 69 void set_phishing_scorer(const Scorer* scorer); | |
| 70 | |
| 71 // Returns true if the classifier is ready to classify pages, i.e. it | |
| 72 // has had a scorer set via set_phishing_scorer(). | |
| 73 bool is_ready() const; | |
| 74 | |
| 75 // Called by the RenderView when a page has finished loading. This begins | |
| 76 // the feature extraction and scoring process. |page_text| should contain | |
| 77 // the plain text of a web page, including any subframes, as returned by | |
| 78 // RenderView::CaptureText(). |page_text| is owned by the caller, and must | |
| 79 // not be destroyed until either |done_callback| is run or | |
| 80 // CancelPendingClassification() is called. | |
| 81 // | |
| 82 // To avoid blocking the render thread for too long, phishing classification | |
| 83 // may run in several chunks of work, posting a task to the current | |
| 84 // MessageLoop to continue processing. Once the scoring process is complete, | |
| 85 // |done_callback| is run on the current thread. PhishingClassifier takes | |
| 86 // ownership of the callback. | |
| 87 // | |
| 88 // It is an error to call BeginClassification if the classifier is not yet | |
| 89 // ready. | |
| 90 virtual void BeginClassification(const base::string16* page_text, | |
| 91 const DoneCallback& callback); | |
| 92 | |
| 93 // Called by the RenderView (on the render thread) when a page is unloading | |
| 94 // or the RenderView is being destroyed. This cancels any extraction that | |
| 95 // is in progress. It is an error to call CancelPendingClassification if | |
| 96 // the classifier is not yet ready. | |
| 97 virtual void CancelPendingClassification(); | |
| 98 | |
| 99 private: | |
| 100 // Any score equal to or above this value is considered phishy. | |
| 101 static const float kPhishyThreshold; | |
| 102 | |
| 103 // Begins the feature extraction process, by extracting URL features and | |
| 104 // beginning DOM feature extraction. | |
| 105 void BeginFeatureExtraction(); | |
| 106 | |
| 107 // Callback to be run when DOM feature extraction is complete. | |
| 108 // If it was successful, begins term feature extraction, otherwise | |
| 109 // runs the DoneCallback with a non-phishy verdict. | |
| 110 void DOMExtractionFinished(bool success); | |
| 111 | |
| 112 // Callback to be run when term feature extraction is complete. | |
| 113 // If it was successful, computes a score and runs the DoneCallback. | |
| 114 // If extraction was unsuccessful, runs the DoneCallback with a | |
| 115 // non-phishy verdict. | |
| 116 void TermExtractionFinished(bool success); | |
| 117 | |
| 118 // Helper to verify that there is no pending phishing classification. Dies | |
| 119 // in debug builds if the state is not as expected. This is a no-op in | |
| 120 // release builds. | |
| 121 void CheckNoPendingClassification(); | |
| 122 | |
| 123 // Helper method to run the DoneCallback and clear the state. | |
| 124 void RunCallback(const ClientPhishingRequest& verdict); | |
| 125 | |
| 126 // Helper to run the DoneCallback when feature extraction has failed. | |
| 127 // This always signals a non-phishy verdict for the page, with kInvalidScore. | |
| 128 void RunFailureCallback(); | |
| 129 | |
| 130 // Clears the current state of the PhishingClassifier. | |
| 131 void Clear(); | |
| 132 | |
| 133 content::RenderFrame* render_frame_; // owns us | |
| 134 const Scorer* scorer_; // owned by the caller | |
| 135 std::unique_ptr<FeatureExtractorClock> clock_; | |
| 136 std::unique_ptr<PhishingUrlFeatureExtractor> url_extractor_; | |
| 137 std::unique_ptr<PhishingDOMFeatureExtractor> dom_extractor_; | |
| 138 std::unique_ptr<PhishingTermFeatureExtractor> term_extractor_; | |
| 139 | |
| 140 // State for any in-progress extraction. | |
| 141 std::unique_ptr<FeatureMap> features_; | |
| 142 std::unique_ptr<std::set<uint32_t>> shingle_hashes_; | |
| 143 const base::string16* page_text_; // owned by the caller | |
| 144 DoneCallback done_callback_; | |
| 145 | |
| 146 // Used in scheduling BeginFeatureExtraction tasks. | |
| 147 // These pointers are invalidated if classification is cancelled. | |
| 148 base::WeakPtrFactory<PhishingClassifier> weak_factory_; | |
| 149 | |
| 150 DISALLOW_COPY_AND_ASSIGN(PhishingClassifier); | |
| 151 }; | |
| 152 | |
| 153 } // namespace safe_browsing | |
| 154 | |
| 155 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_ | |
| OLD | NEW |