| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 // | |
| 5 // Common types and constants for extracting and evaluating features in the | |
| 6 // client-side phishing detection model. A feature is simply a string and an | |
| 7 // associated floating-point value between 0 and 1. The phishing | |
| 8 // classification model contains rules which give an appropriate weight to each | |
| 9 // feature or combination of features. These values can then be summed to | |
| 10 // compute a final phishiness score. | |
| 11 // | |
| 12 // Some features are boolean features. If these features are set, they always | |
| 13 // have a value of 0.0 or 1.0. In practice, the features are only set if the | |
| 14 // value is true (1.0). | |
| 15 // | |
| 16 // We also use token features. These features have a unique name that is | |
| 17 // constructed from the URL or page contents that we are classifying, for | |
| 18 // example, "UrlDomain=chromium". These features are also always set to 1.0 | |
| 19 // if they are present. | |
| 20 // | |
| 21 // The intermediate storage of the features for a URL is a FeatureMap, which is | |
| 22 // just a thin wrapper around a map of feature name to value. The entire set | |
| 23 // of features for a URL is extracted before we do any scoring. | |
| 24 | |
| 25 #ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ | |
| 26 #define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ | |
| 27 | |
| 28 #include <stddef.h> | |
| 29 #include <string> | |
| 30 | |
| 31 #include "base/containers/hash_tables.h" | |
| 32 #include "base/macros.h" | |
| 33 | |
| 34 namespace safe_browsing { | |
| 35 | |
| 36 // Container for a map of features to values, which enforces behavior | |
| 37 // such as a maximum number of features in the map. | |
| 38 class FeatureMap { | |
| 39 public: | |
| 40 FeatureMap(); | |
| 41 ~FeatureMap(); | |
| 42 | |
| 43 // Adds a boolean feature to a FeatureMap with a value of 1.0. | |
| 44 // Returns true on success, or false if the feature map exceeds | |
| 45 // kMaxFeatureMapSize. | |
| 46 bool AddBooleanFeature(const std::string& name); | |
| 47 | |
| 48 // Adds a real-valued feature to a FeatureMap with the given value. | |
| 49 // Values must always be in the range [0.0, 1.0]. Returns true on | |
| 50 // success, or false if the feature map exceeds kMaxFeatureMapSize | |
| 51 // or the value is outside of the allowed range. | |
| 52 bool AddRealFeature(const std::string& name, double value); | |
| 53 | |
| 54 // Provides read-only access to the current set of features. | |
| 55 const base::hash_map<std::string, double>& features() const { | |
| 56 return features_; | |
| 57 } | |
| 58 | |
| 59 // Clears the set of features in the map. | |
| 60 void Clear(); | |
| 61 | |
| 62 // This is an upper bound on the number of features that will be extracted. | |
| 63 // We should never hit this cap; it is intended as a sanity check to prevent | |
| 64 // the FeatureMap from growing too large. | |
| 65 static const size_t kMaxFeatureMapSize; | |
| 66 | |
| 67 private: | |
| 68 base::hash_map<std::string, double> features_; | |
| 69 | |
| 70 DISALLOW_COPY_AND_ASSIGN(FeatureMap); | |
| 71 }; | |
| 72 | |
| 73 namespace features { | |
| 74 // Constants for the various feature names that we use. | |
| 75 // | |
| 76 // IMPORTANT: when adding new features, you must update kAllowedFeatures in | |
| 77 // chrome/browser/safe_browsing/client_side_detection_service.cc if the feature | |
| 78 // should be sent in sanitized pingbacks. | |
| 79 | |
| 80 //////////////////////////////////////////////////// | |
| 81 // URL host features | |
| 82 //////////////////////////////////////////////////// | |
| 83 | |
| 84 // Set if the URL's hostname is an IP address. | |
| 85 extern const char kUrlHostIsIpAddress[]; | |
| 86 // Token feature containing the portion of the hostname controlled by a | |
| 87 // registrar, for example "com" or "co.uk". | |
| 88 extern const char kUrlTldToken[]; | |
| 89 // Token feature containing the first host component below the registrar. | |
| 90 // For example, in "www.google.com", the domain would be "google". | |
| 91 extern const char kUrlDomainToken[]; | |
| 92 // Token feature containing each host component below the domain. | |
| 93 // For example, in "www.host.example.com", both "www" and "host" would be | |
| 94 // "other host tokens". | |
| 95 extern const char kUrlOtherHostToken[]; | |
| 96 | |
| 97 //////////////////////////////////////////////////// | |
| 98 // Aggregate features for URL host tokens | |
| 99 //////////////////////////////////////////////////// | |
| 100 | |
| 101 // Set if the number of "other" host tokens for a URL is greater than one. | |
| 102 // Longer hostnames, regardless of the specific tokens, can be a signal that | |
| 103 // the URL is phishy. | |
| 104 extern const char kUrlNumOtherHostTokensGTOne[]; | |
| 105 // Set if the number of "other" host tokens for a URL is greater than three. | |
| 106 extern const char kUrlNumOtherHostTokensGTThree[]; | |
| 107 | |
| 108 //////////////////////////////////////////////////// | |
| 109 // URL path token features | |
| 110 //////////////////////////////////////////////////// | |
| 111 | |
| 112 // Token feature containing each alphanumeric string in the path that is at | |
| 113 // least 3 characters long. For example, "/abc/d/efg" would have 2 path | |
| 114 // token features, "abc" and "efg". Query parameters are not included. | |
| 115 extern const char kUrlPathToken[]; | |
| 116 | |
| 117 //////////////////////////////////////////////////// | |
| 118 // DOM HTML form features | |
| 119 //////////////////////////////////////////////////// | |
| 120 | |
| 121 // Set if the page has any <form> elements. | |
| 122 extern const char kPageHasForms[]; | |
| 123 // The fraction of form elements whose |action| attribute points to a | |
| 124 // URL on a different domain from the document URL. | |
| 125 extern const char kPageActionOtherDomainFreq[]; | |
| 126 // Token feature containing each URL that an |action| attribute | |
| 127 // points to. | |
| 128 extern const char kPageActionURL[]; | |
| 129 // Set if the page has any <input type="text"> elements | |
| 130 // (includes inputs with missing or unknown types). | |
| 131 extern const char kPageHasTextInputs[]; | |
| 132 // Set if the page has any <input type="password"> elements. | |
| 133 extern const char kPageHasPswdInputs[]; | |
| 134 // Set if the page has any <input type="radio"> elements. | |
| 135 extern const char kPageHasRadioInputs[]; | |
| 136 // Set if the page has any <input type="checkbox"> elements. | |
| 137 extern const char kPageHasCheckInputs[]; | |
| 138 | |
| 139 //////////////////////////////////////////////////// | |
| 140 // DOM HTML link features | |
| 141 //////////////////////////////////////////////////// | |
| 142 | |
| 143 // The fraction of links in the page which point to a domain other than the | |
| 144 // domain of the document. See "URL host features" above for a discussion | |
| 145 // of how the doamin is computed. | |
| 146 extern const char kPageExternalLinksFreq[]; | |
| 147 // Token feature containing each external domain that is linked to. | |
| 148 extern const char kPageLinkDomain[]; | |
| 149 // Fraction of links in the page that use https. | |
| 150 extern const char kPageSecureLinksFreq[]; | |
| 151 | |
| 152 //////////////////////////////////////////////////// | |
| 153 // DOM HTML script features | |
| 154 //////////////////////////////////////////////////// | |
| 155 | |
| 156 // Set if the number of <script> elements in the page is greater than 1. | |
| 157 extern const char kPageNumScriptTagsGTOne[]; | |
| 158 // Set if the number of <script> elements in the page is greater than 6. | |
| 159 extern const char kPageNumScriptTagsGTSix[]; | |
| 160 | |
| 161 //////////////////////////////////////////////////// | |
| 162 // Other DOM HTML features | |
| 163 //////////////////////////////////////////////////// | |
| 164 | |
| 165 // The fraction of images whose src attribute points to an external domain. | |
| 166 extern const char kPageImgOtherDomainFreq[]; | |
| 167 | |
| 168 //////////////////////////////////////////////////// | |
| 169 // Page term features | |
| 170 //////////////////////////////////////////////////// | |
| 171 | |
| 172 // Token feature for a term (whitespace-delimited) on a page. Terms can be | |
| 173 // single words or multi-word n-grams. Rather than adding this feature for | |
| 174 // every possible token on a page, only the terms that are mentioned in the | |
| 175 // classification model are added. | |
| 176 extern const char kPageTerm[]; | |
| 177 | |
| 178 } // namespace features | |
| 179 } // namespace safe_browsing | |
| 180 | |
| 181 #endif // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ | |
| OLD | NEW |