| OLD | NEW |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 // | 4 // |
| 5 // Common types and constants for extracting and evaluating features in the | 5 // Common types and constants for extracting and evaluating features in the |
| 6 // client-side phishing detection model. A feature is simply a string and an | 6 // client-side phishing detection model. A feature is simply a string and an |
| 7 // associated floating-point value between 0 and 1. The phishing | 7 // associated floating-point value between 0 and 1. The phishing |
| 8 // classification model contains rules which give an appropriate weight to each | 8 // classification model contains rules which give an appropriate weight to each |
| 9 // feature or combination of features. These values can then be summed to | 9 // feature or combination of features. These values can then be summed to |
| 10 // compute a final phishiness score. | 10 // compute a final phishiness score. |
| 11 // | 11 // |
| 12 // Some features are boolean features. If these features are set, they always | 12 // Some features are boolean features. If these features are set, they always |
| 13 // have a value of 0.0 or 1.0. In practice, the features are only set if the | 13 // have a value of 0.0 or 1.0. In practice, the features are only set if the |
| 14 // value is true (1.0). | 14 // value is true (1.0). |
| 15 // | 15 // |
| 16 // We also use token features. These features have a unique name that is | 16 // We also use token features. These features have a unique name that is |
| 17 // constructed from the URL or page contents that we are classifying, for | 17 // constructed from the URL or page contents that we are classifying, for |
| 18 // example, "UrlDomain=chromium". These features are also always set to 1.0 | 18 // example, "UrlDomain=chromium". These features are also always set to 1.0 |
| 19 // if they are present. | 19 // if they are present. |
| 20 // | 20 // |
| 21 // The intermediate storage of the features for a URL is a FeatureMap, which is | 21 // The intermediate storage of the features for a URL is a FeatureMap, which is |
| 22 // just a thin wrapper around a map of feature name to value. The entire set | 22 // just a thin wrapper around a map of feature name to value. The entire set |
| 23 // of features for a URL is extracted before we do any scoring. | 23 // of features for a URL is extracted before we do any scoring. |
| 24 | 24 |
| 25 #ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ | 25 #ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ |
| 26 #define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ | 26 #define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ |
| 27 | 27 |
| 28 #include <stddef.h> | 28 #include <stddef.h> |
| 29 #include <string> | 29 #include <string> |
| 30 #include <unordered_map> |
| 30 | 31 |
| 31 #include "base/containers/hash_tables.h" | |
| 32 #include "base/macros.h" | 32 #include "base/macros.h" |
| 33 | 33 |
| 34 namespace safe_browsing { | 34 namespace safe_browsing { |
| 35 | 35 |
| 36 // Container for a map of features to values, which enforces behavior | 36 // Container for a map of features to values, which enforces behavior |
| 37 // such as a maximum number of features in the map. | 37 // such as a maximum number of features in the map. |
| 38 class FeatureMap { | 38 class FeatureMap { |
| 39 public: | 39 public: |
| 40 FeatureMap(); | 40 FeatureMap(); |
| 41 ~FeatureMap(); | 41 ~FeatureMap(); |
| 42 | 42 |
| 43 // Adds a boolean feature to a FeatureMap with a value of 1.0. | 43 // Adds a boolean feature to a FeatureMap with a value of 1.0. |
| 44 // Returns true on success, or false if the feature map exceeds | 44 // Returns true on success, or false if the feature map exceeds |
| 45 // kMaxFeatureMapSize. | 45 // kMaxFeatureMapSize. |
| 46 bool AddBooleanFeature(const std::string& name); | 46 bool AddBooleanFeature(const std::string& name); |
| 47 | 47 |
| 48 // Adds a real-valued feature to a FeatureMap with the given value. | 48 // Adds a real-valued feature to a FeatureMap with the given value. |
| 49 // Values must always be in the range [0.0, 1.0]. Returns true on | 49 // Values must always be in the range [0.0, 1.0]. Returns true on |
| 50 // success, or false if the feature map exceeds kMaxFeatureMapSize | 50 // success, or false if the feature map exceeds kMaxFeatureMapSize |
| 51 // or the value is outside of the allowed range. | 51 // or the value is outside of the allowed range. |
| 52 bool AddRealFeature(const std::string& name, double value); | 52 bool AddRealFeature(const std::string& name, double value); |
| 53 | 53 |
| 54 // Provides read-only access to the current set of features. | 54 // Provides read-only access to the current set of features. |
| 55 const base::hash_map<std::string, double>& features() const { | 55 const std::unordered_map<std::string, double>& features() const { |
| 56 return features_; | 56 return features_; |
| 57 } | 57 } |
| 58 | 58 |
| 59 // Clears the set of features in the map. | 59 // Clears the set of features in the map. |
| 60 void Clear(); | 60 void Clear(); |
| 61 | 61 |
| 62 // This is an upper bound on the number of features that will be extracted. | 62 // This is an upper bound on the number of features that will be extracted. |
| 63 // We should never hit this cap; it is intended as a sanity check to prevent | 63 // We should never hit this cap; it is intended as a sanity check to prevent |
| 64 // the FeatureMap from growing too large. | 64 // the FeatureMap from growing too large. |
| 65 static const size_t kMaxFeatureMapSize; | 65 static const size_t kMaxFeatureMapSize; |
| 66 | 66 |
| 67 private: | 67 private: |
| 68 base::hash_map<std::string, double> features_; | 68 std::unordered_map<std::string, double> features_; |
| 69 | 69 |
| 70 DISALLOW_COPY_AND_ASSIGN(FeatureMap); | 70 DISALLOW_COPY_AND_ASSIGN(FeatureMap); |
| 71 }; | 71 }; |
| 72 | 72 |
| 73 namespace features { | 73 namespace features { |
| 74 // Constants for the various feature names that we use. | 74 // Constants for the various feature names that we use. |
| 75 // | 75 // |
| 76 // IMPORTANT: when adding new features, you must update kAllowedFeatures in | 76 // IMPORTANT: when adding new features, you must update kAllowedFeatures in |
| 77 // chrome/browser/safe_browsing/client_side_detection_service.cc if the feature | 77 // chrome/browser/safe_browsing/client_side_detection_service.cc if the feature |
| 78 // should be sent in sanitized pingbacks. | 78 // should be sent in sanitized pingbacks. |
| (...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 172 // Token feature for a term (whitespace-delimited) on a page. Terms can be | 172 // Token feature for a term (whitespace-delimited) on a page. Terms can be |
| 173 // single words or multi-word n-grams. Rather than adding this feature for | 173 // single words or multi-word n-grams. Rather than adding this feature for |
| 174 // every possible token on a page, only the terms that are mentioned in the | 174 // every possible token on a page, only the terms that are mentioned in the |
| 175 // classification model are added. | 175 // classification model are added. |
| 176 extern const char kPageTerm[]; | 176 extern const char kPageTerm[]; |
| 177 | 177 |
| 178 } // namespace features | 178 } // namespace features |
| 179 } // namespace safe_browsing | 179 } // namespace safe_browsing |
| 180 | 180 |
| 181 #endif // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ | 181 #endif // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ |
| OLD | NEW |