OLD | NEW |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 // | 4 // |
5 // Common types and constants for extracting and evaluating features in the | 5 // Common types and constants for extracting and evaluating features in the |
6 // client-side phishing detection model. A feature is simply a string and an | 6 // client-side phishing detection model. A feature is simply a string and an |
7 // associated floating-point value between 0 and 1. The phishing | 7 // associated floating-point value between 0 and 1. The phishing |
8 // classification model contains rules which give an appropriate weight to each | 8 // classification model contains rules which give an appropriate weight to each |
9 // feature or combination of features. These values can then be summed to | 9 // feature or combination of features. These values can then be summed to |
10 // compute a final phishiness score. | 10 // compute a final phishiness score. |
11 // | 11 // |
12 // Some features are boolean features. If these features are set, they always | 12 // Some features are boolean features. If these features are set, they always |
13 // have a value of 0.0 or 1.0. In practice, the features are only set if the | 13 // have a value of 0.0 or 1.0. In practice, the features are only set if the |
14 // value is true (1.0). | 14 // value is true (1.0). |
15 // | 15 // |
16 // We also use token features. These features have a unique name that is | 16 // We also use token features. These features have a unique name that is |
17 // constructed from the URL or page contents that we are classifying, for | 17 // constructed from the URL or page contents that we are classifying, for |
18 // example, "UrlDomain=chromium". These features are also always set to 1.0 | 18 // example, "UrlDomain=chromium". These features are also always set to 1.0 |
19 // if they are present. | 19 // if they are present. |
20 // | 20 // |
21 // The intermediate storage of the features for a URL is a FeatureMap, which is | 21 // The intermediate storage of the features for a URL is a FeatureMap, which is |
22 // just a thin wrapper around a map of feature name to value. The entire set | 22 // just a thin wrapper around a map of feature name to value. The entire set |
23 // of features for a URL is extracted before we do any scoring. | 23 // of features for a URL is extracted before we do any scoring. |
24 | 24 |
25 #ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ | 25 #ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ |
26 #define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ | 26 #define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ |
27 | 27 |
28 #include <stddef.h> | 28 #include <stddef.h> |
29 #include <string> | 29 #include <string> |
| 30 #include <unordered_map> |
30 | 31 |
31 #include "base/containers/hash_tables.h" | |
32 #include "base/macros.h" | 32 #include "base/macros.h" |
33 | 33 |
34 namespace safe_browsing { | 34 namespace safe_browsing { |
35 | 35 |
36 // Container for a map of features to values, which enforces behavior | 36 // Container for a map of features to values, which enforces behavior |
37 // such as a maximum number of features in the map. | 37 // such as a maximum number of features in the map. |
38 class FeatureMap { | 38 class FeatureMap { |
39 public: | 39 public: |
40 FeatureMap(); | 40 FeatureMap(); |
41 ~FeatureMap(); | 41 ~FeatureMap(); |
42 | 42 |
43 // Adds a boolean feature to a FeatureMap with a value of 1.0. | 43 // Adds a boolean feature to a FeatureMap with a value of 1.0. |
44 // Returns true on success, or false if the feature map exceeds | 44 // Returns true on success, or false if the feature map exceeds |
45 // kMaxFeatureMapSize. | 45 // kMaxFeatureMapSize. |
46 bool AddBooleanFeature(const std::string& name); | 46 bool AddBooleanFeature(const std::string& name); |
47 | 47 |
48 // Adds a real-valued feature to a FeatureMap with the given value. | 48 // Adds a real-valued feature to a FeatureMap with the given value. |
49 // Values must always be in the range [0.0, 1.0]. Returns true on | 49 // Values must always be in the range [0.0, 1.0]. Returns true on |
50 // success, or false if the feature map exceeds kMaxFeatureMapSize | 50 // success, or false if the feature map exceeds kMaxFeatureMapSize |
51 // or the value is outside of the allowed range. | 51 // or the value is outside of the allowed range. |
52 bool AddRealFeature(const std::string& name, double value); | 52 bool AddRealFeature(const std::string& name, double value); |
53 | 53 |
54 // Provides read-only access to the current set of features. | 54 // Provides read-only access to the current set of features. |
55 const base::hash_map<std::string, double>& features() const { | 55 const std::unordered_map<std::string, double>& features() const { |
56 return features_; | 56 return features_; |
57 } | 57 } |
58 | 58 |
59 // Clears the set of features in the map. | 59 // Clears the set of features in the map. |
60 void Clear(); | 60 void Clear(); |
61 | 61 |
62 // This is an upper bound on the number of features that will be extracted. | 62 // This is an upper bound on the number of features that will be extracted. |
63 // We should never hit this cap; it is intended as a sanity check to prevent | 63 // We should never hit this cap; it is intended as a sanity check to prevent |
64 // the FeatureMap from growing too large. | 64 // the FeatureMap from growing too large. |
65 static const size_t kMaxFeatureMapSize; | 65 static const size_t kMaxFeatureMapSize; |
66 | 66 |
67 private: | 67 private: |
68 base::hash_map<std::string, double> features_; | 68 std::unordered_map<std::string, double> features_; |
69 | 69 |
70 DISALLOW_COPY_AND_ASSIGN(FeatureMap); | 70 DISALLOW_COPY_AND_ASSIGN(FeatureMap); |
71 }; | 71 }; |
72 | 72 |
73 namespace features { | 73 namespace features { |
74 // Constants for the various feature names that we use. | 74 // Constants for the various feature names that we use. |
75 // | 75 // |
76 // IMPORTANT: when adding new features, you must update kAllowedFeatures in | 76 // IMPORTANT: when adding new features, you must update kAllowedFeatures in |
77 // chrome/browser/safe_browsing/client_side_detection_service.cc if the feature | 77 // chrome/browser/safe_browsing/client_side_detection_service.cc if the feature |
78 // should be sent in sanitized pingbacks. | 78 // should be sent in sanitized pingbacks. |
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
172 // Token feature for a term (whitespace-delimited) on a page. Terms can be | 172 // Token feature for a term (whitespace-delimited) on a page. Terms can be |
173 // single words or multi-word n-grams. Rather than adding this feature for | 173 // single words or multi-word n-grams. Rather than adding this feature for |
174 // every possible token on a page, only the terms that are mentioned in the | 174 // every possible token on a page, only the terms that are mentioned in the |
175 // classification model are added. | 175 // classification model are added. |
176 extern const char kPageTerm[]; | 176 extern const char kPageTerm[]; |
177 | 177 |
178 } // namespace features | 178 } // namespace features |
179 } // namespace safe_browsing | 179 } // namespace safe_browsing |
180 | 180 |
181 #endif // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ | 181 #endif // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ |
OLD | NEW |