OLD | NEW |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 // | 4 // |
5 // This proto represents a machine learning model which is used to compute | 5 // This proto represents a machine learning model which is used to compute |
6 // the probability that a particular page visited by Chrome is phishing. | 6 // the probability that a particular page visited by Chrome is phishing. |
7 // | 7 // |
8 // Note: sine the machine learning model is trained on the server-side and then | 8 // Note: sine the machine learning model is trained on the server-side and then |
9 // downloaded onto the client it is important that this proto file stays in | 9 // downloaded onto the client it is important that this proto file stays in |
10 // sync with the server-side copy. Otherwise, the client may not be able to | 10 // sync with the server-side copy. Otherwise, the client may not be able to |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
53 } | 53 } |
54 | 54 |
55 // List of rules which make up the model | 55 // List of rules which make up the model |
56 repeated Rule rule = 2; | 56 repeated Rule rule = 2; |
57 | 57 |
58 // List of indexes that point to the hashed page terms that appear in | 58 // List of indexes that point to the hashed page terms that appear in |
59 // the model. The hashes are computed over page terms that are encoded | 59 // the model. The hashes are computed over page terms that are encoded |
60 // as lowercase UTF-8 strings. | 60 // as lowercase UTF-8 strings. |
61 repeated int32 page_term = 3; | 61 repeated int32 page_term = 3; |
62 | 62 |
63 // List of indexes that point to the hashed page words. The page words | 63 // List of hashed page words. The page words correspond to all words that |
64 // correspond to all words that appear in page terms. If the term | 64 // appear in page terms. If the term "one two" is in the list of page terms |
65 // "one two" is in the list of page terms then "one" and "two" will be | 65 // then "one" and "two" will be in the list of page words. For page words |
66 // in the list of page words. | 66 // we don't use SHA256 because it is too expensive. We use MurmurHash3 |
67 repeated int32 page_word = 4; | 67 // instead. See: http://code.google.com/p/smhasher. |
| 68 repeated fixed32 page_word = 4; |
68 | 69 |
69 // Page terms in page_term contain at most this many page words. | 70 // Page terms in page_term contain at most this many page words. |
70 required int32 max_words_per_term = 5; | 71 required int32 max_words_per_term = 5; |
71 | 72 |
72 // Model version number. Every model that we train should have a different | 73 // Model version number. Every model that we train should have a different |
73 // version number and it should always be larger than the previous model | 74 // version number and it should always be larger than the previous model |
74 // version. | 75 // version. |
75 optional int32 version = 6; | 76 optional int32 version = 6; |
76 | 77 |
77 // List of known bad IP subnets. | 78 // List of known bad IP subnets. |
78 message IPSubnet { | 79 message IPSubnet { |
79 // The subnet prefix is a valid 16-byte IPv6 address (in network order) that | 80 // The subnet prefix is a valid 16-byte IPv6 address (in network order) that |
80 // is hashed using sha256. | 81 // is hashed using sha256. |
81 required bytes prefix = 1; | 82 required bytes prefix = 1; |
82 | 83 |
83 // Network prefix size in bits. Default is an exact-host match. | 84 // Network prefix size in bits. Default is an exact-host match. |
84 optional int32 size = 2 [default = 128]; | 85 optional int32 size = 2 [default = 128]; |
85 }; | 86 }; |
86 repeated IPSubnet bad_subnet = 7; | 87 repeated IPSubnet bad_subnet = 7; |
| 88 |
| 89 // Murmur hash seed that was used to hash the page words. |
| 90 optional fixed32 murmur_hash_seed = 8; |
87 } | 91 } |
OLD | NEW |