OLD | NEW |
---|---|
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // This utility program exists to process the False Start blacklist file into | 5 // This utility program exists to process the False Start blacklist file into |
6 // a static hash table so that it can be efficiently queried by Chrome. | 6 // a static hash table so that it can be efficiently queried by Chrome. |
7 | 7 |
8 #include <stdio.h> | 8 #include <algorithm> |
9 #include <stdlib.h> | 9 #include <cstdio> |
10 | |
11 #include <set> | 10 #include <set> |
11 #include <sstream> | |
12 #include <string> | 12 #include <string> |
13 #include <vector> | 13 #include <vector> |
14 | 14 |
15 #include "base/basictypes.h" | 15 #include "base/basictypes.h" |
16 #include "base/file_util.h" | |
17 #include "base/string_util.h" | |
16 #include "net/base/ssl_false_start_blacklist.h" | 18 #include "net/base/ssl_false_start_blacklist.h" |
17 | 19 |
18 using net::SSLFalseStartBlacklist; | 20 typedef std::vector<std::string> Hosts; |
19 | 21 |
20 static const unsigned kBuckets = SSLFalseStartBlacklist::kBuckets; | 22 // Parses |input| as a blacklist data file, and returns the set of hosts it |
21 | 23 // contains. |
22 static bool verbose = false; | 24 Hosts ParseHosts(const std::string& input) { |
23 | 25 Hosts hosts; |
24 static int | 26 size_t line_start = 0; |
25 usage(const char* argv0) { | 27 bool is_comment = false; |
26 fprintf(stderr, "Usage: %s <blacklist file> <output .c file>\n", argv0); | 28 bool non_whitespace_seen = false; |
27 return 1; | 29 for (size_t i = 0; i <= input.size(); ++i) { |
30 if (i == input.size() || input[i] == '\n') { | |
31 if (!is_comment && non_whitespace_seen) { | |
32 size_t len = i - line_start; | |
33 if (i > 0 && input[i - 1] == '\r') | |
34 len--; | |
35 hosts.push_back(input.substr(line_start, len)); | |
36 } | |
37 is_comment = false; | |
38 non_whitespace_seen = false; | |
39 line_start = i + 1; | |
40 } else if (input[i] != ' ' && input[i] != '\t' && input[i] != '\r') { | |
41 non_whitespace_seen = true; | |
42 if (i == line_start && input[i] == '#') | |
43 is_comment = true; | |
44 } | |
45 } | |
46 VLOG(1) << "Have " << hosts.size() << " hosts after parse"; | |
47 return hosts; | |
28 } | 48 } |
29 | 49 |
30 // StripWWWPrefix removes "www." from the beginning of any elements of the | 50 // Returns |host| with any initial "www." and trailing dots removed. Partly |
31 // vector. | 51 // based on net::StripWWW(). |
32 static void StripWWWPrefix(std::vector<std::string>* hosts) { | 52 std::string StripWWWAndTrailingDots(const std::string& host) { |
33 static const char kPrefix[] = "www."; | 53 const std::string www("www."); |
34 static const unsigned kPrefixLen = sizeof(kPrefix) - 1; | 54 const size_t start = StartsWithASCII(host, www, true) ? www.length() : 0; |
35 | 55 const size_t end = host.find_last_not_of('.'); |
36 for (size_t i = 0; i < hosts->size(); i++) { | 56 return (end == std::string::npos) ? |
37 const std::string& h = (*hosts)[i]; | 57 std::string() : host.substr(start, end - start + 1); |
38 if (h.size() >= kPrefixLen && | |
39 memcmp(h.data(), kPrefix, kPrefixLen) == 0) { | |
40 (*hosts)[i] = h.substr(kPrefixLen, h.size() - kPrefixLen); | |
41 } | |
42 } | |
43 } | 58 } |
44 | 59 |
45 // RemoveDuplicateEntries removes all duplicates from |hosts|. | 60 // Removes all duplicates from |hosts|. |
46 static void RemoveDuplicateEntries(std::vector<std::string>* hosts) { | 61 static void RemoveDuplicateEntries(std::vector<std::string>* hosts) { |
47 std::set<std::string> hosts_set; | 62 std::sort(hosts->begin(), hosts->end()); |
48 std::vector<std::string> ret; | 63 const Hosts::iterator first_dupe(std::unique(hosts->begin(), hosts->end())); |
64 for (Hosts::const_iterator i(first_dupe); i != hosts->end(); ++i) | |
agl
2011/08/02 21:39:16
The SGI STL spec says that the iterators after |fi
Peter Kasting
2011/08/02 23:37:03
Done.
| |
65 VLOG(1) << "Removing duplicate entry for " << *i; | |
66 hosts->erase(first_dupe, hosts->end()); | |
67 VLOG(1) << "Have " << hosts->size() << " hosts after removing duplicates"; | |
68 } | |
49 | 69 |
50 for (std::vector<std::string>::const_iterator | 70 // Returns the parent domain for |host|, or the empty string if the name is a |
51 i = hosts->begin(); i != hosts->end(); i++) { | 71 // top-level domain. |
52 if (hosts_set.count(*i)) { | 72 static std::string ParentDomain(const std::string& host) { |
53 if (verbose) | 73 const size_t first_dot = host.find('.'); |
54 fprintf(stderr, "Removing duplicate entry for %s\n", i->c_str()); | 74 return (first_dot == std::string::npos) ? |
55 continue; | 75 std::string() : host.substr(first_dot + 1); |
76 } | |
77 | |
78 // Predicate which returns true when a hostname has a parent domain in the set | |
79 // of hosts provided at construction time. | |
80 class ParentInSet : public std::unary_function<std::string, bool> { | |
81 public: | |
82 explicit ParentInSet(const std::set<std::string>& hosts) : hosts_(hosts) {} | |
83 | |
84 bool operator()(const std::string& host) const { | |
85 for (std::string parent(ParentDomain(host)); !parent.empty(); | |
86 parent = ParentDomain(parent)) { | |
87 if (hosts_.count(parent)) { | |
88 VLOG(1) << "Removing " << host << " as redundant"; | |
89 return true; | |
90 } | |
56 } | 91 } |
57 hosts_set.insert(*i); | 92 return false; |
58 ret.push_back(*i); | |
59 } | 93 } |
60 | 94 |
61 hosts->swap(ret); | 95 private: |
96 const std::set<std::string>& hosts_; | |
97 }; | |
98 | |
99 // Removes any hosts which are subdomains of other hosts. E.g. | |
100 // "foo.example.com" would be removed if "example.com" were also included. | |
101 static void RemoveRedundantEntries(Hosts* hosts) { | |
102 std::set<std::string> hosts_set; | |
103 for (Hosts::const_iterator i(hosts->begin()); i != hosts->end(); ++i) | |
104 hosts_set.insert(*i); | |
105 hosts->erase(std::remove_if(hosts->begin(), hosts->end(), ParentInSet(hosts_se t)), | |
agl
2011/08/02 21:39:16
(nit) 80 chars
Peter Kasting
2011/08/02 23:37:03
Fixed.
| |
106 hosts->end()); | |
107 VLOG(1) << "Have " << hosts->size() << " hosts after removing redundants"; | |
62 } | 108 } |
63 | 109 |
64 // ParentDomain returns the parent domain for a given domain name or the empty | 110 // Returns true iff all |hosts| are less than 256 bytes long (not including the |
65 // string if the name is a top-level domain. | 111 // terminating NUL) and contain two or more dot-separated components. |
66 static std::string ParentDomain(const std::string& in) { | 112 static bool CheckLengths(const Hosts& hosts) { |
67 for (size_t i = 0; i < in.size(); i++) { | 113 for (Hosts::const_iterator i(hosts.begin()); i != hosts.end(); ++i) { |
68 if (in[i] == '.') { | |
69 return in.substr(i + 1, in.size() - i - 1); | |
70 } | |
71 } | |
72 | |
73 return std::string(); | |
74 } | |
75 | |
76 // RemoveRedundantEntries removes any entries which are subdomains of other | |
77 // entries. (i.e. foo.example.com would be removed if example.com were also | |
78 // included.) | |
79 static void RemoveRedundantEntries(std::vector<std::string>* hosts) { | |
80 std::set<std::string> hosts_set; | |
81 std::vector<std::string> ret; | |
82 | |
83 for (std::vector<std::string>::const_iterator | |
84 i = hosts->begin(); i != hosts->end(); i++) { | |
85 hosts_set.insert(*i); | |
86 } | |
87 | |
88 for (std::vector<std::string>::const_iterator | |
89 i = hosts->begin(); i != hosts->end(); i++) { | |
90 std::string parent = ParentDomain(*i); | |
91 while (!parent.empty()) { | |
92 if (hosts_set.count(parent)) | |
93 break; | |
94 parent = ParentDomain(parent); | |
95 } | |
96 if (parent.empty()) { | |
97 ret.push_back(*i); | |
98 } else { | |
99 if (verbose) | |
100 fprintf(stderr, "Removing %s as redundant\n", i->c_str()); | |
101 } | |
102 } | |
103 | |
104 hosts->swap(ret); | |
105 } | |
106 | |
107 // CheckLengths returns true iff every host is less than 256 bytes long (not | |
108 // including the terminating NUL) and contains two or more labels. | |
109 static bool CheckLengths(const std::vector<std::string>& hosts) { | |
110 for (std::vector<std::string>::const_iterator | |
111 i = hosts.begin(); i != hosts.end(); i++) { | |
112 if (i->size() >= 256) { | 114 if (i->size() >= 256) { |
113 fprintf(stderr, "Entry %s is too large\n", i->c_str()); | 115 fprintf(stderr, "Entry '%s' is too large\n", i->c_str()); |
114 return false; | 116 return false; |
115 } | 117 } |
116 if (SSLFalseStartBlacklist::LastTwoLabels(i->c_str()) == NULL) { | 118 if (net::SSLFalseStartBlacklist::LastTwoComponents(*i).empty()) { |
117 fprintf(stderr, "Entry %s contains too few labels\n", i->c_str()); | 119 fprintf(stderr, "Entry '%s' contains too few labels\n", i->c_str()); |
118 return false; | 120 return false; |
119 } | 121 } |
120 } | 122 } |
121 | 123 |
122 return true; | 124 return true; |
123 } | 125 } |
124 | 126 |
125 int main(int argc, char** argv) { | 127 // Returns the contents of the output file to be written. |
126 if (argc != 3) | 128 std::string GenerateOutput(const Hosts& hosts) { |
127 return usage(argv[0]); | 129 // Hash each host into its appropriate bucket. |
128 | 130 VLOG(1) << "Using " << net::SSLFalseStartBlacklist::kBuckets |
129 const char* input_file = argv[1]; | 131 << " entry hash table"; |
130 const char* output_file = argv[2]; | 132 Hosts buckets[net::SSLFalseStartBlacklist::kBuckets]; |
131 FILE* input = fopen(input_file, "rb"); | 133 for (Hosts::const_iterator i(hosts.begin()); i != hosts.end(); ++i) { |
132 if (!input) { | 134 buckets[net::SSLFalseStartBlacklist::Hash( |
agl
2011/08/02 21:39:16
I think that this is too much for one line. Maybe:
Peter Kasting
2011/08/02 23:37:03
Yeah, it's a big chunk. Split into two pieces.
| |
133 perror("open"); | 135 net::SSLFalseStartBlacklist::LastTwoComponents(*i)) & |
134 return usage(argv[0]); | 136 (net::SSLFalseStartBlacklist::kBuckets - 1)].push_back(*i); |
135 } | 137 } |
136 | 138 |
137 if (fseek(input, 0, SEEK_END)) { | 139 // Write header. |
138 perror("fseek"); | 140 std::ostringstream output; |
141 output << "// Copyright (c) 2011 The Chromium Authors. All rights reserved.\n" | |
142 "// Use of this source code is governed by a BSD-style license that" | |
143 " can be\n// found in the LICENSE file.\n\n// WARNING: This code is" | |
144 " generated by ssl_false_start_blacklist_process.cc.\n// Do not " | |
145 "edit.\n\n#include \"net/base/ssl_false_start_blacklist.h\"\n\n" | |
146 "namespace net {\n\nconst uint32 " | |
147 "SSLFalseStartBlacklist::kHashTable[" | |
148 << net::SSLFalseStartBlacklist::kBuckets << " + 1] = {\n 0,\n"; | |
149 | |
150 // Construct data table, writing out the size as each bucket is appended. | |
151 std::string table_data; | |
152 size_t max_bucket_size = 0; | |
153 for (size_t i = 0; i < net::SSLFalseStartBlacklist::kBuckets; i++) { | |
154 max_bucket_size = std::max(max_bucket_size, buckets[i].size()); | |
155 for (Hosts::const_iterator j(buckets[i].begin()); j != buckets[i].end(); | |
156 ++j) { | |
157 table_data.push_back(static_cast<char>(j->size())); | |
158 table_data.append(*j); | |
159 } | |
160 output << " " << table_data.size() << ",\n"; | |
161 } | |
162 output << "};\n\n"; | |
163 VLOG(1) << "Largest bucket has " << max_bucket_size << " entries"; | |
164 | |
165 // Write data table, breaking lines after 72+ (2 indent, 70+ data) characters. | |
166 output << "const char SSLFalseStartBlacklist::kHashData[] = {\n"; | |
167 for (size_t i = 0, line_length = 0; i < table_data.size(); i++) { | |
168 if (line_length == 0) | |
169 output << " "; | |
170 std::ostringstream::pos_type current_length = output.tellp(); | |
171 output << static_cast<int>(table_data[i]) << ", "; | |
172 line_length += output.tellp() - current_length; | |
173 if (i == table_data.size() - 1) { | |
174 output << "\n};\n"; | |
175 } else if (line_length >= 70) { | |
176 output << "\n"; | |
177 line_length = 0; | |
178 } | |
179 } | |
180 output << "\n} // namespace net\n"; | |
181 return output.str(); | |
182 } | |
183 | |
184 #if defined(OS_WIN) | |
185 int wmain(int argc, wchar_t* argv[], wchar_t* envp[]) { | |
186 #elif defined(OS_POSIX) | |
187 int main(int argc, char* argv[], char* envp[]) { | |
188 #endif | |
189 if (argc != 3) { | |
190 fprintf(stderr, "Usage: %s <blacklist file> <output .c file>\n", argv[0]); | |
139 return 1; | 191 return 1; |
140 } | 192 } |
141 | 193 |
142 const long input_size = ftell(input); | 194 // Read input file. |
143 if (input_size < 0) { | 195 std::string input; |
144 perror("ftell"); | 196 if (!file_util::ReadFileToString(FilePath(argv[1]), &input)) { |
145 return 1; | 197 fprintf(stderr, "Failed to read input file '%s'\n", argv[1]); |
146 } | |
147 | |
148 if (fseek(input, 0, SEEK_SET)) { | |
149 perror("fseek"); | |
150 return 1; | |
151 } | |
152 | |
153 char* buffer = static_cast<char*>(malloc(input_size)); | |
154 long done = 0; | |
155 while (done < input_size) { | |
156 size_t n = fread(buffer + done, 1, input_size - done, input); | |
157 if (n == 0) { | |
158 perror("fread"); | |
159 free(buffer); | |
160 fclose(input); | |
161 return 1; | |
162 } | |
163 done += n; | |
164 } | |
165 fclose(input); | |
166 | |
167 std::vector<std::string> hosts; | |
168 | |
169 off_t line_start = 0; | |
170 bool is_comment = false; | |
171 bool non_whitespace_seen = false; | |
172 for (long i = 0; i <= input_size; i++) { | |
173 if (i == input_size || buffer[i] == '\n') { | |
174 if (!is_comment && non_whitespace_seen) { | |
175 long len = i - line_start; | |
176 if (i > 0 && buffer[i-1] == '\r') | |
177 len--; | |
178 hosts.push_back(std::string(&buffer[line_start], len)); | |
179 } | |
180 is_comment = false; | |
181 non_whitespace_seen = false; | |
182 line_start = i + 1; | |
183 continue; | |
184 } | |
185 | |
186 if (i == line_start && buffer[i] == '#') | |
187 is_comment = true; | |
188 if (buffer[i] != ' ' && buffer[i] != '\t' && buffer[i] != '\r') | |
189 non_whitespace_seen = true; | |
190 } | |
191 free(buffer); | |
192 | |
193 fprintf(stderr, "Have %d hosts after parse\n", (int) hosts.size()); | |
194 StripWWWPrefix(&hosts); | |
195 RemoveDuplicateEntries(&hosts); | |
196 fprintf(stderr, "Have %d hosts after removing duplicates\n", (int) hosts.size( )); | |
197 RemoveRedundantEntries(&hosts); | |
198 fprintf(stderr, "Have %d hosts after removing redundants\n", (int) hosts.size( )); | |
199 if (!CheckLengths(hosts)) { | |
200 fprintf(stderr, "One or more entries is too large or too small\n"); | |
201 return 2; | 198 return 2; |
202 } | 199 } |
200 Hosts hosts(ParseHosts(input)); | |
203 | 201 |
204 fprintf(stderr, "Using %d entry hash table\n", kBuckets); | 202 // Sanitize |hosts|. |
205 uint32 table[kBuckets]; | 203 std::transform(hosts.begin(), hosts.end(), hosts.begin(), |
206 std::vector<std::string> buckets[kBuckets]; | 204 StripWWWAndTrailingDots); |
205 RemoveDuplicateEntries(&hosts); | |
206 RemoveRedundantEntries(&hosts); | |
207 if (!CheckLengths(hosts)) | |
208 return 3; | |
207 | 209 |
208 for (std::vector<std::string>::const_iterator | 210 // Write output file. |
209 i = hosts.begin(); i != hosts.end(); i++) { | 211 const std::string output_str(GenerateOutput(hosts)); |
210 const char* last_two_labels = | 212 if (file_util::WriteFile(FilePath(argv[2]), output_str.data(), |
211 SSLFalseStartBlacklist::LastTwoLabels(i->c_str()); | 213 output_str.size()) == static_cast<int>(output_str.size())) |
212 const unsigned h = SSLFalseStartBlacklist::Hash(last_two_labels); | 214 return 0; |
213 buckets[h & (kBuckets - 1)].push_back(*i); | 215 fprintf(stderr, "Failed to write output file '%s'\n", argv[2]); |
214 } | 216 return 4; |
215 | |
216 std::string table_data; | |
217 unsigned max_bucket_size = 0; | |
218 for (unsigned i = 0; i < kBuckets; i++) { | |
219 if (buckets[i].size() > max_bucket_size) | |
220 max_bucket_size = buckets[i].size(); | |
221 | |
222 table[i] = table_data.size(); | |
223 for (std::vector<std::string>::const_iterator | |
224 j = buckets[i].begin(); j != buckets[i].end(); j++) { | |
225 table_data.push_back((char) j->size()); | |
226 table_data.append(*j); | |
227 } | |
228 } | |
229 | |
230 fprintf(stderr, "Largest bucket has %d entries\n", max_bucket_size); | |
231 | |
232 FILE* out = fopen(output_file, "w+"); | |
233 if (!out) { | |
234 perror("opening output file"); | |
235 return 4; | |
236 } | |
237 | |
238 fprintf(out, "// Copyright (c) 2010 The Chromium Authors. All rights " | |
239 "reserved.\n// Use of this source code is governed by a BSD-style " | |
240 "license that can be\n// found in the LICENSE file.\n\n"); | |
241 fprintf(out, "// WARNING: this code is generated by\n" | |
242 "// ssl_false_start_blacklist_process.cc. Do not edit.\n\n"); | |
243 fprintf(out, "#include \"base/basictypes.h\"\n\n"); | |
244 fprintf(out, "#include \"net/base/ssl_false_start_blacklist.h\"\n\n"); | |
245 fprintf(out, "namespace net {\n\n"); | |
246 fprintf(out, "const uint32 SSLFalseStartBlacklist::kHashTable[%d + 1] = {\n", | |
247 kBuckets); | |
248 for (unsigned i = 0; i < kBuckets; i++) { | |
249 fprintf(out, " %u,\n", (unsigned) table[i]); | |
250 } | |
251 fprintf(out, " %u,\n", (unsigned) table_data.size()); | |
252 fprintf(out, "};\n\n"); | |
253 | |
254 fprintf(out, "const char SSLFalseStartBlacklist::kHashData[] = {\n"); | |
255 for (unsigned i = 0, line_length = 0; i < table_data.size(); i++) { | |
256 if (line_length == 0) | |
257 fprintf(out, " "); | |
258 uint8 c = static_cast<uint8>(table_data[i]); | |
259 line_length += fprintf(out, "%d, ", c); | |
260 if (i == table_data.size() - 1) { | |
261 fprintf(out, "\n};\n"); | |
262 } else if (line_length >= 70) { | |
263 fprintf(out, "\n"); | |
264 line_length = 0; | |
265 } | |
266 } | |
267 fprintf(out, "\n} // namespace net\n"); | |
268 fclose(out); | |
269 | |
270 return 0; | |
271 } | 217 } |
OLD | NEW |