| OLD | NEW |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // This utility program exists to process the False Start blacklist file into | 5 // This utility program exists to process the False Start blacklist file into |
| 6 // a static hash table so that it can be efficiently queried by Chrome. | 6 // a static hash table so that it can be efficiently queried by Chrome. |
| 7 | 7 |
| 8 #include <algorithm> | 8 #include <stdio.h> |
| 9 #include <cstdio> | 9 #include <stdlib.h> |
| 10 |
| 10 #include <set> | 11 #include <set> |
| 11 #include <sstream> | |
| 12 #include <string> | 12 #include <string> |
| 13 #include <vector> | 13 #include <vector> |
| 14 | 14 |
| 15 #include "base/basictypes.h" | 15 #include "base/basictypes.h" |
| 16 #include "base/file_util.h" | |
| 17 #include "base/string_util.h" | |
| 18 #include "net/base/ssl_false_start_blacklist.h" | 16 #include "net/base/ssl_false_start_blacklist.h" |
| 19 | 17 |
| 20 typedef std::vector<std::string> Hosts; | 18 using net::SSLFalseStartBlacklist; |
| 21 | 19 |
| 22 // Parses |input| as a blacklist data file, and returns the set of hosts it | 20 static const unsigned kBuckets = SSLFalseStartBlacklist::kBuckets; |
| 23 // contains. | 21 |
| 24 Hosts ParseHosts(const std::string& input) { | 22 static bool verbose = false; |
| 25 Hosts hosts; | 23 |
| 26 size_t line_start = 0; | 24 static int |
| 27 bool is_comment = false; | 25 usage(const char* argv0) { |
| 28 bool non_whitespace_seen = false; | 26 fprintf(stderr, "Usage: %s <blacklist file> <output .c file>\n", argv0); |
| 29 for (size_t i = 0; i <= input.size(); ++i) { | 27 return 1; |
| 30 if (i == input.size() || input[i] == '\n') { | 28 } |
| 31 if (!is_comment && non_whitespace_seen) { | 29 |
| 32 size_t len = i - line_start; | 30 // StripWWWPrefix removes "www." from the beginning of any elements of the |
| 33 if (i > 0 && input[i - 1] == '\r') | 31 // vector. |
| 34 len--; | 32 static void StripWWWPrefix(std::vector<std::string>* hosts) { |
| 35 hosts.push_back(input.substr(line_start, len)); | 33 static const char kPrefix[] = "www."; |
| 36 } | 34 static const unsigned kPrefixLen = sizeof(kPrefix) - 1; |
| 37 is_comment = false; | 35 |
| 38 non_whitespace_seen = false; | 36 for (size_t i = 0; i < hosts->size(); i++) { |
| 39 line_start = i + 1; | 37 const std::string& h = (*hosts)[i]; |
| 40 } else if (input[i] != ' ' && input[i] != '\t' && input[i] != '\r') { | 38 if (h.size() >= kPrefixLen && |
| 41 non_whitespace_seen = true; | 39 memcmp(h.data(), kPrefix, kPrefixLen) == 0) { |
| 42 if (i == line_start && input[i] == '#') | 40 (*hosts)[i] = h.substr(kPrefixLen, h.size() - kPrefixLen); |
| 43 is_comment = true; | |
| 44 } | 41 } |
| 45 } | 42 } |
| 46 VLOG(1) << "Have " << hosts.size() << " hosts after parse"; | |
| 47 return hosts; | |
| 48 } | 43 } |
| 49 | 44 |
| 50 // Returns |host| with any initial "www." and trailing dots removed. Partly | 45 // RemoveDuplicateEntries removes all duplicates from |hosts|. |
| 51 // based on net::StripWWW(). | 46 static void RemoveDuplicateEntries(std::vector<std::string>* hosts) { |
| 52 std::string StripWWWAndTrailingDots(const std::string& host) { | 47 std::set<std::string> hosts_set; |
| 53 const std::string www("www."); | 48 std::vector<std::string> ret; |
| 54 const size_t start = StartsWithASCII(host, www, true) ? www.length() : 0; | 49 |
| 55 const size_t end = host.find_last_not_of('.'); | 50 for (std::vector<std::string>::const_iterator |
| 56 return (end == std::string::npos) ? | 51 i = hosts->begin(); i != hosts->end(); i++) { |
| 57 std::string() : host.substr(start, end - start + 1); | 52 if (hosts_set.count(*i)) { |
| 53 if (verbose) |
| 54 fprintf(stderr, "Removing duplicate entry for %s\n", i->c_str()); |
| 55 continue; |
| 56 } |
| 57 hosts_set.insert(*i); |
| 58 ret.push_back(*i); |
| 59 } |
| 60 |
| 61 hosts->swap(ret); |
| 58 } | 62 } |
| 59 | 63 |
| 60 // Removes all duplicates from |hosts|. | 64 // ParentDomain returns the parent domain for a given domain name or the empty |
| 61 static void RemoveDuplicateEntries(std::vector<std::string>* hosts) { | 65 // string if the name is a top-level domain. |
| 62 std::sort(hosts->begin(), hosts->end()); | 66 static std::string ParentDomain(const std::string& in) { |
| 63 hosts->erase(std::unique(hosts->begin(), hosts->end()), hosts->end()); | 67 for (size_t i = 0; i < in.size(); i++) { |
| 64 VLOG(1) << "Have " << hosts->size() << " hosts after removing duplicates"; | 68 if (in[i] == '.') { |
| 69 return in.substr(i + 1, in.size() - i - 1); |
| 70 } |
| 71 } |
| 72 |
| 73 return std::string(); |
| 65 } | 74 } |
| 66 | 75 |
| 67 // Returns the parent domain for |host|, or the empty string if the name is a | 76 // RemoveRedundantEntries removes any entries which are subdomains of other |
| 68 // top-level domain. | 77 // entries. (i.e. foo.example.com would be removed if example.com were also |
| 69 static std::string ParentDomain(const std::string& host) { | 78 // included.) |
| 70 const size_t first_dot = host.find('.'); | 79 static void RemoveRedundantEntries(std::vector<std::string>* hosts) { |
| 71 return (first_dot == std::string::npos) ? | 80 std::set<std::string> hosts_set; |
| 72 std::string() : host.substr(first_dot + 1); | 81 std::vector<std::string> ret; |
| 82 |
| 83 for (std::vector<std::string>::const_iterator |
| 84 i = hosts->begin(); i != hosts->end(); i++) { |
| 85 hosts_set.insert(*i); |
| 86 } |
| 87 |
| 88 for (std::vector<std::string>::const_iterator |
| 89 i = hosts->begin(); i != hosts->end(); i++) { |
| 90 std::string parent = ParentDomain(*i); |
| 91 while (!parent.empty()) { |
| 92 if (hosts_set.count(parent)) |
| 93 break; |
| 94 parent = ParentDomain(parent); |
| 95 } |
| 96 if (parent.empty()) { |
| 97 ret.push_back(*i); |
| 98 } else { |
| 99 if (verbose) |
| 100 fprintf(stderr, "Removing %s as redundant\n", i->c_str()); |
| 101 } |
| 102 } |
| 103 |
| 104 hosts->swap(ret); |
| 73 } | 105 } |
| 74 | 106 |
| 75 // Predicate which returns true when a hostname has a parent domain in the set | 107 // CheckLengths returns true iff every host is less than 256 bytes long (not |
| 76 // of hosts provided at construction time. | 108 // including the terminating NUL) and contains two or more labels. |
| 77 class ParentInSet : public std::unary_function<std::string, bool> { | 109 static bool CheckLengths(const std::vector<std::string>& hosts) { |
| 78 public: | 110 for (std::vector<std::string>::const_iterator |
| 79 explicit ParentInSet(const std::set<std::string>& hosts) : hosts_(hosts) {} | 111 i = hosts.begin(); i != hosts.end(); i++) { |
| 80 | |
| 81 bool operator()(const std::string& host) const { | |
| 82 for (std::string parent(ParentDomain(host)); !parent.empty(); | |
| 83 parent = ParentDomain(parent)) { | |
| 84 if (hosts_.count(parent)) { | |
| 85 VLOG(1) << "Removing " << host << " as redundant"; | |
| 86 return true; | |
| 87 } | |
| 88 } | |
| 89 return false; | |
| 90 } | |
| 91 | |
| 92 private: | |
| 93 const std::set<std::string>& hosts_; | |
| 94 }; | |
| 95 | |
| 96 // Removes any hosts which are subdomains of other hosts. E.g. | |
| 97 // "foo.example.com" would be removed if "example.com" were also included. | |
| 98 static void RemoveRedundantEntries(Hosts* hosts) { | |
| 99 std::set<std::string> hosts_set; | |
| 100 for (Hosts::const_iterator i(hosts->begin()); i != hosts->end(); ++i) | |
| 101 hosts_set.insert(*i); | |
| 102 hosts->erase(std::remove_if(hosts->begin(), hosts->end(), | |
| 103 ParentInSet(hosts_set)), hosts->end()); | |
| 104 VLOG(1) << "Have " << hosts->size() << " hosts after removing redundants"; | |
| 105 } | |
| 106 | |
| 107 // Returns true iff all |hosts| are less than 256 bytes long (not including the | |
| 108 // terminating NUL) and contain two or more dot-separated components. | |
| 109 static bool CheckLengths(const Hosts& hosts) { | |
| 110 for (Hosts::const_iterator i(hosts.begin()); i != hosts.end(); ++i) { | |
| 111 if (i->size() >= 256) { | 112 if (i->size() >= 256) { |
| 112 fprintf(stderr, "Entry '%s' is too large\n", i->c_str()); | 113 fprintf(stderr, "Entry %s is too large\n", i->c_str()); |
| 113 return false; | 114 return false; |
| 114 } | 115 } |
| 115 if (net::SSLFalseStartBlacklist::LastTwoComponents(*i).empty()) { | 116 if (SSLFalseStartBlacklist::LastTwoLabels(i->c_str()) == NULL) { |
| 116 fprintf(stderr, "Entry '%s' contains too few labels\n", i->c_str()); | 117 fprintf(stderr, "Entry %s contains too few labels\n", i->c_str()); |
| 117 return false; | 118 return false; |
| 118 } | 119 } |
| 119 } | 120 } |
| 120 | 121 |
| 121 return true; | 122 return true; |
| 122 } | 123 } |
| 123 | 124 |
| 124 // Returns the contents of the output file to be written. | 125 int main(int argc, char** argv) { |
| 125 std::string GenerateOutput(const Hosts& hosts) { | 126 if (argc != 3) |
| 126 // Hash each host into its appropriate bucket. | 127 return usage(argv[0]); |
| 127 VLOG(1) << "Using " << net::SSLFalseStartBlacklist::kBuckets | 128 |
| 128 << " entry hash table"; | 129 const char* input_file = argv[1]; |
| 129 Hosts buckets[net::SSLFalseStartBlacklist::kBuckets]; | 130 const char* output_file = argv[2]; |
| 130 for (Hosts::const_iterator i(hosts.begin()); i != hosts.end(); ++i) { | 131 FILE* input = fopen(input_file, "rb"); |
| 131 const uint32 hash = net::SSLFalseStartBlacklist::Hash( | 132 if (!input) { |
| 132 net::SSLFalseStartBlacklist::LastTwoComponents(*i)); | 133 perror("open"); |
| 133 buckets[hash & (net::SSLFalseStartBlacklist::kBuckets - 1)].push_back(*i); | 134 return usage(argv[0]); |
| 134 } | 135 } |
| 135 | 136 |
| 136 // Write header. | 137 if (fseek(input, 0, SEEK_END)) { |
| 137 std::ostringstream output; | 138 perror("fseek"); |
| 138 output << "// Copyright (c) 2011 The Chromium Authors. All rights reserved.\n" | 139 return 1; |
| 139 "// Use of this source code is governed by a BSD-style license that" | 140 } |
| 140 " can be\n// found in the LICENSE file.\n\n// WARNING: This code is" | |
| 141 " generated by ssl_false_start_blacklist_process.cc.\n// Do not " | |
| 142 "edit.\n\n#include \"net/base/ssl_false_start_blacklist.h\"\n\n" | |
| 143 "namespace net {\n\nconst uint32 " | |
| 144 "SSLFalseStartBlacklist::kHashTable[" | |
| 145 << net::SSLFalseStartBlacklist::kBuckets << " + 1] = {\n 0,\n"; | |
| 146 | 141 |
| 147 // Construct data table, writing out the size as each bucket is appended. | 142 const long input_size = ftell(input); |
| 143 if (input_size < 0) { |
| 144 perror("ftell"); |
| 145 return 1; |
| 146 } |
| 147 |
| 148 if (fseek(input, 0, SEEK_SET)) { |
| 149 perror("fseek"); |
| 150 return 1; |
| 151 } |
| 152 |
| 153 char* buffer = static_cast<char*>(malloc(input_size)); |
| 154 long done = 0; |
| 155 while (done < input_size) { |
| 156 size_t n = fread(buffer + done, 1, input_size - done, input); |
| 157 if (n == 0) { |
| 158 perror("fread"); |
| 159 free(buffer); |
| 160 fclose(input); |
| 161 return 1; |
| 162 } |
| 163 done += n; |
| 164 } |
| 165 fclose(input); |
| 166 |
| 167 std::vector<std::string> hosts; |
| 168 |
| 169 off_t line_start = 0; |
| 170 bool is_comment = false; |
| 171 bool non_whitespace_seen = false; |
| 172 for (long i = 0; i <= input_size; i++) { |
| 173 if (i == input_size || buffer[i] == '\n') { |
| 174 if (!is_comment && non_whitespace_seen) { |
| 175 long len = i - line_start; |
| 176 if (i > 0 && buffer[i-1] == '\r') |
| 177 len--; |
| 178 hosts.push_back(std::string(&buffer[line_start], len)); |
| 179 } |
| 180 is_comment = false; |
| 181 non_whitespace_seen = false; |
| 182 line_start = i + 1; |
| 183 continue; |
| 184 } |
| 185 |
| 186 if (i == line_start && buffer[i] == '#') |
| 187 is_comment = true; |
| 188 if (buffer[i] != ' ' && buffer[i] != '\t' && buffer[i] != '\r') |
| 189 non_whitespace_seen = true; |
| 190 } |
| 191 free(buffer); |
| 192 |
| 193 fprintf(stderr, "Have %d hosts after parse\n", (int) hosts.size()); |
| 194 StripWWWPrefix(&hosts); |
| 195 RemoveDuplicateEntries(&hosts); |
| 196 fprintf(stderr, "Have %d hosts after removing duplicates\n", (int) hosts.size(
)); |
| 197 RemoveRedundantEntries(&hosts); |
| 198 fprintf(stderr, "Have %d hosts after removing redundants\n", (int) hosts.size(
)); |
| 199 if (!CheckLengths(hosts)) { |
| 200 fprintf(stderr, "One or more entries is too large or too small\n"); |
| 201 return 2; |
| 202 } |
| 203 |
| 204 fprintf(stderr, "Using %d entry hash table\n", kBuckets); |
| 205 uint32 table[kBuckets]; |
| 206 std::vector<std::string> buckets[kBuckets]; |
| 207 |
| 208 for (std::vector<std::string>::const_iterator |
| 209 i = hosts.begin(); i != hosts.end(); i++) { |
| 210 const char* last_two_labels = |
| 211 SSLFalseStartBlacklist::LastTwoLabels(i->c_str()); |
| 212 const unsigned h = SSLFalseStartBlacklist::Hash(last_two_labels); |
| 213 buckets[h & (kBuckets - 1)].push_back(*i); |
| 214 } |
| 215 |
| 148 std::string table_data; | 216 std::string table_data; |
| 149 size_t max_bucket_size = 0; | 217 unsigned max_bucket_size = 0; |
| 150 for (size_t i = 0; i < net::SSLFalseStartBlacklist::kBuckets; i++) { | 218 for (unsigned i = 0; i < kBuckets; i++) { |
| 151 max_bucket_size = std::max(max_bucket_size, buckets[i].size()); | 219 if (buckets[i].size() > max_bucket_size) |
| 152 for (Hosts::const_iterator j(buckets[i].begin()); j != buckets[i].end(); | 220 max_bucket_size = buckets[i].size(); |
| 153 ++j) { | 221 |
| 154 table_data.push_back(static_cast<char>(j->size())); | 222 table[i] = table_data.size(); |
| 223 for (std::vector<std::string>::const_iterator |
| 224 j = buckets[i].begin(); j != buckets[i].end(); j++) { |
| 225 table_data.push_back((char) j->size()); |
| 155 table_data.append(*j); | 226 table_data.append(*j); |
| 156 } | 227 } |
| 157 output << " " << table_data.size() << ",\n"; | |
| 158 } | 228 } |
| 159 output << "};\n\n"; | |
| 160 VLOG(1) << "Largest bucket has " << max_bucket_size << " entries"; | |
| 161 | 229 |
| 162 // Write data table, breaking lines after 72+ (2 indent, 70+ data) characters. | 230 fprintf(stderr, "Largest bucket has %d entries\n", max_bucket_size); |
| 163 output << "const char SSLFalseStartBlacklist::kHashData[] = {\n"; | 231 |
| 164 for (size_t i = 0, line_length = 0; i < table_data.size(); i++) { | 232 FILE* out = fopen(output_file, "w+"); |
| 233 if (!out) { |
| 234 perror("opening output file"); |
| 235 return 4; |
| 236 } |
| 237 |
| 238 fprintf(out, "// Copyright (c) 2010 The Chromium Authors. All rights " |
| 239 "reserved.\n// Use of this source code is governed by a BSD-style " |
| 240 "license that can be\n// found in the LICENSE file.\n\n"); |
| 241 fprintf(out, "// WARNING: this code is generated by\n" |
| 242 "// ssl_false_start_blacklist_process.cc. Do not edit.\n\n"); |
| 243 fprintf(out, "#include \"base/basictypes.h\"\n\n"); |
| 244 fprintf(out, "#include \"net/base/ssl_false_start_blacklist.h\"\n\n"); |
| 245 fprintf(out, "namespace net {\n\n"); |
| 246 fprintf(out, "const uint32 SSLFalseStartBlacklist::kHashTable[%d + 1] = {\n", |
| 247 kBuckets); |
| 248 for (unsigned i = 0; i < kBuckets; i++) { |
| 249 fprintf(out, " %u,\n", (unsigned) table[i]); |
| 250 } |
| 251 fprintf(out, " %u,\n", (unsigned) table_data.size()); |
| 252 fprintf(out, "};\n\n"); |
| 253 |
| 254 fprintf(out, "const char SSLFalseStartBlacklist::kHashData[] = {\n"); |
| 255 for (unsigned i = 0, line_length = 0; i < table_data.size(); i++) { |
| 165 if (line_length == 0) | 256 if (line_length == 0) |
| 166 output << " "; | 257 fprintf(out, " "); |
| 167 std::ostringstream::pos_type current_length = output.tellp(); | 258 uint8 c = static_cast<uint8>(table_data[i]); |
| 168 output << static_cast<int>(table_data[i]) << ", "; | 259 line_length += fprintf(out, "%d, ", c); |
| 169 line_length += output.tellp() - current_length; | |
| 170 if (i == table_data.size() - 1) { | 260 if (i == table_data.size() - 1) { |
| 171 output << "\n};\n"; | 261 fprintf(out, "\n};\n"); |
| 172 } else if (line_length >= 70) { | 262 } else if (line_length >= 70) { |
| 173 output << "\n"; | 263 fprintf(out, "\n"); |
| 174 line_length = 0; | 264 line_length = 0; |
| 175 } | 265 } |
| 176 } | 266 } |
| 177 output << "\n} // namespace net\n"; | 267 fprintf(out, "\n} // namespace net\n"); |
| 178 return output.str(); | 268 fclose(out); |
| 269 |
| 270 return 0; |
| 179 } | 271 } |
| 180 | |
| 181 #if defined(OS_WIN) | |
| 182 int wmain(int argc, wchar_t* argv[], wchar_t* envp[]) { | |
| 183 #elif defined(OS_POSIX) | |
| 184 int main(int argc, char* argv[], char* envp[]) { | |
| 185 #endif | |
| 186 if (argc != 3) { | |
| 187 fprintf(stderr, "Usage: %s <blacklist file> <output .c file>\n", argv[0]); | |
| 188 return 1; | |
| 189 } | |
| 190 | |
| 191 // Read input file. | |
| 192 std::string input; | |
| 193 if (!file_util::ReadFileToString(FilePath(argv[1]), &input)) { | |
| 194 fprintf(stderr, "Failed to read input file '%s'\n", argv[1]); | |
| 195 return 2; | |
| 196 } | |
| 197 Hosts hosts(ParseHosts(input)); | |
| 198 | |
| 199 // Sanitize |hosts|. | |
| 200 std::transform(hosts.begin(), hosts.end(), hosts.begin(), | |
| 201 StripWWWAndTrailingDots); | |
| 202 RemoveDuplicateEntries(&hosts); | |
| 203 RemoveRedundantEntries(&hosts); | |
| 204 if (!CheckLengths(hosts)) | |
| 205 return 3; | |
| 206 | |
| 207 // Write output file. | |
| 208 const std::string output_str(GenerateOutput(hosts)); | |
| 209 if (file_util::WriteFile(FilePath(argv[2]), output_str.data(), | |
| 210 output_str.size()) == static_cast<int>(output_str.size())) | |
| 211 return 0; | |
| 212 fprintf(stderr, "Failed to write output file '%s'\n", argv[2]); | |
| 213 return 4; | |
| 214 } | |
| OLD | NEW |