| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // This command-line program converts an effective-TLD data file in UTF-8 from | 5 // This command-line program converts an effective-TLD data file in UTF-8 from |
| 6 // the format provided by Mozilla to the format expected by Chrome. This | 6 // the format provided by Mozilla to the format expected by Chrome. This |
| 7 // program generates an intermediate file which is then used by gperf to | 7 // program generates an intermediate file which is then used by gperf to |
| 8 // generate a perfect hash map. The benefit of this approach is that no time is | 8 // generate a perfect hash map. The benefit of this approach is that no time is |
| 9 // spent on program initialization to generate the map of this data. | 9 // spent on program initialization to generate the map of this data. |
| 10 // | 10 // |
| 11 // Running this program finds "effective_tld_names.dat" in the expected location | 11 // Running this program finds "effective_tld_names.dat" in the expected location |
| 12 // in the source checkout and generates "effective_tld_names.gperf" next to it. | 12 // in the source checkout and generates "effective_tld_names.gperf" next to it. |
| 13 // | 13 // |
| 14 // Any errors or warnings from this program are recorded in tld_cleanup.log. | 14 // Any errors or warnings from this program are recorded in tld_cleanup.log. |
| 15 // | 15 // |
| 16 // In particular, it | 16 // In particular, it |
| 17 // * Strips blank lines and comments, as well as notes for individual rules. | 17 // * Strips blank lines and comments, as well as notes for individual rules. |
| 18 // * Strips a single leading and/or trailing dot from each rule, if present. | 18 // * Strips a single leading and/or trailing dot from each rule, if present. |
| 19 // * Logs a warning if a rule contains '!' or '*.' other than at the beginning | 19 // * Logs a warning if a rule contains '!' or '*.' other than at the beginning |
| 20 // of the rule. (This also catches multiple ! or *. at the start of a rule.) | 20 // of the rule. (This also catches multiple ! or *. at the start of a rule.) |
| 21 // * Logs a warning if GURL reports a rule as invalid, but keeps the rule. | 21 // * Logs a warning if GURL reports a rule as invalid, but keeps the rule. |
| 22 // * Canonicalizes each rule's domain by converting it to a GURL and back. | 22 // * Canonicalizes each rule's domain by converting it to a GURL and back. |
| 23 // * Adds explicit rules for true TLDs found in any rule. | 23 // * Adds explicit rules for true TLDs found in any rule. |
| 24 // TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed. | 24 // * Marks entries in the file between "// ===BEGIN PRIVATE DOMAINS===" |
| 25 // * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS===" | 25 // and "// ===END PRIVATE DOMAINS===" as private. |
| 26 // and "// ===END PRIVATE DOMAINS===". | |
| 27 | |
| 28 #include <map> | |
| 29 #include <set> | |
| 30 #include <string> | |
| 31 | 26 |
| 32 #include "base/at_exit.h" | 27 #include "base/at_exit.h" |
| 33 #include "base/command_line.h" | 28 #include "base/command_line.h" |
| 34 #include "base/file_util.h" | 29 #include "base/file_util.h" |
| 35 #include "base/file_util.h" | |
| 36 #include "base/files/file_path.h" | 30 #include "base/files/file_path.h" |
| 37 #include "base/i18n/icu_util.h" | 31 #include "base/i18n/icu_util.h" |
| 38 #include "base/logging.h" | 32 #include "base/logging.h" |
| 39 #include "base/path_service.h" | 33 #include "base/path_service.h" |
| 40 #include "base/process_util.h" | 34 #include "base/process_util.h" |
| 41 #include "base/string_util.h" | 35 #include "net/tools/tld_cleanup/tld_cleanup_util.h" |
| 42 #include "googleurl/src/gurl.h" | |
| 43 #include "googleurl/src/url_parse.h" | |
| 44 | |
| 45 namespace { | |
| 46 struct Rule { | |
| 47 bool exception; | |
| 48 bool wildcard; | |
| 49 }; | |
| 50 | |
| 51 typedef std::map<std::string, Rule> RuleMap; | |
| 52 typedef std::set<std::string> RuleSet; | |
| 53 | |
| 54 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; | |
| 55 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; | |
| 56 } | |
| 57 | |
| 58 // Writes the list of domain rules contained in the 'rules' set to the | |
| 59 // 'outfile', with each rule terminated by a LF. The file must already have | |
| 60 // been created with write access. | |
| 61 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) { | |
| 62 std::string data; | |
| 63 data.append( | |
| 64 "%{\n" | |
| 65 "// Copyright (c) 2012 The Chromium Authors. All rights reserved.\n" | |
| 66 "// Use of this source code is governed by a BSD-style license that can be\n" | |
| 67 "// found in the LICENSE file.\n\n" | |
| 68 "// This file is generated by net/tools/tld_cleanup/.\n" | |
| 69 "// DO NOT MANUALLY EDIT!\n" | |
| 70 "%}\n" | |
| 71 "struct DomainRule {\n" | |
| 72 " const char *name;\n" | |
| 73 " int type; // 1: exception, 2: wildcard\n" | |
| 74 "};\n" | |
| 75 "%%\n" | |
| 76 ); | |
| 77 | |
| 78 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) { | |
| 79 data.append(i->first); | |
| 80 data.append(", "); | |
| 81 if (i->second.exception) { | |
| 82 data.append("1"); | |
| 83 } else if (i->second.wildcard) { | |
| 84 data.append("2"); | |
| 85 } else { | |
| 86 data.append("0"); | |
| 87 } | |
| 88 data.append("\n"); | |
| 89 } | |
| 90 | |
| 91 data.append("%%\n"); | |
| 92 | |
| 93 int written = file_util::WriteFile(outfile, data.data(), data.size()); | |
| 94 | |
| 95 return written == static_cast<int>(data.size()); | |
| 96 } | |
| 97 | |
| 98 // These result codes should be in increasing order of severity. | |
| 99 typedef enum { | |
| 100 kSuccess, | |
| 101 kWarning, | |
| 102 kError, | |
| 103 } NormalizeResult; | |
| 104 | |
| 105 // Adjusts the rule to a standard form: removes single extraneous dots and | |
| 106 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as | |
| 107 // valid; logs a warning and returns kWarning if it is probably invalid; and | |
| 108 // logs an error and returns kError if the rule is (almost) certainly invalid. | |
| 109 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) { | |
| 110 NormalizeResult result = kSuccess; | |
| 111 | |
| 112 // Strip single leading and trailing dots. | |
| 113 if (domain->at(0) == '.') | |
| 114 domain->erase(0, 1); | |
| 115 if (domain->empty()) { | |
| 116 LOG(WARNING) << "Ignoring empty rule"; | |
| 117 return kWarning; | |
| 118 } | |
| 119 if (domain->at(domain->size() - 1) == '.') | |
| 120 domain->erase(domain->size() - 1, 1); | |
| 121 if (domain->empty()) { | |
| 122 LOG(WARNING) << "Ignoring empty rule"; | |
| 123 return kWarning; | |
| 124 } | |
| 125 | |
| 126 // Allow single leading '*.' or '!', saved here so it's not canonicalized. | |
| 127 size_t start_offset = 0; | |
| 128 if (domain->at(0) == '!') { | |
| 129 domain->erase(0, 1); | |
| 130 rule->exception = true; | |
| 131 } else if (domain->find("*.") == 0) { | |
| 132 domain->erase(0, 2); | |
| 133 rule->wildcard = true; | |
| 134 } | |
| 135 if (domain->empty()) { | |
| 136 LOG(WARNING) << "Ignoring empty rule"; | |
| 137 return kWarning; | |
| 138 } | |
| 139 | |
| 140 // Warn about additional '*.' or '!'. | |
| 141 if (domain->find("*.", start_offset) != std::string::npos || | |
| 142 domain->find('!', start_offset) != std::string::npos) { | |
| 143 LOG(WARNING) << "Keeping probably invalid rule: " << *domain; | |
| 144 result = kWarning; | |
| 145 } | |
| 146 | |
| 147 // Make a GURL and normalize it, then get the host back out. | |
| 148 std::string url = "http://"; | |
| 149 url.append(*domain); | |
| 150 GURL gurl(url); | |
| 151 const std::string& spec = gurl.possibly_invalid_spec(); | |
| 152 url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host; | |
| 153 if (host.len < 0) { | |
| 154 LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain; | |
| 155 return kError; | |
| 156 } | |
| 157 if (!gurl.is_valid()) { | |
| 158 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain; | |
| 159 result = kWarning; | |
| 160 } | |
| 161 domain->assign(spec.substr(host.begin, host.len)); | |
| 162 | |
| 163 return result; | |
| 164 } | |
| 165 | |
| 166 // Loads the file described by 'in_filename', converts it to the desired format | |
| 167 // (see the file comments above), and saves it into 'out_filename'. Returns | |
| 168 // the most severe of the result codes encountered when normalizing the rules. | |
| 169 NormalizeResult NormalizeFile(const base::FilePath& in_filename, | |
| 170 const base::FilePath& out_filename) { | |
| 171 std::string data; | |
| 172 if (!file_util::ReadFileToString(in_filename, &data)) { | |
| 173 LOG(ERROR) << "Unable to read file"; | |
| 174 // We return success since we've already reported the error. | |
| 175 return kSuccess; | |
| 176 } | |
| 177 | |
| 178 // We do a lot of string assignment during parsing, but simplicity is more | |
| 179 // important than performance here. | |
| 180 std::string domain; | |
| 181 NormalizeResult result = kSuccess; | |
| 182 size_t line_start = 0; | |
| 183 size_t line_end = 0; | |
| 184 RuleMap rules; | |
| 185 RuleSet extra_rules; | |
| 186 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1; | |
| 187 while (line_start < data.size()) { | |
| 188 // Skip the entire section of private domains. | |
| 189 // TODO(pamg): remove this when http://crbug.com/96086 is fixed. | |
| 190 if (line_start + begin_private_length < data.size() && | |
| 191 !data.compare(line_start, begin_private_length, | |
| 192 kBeginPrivateDomainsComment)) { | |
| 193 line_end = data.find(kEndPrivateDomainsComment, line_start); | |
| 194 if (line_end == std::string::npos) { | |
| 195 LOG(WARNING) << "Private-domain section had no end marker."; | |
| 196 line_end = data.size(); | |
| 197 } | |
| 198 } else if (line_start + 1 < data.size() && | |
| 199 data[line_start] == '/' && | |
| 200 data[line_start + 1] == '/') { | |
| 201 // Skip comments. | |
| 202 line_end = data.find_first_of("\r\n", line_start); | |
| 203 if (line_end == std::string::npos) | |
| 204 line_end = data.size(); | |
| 205 } else { | |
| 206 // Truncate at first whitespace. | |
| 207 line_end = data.find_first_of("\r\n \t", line_start); | |
| 208 if (line_end == std::string::npos) | |
| 209 line_end = data.size(); | |
| 210 domain.assign(data.data(), line_start, line_end - line_start); | |
| 211 | |
| 212 Rule rule; | |
| 213 rule.wildcard = false; | |
| 214 rule.exception = false; | |
| 215 NormalizeResult new_result = NormalizeRule(&domain, &rule); | |
| 216 if (new_result != kError) { | |
| 217 // Check the existing rules to make sure we don't have an exception and | |
| 218 // wildcard for the same rule. If we did, we'd have to update our | |
| 219 // parsing code to handle this case. | |
| 220 CHECK(rules.find(domain) == rules.end()); | |
| 221 | |
| 222 rules[domain] = rule; | |
| 223 // Add true TLD for multi-level rules. We don't add them right now, in | |
| 224 // case there's an exception or wild card that either exists or might be | |
| 225 // added in a later iteration. In those cases, there's no need to add | |
| 226 // it and it would just slow down parsing the data. | |
| 227 size_t tld_start = domain.find_last_of('.'); | |
| 228 if (tld_start != std::string::npos && tld_start + 1 < domain.size()) | |
| 229 extra_rules.insert(domain.substr(tld_start + 1)); | |
| 230 } | |
| 231 result = std::max(result, new_result); | |
| 232 } | |
| 233 | |
| 234 // Find beginning of next non-empty line. | |
| 235 line_start = data.find_first_of("\r\n", line_end); | |
| 236 if (line_start == std::string::npos) | |
| 237 line_start = data.size(); | |
| 238 line_start = data.find_first_not_of("\r\n", line_start); | |
| 239 if (line_start == std::string::npos) | |
| 240 line_start = data.size(); | |
| 241 } | |
| 242 | |
| 243 for (RuleSet::const_iterator iter = extra_rules.begin(); | |
| 244 iter != extra_rules.end(); | |
| 245 ++iter) { | |
| 246 if (rules.find(*iter) == rules.end()) { | |
| 247 Rule rule; | |
| 248 rule.exception = false; | |
| 249 rule.wildcard = false; | |
| 250 rules[*iter] = rule; | |
| 251 } | |
| 252 } | |
| 253 | |
| 254 if (!WriteRules(rules, out_filename)) { | |
| 255 LOG(ERROR) << "Error(s) writing output file"; | |
| 256 result = kError; | |
| 257 } | |
| 258 | |
| 259 return result; | |
| 260 } | |
| 261 | 36 |
| 262 int main(int argc, const char* argv[]) { | 37 int main(int argc, const char* argv[]) { |
| 263 base::EnableTerminationOnHeapCorruption(); | 38 base::EnableTerminationOnHeapCorruption(); |
| 264 if (argc != 1) { | 39 if (argc != 1) { |
| 265 fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n"); | 40 fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n"); |
| 266 fprintf(stderr, "Usage: %s\n", argv[0]); | 41 fprintf(stderr, "Usage: %s\n", argv[0]); |
| 267 return 1; | 42 return 1; |
| 268 } | 43 } |
| 269 | 44 |
| 270 // Manages the destruction of singletons. | 45 // Manages the destruction of singletons. |
| (...skipping 29 matching lines...) Expand all Loading... |
| 300 "registry_controlled_domains")) | 75 "registry_controlled_domains")) |
| 301 .Append(FILE_PATH_LITERAL("effective_tld_names.dat")); | 76 .Append(FILE_PATH_LITERAL("effective_tld_names.dat")); |
| 302 base::FilePath output_file; | 77 base::FilePath output_file; |
| 303 PathService::Get(base::DIR_SOURCE_ROOT, &output_file); | 78 PathService::Get(base::DIR_SOURCE_ROOT, &output_file); |
| 304 output_file = output_file.Append(FILE_PATH_LITERAL("net")) | 79 output_file = output_file.Append(FILE_PATH_LITERAL("net")) |
| 305 .Append(FILE_PATH_LITERAL("base")) | 80 .Append(FILE_PATH_LITERAL("base")) |
| 306 .Append(FILE_PATH_LITERAL( | 81 .Append(FILE_PATH_LITERAL( |
| 307 "registry_controlled_domains")) | 82 "registry_controlled_domains")) |
| 308 .Append(FILE_PATH_LITERAL( | 83 .Append(FILE_PATH_LITERAL( |
| 309 "effective_tld_names.gperf")); | 84 "effective_tld_names.gperf")); |
| 310 NormalizeResult result = NormalizeFile(input_file, output_file); | 85 net::tld_cleanup::NormalizeResult result = |
| 311 if (result != kSuccess) { | 86 net::tld_cleanup::NormalizeFile(input_file, output_file); |
| 87 if (result != net::tld_cleanup::kSuccess) { |
| 312 fprintf(stderr, | 88 fprintf(stderr, |
| 313 "Errors or warnings processing file. See log in tld_cleanup.log."); | 89 "Errors or warnings processing file. See log in tld_cleanup.log."); |
| 314 } | 90 } |
| 315 | 91 |
| 316 if (result == kError) | 92 if (result == net::tld_cleanup::kError) |
| 317 return 1; | 93 return 1; |
| 318 return 0; | 94 return 0; |
| 319 } | 95 } |
| OLD | NEW |