| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // This command-line program converts an effective-TLD data file in UTF-8 from | 5 #include "net/tools/tld_cleanup/tld_cleanup_util.h" |
| 6 // the format provided by Mozilla to the format expected by Chrome. This | |
| 7 // program generates an intermediate file which is then used by gperf to | |
| 8 // generate a perfect hash map. The benefit of this approach is that no time is | |
| 9 // spent on program initialization to generate the map of this data. | |
| 10 // | |
| 11 // Running this program finds "effective_tld_names.dat" in the expected location | |
| 12 // in the source checkout and generates "effective_tld_names.gperf" next to it. | |
| 13 // | |
| 14 // Any errors or warnings from this program are recorded in tld_cleanup.log. | |
| 15 // | |
| 16 // In particular, it | |
| 17 // * Strips blank lines and comments, as well as notes for individual rules. | |
| 18 // * Strips a single leading and/or trailing dot from each rule, if present. | |
| 19 // * Logs a warning if a rule contains '!' or '*.' other than at the beginning | |
| 20 // of the rule. (This also catches multiple ! or *. at the start of a rule.) | |
| 21 // * Logs a warning if GURL reports a rule as invalid, but keeps the rule. | |
| 22 // * Canonicalizes each rule's domain by converting it to a GURL and back. | |
| 23 // * Adds explicit rules for true TLDs found in any rule. | |
| 24 // TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed. | |
| 25 // * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS===" | |
| 26 // and "// ===END PRIVATE DOMAINS===". | |
| 27 | 6 |
| 28 #include <map> | |
| 29 #include <set> | |
| 30 #include <string> | |
| 31 | |
| 32 #include "base/at_exit.h" | |
| 33 #include "base/command_line.h" | |
| 34 #include "base/file_util.h" | 7 #include "base/file_util.h" |
| 35 #include "base/file_util.h" | |
| 36 #include "base/files/file_path.h" | |
| 37 #include "base/i18n/icu_util.h" | |
| 38 #include "base/logging.h" | 8 #include "base/logging.h" |
| 39 #include "base/path_service.h" | |
| 40 #include "base/process_util.h" | |
| 41 #include "base/string_util.h" | 9 #include "base/string_util.h" |
| 42 #include "googleurl/src/gurl.h" | 10 #include "googleurl/src/gurl.h" |
| 43 #include "googleurl/src/url_parse.h" | 11 #include "googleurl/src/url_parse.h" |
| 44 | 12 |
| 45 namespace { | 13 namespace { |
| 46 struct Rule { | |
| 47 bool exception; | |
| 48 bool wildcard; | |
| 49 }; | |
| 50 | |
| 51 typedef std::map<std::string, Rule> RuleMap; | |
| 52 typedef std::set<std::string> RuleSet; | |
| 53 | 14 |
| 54 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; | 15 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; |
| 55 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; | 16 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; |
| 56 } | 17 } |
| 57 | 18 |
| 19 namespace net { |
| 20 namespace tld_cleanup { |
| 21 |
| 58 // Writes the list of domain rules contained in the 'rules' set to the | 22 // Writes the list of domain rules contained in the 'rules' set to the |
| 59 // 'outfile', with each rule terminated by a LF. The file must already have | 23 // 'outfile', with each rule terminated by a LF. The file must already have |
| 60 // been created with write access. | 24 // been created with write access. |
| 61 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) { | 25 bool WriteRules(const RuleMap& rules, base::FilePath* outfile) { |
| 62 std::string data; | 26 std::string data; |
| 63 data.append( | 27 data.append( |
| 64 "%{\n" | 28 "%{\n" |
| 65 "// Copyright (c) 2012 The Chromium Authors. All rights reserved.\n" | 29 "// Copyright 2012 The Chromium Authors. All rights reserved.\n" |
| 66 "// Use of this source code is governed by a BSD-style license that can be\n" | 30 "// Use of this source code is governed by a BSD-style license that can be\n" |
| 67 "// found in the LICENSE file.\n\n" | 31 "// found in the LICENSE file.\n\n" |
| 68 "// This file is generated by net/tools/tld_cleanup/.\n" | 32 "// This file is generated by net/tools/tld_cleanup/.\n" |
| 69 "// DO NOT MANUALLY EDIT!\n" | 33 "// DO NOT MANUALLY EDIT!\n" |
| 70 "%}\n" | 34 "%}\n" |
| 71 "struct DomainRule {\n" | 35 "struct DomainRule {\n" |
| 72 " const char *name;\n" | 36 " const char *name;\n" |
| 73 " int type; // 1: exception, 2: wildcard\n" | 37 " int type; // 1: exception, 2: wildcard\n" |
| 38 " bool is_private;\n" |
| 74 "};\n" | 39 "};\n" |
| 75 "%%\n" | 40 "%%\n" |
| 76 ); | 41 ); |
| 77 | 42 |
| 78 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) { | 43 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) { |
| 79 data.append(i->first); | 44 data.append(i->first); |
| 80 data.append(", "); | 45 data.append(", "); |
| 81 if (i->second.exception) { | 46 if (i->second.exception) { |
| 82 data.append("1"); | 47 data.append("1"); |
| 83 } else if (i->second.wildcard) { | 48 } else if (i->second.wildcard) { |
| 84 data.append("2"); | 49 data.append("2"); |
| 85 } else { | 50 } else { |
| 86 data.append("0"); | 51 data.append("0"); |
| 87 } | 52 } |
| 53 if (i->second.is_private) { |
| 54 data.append(", true"); |
| 55 } else { |
| 56 data.append(", false"); |
| 57 } |
| 88 data.append("\n"); | 58 data.append("\n"); |
| 89 } | 59 } |
| 90 | 60 |
| 91 data.append("%%\n"); | 61 data.append("%%\n"); |
| 92 | 62 |
| 93 int written = file_util::WriteFile(outfile, data.data(), data.size()); | 63 int written = file_util::WriteFile(*outfile, data.data(), data.size()); |
| 94 | 64 |
| 95 return written == static_cast<int>(data.size()); | 65 return written == static_cast<int>(data.size()); |
| 96 } | 66 } |
| 97 | 67 |
| 98 // These result codes should be in increasing order of severity. | |
| 99 typedef enum { | |
| 100 kSuccess, | |
| 101 kWarning, | |
| 102 kError, | |
| 103 } NormalizeResult; | |
| 104 | |
| 105 // Adjusts the rule to a standard form: removes single extraneous dots and | 68 // Adjusts the rule to a standard form: removes single extraneous dots and |
| 106 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as | 69 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as |
| 107 // valid; logs a warning and returns kWarning if it is probably invalid; and | 70 // valid; logs a warning and returns kWarning if it is probably invalid; and |
| 108 // logs an error and returns kError if the rule is (almost) certainly invalid. | 71 // logs an error and returns kError if the rule is (almost) certainly invalid. |
| 109 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) { | 72 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) { |
| 110 NormalizeResult result = kSuccess; | 73 NormalizeResult result = kSuccess; |
| 111 | 74 |
| 112 // Strip single leading and trailing dots. | 75 // Strip single leading and trailing dots. |
| 113 if (domain->at(0) == '.') | 76 if (domain->at(0) == '.') |
| 114 domain->erase(0, 1); | 77 domain->erase(0, 1); |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 156 } | 119 } |
| 157 if (!gurl.is_valid()) { | 120 if (!gurl.is_valid()) { |
| 158 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain; | 121 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain; |
| 159 result = kWarning; | 122 result = kWarning; |
| 160 } | 123 } |
| 161 domain->assign(spec.substr(host.begin, host.len)); | 124 domain->assign(spec.substr(host.begin, host.len)); |
| 162 | 125 |
| 163 return result; | 126 return result; |
| 164 } | 127 } |
| 165 | 128 |
| 166 // Loads the file described by 'in_filename', converts it to the desired format | 129 NormalizeResult NormalizeDataToRuleMap(const std::string data, |
| 167 // (see the file comments above), and saves it into 'out_filename'. Returns | 130 RuleMap* rules) { |
| 168 // the most severe of the result codes encountered when normalizing the rules. | 131 CHECK(rules); |
| 169 NormalizeResult NormalizeFile(const base::FilePath& in_filename, | |
| 170 const base::FilePath& out_filename) { | |
| 171 std::string data; | |
| 172 if (!file_util::ReadFileToString(in_filename, &data)) { | |
| 173 LOG(ERROR) << "Unable to read file"; | |
| 174 // We return success since we've already reported the error. | |
| 175 return kSuccess; | |
| 176 } | |
| 177 | |
| 178 // We do a lot of string assignment during parsing, but simplicity is more | 132 // We do a lot of string assignment during parsing, but simplicity is more |
| 179 // important than performance here. | 133 // important than performance here. |
| 180 std::string domain; | 134 std::string domain; |
| 181 NormalizeResult result = kSuccess; | 135 NormalizeResult result = kSuccess; |
| 182 size_t line_start = 0; | 136 size_t line_start = 0; |
| 183 size_t line_end = 0; | 137 size_t line_end = 0; |
| 184 RuleMap rules; | 138 bool is_private = false; |
| 185 RuleSet extra_rules; | 139 RuleMap extra_rules; |
| 186 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1; | 140 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1; |
| 141 int end_private_length = arraysize(kEndPrivateDomainsComment) - 1; |
| 187 while (line_start < data.size()) { | 142 while (line_start < data.size()) { |
| 188 // Skip the entire section of private domains. | |
| 189 // TODO(pamg): remove this when http://crbug.com/96086 is fixed. | |
| 190 if (line_start + begin_private_length < data.size() && | 143 if (line_start + begin_private_length < data.size() && |
| 191 !data.compare(line_start, begin_private_length, | 144 !data.compare(line_start, begin_private_length, |
| 192 kBeginPrivateDomainsComment)) { | 145 kBeginPrivateDomainsComment)) { |
| 193 line_end = data.find(kEndPrivateDomainsComment, line_start); | 146 is_private = true; |
| 194 if (line_end == std::string::npos) { | 147 line_end = line_start + begin_private_length; |
| 195 LOG(WARNING) << "Private-domain section had no end marker."; | 148 } else if (line_start + end_private_length < data.size() && |
| 196 line_end = data.size(); | 149 !data.compare(line_start, end_private_length, |
| 197 } | 150 kEndPrivateDomainsComment)) { |
| 151 is_private = false; |
| 152 line_end = line_start + end_private_length; |
| 198 } else if (line_start + 1 < data.size() && | 153 } else if (line_start + 1 < data.size() && |
| 199 data[line_start] == '/' && | 154 data[line_start] == '/' && |
| 200 data[line_start + 1] == '/') { | 155 data[line_start + 1] == '/') { |
| 201 // Skip comments. | 156 // Skip comments. |
| 202 line_end = data.find_first_of("\r\n", line_start); | 157 line_end = data.find_first_of("\r\n", line_start); |
| 203 if (line_end == std::string::npos) | 158 if (line_end == std::string::npos) |
| 204 line_end = data.size(); | 159 line_end = data.size(); |
| 205 } else { | 160 } else { |
| 206 // Truncate at first whitespace. | 161 // Truncate at first whitespace. |
| 207 line_end = data.find_first_of("\r\n \t", line_start); | 162 line_end = data.find_first_of("\r\n \t", line_start); |
| 208 if (line_end == std::string::npos) | 163 if (line_end == std::string::npos) |
| 209 line_end = data.size(); | 164 line_end = data.size(); |
| 210 domain.assign(data.data(), line_start, line_end - line_start); | 165 domain.assign(data.data(), line_start, line_end - line_start); |
| 211 | 166 |
| 212 Rule rule; | 167 Rule rule; |
| 213 rule.wildcard = false; | 168 rule.wildcard = false; |
| 214 rule.exception = false; | 169 rule.exception = false; |
| 170 rule.is_private = is_private; |
| 215 NormalizeResult new_result = NormalizeRule(&domain, &rule); | 171 NormalizeResult new_result = NormalizeRule(&domain, &rule); |
| 216 if (new_result != kError) { | 172 if (new_result != kError) { |
| 217 // Check the existing rules to make sure we don't have an exception and | 173 // Check the existing rules to make sure we don't have an exception and |
| 218 // wildcard for the same rule. If we did, we'd have to update our | 174 // wildcard for the same rule, or that the same domain is listed as both |
| 175 // private and not private. If we did, we'd have to update our |
| 219 // parsing code to handle this case. | 176 // parsing code to handle this case. |
| 220 CHECK(rules.find(domain) == rules.end()); | 177 CHECK(rules->find(domain) == rules->end()); |
| 221 | 178 |
| 222 rules[domain] = rule; | 179 (*rules)[domain] = rule; |
| 223 // Add true TLD for multi-level rules. We don't add them right now, in | 180 // Add true TLD for multi-level rules. We don't add them right now, in |
| 224 // case there's an exception or wild card that either exists or might be | 181 // case there's an exception or wild card that either exists or might be |
| 225 // added in a later iteration. In those cases, there's no need to add | 182 // added in a later iteration. In those cases, there's no need to add |
| 226 // it and it would just slow down parsing the data. | 183 // it and it would just slow down parsing the data. |
| 227 size_t tld_start = domain.find_last_of('.'); | 184 size_t tld_start = domain.find_last_of('.'); |
| 228 if (tld_start != std::string::npos && tld_start + 1 < domain.size()) | 185 if (tld_start != std::string::npos && tld_start + 1 < domain.size()) { |
| 229 extra_rules.insert(domain.substr(tld_start + 1)); | 186 std::string extra_rule_domain = domain.substr(tld_start + 1); |
| 187 RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain); |
| 188 Rule extra_rule; |
| 189 extra_rule.exception = false; |
| 190 extra_rule.wildcard = false; |
| 191 if (iter == extra_rules.end()) { |
| 192 extra_rule.is_private = is_private; |
| 193 } else { |
| 194 // A rule already exists, so we ensure that if any of the entries is |
| 195 // not private the result should be that the entry is not private. |
| 196 // An example is .au which is not listed as a real TLD, but only |
| 197 // lists second-level domains such as com.au. Subdomains of .au |
| 198 // (eg. blogspot.com.au) are also listed in the private section, |
| 199 // which is processed later, so this ensures that the real TLD |
| 200 // (eg. .au) is listed as public. |
| 201 extra_rule.is_private = is_private && iter->second.is_private; |
| 202 } |
| 203 extra_rules[extra_rule_domain] = extra_rule; |
| 204 } |
| 230 } | 205 } |
| 231 result = std::max(result, new_result); | 206 result = std::max(result, new_result); |
| 232 } | 207 } |
| 233 | 208 |
| 234 // Find beginning of next non-empty line. | 209 // Find beginning of next non-empty line. |
| 235 line_start = data.find_first_of("\r\n", line_end); | 210 line_start = data.find_first_of("\r\n", line_end); |
| 236 if (line_start == std::string::npos) | 211 if (line_start == std::string::npos) |
| 237 line_start = data.size(); | 212 line_start = data.size(); |
| 238 line_start = data.find_first_not_of("\r\n", line_start); | 213 line_start = data.find_first_not_of("\r\n", line_start); |
| 239 if (line_start == std::string::npos) | 214 if (line_start == std::string::npos) |
| 240 line_start = data.size(); | 215 line_start = data.size(); |
| 241 } | 216 } |
| 242 | 217 |
| 243 for (RuleSet::const_iterator iter = extra_rules.begin(); | 218 for (RuleMap::const_iterator iter = extra_rules.begin(); |
| 244 iter != extra_rules.end(); | 219 iter != extra_rules.end(); |
| 245 ++iter) { | 220 ++iter) { |
| 246 if (rules.find(*iter) == rules.end()) { | 221 if (rules->find(iter->first) == rules->end()) { |
| 247 Rule rule; | 222 (*rules)[iter->first] = iter->second; |
| 248 rule.exception = false; | |
| 249 rule.wildcard = false; | |
| 250 rules[*iter] = rule; | |
| 251 } | 223 } |
| 252 } | 224 } |
| 253 | 225 |
| 226 return result; |
| 227 } |
| 228 |
| 229 NormalizeResult NormalizeFile(const base::FilePath& in_filename, |
| 230 base::FilePath* out_filename) { |
| 231 DCHECK(out_filename); |
| 232 RuleMap rules; |
| 233 std::string data; |
| 234 if (!file_util::ReadFileToString(in_filename, &data)) { |
| 235 LOG(ERROR) << "Unable to read file"; |
| 236 // We return success since we've already reported the error. |
| 237 return kSuccess; |
| 238 } |
| 239 |
| 240 NormalizeResult result = NormalizeDataToRuleMap(data, &rules); |
| 241 |
| 254 if (!WriteRules(rules, out_filename)) { | 242 if (!WriteRules(rules, out_filename)) { |
| 255 LOG(ERROR) << "Error(s) writing output file"; | 243 LOG(ERROR) << "Error(s) writing output file"; |
| 256 result = kError; | 244 result = kError; |
| 257 } | 245 } |
| 258 | 246 |
| 259 return result; | 247 return result; |
| 260 } | 248 } |
| 261 | 249 |
| 262 int main(int argc, const char* argv[]) { | |
| 263 base::EnableTerminationOnHeapCorruption(); | |
| 264 if (argc != 1) { | |
| 265 fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n"); | |
| 266 fprintf(stderr, "Usage: %s\n", argv[0]); | |
| 267 return 1; | |
| 268 } | |
| 269 | 250 |
| 270 // Manages the destruction of singletons. | 251 } // namespace tld_cleanup |
| 271 base::AtExitManager exit_manager; | 252 } // namespace net |
| 272 | |
| 273 // Only use OutputDebugString in debug mode. | |
| 274 #ifdef NDEBUG | |
| 275 logging::LoggingDestination destination = logging::LOG_ONLY_TO_FILE; | |
| 276 #else | |
| 277 logging::LoggingDestination destination = | |
| 278 logging::LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG; | |
| 279 #endif | |
| 280 | |
| 281 CommandLine::Init(argc, argv); | |
| 282 | |
| 283 base::FilePath log_filename; | |
| 284 PathService::Get(base::DIR_EXE, &log_filename); | |
| 285 log_filename = log_filename.AppendASCII("tld_cleanup.log"); | |
| 286 logging::InitLogging( | |
| 287 log_filename.value().c_str(), | |
| 288 destination, | |
| 289 logging::LOCK_LOG_FILE, | |
| 290 logging::DELETE_OLD_LOG_FILE, | |
| 291 logging::DISABLE_DCHECK_FOR_NON_OFFICIAL_RELEASE_BUILDS); | |
| 292 | |
| 293 icu_util::Initialize(); | |
| 294 | |
| 295 base::FilePath input_file; | |
| 296 PathService::Get(base::DIR_SOURCE_ROOT, &input_file); | |
| 297 input_file = input_file.Append(FILE_PATH_LITERAL("net")) | |
| 298 .Append(FILE_PATH_LITERAL("base")) | |
| 299 .Append(FILE_PATH_LITERAL( | |
| 300 "registry_controlled_domains")) | |
| 301 .Append(FILE_PATH_LITERAL("effective_tld_names.dat")); | |
| 302 base::FilePath output_file; | |
| 303 PathService::Get(base::DIR_SOURCE_ROOT, &output_file); | |
| 304 output_file = output_file.Append(FILE_PATH_LITERAL("net")) | |
| 305 .Append(FILE_PATH_LITERAL("base")) | |
| 306 .Append(FILE_PATH_LITERAL( | |
| 307 "registry_controlled_domains")) | |
| 308 .Append(FILE_PATH_LITERAL( | |
| 309 "effective_tld_names.gperf")); | |
| 310 NormalizeResult result = NormalizeFile(input_file, output_file); | |
| 311 if (result != kSuccess) { | |
| 312 fprintf(stderr, | |
| 313 "Errors or warnings processing file. See log in tld_cleanup.log."); | |
| 314 } | |
| 315 | |
| 316 if (result == kError) | |
| 317 return 1; | |
| 318 return 0; | |
| 319 } | |
| OLD | NEW |