| OLD | NEW |
| (Empty) |
| 1 // Copyright 2013 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "net/tools/tld_cleanup/tld_cleanup_util.h" | |
| 6 | |
| 7 #include "base/files/file_util.h" | |
| 8 #include "base/logging.h" | |
| 9 #include "base/strings/string_number_conversions.h" | |
| 10 #include "base/strings/string_util.h" | |
| 11 #include "url/gurl.h" | |
| 12 #include "url/url_parse.h" | |
| 13 | |
| 14 namespace { | |
| 15 | |
| 16 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; | |
| 17 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; | |
| 18 | |
| 19 const int kExceptionRule = 1; | |
| 20 const int kWildcardRule = 2; | |
| 21 const int kPrivateRule = 4; | |
| 22 } | |
| 23 | |
| 24 namespace net { | |
| 25 namespace tld_cleanup { | |
| 26 | |
| 27 // Writes the list of domain rules contained in the 'rules' set to the | |
| 28 // 'outfile', with each rule terminated by a LF. The file must already have | |
| 29 // been created with write access. | |
| 30 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) { | |
| 31 std::string data; | |
| 32 data.append("%{\n" | |
| 33 "// Copyright 2012 The Chromium Authors. All rights reserved.\n" | |
| 34 "// Use of this source code is governed by a BSD-style license " | |
| 35 "that can be\n" | |
| 36 "// found in the LICENSE file.\n\n" | |
| 37 "// This file is generated by net/tools/tld_cleanup/.\n" | |
| 38 "// DO NOT MANUALLY EDIT!\n" | |
| 39 "%}\n" | |
| 40 "struct DomainRule {\n" | |
| 41 " int name_offset;\n" | |
| 42 " int type; // flags: 1: exception, 2: wildcard, 4: private\n" | |
| 43 "};\n" | |
| 44 "%%\n"); | |
| 45 | |
| 46 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) { | |
| 47 data.append(i->first); | |
| 48 data.append(", "); | |
| 49 int type = 0; | |
| 50 if (i->second.exception) { | |
| 51 type = kExceptionRule; | |
| 52 } else if (i->second.wildcard) { | |
| 53 type = kWildcardRule; | |
| 54 } | |
| 55 if (i->second.is_private) { | |
| 56 type += kPrivateRule; | |
| 57 } | |
| 58 data.append(base::IntToString(type)); | |
| 59 data.append("\n"); | |
| 60 } | |
| 61 | |
| 62 data.append("%%\n"); | |
| 63 | |
| 64 int written = base::WriteFile(outfile, | |
| 65 data.data(), | |
| 66 static_cast<int>(data.size())); | |
| 67 | |
| 68 return written == static_cast<int>(data.size()); | |
| 69 } | |
| 70 | |
| 71 // Adjusts the rule to a standard form: removes single extraneous dots and | |
| 72 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as | |
| 73 // valid; logs a warning and returns kWarning if it is probably invalid; and | |
| 74 // logs an error and returns kError if the rule is (almost) certainly invalid. | |
| 75 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) { | |
| 76 NormalizeResult result = kSuccess; | |
| 77 | |
| 78 // Strip single leading and trailing dots. | |
| 79 if (domain->at(0) == '.') | |
| 80 domain->erase(0, 1); | |
| 81 if (domain->empty()) { | |
| 82 LOG(WARNING) << "Ignoring empty rule"; | |
| 83 return kWarning; | |
| 84 } | |
| 85 if (domain->at(domain->size() - 1) == '.') | |
| 86 domain->erase(domain->size() - 1, 1); | |
| 87 if (domain->empty()) { | |
| 88 LOG(WARNING) << "Ignoring empty rule"; | |
| 89 return kWarning; | |
| 90 } | |
| 91 | |
| 92 // Allow single leading '*.' or '!', saved here so it's not canonicalized. | |
| 93 size_t start_offset = 0; | |
| 94 if (domain->at(0) == '!') { | |
| 95 domain->erase(0, 1); | |
| 96 rule->exception = true; | |
| 97 } else if (domain->find("*.") == 0) { | |
| 98 domain->erase(0, 2); | |
| 99 rule->wildcard = true; | |
| 100 } | |
| 101 if (domain->empty()) { | |
| 102 LOG(WARNING) << "Ignoring empty rule"; | |
| 103 return kWarning; | |
| 104 } | |
| 105 | |
| 106 // Warn about additional '*.' or '!'. | |
| 107 if (domain->find("*.", start_offset) != std::string::npos || | |
| 108 domain->find('!', start_offset) != std::string::npos) { | |
| 109 LOG(WARNING) << "Keeping probably invalid rule: " << *domain; | |
| 110 result = kWarning; | |
| 111 } | |
| 112 | |
| 113 // Make a GURL and normalize it, then get the host back out. | |
| 114 std::string url = "http://"; | |
| 115 url.append(*domain); | |
| 116 GURL gurl(url); | |
| 117 const std::string& spec = gurl.possibly_invalid_spec(); | |
| 118 url::Component host = gurl.parsed_for_possibly_invalid_spec().host; | |
| 119 if (host.len < 0) { | |
| 120 LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain; | |
| 121 return kError; | |
| 122 } | |
| 123 if (!gurl.is_valid()) { | |
| 124 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain; | |
| 125 result = kWarning; | |
| 126 } | |
| 127 domain->assign(spec.substr(host.begin, host.len)); | |
| 128 | |
| 129 return result; | |
| 130 } | |
| 131 | |
| 132 NormalizeResult NormalizeDataToRuleMap(const std::string data, | |
| 133 RuleMap* rules) { | |
| 134 CHECK(rules); | |
| 135 // We do a lot of string assignment during parsing, but simplicity is more | |
| 136 // important than performance here. | |
| 137 std::string domain; | |
| 138 NormalizeResult result = kSuccess; | |
| 139 size_t line_start = 0; | |
| 140 size_t line_end = 0; | |
| 141 bool is_private = false; | |
| 142 RuleMap extra_rules; | |
| 143 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1; | |
| 144 int end_private_length = arraysize(kEndPrivateDomainsComment) - 1; | |
| 145 while (line_start < data.size()) { | |
| 146 if (line_start + begin_private_length < data.size() && | |
| 147 !data.compare(line_start, begin_private_length, | |
| 148 kBeginPrivateDomainsComment)) { | |
| 149 is_private = true; | |
| 150 line_end = line_start + begin_private_length; | |
| 151 } else if (line_start + end_private_length < data.size() && | |
| 152 !data.compare(line_start, end_private_length, | |
| 153 kEndPrivateDomainsComment)) { | |
| 154 is_private = false; | |
| 155 line_end = line_start + end_private_length; | |
| 156 } else if (line_start + 1 < data.size() && | |
| 157 data[line_start] == '/' && | |
| 158 data[line_start + 1] == '/') { | |
| 159 // Skip comments. | |
| 160 line_end = data.find_first_of("\r\n", line_start); | |
| 161 if (line_end == std::string::npos) | |
| 162 line_end = data.size(); | |
| 163 } else { | |
| 164 // Truncate at first whitespace. | |
| 165 line_end = data.find_first_of("\r\n \t", line_start); | |
| 166 if (line_end == std::string::npos) | |
| 167 line_end = data.size(); | |
| 168 domain.assign(data.data(), line_start, line_end - line_start); | |
| 169 | |
| 170 Rule rule; | |
| 171 rule.wildcard = false; | |
| 172 rule.exception = false; | |
| 173 rule.is_private = is_private; | |
| 174 NormalizeResult new_result = NormalizeRule(&domain, &rule); | |
| 175 if (new_result != kError) { | |
| 176 // Check the existing rules to make sure we don't have an exception and | |
| 177 // wildcard for the same rule, or that the same domain is listed as both | |
| 178 // private and not private. If we did, we'd have to update our | |
| 179 // parsing code to handle this case. | |
| 180 CHECK(rules->find(domain) == rules->end()) | |
| 181 << "Duplicate rule found for " << domain; | |
| 182 | |
| 183 (*rules)[domain] = rule; | |
| 184 // Add true TLD for multi-level rules. We don't add them right now, in | |
| 185 // case there's an exception or wild card that either exists or might be | |
| 186 // added in a later iteration. In those cases, there's no need to add | |
| 187 // it and it would just slow down parsing the data. | |
| 188 size_t tld_start = domain.find_last_of('.'); | |
| 189 if (tld_start != std::string::npos && tld_start + 1 < domain.size()) { | |
| 190 std::string extra_rule_domain = domain.substr(tld_start + 1); | |
| 191 RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain); | |
| 192 Rule extra_rule; | |
| 193 extra_rule.exception = false; | |
| 194 extra_rule.wildcard = false; | |
| 195 if (iter == extra_rules.end()) { | |
| 196 extra_rule.is_private = is_private; | |
| 197 } else { | |
| 198 // A rule already exists, so we ensure that if any of the entries is | |
| 199 // not private the result should be that the entry is not private. | |
| 200 // An example is .au which is not listed as a real TLD, but only | |
| 201 // lists second-level domains such as com.au. Subdomains of .au | |
| 202 // (eg. blogspot.com.au) are also listed in the private section, | |
| 203 // which is processed later, so this ensures that the real TLD | |
| 204 // (eg. .au) is listed as public. | |
| 205 extra_rule.is_private = is_private && iter->second.is_private; | |
| 206 } | |
| 207 extra_rules[extra_rule_domain] = extra_rule; | |
| 208 } | |
| 209 } | |
| 210 result = std::max(result, new_result); | |
| 211 } | |
| 212 | |
| 213 // Find beginning of next non-empty line. | |
| 214 line_start = data.find_first_of("\r\n", line_end); | |
| 215 if (line_start == std::string::npos) | |
| 216 line_start = data.size(); | |
| 217 line_start = data.find_first_not_of("\r\n", line_start); | |
| 218 if (line_start == std::string::npos) | |
| 219 line_start = data.size(); | |
| 220 } | |
| 221 | |
| 222 for (RuleMap::const_iterator iter = extra_rules.begin(); | |
| 223 iter != extra_rules.end(); | |
| 224 ++iter) { | |
| 225 if (rules->find(iter->first) == rules->end()) { | |
| 226 (*rules)[iter->first] = iter->second; | |
| 227 } | |
| 228 } | |
| 229 | |
| 230 return result; | |
| 231 } | |
| 232 | |
| 233 NormalizeResult NormalizeFile(const base::FilePath& in_filename, | |
| 234 const base::FilePath& out_filename) { | |
| 235 RuleMap rules; | |
| 236 std::string data; | |
| 237 if (!base::ReadFileToString(in_filename, &data)) { | |
| 238 LOG(ERROR) << "Unable to read file"; | |
| 239 // We return success since we've already reported the error. | |
| 240 return kSuccess; | |
| 241 } | |
| 242 | |
| 243 NormalizeResult result = NormalizeDataToRuleMap(data, &rules); | |
| 244 | |
| 245 if (!WriteRules(rules, out_filename)) { | |
| 246 LOG(ERROR) << "Error(s) writing output file"; | |
| 247 result = kError; | |
| 248 } | |
| 249 | |
| 250 return result; | |
| 251 } | |
| 252 | |
| 253 | |
| 254 } // namespace tld_cleanup | |
| 255 } // namespace net | |
| OLD | NEW |