OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // This command-line program converts an effective-TLD data file in UTF-8 from | 5 #include "net/tools/tld_cleanup/tld_cleanup_util.h" |
6 // the format provided by Mozilla to the format expected by Chrome. This | |
7 // program generates an intermediate file which is then used by gperf to | |
8 // generate a perfect hash map. The benefit of this approach is that no time is | |
9 // spent on program initialization to generate the map of this data. | |
10 // | |
11 // Running this program finds "effective_tld_names.dat" in the expected location | |
12 // in the source checkout and generates "effective_tld_names.gperf" next to it. | |
13 // | |
14 // Any errors or warnings from this program are recorded in tld_cleanup.log. | |
15 // | |
16 // In particular, it | |
17 // * Strips blank lines and comments, as well as notes for individual rules. | |
18 // * Strips a single leading and/or trailing dot from each rule, if present. | |
19 // * Logs a warning if a rule contains '!' or '*.' other than at the beginning | |
20 // of the rule. (This also catches multiple ! or *. at the start of a rule.) | |
21 // * Logs a warning if GURL reports a rule as invalid, but keeps the rule. | |
22 // * Canonicalizes each rule's domain by converting it to a GURL and back. | |
23 // * Adds explicit rules for true TLDs found in any rule. | |
24 // TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed. | |
25 // * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS===" | |
26 // and "// ===END PRIVATE DOMAINS===". | |
27 | 6 |
28 #include <map> | |
29 #include <set> | |
30 #include <string> | |
31 | |
32 #include "base/at_exit.h" | |
33 #include "base/command_line.h" | |
34 #include "base/file_util.h" | 7 #include "base/file_util.h" |
35 #include "base/file_util.h" | |
36 #include "base/files/file_path.h" | |
37 #include "base/i18n/icu_util.h" | |
38 #include "base/logging.h" | 8 #include "base/logging.h" |
39 #include "base/path_service.h" | |
40 #include "base/process_util.h" | |
41 #include "base/string_util.h" | 9 #include "base/string_util.h" |
42 #include "googleurl/src/gurl.h" | 10 #include "googleurl/src/gurl.h" |
43 #include "googleurl/src/url_parse.h" | 11 #include "googleurl/src/url_parse.h" |
44 | 12 |
45 namespace { | 13 namespace { |
46 struct Rule { | |
47 bool exception; | |
48 bool wildcard; | |
49 }; | |
50 | |
51 typedef std::map<std::string, Rule> RuleMap; | |
52 typedef std::set<std::string> RuleSet; | |
53 | 14 |
54 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; | 15 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; |
55 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; | 16 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; |
56 } | 17 } |
57 | 18 |
| 19 namespace net { |
| 20 namespace tld_cleanup { |
| 21 |
58 // Writes the list of domain rules contained in the 'rules' set to the | 22 // Writes the list of domain rules contained in the 'rules' set to the |
59 // 'outfile', with each rule terminated by a LF. The file must already have | 23 // 'outfile', with each rule terminated by a LF. The file must already have |
60 // been created with write access. | 24 // been created with write access. |
61 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) { | 25 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) { |
62 std::string data; | 26 std::string data; |
63 data.append( | 27 data.append( |
64 "%{\n" | 28 "%{\n" |
65 "// Copyright (c) 2012 The Chromium Authors. All rights reserved.\n" | 29 "// Copyright 2012 The Chromium Authors. All rights reserved.\n" |
66 "// Use of this source code is governed by a BSD-style license that can be\n" | 30 "// Use of this source code is governed by a BSD-style license that can be\n" |
67 "// found in the LICENSE file.\n\n" | 31 "// found in the LICENSE file.\n\n" |
68 "// This file is generated by net/tools/tld_cleanup/.\n" | 32 "// This file is generated by net/tools/tld_cleanup/.\n" |
69 "// DO NOT MANUALLY EDIT!\n" | 33 "// DO NOT MANUALLY EDIT!\n" |
70 "%}\n" | 34 "%}\n" |
71 "struct DomainRule {\n" | 35 "struct DomainRule {\n" |
72 " const char *name;\n" | 36 " const char *name;\n" |
73 " int type; // 1: exception, 2: wildcard\n" | 37 " int type; // 1: exception, 2: wildcard\n" |
| 38 " bool is_private;\n" |
74 "};\n" | 39 "};\n" |
75 "%%\n" | 40 "%%\n" |
76 ); | 41 ); |
77 | 42 |
78 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) { | 43 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) { |
79 data.append(i->first); | 44 data.append(i->first); |
80 data.append(", "); | 45 data.append(", "); |
81 if (i->second.exception) { | 46 if (i->second.exception) { |
82 data.append("1"); | 47 data.append("1"); |
83 } else if (i->second.wildcard) { | 48 } else if (i->second.wildcard) { |
84 data.append("2"); | 49 data.append("2"); |
85 } else { | 50 } else { |
86 data.append("0"); | 51 data.append("0"); |
87 } | 52 } |
| 53 if (i->second.is_private) { |
| 54 data.append(", true"); |
| 55 } else { |
| 56 data.append(", false"); |
| 57 } |
88 data.append("\n"); | 58 data.append("\n"); |
89 } | 59 } |
90 | 60 |
91 data.append("%%\n"); | 61 data.append("%%\n"); |
92 | 62 |
93 int written = file_util::WriteFile(outfile, data.data(), data.size()); | 63 int written = file_util::WriteFile(outfile, |
| 64 data.data(), |
| 65 static_cast<int>(data.size())); |
94 | 66 |
95 return written == static_cast<int>(data.size()); | 67 return written == static_cast<int>(data.size()); |
96 } | 68 } |
97 | 69 |
98 // These result codes should be in increasing order of severity. | |
99 typedef enum { | |
100 kSuccess, | |
101 kWarning, | |
102 kError, | |
103 } NormalizeResult; | |
104 | |
105 // Adjusts the rule to a standard form: removes single extraneous dots and | 70 // Adjusts the rule to a standard form: removes single extraneous dots and |
106 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as | 71 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as |
107 // valid; logs a warning and returns kWarning if it is probably invalid; and | 72 // valid; logs a warning and returns kWarning if it is probably invalid; and |
108 // logs an error and returns kError if the rule is (almost) certainly invalid. | 73 // logs an error and returns kError if the rule is (almost) certainly invalid. |
109 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) { | 74 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) { |
110 NormalizeResult result = kSuccess; | 75 NormalizeResult result = kSuccess; |
111 | 76 |
112 // Strip single leading and trailing dots. | 77 // Strip single leading and trailing dots. |
113 if (domain->at(0) == '.') | 78 if (domain->at(0) == '.') |
114 domain->erase(0, 1); | 79 domain->erase(0, 1); |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
156 } | 121 } |
157 if (!gurl.is_valid()) { | 122 if (!gurl.is_valid()) { |
158 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain; | 123 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain; |
159 result = kWarning; | 124 result = kWarning; |
160 } | 125 } |
161 domain->assign(spec.substr(host.begin, host.len)); | 126 domain->assign(spec.substr(host.begin, host.len)); |
162 | 127 |
163 return result; | 128 return result; |
164 } | 129 } |
165 | 130 |
166 // Loads the file described by 'in_filename', converts it to the desired format | 131 NormalizeResult NormalizeDataToRuleMap(const std::string data, |
167 // (see the file comments above), and saves it into 'out_filename'. Returns | 132 RuleMap* rules) { |
168 // the most severe of the result codes encountered when normalizing the rules. | 133 CHECK(rules); |
169 NormalizeResult NormalizeFile(const base::FilePath& in_filename, | |
170 const base::FilePath& out_filename) { | |
171 std::string data; | |
172 if (!file_util::ReadFileToString(in_filename, &data)) { | |
173 LOG(ERROR) << "Unable to read file"; | |
174 // We return success since we've already reported the error. | |
175 return kSuccess; | |
176 } | |
177 | |
178 // We do a lot of string assignment during parsing, but simplicity is more | 134 // We do a lot of string assignment during parsing, but simplicity is more |
179 // important than performance here. | 135 // important than performance here. |
180 std::string domain; | 136 std::string domain; |
181 NormalizeResult result = kSuccess; | 137 NormalizeResult result = kSuccess; |
182 size_t line_start = 0; | 138 size_t line_start = 0; |
183 size_t line_end = 0; | 139 size_t line_end = 0; |
184 RuleMap rules; | 140 bool is_private = false; |
185 RuleSet extra_rules; | 141 RuleMap extra_rules; |
186 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1; | 142 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1; |
| 143 int end_private_length = arraysize(kEndPrivateDomainsComment) - 1; |
187 while (line_start < data.size()) { | 144 while (line_start < data.size()) { |
188 // Skip the entire section of private domains. | |
189 // TODO(pamg): remove this when http://crbug.com/96086 is fixed. | |
190 if (line_start + begin_private_length < data.size() && | 145 if (line_start + begin_private_length < data.size() && |
191 !data.compare(line_start, begin_private_length, | 146 !data.compare(line_start, begin_private_length, |
192 kBeginPrivateDomainsComment)) { | 147 kBeginPrivateDomainsComment)) { |
193 line_end = data.find(kEndPrivateDomainsComment, line_start); | 148 is_private = true; |
194 if (line_end == std::string::npos) { | 149 line_end = line_start + begin_private_length; |
195 LOG(WARNING) << "Private-domain section had no end marker."; | 150 } else if (line_start + end_private_length < data.size() && |
196 line_end = data.size(); | 151 !data.compare(line_start, end_private_length, |
197 } | 152 kEndPrivateDomainsComment)) { |
| 153 is_private = false; |
| 154 line_end = line_start + end_private_length; |
198 } else if (line_start + 1 < data.size() && | 155 } else if (line_start + 1 < data.size() && |
199 data[line_start] == '/' && | 156 data[line_start] == '/' && |
200 data[line_start + 1] == '/') { | 157 data[line_start + 1] == '/') { |
201 // Skip comments. | 158 // Skip comments. |
202 line_end = data.find_first_of("\r\n", line_start); | 159 line_end = data.find_first_of("\r\n", line_start); |
203 if (line_end == std::string::npos) | 160 if (line_end == std::string::npos) |
204 line_end = data.size(); | 161 line_end = data.size(); |
205 } else { | 162 } else { |
206 // Truncate at first whitespace. | 163 // Truncate at first whitespace. |
207 line_end = data.find_first_of("\r\n \t", line_start); | 164 line_end = data.find_first_of("\r\n \t", line_start); |
208 if (line_end == std::string::npos) | 165 if (line_end == std::string::npos) |
209 line_end = data.size(); | 166 line_end = data.size(); |
210 domain.assign(data.data(), line_start, line_end - line_start); | 167 domain.assign(data.data(), line_start, line_end - line_start); |
211 | 168 |
212 Rule rule; | 169 Rule rule; |
213 rule.wildcard = false; | 170 rule.wildcard = false; |
214 rule.exception = false; | 171 rule.exception = false; |
| 172 rule.is_private = is_private; |
215 NormalizeResult new_result = NormalizeRule(&domain, &rule); | 173 NormalizeResult new_result = NormalizeRule(&domain, &rule); |
216 if (new_result != kError) { | 174 if (new_result != kError) { |
217 // Check the existing rules to make sure we don't have an exception and | 175 // Check the existing rules to make sure we don't have an exception and |
218 // wildcard for the same rule. If we did, we'd have to update our | 176 // wildcard for the same rule, or that the same domain is listed as both |
| 177 // private and not private. If we did, we'd have to update our |
219 // parsing code to handle this case. | 178 // parsing code to handle this case. |
220 CHECK(rules.find(domain) == rules.end()); | 179 CHECK(rules->find(domain) == rules->end()); |
221 | 180 |
222 rules[domain] = rule; | 181 (*rules)[domain] = rule; |
223 // Add true TLD for multi-level rules. We don't add them right now, in | 182 // Add true TLD for multi-level rules. We don't add them right now, in |
224 // case there's an exception or wild card that either exists or might be | 183 // case there's an exception or wild card that either exists or might be |
225 // added in a later iteration. In those cases, there's no need to add | 184 // added in a later iteration. In those cases, there's no need to add |
226 // it and it would just slow down parsing the data. | 185 // it and it would just slow down parsing the data. |
227 size_t tld_start = domain.find_last_of('.'); | 186 size_t tld_start = domain.find_last_of('.'); |
228 if (tld_start != std::string::npos && tld_start + 1 < domain.size()) | 187 if (tld_start != std::string::npos && tld_start + 1 < domain.size()) { |
229 extra_rules.insert(domain.substr(tld_start + 1)); | 188 std::string extra_rule_domain = domain.substr(tld_start + 1); |
| 189 RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain); |
| 190 Rule extra_rule; |
| 191 extra_rule.exception = false; |
| 192 extra_rule.wildcard = false; |
| 193 if (iter == extra_rules.end()) { |
| 194 extra_rule.is_private = is_private; |
| 195 } else { |
| 196 // A rule already exists, so we ensure that if any of the entries is |
| 197 // not private the result should be that the entry is not private. |
| 198 // An example is .au which is not listed as a real TLD, but only |
| 199 // lists second-level domains such as com.au. Subdomains of .au |
| 200 // (eg. blogspot.com.au) are also listed in the private section, |
| 201 // which is processed later, so this ensures that the real TLD |
| 202 // (eg. .au) is listed as public. |
| 203 extra_rule.is_private = is_private && iter->second.is_private; |
| 204 } |
| 205 extra_rules[extra_rule_domain] = extra_rule; |
| 206 } |
230 } | 207 } |
231 result = std::max(result, new_result); | 208 result = std::max(result, new_result); |
232 } | 209 } |
233 | 210 |
234 // Find beginning of next non-empty line. | 211 // Find beginning of next non-empty line. |
235 line_start = data.find_first_of("\r\n", line_end); | 212 line_start = data.find_first_of("\r\n", line_end); |
236 if (line_start == std::string::npos) | 213 if (line_start == std::string::npos) |
237 line_start = data.size(); | 214 line_start = data.size(); |
238 line_start = data.find_first_not_of("\r\n", line_start); | 215 line_start = data.find_first_not_of("\r\n", line_start); |
239 if (line_start == std::string::npos) | 216 if (line_start == std::string::npos) |
240 line_start = data.size(); | 217 line_start = data.size(); |
241 } | 218 } |
242 | 219 |
243 for (RuleSet::const_iterator iter = extra_rules.begin(); | 220 for (RuleMap::const_iterator iter = extra_rules.begin(); |
244 iter != extra_rules.end(); | 221 iter != extra_rules.end(); |
245 ++iter) { | 222 ++iter) { |
246 if (rules.find(*iter) == rules.end()) { | 223 if (rules->find(iter->first) == rules->end()) { |
247 Rule rule; | 224 (*rules)[iter->first] = iter->second; |
248 rule.exception = false; | |
249 rule.wildcard = false; | |
250 rules[*iter] = rule; | |
251 } | 225 } |
252 } | 226 } |
253 | 227 |
| 228 return result; |
| 229 } |
| 230 |
| 231 NormalizeResult NormalizeFile(const base::FilePath& in_filename, |
| 232 const base::FilePath& out_filename) { |
| 233 RuleMap rules; |
| 234 std::string data; |
| 235 if (!file_util::ReadFileToString(in_filename, &data)) { |
| 236 LOG(ERROR) << "Unable to read file"; |
| 237 // We return success since we've already reported the error. |
| 238 return kSuccess; |
| 239 } |
| 240 |
| 241 NormalizeResult result = NormalizeDataToRuleMap(data, &rules); |
| 242 |
254 if (!WriteRules(rules, out_filename)) { | 243 if (!WriteRules(rules, out_filename)) { |
255 LOG(ERROR) << "Error(s) writing output file"; | 244 LOG(ERROR) << "Error(s) writing output file"; |
256 result = kError; | 245 result = kError; |
257 } | 246 } |
258 | 247 |
259 return result; | 248 return result; |
260 } | 249 } |
261 | 250 |
262 int main(int argc, const char* argv[]) { | |
263 base::EnableTerminationOnHeapCorruption(); | |
264 if (argc != 1) { | |
265 fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n"); | |
266 fprintf(stderr, "Usage: %s\n", argv[0]); | |
267 return 1; | |
268 } | |
269 | 251 |
270 // Manages the destruction of singletons. | 252 } // namespace tld_cleanup |
271 base::AtExitManager exit_manager; | 253 } // namespace net |
272 | |
273 // Only use OutputDebugString in debug mode. | |
274 #ifdef NDEBUG | |
275 logging::LoggingDestination destination = logging::LOG_ONLY_TO_FILE; | |
276 #else | |
277 logging::LoggingDestination destination = | |
278 logging::LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG; | |
279 #endif | |
280 | |
281 CommandLine::Init(argc, argv); | |
282 | |
283 base::FilePath log_filename; | |
284 PathService::Get(base::DIR_EXE, &log_filename); | |
285 log_filename = log_filename.AppendASCII("tld_cleanup.log"); | |
286 logging::InitLogging( | |
287 log_filename.value().c_str(), | |
288 destination, | |
289 logging::LOCK_LOG_FILE, | |
290 logging::DELETE_OLD_LOG_FILE, | |
291 logging::DISABLE_DCHECK_FOR_NON_OFFICIAL_RELEASE_BUILDS); | |
292 | |
293 icu_util::Initialize(); | |
294 | |
295 base::FilePath input_file; | |
296 PathService::Get(base::DIR_SOURCE_ROOT, &input_file); | |
297 input_file = input_file.Append(FILE_PATH_LITERAL("net")) | |
298 .Append(FILE_PATH_LITERAL("base")) | |
299 .Append(FILE_PATH_LITERAL( | |
300 "registry_controlled_domains")) | |
301 .Append(FILE_PATH_LITERAL("effective_tld_names.dat")); | |
302 base::FilePath output_file; | |
303 PathService::Get(base::DIR_SOURCE_ROOT, &output_file); | |
304 output_file = output_file.Append(FILE_PATH_LITERAL("net")) | |
305 .Append(FILE_PATH_LITERAL("base")) | |
306 .Append(FILE_PATH_LITERAL( | |
307 "registry_controlled_domains")) | |
308 .Append(FILE_PATH_LITERAL( | |
309 "effective_tld_names.gperf")); | |
310 NormalizeResult result = NormalizeFile(input_file, output_file); | |
311 if (result != kSuccess) { | |
312 fprintf(stderr, | |
313 "Errors or warnings processing file. See log in tld_cleanup.log."); | |
314 } | |
315 | |
316 if (result == kError) | |
317 return 1; | |
318 return 0; | |
319 } | |
OLD | NEW |