OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // This command-line program converts an effective-TLD data file in UTF-8 from | 5 // This command-line program converts an effective-TLD data file in UTF-8 from |
6 // the format provided by Mozilla to the format expected by Chrome. This | 6 // the format provided by Mozilla to the format expected by Chrome. This |
7 // program generates an intermediate file which is then used by gperf to | 7 // program generates an intermediate file which is then used by gperf to |
8 // generate a perfect hash map. The benefit of this approach is that no time is | 8 // generate a perfect hash map. The benefit of this approach is that no time is |
9 // spent on program initialization to generate the map of this data. | 9 // spent on program initialization to generate the map of this data. |
10 // | 10 // |
11 // Running this program finds "effective_tld_names.dat" in the expected location | 11 // Running this program finds "effective_tld_names.dat" in the expected location |
12 // in the source checkout and generates "effective_tld_names.gperf" next to it. | 12 // in the source checkout and generates "effective_tld_names.gperf" next to it. |
13 // | 13 // |
14 // Any errors or warnings from this program are recorded in tld_cleanup.log. | 14 // Any errors or warnings from this program are recorded in tld_cleanup.log. |
15 // | 15 // |
16 // In particular, it | 16 // In particular, it |
17 // * Strips blank lines and comments, as well as notes for individual rules. | 17 // * Strips blank lines and comments, as well as notes for individual rules. |
18 // * Strips a single leading and/or trailing dot from each rule, if present. | 18 // * Strips a single leading and/or trailing dot from each rule, if present. |
19 // * Logs a warning if a rule contains '!' or '*.' other than at the beginning | 19 // * Logs a warning if a rule contains '!' or '*.' other than at the beginning |
20 // of the rule. (This also catches multiple ! or *. at the start of a rule.) | 20 // of the rule. (This also catches multiple ! or *. at the start of a rule.) |
21 // * Logs a warning if GURL reports a rule as invalid, but keeps the rule. | 21 // * Logs a warning if GURL reports a rule as invalid, but keeps the rule. |
22 // * Canonicalizes each rule's domain by converting it to a GURL and back. | 22 // * Canonicalizes each rule's domain by converting it to a GURL and back. |
23 // * Adds explicit rules for true TLDs found in any rule. | 23 // * Adds explicit rules for true TLDs found in any rule. |
24 // TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed. | 24 // * Marks entries in the file between "// ===BEGIN PRIVATE DOMAINS===" |
25 // * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS===" | 25 // and "// ===END PRIVATE DOMAINS===" as private. |
26 // and "// ===END PRIVATE DOMAINS===". | |
27 | |
28 #include <map> | |
29 #include <set> | |
30 #include <string> | |
31 | 26 |
32 #include "base/at_exit.h" | 27 #include "base/at_exit.h" |
33 #include "base/command_line.h" | 28 #include "base/command_line.h" |
34 #include "base/file_util.h" | 29 #include "base/file_util.h" |
35 #include "base/file_util.h" | |
36 #include "base/files/file_path.h" | 30 #include "base/files/file_path.h" |
37 #include "base/i18n/icu_util.h" | 31 #include "base/i18n/icu_util.h" |
38 #include "base/logging.h" | 32 #include "base/logging.h" |
39 #include "base/path_service.h" | 33 #include "base/path_service.h" |
40 #include "base/process_util.h" | 34 #include "base/process_util.h" |
41 #include "base/string_util.h" | 35 #include "net/tools/tld_cleanup/tld_cleanup_util.h" |
42 #include "googleurl/src/gurl.h" | |
43 #include "googleurl/src/url_parse.h" | |
44 | |
45 namespace { | |
46 struct Rule { | |
47 bool exception; | |
48 bool wildcard; | |
49 }; | |
50 | |
51 typedef std::map<std::string, Rule> RuleMap; | |
52 typedef std::set<std::string> RuleSet; | |
53 | |
54 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; | |
55 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; | |
56 } | |
57 | |
58 // Writes the list of domain rules contained in the 'rules' set to the | |
59 // 'outfile', with each rule terminated by a LF. The file must already have | |
60 // been created with write access. | |
61 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) { | |
62 std::string data; | |
63 data.append( | |
64 "%{\n" | |
65 "// Copyright (c) 2012 The Chromium Authors. All rights reserved.\n" | |
66 "// Use of this source code is governed by a BSD-style license that can be\n" | |
67 "// found in the LICENSE file.\n\n" | |
68 "// This file is generated by net/tools/tld_cleanup/.\n" | |
69 "// DO NOT MANUALLY EDIT!\n" | |
70 "%}\n" | |
71 "struct DomainRule {\n" | |
72 " const char *name;\n" | |
73 " int type; // 1: exception, 2: wildcard\n" | |
74 "};\n" | |
75 "%%\n" | |
76 ); | |
77 | |
78 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) { | |
79 data.append(i->first); | |
80 data.append(", "); | |
81 if (i->second.exception) { | |
82 data.append("1"); | |
83 } else if (i->second.wildcard) { | |
84 data.append("2"); | |
85 } else { | |
86 data.append("0"); | |
87 } | |
88 data.append("\n"); | |
89 } | |
90 | |
91 data.append("%%\n"); | |
92 | |
93 int written = file_util::WriteFile(outfile, data.data(), data.size()); | |
94 | |
95 return written == static_cast<int>(data.size()); | |
96 } | |
97 | |
98 // These result codes should be in increasing order of severity. | |
99 typedef enum { | |
100 kSuccess, | |
101 kWarning, | |
102 kError, | |
103 } NormalizeResult; | |
104 | |
105 // Adjusts the rule to a standard form: removes single extraneous dots and | |
106 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as | |
107 // valid; logs a warning and returns kWarning if it is probably invalid; and | |
108 // logs an error and returns kError if the rule is (almost) certainly invalid. | |
109 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) { | |
110 NormalizeResult result = kSuccess; | |
111 | |
112 // Strip single leading and trailing dots. | |
113 if (domain->at(0) == '.') | |
114 domain->erase(0, 1); | |
115 if (domain->empty()) { | |
116 LOG(WARNING) << "Ignoring empty rule"; | |
117 return kWarning; | |
118 } | |
119 if (domain->at(domain->size() - 1) == '.') | |
120 domain->erase(domain->size() - 1, 1); | |
121 if (domain->empty()) { | |
122 LOG(WARNING) << "Ignoring empty rule"; | |
123 return kWarning; | |
124 } | |
125 | |
126 // Allow single leading '*.' or '!', saved here so it's not canonicalized. | |
127 size_t start_offset = 0; | |
128 if (domain->at(0) == '!') { | |
129 domain->erase(0, 1); | |
130 rule->exception = true; | |
131 } else if (domain->find("*.") == 0) { | |
132 domain->erase(0, 2); | |
133 rule->wildcard = true; | |
134 } | |
135 if (domain->empty()) { | |
136 LOG(WARNING) << "Ignoring empty rule"; | |
137 return kWarning; | |
138 } | |
139 | |
140 // Warn about additional '*.' or '!'. | |
141 if (domain->find("*.", start_offset) != std::string::npos || | |
142 domain->find('!', start_offset) != std::string::npos) { | |
143 LOG(WARNING) << "Keeping probably invalid rule: " << *domain; | |
144 result = kWarning; | |
145 } | |
146 | |
147 // Make a GURL and normalize it, then get the host back out. | |
148 std::string url = "http://"; | |
149 url.append(*domain); | |
150 GURL gurl(url); | |
151 const std::string& spec = gurl.possibly_invalid_spec(); | |
152 url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host; | |
153 if (host.len < 0) { | |
154 LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain; | |
155 return kError; | |
156 } | |
157 if (!gurl.is_valid()) { | |
158 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain; | |
159 result = kWarning; | |
160 } | |
161 domain->assign(spec.substr(host.begin, host.len)); | |
162 | |
163 return result; | |
164 } | |
165 | |
166 // Loads the file described by 'in_filename', converts it to the desired format | |
167 // (see the file comments above), and saves it into 'out_filename'. Returns | |
168 // the most severe of the result codes encountered when normalizing the rules. | |
169 NormalizeResult NormalizeFile(const base::FilePath& in_filename, | |
170 const base::FilePath& out_filename) { | |
171 std::string data; | |
172 if (!file_util::ReadFileToString(in_filename, &data)) { | |
173 LOG(ERROR) << "Unable to read file"; | |
174 // We return success since we've already reported the error. | |
175 return kSuccess; | |
176 } | |
177 | |
178 // We do a lot of string assignment during parsing, but simplicity is more | |
179 // important than performance here. | |
180 std::string domain; | |
181 NormalizeResult result = kSuccess; | |
182 size_t line_start = 0; | |
183 size_t line_end = 0; | |
184 RuleMap rules; | |
185 RuleSet extra_rules; | |
186 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1; | |
187 while (line_start < data.size()) { | |
188 // Skip the entire section of private domains. | |
189 // TODO(pamg): remove this when http://crbug.com/96086 is fixed. | |
190 if (line_start + begin_private_length < data.size() && | |
191 !data.compare(line_start, begin_private_length, | |
192 kBeginPrivateDomainsComment)) { | |
193 line_end = data.find(kEndPrivateDomainsComment, line_start); | |
194 if (line_end == std::string::npos) { | |
195 LOG(WARNING) << "Private-domain section had no end marker."; | |
196 line_end = data.size(); | |
197 } | |
198 } else if (line_start + 1 < data.size() && | |
199 data[line_start] == '/' && | |
200 data[line_start + 1] == '/') { | |
201 // Skip comments. | |
202 line_end = data.find_first_of("\r\n", line_start); | |
203 if (line_end == std::string::npos) | |
204 line_end = data.size(); | |
205 } else { | |
206 // Truncate at first whitespace. | |
207 line_end = data.find_first_of("\r\n \t", line_start); | |
208 if (line_end == std::string::npos) | |
209 line_end = data.size(); | |
210 domain.assign(data.data(), line_start, line_end - line_start); | |
211 | |
212 Rule rule; | |
213 rule.wildcard = false; | |
214 rule.exception = false; | |
215 NormalizeResult new_result = NormalizeRule(&domain, &rule); | |
216 if (new_result != kError) { | |
217 // Check the existing rules to make sure we don't have an exception and | |
218 // wildcard for the same rule. If we did, we'd have to update our | |
219 // parsing code to handle this case. | |
220 CHECK(rules.find(domain) == rules.end()); | |
221 | |
222 rules[domain] = rule; | |
223 // Add true TLD for multi-level rules. We don't add them right now, in | |
224 // case there's an exception or wild card that either exists or might be | |
225 // added in a later iteration. In those cases, there's no need to add | |
226 // it and it would just slow down parsing the data. | |
227 size_t tld_start = domain.find_last_of('.'); | |
228 if (tld_start != std::string::npos && tld_start + 1 < domain.size()) | |
229 extra_rules.insert(domain.substr(tld_start + 1)); | |
230 } | |
231 result = std::max(result, new_result); | |
232 } | |
233 | |
234 // Find beginning of next non-empty line. | |
235 line_start = data.find_first_of("\r\n", line_end); | |
236 if (line_start == std::string::npos) | |
237 line_start = data.size(); | |
238 line_start = data.find_first_not_of("\r\n", line_start); | |
239 if (line_start == std::string::npos) | |
240 line_start = data.size(); | |
241 } | |
242 | |
243 for (RuleSet::const_iterator iter = extra_rules.begin(); | |
244 iter != extra_rules.end(); | |
245 ++iter) { | |
246 if (rules.find(*iter) == rules.end()) { | |
247 Rule rule; | |
248 rule.exception = false; | |
249 rule.wildcard = false; | |
250 rules[*iter] = rule; | |
251 } | |
252 } | |
253 | |
254 if (!WriteRules(rules, out_filename)) { | |
255 LOG(ERROR) << "Error(s) writing output file"; | |
256 result = kError; | |
257 } | |
258 | |
259 return result; | |
260 } | |
261 | 36 |
262 int main(int argc, const char* argv[]) { | 37 int main(int argc, const char* argv[]) { |
263 base::EnableTerminationOnHeapCorruption(); | 38 base::EnableTerminationOnHeapCorruption(); |
264 if (argc != 1) { | 39 if (argc != 1) { |
265 fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n"); | 40 fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n"); |
266 fprintf(stderr, "Usage: %s\n", argv[0]); | 41 fprintf(stderr, "Usage: %s\n", argv[0]); |
267 return 1; | 42 return 1; |
268 } | 43 } |
269 | 44 |
270 // Manages the destruction of singletons. | 45 // Manages the destruction of singletons. |
(...skipping 29 matching lines...) Expand all Loading... |
300 "registry_controlled_domains")) | 75 "registry_controlled_domains")) |
301 .Append(FILE_PATH_LITERAL("effective_tld_names.dat")); | 76 .Append(FILE_PATH_LITERAL("effective_tld_names.dat")); |
302 base::FilePath output_file; | 77 base::FilePath output_file; |
303 PathService::Get(base::DIR_SOURCE_ROOT, &output_file); | 78 PathService::Get(base::DIR_SOURCE_ROOT, &output_file); |
304 output_file = output_file.Append(FILE_PATH_LITERAL("net")) | 79 output_file = output_file.Append(FILE_PATH_LITERAL("net")) |
305 .Append(FILE_PATH_LITERAL("base")) | 80 .Append(FILE_PATH_LITERAL("base")) |
306 .Append(FILE_PATH_LITERAL( | 81 .Append(FILE_PATH_LITERAL( |
307 "registry_controlled_domains")) | 82 "registry_controlled_domains")) |
308 .Append(FILE_PATH_LITERAL( | 83 .Append(FILE_PATH_LITERAL( |
309 "effective_tld_names.gperf")); | 84 "effective_tld_names.gperf")); |
310 NormalizeResult result = NormalizeFile(input_file, output_file); | 85 net::tld_cleanup::NormalizeResult result = |
311 if (result != kSuccess) { | 86 net::tld_cleanup::NormalizeFile(input_file, output_file); |
| 87 if (result != net::tld_cleanup::kSuccess) { |
312 fprintf(stderr, | 88 fprintf(stderr, |
313 "Errors or warnings processing file. See log in tld_cleanup.log."); | 89 "Errors or warnings processing file. See log in tld_cleanup.log."); |
314 } | 90 } |
315 | 91 |
316 if (result == kError) | 92 if (result == net::tld_cleanup::kError) |
317 return 1; | 93 return 1; |
318 return 0; | 94 return 0; |
319 } | 95 } |
OLD | NEW |