Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(247)

Side by Side Diff: net/tools/tld_cleanup/tld_cleanup_util.cc

Issue 15140003: Add support for split Public Suffix List distinctions. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Rebased again Created 7 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // This command-line program converts an effective-TLD data file in UTF-8 from 5 #include "net/tools/tld_cleanup/tld_cleanup_util.h"
6 // the format provided by Mozilla to the format expected by Chrome. This
7 // program generates an intermediate file which is then used by gperf to
8 // generate a perfect hash map. The benefit of this approach is that no time is
9 // spent on program initialization to generate the map of this data.
10 //
11 // Running this program finds "effective_tld_names.dat" in the expected location
12 // in the source checkout and generates "effective_tld_names.gperf" next to it.
13 //
14 // Any errors or warnings from this program are recorded in tld_cleanup.log.
15 //
16 // In particular, it
17 // * Strips blank lines and comments, as well as notes for individual rules.
18 // * Strips a single leading and/or trailing dot from each rule, if present.
19 // * Logs a warning if a rule contains '!' or '*.' other than at the beginning
20 // of the rule. (This also catches multiple ! or *. at the start of a rule.)
21 // * Logs a warning if GURL reports a rule as invalid, but keeps the rule.
22 // * Canonicalizes each rule's domain by converting it to a GURL and back.
23 // * Adds explicit rules for true TLDs found in any rule.
24 // TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed.
25 // * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS==="
26 // and "// ===END PRIVATE DOMAINS===".
27 6
28 #include <map>
29 #include <set>
30 #include <string>
31
32 #include "base/at_exit.h"
33 #include "base/command_line.h"
34 #include "base/file_util.h" 7 #include "base/file_util.h"
35 #include "base/file_util.h"
36 #include "base/files/file_path.h"
37 #include "base/i18n/icu_util.h"
38 #include "base/logging.h" 8 #include "base/logging.h"
39 #include "base/path_service.h"
40 #include "base/process_util.h"
41 #include "base/string_util.h" 9 #include "base/string_util.h"
42 #include "googleurl/src/gurl.h" 10 #include "googleurl/src/gurl.h"
43 #include "googleurl/src/url_parse.h" 11 #include "googleurl/src/url_parse.h"
44 12
45 namespace { 13 namespace {
46 struct Rule {
47 bool exception;
48 bool wildcard;
49 };
50
51 typedef std::map<std::string, Rule> RuleMap;
52 typedef std::set<std::string> RuleSet;
53 14
54 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; 15 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
55 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; 16 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";
56 } 17 }
57 18
19 namespace net {
20 namespace tld_cleanup {
21
58 // Writes the list of domain rules contained in the 'rules' set to the 22 // Writes the list of domain rules contained in the 'rules' set to the
59 // 'outfile', with each rule terminated by a LF. The file must already have 23 // 'outfile', with each rule terminated by a LF. The file must already have
60 // been created with write access. 24 // been created with write access.
61 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) { 25 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
62 std::string data; 26 std::string data;
63 data.append( 27 data.append(
64 "%{\n" 28 "%{\n"
65 "// Copyright (c) 2012 The Chromium Authors. All rights reserved.\n" 29 "// Copyright 2012 The Chromium Authors. All rights reserved.\n"
66 "// Use of this source code is governed by a BSD-style license that can be\n" 30 "// Use of this source code is governed by a BSD-style license that can be\n"
67 "// found in the LICENSE file.\n\n" 31 "// found in the LICENSE file.\n\n"
68 "// This file is generated by net/tools/tld_cleanup/.\n" 32 "// This file is generated by net/tools/tld_cleanup/.\n"
69 "// DO NOT MANUALLY EDIT!\n" 33 "// DO NOT MANUALLY EDIT!\n"
70 "%}\n" 34 "%}\n"
71 "struct DomainRule {\n" 35 "struct DomainRule {\n"
72 " const char *name;\n" 36 " const char *name;\n"
73 " int type; // 1: exception, 2: wildcard\n" 37 " int type; // 1: exception, 2: wildcard\n"
38 " bool is_private;\n"
74 "};\n" 39 "};\n"
75 "%%\n" 40 "%%\n"
76 ); 41 );
77 42
78 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) { 43 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {
79 data.append(i->first); 44 data.append(i->first);
80 data.append(", "); 45 data.append(", ");
81 if (i->second.exception) { 46 if (i->second.exception) {
82 data.append("1"); 47 data.append("1");
83 } else if (i->second.wildcard) { 48 } else if (i->second.wildcard) {
84 data.append("2"); 49 data.append("2");
85 } else { 50 } else {
86 data.append("0"); 51 data.append("0");
87 } 52 }
53 if (i->second.is_private) {
54 data.append(", true");
55 } else {
56 data.append(", false");
57 }
88 data.append("\n"); 58 data.append("\n");
89 } 59 }
90 60
91 data.append("%%\n"); 61 data.append("%%\n");
92 62
93 int written = file_util::WriteFile(outfile, data.data(), data.size()); 63 int written = file_util::WriteFile(outfile,
64 data.data(),
65 static_cast<int>(data.size()));
94 66
95 return written == static_cast<int>(data.size()); 67 return written == static_cast<int>(data.size());
96 } 68 }
97 69
98 // These result codes should be in increasing order of severity.
99 typedef enum {
100 kSuccess,
101 kWarning,
102 kError,
103 } NormalizeResult;
104
105 // Adjusts the rule to a standard form: removes single extraneous dots and 70 // Adjusts the rule to a standard form: removes single extraneous dots and
106 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as 71 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
107 // valid; logs a warning and returns kWarning if it is probably invalid; and 72 // valid; logs a warning and returns kWarning if it is probably invalid; and
108 // logs an error and returns kError if the rule is (almost) certainly invalid. 73 // logs an error and returns kError if the rule is (almost) certainly invalid.
109 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) { 74 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
110 NormalizeResult result = kSuccess; 75 NormalizeResult result = kSuccess;
111 76
112 // Strip single leading and trailing dots. 77 // Strip single leading and trailing dots.
113 if (domain->at(0) == '.') 78 if (domain->at(0) == '.')
114 domain->erase(0, 1); 79 domain->erase(0, 1);
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
156 } 121 }
157 if (!gurl.is_valid()) { 122 if (!gurl.is_valid()) {
158 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain; 123 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
159 result = kWarning; 124 result = kWarning;
160 } 125 }
161 domain->assign(spec.substr(host.begin, host.len)); 126 domain->assign(spec.substr(host.begin, host.len));
162 127
163 return result; 128 return result;
164 } 129 }
165 130
166 // Loads the file described by 'in_filename', converts it to the desired format 131 NormalizeResult NormalizeDataToRuleMap(const std::string data,
167 // (see the file comments above), and saves it into 'out_filename'. Returns 132 RuleMap* rules) {
168 // the most severe of the result codes encountered when normalizing the rules. 133 CHECK(rules);
169 NormalizeResult NormalizeFile(const base::FilePath& in_filename,
170 const base::FilePath& out_filename) {
171 std::string data;
172 if (!file_util::ReadFileToString(in_filename, &data)) {
173 LOG(ERROR) << "Unable to read file";
174 // We return success since we've already reported the error.
175 return kSuccess;
176 }
177
178 // We do a lot of string assignment during parsing, but simplicity is more 134 // We do a lot of string assignment during parsing, but simplicity is more
179 // important than performance here. 135 // important than performance here.
180 std::string domain; 136 std::string domain;
181 NormalizeResult result = kSuccess; 137 NormalizeResult result = kSuccess;
182 size_t line_start = 0; 138 size_t line_start = 0;
183 size_t line_end = 0; 139 size_t line_end = 0;
184 RuleMap rules; 140 bool is_private = false;
185 RuleSet extra_rules; 141 RuleMap extra_rules;
186 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1; 142 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;
143 int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;
187 while (line_start < data.size()) { 144 while (line_start < data.size()) {
188 // Skip the entire section of private domains.
189 // TODO(pamg): remove this when http://crbug.com/96086 is fixed.
190 if (line_start + begin_private_length < data.size() && 145 if (line_start + begin_private_length < data.size() &&
191 !data.compare(line_start, begin_private_length, 146 !data.compare(line_start, begin_private_length,
192 kBeginPrivateDomainsComment)) { 147 kBeginPrivateDomainsComment)) {
193 line_end = data.find(kEndPrivateDomainsComment, line_start); 148 is_private = true;
194 if (line_end == std::string::npos) { 149 line_end = line_start + begin_private_length;
195 LOG(WARNING) << "Private-domain section had no end marker."; 150 } else if (line_start + end_private_length < data.size() &&
196 line_end = data.size(); 151 !data.compare(line_start, end_private_length,
197 } 152 kEndPrivateDomainsComment)) {
153 is_private = false;
154 line_end = line_start + end_private_length;
198 } else if (line_start + 1 < data.size() && 155 } else if (line_start + 1 < data.size() &&
199 data[line_start] == '/' && 156 data[line_start] == '/' &&
200 data[line_start + 1] == '/') { 157 data[line_start + 1] == '/') {
201 // Skip comments. 158 // Skip comments.
202 line_end = data.find_first_of("\r\n", line_start); 159 line_end = data.find_first_of("\r\n", line_start);
203 if (line_end == std::string::npos) 160 if (line_end == std::string::npos)
204 line_end = data.size(); 161 line_end = data.size();
205 } else { 162 } else {
206 // Truncate at first whitespace. 163 // Truncate at first whitespace.
207 line_end = data.find_first_of("\r\n \t", line_start); 164 line_end = data.find_first_of("\r\n \t", line_start);
208 if (line_end == std::string::npos) 165 if (line_end == std::string::npos)
209 line_end = data.size(); 166 line_end = data.size();
210 domain.assign(data.data(), line_start, line_end - line_start); 167 domain.assign(data.data(), line_start, line_end - line_start);
211 168
212 Rule rule; 169 Rule rule;
213 rule.wildcard = false; 170 rule.wildcard = false;
214 rule.exception = false; 171 rule.exception = false;
172 rule.is_private = is_private;
215 NormalizeResult new_result = NormalizeRule(&domain, &rule); 173 NormalizeResult new_result = NormalizeRule(&domain, &rule);
216 if (new_result != kError) { 174 if (new_result != kError) {
217 // Check the existing rules to make sure we don't have an exception and 175 // Check the existing rules to make sure we don't have an exception and
218 // wildcard for the same rule. If we did, we'd have to update our 176 // wildcard for the same rule, or that the same domain is listed as both
177 // private and not private. If we did, we'd have to update our
219 // parsing code to handle this case. 178 // parsing code to handle this case.
220 CHECK(rules.find(domain) == rules.end()); 179 CHECK(rules->find(domain) == rules->end());
221 180
222 rules[domain] = rule; 181 (*rules)[domain] = rule;
223 // Add true TLD for multi-level rules. We don't add them right now, in 182 // Add true TLD for multi-level rules. We don't add them right now, in
224 // case there's an exception or wild card that either exists or might be 183 // case there's an exception or wild card that either exists or might be
225 // added in a later iteration. In those cases, there's no need to add 184 // added in a later iteration. In those cases, there's no need to add
226 // it and it would just slow down parsing the data. 185 // it and it would just slow down parsing the data.
227 size_t tld_start = domain.find_last_of('.'); 186 size_t tld_start = domain.find_last_of('.');
228 if (tld_start != std::string::npos && tld_start + 1 < domain.size()) 187 if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
229 extra_rules.insert(domain.substr(tld_start + 1)); 188 std::string extra_rule_domain = domain.substr(tld_start + 1);
189 RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
190 Rule extra_rule;
191 extra_rule.exception = false;
192 extra_rule.wildcard = false;
193 if (iter == extra_rules.end()) {
194 extra_rule.is_private = is_private;
195 } else {
196 // A rule already exists, so we ensure that if any of the entries is
197 // not private the result should be that the entry is not private.
198 // An example is .au which is not listed as a real TLD, but only
199 // lists second-level domains such as com.au. Subdomains of .au
200 // (eg. blogspot.com.au) are also listed in the private section,
201 // which is processed later, so this ensures that the real TLD
202 // (eg. .au) is listed as public.
203 extra_rule.is_private = is_private && iter->second.is_private;
204 }
205 extra_rules[extra_rule_domain] = extra_rule;
206 }
230 } 207 }
231 result = std::max(result, new_result); 208 result = std::max(result, new_result);
232 } 209 }
233 210
234 // Find beginning of next non-empty line. 211 // Find beginning of next non-empty line.
235 line_start = data.find_first_of("\r\n", line_end); 212 line_start = data.find_first_of("\r\n", line_end);
236 if (line_start == std::string::npos) 213 if (line_start == std::string::npos)
237 line_start = data.size(); 214 line_start = data.size();
238 line_start = data.find_first_not_of("\r\n", line_start); 215 line_start = data.find_first_not_of("\r\n", line_start);
239 if (line_start == std::string::npos) 216 if (line_start == std::string::npos)
240 line_start = data.size(); 217 line_start = data.size();
241 } 218 }
242 219
243 for (RuleSet::const_iterator iter = extra_rules.begin(); 220 for (RuleMap::const_iterator iter = extra_rules.begin();
244 iter != extra_rules.end(); 221 iter != extra_rules.end();
245 ++iter) { 222 ++iter) {
246 if (rules.find(*iter) == rules.end()) { 223 if (rules->find(iter->first) == rules->end()) {
247 Rule rule; 224 (*rules)[iter->first] = iter->second;
248 rule.exception = false;
249 rule.wildcard = false;
250 rules[*iter] = rule;
251 } 225 }
252 } 226 }
253 227
228 return result;
229 }
230
231 NormalizeResult NormalizeFile(const base::FilePath& in_filename,
232 const base::FilePath& out_filename) {
233 RuleMap rules;
234 std::string data;
235 if (!file_util::ReadFileToString(in_filename, &data)) {
236 LOG(ERROR) << "Unable to read file";
237 // We return success since we've already reported the error.
238 return kSuccess;
239 }
240
241 NormalizeResult result = NormalizeDataToRuleMap(data, &rules);
242
254 if (!WriteRules(rules, out_filename)) { 243 if (!WriteRules(rules, out_filename)) {
255 LOG(ERROR) << "Error(s) writing output file"; 244 LOG(ERROR) << "Error(s) writing output file";
256 result = kError; 245 result = kError;
257 } 246 }
258 247
259 return result; 248 return result;
260 } 249 }
261 250
262 int main(int argc, const char* argv[]) {
263 base::EnableTerminationOnHeapCorruption();
264 if (argc != 1) {
265 fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n");
266 fprintf(stderr, "Usage: %s\n", argv[0]);
267 return 1;
268 }
269 251
270 // Manages the destruction of singletons. 252 } // namespace tld_cleanup
271 base::AtExitManager exit_manager; 253 } // namespace net
272
273 // Only use OutputDebugString in debug mode.
274 #ifdef NDEBUG
275 logging::LoggingDestination destination = logging::LOG_ONLY_TO_FILE;
276 #else
277 logging::LoggingDestination destination =
278 logging::LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG;
279 #endif
280
281 CommandLine::Init(argc, argv);
282
283 base::FilePath log_filename;
284 PathService::Get(base::DIR_EXE, &log_filename);
285 log_filename = log_filename.AppendASCII("tld_cleanup.log");
286 logging::InitLogging(
287 log_filename.value().c_str(),
288 destination,
289 logging::LOCK_LOG_FILE,
290 logging::DELETE_OLD_LOG_FILE,
291 logging::DISABLE_DCHECK_FOR_NON_OFFICIAL_RELEASE_BUILDS);
292
293 icu_util::Initialize();
294
295 base::FilePath input_file;
296 PathService::Get(base::DIR_SOURCE_ROOT, &input_file);
297 input_file = input_file.Append(FILE_PATH_LITERAL("net"))
298 .Append(FILE_PATH_LITERAL("base"))
299 .Append(FILE_PATH_LITERAL(
300 "registry_controlled_domains"))
301 .Append(FILE_PATH_LITERAL("effective_tld_names.dat"));
302 base::FilePath output_file;
303 PathService::Get(base::DIR_SOURCE_ROOT, &output_file);
304 output_file = output_file.Append(FILE_PATH_LITERAL("net"))
305 .Append(FILE_PATH_LITERAL("base"))
306 .Append(FILE_PATH_LITERAL(
307 "registry_controlled_domains"))
308 .Append(FILE_PATH_LITERAL(
309 "effective_tld_names.gperf"));
310 NormalizeResult result = NormalizeFile(input_file, output_file);
311 if (result != kSuccess) {
312 fprintf(stderr,
313 "Errors or warnings processing file. See log in tld_cleanup.log.");
314 }
315
316 if (result == kError)
317 return 1;
318 return 0;
319 }
OLDNEW
« no previous file with comments | « net/tools/tld_cleanup/tld_cleanup_util.h ('k') | net/tools/tld_cleanup/tld_cleanup_util_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698