Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(386)

Side by Side Diff: net/tools/tld_cleanup/tld_cleanup_util.cc

Issue 13979002: Add support for split PSL list distinctions. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Added const modifiers Created 7 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // This command-line program converts an effective-TLD data file in UTF-8 from 5 #include "net/tools/tld_cleanup/tld_cleanup_util.h"
6 // the format provided by Mozilla to the format expected by Chrome. This
7 // program generates an intermediate file which is then used by gperf to
8 // generate a perfect hash map. The benefit of this approach is that no time is
9 // spent on program initialization to generate the map of this data.
10 //
11 // Running this program finds "effective_tld_names.dat" in the expected location
12 // in the source checkout and generates "effective_tld_names.gperf" next to it.
13 //
14 // Any errors or warnings from this program are recorded in tld_cleanup.log.
15 //
16 // In particular, it
17 // * Strips blank lines and comments, as well as notes for individual rules.
18 // * Strips a single leading and/or trailing dot from each rule, if present.
19 // * Logs a warning if a rule contains '!' or '*.' other than at the beginning
20 // of the rule. (This also catches multiple ! or *. at the start of a rule.)
21 // * Logs a warning if GURL reports a rule as invalid, but keeps the rule.
22 // * Canonicalizes each rule's domain by converting it to a GURL and back.
23 // * Adds explicit rules for true TLDs found in any rule.
24 // TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed.
25 // * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS==="
26 // and "// ===END PRIVATE DOMAINS===".
27 6
28 #include <map>
29 #include <set>
30 #include <string>
31
32 #include "base/at_exit.h"
33 #include "base/command_line.h"
34 #include "base/file_util.h" 7 #include "base/file_util.h"
35 #include "base/file_util.h"
36 #include "base/files/file_path.h"
37 #include "base/i18n/icu_util.h"
38 #include "base/logging.h" 8 #include "base/logging.h"
39 #include "base/path_service.h"
40 #include "base/process_util.h"
41 #include "base/string_util.h" 9 #include "base/string_util.h"
42 #include "googleurl/src/gurl.h" 10 #include "googleurl/src/gurl.h"
43 #include "googleurl/src/url_parse.h" 11 #include "googleurl/src/url_parse.h"
44 12
45 namespace { 13 namespace {
46 struct Rule {
47 bool exception;
48 bool wildcard;
49 };
50
51 typedef std::map<std::string, Rule> RuleMap;
52 typedef std::set<std::string> RuleSet;
53 14
54 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; 15 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
55 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; 16 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";
56 } 17 }
57 18
19 namespace net {
20 namespace tld_cleanup {
21
58 // Writes the list of domain rules contained in the 'rules' set to the 22 // Writes the list of domain rules contained in the 'rules' set to the
59 // 'outfile', with each rule terminated by a LF. The file must already have 23 // 'outfile', with each rule terminated by a LF. The file must already have
60 // been created with write access. 24 // been created with write access.
61 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) { 25 bool WriteRules(const RuleMap& rules, base::FilePath* outfile) {
62 std::string data; 26 std::string data;
63 data.append( 27 data.append(
64 "%{\n" 28 "%{\n"
65 "// Copyright (c) 2012 The Chromium Authors. All rights reserved.\n" 29 "// Copyright 2012 The Chromium Authors. All rights reserved.\n"
66 "// Use of this source code is governed by a BSD-style license that can be\n" 30 "// Use of this source code is governed by a BSD-style license that can be\n"
67 "// found in the LICENSE file.\n\n" 31 "// found in the LICENSE file.\n\n"
68 "// This file is generated by net/tools/tld_cleanup/.\n" 32 "// This file is generated by net/tools/tld_cleanup/.\n"
69 "// DO NOT MANUALLY EDIT!\n" 33 "// DO NOT MANUALLY EDIT!\n"
70 "%}\n" 34 "%}\n"
71 "struct DomainRule {\n" 35 "struct DomainRule {\n"
72 " const char *name;\n" 36 " const char *name;\n"
73 " int type; // 1: exception, 2: wildcard\n" 37 " int type; // 1: exception, 2: wildcard\n"
38 " bool is_private;\n"
74 "};\n" 39 "};\n"
75 "%%\n" 40 "%%\n"
76 ); 41 );
77 42
78 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) { 43 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {
79 data.append(i->first); 44 data.append(i->first);
80 data.append(", "); 45 data.append(", ");
81 if (i->second.exception) { 46 if (i->second.exception) {
82 data.append("1"); 47 data.append("1");
83 } else if (i->second.wildcard) { 48 } else if (i->second.wildcard) {
84 data.append("2"); 49 data.append("2");
85 } else { 50 } else {
86 data.append("0"); 51 data.append("0");
87 } 52 }
53 if (i->second.is_private) {
54 data.append(", true");
55 } else {
56 data.append(", false");
57 }
88 data.append("\n"); 58 data.append("\n");
89 } 59 }
90 60
91 data.append("%%\n"); 61 data.append("%%\n");
92 62
93 int written = file_util::WriteFile(outfile, data.data(), data.size()); 63 int written = file_util::WriteFile(*outfile, data.data(), data.size());
94 64
95 return written == static_cast<int>(data.size()); 65 return written == static_cast<int>(data.size());
96 } 66 }
97 67
98 // These result codes should be in increasing order of severity.
99 typedef enum {
100 kSuccess,
101 kWarning,
102 kError,
103 } NormalizeResult;
104
105 // Adjusts the rule to a standard form: removes single extraneous dots and 68 // Adjusts the rule to a standard form: removes single extraneous dots and
106 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as 69 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
107 // valid; logs a warning and returns kWarning if it is probably invalid; and 70 // valid; logs a warning and returns kWarning if it is probably invalid; and
108 // logs an error and returns kError if the rule is (almost) certainly invalid. 71 // logs an error and returns kError if the rule is (almost) certainly invalid.
109 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) { 72 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
110 NormalizeResult result = kSuccess; 73 NormalizeResult result = kSuccess;
111 74
112 // Strip single leading and trailing dots. 75 // Strip single leading and trailing dots.
113 if (domain->at(0) == '.') 76 if (domain->at(0) == '.')
114 domain->erase(0, 1); 77 domain->erase(0, 1);
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
156 } 119 }
157 if (!gurl.is_valid()) { 120 if (!gurl.is_valid()) {
158 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain; 121 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
159 result = kWarning; 122 result = kWarning;
160 } 123 }
161 domain->assign(spec.substr(host.begin, host.len)); 124 domain->assign(spec.substr(host.begin, host.len));
162 125
163 return result; 126 return result;
164 } 127 }
165 128
166 // Loads the file described by 'in_filename', converts it to the desired format 129 NormalizeResult NormalizeDataToRuleMap(const std::string data,
167 // (see the file comments above), and saves it into 'out_filename'. Returns 130 RuleMap* rules) {
168 // the most severe of the result codes encountered when normalizing the rules. 131 CHECK(rules);
169 NormalizeResult NormalizeFile(const base::FilePath& in_filename,
170 const base::FilePath& out_filename) {
171 std::string data;
172 if (!file_util::ReadFileToString(in_filename, &data)) {
173 LOG(ERROR) << "Unable to read file";
174 // We return success since we've already reported the error.
175 return kSuccess;
176 }
177
178 // We do a lot of string assignment during parsing, but simplicity is more 132 // We do a lot of string assignment during parsing, but simplicity is more
179 // important than performance here. 133 // important than performance here.
180 std::string domain; 134 std::string domain;
181 NormalizeResult result = kSuccess; 135 NormalizeResult result = kSuccess;
182 size_t line_start = 0; 136 size_t line_start = 0;
183 size_t line_end = 0; 137 size_t line_end = 0;
184 RuleMap rules; 138 bool is_private = false;
185 RuleSet extra_rules; 139 RuleMap extra_rules;
186 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1; 140 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;
141 int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;
187 while (line_start < data.size()) { 142 while (line_start < data.size()) {
188 // Skip the entire section of private domains.
189 // TODO(pamg): remove this when http://crbug.com/96086 is fixed.
190 if (line_start + begin_private_length < data.size() && 143 if (line_start + begin_private_length < data.size() &&
191 !data.compare(line_start, begin_private_length, 144 !data.compare(line_start, begin_private_length,
192 kBeginPrivateDomainsComment)) { 145 kBeginPrivateDomainsComment)) {
193 line_end = data.find(kEndPrivateDomainsComment, line_start); 146 is_private = true;
194 if (line_end == std::string::npos) { 147 line_end = line_start + begin_private_length;
195 LOG(WARNING) << "Private-domain section had no end marker."; 148 } else if (line_start + end_private_length < data.size() &&
196 line_end = data.size(); 149 !data.compare(line_start, end_private_length,
197 } 150 kEndPrivateDomainsComment)) {
151 is_private = false;
152 line_end = line_start + end_private_length;
198 } else if (line_start + 1 < data.size() && 153 } else if (line_start + 1 < data.size() &&
199 data[line_start] == '/' && 154 data[line_start] == '/' &&
200 data[line_start + 1] == '/') { 155 data[line_start + 1] == '/') {
201 // Skip comments. 156 // Skip comments.
202 line_end = data.find_first_of("\r\n", line_start); 157 line_end = data.find_first_of("\r\n", line_start);
203 if (line_end == std::string::npos) 158 if (line_end == std::string::npos)
204 line_end = data.size(); 159 line_end = data.size();
205 } else { 160 } else {
206 // Truncate at first whitespace. 161 // Truncate at first whitespace.
207 line_end = data.find_first_of("\r\n \t", line_start); 162 line_end = data.find_first_of("\r\n \t", line_start);
208 if (line_end == std::string::npos) 163 if (line_end == std::string::npos)
209 line_end = data.size(); 164 line_end = data.size();
210 domain.assign(data.data(), line_start, line_end - line_start); 165 domain.assign(data.data(), line_start, line_end - line_start);
211 166
212 Rule rule; 167 Rule rule;
213 rule.wildcard = false; 168 rule.wildcard = false;
214 rule.exception = false; 169 rule.exception = false;
170 rule.is_private = is_private;
215 NormalizeResult new_result = NormalizeRule(&domain, &rule); 171 NormalizeResult new_result = NormalizeRule(&domain, &rule);
216 if (new_result != kError) { 172 if (new_result != kError) {
217 // Check the existing rules to make sure we don't have an exception and 173 // Check the existing rules to make sure we don't have an exception and
218 // wildcard for the same rule. If we did, we'd have to update our 174 // wildcard for the same rule, or that the same domain is listed as both
175 // private and not private. If we did, we'd have to update our
219 // parsing code to handle this case. 176 // parsing code to handle this case.
220 CHECK(rules.find(domain) == rules.end()); 177 CHECK(rules->find(domain) == rules->end());
221 178
222 rules[domain] = rule; 179 (*rules)[domain] = rule;
223 // Add true TLD for multi-level rules. We don't add them right now, in 180 // Add true TLD for multi-level rules. We don't add them right now, in
224 // case there's an exception or wild card that either exists or might be 181 // case there's an exception or wild card that either exists or might be
225 // added in a later iteration. In those cases, there's no need to add 182 // added in a later iteration. In those cases, there's no need to add
226 // it and it would just slow down parsing the data. 183 // it and it would just slow down parsing the data.
227 size_t tld_start = domain.find_last_of('.'); 184 size_t tld_start = domain.find_last_of('.');
228 if (tld_start != std::string::npos && tld_start + 1 < domain.size()) 185 if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
229 extra_rules.insert(domain.substr(tld_start + 1)); 186 std::string extra_rule_domain = domain.substr(tld_start + 1);
187 RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
188 Rule extra_rule;
189 extra_rule.exception = false;
190 extra_rule.wildcard = false;
191 if (iter == extra_rules.end()) {
192 extra_rule.is_private = is_private;
193 } else {
194 // A rule already exists, so we ensure that if any of the entries is
195 // not private the result should be that the entry is not private.
196 // An example is .au which is not listed as a real TLD, but only
197 // lists second-level domains such as com.au. Subdomains of .au
198 // (eg. blogspot.com.au) are also listed in the private section,
199 // which is processed later, so this ensures that the real TLD
200 // (eg. .au) is listed as public.
201 extra_rule.is_private = is_private && iter->second.is_private;
202 }
203 extra_rules[extra_rule_domain] = extra_rule;
204 }
230 } 205 }
231 result = std::max(result, new_result); 206 result = std::max(result, new_result);
232 } 207 }
233 208
234 // Find beginning of next non-empty line. 209 // Find beginning of next non-empty line.
235 line_start = data.find_first_of("\r\n", line_end); 210 line_start = data.find_first_of("\r\n", line_end);
236 if (line_start == std::string::npos) 211 if (line_start == std::string::npos)
237 line_start = data.size(); 212 line_start = data.size();
238 line_start = data.find_first_not_of("\r\n", line_start); 213 line_start = data.find_first_not_of("\r\n", line_start);
239 if (line_start == std::string::npos) 214 if (line_start == std::string::npos)
240 line_start = data.size(); 215 line_start = data.size();
241 } 216 }
242 217
243 for (RuleSet::const_iterator iter = extra_rules.begin(); 218 for (RuleMap::const_iterator iter = extra_rules.begin();
244 iter != extra_rules.end(); 219 iter != extra_rules.end();
245 ++iter) { 220 ++iter) {
246 if (rules.find(*iter) == rules.end()) { 221 if (rules->find(iter->first) == rules->end()) {
247 Rule rule; 222 (*rules)[iter->first] = iter->second;
248 rule.exception = false;
249 rule.wildcard = false;
250 rules[*iter] = rule;
251 } 223 }
252 } 224 }
253 225
226 return result;
227 }
228
229 NormalizeResult NormalizeFile(const base::FilePath& in_filename,
230 base::FilePath* out_filename) {
231 DCHECK(out_filename);
232 RuleMap rules;
233 std::string data;
234 if (!file_util::ReadFileToString(in_filename, &data)) {
235 LOG(ERROR) << "Unable to read file";
236 // We return success since we've already reported the error.
237 return kSuccess;
238 }
239
240 NormalizeResult result = NormalizeDataToRuleMap(data, &rules);
241
254 if (!WriteRules(rules, out_filename)) { 242 if (!WriteRules(rules, out_filename)) {
255 LOG(ERROR) << "Error(s) writing output file"; 243 LOG(ERROR) << "Error(s) writing output file";
256 result = kError; 244 result = kError;
257 } 245 }
258 246
259 return result; 247 return result;
260 } 248 }
261 249
262 int main(int argc, const char* argv[]) {
263 base::EnableTerminationOnHeapCorruption();
264 if (argc != 1) {
265 fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n");
266 fprintf(stderr, "Usage: %s\n", argv[0]);
267 return 1;
268 }
269 250
270 // Manages the destruction of singletons. 251 } // namespace tld_cleanup
271 base::AtExitManager exit_manager; 252 } // namespace net
272
273 // Only use OutputDebugString in debug mode.
274 #ifdef NDEBUG
275 logging::LoggingDestination destination = logging::LOG_ONLY_TO_FILE;
276 #else
277 logging::LoggingDestination destination =
278 logging::LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG;
279 #endif
280
281 CommandLine::Init(argc, argv);
282
283 base::FilePath log_filename;
284 PathService::Get(base::DIR_EXE, &log_filename);
285 log_filename = log_filename.AppendASCII("tld_cleanup.log");
286 logging::InitLogging(
287 log_filename.value().c_str(),
288 destination,
289 logging::LOCK_LOG_FILE,
290 logging::DELETE_OLD_LOG_FILE,
291 logging::DISABLE_DCHECK_FOR_NON_OFFICIAL_RELEASE_BUILDS);
292
293 icu_util::Initialize();
294
295 base::FilePath input_file;
296 PathService::Get(base::DIR_SOURCE_ROOT, &input_file);
297 input_file = input_file.Append(FILE_PATH_LITERAL("net"))
298 .Append(FILE_PATH_LITERAL("base"))
299 .Append(FILE_PATH_LITERAL(
300 "registry_controlled_domains"))
301 .Append(FILE_PATH_LITERAL("effective_tld_names.dat"));
302 base::FilePath output_file;
303 PathService::Get(base::DIR_SOURCE_ROOT, &output_file);
304 output_file = output_file.Append(FILE_PATH_LITERAL("net"))
305 .Append(FILE_PATH_LITERAL("base"))
306 .Append(FILE_PATH_LITERAL(
307 "registry_controlled_domains"))
308 .Append(FILE_PATH_LITERAL(
309 "effective_tld_names.gperf"));
310 NormalizeResult result = NormalizeFile(input_file, output_file);
311 if (result != kSuccess) {
312 fprintf(stderr,
313 "Errors or warnings processing file. See log in tld_cleanup.log.");
314 }
315
316 if (result == kError)
317 return 1;
318 return 0;
319 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698