Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(555)

Side by Side Diff: net/tools/tld_cleanup/tld_cleanup.cc

Issue 15140003: Add support for split Public Suffix List distinctions. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Rebased again Created 7 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « net/tools/tld_cleanup/README ('k') | net/tools/tld_cleanup/tld_cleanup.gyp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // This command-line program converts an effective-TLD data file in UTF-8 from 5 // This command-line program converts an effective-TLD data file in UTF-8 from
6 // the format provided by Mozilla to the format expected by Chrome. This 6 // the format provided by Mozilla to the format expected by Chrome. This
7 // program generates an intermediate file which is then used by gperf to 7 // program generates an intermediate file which is then used by gperf to
8 // generate a perfect hash map. The benefit of this approach is that no time is 8 // generate a perfect hash map. The benefit of this approach is that no time is
9 // spent on program initialization to generate the map of this data. 9 // spent on program initialization to generate the map of this data.
10 // 10 //
11 // Running this program finds "effective_tld_names.dat" in the expected location 11 // Running this program finds "effective_tld_names.dat" in the expected location
12 // in the source checkout and generates "effective_tld_names.gperf" next to it. 12 // in the source checkout and generates "effective_tld_names.gperf" next to it.
13 // 13 //
14 // Any errors or warnings from this program are recorded in tld_cleanup.log. 14 // Any errors or warnings from this program are recorded in tld_cleanup.log.
15 // 15 //
16 // In particular, it 16 // In particular, it
17 // * Strips blank lines and comments, as well as notes for individual rules. 17 // * Strips blank lines and comments, as well as notes for individual rules.
18 // * Strips a single leading and/or trailing dot from each rule, if present. 18 // * Strips a single leading and/or trailing dot from each rule, if present.
19 // * Logs a warning if a rule contains '!' or '*.' other than at the beginning 19 // * Logs a warning if a rule contains '!' or '*.' other than at the beginning
20 // of the rule. (This also catches multiple ! or *. at the start of a rule.) 20 // of the rule. (This also catches multiple ! or *. at the start of a rule.)
21 // * Logs a warning if GURL reports a rule as invalid, but keeps the rule. 21 // * Logs a warning if GURL reports a rule as invalid, but keeps the rule.
22 // * Canonicalizes each rule's domain by converting it to a GURL and back. 22 // * Canonicalizes each rule's domain by converting it to a GURL and back.
23 // * Adds explicit rules for true TLDs found in any rule. 23 // * Adds explicit rules for true TLDs found in any rule.
24 // TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed. 24 // * Marks entries in the file between "// ===BEGIN PRIVATE DOMAINS==="
25 // * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS===" 25 // and "// ===END PRIVATE DOMAINS===" as private.
26 // and "// ===END PRIVATE DOMAINS===".
27
28 #include <map>
29 #include <set>
30 #include <string>
31 26
32 #include "base/at_exit.h" 27 #include "base/at_exit.h"
33 #include "base/command_line.h" 28 #include "base/command_line.h"
34 #include "base/file_util.h" 29 #include "base/file_util.h"
35 #include "base/file_util.h"
36 #include "base/files/file_path.h" 30 #include "base/files/file_path.h"
37 #include "base/i18n/icu_util.h" 31 #include "base/i18n/icu_util.h"
38 #include "base/logging.h" 32 #include "base/logging.h"
39 #include "base/path_service.h" 33 #include "base/path_service.h"
40 #include "base/process_util.h" 34 #include "base/process_util.h"
41 #include "base/string_util.h" 35 #include "net/tools/tld_cleanup/tld_cleanup_util.h"
42 #include "googleurl/src/gurl.h"
43 #include "googleurl/src/url_parse.h"
44
45 namespace {
46 struct Rule {
47 bool exception;
48 bool wildcard;
49 };
50
51 typedef std::map<std::string, Rule> RuleMap;
52 typedef std::set<std::string> RuleSet;
53
54 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
55 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";
56 }
57
58 // Writes the list of domain rules contained in the 'rules' set to the
59 // 'outfile', with each rule terminated by a LF. The file must already have
60 // been created with write access.
61 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
62 std::string data;
63 data.append(
64 "%{\n"
65 "// Copyright (c) 2012 The Chromium Authors. All rights reserved.\n"
66 "// Use of this source code is governed by a BSD-style license that can be\n"
67 "// found in the LICENSE file.\n\n"
68 "// This file is generated by net/tools/tld_cleanup/.\n"
69 "// DO NOT MANUALLY EDIT!\n"
70 "%}\n"
71 "struct DomainRule {\n"
72 " const char *name;\n"
73 " int type; // 1: exception, 2: wildcard\n"
74 "};\n"
75 "%%\n"
76 );
77
78 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {
79 data.append(i->first);
80 data.append(", ");
81 if (i->second.exception) {
82 data.append("1");
83 } else if (i->second.wildcard) {
84 data.append("2");
85 } else {
86 data.append("0");
87 }
88 data.append("\n");
89 }
90
91 data.append("%%\n");
92
93 int written = file_util::WriteFile(outfile, data.data(), data.size());
94
95 return written == static_cast<int>(data.size());
96 }
97
98 // These result codes should be in increasing order of severity.
99 typedef enum {
100 kSuccess,
101 kWarning,
102 kError,
103 } NormalizeResult;
104
105 // Adjusts the rule to a standard form: removes single extraneous dots and
106 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
107 // valid; logs a warning and returns kWarning if it is probably invalid; and
108 // logs an error and returns kError if the rule is (almost) certainly invalid.
109 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
110 NormalizeResult result = kSuccess;
111
112 // Strip single leading and trailing dots.
113 if (domain->at(0) == '.')
114 domain->erase(0, 1);
115 if (domain->empty()) {
116 LOG(WARNING) << "Ignoring empty rule";
117 return kWarning;
118 }
119 if (domain->at(domain->size() - 1) == '.')
120 domain->erase(domain->size() - 1, 1);
121 if (domain->empty()) {
122 LOG(WARNING) << "Ignoring empty rule";
123 return kWarning;
124 }
125
126 // Allow single leading '*.' or '!', saved here so it's not canonicalized.
127 size_t start_offset = 0;
128 if (domain->at(0) == '!') {
129 domain->erase(0, 1);
130 rule->exception = true;
131 } else if (domain->find("*.") == 0) {
132 domain->erase(0, 2);
133 rule->wildcard = true;
134 }
135 if (domain->empty()) {
136 LOG(WARNING) << "Ignoring empty rule";
137 return kWarning;
138 }
139
140 // Warn about additional '*.' or '!'.
141 if (domain->find("*.", start_offset) != std::string::npos ||
142 domain->find('!', start_offset) != std::string::npos) {
143 LOG(WARNING) << "Keeping probably invalid rule: " << *domain;
144 result = kWarning;
145 }
146
147 // Make a GURL and normalize it, then get the host back out.
148 std::string url = "http://";
149 url.append(*domain);
150 GURL gurl(url);
151 const std::string& spec = gurl.possibly_invalid_spec();
152 url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host;
153 if (host.len < 0) {
154 LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;
155 return kError;
156 }
157 if (!gurl.is_valid()) {
158 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
159 result = kWarning;
160 }
161 domain->assign(spec.substr(host.begin, host.len));
162
163 return result;
164 }
165
166 // Loads the file described by 'in_filename', converts it to the desired format
167 // (see the file comments above), and saves it into 'out_filename'. Returns
168 // the most severe of the result codes encountered when normalizing the rules.
169 NormalizeResult NormalizeFile(const base::FilePath& in_filename,
170 const base::FilePath& out_filename) {
171 std::string data;
172 if (!file_util::ReadFileToString(in_filename, &data)) {
173 LOG(ERROR) << "Unable to read file";
174 // We return success since we've already reported the error.
175 return kSuccess;
176 }
177
178 // We do a lot of string assignment during parsing, but simplicity is more
179 // important than performance here.
180 std::string domain;
181 NormalizeResult result = kSuccess;
182 size_t line_start = 0;
183 size_t line_end = 0;
184 RuleMap rules;
185 RuleSet extra_rules;
186 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;
187 while (line_start < data.size()) {
188 // Skip the entire section of private domains.
189 // TODO(pamg): remove this when http://crbug.com/96086 is fixed.
190 if (line_start + begin_private_length < data.size() &&
191 !data.compare(line_start, begin_private_length,
192 kBeginPrivateDomainsComment)) {
193 line_end = data.find(kEndPrivateDomainsComment, line_start);
194 if (line_end == std::string::npos) {
195 LOG(WARNING) << "Private-domain section had no end marker.";
196 line_end = data.size();
197 }
198 } else if (line_start + 1 < data.size() &&
199 data[line_start] == '/' &&
200 data[line_start + 1] == '/') {
201 // Skip comments.
202 line_end = data.find_first_of("\r\n", line_start);
203 if (line_end == std::string::npos)
204 line_end = data.size();
205 } else {
206 // Truncate at first whitespace.
207 line_end = data.find_first_of("\r\n \t", line_start);
208 if (line_end == std::string::npos)
209 line_end = data.size();
210 domain.assign(data.data(), line_start, line_end - line_start);
211
212 Rule rule;
213 rule.wildcard = false;
214 rule.exception = false;
215 NormalizeResult new_result = NormalizeRule(&domain, &rule);
216 if (new_result != kError) {
217 // Check the existing rules to make sure we don't have an exception and
218 // wildcard for the same rule. If we did, we'd have to update our
219 // parsing code to handle this case.
220 CHECK(rules.find(domain) == rules.end());
221
222 rules[domain] = rule;
223 // Add true TLD for multi-level rules. We don't add them right now, in
224 // case there's an exception or wild card that either exists or might be
225 // added in a later iteration. In those cases, there's no need to add
226 // it and it would just slow down parsing the data.
227 size_t tld_start = domain.find_last_of('.');
228 if (tld_start != std::string::npos && tld_start + 1 < domain.size())
229 extra_rules.insert(domain.substr(tld_start + 1));
230 }
231 result = std::max(result, new_result);
232 }
233
234 // Find beginning of next non-empty line.
235 line_start = data.find_first_of("\r\n", line_end);
236 if (line_start == std::string::npos)
237 line_start = data.size();
238 line_start = data.find_first_not_of("\r\n", line_start);
239 if (line_start == std::string::npos)
240 line_start = data.size();
241 }
242
243 for (RuleSet::const_iterator iter = extra_rules.begin();
244 iter != extra_rules.end();
245 ++iter) {
246 if (rules.find(*iter) == rules.end()) {
247 Rule rule;
248 rule.exception = false;
249 rule.wildcard = false;
250 rules[*iter] = rule;
251 }
252 }
253
254 if (!WriteRules(rules, out_filename)) {
255 LOG(ERROR) << "Error(s) writing output file";
256 result = kError;
257 }
258
259 return result;
260 }
261 36
262 int main(int argc, const char* argv[]) { 37 int main(int argc, const char* argv[]) {
263 base::EnableTerminationOnHeapCorruption(); 38 base::EnableTerminationOnHeapCorruption();
264 if (argc != 1) { 39 if (argc != 1) {
265 fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n"); 40 fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n");
266 fprintf(stderr, "Usage: %s\n", argv[0]); 41 fprintf(stderr, "Usage: %s\n", argv[0]);
267 return 1; 42 return 1;
268 } 43 }
269 44
270 // Manages the destruction of singletons. 45 // Manages the destruction of singletons.
(...skipping 29 matching lines...) Expand all
300 "registry_controlled_domains")) 75 "registry_controlled_domains"))
301 .Append(FILE_PATH_LITERAL("effective_tld_names.dat")); 76 .Append(FILE_PATH_LITERAL("effective_tld_names.dat"));
302 base::FilePath output_file; 77 base::FilePath output_file;
303 PathService::Get(base::DIR_SOURCE_ROOT, &output_file); 78 PathService::Get(base::DIR_SOURCE_ROOT, &output_file);
304 output_file = output_file.Append(FILE_PATH_LITERAL("net")) 79 output_file = output_file.Append(FILE_PATH_LITERAL("net"))
305 .Append(FILE_PATH_LITERAL("base")) 80 .Append(FILE_PATH_LITERAL("base"))
306 .Append(FILE_PATH_LITERAL( 81 .Append(FILE_PATH_LITERAL(
307 "registry_controlled_domains")) 82 "registry_controlled_domains"))
308 .Append(FILE_PATH_LITERAL( 83 .Append(FILE_PATH_LITERAL(
309 "effective_tld_names.gperf")); 84 "effective_tld_names.gperf"));
310 NormalizeResult result = NormalizeFile(input_file, output_file); 85 net::tld_cleanup::NormalizeResult result =
311 if (result != kSuccess) { 86 net::tld_cleanup::NormalizeFile(input_file, output_file);
87 if (result != net::tld_cleanup::kSuccess) {
312 fprintf(stderr, 88 fprintf(stderr,
313 "Errors or warnings processing file. See log in tld_cleanup.log."); 89 "Errors or warnings processing file. See log in tld_cleanup.log.");
314 } 90 }
315 91
316 if (result == kError) 92 if (result == net::tld_cleanup::kError)
317 return 1; 93 return 1;
318 return 0; 94 return 0;
319 } 95 }
OLDNEW
« no previous file with comments | « net/tools/tld_cleanup/README ('k') | net/tools/tld_cleanup/tld_cleanup.gyp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698