OLD | NEW |
| (Empty) |
1 // Copyright 2013 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "net/tools/tld_cleanup/tld_cleanup_util.h" | |
6 | |
7 #include "base/files/file_util.h" | |
8 #include "base/logging.h" | |
9 #include "base/strings/string_number_conversions.h" | |
10 #include "base/strings/string_util.h" | |
11 #include "url/gurl.h" | |
12 #include "url/url_parse.h" | |
13 | |
14 namespace { | |
15 | |
16 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; | |
17 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; | |
18 | |
19 const int kExceptionRule = 1; | |
20 const int kWildcardRule = 2; | |
21 const int kPrivateRule = 4; | |
22 } | |
23 | |
24 namespace net { | |
25 namespace tld_cleanup { | |
26 | |
27 // Writes the list of domain rules contained in the 'rules' set to the | |
28 // 'outfile', with each rule terminated by a LF. The file must already have | |
29 // been created with write access. | |
30 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) { | |
31 std::string data; | |
32 data.append("%{\n" | |
33 "// Copyright 2012 The Chromium Authors. All rights reserved.\n" | |
34 "// Use of this source code is governed by a BSD-style license " | |
35 "that can be\n" | |
36 "// found in the LICENSE file.\n\n" | |
37 "// This file is generated by net/tools/tld_cleanup/.\n" | |
38 "// DO NOT MANUALLY EDIT!\n" | |
39 "%}\n" | |
40 "struct DomainRule {\n" | |
41 " int name_offset;\n" | |
42 " int type; // flags: 1: exception, 2: wildcard, 4: private\n" | |
43 "};\n" | |
44 "%%\n"); | |
45 | |
46 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) { | |
47 data.append(i->first); | |
48 data.append(", "); | |
49 int type = 0; | |
50 if (i->second.exception) { | |
51 type = kExceptionRule; | |
52 } else if (i->second.wildcard) { | |
53 type = kWildcardRule; | |
54 } | |
55 if (i->second.is_private) { | |
56 type += kPrivateRule; | |
57 } | |
58 data.append(base::IntToString(type)); | |
59 data.append("\n"); | |
60 } | |
61 | |
62 data.append("%%\n"); | |
63 | |
64 int written = base::WriteFile(outfile, | |
65 data.data(), | |
66 static_cast<int>(data.size())); | |
67 | |
68 return written == static_cast<int>(data.size()); | |
69 } | |
70 | |
71 // Adjusts the rule to a standard form: removes single extraneous dots and | |
72 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as | |
73 // valid; logs a warning and returns kWarning if it is probably invalid; and | |
74 // logs an error and returns kError if the rule is (almost) certainly invalid. | |
75 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) { | |
76 NormalizeResult result = kSuccess; | |
77 | |
78 // Strip single leading and trailing dots. | |
79 if (domain->at(0) == '.') | |
80 domain->erase(0, 1); | |
81 if (domain->empty()) { | |
82 LOG(WARNING) << "Ignoring empty rule"; | |
83 return kWarning; | |
84 } | |
85 if (domain->at(domain->size() - 1) == '.') | |
86 domain->erase(domain->size() - 1, 1); | |
87 if (domain->empty()) { | |
88 LOG(WARNING) << "Ignoring empty rule"; | |
89 return kWarning; | |
90 } | |
91 | |
92 // Allow single leading '*.' or '!', saved here so it's not canonicalized. | |
93 size_t start_offset = 0; | |
94 if (domain->at(0) == '!') { | |
95 domain->erase(0, 1); | |
96 rule->exception = true; | |
97 } else if (domain->find("*.") == 0) { | |
98 domain->erase(0, 2); | |
99 rule->wildcard = true; | |
100 } | |
101 if (domain->empty()) { | |
102 LOG(WARNING) << "Ignoring empty rule"; | |
103 return kWarning; | |
104 } | |
105 | |
106 // Warn about additional '*.' or '!'. | |
107 if (domain->find("*.", start_offset) != std::string::npos || | |
108 domain->find('!', start_offset) != std::string::npos) { | |
109 LOG(WARNING) << "Keeping probably invalid rule: " << *domain; | |
110 result = kWarning; | |
111 } | |
112 | |
113 // Make a GURL and normalize it, then get the host back out. | |
114 std::string url = "http://"; | |
115 url.append(*domain); | |
116 GURL gurl(url); | |
117 const std::string& spec = gurl.possibly_invalid_spec(); | |
118 url::Component host = gurl.parsed_for_possibly_invalid_spec().host; | |
119 if (host.len < 0) { | |
120 LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain; | |
121 return kError; | |
122 } | |
123 if (!gurl.is_valid()) { | |
124 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain; | |
125 result = kWarning; | |
126 } | |
127 domain->assign(spec.substr(host.begin, host.len)); | |
128 | |
129 return result; | |
130 } | |
131 | |
132 NormalizeResult NormalizeDataToRuleMap(const std::string data, | |
133 RuleMap* rules) { | |
134 CHECK(rules); | |
135 // We do a lot of string assignment during parsing, but simplicity is more | |
136 // important than performance here. | |
137 std::string domain; | |
138 NormalizeResult result = kSuccess; | |
139 size_t line_start = 0; | |
140 size_t line_end = 0; | |
141 bool is_private = false; | |
142 RuleMap extra_rules; | |
143 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1; | |
144 int end_private_length = arraysize(kEndPrivateDomainsComment) - 1; | |
145 while (line_start < data.size()) { | |
146 if (line_start + begin_private_length < data.size() && | |
147 !data.compare(line_start, begin_private_length, | |
148 kBeginPrivateDomainsComment)) { | |
149 is_private = true; | |
150 line_end = line_start + begin_private_length; | |
151 } else if (line_start + end_private_length < data.size() && | |
152 !data.compare(line_start, end_private_length, | |
153 kEndPrivateDomainsComment)) { | |
154 is_private = false; | |
155 line_end = line_start + end_private_length; | |
156 } else if (line_start + 1 < data.size() && | |
157 data[line_start] == '/' && | |
158 data[line_start + 1] == '/') { | |
159 // Skip comments. | |
160 line_end = data.find_first_of("\r\n", line_start); | |
161 if (line_end == std::string::npos) | |
162 line_end = data.size(); | |
163 } else { | |
164 // Truncate at first whitespace. | |
165 line_end = data.find_first_of("\r\n \t", line_start); | |
166 if (line_end == std::string::npos) | |
167 line_end = data.size(); | |
168 domain.assign(data.data(), line_start, line_end - line_start); | |
169 | |
170 Rule rule; | |
171 rule.wildcard = false; | |
172 rule.exception = false; | |
173 rule.is_private = is_private; | |
174 NormalizeResult new_result = NormalizeRule(&domain, &rule); | |
175 if (new_result != kError) { | |
176 // Check the existing rules to make sure we don't have an exception and | |
177 // wildcard for the same rule, or that the same domain is listed as both | |
178 // private and not private. If we did, we'd have to update our | |
179 // parsing code to handle this case. | |
180 CHECK(rules->find(domain) == rules->end()) | |
181 << "Duplicate rule found for " << domain; | |
182 | |
183 (*rules)[domain] = rule; | |
184 // Add true TLD for multi-level rules. We don't add them right now, in | |
185 // case there's an exception or wild card that either exists or might be | |
186 // added in a later iteration. In those cases, there's no need to add | |
187 // it and it would just slow down parsing the data. | |
188 size_t tld_start = domain.find_last_of('.'); | |
189 if (tld_start != std::string::npos && tld_start + 1 < domain.size()) { | |
190 std::string extra_rule_domain = domain.substr(tld_start + 1); | |
191 RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain); | |
192 Rule extra_rule; | |
193 extra_rule.exception = false; | |
194 extra_rule.wildcard = false; | |
195 if (iter == extra_rules.end()) { | |
196 extra_rule.is_private = is_private; | |
197 } else { | |
198 // A rule already exists, so we ensure that if any of the entries is | |
199 // not private the result should be that the entry is not private. | |
200 // An example is .au which is not listed as a real TLD, but only | |
201 // lists second-level domains such as com.au. Subdomains of .au | |
202 // (eg. blogspot.com.au) are also listed in the private section, | |
203 // which is processed later, so this ensures that the real TLD | |
204 // (eg. .au) is listed as public. | |
205 extra_rule.is_private = is_private && iter->second.is_private; | |
206 } | |
207 extra_rules[extra_rule_domain] = extra_rule; | |
208 } | |
209 } | |
210 result = std::max(result, new_result); | |
211 } | |
212 | |
213 // Find beginning of next non-empty line. | |
214 line_start = data.find_first_of("\r\n", line_end); | |
215 if (line_start == std::string::npos) | |
216 line_start = data.size(); | |
217 line_start = data.find_first_not_of("\r\n", line_start); | |
218 if (line_start == std::string::npos) | |
219 line_start = data.size(); | |
220 } | |
221 | |
222 for (RuleMap::const_iterator iter = extra_rules.begin(); | |
223 iter != extra_rules.end(); | |
224 ++iter) { | |
225 if (rules->find(iter->first) == rules->end()) { | |
226 (*rules)[iter->first] = iter->second; | |
227 } | |
228 } | |
229 | |
230 return result; | |
231 } | |
232 | |
233 NormalizeResult NormalizeFile(const base::FilePath& in_filename, | |
234 const base::FilePath& out_filename) { | |
235 RuleMap rules; | |
236 std::string data; | |
237 if (!base::ReadFileToString(in_filename, &data)) { | |
238 LOG(ERROR) << "Unable to read file"; | |
239 // We return success since we've already reported the error. | |
240 return kSuccess; | |
241 } | |
242 | |
243 NormalizeResult result = NormalizeDataToRuleMap(data, &rules); | |
244 | |
245 if (!WriteRules(rules, out_filename)) { | |
246 LOG(ERROR) << "Error(s) writing output file"; | |
247 result = kError; | |
248 } | |
249 | |
250 return result; | |
251 } | |
252 | |
253 | |
254 } // namespace tld_cleanup | |
255 } // namespace net | |
OLD | NEW |