net/tools/tld_cleanup/tld_cleanup.cc - Issue 15140003: Add support for split Public Suffix List distinctions.

Side by Side Diff: net/tools/tld_cleanup/tld_cleanup.cc

Issue 15140003: Add support for split Public Suffix List distinctions. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Rebased again Created 7 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 // This command-line program converts an effective-TLD data file in UTF-8 from	5 // This command-line program converts an effective-TLD data file in UTF-8 from

6 // the format provided by Mozilla to the format expected by Chrome. This	6 // the format provided by Mozilla to the format expected by Chrome. This

7 // program generates an intermediate file which is then used by gperf to	7 // program generates an intermediate file which is then used by gperf to

8 // generate a perfect hash map. The benefit of this approach is that no time is	8 // generate a perfect hash map. The benefit of this approach is that no time is

9 // spent on program initialization to generate the map of this data.	9 // spent on program initialization to generate the map of this data.

10 //	10 //

11 // Running this program finds "effective_tld_names.dat" in the expected location	11 // Running this program finds "effective_tld_names.dat" in the expected location

12 // in the source checkout and generates "effective_tld_names.gperf" next to it.	12 // in the source checkout and generates "effective_tld_names.gperf" next to it.

13 //	13 //

14 // Any errors or warnings from this program are recorded in tld_cleanup.log.	14 // Any errors or warnings from this program are recorded in tld_cleanup.log.

15 //	15 //

16 // In particular, it	16 // In particular, it

17 // * Strips blank lines and comments, as well as notes for individual rules.	17 // * Strips blank lines and comments, as well as notes for individual rules.

18 // * Strips a single leading and/or trailing dot from each rule, if present.	18 // * Strips a single leading and/or trailing dot from each rule, if present.

19 // * Logs a warning if a rule contains '!' or '*.' other than at the beginning	19 // * Logs a warning if a rule contains '!' or '*.' other than at the beginning

20 // of the rule. (This also catches multiple ! or *. at the start of a rule.)	20 // of the rule. (This also catches multiple ! or *. at the start of a rule.)

21 // * Logs a warning if GURL reports a rule as invalid, but keeps the rule.	21 // * Logs a warning if GURL reports a rule as invalid, but keeps the rule.

22 // * Canonicalizes each rule's domain by converting it to a GURL and back.	22 // * Canonicalizes each rule's domain by converting it to a GURL and back.

23 // * Adds explicit rules for true TLDs found in any rule.	23 // * Adds explicit rules for true TLDs found in any rule.

24 // TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed.	24 // * Marks entries in the file between "// ===BEGIN PRIVATE DOMAINS==="

25 // * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS==="	25 // and "// ===END PRIVATE DOMAINS===" as private.

26 // and "// ===END PRIVATE DOMAINS===".

27

28 #include <map>

29 #include <set>

30 #include <string>

31	26

32 #include "base/at_exit.h"	27 #include "base/at_exit.h"

33 #include "base/command_line.h"	28 #include "base/command_line.h"

34 #include "base/file_util.h"	29 #include "base/file_util.h"

35 #include "base/file_util.h"

36 #include "base/files/file_path.h"	30 #include "base/files/file_path.h"

37 #include "base/i18n/icu_util.h"	31 #include "base/i18n/icu_util.h"

38 #include "base/logging.h"	32 #include "base/logging.h"

39 #include "base/path_service.h"	33 #include "base/path_service.h"

40 #include "base/process_util.h"	34 #include "base/process_util.h"

41 #include "base/string_util.h"	35 #include "net/tools/tld_cleanup/tld_cleanup_util.h"

42 #include "googleurl/src/gurl.h"

43 #include "googleurl/src/url_parse.h"

44

45 namespace {

46 struct Rule {

47 bool exception;

48 bool wildcard;

49 };

50

51 typedef std::map<std::string, Rule> RuleMap;

52 typedef std::set<std::string> RuleSet;

53

54 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";

55 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";

56 }

57

58 // Writes the list of domain rules contained in the 'rules' set to the

59 // 'outfile', with each rule terminated by a LF. The file must already have

60 // been created with write access.

61 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {

62 std::string data;

63 data.append(

64 "%{\n"

65 "// Copyright (c) 2012 The Chromium Authors. All rights reserved.\n"

66 "// Use of this source code is governed by a BSD-style license that can be\n"

67 "// found in the LICENSE file.\n\n"

68 "// This file is generated by net/tools/tld_cleanup/.\n"

69 "// DO NOT MANUALLY EDIT!\n"

70 "%}\n"

71 "struct DomainRule {\n"

72 " const char *name;\n"

73 " int type; // 1: exception, 2: wildcard\n"

74 "};\n"

75 "%%\n"

76 );

77

78 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {

79 data.append(i->first);

80 data.append(", ");

81 if (i->second.exception) {

82 data.append("1");

83 } else if (i->second.wildcard) {

84 data.append("2");

85 } else {

86 data.append("0");

87 }

88 data.append("\n");

89 }

90

91 data.append("%%\n");

92

93 int written = file_util::WriteFile(outfile, data.data(), data.size());

94

95 return written == static_cast<int>(data.size());

96 }

97

98 // These result codes should be in increasing order of severity.

99 typedef enum {

100 kSuccess,

101 kWarning,

102 kError,

103 } NormalizeResult;

104

105 // Adjusts the rule to a standard form: removes single extraneous dots and

106 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as

107 // valid; logs a warning and returns kWarning if it is probably invalid; and

108 // logs an error and returns kError if the rule is (almost) certainly invalid.

109 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {

110 NormalizeResult result = kSuccess;

111

112 // Strip single leading and trailing dots.

113 if (domain->at(0) == '.')

114 domain->erase(0, 1);

115 if (domain->empty()) {

116 LOG(WARNING) << "Ignoring empty rule";

117 return kWarning;

118 }

119 if (domain->at(domain->size() - 1) == '.')

120 domain->erase(domain->size() - 1, 1);

121 if (domain->empty()) {

122 LOG(WARNING) << "Ignoring empty rule";

123 return kWarning;

124 }

125

126 // Allow single leading '*.' or '!', saved here so it's not canonicalized.

127 size_t start_offset = 0;

128 if (domain->at(0) == '!') {

129 domain->erase(0, 1);

130 rule->exception = true;

131 } else if (domain->find("*.") == 0) {

132 domain->erase(0, 2);

133 rule->wildcard = true;

134 }

135 if (domain->empty()) {

136 LOG(WARNING) << "Ignoring empty rule";

137 return kWarning;

138 }

139

140 // Warn about additional '*.' or '!'.

141 if (domain->find("*.", start_offset) != std::string::npos \|\|

142 domain->find('!', start_offset) != std::string::npos) {

143 LOG(WARNING) << "Keeping probably invalid rule: " << *domain;

144 result = kWarning;

145 }

146

147 // Make a GURL and normalize it, then get the host back out.

148 std::string url = "http://";

149 url.append(*domain);

150 GURL gurl(url);

151 const std::string& spec = gurl.possibly_invalid_spec();

152 url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host;

153 if (host.len < 0) {

154 LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;

155 return kError;

156 }

157 if (!gurl.is_valid()) {

158 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;

159 result = kWarning;

160 }

161 domain->assign(spec.substr(host.begin, host.len));

162

163 return result;

164 }

165

166 // Loads the file described by 'in_filename', converts it to the desired format

167 // (see the file comments above), and saves it into 'out_filename'. Returns

168 // the most severe of the result codes encountered when normalizing the rules.

169 NormalizeResult NormalizeFile(const base::FilePath& in_filename,

170 const base::FilePath& out_filename) {

171 std::string data;

172 if (!file_util::ReadFileToString(in_filename, &data)) {

173 LOG(ERROR) << "Unable to read file";

174 // We return success since we've already reported the error.

175 return kSuccess;

176 }

177

178 // We do a lot of string assignment during parsing, but simplicity is more

179 // important than performance here.

180 std::string domain;

181 NormalizeResult result = kSuccess;

182 size_t line_start = 0;

183 size_t line_end = 0;

184 RuleMap rules;

185 RuleSet extra_rules;

186 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;

187 while (line_start < data.size()) {

188 // Skip the entire section of private domains.

189 // TODO(pamg): remove this when http://crbug.com/96086 is fixed.

190 if (line_start + begin_private_length < data.size() &&

191 !data.compare(line_start, begin_private_length,

192 kBeginPrivateDomainsComment)) {

193 line_end = data.find(kEndPrivateDomainsComment, line_start);

194 if (line_end == std::string::npos) {

195 LOG(WARNING) << "Private-domain section had no end marker.";

196 line_end = data.size();

197 }

198 } else if (line_start + 1 < data.size() &&

199 data[line_start] == '/' &&

200 data[line_start + 1] == '/') {

201 // Skip comments.

202 line_end = data.find_first_of("\r\n", line_start);

203 if (line_end == std::string::npos)

204 line_end = data.size();

205 } else {

206 // Truncate at first whitespace.

207 line_end = data.find_first_of("\r\n \t", line_start);

208 if (line_end == std::string::npos)

209 line_end = data.size();

210 domain.assign(data.data(), line_start, line_end - line_start);

211

212 Rule rule;

213 rule.wildcard = false;

214 rule.exception = false;

215 NormalizeResult new_result = NormalizeRule(&domain, &rule);

216 if (new_result != kError) {

217 // Check the existing rules to make sure we don't have an exception and

218 // wildcard for the same rule. If we did, we'd have to update our

219 // parsing code to handle this case.

220 CHECK(rules.find(domain) == rules.end());

221

222 rules[domain] = rule;

223 // Add true TLD for multi-level rules. We don't add them right now, in

224 // case there's an exception or wild card that either exists or might be

225 // added in a later iteration. In those cases, there's no need to add

226 // it and it would just slow down parsing the data.

227 size_t tld_start = domain.find_last_of('.');

228 if (tld_start != std::string::npos && tld_start + 1 < domain.size())

229 extra_rules.insert(domain.substr(tld_start + 1));

230 }

231 result = std::max(result, new_result);

232 }

233

234 // Find beginning of next non-empty line.

235 line_start = data.find_first_of("\r\n", line_end);

236 if (line_start == std::string::npos)

237 line_start = data.size();

238 line_start = data.find_first_not_of("\r\n", line_start);

239 if (line_start == std::string::npos)

240 line_start = data.size();

241 }

242

243 for (RuleSet::const_iterator iter = extra_rules.begin();

244 iter != extra_rules.end();

245 ++iter) {

246 if (rules.find(*iter) == rules.end()) {

247 Rule rule;

248 rule.exception = false;

249 rule.wildcard = false;

250 rules[*iter] = rule;

251 }

252 }

253

254 if (!WriteRules(rules, out_filename)) {

255 LOG(ERROR) << "Error(s) writing output file";

256 result = kError;

257 }

258

259 return result;

260 }

261	36

262 int main(int argc, const char* argv[]) {	37 int main(int argc, const char* argv[]) {

263 base::EnableTerminationOnHeapCorruption();	38 base::EnableTerminationOnHeapCorruption();

264 if (argc != 1) {	39 if (argc != 1) {

265 fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n");	40 fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n");

266 fprintf(stderr, "Usage: %s\n", argv[0]);	41 fprintf(stderr, "Usage: %s\n", argv[0]);

267 return 1;	42 return 1;

268 }	43 }

269	44

270 // Manages the destruction of singletons.	45 // Manages the destruction of singletons.

(...skipping 29 matching lines...) Expand all Loading...
300 "registry_controlled_domains"))	75 "registry_controlled_domains"))

301 .Append(FILE_PATH_LITERAL("effective_tld_names.dat"));	76 .Append(FILE_PATH_LITERAL("effective_tld_names.dat"));

302 base::FilePath output_file;	77 base::FilePath output_file;

303 PathService::Get(base::DIR_SOURCE_ROOT, &output_file);	78 PathService::Get(base::DIR_SOURCE_ROOT, &output_file);

304 output_file = output_file.Append(FILE_PATH_LITERAL("net"))	79 output_file = output_file.Append(FILE_PATH_LITERAL("net"))

305 .Append(FILE_PATH_LITERAL("base"))	80 .Append(FILE_PATH_LITERAL("base"))

306 .Append(FILE_PATH_LITERAL(	81 .Append(FILE_PATH_LITERAL(

307 "registry_controlled_domains"))	82 "registry_controlled_domains"))

308 .Append(FILE_PATH_LITERAL(	83 .Append(FILE_PATH_LITERAL(

309 "effective_tld_names.gperf"));	84 "effective_tld_names.gperf"));

310 NormalizeResult result = NormalizeFile(input_file, output_file);	85 net::tld_cleanup::NormalizeResult result =

311 if (result != kSuccess) {	86 net::tld_cleanup::NormalizeFile(input_file, output_file);

	87 if (result != net::tld_cleanup::kSuccess) {

312 fprintf(stderr,	88 fprintf(stderr,

313 "Errors or warnings processing file. See log in tld_cleanup.log.");	89 "Errors or warnings processing file. See log in tld_cleanup.log.");

314 }	90 }

315	91

316 if (result == kError)	92 if (result == net::tld_cleanup::kError)

317 return 1;	93 return 1;

318 return 0;	94 return 0;

319 }	95 }

OLD	NEW

« no previous file with comments | « net/tools/tld_cleanup/README ('k') | net/tools/tld_cleanup/tld_cleanup.gyp » ('j') | no next file with comments »