net/tools/tld_cleanup/tld_cleanup_util.cc - Issue 15140003: Add support for split Public Suffix List distinctions.

Side by Side Diff: net/tools/tld_cleanup/tld_cleanup_util.cc

Issue 15140003: Add support for split Public Suffix List distinctions. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Rebased again Created 7 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright 2013 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 // This command-line program converts an effective-TLD data file in UTF-8 from	5 #include "net/tools/tld_cleanup/tld_cleanup_util.h"

6 // the format provided by Mozilla to the format expected by Chrome. This

7 // program generates an intermediate file which is then used by gperf to

8 // generate a perfect hash map. The benefit of this approach is that no time is

9 // spent on program initialization to generate the map of this data.

10 //

11 // Running this program finds "effective_tld_names.dat" in the expected location

12 // in the source checkout and generates "effective_tld_names.gperf" next to it.

13 //

14 // Any errors or warnings from this program are recorded in tld_cleanup.log.

15 //

16 // In particular, it

17 // * Strips blank lines and comments, as well as notes for individual rules.

18 // * Strips a single leading and/or trailing dot from each rule, if present.

19 // * Logs a warning if a rule contains '!' or '*.' other than at the beginning

20 // of the rule. (This also catches multiple ! or *. at the start of a rule.)

21 // * Logs a warning if GURL reports a rule as invalid, but keeps the rule.

22 // * Canonicalizes each rule's domain by converting it to a GURL and back.

23 // * Adds explicit rules for true TLDs found in any rule.

24 // TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed.

25 // * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS==="

26 // and "// ===END PRIVATE DOMAINS===".

27	6

28 #include <map>

29 #include <set>

30 #include <string>

31

32 #include "base/at_exit.h"

33 #include "base/command_line.h"

34 #include "base/file_util.h"	7 #include "base/file_util.h"

35 #include "base/file_util.h"

36 #include "base/files/file_path.h"

37 #include "base/i18n/icu_util.h"

38 #include "base/logging.h"	8 #include "base/logging.h"

39 #include "base/path_service.h"

40 #include "base/process_util.h"

41 #include "base/string_util.h"	9 #include "base/string_util.h"

42 #include "googleurl/src/gurl.h"	10 #include "googleurl/src/gurl.h"

43 #include "googleurl/src/url_parse.h"	11 #include "googleurl/src/url_parse.h"

44	12

45 namespace {	13 namespace {

46 struct Rule {

47 bool exception;

48 bool wildcard;

49 };

50

51 typedef std::map<std::string, Rule> RuleMap;

52 typedef std::set<std::string> RuleSet;

53	14

54 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";	15 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";

55 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";	16 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";

56 }	17 }

57	18

	19 namespace net {

	20 namespace tld_cleanup {

	21

58 // Writes the list of domain rules contained in the 'rules' set to the	22 // Writes the list of domain rules contained in the 'rules' set to the

59 // 'outfile', with each rule terminated by a LF. The file must already have	23 // 'outfile', with each rule terminated by a LF. The file must already have

60 // been created with write access.	24 // been created with write access.

61 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {	25 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {

62 std::string data;	26 std::string data;

63 data.append(	27 data.append(

64 "%{\n"	28 "%{\n"

65 "// Copyright (c) 2012 The Chromium Authors. All rights reserved.\n"	29 "// Copyright 2012 The Chromium Authors. All rights reserved.\n"

66 "// Use of this source code is governed by a BSD-style license that can be\n"	30 "// Use of this source code is governed by a BSD-style license that can be\n"

67 "// found in the LICENSE file.\n\n"	31 "// found in the LICENSE file.\n\n"

68 "// This file is generated by net/tools/tld_cleanup/.\n"	32 "// This file is generated by net/tools/tld_cleanup/.\n"

69 "// DO NOT MANUALLY EDIT!\n"	33 "// DO NOT MANUALLY EDIT!\n"

70 "%}\n"	34 "%}\n"

71 "struct DomainRule {\n"	35 "struct DomainRule {\n"

72 " const char *name;\n"	36 " const char *name;\n"

73 " int type; // 1: exception, 2: wildcard\n"	37 " int type; // 1: exception, 2: wildcard\n"

	38 " bool is_private;\n"

74 "};\n"	39 "};\n"

75 "%%\n"	40 "%%\n"

76 );	41 );

77	42

78 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {	43 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {

79 data.append(i->first);	44 data.append(i->first);

80 data.append(", ");	45 data.append(", ");

81 if (i->second.exception) {	46 if (i->second.exception) {

82 data.append("1");	47 data.append("1");

83 } else if (i->second.wildcard) {	48 } else if (i->second.wildcard) {

84 data.append("2");	49 data.append("2");

85 } else {	50 } else {

86 data.append("0");	51 data.append("0");

87 }	52 }

	53 if (i->second.is_private) {

	54 data.append(", true");

	55 } else {

	56 data.append(", false");

	57 }

88 data.append("\n");	58 data.append("\n");

89 }	59 }

90	60

91 data.append("%%\n");	61 data.append("%%\n");

92	62

93 int written = file_util::WriteFile(outfile, data.data(), data.size());	63 int written = file_util::WriteFile(outfile,

	64 data.data(),

	65 static_cast<int>(data.size()));

94	66

95 return written == static_cast<int>(data.size());	67 return written == static_cast<int>(data.size());

96 }	68 }

97	69

98 // These result codes should be in increasing order of severity.

99 typedef enum {

100 kSuccess,

101 kWarning,

102 kError,

103 } NormalizeResult;

104

105 // Adjusts the rule to a standard form: removes single extraneous dots and	70 // Adjusts the rule to a standard form: removes single extraneous dots and

106 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as	71 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as

107 // valid; logs a warning and returns kWarning if it is probably invalid; and	72 // valid; logs a warning and returns kWarning if it is probably invalid; and

108 // logs an error and returns kError if the rule is (almost) certainly invalid.	73 // logs an error and returns kError if the rule is (almost) certainly invalid.

109 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {	74 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {

110 NormalizeResult result = kSuccess;	75 NormalizeResult result = kSuccess;

111	76

112 // Strip single leading and trailing dots.	77 // Strip single leading and trailing dots.

113 if (domain->at(0) == '.')	78 if (domain->at(0) == '.')

114 domain->erase(0, 1);	79 domain->erase(0, 1);

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
156 }	121 }

157 if (!gurl.is_valid()) {	122 if (!gurl.is_valid()) {

158 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;	123 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;

159 result = kWarning;	124 result = kWarning;

160 }	125 }

161 domain->assign(spec.substr(host.begin, host.len));	126 domain->assign(spec.substr(host.begin, host.len));

162	127

163 return result;	128 return result;

164 }	129 }

165	130

166 // Loads the file described by 'in_filename', converts it to the desired format	131 NormalizeResult NormalizeDataToRuleMap(const std::string data,

167 // (see the file comments above), and saves it into 'out_filename'. Returns	132 RuleMap* rules) {

168 // the most severe of the result codes encountered when normalizing the rules.	133 CHECK(rules);

169 NormalizeResult NormalizeFile(const base::FilePath& in_filename,

170 const base::FilePath& out_filename) {

171 std::string data;

172 if (!file_util::ReadFileToString(in_filename, &data)) {

173 LOG(ERROR) << "Unable to read file";

174 // We return success since we've already reported the error.

175 return kSuccess;

176 }

177

178 // We do a lot of string assignment during parsing, but simplicity is more	134 // We do a lot of string assignment during parsing, but simplicity is more

179 // important than performance here.	135 // important than performance here.

180 std::string domain;	136 std::string domain;

181 NormalizeResult result = kSuccess;	137 NormalizeResult result = kSuccess;

182 size_t line_start = 0;	138 size_t line_start = 0;

183 size_t line_end = 0;	139 size_t line_end = 0;

184 RuleMap rules;	140 bool is_private = false;

185 RuleSet extra_rules;	141 RuleMap extra_rules;

186 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;	142 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;

	143 int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;

187 while (line_start < data.size()) {	144 while (line_start < data.size()) {

188 // Skip the entire section of private domains.

189 // TODO(pamg): remove this when http://crbug.com/96086 is fixed.

190 if (line_start + begin_private_length < data.size() &&	145 if (line_start + begin_private_length < data.size() &&

191 !data.compare(line_start, begin_private_length,	146 !data.compare(line_start, begin_private_length,

192 kBeginPrivateDomainsComment)) {	147 kBeginPrivateDomainsComment)) {

193 line_end = data.find(kEndPrivateDomainsComment, line_start);	148 is_private = true;

194 if (line_end == std::string::npos) {	149 line_end = line_start + begin_private_length;

195 LOG(WARNING) << "Private-domain section had no end marker.";	150 } else if (line_start + end_private_length < data.size() &&

196 line_end = data.size();	151 !data.compare(line_start, end_private_length,

197 }	152 kEndPrivateDomainsComment)) {

	153 is_private = false;

	154 line_end = line_start + end_private_length;

198 } else if (line_start + 1 < data.size() &&	155 } else if (line_start + 1 < data.size() &&

199 data[line_start] == '/' &&	156 data[line_start] == '/' &&

200 data[line_start + 1] == '/') {	157 data[line_start + 1] == '/') {

201 // Skip comments.	158 // Skip comments.

202 line_end = data.find_first_of("\r\n", line_start);	159 line_end = data.find_first_of("\r\n", line_start);

203 if (line_end == std::string::npos)	160 if (line_end == std::string::npos)

204 line_end = data.size();	161 line_end = data.size();

205 } else {	162 } else {

206 // Truncate at first whitespace.	163 // Truncate at first whitespace.

207 line_end = data.find_first_of("\r\n \t", line_start);	164 line_end = data.find_first_of("\r\n \t", line_start);

208 if (line_end == std::string::npos)	165 if (line_end == std::string::npos)

209 line_end = data.size();	166 line_end = data.size();

210 domain.assign(data.data(), line_start, line_end - line_start);	167 domain.assign(data.data(), line_start, line_end - line_start);

211	168

212 Rule rule;	169 Rule rule;

213 rule.wildcard = false;	170 rule.wildcard = false;

214 rule.exception = false;	171 rule.exception = false;

	172 rule.is_private = is_private;

215 NormalizeResult new_result = NormalizeRule(&domain, &rule);	173 NormalizeResult new_result = NormalizeRule(&domain, &rule);

216 if (new_result != kError) {	174 if (new_result != kError) {

217 // Check the existing rules to make sure we don't have an exception and	175 // Check the existing rules to make sure we don't have an exception and

218 // wildcard for the same rule. If we did, we'd have to update our	176 // wildcard for the same rule, or that the same domain is listed as both

	177 // private and not private. If we did, we'd have to update our

219 // parsing code to handle this case.	178 // parsing code to handle this case.

220 CHECK(rules.find(domain) == rules.end());	179 CHECK(rules->find(domain) == rules->end());

221	180

222 rules[domain] = rule;	181 (*rules)[domain] = rule;

223 // Add true TLD for multi-level rules. We don't add them right now, in	182 // Add true TLD for multi-level rules. We don't add them right now, in

224 // case there's an exception or wild card that either exists or might be	183 // case there's an exception or wild card that either exists or might be

225 // added in a later iteration. In those cases, there's no need to add	184 // added in a later iteration. In those cases, there's no need to add

226 // it and it would just slow down parsing the data.	185 // it and it would just slow down parsing the data.

227 size_t tld_start = domain.find_last_of('.');	186 size_t tld_start = domain.find_last_of('.');

228 if (tld_start != std::string::npos && tld_start + 1 < domain.size())	187 if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {

229 extra_rules.insert(domain.substr(tld_start + 1));	188 std::string extra_rule_domain = domain.substr(tld_start + 1);

	189 RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);

	190 Rule extra_rule;

	191 extra_rule.exception = false;

	192 extra_rule.wildcard = false;

	193 if (iter == extra_rules.end()) {

	194 extra_rule.is_private = is_private;

	195 } else {

	196 // A rule already exists, so we ensure that if any of the entries is

	197 // not private the result should be that the entry is not private.

	198 // An example is .au which is not listed as a real TLD, but only

	199 // lists second-level domains such as com.au. Subdomains of .au

	200 // (eg. blogspot.com.au) are also listed in the private section,

	201 // which is processed later, so this ensures that the real TLD

	202 // (eg. .au) is listed as public.

	203 extra_rule.is_private = is_private && iter->second.is_private;

	204 }

	205 extra_rules[extra_rule_domain] = extra_rule;

	206 }

230 }	207 }

231 result = std::max(result, new_result);	208 result = std::max(result, new_result);

232 }	209 }

233	210

234 // Find beginning of next non-empty line.	211 // Find beginning of next non-empty line.

235 line_start = data.find_first_of("\r\n", line_end);	212 line_start = data.find_first_of("\r\n", line_end);

236 if (line_start == std::string::npos)	213 if (line_start == std::string::npos)

237 line_start = data.size();	214 line_start = data.size();

238 line_start = data.find_first_not_of("\r\n", line_start);	215 line_start = data.find_first_not_of("\r\n", line_start);

239 if (line_start == std::string::npos)	216 if (line_start == std::string::npos)

240 line_start = data.size();	217 line_start = data.size();

241 }	218 }

242	219

243 for (RuleSet::const_iterator iter = extra_rules.begin();	220 for (RuleMap::const_iterator iter = extra_rules.begin();

244 iter != extra_rules.end();	221 iter != extra_rules.end();

245 ++iter) {	222 ++iter) {

246 if (rules.find(*iter) == rules.end()) {	223 if (rules->find(iter->first) == rules->end()) {

247 Rule rule;	224 (*rules)[iter->first] = iter->second;

248 rule.exception = false;

249 rule.wildcard = false;

250 rules[*iter] = rule;

251 }	225 }

252 }	226 }

253	227

	228 return result;

	229 }

	230

	231 NormalizeResult NormalizeFile(const base::FilePath& in_filename,

	232 const base::FilePath& out_filename) {

	233 RuleMap rules;

	234 std::string data;

	235 if (!file_util::ReadFileToString(in_filename, &data)) {

	236 LOG(ERROR) << "Unable to read file";

	237 // We return success since we've already reported the error.

	238 return kSuccess;

	239 }

	240

	241 NormalizeResult result = NormalizeDataToRuleMap(data, &rules);

	242

254 if (!WriteRules(rules, out_filename)) {	243 if (!WriteRules(rules, out_filename)) {

255 LOG(ERROR) << "Error(s) writing output file";	244 LOG(ERROR) << "Error(s) writing output file";

256 result = kError;	245 result = kError;

257 }	246 }

258	247

259 return result;	248 return result;

260 }	249 }

261	250

262 int main(int argc, const char* argv[]) {

263 base::EnableTerminationOnHeapCorruption();

264 if (argc != 1) {

265 fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n");

266 fprintf(stderr, "Usage: %s\n", argv[0]);

267 return 1;

268 }

269	251

270 // Manages the destruction of singletons.	252 } // namespace tld_cleanup

271 base::AtExitManager exit_manager;	253 } // namespace net

272

273 // Only use OutputDebugString in debug mode.

274 #ifdef NDEBUG

275 logging::LoggingDestination destination = logging::LOG_ONLY_TO_FILE;

276 #else

277 logging::LoggingDestination destination =

278 logging::LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG;

279 #endif

280

281 CommandLine::Init(argc, argv);

282

283 base::FilePath log_filename;

284 PathService::Get(base::DIR_EXE, &log_filename);

285 log_filename = log_filename.AppendASCII("tld_cleanup.log");

286 logging::InitLogging(

287 log_filename.value().c_str(),

288 destination,

289 logging::LOCK_LOG_FILE,

290 logging::DELETE_OLD_LOG_FILE,

291 logging::DISABLE_DCHECK_FOR_NON_OFFICIAL_RELEASE_BUILDS);

292

293 icu_util::Initialize();

294

295 base::FilePath input_file;

296 PathService::Get(base::DIR_SOURCE_ROOT, &input_file);

297 input_file = input_file.Append(FILE_PATH_LITERAL("net"))

298 .Append(FILE_PATH_LITERAL("base"))

299 .Append(FILE_PATH_LITERAL(

300 "registry_controlled_domains"))

301 .Append(FILE_PATH_LITERAL("effective_tld_names.dat"));

302 base::FilePath output_file;

303 PathService::Get(base::DIR_SOURCE_ROOT, &output_file);

304 output_file = output_file.Append(FILE_PATH_LITERAL("net"))

305 .Append(FILE_PATH_LITERAL("base"))

306 .Append(FILE_PATH_LITERAL(

307 "registry_controlled_domains"))

308 .Append(FILE_PATH_LITERAL(

309 "effective_tld_names.gperf"));

310 NormalizeResult result = NormalizeFile(input_file, output_file);

311 if (result != kSuccess) {

312 fprintf(stderr,

313 "Errors or warnings processing file. See log in tld_cleanup.log.");

314 }

315

316 if (result == kError)

317 return 1;

318 return 0;

319 }

OLD	NEW

« no previous file with comments | « net/tools/tld_cleanup/tld_cleanup_util.h ('k') | net/tools/tld_cleanup/tld_cleanup_util_unittest.cc » ('j') | no next file with comments »