net/tools/tld_cleanup/tld_cleanup_util.cc - Issue 13979002: Add support for split PSL list distinctions.

Side by Side Diff: net/tools/tld_cleanup/tld_cleanup_util.cc

Issue 13979002: Add support for split PSL list distinctions. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Added const modifiers Created 7 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright 2013 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 // This command-line program converts an effective-TLD data file in UTF-8 from	5 #include "net/tools/tld_cleanup/tld_cleanup_util.h"

6 // the format provided by Mozilla to the format expected by Chrome. This

7 // program generates an intermediate file which is then used by gperf to

8 // generate a perfect hash map. The benefit of this approach is that no time is

9 // spent on program initialization to generate the map of this data.

10 //

11 // Running this program finds "effective_tld_names.dat" in the expected location

12 // in the source checkout and generates "effective_tld_names.gperf" next to it.

13 //

14 // Any errors or warnings from this program are recorded in tld_cleanup.log.

15 //

16 // In particular, it

17 // * Strips blank lines and comments, as well as notes for individual rules.

18 // * Strips a single leading and/or trailing dot from each rule, if present.

19 // * Logs a warning if a rule contains '!' or '*.' other than at the beginning

20 // of the rule. (This also catches multiple ! or *. at the start of a rule.)

21 // * Logs a warning if GURL reports a rule as invalid, but keeps the rule.

22 // * Canonicalizes each rule's domain by converting it to a GURL and back.

23 // * Adds explicit rules for true TLDs found in any rule.

24 // TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed.

25 // * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS==="

26 // and "// ===END PRIVATE DOMAINS===".

27	6

28 #include <map>

29 #include <set>

30 #include <string>

31

32 #include "base/at_exit.h"

33 #include "base/command_line.h"

34 #include "base/file_util.h"	7 #include "base/file_util.h"

35 #include "base/file_util.h"

36 #include "base/files/file_path.h"

37 #include "base/i18n/icu_util.h"

38 #include "base/logging.h"	8 #include "base/logging.h"

39 #include "base/path_service.h"

40 #include "base/process_util.h"

41 #include "base/string_util.h"	9 #include "base/string_util.h"

42 #include "googleurl/src/gurl.h"	10 #include "googleurl/src/gurl.h"

43 #include "googleurl/src/url_parse.h"	11 #include "googleurl/src/url_parse.h"

44	12

45 namespace {	13 namespace {

46 struct Rule {

47 bool exception;

48 bool wildcard;

49 };

50

51 typedef std::map<std::string, Rule> RuleMap;

52 typedef std::set<std::string> RuleSet;

53	14

54 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";	15 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";

55 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";	16 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";

56 }	17 }

57	18

	19 namespace net {

	20 namespace tld_cleanup {

	21

58 // Writes the list of domain rules contained in the 'rules' set to the	22 // Writes the list of domain rules contained in the 'rules' set to the

59 // 'outfile', with each rule terminated by a LF. The file must already have	23 // 'outfile', with each rule terminated by a LF. The file must already have

60 // been created with write access.	24 // been created with write access.

61 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {	25 bool WriteRules(const RuleMap& rules, base::FilePath* outfile) {

62 std::string data;	26 std::string data;

63 data.append(	27 data.append(

64 "%{\n"	28 "%{\n"

65 "// Copyright (c) 2012 The Chromium Authors. All rights reserved.\n"	29 "// Copyright 2012 The Chromium Authors. All rights reserved.\n"

66 "// Use of this source code is governed by a BSD-style license that can be\n"	30 "// Use of this source code is governed by a BSD-style license that can be\n"

67 "// found in the LICENSE file.\n\n"	31 "// found in the LICENSE file.\n\n"

68 "// This file is generated by net/tools/tld_cleanup/.\n"	32 "// This file is generated by net/tools/tld_cleanup/.\n"

69 "// DO NOT MANUALLY EDIT!\n"	33 "// DO NOT MANUALLY EDIT!\n"

70 "%}\n"	34 "%}\n"

71 "struct DomainRule {\n"	35 "struct DomainRule {\n"

72 " const char *name;\n"	36 " const char *name;\n"

73 " int type; // 1: exception, 2: wildcard\n"	37 " int type; // 1: exception, 2: wildcard\n"

	38 " bool is_private;\n"

74 "};\n"	39 "};\n"

75 "%%\n"	40 "%%\n"

76 );	41 );

77	42

78 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {	43 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {

79 data.append(i->first);	44 data.append(i->first);

80 data.append(", ");	45 data.append(", ");

81 if (i->second.exception) {	46 if (i->second.exception) {

82 data.append("1");	47 data.append("1");

83 } else if (i->second.wildcard) {	48 } else if (i->second.wildcard) {

84 data.append("2");	49 data.append("2");

85 } else {	50 } else {

86 data.append("0");	51 data.append("0");

87 }	52 }

	53 if (i->second.is_private) {

	54 data.append(", true");

	55 } else {

	56 data.append(", false");

	57 }

88 data.append("\n");	58 data.append("\n");

89 }	59 }

90	60

91 data.append("%%\n");	61 data.append("%%\n");

92	62

93 int written = file_util::WriteFile(outfile, data.data(), data.size());	63 int written = file_util::WriteFile(*outfile, data.data(), data.size());

94	64

95 return written == static_cast<int>(data.size());	65 return written == static_cast<int>(data.size());

96 }	66 }

97	67

98 // These result codes should be in increasing order of severity.

99 typedef enum {

100 kSuccess,

101 kWarning,

102 kError,

103 } NormalizeResult;

104

105 // Adjusts the rule to a standard form: removes single extraneous dots and	68 // Adjusts the rule to a standard form: removes single extraneous dots and

106 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as	69 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as

107 // valid; logs a warning and returns kWarning if it is probably invalid; and	70 // valid; logs a warning and returns kWarning if it is probably invalid; and

108 // logs an error and returns kError if the rule is (almost) certainly invalid.	71 // logs an error and returns kError if the rule is (almost) certainly invalid.

109 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {	72 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {

110 NormalizeResult result = kSuccess;	73 NormalizeResult result = kSuccess;

111	74

112 // Strip single leading and trailing dots.	75 // Strip single leading and trailing dots.

113 if (domain->at(0) == '.')	76 if (domain->at(0) == '.')

114 domain->erase(0, 1);	77 domain->erase(0, 1);

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
156 }	119 }

157 if (!gurl.is_valid()) {	120 if (!gurl.is_valid()) {

158 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;	121 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;

159 result = kWarning;	122 result = kWarning;

160 }	123 }

161 domain->assign(spec.substr(host.begin, host.len));	124 domain->assign(spec.substr(host.begin, host.len));

162	125

163 return result;	126 return result;

164 }	127 }

165	128

166 // Loads the file described by 'in_filename', converts it to the desired format	129 NormalizeResult NormalizeDataToRuleMap(const std::string data,

167 // (see the file comments above), and saves it into 'out_filename'. Returns	130 RuleMap* rules) {

168 // the most severe of the result codes encountered when normalizing the rules.	131 CHECK(rules);

169 NormalizeResult NormalizeFile(const base::FilePath& in_filename,

170 const base::FilePath& out_filename) {

171 std::string data;

172 if (!file_util::ReadFileToString(in_filename, &data)) {

173 LOG(ERROR) << "Unable to read file";

174 // We return success since we've already reported the error.

175 return kSuccess;

176 }

177

178 // We do a lot of string assignment during parsing, but simplicity is more	132 // We do a lot of string assignment during parsing, but simplicity is more

179 // important than performance here.	133 // important than performance here.

180 std::string domain;	134 std::string domain;

181 NormalizeResult result = kSuccess;	135 NormalizeResult result = kSuccess;

182 size_t line_start = 0;	136 size_t line_start = 0;

183 size_t line_end = 0;	137 size_t line_end = 0;

184 RuleMap rules;	138 bool is_private = false;

185 RuleSet extra_rules;	139 RuleMap extra_rules;

186 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;	140 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;

	141 int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;

187 while (line_start < data.size()) {	142 while (line_start < data.size()) {

188 // Skip the entire section of private domains.

189 // TODO(pamg): remove this when http://crbug.com/96086 is fixed.

190 if (line_start + begin_private_length < data.size() &&	143 if (line_start + begin_private_length < data.size() &&

191 !data.compare(line_start, begin_private_length,	144 !data.compare(line_start, begin_private_length,

192 kBeginPrivateDomainsComment)) {	145 kBeginPrivateDomainsComment)) {

193 line_end = data.find(kEndPrivateDomainsComment, line_start);	146 is_private = true;

194 if (line_end == std::string::npos) {	147 line_end = line_start + begin_private_length;

195 LOG(WARNING) << "Private-domain section had no end marker.";	148 } else if (line_start + end_private_length < data.size() &&

196 line_end = data.size();	149 !data.compare(line_start, end_private_length,

197 }	150 kEndPrivateDomainsComment)) {

	151 is_private = false;

	152 line_end = line_start + end_private_length;

198 } else if (line_start + 1 < data.size() &&	153 } else if (line_start + 1 < data.size() &&

199 data[line_start] == '/' &&	154 data[line_start] == '/' &&

200 data[line_start + 1] == '/') {	155 data[line_start + 1] == '/') {

201 // Skip comments.	156 // Skip comments.

202 line_end = data.find_first_of("\r\n", line_start);	157 line_end = data.find_first_of("\r\n", line_start);

203 if (line_end == std::string::npos)	158 if (line_end == std::string::npos)

204 line_end = data.size();	159 line_end = data.size();

205 } else {	160 } else {

206 // Truncate at first whitespace.	161 // Truncate at first whitespace.

207 line_end = data.find_first_of("\r\n \t", line_start);	162 line_end = data.find_first_of("\r\n \t", line_start);

208 if (line_end == std::string::npos)	163 if (line_end == std::string::npos)

209 line_end = data.size();	164 line_end = data.size();

210 domain.assign(data.data(), line_start, line_end - line_start);	165 domain.assign(data.data(), line_start, line_end - line_start);

211	166

212 Rule rule;	167 Rule rule;

213 rule.wildcard = false;	168 rule.wildcard = false;

214 rule.exception = false;	169 rule.exception = false;

	170 rule.is_private = is_private;

215 NormalizeResult new_result = NormalizeRule(&domain, &rule);	171 NormalizeResult new_result = NormalizeRule(&domain, &rule);

216 if (new_result != kError) {	172 if (new_result != kError) {

217 // Check the existing rules to make sure we don't have an exception and	173 // Check the existing rules to make sure we don't have an exception and

218 // wildcard for the same rule. If we did, we'd have to update our	174 // wildcard for the same rule, or that the same domain is listed as both

	175 // private and not private. If we did, we'd have to update our

219 // parsing code to handle this case.	176 // parsing code to handle this case.

220 CHECK(rules.find(domain) == rules.end());	177 CHECK(rules->find(domain) == rules->end());

221	178

222 rules[domain] = rule;	179 (*rules)[domain] = rule;

223 // Add true TLD for multi-level rules. We don't add them right now, in	180 // Add true TLD for multi-level rules. We don't add them right now, in

224 // case there's an exception or wild card that either exists or might be	181 // case there's an exception or wild card that either exists or might be

225 // added in a later iteration. In those cases, there's no need to add	182 // added in a later iteration. In those cases, there's no need to add

226 // it and it would just slow down parsing the data.	183 // it and it would just slow down parsing the data.

227 size_t tld_start = domain.find_last_of('.');	184 size_t tld_start = domain.find_last_of('.');

228 if (tld_start != std::string::npos && tld_start + 1 < domain.size())	185 if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {

229 extra_rules.insert(domain.substr(tld_start + 1));	186 std::string extra_rule_domain = domain.substr(tld_start + 1);

	187 RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);

	188 Rule extra_rule;

	189 extra_rule.exception = false;

	190 extra_rule.wildcard = false;

	191 if (iter == extra_rules.end()) {

	192 extra_rule.is_private = is_private;

	193 } else {

	194 // A rule already exists, so we ensure that if any of the entries is

	195 // not private the result should be that the entry is not private.

	196 // An example is .au which is not listed as a real TLD, but only

	197 // lists second-level domains such as com.au. Subdomains of .au

	198 // (eg. blogspot.com.au) are also listed in the private section,

	199 // which is processed later, so this ensures that the real TLD

	200 // (eg. .au) is listed as public.

	201 extra_rule.is_private = is_private && iter->second.is_private;

	202 }

	203 extra_rules[extra_rule_domain] = extra_rule;

	204 }

230 }	205 }

231 result = std::max(result, new_result);	206 result = std::max(result, new_result);

232 }	207 }

233	208

234 // Find beginning of next non-empty line.	209 // Find beginning of next non-empty line.

235 line_start = data.find_first_of("\r\n", line_end);	210 line_start = data.find_first_of("\r\n", line_end);

236 if (line_start == std::string::npos)	211 if (line_start == std::string::npos)

237 line_start = data.size();	212 line_start = data.size();

238 line_start = data.find_first_not_of("\r\n", line_start);	213 line_start = data.find_first_not_of("\r\n", line_start);

239 if (line_start == std::string::npos)	214 if (line_start == std::string::npos)

240 line_start = data.size();	215 line_start = data.size();

241 }	216 }

242	217

243 for (RuleSet::const_iterator iter = extra_rules.begin();	218 for (RuleMap::const_iterator iter = extra_rules.begin();

244 iter != extra_rules.end();	219 iter != extra_rules.end();

245 ++iter) {	220 ++iter) {

246 if (rules.find(*iter) == rules.end()) {	221 if (rules->find(iter->first) == rules->end()) {

247 Rule rule;	222 (*rules)[iter->first] = iter->second;

248 rule.exception = false;

249 rule.wildcard = false;

250 rules[*iter] = rule;

251 }	223 }

252 }	224 }

253	225

	226 return result;

	227 }

	228

	229 NormalizeResult NormalizeFile(const base::FilePath& in_filename,

	230 base::FilePath* out_filename) {

	231 DCHECK(out_filename);

	232 RuleMap rules;

	233 std::string data;

	234 if (!file_util::ReadFileToString(in_filename, &data)) {

	235 LOG(ERROR) << "Unable to read file";

	236 // We return success since we've already reported the error.

	237 return kSuccess;

	238 }

	239

	240 NormalizeResult result = NormalizeDataToRuleMap(data, &rules);

	241

254 if (!WriteRules(rules, out_filename)) {	242 if (!WriteRules(rules, out_filename)) {

255 LOG(ERROR) << "Error(s) writing output file";	243 LOG(ERROR) << "Error(s) writing output file";

256 result = kError;	244 result = kError;

257 }	245 }

258	246

259 return result;	247 return result;

260 }	248 }

261	249

262 int main(int argc, const char* argv[]) {

263 base::EnableTerminationOnHeapCorruption();

264 if (argc != 1) {

265 fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n");

266 fprintf(stderr, "Usage: %s\n", argv[0]);

267 return 1;

268 }

269	250

270 // Manages the destruction of singletons.	251 } // namespace tld_cleanup

271 base::AtExitManager exit_manager;	252 } // namespace net

272

273 // Only use OutputDebugString in debug mode.

274 #ifdef NDEBUG

275 logging::LoggingDestination destination = logging::LOG_ONLY_TO_FILE;

276 #else

277 logging::LoggingDestination destination =

278 logging::LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG;

279 #endif

280

281 CommandLine::Init(argc, argv);

282

283 base::FilePath log_filename;

284 PathService::Get(base::DIR_EXE, &log_filename);

285 log_filename = log_filename.AppendASCII("tld_cleanup.log");

286 logging::InitLogging(

287 log_filename.value().c_str(),

288 destination,

289 logging::LOCK_LOG_FILE,

290 logging::DELETE_OLD_LOG_FILE,

291 logging::DISABLE_DCHECK_FOR_NON_OFFICIAL_RELEASE_BUILDS);

292

293 icu_util::Initialize();

294

295 base::FilePath input_file;

296 PathService::Get(base::DIR_SOURCE_ROOT, &input_file);

297 input_file = input_file.Append(FILE_PATH_LITERAL("net"))

298 .Append(FILE_PATH_LITERAL("base"))

299 .Append(FILE_PATH_LITERAL(

300 "registry_controlled_domains"))

301 .Append(FILE_PATH_LITERAL("effective_tld_names.dat"));

302 base::FilePath output_file;

303 PathService::Get(base::DIR_SOURCE_ROOT, &output_file);

304 output_file = output_file.Append(FILE_PATH_LITERAL("net"))

305 .Append(FILE_PATH_LITERAL("base"))

306 .Append(FILE_PATH_LITERAL(

307 "registry_controlled_domains"))

308 .Append(FILE_PATH_LITERAL(

309 "effective_tld_names.gperf"));

310 NormalizeResult result = NormalizeFile(input_file, output_file);

311 if (result != kSuccess) {

312 fprintf(stderr,

313 "Errors or warnings processing file. See log in tld_cleanup.log.");

314 }

315

316 if (result == kError)

317 return 1;

318 return 0;

319 }

OLD	NEW

« net/base/static_cookie_policy.cc ('K') | « net/tools/tld_cleanup/tld_cleanup_util.h ('k') | net/tools/tld_cleanup/tld_cleanup_util_unittest.cc » ('j') | no next file with comments »