net/tools/tld_cleanup/tld_cleanup_util.cc - Issue 992733002: Remove //net (except for Android test stuff) and sdch

Side by Side Diff: net/tools/tld_cleanup/tld_cleanup_util.cc

Issue 992733002: Remove //net (except for Android test stuff) and sdch (Closed) Base URL: git@github.com:domokit/mojo.git@master

Patch Set: Created 5 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright 2013 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "net/tools/tld_cleanup/tld_cleanup_util.h"

6

7 #include "base/files/file_util.h"

8 #include "base/logging.h"

9 #include "base/strings/string_number_conversions.h"

10 #include "base/strings/string_util.h"

11 #include "url/gurl.h"

12 #include "url/url_parse.h"

13

14 namespace {

15

16 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";

17 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";

18

19 const int kExceptionRule = 1;

20 const int kWildcardRule = 2;

21 const int kPrivateRule = 4;

22 }

23

24 namespace net {

25 namespace tld_cleanup {

26

27 // Writes the list of domain rules contained in the 'rules' set to the

28 // 'outfile', with each rule terminated by a LF. The file must already have

29 // been created with write access.

30 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {

31 std::string data;

32 data.append("%{\n"

33 "// Copyright 2012 The Chromium Authors. All rights reserved.\n"

34 "// Use of this source code is governed by a BSD-style license "

35 "that can be\n"

36 "// found in the LICENSE file.\n\n"

37 "// This file is generated by net/tools/tld_cleanup/.\n"

38 "// DO NOT MANUALLY EDIT!\n"

39 "%}\n"

40 "struct DomainRule {\n"

41 " int name_offset;\n"

42 " int type; // flags: 1: exception, 2: wildcard, 4: private\n"

43 "};\n"

44 "%%\n");

45

46 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {

47 data.append(i->first);

48 data.append(", ");

49 int type = 0;

50 if (i->second.exception) {

51 type = kExceptionRule;

52 } else if (i->second.wildcard) {

53 type = kWildcardRule;

54 }

55 if (i->second.is_private) {

56 type += kPrivateRule;

57 }

58 data.append(base::IntToString(type));

59 data.append("\n");

60 }

61

62 data.append("%%\n");

63

64 int written = base::WriteFile(outfile,

65 data.data(),

66 static_cast<int>(data.size()));

67

68 return written == static_cast<int>(data.size());

69 }

70

71 // Adjusts the rule to a standard form: removes single extraneous dots and

72 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as

73 // valid; logs a warning and returns kWarning if it is probably invalid; and

74 // logs an error and returns kError if the rule is (almost) certainly invalid.

75 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {

76 NormalizeResult result = kSuccess;

77

78 // Strip single leading and trailing dots.

79 if (domain->at(0) == '.')

80 domain->erase(0, 1);

81 if (domain->empty()) {

82 LOG(WARNING) << "Ignoring empty rule";

83 return kWarning;

84 }

85 if (domain->at(domain->size() - 1) == '.')

86 domain->erase(domain->size() - 1, 1);

87 if (domain->empty()) {

88 LOG(WARNING) << "Ignoring empty rule";

89 return kWarning;

90 }

91

92 // Allow single leading '*.' or '!', saved here so it's not canonicalized.

93 size_t start_offset = 0;

94 if (domain->at(0) == '!') {

95 domain->erase(0, 1);

96 rule->exception = true;

97 } else if (domain->find("*.") == 0) {

98 domain->erase(0, 2);

99 rule->wildcard = true;

100 }

101 if (domain->empty()) {

102 LOG(WARNING) << "Ignoring empty rule";

103 return kWarning;

104 }

105

106 // Warn about additional '*.' or '!'.

107 if (domain->find("*.", start_offset) != std::string::npos \|\|

108 domain->find('!', start_offset) != std::string::npos) {

109 LOG(WARNING) << "Keeping probably invalid rule: " << *domain;

110 result = kWarning;

111 }

112

113 // Make a GURL and normalize it, then get the host back out.

114 std::string url = "http://";

115 url.append(*domain);

116 GURL gurl(url);

117 const std::string& spec = gurl.possibly_invalid_spec();

118 url::Component host = gurl.parsed_for_possibly_invalid_spec().host;

119 if (host.len < 0) {

120 LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;

121 return kError;

122 }

123 if (!gurl.is_valid()) {

124 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;

125 result = kWarning;

126 }

127 domain->assign(spec.substr(host.begin, host.len));

128

129 return result;

130 }

131

132 NormalizeResult NormalizeDataToRuleMap(const std::string data,

133 RuleMap* rules) {

134 CHECK(rules);

135 // We do a lot of string assignment during parsing, but simplicity is more

136 // important than performance here.

137 std::string domain;

138 NormalizeResult result = kSuccess;

139 size_t line_start = 0;

140 size_t line_end = 0;

141 bool is_private = false;

142 RuleMap extra_rules;

143 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;

144 int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;

145 while (line_start < data.size()) {

146 if (line_start + begin_private_length < data.size() &&

147 !data.compare(line_start, begin_private_length,

148 kBeginPrivateDomainsComment)) {

149 is_private = true;

150 line_end = line_start + begin_private_length;

151 } else if (line_start + end_private_length < data.size() &&

152 !data.compare(line_start, end_private_length,

153 kEndPrivateDomainsComment)) {

154 is_private = false;

155 line_end = line_start + end_private_length;

156 } else if (line_start + 1 < data.size() &&

157 data[line_start] == '/' &&

158 data[line_start + 1] == '/') {

159 // Skip comments.

160 line_end = data.find_first_of("\r\n", line_start);

161 if (line_end == std::string::npos)

162 line_end = data.size();

163 } else {

164 // Truncate at first whitespace.

165 line_end = data.find_first_of("\r\n \t", line_start);

166 if (line_end == std::string::npos)

167 line_end = data.size();

168 domain.assign(data.data(), line_start, line_end - line_start);

169

170 Rule rule;

171 rule.wildcard = false;

172 rule.exception = false;

173 rule.is_private = is_private;

174 NormalizeResult new_result = NormalizeRule(&domain, &rule);

175 if (new_result != kError) {

176 // Check the existing rules to make sure we don't have an exception and

177 // wildcard for the same rule, or that the same domain is listed as both

178 // private and not private. If we did, we'd have to update our

179 // parsing code to handle this case.

180 CHECK(rules->find(domain) == rules->end())

181 << "Duplicate rule found for " << domain;

182

183 (*rules)[domain] = rule;

184 // Add true TLD for multi-level rules. We don't add them right now, in

185 // case there's an exception or wild card that either exists or might be

186 // added in a later iteration. In those cases, there's no need to add

187 // it and it would just slow down parsing the data.

188 size_t tld_start = domain.find_last_of('.');

189 if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {

190 std::string extra_rule_domain = domain.substr(tld_start + 1);

191 RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);

192 Rule extra_rule;

193 extra_rule.exception = false;

194 extra_rule.wildcard = false;

195 if (iter == extra_rules.end()) {

196 extra_rule.is_private = is_private;

197 } else {

198 // A rule already exists, so we ensure that if any of the entries is

199 // not private the result should be that the entry is not private.

200 // An example is .au which is not listed as a real TLD, but only

201 // lists second-level domains such as com.au. Subdomains of .au

202 // (eg. blogspot.com.au) are also listed in the private section,

203 // which is processed later, so this ensures that the real TLD

204 // (eg. .au) is listed as public.

205 extra_rule.is_private = is_private && iter->second.is_private;

206 }

207 extra_rules[extra_rule_domain] = extra_rule;

208 }

209 }

210 result = std::max(result, new_result);

211 }

212

213 // Find beginning of next non-empty line.

214 line_start = data.find_first_of("\r\n", line_end);

215 if (line_start == std::string::npos)

216 line_start = data.size();

217 line_start = data.find_first_not_of("\r\n", line_start);

218 if (line_start == std::string::npos)

219 line_start = data.size();

220 }

221

222 for (RuleMap::const_iterator iter = extra_rules.begin();

223 iter != extra_rules.end();

224 ++iter) {

225 if (rules->find(iter->first) == rules->end()) {

226 (*rules)[iter->first] = iter->second;

227 }

228 }

229

230 return result;

231 }

232

233 NormalizeResult NormalizeFile(const base::FilePath& in_filename,

234 const base::FilePath& out_filename) {

235 RuleMap rules;

236 std::string data;

237 if (!base::ReadFileToString(in_filename, &data)) {

238 LOG(ERROR) << "Unable to read file";

239 // We return success since we've already reported the error.

240 return kSuccess;

241 }

242

243 NormalizeResult result = NormalizeDataToRuleMap(data, &rules);

244

245 if (!WriteRules(rules, out_filename)) {

246 LOG(ERROR) << "Error(s) writing output file";

247 result = kError;

248 }

249

250 return result;

251 }

252

253

254 } // namespace tld_cleanup

255 } // namespace net

OLD	NEW

« no previous file with comments | « net/tools/tld_cleanup/tld_cleanup_util.h ('k') | net/tools/tld_cleanup/tld_cleanup_util_unittest.cc » ('j') | no next file with comments »