Index: net/tools/tld_cleanup/tld_cleanup.cc |
diff --git a/net/tools/tld_cleanup/tld_cleanup.cc b/net/tools/tld_cleanup/tld_cleanup.cc |
index 1162d983a700a893e5467e95e926452a88ff6235..485bece3db16d1ef2f1629d687136829b43023dd 100644 |
--- a/net/tools/tld_cleanup/tld_cleanup.cc |
+++ b/net/tools/tld_cleanup/tld_cleanup.cc |
@@ -21,243 +21,18 @@ |
// * Logs a warning if GURL reports a rule as invalid, but keeps the rule. |
// * Canonicalizes each rule's domain by converting it to a GURL and back. |
// * Adds explicit rules for true TLDs found in any rule. |
-// TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed. |
-// * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS===" |
-// and "// ===END PRIVATE DOMAINS===". |
- |
-#include <map> |
-#include <set> |
-#include <string> |
+// * Marks entries in the file between "// ===BEGIN PRIVATE DOMAINS===" |
+// and "// ===END PRIVATE DOMAINS===" as private. |
#include "base/at_exit.h" |
#include "base/command_line.h" |
#include "base/file_util.h" |
-#include "base/file_util.h" |
#include "base/files/file_path.h" |
#include "base/i18n/icu_util.h" |
#include "base/logging.h" |
#include "base/path_service.h" |
#include "base/process_util.h" |
-#include "base/string_util.h" |
-#include "googleurl/src/gurl.h" |
-#include "googleurl/src/url_parse.h" |
- |
-namespace { |
-struct Rule { |
- bool exception; |
- bool wildcard; |
-}; |
- |
-typedef std::map<std::string, Rule> RuleMap; |
-typedef std::set<std::string> RuleSet; |
- |
-const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; |
-const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; |
-} |
- |
-// Writes the list of domain rules contained in the 'rules' set to the |
-// 'outfile', with each rule terminated by a LF. The file must already have |
-// been created with write access. |
-bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) { |
- std::string data; |
- data.append( |
-"%{\n" |
-"// Copyright (c) 2012 The Chromium Authors. All rights reserved.\n" |
-"// Use of this source code is governed by a BSD-style license that can be\n" |
-"// found in the LICENSE file.\n\n" |
-"// This file is generated by net/tools/tld_cleanup/.\n" |
-"// DO NOT MANUALLY EDIT!\n" |
-"%}\n" |
-"struct DomainRule {\n" |
-" const char *name;\n" |
-" int type; // 1: exception, 2: wildcard\n" |
-"};\n" |
-"%%\n" |
- ); |
- |
- for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) { |
- data.append(i->first); |
- data.append(", "); |
- if (i->second.exception) { |
- data.append("1"); |
- } else if (i->second.wildcard) { |
- data.append("2"); |
- } else { |
- data.append("0"); |
- } |
- data.append("\n"); |
- } |
- |
- data.append("%%\n"); |
- |
- int written = file_util::WriteFile(outfile, data.data(), data.size()); |
- |
- return written == static_cast<int>(data.size()); |
-} |
- |
-// These result codes should be in increasing order of severity. |
-typedef enum { |
- kSuccess, |
- kWarning, |
- kError, |
-} NormalizeResult; |
- |
-// Adjusts the rule to a standard form: removes single extraneous dots and |
-// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as |
-// valid; logs a warning and returns kWarning if it is probably invalid; and |
-// logs an error and returns kError if the rule is (almost) certainly invalid. |
-NormalizeResult NormalizeRule(std::string* domain, Rule* rule) { |
- NormalizeResult result = kSuccess; |
- |
- // Strip single leading and trailing dots. |
- if (domain->at(0) == '.') |
- domain->erase(0, 1); |
- if (domain->empty()) { |
- LOG(WARNING) << "Ignoring empty rule"; |
- return kWarning; |
- } |
- if (domain->at(domain->size() - 1) == '.') |
- domain->erase(domain->size() - 1, 1); |
- if (domain->empty()) { |
- LOG(WARNING) << "Ignoring empty rule"; |
- return kWarning; |
- } |
- |
- // Allow single leading '*.' or '!', saved here so it's not canonicalized. |
- size_t start_offset = 0; |
- if (domain->at(0) == '!') { |
- domain->erase(0, 1); |
- rule->exception = true; |
- } else if (domain->find("*.") == 0) { |
- domain->erase(0, 2); |
- rule->wildcard = true; |
- } |
- if (domain->empty()) { |
- LOG(WARNING) << "Ignoring empty rule"; |
- return kWarning; |
- } |
- |
- // Warn about additional '*.' or '!'. |
- if (domain->find("*.", start_offset) != std::string::npos || |
- domain->find('!', start_offset) != std::string::npos) { |
- LOG(WARNING) << "Keeping probably invalid rule: " << *domain; |
- result = kWarning; |
- } |
- |
- // Make a GURL and normalize it, then get the host back out. |
- std::string url = "http://"; |
- url.append(*domain); |
- GURL gurl(url); |
- const std::string& spec = gurl.possibly_invalid_spec(); |
- url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host; |
- if (host.len < 0) { |
- LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain; |
- return kError; |
- } |
- if (!gurl.is_valid()) { |
- LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain; |
- result = kWarning; |
- } |
- domain->assign(spec.substr(host.begin, host.len)); |
- |
- return result; |
-} |
- |
-// Loads the file described by 'in_filename', converts it to the desired format |
-// (see the file comments above), and saves it into 'out_filename'. Returns |
-// the most severe of the result codes encountered when normalizing the rules. |
-NormalizeResult NormalizeFile(const base::FilePath& in_filename, |
- const base::FilePath& out_filename) { |
- std::string data; |
- if (!file_util::ReadFileToString(in_filename, &data)) { |
- LOG(ERROR) << "Unable to read file"; |
- // We return success since we've already reported the error. |
- return kSuccess; |
- } |
- |
- // We do a lot of string assignment during parsing, but simplicity is more |
- // important than performance here. |
- std::string domain; |
- NormalizeResult result = kSuccess; |
- size_t line_start = 0; |
- size_t line_end = 0; |
- RuleMap rules; |
- RuleSet extra_rules; |
- int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1; |
- while (line_start < data.size()) { |
- // Skip the entire section of private domains. |
- // TODO(pamg): remove this when http://crbug.com/96086 is fixed. |
- if (line_start + begin_private_length < data.size() && |
- !data.compare(line_start, begin_private_length, |
- kBeginPrivateDomainsComment)) { |
- line_end = data.find(kEndPrivateDomainsComment, line_start); |
- if (line_end == std::string::npos) { |
- LOG(WARNING) << "Private-domain section had no end marker."; |
- line_end = data.size(); |
- } |
- } else if (line_start + 1 < data.size() && |
- data[line_start] == '/' && |
- data[line_start + 1] == '/') { |
- // Skip comments. |
- line_end = data.find_first_of("\r\n", line_start); |
- if (line_end == std::string::npos) |
- line_end = data.size(); |
- } else { |
- // Truncate at first whitespace. |
- line_end = data.find_first_of("\r\n \t", line_start); |
- if (line_end == std::string::npos) |
- line_end = data.size(); |
- domain.assign(data.data(), line_start, line_end - line_start); |
- |
- Rule rule; |
- rule.wildcard = false; |
- rule.exception = false; |
- NormalizeResult new_result = NormalizeRule(&domain, &rule); |
- if (new_result != kError) { |
- // Check the existing rules to make sure we don't have an exception and |
- // wildcard for the same rule. If we did, we'd have to update our |
- // parsing code to handle this case. |
- CHECK(rules.find(domain) == rules.end()); |
- |
- rules[domain] = rule; |
- // Add true TLD for multi-level rules. We don't add them right now, in |
- // case there's an exception or wild card that either exists or might be |
- // added in a later iteration. In those cases, there's no need to add |
- // it and it would just slow down parsing the data. |
- size_t tld_start = domain.find_last_of('.'); |
- if (tld_start != std::string::npos && tld_start + 1 < domain.size()) |
- extra_rules.insert(domain.substr(tld_start + 1)); |
- } |
- result = std::max(result, new_result); |
- } |
- |
- // Find beginning of next non-empty line. |
- line_start = data.find_first_of("\r\n", line_end); |
- if (line_start == std::string::npos) |
- line_start = data.size(); |
- line_start = data.find_first_not_of("\r\n", line_start); |
- if (line_start == std::string::npos) |
- line_start = data.size(); |
- } |
- |
- for (RuleSet::const_iterator iter = extra_rules.begin(); |
- iter != extra_rules.end(); |
- ++iter) { |
- if (rules.find(*iter) == rules.end()) { |
- Rule rule; |
- rule.exception = false; |
- rule.wildcard = false; |
- rules[*iter] = rule; |
- } |
- } |
- |
- if (!WriteRules(rules, out_filename)) { |
- LOG(ERROR) << "Error(s) writing output file"; |
- result = kError; |
- } |
- |
- return result; |
-} |
+#include "net/tools/tld_cleanup/tld_cleanup_util.h" |
int main(int argc, const char* argv[]) { |
base::EnableTerminationOnHeapCorruption(); |
@@ -307,13 +82,14 @@ int main(int argc, const char* argv[]) { |
"registry_controlled_domains")) |
.Append(FILE_PATH_LITERAL( |
"effective_tld_names.gperf")); |
- NormalizeResult result = NormalizeFile(input_file, output_file); |
- if (result != kSuccess) { |
+ net::tld_cleanup::NormalizeResult result = |
+ net::tld_cleanup::NormalizeFile(input_file, output_file); |
+ if (result != net::tld_cleanup::kSuccess) { |
fprintf(stderr, |
"Errors or warnings processing file. See log in tld_cleanup.log."); |
} |
- if (result == kError) |
+ if (result == net::tld_cleanup::kError) |
return 1; |
return 0; |
} |