| Index: net/tools/tld_cleanup/tld_cleanup_util.cc
|
| diff --git a/net/tools/tld_cleanup/tld_cleanup.cc b/net/tools/tld_cleanup/tld_cleanup_util.cc
|
| similarity index 54%
|
| copy from net/tools/tld_cleanup/tld_cleanup.cc
|
| copy to net/tools/tld_cleanup/tld_cleanup_util.cc
|
| index 1162d983a700a893e5467e95e926452a88ff6235..2f5496e0a56286a2686aa21c35bc2861ea8b15c5 100644
|
| --- a/net/tools/tld_cleanup/tld_cleanup.cc
|
| +++ b/net/tools/tld_cleanup/tld_cleanup_util.cc
|
| @@ -1,60 +1,24 @@
|
| -// Copyright (c) 2012 The Chromium Authors. All rights reserved.
|
| +// Copyright 2013 The Chromium Authors. All rights reserved.
|
| // Use of this source code is governed by a BSD-style license that can be
|
| // found in the LICENSE file.
|
|
|
| -// This command-line program converts an effective-TLD data file in UTF-8 from
|
| -// the format provided by Mozilla to the format expected by Chrome. This
|
| -// program generates an intermediate file which is then used by gperf to
|
| -// generate a perfect hash map. The benefit of this approach is that no time is
|
| -// spent on program initialization to generate the map of this data.
|
| -//
|
| -// Running this program finds "effective_tld_names.dat" in the expected location
|
| -// in the source checkout and generates "effective_tld_names.gperf" next to it.
|
| -//
|
| -// Any errors or warnings from this program are recorded in tld_cleanup.log.
|
| -//
|
| -// In particular, it
|
| -// * Strips blank lines and comments, as well as notes for individual rules.
|
| -// * Strips a single leading and/or trailing dot from each rule, if present.
|
| -// * Logs a warning if a rule contains '!' or '*.' other than at the beginning
|
| -// of the rule. (This also catches multiple ! or *. at the start of a rule.)
|
| -// * Logs a warning if GURL reports a rule as invalid, but keeps the rule.
|
| -// * Canonicalizes each rule's domain by converting it to a GURL and back.
|
| -// * Adds explicit rules for true TLDs found in any rule.
|
| -// TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed.
|
| -// * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS==="
|
| -// and "// ===END PRIVATE DOMAINS===".
|
| +#include "net/tools/tld_cleanup/tld_cleanup_util.h"
|
|
|
| -#include <map>
|
| -#include <set>
|
| -#include <string>
|
| -
|
| -#include "base/at_exit.h"
|
| -#include "base/command_line.h"
|
| -#include "base/file_util.h"
|
| #include "base/file_util.h"
|
| -#include "base/files/file_path.h"
|
| -#include "base/i18n/icu_util.h"
|
| #include "base/logging.h"
|
| -#include "base/path_service.h"
|
| -#include "base/process_util.h"
|
| #include "base/string_util.h"
|
| #include "googleurl/src/gurl.h"
|
| #include "googleurl/src/url_parse.h"
|
|
|
| namespace {
|
| -struct Rule {
|
| - bool exception;
|
| - bool wildcard;
|
| -};
|
| -
|
| -typedef std::map<std::string, Rule> RuleMap;
|
| -typedef std::set<std::string> RuleSet;
|
|
|
| const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
|
| const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";
|
| }
|
|
|
| +namespace net {
|
| +namespace tld_cleanup {
|
| +
|
| // Writes the list of domain rules contained in the 'rules' set to the
|
| // 'outfile', with each rule terminated by a LF. The file must already have
|
| // been created with write access.
|
| @@ -62,7 +26,7 @@ bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
|
| std::string data;
|
| data.append(
|
| "%{\n"
|
| -"// Copyright (c) 2012 The Chromium Authors. All rights reserved.\n"
|
| +"// Copyright 2012 The Chromium Authors. All rights reserved.\n"
|
| "// Use of this source code is governed by a BSD-style license that can be\n"
|
| "// found in the LICENSE file.\n\n"
|
| "// This file is generated by net/tools/tld_cleanup/.\n"
|
| @@ -71,6 +35,7 @@ bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
|
| "struct DomainRule {\n"
|
| " const char *name;\n"
|
| " int type; // 1: exception, 2: wildcard\n"
|
| +" bool is_private;\n"
|
| "};\n"
|
| "%%\n"
|
| );
|
| @@ -85,6 +50,11 @@ bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
|
| } else {
|
| data.append("0");
|
| }
|
| + if (i->second.is_private) {
|
| + data.append(", true");
|
| + } else {
|
| + data.append(", false");
|
| + }
|
| data.append("\n");
|
| }
|
|
|
| @@ -95,13 +65,6 @@ bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
|
| return written == static_cast<int>(data.size());
|
| }
|
|
|
| -// These result codes should be in increasing order of severity.
|
| -typedef enum {
|
| - kSuccess,
|
| - kWarning,
|
| - kError,
|
| -} NormalizeResult;
|
| -
|
| // Adjusts the rule to a standard form: removes single extraneous dots and
|
| // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
|
| // valid; logs a warning and returns kWarning if it is probably invalid; and
|
| @@ -163,38 +126,30 @@ NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
|
| return result;
|
| }
|
|
|
| -// Loads the file described by 'in_filename', converts it to the desired format
|
| -// (see the file comments above), and saves it into 'out_filename'. Returns
|
| -// the most severe of the result codes encountered when normalizing the rules.
|
| -NormalizeResult NormalizeFile(const base::FilePath& in_filename,
|
| - const base::FilePath& out_filename) {
|
| - std::string data;
|
| - if (!file_util::ReadFileToString(in_filename, &data)) {
|
| - LOG(ERROR) << "Unable to read file";
|
| - // We return success since we've already reported the error.
|
| - return kSuccess;
|
| - }
|
| -
|
| +NormalizeResult NormalizeDataToRuleMap(const std::string data,
|
| + RuleMap* rules) {
|
| + CHECK(rules);
|
| // We do a lot of string assignment during parsing, but simplicity is more
|
| // important than performance here.
|
| std::string domain;
|
| NormalizeResult result = kSuccess;
|
| size_t line_start = 0;
|
| size_t line_end = 0;
|
| - RuleMap rules;
|
| - RuleSet extra_rules;
|
| + bool is_private = false;
|
| + RuleMap extra_rules;
|
| int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;
|
| + int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;
|
| while (line_start < data.size()) {
|
| - // Skip the entire section of private domains.
|
| - // TODO(pamg): remove this when http://crbug.com/96086 is fixed.
|
| if (line_start + begin_private_length < data.size() &&
|
| !data.compare(line_start, begin_private_length,
|
| kBeginPrivateDomainsComment)) {
|
| - line_end = data.find(kEndPrivateDomainsComment, line_start);
|
| - if (line_end == std::string::npos) {
|
| - LOG(WARNING) << "Private-domain section had no end marker.";
|
| - line_end = data.size();
|
| - }
|
| + is_private = true;
|
| + line_end = line_start + begin_private_length;
|
| + } else if (line_start + end_private_length < data.size() &&
|
| + !data.compare(line_start, end_private_length,
|
| + kEndPrivateDomainsComment)) {
|
| + is_private = false;
|
| + line_end = line_start + end_private_length;
|
| } else if (line_start + 1 < data.size() &&
|
| data[line_start] == '/' &&
|
| data[line_start + 1] == '/') {
|
| @@ -212,21 +167,41 @@ NormalizeResult NormalizeFile(const base::FilePath& in_filename,
|
| Rule rule;
|
| rule.wildcard = false;
|
| rule.exception = false;
|
| + rule.is_private = is_private;
|
| NormalizeResult new_result = NormalizeRule(&domain, &rule);
|
| if (new_result != kError) {
|
| // Check the existing rules to make sure we don't have an exception and
|
| - // wildcard for the same rule. If we did, we'd have to update our
|
| + // wildcard for the same rule, or that the same domain is listed as both
|
| + // private and not private. If we did, we'd have to update our
|
| // parsing code to handle this case.
|
| - CHECK(rules.find(domain) == rules.end());
|
| + CHECK(rules->find(domain) == rules->end());
|
|
|
| - rules[domain] = rule;
|
| + (*rules)[domain] = rule;
|
| // Add true TLD for multi-level rules. We don't add them right now, in
|
| // case there's an exception or wild card that either exists or might be
|
| // added in a later iteration. In those cases, there's no need to add
|
| // it and it would just slow down parsing the data.
|
| size_t tld_start = domain.find_last_of('.');
|
| - if (tld_start != std::string::npos && tld_start + 1 < domain.size())
|
| - extra_rules.insert(domain.substr(tld_start + 1));
|
| + if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
|
| + std::string extra_rule_domain = domain.substr(tld_start + 1);
|
| + RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
|
| + Rule extra_rule;
|
| + extra_rule.exception = false;
|
| + extra_rule.wildcard = false;
|
| + if (iter == extra_rules.end()) {
|
| + extra_rule.is_private = is_private;
|
| + } else {
|
| + // A rule already exists, so we ensure that if any of the entries is
|
| + // not private the result should be that the entry is not private.
|
| + // An example is .au which is not listed as a real TLD, but only
|
| + // lists second-level domains such as com.au. Subdomains of .au
|
| + // (eg. blogspot.com.au) are also listed in the private section,
|
| + // which is processed later, so this ensures that the real TLD
|
| + // (eg. .au) is listed as public.
|
| + extra_rule.is_private = is_private && iter->second.is_private;
|
| + }
|
| + extra_rules[extra_rule_domain] = extra_rule;
|
| + }
|
| }
|
| result = std::max(result, new_result);
|
| }
|
| @@ -240,80 +215,37 @@ NormalizeResult NormalizeFile(const base::FilePath& in_filename,
|
| line_start = data.size();
|
| }
|
|
|
| - for (RuleSet::const_iterator iter = extra_rules.begin();
|
| + for (RuleMap::const_iterator iter = extra_rules.begin();
|
| iter != extra_rules.end();
|
| ++iter) {
|
| - if (rules.find(*iter) == rules.end()) {
|
| - Rule rule;
|
| - rule.exception = false;
|
| - rule.wildcard = false;
|
| - rules[*iter] = rule;
|
| + if (rules->find(iter->first) == rules->end()) {
|
| + (*rules)[iter->first] = iter->second;
|
| }
|
| }
|
|
|
| - if (!WriteRules(rules, out_filename)) {
|
| - LOG(ERROR) << "Error(s) writing output file";
|
| - result = kError;
|
| - }
|
| -
|
| return result;
|
| }
|
|
|
| -int main(int argc, const char* argv[]) {
|
| - base::EnableTerminationOnHeapCorruption();
|
| - if (argc != 1) {
|
| - fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n");
|
| - fprintf(stderr, "Usage: %s\n", argv[0]);
|
| - return 1;
|
| +NormalizeResult NormalizeFile(const base::FilePath& in_filename,
|
| + const base::FilePath& out_filename) {
|
| + RuleMap rules;
|
| + std::string data;
|
| + if (!file_util::ReadFileToString(in_filename, &data)) {
|
| + LOG(ERROR) << "Unable to read file";
|
| + // We return success since we've already reported the error.
|
| + return kSuccess;
|
| }
|
|
|
| - // Manages the destruction of singletons.
|
| - base::AtExitManager exit_manager;
|
| -
|
| - // Only use OutputDebugString in debug mode.
|
| -#ifdef NDEBUG
|
| - logging::LoggingDestination destination = logging::LOG_ONLY_TO_FILE;
|
| -#else
|
| - logging::LoggingDestination destination =
|
| - logging::LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG;
|
| -#endif
|
| -
|
| - CommandLine::Init(argc, argv);
|
| + NormalizeResult result = NormalizeDataToRuleMap(data, &rules);
|
|
|
| - base::FilePath log_filename;
|
| - PathService::Get(base::DIR_EXE, &log_filename);
|
| - log_filename = log_filename.AppendASCII("tld_cleanup.log");
|
| - logging::InitLogging(
|
| - log_filename.value().c_str(),
|
| - destination,
|
| - logging::LOCK_LOG_FILE,
|
| - logging::DELETE_OLD_LOG_FILE,
|
| - logging::DISABLE_DCHECK_FOR_NON_OFFICIAL_RELEASE_BUILDS);
|
| -
|
| - icu_util::Initialize();
|
| -
|
| - base::FilePath input_file;
|
| - PathService::Get(base::DIR_SOURCE_ROOT, &input_file);
|
| - input_file = input_file.Append(FILE_PATH_LITERAL("net"))
|
| - .Append(FILE_PATH_LITERAL("base"))
|
| - .Append(FILE_PATH_LITERAL(
|
| - "registry_controlled_domains"))
|
| - .Append(FILE_PATH_LITERAL("effective_tld_names.dat"));
|
| - base::FilePath output_file;
|
| - PathService::Get(base::DIR_SOURCE_ROOT, &output_file);
|
| - output_file = output_file.Append(FILE_PATH_LITERAL("net"))
|
| - .Append(FILE_PATH_LITERAL("base"))
|
| - .Append(FILE_PATH_LITERAL(
|
| - "registry_controlled_domains"))
|
| - .Append(FILE_PATH_LITERAL(
|
| - "effective_tld_names.gperf"));
|
| - NormalizeResult result = NormalizeFile(input_file, output_file);
|
| - if (result != kSuccess) {
|
| - fprintf(stderr,
|
| - "Errors or warnings processing file. See log in tld_cleanup.log.");
|
| + if (!WriteRules(rules, out_filename)) {
|
| + LOG(ERROR) << "Error(s) writing output file";
|
| + result = kError;
|
| }
|
|
|
| - if (result == kError)
|
| - return 1;
|
| - return 0;
|
| + return result;
|
| }
|
| +
|
| +
|
| +} // namespace tld_cleanup
|
| +} // namespace net
|
|
|