Index: net/tools/tld_cleanup/tld_cleanup_util.cc |
diff --git a/net/tools/tld_cleanup/tld_cleanup.cc b/net/tools/tld_cleanup/tld_cleanup_util.cc |
similarity index 54% |
copy from net/tools/tld_cleanup/tld_cleanup.cc |
copy to net/tools/tld_cleanup/tld_cleanup_util.cc |
index 1162d983a700a893e5467e95e926452a88ff6235..e291b02239db9c244bbb9b3e65c7d56543678495 100644 |
--- a/net/tools/tld_cleanup/tld_cleanup.cc |
+++ b/net/tools/tld_cleanup/tld_cleanup_util.cc |
@@ -1,60 +1,24 @@ |
-// Copyright (c) 2012 The Chromium Authors. All rights reserved. |
+// Copyright 2013 The Chromium Authors. All rights reserved. |
// Use of this source code is governed by a BSD-style license that can be |
// found in the LICENSE file. |
-// This command-line program converts an effective-TLD data file in UTF-8 from |
-// the format provided by Mozilla to the format expected by Chrome. This |
-// program generates an intermediate file which is then used by gperf to |
-// generate a perfect hash map. The benefit of this approach is that no time is |
-// spent on program initialization to generate the map of this data. |
-// |
-// Running this program finds "effective_tld_names.dat" in the expected location |
-// in the source checkout and generates "effective_tld_names.gperf" next to it. |
-// |
-// Any errors or warnings from this program are recorded in tld_cleanup.log. |
-// |
-// In particular, it |
-// * Strips blank lines and comments, as well as notes for individual rules. |
-// * Strips a single leading and/or trailing dot from each rule, if present. |
-// * Logs a warning if a rule contains '!' or '*.' other than at the beginning |
-// of the rule. (This also catches multiple ! or *. at the start of a rule.) |
-// * Logs a warning if GURL reports a rule as invalid, but keeps the rule. |
-// * Canonicalizes each rule's domain by converting it to a GURL and back. |
-// * Adds explicit rules for true TLDs found in any rule. |
-// TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed. |
-// * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS===" |
-// and "// ===END PRIVATE DOMAINS===". |
+#include "net/tools/tld_cleanup/tld_cleanup_util.h" |
-#include <map> |
-#include <set> |
-#include <string> |
- |
-#include "base/at_exit.h" |
-#include "base/command_line.h" |
-#include "base/file_util.h" |
#include "base/file_util.h" |
-#include "base/files/file_path.h" |
-#include "base/i18n/icu_util.h" |
#include "base/logging.h" |
-#include "base/path_service.h" |
-#include "base/process_util.h" |
#include "base/string_util.h" |
#include "googleurl/src/gurl.h" |
#include "googleurl/src/url_parse.h" |
namespace { |
-struct Rule { |
- bool exception; |
- bool wildcard; |
-}; |
- |
-typedef std::map<std::string, Rule> RuleMap; |
-typedef std::set<std::string> RuleSet; |
const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; |
const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; |
} |
+namespace net { |
+namespace tld_cleanup { |
+ |
// Writes the list of domain rules contained in the 'rules' set to the |
// 'outfile', with each rule terminated by a LF. The file must already have |
// been created with write access. |
@@ -62,7 +26,7 @@ bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) { |
std::string data; |
data.append( |
"%{\n" |
-"// Copyright (c) 2012 The Chromium Authors. All rights reserved.\n" |
+"// Copyright 2012 The Chromium Authors. All rights reserved.\n" |
"// Use of this source code is governed by a BSD-style license that can be\n" |
"// found in the LICENSE file.\n\n" |
"// This file is generated by net/tools/tld_cleanup/.\n" |
@@ -71,6 +35,7 @@ bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) { |
"struct DomainRule {\n" |
" const char *name;\n" |
" int type; // 1: exception, 2: wildcard\n" |
+" bool is_private;\n" |
"};\n" |
"%%\n" |
); |
@@ -85,23 +50,23 @@ bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) { |
} else { |
data.append("0"); |
} |
+ if (i->second.is_private) { |
+ data.append(", true"); |
+ } else { |
+ data.append(", false"); |
+ } |
data.append("\n"); |
} |
data.append("%%\n"); |
- int written = file_util::WriteFile(outfile, data.data(), data.size()); |
+ int written = file_util::WriteFile(outfile, |
+ data.data(), |
+ static_cast<int>(data.size())); |
return written == static_cast<int>(data.size()); |
} |
-// These result codes should be in increasing order of severity. |
-typedef enum { |
- kSuccess, |
- kWarning, |
- kError, |
-} NormalizeResult; |
- |
// Adjusts the rule to a standard form: removes single extraneous dots and |
// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as |
// valid; logs a warning and returns kWarning if it is probably invalid; and |
@@ -163,38 +128,30 @@ NormalizeResult NormalizeRule(std::string* domain, Rule* rule) { |
return result; |
} |
-// Loads the file described by 'in_filename', converts it to the desired format |
-// (see the file comments above), and saves it into 'out_filename'. Returns |
-// the most severe of the result codes encountered when normalizing the rules. |
-NormalizeResult NormalizeFile(const base::FilePath& in_filename, |
- const base::FilePath& out_filename) { |
- std::string data; |
- if (!file_util::ReadFileToString(in_filename, &data)) { |
- LOG(ERROR) << "Unable to read file"; |
- // We return success since we've already reported the error. |
- return kSuccess; |
- } |
- |
+NormalizeResult NormalizeDataToRuleMap(const std::string data, |
+ RuleMap* rules) { |
+ CHECK(rules); |
// We do a lot of string assignment during parsing, but simplicity is more |
// important than performance here. |
std::string domain; |
NormalizeResult result = kSuccess; |
size_t line_start = 0; |
size_t line_end = 0; |
- RuleMap rules; |
- RuleSet extra_rules; |
+ bool is_private = false; |
+ RuleMap extra_rules; |
int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1; |
+ int end_private_length = arraysize(kEndPrivateDomainsComment) - 1; |
while (line_start < data.size()) { |
- // Skip the entire section of private domains. |
- // TODO(pamg): remove this when http://crbug.com/96086 is fixed. |
if (line_start + begin_private_length < data.size() && |
!data.compare(line_start, begin_private_length, |
kBeginPrivateDomainsComment)) { |
- line_end = data.find(kEndPrivateDomainsComment, line_start); |
- if (line_end == std::string::npos) { |
- LOG(WARNING) << "Private-domain section had no end marker."; |
- line_end = data.size(); |
- } |
+ is_private = true; |
+ line_end = line_start + begin_private_length; |
+ } else if (line_start + end_private_length < data.size() && |
+ !data.compare(line_start, end_private_length, |
+ kEndPrivateDomainsComment)) { |
+ is_private = false; |
+ line_end = line_start + end_private_length; |
} else if (line_start + 1 < data.size() && |
data[line_start] == '/' && |
data[line_start + 1] == '/') { |
@@ -212,21 +169,41 @@ NormalizeResult NormalizeFile(const base::FilePath& in_filename, |
Rule rule; |
rule.wildcard = false; |
rule.exception = false; |
+ rule.is_private = is_private; |
NormalizeResult new_result = NormalizeRule(&domain, &rule); |
if (new_result != kError) { |
// Check the existing rules to make sure we don't have an exception and |
- // wildcard for the same rule. If we did, we'd have to update our |
+ // wildcard for the same rule, or that the same domain is listed as both |
+ // private and not private. If we did, we'd have to update our |
// parsing code to handle this case. |
- CHECK(rules.find(domain) == rules.end()); |
+ CHECK(rules->find(domain) == rules->end()); |
- rules[domain] = rule; |
+ (*rules)[domain] = rule; |
// Add true TLD for multi-level rules. We don't add them right now, in |
// case there's an exception or wild card that either exists or might be |
// added in a later iteration. In those cases, there's no need to add |
// it and it would just slow down parsing the data. |
size_t tld_start = domain.find_last_of('.'); |
- if (tld_start != std::string::npos && tld_start + 1 < domain.size()) |
- extra_rules.insert(domain.substr(tld_start + 1)); |
+ if (tld_start != std::string::npos && tld_start + 1 < domain.size()) { |
+ std::string extra_rule_domain = domain.substr(tld_start + 1); |
+ RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain); |
+ Rule extra_rule; |
+ extra_rule.exception = false; |
+ extra_rule.wildcard = false; |
+ if (iter == extra_rules.end()) { |
+ extra_rule.is_private = is_private; |
+ } else { |
+ // A rule already exists, so we ensure that if any of the entries is |
+ // not private the result should be that the entry is not private. |
+ // An example is .au which is not listed as a real TLD, but only |
+ // lists second-level domains such as com.au. Subdomains of .au |
+ // (eg. blogspot.com.au) are also listed in the private section, |
+ // which is processed later, so this ensures that the real TLD |
+ // (eg. .au) is listed as public. |
+ extra_rule.is_private = is_private && iter->second.is_private; |
+ } |
+ extra_rules[extra_rule_domain] = extra_rule; |
+ } |
} |
result = std::max(result, new_result); |
} |
@@ -240,80 +217,37 @@ NormalizeResult NormalizeFile(const base::FilePath& in_filename, |
line_start = data.size(); |
} |
- for (RuleSet::const_iterator iter = extra_rules.begin(); |
+ for (RuleMap::const_iterator iter = extra_rules.begin(); |
iter != extra_rules.end(); |
++iter) { |
- if (rules.find(*iter) == rules.end()) { |
- Rule rule; |
- rule.exception = false; |
- rule.wildcard = false; |
- rules[*iter] = rule; |
+ if (rules->find(iter->first) == rules->end()) { |
+ (*rules)[iter->first] = iter->second; |
} |
} |
- if (!WriteRules(rules, out_filename)) { |
- LOG(ERROR) << "Error(s) writing output file"; |
- result = kError; |
- } |
- |
return result; |
} |
-int main(int argc, const char* argv[]) { |
- base::EnableTerminationOnHeapCorruption(); |
- if (argc != 1) { |
- fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n"); |
- fprintf(stderr, "Usage: %s\n", argv[0]); |
- return 1; |
+NormalizeResult NormalizeFile(const base::FilePath& in_filename, |
+ const base::FilePath& out_filename) { |
+ RuleMap rules; |
+ std::string data; |
+ if (!file_util::ReadFileToString(in_filename, &data)) { |
+ LOG(ERROR) << "Unable to read file"; |
+ // We return success since we've already reported the error. |
+ return kSuccess; |
} |
- // Manages the destruction of singletons. |
- base::AtExitManager exit_manager; |
- |
- // Only use OutputDebugString in debug mode. |
-#ifdef NDEBUG |
- logging::LoggingDestination destination = logging::LOG_ONLY_TO_FILE; |
-#else |
- logging::LoggingDestination destination = |
- logging::LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG; |
-#endif |
- |
- CommandLine::Init(argc, argv); |
+ NormalizeResult result = NormalizeDataToRuleMap(data, &rules); |
- base::FilePath log_filename; |
- PathService::Get(base::DIR_EXE, &log_filename); |
- log_filename = log_filename.AppendASCII("tld_cleanup.log"); |
- logging::InitLogging( |
- log_filename.value().c_str(), |
- destination, |
- logging::LOCK_LOG_FILE, |
- logging::DELETE_OLD_LOG_FILE, |
- logging::DISABLE_DCHECK_FOR_NON_OFFICIAL_RELEASE_BUILDS); |
- |
- icu_util::Initialize(); |
- |
- base::FilePath input_file; |
- PathService::Get(base::DIR_SOURCE_ROOT, &input_file); |
- input_file = input_file.Append(FILE_PATH_LITERAL("net")) |
- .Append(FILE_PATH_LITERAL("base")) |
- .Append(FILE_PATH_LITERAL( |
- "registry_controlled_domains")) |
- .Append(FILE_PATH_LITERAL("effective_tld_names.dat")); |
- base::FilePath output_file; |
- PathService::Get(base::DIR_SOURCE_ROOT, &output_file); |
- output_file = output_file.Append(FILE_PATH_LITERAL("net")) |
- .Append(FILE_PATH_LITERAL("base")) |
- .Append(FILE_PATH_LITERAL( |
- "registry_controlled_domains")) |
- .Append(FILE_PATH_LITERAL( |
- "effective_tld_names.gperf")); |
- NormalizeResult result = NormalizeFile(input_file, output_file); |
- if (result != kSuccess) { |
- fprintf(stderr, |
- "Errors or warnings processing file. See log in tld_cleanup.log."); |
+ if (!WriteRules(rules, out_filename)) { |
+ LOG(ERROR) << "Error(s) writing output file"; |
+ result = kError; |
} |
- if (result == kError) |
- return 1; |
- return 0; |
+ return result; |
} |
+ |
+ |
+} // namespace tld_cleanup |
+} // namespace net |