Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1629)

Unified Diff: net/tools/tld_cleanup/tld_cleanup_util.cc

Issue 15140003: Add support for split Public Suffix List distinctions. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Rebased again Created 7 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « net/tools/tld_cleanup/tld_cleanup_util.h ('k') | net/tools/tld_cleanup/tld_cleanup_util_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: net/tools/tld_cleanup/tld_cleanup_util.cc
diff --git a/net/tools/tld_cleanup/tld_cleanup.cc b/net/tools/tld_cleanup/tld_cleanup_util.cc
similarity index 54%
copy from net/tools/tld_cleanup/tld_cleanup.cc
copy to net/tools/tld_cleanup/tld_cleanup_util.cc
index 1162d983a700a893e5467e95e926452a88ff6235..e291b02239db9c244bbb9b3e65c7d56543678495 100644
--- a/net/tools/tld_cleanup/tld_cleanup.cc
+++ b/net/tools/tld_cleanup/tld_cleanup_util.cc
@@ -1,60 +1,24 @@
-// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
-// This command-line program converts an effective-TLD data file in UTF-8 from
-// the format provided by Mozilla to the format expected by Chrome. This
-// program generates an intermediate file which is then used by gperf to
-// generate a perfect hash map. The benefit of this approach is that no time is
-// spent on program initialization to generate the map of this data.
-//
-// Running this program finds "effective_tld_names.dat" in the expected location
-// in the source checkout and generates "effective_tld_names.gperf" next to it.
-//
-// Any errors or warnings from this program are recorded in tld_cleanup.log.
-//
-// In particular, it
-// * Strips blank lines and comments, as well as notes for individual rules.
-// * Strips a single leading and/or trailing dot from each rule, if present.
-// * Logs a warning if a rule contains '!' or '*.' other than at the beginning
-// of the rule. (This also catches multiple ! or *. at the start of a rule.)
-// * Logs a warning if GURL reports a rule as invalid, but keeps the rule.
-// * Canonicalizes each rule's domain by converting it to a GURL and back.
-// * Adds explicit rules for true TLDs found in any rule.
-// TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed.
-// * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS==="
-// and "// ===END PRIVATE DOMAINS===".
+#include "net/tools/tld_cleanup/tld_cleanup_util.h"
-#include <map>
-#include <set>
-#include <string>
-
-#include "base/at_exit.h"
-#include "base/command_line.h"
-#include "base/file_util.h"
#include "base/file_util.h"
-#include "base/files/file_path.h"
-#include "base/i18n/icu_util.h"
#include "base/logging.h"
-#include "base/path_service.h"
-#include "base/process_util.h"
#include "base/string_util.h"
#include "googleurl/src/gurl.h"
#include "googleurl/src/url_parse.h"
namespace {
-struct Rule {
- bool exception;
- bool wildcard;
-};
-
-typedef std::map<std::string, Rule> RuleMap;
-typedef std::set<std::string> RuleSet;
const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";
}
+namespace net {
+namespace tld_cleanup {
+
// Writes the list of domain rules contained in the 'rules' set to the
// 'outfile', with each rule terminated by a LF. The file must already have
// been created with write access.
@@ -62,7 +26,7 @@ bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
std::string data;
data.append(
"%{\n"
-"// Copyright (c) 2012 The Chromium Authors. All rights reserved.\n"
+"// Copyright 2012 The Chromium Authors. All rights reserved.\n"
"// Use of this source code is governed by a BSD-style license that can be\n"
"// found in the LICENSE file.\n\n"
"// This file is generated by net/tools/tld_cleanup/.\n"
@@ -71,6 +35,7 @@ bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
"struct DomainRule {\n"
" const char *name;\n"
" int type; // 1: exception, 2: wildcard\n"
+" bool is_private;\n"
"};\n"
"%%\n"
);
@@ -85,23 +50,23 @@ bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
} else {
data.append("0");
}
+ if (i->second.is_private) {
+ data.append(", true");
+ } else {
+ data.append(", false");
+ }
data.append("\n");
}
data.append("%%\n");
- int written = file_util::WriteFile(outfile, data.data(), data.size());
+ int written = file_util::WriteFile(outfile,
+ data.data(),
+ static_cast<int>(data.size()));
return written == static_cast<int>(data.size());
}
-// These result codes should be in increasing order of severity.
-typedef enum {
- kSuccess,
- kWarning,
- kError,
-} NormalizeResult;
-
// Adjusts the rule to a standard form: removes single extraneous dots and
// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
// valid; logs a warning and returns kWarning if it is probably invalid; and
@@ -163,38 +128,30 @@ NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
return result;
}
-// Loads the file described by 'in_filename', converts it to the desired format
-// (see the file comments above), and saves it into 'out_filename'. Returns
-// the most severe of the result codes encountered when normalizing the rules.
-NormalizeResult NormalizeFile(const base::FilePath& in_filename,
- const base::FilePath& out_filename) {
- std::string data;
- if (!file_util::ReadFileToString(in_filename, &data)) {
- LOG(ERROR) << "Unable to read file";
- // We return success since we've already reported the error.
- return kSuccess;
- }
-
+NormalizeResult NormalizeDataToRuleMap(const std::string data,
+ RuleMap* rules) {
+ CHECK(rules);
// We do a lot of string assignment during parsing, but simplicity is more
// important than performance here.
std::string domain;
NormalizeResult result = kSuccess;
size_t line_start = 0;
size_t line_end = 0;
- RuleMap rules;
- RuleSet extra_rules;
+ bool is_private = false;
+ RuleMap extra_rules;
int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;
+ int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;
while (line_start < data.size()) {
- // Skip the entire section of private domains.
- // TODO(pamg): remove this when http://crbug.com/96086 is fixed.
if (line_start + begin_private_length < data.size() &&
!data.compare(line_start, begin_private_length,
kBeginPrivateDomainsComment)) {
- line_end = data.find(kEndPrivateDomainsComment, line_start);
- if (line_end == std::string::npos) {
- LOG(WARNING) << "Private-domain section had no end marker.";
- line_end = data.size();
- }
+ is_private = true;
+ line_end = line_start + begin_private_length;
+ } else if (line_start + end_private_length < data.size() &&
+ !data.compare(line_start, end_private_length,
+ kEndPrivateDomainsComment)) {
+ is_private = false;
+ line_end = line_start + end_private_length;
} else if (line_start + 1 < data.size() &&
data[line_start] == '/' &&
data[line_start + 1] == '/') {
@@ -212,21 +169,41 @@ NormalizeResult NormalizeFile(const base::FilePath& in_filename,
Rule rule;
rule.wildcard = false;
rule.exception = false;
+ rule.is_private = is_private;
NormalizeResult new_result = NormalizeRule(&domain, &rule);
if (new_result != kError) {
// Check the existing rules to make sure we don't have an exception and
- // wildcard for the same rule. If we did, we'd have to update our
+ // wildcard for the same rule, or that the same domain is listed as both
+ // private and not private. If we did, we'd have to update our
// parsing code to handle this case.
- CHECK(rules.find(domain) == rules.end());
+ CHECK(rules->find(domain) == rules->end());
- rules[domain] = rule;
+ (*rules)[domain] = rule;
// Add true TLD for multi-level rules. We don't add them right now, in
// case there's an exception or wild card that either exists or might be
// added in a later iteration. In those cases, there's no need to add
// it and it would just slow down parsing the data.
size_t tld_start = domain.find_last_of('.');
- if (tld_start != std::string::npos && tld_start + 1 < domain.size())
- extra_rules.insert(domain.substr(tld_start + 1));
+ if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
+ std::string extra_rule_domain = domain.substr(tld_start + 1);
+ RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
+ Rule extra_rule;
+ extra_rule.exception = false;
+ extra_rule.wildcard = false;
+ if (iter == extra_rules.end()) {
+ extra_rule.is_private = is_private;
+ } else {
+ // A rule already exists, so we ensure that if any of the entries is
+ // not private the result should be that the entry is not private.
+ // An example is .au which is not listed as a real TLD, but only
+ // lists second-level domains such as com.au. Subdomains of .au
+ // (eg. blogspot.com.au) are also listed in the private section,
+ // which is processed later, so this ensures that the real TLD
+ // (eg. .au) is listed as public.
+ extra_rule.is_private = is_private && iter->second.is_private;
+ }
+ extra_rules[extra_rule_domain] = extra_rule;
+ }
}
result = std::max(result, new_result);
}
@@ -240,80 +217,37 @@ NormalizeResult NormalizeFile(const base::FilePath& in_filename,
line_start = data.size();
}
- for (RuleSet::const_iterator iter = extra_rules.begin();
+ for (RuleMap::const_iterator iter = extra_rules.begin();
iter != extra_rules.end();
++iter) {
- if (rules.find(*iter) == rules.end()) {
- Rule rule;
- rule.exception = false;
- rule.wildcard = false;
- rules[*iter] = rule;
+ if (rules->find(iter->first) == rules->end()) {
+ (*rules)[iter->first] = iter->second;
}
}
- if (!WriteRules(rules, out_filename)) {
- LOG(ERROR) << "Error(s) writing output file";
- result = kError;
- }
-
return result;
}
-int main(int argc, const char* argv[]) {
- base::EnableTerminationOnHeapCorruption();
- if (argc != 1) {
- fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n");
- fprintf(stderr, "Usage: %s\n", argv[0]);
- return 1;
+NormalizeResult NormalizeFile(const base::FilePath& in_filename,
+ const base::FilePath& out_filename) {
+ RuleMap rules;
+ std::string data;
+ if (!file_util::ReadFileToString(in_filename, &data)) {
+ LOG(ERROR) << "Unable to read file";
+ // We return success since we've already reported the error.
+ return kSuccess;
}
- // Manages the destruction of singletons.
- base::AtExitManager exit_manager;
-
- // Only use OutputDebugString in debug mode.
-#ifdef NDEBUG
- logging::LoggingDestination destination = logging::LOG_ONLY_TO_FILE;
-#else
- logging::LoggingDestination destination =
- logging::LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG;
-#endif
-
- CommandLine::Init(argc, argv);
+ NormalizeResult result = NormalizeDataToRuleMap(data, &rules);
- base::FilePath log_filename;
- PathService::Get(base::DIR_EXE, &log_filename);
- log_filename = log_filename.AppendASCII("tld_cleanup.log");
- logging::InitLogging(
- log_filename.value().c_str(),
- destination,
- logging::LOCK_LOG_FILE,
- logging::DELETE_OLD_LOG_FILE,
- logging::DISABLE_DCHECK_FOR_NON_OFFICIAL_RELEASE_BUILDS);
-
- icu_util::Initialize();
-
- base::FilePath input_file;
- PathService::Get(base::DIR_SOURCE_ROOT, &input_file);
- input_file = input_file.Append(FILE_PATH_LITERAL("net"))
- .Append(FILE_PATH_LITERAL("base"))
- .Append(FILE_PATH_LITERAL(
- "registry_controlled_domains"))
- .Append(FILE_PATH_LITERAL("effective_tld_names.dat"));
- base::FilePath output_file;
- PathService::Get(base::DIR_SOURCE_ROOT, &output_file);
- output_file = output_file.Append(FILE_PATH_LITERAL("net"))
- .Append(FILE_PATH_LITERAL("base"))
- .Append(FILE_PATH_LITERAL(
- "registry_controlled_domains"))
- .Append(FILE_PATH_LITERAL(
- "effective_tld_names.gperf"));
- NormalizeResult result = NormalizeFile(input_file, output_file);
- if (result != kSuccess) {
- fprintf(stderr,
- "Errors or warnings processing file. See log in tld_cleanup.log.");
+ if (!WriteRules(rules, out_filename)) {
+ LOG(ERROR) << "Error(s) writing output file";
+ result = kError;
}
- if (result == kError)
- return 1;
- return 0;
+ return result;
}
+
+
+} // namespace tld_cleanup
+} // namespace net
« no previous file with comments | « net/tools/tld_cleanup/tld_cleanup_util.h ('k') | net/tools/tld_cleanup/tld_cleanup_util_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698