net/tools/tld_cleanup/tld_cleanup_util.cc - Issue 13979002: Add support for split PSL list distinctions.

Unified Diff: net/tools/tld_cleanup/tld_cleanup_util.cc

Issue 13979002: Add support for split PSL list distinctions. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Rebased Created 7 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: net/tools/tld_cleanup/tld_cleanup_util.cc

diff --git a/net/tools/tld_cleanup/tld_cleanup.cc b/net/tools/tld_cleanup/tld_cleanup_util.cc

similarity index 54%

copy from net/tools/tld_cleanup/tld_cleanup.cc

copy to net/tools/tld_cleanup/tld_cleanup_util.cc

index 1162d983a700a893e5467e95e926452a88ff6235..2f5496e0a56286a2686aa21c35bc2861ea8b15c5 100644

--- a/net/tools/tld_cleanup/tld_cleanup.cc

+++ b/net/tools/tld_cleanup/tld_cleanup_util.cc

@@ -1,60 +1,24 @@

// Use of this source code is governed by a BSD-style license that can be

// found in the LICENSE file.

-// This command-line program converts an effective-TLD data file in UTF-8 from

-// the format provided by Mozilla to the format expected by Chrome. This

-// program generates an intermediate file which is then used by gperf to

-// generate a perfect hash map. The benefit of this approach is that no time is

-// spent on program initialization to generate the map of this data.

-//

-// Running this program finds "effective_tld_names.dat" in the expected location

-// in the source checkout and generates "effective_tld_names.gperf" next to it.

-//

-// Any errors or warnings from this program are recorded in tld_cleanup.log.

-//

-// In particular, it

-// * Strips blank lines and comments, as well as notes for individual rules.

-// * Strips a single leading and/or trailing dot from each rule, if present.

-// * Logs a warning if a rule contains '!' or '*.' other than at the beginning

-// of the rule. (This also catches multiple ! or *. at the start of a rule.)

-// * Logs a warning if GURL reports a rule as invalid, but keeps the rule.

-// * Canonicalizes each rule's domain by converting it to a GURL and back.

-// * Adds explicit rules for true TLDs found in any rule.

-// TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed.

-// * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS==="

-// and "// ===END PRIVATE DOMAINS===".

+#include "net/tools/tld_cleanup/tld_cleanup_util.h"

-#include <map>

-#include <set>

-#include <string>

-#include "base/at_exit.h"

-#include "base/command_line.h"

-#include "base/file_util.h"

#include "base/file_util.h"

-#include "base/files/file_path.h"

-#include "base/i18n/icu_util.h"

#include "base/logging.h"

-#include "base/path_service.h"

-#include "base/process_util.h"

#include "base/string_util.h"

#include "googleurl/src/gurl.h"

#include "googleurl/src/url_parse.h"

namespace {

-struct Rule {

- bool exception;

- bool wildcard;

-};

-typedef std::map<std::string, Rule> RuleMap;

-typedef std::set<std::string> RuleSet;

const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";

const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";

}

+namespace net {

+namespace tld_cleanup {

// Writes the list of domain rules contained in the 'rules' set to the

// 'outfile', with each rule terminated by a LF. The file must already have

// been created with write access.

@@ -62,7 +26,7 @@ bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {

std::string data;

data.append(

"%{\n"

"// Use of this source code is governed by a BSD-style license that can be\n"

"// found in the LICENSE file.\n\n"

"// This file is generated by net/tools/tld_cleanup/.\n"

@@ -71,6 +35,7 @@ bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {

"struct DomainRule {\n"

" const char *name;\n"

" int type; // 1: exception, 2: wildcard\n"

+" bool is_private;\n"

"};\n"

"%%\n"

);

@@ -85,6 +50,11 @@ bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {

} else {

data.append("0");

}

+ if (i->second.is_private) {

+ data.append(", true");

+ } else {

+ data.append(", false");

+ }

data.append("\n");

}

@@ -95,13 +65,6 @@ bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {

return written == static_cast<int>(data.size());

}

-// These result codes should be in increasing order of severity.

-typedef enum {

- kSuccess,

- kWarning,

- kError,

-} NormalizeResult;

// Adjusts the rule to a standard form: removes single extraneous dots and

// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as

// valid; logs a warning and returns kWarning if it is probably invalid; and

@@ -163,38 +126,30 @@ NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {

return result;

}

-// Loads the file described by 'in_filename', converts it to the desired format

-// (see the file comments above), and saves it into 'out_filename'. Returns

-// the most severe of the result codes encountered when normalizing the rules.

-NormalizeResult NormalizeFile(const base::FilePath& in_filename,

- const base::FilePath& out_filename) {

- std::string data;

- if (!file_util::ReadFileToString(in_filename, &data)) {

- LOG(ERROR) << "Unable to read file";

- // We return success since we've already reported the error.

- return kSuccess;

- }

+NormalizeResult NormalizeDataToRuleMap(const std::string data,

+ RuleMap* rules) {

+ CHECK(rules);

// We do a lot of string assignment during parsing, but simplicity is more

// important than performance here.

std::string domain;

NormalizeResult result = kSuccess;

size_t line_start = 0;

size_t line_end = 0;

- RuleMap rules;

- RuleSet extra_rules;

+ bool is_private = false;

+ RuleMap extra_rules;

int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;

+ int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;

while (line_start < data.size()) {

- // Skip the entire section of private domains.

- // TODO(pamg): remove this when http://crbug.com/96086 is fixed.

if (line_start + begin_private_length < data.size() &&

!data.compare(line_start, begin_private_length,

kBeginPrivateDomainsComment)) {

- line_end = data.find(kEndPrivateDomainsComment, line_start);

- if (line_end == std::string::npos) {

- LOG(WARNING) << "Private-domain section had no end marker.";

- line_end = data.size();

- }

+ is_private = true;

+ line_end = line_start + begin_private_length;

+ } else if (line_start + end_private_length < data.size() &&

+ !data.compare(line_start, end_private_length,

+ kEndPrivateDomainsComment)) {

+ is_private = false;

+ line_end = line_start + end_private_length;

} else if (line_start + 1 < data.size() &&

data[line_start] == '/' &&

data[line_start + 1] == '/') {

@@ -212,21 +167,41 @@ NormalizeResult NormalizeFile(const base::FilePath& in_filename,

Rule rule;

rule.wildcard = false;

rule.exception = false;

+ rule.is_private = is_private;

NormalizeResult new_result = NormalizeRule(&domain, &rule);

if (new_result != kError) {

// Check the existing rules to make sure we don't have an exception and

- // wildcard for the same rule. If we did, we'd have to update our

+ // wildcard for the same rule, or that the same domain is listed as both

+ // private and not private. If we did, we'd have to update our

// parsing code to handle this case.

- CHECK(rules.find(domain) == rules.end());

+ CHECK(rules->find(domain) == rules->end());

- rules[domain] = rule;

+ (*rules)[domain] = rule;

// Add true TLD for multi-level rules. We don't add them right now, in

// case there's an exception or wild card that either exists or might be

// added in a later iteration. In those cases, there's no need to add

// it and it would just slow down parsing the data.

size_t tld_start = domain.find_last_of('.');

- if (tld_start != std::string::npos && tld_start + 1 < domain.size())

- extra_rules.insert(domain.substr(tld_start + 1));

+ if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {

+ std::string extra_rule_domain = domain.substr(tld_start + 1);

+ RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);

+ Rule extra_rule;

+ extra_rule.exception = false;

+ extra_rule.wildcard = false;

+ if (iter == extra_rules.end()) {

+ extra_rule.is_private = is_private;

+ } else {

+ // A rule already exists, so we ensure that if any of the entries is

+ // not private the result should be that the entry is not private.

+ // An example is .au which is not listed as a real TLD, but only

+ // lists second-level domains such as com.au. Subdomains of .au

+ // (eg. blogspot.com.au) are also listed in the private section,

+ // which is processed later, so this ensures that the real TLD

+ // (eg. .au) is listed as public.

+ extra_rule.is_private = is_private && iter->second.is_private;

+ }

+ extra_rules[extra_rule_domain] = extra_rule;

+ }

}

result = std::max(result, new_result);

}

@@ -240,80 +215,37 @@ NormalizeResult NormalizeFile(const base::FilePath& in_filename,

line_start = data.size();

}

- for (RuleSet::const_iterator iter = extra_rules.begin();

+ for (RuleMap::const_iterator iter = extra_rules.begin();

iter != extra_rules.end();

++iter) {

- if (rules.find(*iter) == rules.end()) {

- Rule rule;

- rule.exception = false;

- rule.wildcard = false;

- rules[*iter] = rule;

+ if (rules->find(iter->first) == rules->end()) {

+ (*rules)[iter->first] = iter->second;

}

- if (!WriteRules(rules, out_filename)) {

- LOG(ERROR) << "Error(s) writing output file";

- result = kError;

- }

return result;

}

-int main(int argc, const char* argv[]) {

- base::EnableTerminationOnHeapCorruption();

- if (argc != 1) {

- fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n");

- fprintf(stderr, "Usage: %s\n", argv[0]);

- return 1;

+NormalizeResult NormalizeFile(const base::FilePath& in_filename,

+ const base::FilePath& out_filename) {

+ RuleMap rules;

+ std::string data;

+ if (!file_util::ReadFileToString(in_filename, &data)) {

+ LOG(ERROR) << "Unable to read file";

+ // We return success since we've already reported the error.

+ return kSuccess;

}

- // Manages the destruction of singletons.

- base::AtExitManager exit_manager;

- // Only use OutputDebugString in debug mode.

-#ifdef NDEBUG

- logging::LoggingDestination destination = logging::LOG_ONLY_TO_FILE;

-#else

- logging::LoggingDestination destination =

- logging::LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG;

-#endif

- CommandLine::Init(argc, argv);

+ NormalizeResult result = NormalizeDataToRuleMap(data, &rules);

- base::FilePath log_filename;

- PathService::Get(base::DIR_EXE, &log_filename);

- log_filename = log_filename.AppendASCII("tld_cleanup.log");

- logging::InitLogging(

- log_filename.value().c_str(),

- destination,

- logging::LOCK_LOG_FILE,

- logging::DELETE_OLD_LOG_FILE,

- logging::DISABLE_DCHECK_FOR_NON_OFFICIAL_RELEASE_BUILDS);

- icu_util::Initialize();

- base::FilePath input_file;

- PathService::Get(base::DIR_SOURCE_ROOT, &input_file);

- input_file = input_file.Append(FILE_PATH_LITERAL("net"))

- .Append(FILE_PATH_LITERAL("base"))

- .Append(FILE_PATH_LITERAL(

- "registry_controlled_domains"))

- .Append(FILE_PATH_LITERAL("effective_tld_names.dat"));

- base::FilePath output_file;

- PathService::Get(base::DIR_SOURCE_ROOT, &output_file);

- output_file = output_file.Append(FILE_PATH_LITERAL("net"))

- .Append(FILE_PATH_LITERAL("base"))

- .Append(FILE_PATH_LITERAL(

- "registry_controlled_domains"))

- .Append(FILE_PATH_LITERAL(

- "effective_tld_names.gperf"));

- NormalizeResult result = NormalizeFile(input_file, output_file);

- if (result != kSuccess) {

- fprintf(stderr,

- "Errors or warnings processing file. See log in tld_cleanup.log.");

+ if (!WriteRules(rules, out_filename)) {

+ LOG(ERROR) << "Error(s) writing output file";

+ result = kError;

}

- if (result == kError)

- return 1;

- return 0;

+ return result;

}

+} // namespace tld_cleanup

+} // namespace net

« no previous file with comments | « net/tools/tld_cleanup/tld_cleanup_util.h ('k') | net/tools/tld_cleanup/tld_cleanup_util_unittest.cc » ('j') | no next file with comments »