net/tools/tld_cleanup/tld_cleanup.cc - Issue 13979002: Add support for split PSL list distinctions.

Unified Diff: net/tools/tld_cleanup/tld_cleanup.cc

Issue 13979002: Add support for split PSL list distinctions. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Rebased Created 7 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: net/tools/tld_cleanup/tld_cleanup.cc

diff --git a/net/tools/tld_cleanup/tld_cleanup.cc b/net/tools/tld_cleanup/tld_cleanup.cc

index 1162d983a700a893e5467e95e926452a88ff6235..485bece3db16d1ef2f1629d687136829b43023dd 100644

--- a/net/tools/tld_cleanup/tld_cleanup.cc

+++ b/net/tools/tld_cleanup/tld_cleanup.cc

@@ -21,243 +21,18 @@

// * Logs a warning if GURL reports a rule as invalid, but keeps the rule.

// * Canonicalizes each rule's domain by converting it to a GURL and back.

// * Adds explicit rules for true TLDs found in any rule.

-// TODO(pamg): Remove this comment when http://crbug.com/96086 is fixed.

-// * Ignores any entries in the file between "// ===BEGIN PRIVATE DOMAINS==="

-// and "// ===END PRIVATE DOMAINS===".

-#include <map>

-#include <set>

-#include <string>

+// * Marks entries in the file between "// ===BEGIN PRIVATE DOMAINS==="

+// and "// ===END PRIVATE DOMAINS===" as private.

#include "base/at_exit.h"

#include "base/command_line.h"

#include "base/file_util.h"

-#include "base/file_util.h"

#include "base/files/file_path.h"

#include "base/i18n/icu_util.h"

#include "base/logging.h"

#include "base/path_service.h"

#include "base/process_util.h"

-#include "base/string_util.h"

-#include "googleurl/src/gurl.h"

-#include "googleurl/src/url_parse.h"

-namespace {

-struct Rule {

- bool exception;

- bool wildcard;

-};

-typedef std::map<std::string, Rule> RuleMap;

-typedef std::set<std::string> RuleSet;

-const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";

-const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";

-// Writes the list of domain rules contained in the 'rules' set to the

-// 'outfile', with each rule terminated by a LF. The file must already have

-// been created with write access.

-bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {

- std::string data;

- data.append(

-"%{\n"

-"// Use of this source code is governed by a BSD-style license that can be\n"

-"// found in the LICENSE file.\n\n"

-"// This file is generated by net/tools/tld_cleanup/.\n"

-"// DO NOT MANUALLY EDIT!\n"

-"%}\n"

-"struct DomainRule {\n"

-" const char *name;\n"

-" int type; // 1: exception, 2: wildcard\n"

-"};\n"

-"%%\n"

- );

- for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {

- data.append(i->first);

- data.append(", ");

- if (i->second.exception) {

- data.append("1");

- } else if (i->second.wildcard) {

- data.append("2");

- } else {

- data.append("0");

- }

- data.append("\n");

- }

- data.append("%%\n");

- int written = file_util::WriteFile(outfile, data.data(), data.size());

- return written == static_cast<int>(data.size());

-// These result codes should be in increasing order of severity.

-typedef enum {

- kSuccess,

- kWarning,

- kError,

-} NormalizeResult;

-// Adjusts the rule to a standard form: removes single extraneous dots and

-// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as

-// valid; logs a warning and returns kWarning if it is probably invalid; and

-// logs an error and returns kError if the rule is (almost) certainly invalid.

-NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {

- NormalizeResult result = kSuccess;

- // Strip single leading and trailing dots.

- if (domain->at(0) == '.')

- domain->erase(0, 1);

- if (domain->empty()) {

- LOG(WARNING) << "Ignoring empty rule";

- return kWarning;

- }

- if (domain->at(domain->size() - 1) == '.')

- domain->erase(domain->size() - 1, 1);

- if (domain->empty()) {

- LOG(WARNING) << "Ignoring empty rule";

- return kWarning;

- }

- // Allow single leading '*.' or '!', saved here so it's not canonicalized.

- size_t start_offset = 0;

- if (domain->at(0) == '!') {

- domain->erase(0, 1);

- rule->exception = true;

- } else if (domain->find("*.") == 0) {

- domain->erase(0, 2);

- rule->wildcard = true;

- }

- if (domain->empty()) {

- LOG(WARNING) << "Ignoring empty rule";

- return kWarning;

- }

- // Warn about additional '*.' or '!'.

- if (domain->find("*.", start_offset) != std::string::npos ||

- domain->find('!', start_offset) != std::string::npos) {

- LOG(WARNING) << "Keeping probably invalid rule: " << *domain;

- result = kWarning;

- }

- // Make a GURL and normalize it, then get the host back out.

- std::string url = "http://";

- url.append(*domain);

- GURL gurl(url);

- const std::string& spec = gurl.possibly_invalid_spec();

- url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host;

- if (host.len < 0) {

- LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;

- return kError;

- }

- if (!gurl.is_valid()) {

- LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;

- result = kWarning;

- }

- domain->assign(spec.substr(host.begin, host.len));

- return result;

-// Loads the file described by 'in_filename', converts it to the desired format

-// (see the file comments above), and saves it into 'out_filename'. Returns

-// the most severe of the result codes encountered when normalizing the rules.

-NormalizeResult NormalizeFile(const base::FilePath& in_filename,

- const base::FilePath& out_filename) {

- std::string data;

- if (!file_util::ReadFileToString(in_filename, &data)) {

- LOG(ERROR) << "Unable to read file";

- // We return success since we've already reported the error.

- return kSuccess;

- }

- // We do a lot of string assignment during parsing, but simplicity is more

- // important than performance here.

- std::string domain;

- NormalizeResult result = kSuccess;

- size_t line_start = 0;

- size_t line_end = 0;

- RuleMap rules;

- RuleSet extra_rules;

- int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;

- while (line_start < data.size()) {

- // Skip the entire section of private domains.

- // TODO(pamg): remove this when http://crbug.com/96086 is fixed.

- if (line_start + begin_private_length < data.size() &&

- !data.compare(line_start, begin_private_length,

- kBeginPrivateDomainsComment)) {

- line_end = data.find(kEndPrivateDomainsComment, line_start);

- if (line_end == std::string::npos) {

- LOG(WARNING) << "Private-domain section had no end marker.";

- line_end = data.size();

- }

- } else if (line_start + 1 < data.size() &&

- data[line_start] == '/' &&

- data[line_start + 1] == '/') {

- // Skip comments.

- line_end = data.find_first_of("\r\n", line_start);

- if (line_end == std::string::npos)

- line_end = data.size();

- } else {

- // Truncate at first whitespace.

- line_end = data.find_first_of("\r\n \t", line_start);

- if (line_end == std::string::npos)

- line_end = data.size();

- domain.assign(data.data(), line_start, line_end - line_start);

- Rule rule;

- rule.wildcard = false;

- rule.exception = false;

- NormalizeResult new_result = NormalizeRule(&domain, &rule);

- if (new_result != kError) {

- // Check the existing rules to make sure we don't have an exception and

- // wildcard for the same rule. If we did, we'd have to update our

- // parsing code to handle this case.

- CHECK(rules.find(domain) == rules.end());

- rules[domain] = rule;

- // Add true TLD for multi-level rules. We don't add them right now, in

- // case there's an exception or wild card that either exists or might be

- // added in a later iteration. In those cases, there's no need to add

- // it and it would just slow down parsing the data.

- size_t tld_start = domain.find_last_of('.');

- if (tld_start != std::string::npos && tld_start + 1 < domain.size())

- extra_rules.insert(domain.substr(tld_start + 1));

- }

- result = std::max(result, new_result);

- }

- // Find beginning of next non-empty line.

- line_start = data.find_first_of("\r\n", line_end);

- if (line_start == std::string::npos)

- line_start = data.size();

- line_start = data.find_first_not_of("\r\n", line_start);

- if (line_start == std::string::npos)

- line_start = data.size();

- }

- for (RuleSet::const_iterator iter = extra_rules.begin();

- iter != extra_rules.end();

- ++iter) {

- if (rules.find(*iter) == rules.end()) {

- Rule rule;

- rule.exception = false;

- rule.wildcard = false;

- rules[*iter] = rule;

- }

- if (!WriteRules(rules, out_filename)) {

- LOG(ERROR) << "Error(s) writing output file";

- result = kError;

- }

- return result;

+#include "net/tools/tld_cleanup/tld_cleanup_util.h"

int main(int argc, const char* argv[]) {

base::EnableTerminationOnHeapCorruption();

@@ -307,13 +82,14 @@ int main(int argc, const char* argv[]) {

"registry_controlled_domains"))

.Append(FILE_PATH_LITERAL(

"effective_tld_names.gperf"));

- NormalizeResult result = NormalizeFile(input_file, output_file);

- if (result != kSuccess) {

+ net::tld_cleanup::NormalizeResult result =

+ net::tld_cleanup::NormalizeFile(input_file, output_file);

+ if (result != net::tld_cleanup::kSuccess) {

fprintf(stderr,

"Errors or warnings processing file. See log in tld_cleanup.log.");

}

- if (result == kError)

+ if (result == net::tld_cleanup::kError)

return 1;

return 0;

}

« no previous file with comments | « net/tools/tld_cleanup/README ('k') | net/tools/tld_cleanup/tld_cleanup.gyp » ('j') | no next file with comments »