| Index: components/url_formatter/top_domains/make_top_domain_list.cc
|
| diff --git a/components/url_formatter/top_domains/make_top_domain_list.cc b/components/url_formatter/top_domains/make_top_domain_list.cc
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..f497cf886d410129a7d6da563c21803666d51278
|
| --- /dev/null
|
| +++ b/components/url_formatter/top_domains/make_top_domain_list.cc
|
| @@ -0,0 +1,126 @@
|
| +// Copyright (c) 2017 The Chromium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +
|
| +#include <algorithm>
|
| +#include <iostream>
|
| +#include <sstream>
|
| +#include <string>
|
| +#include <vector>
|
| +
|
| +#include "base/base_paths.h"
|
| +#include "base/files/file_path.h"
|
| +#include "base/files/file_util.h"
|
| +#include "base/i18n/icu_util.h"
|
| +#include "base/path_service.h"
|
| +#include "base/strings/string_split.h"
|
| +#include "base/strings/string_util.h"
|
| +#include "third_party/icu/source/common/unicode/unistr.h"
|
| +#include "third_party/icu/source/common/unicode/utypes.h"
|
| +#include "third_party/icu/source/i18n/unicode/uspoof.h"
|
| +
|
| +std::string GetSkeleton(const std::string& domain,
|
| + const USpoofChecker* spoof_checker) {
|
| + UErrorCode status = U_ZERO_ERROR;
|
| + icu::UnicodeString ustr_skeleton;
|
| + uspoof_getSkeletonUnicodeString(spoof_checker, 0, /* not used */
|
| + icu::UnicodeString::fromUTF8(domain),
|
| + ustr_skeleton, &status);
|
| + std::string skeleton;
|
| + return U_SUCCESS(status) ? ustr_skeleton.toUTF8String(skeleton) : skeleton;
|
| +}
|
| +
|
| +base::FilePath GetPath(base::StringPiece basename) {
|
| + base::FilePath path;
|
| + base::PathService::Get(base::DIR_SOURCE_ROOT, &path);
|
| + return path.Append(FILE_PATH_LITERAL("components"))
|
| + .Append(FILE_PATH_LITERAL("url_formatter"))
|
| + .Append(FILE_PATH_LITERAL("top_domains")).AppendASCII(basename);
|
| +}
|
| +
|
| +bool WriteToFile(const std::string& content, base::StringPiece basename) {
|
| + base::FilePath path = GetPath(basename);
|
| + if (base::WriteFile(path, content.data(), content.size()) == -1) {
|
| + std::cerr << "failed to write to " << path.AsUTF8Unsafe()
|
| + << '\n';
|
| + return false;
|
| + }
|
| + return true;
|
| +}
|
| +
|
| +int main(int argc, const char** argv) {
|
| + if (argc != 1) {
|
| + std::cerr << "Generate top domain names and skeletons list to use as\n"
|
| + << "input to base/dafsa/make_dafsa.py\n";
|
| + std::cerr << "Usage: " << argv[0] << '\n';
|
| + return 1;
|
| + }
|
| +
|
| +
|
| + base::i18n::InitializeICU();
|
| +
|
| + base::FilePath input_file = GetPath("alexa_10k_domains.list");
|
| + std::string input_content;
|
| + if (!base::ReadFileToString(input_file, &input_content)) {
|
| + std::cerr << "failed to read the input file " << input_file.AsUTF8Unsafe()
|
| + << '\n';
|
| + return 1;
|
| + }
|
| +
|
| + UErrorCode status = U_ZERO_ERROR;
|
| + USpoofChecker* spoof_checker = uspoof_open(&status);
|
| + if (U_FAILURE(status)) {
|
| + std::cerr << "failed to create an ICU uspoof_checker due to "
|
| + << u_errorName(status) << ".\n";
|
| + return 1;
|
| + }
|
| +
|
| + std::stringstream input(input_content);
|
| + std::stringstream output;
|
| +
|
| + output << "// Copyright 2017 The Chromium Authors. All rights reserved.\n"
|
| + << "// Use of this source code is governed by a BSD-style license that"
|
| + << " can be\n"
|
| + << "// found in the LICENSE file.\n\n"
|
| + << "// This file is generated by "
|
| + << "components/url_formatter/make_top_domain_list.\n"
|
| + << "// DO NOT MANUALLY EDIT!\n\n"
|
| + << "// Each entry has one of the following two values.\n"
|
| + << "// 0: skeletons for confusable look-up.\n"
|
| + << "// 1: original domain name.\n"
|
| + << "%%\n";
|
| +
|
| + std::string domain;
|
| + size_t max_labels = 0;
|
| + std::string domains_with_max_labels;
|
| + while (std::getline(input, domain)) {
|
| + if (domain[0] == '#') continue;
|
| + std::string skeleton = GetSkeleton(domain, spoof_checker);
|
| + if (!skeleton.empty()) {
|
| + output << skeleton << ", 0\n";
|
| + } else {
|
| + std::cerr << "failed to generate the skeleton of " << domain << '\n';
|
| + output << "// " << domain << '\n';
|
| + }
|
| + output << domain << ", 1\n";
|
| + std::vector<base::StringPiece> labels =
|
| + base::SplitStringPiece(domain, ".", base::TRIM_WHITESPACE,
|
| + base::SPLIT_WANT_ALL);
|
| + if (labels.size() > max_labels) {
|
| + domains_with_max_labels = domain;
|
| + max_labels = labels.size();
|
| + }
|
| +
|
| + }
|
| +
|
| + output << "%%\n";
|
| +
|
| + if (!WriteToFile(output.str(), "alexa_10k_names_and_skeletons.gperf"))
|
| + return 1;
|
| +
|
| + std::cout << "The first domain with the largest number of labels is "
|
| + << domains_with_max_labels << " and has " << max_labels
|
| + << " labels.\n";
|
| +
|
| + return 0;
|
| +}
|
|
|