Chromium Code Reviews| Index: components/url_formatter/top_domains/make_top_domain_gperf.cc |
| diff --git a/components/url_formatter/top_domains/make_top_domain_gperf.cc b/components/url_formatter/top_domains/make_top_domain_gperf.cc |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..3921bd35fd8fe955aa2fb99a43155d3457167257 |
| --- /dev/null |
| +++ b/components/url_formatter/top_domains/make_top_domain_gperf.cc |
| @@ -0,0 +1,124 @@ |
| +// Copyright (c) 2017 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +#include <algorithm> |
| +#include <iostream> |
| +#include <sstream> |
| +#include <string> |
| +#include <vector> |
| + |
| +#include "base/base_paths.h" |
| +#include "base/files/file_path.h" |
| +#include "base/files/file_util.h" |
| +#include "base/i18n/icu_util.h" |
| +#include "base/path_service.h" |
| +#include "base/strings/string_split.h" |
| +#include "base/strings/string_util.h" |
| +#include "third_party/icu/source/common/unicode/unistr.h" |
| +#include "third_party/icu/source/common/unicode/utypes.h" |
| +#include "third_party/icu/source/i18n/unicode/uspoof.h" |
| + |
| +std::string GetSkeleton(const std::string& domain, |
| + const USpoofChecker* spoof_checker) { |
| + UErrorCode status = U_ZERO_ERROR; |
| + icu::UnicodeString ustr_skeleton; |
| + uspoof_getSkeletonUnicodeString(spoof_checker, 0, /* not used */ |
| + icu::UnicodeString::fromUTF8(domain), |
| + ustr_skeleton, &status); |
| + std::string skeleton; |
| + return U_SUCCESS(status) ? ustr_skeleton.toUTF8String(skeleton) : skeleton; |
| +} |
| + |
| +base::FilePath GetPath(base::StringPiece basename) { |
| + base::FilePath path; |
| + base::PathService::Get(base::DIR_SOURCE_ROOT, &path); |
| + return path.Append(FILE_PATH_LITERAL("components")) |
| + .Append(FILE_PATH_LITERAL("url_formatter")) |
| + .Append(FILE_PATH_LITERAL("top_domains")).AppendASCII(basename); |
| +} |
| + |
| +bool WriteToFile(const std::string& content, base::StringPiece basename) { |
| + base::FilePath path = GetPath(basename); |
| + if (base::WriteFile(path, content.data(), content.size()) == -1) { |
| + std::cerr << "failed to write to " << path.AsUTF8Unsafe() |
|
Peter Kasting
2017/05/09 01:37:02
Nit: Initial caps? (several places)
jungshik at Google
2017/05/10 18:05:12
Done.
|
| + << '\n'; |
| + return false; |
| + } |
| + return true; |
|
Peter Kasting
2017/05/09 01:37:02
Nit: Shorter:
bool succeeded = base::WriteFile(
jungshik at Google
2017/05/10 18:05:12
Ok. Turned it into CHECK.
|
| +} |
| + |
| +int main(int argc, const char** argv) { |
| + if (argc != 1) { |
| + std::cerr << "Generate the list of top domain skeletons to use as\n" |
|
Peter Kasting
2017/05/09 01:37:03
Nit: Generates
Seems like this could be wrapped c
jungshik at Google
2017/05/10 18:05:12
Done.
|
| + << "input to base/dafsa/make_dafsa.py\n"; |
|
Peter Kasting
2017/05/09 01:37:02
Nit: Leading << not necessary when continuing prev
jungshik at Google
2017/05/10 18:05:12
Done.
|
| + std::cerr << "Usage: " << argv[0] << '\n'; |
|
Peter Kasting
2017/05/09 01:37:02
Nit: Why not just continue << from the previous st
jungshik at Google
2017/05/10 18:05:13
Done.
|
| + return 1; |
| + } |
| + |
| + |
| + base::i18n::InitializeICU(); |
| + |
| + base::FilePath input_file = GetPath("alexa_domains.list"); |
| + std::string input_content; |
| + if (!base::ReadFileToString(input_file, &input_content)) { |
| + std::cerr << "failed to read the input file " << input_file.AsUTF8Unsafe() |
| + << '\n'; |
| + return 1; |
| + } |
| + |
| + UErrorCode status = U_ZERO_ERROR; |
| + USpoofChecker* spoof_checker = uspoof_open(&status); |
| + if (U_FAILURE(status)) { |
| + std::cerr << "failed to create an ICU uspoof_checker due to " |
| + << u_errorName(status) << ".\n"; |
| + return 1; |
| + } |
| + |
| + std::stringstream input(input_content); |
| + std::stringstream output; |
|
Peter Kasting
2017/05/09 01:37:02
As far as I can tell, you just append unformatted
jungshik at Google
2017/05/10 18:05:13
You.re right. Thank you for catching it.
At first
|
| + |
| + output << "// Copyright 2017 The Chromium Authors. All rights reserved.\n" |
| + << "// Use of this source code is governed by a BSD-style license that" |
| + << " can be\n" |
| + << "// found in the LICENSE file.\n\n" |
| + << "// This file is generated by " |
| + << "components/url_formatter/make_top_domain_gperf.\n" |
| + << "// DO NOT MANUALLY EDIT!\n\n" |
| + << "// Each entry is the skeleton of a top domain for the " |
| + << "confusability check.\n" |
|
Peter Kasting
2017/05/09 01:37:02
Nit: "...for the confusability check in <sourcefil
jungshik at Google
2017/05/10 18:05:12
Done.
|
| + << "%%\n"; |
| + |
| + std::string domain; |
| + size_t max_labels = 0; |
| + std::string domains_with_max_labels; |
|
Peter Kasting
2017/05/09 01:37:02
Nit: domain, singular?
jungshik at Google
2017/05/10 18:05:12
Done.
|
| + while (std::getline(input, domain)) { |
| + if (domain[0] == '#') continue; |
| + std::string skeleton = GetSkeleton(domain, spoof_checker); |
| + if (skeleton.empty()) { |
| + std::cerr << "failed to generate the skeleton of " << domain << '\n'; |
| + output << "// " << domain << '\n'; |
| + } else { |
| + output << skeleton << ", 1\n"; |
| + } |
| + std::vector<base::StringPiece> labels = |
| + base::SplitStringPiece(domain, ".", base::TRIM_WHITESPACE, |
| + base::SPLIT_WANT_ALL); |
| + if (labels.size() > max_labels) { |
| + domains_with_max_labels = domain; |
| + max_labels = labels.size(); |
| + } |
| + |
|
Peter Kasting
2017/05/09 01:37:02
Nit: No blank line
jungshik at Google
2017/05/10 18:05:12
Done.
|
| + } |
| + |
| + output << "%%\n"; |
| + |
| + if (!WriteToFile(output.str(), "alexa_skeletons.gperf")) |
| + return 1; |
| + |
| + std::cout << "The first domain with the largest number of labels is " |
| + << domains_with_max_labels << " and has " << max_labels |
| + << " labels.\n"; |
| + |
| + return 0; |
| +} |