OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2017 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include <algorithm> |
| 6 #include <iostream> |
| 7 #include <sstream> |
| 8 #include <string> |
| 9 #include <vector> |
| 10 |
| 11 #include "base/base_paths.h" |
| 12 #include "base/files/file_path.h" |
| 13 #include "base/files/file_util.h" |
| 14 #include "base/i18n/icu_util.h" |
| 15 #include "base/path_service.h" |
| 16 #include "base/strings/string_split.h" |
| 17 #include "base/strings/string_util.h" |
| 18 #include "third_party/icu/source/common/unicode/unistr.h" |
| 19 #include "third_party/icu/source/common/unicode/utypes.h" |
| 20 #include "third_party/icu/source/i18n/unicode/uspoof.h" |
| 21 |
| 22 std::string GetSkeleton(const std::string& domain, |
| 23 const USpoofChecker* spoof_checker) { |
| 24 UErrorCode status = U_ZERO_ERROR; |
| 25 icu::UnicodeString ustr_skeleton; |
| 26 uspoof_getSkeletonUnicodeString(spoof_checker, 0, /* not used */ |
| 27 icu::UnicodeString::fromUTF8(domain), |
| 28 ustr_skeleton, &status); |
| 29 std::string skeleton; |
| 30 return U_SUCCESS(status) ? ustr_skeleton.toUTF8String(skeleton) : skeleton; |
| 31 } |
| 32 |
| 33 base::FilePath GetPath(base::StringPiece basename) { |
| 34 base::FilePath path; |
| 35 base::PathService::Get(base::DIR_SOURCE_ROOT, &path); |
| 36 return path.Append(FILE_PATH_LITERAL("components")) |
| 37 .Append(FILE_PATH_LITERAL("url_formatter")) |
| 38 .Append(FILE_PATH_LITERAL("top_domains")).AppendASCII(basename); |
| 39 } |
| 40 |
| 41 bool WriteToFile(const std::string& content, base::StringPiece basename) { |
| 42 base::FilePath path = GetPath(basename); |
| 43 if (base::WriteFile(path, content.data(), content.size()) == -1) { |
| 44 std::cerr << "failed to write to " << path.AsUTF8Unsafe() |
| 45 << '\n'; |
| 46 return false; |
| 47 } |
| 48 return true; |
| 49 } |
| 50 |
| 51 int main(int argc, const char** argv) { |
| 52 if (argc != 1) { |
| 53 std::cerr << "Generate top domain names and skeletons list to use as\n" |
| 54 << "input to base/dafsa/make_dafsa.py\n"; |
| 55 std::cerr << "Usage: " << argv[0] << '\n'; |
| 56 return 1; |
| 57 } |
| 58 |
| 59 |
| 60 base::i18n::InitializeICU(); |
| 61 |
| 62 base::FilePath input_file = GetPath("alexa_domains.list"); |
| 63 std::string input_content; |
| 64 if (!base::ReadFileToString(input_file, &input_content)) { |
| 65 std::cerr << "failed to read the input file " << input_file.AsUTF8Unsafe() |
| 66 << '\n'; |
| 67 return 1; |
| 68 } |
| 69 |
| 70 UErrorCode status = U_ZERO_ERROR; |
| 71 USpoofChecker* spoof_checker = uspoof_open(&status); |
| 72 if (U_FAILURE(status)) { |
| 73 std::cerr << "failed to create an ICU uspoof_checker due to " |
| 74 << u_errorName(status) << ".\n"; |
| 75 return 1; |
| 76 } |
| 77 |
| 78 std::stringstream input(input_content); |
| 79 std::stringstream output; |
| 80 |
| 81 output << "// Copyright 2017 The Chromium Authors. All rights reserved.\n" |
| 82 << "// Use of this source code is governed by a BSD-style license that" |
| 83 << " can be\n" |
| 84 << "// found in the LICENSE file.\n\n" |
| 85 << "// This file is generated by " |
| 86 << "components/url_formatter/make_top_domain_list.\n" |
| 87 << "// DO NOT MANUALLY EDIT!\n\n" |
| 88 << "// Each entry has one of the following two values.\n" |
| 89 << "// 1: original domain name.\n" |
| 90 << "// 2: skeletons for confusable look-up.\n" |
| 91 << "// 3: domain name and skeleton.\n" |
| 92 << "%%\n"; |
| 93 |
| 94 std::string domain; |
| 95 size_t max_labels = 0; |
| 96 std::string domains_with_max_labels; |
| 97 while (std::getline(input, domain)) { |
| 98 if (domain[0] == '#') continue; |
| 99 std::string skeleton = GetSkeleton(domain, spoof_checker); |
| 100 if (skeleton.empty()) { |
| 101 std::cerr << "failed to generate the skeleton of " << domain << '\n'; |
| 102 output << "// " << domain << '\n'; |
| 103 } else if (skeleton != domain) { |
| 104 output << domain << ", 1\n"; |
| 105 output << skeleton << ", 2\n"; |
| 106 } else { |
| 107 output << domain << ", 3\n"; |
| 108 } |
| 109 std::vector<base::StringPiece> labels = |
| 110 base::SplitStringPiece(domain, ".", base::TRIM_WHITESPACE, |
| 111 base::SPLIT_WANT_ALL); |
| 112 if (labels.size() > max_labels) { |
| 113 domains_with_max_labels = domain; |
| 114 max_labels = labels.size(); |
| 115 } |
| 116 |
| 117 } |
| 118 |
| 119 output << "%%\n"; |
| 120 |
| 121 if (!WriteToFile(output.str(), "alexa_names_and_skeletons.gperf")) |
| 122 return 1; |
| 123 |
| 124 std::cout << "The first domain with the largest number of labels is " |
| 125 << domains_with_max_labels << " and has " << max_labels |
| 126 << " labels.\n"; |
| 127 |
| 128 return 0; |
| 129 } |
OLD | NEW |