OLD | NEW |
| (Empty) |
1 // Copyright (c) 2017 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include <algorithm> | |
6 #include <iostream> | |
7 #include <sstream> | |
8 #include <string> | |
9 #include <vector> | |
10 | |
11 #include "base/base_paths.h" | |
12 #include "base/files/file_path.h" | |
13 #include "base/files/file_util.h" | |
14 #include "base/i18n/icu_util.h" | |
15 #include "base/path_service.h" | |
16 #include "base/strings/string_split.h" | |
17 #include "base/strings/string_util.h" | |
18 #include "third_party/icu/source/common/unicode/unistr.h" | |
19 #include "third_party/icu/source/common/unicode/utypes.h" | |
20 #include "third_party/icu/source/i18n/unicode/uspoof.h" | |
21 | |
22 std::string GetSkeleton(const std::string& domain, | |
23 const USpoofChecker* spoof_checker) { | |
24 UErrorCode status = U_ZERO_ERROR; | |
25 icu::UnicodeString ustr_skeleton; | |
26 uspoof_getSkeletonUnicodeString(spoof_checker, 0 /* not used */, | |
27 icu::UnicodeString::fromUTF8(domain), | |
28 ustr_skeleton, &status); | |
29 std::string skeleton; | |
30 return U_SUCCESS(status) ? ustr_skeleton.toUTF8String(skeleton) : skeleton; | |
31 } | |
32 | |
33 base::FilePath GetPath(base::StringPiece basename) { | |
34 base::FilePath path; | |
35 base::PathService::Get(base::DIR_SOURCE_ROOT, &path); | |
36 return path.Append(FILE_PATH_LITERAL("components")) | |
37 .Append(FILE_PATH_LITERAL("url_formatter")) | |
38 .Append(FILE_PATH_LITERAL("top_domains")) | |
39 .AppendASCII(basename); | |
40 } | |
41 | |
42 bool WriteToFile(const std::string& content, base::StringPiece basename) { | |
43 base::FilePath path = GetPath(basename); | |
44 size_t size = content.size(); | |
45 bool succeeded = | |
46 static_cast<size_t>(base::WriteFile(path, content.data(), size)) == size; | |
47 if (!succeeded) | |
48 std::cerr << "Failed to write to " << path.AsUTF8Unsafe() << '\n'; | |
49 return succeeded; | |
50 } | |
51 | |
52 int main(int argc, const char** argv) { | |
53 if (argc != 1) { | |
54 std::cerr << "Generates the list of top domain skeletons to use as input to" | |
55 "\nbase/dafsa/make_dafsa.py.\nUsage: " | |
56 << argv[0] << '\n'; | |
57 return 1; | |
58 } | |
59 | |
60 base::i18n::InitializeICU(); | |
61 base::FilePath input_file = GetPath("alexa_domains.list"); | |
62 std::string input_content; | |
63 if (!base::ReadFileToString(input_file, &input_content)) { | |
64 std::cerr << "Failed to read the input file " << input_file.AsUTF8Unsafe() | |
65 << '\n'; | |
66 return 1; | |
67 } | |
68 | |
69 UErrorCode status = U_ZERO_ERROR; | |
70 USpoofChecker* spoof_checker = uspoof_open(&status); | |
71 if (U_FAILURE(status)) { | |
72 std::cerr << "Failed to create an ICU uspoof_checker due to " | |
73 << u_errorName(status) << ".\n"; | |
74 return 1; | |
75 } | |
76 | |
77 std::stringstream input(input_content); | |
78 std::string output = | |
79 R"(// Copyright 2017 The Chromium Authors. All rights reserved. | |
80 // Use of this source code is governed by a BSD-style license that can be | |
81 // found in the LICENSE file. | |
82 | |
83 // This file is generated by components/url_formatter/make_top_domain_gperf.cc | |
84 // DO NOT MANUALLY EDIT! | |
85 | |
86 // Each entry is the skeleton of a top domain for the confusability check | |
87 // in components/url_formatter/url_formatter.cc. | |
88 %% | |
89 )"; | |
90 | |
91 std::string domain; | |
92 size_t max_labels = 0; | |
93 std::string domain_with_max_labels; | |
94 while (std::getline(input, domain)) { | |
95 if (domain[0] == '#') | |
96 continue; | |
97 std::string skeleton = GetSkeleton(domain, spoof_checker); | |
98 if (skeleton.empty()) { | |
99 std::cerr << "Failed to generate the skeleton of " << domain << '\n'; | |
100 output += "// " + domain + '\n'; | |
101 } else { | |
102 output += skeleton + ", 1\n"; | |
103 } | |
104 std::vector<base::StringPiece> labels = base::SplitStringPiece( | |
105 domain, ".", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL); | |
106 if (labels.size() > max_labels) { | |
107 domain_with_max_labels = domain; | |
108 max_labels = labels.size(); | |
109 } | |
110 } | |
111 | |
112 output += "%%\n"; | |
113 | |
114 if (!WriteToFile(output, "alexa_skeletons.gperf")) | |
115 return 1; | |
116 | |
117 std::cout << "The first domain with the largest number of labels is " | |
118 << domain_with_max_labels << " and has " << max_labels | |
119 << " labels.\n"; | |
120 | |
121 return 0; | |
122 } | |
OLD | NEW |