OLD | NEW |
---|---|
(Empty) | |
1 // Copyright (c) 2017 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include <algorithm> | |
6 #include <iostream> | |
7 #include <sstream> | |
8 #include <string> | |
9 #include <vector> | |
10 | |
11 #include "base/base_paths.h" | |
12 #include "base/files/file_path.h" | |
13 #include "base/files/file_util.h" | |
14 #include "base/i18n/icu_util.h" | |
15 #include "base/path_service.h" | |
16 #include "base/strings/string_split.h" | |
17 #include "base/strings/string_util.h" | |
18 #include "third_party/icu/source/common/unicode/unistr.h" | |
19 #include "third_party/icu/source/common/unicode/utypes.h" | |
20 #include "third_party/icu/source/i18n/unicode/uspoof.h" | |
21 | |
22 std::string GetSkeleton(const std::string& domain, | |
23 const USpoofChecker* spoof_checker) { | |
24 UErrorCode status = U_ZERO_ERROR; | |
25 icu::UnicodeString ustr_skeleton; | |
26 uspoof_getSkeletonUnicodeString(spoof_checker, 0, /* not used */ | |
27 icu::UnicodeString::fromUTF8(domain), | |
28 ustr_skeleton, &status); | |
29 std::string skeleton; | |
30 return U_SUCCESS(status) ? ustr_skeleton.toUTF8String(skeleton) : skeleton; | |
31 } | |
32 | |
33 base::FilePath GetPath(base::StringPiece basename) { | |
34 base::FilePath path; | |
35 base::PathService::Get(base::DIR_SOURCE_ROOT, &path); | |
36 return path.Append(FILE_PATH_LITERAL("components")) | |
37 .Append(FILE_PATH_LITERAL("url_formatter")) | |
38 .Append(FILE_PATH_LITERAL("top_domains")).AppendASCII(basename); | |
39 } | |
40 | |
41 bool WriteToFile(const std::string& content, base::StringPiece basename) { | |
42 base::FilePath path = GetPath(basename); | |
43 if (base::WriteFile(path, content.data(), content.size()) == -1) { | |
44 std::cerr << "failed to write to " << path.AsUTF8Unsafe() | |
Peter Kasting
2017/05/09 01:37:02
Nit: Initial caps? (several places)
jungshik at Google
2017/05/10 18:05:12
Done.
| |
45 << '\n'; | |
46 return false; | |
47 } | |
48 return true; | |
Peter Kasting
2017/05/09 01:37:02
Nit: Shorter:
bool succeeded = base::WriteFile(
jungshik at Google
2017/05/10 18:05:12
Ok. Turned it into CHECK.
| |
49 } | |
50 | |
51 int main(int argc, const char** argv) { | |
52 if (argc != 1) { | |
53 std::cerr << "Generate the list of top domain skeletons to use as\n" | |
Peter Kasting
2017/05/09 01:37:03
Nit: Generates
Seems like this could be wrapped c
jungshik at Google
2017/05/10 18:05:12
Done.
| |
54 << "input to base/dafsa/make_dafsa.py\n"; | |
Peter Kasting
2017/05/09 01:37:02
Nit: Leading << not necessary when continuing prev
jungshik at Google
2017/05/10 18:05:12
Done.
| |
55 std::cerr << "Usage: " << argv[0] << '\n'; | |
Peter Kasting
2017/05/09 01:37:02
Nit: Why not just continue << from the previous st
jungshik at Google
2017/05/10 18:05:13
Done.
| |
56 return 1; | |
57 } | |
58 | |
59 | |
60 base::i18n::InitializeICU(); | |
61 | |
62 base::FilePath input_file = GetPath("alexa_domains.list"); | |
63 std::string input_content; | |
64 if (!base::ReadFileToString(input_file, &input_content)) { | |
65 std::cerr << "failed to read the input file " << input_file.AsUTF8Unsafe() | |
66 << '\n'; | |
67 return 1; | |
68 } | |
69 | |
70 UErrorCode status = U_ZERO_ERROR; | |
71 USpoofChecker* spoof_checker = uspoof_open(&status); | |
72 if (U_FAILURE(status)) { | |
73 std::cerr << "failed to create an ICU uspoof_checker due to " | |
74 << u_errorName(status) << ".\n"; | |
75 return 1; | |
76 } | |
77 | |
78 std::stringstream input(input_content); | |
79 std::stringstream output; | |
Peter Kasting
2017/05/09 01:37:02
As far as I can tell, you just append unformatted
jungshik at Google
2017/05/10 18:05:13
You.re right. Thank you for catching it.
At first
| |
80 | |
81 output << "// Copyright 2017 The Chromium Authors. All rights reserved.\n" | |
82 << "// Use of this source code is governed by a BSD-style license that" | |
83 << " can be\n" | |
84 << "// found in the LICENSE file.\n\n" | |
85 << "// This file is generated by " | |
86 << "components/url_formatter/make_top_domain_gperf.\n" | |
87 << "// DO NOT MANUALLY EDIT!\n\n" | |
88 << "// Each entry is the skeleton of a top domain for the " | |
89 << "confusability check.\n" | |
Peter Kasting
2017/05/09 01:37:02
Nit: "...for the confusability check in <sourcefil
jungshik at Google
2017/05/10 18:05:12
Done.
| |
90 << "%%\n"; | |
91 | |
92 std::string domain; | |
93 size_t max_labels = 0; | |
94 std::string domains_with_max_labels; | |
Peter Kasting
2017/05/09 01:37:02
Nit: domain, singular?
jungshik at Google
2017/05/10 18:05:12
Done.
| |
95 while (std::getline(input, domain)) { | |
96 if (domain[0] == '#') continue; | |
97 std::string skeleton = GetSkeleton(domain, spoof_checker); | |
98 if (skeleton.empty()) { | |
99 std::cerr << "failed to generate the skeleton of " << domain << '\n'; | |
100 output << "// " << domain << '\n'; | |
101 } else { | |
102 output << skeleton << ", 1\n"; | |
103 } | |
104 std::vector<base::StringPiece> labels = | |
105 base::SplitStringPiece(domain, ".", base::TRIM_WHITESPACE, | |
106 base::SPLIT_WANT_ALL); | |
107 if (labels.size() > max_labels) { | |
108 domains_with_max_labels = domain; | |
109 max_labels = labels.size(); | |
110 } | |
111 | |
Peter Kasting
2017/05/09 01:37:02
Nit: No blank line
jungshik at Google
2017/05/10 18:05:12
Done.
| |
112 } | |
113 | |
114 output << "%%\n"; | |
115 | |
116 if (!WriteToFile(output.str(), "alexa_skeletons.gperf")) | |
117 return 1; | |
118 | |
119 std::cout << "The first domain with the largest number of labels is " | |
120 << domains_with_max_labels << " and has " << max_labels | |
121 << " labels.\n"; | |
122 | |
123 return 0; | |
124 } | |
OLD | NEW |