OLD | NEW |
---|---|
(Empty) | |
1 // Copyright (c) 2017 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include <algorithm> | |
6 #include <iostream> | |
7 #include <sstream> | |
8 #include <string> | |
9 #include <vector> | |
10 | |
11 #include "base/base_paths.h" | |
12 #include "base/files/file_path.h" | |
13 #include "base/files/file_util.h" | |
14 #include "base/i18n/icu_util.h" | |
15 #include "base/path_service.h" | |
16 #include "base/strings/string_split.h" | |
17 #include "base/strings/string_util.h" | |
18 #include "third_party/icu/source/common/unicode/unistr.h" | |
19 #include "third_party/icu/source/common/unicode/utypes.h" | |
20 #include "third_party/icu/source/i18n/unicode/uspoof.h" | |
21 | |
22 std::string GetSkeleton(const std::string& domain, | |
23 const USpoofChecker* spoof_checker) { | |
24 UErrorCode status = U_ZERO_ERROR; | |
25 icu::UnicodeString ustr_skeleton; | |
26 uspoof_getSkeletonUnicodeString(spoof_checker, 0, /* not used */ | |
Peter Kasting
2017/05/10 22:38:47
Nit: Suggest ',' after */ instead of before
jungshik at Google
2017/05/14 09:36:23
Done.
| |
27 icu::UnicodeString::fromUTF8(domain), | |
28 ustr_skeleton, &status); | |
29 std::string skeleton; | |
30 return U_SUCCESS(status) ? ustr_skeleton.toUTF8String(skeleton) : skeleton; | |
31 } | |
32 | |
33 base::FilePath GetPath(base::StringPiece basename) { | |
34 base::FilePath path; | |
35 base::PathService::Get(base::DIR_SOURCE_ROOT, &path); | |
36 return path.Append(FILE_PATH_LITERAL("components")) | |
37 .Append(FILE_PATH_LITERAL("url_formatter")) | |
Peter Kasting
2017/05/10 22:38:47
Nit: Is this how git cl format indented this? I'd
jungshik at Google
2017/05/14 09:36:23
Yes, even if I get dots aligned manually, 'git cl
| |
38 .Append(FILE_PATH_LITERAL("top_domains")) | |
39 .AppendASCII(basename); | |
40 } | |
41 | |
42 void WriteToFile(const std::string& content, base::StringPiece basename) { | |
43 base::FilePath path = GetPath(basename); | |
44 bool succeeded = base::WriteFile(path, content.data(), content.size()) != -1; | |
Peter Kasting
2017/05/10 22:38:47
Again, shouldn't this check "== content.size()"?
jungshik at Google
2017/05/14 09:36:23
Done.
| |
45 CHECK(succeeded) << "Failed to write to " << path.AsUTF8Unsafe() << '\n'; | |
Peter Kasting
2017/05/10 22:38:47
Nit: Maybe rather than CHECK, return |succeeded|,
jungshik at Google
2017/05/14 09:36:23
Done.
| |
46 } | |
47 | |
48 int main(int argc, const char** argv) { | |
49 if (argc != 1) { | |
50 std::cerr << "Generates the list of top domain skeletons to use as input to" | |
51 "\nbase/dafsa/make_dafsa.py.\nUsage: " | |
52 << argv[0] << '\n'; | |
53 return 1; | |
54 } | |
55 | |
56 base::i18n::InitializeICU(); | |
57 base::FilePath input_file = GetPath("alexa_domains.list"); | |
58 std::string input_content; | |
59 if (!base::ReadFileToString(input_file, &input_content)) { | |
60 std::cerr << "Failed to read the input file " << input_file.AsUTF8Unsafe() | |
61 << '\n'; | |
62 return 1; | |
63 } | |
64 | |
65 UErrorCode status = U_ZERO_ERROR; | |
66 USpoofChecker* spoof_checker = uspoof_open(&status); | |
67 if (U_FAILURE(status)) { | |
68 std::cerr << "Failed to create an ICU uspoof_checker due to " | |
69 << u_errorName(status) << ".\n"; | |
70 return 1; | |
71 } | |
72 | |
73 std::stringstream input(input_content); | |
74 std::string output = | |
75 "// Copyright 2017 The Chromium Authors. All rights reserved.\n" | |
Peter Kasting
2017/05/10 22:38:47
Nit: Maybe consider a raw string literal
jungshik at Google
2017/05/14 09:36:23
Done.
| |
76 "// Use of this source code is governed by a BSD-style license that can " | |
77 "be\n" | |
78 "// found in the LICENSE file.\n\n" | |
79 "// This file is generated by components/url_formatter/" | |
80 "make_top_domain_gperf.cc\n" | |
81 "// DO NOT MANUALLY EDIT!\n\n" | |
82 "// Each entry is the skeleton of a top domain for the confusability " | |
83 "check\n" | |
84 "// in components/url_formatter/url_formatter.cc.\n" | |
85 "%%\n"; | |
86 | |
87 std::string domain; | |
88 size_t max_labels = 0; | |
89 std::string domain_with_max_labels; | |
90 while (std::getline(input, domain)) { | |
91 if (domain[0] == '#') | |
92 continue; | |
93 std::string skeleton = GetSkeleton(domain, spoof_checker); | |
94 if (skeleton.empty()) { | |
95 std::cerr << "Failed to generate the skeleton of " << domain << '\n'; | |
96 output += "// " + domain + '\n'; | |
97 } else { | |
98 output += skeleton + ", 1\n"; | |
99 } | |
100 std::vector<base::StringPiece> labels = base::SplitStringPiece( | |
101 domain, ".", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL); | |
102 if (labels.size() > max_labels) { | |
103 domain_with_max_labels = domain; | |
104 max_labels = labels.size(); | |
105 } | |
106 } | |
107 | |
108 output += "%%\n"; | |
109 | |
110 WriteToFile(output, "alexa_skeletons.gperf"); | |
111 | |
112 std::cout << "The first domain with the largest number of labels is " | |
113 << domain_with_max_labels << " and has " << max_labels | |
114 << " labels.\n"; | |
115 | |
116 return 0; | |
117 } | |
OLD | NEW |