Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(63)

Side by Side Diff: components/url_formatter/top_domains/make_top_domain_list.cc

Issue 2784933002: Mitigate spoofing attempt using Latin letters. (Closed)
Patch Set: add similarity check unittests Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <algorithm>
6 #include <iostream>
7 #include <sstream>
8 #include <string>
9 #include <vector>
10
11 #include "base/base_paths.h"
12 #include "base/files/file_path.h"
13 #include "base/files/file_util.h"
14 #include "base/i18n/icu_util.h"
15 #include "base/path_service.h"
16 #include "base/strings/string_split.h"
17 #include "base/strings/string_util.h"
18 #include "third_party/icu/source/common/unicode/unistr.h"
19 #include "third_party/icu/source/common/unicode/utypes.h"
20 #include "third_party/icu/source/i18n/unicode/uspoof.h"
21
22 std::string GetSkeleton(const std::string& domain,
23 const USpoofChecker* spoof_checker) {
24 UErrorCode status = U_ZERO_ERROR;
25 icu::UnicodeString ustr_skeleton;
26 uspoof_getSkeletonUnicodeString(spoof_checker, 0, /* not used */
27 icu::UnicodeString::fromUTF8(domain),
28 ustr_skeleton, &status);
29 std::string skeleton;
30 return U_SUCCESS(status) ? ustr_skeleton.toUTF8String(skeleton) : skeleton;
31 }
32
33 base::FilePath GetPath(base::StringPiece basename) {
34 base::FilePath path;
35 base::PathService::Get(base::DIR_SOURCE_ROOT, &path);
36 return path.Append(FILE_PATH_LITERAL("components"))
37 .Append(FILE_PATH_LITERAL("url_formatter"))
38 .Append(FILE_PATH_LITERAL("top_domains")).AppendASCII(basename);
39 }
40
41 bool WriteToFile(const std::string& content, base::StringPiece basename) {
42 base::FilePath path = GetPath(basename);
43 if (base::WriteFile(path, content.data(), content.size()) == -1) {
44 std::cerr << "failed to write to " << path.AsUTF8Unsafe()
45 << '\n';
46 return false;
47 }
48 return true;
49 }
50
51 int main(int argc, const char** argv) {
52 if (argc != 1) {
53 std::cerr << "Generate top domain names and skeletons list to use as\n"
54 << "input to base/dafsa/make_dafsa.py\n";
55 std::cerr << "Usage: " << argv[0] << '\n';
56 return 1;
57 }
58
59
60 base::i18n::InitializeICU();
61
62 base::FilePath input_file = GetPath("alexa_10k_domains.list");
63 std::string input_content;
64 if (!base::ReadFileToString(input_file, &input_content)) {
65 std::cerr << "failed to read the input file " << input_file.AsUTF8Unsafe()
66 << '\n';
67 return 1;
68 }
69
70 UErrorCode status = U_ZERO_ERROR;
71 USpoofChecker* spoof_checker = uspoof_open(&status);
72 if (U_FAILURE(status)) {
73 std::cerr << "failed to create an ICU uspoof_checker due to "
74 << u_errorName(status) << ".\n";
75 return 1;
76 }
77
78 std::stringstream input(input_content);
79 std::stringstream output;
80
81 output << "// Copyright 2017 The Chromium Authors. All rights reserved.\n"
82 << "// Use of this source code is governed by a BSD-style license that"
83 << " can be\n"
84 << "// found in the LICENSE file.\n\n"
85 << "// This file is generated by "
86 << "components/url_formatter/make_top_domain_list.\n"
87 << "// DO NOT MANUALLY EDIT!\n\n"
88 << "// Each entry has one of the following two values.\n"
89 << "// 0: skeletons for confusable look-up.\n"
90 << "// 1: original domain name.\n"
91 << "%%\n";
92
93 std::string domain;
94 size_t max_labels = 0;
95 std::string domains_with_max_labels;
96 while (std::getline(input, domain)) {
97 if (domain[0] == '#') continue;
98 std::string skeleton = GetSkeleton(domain, spoof_checker);
99 if (!skeleton.empty()) {
100 output << skeleton << ", 0\n";
101 } else {
102 std::cerr << "failed to generate the skeleton of " << domain << '\n';
103 output << "// " << domain << '\n';
104 }
105 output << domain << ", 1\n";
106 std::vector<base::StringPiece> labels =
107 base::SplitStringPiece(domain, ".", base::TRIM_WHITESPACE,
108 base::SPLIT_WANT_ALL);
109 if (labels.size() > max_labels) {
110 domains_with_max_labels = domain;
111 max_labels = labels.size();
112 }
113
114 }
115
116 output << "%%\n";
117
118 if (!WriteToFile(output.str(), "alexa_10k_names_and_skeletons.gperf"))
119 return 1;
120
121 std::cout << "The first domain with the largest number of labels is "
122 << domains_with_max_labels << " and has " << max_labels
123 << " labels.\n";
124
125 return 0;
126 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698