Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright (c) 2017 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include <algorithm> | |
| 6 #include <iostream> | |
| 7 #include <sstream> | |
| 8 #include <string> | |
| 9 #include <vector> | |
| 10 | |
| 11 #include "base/base_paths.h" | |
| 12 #include "base/files/file_path.h" | |
| 13 #include "base/files/file_util.h" | |
| 14 #include "base/i18n/icu_util.h" | |
| 15 #include "base/path_service.h" | |
| 16 #include "base/strings/string_split.h" | |
| 17 #include "base/strings/string_util.h" | |
| 18 #include "third_party/icu/source/common/unicode/unistr.h" | |
| 19 #include "third_party/icu/source/common/unicode/utypes.h" | |
| 20 #include "third_party/icu/source/i18n/unicode/uspoof.h" | |
| 21 | |
| 22 std::string GetSkeleton(const std::string& domain, | |
| 23 const USpoofChecker* spoof_checker) { | |
| 24 UErrorCode status = U_ZERO_ERROR; | |
| 25 icu::UnicodeString ustr_skeleton; | |
| 26 uspoof_getSkeletonUnicodeString(spoof_checker, 0, /* not used */ | |
|
Peter Kasting
2017/05/10 22:38:47
Nit: Suggest ',' after */ instead of before
jungshik at Google
2017/05/14 09:36:23
Done.
| |
| 27 icu::UnicodeString::fromUTF8(domain), | |
| 28 ustr_skeleton, &status); | |
| 29 std::string skeleton; | |
| 30 return U_SUCCESS(status) ? ustr_skeleton.toUTF8String(skeleton) : skeleton; | |
| 31 } | |
| 32 | |
| 33 base::FilePath GetPath(base::StringPiece basename) { | |
| 34 base::FilePath path; | |
| 35 base::PathService::Get(base::DIR_SOURCE_ROOT, &path); | |
| 36 return path.Append(FILE_PATH_LITERAL("components")) | |
| 37 .Append(FILE_PATH_LITERAL("url_formatter")) | |
|
Peter Kasting
2017/05/10 22:38:47
Nit: Is this how git cl format indented this? I'd
jungshik at Google
2017/05/14 09:36:23
Yes, even if I get dots aligned manually, 'git cl
| |
| 38 .Append(FILE_PATH_LITERAL("top_domains")) | |
| 39 .AppendASCII(basename); | |
| 40 } | |
| 41 | |
| 42 void WriteToFile(const std::string& content, base::StringPiece basename) { | |
| 43 base::FilePath path = GetPath(basename); | |
| 44 bool succeeded = base::WriteFile(path, content.data(), content.size()) != -1; | |
|
Peter Kasting
2017/05/10 22:38:47
Again, shouldn't this check "== content.size()"?
jungshik at Google
2017/05/14 09:36:23
Done.
| |
| 45 CHECK(succeeded) << "Failed to write to " << path.AsUTF8Unsafe() << '\n'; | |
|
Peter Kasting
2017/05/10 22:38:47
Nit: Maybe rather than CHECK, return |succeeded|,
jungshik at Google
2017/05/14 09:36:23
Done.
| |
| 46 } | |
| 47 | |
| 48 int main(int argc, const char** argv) { | |
| 49 if (argc != 1) { | |
| 50 std::cerr << "Generates the list of top domain skeletons to use as input to" | |
| 51 "\nbase/dafsa/make_dafsa.py.\nUsage: " | |
| 52 << argv[0] << '\n'; | |
| 53 return 1; | |
| 54 } | |
| 55 | |
| 56 base::i18n::InitializeICU(); | |
| 57 base::FilePath input_file = GetPath("alexa_domains.list"); | |
| 58 std::string input_content; | |
| 59 if (!base::ReadFileToString(input_file, &input_content)) { | |
| 60 std::cerr << "Failed to read the input file " << input_file.AsUTF8Unsafe() | |
| 61 << '\n'; | |
| 62 return 1; | |
| 63 } | |
| 64 | |
| 65 UErrorCode status = U_ZERO_ERROR; | |
| 66 USpoofChecker* spoof_checker = uspoof_open(&status); | |
| 67 if (U_FAILURE(status)) { | |
| 68 std::cerr << "Failed to create an ICU uspoof_checker due to " | |
| 69 << u_errorName(status) << ".\n"; | |
| 70 return 1; | |
| 71 } | |
| 72 | |
| 73 std::stringstream input(input_content); | |
| 74 std::string output = | |
| 75 "// Copyright 2017 The Chromium Authors. All rights reserved.\n" | |
|
Peter Kasting
2017/05/10 22:38:47
Nit: Maybe consider a raw string literal
jungshik at Google
2017/05/14 09:36:23
Done.
| |
| 76 "// Use of this source code is governed by a BSD-style license that can " | |
| 77 "be\n" | |
| 78 "// found in the LICENSE file.\n\n" | |
| 79 "// This file is generated by components/url_formatter/" | |
| 80 "make_top_domain_gperf.cc\n" | |
| 81 "// DO NOT MANUALLY EDIT!\n\n" | |
| 82 "// Each entry is the skeleton of a top domain for the confusability " | |
| 83 "check\n" | |
| 84 "// in components/url_formatter/url_formatter.cc.\n" | |
| 85 "%%\n"; | |
| 86 | |
| 87 std::string domain; | |
| 88 size_t max_labels = 0; | |
| 89 std::string domain_with_max_labels; | |
| 90 while (std::getline(input, domain)) { | |
| 91 if (domain[0] == '#') | |
| 92 continue; | |
| 93 std::string skeleton = GetSkeleton(domain, spoof_checker); | |
| 94 if (skeleton.empty()) { | |
| 95 std::cerr << "Failed to generate the skeleton of " << domain << '\n'; | |
| 96 output += "// " + domain + '\n'; | |
| 97 } else { | |
| 98 output += skeleton + ", 1\n"; | |
| 99 } | |
| 100 std::vector<base::StringPiece> labels = base::SplitStringPiece( | |
| 101 domain, ".", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL); | |
| 102 if (labels.size() > max_labels) { | |
| 103 domain_with_max_labels = domain; | |
| 104 max_labels = labels.size(); | |
| 105 } | |
| 106 } | |
| 107 | |
| 108 output += "%%\n"; | |
| 109 | |
| 110 WriteToFile(output, "alexa_skeletons.gperf"); | |
| 111 | |
| 112 std::cout << "The first domain with the largest number of labels is " | |
| 113 << domain_with_max_labels << " and has " << max_labels | |
| 114 << " labels.\n"; | |
| 115 | |
| 116 return 0; | |
| 117 } | |
| OLD | NEW |