Index: components/url_formatter/top_domains/make_top_domain_gperf.cc |
diff --git a/components/url_formatter/top_domains/make_top_domain_gperf.cc b/components/url_formatter/top_domains/make_top_domain_gperf.cc |
new file mode 100644 |
index 0000000000000000000000000000000000000000..9a6d2049b872fc92d5db8265be7ee5d4a62690e0 |
--- /dev/null |
+++ b/components/url_formatter/top_domains/make_top_domain_gperf.cc |
@@ -0,0 +1,117 @@ |
+// Copyright (c) 2017 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#include <algorithm> |
+#include <iostream> |
+#include <sstream> |
+#include <string> |
+#include <vector> |
+ |
+#include "base/base_paths.h" |
+#include "base/files/file_path.h" |
+#include "base/files/file_util.h" |
+#include "base/i18n/icu_util.h" |
+#include "base/path_service.h" |
+#include "base/strings/string_split.h" |
+#include "base/strings/string_util.h" |
+#include "third_party/icu/source/common/unicode/unistr.h" |
+#include "third_party/icu/source/common/unicode/utypes.h" |
+#include "third_party/icu/source/i18n/unicode/uspoof.h" |
+ |
+std::string GetSkeleton(const std::string& domain, |
+ const USpoofChecker* spoof_checker) { |
+ UErrorCode status = U_ZERO_ERROR; |
+ icu::UnicodeString ustr_skeleton; |
+ uspoof_getSkeletonUnicodeString(spoof_checker, 0, /* not used */ |
Peter Kasting
2017/05/10 22:38:47
Nit: Suggest ',' after */ instead of before
jungshik at Google
2017/05/14 09:36:23
Done.
|
+ icu::UnicodeString::fromUTF8(domain), |
+ ustr_skeleton, &status); |
+ std::string skeleton; |
+ return U_SUCCESS(status) ? ustr_skeleton.toUTF8String(skeleton) : skeleton; |
+} |
+ |
+base::FilePath GetPath(base::StringPiece basename) { |
+ base::FilePath path; |
+ base::PathService::Get(base::DIR_SOURCE_ROOT, &path); |
+ return path.Append(FILE_PATH_LITERAL("components")) |
+ .Append(FILE_PATH_LITERAL("url_formatter")) |
Peter Kasting
2017/05/10 22:38:47
Nit: Is this how git cl format indented this? I'd
jungshik at Google
2017/05/14 09:36:23
Yes, even if I get dots aligned manually, 'git cl
|
+ .Append(FILE_PATH_LITERAL("top_domains")) |
+ .AppendASCII(basename); |
+} |
+ |
+void WriteToFile(const std::string& content, base::StringPiece basename) { |
+ base::FilePath path = GetPath(basename); |
+ bool succeeded = base::WriteFile(path, content.data(), content.size()) != -1; |
Peter Kasting
2017/05/10 22:38:47
Again, shouldn't this check "== content.size()"?
jungshik at Google
2017/05/14 09:36:23
Done.
|
+ CHECK(succeeded) << "Failed to write to " << path.AsUTF8Unsafe() << '\n'; |
Peter Kasting
2017/05/10 22:38:47
Nit: Maybe rather than CHECK, return |succeeded|,
jungshik at Google
2017/05/14 09:36:23
Done.
|
+} |
+ |
+int main(int argc, const char** argv) { |
+ if (argc != 1) { |
+ std::cerr << "Generates the list of top domain skeletons to use as input to" |
+ "\nbase/dafsa/make_dafsa.py.\nUsage: " |
+ << argv[0] << '\n'; |
+ return 1; |
+ } |
+ |
+ base::i18n::InitializeICU(); |
+ base::FilePath input_file = GetPath("alexa_domains.list"); |
+ std::string input_content; |
+ if (!base::ReadFileToString(input_file, &input_content)) { |
+ std::cerr << "Failed to read the input file " << input_file.AsUTF8Unsafe() |
+ << '\n'; |
+ return 1; |
+ } |
+ |
+ UErrorCode status = U_ZERO_ERROR; |
+ USpoofChecker* spoof_checker = uspoof_open(&status); |
+ if (U_FAILURE(status)) { |
+ std::cerr << "Failed to create an ICU uspoof_checker due to " |
+ << u_errorName(status) << ".\n"; |
+ return 1; |
+ } |
+ |
+ std::stringstream input(input_content); |
+ std::string output = |
+ "// Copyright 2017 The Chromium Authors. All rights reserved.\n" |
Peter Kasting
2017/05/10 22:38:47
Nit: Maybe consider a raw string literal
jungshik at Google
2017/05/14 09:36:23
Done.
|
+ "// Use of this source code is governed by a BSD-style license that can " |
+ "be\n" |
+ "// found in the LICENSE file.\n\n" |
+ "// This file is generated by components/url_formatter/" |
+ "make_top_domain_gperf.cc\n" |
+ "// DO NOT MANUALLY EDIT!\n\n" |
+ "// Each entry is the skeleton of a top domain for the confusability " |
+ "check\n" |
+ "// in components/url_formatter/url_formatter.cc.\n" |
+ "%%\n"; |
+ |
+ std::string domain; |
+ size_t max_labels = 0; |
+ std::string domain_with_max_labels; |
+ while (std::getline(input, domain)) { |
+ if (domain[0] == '#') |
+ continue; |
+ std::string skeleton = GetSkeleton(domain, spoof_checker); |
+ if (skeleton.empty()) { |
+ std::cerr << "Failed to generate the skeleton of " << domain << '\n'; |
+ output += "// " + domain + '\n'; |
+ } else { |
+ output += skeleton + ", 1\n"; |
+ } |
+ std::vector<base::StringPiece> labels = base::SplitStringPiece( |
+ domain, ".", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL); |
+ if (labels.size() > max_labels) { |
+ domain_with_max_labels = domain; |
+ max_labels = labels.size(); |
+ } |
+ } |
+ |
+ output += "%%\n"; |
+ |
+ WriteToFile(output, "alexa_skeletons.gperf"); |
+ |
+ std::cout << "The first domain with the largest number of labels is " |
+ << domain_with_max_labels << " and has " << max_labels |
+ << " labels.\n"; |
+ |
+ return 0; |
+} |