Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(195)

Unified Diff: components/url_formatter/top_domains/make_top_domain_gperf.cc

Issue 2784933002: Mitigate spoofing attempt using Latin letters. (Closed)
Patch Set: add back U+04CF (ÓŹ) -> 'l' map Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: components/url_formatter/top_domains/make_top_domain_gperf.cc
diff --git a/components/url_formatter/top_domains/make_top_domain_gperf.cc b/components/url_formatter/top_domains/make_top_domain_gperf.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3921bd35fd8fe955aa2fb99a43155d3457167257
--- /dev/null
+++ b/components/url_formatter/top_domains/make_top_domain_gperf.cc
@@ -0,0 +1,124 @@
+// Copyright (c) 2017 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "base/base_paths.h"
+#include "base/files/file_path.h"
+#include "base/files/file_util.h"
+#include "base/i18n/icu_util.h"
+#include "base/path_service.h"
+#include "base/strings/string_split.h"
+#include "base/strings/string_util.h"
+#include "third_party/icu/source/common/unicode/unistr.h"
+#include "third_party/icu/source/common/unicode/utypes.h"
+#include "third_party/icu/source/i18n/unicode/uspoof.h"
+
+std::string GetSkeleton(const std::string& domain,
+ const USpoofChecker* spoof_checker) {
+ UErrorCode status = U_ZERO_ERROR;
+ icu::UnicodeString ustr_skeleton;
+ uspoof_getSkeletonUnicodeString(spoof_checker, 0, /* not used */
+ icu::UnicodeString::fromUTF8(domain),
+ ustr_skeleton, &status);
+ std::string skeleton;
+ return U_SUCCESS(status) ? ustr_skeleton.toUTF8String(skeleton) : skeleton;
+}
+
+base::FilePath GetPath(base::StringPiece basename) {
+ base::FilePath path;
+ base::PathService::Get(base::DIR_SOURCE_ROOT, &path);
+ return path.Append(FILE_PATH_LITERAL("components"))
+ .Append(FILE_PATH_LITERAL("url_formatter"))
+ .Append(FILE_PATH_LITERAL("top_domains")).AppendASCII(basename);
+}
+
+bool WriteToFile(const std::string& content, base::StringPiece basename) {
+ base::FilePath path = GetPath(basename);
+ if (base::WriteFile(path, content.data(), content.size()) == -1) {
+ std::cerr << "failed to write to " << path.AsUTF8Unsafe()
Peter Kasting 2017/05/09 01:37:02 Nit: Initial caps? (several places)
jungshik at Google 2017/05/10 18:05:12 Done.
+ << '\n';
+ return false;
+ }
+ return true;
Peter Kasting 2017/05/09 01:37:02 Nit: Shorter: bool succeeded = base::WriteFile(
jungshik at Google 2017/05/10 18:05:12 Ok. Turned it into CHECK.
+}
+
+int main(int argc, const char** argv) {
+ if (argc != 1) {
+ std::cerr << "Generate the list of top domain skeletons to use as\n"
Peter Kasting 2017/05/09 01:37:03 Nit: Generates Seems like this could be wrapped c
jungshik at Google 2017/05/10 18:05:12 Done.
+ << "input to base/dafsa/make_dafsa.py\n";
Peter Kasting 2017/05/09 01:37:02 Nit: Leading << not necessary when continuing prev
jungshik at Google 2017/05/10 18:05:12 Done.
+ std::cerr << "Usage: " << argv[0] << '\n';
Peter Kasting 2017/05/09 01:37:02 Nit: Why not just continue << from the previous st
jungshik at Google 2017/05/10 18:05:13 Done.
+ return 1;
+ }
+
+
+ base::i18n::InitializeICU();
+
+ base::FilePath input_file = GetPath("alexa_domains.list");
+ std::string input_content;
+ if (!base::ReadFileToString(input_file, &input_content)) {
+ std::cerr << "failed to read the input file " << input_file.AsUTF8Unsafe()
+ << '\n';
+ return 1;
+ }
+
+ UErrorCode status = U_ZERO_ERROR;
+ USpoofChecker* spoof_checker = uspoof_open(&status);
+ if (U_FAILURE(status)) {
+ std::cerr << "failed to create an ICU uspoof_checker due to "
+ << u_errorName(status) << ".\n";
+ return 1;
+ }
+
+ std::stringstream input(input_content);
+ std::stringstream output;
Peter Kasting 2017/05/09 01:37:02 As far as I can tell, you just append unformatted
jungshik at Google 2017/05/10 18:05:13 You.re right. Thank you for catching it. At first
+
+ output << "// Copyright 2017 The Chromium Authors. All rights reserved.\n"
+ << "// Use of this source code is governed by a BSD-style license that"
+ << " can be\n"
+ << "// found in the LICENSE file.\n\n"
+ << "// This file is generated by "
+ << "components/url_formatter/make_top_domain_gperf.\n"
+ << "// DO NOT MANUALLY EDIT!\n\n"
+ << "// Each entry is the skeleton of a top domain for the "
+ << "confusability check.\n"
Peter Kasting 2017/05/09 01:37:02 Nit: "...for the confusability check in <sourcefil
jungshik at Google 2017/05/10 18:05:12 Done.
+ << "%%\n";
+
+ std::string domain;
+ size_t max_labels = 0;
+ std::string domains_with_max_labels;
Peter Kasting 2017/05/09 01:37:02 Nit: domain, singular?
jungshik at Google 2017/05/10 18:05:12 Done.
+ while (std::getline(input, domain)) {
+ if (domain[0] == '#') continue;
+ std::string skeleton = GetSkeleton(domain, spoof_checker);
+ if (skeleton.empty()) {
+ std::cerr << "failed to generate the skeleton of " << domain << '\n';
+ output << "// " << domain << '\n';
+ } else {
+ output << skeleton << ", 1\n";
+ }
+ std::vector<base::StringPiece> labels =
+ base::SplitStringPiece(domain, ".", base::TRIM_WHITESPACE,
+ base::SPLIT_WANT_ALL);
+ if (labels.size() > max_labels) {
+ domains_with_max_labels = domain;
+ max_labels = labels.size();
+ }
+
Peter Kasting 2017/05/09 01:37:02 Nit: No blank line
jungshik at Google 2017/05/10 18:05:12 Done.
+ }
+
+ output << "%%\n";
+
+ if (!WriteToFile(output.str(), "alexa_skeletons.gperf"))
+ return 1;
+
+ std::cout << "The first domain with the largest number of labels is "
+ << domains_with_max_labels << " and has " << max_labels
+ << " labels.\n";
+
+ return 0;
+}

Powered by Google App Engine
This is Rietveld 408576698