Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(972)

Unified Diff: components/url_formatter/url_formatter.cc

Issue 2784933002: Mitigate spoofing attempt using Latin letters. (Closed)
Patch Set: check in alex_names_and_skeletons.gperf Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: components/url_formatter/url_formatter.cc
diff --git a/components/url_formatter/url_formatter.cc b/components/url_formatter/url_formatter.cc
index d54b67913ce76d400907266963475bae6920cdd9..0671c5dbf95e7d7109cedfe61c11385410a8b700 100644
--- a/components/url_formatter/url_formatter.cc
+++ b/components/url_formatter/url_formatter.cc
@@ -6,21 +6,26 @@
#include <algorithm>
#include <utility>
+#include <vector>
#include "base/lazy_instance.h"
#include "base/macros.h"
#include "base/numerics/safe_conversions.h"
#include "base/strings/string_piece.h"
+#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_offset_string_conversions.h"
#include "base/strings/utf_string_conversions.h"
#include "base/threading/thread_local_storage.h"
+#include "net/base/lookup_string_in_fixed_set.h"
#include "third_party/icu/source/common/unicode/schriter.h"
#include "third_party/icu/source/common/unicode/uidna.h"
#include "third_party/icu/source/common/unicode/uniset.h"
#include "third_party/icu/source/common/unicode/uscript.h"
+#include "third_party/icu/source/common/unicode/utypes.h"
#include "third_party/icu/source/common/unicode/uvernum.h"
#include "third_party/icu/source/i18n/unicode/regex.h"
+#include "third_party/icu/source/i18n/unicode/translit.h"
#include "third_party/icu/source/i18n/unicode/uspoof.h"
#include "url/gurl.h"
#include "url/third_party/mozilla/url_parse.h"
@@ -191,6 +196,55 @@ base::string16 FormatViewSourceUrl(
return result;
}
+// A helper class for IDN Spoof checking, used to ensure that no IDN input is
+// spoofable per Chromium's standard of spoofability. For a more thorough
+// explanation of how spoof checking works in Chromium, see
+// http://dev.chromium.org/developers/design-documents/idn-in-google-chrome .
+class IDNSpoofChecker {
+ public:
+ IDNSpoofChecker();
+
+ // Returns true if |label| is safe to display as Unicode. When the TLD is
+ // ASCII, check if a label is entirely made of Cyrillic letters that look like
+ // Latin letters. In the event of library failure, all IDN inputs will be
+ // treated as unsafe.
+ bool Check(base::StringPiece16 label, bool is_tld_ascii);
+
+ // Returns true if |hostname| or the last few components of |hostname| looks
+ // similar to one of top N domains (N=500). Two checks are done:
ncarter (slow) 2017/04/26 18:46:29 The (N=500) comment seems likely to fall out of da
jungshik at Google 2017/04/26 19:36:21 Done.
+ // 1. Calculate the skeleton of |hostname| based on the Unicode confusable
+ // character list and look it up in the pre-calculated skeleton list of
+ // top N domains.
+ // 2. Look up the diacritic-free version of |hostname| in the list of
+ // top N domains.
ncarter (slow) 2017/04/26 18:46:29 Should this document what happens if |hostname| is
jungshik at Google 2017/04/26 19:36:21 Non-IDN hostnames will not reach here (they're not
+ bool SimilarToTopDomains(base::StringPiece16 hostname);
+
+ private:
+ void SetAllowedUnicodeSet(UErrorCode* status);
+ bool IsMadeOfLatinAlikeCyrillic(const icu::UnicodeString& label_string);
+ bool GetSkeleton(base::StringPiece16 hostname, std::string* skeleton);
+ bool RemoveDiacritics(base::StringPiece16 input, std::string* accent_free);
+
+ USpoofChecker* checker_;
+ icu::UnicodeSet deviation_characters_;
+ icu::UnicodeSet non_ascii_latin_letters_;
+ icu::UnicodeSet kana_letters_exceptions_;
+ icu::UnicodeSet cyrillic_letters_;
+ icu::UnicodeSet cyrillic_letters_latin_alike_;
+ icu::UnicodeSet latin_letters_n_ascii_;
+ icu::Transliterator* transliterator_;
+
+ DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);
+};
+
+base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =
+ LAZY_INSTANCE_INITIALIZER;
+base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;
+
+void OnThreadTermination(void* regex_matcher) {
+ delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);
+}
+
// TODO(brettw): We may want to skip this step in the case of file URLs to
// allow unicode UNC hostnames regardless of encodings.
base::string16 IDNToUnicodeWithAdjustments(
@@ -212,6 +266,7 @@ base::string16 IDNToUnicodeWithAdjustments(
// Do each component of the host separately, since we enforce script matching
// on a per-component basis.
base::string16 out16;
+ bool has_idn_component = false;
for (size_t component_start = 0, component_end;
component_start < input16.length();
component_start = component_end + 1) {
@@ -227,6 +282,7 @@ base::string16 IDNToUnicodeWithAdjustments(
converted_idn =
IDNToUnicodeOneComponent(input16.data() + component_start,
component_length, is_tld_ascii, &out16);
+ has_idn_component = has_idn_component || converted_idn;
}
size_t new_component_length = out16.length() - new_component_start;
@@ -239,43 +295,14 @@ base::string16 IDNToUnicodeWithAdjustments(
if (component_end < input16.length())
out16.push_back('.');
}
- return out16;
-}
-
-// A helper class for IDN Spoof checking, used to ensure that no IDN input is
-// spoofable per Chromium's standard of spoofability. For a more thorough
-// explanation of how spoof checking works in Chromium, see
-// http://dev.chromium.org/developers/design-documents/idn-in-google-chrome .
-class IDNSpoofChecker {
- public:
- IDNSpoofChecker();
-
- // Returns true if |label| is safe to display as Unicode. When the TLD is
- // ASCII, check if a label is entirely made of Cyrillic letters that look like
- // Latin letters. In the event of library failure, all IDN inputs will be
- // treated as unsafe.
- bool Check(base::StringPiece16 label, bool is_tld_ascii);
-
- private:
- void SetAllowedUnicodeSet(UErrorCode* status);
- bool IsMadeOfLatinAlikeCyrillic(const icu::UnicodeString& label_string);
-
- USpoofChecker* checker_;
- icu::UnicodeSet deviation_characters_;
- icu::UnicodeSet non_ascii_latin_letters_;
- icu::UnicodeSet kana_letters_exceptions_;
- icu::UnicodeSet cyrillic_letters_;
- icu::UnicodeSet cyrillic_letters_latin_alike_;
-
- DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);
-};
-
-base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =
- LAZY_INSTANCE_INITIALIZER;
-base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;
-void OnThreadTermination(void* regex_matcher) {
- delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);
+ if (has_idn_component &&
+ g_idn_spoof_checker.Get().SimilarToTopDomains(out16)) {
+ if (adjustments)
+ adjustments->clear();
+ return input16;
+ }
+ return out16;
}
IDNSpoofChecker::IDNSpoofChecker() {
@@ -339,7 +366,32 @@ IDNSpoofChecker::IDNSpoofChecker() {
icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status);
cyrillic_letters_.freeze();
- DCHECK(U_SUCCESS(status));
+ // This set is used to determine whether or not to apply a slow
+ // transliteration to remove diacritics to a given hostname for accent-free
+ // comparison with top domain names. If it has any character outside the set,
+ // the expensive step will be skipped because it cannot match any of top
+ // domain names.
+ // The last ([\u0300-\u0331] is a shorthand for "[:Identifier_Status=Allowed:]
+ // & [:Script_Extensions=Inherited:] - [\\u200C\\u200D]". The latter is a
+ // subset of the former but it does not matter because hostnames with
+ // characters outside the latter set would be rejected in an earlier step.
+ latin_letters_n_ascii_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE(
+ "[[:Latin:] [0-9\\u002e_\\u002d] [\\u0300-\\u0331]]"), status);
+ latin_letters_n_ascii_.freeze();
+
+ // Used for diacritics-agnostic comparison. Add "ł > l; ø > o; đ > d" that
+ // are not handled by "NFD; Nonspacing mark removal; NFC".
+ UParseError parse_error;
+ transliterator_ = icu::Transliterator::createFromRules(
+ UNICODE_STRING_SIMPLE("DropAcc"),
+ icu::UnicodeString("::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;"
+ " ł > l; ø > o; đ > d;"),
+ UTRANS_FORWARD, parse_error, status);
+ DCHECK(U_SUCCESS(status))
+ << "Spoofchecker initalization failed due to an error: "
+ << u_errorName(status);
+ if (U_FAILURE(status))
+ transliterator_ = nullptr;
}
bool IDNSpoofChecker::Check(base::StringPiece16 label, bool is_tld_ascii) {
@@ -437,6 +489,80 @@ bool IDNSpoofChecker::Check(base::StringPiece16 label, bool is_tld_ascii) {
return !dangerous_pattern->find();
}
+bool IDNSpoofChecker::GetSkeleton(base::StringPiece16 hostname,
+ std::string* skeleton) {
+ skeleton->clear();
+ icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname.length());
+ // TODO(jshin): Consider supplementing the confusable list by replacing some
+ // characters with their confusable counterpart (e.g. U+04CF => 'l').
+ UErrorCode status = U_ZERO_ERROR;
+ icu::UnicodeString ustr_skeleton;
+ uspoof_getSkeletonUnicodeString(checker_, 0, /* not used. deprecated. */
+ ustr_host, ustr_skeleton, &status);
+ if (U_FAILURE(status))
+ return false;
+ ustr_skeleton.toUTF8String(*skeleton);
+ return true;
+}
+
+#include "components/url_formatter/top_domains/alexa_names_and_skeletons-inc.cc"
+// All the domains in the above file have 3 or fewer labels.
+const size_t kNumberOfLabelsToCheck = 3;
+
+bool LookupStringInSet(base::StringPiece needle,
+ const unsigned char* fixed_set,
+ size_t set_len,
+ int mask) {
+ int type = net::LookupStringInFixedSet(fixed_set, set_len, needle.data(),
+ needle.length());
+ return (type != net::kDafsaNotFound) && ((type & mask) != 0);
+}
+
+bool LookupMatchInTopDomains(base::StringPiece hostname, int mask) {
+ // When 'hostname' is a skeleton instead of actual hostname, it's assumed
+ // that no character other than '.' among those allowed in IDN will have
+ // '.' as its skeleton.
+ auto labels = base::SplitStringPiece(hostname, ".", base::KEEP_WHITESPACE,
+ base::SPLIT_WANT_ALL);
ncarter (slow) 2017/04/26 18:46:29 Is it possible for hostname to end in ".", or has
jungshik at Google 2017/04/26 19:36:21 That's taken care of in the loop in the caller (se
+
+ while (labels.size() > kNumberOfLabelsToCheck)
+ labels.erase(labels.begin());
+
+ while (labels.size() > 1) {
+ std::string partial_hostname = base::JoinString(labels, ".");
+ if (LookupStringInSet(partial_hostname, kDafsa, arraysize(kDafsa), mask))
+ return true;
+ labels.erase(labels.begin());
+ }
+ return false;
+}
+
+bool IDNSpoofChecker::RemoveDiacritics(base::StringPiece16 input,
+ std::string* accent_free) {
+ if (!transliterator_)
+ return false;
+ icu::UnicodeString ustr_input(FALSE, input.data(), input.length());
+ // If input has any characters outside Latin and [._-], there is no point in
+ // getting rid of diacritics because it will not match any of top domain
+ // names even after diacritics removal.
+ if (latin_letters_n_ascii_.span(ustr_input, 0, USET_SPAN_CONTAINED) !=
+ ustr_input.length())
+ return false;
+ transliterator_->transliterate(ustr_input);
+ ustr_input.toUTF8String(*accent_free);
+ return true;
+}
+
+bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {
+ std::string skeleton;
+ if (GetSkeleton(hostname, &skeleton) && LookupMatchInTopDomains(skeleton, 2))
+ return true;
+
+ std::string accent_free_name;
+ return RemoveDiacritics(hostname, &accent_free_name) &&
+ LookupMatchInTopDomains(accent_free_name, 1);
+}
+
bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(
const icu::UnicodeString& label_string) {
// Collect all the Cyrillic letters in |label_string| and see if they're
« no previous file with comments | « components/url_formatter/top_domains/make_top_domain_gperf.cc ('k') | components/url_formatter/url_formatter_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698