components/url_formatter/idn_spoof_checker.cc - Issue 2889303003: Revert of Mitigate spoofing attempt using Latin letters.

Unified Diff: components/url_formatter/idn_spoof_checker.cc

Issue 2889303003: Revert of Mitigate spoofing attempt using Latin letters. (Closed)

Patch Set: Created 3 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: components/url_formatter/idn_spoof_checker.cc

diff --git a/components/url_formatter/idn_spoof_checker.cc b/components/url_formatter/idn_spoof_checker.cc

index d90442c84cf64b6f052a919984042e6151367796..156f0cd65ac81acc0907a01936837326926ef061 100644

--- a/components/url_formatter/idn_spoof_checker.cc

+++ b/components/url_formatter/idn_spoof_checker.cc

@@ -8,11 +8,9 @@

#include "base/strings/string_split.h"

#include "base/strings/string_util.h"

#include "base/threading/thread_local_storage.h"

-#include "net/base/lookup_string_in_fixed_set.h"

#include "third_party/icu/source/common/unicode/schriter.h"

#include "third_party/icu/source/common/unicode/unistr.h"

#include "third_party/icu/source/i18n/unicode/regex.h"

-#include "third_party/icu/source/i18n/unicode/translit.h"

#include "third_party/icu/source/i18n/unicode/uspoof.h"

namespace url_formatter {

@@ -22,31 +20,6 @@

void OnThreadTermination(void* regex_matcher) {

delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);

-#include "components/url_formatter/top_domains/alexa_skeletons-inc.cc"

-// All the domains in the above file have 3 or fewer labels.

-const size_t kNumberOfLabelsToCheck = 3;

-bool LookupMatchInTopDomains(base::StringPiece skeleton) {

- DCHECK_NE(skeleton.back(), '.');

- auto labels = base::SplitStringPiece(skeleton, ".", base::KEEP_WHITESPACE,

- base::SPLIT_WANT_ALL);

- if (labels.size() > kNumberOfLabelsToCheck) {

- labels.erase(labels.begin(),

- labels.begin() + labels.size() - kNumberOfLabelsToCheck);

- }

- while (labels.size() > 1) {

- std::string partial_skeleton = base::JoinString(labels, ".");

- if (net::LookupStringInFixedSet(

- kDafsa, arraysize(kDafsa), partial_skeleton.data(),

- partial_skeleton.length()) != net::kDafsaNotFound)

- return true;

- labels.erase(labels.begin());

- }

- return false;

}

} // namespace

@@ -95,14 +68,11 @@

icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);

non_ascii_latin_letters_.freeze();

- // The following two sets are parts of |dangerous_patterns_|.

+ // These letters are parts of |dangerous_patterns_|.

kana_letters_exceptions_ = icu::UnicodeSet(

UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"),

status);

kana_letters_exceptions_.freeze();

- combining_diacritics_exceptions_ =

- icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0300-\\u0339]"), status);

- combining_diacritics_exceptions_.freeze();

// These Cyrillic letters look like Latin. A domain label entirely made of

// these letters is blocked as a simplified whole-script-spoofable.

@@ -115,36 +85,6 @@

cyrillic_letters_.freeze();

DCHECK(U_SUCCESS(status));

- // This set is used to determine whether or not to apply a slow

- // transliteration to remove diacritics to a given hostname before the

- // confusable skeleton calculation for comparison with top domain names. If

- // it has any character outside the set, the expensive step will be skipped

- // because it cannot match any of top domain names.

- // The last ([\u0300-\u0339] is a shorthand for "[:Identifier_Status=Allowed:]

- // & [:Script_Extensions=Inherited:] - [\\u200C\\u200D]". The latter is a

- // subset of the former but it does not matter because hostnames with

- // characters outside the latter set would be rejected in an earlier step.

- lgc_letters_n_ascii_ = icu::UnicodeSet(

- UNICODE_STRING_SIMPLE("[[:Latin:][:Greek:][:Cyrillic:][0-9\\u002e_"

- "\\u002d][\\u0300-\\u0339]]"),

- status);

- lgc_letters_n_ascii_.freeze();

- // Used for diacritics-removal before the skeleton calculation. Add

- // "ł > l; ø > o; đ > d" that are not handled by "NFD; Nonspacing mark

- // removal; NFC". On top of that, supplement the Unicode confusable list by

- // replacing {U+043A (к), U+0138(ĸ), U+03BA(κ)}, U+04CF (ӏ) and U+043F(п) by

- // 'k', 'l' and 'n', respectively.

- // TODO(jshin): Revisit "ł > l; ø > o" mapping.

- UParseError parse_error;

- transliterator_.reset(icu::Transliterator::createFromRules(

- UNICODE_STRING_SIMPLE("DropAcc"),

- icu::UnicodeString("::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;"

- " ł > l; ø > o; đ > d; ӏ > l; [кĸκ] > k; п > n;"),

- UTRANS_FORWARD, parse_error, status));

- DCHECK(U_SUCCESS(status))

- << "Spoofchecker initalization failed due to an error: "

- << u_errorName(status);

}

IDNSpoofChecker::~IDNSpoofChecker() {

@@ -180,13 +120,9 @@

return false;

// If there's no script mixing, the input is regarded as safe without any

- // extra check unless it falls into one of three categories:

- // - contains Kana letter exceptions

- // - the TLD is ASCII and the input is made entirely of Cyrillic letters

- // that look like Latin letters.

- // - it has combining diacritic marks.

- // Note that the following combinations of scripts are treated as a 'logical'

- // single script.

+ // extra check unless it contains Kana letter exceptions or it's made entirely

+ // of Cyrillic letters that look like Latin letters. Note that the following

+ // combinations of scripts are treated as a 'logical' single script.

// - Chinese: Han, Bopomofo, Common

// - Japanese: Han, Hiragana, Katakana, Common

// - Korean: Hangul, Han, Common

@@ -194,19 +130,14 @@

if (result == USPOOF_ASCII)

return true;

if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&

- kana_letters_exceptions_.containsNone(label_string) &&

- combining_diacritics_exceptions_.containsNone(label_string)) {

+ kana_letters_exceptions_.containsNone(label_string)) {

// Check Cyrillic confusable only for ASCII TLDs.

return !is_tld_ascii || !IsMadeOfLatinAlikeCyrillic(label_string);

}

// Additional checks for |label| with multiple scripts, one of which is Latin.

// Disallow non-ASCII Latin letters to mix with a non-Latin script.

- // Note that the non-ASCII Latin check should not be applied when the entire

- // label is made of Latin. Checking with lgc_letters set here should be fine

- // because script mixing of LGC is already rejected.

- if (non_ascii_latin_letters_.containsSome(label_string) &&

- !lgc_letters_n_ascii_.containsAll(label_string))

+ if (non_ascii_latin_letters_.containsSome(label_string))

return false;

if (!tls_index.initialized())

@@ -236,9 +167,6 @@

// Letter Co) to be next to Latin.

// - Disallow Latin 'o' and 'g' next to Armenian.

// - Disalow mixing of Latin and Canadian Syllabary.

- // - Disallow combining diacritical mark (U+0300-U+0339) after a non-LGC

- // character. Other combining diacritical marks are not in the allowed

- // character set.

dangerous_pattern = new icu::RegexMatcher(

icu::UnicodeString(

R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])"

@@ -253,8 +181,7 @@

R"([a-z][\u0585\u0581]+[a-z]|)"

R"(^[og]+[\p{scx=armn}]|[\p{scx=armn}][og]+$|)"

R"([\p{scx=armn}][og]+[\p{scx=armn}]|)"

- R"([\p{sc=cans}].*[a-z]|[a-z].*[\p{sc=cans}]|)"

- R"([^\p{scx=latn}\p{scx=grek}\p{scx=cyrl}][\u0300-\u0339])",

+ R"([\p{sc=cans}].*[a-z]|[a-z].*[\p{sc=cans}])",

-1, US_INV),

0, status);

tls_index.Set(dangerous_pattern);

@@ -263,33 +190,10 @@

return !dangerous_pattern->find();

}

-bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {

- size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0);

- icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname_length);

- // If input has any characters outside Latin-Greek-Cyrillic and [0-9._-],

- // there is no point in getting rid of diacritics because combining marks

- // attached to non-LGC characters are already blocked.

- if (lgc_letters_n_ascii_.span(ustr_host, 0, USET_SPAN_CONTAINED) ==

- ustr_host.length())

- transliterator_.get()->transliterate(ustr_host);

- UErrorCode status = U_ZERO_ERROR;

- icu::UnicodeString ustr_skeleton;

- uspoof_getSkeletonUnicodeString(checker_, 0, ustr_host, ustr_skeleton,

- &status);

- if (U_FAILURE(status))

- return false;

- std::string skeleton;

- ustr_skeleton.toUTF8String(skeleton);

- return LookupMatchInTopDomains(skeleton);

bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(

const icu::UnicodeString& label) {

- // Collect all the Cyrillic letters in |label_string| and see if they're

- // a subset of |cyrillic_letters_latin_alike_|.

// A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and

- // [_-] and checking if the set contains all letters of |label|

+ // [_-] and checking if the set contains all letters of |label_string|

// would work in most cases, but not if a label has non-letters outside

// ASCII.

icu::UnicodeSet cyrillic_in_label;

« no previous file with comments | « components/url_formatter/idn_spoof_checker.h ('k') | components/url_formatter/top_domains/BUILD.gn » ('j') | no next file with comments »