components/url_formatter/url_formatter.cc - Issue 2683793010: Block domain labels made of Cyrillic letters that look alike Latin

Unified Diff: components/url_formatter/url_formatter.cc

Issue 2683793010: Block domain labels made of Cyrillic letters that look alike Latin (Closed)

Patch Set: check Cyrl-Latn alikes only for non-IDN tlds Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: components/url_formatter/url_formatter.cc

diff --git a/components/url_formatter/url_formatter.cc b/components/url_formatter/url_formatter.cc

index a93bf1154c333fbd7b268855f60a3eac3b250353..f90f52807b978a626c48b592d59a3025aa9aaeb4 100644

--- a/components/url_formatter/url_formatter.cc

+++ b/components/url_formatter/url_formatter.cc

@@ -31,9 +31,8 @@ namespace {

base::string16 IDNToUnicodeWithAdjustments(

base::StringPiece host,

base::OffsetAdjuster::Adjustments* adjustments);

-bool IDNToUnicodeOneComponent(const base::char16* comp,

- size_t comp_len,

- base::string16* out);

+bool IDNToUnicodeOneComponent(const base::char16* comp, size_t comp_len,

+ bool is_tld_ascii, base::string16* out);

class AppendComponentTransform {

public:

@@ -200,6 +199,16 @@ base::string16 IDNToUnicodeWithAdjustments(

input16.reserve(host.length());

input16.insert(input16.end(), host.begin(), host.end());

+ bool is_tld_ascii = true;

+ size_t last_dot = host.rfind('.');

+ static const char* kAcePrefix = "xn--";

+ const size_t kAcePrefixLen = 4;

+ if (last_dot != base::StringPiece::npos &&

+ last_dot + kAcePrefixLen < host.length() &&

+ memcmp(kAcePrefix, host.data() + last_dot + 1, kAcePrefixLen) == 0) {

+ is_tld_ascii = false;

sffc 2017/02/15 19:57:30 I don't really understand what this part of the co

jungshik at Google 2017/02/15 20:55:52 It's checking if the TLD starts with 'xn--'. Other

+ }

// Do each component of the host separately, since we enforce script matching

// on a per-component basis.

base::string16 out16;

@@ -217,7 +226,7 @@ base::string16 IDNToUnicodeWithAdjustments(

// Add the substring that we just found.

converted_idn =

IDNToUnicodeOneComponent(input16.data() + component_start,

- component_length, &out16);

+ component_length, is_tld_ascii, &out16);

}

size_t new_component_length = out16.length() - new_component_start;

@@ -241,9 +250,11 @@ class IDNSpoofChecker {

public:

IDNSpoofChecker();

- // Returns true if |label| is safe to display as Unicode. In the event of

- // library failure, all IDN inputs will be treated as unsafe.

- bool Check(base::StringPiece16 label);

+ // Returns true if |label| is safe to display as Unicode. When

+ // TLD is ASCII, check if a label is entirely made of

+ // Cyrillic letters that look alike Latin letters. In the event of library

+ // failure, all IDN inputs will be treated as unsafe.

+ bool Check(base::StringPiece16 label, bool is_tld_ascii);

private:

void SetAllowedUnicodeSet(UErrorCode* status);

@@ -252,6 +263,7 @@ class IDNSpoofChecker {

icu::UnicodeSet deviation_characters_;

icu::UnicodeSet non_ascii_latin_letters_;

icu::UnicodeSet kana_letters_exceptions_;

+ icu::UnicodeSet cyrillic_letters_latin_alike_;

DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);

};

@@ -313,11 +325,17 @@ IDNSpoofChecker::IDNSpoofChecker() {

kana_letters_exceptions_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE(

"[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb\\u30fc]"), status);

kana_letters_exceptions_.freeze();

+ // These Cyrillic letters look alike Latin. A domain label entirely

+ // made of these letters are blocked as a poorman's whole-script-spoofable.

+ cyrillic_letters_latin_alike_ = icu::UnicodeSet(

+ icu::UnicodeString("[аеорсухьѕіјһмӏтнв]"), status);

jungshik at Google 2017/02/15 18:50:21 "м, т, н, в" look like smallcap Latin and it's deb

sffc 2017/02/15 19:57:30 I'd look at the capital letters too. Here's a pos

lgarron 2017/02/15 20:24:49 Also consider ԁ, Ӏ, and maybe ѵ, listed at the bot

lgarron 2017/02/15 20:25:56 (Note that Ӏ is a spoof of l, not just I.)

jungshik at Google 2017/02/15 20:55:52 Well, uppercase letters will not 'survive' (they'l

jungshik at Google 2017/02/15 20:55:52 Thanks. U+0501 and U+0475 I'll consider. As for U+

+ cyrillic_letters_latin_alike_.freeze();

DCHECK(U_SUCCESS(status));

}

-bool IDNSpoofChecker::Check(base::StringPiece16 label) {

+bool IDNSpoofChecker::Check(base::StringPiece16 label,

+ bool is_tld_ascii) {

UErrorCode status = U_ZERO_ERROR;

int32_t result = uspoof_check(checker_, label.data(),

base::checked_cast<int32_t>(label.size()),

@@ -345,16 +363,20 @@ bool IDNSpoofChecker::Check(base::StringPiece16 label) {

return false;

// If there's no script mixing, the input is regarded as safe without any

- // extra check unless it contains Kana letter exceptions. Note that

+ // extra check unless it contains Kana letter exceptions or it's made enitrely

+ // of Cyrillic letters that look alike Latin letters. Note that

// the following combinations of scripts are treated as a 'logical' single

// script.

// - Chinese: Han, Bopomofo, Common

// - Japanese: Han, Hiragana, Katakana, Common

// - Korean: Hangul, Han, Common

result &= USPOOF_RESTRICTION_LEVEL_MASK;

- if (result == USPOOF_ASCII ||

- (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&

- kana_letters_exceptions_.containsNone(label_string)))

+ if (result == USPOOF_ASCII) return true;

+ // Check Cyrillic confusable only for ASCII TLDs.

+ if (is_tld_ascii && cyrillic_letters_latin_alike_.containsAll(label_string))

sffc 2017/02/15 19:57:30 I think you should compare only the letter charact

jungshik at Google 2017/02/15 20:55:52 That's a good point. Thanks !

+ return false;

+ if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&

+ kana_letters_exceptions_.containsNone(label_string))

return true;

// Additional checks for |label| with multiple scripts, one of which is Latin.

@@ -481,8 +503,8 @@ void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) {

// user. Note that this function does not deal with pure ASCII domain labels at

// all even though it's possible to make up look-alike labels with ASCII

// characters alone.

-bool IsIDNComponentSafe(base::StringPiece16 label) {

- return g_idn_spoof_checker.Get().Check(label);

+bool IsIDNComponentSafe(base::StringPiece16 label, bool is_tld_ascii) {

+ return g_idn_spoof_checker.Get().Check(label, is_tld_ascii);

}

// A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to

@@ -527,6 +549,7 @@ base::LazyInstance<UIDNAWrapper>::Leaky g_uidna = LAZY_INSTANCE_INITIALIZER;

// Returns whether any conversion was performed.

bool IDNToUnicodeOneComponent(const base::char16* comp,

size_t comp_len,

+ bool is_tld_ascii,

base::string16* out) {

DCHECK(out);

if (comp_len == 0)

@@ -559,7 +582,8 @@ bool IDNToUnicodeOneComponent(const base::char16* comp,

out->resize(original_length + output_length);

if (IsIDNComponentSafe(

base::StringPiece16(out->data() + original_length,

- base::checked_cast<size_t>(output_length))))

+ base::checked_cast<size_t>(output_length)),

+ is_tld_ascii))

return true;

}

« no previous file with comments | « no previous file | components/url_formatter/url_formatter_unittest.cc » ('j') | no next file with comments »