Index: components/url_formatter/url_formatter.cc |
diff --git a/components/url_formatter/url_formatter.cc b/components/url_formatter/url_formatter.cc |
index a93bf1154c333fbd7b268855f60a3eac3b250353..3d5740fd3778550c5c98bce8d533d44885a6fdf9 100644 |
--- a/components/url_formatter/url_formatter.cc |
+++ b/components/url_formatter/url_formatter.cc |
@@ -15,6 +15,7 @@ |
#include "base/strings/utf_offset_string_conversions.h" |
#include "base/strings/utf_string_conversions.h" |
#include "base/threading/thread_local_storage.h" |
+#include "third_party/icu/source/common/unicode/schriter.h" |
#include "third_party/icu/source/common/unicode/uidna.h" |
#include "third_party/icu/source/common/unicode/uniset.h" |
#include "third_party/icu/source/common/unicode/uscript.h" |
@@ -33,6 +34,7 @@ base::string16 IDNToUnicodeWithAdjustments( |
base::OffsetAdjuster::Adjustments* adjustments); |
bool IDNToUnicodeOneComponent(const base::char16* comp, |
size_t comp_len, |
+ bool is_tld_ascii, |
base::string16* out); |
class AppendComponentTransform { |
@@ -200,6 +202,13 @@ base::string16 IDNToUnicodeWithAdjustments( |
input16.reserve(host.length()); |
input16.insert(input16.end(), host.begin(), host.end()); |
+ bool is_tld_ascii = true; |
+ size_t last_dot = host.rfind('.'); |
+ if (last_dot != base::StringPiece::npos && |
+ host.substr(last_dot).starts_with(".xn--")) { |
+ is_tld_ascii = false; |
+ } |
+ |
// Do each component of the host separately, since we enforce script matching |
// on a per-component basis. |
base::string16 out16; |
@@ -217,7 +226,7 @@ base::string16 IDNToUnicodeWithAdjustments( |
// Add the substring that we just found. |
converted_idn = |
IDNToUnicodeOneComponent(input16.data() + component_start, |
- component_length, &out16); |
+ component_length, is_tld_ascii, &out16); |
} |
size_t new_component_length = out16.length() - new_component_start; |
@@ -241,17 +250,22 @@ class IDNSpoofChecker { |
public: |
IDNSpoofChecker(); |
- // Returns true if |label| is safe to display as Unicode. In the event of |
- // library failure, all IDN inputs will be treated as unsafe. |
- bool Check(base::StringPiece16 label); |
+ // Returns true if |label| is safe to display as Unicode. When the TLD is |
+ // ASCII, check if a label is entirely made of Cyrillic letters that look like |
+ // Latin letters. In the event of library failure, all IDN inputs will be |
+ // treated as unsafe. |
+ bool Check(base::StringPiece16 label, bool is_tld_ascii); |
private: |
void SetAllowedUnicodeSet(UErrorCode* status); |
+ bool IsMadeOfLatinAlikeCyrillic(const icu::UnicodeString& label_string); |
USpoofChecker* checker_; |
icu::UnicodeSet deviation_characters_; |
icu::UnicodeSet non_ascii_latin_letters_; |
icu::UnicodeSet kana_letters_exceptions_; |
+ icu::UnicodeSet cyrillic_letters_; |
+ icu::UnicodeSet cyrillic_letters_latin_alike_; |
DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker); |
}; |
@@ -314,10 +328,20 @@ IDNSpoofChecker::IDNSpoofChecker() { |
"[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb\\u30fc]"), status); |
kana_letters_exceptions_.freeze(); |
+ // These Cyrillic letters look like Latin. A domain label entirely made of |
+ // these letters is blocked as a simpliified whole-script-spoofable. |
+ cyrillic_letters_latin_alike_ = |
+ icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status); |
+ cyrillic_letters_latin_alike_.freeze(); |
+ |
+ cyrillic_letters_ = |
+ icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status); |
+ cyrillic_letters_.freeze(); |
+ |
DCHECK(U_SUCCESS(status)); |
} |
-bool IDNSpoofChecker::Check(base::StringPiece16 label) { |
+bool IDNSpoofChecker::Check(base::StringPiece16 label, bool is_tld_ascii) { |
UErrorCode status = U_ZERO_ERROR; |
int32_t result = uspoof_check(checker_, label.data(), |
base::checked_cast<int32_t>(label.size()), |
@@ -345,17 +369,19 @@ bool IDNSpoofChecker::Check(base::StringPiece16 label) { |
return false; |
// If there's no script mixing, the input is regarded as safe without any |
- // extra check unless it contains Kana letter exceptions. Note that |
- // the following combinations of scripts are treated as a 'logical' single |
- // script. |
+ // extra check unless it contains Kana letter exceptions or it's made entirely |
+ // of Cyrillic letters that look like Latin letters. Note that the following |
+ // combinations of scripts are treated as a 'logical' single script. |
// - Chinese: Han, Bopomofo, Common |
// - Japanese: Han, Hiragana, Katakana, Common |
// - Korean: Hangul, Han, Common |
result &= USPOOF_RESTRICTION_LEVEL_MASK; |
- if (result == USPOOF_ASCII || |
- (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE && |
- kana_letters_exceptions_.containsNone(label_string))) |
- return true; |
+ if (result == USPOOF_ASCII) return true; |
+ if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE && |
+ kana_letters_exceptions_.containsNone(label_string)) { |
+ // Check Cyrillic confusable only for ASCII TLDs. |
+ return !is_tld_ascii || !IsMadeOfLatinAlikeCyrillic(label_string); |
+ } |
// Additional checks for |label| with multiple scripts, one of which is Latin. |
// Disallow non-ASCII Latin letters to mix with a non-Latin script. |
@@ -407,6 +433,25 @@ bool IDNSpoofChecker::Check(base::StringPiece16 label) { |
return !dangerous_pattern->find(); |
} |
+bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic( |
+ const icu::UnicodeString& label_string) { |
+ // Collect all the Cyrillic letters in |label_string| and see if they're |
+ // a subset of |cyrillic_letters_latin_alike_|. |
+ // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and |
+ // [_-] and checking if the set contains all letters of |label_string| |
+ // would work in most cases, but not if a label has non-letters outside |
+ // ASCII. |
+ icu::UnicodeSet cyrillic_in_label; |
+ icu::StringCharacterIterator it(label_string); |
+ for (it.setToStart(); it.hasNext();) { |
+ const UChar32 c = it.next32PostInc(); |
+ if (cyrillic_letters_.contains(c)) |
+ cyrillic_in_label.add(c); |
+ } |
+ return !cyrillic_in_label.isEmpty() && |
+ cyrillic_letters_latin_alike_.containsAll(cyrillic_in_label); |
+} |
+ |
void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) { |
if (U_FAILURE(*status)) |
return; |
@@ -481,8 +526,8 @@ void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) { |
// user. Note that this function does not deal with pure ASCII domain labels at |
// all even though it's possible to make up look-alike labels with ASCII |
// characters alone. |
-bool IsIDNComponentSafe(base::StringPiece16 label) { |
- return g_idn_spoof_checker.Get().Check(label); |
+bool IsIDNComponentSafe(base::StringPiece16 label, bool is_tld_ascii) { |
+ return g_idn_spoof_checker.Get().Check(label, is_tld_ascii); |
} |
// A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to |
@@ -527,6 +572,7 @@ base::LazyInstance<UIDNAWrapper>::Leaky g_uidna = LAZY_INSTANCE_INITIALIZER; |
// Returns whether any conversion was performed. |
bool IDNToUnicodeOneComponent(const base::char16* comp, |
size_t comp_len, |
+ bool is_tld_ascii, |
base::string16* out) { |
DCHECK(out); |
if (comp_len == 0) |
@@ -558,8 +604,9 @@ bool IDNToUnicodeOneComponent(const base::char16* comp, |
// can be safely displayed to the user. |
out->resize(original_length + output_length); |
if (IsIDNComponentSafe( |
- base::StringPiece16(out->data() + original_length, |
- base::checked_cast<size_t>(output_length)))) |
+ base::StringPiece16(out->data() + original_length, |
+ base::checked_cast<size_t>(output_length)), |
+ is_tld_ascii)) |
return true; |
} |