Chromium Code Reviews| Index: components/url_formatter/url_formatter.cc |
| diff --git a/components/url_formatter/url_formatter.cc b/components/url_formatter/url_formatter.cc |
| index a93bf1154c333fbd7b268855f60a3eac3b250353..f59351428cf5e91f6bea01f54c2617d9375cbec7 100644 |
| --- a/components/url_formatter/url_formatter.cc |
| +++ b/components/url_formatter/url_formatter.cc |
| @@ -15,6 +15,7 @@ |
| #include "base/strings/utf_offset_string_conversions.h" |
| #include "base/strings/utf_string_conversions.h" |
| #include "base/threading/thread_local_storage.h" |
| +#include "third_party/icu/source/common/unicode/schriter.h" |
| #include "third_party/icu/source/common/unicode/uidna.h" |
| #include "third_party/icu/source/common/unicode/uniset.h" |
| #include "third_party/icu/source/common/unicode/uscript.h" |
| @@ -33,6 +34,7 @@ base::string16 IDNToUnicodeWithAdjustments( |
| base::OffsetAdjuster::Adjustments* adjustments); |
| bool IDNToUnicodeOneComponent(const base::char16* comp, |
| size_t comp_len, |
| + bool is_tld_ascii, |
| base::string16* out); |
| class AppendComponentTransform { |
| @@ -200,6 +202,13 @@ base::string16 IDNToUnicodeWithAdjustments( |
| input16.reserve(host.length()); |
| input16.insert(input16.end(), host.begin(), host.end()); |
| + bool is_tld_ascii = true; |
| + size_t last_dot = host.rfind('.'); |
| + if (last_dot != base::StringPiece::npos && |
| + host.substr(last_dot).starts_with(".xn--")) { |
| + is_tld_ascii = false; |
| + } |
| + |
| // Do each component of the host separately, since we enforce script matching |
| // on a per-component basis. |
| base::string16 out16; |
| @@ -217,7 +226,7 @@ base::string16 IDNToUnicodeWithAdjustments( |
| // Add the substring that we just found. |
| converted_idn = |
| IDNToUnicodeOneComponent(input16.data() + component_start, |
| - component_length, &out16); |
| + component_length, is_tld_ascii, &out16); |
| } |
| size_t new_component_length = out16.length() - new_component_start; |
| @@ -241,17 +250,22 @@ class IDNSpoofChecker { |
| public: |
| IDNSpoofChecker(); |
| - // Returns true if |label| is safe to display as Unicode. In the event of |
| - // library failure, all IDN inputs will be treated as unsafe. |
| - bool Check(base::StringPiece16 label); |
| + // Returns true if |label| is safe to display as Unicode. When |
|
Peter Kasting
2017/03/22 22:14:08
Nit: When -> When the
Wrap at 80 columns.
|
| + // TLD is ASCII, check if a label is entirely made of |
| + // Cyrillic letters that look like Latin letters. In the event of library |
| + // failure, all IDN inputs will be treated as unsafe. |
| + bool Check(base::StringPiece16 label, bool is_tld_ascii); |
| private: |
| void SetAllowedUnicodeSet(UErrorCode* status); |
| + bool IsMadeOfLatinAlikeCyrillic(const icu::UnicodeString& label_string); |
| USpoofChecker* checker_; |
| icu::UnicodeSet deviation_characters_; |
| icu::UnicodeSet non_ascii_latin_letters_; |
| icu::UnicodeSet kana_letters_exceptions_; |
| + icu::UnicodeSet cyrillic_letters_; |
| + icu::UnicodeSet cyrillic_letters_latin_alike_; |
| DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker); |
| }; |
| @@ -313,11 +327,22 @@ IDNSpoofChecker::IDNSpoofChecker() { |
| kana_letters_exceptions_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE( |
| "[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb\\u30fc]"), status); |
| kana_letters_exceptions_.freeze(); |
| + // These Cyrillic letters look like Latin. A domain label entirely |
|
Peter Kasting
2017/03/22 22:14:08
Nit: Blank line above this
|
| + // made of these letters are blocked as a poorman's whole-script-spoofable. |
|
Peter Kasting
2017/03/22 22:14:08
Nit: are -> is; poorman's -> simplified
|
| + cyrillic_letters_latin_alike_ = |
| + icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status); |
| + // ӕъЬвҽвԍнкпгмтцѵѡүӔѴҲ |
| + // icu::UnicodeString("[аеорсухьѕіјһмӏтнв]"), status); |
|
Peter Kasting
2017/03/22 22:14:08
Looks like these comments are leftovers?
|
| + cyrillic_letters_latin_alike_.freeze(); |
| + |
| + cyrillic_letters_ = |
| + icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status); |
| + cyrillic_letters_.freeze(); |
| DCHECK(U_SUCCESS(status)); |
| } |
| -bool IDNSpoofChecker::Check(base::StringPiece16 label) { |
| +bool IDNSpoofChecker::Check(base::StringPiece16 label, bool is_tld_ascii) { |
| UErrorCode status = U_ZERO_ERROR; |
| int32_t result = uspoof_check(checker_, label.data(), |
| base::checked_cast<int32_t>(label.size()), |
| @@ -345,17 +370,21 @@ bool IDNSpoofChecker::Check(base::StringPiece16 label) { |
| return false; |
| // If there's no script mixing, the input is regarded as safe without any |
| - // extra check unless it contains Kana letter exceptions. Note that |
| + // extra check unless it contains Kana letter exceptions or it's made enitrely |
|
Peter Kasting
2017/03/22 22:14:08
Nit: entirely
|
| + // of Cyrillic letters that look alike Latin letters. Note that |
|
Peter Kasting
2017/03/22 22:14:08
Nit: alike -> like
|
| // the following combinations of scripts are treated as a 'logical' single |
| // script. |
| // - Chinese: Han, Bopomofo, Common |
| // - Japanese: Han, Hiragana, Katakana, Common |
| // - Korean: Hangul, Han, Common |
| result &= USPOOF_RESTRICTION_LEVEL_MASK; |
| - if (result == USPOOF_ASCII || |
| - (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE && |
| - kana_letters_exceptions_.containsNone(label_string))) |
| + if (result == USPOOF_ASCII) |
| return true; |
| + if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE && |
| + kana_letters_exceptions_.containsNone(label_string)) { |
| + // Check Cyrillic confusable only for ASCII TLDs. |
| + return !is_tld_ascii || !IsMadeOfLatinAlikeCyrillic(label_string); |
| + } |
| // Additional checks for |label| with multiple scripts, one of which is Latin. |
| // Disallow non-ASCII Latin letters to mix with a non-Latin script. |
| @@ -407,6 +436,26 @@ bool IDNSpoofChecker::Check(base::StringPiece16 label) { |
| return !dangerous_pattern->find(); |
| } |
| +bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic( |
| + const icu::UnicodeString& label_string) { |
| + // Collect all the Cyrillic letters in |label_string| and see if they're |
| + // a subset of |cyrillic_letters_latin_alike_|. |
| + // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and |
| + // [_-] and checking if the set contains all letters of |label_string| |
| + // would work in most cases, but it'd not if a label has non-letters outside |
|
Peter Kasting
2017/03/22 22:14:08
Nit: Remove "it'd"
|
| + // ASCII. |
| + icu::UnicodeSet cyrillic_in_label; |
| + icu::StringCharacterIterator it(label_string); |
| + UChar32 c; |
|
Peter Kasting
2017/03/22 22:14:08
Nit: Define inside loop (and can then be const)
|
| + for (it.setToStart(); it.hasNext();) { |
| + c = it.next32PostInc(); |
| + if (cyrillic_letters_.contains(c)) |
| + cyrillic_in_label.add(c); |
| + } |
| + return !cyrillic_in_label.isEmpty() && |
| + cyrillic_letters_latin_alike_.containsAll(cyrillic_in_label); |
| +} |
| + |
| void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) { |
| if (U_FAILURE(*status)) |
| return; |
| @@ -481,8 +530,8 @@ void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) { |
| // user. Note that this function does not deal with pure ASCII domain labels at |
| // all even though it's possible to make up look-alike labels with ASCII |
| // characters alone. |
| -bool IsIDNComponentSafe(base::StringPiece16 label) { |
| - return g_idn_spoof_checker.Get().Check(label); |
| +bool IsIDNComponentSafe(base::StringPiece16 label, bool is_tld_ascii) { |
| + return g_idn_spoof_checker.Get().Check(label, is_tld_ascii); |
| } |
| // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to |
| @@ -527,6 +576,7 @@ base::LazyInstance<UIDNAWrapper>::Leaky g_uidna = LAZY_INSTANCE_INITIALIZER; |
| // Returns whether any conversion was performed. |
| bool IDNToUnicodeOneComponent(const base::char16* comp, |
| size_t comp_len, |
| + bool is_tld_ascii, |
| base::string16* out) { |
| DCHECK(out); |
| if (comp_len == 0) |
| @@ -558,8 +608,9 @@ bool IDNToUnicodeOneComponent(const base::char16* comp, |
| // can be safely displayed to the user. |
| out->resize(original_length + output_length); |
| if (IsIDNComponentSafe( |
| - base::StringPiece16(out->data() + original_length, |
| - base::checked_cast<size_t>(output_length)))) |
| + base::StringPiece16(out->data() + original_length, |
| + base::checked_cast<size_t>(output_length)), |
| + is_tld_ascii)) |
| return true; |
| } |