Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(50)

Unified Diff: components/url_formatter/url_formatter.cc

Issue 2683793010: Block domain labels made of Cyrillic letters that look alike Latin (Closed)
Patch Set: go back to ps11 Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | components/url_formatter/url_formatter_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: components/url_formatter/url_formatter.cc
diff --git a/components/url_formatter/url_formatter.cc b/components/url_formatter/url_formatter.cc
index a93bf1154c333fbd7b268855f60a3eac3b250353..3d5740fd3778550c5c98bce8d533d44885a6fdf9 100644
--- a/components/url_formatter/url_formatter.cc
+++ b/components/url_formatter/url_formatter.cc
@@ -15,6 +15,7 @@
#include "base/strings/utf_offset_string_conversions.h"
#include "base/strings/utf_string_conversions.h"
#include "base/threading/thread_local_storage.h"
+#include "third_party/icu/source/common/unicode/schriter.h"
#include "third_party/icu/source/common/unicode/uidna.h"
#include "third_party/icu/source/common/unicode/uniset.h"
#include "third_party/icu/source/common/unicode/uscript.h"
@@ -33,6 +34,7 @@ base::string16 IDNToUnicodeWithAdjustments(
base::OffsetAdjuster::Adjustments* adjustments);
bool IDNToUnicodeOneComponent(const base::char16* comp,
size_t comp_len,
+ bool is_tld_ascii,
base::string16* out);
class AppendComponentTransform {
@@ -200,6 +202,13 @@ base::string16 IDNToUnicodeWithAdjustments(
input16.reserve(host.length());
input16.insert(input16.end(), host.begin(), host.end());
+ bool is_tld_ascii = true;
+ size_t last_dot = host.rfind('.');
+ if (last_dot != base::StringPiece::npos &&
+ host.substr(last_dot).starts_with(".xn--")) {
+ is_tld_ascii = false;
+ }
+
// Do each component of the host separately, since we enforce script matching
// on a per-component basis.
base::string16 out16;
@@ -217,7 +226,7 @@ base::string16 IDNToUnicodeWithAdjustments(
// Add the substring that we just found.
converted_idn =
IDNToUnicodeOneComponent(input16.data() + component_start,
- component_length, &out16);
+ component_length, is_tld_ascii, &out16);
}
size_t new_component_length = out16.length() - new_component_start;
@@ -241,17 +250,22 @@ class IDNSpoofChecker {
public:
IDNSpoofChecker();
- // Returns true if |label| is safe to display as Unicode. In the event of
- // library failure, all IDN inputs will be treated as unsafe.
- bool Check(base::StringPiece16 label);
+ // Returns true if |label| is safe to display as Unicode. When the TLD is
+ // ASCII, check if a label is entirely made of Cyrillic letters that look like
+ // Latin letters. In the event of library failure, all IDN inputs will be
+ // treated as unsafe.
+ bool Check(base::StringPiece16 label, bool is_tld_ascii);
private:
void SetAllowedUnicodeSet(UErrorCode* status);
+ bool IsMadeOfLatinAlikeCyrillic(const icu::UnicodeString& label_string);
USpoofChecker* checker_;
icu::UnicodeSet deviation_characters_;
icu::UnicodeSet non_ascii_latin_letters_;
icu::UnicodeSet kana_letters_exceptions_;
+ icu::UnicodeSet cyrillic_letters_;
+ icu::UnicodeSet cyrillic_letters_latin_alike_;
DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);
};
@@ -314,10 +328,20 @@ IDNSpoofChecker::IDNSpoofChecker() {
"[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb\\u30fc]"), status);
kana_letters_exceptions_.freeze();
+ // These Cyrillic letters look like Latin. A domain label entirely made of
+ // these letters is blocked as a simpliified whole-script-spoofable.
+ cyrillic_letters_latin_alike_ =
+ icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status);
+ cyrillic_letters_latin_alike_.freeze();
+
+ cyrillic_letters_ =
+ icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status);
+ cyrillic_letters_.freeze();
+
DCHECK(U_SUCCESS(status));
}
-bool IDNSpoofChecker::Check(base::StringPiece16 label) {
+bool IDNSpoofChecker::Check(base::StringPiece16 label, bool is_tld_ascii) {
UErrorCode status = U_ZERO_ERROR;
int32_t result = uspoof_check(checker_, label.data(),
base::checked_cast<int32_t>(label.size()),
@@ -345,17 +369,19 @@ bool IDNSpoofChecker::Check(base::StringPiece16 label) {
return false;
// If there's no script mixing, the input is regarded as safe without any
- // extra check unless it contains Kana letter exceptions. Note that
- // the following combinations of scripts are treated as a 'logical' single
- // script.
+ // extra check unless it contains Kana letter exceptions or it's made entirely
+ // of Cyrillic letters that look like Latin letters. Note that the following
+ // combinations of scripts are treated as a 'logical' single script.
// - Chinese: Han, Bopomofo, Common
// - Japanese: Han, Hiragana, Katakana, Common
// - Korean: Hangul, Han, Common
result &= USPOOF_RESTRICTION_LEVEL_MASK;
- if (result == USPOOF_ASCII ||
- (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&
- kana_letters_exceptions_.containsNone(label_string)))
- return true;
+ if (result == USPOOF_ASCII) return true;
+ if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&
+ kana_letters_exceptions_.containsNone(label_string)) {
+ // Check Cyrillic confusable only for ASCII TLDs.
+ return !is_tld_ascii || !IsMadeOfLatinAlikeCyrillic(label_string);
+ }
// Additional checks for |label| with multiple scripts, one of which is Latin.
// Disallow non-ASCII Latin letters to mix with a non-Latin script.
@@ -407,6 +433,25 @@ bool IDNSpoofChecker::Check(base::StringPiece16 label) {
return !dangerous_pattern->find();
}
+bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(
+ const icu::UnicodeString& label_string) {
+ // Collect all the Cyrillic letters in |label_string| and see if they're
+ // a subset of |cyrillic_letters_latin_alike_|.
+ // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and
+ // [_-] and checking if the set contains all letters of |label_string|
+ // would work in most cases, but not if a label has non-letters outside
+ // ASCII.
+ icu::UnicodeSet cyrillic_in_label;
+ icu::StringCharacterIterator it(label_string);
+ for (it.setToStart(); it.hasNext();) {
+ const UChar32 c = it.next32PostInc();
+ if (cyrillic_letters_.contains(c))
+ cyrillic_in_label.add(c);
+ }
+ return !cyrillic_in_label.isEmpty() &&
+ cyrillic_letters_latin_alike_.containsAll(cyrillic_in_label);
+}
+
void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) {
if (U_FAILURE(*status))
return;
@@ -481,8 +526,8 @@ void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) {
// user. Note that this function does not deal with pure ASCII domain labels at
// all even though it's possible to make up look-alike labels with ASCII
// characters alone.
-bool IsIDNComponentSafe(base::StringPiece16 label) {
- return g_idn_spoof_checker.Get().Check(label);
+bool IsIDNComponentSafe(base::StringPiece16 label, bool is_tld_ascii) {
+ return g_idn_spoof_checker.Get().Check(label, is_tld_ascii);
}
// A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to
@@ -527,6 +572,7 @@ base::LazyInstance<UIDNAWrapper>::Leaky g_uidna = LAZY_INSTANCE_INITIALIZER;
// Returns whether any conversion was performed.
bool IDNToUnicodeOneComponent(const base::char16* comp,
size_t comp_len,
+ bool is_tld_ascii,
base::string16* out) {
DCHECK(out);
if (comp_len == 0)
@@ -558,8 +604,9 @@ bool IDNToUnicodeOneComponent(const base::char16* comp,
// can be safely displayed to the user.
out->resize(original_length + output_length);
if (IsIDNComponentSafe(
- base::StringPiece16(out->data() + original_length,
- base::checked_cast<size_t>(output_length))))
+ base::StringPiece16(out->data() + original_length,
+ base::checked_cast<size_t>(output_length)),
+ is_tld_ascii))
return true;
}
« no previous file with comments | « no previous file | components/url_formatter/url_formatter_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698