components/url_formatter/idn_spoof_checker.cc - Issue 2784933002: Mitigate spoofing attempt using Latin letters.

Side by Side Diff: components/url_formatter/idn_spoof_checker.cc

Issue 2784933002: Mitigate spoofing attempt using Latin letters. (Closed)

Patch Set: pull IDNSpoofChecker to separae h/cc files Created 3 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« components/url_formatter/idn_spoof_checker.h ('K') | « components/url_formatter/idn_spoof_checker.h ('k') | components/url_formatter/top_domains/BUILD.gn » ('j') | components/url_formatter/top_domains/README » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright 2017 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4 //

	5 #include "components/url_formatter/idn_spoof_checker.h"

	6

	7 #include "base/numerics/safe_conversions.h"

	8 #include "base/strings/string_split.h"

	9 #include "base/strings/string_util.h"

	10 #include "base/threading/thread_local_storage.h"

	11 #include "net/base/lookup_string_in_fixed_set.h"

	12 #include "third_party/icu/source/common/unicode/schriter.h"

	13 #include "third_party/icu/source/i18n/unicode/translit.h"

	14

	15 namespace url_formatter {

	16

	17 namespace {

	18 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;

	19

	20 void OnThreadTermination(void* regex_matcher) {

	21 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);

	22 }

	23

	24 #include "components/url_formatter/top_domains/alexa_skeletons-inc.cc"

	25 // All the domains in the above file have 3 or fewer labels.

	26 const size_t kNumberOfLabelsToCheck = 3;

	27

	28 bool LookupMatchInTopDomains(base::StringPiece skeleton) {

	29 DCHECK(skeleton.back() != '.');
	Peter Kasting 2017/05/10 22:38:46 Nit: DCHECK_NE('.', skeleton.back())? Nit: DCHECK_NE('.', skeleton.back())? jungshik at Google 2017/05/14 09:36:22 Done. Show quoted text On 2017/05/10 22:38:46, Peter Kasting wrote: > Nit: DCHECK_NE('.', skeleton.back())? Done.
	30 auto labels = base::SplitStringPiece(skeleton, ".", base::KEEP_WHITESPACE,

	31 base::SPLIT_WANT_ALL);

	32

	33 if (labels.size() > kNumberOfLabelsToCheck) {

	34 labels.erase(labels.begin(),

	35 labels.begin() + labels.size() - kNumberOfLabelsToCheck);

	36 }

	37

	38 while (labels.size() > 1) {

	39 std::string partial_skeleton = base::JoinString(labels, ".");

	40 if (net::LookupStringInFixedSet(

	41 kDafsa, arraysize(kDafsa), partial_skeleton.data(),

	42 partial_skeleton.length()) != net::kDafsaNotFound)

	43 return true;

	44 labels.erase(labels.begin());

	45 }

	46 return false;

	47 }

	48

	49 } // namespace

	50

	51 IDNSpoofChecker::IDNSpoofChecker() {

	52 UErrorCode status = U_ZERO_ERROR;

	53 checker_ = uspoof_open(&status);

	54 if (U_FAILURE(status)) {

	55 checker_ = nullptr;

	56 return;

	57 }

	58

	59 // At this point, USpoofChecker has all the checks enabled except

	60 // for USPOOF_CHAR_LIMIT (USPOOF_{RESTRICTION_LEVEL, INVISIBLE,

	61 // MIXED_SCRIPT_CONFUSABLE, WHOLE_SCRIPT_CONFUSABLE, MIXED_NUMBERS, ANY_CASE})

	62 // This default configuration is adjusted below as necessary.

	63

	64 // Set the restriction level to moderate. It allows mixing Latin with another

	65 // script (+ COMMON and INHERITED). Except for Chinese(Han + Bopomofo),

	66 // Japanese(Hiragana + Katakana + Han), and Korean(Hangul + Han), only one

	67 // script other than Common and Inherited can be mixed with Latin. Cyrillic

	68 // and Greek are not allowed to mix with Latin.

	69 // See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection

	70 uspoof_setRestrictionLevel(checker_, USPOOF_MODERATELY_RESTRICTIVE);

	71

	72 // Sets allowed characters in IDN labels and turn on USPOOF_CHAR_LIMIT.

	73 SetAllowedUnicodeSet(&status);

	74

	75 // Enable the return of auxillary (non-error) information.

	76 // We used to disable WHOLE_SCRIPT_CONFUSABLE check explicitly, but as of

	77 // ICU 58.1, WSC is a no-op in a single string check API.

	78 int32_t checks = uspoof_getChecks(checker_, &status) \| USPOOF_AUX_INFO;

	79 uspoof_setChecks(checker_, checks, &status);

	80

	81 // Four characters handled differently by IDNA 2003 and IDNA 2008. UTS46

	82 // transitional processing treats them as IDNA 2003 does; maps U+00DF and

	83 // U+03C2 and drops U+200[CD].

	84 deviation_characters_ = icu::UnicodeSet(

	85 UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"), status);

	86 deviation_characters_.freeze();

	87

	88 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary

	89 // because additional characters pulled in with scx=Latn are not included in

	90 // the allowed set.

	91 non_ascii_latin_letters_ =

	92 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);

	93 non_ascii_latin_letters_.freeze();

	94

	95 // The following two sets are parts of \|dangerous_patterns_\|.

	96 kana_letters_exceptions_ = icu::UnicodeSet(

	97 UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"),

	98 status);

	99 kana_letters_exceptions_.freeze();

	100 combining_diacritics_exceptions_ =

	101 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0300-\\u0339]"), status);

	102 combining_diacritics_exceptions_.freeze();

	103

	104 // These Cyrillic letters look like Latin. A domain label entirely made of

	105 // these letters is blocked as a simplified whole-script-spoofable.

	106 cyrillic_letters_latin_alike_ =

	107 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status);

	108 cyrillic_letters_latin_alike_.freeze();

	109

	110 cyrillic_letters_ =

	111 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status);

	112 cyrillic_letters_.freeze();

	113

	114 // This set is used to determine whether or not to apply a slow
	Peter Kasting 2017/05/10 22:38:46 Since you're both moving this code to this file an Since you're both moving this code to this file and modifying it, it's hard to tell what the functional vs. nonfunctional diff from this change is. Consider splitting off a separate initial CL that just moves the code out to its own file without modifying it (more than is necessary), then having this CL only make functional changes. jungshik at Google 2017/05/14 09:36:22 Ok. A file reorg CL is https://codereview.chromiu Show quoted text On 2017/05/10 22:38:46, Peter Kasting wrote: > Since you're both moving this code to this file and modifying it, it's hard to > tell what the functional vs. nonfunctional diff from this change is. Consider > splitting off a separate initial CL that just moves the code out to its own file > without modifying it (more than is necessary), then having this CL only make > functional changes. Ok. A file reorg CL is https://codereview.chromium.org/2877973003 .
	115 // transliteration to remove diacritics to a given hostname before the

	116 // confusable skeleton calculation for comparison with top domain names. If

	117 // it has any character outside the set, the expensive step will be skipped

	118 // because it cannot match any of top domain names.

	119 // The last ([\u0300-\u0339] is a shorthand for "[:Identifier_Status=Allowed:]

	120 // & [:Script_Extensions=Inherited:] - [\\u200C\\u200D]". The latter is a

	121 // subset of the former but it does not matter because hostnames with

	122 // characters outside the latter set would be rejected in an earlier step.

	123 lgc_letters_n_ascii_ = icu::UnicodeSet(

	124 UNICODE_STRING_SIMPLE("[[:Latin:][:Greek:][:Cyrillic:][0-9\\u002e_"

	125 "\\u002d][\\u0300-\\u0339]]"),

	126 status);

	127 lgc_letters_n_ascii_.freeze();

	128

	129 // Used for diacritics-removal before the skeleton calculation. Add

	130 // "ł > l; ø > o; đ > d" that are not handled by "NFD; Nonspacing mark

	131 // removal; NFC". On top of that, supplement the Unicode confusable list by

	132 // replacing {U+043A (к), U+0138(ĸ), U+03BA(κ)}, U+04CF (ӏ) and U+043F(п) by

	133 // 'k', 'l' and 'n', respectively.

	134 // TODO(jshin): Revisit "ł > l; ø > o" mapping.

	135 UParseError parse_error;

	136 transliterator_.reset(icu::Transliterator::createFromRules(

	137 UNICODE_STRING_SIMPLE("DropAcc"),

	138 icu::UnicodeString("::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;"

	139 " ł > l; ø > o; đ > d; ӏ > l; [кĸκ] > k; п > n;"),

	140 UTRANS_FORWARD, parse_error, status));

	141 DCHECK(U_SUCCESS(status))

	142 << "Spoofchecker initalization failed due to an error: "

	143 << u_errorName(status);

	144 }

	145

	146 IDNSpoofChecker::~IDNSpoofChecker() {

	147 uspoof_close(checker_);

	148 }

	149

	150 bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,

	151 bool is_tld_ascii) {

	152 UErrorCode status = U_ZERO_ERROR;

	153 int32_t result =

	154 uspoof_check(checker_, label.data(),

	155 base::checked_cast<int32_t>(label.size()), NULL, &status);

	156 // If uspoof_check fails (due to library failure), or if any of the checks

	157 // fail, treat the IDN as unsafe.

	158 if (U_FAILURE(status) \|\| (result & USPOOF_ALL_CHECKS))

	159 return false;

	160

	161 icu::UnicodeString label_string(FALSE, label.data(),

	162 base::checked_cast<int32_t>(label.size()));

	163

	164 // A punycode label with 'xn--' prefix is not subject to the URL

	165 // canonicalization and is stored as it is in GURL. If it encodes a deviation

	166 // character (UTS 46; e.g. U+00DF/sharp-s), it should be still shown in

	167 // punycode instead of Unicode. Without this check, xn--fu-hia for

	168 // 'fu<sharp-s>' would be converted to 'fu<sharp-s>' for display because

	169 // "UTS 46 section 4 Processing step 4" applies validity criteria for

	170 // non-transitional processing (i.e. do not map deviation characters) to any

	171 // punycode labels regardless of whether transitional or non-transitional is

	172 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted

	173 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as

	174 // such. See http://crbug.com/595263 .

	175 if (deviation_characters_.containsSome(label_string))

	176 return false;

	177

	178 // If there's no script mixing, the input is regarded as safe without any

	179 // extra check unless it falls into one of three categories:

	180 // - contains Kana letter exceptions

	181 // - the TLD is ASCII and the input is made entirely of Cyrillic letters

	182 // that look like Latin letters.

	183 // - it has combining diacritic marks.

	184 // Note that the following combinations of scripts are treated as a 'logical'

	185 // single script.

	186 // - Chinese: Han, Bopomofo, Common

	187 // - Japanese: Han, Hiragana, Katakana, Common

	188 // - Korean: Hangul, Han, Common

	189 result &= USPOOF_RESTRICTION_LEVEL_MASK;

	190 if (result == USPOOF_ASCII)

	191 return true;

	192 if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&

	193 kana_letters_exceptions_.containsNone(label_string) &&

	194 combining_diacritics_exceptions_.containsNone(label_string)) {

	195 // Check Cyrillic confusable only for ASCII TLDs.

	196 return !is_tld_ascii \|\| !IsMadeOfLatinAlikeCyrillic(label_string);

	197 }

	198

	199 // Additional checks for \|label\| with multiple scripts, one of which is Latin.

	200 // Disallow non-ASCII Latin letters to mix with a non-Latin script.

	201 // Note that the non-ASCII Latin check should not be applied when the entire

	202 // label is made of Latin. Checking with lgc_letters set here should be fine

	203 // because script mixing of LGC is already rejected.

	204 if (non_ascii_latin_letters_.containsSome(label_string) &&

	205 !lgc_letters_n_ascii_.containsAll(label_string))

	206 return false;

	207

	208 if (!tls_index.initialized())

	209 tls_index.Initialize(&OnThreadTermination);

	210 icu::RegexMatcher* dangerous_pattern =

	211 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get());

	212 if (!dangerous_pattern) {

	213 // Disallow the katakana no, so, zo, or n, as they may be mistaken for

	214 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts

	215 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a

	216 // non-Japanese script on either side is disallowed, legitimate cases like

	217 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those

	218 // characters when used alone as a label is futile because those cases

	219 // would not reach here.

	220 // Also disallow what used to be blocked by mixed-script-confusable (MSC)

	221 // detection. ICU 58 does not detect MSC any more for a single input string.

	222 // See http://bugs.icu-project.org/trac/ticket/12823 .

	223 // TODO(jshin): adjust the pattern once the above ICU bug is fixed.

	224 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana

	225 // Prolonged Sound) used out-of-context.

	226 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark)

	227 // unless they're preceded by a Katakana.

	228 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters

	229 // (U+30D[8-A]) that look exactly like each other when they're used in a

	230 // label otherwise entirely in Katakna or Hiragana.

	231 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small

	232 // Letter Co) to be next to Latin.

	233 // - Disallow Latin 'o' and 'g' next to Armenian.

	234 // - Disallow combining diacritical mark (U+0300-U+0339) after a non-LGC

	235 // character. Other combining diacritical marks are not in the allowed

	236 // character set.

	237 dangerous_pattern = new icu::RegexMatcher(

	238 icu::UnicodeString(

	239 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]"

	240 "[\\u30ce\\u30f3\\u30bd\\u30be]"

	241 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]\|"

	242 "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc\|^\\u30fc\|"

	243 "[^\\p{scx=kana}][\\u30fd\\u30fe]\|^[\\u30fd\\u30fe]\|"

	244 "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$\|"

	245 "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$\|"

	246 "[a-z]\\u30fb\|\\u30fb[a-z]\|"

	247 "^[\\u0585\\u0581]+[a-z]\|[a-z][\\u0585\\u0581]+$\|"

	248 "[a-z][\\u0585\\u0581]+[a-z]\|"

	249 "^[og]+[\\p{scx=armn}]\|[\\p{scx=armn}][og]+$\|"

	250 "[\\p{scx=armn}][og]+[\\p{scx=armn}]\|"

	251 "[^\\p{scx=latn}\\p{scx=grek}\\p{scx=cyrl}][\\u0300-\\u0339]",

	252 -1, US_INV),

	253 0, status);

	254 tls_index.Set(dangerous_pattern);

	255 }

	256 dangerous_pattern->reset(label_string);

	257 return !dangerous_pattern->find();

	258 }

	259

	260 bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {

	261 size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0);

	262 icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname_length);

	263 // If input has any characters outside Latin-Greek-Cyrillic and [0-9._-],

	264 // there is no point in getting rid of diacritics because combining marks

	265 // attached to non-LGC characters are already blocked.

	266 if (lgc_letters_n_ascii_.span(ustr_host, 0, USET_SPAN_CONTAINED) ==

	267 ustr_host.length())

	268 transliterator_.get()->transliterate(ustr_host);

	269

	270 UErrorCode status = U_ZERO_ERROR;

	271 icu::UnicodeString ustr_skeleton;

	272 uspoof_getSkeletonUnicodeString(checker_, 0, ustr_host, ustr_skeleton,

	273 &status);

	274 if (U_FAILURE(status))

	275 return false;

	276 std::string skeleton;

	277 ustr_skeleton.toUTF8String(skeleton);

	278 return LookupMatchInTopDomains(skeleton);

	279 }

	280

	281 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(

	282 const icu::UnicodeString& label_string) {

	283 // Collect all the Cyrillic letters in \|label_string\| and see if they're

	284 // a subset of \|cyrillic_letters_latin_alike_\|.
	Peter Kasting 2017/05/10 22:38:46 Nit: The sentence above is the sort of thing I'd e Nit: The sentence above is the sort of thing I'd expect to see on the function declaration instead. jungshik at Google 2017/05/14 09:36:22 Added to the header. Show quoted text On 2017/05/10 22:38:46, Peter Kasting wrote: > Nit: The sentence above is the sort of thing I'd expect to see on the function > declaration instead. Added to the header.
	285 // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and

	286 // [_-] and checking if the set contains all letters of \|label_string\|

	287 // would work in most cases, but not if a label has non-letters outside

	288 // ASCII.

	289 icu::UnicodeSet cyrillic_in_label;

	290 icu::StringCharacterIterator it(label_string);

	291 for (it.setToStart(); it.hasNext();) {

	292 const UChar32 c = it.next32PostInc();

	293 if (cyrillic_letters_.contains(c))

	294 cyrillic_in_label.add(c);

	295 }

	296 return !cyrillic_in_label.isEmpty() &&

	297 cyrillic_letters_latin_alike_.containsAll(cyrillic_in_label);

	298 }

	299

	300 void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) {

	301 if (U_FAILURE(*status))

	302 return;

	303

	304 // The recommended set is a set of characters for identifiers in a

	305 // security-sensitive environment taken from UTR 39

	306 // (http://unicode.org/reports/tr39/) and

	307 // http://www.unicode.org/Public/security/latest/xidmodifications.txt .

	308 // The inclusion set comes from "Candidate Characters for Inclusion

	309 // in idenfiers" of UTR 31 (http://www.unicode.org/reports/tr31). The list

	310 // may change over the time and will be updated whenever the version of ICU

	311 // used in Chromium is updated.

	312 const icu::UnicodeSet* recommended_set =

	313 uspoof_getRecommendedUnicodeSet(status);

	314 icu::UnicodeSet allowed_set;

	315 allowed_set.addAll(*recommended_set);

	316 const icu::UnicodeSet* inclusion_set = uspoof_getInclusionUnicodeSet(status);

	317 allowed_set.addAll(*inclusion_set);

	318

	319 // Five aspirational scripts are taken from UTR 31 Table 6 at

	320 // http://www.unicode.org/reports/tr31/#Aspirational_Use_Scripts .

	321 // Not all the characters of aspirational scripts are suitable for

	322 // identifiers. Therefore, only characters belonging to

	323 // [:Identifier_Type=Aspirational:] (listed in 'Status/Type=Aspirational'

	324 // section at

	325 // http://www.unicode.org/Public/security/latest/xidmodifications.txt) are

	326 // are added to the allowed set. The list has to be updated when a new

	327 // version of Unicode is released. The current version is 9.0.0 and ICU 60

	328 // will have Unicode 10.0 data.

	329 #if U_ICU_VERSION_MAJOR_NUM < 60

	330 const icu::UnicodeSet aspirational_scripts(

	331 icu::UnicodeString(

	332 // Unified Canadian Syllabics

	333 "[\\u1401-\\u166C\\u166F-\\u167F"

	334 // Mongolian

	335 "\\u1810-\\u1819\\u1820-\\u1877\\u1880-\\u18AA"

	336 // Unified Canadian Syllabics

	337 "\\u18B0-\\u18F5"

	338 // Tifinagh

	339 "\\u2D30-\\u2D67\\u2D7F"

	340 // Yi

	341 "\\uA000-\\uA48C"

	342 // Miao

	343 "\\U00016F00-\\U00016F44\\U00016F50-\\U00016F7E"

	344 "\\U00016F8F-\\U00016F9F]",

	345 -1, US_INV),

	346 *status);

	347 allowed_set.addAll(aspirational_scripts);

	348 #else

	349 #error "Update aspirational_scripts per Unicode 10.0"

	350 #endif

	351

	352 // U+0338 is included in the recommended set, while U+05F4 and U+2027 are in

	353 // the inclusion set. However, they are blacklisted as a part of Mozilla's

	354 // IDN blacklist (http://kb.mozillazine.org/Network.IDN.blacklist_chars).

	355 // U+2010 is in the inclusion set, but we drop it because it can be confused

	356 // with an ASCII U+002D (Hyphen-Minus).

	357 // U+0338 and U+2027 are dropped; the former can look like a slash when

	358 // rendered with a broken font, and the latter can be confused with U+30FB

	359 // (Katakana Middle Dot). U+05F4 (Hebrew Punctuation Gershayim) is kept,

	360 // even though it can look like a double quotation mark. Using it in Hebrew

	361 // should be safe. When used with a non-Hebrew script, it'd be filtered by

	362 // other checks in place.

	363 allowed_set.remove(0x338u); // Combining Long Solidus Overlay

	364 allowed_set.remove(0x2010u); // Hyphen

	365 allowed_set.remove(0x2027u); // Hyphenation Point

	366

	367 #if defined(OS_MACOSX)

	368 // The following characters are reported as present in the default macOS

	369 // system UI font, but they render as blank. Remove them from the allowed

	370 // set to prevent spoofing.

	371 // Tibetan characters used for transliteration of ancient texts:

	372 allowed_set.remove(0x0F8Cu);

	373 allowed_set.remove(0x0F8Du);

	374 allowed_set.remove(0x0F8Eu);

	375 allowed_set.remove(0x0F8Fu);

	376 #endif

	377

	378 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status);

	379 }

	380

	381 } // namespace url_formatter

OLD	NEW