components/url_formatter/idn_spoof_checker.cc - Issue 2784933002: Mitigate spoofing attempt using Latin letters.

Side by Side Diff: components/url_formatter/idn_spoof_checker.cc

Issue 2784933002: Mitigate spoofing attempt using Latin letters. (Closed)

Patch Set: use checked_cast and make win64 happy Created 3 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2017 The Chromium Authors. All rights reserved.	1 // Copyright 2017 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/url_formatter/idn_spoof_checker.h"	5 #include "components/url_formatter/idn_spoof_checker.h"

6	6

7 #include "base/numerics/safe_conversions.h"	7 #include "base/numerics/safe_conversions.h"

8 #include "base/strings/string_split.h"	8 #include "base/strings/string_split.h"

9 #include "base/strings/string_util.h"	9 #include "base/strings/string_util.h"

10 #include "base/threading/thread_local_storage.h"	10 #include "base/threading/thread_local_storage.h"

	11 #include "net/base/lookup_string_in_fixed_set.h"

11 #include "third_party/icu/source/common/unicode/schriter.h"	12 #include "third_party/icu/source/common/unicode/schriter.h"

12 #include "third_party/icu/source/common/unicode/unistr.h"	13 #include "third_party/icu/source/common/unicode/unistr.h"

13 #include "third_party/icu/source/i18n/unicode/regex.h"	14 #include "third_party/icu/source/i18n/unicode/regex.h"

	15 #include "third_party/icu/source/i18n/unicode/translit.h"

14 #include "third_party/icu/source/i18n/unicode/uspoof.h"	16 #include "third_party/icu/source/i18n/unicode/uspoof.h"

15	17

16 namespace url_formatter {	18 namespace url_formatter {

17	19

18 namespace {	20 namespace {

19 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;	21 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;

20	22

21 void OnThreadTermination(void* regex_matcher) {	23 void OnThreadTermination(void* regex_matcher) {

22 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);	24 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);

23 }	25 }

24	26

	27 #include "components/url_formatter/top_domains/alexa_skeletons-inc.cc"

	28 // All the domains in the above file have 3 or fewer labels.

	29 const size_t kNumberOfLabelsToCheck = 3;

	30

	31 bool LookupMatchInTopDomains(base::StringPiece skeleton) {

	32 DCHECK_NE(skeleton.back(), '.');

	33 auto labels = base::SplitStringPiece(skeleton, ".", base::KEEP_WHITESPACE,

	34 base::SPLIT_WANT_ALL);

	35

	36 if (labels.size() > kNumberOfLabelsToCheck) {

	37 labels.erase(labels.begin(),

	38 labels.begin() + labels.size() - kNumberOfLabelsToCheck);

	39 }

	40

	41 while (labels.size() > 1) {

	42 std::string partial_skeleton = base::JoinString(labels, ".");

	43 if (net::LookupStringInFixedSet(

	44 kDafsa, arraysize(kDafsa), partial_skeleton.data(),

	45 partial_skeleton.length()) != net::kDafsaNotFound)

	46 return true;

	47 labels.erase(labels.begin());

	48 }

	49 return false;

	50 }

	51

25 } // namespace	52 } // namespace

26	53

27 IDNSpoofChecker::IDNSpoofChecker() {	54 IDNSpoofChecker::IDNSpoofChecker() {

28 UErrorCode status = U_ZERO_ERROR;	55 UErrorCode status = U_ZERO_ERROR;

29 checker_ = uspoof_open(&status);	56 checker_ = uspoof_open(&status);

30 if (U_FAILURE(status)) {	57 if (U_FAILURE(status)) {

31 checker_ = nullptr;	58 checker_ = nullptr;

32 return;	59 return;

33 }	60 }

34	61

(...skipping 26 matching lines...) Expand all Loading...
61 UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"), status);	88 UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"), status);

62 deviation_characters_.freeze();	89 deviation_characters_.freeze();

63	90

64 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary	91 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary

65 // because additional characters pulled in with scx=Latn are not included in	92 // because additional characters pulled in with scx=Latn are not included in

66 // the allowed set.	93 // the allowed set.

67 non_ascii_latin_letters_ =	94 non_ascii_latin_letters_ =

68 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);	95 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);

69 non_ascii_latin_letters_.freeze();	96 non_ascii_latin_letters_.freeze();

70	97

71 // These letters are parts of \|dangerous_patterns_\|.	98 // The following two sets are parts of \|dangerous_patterns_\|.

72 kana_letters_exceptions_ = icu::UnicodeSet(	99 kana_letters_exceptions_ = icu::UnicodeSet(

73 UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"),	100 UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"),

74 status);	101 status);

75 kana_letters_exceptions_.freeze();	102 kana_letters_exceptions_.freeze();

	103 combining_diacritics_exceptions_ =

	104 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0300-\\u0339]"), status);

	105 combining_diacritics_exceptions_.freeze();

76	106

77 // These Cyrillic letters look like Latin. A domain label entirely made of	107 // These Cyrillic letters look like Latin. A domain label entirely made of

78 // these letters is blocked as a simplified whole-script-spoofable.	108 // these letters is blocked as a simplified whole-script-spoofable.

79 cyrillic_letters_latin_alike_ =	109 cyrillic_letters_latin_alike_ =

80 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status);	110 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status);

81 cyrillic_letters_latin_alike_.freeze();	111 cyrillic_letters_latin_alike_.freeze();

82	112

83 cyrillic_letters_ =	113 cyrillic_letters_ =

84 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status);	114 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status);

85 cyrillic_letters_.freeze();	115 cyrillic_letters_.freeze();

86	116

87 DCHECK(U_SUCCESS(status));	117 DCHECK(U_SUCCESS(status));

	118 // This set is used to determine whether or not to apply a slow

	119 // transliteration to remove diacritics to a given hostname before the

	120 // confusable skeleton calculation for comparison with top domain names. If

	121 // it has any character outside the set, the expensive step will be skipped

	122 // because it cannot match any of top domain names.

	123 // The last ([\u0300-\u0339] is a shorthand for "[:Identifier_Status=Allowed:]

	124 // & [:Script_Extensions=Inherited:] - [\\u200C\\u200D]". The latter is a

	125 // subset of the former but it does not matter because hostnames with

	126 // characters outside the latter set would be rejected in an earlier step.

	127 lgc_letters_n_ascii_ = icu::UnicodeSet(

	128 UNICODE_STRING_SIMPLE("[[:Latin:][:Greek:][:Cyrillic:][0-9\\u002e_"

	129 "\\u002d][\\u0300-\\u0339]]"),

	130 status);

	131 lgc_letters_n_ascii_.freeze();

	132

	133 // Used for diacritics-removal before the skeleton calculation. Add

	134 // "ł > l; ø > o; đ > d" that are not handled by "NFD; Nonspacing mark

	135 // removal; NFC". On top of that, supplement the Unicode confusable list by

	136 // replacing {U+043A (к), U+0138(ĸ), U+03BA(κ)}, U+04CF (ӏ) and U+043F(п) by

	137 // 'k', 'l' and 'n', respectively.

	138 // TODO(jshin): Revisit "ł > l; ø > o" mapping.

	139 UParseError parse_error;

	140 transliterator_.reset(icu::Transliterator::createFromRules(

	141 UNICODE_STRING_SIMPLE("DropAcc"),

	142 icu::UnicodeString("::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;"

	143 " ł > l; ø > o; đ > d; ӏ > l; [кĸκ] > k; п > n;"),

	144 UTRANS_FORWARD, parse_error, status));

	145 DCHECK(U_SUCCESS(status))

	146 << "Spoofchecker initalization failed due to an error: "

	147 << u_errorName(status);

88 }	148 }

89	149

90 IDNSpoofChecker::~IDNSpoofChecker() {	150 IDNSpoofChecker::~IDNSpoofChecker() {

91 uspoof_close(checker_);	151 uspoof_close(checker_);

92 }	152 }

93	153

94 bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,	154 bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,

95 bool is_tld_ascii) {	155 bool is_tld_ascii) {

96 UErrorCode status = U_ZERO_ERROR;	156 UErrorCode status = U_ZERO_ERROR;

97 int32_t result =	157 int32_t result =

(...skipping 15 matching lines...) Expand all Loading...
113 // "UTS 46 section 4 Processing step 4" applies validity criteria for	173 // "UTS 46 section 4 Processing step 4" applies validity criteria for

114 // non-transitional processing (i.e. do not map deviation characters) to any	174 // non-transitional processing (i.e. do not map deviation characters) to any

115 // punycode labels regardless of whether transitional or non-transitional is	175 // punycode labels regardless of whether transitional or non-transitional is

116 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted	176 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted

117 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as	177 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as

118 // such. See http://crbug.com/595263 .	178 // such. See http://crbug.com/595263 .

119 if (deviation_characters_.containsSome(label_string))	179 if (deviation_characters_.containsSome(label_string))

120 return false;	180 return false;

121	181

122 // If there's no script mixing, the input is regarded as safe without any	182 // If there's no script mixing, the input is regarded as safe without any

123 // extra check unless it contains Kana letter exceptions or it's made entirely	183 // extra check unless it falls into one of three categories:

124 // of Cyrillic letters that look like Latin letters. Note that the following	184 // - contains Kana letter exceptions

125 // combinations of scripts are treated as a 'logical' single script.	185 // - the TLD is ASCII and the input is made entirely of Cyrillic letters

	186 // that look like Latin letters.

	187 // - it has combining diacritic marks.

	188 // Note that the following combinations of scripts are treated as a 'logical'

	189 // single script.

126 // - Chinese: Han, Bopomofo, Common	190 // - Chinese: Han, Bopomofo, Common

127 // - Japanese: Han, Hiragana, Katakana, Common	191 // - Japanese: Han, Hiragana, Katakana, Common

128 // - Korean: Hangul, Han, Common	192 // - Korean: Hangul, Han, Common

129 result &= USPOOF_RESTRICTION_LEVEL_MASK;	193 result &= USPOOF_RESTRICTION_LEVEL_MASK;

130 if (result == USPOOF_ASCII)	194 if (result == USPOOF_ASCII)

131 return true;	195 return true;

132 if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&	196 if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&

133 kana_letters_exceptions_.containsNone(label_string)) {	197 kana_letters_exceptions_.containsNone(label_string) &&

	198 combining_diacritics_exceptions_.containsNone(label_string)) {

134 // Check Cyrillic confusable only for ASCII TLDs.	199 // Check Cyrillic confusable only for ASCII TLDs.

135 return !is_tld_ascii \|\| !IsMadeOfLatinAlikeCyrillic(label_string);	200 return !is_tld_ascii \|\| !IsMadeOfLatinAlikeCyrillic(label_string);

136 }	201 }

137	202

138 // Additional checks for \|label\| with multiple scripts, one of which is Latin.	203 // Additional checks for \|label\| with multiple scripts, one of which is Latin.

139 // Disallow non-ASCII Latin letters to mix with a non-Latin script.	204 // Disallow non-ASCII Latin letters to mix with a non-Latin script.

140 if (non_ascii_latin_letters_.containsSome(label_string))	205 // Note that the non-ASCII Latin check should not be applied when the entire

	206 // label is made of Latin. Checking with lgc_letters set here should be fine

	207 // because script mixing of LGC is already rejected.

	208 if (non_ascii_latin_letters_.containsSome(label_string) &&

	209 !lgc_letters_n_ascii_.containsAll(label_string))

141 return false;	210 return false;

142	211

143 if (!tls_index.initialized())	212 if (!tls_index.initialized())

144 tls_index.Initialize(&OnThreadTermination);	213 tls_index.Initialize(&OnThreadTermination);

145 icu::RegexMatcher* dangerous_pattern =	214 icu::RegexMatcher* dangerous_pattern =

146 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get());	215 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get());

147 if (!dangerous_pattern) {	216 if (!dangerous_pattern) {

148 // Disallow the katakana no, so, zo, or n, as they may be mistaken for	217 // Disallow the katakana no, so, zo, or n, as they may be mistaken for

149 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts	218 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts

150 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a	219 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a

151 // non-Japanese script on either side is disallowed, legitimate cases like	220 // non-Japanese script on either side is disallowed, legitimate cases like

152 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those	221 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those

153 // characters when used alone as a label is futile because those cases	222 // characters when used alone as a label is futile because those cases

154 // would not reach here.	223 // would not reach here.

155 // Also disallow what used to be blocked by mixed-script-confusable (MSC)	224 // Also disallow what used to be blocked by mixed-script-confusable (MSC)

156 // detection. ICU 58 does not detect MSC any more for a single input string.	225 // detection. ICU 58 does not detect MSC any more for a single input string.

157 // See http://bugs.icu-project.org/trac/ticket/12823 .	226 // See http://bugs.icu-project.org/trac/ticket/12823 .

158 // TODO(jshin): adjust the pattern once the above ICU bug is fixed.	227 // TODO(jshin): adjust the pattern once the above ICU bug is fixed.

159 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana	228 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana

160 // Prolonged Sound) used out-of-context.	229 // Prolonged Sound) used out-of-context.

161 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark)	230 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark)

162 // unless they're preceded by a Katakana.	231 // unless they're preceded by a Katakana.

163 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters	232 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters

164 // (U+30D[8-A]) that look exactly like each other when they're used in a	233 // (U+30D[8-A]) that look exactly like each other when they're used in a

165 // label otherwise entirely in Katakna or Hiragana.	234 // label otherwise entirely in Katakna or Hiragana.

166 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small	235 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small

167 // Letter Co) to be next to Latin.	236 // Letter Co) to be next to Latin.

168 // - Disallow Latin 'o' and 'g' next to Armenian.	237 // - Disallow Latin 'o' and 'g' next to Armenian.

169 // - Disalow mixing of Latin and Canadian Syllabary.	238 // - Disalow mixing of Latin and Canadian Syllabary.

	239 // - Disallow combining diacritical mark (U+0300-U+0339) after a non-LGC

	240 // character. Other combining diacritical marks are not in the allowed

	241 // character set.

170 dangerous_pattern = new icu::RegexMatcher(	242 dangerous_pattern = new icu::RegexMatcher(

171 icu::UnicodeString(	243 icu::UnicodeString(

172 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])"	244 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])"

173 R"([\u30ce\u30f3\u30bd\u30be])"	245 R"([\u30ce\u30f3\u30bd\u30be])"

174 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}]\|)"	246 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}]\|)"

175 R"([^\p{scx=kana}\p{scx=hira}]\u30fc\|^\u30fc\|)"	247 R"([^\p{scx=kana}\p{scx=hira}]\u30fc\|^\u30fc\|)"

176 R"([^\p{scx=kana}][\u30fd\u30fe]\|^[\u30fd\u30fe]\|)"	248 R"([^\p{scx=kana}][\u30fd\u30fe]\|^[\u30fd\u30fe]\|)"

177 R"(^[\p{scx=kana}]+[\u3078-\u307a][\p{scx=kana}]+$\|)"	249 R"(^[\p{scx=kana}]+[\u3078-\u307a][\p{scx=kana}]+$\|)"

178 R"(^[\p{scx=hira}]+[\u30d8-\u30da][\p{scx=hira}]+$\|)"	250 R"(^[\p{scx=hira}]+[\u30d8-\u30da][\p{scx=hira}]+$\|)"

179 R"([a-z]\u30fb\|\u30fb[a-z]\|)"	251 R"([a-z]\u30fb\|\u30fb[a-z]\|)"

180 R"(^[\u0585\u0581]+[a-z]\|[a-z][\u0585\u0581]+$\|)"	252 R"(^[\u0585\u0581]+[a-z]\|[a-z][\u0585\u0581]+$\|)"

181 R"([a-z][\u0585\u0581]+[a-z]\|)"	253 R"([a-z][\u0585\u0581]+[a-z]\|)"

182 R"(^[og]+[\p{scx=armn}]\|[\p{scx=armn}][og]+$\|)"	254 R"(^[og]+[\p{scx=armn}]\|[\p{scx=armn}][og]+$\|)"

183 R"([\p{scx=armn}][og]+[\p{scx=armn}]\|)"	255 R"([\p{scx=armn}][og]+[\p{scx=armn}]\|)"

184 R"([\p{sc=cans}].[a-z]\|[a-z].[\p{sc=cans}])",	256 R"([\p{sc=cans}].[a-z]\|[a-z].[\p{sc=cans}]\|)"

	257 R"([^\p{scx=latn}\p{scx=grek}\p{scx=cyrl}][\u0300-\u0339])",

185 -1, US_INV),	258 -1, US_INV),

186 0, status);	259 0, status);

187 tls_index.Set(dangerous_pattern);	260 tls_index.Set(dangerous_pattern);

188 }	261 }

189 dangerous_pattern->reset(label_string);	262 dangerous_pattern->reset(label_string);

190 return !dangerous_pattern->find();	263 return !dangerous_pattern->find();

191 }	264 }

192	265

	266 bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {

	267 size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0);

	268 icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname_length);

	269 // If input has any characters outside Latin-Greek-Cyrillic and [0-9._-],

	270 // there is no point in getting rid of diacritics because combining marks

	271 // attached to non-LGC characters are already blocked.

	272 if (lgc_letters_n_ascii_.span(ustr_host, 0, USET_SPAN_CONTAINED) ==

	273 ustr_host.length())

	274 transliterator_.get()->transliterate(ustr_host);

	275

	276 UErrorCode status = U_ZERO_ERROR;

	277 icu::UnicodeString ustr_skeleton;

	278 uspoof_getSkeletonUnicodeString(checker_, 0, ustr_host, ustr_skeleton,

	279 &status);

	280 if (U_FAILURE(status))

	281 return false;

	282 std::string skeleton;

	283 ustr_skeleton.toUTF8String(skeleton);

	284 return LookupMatchInTopDomains(skeleton);

	285 }

	286

193 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(	287 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(

194 const icu::UnicodeString& label) {	288 const icu::UnicodeString& label) {

	289 // Collect all the Cyrillic letters in \|label_string\| and see if they're

	290 // a subset of \|cyrillic_letters_latin_alike_\|.

195 // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and	291 // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and

196 // [_-] and checking if the set contains all letters of \|label_string\|	292 // [_-] and checking if the set contains all letters of \|label\|

197 // would work in most cases, but not if a label has non-letters outside	293 // would work in most cases, but not if a label has non-letters outside

198 // ASCII.	294 // ASCII.

199 icu::UnicodeSet cyrillic_in_label;	295 icu::UnicodeSet cyrillic_in_label;

200 icu::StringCharacterIterator it(label);	296 icu::StringCharacterIterator it(label);

201 for (it.setToStart(); it.hasNext();) {	297 for (it.setToStart(); it.hasNext();) {

202 const UChar32 c = it.next32PostInc();	298 const UChar32 c = it.next32PostInc();

203 if (cyrillic_letters_.contains(c))	299 if (cyrillic_letters_.contains(c))

204 cyrillic_in_label.add(c);	300 cyrillic_in_label.add(c);

205 }	301 }

206 return !cyrillic_in_label.isEmpty() &&	302 return !cyrillic_in_label.isEmpty() &&

(...skipping 80 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
287 allowed_set.remove(0x0F8Cu);	383 allowed_set.remove(0x0F8Cu);

288 allowed_set.remove(0x0F8Du);	384 allowed_set.remove(0x0F8Du);

289 allowed_set.remove(0x0F8Eu);	385 allowed_set.remove(0x0F8Eu);

290 allowed_set.remove(0x0F8Fu);	386 allowed_set.remove(0x0F8Fu);

291 #endif	387 #endif

292	388

293 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status);	389 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status);

294 }	390 }

295	391

296 } // namespace url_formatter	392 } // namespace url_formatter

OLD	NEW

« no previous file with comments | « components/url_formatter/idn_spoof_checker.h ('k') | components/url_formatter/top_domains/BUILD.gn » ('j') | no next file with comments »