components/url_formatter/idn_spoof_checker.cc - Issue 2889303003: Revert of Mitigate spoofing attempt using Latin letters.

Side by Side Diff: components/url_formatter/idn_spoof_checker.cc

Issue 2889303003: Revert of Mitigate spoofing attempt using Latin letters. (Closed)

Patch Set: Created 3 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2017 The Chromium Authors. All rights reserved.	1 // Copyright 2017 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/url_formatter/idn_spoof_checker.h"	5 #include "components/url_formatter/idn_spoof_checker.h"

6	6

7 #include "base/numerics/safe_conversions.h"	7 #include "base/numerics/safe_conversions.h"

8 #include "base/strings/string_split.h"	8 #include "base/strings/string_split.h"

9 #include "base/strings/string_util.h"	9 #include "base/strings/string_util.h"

10 #include "base/threading/thread_local_storage.h"	10 #include "base/threading/thread_local_storage.h"

11 #include "net/base/lookup_string_in_fixed_set.h"

12 #include "third_party/icu/source/common/unicode/schriter.h"	11 #include "third_party/icu/source/common/unicode/schriter.h"

13 #include "third_party/icu/source/common/unicode/unistr.h"	12 #include "third_party/icu/source/common/unicode/unistr.h"

14 #include "third_party/icu/source/i18n/unicode/regex.h"	13 #include "third_party/icu/source/i18n/unicode/regex.h"

15 #include "third_party/icu/source/i18n/unicode/translit.h"

16 #include "third_party/icu/source/i18n/unicode/uspoof.h"	14 #include "third_party/icu/source/i18n/unicode/uspoof.h"

17	15

18 namespace url_formatter {	16 namespace url_formatter {

19	17

20 namespace {	18 namespace {

21 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;	19 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;

22	20

23 void OnThreadTermination(void* regex_matcher) {	21 void OnThreadTermination(void* regex_matcher) {

24 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);	22 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);

25 }	23 }

26	24

27 #include "components/url_formatter/top_domains/alexa_skeletons-inc.cc"

28 // All the domains in the above file have 3 or fewer labels.

29 const size_t kNumberOfLabelsToCheck = 3;

30

31 bool LookupMatchInTopDomains(base::StringPiece skeleton) {

32 DCHECK_NE(skeleton.back(), '.');

33 auto labels = base::SplitStringPiece(skeleton, ".", base::KEEP_WHITESPACE,

34 base::SPLIT_WANT_ALL);

35

36 if (labels.size() > kNumberOfLabelsToCheck) {

37 labels.erase(labels.begin(),

38 labels.begin() + labels.size() - kNumberOfLabelsToCheck);

39 }

40

41 while (labels.size() > 1) {

42 std::string partial_skeleton = base::JoinString(labels, ".");

43 if (net::LookupStringInFixedSet(

44 kDafsa, arraysize(kDafsa), partial_skeleton.data(),

45 partial_skeleton.length()) != net::kDafsaNotFound)

46 return true;

47 labels.erase(labels.begin());

48 }

49 return false;

50 }

51

52 } // namespace	25 } // namespace

53	26

54 IDNSpoofChecker::IDNSpoofChecker() {	27 IDNSpoofChecker::IDNSpoofChecker() {

55 UErrorCode status = U_ZERO_ERROR;	28 UErrorCode status = U_ZERO_ERROR;

56 checker_ = uspoof_open(&status);	29 checker_ = uspoof_open(&status);

57 if (U_FAILURE(status)) {	30 if (U_FAILURE(status)) {

58 checker_ = nullptr;	31 checker_ = nullptr;

59 return;	32 return;

60 }	33 }

61	34

(...skipping 26 matching lines...) Expand all Loading...
88 UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"), status);	61 UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"), status);

89 deviation_characters_.freeze();	62 deviation_characters_.freeze();

90	63

91 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary	64 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary

92 // because additional characters pulled in with scx=Latn are not included in	65 // because additional characters pulled in with scx=Latn are not included in

93 // the allowed set.	66 // the allowed set.

94 non_ascii_latin_letters_ =	67 non_ascii_latin_letters_ =

95 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);	68 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);

96 non_ascii_latin_letters_.freeze();	69 non_ascii_latin_letters_.freeze();

97	70

98 // The following two sets are parts of \|dangerous_patterns_\|.	71 // These letters are parts of \|dangerous_patterns_\|.

99 kana_letters_exceptions_ = icu::UnicodeSet(	72 kana_letters_exceptions_ = icu::UnicodeSet(

100 UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"),	73 UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"),

101 status);	74 status);

102 kana_letters_exceptions_.freeze();	75 kana_letters_exceptions_.freeze();

103 combining_diacritics_exceptions_ =

104 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0300-\\u0339]"), status);

105 combining_diacritics_exceptions_.freeze();

106	76

107 // These Cyrillic letters look like Latin. A domain label entirely made of	77 // These Cyrillic letters look like Latin. A domain label entirely made of

108 // these letters is blocked as a simplified whole-script-spoofable.	78 // these letters is blocked as a simplified whole-script-spoofable.

109 cyrillic_letters_latin_alike_ =	79 cyrillic_letters_latin_alike_ =

110 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status);	80 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status);

111 cyrillic_letters_latin_alike_.freeze();	81 cyrillic_letters_latin_alike_.freeze();

112	82

113 cyrillic_letters_ =	83 cyrillic_letters_ =

114 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status);	84 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status);

115 cyrillic_letters_.freeze();	85 cyrillic_letters_.freeze();

116	86

117 DCHECK(U_SUCCESS(status));	87 DCHECK(U_SUCCESS(status));

118 // This set is used to determine whether or not to apply a slow

119 // transliteration to remove diacritics to a given hostname before the

120 // confusable skeleton calculation for comparison with top domain names. If

121 // it has any character outside the set, the expensive step will be skipped

122 // because it cannot match any of top domain names.

123 // The last ([\u0300-\u0339] is a shorthand for "[:Identifier_Status=Allowed:]

124 // & [:Script_Extensions=Inherited:] - [\\u200C\\u200D]". The latter is a

125 // subset of the former but it does not matter because hostnames with

126 // characters outside the latter set would be rejected in an earlier step.

127 lgc_letters_n_ascii_ = icu::UnicodeSet(

128 UNICODE_STRING_SIMPLE("[[:Latin:][:Greek:][:Cyrillic:][0-9\\u002e_"

129 "\\u002d][\\u0300-\\u0339]]"),

130 status);

131 lgc_letters_n_ascii_.freeze();

132

133 // Used for diacritics-removal before the skeleton calculation. Add

134 // "ł > l; ø > o; đ > d" that are not handled by "NFD; Nonspacing mark

135 // removal; NFC". On top of that, supplement the Unicode confusable list by

136 // replacing {U+043A (к), U+0138(ĸ), U+03BA(κ)}, U+04CF (ӏ) and U+043F(п) by

137 // 'k', 'l' and 'n', respectively.

138 // TODO(jshin): Revisit "ł > l; ø > o" mapping.

139 UParseError parse_error;

140 transliterator_.reset(icu::Transliterator::createFromRules(

141 UNICODE_STRING_SIMPLE("DropAcc"),

142 icu::UnicodeString("::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;"

143 " ł > l; ø > o; đ > d; ӏ > l; [кĸκ] > k; п > n;"),

144 UTRANS_FORWARD, parse_error, status));

145 DCHECK(U_SUCCESS(status))

146 << "Spoofchecker initalization failed due to an error: "

147 << u_errorName(status);

148 }	88 }

149	89

150 IDNSpoofChecker::~IDNSpoofChecker() {	90 IDNSpoofChecker::~IDNSpoofChecker() {

151 uspoof_close(checker_);	91 uspoof_close(checker_);

152 }	92 }

153	93

154 bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,	94 bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,

155 bool is_tld_ascii) {	95 bool is_tld_ascii) {

156 UErrorCode status = U_ZERO_ERROR;	96 UErrorCode status = U_ZERO_ERROR;

157 int32_t result =	97 int32_t result =

(...skipping 15 matching lines...) Expand all Loading...
173 // "UTS 46 section 4 Processing step 4" applies validity criteria for	113 // "UTS 46 section 4 Processing step 4" applies validity criteria for

174 // non-transitional processing (i.e. do not map deviation characters) to any	114 // non-transitional processing (i.e. do not map deviation characters) to any

175 // punycode labels regardless of whether transitional or non-transitional is	115 // punycode labels regardless of whether transitional or non-transitional is

176 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted	116 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted

177 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as	117 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as

178 // such. See http://crbug.com/595263 .	118 // such. See http://crbug.com/595263 .

179 if (deviation_characters_.containsSome(label_string))	119 if (deviation_characters_.containsSome(label_string))

180 return false;	120 return false;

181	121

182 // If there's no script mixing, the input is regarded as safe without any	122 // If there's no script mixing, the input is regarded as safe without any

183 // extra check unless it falls into one of three categories:	123 // extra check unless it contains Kana letter exceptions or it's made entirely

184 // - contains Kana letter exceptions	124 // of Cyrillic letters that look like Latin letters. Note that the following

185 // - the TLD is ASCII and the input is made entirely of Cyrillic letters	125 // combinations of scripts are treated as a 'logical' single script.

186 // that look like Latin letters.

187 // - it has combining diacritic marks.

188 // Note that the following combinations of scripts are treated as a 'logical'

189 // single script.

190 // - Chinese: Han, Bopomofo, Common	126 // - Chinese: Han, Bopomofo, Common

191 // - Japanese: Han, Hiragana, Katakana, Common	127 // - Japanese: Han, Hiragana, Katakana, Common

192 // - Korean: Hangul, Han, Common	128 // - Korean: Hangul, Han, Common

193 result &= USPOOF_RESTRICTION_LEVEL_MASK;	129 result &= USPOOF_RESTRICTION_LEVEL_MASK;

194 if (result == USPOOF_ASCII)	130 if (result == USPOOF_ASCII)

195 return true;	131 return true;

196 if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&	132 if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&

197 kana_letters_exceptions_.containsNone(label_string) &&	133 kana_letters_exceptions_.containsNone(label_string)) {

198 combining_diacritics_exceptions_.containsNone(label_string)) {

199 // Check Cyrillic confusable only for ASCII TLDs.	134 // Check Cyrillic confusable only for ASCII TLDs.

200 return !is_tld_ascii \|\| !IsMadeOfLatinAlikeCyrillic(label_string);	135 return !is_tld_ascii \|\| !IsMadeOfLatinAlikeCyrillic(label_string);

201 }	136 }

202	137

203 // Additional checks for \|label\| with multiple scripts, one of which is Latin.	138 // Additional checks for \|label\| with multiple scripts, one of which is Latin.

204 // Disallow non-ASCII Latin letters to mix with a non-Latin script.	139 // Disallow non-ASCII Latin letters to mix with a non-Latin script.

205 // Note that the non-ASCII Latin check should not be applied when the entire	140 if (non_ascii_latin_letters_.containsSome(label_string))

206 // label is made of Latin. Checking with lgc_letters set here should be fine

207 // because script mixing of LGC is already rejected.

208 if (non_ascii_latin_letters_.containsSome(label_string) &&

209 !lgc_letters_n_ascii_.containsAll(label_string))

210 return false;	141 return false;

211	142

212 if (!tls_index.initialized())	143 if (!tls_index.initialized())

213 tls_index.Initialize(&OnThreadTermination);	144 tls_index.Initialize(&OnThreadTermination);

214 icu::RegexMatcher* dangerous_pattern =	145 icu::RegexMatcher* dangerous_pattern =

215 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get());	146 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get());

216 if (!dangerous_pattern) {	147 if (!dangerous_pattern) {

217 // Disallow the katakana no, so, zo, or n, as they may be mistaken for	148 // Disallow the katakana no, so, zo, or n, as they may be mistaken for

218 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts	149 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts

219 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a	150 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a

220 // non-Japanese script on either side is disallowed, legitimate cases like	151 // non-Japanese script on either side is disallowed, legitimate cases like

221 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those	152 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those

222 // characters when used alone as a label is futile because those cases	153 // characters when used alone as a label is futile because those cases

223 // would not reach here.	154 // would not reach here.

224 // Also disallow what used to be blocked by mixed-script-confusable (MSC)	155 // Also disallow what used to be blocked by mixed-script-confusable (MSC)

225 // detection. ICU 58 does not detect MSC any more for a single input string.	156 // detection. ICU 58 does not detect MSC any more for a single input string.

226 // See http://bugs.icu-project.org/trac/ticket/12823 .	157 // See http://bugs.icu-project.org/trac/ticket/12823 .

227 // TODO(jshin): adjust the pattern once the above ICU bug is fixed.	158 // TODO(jshin): adjust the pattern once the above ICU bug is fixed.

228 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana	159 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana

229 // Prolonged Sound) used out-of-context.	160 // Prolonged Sound) used out-of-context.

230 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark)	161 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark)

231 // unless they're preceded by a Katakana.	162 // unless they're preceded by a Katakana.

232 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters	163 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters

233 // (U+30D[8-A]) that look exactly like each other when they're used in a	164 // (U+30D[8-A]) that look exactly like each other when they're used in a

234 // label otherwise entirely in Katakna or Hiragana.	165 // label otherwise entirely in Katakna or Hiragana.

235 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small	166 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small

236 // Letter Co) to be next to Latin.	167 // Letter Co) to be next to Latin.

237 // - Disallow Latin 'o' and 'g' next to Armenian.	168 // - Disallow Latin 'o' and 'g' next to Armenian.

238 // - Disalow mixing of Latin and Canadian Syllabary.	169 // - Disalow mixing of Latin and Canadian Syllabary.

239 // - Disallow combining diacritical mark (U+0300-U+0339) after a non-LGC

240 // character. Other combining diacritical marks are not in the allowed

241 // character set.

242 dangerous_pattern = new icu::RegexMatcher(	170 dangerous_pattern = new icu::RegexMatcher(

243 icu::UnicodeString(	171 icu::UnicodeString(

244 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])"	172 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])"

245 R"([\u30ce\u30f3\u30bd\u30be])"	173 R"([\u30ce\u30f3\u30bd\u30be])"

246 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}]\|)"	174 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}]\|)"

247 R"([^\p{scx=kana}\p{scx=hira}]\u30fc\|^\u30fc\|)"	175 R"([^\p{scx=kana}\p{scx=hira}]\u30fc\|^\u30fc\|)"

248 R"([^\p{scx=kana}][\u30fd\u30fe]\|^[\u30fd\u30fe]\|)"	176 R"([^\p{scx=kana}][\u30fd\u30fe]\|^[\u30fd\u30fe]\|)"

249 R"(^[\p{scx=kana}]+[\u3078-\u307a][\p{scx=kana}]+$\|)"	177 R"(^[\p{scx=kana}]+[\u3078-\u307a][\p{scx=kana}]+$\|)"

250 R"(^[\p{scx=hira}]+[\u30d8-\u30da][\p{scx=hira}]+$\|)"	178 R"(^[\p{scx=hira}]+[\u30d8-\u30da][\p{scx=hira}]+$\|)"

251 R"([a-z]\u30fb\|\u30fb[a-z]\|)"	179 R"([a-z]\u30fb\|\u30fb[a-z]\|)"

252 R"(^[\u0585\u0581]+[a-z]\|[a-z][\u0585\u0581]+$\|)"	180 R"(^[\u0585\u0581]+[a-z]\|[a-z][\u0585\u0581]+$\|)"

253 R"([a-z][\u0585\u0581]+[a-z]\|)"	181 R"([a-z][\u0585\u0581]+[a-z]\|)"

254 R"(^[og]+[\p{scx=armn}]\|[\p{scx=armn}][og]+$\|)"	182 R"(^[og]+[\p{scx=armn}]\|[\p{scx=armn}][og]+$\|)"

255 R"([\p{scx=armn}][og]+[\p{scx=armn}]\|)"	183 R"([\p{scx=armn}][og]+[\p{scx=armn}]\|)"

256 R"([\p{sc=cans}].[a-z]\|[a-z].[\p{sc=cans}]\|)"	184 R"([\p{sc=cans}].[a-z]\|[a-z].[\p{sc=cans}])",

257 R"([^\p{scx=latn}\p{scx=grek}\p{scx=cyrl}][\u0300-\u0339])",

258 -1, US_INV),	185 -1, US_INV),

259 0, status);	186 0, status);

260 tls_index.Set(dangerous_pattern);	187 tls_index.Set(dangerous_pattern);

261 }	188 }

262 dangerous_pattern->reset(label_string);	189 dangerous_pattern->reset(label_string);

263 return !dangerous_pattern->find();	190 return !dangerous_pattern->find();

264 }	191 }

265	192

266 bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {

267 size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0);

268 icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname_length);

269 // If input has any characters outside Latin-Greek-Cyrillic and [0-9._-],

270 // there is no point in getting rid of diacritics because combining marks

271 // attached to non-LGC characters are already blocked.

272 if (lgc_letters_n_ascii_.span(ustr_host, 0, USET_SPAN_CONTAINED) ==

273 ustr_host.length())

274 transliterator_.get()->transliterate(ustr_host);

275

276 UErrorCode status = U_ZERO_ERROR;

277 icu::UnicodeString ustr_skeleton;

278 uspoof_getSkeletonUnicodeString(checker_, 0, ustr_host, ustr_skeleton,

279 &status);

280 if (U_FAILURE(status))

281 return false;

282 std::string skeleton;

283 ustr_skeleton.toUTF8String(skeleton);

284 return LookupMatchInTopDomains(skeleton);

285 }

286

287 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(	193 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(

288 const icu::UnicodeString& label) {	194 const icu::UnicodeString& label) {

289 // Collect all the Cyrillic letters in \|label_string\| and see if they're

290 // a subset of \|cyrillic_letters_latin_alike_\|.

291 // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and	195 // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and

292 // [_-] and checking if the set contains all letters of \|label\|	196 // [_-] and checking if the set contains all letters of \|label_string\|

293 // would work in most cases, but not if a label has non-letters outside	197 // would work in most cases, but not if a label has non-letters outside

294 // ASCII.	198 // ASCII.

295 icu::UnicodeSet cyrillic_in_label;	199 icu::UnicodeSet cyrillic_in_label;

296 icu::StringCharacterIterator it(label);	200 icu::StringCharacterIterator it(label);

297 for (it.setToStart(); it.hasNext();) {	201 for (it.setToStart(); it.hasNext();) {

298 const UChar32 c = it.next32PostInc();	202 const UChar32 c = it.next32PostInc();

299 if (cyrillic_letters_.contains(c))	203 if (cyrillic_letters_.contains(c))

300 cyrillic_in_label.add(c);	204 cyrillic_in_label.add(c);

301 }	205 }

302 return !cyrillic_in_label.isEmpty() &&	206 return !cyrillic_in_label.isEmpty() &&

(...skipping 75 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
378 allowed_set.remove(0x0F8Cu);	282 allowed_set.remove(0x0F8Cu);

379 allowed_set.remove(0x0F8Du);	283 allowed_set.remove(0x0F8Du);

380 allowed_set.remove(0x0F8Eu);	284 allowed_set.remove(0x0F8Eu);

381 allowed_set.remove(0x0F8Fu);	285 allowed_set.remove(0x0F8Fu);

382 #endif	286 #endif

383	287

384 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status);	288 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status);

385 }	289 }

386	290

387 } // namespace url_formatter	291 } // namespace url_formatter

OLD	NEW

« no previous file with comments | « components/url_formatter/idn_spoof_checker.h ('k') | components/url_formatter/top_domains/BUILD.gn » ('j') | no next file with comments »