Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(289)

Side by Side Diff: components/url_formatter/idn_spoof_checker.cc

Issue 2784933002: Mitigate spoofing attempt using Latin letters. (Closed)
Patch Set: use checked_cast and make win64 happy Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2017 The Chromium Authors. All rights reserved. 1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/url_formatter/idn_spoof_checker.h" 5 #include "components/url_formatter/idn_spoof_checker.h"
6 6
7 #include "base/numerics/safe_conversions.h" 7 #include "base/numerics/safe_conversions.h"
8 #include "base/strings/string_split.h" 8 #include "base/strings/string_split.h"
9 #include "base/strings/string_util.h" 9 #include "base/strings/string_util.h"
10 #include "base/threading/thread_local_storage.h" 10 #include "base/threading/thread_local_storage.h"
11 #include "net/base/lookup_string_in_fixed_set.h"
11 #include "third_party/icu/source/common/unicode/schriter.h" 12 #include "third_party/icu/source/common/unicode/schriter.h"
12 #include "third_party/icu/source/common/unicode/unistr.h" 13 #include "third_party/icu/source/common/unicode/unistr.h"
13 #include "third_party/icu/source/i18n/unicode/regex.h" 14 #include "third_party/icu/source/i18n/unicode/regex.h"
15 #include "third_party/icu/source/i18n/unicode/translit.h"
14 #include "third_party/icu/source/i18n/unicode/uspoof.h" 16 #include "third_party/icu/source/i18n/unicode/uspoof.h"
15 17
16 namespace url_formatter { 18 namespace url_formatter {
17 19
18 namespace { 20 namespace {
19 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER; 21 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;
20 22
21 void OnThreadTermination(void* regex_matcher) { 23 void OnThreadTermination(void* regex_matcher) {
22 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher); 24 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);
23 } 25 }
24 26
27 #include "components/url_formatter/top_domains/alexa_skeletons-inc.cc"
28 // All the domains in the above file have 3 or fewer labels.
29 const size_t kNumberOfLabelsToCheck = 3;
30
31 bool LookupMatchInTopDomains(base::StringPiece skeleton) {
32 DCHECK_NE(skeleton.back(), '.');
33 auto labels = base::SplitStringPiece(skeleton, ".", base::KEEP_WHITESPACE,
34 base::SPLIT_WANT_ALL);
35
36 if (labels.size() > kNumberOfLabelsToCheck) {
37 labels.erase(labels.begin(),
38 labels.begin() + labels.size() - kNumberOfLabelsToCheck);
39 }
40
41 while (labels.size() > 1) {
42 std::string partial_skeleton = base::JoinString(labels, ".");
43 if (net::LookupStringInFixedSet(
44 kDafsa, arraysize(kDafsa), partial_skeleton.data(),
45 partial_skeleton.length()) != net::kDafsaNotFound)
46 return true;
47 labels.erase(labels.begin());
48 }
49 return false;
50 }
51
25 } // namespace 52 } // namespace
26 53
27 IDNSpoofChecker::IDNSpoofChecker() { 54 IDNSpoofChecker::IDNSpoofChecker() {
28 UErrorCode status = U_ZERO_ERROR; 55 UErrorCode status = U_ZERO_ERROR;
29 checker_ = uspoof_open(&status); 56 checker_ = uspoof_open(&status);
30 if (U_FAILURE(status)) { 57 if (U_FAILURE(status)) {
31 checker_ = nullptr; 58 checker_ = nullptr;
32 return; 59 return;
33 } 60 }
34 61
(...skipping 26 matching lines...) Expand all
61 UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"), status); 88 UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"), status);
62 deviation_characters_.freeze(); 89 deviation_characters_.freeze();
63 90
64 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary 91 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary
65 // because additional characters pulled in with scx=Latn are not included in 92 // because additional characters pulled in with scx=Latn are not included in
66 // the allowed set. 93 // the allowed set.
67 non_ascii_latin_letters_ = 94 non_ascii_latin_letters_ =
68 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status); 95 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);
69 non_ascii_latin_letters_.freeze(); 96 non_ascii_latin_letters_.freeze();
70 97
71 // These letters are parts of |dangerous_patterns_|. 98 // The following two sets are parts of |dangerous_patterns_|.
72 kana_letters_exceptions_ = icu::UnicodeSet( 99 kana_letters_exceptions_ = icu::UnicodeSet(
73 UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"), 100 UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"),
74 status); 101 status);
75 kana_letters_exceptions_.freeze(); 102 kana_letters_exceptions_.freeze();
103 combining_diacritics_exceptions_ =
104 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0300-\\u0339]"), status);
105 combining_diacritics_exceptions_.freeze();
76 106
77 // These Cyrillic letters look like Latin. A domain label entirely made of 107 // These Cyrillic letters look like Latin. A domain label entirely made of
78 // these letters is blocked as a simplified whole-script-spoofable. 108 // these letters is blocked as a simplified whole-script-spoofable.
79 cyrillic_letters_latin_alike_ = 109 cyrillic_letters_latin_alike_ =
80 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status); 110 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status);
81 cyrillic_letters_latin_alike_.freeze(); 111 cyrillic_letters_latin_alike_.freeze();
82 112
83 cyrillic_letters_ = 113 cyrillic_letters_ =
84 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status); 114 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status);
85 cyrillic_letters_.freeze(); 115 cyrillic_letters_.freeze();
86 116
87 DCHECK(U_SUCCESS(status)); 117 DCHECK(U_SUCCESS(status));
118 // This set is used to determine whether or not to apply a slow
119 // transliteration to remove diacritics to a given hostname before the
120 // confusable skeleton calculation for comparison with top domain names. If
121 // it has any character outside the set, the expensive step will be skipped
122 // because it cannot match any of top domain names.
123 // The last ([\u0300-\u0339] is a shorthand for "[:Identifier_Status=Allowed:]
124 // & [:Script_Extensions=Inherited:] - [\\u200C\\u200D]". The latter is a
125 // subset of the former but it does not matter because hostnames with
126 // characters outside the latter set would be rejected in an earlier step.
127 lgc_letters_n_ascii_ = icu::UnicodeSet(
128 UNICODE_STRING_SIMPLE("[[:Latin:][:Greek:][:Cyrillic:][0-9\\u002e_"
129 "\\u002d][\\u0300-\\u0339]]"),
130 status);
131 lgc_letters_n_ascii_.freeze();
132
133 // Used for diacritics-removal before the skeleton calculation. Add
134 // "ł > l; ø > o; đ > d" that are not handled by "NFD; Nonspacing mark
135 // removal; NFC". On top of that, supplement the Unicode confusable list by
136 // replacing {U+043A (к), U+0138(ĸ), U+03BA(κ)}, U+04CF (ӏ) and U+043F(п) by
137 // 'k', 'l' and 'n', respectively.
138 // TODO(jshin): Revisit "ł > l; ø > o" mapping.
139 UParseError parse_error;
140 transliterator_.reset(icu::Transliterator::createFromRules(
141 UNICODE_STRING_SIMPLE("DropAcc"),
142 icu::UnicodeString("::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;"
143 " ł > l; ø > o; đ > d; ӏ > l; [кĸκ] > k; п > n;"),
144 UTRANS_FORWARD, parse_error, status));
145 DCHECK(U_SUCCESS(status))
146 << "Spoofchecker initalization failed due to an error: "
147 << u_errorName(status);
88 } 148 }
89 149
90 IDNSpoofChecker::~IDNSpoofChecker() { 150 IDNSpoofChecker::~IDNSpoofChecker() {
91 uspoof_close(checker_); 151 uspoof_close(checker_);
92 } 152 }
93 153
94 bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label, 154 bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,
95 bool is_tld_ascii) { 155 bool is_tld_ascii) {
96 UErrorCode status = U_ZERO_ERROR; 156 UErrorCode status = U_ZERO_ERROR;
97 int32_t result = 157 int32_t result =
(...skipping 15 matching lines...) Expand all
113 // "UTS 46 section 4 Processing step 4" applies validity criteria for 173 // "UTS 46 section 4 Processing step 4" applies validity criteria for
114 // non-transitional processing (i.e. do not map deviation characters) to any 174 // non-transitional processing (i.e. do not map deviation characters) to any
115 // punycode labels regardless of whether transitional or non-transitional is 175 // punycode labels regardless of whether transitional or non-transitional is
116 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted 176 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted
117 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as 177 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as
118 // such. See http://crbug.com/595263 . 178 // such. See http://crbug.com/595263 .
119 if (deviation_characters_.containsSome(label_string)) 179 if (deviation_characters_.containsSome(label_string))
120 return false; 180 return false;
121 181
122 // If there's no script mixing, the input is regarded as safe without any 182 // If there's no script mixing, the input is regarded as safe without any
123 // extra check unless it contains Kana letter exceptions or it's made entirely 183 // extra check unless it falls into one of three categories:
124 // of Cyrillic letters that look like Latin letters. Note that the following 184 // - contains Kana letter exceptions
125 // combinations of scripts are treated as a 'logical' single script. 185 // - the TLD is ASCII and the input is made entirely of Cyrillic letters
186 // that look like Latin letters.
187 // - it has combining diacritic marks.
188 // Note that the following combinations of scripts are treated as a 'logical'
189 // single script.
126 // - Chinese: Han, Bopomofo, Common 190 // - Chinese: Han, Bopomofo, Common
127 // - Japanese: Han, Hiragana, Katakana, Common 191 // - Japanese: Han, Hiragana, Katakana, Common
128 // - Korean: Hangul, Han, Common 192 // - Korean: Hangul, Han, Common
129 result &= USPOOF_RESTRICTION_LEVEL_MASK; 193 result &= USPOOF_RESTRICTION_LEVEL_MASK;
130 if (result == USPOOF_ASCII) 194 if (result == USPOOF_ASCII)
131 return true; 195 return true;
132 if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE && 196 if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&
133 kana_letters_exceptions_.containsNone(label_string)) { 197 kana_letters_exceptions_.containsNone(label_string) &&
198 combining_diacritics_exceptions_.containsNone(label_string)) {
134 // Check Cyrillic confusable only for ASCII TLDs. 199 // Check Cyrillic confusable only for ASCII TLDs.
135 return !is_tld_ascii || !IsMadeOfLatinAlikeCyrillic(label_string); 200 return !is_tld_ascii || !IsMadeOfLatinAlikeCyrillic(label_string);
136 } 201 }
137 202
138 // Additional checks for |label| with multiple scripts, one of which is Latin. 203 // Additional checks for |label| with multiple scripts, one of which is Latin.
139 // Disallow non-ASCII Latin letters to mix with a non-Latin script. 204 // Disallow non-ASCII Latin letters to mix with a non-Latin script.
140 if (non_ascii_latin_letters_.containsSome(label_string)) 205 // Note that the non-ASCII Latin check should not be applied when the entire
206 // label is made of Latin. Checking with lgc_letters set here should be fine
207 // because script mixing of LGC is already rejected.
208 if (non_ascii_latin_letters_.containsSome(label_string) &&
209 !lgc_letters_n_ascii_.containsAll(label_string))
141 return false; 210 return false;
142 211
143 if (!tls_index.initialized()) 212 if (!tls_index.initialized())
144 tls_index.Initialize(&OnThreadTermination); 213 tls_index.Initialize(&OnThreadTermination);
145 icu::RegexMatcher* dangerous_pattern = 214 icu::RegexMatcher* dangerous_pattern =
146 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get()); 215 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get());
147 if (!dangerous_pattern) { 216 if (!dangerous_pattern) {
148 // Disallow the katakana no, so, zo, or n, as they may be mistaken for 217 // Disallow the katakana no, so, zo, or n, as they may be mistaken for
149 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts 218 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts
150 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a 219 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a
151 // non-Japanese script on either side is disallowed, legitimate cases like 220 // non-Japanese script on either side is disallowed, legitimate cases like
152 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those 221 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those
153 // characters when used alone as a label is futile because those cases 222 // characters when used alone as a label is futile because those cases
154 // would not reach here. 223 // would not reach here.
155 // Also disallow what used to be blocked by mixed-script-confusable (MSC) 224 // Also disallow what used to be blocked by mixed-script-confusable (MSC)
156 // detection. ICU 58 does not detect MSC any more for a single input string. 225 // detection. ICU 58 does not detect MSC any more for a single input string.
157 // See http://bugs.icu-project.org/trac/ticket/12823 . 226 // See http://bugs.icu-project.org/trac/ticket/12823 .
158 // TODO(jshin): adjust the pattern once the above ICU bug is fixed. 227 // TODO(jshin): adjust the pattern once the above ICU bug is fixed.
159 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana 228 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana
160 // Prolonged Sound) used out-of-context. 229 // Prolonged Sound) used out-of-context.
161 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark) 230 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark)
162 // unless they're preceded by a Katakana. 231 // unless they're preceded by a Katakana.
163 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters 232 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters
164 // (U+30D[8-A]) that look exactly like each other when they're used in a 233 // (U+30D[8-A]) that look exactly like each other when they're used in a
165 // label otherwise entirely in Katakna or Hiragana. 234 // label otherwise entirely in Katakna or Hiragana.
166 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small 235 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small
167 // Letter Co) to be next to Latin. 236 // Letter Co) to be next to Latin.
168 // - Disallow Latin 'o' and 'g' next to Armenian. 237 // - Disallow Latin 'o' and 'g' next to Armenian.
169 // - Disalow mixing of Latin and Canadian Syllabary. 238 // - Disalow mixing of Latin and Canadian Syllabary.
239 // - Disallow combining diacritical mark (U+0300-U+0339) after a non-LGC
240 // character. Other combining diacritical marks are not in the allowed
241 // character set.
170 dangerous_pattern = new icu::RegexMatcher( 242 dangerous_pattern = new icu::RegexMatcher(
171 icu::UnicodeString( 243 icu::UnicodeString(
172 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])" 244 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])"
173 R"([\u30ce\u30f3\u30bd\u30be])" 245 R"([\u30ce\u30f3\u30bd\u30be])"
174 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}]|)" 246 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}]|)"
175 R"([^\p{scx=kana}\p{scx=hira}]\u30fc|^\u30fc|)" 247 R"([^\p{scx=kana}\p{scx=hira}]\u30fc|^\u30fc|)"
176 R"([^\p{scx=kana}][\u30fd\u30fe]|^[\u30fd\u30fe]|)" 248 R"([^\p{scx=kana}][\u30fd\u30fe]|^[\u30fd\u30fe]|)"
177 R"(^[\p{scx=kana}]+[\u3078-\u307a][\p{scx=kana}]+$|)" 249 R"(^[\p{scx=kana}]+[\u3078-\u307a][\p{scx=kana}]+$|)"
178 R"(^[\p{scx=hira}]+[\u30d8-\u30da][\p{scx=hira}]+$|)" 250 R"(^[\p{scx=hira}]+[\u30d8-\u30da][\p{scx=hira}]+$|)"
179 R"([a-z]\u30fb|\u30fb[a-z]|)" 251 R"([a-z]\u30fb|\u30fb[a-z]|)"
180 R"(^[\u0585\u0581]+[a-z]|[a-z][\u0585\u0581]+$|)" 252 R"(^[\u0585\u0581]+[a-z]|[a-z][\u0585\u0581]+$|)"
181 R"([a-z][\u0585\u0581]+[a-z]|)" 253 R"([a-z][\u0585\u0581]+[a-z]|)"
182 R"(^[og]+[\p{scx=armn}]|[\p{scx=armn}][og]+$|)" 254 R"(^[og]+[\p{scx=armn}]|[\p{scx=armn}][og]+$|)"
183 R"([\p{scx=armn}][og]+[\p{scx=armn}]|)" 255 R"([\p{scx=armn}][og]+[\p{scx=armn}]|)"
184 R"([\p{sc=cans}].*[a-z]|[a-z].*[\p{sc=cans}])", 256 R"([\p{sc=cans}].*[a-z]|[a-z].*[\p{sc=cans}]|)"
257 R"([^\p{scx=latn}\p{scx=grek}\p{scx=cyrl}][\u0300-\u0339])",
185 -1, US_INV), 258 -1, US_INV),
186 0, status); 259 0, status);
187 tls_index.Set(dangerous_pattern); 260 tls_index.Set(dangerous_pattern);
188 } 261 }
189 dangerous_pattern->reset(label_string); 262 dangerous_pattern->reset(label_string);
190 return !dangerous_pattern->find(); 263 return !dangerous_pattern->find();
191 } 264 }
192 265
266 bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {
267 size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0);
268 icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname_length);
269 // If input has any characters outside Latin-Greek-Cyrillic and [0-9._-],
270 // there is no point in getting rid of diacritics because combining marks
271 // attached to non-LGC characters are already blocked.
272 if (lgc_letters_n_ascii_.span(ustr_host, 0, USET_SPAN_CONTAINED) ==
273 ustr_host.length())
274 transliterator_.get()->transliterate(ustr_host);
275
276 UErrorCode status = U_ZERO_ERROR;
277 icu::UnicodeString ustr_skeleton;
278 uspoof_getSkeletonUnicodeString(checker_, 0, ustr_host, ustr_skeleton,
279 &status);
280 if (U_FAILURE(status))
281 return false;
282 std::string skeleton;
283 ustr_skeleton.toUTF8String(skeleton);
284 return LookupMatchInTopDomains(skeleton);
285 }
286
193 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic( 287 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(
194 const icu::UnicodeString& label) { 288 const icu::UnicodeString& label) {
289 // Collect all the Cyrillic letters in |label_string| and see if they're
290 // a subset of |cyrillic_letters_latin_alike_|.
195 // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and 291 // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and
196 // [_-] and checking if the set contains all letters of |label_string| 292 // [_-] and checking if the set contains all letters of |label|
197 // would work in most cases, but not if a label has non-letters outside 293 // would work in most cases, but not if a label has non-letters outside
198 // ASCII. 294 // ASCII.
199 icu::UnicodeSet cyrillic_in_label; 295 icu::UnicodeSet cyrillic_in_label;
200 icu::StringCharacterIterator it(label); 296 icu::StringCharacterIterator it(label);
201 for (it.setToStart(); it.hasNext();) { 297 for (it.setToStart(); it.hasNext();) {
202 const UChar32 c = it.next32PostInc(); 298 const UChar32 c = it.next32PostInc();
203 if (cyrillic_letters_.contains(c)) 299 if (cyrillic_letters_.contains(c))
204 cyrillic_in_label.add(c); 300 cyrillic_in_label.add(c);
205 } 301 }
206 return !cyrillic_in_label.isEmpty() && 302 return !cyrillic_in_label.isEmpty() &&
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after
287 allowed_set.remove(0x0F8Cu); 383 allowed_set.remove(0x0F8Cu);
288 allowed_set.remove(0x0F8Du); 384 allowed_set.remove(0x0F8Du);
289 allowed_set.remove(0x0F8Eu); 385 allowed_set.remove(0x0F8Eu);
290 allowed_set.remove(0x0F8Fu); 386 allowed_set.remove(0x0F8Fu);
291 #endif 387 #endif
292 388
293 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status); 389 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status);
294 } 390 }
295 391
296 } // namespace url_formatter 392 } // namespace url_formatter
OLDNEW
« no previous file with comments | « components/url_formatter/idn_spoof_checker.h ('k') | components/url_formatter/top_domains/BUILD.gn » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698