Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(30)

Side by Side Diff: components/url_formatter/idn_spoof_checker.cc

Issue 2889303003: Revert of Mitigate spoofing attempt using Latin letters. (Closed)
Patch Set: Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2017 The Chromium Authors. All rights reserved. 1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/url_formatter/idn_spoof_checker.h" 5 #include "components/url_formatter/idn_spoof_checker.h"
6 6
7 #include "base/numerics/safe_conversions.h" 7 #include "base/numerics/safe_conversions.h"
8 #include "base/strings/string_split.h" 8 #include "base/strings/string_split.h"
9 #include "base/strings/string_util.h" 9 #include "base/strings/string_util.h"
10 #include "base/threading/thread_local_storage.h" 10 #include "base/threading/thread_local_storage.h"
11 #include "net/base/lookup_string_in_fixed_set.h"
12 #include "third_party/icu/source/common/unicode/schriter.h" 11 #include "third_party/icu/source/common/unicode/schriter.h"
13 #include "third_party/icu/source/common/unicode/unistr.h" 12 #include "third_party/icu/source/common/unicode/unistr.h"
14 #include "third_party/icu/source/i18n/unicode/regex.h" 13 #include "third_party/icu/source/i18n/unicode/regex.h"
15 #include "third_party/icu/source/i18n/unicode/translit.h"
16 #include "third_party/icu/source/i18n/unicode/uspoof.h" 14 #include "third_party/icu/source/i18n/unicode/uspoof.h"
17 15
18 namespace url_formatter { 16 namespace url_formatter {
19 17
20 namespace { 18 namespace {
21 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER; 19 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;
22 20
23 void OnThreadTermination(void* regex_matcher) { 21 void OnThreadTermination(void* regex_matcher) {
24 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher); 22 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);
25 } 23 }
26 24
27 #include "components/url_formatter/top_domains/alexa_skeletons-inc.cc"
28 // All the domains in the above file have 3 or fewer labels.
29 const size_t kNumberOfLabelsToCheck = 3;
30
31 bool LookupMatchInTopDomains(base::StringPiece skeleton) {
32 DCHECK_NE(skeleton.back(), '.');
33 auto labels = base::SplitStringPiece(skeleton, ".", base::KEEP_WHITESPACE,
34 base::SPLIT_WANT_ALL);
35
36 if (labels.size() > kNumberOfLabelsToCheck) {
37 labels.erase(labels.begin(),
38 labels.begin() + labels.size() - kNumberOfLabelsToCheck);
39 }
40
41 while (labels.size() > 1) {
42 std::string partial_skeleton = base::JoinString(labels, ".");
43 if (net::LookupStringInFixedSet(
44 kDafsa, arraysize(kDafsa), partial_skeleton.data(),
45 partial_skeleton.length()) != net::kDafsaNotFound)
46 return true;
47 labels.erase(labels.begin());
48 }
49 return false;
50 }
51
52 } // namespace 25 } // namespace
53 26
54 IDNSpoofChecker::IDNSpoofChecker() { 27 IDNSpoofChecker::IDNSpoofChecker() {
55 UErrorCode status = U_ZERO_ERROR; 28 UErrorCode status = U_ZERO_ERROR;
56 checker_ = uspoof_open(&status); 29 checker_ = uspoof_open(&status);
57 if (U_FAILURE(status)) { 30 if (U_FAILURE(status)) {
58 checker_ = nullptr; 31 checker_ = nullptr;
59 return; 32 return;
60 } 33 }
61 34
(...skipping 26 matching lines...) Expand all
88 UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"), status); 61 UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"), status);
89 deviation_characters_.freeze(); 62 deviation_characters_.freeze();
90 63
91 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary 64 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary
92 // because additional characters pulled in with scx=Latn are not included in 65 // because additional characters pulled in with scx=Latn are not included in
93 // the allowed set. 66 // the allowed set.
94 non_ascii_latin_letters_ = 67 non_ascii_latin_letters_ =
95 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status); 68 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);
96 non_ascii_latin_letters_.freeze(); 69 non_ascii_latin_letters_.freeze();
97 70
98 // The following two sets are parts of |dangerous_patterns_|. 71 // These letters are parts of |dangerous_patterns_|.
99 kana_letters_exceptions_ = icu::UnicodeSet( 72 kana_letters_exceptions_ = icu::UnicodeSet(
100 UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"), 73 UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"),
101 status); 74 status);
102 kana_letters_exceptions_.freeze(); 75 kana_letters_exceptions_.freeze();
103 combining_diacritics_exceptions_ =
104 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0300-\\u0339]"), status);
105 combining_diacritics_exceptions_.freeze();
106 76
107 // These Cyrillic letters look like Latin. A domain label entirely made of 77 // These Cyrillic letters look like Latin. A domain label entirely made of
108 // these letters is blocked as a simplified whole-script-spoofable. 78 // these letters is blocked as a simplified whole-script-spoofable.
109 cyrillic_letters_latin_alike_ = 79 cyrillic_letters_latin_alike_ =
110 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status); 80 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status);
111 cyrillic_letters_latin_alike_.freeze(); 81 cyrillic_letters_latin_alike_.freeze();
112 82
113 cyrillic_letters_ = 83 cyrillic_letters_ =
114 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status); 84 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status);
115 cyrillic_letters_.freeze(); 85 cyrillic_letters_.freeze();
116 86
117 DCHECK(U_SUCCESS(status)); 87 DCHECK(U_SUCCESS(status));
118 // This set is used to determine whether or not to apply a slow
119 // transliteration to remove diacritics to a given hostname before the
120 // confusable skeleton calculation for comparison with top domain names. If
121 // it has any character outside the set, the expensive step will be skipped
122 // because it cannot match any of top domain names.
123 // The last ([\u0300-\u0339] is a shorthand for "[:Identifier_Status=Allowed:]
124 // & [:Script_Extensions=Inherited:] - [\\u200C\\u200D]". The latter is a
125 // subset of the former but it does not matter because hostnames with
126 // characters outside the latter set would be rejected in an earlier step.
127 lgc_letters_n_ascii_ = icu::UnicodeSet(
128 UNICODE_STRING_SIMPLE("[[:Latin:][:Greek:][:Cyrillic:][0-9\\u002e_"
129 "\\u002d][\\u0300-\\u0339]]"),
130 status);
131 lgc_letters_n_ascii_.freeze();
132
133 // Used for diacritics-removal before the skeleton calculation. Add
134 // "ł > l; ø > o; đ > d" that are not handled by "NFD; Nonspacing mark
135 // removal; NFC". On top of that, supplement the Unicode confusable list by
136 // replacing {U+043A (к), U+0138(ĸ), U+03BA(κ)}, U+04CF (ӏ) and U+043F(п) by
137 // 'k', 'l' and 'n', respectively.
138 // TODO(jshin): Revisit "ł > l; ø > o" mapping.
139 UParseError parse_error;
140 transliterator_.reset(icu::Transliterator::createFromRules(
141 UNICODE_STRING_SIMPLE("DropAcc"),
142 icu::UnicodeString("::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;"
143 " ł > l; ø > o; đ > d; ӏ > l; [кĸκ] > k; п > n;"),
144 UTRANS_FORWARD, parse_error, status));
145 DCHECK(U_SUCCESS(status))
146 << "Spoofchecker initalization failed due to an error: "
147 << u_errorName(status);
148 } 88 }
149 89
150 IDNSpoofChecker::~IDNSpoofChecker() { 90 IDNSpoofChecker::~IDNSpoofChecker() {
151 uspoof_close(checker_); 91 uspoof_close(checker_);
152 } 92 }
153 93
154 bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label, 94 bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,
155 bool is_tld_ascii) { 95 bool is_tld_ascii) {
156 UErrorCode status = U_ZERO_ERROR; 96 UErrorCode status = U_ZERO_ERROR;
157 int32_t result = 97 int32_t result =
(...skipping 15 matching lines...) Expand all
173 // "UTS 46 section 4 Processing step 4" applies validity criteria for 113 // "UTS 46 section 4 Processing step 4" applies validity criteria for
174 // non-transitional processing (i.e. do not map deviation characters) to any 114 // non-transitional processing (i.e. do not map deviation characters) to any
175 // punycode labels regardless of whether transitional or non-transitional is 115 // punycode labels regardless of whether transitional or non-transitional is
176 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted 116 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted
177 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as 117 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as
178 // such. See http://crbug.com/595263 . 118 // such. See http://crbug.com/595263 .
179 if (deviation_characters_.containsSome(label_string)) 119 if (deviation_characters_.containsSome(label_string))
180 return false; 120 return false;
181 121
182 // If there's no script mixing, the input is regarded as safe without any 122 // If there's no script mixing, the input is regarded as safe without any
183 // extra check unless it falls into one of three categories: 123 // extra check unless it contains Kana letter exceptions or it's made entirely
184 // - contains Kana letter exceptions 124 // of Cyrillic letters that look like Latin letters. Note that the following
185 // - the TLD is ASCII and the input is made entirely of Cyrillic letters 125 // combinations of scripts are treated as a 'logical' single script.
186 // that look like Latin letters.
187 // - it has combining diacritic marks.
188 // Note that the following combinations of scripts are treated as a 'logical'
189 // single script.
190 // - Chinese: Han, Bopomofo, Common 126 // - Chinese: Han, Bopomofo, Common
191 // - Japanese: Han, Hiragana, Katakana, Common 127 // - Japanese: Han, Hiragana, Katakana, Common
192 // - Korean: Hangul, Han, Common 128 // - Korean: Hangul, Han, Common
193 result &= USPOOF_RESTRICTION_LEVEL_MASK; 129 result &= USPOOF_RESTRICTION_LEVEL_MASK;
194 if (result == USPOOF_ASCII) 130 if (result == USPOOF_ASCII)
195 return true; 131 return true;
196 if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE && 132 if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&
197 kana_letters_exceptions_.containsNone(label_string) && 133 kana_letters_exceptions_.containsNone(label_string)) {
198 combining_diacritics_exceptions_.containsNone(label_string)) {
199 // Check Cyrillic confusable only for ASCII TLDs. 134 // Check Cyrillic confusable only for ASCII TLDs.
200 return !is_tld_ascii || !IsMadeOfLatinAlikeCyrillic(label_string); 135 return !is_tld_ascii || !IsMadeOfLatinAlikeCyrillic(label_string);
201 } 136 }
202 137
203 // Additional checks for |label| with multiple scripts, one of which is Latin. 138 // Additional checks for |label| with multiple scripts, one of which is Latin.
204 // Disallow non-ASCII Latin letters to mix with a non-Latin script. 139 // Disallow non-ASCII Latin letters to mix with a non-Latin script.
205 // Note that the non-ASCII Latin check should not be applied when the entire 140 if (non_ascii_latin_letters_.containsSome(label_string))
206 // label is made of Latin. Checking with lgc_letters set here should be fine
207 // because script mixing of LGC is already rejected.
208 if (non_ascii_latin_letters_.containsSome(label_string) &&
209 !lgc_letters_n_ascii_.containsAll(label_string))
210 return false; 141 return false;
211 142
212 if (!tls_index.initialized()) 143 if (!tls_index.initialized())
213 tls_index.Initialize(&OnThreadTermination); 144 tls_index.Initialize(&OnThreadTermination);
214 icu::RegexMatcher* dangerous_pattern = 145 icu::RegexMatcher* dangerous_pattern =
215 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get()); 146 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get());
216 if (!dangerous_pattern) { 147 if (!dangerous_pattern) {
217 // Disallow the katakana no, so, zo, or n, as they may be mistaken for 148 // Disallow the katakana no, so, zo, or n, as they may be mistaken for
218 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts 149 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts
219 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a 150 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a
220 // non-Japanese script on either side is disallowed, legitimate cases like 151 // non-Japanese script on either side is disallowed, legitimate cases like
221 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those 152 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those
222 // characters when used alone as a label is futile because those cases 153 // characters when used alone as a label is futile because those cases
223 // would not reach here. 154 // would not reach here.
224 // Also disallow what used to be blocked by mixed-script-confusable (MSC) 155 // Also disallow what used to be blocked by mixed-script-confusable (MSC)
225 // detection. ICU 58 does not detect MSC any more for a single input string. 156 // detection. ICU 58 does not detect MSC any more for a single input string.
226 // See http://bugs.icu-project.org/trac/ticket/12823 . 157 // See http://bugs.icu-project.org/trac/ticket/12823 .
227 // TODO(jshin): adjust the pattern once the above ICU bug is fixed. 158 // TODO(jshin): adjust the pattern once the above ICU bug is fixed.
228 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana 159 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana
229 // Prolonged Sound) used out-of-context. 160 // Prolonged Sound) used out-of-context.
230 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark) 161 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark)
231 // unless they're preceded by a Katakana. 162 // unless they're preceded by a Katakana.
232 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters 163 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters
233 // (U+30D[8-A]) that look exactly like each other when they're used in a 164 // (U+30D[8-A]) that look exactly like each other when they're used in a
234 // label otherwise entirely in Katakna or Hiragana. 165 // label otherwise entirely in Katakna or Hiragana.
235 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small 166 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small
236 // Letter Co) to be next to Latin. 167 // Letter Co) to be next to Latin.
237 // - Disallow Latin 'o' and 'g' next to Armenian. 168 // - Disallow Latin 'o' and 'g' next to Armenian.
238 // - Disalow mixing of Latin and Canadian Syllabary. 169 // - Disalow mixing of Latin and Canadian Syllabary.
239 // - Disallow combining diacritical mark (U+0300-U+0339) after a non-LGC
240 // character. Other combining diacritical marks are not in the allowed
241 // character set.
242 dangerous_pattern = new icu::RegexMatcher( 170 dangerous_pattern = new icu::RegexMatcher(
243 icu::UnicodeString( 171 icu::UnicodeString(
244 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])" 172 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])"
245 R"([\u30ce\u30f3\u30bd\u30be])" 173 R"([\u30ce\u30f3\u30bd\u30be])"
246 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}]|)" 174 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}]|)"
247 R"([^\p{scx=kana}\p{scx=hira}]\u30fc|^\u30fc|)" 175 R"([^\p{scx=kana}\p{scx=hira}]\u30fc|^\u30fc|)"
248 R"([^\p{scx=kana}][\u30fd\u30fe]|^[\u30fd\u30fe]|)" 176 R"([^\p{scx=kana}][\u30fd\u30fe]|^[\u30fd\u30fe]|)"
249 R"(^[\p{scx=kana}]+[\u3078-\u307a][\p{scx=kana}]+$|)" 177 R"(^[\p{scx=kana}]+[\u3078-\u307a][\p{scx=kana}]+$|)"
250 R"(^[\p{scx=hira}]+[\u30d8-\u30da][\p{scx=hira}]+$|)" 178 R"(^[\p{scx=hira}]+[\u30d8-\u30da][\p{scx=hira}]+$|)"
251 R"([a-z]\u30fb|\u30fb[a-z]|)" 179 R"([a-z]\u30fb|\u30fb[a-z]|)"
252 R"(^[\u0585\u0581]+[a-z]|[a-z][\u0585\u0581]+$|)" 180 R"(^[\u0585\u0581]+[a-z]|[a-z][\u0585\u0581]+$|)"
253 R"([a-z][\u0585\u0581]+[a-z]|)" 181 R"([a-z][\u0585\u0581]+[a-z]|)"
254 R"(^[og]+[\p{scx=armn}]|[\p{scx=armn}][og]+$|)" 182 R"(^[og]+[\p{scx=armn}]|[\p{scx=armn}][og]+$|)"
255 R"([\p{scx=armn}][og]+[\p{scx=armn}]|)" 183 R"([\p{scx=armn}][og]+[\p{scx=armn}]|)"
256 R"([\p{sc=cans}].*[a-z]|[a-z].*[\p{sc=cans}]|)" 184 R"([\p{sc=cans}].*[a-z]|[a-z].*[\p{sc=cans}])",
257 R"([^\p{scx=latn}\p{scx=grek}\p{scx=cyrl}][\u0300-\u0339])",
258 -1, US_INV), 185 -1, US_INV),
259 0, status); 186 0, status);
260 tls_index.Set(dangerous_pattern); 187 tls_index.Set(dangerous_pattern);
261 } 188 }
262 dangerous_pattern->reset(label_string); 189 dangerous_pattern->reset(label_string);
263 return !dangerous_pattern->find(); 190 return !dangerous_pattern->find();
264 } 191 }
265 192
266 bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {
267 size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0);
268 icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname_length);
269 // If input has any characters outside Latin-Greek-Cyrillic and [0-9._-],
270 // there is no point in getting rid of diacritics because combining marks
271 // attached to non-LGC characters are already blocked.
272 if (lgc_letters_n_ascii_.span(ustr_host, 0, USET_SPAN_CONTAINED) ==
273 ustr_host.length())
274 transliterator_.get()->transliterate(ustr_host);
275
276 UErrorCode status = U_ZERO_ERROR;
277 icu::UnicodeString ustr_skeleton;
278 uspoof_getSkeletonUnicodeString(checker_, 0, ustr_host, ustr_skeleton,
279 &status);
280 if (U_FAILURE(status))
281 return false;
282 std::string skeleton;
283 ustr_skeleton.toUTF8String(skeleton);
284 return LookupMatchInTopDomains(skeleton);
285 }
286
287 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic( 193 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(
288 const icu::UnicodeString& label) { 194 const icu::UnicodeString& label) {
289 // Collect all the Cyrillic letters in |label_string| and see if they're
290 // a subset of |cyrillic_letters_latin_alike_|.
291 // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and 195 // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and
292 // [_-] and checking if the set contains all letters of |label| 196 // [_-] and checking if the set contains all letters of |label_string|
293 // would work in most cases, but not if a label has non-letters outside 197 // would work in most cases, but not if a label has non-letters outside
294 // ASCII. 198 // ASCII.
295 icu::UnicodeSet cyrillic_in_label; 199 icu::UnicodeSet cyrillic_in_label;
296 icu::StringCharacterIterator it(label); 200 icu::StringCharacterIterator it(label);
297 for (it.setToStart(); it.hasNext();) { 201 for (it.setToStart(); it.hasNext();) {
298 const UChar32 c = it.next32PostInc(); 202 const UChar32 c = it.next32PostInc();
299 if (cyrillic_letters_.contains(c)) 203 if (cyrillic_letters_.contains(c))
300 cyrillic_in_label.add(c); 204 cyrillic_in_label.add(c);
301 } 205 }
302 return !cyrillic_in_label.isEmpty() && 206 return !cyrillic_in_label.isEmpty() &&
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after
378 allowed_set.remove(0x0F8Cu); 282 allowed_set.remove(0x0F8Cu);
379 allowed_set.remove(0x0F8Du); 283 allowed_set.remove(0x0F8Du);
380 allowed_set.remove(0x0F8Eu); 284 allowed_set.remove(0x0F8Eu);
381 allowed_set.remove(0x0F8Fu); 285 allowed_set.remove(0x0F8Fu);
382 #endif 286 #endif
383 287
384 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status); 288 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status);
385 } 289 }
386 290
387 } // namespace url_formatter 291 } // namespace url_formatter
OLDNEW
« no previous file with comments | « components/url_formatter/idn_spoof_checker.h ('k') | components/url_formatter/top_domains/BUILD.gn » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698