Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(210)

Side by Side Diff: components/url_formatter/url_formatter.cc

Issue 2784933002: Mitigate spoofing attempt using Latin letters. (Closed)
Patch Set: check in alex_names_and_skeletons.gperf Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/url_formatter/url_formatter.h" 5 #include "components/url_formatter/url_formatter.h"
6 6
7 #include <algorithm> 7 #include <algorithm>
8 #include <utility> 8 #include <utility>
9 #include <vector>
9 10
10 #include "base/lazy_instance.h" 11 #include "base/lazy_instance.h"
11 #include "base/macros.h" 12 #include "base/macros.h"
12 #include "base/numerics/safe_conversions.h" 13 #include "base/numerics/safe_conversions.h"
13 #include "base/strings/string_piece.h" 14 #include "base/strings/string_piece.h"
15 #include "base/strings/string_split.h"
14 #include "base/strings/string_util.h" 16 #include "base/strings/string_util.h"
15 #include "base/strings/utf_offset_string_conversions.h" 17 #include "base/strings/utf_offset_string_conversions.h"
16 #include "base/strings/utf_string_conversions.h" 18 #include "base/strings/utf_string_conversions.h"
17 #include "base/threading/thread_local_storage.h" 19 #include "base/threading/thread_local_storage.h"
20 #include "net/base/lookup_string_in_fixed_set.h"
18 #include "third_party/icu/source/common/unicode/schriter.h" 21 #include "third_party/icu/source/common/unicode/schriter.h"
19 #include "third_party/icu/source/common/unicode/uidna.h" 22 #include "third_party/icu/source/common/unicode/uidna.h"
20 #include "third_party/icu/source/common/unicode/uniset.h" 23 #include "third_party/icu/source/common/unicode/uniset.h"
21 #include "third_party/icu/source/common/unicode/uscript.h" 24 #include "third_party/icu/source/common/unicode/uscript.h"
25 #include "third_party/icu/source/common/unicode/utypes.h"
22 #include "third_party/icu/source/common/unicode/uvernum.h" 26 #include "third_party/icu/source/common/unicode/uvernum.h"
23 #include "third_party/icu/source/i18n/unicode/regex.h" 27 #include "third_party/icu/source/i18n/unicode/regex.h"
28 #include "third_party/icu/source/i18n/unicode/translit.h"
24 #include "third_party/icu/source/i18n/unicode/uspoof.h" 29 #include "third_party/icu/source/i18n/unicode/uspoof.h"
25 #include "url/gurl.h" 30 #include "url/gurl.h"
26 #include "url/third_party/mozilla/url_parse.h" 31 #include "url/third_party/mozilla/url_parse.h"
27 32
28 namespace url_formatter { 33 namespace url_formatter {
29 34
30 namespace { 35 namespace {
31 36
32 base::string16 IDNToUnicodeWithAdjustments( 37 base::string16 IDNToUnicodeWithAdjustments(
33 base::StringPiece host, 38 base::StringPiece host,
(...skipping 150 matching lines...) Expand 10 before | Expand all | Expand 10 after
184 new_parsed->scheme.len = kViewSourceLength - 1; 189 new_parsed->scheme.len = kViewSourceLength - 1;
185 } 190 }
186 AdjustAllComponentsButScheme(kViewSourceLength, new_parsed); 191 AdjustAllComponentsButScheme(kViewSourceLength, new_parsed);
187 192
188 if (prefix_end) 193 if (prefix_end)
189 *prefix_end += kViewSourceLength; 194 *prefix_end += kViewSourceLength;
190 195
191 return result; 196 return result;
192 } 197 }
193 198
199 // A helper class for IDN Spoof checking, used to ensure that no IDN input is
200 // spoofable per Chromium's standard of spoofability. For a more thorough
201 // explanation of how spoof checking works in Chromium, see
202 // http://dev.chromium.org/developers/design-documents/idn-in-google-chrome .
203 class IDNSpoofChecker {
204 public:
205 IDNSpoofChecker();
206
207 // Returns true if |label| is safe to display as Unicode. When the TLD is
208 // ASCII, check if a label is entirely made of Cyrillic letters that look like
209 // Latin letters. In the event of library failure, all IDN inputs will be
210 // treated as unsafe.
211 bool Check(base::StringPiece16 label, bool is_tld_ascii);
212
213 // Returns true if |hostname| or the last few components of |hostname| looks
214 // similar to one of top N domains (N=500). Two checks are done:
ncarter (slow) 2017/04/26 18:46:29 The (N=500) comment seems likely to fall out of da
jungshik at Google 2017/04/26 19:36:21 Done.
215 // 1. Calculate the skeleton of |hostname| based on the Unicode confusable
216 // character list and look it up in the pre-calculated skeleton list of
217 // top N domains.
218 // 2. Look up the diacritic-free version of |hostname| in the list of
219 // top N domains.
ncarter (slow) 2017/04/26 18:46:29 Should this document what happens if |hostname| is
jungshik at Google 2017/04/26 19:36:21 Non-IDN hostnames will not reach here (they're not
220 bool SimilarToTopDomains(base::StringPiece16 hostname);
221
222 private:
223 void SetAllowedUnicodeSet(UErrorCode* status);
224 bool IsMadeOfLatinAlikeCyrillic(const icu::UnicodeString& label_string);
225 bool GetSkeleton(base::StringPiece16 hostname, std::string* skeleton);
226 bool RemoveDiacritics(base::StringPiece16 input, std::string* accent_free);
227
228 USpoofChecker* checker_;
229 icu::UnicodeSet deviation_characters_;
230 icu::UnicodeSet non_ascii_latin_letters_;
231 icu::UnicodeSet kana_letters_exceptions_;
232 icu::UnicodeSet cyrillic_letters_;
233 icu::UnicodeSet cyrillic_letters_latin_alike_;
234 icu::UnicodeSet latin_letters_n_ascii_;
235 icu::Transliterator* transliterator_;
236
237 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);
238 };
239
240 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =
241 LAZY_INSTANCE_INITIALIZER;
242 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;
243
244 void OnThreadTermination(void* regex_matcher) {
245 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);
246 }
247
194 // TODO(brettw): We may want to skip this step in the case of file URLs to 248 // TODO(brettw): We may want to skip this step in the case of file URLs to
195 // allow unicode UNC hostnames regardless of encodings. 249 // allow unicode UNC hostnames regardless of encodings.
196 base::string16 IDNToUnicodeWithAdjustments( 250 base::string16 IDNToUnicodeWithAdjustments(
197 base::StringPiece host, base::OffsetAdjuster::Adjustments* adjustments) { 251 base::StringPiece host, base::OffsetAdjuster::Adjustments* adjustments) {
198 if (adjustments) 252 if (adjustments)
199 adjustments->clear(); 253 adjustments->clear();
200 // Convert the ASCII input to a base::string16 for ICU. 254 // Convert the ASCII input to a base::string16 for ICU.
201 base::string16 input16; 255 base::string16 input16;
202 input16.reserve(host.length()); 256 input16.reserve(host.length());
203 input16.insert(input16.end(), host.begin(), host.end()); 257 input16.insert(input16.end(), host.begin(), host.end());
204 258
205 bool is_tld_ascii = true; 259 bool is_tld_ascii = true;
206 size_t last_dot = host.rfind('.'); 260 size_t last_dot = host.rfind('.');
207 if (last_dot != base::StringPiece::npos && 261 if (last_dot != base::StringPiece::npos &&
208 host.substr(last_dot).starts_with(".xn--")) { 262 host.substr(last_dot).starts_with(".xn--")) {
209 is_tld_ascii = false; 263 is_tld_ascii = false;
210 } 264 }
211 265
212 // Do each component of the host separately, since we enforce script matching 266 // Do each component of the host separately, since we enforce script matching
213 // on a per-component basis. 267 // on a per-component basis.
214 base::string16 out16; 268 base::string16 out16;
269 bool has_idn_component = false;
215 for (size_t component_start = 0, component_end; 270 for (size_t component_start = 0, component_end;
216 component_start < input16.length(); 271 component_start < input16.length();
217 component_start = component_end + 1) { 272 component_start = component_end + 1) {
218 // Find the end of the component. 273 // Find the end of the component.
219 component_end = input16.find('.', component_start); 274 component_end = input16.find('.', component_start);
220 if (component_end == base::string16::npos) 275 if (component_end == base::string16::npos)
221 component_end = input16.length(); // For getting the last component. 276 component_end = input16.length(); // For getting the last component.
222 size_t component_length = component_end - component_start; 277 size_t component_length = component_end - component_start;
223 size_t new_component_start = out16.length(); 278 size_t new_component_start = out16.length();
224 bool converted_idn = false; 279 bool converted_idn = false;
225 if (component_end > component_start) { 280 if (component_end > component_start) {
226 // Add the substring that we just found. 281 // Add the substring that we just found.
227 converted_idn = 282 converted_idn =
228 IDNToUnicodeOneComponent(input16.data() + component_start, 283 IDNToUnicodeOneComponent(input16.data() + component_start,
229 component_length, is_tld_ascii, &out16); 284 component_length, is_tld_ascii, &out16);
285 has_idn_component = has_idn_component || converted_idn;
230 } 286 }
231 size_t new_component_length = out16.length() - new_component_start; 287 size_t new_component_length = out16.length() - new_component_start;
232 288
233 if (converted_idn && adjustments) { 289 if (converted_idn && adjustments) {
234 adjustments->push_back(base::OffsetAdjuster::Adjustment( 290 adjustments->push_back(base::OffsetAdjuster::Adjustment(
235 component_start, component_length, new_component_length)); 291 component_start, component_length, new_component_length));
236 } 292 }
237 293
238 // Need to add the dot we just found (if we found one). 294 // Need to add the dot we just found (if we found one).
239 if (component_end < input16.length()) 295 if (component_end < input16.length())
240 out16.push_back('.'); 296 out16.push_back('.');
241 } 297 }
298
299 if (has_idn_component &&
300 g_idn_spoof_checker.Get().SimilarToTopDomains(out16)) {
301 if (adjustments)
302 adjustments->clear();
303 return input16;
304 }
242 return out16; 305 return out16;
243 } 306 }
244 307
245 // A helper class for IDN Spoof checking, used to ensure that no IDN input is
246 // spoofable per Chromium's standard of spoofability. For a more thorough
247 // explanation of how spoof checking works in Chromium, see
248 // http://dev.chromium.org/developers/design-documents/idn-in-google-chrome .
249 class IDNSpoofChecker {
250 public:
251 IDNSpoofChecker();
252
253 // Returns true if |label| is safe to display as Unicode. When the TLD is
254 // ASCII, check if a label is entirely made of Cyrillic letters that look like
255 // Latin letters. In the event of library failure, all IDN inputs will be
256 // treated as unsafe.
257 bool Check(base::StringPiece16 label, bool is_tld_ascii);
258
259 private:
260 void SetAllowedUnicodeSet(UErrorCode* status);
261 bool IsMadeOfLatinAlikeCyrillic(const icu::UnicodeString& label_string);
262
263 USpoofChecker* checker_;
264 icu::UnicodeSet deviation_characters_;
265 icu::UnicodeSet non_ascii_latin_letters_;
266 icu::UnicodeSet kana_letters_exceptions_;
267 icu::UnicodeSet cyrillic_letters_;
268 icu::UnicodeSet cyrillic_letters_latin_alike_;
269
270 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);
271 };
272
273 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =
274 LAZY_INSTANCE_INITIALIZER;
275 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;
276
277 void OnThreadTermination(void* regex_matcher) {
278 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);
279 }
280
281 IDNSpoofChecker::IDNSpoofChecker() { 308 IDNSpoofChecker::IDNSpoofChecker() {
282 UErrorCode status = U_ZERO_ERROR; 309 UErrorCode status = U_ZERO_ERROR;
283 checker_ = uspoof_open(&status); 310 checker_ = uspoof_open(&status);
284 if (U_FAILURE(status)) { 311 if (U_FAILURE(status)) {
285 checker_ = nullptr; 312 checker_ = nullptr;
286 return; 313 return;
287 } 314 }
288 315
289 // At this point, USpoofChecker has all the checks enabled except 316 // At this point, USpoofChecker has all the checks enabled except
290 // for USPOOF_CHAR_LIMIT (USPOOF_{RESTRICTION_LEVEL, INVISIBLE, 317 // for USPOOF_CHAR_LIMIT (USPOOF_{RESTRICTION_LEVEL, INVISIBLE,
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
332 // These Cyrillic letters look like Latin. A domain label entirely made of 359 // These Cyrillic letters look like Latin. A domain label entirely made of
333 // these letters is blocked as a simplified whole-script-spoofable. 360 // these letters is blocked as a simplified whole-script-spoofable.
334 cyrillic_letters_latin_alike_ = 361 cyrillic_letters_latin_alike_ =
335 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status); 362 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status);
336 cyrillic_letters_latin_alike_.freeze(); 363 cyrillic_letters_latin_alike_.freeze();
337 364
338 cyrillic_letters_ = 365 cyrillic_letters_ =
339 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status); 366 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status);
340 cyrillic_letters_.freeze(); 367 cyrillic_letters_.freeze();
341 368
342 DCHECK(U_SUCCESS(status)); 369 // This set is used to determine whether or not to apply a slow
370 // transliteration to remove diacritics to a given hostname for accent-free
371 // comparison with top domain names. If it has any character outside the set,
372 // the expensive step will be skipped because it cannot match any of top
373 // domain names.
374 // The last ([\u0300-\u0331] is a shorthand for "[:Identifier_Status=Allowed:]
375 // & [:Script_Extensions=Inherited:] - [\\u200C\\u200D]". The latter is a
376 // subset of the former but it does not matter because hostnames with
377 // characters outside the latter set would be rejected in an earlier step.
378 latin_letters_n_ascii_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE(
379 "[[:Latin:] [0-9\\u002e_\\u002d] [\\u0300-\\u0331]]"), status);
380 latin_letters_n_ascii_.freeze();
381
382 // Used for diacritics-agnostic comparison. Add "ł > l; ø > o; đ > d" that
383 // are not handled by "NFD; Nonspacing mark removal; NFC".
384 UParseError parse_error;
385 transliterator_ = icu::Transliterator::createFromRules(
386 UNICODE_STRING_SIMPLE("DropAcc"),
387 icu::UnicodeString("::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;"
388 " ł > l; ø > o; đ > d;"),
389 UTRANS_FORWARD, parse_error, status);
390 DCHECK(U_SUCCESS(status))
391 << "Spoofchecker initalization failed due to an error: "
392 << u_errorName(status);
393 if (U_FAILURE(status))
394 transliterator_ = nullptr;
343 } 395 }
344 396
345 bool IDNSpoofChecker::Check(base::StringPiece16 label, bool is_tld_ascii) { 397 bool IDNSpoofChecker::Check(base::StringPiece16 label, bool is_tld_ascii) {
346 UErrorCode status = U_ZERO_ERROR; 398 UErrorCode status = U_ZERO_ERROR;
347 int32_t result = uspoof_check(checker_, label.data(), 399 int32_t result = uspoof_check(checker_, label.data(),
348 base::checked_cast<int32_t>(label.size()), 400 base::checked_cast<int32_t>(label.size()),
349 NULL, &status); 401 NULL, &status);
350 // If uspoof_check fails (due to library failure), or if any of the checks 402 // If uspoof_check fails (due to library failure), or if any of the checks
351 // fail, treat the IDN as unsafe. 403 // fail, treat the IDN as unsafe.
352 if (U_FAILURE(status) || (result & USPOOF_ALL_CHECKS)) 404 if (U_FAILURE(status) || (result & USPOOF_ALL_CHECKS))
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after
430 "^[og]+[\\p{scx=armn}]|[\\p{scx=armn}][og]+$|" 482 "^[og]+[\\p{scx=armn}]|[\\p{scx=armn}][og]+$|"
431 "[\\p{scx=armn}][og]+[\\p{scx=armn}]", 483 "[\\p{scx=armn}][og]+[\\p{scx=armn}]",
432 -1, US_INV), 484 -1, US_INV),
433 0, status); 485 0, status);
434 tls_index.Set(dangerous_pattern); 486 tls_index.Set(dangerous_pattern);
435 } 487 }
436 dangerous_pattern->reset(label_string); 488 dangerous_pattern->reset(label_string);
437 return !dangerous_pattern->find(); 489 return !dangerous_pattern->find();
438 } 490 }
439 491
492 bool IDNSpoofChecker::GetSkeleton(base::StringPiece16 hostname,
493 std::string* skeleton) {
494 skeleton->clear();
495 icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname.length());
496 // TODO(jshin): Consider supplementing the confusable list by replacing some
497 // characters with their confusable counterpart (e.g. U+04CF => 'l').
498 UErrorCode status = U_ZERO_ERROR;
499 icu::UnicodeString ustr_skeleton;
500 uspoof_getSkeletonUnicodeString(checker_, 0, /* not used. deprecated. */
501 ustr_host, ustr_skeleton, &status);
502 if (U_FAILURE(status))
503 return false;
504 ustr_skeleton.toUTF8String(*skeleton);
505 return true;
506 }
507
508 #include "components/url_formatter/top_domains/alexa_names_and_skeletons-inc.cc"
509 // All the domains in the above file have 3 or fewer labels.
510 const size_t kNumberOfLabelsToCheck = 3;
511
512 bool LookupStringInSet(base::StringPiece needle,
513 const unsigned char* fixed_set,
514 size_t set_len,
515 int mask) {
516 int type = net::LookupStringInFixedSet(fixed_set, set_len, needle.data(),
517 needle.length());
518 return (type != net::kDafsaNotFound) && ((type & mask) != 0);
519 }
520
521 bool LookupMatchInTopDomains(base::StringPiece hostname, int mask) {
522 // When 'hostname' is a skeleton instead of actual hostname, it's assumed
523 // that no character other than '.' among those allowed in IDN will have
524 // '.' as its skeleton.
525 auto labels = base::SplitStringPiece(hostname, ".", base::KEEP_WHITESPACE,
526 base::SPLIT_WANT_ALL);
ncarter (slow) 2017/04/26 18:46:29 Is it possible for hostname to end in ".", or has
jungshik at Google 2017/04/26 19:36:21 That's taken care of in the loop in the caller (se
527
528 while (labels.size() > kNumberOfLabelsToCheck)
529 labels.erase(labels.begin());
530
531 while (labels.size() > 1) {
532 std::string partial_hostname = base::JoinString(labels, ".");
533 if (LookupStringInSet(partial_hostname, kDafsa, arraysize(kDafsa), mask))
534 return true;
535 labels.erase(labels.begin());
536 }
537 return false;
538 }
539
540 bool IDNSpoofChecker::RemoveDiacritics(base::StringPiece16 input,
541 std::string* accent_free) {
542 if (!transliterator_)
543 return false;
544 icu::UnicodeString ustr_input(FALSE, input.data(), input.length());
545 // If input has any characters outside Latin and [._-], there is no point in
546 // getting rid of diacritics because it will not match any of top domain
547 // names even after diacritics removal.
548 if (latin_letters_n_ascii_.span(ustr_input, 0, USET_SPAN_CONTAINED) !=
549 ustr_input.length())
550 return false;
551 transliterator_->transliterate(ustr_input);
552 ustr_input.toUTF8String(*accent_free);
553 return true;
554 }
555
556 bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {
557 std::string skeleton;
558 if (GetSkeleton(hostname, &skeleton) && LookupMatchInTopDomains(skeleton, 2))
559 return true;
560
561 std::string accent_free_name;
562 return RemoveDiacritics(hostname, &accent_free_name) &&
563 LookupMatchInTopDomains(accent_free_name, 1);
564 }
565
440 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic( 566 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(
441 const icu::UnicodeString& label_string) { 567 const icu::UnicodeString& label_string) {
442 // Collect all the Cyrillic letters in |label_string| and see if they're 568 // Collect all the Cyrillic letters in |label_string| and see if they're
443 // a subset of |cyrillic_letters_latin_alike_|. 569 // a subset of |cyrillic_letters_latin_alike_|.
444 // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and 570 // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and
445 // [_-] and checking if the set contains all letters of |label_string| 571 // [_-] and checking if the set contains all letters of |label_string|
446 // would work in most cases, but not if a label has non-letters outside 572 // would work in most cases, but not if a label has non-letters outside
447 // ASCII. 573 // ASCII.
448 icu::UnicodeSet cyrillic_in_label; 574 icu::UnicodeSet cyrillic_in_label;
449 icu::StringCharacterIterator it(label_string); 575 icu::StringCharacterIterator it(label_string);
(...skipping 400 matching lines...) Expand 10 before | Expand all | Expand 10 after
850 return base::StartsWith(text, www, base::CompareCase::SENSITIVE) 976 return base::StartsWith(text, www, base::CompareCase::SENSITIVE)
851 ? text.substr(www.length()) : text; 977 ? text.substr(www.length()) : text;
852 } 978 }
853 979
854 base::string16 StripWWWFromHost(const GURL& url) { 980 base::string16 StripWWWFromHost(const GURL& url) {
855 DCHECK(url.is_valid()); 981 DCHECK(url.is_valid());
856 return StripWWW(base::ASCIIToUTF16(url.host_piece())); 982 return StripWWW(base::ASCIIToUTF16(url.host_piece()));
857 } 983 }
858 984
859 } // namespace url_formatter 985 } // namespace url_formatter
OLDNEW
« no previous file with comments | « components/url_formatter/top_domains/make_top_domain_gperf.cc ('k') | components/url_formatter/url_formatter_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698