| OLD | NEW |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/url_formatter/url_formatter.h" | 5 #include "components/url_formatter/url_formatter.h" |
| 6 | 6 |
| 7 #include <algorithm> | 7 #include <algorithm> |
| 8 #include <utility> | 8 #include <utility> |
| 9 | 9 |
| 10 #include "base/lazy_instance.h" | 10 #include "base/lazy_instance.h" |
| (...skipping 232 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 243 | 243 |
| 244 // Returns true if |label| is safe to display as Unicode. In the event of | 244 // Returns true if |label| is safe to display as Unicode. In the event of |
| 245 // library failure, all IDN inputs will be treated as unsafe. | 245 // library failure, all IDN inputs will be treated as unsafe. |
| 246 bool Check(base::StringPiece16 label); | 246 bool Check(base::StringPiece16 label); |
| 247 | 247 |
| 248 private: | 248 private: |
| 249 void SetAllowedUnicodeSet(UErrorCode* status); | 249 void SetAllowedUnicodeSet(UErrorCode* status); |
| 250 | 250 |
| 251 USpoofChecker* checker_; | 251 USpoofChecker* checker_; |
| 252 icu::UnicodeSet deviation_characters_; | 252 icu::UnicodeSet deviation_characters_; |
| 253 icu::UnicodeSet latin_letters_; | |
| 254 icu::UnicodeSet non_ascii_latin_letters_; | 253 icu::UnicodeSet non_ascii_latin_letters_; |
| 254 icu::UnicodeSet kana_letters_exceptions_; |
| 255 | 255 |
| 256 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker); | 256 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker); |
| 257 }; | 257 }; |
| 258 | 258 |
| 259 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker = | 259 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker = |
| 260 LAZY_INSTANCE_INITIALIZER; | 260 LAZY_INSTANCE_INITIALIZER; |
| 261 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER; | 261 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER; |
| 262 | 262 |
| 263 void OnThreadTermination(void* regex_matcher) { | 263 void OnThreadTermination(void* regex_matcher) { |
| 264 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher); | 264 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher); |
| (...skipping 17 matching lines...) Expand all Loading... |
| 282 // Japanese(Hiragana + Katakana + Han), and Korean(Hangul + Han), only one | 282 // Japanese(Hiragana + Katakana + Han), and Korean(Hangul + Han), only one |
| 283 // script other than Common and Inherited can be mixed with Latin. Cyrillic | 283 // script other than Common and Inherited can be mixed with Latin. Cyrillic |
| 284 // and Greek are not allowed to mix with Latin. | 284 // and Greek are not allowed to mix with Latin. |
| 285 // See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection | 285 // See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection |
| 286 uspoof_setRestrictionLevel(checker_, USPOOF_MODERATELY_RESTRICTIVE); | 286 uspoof_setRestrictionLevel(checker_, USPOOF_MODERATELY_RESTRICTIVE); |
| 287 | 287 |
| 288 // Restrict allowed characters in IDN labels and turn on USPOOF_CHAR_LIMIT. | 288 // Restrict allowed characters in IDN labels and turn on USPOOF_CHAR_LIMIT. |
| 289 SetAllowedUnicodeSet(&status); | 289 SetAllowedUnicodeSet(&status); |
| 290 | 290 |
| 291 // Enable the return of auxillary (non-error) information. | 291 // Enable the return of auxillary (non-error) information. |
| 292 // We used to disable WHOLE_SCRIPT_CONFUSABLE check explicitly, but as of |
| 293 // ICU 58.1, WSC is a no-op in a single string check API. |
| 292 int32_t checks = uspoof_getChecks(checker_, &status) | USPOOF_AUX_INFO; | 294 int32_t checks = uspoof_getChecks(checker_, &status) | USPOOF_AUX_INFO; |
| 293 | |
| 294 // Disable WHOLE_SCRIPT_CONFUSABLE check. The check has a marginal value when | |
| 295 // used against a single string as opposed to comparing a pair of strings. In | |
| 296 // addition, it would also flag a number of common labels including the IDN | |
| 297 // TLD for Russian. | |
| 298 // A possible alternative would be to turn on the check and block a label | |
| 299 // only under the following conditions, but it'd better be done on the | |
| 300 // server-side (e.g. SafeBrowsing): | |
| 301 // 1. The label is whole-script confusable. | |
| 302 // 2. And the skeleton of the label matches the skeleton of one of top | |
| 303 // domain labels. See http://unicode.org/reports/tr39/#Confusable_Detection | |
| 304 // for the definition of skeleton. | |
| 305 // 3. And the label is different from the matched top domain label in #2. | |
| 306 checks &= ~USPOOF_WHOLE_SCRIPT_CONFUSABLE; | |
| 307 | |
| 308 uspoof_setChecks(checker_, checks, &status); | 295 uspoof_setChecks(checker_, checks, &status); |
| 309 | 296 |
| 310 // Four characters handled differently by IDNA 2003 and IDNA 2008. UTS46 | 297 // Four characters handled differently by IDNA 2003 and IDNA 2008. UTS46 |
| 311 // transitional processing treats them as IDNA 2003 does; maps U+00DF and | 298 // transitional processing treats them as IDNA 2003 does; maps U+00DF and |
| 312 // U+03C2 and drops U+200[CD]. | 299 // U+03C2 and drops U+200[CD]. |
| 313 deviation_characters_ = | 300 deviation_characters_ = |
| 314 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"), | 301 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"), |
| 315 status); | 302 status); |
| 316 deviation_characters_.freeze(); | 303 deviation_characters_.freeze(); |
| 317 | 304 |
| 318 latin_letters_ = | |
| 319 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[:Latin:]"), status); | |
| 320 latin_letters_.freeze(); | |
| 321 | |
| 322 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary | 305 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary |
| 323 // because additional characters pulled in with scx=Latn are not included in | 306 // because additional characters pulled in with scx=Latn are not included in |
| 324 // the allowed set. | 307 // the allowed set. |
| 325 non_ascii_latin_letters_ = icu::UnicodeSet( | 308 non_ascii_latin_letters_ = icu::UnicodeSet( |
| 326 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status); | 309 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status); |
| 327 non_ascii_latin_letters_.freeze(); | 310 non_ascii_latin_letters_.freeze(); |
| 328 | 311 |
| 312 // These letters are parts of |dangerous_patterns_|. |
| 313 kana_letters_exceptions_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE( |
| 314 "[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb\\u30fc]"), status); |
| 315 kana_letters_exceptions_.freeze(); |
| 316 |
| 329 DCHECK(U_SUCCESS(status)); | 317 DCHECK(U_SUCCESS(status)); |
| 330 } | 318 } |
| 331 | 319 |
| 332 bool IDNSpoofChecker::Check(base::StringPiece16 label) { | 320 bool IDNSpoofChecker::Check(base::StringPiece16 label) { |
| 333 UErrorCode status = U_ZERO_ERROR; | 321 UErrorCode status = U_ZERO_ERROR; |
| 334 int32_t result = uspoof_check(checker_, label.data(), | 322 int32_t result = uspoof_check(checker_, label.data(), |
| 335 base::checked_cast<int32_t>(label.size()), | 323 base::checked_cast<int32_t>(label.size()), |
| 336 NULL, &status); | 324 NULL, &status); |
| 337 // If uspoof_check fails (due to library failure), or if any of the checks | 325 // If uspoof_check fails (due to library failure), or if any of the checks |
| 338 // fail, treat the IDN as unsafe. | 326 // fail, treat the IDN as unsafe. |
| (...skipping 11 matching lines...) Expand all Loading... |
| 350 // "UTS 46 section 4 Processing step 4" applies validity criteria for | 338 // "UTS 46 section 4 Processing step 4" applies validity criteria for |
| 351 // non-transitional processing (i.e. do not map deviation characters) to any | 339 // non-transitional processing (i.e. do not map deviation characters) to any |
| 352 // punycode labels regardless of whether transitional or non-transitional is | 340 // punycode labels regardless of whether transitional or non-transitional is |
| 353 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted | 341 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted |
| 354 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as | 342 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as |
| 355 // such. See http://crbug.com/595263 . | 343 // such. See http://crbug.com/595263 . |
| 356 if (deviation_characters_.containsSome(label_string)) | 344 if (deviation_characters_.containsSome(label_string)) |
| 357 return false; | 345 return false; |
| 358 | 346 |
| 359 // If there's no script mixing, the input is regarded as safe without any | 347 // If there's no script mixing, the input is regarded as safe without any |
| 360 // extra check. | 348 // extra check unless it contains Kana letter exceptions. Note that |
| 361 result &= USPOOF_RESTRICTION_LEVEL_MASK; | 349 // the following combinations of scripts are treated as a 'logical' single |
| 362 if (result == USPOOF_ASCII || result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE) | 350 // script. |
| 363 return true; | |
| 364 | |
| 365 // When check is passed at 'highly restrictive' level, |label| is | |
| 366 // made up of one of the following script sets optionally mixed with Latin. | |
| 367 // - Chinese: Han, Bopomofo, Common | 351 // - Chinese: Han, Bopomofo, Common |
| 368 // - Japanese: Han, Hiragana, Katakana, Common | 352 // - Japanese: Han, Hiragana, Katakana, Common |
| 369 // - Korean: Hangul, Han, Common | 353 // - Korean: Hangul, Han, Common |
| 370 // Treat this case as a 'logical' single script unless Latin is mixed. | 354 result &= USPOOF_RESTRICTION_LEVEL_MASK; |
| 371 if (result == USPOOF_HIGHLY_RESTRICTIVE && | 355 if (result == USPOOF_ASCII || |
| 372 latin_letters_.containsNone(label_string)) | 356 (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE && |
| 357 kana_letters_exceptions_.containsNone(label_string))) |
| 373 return true; | 358 return true; |
| 374 | 359 |
| 375 // Additional checks for |label| with multiple scripts, one of which is Latin. | 360 // Additional checks for |label| with multiple scripts, one of which is Latin. |
| 376 // Disallow non-ASCII Latin letters to mix with a non-Latin script. | 361 // Disallow non-ASCII Latin letters to mix with a non-Latin script. |
| 377 if (non_ascii_latin_letters_.containsSome(label_string)) | 362 if (non_ascii_latin_letters_.containsSome(label_string)) |
| 378 return false; | 363 return false; |
| 379 | 364 |
| 380 if (!tls_index.initialized()) | 365 if (!tls_index.initialized()) |
| 381 tls_index.Initialize(&OnThreadTermination); | 366 tls_index.Initialize(&OnThreadTermination); |
| 382 icu::RegexMatcher* dangerous_pattern = | 367 icu::RegexMatcher* dangerous_pattern = |
| 383 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get()); | 368 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get()); |
| 384 if (!dangerous_pattern) { | 369 if (!dangerous_pattern) { |
| 385 // Disallow the katakana no, so, zo, or n, as they may be mistaken for | 370 // Disallow the katakana no, so, zo, or n, as they may be mistaken for |
| 386 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts | 371 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts |
| 387 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a | 372 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a |
| 388 // non-Japanese script on either side is disallowed, legitimate cases like | 373 // non-Japanese script on either side is disallowed, legitimate cases like |
| 389 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those | 374 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those |
| 390 // characters when used alone as a label is futile because those cases | 375 // characters when used alone as a label is futile because those cases |
| 391 // would not reach here. | 376 // would not reach here. |
| 377 // Also disallow what used to be blocked by mixed-script-confusable (MSC) |
| 378 // detection. ICU 58 does not detect MSC any more for a single input string. |
| 379 // See http://bugs.icu-project.org/trac/ticket/12823 . |
| 380 // TODO(jshin): adjust the pattern once the above ICU bug is fixed. |
| 381 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana |
| 382 // Prolonged Sound) used out-of-context. |
| 383 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters |
| 384 // (U+30D[8-A]) that look exactly like each other when they're used in a |
| 385 // label otherwise entirely in Katakna or Hiragana. |
| 386 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small |
| 387 // Letter Co) to be next to Latin. |
| 388 // - Disallow Latin 'o' and 'g' next to Armenian. |
| 392 dangerous_pattern = new icu::RegexMatcher( | 389 dangerous_pattern = new icu::RegexMatcher( |
| 393 icu::UnicodeString( | 390 icu::UnicodeString( |
| 394 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]" | 391 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]" |
| 395 "[\\u30ce\\u30f3\\u30bd\\u30be]" | 392 "[\\u30ce\\u30f3\\u30bd\\u30be]" |
| 396 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]", -1, US_INV), | 393 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]|" |
| 394 "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc|" |
| 395 "\\u30fc[^\\p{scx=kana}\\p{scx=hira}]|" |
| 396 "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$|" |
| 397 "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$|" |
| 398 "[a-z]\\u30fb|\\u30fb[a-z]|" |
| 399 "^[\\u0585\\u0581]+[a-z]|[a-z][\\u0585\\u0581]+$|" |
| 400 "[a-z][\\u0585\\u0581]+[a-z]|" |
| 401 "^[og]+[\\p{scx=armn}]|[\\p{scx=armn}][og]+$|" |
| 402 "[\\p{scx=armn}][og]+[\\p{scx=armn}]", -1, US_INV), |
| 397 0, status); | 403 0, status); |
| 398 tls_index.Set(dangerous_pattern); | 404 tls_index.Set(dangerous_pattern); |
| 399 } | 405 } |
| 400 dangerous_pattern->reset(label_string); | 406 dangerous_pattern->reset(label_string); |
| 401 return !dangerous_pattern->find(); | 407 return !dangerous_pattern->find(); |
| 402 } | 408 } |
| 403 | 409 |
| 404 void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) { | 410 void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) { |
| 405 if (U_FAILURE(*status)) | 411 if (U_FAILURE(*status)) |
| 406 return; | 412 return; |
| (...skipping 383 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 790 return base::StartsWith(text, www, base::CompareCase::SENSITIVE) | 796 return base::StartsWith(text, www, base::CompareCase::SENSITIVE) |
| 791 ? text.substr(www.length()) : text; | 797 ? text.substr(www.length()) : text; |
| 792 } | 798 } |
| 793 | 799 |
| 794 base::string16 StripWWWFromHost(const GURL& url) { | 800 base::string16 StripWWWFromHost(const GURL& url) { |
| 795 DCHECK(url.is_valid()); | 801 DCHECK(url.is_valid()); |
| 796 return StripWWW(base::ASCIIToUTF16(url.host_piece())); | 802 return StripWWW(base::ASCIIToUTF16(url.host_piece())); |
| 797 } | 803 } |
| 798 | 804 |
| 799 } // namespace url_formatter | 805 } // namespace url_formatter |
| OLD | NEW |