| OLD | NEW |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/url_formatter/url_formatter.h" | 5 #include "components/url_formatter/url_formatter.h" |
| 6 | 6 |
| 7 #include <algorithm> | 7 #include <algorithm> |
| 8 #include <utility> | 8 #include <utility> |
| 9 | 9 |
| 10 #include "base/lazy_instance.h" | 10 #include "base/lazy_instance.h" |
| (...skipping 306 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 317 deviation_characters_.freeze(); | 317 deviation_characters_.freeze(); |
| 318 | 318 |
| 319 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary | 319 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary |
| 320 // because additional characters pulled in with scx=Latn are not included in | 320 // because additional characters pulled in with scx=Latn are not included in |
| 321 // the allowed set. | 321 // the allowed set. |
| 322 non_ascii_latin_letters_ = icu::UnicodeSet( | 322 non_ascii_latin_letters_ = icu::UnicodeSet( |
| 323 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status); | 323 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status); |
| 324 non_ascii_latin_letters_.freeze(); | 324 non_ascii_latin_letters_.freeze(); |
| 325 | 325 |
| 326 // These letters are parts of |dangerous_patterns_|. | 326 // These letters are parts of |dangerous_patterns_|. |
| 327 kana_letters_exceptions_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE( | 327 kana_letters_exceptions_ = icu::UnicodeSet( |
| 328 "[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb\\u30fc]"), status); | 328 UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"), |
| 329 status); |
| 329 kana_letters_exceptions_.freeze(); | 330 kana_letters_exceptions_.freeze(); |
| 330 | 331 |
| 331 // These Cyrillic letters look like Latin. A domain label entirely made of | 332 // These Cyrillic letters look like Latin. A domain label entirely made of |
| 332 // these letters is blocked as a simplified whole-script-spoofable. | 333 // these letters is blocked as a simplified whole-script-spoofable. |
| 333 cyrillic_letters_latin_alike_ = | 334 cyrillic_letters_latin_alike_ = |
| 334 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status); | 335 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status); |
| 335 cyrillic_letters_latin_alike_.freeze(); | 336 cyrillic_letters_latin_alike_.freeze(); |
| 336 | 337 |
| 337 cyrillic_letters_ = | 338 cyrillic_letters_ = |
| 338 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status); | 339 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status); |
| (...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 399 // non-Japanese script on either side is disallowed, legitimate cases like | 400 // non-Japanese script on either side is disallowed, legitimate cases like |
| 400 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those | 401 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those |
| 401 // characters when used alone as a label is futile because those cases | 402 // characters when used alone as a label is futile because those cases |
| 402 // would not reach here. | 403 // would not reach here. |
| 403 // Also disallow what used to be blocked by mixed-script-confusable (MSC) | 404 // Also disallow what used to be blocked by mixed-script-confusable (MSC) |
| 404 // detection. ICU 58 does not detect MSC any more for a single input string. | 405 // detection. ICU 58 does not detect MSC any more for a single input string. |
| 405 // See http://bugs.icu-project.org/trac/ticket/12823 . | 406 // See http://bugs.icu-project.org/trac/ticket/12823 . |
| 406 // TODO(jshin): adjust the pattern once the above ICU bug is fixed. | 407 // TODO(jshin): adjust the pattern once the above ICU bug is fixed. |
| 407 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana | 408 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana |
| 408 // Prolonged Sound) used out-of-context. | 409 // Prolonged Sound) used out-of-context. |
| 410 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark) |
| 411 // unless they're preceded by a Katakana. |
| 409 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters | 412 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters |
| 410 // (U+30D[8-A]) that look exactly like each other when they're used in a | 413 // (U+30D[8-A]) that look exactly like each other when they're used in a |
| 411 // label otherwise entirely in Katakna or Hiragana. | 414 // label otherwise entirely in Katakna or Hiragana. |
| 412 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small | 415 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small |
| 413 // Letter Co) to be next to Latin. | 416 // Letter Co) to be next to Latin. |
| 414 // - Disallow Latin 'o' and 'g' next to Armenian. | 417 // - Disallow Latin 'o' and 'g' next to Armenian. |
| 415 dangerous_pattern = new icu::RegexMatcher( | 418 dangerous_pattern = new icu::RegexMatcher( |
| 416 icu::UnicodeString( | 419 icu::UnicodeString( |
| 417 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]" | 420 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]" |
| 418 "[\\u30ce\\u30f3\\u30bd\\u30be]" | 421 "[\\u30ce\\u30f3\\u30bd\\u30be]" |
| 419 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]|" | 422 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]|" |
| 420 "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc|" | 423 "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc|^\\u30fc|" |
| 421 "\\u30fc[^\\p{scx=kana}\\p{scx=hira}]|" | 424 "[^\\p{scx=kana}][\\u30fd\\u30fe]|^[\\u30fd\\u30fe]|" |
| 422 "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$|" | 425 "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$|" |
| 423 "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$|" | 426 "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$|" |
| 424 "[a-z]\\u30fb|\\u30fb[a-z]|" | 427 "[a-z]\\u30fb|\\u30fb[a-z]|" |
| 425 "^[\\u0585\\u0581]+[a-z]|[a-z][\\u0585\\u0581]+$|" | 428 "^[\\u0585\\u0581]+[a-z]|[a-z][\\u0585\\u0581]+$|" |
| 426 "[a-z][\\u0585\\u0581]+[a-z]|" | 429 "[a-z][\\u0585\\u0581]+[a-z]|" |
| 427 "^[og]+[\\p{scx=armn}]|[\\p{scx=armn}][og]+$|" | 430 "^[og]+[\\p{scx=armn}]|[\\p{scx=armn}][og]+$|" |
| 428 "[\\p{scx=armn}][og]+[\\p{scx=armn}]", -1, US_INV), | 431 "[\\p{scx=armn}][og]+[\\p{scx=armn}]", |
| 432 -1, US_INV), |
| 429 0, status); | 433 0, status); |
| 430 tls_index.Set(dangerous_pattern); | 434 tls_index.Set(dangerous_pattern); |
| 431 } | 435 } |
| 432 dangerous_pattern->reset(label_string); | 436 dangerous_pattern->reset(label_string); |
| 433 return !dangerous_pattern->find(); | 437 return !dangerous_pattern->find(); |
| 434 } | 438 } |
| 435 | 439 |
| 436 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic( | 440 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic( |
| 437 const icu::UnicodeString& label_string) { | 441 const icu::UnicodeString& label_string) { |
| 438 // Collect all the Cyrillic letters in |label_string| and see if they're | 442 // Collect all the Cyrillic letters in |label_string| and see if they're |
| (...skipping 407 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 846 return base::StartsWith(text, www, base::CompareCase::SENSITIVE) | 850 return base::StartsWith(text, www, base::CompareCase::SENSITIVE) |
| 847 ? text.substr(www.length()) : text; | 851 ? text.substr(www.length()) : text; |
| 848 } | 852 } |
| 849 | 853 |
| 850 base::string16 StripWWWFromHost(const GURL& url) { | 854 base::string16 StripWWWFromHost(const GURL& url) { |
| 851 DCHECK(url.is_valid()); | 855 DCHECK(url.is_valid()); |
| 852 return StripWWW(base::ASCIIToUTF16(url.host_piece())); | 856 return StripWWW(base::ASCIIToUTF16(url.host_piece())); |
| 853 } | 857 } |
| 854 | 858 |
| 855 } // namespace url_formatter | 859 } // namespace url_formatter |
| OLD | NEW |