OLD | NEW |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/url_formatter/url_formatter.h" | 5 #include "components/url_formatter/url_formatter.h" |
6 | 6 |
7 #include <algorithm> | 7 #include <algorithm> |
8 #include <utility> | 8 #include <utility> |
9 | 9 |
10 #include "base/lazy_instance.h" | 10 #include "base/lazy_instance.h" |
(...skipping 306 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
317 deviation_characters_.freeze(); | 317 deviation_characters_.freeze(); |
318 | 318 |
319 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary | 319 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary |
320 // because additional characters pulled in with scx=Latn are not included in | 320 // because additional characters pulled in with scx=Latn are not included in |
321 // the allowed set. | 321 // the allowed set. |
322 non_ascii_latin_letters_ = icu::UnicodeSet( | 322 non_ascii_latin_letters_ = icu::UnicodeSet( |
323 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status); | 323 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status); |
324 non_ascii_latin_letters_.freeze(); | 324 non_ascii_latin_letters_.freeze(); |
325 | 325 |
326 // These letters are parts of |dangerous_patterns_|. | 326 // These letters are parts of |dangerous_patterns_|. |
327 kana_letters_exceptions_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE( | 327 kana_letters_exceptions_ = icu::UnicodeSet( |
328 "[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb\\u30fc]"), status); | 328 UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"), |
| 329 status); |
329 kana_letters_exceptions_.freeze(); | 330 kana_letters_exceptions_.freeze(); |
330 | 331 |
331 // These Cyrillic letters look like Latin. A domain label entirely made of | 332 // These Cyrillic letters look like Latin. A domain label entirely made of |
332 // these letters is blocked as a simplified whole-script-spoofable. | 333 // these letters is blocked as a simplified whole-script-spoofable. |
333 cyrillic_letters_latin_alike_ = | 334 cyrillic_letters_latin_alike_ = |
334 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status); | 335 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status); |
335 cyrillic_letters_latin_alike_.freeze(); | 336 cyrillic_letters_latin_alike_.freeze(); |
336 | 337 |
337 cyrillic_letters_ = | 338 cyrillic_letters_ = |
338 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status); | 339 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status); |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
399 // non-Japanese script on either side is disallowed, legitimate cases like | 400 // non-Japanese script on either side is disallowed, legitimate cases like |
400 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those | 401 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those |
401 // characters when used alone as a label is futile because those cases | 402 // characters when used alone as a label is futile because those cases |
402 // would not reach here. | 403 // would not reach here. |
403 // Also disallow what used to be blocked by mixed-script-confusable (MSC) | 404 // Also disallow what used to be blocked by mixed-script-confusable (MSC) |
404 // detection. ICU 58 does not detect MSC any more for a single input string. | 405 // detection. ICU 58 does not detect MSC any more for a single input string. |
405 // See http://bugs.icu-project.org/trac/ticket/12823 . | 406 // See http://bugs.icu-project.org/trac/ticket/12823 . |
406 // TODO(jshin): adjust the pattern once the above ICU bug is fixed. | 407 // TODO(jshin): adjust the pattern once the above ICU bug is fixed. |
407 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana | 408 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana |
408 // Prolonged Sound) used out-of-context. | 409 // Prolonged Sound) used out-of-context. |
| 410 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark) |
| 411 // unless they're preceded by a Katakana. |
409 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters | 412 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters |
410 // (U+30D[8-A]) that look exactly like each other when they're used in a | 413 // (U+30D[8-A]) that look exactly like each other when they're used in a |
411 // label otherwise entirely in Katakna or Hiragana. | 414 // label otherwise entirely in Katakna or Hiragana. |
412 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small | 415 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small |
413 // Letter Co) to be next to Latin. | 416 // Letter Co) to be next to Latin. |
414 // - Disallow Latin 'o' and 'g' next to Armenian. | 417 // - Disallow Latin 'o' and 'g' next to Armenian. |
415 dangerous_pattern = new icu::RegexMatcher( | 418 dangerous_pattern = new icu::RegexMatcher( |
416 icu::UnicodeString( | 419 icu::UnicodeString( |
417 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]" | 420 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]" |
418 "[\\u30ce\\u30f3\\u30bd\\u30be]" | 421 "[\\u30ce\\u30f3\\u30bd\\u30be]" |
419 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]|" | 422 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]|" |
420 "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc|" | 423 "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc|^\\u30fc|" |
421 "\\u30fc[^\\p{scx=kana}\\p{scx=hira}]|" | 424 "[^\\p{scx=kana}][\\u30fd\\u30fe]|^[\\u30fd\\u30fe]|" |
422 "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$|" | 425 "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$|" |
423 "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$|" | 426 "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$|" |
424 "[a-z]\\u30fb|\\u30fb[a-z]|" | 427 "[a-z]\\u30fb|\\u30fb[a-z]|" |
425 "^[\\u0585\\u0581]+[a-z]|[a-z][\\u0585\\u0581]+$|" | 428 "^[\\u0585\\u0581]+[a-z]|[a-z][\\u0585\\u0581]+$|" |
426 "[a-z][\\u0585\\u0581]+[a-z]|" | 429 "[a-z][\\u0585\\u0581]+[a-z]|" |
427 "^[og]+[\\p{scx=armn}]|[\\p{scx=armn}][og]+$|" | 430 "^[og]+[\\p{scx=armn}]|[\\p{scx=armn}][og]+$|" |
428 "[\\p{scx=armn}][og]+[\\p{scx=armn}]", -1, US_INV), | 431 "[\\p{scx=armn}][og]+[\\p{scx=armn}]", |
| 432 -1, US_INV), |
429 0, status); | 433 0, status); |
430 tls_index.Set(dangerous_pattern); | 434 tls_index.Set(dangerous_pattern); |
431 } | 435 } |
432 dangerous_pattern->reset(label_string); | 436 dangerous_pattern->reset(label_string); |
433 return !dangerous_pattern->find(); | 437 return !dangerous_pattern->find(); |
434 } | 438 } |
435 | 439 |
436 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic( | 440 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic( |
437 const icu::UnicodeString& label_string) { | 441 const icu::UnicodeString& label_string) { |
438 // Collect all the Cyrillic letters in |label_string| and see if they're | 442 // Collect all the Cyrillic letters in |label_string| and see if they're |
(...skipping 407 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
846 return base::StartsWith(text, www, base::CompareCase::SENSITIVE) | 850 return base::StartsWith(text, www, base::CompareCase::SENSITIVE) |
847 ? text.substr(www.length()) : text; | 851 ? text.substr(www.length()) : text; |
848 } | 852 } |
849 | 853 |
850 base::string16 StripWWWFromHost(const GURL& url) { | 854 base::string16 StripWWWFromHost(const GURL& url) { |
851 DCHECK(url.is_valid()); | 855 DCHECK(url.is_valid()); |
852 return StripWWW(base::ASCIIToUTF16(url.host_piece())); | 856 return StripWWW(base::ASCIIToUTF16(url.host_piece())); |
853 } | 857 } |
854 | 858 |
855 } // namespace url_formatter | 859 } // namespace url_formatter |
OLD | NEW |