Index: components/url_formatter/url_formatter.cc |
diff --git a/components/url_formatter/url_formatter.cc b/components/url_formatter/url_formatter.cc |
index 2b82c0cc493a0ae06e500980cbb91f09d42e0287..59b072033454744a22f4e662f6ecb3ea5324bfbd 100644 |
--- a/components/url_formatter/url_formatter.cc |
+++ b/components/url_formatter/url_formatter.cc |
@@ -250,8 +250,8 @@ class IDNSpoofChecker { |
USpoofChecker* checker_; |
icu::UnicodeSet deviation_characters_; |
- icu::UnicodeSet latin_letters_; |
icu::UnicodeSet non_ascii_latin_letters_; |
+ icu::UnicodeSet kana_letters_exceptions_; |
DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker); |
}; |
@@ -289,22 +289,9 @@ IDNSpoofChecker::IDNSpoofChecker() { |
SetAllowedUnicodeSet(&status); |
// Enable the return of auxillary (non-error) information. |
+ // We used to disable WHOLE_SCRIPT_CONFUSABLE check explicitly, but as of |
+ // ICU 58.1, WSC is a no-op in a single string check API. |
int32_t checks = uspoof_getChecks(checker_, &status) | USPOOF_AUX_INFO; |
- |
- // Disable WHOLE_SCRIPT_CONFUSABLE check. The check has a marginal value when |
- // used against a single string as opposed to comparing a pair of strings. In |
- // addition, it would also flag a number of common labels including the IDN |
- // TLD for Russian. |
- // A possible alternative would be to turn on the check and block a label |
- // only under the following conditions, but it'd better be done on the |
- // server-side (e.g. SafeBrowsing): |
- // 1. The label is whole-script confusable. |
- // 2. And the skeleton of the label matches the skeleton of one of top |
- // domain labels. See http://unicode.org/reports/tr39/#Confusable_Detection |
- // for the definition of skeleton. |
- // 3. And the label is different from the matched top domain label in #2. |
- checks &= ~USPOOF_WHOLE_SCRIPT_CONFUSABLE; |
- |
uspoof_setChecks(checker_, checks, &status); |
// Four characters handled differently by IDNA 2003 and IDNA 2008. UTS46 |
@@ -315,10 +302,6 @@ IDNSpoofChecker::IDNSpoofChecker() { |
status); |
deviation_characters_.freeze(); |
- latin_letters_ = |
- icu::UnicodeSet(UNICODE_STRING_SIMPLE("[:Latin:]"), status); |
- latin_letters_.freeze(); |
- |
// Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary |
// because additional characters pulled in with scx=Latn are not included in |
// the allowed set. |
@@ -326,6 +309,11 @@ IDNSpoofChecker::IDNSpoofChecker() { |
UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status); |
non_ascii_latin_letters_.freeze(); |
+ // These letters are parts of |dangerous_patterns_|. |
+ kana_letters_exceptions_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE( |
+ "[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb\\u30fc]"), status); |
+ kana_letters_exceptions_.freeze(); |
+ |
DCHECK(U_SUCCESS(status)); |
} |
@@ -357,19 +345,16 @@ bool IDNSpoofChecker::Check(base::StringPiece16 label) { |
return false; |
// If there's no script mixing, the input is regarded as safe without any |
- // extra check. |
- result &= USPOOF_RESTRICTION_LEVEL_MASK; |
- if (result == USPOOF_ASCII || result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE) |
- return true; |
- |
- // When check is passed at 'highly restrictive' level, |label| is |
- // made up of one of the following script sets optionally mixed with Latin. |
+ // extra check unless it contains Kana letter exceptions. Note that |
+ // the following combinations of scripts are treated as a 'logical' single |
+ // script. |
// - Chinese: Han, Bopomofo, Common |
// - Japanese: Han, Hiragana, Katakana, Common |
// - Korean: Hangul, Han, Common |
- // Treat this case as a 'logical' single script unless Latin is mixed. |
- if (result == USPOOF_HIGHLY_RESTRICTIVE && |
- latin_letters_.containsNone(label_string)) |
+ result &= USPOOF_RESTRICTION_LEVEL_MASK; |
+ if (result == USPOOF_ASCII || |
+ (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE && |
+ kana_letters_exceptions_.containsNone(label_string))) |
return true; |
// Additional checks for |label| with multiple scripts, one of which is Latin. |
@@ -389,11 +374,32 @@ bool IDNSpoofChecker::Check(base::StringPiece16 label) { |
// '{vitamin in Katakana}b6' are blocked. Note that trying to block those |
// characters when used alone as a label is futile because those cases |
// would not reach here. |
+ // Also disallow what used to be blocked by mixed-script-confusable (MSC) |
+ // detection. ICU 58 does not detect MSC any more for a single input string. |
+ // See http://bugs.icu-project.org/trac/ticket/12823 . |
+ // TODO(jshin): adjust the pattern once the above ICU bug is fixed. |
+ // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana |
+ // Prolonged Sound) used out-of-context. |
+ // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters |
+ // (U+30D[8-A]) that look exactly like each other when they're used in a |
+ // label otherwise entirely in Katakna or Hiragana. |
+ // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small |
+ // Letter Co) to be next to Latin. |
+ // - Disallow Latin 'o' and 'g' next to Armenian. |
dangerous_pattern = new icu::RegexMatcher( |
icu::UnicodeString( |
"[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]" |
"[\\u30ce\\u30f3\\u30bd\\u30be]" |
- "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]", -1, US_INV), |
+ "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]|" |
+ "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc|" |
+ "\\u30fc[^\\p{scx=kana}\\p{scx=hira}]|" |
+ "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$|" |
+ "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$|" |
+ "[a-z]\\u30fb|\\u30fb[a-z]|" |
+ "^[\\u0585\\u0581]+[a-z]|[a-z][\\u0585\\u0581]+$|" |
+ "[a-z][\\u0585\\u0581]+[a-z]|" |
+ "^[og]+[\\p{scx=armn}]|[\\p{scx=armn}][og]+$|" |
+ "[\\p{scx=armn}][og]+[\\p{scx=armn}]", -1, US_INV), |
0, status); |
tls_index.Set(dangerous_pattern); |
} |