Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(32)

Unified Diff: components/url_formatter/url_formatter.cc

Issue 2447513002: Update ICU to 58.1 (Closed)
Patch Set: fix a typo in html comment Created 4 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « base/i18n/number_formatting_unittest.cc ('k') | components/url_formatter/url_formatter_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: components/url_formatter/url_formatter.cc
diff --git a/components/url_formatter/url_formatter.cc b/components/url_formatter/url_formatter.cc
index 2b82c0cc493a0ae06e500980cbb91f09d42e0287..59b072033454744a22f4e662f6ecb3ea5324bfbd 100644
--- a/components/url_formatter/url_formatter.cc
+++ b/components/url_formatter/url_formatter.cc
@@ -250,8 +250,8 @@ class IDNSpoofChecker {
USpoofChecker* checker_;
icu::UnicodeSet deviation_characters_;
- icu::UnicodeSet latin_letters_;
icu::UnicodeSet non_ascii_latin_letters_;
+ icu::UnicodeSet kana_letters_exceptions_;
DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);
};
@@ -289,22 +289,9 @@ IDNSpoofChecker::IDNSpoofChecker() {
SetAllowedUnicodeSet(&status);
// Enable the return of auxillary (non-error) information.
+ // We used to disable WHOLE_SCRIPT_CONFUSABLE check explicitly, but as of
+ // ICU 58.1, WSC is a no-op in a single string check API.
int32_t checks = uspoof_getChecks(checker_, &status) | USPOOF_AUX_INFO;
-
- // Disable WHOLE_SCRIPT_CONFUSABLE check. The check has a marginal value when
- // used against a single string as opposed to comparing a pair of strings. In
- // addition, it would also flag a number of common labels including the IDN
- // TLD for Russian.
- // A possible alternative would be to turn on the check and block a label
- // only under the following conditions, but it'd better be done on the
- // server-side (e.g. SafeBrowsing):
- // 1. The label is whole-script confusable.
- // 2. And the skeleton of the label matches the skeleton of one of top
- // domain labels. See http://unicode.org/reports/tr39/#Confusable_Detection
- // for the definition of skeleton.
- // 3. And the label is different from the matched top domain label in #2.
- checks &= ~USPOOF_WHOLE_SCRIPT_CONFUSABLE;
-
uspoof_setChecks(checker_, checks, &status);
// Four characters handled differently by IDNA 2003 and IDNA 2008. UTS46
@@ -315,10 +302,6 @@ IDNSpoofChecker::IDNSpoofChecker() {
status);
deviation_characters_.freeze();
- latin_letters_ =
- icu::UnicodeSet(UNICODE_STRING_SIMPLE("[:Latin:]"), status);
- latin_letters_.freeze();
-
// Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary
// because additional characters pulled in with scx=Latn are not included in
// the allowed set.
@@ -326,6 +309,11 @@ IDNSpoofChecker::IDNSpoofChecker() {
UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);
non_ascii_latin_letters_.freeze();
+ // These letters are parts of |dangerous_patterns_|.
+ kana_letters_exceptions_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE(
+ "[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb\\u30fc]"), status);
+ kana_letters_exceptions_.freeze();
+
DCHECK(U_SUCCESS(status));
}
@@ -357,19 +345,16 @@ bool IDNSpoofChecker::Check(base::StringPiece16 label) {
return false;
// If there's no script mixing, the input is regarded as safe without any
- // extra check.
- result &= USPOOF_RESTRICTION_LEVEL_MASK;
- if (result == USPOOF_ASCII || result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE)
- return true;
-
- // When check is passed at 'highly restrictive' level, |label| is
- // made up of one of the following script sets optionally mixed with Latin.
+ // extra check unless it contains Kana letter exceptions. Note that
+ // the following combinations of scripts are treated as a 'logical' single
+ // script.
// - Chinese: Han, Bopomofo, Common
// - Japanese: Han, Hiragana, Katakana, Common
// - Korean: Hangul, Han, Common
- // Treat this case as a 'logical' single script unless Latin is mixed.
- if (result == USPOOF_HIGHLY_RESTRICTIVE &&
- latin_letters_.containsNone(label_string))
+ result &= USPOOF_RESTRICTION_LEVEL_MASK;
+ if (result == USPOOF_ASCII ||
+ (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&
+ kana_letters_exceptions_.containsNone(label_string)))
return true;
// Additional checks for |label| with multiple scripts, one of which is Latin.
@@ -389,11 +374,32 @@ bool IDNSpoofChecker::Check(base::StringPiece16 label) {
// '{vitamin in Katakana}b6' are blocked. Note that trying to block those
// characters when used alone as a label is futile because those cases
// would not reach here.
+ // Also disallow what used to be blocked by mixed-script-confusable (MSC)
+ // detection. ICU 58 does not detect MSC any more for a single input string.
+ // See http://bugs.icu-project.org/trac/ticket/12823 .
+ // TODO(jshin): adjust the pattern once the above ICU bug is fixed.
+ // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana
+ // Prolonged Sound) used out-of-context.
+ // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters
+ // (U+30D[8-A]) that look exactly like each other when they're used in a
+ // label otherwise entirely in Katakna or Hiragana.
+ // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small
+ // Letter Co) to be next to Latin.
+ // - Disallow Latin 'o' and 'g' next to Armenian.
dangerous_pattern = new icu::RegexMatcher(
icu::UnicodeString(
"[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]"
"[\\u30ce\\u30f3\\u30bd\\u30be]"
- "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]", -1, US_INV),
+ "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]|"
+ "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc|"
+ "\\u30fc[^\\p{scx=kana}\\p{scx=hira}]|"
+ "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$|"
+ "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$|"
+ "[a-z]\\u30fb|\\u30fb[a-z]|"
+ "^[\\u0585\\u0581]+[a-z]|[a-z][\\u0585\\u0581]+$|"
+ "[a-z][\\u0585\\u0581]+[a-z]|"
+ "^[og]+[\\p{scx=armn}]|[\\p{scx=armn}][og]+$|"
+ "[\\p{scx=armn}][og]+[\\p{scx=armn}]", -1, US_INV),
0, status);
tls_index.Set(dangerous_pattern);
}
« no previous file with comments | « base/i18n/number_formatting_unittest.cc ('k') | components/url_formatter/url_formatter_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698