components/url_formatter/url_formatter.cc - Issue 2447513002: Update ICU to 58.1

Unified Diff: components/url_formatter/url_formatter.cc

Issue 2447513002: Update ICU to 58.1 (Closed)

Patch Set: roll icu to final 58.1 in master Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« no previous file with comments | « base/i18n/number_formatting_unittest.cc ('k') | third_party/WebKit/LayoutTests/fast/text/midword-break-before-surrogate-pair.html » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: components/url_formatter/url_formatter.cc

diff --git a/components/url_formatter/url_formatter.cc b/components/url_formatter/url_formatter.cc

index 2b82c0cc493a0ae06e500980cbb91f09d42e0287..3dc925cb19048e5833e9f3d4323792d180ab67e2 100644

--- a/components/url_formatter/url_formatter.cc

+++ b/components/url_formatter/url_formatter.cc

@@ -250,8 +250,8 @@ class IDNSpoofChecker {

USpoofChecker* checker_;

icu::UnicodeSet deviation_characters_;

- icu::UnicodeSet latin_letters_;

icu::UnicodeSet non_ascii_latin_letters_;

+ icu::UnicodeSet kana_letters_exceptions_;

DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);

};

@@ -289,22 +289,9 @@ IDNSpoofChecker::IDNSpoofChecker() {

SetAllowedUnicodeSet(&status);

// Enable the return of auxillary (non-error) information.

+ // We used to disable WHOLE_SCRIPT_CONFUSABLE check explicitly, but as of

+ // ICU 58.1, WSC is a no-op in a single string check API.

int32_t checks = uspoof_getChecks(checker_, &status) | USPOOF_AUX_INFO;

- // Disable WHOLE_SCRIPT_CONFUSABLE check. The check has a marginal value when

- // used against a single string as opposed to comparing a pair of strings. In

- // addition, it would also flag a number of common labels including the IDN

- // TLD for Russian.

- // A possible alternative would be to turn on the check and block a label

- // only under the following conditions, but it'd better be done on the

- // server-side (e.g. SafeBrowsing):

- // 1. The label is whole-script confusable.

- // 2. And the skeleton of the label matches the skeleton of one of top

- // domain labels. See http://unicode.org/reports/tr39/#Confusable_Detection

- // for the definition of skeleton.

- // 3. And the label is different from the matched top domain label in #2.

- checks &= ~USPOOF_WHOLE_SCRIPT_CONFUSABLE;

uspoof_setChecks(checker_, checks, &status);

// Four characters handled differently by IDNA 2003 and IDNA 2008. UTS46

@@ -315,10 +302,6 @@ IDNSpoofChecker::IDNSpoofChecker() {

status);

deviation_characters_.freeze();

- latin_letters_ =

- icu::UnicodeSet(UNICODE_STRING_SIMPLE("[:Latin:]"), status);

- latin_letters_.freeze();

// Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary

// because additional characters pulled in with scx=Latn are not included in

// the allowed set.

@@ -326,6 +309,11 @@ IDNSpoofChecker::IDNSpoofChecker() {

UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);

non_ascii_latin_letters_.freeze();

+ // These letters are parts of |dangerous_patterns_|.

+ kana_letters_exceptions_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE(

+ "[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb\\u30fc]"), status);

+ kana_letters_exceptions_.freeze();

DCHECK(U_SUCCESS(status));

}

@@ -357,19 +345,16 @@ bool IDNSpoofChecker::Check(base::StringPiece16 label) {

return false;

// If there's no script mixing, the input is regarded as safe without any

- // extra check.

- result &= USPOOF_RESTRICTION_LEVEL_MASK;

- if (result == USPOOF_ASCII || result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE)

- return true;

- // When check is passed at 'highly restrictive' level, |label| is

- // made up of one of the following script sets optionally mixed with Latin.

+ // extra check unless it contains Kana letter exceptions. Note that

+ // the following combinations of scripts are treated as a 'logical' single

+ // script.

// - Chinese: Han, Bopomofo, Common

// - Japanese: Han, Hiragana, Katakana, Common

// - Korean: Hangul, Han, Common

- // Treat this case as a 'logical' single script unless Latin is mixed.

- if (result == USPOOF_HIGHLY_RESTRICTIVE &&

- latin_letters_.containsNone(label_string))

+ result &= USPOOF_RESTRICTION_LEVEL_MASK;

+ if (result == USPOOF_ASCII ||

+ (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&

+ kana_letters_exceptions_.containsNone(label_string)))

return true;

// Additional checks for |label| with multiple scripts, one of which is Latin.

@@ -389,11 +374,26 @@ bool IDNSpoofChecker::Check(base::StringPiece16 label) {

// '{vitamin in Katakana}b6' are blocked. Note that trying to block those

// characters when used alone as a label is futile because those cases

// would not reach here.

+ // Besides, disallow what used to be blocked by mixed-script-confusable(MSC)

Peter Kasting 2016/10/28 22:46:18 Nit "Besides," -> "Also"; space before '('

+ // detection. ICU 58 does not detect MSC any more for a single input string.

+ // See http://bugs.icu-project.org/trac/ticket/12823 .

+ // - Disallow U+30FB(Katakana Middle Dot) and U+30FC(Hiragana-Katakana

+ // Prolonged Sound) used out-of-context.

+ // - Disallow three Hiragana letters(U+307[8-A]) or Katakana letters

+ // (U+30D[8-A]) that look exactly like each other when they're used in a

+ // label otherwise entirely in Katakna or Hiragana.

+ // - Disallow U+0585 (Armenian Small Letter Oh) to be mixed with Latin.

dangerous_pattern = new icu::RegexMatcher(

icu::UnicodeString(

"[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]"

"[\\u30ce\\u30f3\\u30bd\\u30be]"

- "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]", -1, US_INV),

+ "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]|"

+ "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc|"

+ "\\u30fc[^\\p{scx=kana}\\p{scx=hira}]|"

+ "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$|"

+ "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$|"

+ "[a-z]\\u30fb|\\u30fb[a-z]|"

+ "\\u0585.*[a-z]+|[a-z]+\\u0585", -1, US_INV),

Peter Kasting 2016/10/28 22:46:18 Note, I do not speak regex enough to really review

0, status);

tls_index.Set(dangerous_pattern);

}