Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(229)

Side by Side Diff: components/url_formatter/url_formatter.cc

Issue 2447513002: Update ICU to 58.1 (Closed)
Patch Set: roll icu to final 58.1 in master Created 4 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/url_formatter/url_formatter.h" 5 #include "components/url_formatter/url_formatter.h"
6 6
7 #include <algorithm> 7 #include <algorithm>
8 #include <utility> 8 #include <utility>
9 9
10 #include "base/lazy_instance.h" 10 #include "base/lazy_instance.h"
(...skipping 232 matching lines...) Expand 10 before | Expand all | Expand 10 after
243 243
244 // Returns true if |label| is safe to display as Unicode. In the event of 244 // Returns true if |label| is safe to display as Unicode. In the event of
245 // library failure, all IDN inputs will be treated as unsafe. 245 // library failure, all IDN inputs will be treated as unsafe.
246 bool Check(base::StringPiece16 label); 246 bool Check(base::StringPiece16 label);
247 247
248 private: 248 private:
249 void SetAllowedUnicodeSet(UErrorCode* status); 249 void SetAllowedUnicodeSet(UErrorCode* status);
250 250
251 USpoofChecker* checker_; 251 USpoofChecker* checker_;
252 icu::UnicodeSet deviation_characters_; 252 icu::UnicodeSet deviation_characters_;
253 icu::UnicodeSet latin_letters_;
254 icu::UnicodeSet non_ascii_latin_letters_; 253 icu::UnicodeSet non_ascii_latin_letters_;
254 icu::UnicodeSet kana_letters_exceptions_;
255 255
256 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker); 256 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);
257 }; 257 };
258 258
259 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker = 259 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =
260 LAZY_INSTANCE_INITIALIZER; 260 LAZY_INSTANCE_INITIALIZER;
261 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER; 261 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;
262 262
263 void OnThreadTermination(void* regex_matcher) { 263 void OnThreadTermination(void* regex_matcher) {
264 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher); 264 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);
(...skipping 17 matching lines...) Expand all
282 // Japanese(Hiragana + Katakana + Han), and Korean(Hangul + Han), only one 282 // Japanese(Hiragana + Katakana + Han), and Korean(Hangul + Han), only one
283 // script other than Common and Inherited can be mixed with Latin. Cyrillic 283 // script other than Common and Inherited can be mixed with Latin. Cyrillic
284 // and Greek are not allowed to mix with Latin. 284 // and Greek are not allowed to mix with Latin.
285 // See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection 285 // See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection
286 uspoof_setRestrictionLevel(checker_, USPOOF_MODERATELY_RESTRICTIVE); 286 uspoof_setRestrictionLevel(checker_, USPOOF_MODERATELY_RESTRICTIVE);
287 287
288 // Restrict allowed characters in IDN labels and turn on USPOOF_CHAR_LIMIT. 288 // Restrict allowed characters in IDN labels and turn on USPOOF_CHAR_LIMIT.
289 SetAllowedUnicodeSet(&status); 289 SetAllowedUnicodeSet(&status);
290 290
291 // Enable the return of auxillary (non-error) information. 291 // Enable the return of auxillary (non-error) information.
292 // We used to disable WHOLE_SCRIPT_CONFUSABLE check explicitly, but as of
293 // ICU 58.1, WSC is a no-op in a single string check API.
292 int32_t checks = uspoof_getChecks(checker_, &status) | USPOOF_AUX_INFO; 294 int32_t checks = uspoof_getChecks(checker_, &status) | USPOOF_AUX_INFO;
293
294 // Disable WHOLE_SCRIPT_CONFUSABLE check. The check has a marginal value when
295 // used against a single string as opposed to comparing a pair of strings. In
296 // addition, it would also flag a number of common labels including the IDN
297 // TLD for Russian.
298 // A possible alternative would be to turn on the check and block a label
299 // only under the following conditions, but it'd better be done on the
300 // server-side (e.g. SafeBrowsing):
301 // 1. The label is whole-script confusable.
302 // 2. And the skeleton of the label matches the skeleton of one of top
303 // domain labels. See http://unicode.org/reports/tr39/#Confusable_Detection
304 // for the definition of skeleton.
305 // 3. And the label is different from the matched top domain label in #2.
306 checks &= ~USPOOF_WHOLE_SCRIPT_CONFUSABLE;
307
308 uspoof_setChecks(checker_, checks, &status); 295 uspoof_setChecks(checker_, checks, &status);
309 296
310 // Four characters handled differently by IDNA 2003 and IDNA 2008. UTS46 297 // Four characters handled differently by IDNA 2003 and IDNA 2008. UTS46
311 // transitional processing treats them as IDNA 2003 does; maps U+00DF and 298 // transitional processing treats them as IDNA 2003 does; maps U+00DF and
312 // U+03C2 and drops U+200[CD]. 299 // U+03C2 and drops U+200[CD].
313 deviation_characters_ = 300 deviation_characters_ =
314 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"), 301 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"),
315 status); 302 status);
316 deviation_characters_.freeze(); 303 deviation_characters_.freeze();
317 304
318 latin_letters_ =
319 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[:Latin:]"), status);
320 latin_letters_.freeze();
321
322 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary 305 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary
323 // because additional characters pulled in with scx=Latn are not included in 306 // because additional characters pulled in with scx=Latn are not included in
324 // the allowed set. 307 // the allowed set.
325 non_ascii_latin_letters_ = icu::UnicodeSet( 308 non_ascii_latin_letters_ = icu::UnicodeSet(
326 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status); 309 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);
327 non_ascii_latin_letters_.freeze(); 310 non_ascii_latin_letters_.freeze();
328 311
312 // These letters are parts of |dangerous_patterns_|.
313 kana_letters_exceptions_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE(
314 "[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb\\u30fc]"), status);
315 kana_letters_exceptions_.freeze();
316
329 DCHECK(U_SUCCESS(status)); 317 DCHECK(U_SUCCESS(status));
330 } 318 }
331 319
332 bool IDNSpoofChecker::Check(base::StringPiece16 label) { 320 bool IDNSpoofChecker::Check(base::StringPiece16 label) {
333 UErrorCode status = U_ZERO_ERROR; 321 UErrorCode status = U_ZERO_ERROR;
334 int32_t result = uspoof_check(checker_, label.data(), 322 int32_t result = uspoof_check(checker_, label.data(),
335 base::checked_cast<int32_t>(label.size()), 323 base::checked_cast<int32_t>(label.size()),
336 NULL, &status); 324 NULL, &status);
337 // If uspoof_check fails (due to library failure), or if any of the checks 325 // If uspoof_check fails (due to library failure), or if any of the checks
338 // fail, treat the IDN as unsafe. 326 // fail, treat the IDN as unsafe.
(...skipping 11 matching lines...) Expand all
350 // "UTS 46 section 4 Processing step 4" applies validity criteria for 338 // "UTS 46 section 4 Processing step 4" applies validity criteria for
351 // non-transitional processing (i.e. do not map deviation characters) to any 339 // non-transitional processing (i.e. do not map deviation characters) to any
352 // punycode labels regardless of whether transitional or non-transitional is 340 // punycode labels regardless of whether transitional or non-transitional is
353 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted 341 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted
354 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as 342 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as
355 // such. See http://crbug.com/595263 . 343 // such. See http://crbug.com/595263 .
356 if (deviation_characters_.containsSome(label_string)) 344 if (deviation_characters_.containsSome(label_string))
357 return false; 345 return false;
358 346
359 // If there's no script mixing, the input is regarded as safe without any 347 // If there's no script mixing, the input is regarded as safe without any
360 // extra check. 348 // extra check unless it contains Kana letter exceptions. Note that
361 result &= USPOOF_RESTRICTION_LEVEL_MASK; 349 // the following combinations of scripts are treated as a 'logical' single
362 if (result == USPOOF_ASCII || result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE) 350 // script.
363 return true;
364
365 // When check is passed at 'highly restrictive' level, |label| is
366 // made up of one of the following script sets optionally mixed with Latin.
367 // - Chinese: Han, Bopomofo, Common 351 // - Chinese: Han, Bopomofo, Common
368 // - Japanese: Han, Hiragana, Katakana, Common 352 // - Japanese: Han, Hiragana, Katakana, Common
369 // - Korean: Hangul, Han, Common 353 // - Korean: Hangul, Han, Common
370 // Treat this case as a 'logical' single script unless Latin is mixed. 354 result &= USPOOF_RESTRICTION_LEVEL_MASK;
371 if (result == USPOOF_HIGHLY_RESTRICTIVE && 355 if (result == USPOOF_ASCII ||
372 latin_letters_.containsNone(label_string)) 356 (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&
357 kana_letters_exceptions_.containsNone(label_string)))
373 return true; 358 return true;
374 359
375 // Additional checks for |label| with multiple scripts, one of which is Latin. 360 // Additional checks for |label| with multiple scripts, one of which is Latin.
376 // Disallow non-ASCII Latin letters to mix with a non-Latin script. 361 // Disallow non-ASCII Latin letters to mix with a non-Latin script.
377 if (non_ascii_latin_letters_.containsSome(label_string)) 362 if (non_ascii_latin_letters_.containsSome(label_string))
378 return false; 363 return false;
379 364
380 if (!tls_index.initialized()) 365 if (!tls_index.initialized())
381 tls_index.Initialize(&OnThreadTermination); 366 tls_index.Initialize(&OnThreadTermination);
382 icu::RegexMatcher* dangerous_pattern = 367 icu::RegexMatcher* dangerous_pattern =
383 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get()); 368 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get());
384 if (!dangerous_pattern) { 369 if (!dangerous_pattern) {
385 // Disallow the katakana no, so, zo, or n, as they may be mistaken for 370 // Disallow the katakana no, so, zo, or n, as they may be mistaken for
386 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts 371 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts
387 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a 372 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a
388 // non-Japanese script on either side is disallowed, legitimate cases like 373 // non-Japanese script on either side is disallowed, legitimate cases like
389 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those 374 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those
390 // characters when used alone as a label is futile because those cases 375 // characters when used alone as a label is futile because those cases
391 // would not reach here. 376 // would not reach here.
377 // Besides, disallow what used to be blocked by mixed-script-confusable(MSC)
Peter Kasting 2016/10/28 22:46:18 Nit "Besides," -> "Also"; space before '('
378 // detection. ICU 58 does not detect MSC any more for a single input string.
379 // See http://bugs.icu-project.org/trac/ticket/12823 .
380 // - Disallow U+30FB(Katakana Middle Dot) and U+30FC(Hiragana-Katakana
381 // Prolonged Sound) used out-of-context.
382 // - Disallow three Hiragana letters(U+307[8-A]) or Katakana letters
383 // (U+30D[8-A]) that look exactly like each other when they're used in a
384 // label otherwise entirely in Katakna or Hiragana.
385 // - Disallow U+0585 (Armenian Small Letter Oh) to be mixed with Latin.
392 dangerous_pattern = new icu::RegexMatcher( 386 dangerous_pattern = new icu::RegexMatcher(
393 icu::UnicodeString( 387 icu::UnicodeString(
394 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]" 388 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]"
395 "[\\u30ce\\u30f3\\u30bd\\u30be]" 389 "[\\u30ce\\u30f3\\u30bd\\u30be]"
396 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]", -1, US_INV), 390 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]|"
391 "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc|"
392 "\\u30fc[^\\p{scx=kana}\\p{scx=hira}]|"
393 "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$|"
394 "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$|"
395 "[a-z]\\u30fb|\\u30fb[a-z]|"
396 "\\u0585.*[a-z]+|[a-z]+\\u0585", -1, US_INV),
Peter Kasting 2016/10/28 22:46:18 Note, I do not speak regex enough to really review
397 0, status); 397 0, status);
398 tls_index.Set(dangerous_pattern); 398 tls_index.Set(dangerous_pattern);
399 } 399 }
400 dangerous_pattern->reset(label_string); 400 dangerous_pattern->reset(label_string);
401 return !dangerous_pattern->find(); 401 return !dangerous_pattern->find();
402 } 402 }
403 403
404 void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) { 404 void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) {
405 if (U_FAILURE(*status)) 405 if (U_FAILURE(*status))
406 return; 406 return;
(...skipping 383 matching lines...) Expand 10 before | Expand all | Expand 10 after
790 return base::StartsWith(text, www, base::CompareCase::SENSITIVE) 790 return base::StartsWith(text, www, base::CompareCase::SENSITIVE)
791 ? text.substr(www.length()) : text; 791 ? text.substr(www.length()) : text;
792 } 792 }
793 793
794 base::string16 StripWWWFromHost(const GURL& url) { 794 base::string16 StripWWWFromHost(const GURL& url) {
795 DCHECK(url.is_valid()); 795 DCHECK(url.is_valid());
796 return StripWWW(base::ASCIIToUTF16(url.host_piece())); 796 return StripWWW(base::ASCIIToUTF16(url.host_piece()));
797 } 797 }
798 798
799 } // namespace url_formatter 799 } // namespace url_formatter
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698