Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(377)

Side by Side Diff: components/url_formatter/url_formatter.cc

Issue 2447513002: Update ICU to 58.1 (Closed)
Patch Set: fix a typo in html comment Created 4 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/url_formatter/url_formatter.h" 5 #include "components/url_formatter/url_formatter.h"
6 6
7 #include <algorithm> 7 #include <algorithm>
8 #include <utility> 8 #include <utility>
9 9
10 #include "base/lazy_instance.h" 10 #include "base/lazy_instance.h"
(...skipping 232 matching lines...) Expand 10 before | Expand all | Expand 10 after
243 243
244 // Returns true if |label| is safe to display as Unicode. In the event of 244 // Returns true if |label| is safe to display as Unicode. In the event of
245 // library failure, all IDN inputs will be treated as unsafe. 245 // library failure, all IDN inputs will be treated as unsafe.
246 bool Check(base::StringPiece16 label); 246 bool Check(base::StringPiece16 label);
247 247
248 private: 248 private:
249 void SetAllowedUnicodeSet(UErrorCode* status); 249 void SetAllowedUnicodeSet(UErrorCode* status);
250 250
251 USpoofChecker* checker_; 251 USpoofChecker* checker_;
252 icu::UnicodeSet deviation_characters_; 252 icu::UnicodeSet deviation_characters_;
253 icu::UnicodeSet latin_letters_;
254 icu::UnicodeSet non_ascii_latin_letters_; 253 icu::UnicodeSet non_ascii_latin_letters_;
254 icu::UnicodeSet kana_letters_exceptions_;
255 255
256 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker); 256 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);
257 }; 257 };
258 258
259 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker = 259 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =
260 LAZY_INSTANCE_INITIALIZER; 260 LAZY_INSTANCE_INITIALIZER;
261 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER; 261 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;
262 262
263 void OnThreadTermination(void* regex_matcher) { 263 void OnThreadTermination(void* regex_matcher) {
264 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher); 264 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);
(...skipping 17 matching lines...) Expand all
282 // Japanese(Hiragana + Katakana + Han), and Korean(Hangul + Han), only one 282 // Japanese(Hiragana + Katakana + Han), and Korean(Hangul + Han), only one
283 // script other than Common and Inherited can be mixed with Latin. Cyrillic 283 // script other than Common and Inherited can be mixed with Latin. Cyrillic
284 // and Greek are not allowed to mix with Latin. 284 // and Greek are not allowed to mix with Latin.
285 // See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection 285 // See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection
286 uspoof_setRestrictionLevel(checker_, USPOOF_MODERATELY_RESTRICTIVE); 286 uspoof_setRestrictionLevel(checker_, USPOOF_MODERATELY_RESTRICTIVE);
287 287
288 // Restrict allowed characters in IDN labels and turn on USPOOF_CHAR_LIMIT. 288 // Restrict allowed characters in IDN labels and turn on USPOOF_CHAR_LIMIT.
289 SetAllowedUnicodeSet(&status); 289 SetAllowedUnicodeSet(&status);
290 290
291 // Enable the return of auxillary (non-error) information. 291 // Enable the return of auxillary (non-error) information.
292 // We used to disable WHOLE_SCRIPT_CONFUSABLE check explicitly, but as of
293 // ICU 58.1, WSC is a no-op in a single string check API.
292 int32_t checks = uspoof_getChecks(checker_, &status) | USPOOF_AUX_INFO; 294 int32_t checks = uspoof_getChecks(checker_, &status) | USPOOF_AUX_INFO;
293
294 // Disable WHOLE_SCRIPT_CONFUSABLE check. The check has a marginal value when
295 // used against a single string as opposed to comparing a pair of strings. In
296 // addition, it would also flag a number of common labels including the IDN
297 // TLD for Russian.
298 // A possible alternative would be to turn on the check and block a label
299 // only under the following conditions, but it'd better be done on the
300 // server-side (e.g. SafeBrowsing):
301 // 1. The label is whole-script confusable.
302 // 2. And the skeleton of the label matches the skeleton of one of top
303 // domain labels. See http://unicode.org/reports/tr39/#Confusable_Detection
304 // for the definition of skeleton.
305 // 3. And the label is different from the matched top domain label in #2.
306 checks &= ~USPOOF_WHOLE_SCRIPT_CONFUSABLE;
307
308 uspoof_setChecks(checker_, checks, &status); 295 uspoof_setChecks(checker_, checks, &status);
309 296
310 // Four characters handled differently by IDNA 2003 and IDNA 2008. UTS46 297 // Four characters handled differently by IDNA 2003 and IDNA 2008. UTS46
311 // transitional processing treats them as IDNA 2003 does; maps U+00DF and 298 // transitional processing treats them as IDNA 2003 does; maps U+00DF and
312 // U+03C2 and drops U+200[CD]. 299 // U+03C2 and drops U+200[CD].
313 deviation_characters_ = 300 deviation_characters_ =
314 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"), 301 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"),
315 status); 302 status);
316 deviation_characters_.freeze(); 303 deviation_characters_.freeze();
317 304
318 latin_letters_ =
319 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[:Latin:]"), status);
320 latin_letters_.freeze();
321
322 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary 305 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary
323 // because additional characters pulled in with scx=Latn are not included in 306 // because additional characters pulled in with scx=Latn are not included in
324 // the allowed set. 307 // the allowed set.
325 non_ascii_latin_letters_ = icu::UnicodeSet( 308 non_ascii_latin_letters_ = icu::UnicodeSet(
326 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status); 309 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);
327 non_ascii_latin_letters_.freeze(); 310 non_ascii_latin_letters_.freeze();
328 311
312 // These letters are parts of |dangerous_patterns_|.
313 kana_letters_exceptions_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE(
314 "[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb\\u30fc]"), status);
315 kana_letters_exceptions_.freeze();
316
329 DCHECK(U_SUCCESS(status)); 317 DCHECK(U_SUCCESS(status));
330 } 318 }
331 319
332 bool IDNSpoofChecker::Check(base::StringPiece16 label) { 320 bool IDNSpoofChecker::Check(base::StringPiece16 label) {
333 UErrorCode status = U_ZERO_ERROR; 321 UErrorCode status = U_ZERO_ERROR;
334 int32_t result = uspoof_check(checker_, label.data(), 322 int32_t result = uspoof_check(checker_, label.data(),
335 base::checked_cast<int32_t>(label.size()), 323 base::checked_cast<int32_t>(label.size()),
336 NULL, &status); 324 NULL, &status);
337 // If uspoof_check fails (due to library failure), or if any of the checks 325 // If uspoof_check fails (due to library failure), or if any of the checks
338 // fail, treat the IDN as unsafe. 326 // fail, treat the IDN as unsafe.
(...skipping 11 matching lines...) Expand all
350 // "UTS 46 section 4 Processing step 4" applies validity criteria for 338 // "UTS 46 section 4 Processing step 4" applies validity criteria for
351 // non-transitional processing (i.e. do not map deviation characters) to any 339 // non-transitional processing (i.e. do not map deviation characters) to any
352 // punycode labels regardless of whether transitional or non-transitional is 340 // punycode labels regardless of whether transitional or non-transitional is
353 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted 341 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted
354 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as 342 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as
355 // such. See http://crbug.com/595263 . 343 // such. See http://crbug.com/595263 .
356 if (deviation_characters_.containsSome(label_string)) 344 if (deviation_characters_.containsSome(label_string))
357 return false; 345 return false;
358 346
359 // If there's no script mixing, the input is regarded as safe without any 347 // If there's no script mixing, the input is regarded as safe without any
360 // extra check. 348 // extra check unless it contains Kana letter exceptions. Note that
361 result &= USPOOF_RESTRICTION_LEVEL_MASK; 349 // the following combinations of scripts are treated as a 'logical' single
362 if (result == USPOOF_ASCII || result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE) 350 // script.
363 return true;
364
365 // When check is passed at 'highly restrictive' level, |label| is
366 // made up of one of the following script sets optionally mixed with Latin.
367 // - Chinese: Han, Bopomofo, Common 351 // - Chinese: Han, Bopomofo, Common
368 // - Japanese: Han, Hiragana, Katakana, Common 352 // - Japanese: Han, Hiragana, Katakana, Common
369 // - Korean: Hangul, Han, Common 353 // - Korean: Hangul, Han, Common
370 // Treat this case as a 'logical' single script unless Latin is mixed. 354 result &= USPOOF_RESTRICTION_LEVEL_MASK;
371 if (result == USPOOF_HIGHLY_RESTRICTIVE && 355 if (result == USPOOF_ASCII ||
372 latin_letters_.containsNone(label_string)) 356 (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&
357 kana_letters_exceptions_.containsNone(label_string)))
373 return true; 358 return true;
374 359
375 // Additional checks for |label| with multiple scripts, one of which is Latin. 360 // Additional checks for |label| with multiple scripts, one of which is Latin.
376 // Disallow non-ASCII Latin letters to mix with a non-Latin script. 361 // Disallow non-ASCII Latin letters to mix with a non-Latin script.
377 if (non_ascii_latin_letters_.containsSome(label_string)) 362 if (non_ascii_latin_letters_.containsSome(label_string))
378 return false; 363 return false;
379 364
380 if (!tls_index.initialized()) 365 if (!tls_index.initialized())
381 tls_index.Initialize(&OnThreadTermination); 366 tls_index.Initialize(&OnThreadTermination);
382 icu::RegexMatcher* dangerous_pattern = 367 icu::RegexMatcher* dangerous_pattern =
383 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get()); 368 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get());
384 if (!dangerous_pattern) { 369 if (!dangerous_pattern) {
385 // Disallow the katakana no, so, zo, or n, as they may be mistaken for 370 // Disallow the katakana no, so, zo, or n, as they may be mistaken for
386 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts 371 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts
387 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a 372 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a
388 // non-Japanese script on either side is disallowed, legitimate cases like 373 // non-Japanese script on either side is disallowed, legitimate cases like
389 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those 374 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those
390 // characters when used alone as a label is futile because those cases 375 // characters when used alone as a label is futile because those cases
391 // would not reach here. 376 // would not reach here.
377 // Also disallow what used to be blocked by mixed-script-confusable (MSC)
378 // detection. ICU 58 does not detect MSC any more for a single input string.
379 // See http://bugs.icu-project.org/trac/ticket/12823 .
380 // TODO(jshin): adjust the pattern once the above ICU bug is fixed.
381 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana
382 // Prolonged Sound) used out-of-context.
383 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters
384 // (U+30D[8-A]) that look exactly like each other when they're used in a
385 // label otherwise entirely in Katakna or Hiragana.
386 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small
387 // Letter Co) to be next to Latin.
388 // - Disallow Latin 'o' and 'g' next to Armenian.
392 dangerous_pattern = new icu::RegexMatcher( 389 dangerous_pattern = new icu::RegexMatcher(
393 icu::UnicodeString( 390 icu::UnicodeString(
394 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]" 391 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]"
395 "[\\u30ce\\u30f3\\u30bd\\u30be]" 392 "[\\u30ce\\u30f3\\u30bd\\u30be]"
396 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]", -1, US_INV), 393 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]|"
394 "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc|"
395 "\\u30fc[^\\p{scx=kana}\\p{scx=hira}]|"
396 "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$|"
397 "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$|"
398 "[a-z]\\u30fb|\\u30fb[a-z]|"
399 "^[\\u0585\\u0581]+[a-z]|[a-z][\\u0585\\u0581]+$|"
400 "[a-z][\\u0585\\u0581]+[a-z]|"
401 "^[og]+[\\p{scx=armn}]|[\\p{scx=armn}][og]+$|"
402 "[\\p{scx=armn}][og]+[\\p{scx=armn}]", -1, US_INV),
397 0, status); 403 0, status);
398 tls_index.Set(dangerous_pattern); 404 tls_index.Set(dangerous_pattern);
399 } 405 }
400 dangerous_pattern->reset(label_string); 406 dangerous_pattern->reset(label_string);
401 return !dangerous_pattern->find(); 407 return !dangerous_pattern->find();
402 } 408 }
403 409
404 void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) { 410 void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) {
405 if (U_FAILURE(*status)) 411 if (U_FAILURE(*status))
406 return; 412 return;
(...skipping 383 matching lines...) Expand 10 before | Expand all | Expand 10 after
790 return base::StartsWith(text, www, base::CompareCase::SENSITIVE) 796 return base::StartsWith(text, www, base::CompareCase::SENSITIVE)
791 ? text.substr(www.length()) : text; 797 ? text.substr(www.length()) : text;
792 } 798 }
793 799
794 base::string16 StripWWWFromHost(const GURL& url) { 800 base::string16 StripWWWFromHost(const GURL& url) {
795 DCHECK(url.is_valid()); 801 DCHECK(url.is_valid());
796 return StripWWW(base::ASCIIToUTF16(url.host_piece())); 802 return StripWWW(base::ASCIIToUTF16(url.host_piece()));
797 } 803 }
798 804
799 } // namespace url_formatter 805 } // namespace url_formatter
OLDNEW
« no previous file with comments | « base/i18n/number_formatting_unittest.cc ('k') | components/url_formatter/url_formatter_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698