OLD | NEW |
---|---|
1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/url_formatter/url_formatter.h" | 5 #include "components/url_formatter/url_formatter.h" |
6 | 6 |
7 #include <algorithm> | 7 #include <algorithm> |
8 #include <utility> | 8 #include <utility> |
9 | 9 |
10 #include "base/lazy_instance.h" | 10 #include "base/lazy_instance.h" |
(...skipping 232 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
243 | 243 |
244 // Returns true if |label| is safe to display as Unicode. In the event of | 244 // Returns true if |label| is safe to display as Unicode. In the event of |
245 // library failure, all IDN inputs will be treated as unsafe. | 245 // library failure, all IDN inputs will be treated as unsafe. |
246 bool Check(base::StringPiece16 label); | 246 bool Check(base::StringPiece16 label); |
247 | 247 |
248 private: | 248 private: |
249 void SetAllowedUnicodeSet(UErrorCode* status); | 249 void SetAllowedUnicodeSet(UErrorCode* status); |
250 | 250 |
251 USpoofChecker* checker_; | 251 USpoofChecker* checker_; |
252 icu::UnicodeSet deviation_characters_; | 252 icu::UnicodeSet deviation_characters_; |
253 icu::UnicodeSet latin_letters_; | |
254 icu::UnicodeSet non_ascii_latin_letters_; | 253 icu::UnicodeSet non_ascii_latin_letters_; |
254 icu::UnicodeSet kana_letters_exceptions_; | |
255 | 255 |
256 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker); | 256 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker); |
257 }; | 257 }; |
258 | 258 |
259 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker = | 259 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker = |
260 LAZY_INSTANCE_INITIALIZER; | 260 LAZY_INSTANCE_INITIALIZER; |
261 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER; | 261 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER; |
262 | 262 |
263 void OnThreadTermination(void* regex_matcher) { | 263 void OnThreadTermination(void* regex_matcher) { |
264 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher); | 264 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher); |
(...skipping 17 matching lines...) Expand all Loading... | |
282 // Japanese(Hiragana + Katakana + Han), and Korean(Hangul + Han), only one | 282 // Japanese(Hiragana + Katakana + Han), and Korean(Hangul + Han), only one |
283 // script other than Common and Inherited can be mixed with Latin. Cyrillic | 283 // script other than Common and Inherited can be mixed with Latin. Cyrillic |
284 // and Greek are not allowed to mix with Latin. | 284 // and Greek are not allowed to mix with Latin. |
285 // See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection | 285 // See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection |
286 uspoof_setRestrictionLevel(checker_, USPOOF_MODERATELY_RESTRICTIVE); | 286 uspoof_setRestrictionLevel(checker_, USPOOF_MODERATELY_RESTRICTIVE); |
287 | 287 |
288 // Restrict allowed characters in IDN labels and turn on USPOOF_CHAR_LIMIT. | 288 // Restrict allowed characters in IDN labels and turn on USPOOF_CHAR_LIMIT. |
289 SetAllowedUnicodeSet(&status); | 289 SetAllowedUnicodeSet(&status); |
290 | 290 |
291 // Enable the return of auxillary (non-error) information. | 291 // Enable the return of auxillary (non-error) information. |
292 // We used to disable WHOLE_SCRIPT_CONFUSABLE check explicitly, but as of | |
293 // ICU 58.1, WSC is a no-op in a single string check API. | |
292 int32_t checks = uspoof_getChecks(checker_, &status) | USPOOF_AUX_INFO; | 294 int32_t checks = uspoof_getChecks(checker_, &status) | USPOOF_AUX_INFO; |
293 | |
294 // Disable WHOLE_SCRIPT_CONFUSABLE check. The check has a marginal value when | |
295 // used against a single string as opposed to comparing a pair of strings. In | |
296 // addition, it would also flag a number of common labels including the IDN | |
297 // TLD for Russian. | |
298 // A possible alternative would be to turn on the check and block a label | |
299 // only under the following conditions, but it'd better be done on the | |
300 // server-side (e.g. SafeBrowsing): | |
301 // 1. The label is whole-script confusable. | |
302 // 2. And the skeleton of the label matches the skeleton of one of top | |
303 // domain labels. See http://unicode.org/reports/tr39/#Confusable_Detection | |
304 // for the definition of skeleton. | |
305 // 3. And the label is different from the matched top domain label in #2. | |
306 checks &= ~USPOOF_WHOLE_SCRIPT_CONFUSABLE; | |
307 | |
308 uspoof_setChecks(checker_, checks, &status); | 295 uspoof_setChecks(checker_, checks, &status); |
309 | 296 |
310 // Four characters handled differently by IDNA 2003 and IDNA 2008. UTS46 | 297 // Four characters handled differently by IDNA 2003 and IDNA 2008. UTS46 |
311 // transitional processing treats them as IDNA 2003 does; maps U+00DF and | 298 // transitional processing treats them as IDNA 2003 does; maps U+00DF and |
312 // U+03C2 and drops U+200[CD]. | 299 // U+03C2 and drops U+200[CD]. |
313 deviation_characters_ = | 300 deviation_characters_ = |
314 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"), | 301 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"), |
315 status); | 302 status); |
316 deviation_characters_.freeze(); | 303 deviation_characters_.freeze(); |
317 | 304 |
318 latin_letters_ = | |
319 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[:Latin:]"), status); | |
320 latin_letters_.freeze(); | |
321 | |
322 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary | 305 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary |
323 // because additional characters pulled in with scx=Latn are not included in | 306 // because additional characters pulled in with scx=Latn are not included in |
324 // the allowed set. | 307 // the allowed set. |
325 non_ascii_latin_letters_ = icu::UnicodeSet( | 308 non_ascii_latin_letters_ = icu::UnicodeSet( |
326 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status); | 309 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status); |
327 non_ascii_latin_letters_.freeze(); | 310 non_ascii_latin_letters_.freeze(); |
328 | 311 |
312 // These letters are parts of |dangerous_patterns_|. | |
313 kana_letters_exceptions_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE( | |
314 "[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb\\u30fc]"), status); | |
315 kana_letters_exceptions_.freeze(); | |
316 | |
329 DCHECK(U_SUCCESS(status)); | 317 DCHECK(U_SUCCESS(status)); |
330 } | 318 } |
331 | 319 |
332 bool IDNSpoofChecker::Check(base::StringPiece16 label) { | 320 bool IDNSpoofChecker::Check(base::StringPiece16 label) { |
333 UErrorCode status = U_ZERO_ERROR; | 321 UErrorCode status = U_ZERO_ERROR; |
334 int32_t result = uspoof_check(checker_, label.data(), | 322 int32_t result = uspoof_check(checker_, label.data(), |
335 base::checked_cast<int32_t>(label.size()), | 323 base::checked_cast<int32_t>(label.size()), |
336 NULL, &status); | 324 NULL, &status); |
337 // If uspoof_check fails (due to library failure), or if any of the checks | 325 // If uspoof_check fails (due to library failure), or if any of the checks |
338 // fail, treat the IDN as unsafe. | 326 // fail, treat the IDN as unsafe. |
(...skipping 11 matching lines...) Expand all Loading... | |
350 // "UTS 46 section 4 Processing step 4" applies validity criteria for | 338 // "UTS 46 section 4 Processing step 4" applies validity criteria for |
351 // non-transitional processing (i.e. do not map deviation characters) to any | 339 // non-transitional processing (i.e. do not map deviation characters) to any |
352 // punycode labels regardless of whether transitional or non-transitional is | 340 // punycode labels regardless of whether transitional or non-transitional is |
353 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted | 341 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted |
354 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as | 342 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as |
355 // such. See http://crbug.com/595263 . | 343 // such. See http://crbug.com/595263 . |
356 if (deviation_characters_.containsSome(label_string)) | 344 if (deviation_characters_.containsSome(label_string)) |
357 return false; | 345 return false; |
358 | 346 |
359 // If there's no script mixing, the input is regarded as safe without any | 347 // If there's no script mixing, the input is regarded as safe without any |
360 // extra check. | 348 // extra check unless it contains Kana letter exceptions. Note that |
361 result &= USPOOF_RESTRICTION_LEVEL_MASK; | 349 // the following combinations of scripts are treated as a 'logical' single |
362 if (result == USPOOF_ASCII || result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE) | 350 // script. |
363 return true; | |
364 | |
365 // When check is passed at 'highly restrictive' level, |label| is | |
366 // made up of one of the following script sets optionally mixed with Latin. | |
367 // - Chinese: Han, Bopomofo, Common | 351 // - Chinese: Han, Bopomofo, Common |
368 // - Japanese: Han, Hiragana, Katakana, Common | 352 // - Japanese: Han, Hiragana, Katakana, Common |
369 // - Korean: Hangul, Han, Common | 353 // - Korean: Hangul, Han, Common |
370 // Treat this case as a 'logical' single script unless Latin is mixed. | 354 result &= USPOOF_RESTRICTION_LEVEL_MASK; |
371 if (result == USPOOF_HIGHLY_RESTRICTIVE && | 355 if (result == USPOOF_ASCII || |
372 latin_letters_.containsNone(label_string)) | 356 (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE && |
357 kana_letters_exceptions_.containsNone(label_string))) | |
373 return true; | 358 return true; |
374 | 359 |
375 // Additional checks for |label| with multiple scripts, one of which is Latin. | 360 // Additional checks for |label| with multiple scripts, one of which is Latin. |
376 // Disallow non-ASCII Latin letters to mix with a non-Latin script. | 361 // Disallow non-ASCII Latin letters to mix with a non-Latin script. |
377 if (non_ascii_latin_letters_.containsSome(label_string)) | 362 if (non_ascii_latin_letters_.containsSome(label_string)) |
378 return false; | 363 return false; |
379 | 364 |
380 if (!tls_index.initialized()) | 365 if (!tls_index.initialized()) |
381 tls_index.Initialize(&OnThreadTermination); | 366 tls_index.Initialize(&OnThreadTermination); |
382 icu::RegexMatcher* dangerous_pattern = | 367 icu::RegexMatcher* dangerous_pattern = |
383 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get()); | 368 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get()); |
384 if (!dangerous_pattern) { | 369 if (!dangerous_pattern) { |
385 // Disallow the katakana no, so, zo, or n, as they may be mistaken for | 370 // Disallow the katakana no, so, zo, or n, as they may be mistaken for |
386 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts | 371 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts |
387 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a | 372 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a |
388 // non-Japanese script on either side is disallowed, legitimate cases like | 373 // non-Japanese script on either side is disallowed, legitimate cases like |
389 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those | 374 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those |
390 // characters when used alone as a label is futile because those cases | 375 // characters when used alone as a label is futile because those cases |
391 // would not reach here. | 376 // would not reach here. |
377 // Besides, disallow what used to be blocked by mixed-script-confusable(MSC) | |
Peter Kasting
2016/10/28 22:46:18
Nit "Besides," -> "Also"; space before '('
| |
378 // detection. ICU 58 does not detect MSC any more for a single input string. | |
379 // See http://bugs.icu-project.org/trac/ticket/12823 . | |
380 // - Disallow U+30FB(Katakana Middle Dot) and U+30FC(Hiragana-Katakana | |
381 // Prolonged Sound) used out-of-context. | |
382 // - Disallow three Hiragana letters(U+307[8-A]) or Katakana letters | |
383 // (U+30D[8-A]) that look exactly like each other when they're used in a | |
384 // label otherwise entirely in Katakna or Hiragana. | |
385 // - Disallow U+0585 (Armenian Small Letter Oh) to be mixed with Latin. | |
392 dangerous_pattern = new icu::RegexMatcher( | 386 dangerous_pattern = new icu::RegexMatcher( |
393 icu::UnicodeString( | 387 icu::UnicodeString( |
394 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]" | 388 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]" |
395 "[\\u30ce\\u30f3\\u30bd\\u30be]" | 389 "[\\u30ce\\u30f3\\u30bd\\u30be]" |
396 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]", -1, US_INV), | 390 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]|" |
391 "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc|" | |
392 "\\u30fc[^\\p{scx=kana}\\p{scx=hira}]|" | |
393 "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$|" | |
394 "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$|" | |
395 "[a-z]\\u30fb|\\u30fb[a-z]|" | |
396 "\\u0585.*[a-z]+|[a-z]+\\u0585", -1, US_INV), | |
Peter Kasting
2016/10/28 22:46:18
Note, I do not speak regex enough to really review
| |
397 0, status); | 397 0, status); |
398 tls_index.Set(dangerous_pattern); | 398 tls_index.Set(dangerous_pattern); |
399 } | 399 } |
400 dangerous_pattern->reset(label_string); | 400 dangerous_pattern->reset(label_string); |
401 return !dangerous_pattern->find(); | 401 return !dangerous_pattern->find(); |
402 } | 402 } |
403 | 403 |
404 void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) { | 404 void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) { |
405 if (U_FAILURE(*status)) | 405 if (U_FAILURE(*status)) |
406 return; | 406 return; |
(...skipping 383 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
790 return base::StartsWith(text, www, base::CompareCase::SENSITIVE) | 790 return base::StartsWith(text, www, base::CompareCase::SENSITIVE) |
791 ? text.substr(www.length()) : text; | 791 ? text.substr(www.length()) : text; |
792 } | 792 } |
793 | 793 |
794 base::string16 StripWWWFromHost(const GURL& url) { | 794 base::string16 StripWWWFromHost(const GURL& url) { |
795 DCHECK(url.is_valid()); | 795 DCHECK(url.is_valid()); |
796 return StripWWW(base::ASCIIToUTF16(url.host_piece())); | 796 return StripWWW(base::ASCIIToUTF16(url.host_piece())); |
797 } | 797 } |
798 | 798 |
799 } // namespace url_formatter | 799 } // namespace url_formatter |
OLD | NEW |