Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(334)

Side by Side Diff: components/url_formatter/url_formatter.cc

Issue 2780633002: Tweaks handling of U+30F[C-E] (Closed)
Patch Set: fix typos Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | components/url_formatter/url_formatter_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/url_formatter/url_formatter.h" 5 #include "components/url_formatter/url_formatter.h"
6 6
7 #include <algorithm> 7 #include <algorithm>
8 #include <utility> 8 #include <utility>
9 9
10 #include "base/lazy_instance.h" 10 #include "base/lazy_instance.h"
(...skipping 306 matching lines...) Expand 10 before | Expand all | Expand 10 after
317 deviation_characters_.freeze(); 317 deviation_characters_.freeze();
318 318
319 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary 319 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary
320 // because additional characters pulled in with scx=Latn are not included in 320 // because additional characters pulled in with scx=Latn are not included in
321 // the allowed set. 321 // the allowed set.
322 non_ascii_latin_letters_ = icu::UnicodeSet( 322 non_ascii_latin_letters_ = icu::UnicodeSet(
323 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status); 323 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);
324 non_ascii_latin_letters_.freeze(); 324 non_ascii_latin_letters_.freeze();
325 325
326 // These letters are parts of |dangerous_patterns_|. 326 // These letters are parts of |dangerous_patterns_|.
327 kana_letters_exceptions_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE( 327 kana_letters_exceptions_ = icu::UnicodeSet(
328 "[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb\\u30fc]"), status); 328 UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"),
329 status);
329 kana_letters_exceptions_.freeze(); 330 kana_letters_exceptions_.freeze();
330 331
331 // These Cyrillic letters look like Latin. A domain label entirely made of 332 // These Cyrillic letters look like Latin. A domain label entirely made of
332 // these letters is blocked as a simplified whole-script-spoofable. 333 // these letters is blocked as a simplified whole-script-spoofable.
333 cyrillic_letters_latin_alike_ = 334 cyrillic_letters_latin_alike_ =
334 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status); 335 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status);
335 cyrillic_letters_latin_alike_.freeze(); 336 cyrillic_letters_latin_alike_.freeze();
336 337
337 cyrillic_letters_ = 338 cyrillic_letters_ =
338 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status); 339 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status);
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after
399 // non-Japanese script on either side is disallowed, legitimate cases like 400 // non-Japanese script on either side is disallowed, legitimate cases like
400 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those 401 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those
401 // characters when used alone as a label is futile because those cases 402 // characters when used alone as a label is futile because those cases
402 // would not reach here. 403 // would not reach here.
403 // Also disallow what used to be blocked by mixed-script-confusable (MSC) 404 // Also disallow what used to be blocked by mixed-script-confusable (MSC)
404 // detection. ICU 58 does not detect MSC any more for a single input string. 405 // detection. ICU 58 does not detect MSC any more for a single input string.
405 // See http://bugs.icu-project.org/trac/ticket/12823 . 406 // See http://bugs.icu-project.org/trac/ticket/12823 .
406 // TODO(jshin): adjust the pattern once the above ICU bug is fixed. 407 // TODO(jshin): adjust the pattern once the above ICU bug is fixed.
407 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana 408 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana
408 // Prolonged Sound) used out-of-context. 409 // Prolonged Sound) used out-of-context.
410 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark)
411 // unless they're preceded by a Katakana.
409 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters 412 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters
410 // (U+30D[8-A]) that look exactly like each other when they're used in a 413 // (U+30D[8-A]) that look exactly like each other when they're used in a
411 // label otherwise entirely in Katakna or Hiragana. 414 // label otherwise entirely in Katakna or Hiragana.
412 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small 415 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small
413 // Letter Co) to be next to Latin. 416 // Letter Co) to be next to Latin.
414 // - Disallow Latin 'o' and 'g' next to Armenian. 417 // - Disallow Latin 'o' and 'g' next to Armenian.
415 dangerous_pattern = new icu::RegexMatcher( 418 dangerous_pattern = new icu::RegexMatcher(
416 icu::UnicodeString( 419 icu::UnicodeString(
417 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]" 420 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]"
418 "[\\u30ce\\u30f3\\u30bd\\u30be]" 421 "[\\u30ce\\u30f3\\u30bd\\u30be]"
419 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]|" 422 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]|"
420 "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc|" 423 "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc|^\\u30fc|"
421 "\\u30fc[^\\p{scx=kana}\\p{scx=hira}]|" 424 "[^\\p{scx=kana}][\\u30fd\\u30fe]|^[\\u30fd\\u30fe]|"
422 "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$|" 425 "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$|"
423 "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$|" 426 "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$|"
424 "[a-z]\\u30fb|\\u30fb[a-z]|" 427 "[a-z]\\u30fb|\\u30fb[a-z]|"
425 "^[\\u0585\\u0581]+[a-z]|[a-z][\\u0585\\u0581]+$|" 428 "^[\\u0585\\u0581]+[a-z]|[a-z][\\u0585\\u0581]+$|"
426 "[a-z][\\u0585\\u0581]+[a-z]|" 429 "[a-z][\\u0585\\u0581]+[a-z]|"
427 "^[og]+[\\p{scx=armn}]|[\\p{scx=armn}][og]+$|" 430 "^[og]+[\\p{scx=armn}]|[\\p{scx=armn}][og]+$|"
428 "[\\p{scx=armn}][og]+[\\p{scx=armn}]", -1, US_INV), 431 "[\\p{scx=armn}][og]+[\\p{scx=armn}]",
432 -1, US_INV),
429 0, status); 433 0, status);
430 tls_index.Set(dangerous_pattern); 434 tls_index.Set(dangerous_pattern);
431 } 435 }
432 dangerous_pattern->reset(label_string); 436 dangerous_pattern->reset(label_string);
433 return !dangerous_pattern->find(); 437 return !dangerous_pattern->find();
434 } 438 }
435 439
436 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic( 440 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(
437 const icu::UnicodeString& label_string) { 441 const icu::UnicodeString& label_string) {
438 // Collect all the Cyrillic letters in |label_string| and see if they're 442 // Collect all the Cyrillic letters in |label_string| and see if they're
(...skipping 407 matching lines...) Expand 10 before | Expand all | Expand 10 after
846 return base::StartsWith(text, www, base::CompareCase::SENSITIVE) 850 return base::StartsWith(text, www, base::CompareCase::SENSITIVE)
847 ? text.substr(www.length()) : text; 851 ? text.substr(www.length()) : text;
848 } 852 }
849 853
850 base::string16 StripWWWFromHost(const GURL& url) { 854 base::string16 StripWWWFromHost(const GURL& url) {
851 DCHECK(url.is_valid()); 855 DCHECK(url.is_valid());
852 return StripWWW(base::ASCIIToUTF16(url.host_piece())); 856 return StripWWW(base::ASCIIToUTF16(url.host_piece()));
853 } 857 }
854 858
855 } // namespace url_formatter 859 } // namespace url_formatter
OLDNEW
« no previous file with comments | « no previous file | components/url_formatter/url_formatter_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698