components/url_formatter/url_formatter.cc - Issue 2780633002: Tweaks handling of U+30F[C-E]

Side by Side Diff: components/url_formatter/url_formatter.cc

Issue 2780633002: Tweaks handling of U+30F[C-E] (Closed)

Patch Set: fix typos Created 3 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2015 The Chromium Authors. All rights reserved.	1 // Copyright 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/url_formatter/url_formatter.h"	5 #include "components/url_formatter/url_formatter.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <utility>	8 #include <utility>

9	9

10 #include "base/lazy_instance.h"	10 #include "base/lazy_instance.h"

(...skipping 306 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
317 deviation_characters_.freeze();	317 deviation_characters_.freeze();

318	318

319 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary	319 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary

320 // because additional characters pulled in with scx=Latn are not included in	320 // because additional characters pulled in with scx=Latn are not included in

321 // the allowed set.	321 // the allowed set.

322 non_ascii_latin_letters_ = icu::UnicodeSet(	322 non_ascii_latin_letters_ = icu::UnicodeSet(

323 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);	323 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);

324 non_ascii_latin_letters_.freeze();	324 non_ascii_latin_letters_.freeze();

325	325

326 // These letters are parts of \|dangerous_patterns_\|.	326 // These letters are parts of \|dangerous_patterns_\|.

327 kana_letters_exceptions_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE(	327 kana_letters_exceptions_ = icu::UnicodeSet(

328 "[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb\\u30fc]"), status);	328 UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"),

	329 status);

329 kana_letters_exceptions_.freeze();	330 kana_letters_exceptions_.freeze();

330	331

331 // These Cyrillic letters look like Latin. A domain label entirely made of	332 // These Cyrillic letters look like Latin. A domain label entirely made of

332 // these letters is blocked as a simplified whole-script-spoofable.	333 // these letters is blocked as a simplified whole-script-spoofable.

333 cyrillic_letters_latin_alike_ =	334 cyrillic_letters_latin_alike_ =

334 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status);	335 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status);

335 cyrillic_letters_latin_alike_.freeze();	336 cyrillic_letters_latin_alike_.freeze();

336	337

337 cyrillic_letters_ =	338 cyrillic_letters_ =

338 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status);	339 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status);

(...skipping 60 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
399 // non-Japanese script on either side is disallowed, legitimate cases like	400 // non-Japanese script on either side is disallowed, legitimate cases like

400 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those	401 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those

401 // characters when used alone as a label is futile because those cases	402 // characters when used alone as a label is futile because those cases

402 // would not reach here.	403 // would not reach here.

403 // Also disallow what used to be blocked by mixed-script-confusable (MSC)	404 // Also disallow what used to be blocked by mixed-script-confusable (MSC)

404 // detection. ICU 58 does not detect MSC any more for a single input string.	405 // detection. ICU 58 does not detect MSC any more for a single input string.

405 // See http://bugs.icu-project.org/trac/ticket/12823 .	406 // See http://bugs.icu-project.org/trac/ticket/12823 .

406 // TODO(jshin): adjust the pattern once the above ICU bug is fixed.	407 // TODO(jshin): adjust the pattern once the above ICU bug is fixed.

407 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana	408 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana

408 // Prolonged Sound) used out-of-context.	409 // Prolonged Sound) used out-of-context.

	410 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark)

	411 // unless they're preceded by a Katakana.

409 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters	412 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters

410 // (U+30D[8-A]) that look exactly like each other when they're used in a	413 // (U+30D[8-A]) that look exactly like each other when they're used in a

411 // label otherwise entirely in Katakna or Hiragana.	414 // label otherwise entirely in Katakna or Hiragana.

412 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small	415 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small

413 // Letter Co) to be next to Latin.	416 // Letter Co) to be next to Latin.

414 // - Disallow Latin 'o' and 'g' next to Armenian.	417 // - Disallow Latin 'o' and 'g' next to Armenian.

415 dangerous_pattern = new icu::RegexMatcher(	418 dangerous_pattern = new icu::RegexMatcher(

416 icu::UnicodeString(	419 icu::UnicodeString(

417 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]"	420 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]"

418 "[\\u30ce\\u30f3\\u30bd\\u30be]"	421 "[\\u30ce\\u30f3\\u30bd\\u30be]"

419 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]\|"	422 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]\|"

420 "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc\|"	423 "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc\|^\\u30fc\|"

421 "\\u30fc[^\\p{scx=kana}\\p{scx=hira}]\|"	424 "[^\\p{scx=kana}][\\u30fd\\u30fe]\|^[\\u30fd\\u30fe]\|"

422 "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$\|"	425 "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$\|"

423 "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$\|"	426 "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$\|"

424 "[a-z]\\u30fb\|\\u30fb[a-z]\|"	427 "[a-z]\\u30fb\|\\u30fb[a-z]\|"

425 "^[\\u0585\\u0581]+[a-z]\|[a-z][\\u0585\\u0581]+$\|"	428 "^[\\u0585\\u0581]+[a-z]\|[a-z][\\u0585\\u0581]+$\|"

426 "[a-z][\\u0585\\u0581]+[a-z]\|"	429 "[a-z][\\u0585\\u0581]+[a-z]\|"

427 "^[og]+[\\p{scx=armn}]\|[\\p{scx=armn}][og]+$\|"	430 "^[og]+[\\p{scx=armn}]\|[\\p{scx=armn}][og]+$\|"

428 "[\\p{scx=armn}][og]+[\\p{scx=armn}]", -1, US_INV),	431 "[\\p{scx=armn}][og]+[\\p{scx=armn}]",

	432 -1, US_INV),

429 0, status);	433 0, status);

430 tls_index.Set(dangerous_pattern);	434 tls_index.Set(dangerous_pattern);

431 }	435 }

432 dangerous_pattern->reset(label_string);	436 dangerous_pattern->reset(label_string);

433 return !dangerous_pattern->find();	437 return !dangerous_pattern->find();

434 }	438 }

435	439

436 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(	440 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(

437 const icu::UnicodeString& label_string) {	441 const icu::UnicodeString& label_string) {

438 // Collect all the Cyrillic letters in \|label_string\| and see if they're	442 // Collect all the Cyrillic letters in \|label_string\| and see if they're

(...skipping 407 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
846 return base::StartsWith(text, www, base::CompareCase::SENSITIVE)	850 return base::StartsWith(text, www, base::CompareCase::SENSITIVE)

847 ? text.substr(www.length()) : text;	851 ? text.substr(www.length()) : text;

848 }	852 }

849	853

850 base::string16 StripWWWFromHost(const GURL& url) {	854 base::string16 StripWWWFromHost(const GURL& url) {

851 DCHECK(url.is_valid());	855 DCHECK(url.is_valid());

852 return StripWWW(base::ASCIIToUTF16(url.host_piece()));	856 return StripWWW(base::ASCIIToUTF16(url.host_piece()));

853 }	857 }

854	858

855 } // namespace url_formatter	859 } // namespace url_formatter

OLD	NEW

« no previous file with comments | « no previous file | components/url_formatter/url_formatter_unittest.cc » ('j') | no next file with comments »