components/url_formatter/url_formatter.cc - Issue 2877973003: Pull out IDN_Spoof_Checker to separate cc/h files.

Side by Side Diff: components/url_formatter/url_formatter.cc

Issue 2877973003: Pull out IDN_Spoof_Checker to separate cc/h files. (Closed)

Patch Set: fix a typo in regex Created 3 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2015 The Chromium Authors. All rights reserved.	1 // Copyright 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/url_formatter/url_formatter.h"	5 #include "components/url_formatter/url_formatter.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <utility>	8 #include <utility>

	9 #include <vector>

9	10

10 #include "base/lazy_instance.h"	11 #include "base/lazy_instance.h"

11 #include "base/macros.h"	12 #include "base/macros.h"

12 #include "base/numerics/safe_conversions.h"	13 #include "base/numerics/safe_conversions.h"

13 #include "base/strings/string_piece.h"	14 #include "base/strings/string_piece.h"

14 #include "base/strings/string_util.h"	15 #include "base/strings/string_util.h"

15 #include "base/strings/utf_offset_string_conversions.h"	16 #include "base/strings/utf_offset_string_conversions.h"

16 #include "base/strings/utf_string_conversions.h"	17 #include "base/strings/utf_string_conversions.h"

17 #include "base/threading/thread_local_storage.h"	18 #include "base/threading/thread_local_storage.h"

18 #include "third_party/icu/source/common/unicode/schriter.h"	19 #include "components/url_formatter/idn_spoof_checker.h"

19 #include "third_party/icu/source/common/unicode/uidna.h"	20 #include "third_party/icu/source/common/unicode/uidna.h"

20 #include "third_party/icu/source/common/unicode/uniset.h"	21 #include "third_party/icu/source/common/unicode/utypes.h"

21 #include "third_party/icu/source/common/unicode/uscript.h"

22 #include "third_party/icu/source/common/unicode/uvernum.h"

23 #include "third_party/icu/source/i18n/unicode/regex.h"

24 #include "third_party/icu/source/i18n/unicode/uspoof.h"

25 #include "url/gurl.h"	22 #include "url/gurl.h"

26 #include "url/third_party/mozilla/url_parse.h"	23 #include "url/third_party/mozilla/url_parse.h"

27	24

28 namespace url_formatter {	25 namespace url_formatter {

29	26

30 namespace {	27 namespace {

31	28

32 base::string16 IDNToUnicodeWithAdjustments(	29 base::string16 IDNToUnicodeWithAdjustments(

33 base::StringPiece host,	30 base::StringPiece host,

34 base::OffsetAdjuster::Adjustments* adjustments);	31 base::OffsetAdjuster::Adjustments* adjustments);

(...skipping 149 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
184 new_parsed->scheme.len = kViewSourceLength - 1;	181 new_parsed->scheme.len = kViewSourceLength - 1;

185 }	182 }

186 AdjustAllComponentsButScheme(kViewSourceLength, new_parsed);	183 AdjustAllComponentsButScheme(kViewSourceLength, new_parsed);

187	184

188 if (prefix_end)	185 if (prefix_end)

189 *prefix_end += kViewSourceLength;	186 *prefix_end += kViewSourceLength;

190	187

191 return result;	188 return result;

192 }	189 }

193	190

	191 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =

	192 LAZY_INSTANCE_INITIALIZER;

	193

194 // TODO(brettw): We may want to skip this step in the case of file URLs to	194 // TODO(brettw): We may want to skip this step in the case of file URLs to

195 // allow unicode UNC hostnames regardless of encodings.	195 // allow unicode UNC hostnames regardless of encodings.

196 base::string16 IDNToUnicodeWithAdjustments(	196 base::string16 IDNToUnicodeWithAdjustments(

197 base::StringPiece host, base::OffsetAdjuster::Adjustments* adjustments) {	197 base::StringPiece host, base::OffsetAdjuster::Adjustments* adjustments) {

198 if (adjustments)	198 if (adjustments)

199 adjustments->clear();	199 adjustments->clear();

200 // Convert the ASCII input to a base::string16 for ICU.	200 // Convert the ASCII input to a base::string16 for ICU.

201 base::string16 input16;	201 base::string16 input16;

202 input16.reserve(host.length());	202 input16.reserve(host.length());

203 input16.insert(input16.end(), host.begin(), host.end());	203 input16.insert(input16.end(), host.begin(), host.end());

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
235 component_start, component_length, new_component_length));	235 component_start, component_length, new_component_length));

236 }	236 }

237	237

238 // Need to add the dot we just found (if we found one).	238 // Need to add the dot we just found (if we found one).

239 if (component_end < input16.length())	239 if (component_end < input16.length())

240 out16.push_back('.');	240 out16.push_back('.');

241 }	241 }

242 return out16;	242 return out16;

243 }	243 }

244	244

245 // A helper class for IDN Spoof checking, used to ensure that no IDN input is

246 // spoofable per Chromium's standard of spoofability. For a more thorough

247 // explanation of how spoof checking works in Chromium, see

248 // http://dev.chromium.org/developers/design-documents/idn-in-google-chrome .

249 class IDNSpoofChecker {

250 public:

251 IDNSpoofChecker();

252

253 // Returns true if \|label\| is safe to display as Unicode. When the TLD is

254 // ASCII, check if a label is entirely made of Cyrillic letters that look like

255 // Latin letters. In the event of library failure, all IDN inputs will be

256 // treated as unsafe.

257 bool Check(base::StringPiece16 label, bool is_tld_ascii);

258

259 private:

260 void SetAllowedUnicodeSet(UErrorCode* status);

261 bool IsMadeOfLatinAlikeCyrillic(const icu::UnicodeString& label_string);

262

263 USpoofChecker* checker_;

264 icu::UnicodeSet deviation_characters_;

265 icu::UnicodeSet non_ascii_latin_letters_;

266 icu::UnicodeSet kana_letters_exceptions_;

267 icu::UnicodeSet cyrillic_letters_;

268 icu::UnicodeSet cyrillic_letters_latin_alike_;

269

270 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);

271 };

272

273 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =

274 LAZY_INSTANCE_INITIALIZER;

275 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;

276

277 void OnThreadTermination(void* regex_matcher) {

278 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);

279 }

280

281 IDNSpoofChecker::IDNSpoofChecker() {

282 UErrorCode status = U_ZERO_ERROR;

283 checker_ = uspoof_open(&status);

284 if (U_FAILURE(status)) {

285 checker_ = nullptr;

286 return;

287 }

288

289 // At this point, USpoofChecker has all the checks enabled except

290 // for USPOOF_CHAR_LIMIT (USPOOF_{RESTRICTION_LEVEL, INVISIBLE,

291 // MIXED_SCRIPT_CONFUSABLE, WHOLE_SCRIPT_CONFUSABLE, MIXED_NUMBERS, ANY_CASE})

292 // This default configuration is adjusted below as necessary.

293

294 // Set the restriction level to moderate. It allows mixing Latin with another

295 // script (+ COMMON and INHERITED). Except for Chinese(Han + Bopomofo),

296 // Japanese(Hiragana + Katakana + Han), and Korean(Hangul + Han), only one

297 // script other than Common and Inherited can be mixed with Latin. Cyrillic

298 // and Greek are not allowed to mix with Latin.

299 // See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection

300 uspoof_setRestrictionLevel(checker_, USPOOF_MODERATELY_RESTRICTIVE);

301

302 // Restrict allowed characters in IDN labels and turn on USPOOF_CHAR_LIMIT.

303 SetAllowedUnicodeSet(&status);

304

305 // Enable the return of auxillary (non-error) information.

306 // We used to disable WHOLE_SCRIPT_CONFUSABLE check explicitly, but as of

307 // ICU 58.1, WSC is a no-op in a single string check API.

308 int32_t checks = uspoof_getChecks(checker_, &status) \| USPOOF_AUX_INFO;

309 uspoof_setChecks(checker_, checks, &status);

310

311 // Four characters handled differently by IDNA 2003 and IDNA 2008. UTS46

312 // transitional processing treats them as IDNA 2003 does; maps U+00DF and

313 // U+03C2 and drops U+200[CD].

314 deviation_characters_ =

315 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"),

316 status);

317 deviation_characters_.freeze();

318

319 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary

320 // because additional characters pulled in with scx=Latn are not included in

321 // the allowed set.

322 non_ascii_latin_letters_ = icu::UnicodeSet(

323 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);

324 non_ascii_latin_letters_.freeze();

325

326 // These letters are parts of \|dangerous_patterns_\|.

327 kana_letters_exceptions_ = icu::UnicodeSet(

328 UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"),

329 status);

330 kana_letters_exceptions_.freeze();

331

332 // These Cyrillic letters look like Latin. A domain label entirely made of

333 // these letters is blocked as a simplified whole-script-spoofable.

334 cyrillic_letters_latin_alike_ =

335 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status);

336 cyrillic_letters_latin_alike_.freeze();

337

338 cyrillic_letters_ =

339 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status);

340 cyrillic_letters_.freeze();

341

342 DCHECK(U_SUCCESS(status));

343 }

344

345 bool IDNSpoofChecker::Check(base::StringPiece16 label, bool is_tld_ascii) {

346 UErrorCode status = U_ZERO_ERROR;

347 int32_t result = uspoof_check(checker_, label.data(),

348 base::checked_cast<int32_t>(label.size()),

349 NULL, &status);

350 // If uspoof_check fails (due to library failure), or if any of the checks

351 // fail, treat the IDN as unsafe.

352 if (U_FAILURE(status) \|\| (result & USPOOF_ALL_CHECKS))

353 return false;

354

355 icu::UnicodeString label_string(FALSE, label.data(),

356 base::checked_cast<int32_t>(label.size()));

357

358 // A punycode label with 'xn--' prefix is not subject to the URL

359 // canonicalization and is stored as it is in GURL. If it encodes a deviation

360 // character (UTS 46; e.g. U+00DF/sharp-s), it should be still shown in

361 // punycode instead of Unicode. Without this check, xn--fu-hia for

362 // 'fu<sharp-s>' would be converted to 'fu<sharp-s>' for display because

363 // "UTS 46 section 4 Processing step 4" applies validity criteria for

364 // non-transitional processing (i.e. do not map deviation characters) to any

365 // punycode labels regardless of whether transitional or non-transitional is

366 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted

367 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as

368 // such. See http://crbug.com/595263 .

369 if (deviation_characters_.containsSome(label_string))

370 return false;

371

372 // If there's no script mixing, the input is regarded as safe without any

373 // extra check unless it contains Kana letter exceptions or it's made entirely

374 // of Cyrillic letters that look like Latin letters. Note that the following

375 // combinations of scripts are treated as a 'logical' single script.

376 // - Chinese: Han, Bopomofo, Common

377 // - Japanese: Han, Hiragana, Katakana, Common

378 // - Korean: Hangul, Han, Common

379 result &= USPOOF_RESTRICTION_LEVEL_MASK;

380 if (result == USPOOF_ASCII) return true;

381 if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&

382 kana_letters_exceptions_.containsNone(label_string)) {

383 // Check Cyrillic confusable only for ASCII TLDs.

384 return !is_tld_ascii \|\| !IsMadeOfLatinAlikeCyrillic(label_string);

385 }

386

387 // Additional checks for \|label\| with multiple scripts, one of which is Latin.

388 // Disallow non-ASCII Latin letters to mix with a non-Latin script.

389 if (non_ascii_latin_letters_.containsSome(label_string))

390 return false;

391

392 if (!tls_index.initialized())

393 tls_index.Initialize(&OnThreadTermination);

394 icu::RegexMatcher* dangerous_pattern =

395 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get());

396 if (!dangerous_pattern) {

397 // Disallow the katakana no, so, zo, or n, as they may be mistaken for

398 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts

399 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a

400 // non-Japanese script on either side is disallowed, legitimate cases like

401 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those

402 // characters when used alone as a label is futile because those cases

403 // would not reach here.

404 // Also disallow what used to be blocked by mixed-script-confusable (MSC)

405 // detection. ICU 58 does not detect MSC any more for a single input string.

406 // See http://bugs.icu-project.org/trac/ticket/12823 .

407 // TODO(jshin): adjust the pattern once the above ICU bug is fixed.

408 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana

409 // Prolonged Sound) used out-of-context.

410 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark)

411 // unless they're preceded by a Katakana.

412 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters

413 // (U+30D[8-A]) that look exactly like each other when they're used in a

414 // label otherwise entirely in Katakna or Hiragana.

415 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small

416 // Letter Co) to be next to Latin.

417 // - Disallow Latin 'o' and 'g' next to Armenian.

418 // - Disalow mixing of Latin and Canadian Syllabary.

419 dangerous_pattern = new icu::RegexMatcher(

420 icu::UnicodeString(

421 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]"

422 "[\\u30ce\\u30f3\\u30bd\\u30be]"

423 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]\|"

424 "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc\|^\\u30fc\|"

425 "[^\\p{scx=kana}][\\u30fd\\u30fe]\|^[\\u30fd\\u30fe]\|"

426 "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$\|"

427 "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$\|"

428 "[a-z]\\u30fb\|\\u30fb[a-z]\|"

429 "^[\\u0585\\u0581]+[a-z]\|[a-z][\\u0585\\u0581]+$\|"

430 "[a-z][\\u0585\\u0581]+[a-z]\|"

431 "^[og]+[\\p{scx=armn}]\|[\\p{scx=armn}][og]+$\|"

432 "[\\p{scx=armn}][og]+[\\p{scx=armn}]\|"

433 "[\\p{sc=cans}].[a-z]\|[a-z].[\\p{sc=cans}]",

434 -1, US_INV),

435 0, status);

436 tls_index.Set(dangerous_pattern);

437 }

438 dangerous_pattern->reset(label_string);

439 return !dangerous_pattern->find();

440 }

441

442 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(

443 const icu::UnicodeString& label_string) {

444 // Collect all the Cyrillic letters in \|label_string\| and see if they're

445 // a subset of \|cyrillic_letters_latin_alike_\|.

446 // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and

447 // [_-] and checking if the set contains all letters of \|label_string\|

448 // would work in most cases, but not if a label has non-letters outside

449 // ASCII.

450 icu::UnicodeSet cyrillic_in_label;

451 icu::StringCharacterIterator it(label_string);

452 for (it.setToStart(); it.hasNext();) {

453 const UChar32 c = it.next32PostInc();

454 if (cyrillic_letters_.contains(c))

455 cyrillic_in_label.add(c);

456 }

457 return !cyrillic_in_label.isEmpty() &&

458 cyrillic_letters_latin_alike_.containsAll(cyrillic_in_label);

459 }

460

461 void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) {

462 if (U_FAILURE(*status))

463 return;

464

465 // The recommended set is a set of characters for identifiers in a

466 // security-sensitive environment taken from UTR 39

467 // (http://unicode.org/reports/tr39/) and

468 // http://www.unicode.org/Public/security/latest/xidmodifications.txt .

469 // The inclusion set comes from "Candidate Characters for Inclusion

470 // in idenfiers" of UTR 31 (http://www.unicode.org/reports/tr31). The list

471 // may change over the time and will be updated whenever the version of ICU

472 // used in Chromium is updated.

473 const icu::UnicodeSet* recommended_set =

474 uspoof_getRecommendedUnicodeSet(status);

475 icu::UnicodeSet allowed_set;

476 allowed_set.addAll(*recommended_set);

477 const icu::UnicodeSet* inclusion_set = uspoof_getInclusionUnicodeSet(status);

478 allowed_set.addAll(*inclusion_set);

479

480 // Five aspirational scripts are taken from UTR 31 Table 6 at

481 // http://www.unicode.org/reports/tr31/#Aspirational_Use_Scripts .

482 // Not all the characters of aspirational scripts are suitable for

483 // identifiers. Therefore, only characters belonging to

484 // [:Identifier_Type=Aspirational:] (listed in 'Status/Type=Aspirational'

485 // section at

486 // http://www.unicode.org/Public/security/latest/xidmodifications.txt) are

487 // are added to the allowed set. The list has to be updated when a new

488 // version of Unicode is released. The current version is 9.0.0 and ICU 60

489 // will have Unicode 10.0 data.

490 #if U_ICU_VERSION_MAJOR_NUM < 60

491 const icu::UnicodeSet aspirational_scripts(

492 icu::UnicodeString(

493 // Unified Canadian Syllabics

494 "[\\u1401-\\u166C\\u166F-\\u167F"

495 // Mongolian

496 "\\u1810-\\u1819\\u1820-\\u1877\\u1880-\\u18AA"

497 // Unified Canadian Syllabics

498 "\\u18B0-\\u18F5"

499 // Tifinagh

500 "\\u2D30-\\u2D67\\u2D7F"

501 // Yi

502 "\\uA000-\\uA48C"

503 // Miao

504 "\\U00016F00-\\U00016F44\\U00016F50-\\U00016F7E"

505 "\\U00016F8F-\\U00016F9F]",

506 -1, US_INV),

507 *status);

508 allowed_set.addAll(aspirational_scripts);

509 #else

510 #error "Update aspirational_scripts per Unicode 10.0"

511 #endif

512

513 // U+0338 is included in the recommended set, while U+05F4 and U+2027 are in

514 // the inclusion set. However, they are blacklisted as a part of Mozilla's

515 // IDN blacklist (http://kb.mozillazine.org/Network.IDN.blacklist_chars).

516 // U+2010 is in the inclusion set, but we drop it because it can be confused

517 // with an ASCII U+002D (Hyphen-Minus).

518 // U+0338 and U+2027 are dropped; the former can look like a slash when

519 // rendered with a broken font, and the latter can be confused with U+30FB

520 // (Katakana Middle Dot). U+05F4 (Hebrew Punctuation Gershayim) is kept,

521 // even though it can look like a double quotation mark. Using it in Hebrew

522 // should be safe. When used with a non-Hebrew script, it'd be filtered by

523 // other checks in place.

524 allowed_set.remove(0x338u); // Combining Long Solidus Overlay

525 allowed_set.remove(0x2010u); // Hyphen

526 allowed_set.remove(0x2027u); // Hyphenation Point

527

528 #if defined(OS_MACOSX)

529 // The following characters are reported as present in the default macOS

530 // system UI font, but they render as blank. Remove them from the allowed

531 // set to prevent spoofing.

532 // Tibetan characters used for transliteration of ancient texts:

533 allowed_set.remove(0x0F8Cu);

534 allowed_set.remove(0x0F8Du);

535 allowed_set.remove(0x0F8Eu);

536 allowed_set.remove(0x0F8Fu);

537 #endif

538

539 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status);

540 }

541

542 // Returns true if the given Unicode host component is safe to display to the	245 // Returns true if the given Unicode host component is safe to display to the

543 // user. Note that this function does not deal with pure ASCII domain labels at	246 // user. Note that this function does not deal with pure ASCII domain labels at

544 // all even though it's possible to make up look-alike labels with ASCII	247 // all even though it's possible to make up look-alike labels with ASCII

545 // characters alone.	248 // characters alone.

546 bool IsIDNComponentSafe(base::StringPiece16 label, bool is_tld_ascii) {	249 bool IsIDNComponentSafe(base::StringPiece16 label, bool is_tld_ascii) {

547 return g_idn_spoof_checker.Get().Check(label, is_tld_ascii);	250 return g_idn_spoof_checker.Get().SafeToDisplayAsUnicode(label, is_tld_ascii);

548 }	251 }

549	252

550 // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to	253 // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to

551 // a UTS46/IDNA 2008 handling object opened with uidna_openUTS46().	254 // a UTS46/IDNA 2008 handling object opened with uidna_openUTS46().

552 //	255 //

553 // We use UTS46 with BiDiCheck to migrate from IDNA 2003 to IDNA 2008 with the	256 // We use UTS46 with BiDiCheck to migrate from IDNA 2003 to IDNA 2008 with the

554 // backward compatibility in mind. What it does:	257 // backward compatibility in mind. What it does:

555 //	258 //

556 // 1. Use the up-to-date Unicode data.	259 // 1. Use the up-to-date Unicode data.

557 // 2. Define a case folding/mapping with the up-to-date Unicode data as in	260 // 2. Define a case folding/mapping with the up-to-date Unicode data as in

(...skipping 305 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
863 return base::StartsWith(text, www, base::CompareCase::SENSITIVE)	566 return base::StartsWith(text, www, base::CompareCase::SENSITIVE)

864 ? text.substr(www.length()) : text;	567 ? text.substr(www.length()) : text;

865 }	568 }

866	569

867 base::string16 StripWWWFromHost(const GURL& url) {	570 base::string16 StripWWWFromHost(const GURL& url) {

868 DCHECK(url.is_valid());	571 DCHECK(url.is_valid());

869 return StripWWW(base::ASCIIToUTF16(url.host_piece()));	572 return StripWWW(base::ASCIIToUTF16(url.host_piece()));

870 }	573 }

871	574

872 } // namespace url_formatter	575 } // namespace url_formatter

OLD	NEW

« no previous file with comments | « components/url_formatter/idn_spoof_checker.cc ('k') | no next file » | no next file with comments »