components/url_formatter/url_formatter.cc - Issue 2683793010: Block domain labels made of Cyrillic letters that look alike Latin

Side by Side Diff: components/url_formatter/url_formatter.cc

Issue 2683793010: Block domain labels made of Cyrillic letters that look alike Latin (Closed)

Patch Set: add a test with U+00B7 and Latin-alike Cyrillics Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2015 The Chromium Authors. All rights reserved.	1 // Copyright 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/url_formatter/url_formatter.h"	5 #include "components/url_formatter/url_formatter.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <utility>	8 #include <utility>

9	9

10 #include "base/lazy_instance.h"	10 #include "base/lazy_instance.h"

11 #include "base/macros.h"	11 #include "base/macros.h"

12 #include "base/numerics/safe_conversions.h"	12 #include "base/numerics/safe_conversions.h"

13 #include "base/strings/string_piece.h"	13 #include "base/strings/string_piece.h"

14 #include "base/strings/string_util.h"	14 #include "base/strings/string_util.h"

15 #include "base/strings/utf_offset_string_conversions.h"	15 #include "base/strings/utf_offset_string_conversions.h"

16 #include "base/strings/utf_string_conversions.h"	16 #include "base/strings/utf_string_conversions.h"

17 #include "base/threading/thread_local_storage.h"	17 #include "base/threading/thread_local_storage.h"

	18 #include "third_party/icu/source/common/unicode/schriter.h"

18 #include "third_party/icu/source/common/unicode/uidna.h"	19 #include "third_party/icu/source/common/unicode/uidna.h"

19 #include "third_party/icu/source/common/unicode/uniset.h"	20 #include "third_party/icu/source/common/unicode/uniset.h"

20 #include "third_party/icu/source/common/unicode/uscript.h"	21 #include "third_party/icu/source/common/unicode/uscript.h"

21 #include "third_party/icu/source/common/unicode/uvernum.h"	22 #include "third_party/icu/source/common/unicode/uvernum.h"

22 #include "third_party/icu/source/i18n/unicode/regex.h"	23 #include "third_party/icu/source/i18n/unicode/regex.h"

23 #include "third_party/icu/source/i18n/unicode/uspoof.h"	24 #include "third_party/icu/source/i18n/unicode/uspoof.h"

24 #include "url/gurl.h"	25 #include "url/gurl.h"

25 #include "url/third_party/mozilla/url_parse.h"	26 #include "url/third_party/mozilla/url_parse.h"

26	27

27 namespace url_formatter {	28 namespace url_formatter {

28	29

29 namespace {	30 namespace {

30	31

31 base::string16 IDNToUnicodeWithAdjustments(	32 base::string16 IDNToUnicodeWithAdjustments(

32 base::StringPiece host,	33 base::StringPiece host,

33 base::OffsetAdjuster::Adjustments* adjustments);	34 base::OffsetAdjuster::Adjustments* adjustments);

34 bool IDNToUnicodeOneComponent(const base::char16* comp,	35 bool IDNToUnicodeOneComponent(const base::char16* comp,

35 size_t comp_len,	36 size_t comp_len,

	37 bool is_tld_ascii,

36 base::string16* out);	38 base::string16* out);

37	39

38 class AppendComponentTransform {	40 class AppendComponentTransform {

39 public:	41 public:

40 AppendComponentTransform() {}	42 AppendComponentTransform() {}

41 virtual ~AppendComponentTransform() {}	43 virtual ~AppendComponentTransform() {}

42	44

43 virtual base::string16 Execute(	45 virtual base::string16 Execute(

44 const std::string& component_text,	46 const std::string& component_text,

45 base::OffsetAdjuster::Adjustments* adjustments) const = 0;	47 base::OffsetAdjuster::Adjustments* adjustments) const = 0;

(...skipping 147 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
193 // allow unicode UNC hostnames regardless of encodings.	195 // allow unicode UNC hostnames regardless of encodings.

194 base::string16 IDNToUnicodeWithAdjustments(	196 base::string16 IDNToUnicodeWithAdjustments(

195 base::StringPiece host, base::OffsetAdjuster::Adjustments* adjustments) {	197 base::StringPiece host, base::OffsetAdjuster::Adjustments* adjustments) {

196 if (adjustments)	198 if (adjustments)

197 adjustments->clear();	199 adjustments->clear();

198 // Convert the ASCII input to a base::string16 for ICU.	200 // Convert the ASCII input to a base::string16 for ICU.

199 base::string16 input16;	201 base::string16 input16;

200 input16.reserve(host.length());	202 input16.reserve(host.length());

201 input16.insert(input16.end(), host.begin(), host.end());	203 input16.insert(input16.end(), host.begin(), host.end());

202	204

	205 bool is_tld_ascii = true;

	206 size_t last_dot = host.rfind('.');

	207 if (last_dot != base::StringPiece::npos &&

	208 host.substr(last_dot).starts_with(".xn--")) {

	209 is_tld_ascii = false;

	210 }

	211

203 // Do each component of the host separately, since we enforce script matching	212 // Do each component of the host separately, since we enforce script matching

204 // on a per-component basis.	213 // on a per-component basis.

205 base::string16 out16;	214 base::string16 out16;

206 for (size_t component_start = 0, component_end;	215 for (size_t component_start = 0, component_end;

207 component_start < input16.length();	216 component_start < input16.length();

208 component_start = component_end + 1) {	217 component_start = component_end + 1) {

209 // Find the end of the component.	218 // Find the end of the component.

210 component_end = input16.find('.', component_start);	219 component_end = input16.find('.', component_start);

211 if (component_end == base::string16::npos)	220 if (component_end == base::string16::npos)

212 component_end = input16.length(); // For getting the last component.	221 component_end = input16.length(); // For getting the last component.

213 size_t component_length = component_end - component_start;	222 size_t component_length = component_end - component_start;

214 size_t new_component_start = out16.length();	223 size_t new_component_start = out16.length();

215 bool converted_idn = false;	224 bool converted_idn = false;

216 if (component_end > component_start) {	225 if (component_end > component_start) {

217 // Add the substring that we just found.	226 // Add the substring that we just found.

218 converted_idn =	227 converted_idn =

219 IDNToUnicodeOneComponent(input16.data() + component_start,	228 IDNToUnicodeOneComponent(input16.data() + component_start,

220 component_length, &out16);	229 component_length, is_tld_ascii, &out16);

221 }	230 }

222 size_t new_component_length = out16.length() - new_component_start;	231 size_t new_component_length = out16.length() - new_component_start;

223	232

224 if (converted_idn && adjustments) {	233 if (converted_idn && adjustments) {

225 adjustments->push_back(base::OffsetAdjuster::Adjustment(	234 adjustments->push_back(base::OffsetAdjuster::Adjustment(

226 component_start, component_length, new_component_length));	235 component_start, component_length, new_component_length));

227 }	236 }

228	237

229 // Need to add the dot we just found (if we found one).	238 // Need to add the dot we just found (if we found one).

230 if (component_end < input16.length())	239 if (component_end < input16.length())

231 out16.push_back('.');	240 out16.push_back('.');

232 }	241 }

233 return out16;	242 return out16;

234 }	243 }

235	244

236 // A helper class for IDN Spoof checking, used to ensure that no IDN input is	245 // A helper class for IDN Spoof checking, used to ensure that no IDN input is

237 // spoofable per Chromium's standard of spoofability. For a more thorough	246 // spoofable per Chromium's standard of spoofability. For a more thorough

238 // explanation of how spoof checking works in Chromium, see	247 // explanation of how spoof checking works in Chromium, see

239 // http://dev.chromium.org/developers/design-documents/idn-in-google-chrome .	248 // http://dev.chromium.org/developers/design-documents/idn-in-google-chrome .

240 class IDNSpoofChecker {	249 class IDNSpoofChecker {

241 public:	250 public:

242 IDNSpoofChecker();	251 IDNSpoofChecker();

243	252

244 // Returns true if \|label\| is safe to display as Unicode. In the event of	253 // Returns true if \|label\| is safe to display as Unicode. When
	Peter Kasting 2017/03/22 22:14:08 Nit: When -> When the Wrap at 80 columns. Nit: When -> When the Wrap at 80 columns.
245 // library failure, all IDN inputs will be treated as unsafe.	254 // TLD is ASCII, check if a label is entirely made of

246 bool Check(base::StringPiece16 label);	255 // Cyrillic letters that look like Latin letters. In the event of library

	256 // failure, all IDN inputs will be treated as unsafe.

	257 bool Check(base::StringPiece16 label, bool is_tld_ascii);

247	258

248 private:	259 private:

249 void SetAllowedUnicodeSet(UErrorCode* status);	260 void SetAllowedUnicodeSet(UErrorCode* status);

	261 bool IsMadeOfLatinAlikeCyrillic(const icu::UnicodeString& label_string);

250	262

251 USpoofChecker* checker_;	263 USpoofChecker* checker_;

252 icu::UnicodeSet deviation_characters_;	264 icu::UnicodeSet deviation_characters_;

253 icu::UnicodeSet non_ascii_latin_letters_;	265 icu::UnicodeSet non_ascii_latin_letters_;

254 icu::UnicodeSet kana_letters_exceptions_;	266 icu::UnicodeSet kana_letters_exceptions_;

	267 icu::UnicodeSet cyrillic_letters_;

	268 icu::UnicodeSet cyrillic_letters_latin_alike_;

255	269

256 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);	270 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);

257 };	271 };

258	272

259 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =	273 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =

260 LAZY_INSTANCE_INITIALIZER;	274 LAZY_INSTANCE_INITIALIZER;

261 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;	275 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;

262	276

263 void OnThreadTermination(void* regex_matcher) {	277 void OnThreadTermination(void* regex_matcher) {

264 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);	278 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
306 // because additional characters pulled in with scx=Latn are not included in	320 // because additional characters pulled in with scx=Latn are not included in

307 // the allowed set.	321 // the allowed set.

308 non_ascii_latin_letters_ = icu::UnicodeSet(	322 non_ascii_latin_letters_ = icu::UnicodeSet(

309 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);	323 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);

310 non_ascii_latin_letters_.freeze();	324 non_ascii_latin_letters_.freeze();

311	325

312 // These letters are parts of \|dangerous_patterns_\|.	326 // These letters are parts of \|dangerous_patterns_\|.

313 kana_letters_exceptions_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE(	327 kana_letters_exceptions_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE(

314 "[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb\\u30fc]"), status);	328 "[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb\\u30fc]"), status);

315 kana_letters_exceptions_.freeze();	329 kana_letters_exceptions_.freeze();

	330 // These Cyrillic letters look like Latin. A domain label entirely
	Peter Kasting 2017/03/22 22:14:08 Nit: Blank line above this Nit: Blank line above this
	331 // made of these letters are blocked as a poorman's whole-script-spoofable.
	Peter Kasting 2017/03/22 22:14:08 Nit: are -> is; poorman's -> simplified Nit: are -> is; poorman's -> simplified
	332 cyrillic_letters_latin_alike_ =

	333 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status);

	334 // ӕъЬвҽвԍнкпгмтцѵѡүӔѴҲ

	335 // icu::UnicodeString("[аеорсухьѕіјһмӏтнв]"), status);
	Peter Kasting 2017/03/22 22:14:08 Looks like these comments are leftovers? Looks like these comments are leftovers?
	336 cyrillic_letters_latin_alike_.freeze();

	337

	338 cyrillic_letters_ =

	339 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status);

	340 cyrillic_letters_.freeze();

316	341

317 DCHECK(U_SUCCESS(status));	342 DCHECK(U_SUCCESS(status));

318 }	343 }

319	344

320 bool IDNSpoofChecker::Check(base::StringPiece16 label) {	345 bool IDNSpoofChecker::Check(base::StringPiece16 label, bool is_tld_ascii) {

321 UErrorCode status = U_ZERO_ERROR;	346 UErrorCode status = U_ZERO_ERROR;

322 int32_t result = uspoof_check(checker_, label.data(),	347 int32_t result = uspoof_check(checker_, label.data(),

323 base::checked_cast<int32_t>(label.size()),	348 base::checked_cast<int32_t>(label.size()),

324 NULL, &status);	349 NULL, &status);

325 // If uspoof_check fails (due to library failure), or if any of the checks	350 // If uspoof_check fails (due to library failure), or if any of the checks

326 // fail, treat the IDN as unsafe.	351 // fail, treat the IDN as unsafe.

327 if (U_FAILURE(status) \|\| (result & USPOOF_ALL_CHECKS))	352 if (U_FAILURE(status) \|\| (result & USPOOF_ALL_CHECKS))

328 return false;	353 return false;

329	354

330 icu::UnicodeString label_string(FALSE, label.data(),	355 icu::UnicodeString label_string(FALSE, label.data(),

331 base::checked_cast<int32_t>(label.size()));	356 base::checked_cast<int32_t>(label.size()));

332	357

333 // A punycode label with 'xn--' prefix is not subject to the URL	358 // A punycode label with 'xn--' prefix is not subject to the URL

334 // canonicalization and is stored as it is in GURL. If it encodes a deviation	359 // canonicalization and is stored as it is in GURL. If it encodes a deviation

335 // character (UTS 46; e.g. U+00DF/sharp-s), it should be still shown in	360 // character (UTS 46; e.g. U+00DF/sharp-s), it should be still shown in

336 // punycode instead of Unicode. Without this check, xn--fu-hia for	361 // punycode instead of Unicode. Without this check, xn--fu-hia for

337 // 'fu<sharp-s>' would be converted to 'fu<sharp-s>' for display because	362 // 'fu<sharp-s>' would be converted to 'fu<sharp-s>' for display because

338 // "UTS 46 section 4 Processing step 4" applies validity criteria for	363 // "UTS 46 section 4 Processing step 4" applies validity criteria for

339 // non-transitional processing (i.e. do not map deviation characters) to any	364 // non-transitional processing (i.e. do not map deviation characters) to any

340 // punycode labels regardless of whether transitional or non-transitional is	365 // punycode labels regardless of whether transitional or non-transitional is

341 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted	366 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted

342 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as	367 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as

343 // such. See http://crbug.com/595263 .	368 // such. See http://crbug.com/595263 .

344 if (deviation_characters_.containsSome(label_string))	369 if (deviation_characters_.containsSome(label_string))

345 return false;	370 return false;

346	371

347 // If there's no script mixing, the input is regarded as safe without any	372 // If there's no script mixing, the input is regarded as safe without any

348 // extra check unless it contains Kana letter exceptions. Note that	373 // extra check unless it contains Kana letter exceptions or it's made enitrely
	Peter Kasting 2017/03/22 22:14:08 Nit: entirely Nit: entirely
	374 // of Cyrillic letters that look alike Latin letters. Note that
	Peter Kasting 2017/03/22 22:14:08 Nit: alike -> like Nit: alike -> like
349 // the following combinations of scripts are treated as a 'logical' single	375 // the following combinations of scripts are treated as a 'logical' single

350 // script.	376 // script.

351 // - Chinese: Han, Bopomofo, Common	377 // - Chinese: Han, Bopomofo, Common

352 // - Japanese: Han, Hiragana, Katakana, Common	378 // - Japanese: Han, Hiragana, Katakana, Common

353 // - Korean: Hangul, Han, Common	379 // - Korean: Hangul, Han, Common

354 result &= USPOOF_RESTRICTION_LEVEL_MASK;	380 result &= USPOOF_RESTRICTION_LEVEL_MASK;

355 if (result == USPOOF_ASCII \|\|	381 if (result == USPOOF_ASCII)

356 (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&

357 kana_letters_exceptions_.containsNone(label_string)))

358 return true;	382 return true;

	383 if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&

	384 kana_letters_exceptions_.containsNone(label_string)) {

	385 // Check Cyrillic confusable only for ASCII TLDs.

	386 return !is_tld_ascii \|\| !IsMadeOfLatinAlikeCyrillic(label_string);

	387 }

359	388

360 // Additional checks for \|label\| with multiple scripts, one of which is Latin.	389 // Additional checks for \|label\| with multiple scripts, one of which is Latin.

361 // Disallow non-ASCII Latin letters to mix with a non-Latin script.	390 // Disallow non-ASCII Latin letters to mix with a non-Latin script.

362 if (non_ascii_latin_letters_.containsSome(label_string))	391 if (non_ascii_latin_letters_.containsSome(label_string))

363 return false;	392 return false;

364	393

365 if (!tls_index.initialized())	394 if (!tls_index.initialized())

366 tls_index.Initialize(&OnThreadTermination);	395 tls_index.Initialize(&OnThreadTermination);

367 icu::RegexMatcher* dangerous_pattern =	396 icu::RegexMatcher* dangerous_pattern =

368 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get());	397 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get());

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
400 "[a-z][\\u0585\\u0581]+[a-z]\|"	429 "[a-z][\\u0585\\u0581]+[a-z]\|"

401 "^[og]+[\\p{scx=armn}]\|[\\p{scx=armn}][og]+$\|"	430 "^[og]+[\\p{scx=armn}]\|[\\p{scx=armn}][og]+$\|"

402 "[\\p{scx=armn}][og]+[\\p{scx=armn}]", -1, US_INV),	431 "[\\p{scx=armn}][og]+[\\p{scx=armn}]", -1, US_INV),

403 0, status);	432 0, status);

404 tls_index.Set(dangerous_pattern);	433 tls_index.Set(dangerous_pattern);

405 }	434 }

406 dangerous_pattern->reset(label_string);	435 dangerous_pattern->reset(label_string);

407 return !dangerous_pattern->find();	436 return !dangerous_pattern->find();

408 }	437 }

409	438

	439 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(

	440 const icu::UnicodeString& label_string) {

	441 // Collect all the Cyrillic letters in \|label_string\| and see if they're

	442 // a subset of \|cyrillic_letters_latin_alike_\|.

	443 // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and

	444 // [_-] and checking if the set contains all letters of \|label_string\|

	445 // would work in most cases, but it'd not if a label has non-letters outside
	Peter Kasting 2017/03/22 22:14:08 Nit: Remove "it'd" Nit: Remove "it'd"
	446 // ASCII.

	447 icu::UnicodeSet cyrillic_in_label;

	448 icu::StringCharacterIterator it(label_string);

	449 UChar32 c;
	Peter Kasting 2017/03/22 22:14:08 Nit: Define inside loop (and can then be const) Nit: Define inside loop (and can then be const)
	450 for (it.setToStart(); it.hasNext();) {

	451 c = it.next32PostInc();

	452 if (cyrillic_letters_.contains(c))

	453 cyrillic_in_label.add(c);

	454 }

	455 return !cyrillic_in_label.isEmpty() &&

	456 cyrillic_letters_latin_alike_.containsAll(cyrillic_in_label);

	457 }

	458

410 void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) {	459 void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) {

411 if (U_FAILURE(*status))	460 if (U_FAILURE(*status))

412 return;	461 return;

413	462

414 // The recommended set is a set of characters for identifiers in a	463 // The recommended set is a set of characters for identifiers in a

415 // security-sensitive environment taken from UTR 39	464 // security-sensitive environment taken from UTR 39

416 // (http://unicode.org/reports/tr39/) and	465 // (http://unicode.org/reports/tr39/) and

417 // http://www.unicode.org/Public/security/latest/xidmodifications.txt .	466 // http://www.unicode.org/Public/security/latest/xidmodifications.txt .

418 // The inclusion set comes from "Candidate Characters for Inclusion	467 // The inclusion set comes from "Candidate Characters for Inclusion

419 // in idenfiers" of UTR 31 (http://www.unicode.org/reports/tr31). The list	468 // in idenfiers" of UTR 31 (http://www.unicode.org/reports/tr31). The list

(...skipping 54 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
474 allowed_set.remove(0x2010u); // Hyphen	523 allowed_set.remove(0x2010u); // Hyphen

475 allowed_set.remove(0x2027u); // Hyphenation Point	524 allowed_set.remove(0x2027u); // Hyphenation Point

476	525

477 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status);	526 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status);

478 }	527 }

479	528

480 // Returns true if the given Unicode host component is safe to display to the	529 // Returns true if the given Unicode host component is safe to display to the

481 // user. Note that this function does not deal with pure ASCII domain labels at	530 // user. Note that this function does not deal with pure ASCII domain labels at

482 // all even though it's possible to make up look-alike labels with ASCII	531 // all even though it's possible to make up look-alike labels with ASCII

483 // characters alone.	532 // characters alone.

484 bool IsIDNComponentSafe(base::StringPiece16 label) {	533 bool IsIDNComponentSafe(base::StringPiece16 label, bool is_tld_ascii) {

485 return g_idn_spoof_checker.Get().Check(label);	534 return g_idn_spoof_checker.Get().Check(label, is_tld_ascii);

486 }	535 }

487	536

488 // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to	537 // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to

489 // a UTS46/IDNA 2008 handling object opened with uidna_openUTS46().	538 // a UTS46/IDNA 2008 handling object opened with uidna_openUTS46().

490 //	539 //

491 // We use UTS46 with BiDiCheck to migrate from IDNA 2003 to IDNA 2008 with the	540 // We use UTS46 with BiDiCheck to migrate from IDNA 2003 to IDNA 2008 with the

492 // backward compatibility in mind. What it does:	541 // backward compatibility in mind. What it does:

493 //	542 //

494 // 1. Use the up-to-date Unicode data.	543 // 1. Use the up-to-date Unicode data.

495 // 2. Define a case folding/mapping with the up-to-date Unicode data as in	544 // 2. Define a case folding/mapping with the up-to-date Unicode data as in

(...skipping 24 matching lines...) Expand all Loading...
520	569

521 base::LazyInstance<UIDNAWrapper>::Leaky g_uidna = LAZY_INSTANCE_INITIALIZER;	570 base::LazyInstance<UIDNAWrapper>::Leaky g_uidna = LAZY_INSTANCE_INITIALIZER;

522	571

523 // Converts one component (label) of a host (between dots) to Unicode if safe.	572 // Converts one component (label) of a host (between dots) to Unicode if safe.

524 // The result will be APPENDED to the given output string and will be the	573 // The result will be APPENDED to the given output string and will be the

525 // same as the input if it is not IDN in ACE/punycode or the IDN is unsafe to	574 // same as the input if it is not IDN in ACE/punycode or the IDN is unsafe to

526 // display.	575 // display.

527 // Returns whether any conversion was performed.	576 // Returns whether any conversion was performed.

528 bool IDNToUnicodeOneComponent(const base::char16* comp,	577 bool IDNToUnicodeOneComponent(const base::char16* comp,

529 size_t comp_len,	578 size_t comp_len,

	579 bool is_tld_ascii,

530 base::string16* out) {	580 base::string16* out) {

531 DCHECK(out);	581 DCHECK(out);

532 if (comp_len == 0)	582 if (comp_len == 0)

533 return false;	583 return false;

534	584

535 // Only transform if the input can be an IDN component.	585 // Only transform if the input can be an IDN component.

536 static const base::char16 kIdnPrefix[] = {'x', 'n', '-', '-'};	586 static const base::char16 kIdnPrefix[] = {'x', 'n', '-', '-'};

537 if ((comp_len > arraysize(kIdnPrefix)) &&	587 if ((comp_len > arraysize(kIdnPrefix)) &&

538 !memcmp(comp, kIdnPrefix, sizeof(kIdnPrefix))) {	588 !memcmp(comp, kIdnPrefix, sizeof(kIdnPrefix))) {

539 UIDNA* uidna = g_uidna.Get().value;	589 UIDNA* uidna = g_uidna.Get().value;

(...skipping 11 matching lines...) Expand all Loading...
551 output_length = uidna_labelToUnicode(	601 output_length = uidna_labelToUnicode(

552 uidna, comp, static_cast<int32_t>(comp_len), &(*out)[original_length],	602 uidna, comp, static_cast<int32_t>(comp_len), &(*out)[original_length],

553 output_length, &info, &status);	603 output_length, &info, &status);

554 } while ((status == U_BUFFER_OVERFLOW_ERROR && info.errors == 0));	604 } while ((status == U_BUFFER_OVERFLOW_ERROR && info.errors == 0));

555	605

556 if (U_SUCCESS(status) && info.errors == 0) {	606 if (U_SUCCESS(status) && info.errors == 0) {

557 // Converted successfully. Ensure that the converted component	607 // Converted successfully. Ensure that the converted component

558 // can be safely displayed to the user.	608 // can be safely displayed to the user.

559 out->resize(original_length + output_length);	609 out->resize(original_length + output_length);

560 if (IsIDNComponentSafe(	610 if (IsIDNComponentSafe(

561 base::StringPiece16(out->data() + original_length,	611 base::StringPiece16(out->data() + original_length,

562 base::checked_cast<size_t>(output_length))))	612 base::checked_cast<size_t>(output_length)),

	613 is_tld_ascii))

563 return true;	614 return true;

564 }	615 }

565	616

566 // Something went wrong. Revert to original string.	617 // Something went wrong. Revert to original string.

567 out->resize(original_length);	618 out->resize(original_length);

568 }	619 }

569	620

570 // We get here with no IDN or on error, in which case we just append the	621 // We get here with no IDN or on error, in which case we just append the

571 // literal input.	622 // literal input.

572 out->append(comp, comp_len);	623 out->append(comp, comp_len);

(...skipping 226 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
799 return base::StartsWith(text, www, base::CompareCase::SENSITIVE)	850 return base::StartsWith(text, www, base::CompareCase::SENSITIVE)

800 ? text.substr(www.length()) : text;	851 ? text.substr(www.length()) : text;

801 }	852 }

802	853

803 base::string16 StripWWWFromHost(const GURL& url) {	854 base::string16 StripWWWFromHost(const GURL& url) {

804 DCHECK(url.is_valid());	855 DCHECK(url.is_valid());

805 return StripWWW(base::ASCIIToUTF16(url.host_piece()));	856 return StripWWW(base::ASCIIToUTF16(url.host_piece()));

806 }	857 }

807	858

808 } // namespace url_formatter	859 } // namespace url_formatter

OLD	NEW

« no previous file with comments | « no previous file | components/url_formatter/url_formatter_unittest.cc » ('j') | no next file with comments »