components/url_formatter/url_formatter.cc - Issue 2447513002: Update ICU to 58.1

Side by Side Diff: components/url_formatter/url_formatter.cc

Issue 2447513002: Update ICU to 58.1 (Closed)

Patch Set: roll icu to final 58.1 in master Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « base/i18n/number_formatting_unittest.cc ('k') | third_party/WebKit/LayoutTests/fast/text/midword-break-before-surrogate-pair.html » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright 2015 The Chromium Authors. All rights reserved.	1 // Copyright 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/url_formatter/url_formatter.h"	5 #include "components/url_formatter/url_formatter.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <utility>	8 #include <utility>

9	9

10 #include "base/lazy_instance.h"	10 #include "base/lazy_instance.h"

(...skipping 232 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
243	243

244 // Returns true if \|label\| is safe to display as Unicode. In the event of	244 // Returns true if \|label\| is safe to display as Unicode. In the event of

245 // library failure, all IDN inputs will be treated as unsafe.	245 // library failure, all IDN inputs will be treated as unsafe.

246 bool Check(base::StringPiece16 label);	246 bool Check(base::StringPiece16 label);

247	247

248 private:	248 private:

249 void SetAllowedUnicodeSet(UErrorCode* status);	249 void SetAllowedUnicodeSet(UErrorCode* status);

250	250

251 USpoofChecker* checker_;	251 USpoofChecker* checker_;

252 icu::UnicodeSet deviation_characters_;	252 icu::UnicodeSet deviation_characters_;

253 icu::UnicodeSet latin_letters_;

254 icu::UnicodeSet non_ascii_latin_letters_;	253 icu::UnicodeSet non_ascii_latin_letters_;

	254 icu::UnicodeSet kana_letters_exceptions_;

255	255

256 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);	256 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);

257 };	257 };

258	258

259 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =	259 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =

260 LAZY_INSTANCE_INITIALIZER;	260 LAZY_INSTANCE_INITIALIZER;

261 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;	261 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;

262	262

263 void OnThreadTermination(void* regex_matcher) {	263 void OnThreadTermination(void* regex_matcher) {

264 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);	264 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);

(...skipping 17 matching lines...) Expand all Loading...
282 // Japanese(Hiragana + Katakana + Han), and Korean(Hangul + Han), only one	282 // Japanese(Hiragana + Katakana + Han), and Korean(Hangul + Han), only one

283 // script other than Common and Inherited can be mixed with Latin. Cyrillic	283 // script other than Common and Inherited can be mixed with Latin. Cyrillic

284 // and Greek are not allowed to mix with Latin.	284 // and Greek are not allowed to mix with Latin.

285 // See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection	285 // See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection

286 uspoof_setRestrictionLevel(checker_, USPOOF_MODERATELY_RESTRICTIVE);	286 uspoof_setRestrictionLevel(checker_, USPOOF_MODERATELY_RESTRICTIVE);

287	287

288 // Restrict allowed characters in IDN labels and turn on USPOOF_CHAR_LIMIT.	288 // Restrict allowed characters in IDN labels and turn on USPOOF_CHAR_LIMIT.

289 SetAllowedUnicodeSet(&status);	289 SetAllowedUnicodeSet(&status);

290	290

291 // Enable the return of auxillary (non-error) information.	291 // Enable the return of auxillary (non-error) information.

	292 // We used to disable WHOLE_SCRIPT_CONFUSABLE check explicitly, but as of

	293 // ICU 58.1, WSC is a no-op in a single string check API.

292 int32_t checks = uspoof_getChecks(checker_, &status) \| USPOOF_AUX_INFO;	294 int32_t checks = uspoof_getChecks(checker_, &status) \| USPOOF_AUX_INFO;

293

294 // Disable WHOLE_SCRIPT_CONFUSABLE check. The check has a marginal value when

295 // used against a single string as opposed to comparing a pair of strings. In

296 // addition, it would also flag a number of common labels including the IDN

297 // TLD for Russian.

298 // A possible alternative would be to turn on the check and block a label

299 // only under the following conditions, but it'd better be done on the

300 // server-side (e.g. SafeBrowsing):

301 // 1. The label is whole-script confusable.

302 // 2. And the skeleton of the label matches the skeleton of one of top

303 // domain labels. See http://unicode.org/reports/tr39/#Confusable_Detection

304 // for the definition of skeleton.

305 // 3. And the label is different from the matched top domain label in #2.

306 checks &= ~USPOOF_WHOLE_SCRIPT_CONFUSABLE;

307

308 uspoof_setChecks(checker_, checks, &status);	295 uspoof_setChecks(checker_, checks, &status);

309	296

310 // Four characters handled differently by IDNA 2003 and IDNA 2008. UTS46	297 // Four characters handled differently by IDNA 2003 and IDNA 2008. UTS46

311 // transitional processing treats them as IDNA 2003 does; maps U+00DF and	298 // transitional processing treats them as IDNA 2003 does; maps U+00DF and

312 // U+03C2 and drops U+200[CD].	299 // U+03C2 and drops U+200[CD].

313 deviation_characters_ =	300 deviation_characters_ =

314 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"),	301 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"),

315 status);	302 status);

316 deviation_characters_.freeze();	303 deviation_characters_.freeze();

317	304

318 latin_letters_ =

319 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[:Latin:]"), status);

320 latin_letters_.freeze();

321

322 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary	305 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary

323 // because additional characters pulled in with scx=Latn are not included in	306 // because additional characters pulled in with scx=Latn are not included in

324 // the allowed set.	307 // the allowed set.

325 non_ascii_latin_letters_ = icu::UnicodeSet(	308 non_ascii_latin_letters_ = icu::UnicodeSet(

326 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);	309 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);

327 non_ascii_latin_letters_.freeze();	310 non_ascii_latin_letters_.freeze();

328	311

	312 // These letters are parts of \|dangerous_patterns_\|.

	313 kana_letters_exceptions_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE(

	314 "[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb\\u30fc]"), status);

	315 kana_letters_exceptions_.freeze();

	316

329 DCHECK(U_SUCCESS(status));	317 DCHECK(U_SUCCESS(status));

330 }	318 }

331	319

332 bool IDNSpoofChecker::Check(base::StringPiece16 label) {	320 bool IDNSpoofChecker::Check(base::StringPiece16 label) {

333 UErrorCode status = U_ZERO_ERROR;	321 UErrorCode status = U_ZERO_ERROR;

334 int32_t result = uspoof_check(checker_, label.data(),	322 int32_t result = uspoof_check(checker_, label.data(),

335 base::checked_cast<int32_t>(label.size()),	323 base::checked_cast<int32_t>(label.size()),

336 NULL, &status);	324 NULL, &status);

337 // If uspoof_check fails (due to library failure), or if any of the checks	325 // If uspoof_check fails (due to library failure), or if any of the checks

338 // fail, treat the IDN as unsafe.	326 // fail, treat the IDN as unsafe.

(...skipping 11 matching lines...) Expand all Loading...
350 // "UTS 46 section 4 Processing step 4" applies validity criteria for	338 // "UTS 46 section 4 Processing step 4" applies validity criteria for

351 // non-transitional processing (i.e. do not map deviation characters) to any	339 // non-transitional processing (i.e. do not map deviation characters) to any

352 // punycode labels regardless of whether transitional or non-transitional is	340 // punycode labels regardless of whether transitional or non-transitional is

353 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted	341 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted

354 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as	342 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as

355 // such. See http://crbug.com/595263 .	343 // such. See http://crbug.com/595263 .

356 if (deviation_characters_.containsSome(label_string))	344 if (deviation_characters_.containsSome(label_string))

357 return false;	345 return false;

358	346

359 // If there's no script mixing, the input is regarded as safe without any	347 // If there's no script mixing, the input is regarded as safe without any

360 // extra check.	348 // extra check unless it contains Kana letter exceptions. Note that

361 result &= USPOOF_RESTRICTION_LEVEL_MASK;	349 // the following combinations of scripts are treated as a 'logical' single

362 if (result == USPOOF_ASCII \|\| result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE)	350 // script.

363 return true;

364

365 // When check is passed at 'highly restrictive' level, \|label\| is

366 // made up of one of the following script sets optionally mixed with Latin.

367 // - Chinese: Han, Bopomofo, Common	351 // - Chinese: Han, Bopomofo, Common

368 // - Japanese: Han, Hiragana, Katakana, Common	352 // - Japanese: Han, Hiragana, Katakana, Common

369 // - Korean: Hangul, Han, Common	353 // - Korean: Hangul, Han, Common

370 // Treat this case as a 'logical' single script unless Latin is mixed.	354 result &= USPOOF_RESTRICTION_LEVEL_MASK;

371 if (result == USPOOF_HIGHLY_RESTRICTIVE &&	355 if (result == USPOOF_ASCII \|\|

372 latin_letters_.containsNone(label_string))	356 (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&

	357 kana_letters_exceptions_.containsNone(label_string)))

373 return true;	358 return true;

374	359

375 // Additional checks for \|label\| with multiple scripts, one of which is Latin.	360 // Additional checks for \|label\| with multiple scripts, one of which is Latin.

376 // Disallow non-ASCII Latin letters to mix with a non-Latin script.	361 // Disallow non-ASCII Latin letters to mix with a non-Latin script.

377 if (non_ascii_latin_letters_.containsSome(label_string))	362 if (non_ascii_latin_letters_.containsSome(label_string))

378 return false;	363 return false;

379	364

380 if (!tls_index.initialized())	365 if (!tls_index.initialized())

381 tls_index.Initialize(&OnThreadTermination);	366 tls_index.Initialize(&OnThreadTermination);

382 icu::RegexMatcher* dangerous_pattern =	367 icu::RegexMatcher* dangerous_pattern =

383 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get());	368 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get());

384 if (!dangerous_pattern) {	369 if (!dangerous_pattern) {

385 // Disallow the katakana no, so, zo, or n, as they may be mistaken for	370 // Disallow the katakana no, so, zo, or n, as they may be mistaken for

386 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts	371 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts

387 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a	372 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a

388 // non-Japanese script on either side is disallowed, legitimate cases like	373 // non-Japanese script on either side is disallowed, legitimate cases like

389 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those	374 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those

390 // characters when used alone as a label is futile because those cases	375 // characters when used alone as a label is futile because those cases

391 // would not reach here.	376 // would not reach here.

	377 // Besides, disallow what used to be blocked by mixed-script-confusable(MSC)
	Peter Kasting 2016/10/28 22:46:18 Nit "Besides," -> "Also"; space before '(' Nit "Besides," -> "Also"; space before '('
	378 // detection. ICU 58 does not detect MSC any more for a single input string.

	379 // See http://bugs.icu-project.org/trac/ticket/12823 .

	380 // - Disallow U+30FB(Katakana Middle Dot) and U+30FC(Hiragana-Katakana

	381 // Prolonged Sound) used out-of-context.

	382 // - Disallow three Hiragana letters(U+307[8-A]) or Katakana letters

	383 // (U+30D[8-A]) that look exactly like each other when they're used in a

	384 // label otherwise entirely in Katakna or Hiragana.

	385 // - Disallow U+0585 (Armenian Small Letter Oh) to be mixed with Latin.

392 dangerous_pattern = new icu::RegexMatcher(	386 dangerous_pattern = new icu::RegexMatcher(

393 icu::UnicodeString(	387 icu::UnicodeString(

394 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]"	388 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]"

395 "[\\u30ce\\u30f3\\u30bd\\u30be]"	389 "[\\u30ce\\u30f3\\u30bd\\u30be]"

396 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]", -1, US_INV),	390 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]\|"

	391 "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc\|"

	392 "\\u30fc[^\\p{scx=kana}\\p{scx=hira}]\|"

	393 "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$\|"

	394 "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$\|"

	395 "[a-z]\\u30fb\|\\u30fb[a-z]\|"

	396 "\\u0585.*[a-z]+\|[a-z]+\\u0585", -1, US_INV),
	Peter Kasting 2016/10/28 22:46:18 Note, I do not speak regex enough to really review Note, I do not speak regex enough to really review this, assuming you did it right + smartly
397 0, status);	397 0, status);

398 tls_index.Set(dangerous_pattern);	398 tls_index.Set(dangerous_pattern);

399 }	399 }

400 dangerous_pattern->reset(label_string);	400 dangerous_pattern->reset(label_string);

401 return !dangerous_pattern->find();	401 return !dangerous_pattern->find();

402 }	402 }

403	403

404 void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) {	404 void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) {

405 if (U_FAILURE(*status))	405 if (U_FAILURE(*status))

406 return;	406 return;

(...skipping 383 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
790 return base::StartsWith(text, www, base::CompareCase::SENSITIVE)	790 return base::StartsWith(text, www, base::CompareCase::SENSITIVE)

791 ? text.substr(www.length()) : text;	791 ? text.substr(www.length()) : text;

792 }	792 }

793	793

794 base::string16 StripWWWFromHost(const GURL& url) {	794 base::string16 StripWWWFromHost(const GURL& url) {

795 DCHECK(url.is_valid());	795 DCHECK(url.is_valid());

796 return StripWWW(base::ASCIIToUTF16(url.host_piece()));	796 return StripWWW(base::ASCIIToUTF16(url.host_piece()));

797 }	797 }

798	798

799 } // namespace url_formatter	799 } // namespace url_formatter

OLD	NEW