components/url_formatter/url_formatter.cc - Issue 2784933002: Mitigate spoofing attempt using Latin letters.

Side by Side Diff: components/url_formatter/url_formatter.cc

Issue 2784933002: Mitigate spoofing attempt using Latin letters. (Closed)

Patch Set: add back U+04CF (ӏ) -> 'l' map Created 3 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« components/url_formatter/top_domains/make_top_domain_gperf.cc ('K') | « components/url_formatter/top_domains/make_top_domain_gperf.cc ('k') | components/url_formatter/url_formatter_unittest.cc » ('j') | components/url_formatter/url_formatter_unittest.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright 2015 The Chromium Authors. All rights reserved.	1 // Copyright 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/url_formatter/url_formatter.h"	5 #include "components/url_formatter/url_formatter.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <utility>	8 #include <utility>

	9 #include <vector>

9	10

10 #include "base/lazy_instance.h"	11 #include "base/lazy_instance.h"

11 #include "base/macros.h"	12 #include "base/macros.h"

12 #include "base/numerics/safe_conversions.h"	13 #include "base/numerics/safe_conversions.h"

13 #include "base/strings/string_piece.h"	14 #include "base/strings/string_piece.h"

	15 #include "base/strings/string_split.h"

14 #include "base/strings/string_util.h"	16 #include "base/strings/string_util.h"

15 #include "base/strings/utf_offset_string_conversions.h"	17 #include "base/strings/utf_offset_string_conversions.h"

16 #include "base/strings/utf_string_conversions.h"	18 #include "base/strings/utf_string_conversions.h"

17 #include "base/threading/thread_local_storage.h"	19 #include "base/threading/thread_local_storage.h"

	20 #include "net/base/lookup_string_in_fixed_set.h"

18 #include "third_party/icu/source/common/unicode/schriter.h"	21 #include "third_party/icu/source/common/unicode/schriter.h"

19 #include "third_party/icu/source/common/unicode/uidna.h"	22 #include "third_party/icu/source/common/unicode/uidna.h"

20 #include "third_party/icu/source/common/unicode/uniset.h"	23 #include "third_party/icu/source/common/unicode/uniset.h"

21 #include "third_party/icu/source/common/unicode/uscript.h"	24 #include "third_party/icu/source/common/unicode/uscript.h"

	25 #include "third_party/icu/source/common/unicode/utypes.h"

22 #include "third_party/icu/source/common/unicode/uvernum.h"	26 #include "third_party/icu/source/common/unicode/uvernum.h"

23 #include "third_party/icu/source/i18n/unicode/regex.h"	27 #include "third_party/icu/source/i18n/unicode/regex.h"

	28 #include "third_party/icu/source/i18n/unicode/translit.h"

24 #include "third_party/icu/source/i18n/unicode/uspoof.h"	29 #include "third_party/icu/source/i18n/unicode/uspoof.h"

25 #include "url/gurl.h"	30 #include "url/gurl.h"

26 #include "url/third_party/mozilla/url_parse.h"	31 #include "url/third_party/mozilla/url_parse.h"

27	32

28 namespace url_formatter {	33 namespace url_formatter {

29	34

30 namespace {	35 namespace {

31	36

32 base::string16 IDNToUnicodeWithAdjustments(	37 base::string16 IDNToUnicodeWithAdjustments(

33 base::StringPiece host,	38 base::StringPiece host,

(...skipping 150 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
184 new_parsed->scheme.len = kViewSourceLength - 1;	189 new_parsed->scheme.len = kViewSourceLength - 1;

185 }	190 }

186 AdjustAllComponentsButScheme(kViewSourceLength, new_parsed);	191 AdjustAllComponentsButScheme(kViewSourceLength, new_parsed);

187	192

188 if (prefix_end)	193 if (prefix_end)

189 *prefix_end += kViewSourceLength;	194 *prefix_end += kViewSourceLength;

190	195

191 return result;	196 return result;

192 }	197 }

193	198

	199 // A helper class for IDN Spoof checking, used to ensure that no IDN input is

	200 // spoofable per Chromium's standard of spoofability. For a more thorough

	201 // explanation of how spoof checking works in Chromium, see

	202 // http://dev.chromium.org/developers/design-documents/idn-in-google-chrome .

	203 class IDNSpoofChecker {
	Peter Kasting 2017/05/09 01:37:03 Nit: It might be nice to pull this class out to it Nit: It might be nice to pull this class out to its own .h/.cc for maximum readability. jungshik at Google 2017/05/10 18:05:13 Ok. pulled it out. Show quoted text On 2017/05/09 01:37:03, Peter Kasting wrote: > Nit: It might be nice to pull this class out to its own .h/.cc for maximum > readability. Ok. pulled it out.
	204 public:

	205 IDNSpoofChecker();

	206

	207 // Returns true if \|label\| is safe to display as Unicode. When the TLD is
	Peter Kasting 2017/05/09 01:37:03 Nit: Does the second sentence here really need to Nit: Does the second sentence here really need to be here? It seems like it only describes a portion of the functionality of the function. Maybe we should just say "See the function body for details on the specific safety checks performed"? jungshik at Google 2017/05/10 18:05:13 Yeah, that's better. Show quoted text On 2017/05/09 01:37:03, Peter Kasting wrote: > Nit: Does the second sentence here really need to be here? It seems like it > only describes a portion of the functionality of the function. Maybe we should > just say "See the function body for details on the specific safety checks > performed"? Yeah, that's better.
	208 // ASCII, check if a label is entirely made of Cyrillic letters that look like

	209 // Latin letters. In the event of library failure, all IDN inputs will be

	210 // treated as unsafe.

	211 bool Check(base::StringPiece16 label, bool is_tld_ascii);
	Peter Kasting 2017/05/09 01:37:03 Nit: This is a poor function name; how about somet Nit: This is a poor function name; how about something like SafeToDisplayAsUnicode()? jungshik at Google 2017/05/10 18:05:13 Done. Show quoted text On 2017/05/09 01:37:03, Peter Kasting wrote: > Nit: This is a poor function name; how about something like > SafeToDisplayAsUnicode()? Done.
	212

	213 // Returns true if \|hostname\| or the last few components of \|hostname\| looks

	214 // similar to one of top domains listed in top_domains/alexa_domains.list. Two

	215 // checks are done:

	216 // 1. Calculate the skeleton of \|hostname\| based on the Unicode confusable

	217 // character list and look it up in the pre-calculated skeleton list of

	218 // top domains.

	219 // 2. Look up the diacritic-free version of \|hostname\| in the list of

	220 // top domains. Note that non-IDN hostnames will not get here.

	221 bool SimilarToTopDomains(base::StringPiece16 hostname);

	222

	223 private:

	224 void SetAllowedUnicodeSet(UErrorCode* status);
	Peter Kasting 2017/05/09 01:37:03 Nit: I suggest adding comments for these even thou Nit: I suggest adding comments for these even though they're private. jungshik at Google 2017/05/10 18:05:13 Done. Show quoted text On 2017/05/09 01:37:03, Peter Kasting wrote: > Nit: I suggest adding comments for these even though they're private. Done.
	225 bool IsMadeOfLatinAlikeCyrillic(const icu::UnicodeString& label_string);

	226 bool GetSkeleton(base::StringPiece16 hostname, std::string* skeleton);

	227

	228 USpoofChecker* checker_;

	229 icu::UnicodeSet deviation_characters_;

	230 icu::UnicodeSet non_ascii_latin_letters_;

	231 icu::UnicodeSet kana_letters_exceptions_;

	232 icu::UnicodeSet combining_diacritics_exceptions_;

	233 icu::UnicodeSet cyrillic_letters_;

	234 icu::UnicodeSet cyrillic_letters_latin_alike_;

	235 icu::UnicodeSet lgc_letters_n_ascii_;

	236 icu::Transliterator* transliterator_;

	237

	238 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);

	239 };

	240

	241 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =

	242 LAZY_INSTANCE_INITIALIZER;

	243 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;

	244

	245 void OnThreadTermination(void* regex_matcher) {
	Peter Kasting 2017/05/09 01:37:03 Let me guess: the RegexMatcher uses internal state Let me guess: the RegexMatcher uses internal state, so it's not possible to simultaneously use it from multiple threads, hence the need to stick it in TLS.
	246 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);

	247 }

	248

194 // TODO(brettw): We may want to skip this step in the case of file URLs to	249 // TODO(brettw): We may want to skip this step in the case of file URLs to

195 // allow unicode UNC hostnames regardless of encodings.	250 // allow unicode UNC hostnames regardless of encodings.

196 base::string16 IDNToUnicodeWithAdjustments(	251 base::string16 IDNToUnicodeWithAdjustments(

197 base::StringPiece host, base::OffsetAdjuster::Adjustments* adjustments) {	252 base::StringPiece host, base::OffsetAdjuster::Adjustments* adjustments) {

198 if (adjustments)	253 if (adjustments)

199 adjustments->clear();	254 adjustments->clear();

200 // Convert the ASCII input to a base::string16 for ICU.	255 // Convert the ASCII input to a base::string16 for ICU.

201 base::string16 input16;	256 base::string16 input16;

202 input16.reserve(host.length());	257 input16.reserve(host.length());

203 input16.insert(input16.end(), host.begin(), host.end());	258 input16.insert(input16.end(), host.begin(), host.end());

204	259

205 bool is_tld_ascii = true;	260 bool is_tld_ascii = true;

206 size_t last_dot = host.rfind('.');	261 size_t last_dot = host.rfind('.');

207 if (last_dot != base::StringPiece::npos &&	262 if (last_dot != base::StringPiece::npos &&

208 host.substr(last_dot).starts_with(".xn--")) {	263 host.substr(last_dot).starts_with(".xn--")) {

209 is_tld_ascii = false;	264 is_tld_ascii = false;

210 }	265 }

211	266

212 // Do each component of the host separately, since we enforce script matching	267 // Do each component of the host separately, since we enforce script matching

213 // on a per-component basis.	268 // on a per-component basis.

214 base::string16 out16;	269 base::string16 out16;

	270 bool has_idn_component = false;
	Peter Kasting 2017/05/09 01:37:03 Can we reach this function with an input that does Can we reach this function with an input that doesn't cause the loop below to set this to true? It seems unlikely. If not, we could eliminate this variable. jungshik at Google 2017/05/10 18:05:13 IDNToUnicode (which calls this function) is called Show quoted text On 2017/05/09 01:37:03, Peter Kasting wrote: > Can we reach this function with an input that doesn't cause the loop below to > set this to true? It seems unlikely. If not, we could eliminate this variable. IDNToUnicode (which calls this function) is called with any host name out of GURL. So, has_idn_component can be false for all labels/components. Then, has_idn_component would be false, too.
215 for (size_t component_start = 0, component_end;	271 for (size_t component_start = 0, component_end;

216 component_start < input16.length();	272 component_start < input16.length();

217 component_start = component_end + 1) {	273 component_start = component_end + 1) {

218 // Find the end of the component.	274 // Find the end of the component.

219 component_end = input16.find('.', component_start);	275 component_end = input16.find('.', component_start);

220 if (component_end == base::string16::npos)	276 if (component_end == base::string16::npos)

221 component_end = input16.length(); // For getting the last component.	277 component_end = input16.length(); // For getting the last component.

222 size_t component_length = component_end - component_start;	278 size_t component_length = component_end - component_start;

223 size_t new_component_start = out16.length();	279 size_t new_component_start = out16.length();

224 bool converted_idn = false;	280 bool converted_idn = false;

225 if (component_end > component_start) {	281 if (component_end > component_start) {

226 // Add the substring that we just found.	282 // Add the substring that we just found.

227 converted_idn =	283 converted_idn =

228 IDNToUnicodeOneComponent(input16.data() + component_start,	284 IDNToUnicodeOneComponent(input16.data() + component_start,

229 component_length, is_tld_ascii, &out16);	285 component_length, is_tld_ascii, &out16);

	286 has_idn_component = has_idn_component \|\| converted_idn;
	Peter Kasting 2017/05/09 01:37:03 Nit: Or use \|= Nit: Or use \|= jungshik at Google 2017/05/10 18:05:13 Changed. Show quoted text On 2017/05/09 01:37:03, Peter Kasting wrote: > Nit: Or use \|= Changed.
230 }	287 }

231 size_t new_component_length = out16.length() - new_component_start;	288 size_t new_component_length = out16.length() - new_component_start;

232	289

233 if (converted_idn && adjustments) {	290 if (converted_idn && adjustments) {

234 adjustments->push_back(base::OffsetAdjuster::Adjustment(	291 adjustments->push_back(base::OffsetAdjuster::Adjustment(

235 component_start, component_length, new_component_length));	292 component_start, component_length, new_component_length));

236 }	293 }

237	294

238 // Need to add the dot we just found (if we found one).	295 // Need to add the dot we just found (if we found one).

239 if (component_end < input16.length())	296 if (component_end < input16.length())

240 out16.push_back('.');	297 out16.push_back('.');

241 }	298 }

	299

	300 if (has_idn_component &&
	Peter Kasting 2017/05/09 01:37:03 Nit: Might want a comment above this block like "L Nit: Might want a comment above this block like "Leave as punycode any inputs that spoof top domains." jungshik at Google 2017/05/10 18:05:13 Done. Show quoted text On 2017/05/09 01:37:03, Peter Kasting wrote: > Nit: Might want a comment above this block like "Leave as punycode any inputs > that spoof top domains." Done.
	301 g_idn_spoof_checker.Get().SimilarToTopDomains(out16)) {

	302 if (adjustments)

	303 adjustments->clear();

	304 return input16;

	305 }
	Peter Kasting 2017/05/09 01:37:03 Nit: Blank line after this? Nit: Blank line after this? jungshik at Google 2017/05/10 18:05:13 Done. Show quoted text On 2017/05/09 01:37:03, Peter Kasting wrote: > Nit: Blank line after this? Done.
242 return out16;	306 return out16;

243 }	307 }

244	308

245 // A helper class for IDN Spoof checking, used to ensure that no IDN input is

246 // spoofable per Chromium's standard of spoofability. For a more thorough

247 // explanation of how spoof checking works in Chromium, see

248 // http://dev.chromium.org/developers/design-documents/idn-in-google-chrome .

249 class IDNSpoofChecker {

250 public:

251 IDNSpoofChecker();

252

253 // Returns true if \|label\| is safe to display as Unicode. When the TLD is

254 // ASCII, check if a label is entirely made of Cyrillic letters that look like

255 // Latin letters. In the event of library failure, all IDN inputs will be

256 // treated as unsafe.

257 bool Check(base::StringPiece16 label, bool is_tld_ascii);

258

259 private:

260 void SetAllowedUnicodeSet(UErrorCode* status);

261 bool IsMadeOfLatinAlikeCyrillic(const icu::UnicodeString& label_string);

262

263 USpoofChecker* checker_;

264 icu::UnicodeSet deviation_characters_;

265 icu::UnicodeSet non_ascii_latin_letters_;

266 icu::UnicodeSet kana_letters_exceptions_;

267 icu::UnicodeSet cyrillic_letters_;

268 icu::UnicodeSet cyrillic_letters_latin_alike_;

269

270 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);

271 };

272

273 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =

274 LAZY_INSTANCE_INITIALIZER;

275 base::ThreadLocalStorage::StaticSlot tls_index = TLS_INITIALIZER;

276

277 void OnThreadTermination(void* regex_matcher) {

278 delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);

279 }

280

281 IDNSpoofChecker::IDNSpoofChecker() {	309 IDNSpoofChecker::IDNSpoofChecker() {

282 UErrorCode status = U_ZERO_ERROR;	310 UErrorCode status = U_ZERO_ERROR;

283 checker_ = uspoof_open(&status);	311 checker_ = uspoof_open(&status);

284 if (U_FAILURE(status)) {	312 if (U_FAILURE(status)) {

285 checker_ = nullptr;	313 checker_ = nullptr;

286 return;	314 return;

287 }	315 }

288	316

289 // At this point, USpoofChecker has all the checks enabled except	317 // At this point, USpoofChecker has all the checks enabled except

290 // for USPOOF_CHAR_LIMIT (USPOOF_{RESTRICTION_LEVEL, INVISIBLE,	318 // for USPOOF_CHAR_LIMIT (USPOOF_{RESTRICTION_LEVEL, INVISIBLE,

(...skipping 25 matching lines...) Expand all Loading...
316 status);	344 status);

317 deviation_characters_.freeze();	345 deviation_characters_.freeze();

318	346

319 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary	347 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary

320 // because additional characters pulled in with scx=Latn are not included in	348 // because additional characters pulled in with scx=Latn are not included in

321 // the allowed set.	349 // the allowed set.

322 non_ascii_latin_letters_ = icu::UnicodeSet(	350 non_ascii_latin_letters_ = icu::UnicodeSet(

323 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);	351 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);

324 non_ascii_latin_letters_.freeze();	352 non_ascii_latin_letters_.freeze();

325	353

326 // These letters are parts of \|dangerous_patterns_\|.	354 // The following two sets are parts of \|dangerous_patterns_\|.

327 kana_letters_exceptions_ = icu::UnicodeSet(	355 kana_letters_exceptions_ = icu::UnicodeSet(

328 UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"),	356 UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"),

329 status);	357 status);

330 kana_letters_exceptions_.freeze();	358 kana_letters_exceptions_.freeze();

	359 combining_diacritics_exceptions_ = icu::UnicodeSet(

	360 UNICODE_STRING_SIMPLE("[\\u0300-\\u0339]"), status);

	361 combining_diacritics_exceptions_.freeze();

331	362

332 // These Cyrillic letters look like Latin. A domain label entirely made of	363 // These Cyrillic letters look like Latin. A domain label entirely made of

333 // these letters is blocked as a simplified whole-script-spoofable.	364 // these letters is blocked as a simplified whole-script-spoofable.

334 cyrillic_letters_latin_alike_ =	365 cyrillic_letters_latin_alike_ =

335 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status);	366 icu::UnicodeSet(icu::UnicodeString("[асԁеһіјӏорԛѕԝхуъЬҽпгѵѡ]"), status);

336 cyrillic_letters_latin_alike_.freeze();	367 cyrillic_letters_latin_alike_.freeze();

337	368

338 cyrillic_letters_ =	369 cyrillic_letters_ =

339 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status);	370 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Cyrl:]]"), status);

340 cyrillic_letters_.freeze();	371 cyrillic_letters_.freeze();

341	372

342 DCHECK(U_SUCCESS(status));	373 // This set is used to determine whether or not to apply a slow

	374 // transliteration to remove diacritics to a given hostname before the

	375 // confusable skeleton calculation for comparison with top domain names. If

	376 // it has any character outside the set, the expensive step will be skipped

	377 // because it cannot match any of top domain names.

	378 // The last ([\u0300-\u0339] is a shorthand for "[:Identifier_Status=Allowed:]

	379 // & [:Script_Extensions=Inherited:] - [\\u200C\\u200D]". The latter is a

	380 // subset of the former but it does not matter because hostnames with

	381 // characters outside the latter set would be rejected in an earlier step.

	382 lgc_letters_n_ascii_ = icu::UnicodeSet(UNICODE_STRING_SIMPLE(

	383 "[[:Latin:][:Greek:][:Cyrillic:][0-9\\u002e_\\u002d][\\u0300-\\u0339]]"),

	384 status);

	385 lgc_letters_n_ascii_.freeze();

	386

	387 // Used for diacritics-removal before the skeleton calculation. Add

	388 // "ł > l; ø > o; đ > d" that are not handled by "NFD; Nonspacing mark

	389 // removal; NFC". On top of that, supplement the Unicode confusable list by

	390 // replacing {U+043A (к), U+0138(ĸ), U+03BA(κ)}, U+04CF (ӏ) and U+043F(п) by

	391 // 'k', 'l' and 'n', respectively.

	392 // TODO(jshin): Revisit "ł > l; ø > o" mapping.
	Peter Kasting 2017/05/09 01:37:03 Nit: Might want to link this TODO to a bug or othe Nit: Might want to link this TODO to a bug or otherwise expand on why/how you'd revisit, or this won't be actionable by others.
	393 UParseError parse_error;

	394 transliterator_ = icu::Transliterator::createFromRules(

	395 UNICODE_STRING_SIMPLE("DropAcc"),

	396 icu::UnicodeString("::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;"

	397 " ł > l; ø > o; đ > d; ӏ > l; [кĸκ] > k; п > n;"),

	398 UTRANS_FORWARD, parse_error, status);

	399 DCHECK(U_SUCCESS(status))

	400 << "Spoofchecker initalization failed due to an error: "

	401 << u_errorName(status);

	402 if (U_FAILURE(status))
	Peter Kasting 2017/05/09 01:37:03 Do not handle DCHECK failure; assume DCHECKs canno Do not handle DCHECK failure; assume DCHECKs cannot fail. If they can, they should be conditionals, not DCHECKs. jungshik at Google 2017/05/10 18:05:13 Done. Show quoted text On 2017/05/09 01:37:03, Peter Kasting wrote: > Do not handle DCHECK failure; assume DCHECKs cannot fail. If they can, they > should be conditionals, not DCHECKs. Done.
	403 transliterator_ = nullptr;

343 }	404 }

344	405

345 bool IDNSpoofChecker::Check(base::StringPiece16 label, bool is_tld_ascii) {	406 bool IDNSpoofChecker::Check(base::StringPiece16 label, bool is_tld_ascii) {

346 UErrorCode status = U_ZERO_ERROR;	407 UErrorCode status = U_ZERO_ERROR;

347 int32_t result = uspoof_check(checker_, label.data(),	408 int32_t result = uspoof_check(checker_, label.data(),

348 base::checked_cast<int32_t>(label.size()),	409 base::checked_cast<int32_t>(label.size()),

349 NULL, &status);	410 NULL, &status);

350 // If uspoof_check fails (due to library failure), or if any of the checks	411 // If uspoof_check fails (due to library failure), or if any of the checks

351 // fail, treat the IDN as unsafe.	412 // fail, treat the IDN as unsafe.

352 if (U_FAILURE(status) \|\| (result & USPOOF_ALL_CHECKS))	413 if (U_FAILURE(status) \|\| (result & USPOOF_ALL_CHECKS))

(...skipping 10 matching lines...) Expand all Loading...
363 // "UTS 46 section 4 Processing step 4" applies validity criteria for	424 // "UTS 46 section 4 Processing step 4" applies validity criteria for

364 // non-transitional processing (i.e. do not map deviation characters) to any	425 // non-transitional processing (i.e. do not map deviation characters) to any

365 // punycode labels regardless of whether transitional or non-transitional is	426 // punycode labels regardless of whether transitional or non-transitional is

366 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted	427 // chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted

367 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as	428 // as Unicode would be canonicalized to 'fuss' by GURL and is displayed as

368 // such. See http://crbug.com/595263 .	429 // such. See http://crbug.com/595263 .

369 if (deviation_characters_.containsSome(label_string))	430 if (deviation_characters_.containsSome(label_string))

370 return false;	431 return false;

371	432

372 // If there's no script mixing, the input is regarded as safe without any	433 // If there's no script mixing, the input is regarded as safe without any

373 // extra check unless it contains Kana letter exceptions or it's made entirely	434 // extra check unless it falls into one of three categories:

374 // of Cyrillic letters that look like Latin letters. Note that the following	435 // - contains Kana letter exceptions

375 // combinations of scripts are treated as a 'logical' single script.	436 // - it's made entirely of Cyrillic letters that look like Latin letters.
	Peter Kasting 2017/05/09 01:37:03 Nit: it's -> the TLD is ASCII, and the input is ? Nit: it's -> the TLD is ASCII, and the input is ? jungshik at Google 2017/05/10 18:05:13 Done. Show quoted text On 2017/05/09 01:37:03, Peter Kasting wrote: > Nit: it's -> the TLD is ASCII, and the input is ? Done.
	437 // - it has combining diacritic marks.

	438 // Note that the following combinations of scripts are treated as a 'logical'

	439 // single script.

376 // - Chinese: Han, Bopomofo, Common	440 // - Chinese: Han, Bopomofo, Common

377 // - Japanese: Han, Hiragana, Katakana, Common	441 // - Japanese: Han, Hiragana, Katakana, Common

378 // - Korean: Hangul, Han, Common	442 // - Korean: Hangul, Han, Common

379 result &= USPOOF_RESTRICTION_LEVEL_MASK;	443 result &= USPOOF_RESTRICTION_LEVEL_MASK;

380 if (result == USPOOF_ASCII) return true;	444 if (result == USPOOF_ASCII) return true;

381 if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&	445 if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&

382 kana_letters_exceptions_.containsNone(label_string)) {	446 kana_letters_exceptions_.containsNone(label_string) &&

	447 combining_diacritics_exceptions_.containsNone(label_string)) {

383 // Check Cyrillic confusable only for ASCII TLDs.	448 // Check Cyrillic confusable only for ASCII TLDs.

384 return !is_tld_ascii \|\| !IsMadeOfLatinAlikeCyrillic(label_string);	449 return !is_tld_ascii \|\| !IsMadeOfLatinAlikeCyrillic(label_string);

385 }	450 }

386	451

387 // Additional checks for \|label\| with multiple scripts, one of which is Latin.	452 // Additional checks for \|label\| with multiple scripts, one of which is Latin.

388 // Disallow non-ASCII Latin letters to mix with a non-Latin script.	453 // Disallow non-ASCII Latin letters to mix with a non-Latin script.

389 if (non_ascii_latin_letters_.containsSome(label_string))	454 // Note that non-ASCII Latin check should not be applied when the entire label
	Peter Kasting 2017/05/09 01:37:03 Nit: that -> that the ? Nit: that -> that the ? jungshik at Google 2017/05/10 18:05:13 Done. Show quoted text On 2017/05/09 01:37:03, Peter Kasting wrote: > Nit: that -> that the ? Done.
	455 // is made of Latin. Checking with lgc_letters set here should be fine because

	456 // script mixing of LGC is already rejected.

	457 if (non_ascii_latin_letters_.containsSome(label_string) &&

	458 !lgc_letters_n_ascii_.containsAll(label_string))

390 return false;	459 return false;

391	460

392 if (!tls_index.initialized())	461 if (!tls_index.initialized())

393 tls_index.Initialize(&OnThreadTermination);	462 tls_index.Initialize(&OnThreadTermination);

394 icu::RegexMatcher* dangerous_pattern =	463 icu::RegexMatcher* dangerous_pattern =

395 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get());	464 reinterpret_cast<icu::RegexMatcher*>(tls_index.Get());

396 if (!dangerous_pattern) {	465 if (!dangerous_pattern) {

397 // Disallow the katakana no, so, zo, or n, as they may be mistaken for	466 // Disallow the katakana no, so, zo, or n, as they may be mistaken for

398 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts	467 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts

399 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a	468 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a

400 // non-Japanese script on either side is disallowed, legitimate cases like	469 // non-Japanese script on either side is disallowed, legitimate cases like

401 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those	470 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those

402 // characters when used alone as a label is futile because those cases	471 // characters when used alone as a label is futile because those cases

403 // would not reach here.	472 // would not reach here.

404 // Also disallow what used to be blocked by mixed-script-confusable (MSC)	473 // Also disallow what used to be blocked by mixed-script-confusable (MSC)

405 // detection. ICU 58 does not detect MSC any more for a single input string.	474 // detection. ICU 58 does not detect MSC any more for a single input string.

406 // See http://bugs.icu-project.org/trac/ticket/12823 .	475 // See http://bugs.icu-project.org/trac/ticket/12823 .

407 // TODO(jshin): adjust the pattern once the above ICU bug is fixed.	476 // TODO(jshin): adjust the pattern once the above ICU bug is fixed.

408 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana	477 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana

409 // Prolonged Sound) used out-of-context.	478 // Prolonged Sound) used out-of-context.

410 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark)	479 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark)

411 // unless they're preceded by a Katakana.	480 // unless they're preceded by a Katakana.

412 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters	481 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters

413 // (U+30D[8-A]) that look exactly like each other when they're used in a	482 // (U+30D[8-A]) that look exactly like each other when they're used in a

414 // label otherwise entirely in Katakna or Hiragana.	483 // label otherwise entirely in Katakna or Hiragana.

415 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small	484 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small

416 // Letter Co) to be next to Latin.	485 // Letter Co) to be next to Latin.

417 // - Disallow Latin 'o' and 'g' next to Armenian.	486 // - Disallow Latin 'o' and 'g' next to Armenian.

	487 // - Disallow combining diacritical mark (U+0300-U+0339) after a non-LGC

	488 // character. Other combining diacritical marks are not in the allowed

	489 // character set.

418 dangerous_pattern = new icu::RegexMatcher(	490 dangerous_pattern = new icu::RegexMatcher(

419 icu::UnicodeString(	491 icu::UnicodeString(

420 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]"	492 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]"

421 "[\\u30ce\\u30f3\\u30bd\\u30be]"	493 "[\\u30ce\\u30f3\\u30bd\\u30be]"

422 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]\|"	494 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]\|"

423 "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc\|^\\u30fc\|"	495 "[^\\p{scx=kana}\\p{scx=hira}]\\u30fc\|^\\u30fc\|"

424 "[^\\p{scx=kana}][\\u30fd\\u30fe]\|^[\\u30fd\\u30fe]\|"	496 "[^\\p{scx=kana}][\\u30fd\\u30fe]\|^[\\u30fd\\u30fe]\|"

425 "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$\|"	497 "^[\\p{scx=kana}]+[\\u3078-\\u307a][\\p{scx=kana}]+$\|"

426 "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$\|"	498 "^[\\p{scx=hira}]+[\\u30d8-\\u30da][\\p{scx=hira}]+$\|"

427 "[a-z]\\u30fb\|\\u30fb[a-z]\|"	499 "[a-z]\\u30fb\|\\u30fb[a-z]\|"

428 "^[\\u0585\\u0581]+[a-z]\|[a-z][\\u0585\\u0581]+$\|"	500 "^[\\u0585\\u0581]+[a-z]\|[a-z][\\u0585\\u0581]+$\|"

429 "[a-z][\\u0585\\u0581]+[a-z]\|"	501 "[a-z][\\u0585\\u0581]+[a-z]\|"

430 "^[og]+[\\p{scx=armn}]\|[\\p{scx=armn}][og]+$\|"	502 "^[og]+[\\p{scx=armn}]\|[\\p{scx=armn}][og]+$\|"

431 "[\\p{scx=armn}][og]+[\\p{scx=armn}]",	503 "[\\p{scx=armn}][og]+[\\p{scx=armn}]\|"

	504 "[^\\p{scx=latn}\\p{scx=grek}\\p{scx=cyrl}][\\u0300-\\u0339]",

432 -1, US_INV),	505 -1, US_INV),

433 0, status);	506 0, status);

434 tls_index.Set(dangerous_pattern);	507 tls_index.Set(dangerous_pattern);

435 }	508 }

436 dangerous_pattern->reset(label_string);	509 dangerous_pattern->reset(label_string);

437 return !dangerous_pattern->find();	510 return !dangerous_pattern->find();

438 }	511 }

439	512

	513 #include "components/url_formatter/top_domains/alexa_skeletons-inc.cc"

	514 // All the domains in the above file have 3 or fewer labels.

	515 const size_t kNumberOfLabelsToCheck = 3;
	Peter Kasting 2017/05/09 01:37:03 Can we write this value into the file so we don't Can we write this value into the file so we don't need to hardcode it here? jungshik at Google 2017/05/10 18:05:13 make_top_domain_gperf can write that out to anoth Show quoted text On 2017/05/09 01:37:03, Peter Kasting wrote: > Can we write this value into the file so we don't need to hardcode it here? make_top_domain_gperf can write that out to another file (the only line other than license boilerplate would be the above line). Do you like that? Peter Kasting 2017/05/10 22:38:46 Can't we write it to the same file? Having a sepa Show quoted text On 2017/05/10 18:05:13, jungshik at Google wrote: > On 2017/05/09 01:37:03, Peter Kasting wrote: > > Can we write this value into the file so we don't need to hardcode it here? > > make_top_domain_gperf can write that out to another file (the only line other > than license boilerplate would be the above line). Do you like that? Can't we write it to the same file? Having a separate file for this works, I guess, it just feels inelegant. Dunno why I'm worrying. It'd still probably be better than hardcoding this.
	516

	517 bool LookupStringInSet(base::StringPiece needle,
	Peter Kasting 2017/05/09 01:37:04 Nit: If you're not going to use boring names for y Nit: If you're not going to use boring names for your params, I'd copy the ones from the underlying net:: declaration rather than using \|needle\|. That said, this wrapper is so short, and is called only once, that I'd just inline the body of this at the callsite below. jungshik at Google 2017/05/10 18:05:13 Inlined it. Show quoted text On 2017/05/09 01:37:04, Peter Kasting wrote: > Nit: If you're not going to use boring names for your params, I'd copy the ones > from the underlying net:: declaration rather than using \|needle\|. > > That said, this wrapper is so short, and is called only once, that I'd just > inline the body of this at the callsite below. Inlined it.
	518 const unsigned char* fixed_set,

	519 size_t set_len) {

	520 return net::LookupStringInFixedSet(fixed_set, set_len, needle.data(),

	521 needle.length()) != net::kDafsaNotFound;

	522 }

	523

	524 bool LookupMatchInTopDomains(base::StringPiece hostname) {

	525 // When 'hostname' is a skeleton instead of actual hostname, it's assumed

	526 // that no character other than '.' among those allowed in IDN will have

	527 // '.' as its skeleton.

	528 DCHECK(hostname[hostname.length() - 1] != '.');
	Peter Kasting 2017/05/09 01:37:03 Nit: hostname.back() Nit: hostname.back() jungshik at Google 2017/05/10 18:05:13 Done. Show quoted text On 2017/05/09 01:37:03, Peter Kasting wrote: > Nit: hostname.back() Done.
	529 auto labels = base::SplitStringPiece(hostname, ".", base::KEEP_WHITESPACE,

	530 base::SPLIT_WANT_ALL);

	531

	532 while (labels.size() > kNumberOfLabelsToCheck)

	533 labels.erase(labels.begin());
	Peter Kasting 2017/05/09 01:37:04 Nit: Seems like a single call to vector::erase cou Nit: Seems like a single call to vector::erase could be more efficient than a while loop. jungshik at Google 2017/05/10 18:05:13 Done. Show quoted text On 2017/05/09 01:37:04, Peter Kasting wrote: > Nit: Seems like a single call to vector::erase could be more efficient than a > while loop. Done.
	534

	535 while (labels.size() > 1) {
	Peter Kasting 2017/05/09 01:37:03 Is this naive loop faster than computing the actua Is this naive loop faster than computing the actual eTLD+1 length using the RCDS and then doing a single DAFSA lookup? jungshik at Google 2017/05/10 18:05:13 'hostname' is not a good name (at one point, it's Show quoted text On 2017/05/09 01:37:03, Peter Kasting wrote: > Is this naive loop faster than computing the actual eTLD+1 length using the RCDS > and then doing a single DAFSA lookup? 'hostname' is not a good name (at one point, it's either a hostname or its skeleton, but now it's always a skeleton). Changed it to \|skeleton\|. eTLD match cannot be done because even 'com' is turned to 'c o r n' (without spaces). We can try the eTLD match before the skeleton calculation. Hmm, it appears that RCDS canonicalizes 'hostname' before finding eTLD+1. The canonicalization would turn an IDN to punycode That would not work here. Because # of labels is limited to 3 here, at most two look ups are done here. So, I'd expect little difference even if RCDS works without canonicalization.. OTOH, thanks to your suggestion, it occurred to me that I can change 2-step (python + C++) into one step (C++ using RCDS that accepts URLs to extract eTLD +1)
	536 std::string partial_hostname = base::JoinString(labels, ".");

	537 if (LookupStringInSet(partial_hostname, kDafsa, arraysize(kDafsa)))

	538 return true;

	539 labels.erase(labels.begin());

	540 }

	541 return false;

	542 }

	543

	544 bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {

	545 size_t hostname_length = hostname.length() -

	546 (*(hostname.rbegin()) == '.' ? 1 : 0);
	Peter Kasting 2017/05/09 01:37:03 Nit: Use .back() instead of rbegin() Nit: Use .back() instead of rbegin() jungshik at Google 2017/05/10 18:05:13 Done. Show quoted text On 2017/05/09 01:37:03, Peter Kasting wrote: > Nit: Use .back() instead of *rbegin() Done.
	547 icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname_length);

	548 // If input has any characters outside Latin-Greek-Cyrillic and [0-9._-],

	549 // there is no point in getting rid of diacritics because combining marks

	550 // attached to non-LGC characters are already blocked.

	551 if (lgc_letters_n_ascii_.span(ustr_host, 0, USET_SPAN_CONTAINED) ==

	552 ustr_host.length() && transliterator_)
	Peter Kasting 2017/05/09 01:37:03 Note that if the DCHECK earlier is assumed not to Note that if the DCHECK earlier is assumed not to fail, this null-check can disappear. jungshik at Google 2017/05/10 18:05:13 removed it. Show quoted text On 2017/05/09 01:37:03, Peter Kasting wrote: > Note that if the DCHECK earlier is assumed not to fail, this null-check can > disappear. removed it.
	553 transliterator_->transliterate(ustr_host);

	554

	555 UErrorCode status = U_ZERO_ERROR;

	556 icu::UnicodeString ustr_skeleton;

	557 uspoof_getSkeletonUnicodeString(checker_, 0, /* not used. deprecated. */
	Peter Kasting 2017/05/09 01:37:03 Nit: If you're going to add /* / (which I'm not s Nit: If you're going to add / / (which I'm not sure is necessary), do so before the comma to make it very clear which parameter this is on. "deprecated." is also probably unnecessary here. jungshik at Google* 2017/05/10 18:05:13 ok. just removed it. Show quoted text On 2017/05/09 01:37:03, Peter Kasting wrote: > Nit: If you're going to add /* */ (which I'm not sure is necessary), do so > before the comma to make it very clear which parameter this is on. > > "deprecated." is also probably unnecessary here. ok. just removed it.
	558 ustr_host, ustr_skeleton, &status);

	559 if (U_FAILURE(status))

	560 return false;

	561 std::string skeleton;

	562 ustr_skeleton.toUTF8String(skeleton);

	563 return LookupMatchInTopDomains(skeleton);

	564 }

	565

440 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(	566 bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(

441 const icu::UnicodeString& label_string) {	567 const icu::UnicodeString& label_string) {

442 // Collect all the Cyrillic letters in \|label_string\| and see if they're	568 // Collect all the Cyrillic letters in \|label_string\| and see if they're

443 // a subset of \|cyrillic_letters_latin_alike_\|.	569 // a subset of \|cyrillic_letters_latin_alike_\|.

444 // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and	570 // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and

445 // [_-] and checking if the set contains all letters of \|label_string\|	571 // [_-] and checking if the set contains all letters of \|label_string\|

446 // would work in most cases, but not if a label has non-letters outside	572 // would work in most cases, but not if a label has non-letters outside

447 // ASCII.	573 // ASCII.

448 icu::UnicodeSet cyrillic_in_label;	574 icu::UnicodeSet cyrillic_in_label;

449 icu::StringCharacterIterator it(label_string);	575 icu::StringCharacterIterator it(label_string);

(...skipping 400 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
850 return base::StartsWith(text, www, base::CompareCase::SENSITIVE)	976 return base::StartsWith(text, www, base::CompareCase::SENSITIVE)

851 ? text.substr(www.length()) : text;	977 ? text.substr(www.length()) : text;

852 }	978 }

853	979

854 base::string16 StripWWWFromHost(const GURL& url) {	980 base::string16 StripWWWFromHost(const GURL& url) {

855 DCHECK(url.is_valid());	981 DCHECK(url.is_valid());

856 return StripWWW(base::ASCIIToUTF16(url.host_piece()));	982 return StripWWW(base::ASCIIToUTF16(url.host_piece()));

857 }	983 }

858	984

859 } // namespace url_formatter	985 } // namespace url_formatter

OLD	NEW