Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(61)

Side by Side Diff: components/url_formatter/idn_spoof_checker.cc

Issue 2895103003: Drop Mongolian from the IDN script list and tighten up the policy on Armenian-Latin mixing (Closed)
Patch Set: block Armenian + Latin mix Created 3 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | components/url_formatter/url_formatter_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2017 The Chromium Authors. All rights reserved. 1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/url_formatter/idn_spoof_checker.h" 5 #include "components/url_formatter/idn_spoof_checker.h"
6 6
7 #include "base/numerics/safe_conversions.h" 7 #include "base/numerics/safe_conversions.h"
8 #include "base/strings/string_split.h" 8 #include "base/strings/string_split.h"
9 #include "base/strings/string_util.h" 9 #include "base/strings/string_util.h"
10 #include "base/threading/thread_local_storage.h" 10 #include "base/threading/thread_local_storage.h"
(...skipping 214 matching lines...) Expand 10 before | Expand all | Expand 10 after
225 // detection. ICU 58 does not detect MSC any more for a single input string. 225 // detection. ICU 58 does not detect MSC any more for a single input string.
226 // See http://bugs.icu-project.org/trac/ticket/12823 . 226 // See http://bugs.icu-project.org/trac/ticket/12823 .
227 // TODO(jshin): adjust the pattern once the above ICU bug is fixed. 227 // TODO(jshin): adjust the pattern once the above ICU bug is fixed.
228 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana 228 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana
229 // Prolonged Sound) used out-of-context. 229 // Prolonged Sound) used out-of-context.
230 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark) 230 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark)
231 // unless they're preceded by a Katakana. 231 // unless they're preceded by a Katakana.
232 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters 232 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters
233 // (U+30D[8-A]) that look exactly like each other when they're used in a 233 // (U+30D[8-A]) that look exactly like each other when they're used in a
234 // label otherwise entirely in Katakna or Hiragana. 234 // label otherwise entirely in Katakna or Hiragana.
235 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small 235 // - Disalow mixing of Latin and Armenian
Peter Kasting 2017/08/29 04:08:30 Nit: Period at end (2 places)
236 // Letter Co) to be next to Latin.
237 // - Disallow Latin 'o' and 'g' next to Armenian.
238 // - Disalow mixing of Latin and Canadian Syllabary. 236 // - Disalow mixing of Latin and Canadian Syllabary.
239 // - Disalow mixing of Latin and Tifinagh. 237 // - Disalow mixing of Latin and Tifinagh.
238 // - Disalow mixing of Latin and Miao
240 // - Disallow combining diacritical mark (U+0300-U+0339) after a non-LGC 239 // - Disallow combining diacritical mark (U+0300-U+0339) after a non-LGC
241 // character. Other combining diacritical marks are not in the allowed 240 // character. Other combining diacritical marks are not in the allowed
242 // character set. 241 // character set.
243 dangerous_pattern = new icu::RegexMatcher( 242 dangerous_pattern = new icu::RegexMatcher(
244 icu::UnicodeString( 243 icu::UnicodeString(
245 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])" 244 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])"
246 R"([\u30ce\u30f3\u30bd\u30be])" 245 R"([\u30ce\u30f3\u30bd\u30be])"
247 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}]|)" 246 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}]|)"
248 R"([^\p{scx=kana}\p{scx=hira}]\u30fc|^\u30fc|)" 247 R"([^\p{scx=kana}\p{scx=hira}]\u30fc|^\u30fc|)"
249 R"([^\p{scx=kana}][\u30fd\u30fe]|^[\u30fd\u30fe]|)" 248 R"([^\p{scx=kana}][\u30fd\u30fe]|^[\u30fd\u30fe]|)"
250 R"(^[\p{scx=kana}]+[\u3078-\u307a][\p{scx=kana}]+$|)" 249 R"(^[\p{scx=kana}]+[\u3078-\u307a][\p{scx=kana}]+$|)"
251 R"(^[\p{scx=hira}]+[\u30d8-\u30da][\p{scx=hira}]+$|)" 250 R"(^[\p{scx=hira}]+[\u30d8-\u30da][\p{scx=hira}]+$|)"
252 R"([a-z]\u30fb|\u30fb[a-z]|)" 251 R"([a-z]\u30fb|\u30fb[a-z]|)"
253 R"(^[\u0585\u0581]+[a-z]|[a-z][\u0585\u0581]+$|)" 252 R"([\p{sc=armn}].*[a-z]|[a-z].*[\p{sc=armn}]|)"
254 R"([a-z][\u0585\u0581]+[a-z]|)"
255 R"(^[og]+[\p{scx=armn}]|[\p{scx=armn}][og]+$|)"
256 R"([\p{scx=armn}][og]+[\p{scx=armn}]|)"
257 R"([\p{sc=cans}].*[a-z]|[a-z].*[\p{sc=cans}]|)" 253 R"([\p{sc=cans}].*[a-z]|[a-z].*[\p{sc=cans}]|)"
258 R"([\p{sc=tfng}].*[a-z]|[a-z].*[\p{sc=tfng}]|)" 254 R"([\p{sc=tfng}].*[a-z]|[a-z].*[\p{sc=tfng}]|)"
259 R"([^\p{scx=latn}\p{scx=grek}\p{scx=cyrl}][\u0300-\u0339])", 255 R"([\p{sc=miao}].*[a-z]|[a-z].*[\p{sc=miao}]|)"
260 -1, US_INV), 256 R"([^\p{scx=latn}\p{scx=grek}\p{scx=cyrl}][\u0300-\u0339])"),
Peter Kasting 2017/08/29 04:08:29 Were these just default values?
261 0, status); 257 0, status);
262 tls_index.Set(dangerous_pattern); 258 tls_index.Set(dangerous_pattern);
263 } 259 }
264 dangerous_pattern->reset(label_string); 260 dangerous_pattern->reset(label_string);
265 return !dangerous_pattern->find(); 261 return !dangerous_pattern->find();
266 } 262 }
267 263
268 bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) { 264 bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {
269 size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0); 265 size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0);
270 icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname_length); 266 icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname_length);
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
323 allowed_set.addAll(*recommended_set); 319 allowed_set.addAll(*recommended_set);
324 const icu::UnicodeSet* inclusion_set = uspoof_getInclusionUnicodeSet(status); 320 const icu::UnicodeSet* inclusion_set = uspoof_getInclusionUnicodeSet(status);
325 allowed_set.addAll(*inclusion_set); 321 allowed_set.addAll(*inclusion_set);
326 322
327 // Five aspirational scripts are taken from UTR 31 Table 6 at 323 // Five aspirational scripts are taken from UTR 31 Table 6 at
328 // http://www.unicode.org/reports/tr31/#Aspirational_Use_Scripts . 324 // http://www.unicode.org/reports/tr31/#Aspirational_Use_Scripts .
329 // Not all the characters of aspirational scripts are suitable for 325 // Not all the characters of aspirational scripts are suitable for
330 // identifiers. Therefore, only characters belonging to 326 // identifiers. Therefore, only characters belonging to
331 // [:Identifier_Type=Aspirational:] (listed in 'Status/Type=Aspirational' 327 // [:Identifier_Type=Aspirational:] (listed in 'Status/Type=Aspirational'
332 // section at 328 // section at
333 // http://www.unicode.org/Public/security/latest/xidmodifications.txt) are 329 // http://www.unicode.org/Public/security/9.0.0/IdentifierType.txt) are
334 // are added to the allowed set. The list has to be updated when a new 330 // added to the allowed set. The list has to be updated when a new
335 // version of Unicode is released. The current version is 9.0.0 and ICU 60 331 // version of Unicode is released. The current version is 9.0.0 and ICU 60
336 // will have Unicode 10.0 data. 332 // will have Unicode 10.0 data.
333 // Note that Mongolian is dropped because it's written vertically.
337 #if U_ICU_VERSION_MAJOR_NUM < 60 334 #if U_ICU_VERSION_MAJOR_NUM < 60
338 const icu::UnicodeSet aspirational_scripts( 335 const icu::UnicodeSet aspirational_scripts(
339 icu::UnicodeString( 336 icu::UnicodeString(
340 // Unified Canadian Syllabics 337 // Unified Canadian Syllabics
341 "[\\u1401-\\u166C\\u166F-\\u167F" 338 "[\\u1401-\\u166C\\u166F-\\u167F"
342 // Mongolian
343 "\\u1810-\\u1819\\u1820-\\u1877\\u1880-\\u18AA"
344 // Unified Canadian Syllabics 339 // Unified Canadian Syllabics
345 "\\u18B0-\\u18F5" 340 "\\u18B0-\\u18F5"
346 // Tifinagh 341 // Tifinagh
347 "\\u2D30-\\u2D67\\u2D7F" 342 "\\u2D30-\\u2D67\\u2D7F"
348 // Yi 343 // Yi
349 "\\uA000-\\uA48C" 344 "\\uA000-\\uA48C"
350 // Miao 345 // Miao
351 "\\U00016F00-\\U00016F44\\U00016F50-\\U00016F7E" 346 "\\U00016F00-\\U00016F44\\U00016F50-\\U00016F7E"
352 "\\U00016F8F-\\U00016F9F]", 347 "\\U00016F8F-\\U00016F9F]",
353 -1, US_INV), 348 -1, US_INV),
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
385 allowed_set.remove(0x0F8Cu); 380 allowed_set.remove(0x0F8Cu);
386 allowed_set.remove(0x0F8Du); 381 allowed_set.remove(0x0F8Du);
387 allowed_set.remove(0x0F8Eu); 382 allowed_set.remove(0x0F8Eu);
388 allowed_set.remove(0x0F8Fu); 383 allowed_set.remove(0x0F8Fu);
389 #endif 384 #endif
390 385
391 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status); 386 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status);
392 } 387 }
393 388
394 } // namespace url_formatter 389 } // namespace url_formatter
OLDNEW
« no previous file with comments | « no previous file | components/url_formatter/url_formatter_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698