| Index: components/url_formatter/idn_spoof_checker.cc
 | 
| diff --git a/components/url_formatter/idn_spoof_checker.cc b/components/url_formatter/idn_spoof_checker.cc
 | 
| index dbed171b9ea309fa7d9450f1c7fe07e597df6225..02dafe39c3b8202981969f265bacdc9fa00327a8 100644
 | 
| --- a/components/url_formatter/idn_spoof_checker.cc
 | 
| +++ b/components/url_formatter/idn_spoof_checker.cc
 | 
| @@ -8,9 +8,11 @@
 | 
|  #include "base/strings/string_split.h"
 | 
|  #include "base/strings/string_util.h"
 | 
|  #include "base/threading/thread_local_storage.h"
 | 
| +#include "net/base/lookup_string_in_fixed_set.h"
 | 
|  #include "third_party/icu/source/common/unicode/schriter.h"
 | 
|  #include "third_party/icu/source/common/unicode/unistr.h"
 | 
|  #include "third_party/icu/source/i18n/unicode/regex.h"
 | 
| +#include "third_party/icu/source/i18n/unicode/translit.h"
 | 
|  #include "third_party/icu/source/i18n/unicode/uspoof.h"
 | 
|  
 | 
|  namespace url_formatter {
 | 
| @@ -22,6 +24,31 @@ void OnThreadTermination(void* regex_matcher) {
 | 
|    delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);
 | 
|  }
 | 
|  
 | 
| +#include "components/url_formatter/top_domains/alexa_skeletons-inc.cc"
 | 
| +// All the domains in the above file have 3 or fewer labels.
 | 
| +const size_t kNumberOfLabelsToCheck = 3;
 | 
| +
 | 
| +bool LookupMatchInTopDomains(base::StringPiece skeleton) {
 | 
| +  DCHECK_NE(skeleton.back(), '.');
 | 
| +  auto labels = base::SplitStringPiece(skeleton, ".", base::KEEP_WHITESPACE,
 | 
| +                                       base::SPLIT_WANT_ALL);
 | 
| +
 | 
| +  if (labels.size() > kNumberOfLabelsToCheck) {
 | 
| +    labels.erase(labels.begin(),
 | 
| +                 labels.begin() + labels.size() - kNumberOfLabelsToCheck);
 | 
| +  }
 | 
| +
 | 
| +  while (labels.size() > 1) {
 | 
| +    std::string partial_skeleton = base::JoinString(labels, ".");
 | 
| +    if (net::LookupStringInFixedSet(
 | 
| +            kDafsa, arraysize(kDafsa), partial_skeleton.data(),
 | 
| +            partial_skeleton.length()) != net::kDafsaNotFound)
 | 
| +      return true;
 | 
| +    labels.erase(labels.begin());
 | 
| +  }
 | 
| +  return false;
 | 
| +}
 | 
| +
 | 
|  }  // namespace
 | 
|  
 | 
|  IDNSpoofChecker::IDNSpoofChecker() {
 | 
| @@ -68,11 +95,14 @@ IDNSpoofChecker::IDNSpoofChecker() {
 | 
|        icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);
 | 
|    non_ascii_latin_letters_.freeze();
 | 
|  
 | 
| -  // These letters are parts of |dangerous_patterns_|.
 | 
| +  // The following two sets are parts of |dangerous_patterns_|.
 | 
|    kana_letters_exceptions_ = icu::UnicodeSet(
 | 
|        UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"),
 | 
|        status);
 | 
|    kana_letters_exceptions_.freeze();
 | 
| +  combining_diacritics_exceptions_ =
 | 
| +      icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0300-\\u0339]"), status);
 | 
| +  combining_diacritics_exceptions_.freeze();
 | 
|  
 | 
|    // These Cyrillic letters look like Latin. A domain label entirely made of
 | 
|    // these letters is blocked as a simplified whole-script-spoofable.
 | 
| @@ -85,6 +115,36 @@ IDNSpoofChecker::IDNSpoofChecker() {
 | 
|    cyrillic_letters_.freeze();
 | 
|  
 | 
|    DCHECK(U_SUCCESS(status));
 | 
| +  // This set is used to determine whether or not to apply a slow
 | 
| +  // transliteration to remove diacritics to a given hostname before the
 | 
| +  // confusable skeleton calculation for comparison with top domain names. If
 | 
| +  // it has any character outside the set, the expensive step will be skipped
 | 
| +  // because it cannot match any of top domain names.
 | 
| +  // The last ([\u0300-\u0339] is a shorthand for "[:Identifier_Status=Allowed:]
 | 
| +  // & [:Script_Extensions=Inherited:] - [\\u200C\\u200D]". The latter is a
 | 
| +  // subset of the former but it does not matter because hostnames with
 | 
| +  // characters outside the latter set would be rejected in an earlier step.
 | 
| +  lgc_letters_n_ascii_ = icu::UnicodeSet(
 | 
| +      UNICODE_STRING_SIMPLE("[[:Latin:][:Greek:][:Cyrillic:][0-9\\u002e_"
 | 
| +                            "\\u002d][\\u0300-\\u0339]]"),
 | 
| +      status);
 | 
| +  lgc_letters_n_ascii_.freeze();
 | 
| +
 | 
| +  // Used for diacritics-removal before the skeleton calculation. Add
 | 
| +  // "ł > l; ø > o; đ > d" that are not handled by "NFD; Nonspacing mark
 | 
| +  // removal; NFC". On top of that, supplement the Unicode confusable list by
 | 
| +  // replacing {U+043A (к), U+0138(ĸ), U+03BA(κ)}, U+04CF (ӏ) and U+043F(п) by
 | 
| +  // 'k', 'l' and 'n', respectively.
 | 
| +  // TODO(jshin): Revisit "ł > l; ø > o" mapping.
 | 
| +  UParseError parse_error;
 | 
| +  transliterator_.reset(icu::Transliterator::createFromRules(
 | 
| +      UNICODE_STRING_SIMPLE("DropAcc"),
 | 
| +      icu::UnicodeString("::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;"
 | 
| +                         " ł > l; ø > o; đ > d; ӏ > l; [кĸκ] > k; п > n;"),
 | 
| +      UTRANS_FORWARD, parse_error, status));
 | 
| +  DCHECK(U_SUCCESS(status))
 | 
| +      << "Spoofchecker initalization failed due to an error: "
 | 
| +      << u_errorName(status);
 | 
|  }
 | 
|  
 | 
|  IDNSpoofChecker::~IDNSpoofChecker() {
 | 
| @@ -120,9 +180,13 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,
 | 
|      return false;
 | 
|  
 | 
|    // If there's no script mixing, the input is regarded as safe without any
 | 
| -  // extra check unless it contains Kana letter exceptions or it's made entirely
 | 
| -  // of Cyrillic letters that look like Latin letters. Note that the following
 | 
| -  // combinations of scripts are treated as a 'logical' single script.
 | 
| +  // extra check unless it falls into one of three categories:
 | 
| +  //   - contains Kana letter exceptions
 | 
| +  //   - the TLD is ASCII and the input is made entirely of Cyrillic letters
 | 
| +  //     that look like Latin letters.
 | 
| +  //   - it has combining diacritic marks.
 | 
| +  // Note that the following combinations of scripts are treated as a 'logical'
 | 
| +  // single script.
 | 
|    //  - Chinese: Han, Bopomofo, Common
 | 
|    //  - Japanese: Han, Hiragana, Katakana, Common
 | 
|    //  - Korean: Hangul, Han, Common
 | 
| @@ -130,14 +194,19 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,
 | 
|    if (result == USPOOF_ASCII)
 | 
|      return true;
 | 
|    if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&
 | 
| -      kana_letters_exceptions_.containsNone(label_string)) {
 | 
| +      kana_letters_exceptions_.containsNone(label_string) &&
 | 
| +      combining_diacritics_exceptions_.containsNone(label_string)) {
 | 
|      // Check Cyrillic confusable only for ASCII TLDs.
 | 
|      return !is_tld_ascii || !IsMadeOfLatinAlikeCyrillic(label_string);
 | 
|    }
 | 
|  
 | 
|    // Additional checks for |label| with multiple scripts, one of which is Latin.
 | 
|    // Disallow non-ASCII Latin letters to mix with a non-Latin script.
 | 
| -  if (non_ascii_latin_letters_.containsSome(label_string))
 | 
| +  // Note that the non-ASCII Latin check should not be applied when the entire
 | 
| +  // label is made of Latin. Checking with lgc_letters set here should be fine
 | 
| +  // because script mixing of LGC is already rejected.
 | 
| +  if (non_ascii_latin_letters_.containsSome(label_string) &&
 | 
| +      !lgc_letters_n_ascii_.containsAll(label_string))
 | 
|      return false;
 | 
|  
 | 
|    if (!tls_index.initialized())
 | 
| @@ -167,6 +236,9 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,
 | 
|      //   Letter Co) to be next to Latin.
 | 
|      // - Disallow Latin 'o' and 'g' next to Armenian.
 | 
|      // - Disalow mixing of Latin and Canadian Syllabary.
 | 
| +    // - Disallow combining diacritical mark (U+0300-U+0339) after a non-LGC
 | 
| +    //   character. Other combining diacritical marks are not in the allowed
 | 
| +    //   character set.
 | 
|      dangerous_pattern = new icu::RegexMatcher(
 | 
|          icu::UnicodeString(
 | 
|              R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])"
 | 
| @@ -181,7 +253,8 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,
 | 
|              R"([a-z][\u0585\u0581]+[a-z]|)"
 | 
|              R"(^[og]+[\p{scx=armn}]|[\p{scx=armn}][og]+$|)"
 | 
|              R"([\p{scx=armn}][og]+[\p{scx=armn}]|)"
 | 
| -            R"([\p{sc=cans}].*[a-z]|[a-z].*[\p{sc=cans}])",
 | 
| +            R"([\p{sc=cans}].*[a-z]|[a-z].*[\p{sc=cans}]|)"
 | 
| +            R"([^\p{scx=latn}\p{scx=grek}\p{scx=cyrl}][\u0300-\u0339])",
 | 
|              -1, US_INV),
 | 
|          0, status);
 | 
|      tls_index.Set(dangerous_pattern);
 | 
| @@ -190,10 +263,33 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,
 | 
|    return !dangerous_pattern->find();
 | 
|  }
 | 
|  
 | 
| +bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {
 | 
| +  size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0);
 | 
| +  icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname_length);
 | 
| +  // If input has any characters outside Latin-Greek-Cyrillic and [0-9._-],
 | 
| +  // there is no point in getting rid of diacritics because combining marks
 | 
| +  // attached to non-LGC characters are already blocked.
 | 
| +  if (lgc_letters_n_ascii_.span(ustr_host, 0, USET_SPAN_CONTAINED) ==
 | 
| +      ustr_host.length())
 | 
| +    transliterator_.get()->transliterate(ustr_host);
 | 
| +
 | 
| +  UErrorCode status = U_ZERO_ERROR;
 | 
| +  icu::UnicodeString ustr_skeleton;
 | 
| +  uspoof_getSkeletonUnicodeString(checker_, 0, ustr_host, ustr_skeleton,
 | 
| +                                  &status);
 | 
| +  if (U_FAILURE(status))
 | 
| +    return false;
 | 
| +  std::string skeleton;
 | 
| +  ustr_skeleton.toUTF8String(skeleton);
 | 
| +  return LookupMatchInTopDomains(skeleton);
 | 
| +}
 | 
| +
 | 
|  bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(
 | 
|      const icu::UnicodeString& label) {
 | 
| +  // Collect all the Cyrillic letters in |label_string| and see if they're
 | 
| +  // a subset of |cyrillic_letters_latin_alike_|.
 | 
|    // A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and
 | 
| -  // [_-] and checking if the set contains all letters of |label_string|
 | 
| +  // [_-] and checking if the set contains all letters of |label|
 | 
|    // would work in most cases, but not if a label has non-letters outside
 | 
|    // ASCII.
 | 
|    icu::UnicodeSet cyrillic_in_label;
 | 
| 
 |