Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(206)

Unified Diff: components/url_formatter/idn_spoof_checker.cc

Issue 2889303003: Revert of Mitigate spoofing attempt using Latin letters. (Closed)
Patch Set: Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « components/url_formatter/idn_spoof_checker.h ('k') | components/url_formatter/top_domains/BUILD.gn » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: components/url_formatter/idn_spoof_checker.cc
diff --git a/components/url_formatter/idn_spoof_checker.cc b/components/url_formatter/idn_spoof_checker.cc
index d90442c84cf64b6f052a919984042e6151367796..156f0cd65ac81acc0907a01936837326926ef061 100644
--- a/components/url_formatter/idn_spoof_checker.cc
+++ b/components/url_formatter/idn_spoof_checker.cc
@@ -8,11 +8,9 @@
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/threading/thread_local_storage.h"
-#include "net/base/lookup_string_in_fixed_set.h"
#include "third_party/icu/source/common/unicode/schriter.h"
#include "third_party/icu/source/common/unicode/unistr.h"
#include "third_party/icu/source/i18n/unicode/regex.h"
-#include "third_party/icu/source/i18n/unicode/translit.h"
#include "third_party/icu/source/i18n/unicode/uspoof.h"
namespace url_formatter {
@@ -22,31 +20,6 @@
void OnThreadTermination(void* regex_matcher) {
delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);
-}
-
-#include "components/url_formatter/top_domains/alexa_skeletons-inc.cc"
-// All the domains in the above file have 3 or fewer labels.
-const size_t kNumberOfLabelsToCheck = 3;
-
-bool LookupMatchInTopDomains(base::StringPiece skeleton) {
- DCHECK_NE(skeleton.back(), '.');
- auto labels = base::SplitStringPiece(skeleton, ".", base::KEEP_WHITESPACE,
- base::SPLIT_WANT_ALL);
-
- if (labels.size() > kNumberOfLabelsToCheck) {
- labels.erase(labels.begin(),
- labels.begin() + labels.size() - kNumberOfLabelsToCheck);
- }
-
- while (labels.size() > 1) {
- std::string partial_skeleton = base::JoinString(labels, ".");
- if (net::LookupStringInFixedSet(
- kDafsa, arraysize(kDafsa), partial_skeleton.data(),
- partial_skeleton.length()) != net::kDafsaNotFound)
- return true;
- labels.erase(labels.begin());
- }
- return false;
}
} // namespace
@@ -95,14 +68,11 @@
icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);
non_ascii_latin_letters_.freeze();
- // The following two sets are parts of |dangerous_patterns_|.
+ // These letters are parts of |dangerous_patterns_|.
kana_letters_exceptions_ = icu::UnicodeSet(
UNICODE_STRING_SIMPLE("[\\u3078-\\u307a\\u30d8-\\u30da\\u30fb-\\u30fe]"),
status);
kana_letters_exceptions_.freeze();
- combining_diacritics_exceptions_ =
- icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0300-\\u0339]"), status);
- combining_diacritics_exceptions_.freeze();
// These Cyrillic letters look like Latin. A domain label entirely made of
// these letters is blocked as a simplified whole-script-spoofable.
@@ -115,36 +85,6 @@
cyrillic_letters_.freeze();
DCHECK(U_SUCCESS(status));
- // This set is used to determine whether or not to apply a slow
- // transliteration to remove diacritics to a given hostname before the
- // confusable skeleton calculation for comparison with top domain names. If
- // it has any character outside the set, the expensive step will be skipped
- // because it cannot match any of top domain names.
- // The last ([\u0300-\u0339] is a shorthand for "[:Identifier_Status=Allowed:]
- // & [:Script_Extensions=Inherited:] - [\\u200C\\u200D]". The latter is a
- // subset of the former but it does not matter because hostnames with
- // characters outside the latter set would be rejected in an earlier step.
- lgc_letters_n_ascii_ = icu::UnicodeSet(
- UNICODE_STRING_SIMPLE("[[:Latin:][:Greek:][:Cyrillic:][0-9\\u002e_"
- "\\u002d][\\u0300-\\u0339]]"),
- status);
- lgc_letters_n_ascii_.freeze();
-
- // Used for diacritics-removal before the skeleton calculation. Add
- // "ł > l; ø > o; đ > d" that are not handled by "NFD; Nonspacing mark
- // removal; NFC". On top of that, supplement the Unicode confusable list by
- // replacing {U+043A (к), U+0138(ĸ), U+03BA(κ)}, U+04CF (ӏ) and U+043F(п) by
- // 'k', 'l' and 'n', respectively.
- // TODO(jshin): Revisit "ł > l; ø > o" mapping.
- UParseError parse_error;
- transliterator_.reset(icu::Transliterator::createFromRules(
- UNICODE_STRING_SIMPLE("DropAcc"),
- icu::UnicodeString("::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;"
- " ł > l; ø > o; đ > d; ӏ > l; [кĸκ] > k; п > n;"),
- UTRANS_FORWARD, parse_error, status));
- DCHECK(U_SUCCESS(status))
- << "Spoofchecker initalization failed due to an error: "
- << u_errorName(status);
}
IDNSpoofChecker::~IDNSpoofChecker() {
@@ -180,13 +120,9 @@
return false;
// If there's no script mixing, the input is regarded as safe without any
- // extra check unless it falls into one of three categories:
- // - contains Kana letter exceptions
- // - the TLD is ASCII and the input is made entirely of Cyrillic letters
- // that look like Latin letters.
- // - it has combining diacritic marks.
- // Note that the following combinations of scripts are treated as a 'logical'
- // single script.
+ // extra check unless it contains Kana letter exceptions or it's made entirely
+ // of Cyrillic letters that look like Latin letters. Note that the following
+ // combinations of scripts are treated as a 'logical' single script.
// - Chinese: Han, Bopomofo, Common
// - Japanese: Han, Hiragana, Katakana, Common
// - Korean: Hangul, Han, Common
@@ -194,19 +130,14 @@
if (result == USPOOF_ASCII)
return true;
if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&
- kana_letters_exceptions_.containsNone(label_string) &&
- combining_diacritics_exceptions_.containsNone(label_string)) {
+ kana_letters_exceptions_.containsNone(label_string)) {
// Check Cyrillic confusable only for ASCII TLDs.
return !is_tld_ascii || !IsMadeOfLatinAlikeCyrillic(label_string);
}
// Additional checks for |label| with multiple scripts, one of which is Latin.
// Disallow non-ASCII Latin letters to mix with a non-Latin script.
- // Note that the non-ASCII Latin check should not be applied when the entire
- // label is made of Latin. Checking with lgc_letters set here should be fine
- // because script mixing of LGC is already rejected.
- if (non_ascii_latin_letters_.containsSome(label_string) &&
- !lgc_letters_n_ascii_.containsAll(label_string))
+ if (non_ascii_latin_letters_.containsSome(label_string))
return false;
if (!tls_index.initialized())
@@ -236,9 +167,6 @@
// Letter Co) to be next to Latin.
// - Disallow Latin 'o' and 'g' next to Armenian.
// - Disalow mixing of Latin and Canadian Syllabary.
- // - Disallow combining diacritical mark (U+0300-U+0339) after a non-LGC
- // character. Other combining diacritical marks are not in the allowed
- // character set.
dangerous_pattern = new icu::RegexMatcher(
icu::UnicodeString(
R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])"
@@ -253,8 +181,7 @@
R"([a-z][\u0585\u0581]+[a-z]|)"
R"(^[og]+[\p{scx=armn}]|[\p{scx=armn}][og]+$|)"
R"([\p{scx=armn}][og]+[\p{scx=armn}]|)"
- R"([\p{sc=cans}].*[a-z]|[a-z].*[\p{sc=cans}]|)"
- R"([^\p{scx=latn}\p{scx=grek}\p{scx=cyrl}][\u0300-\u0339])",
+ R"([\p{sc=cans}].*[a-z]|[a-z].*[\p{sc=cans}])",
-1, US_INV),
0, status);
tls_index.Set(dangerous_pattern);
@@ -263,33 +190,10 @@
return !dangerous_pattern->find();
}
-bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {
- size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0);
- icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname_length);
- // If input has any characters outside Latin-Greek-Cyrillic and [0-9._-],
- // there is no point in getting rid of diacritics because combining marks
- // attached to non-LGC characters are already blocked.
- if (lgc_letters_n_ascii_.span(ustr_host, 0, USET_SPAN_CONTAINED) ==
- ustr_host.length())
- transliterator_.get()->transliterate(ustr_host);
-
- UErrorCode status = U_ZERO_ERROR;
- icu::UnicodeString ustr_skeleton;
- uspoof_getSkeletonUnicodeString(checker_, 0, ustr_host, ustr_skeleton,
- &status);
- if (U_FAILURE(status))
- return false;
- std::string skeleton;
- ustr_skeleton.toUTF8String(skeleton);
- return LookupMatchInTopDomains(skeleton);
-}
-
bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(
const icu::UnicodeString& label) {
- // Collect all the Cyrillic letters in |label_string| and see if they're
- // a subset of |cyrillic_letters_latin_alike_|.
// A shortcut of defining cyrillic_letters_latin_alike_ to include [0-9] and
- // [_-] and checking if the set contains all letters of |label|
+ // [_-] and checking if the set contains all letters of |label_string|
// would work in most cases, but not if a label has non-letters outside
// ASCII.
icu::UnicodeSet cyrillic_in_label;
« no previous file with comments | « components/url_formatter/idn_spoof_checker.h ('k') | components/url_formatter/top_domains/BUILD.gn » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698