| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h" | |
| 6 | |
| 7 #include <algorithm> | |
| 8 #include <string> | |
| 9 #include <vector> | |
| 10 | |
| 11 #include "base/logging.h" | |
| 12 #include "base/metrics/histogram_macros.h" | |
| 13 #include "base/strings/string_split.h" | |
| 14 #include "base/strings/string_util.h" | |
| 15 #include "base/timer/elapsed_timer.h" | |
| 16 #include "chrome/renderer/safe_browsing/features.h" | |
| 17 #include "net/base/registry_controlled_domains/registry_controlled_domain.h" | |
| 18 #include "url/gurl.h" | |
| 19 | |
| 20 namespace safe_browsing { | |
| 21 | |
| 22 PhishingUrlFeatureExtractor::PhishingUrlFeatureExtractor() {} | |
| 23 | |
| 24 PhishingUrlFeatureExtractor::~PhishingUrlFeatureExtractor() {} | |
| 25 | |
| 26 bool PhishingUrlFeatureExtractor::ExtractFeatures(const GURL& url, | |
| 27 FeatureMap* features) { | |
| 28 base::ElapsedTimer timer; | |
| 29 if (url.HostIsIPAddress()) { | |
| 30 if (!features->AddBooleanFeature(features::kUrlHostIsIpAddress)) | |
| 31 return false; | |
| 32 } else { | |
| 33 // Remove any leading/trailing dots. | |
| 34 std::string host; | |
| 35 base::TrimString(url.host(), ".", &host); | |
| 36 | |
| 37 // TODO(bryner): Ensure that the url encoding is consistent with | |
| 38 // the features in the model. | |
| 39 | |
| 40 // Disallow unknown registries so that we don't classify | |
| 41 // partial hostnames (e.g. "www.subdomain"). | |
| 42 size_t registry_length = | |
| 43 net::registry_controlled_domains::GetCanonicalHostRegistryLength( | |
| 44 host, net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES, | |
| 45 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); | |
| 46 | |
| 47 if (registry_length == 0 || registry_length == std::string::npos) { | |
| 48 DVLOG(1) << "Could not find TLD for host: " << host; | |
| 49 return false; | |
| 50 } | |
| 51 DCHECK_LT(registry_length, host.size()) << "Non-zero registry length, but " | |
| 52 "host is only a TLD: " << host; | |
| 53 size_t tld_start = host.size() - registry_length; | |
| 54 if (!features->AddBooleanFeature(features::kUrlTldToken + | |
| 55 host.substr(tld_start))) | |
| 56 return false; | |
| 57 | |
| 58 // Pull off the TLD and the preceeding dot. | |
| 59 host.erase(tld_start - 1); | |
| 60 std::vector<std::string> host_tokens = base::SplitString( | |
| 61 host, ".", base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY); | |
| 62 if (host_tokens.empty()) { | |
| 63 DVLOG(1) << "Could not find domain for host: " << host; | |
| 64 return false; | |
| 65 } | |
| 66 if (!features->AddBooleanFeature(features::kUrlDomainToken + | |
| 67 host_tokens.back())) | |
| 68 return false; | |
| 69 host_tokens.pop_back(); | |
| 70 | |
| 71 // Now we're just left with the "other" host tokens. | |
| 72 for (std::vector<std::string>::iterator it = host_tokens.begin(); | |
| 73 it != host_tokens.end(); ++it) { | |
| 74 if (!features->AddBooleanFeature(features::kUrlOtherHostToken + *it)) | |
| 75 return false; | |
| 76 } | |
| 77 | |
| 78 if (host_tokens.size() > 1) { | |
| 79 if (!features->AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne)) | |
| 80 return false; | |
| 81 if (host_tokens.size() > 3) { | |
| 82 if (!features->AddBooleanFeature( | |
| 83 features::kUrlNumOtherHostTokensGTThree)) | |
| 84 return false; | |
| 85 } | |
| 86 } | |
| 87 } | |
| 88 | |
| 89 std::vector<std::string> long_tokens; | |
| 90 SplitStringIntoLongAlphanumTokens(url.path(), &long_tokens); | |
| 91 for (const std::string& token : long_tokens) { | |
| 92 if (!features->AddBooleanFeature(features::kUrlPathToken + token)) | |
| 93 return false; | |
| 94 } | |
| 95 | |
| 96 UMA_HISTOGRAM_TIMES("SBClientPhishing.URLFeatureTime", timer.Elapsed()); | |
| 97 return true; | |
| 98 } | |
| 99 | |
| 100 // static | |
| 101 void PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens( | |
| 102 const std::string& full, | |
| 103 std::vector<std::string>* tokens) { | |
| 104 // Split on common non-alphanumerics. | |
| 105 // TODO(bryner): Split on all(?) non-alphanumerics and handle %XX properly. | |
| 106 static const char kTokenSeparators[] = ".,\\/_-|=%:!&"; | |
| 107 for (const base::StringPiece& token : | |
| 108 base::SplitStringPiece(full, kTokenSeparators, base::KEEP_WHITESPACE, | |
| 109 base::SPLIT_WANT_NONEMPTY)) { | |
| 110 // Copy over only the splits that are 3 or more chars long. | |
| 111 // TODO(bryner): Determine a meaningful min size. | |
| 112 if (token.length() >= kMinPathComponentLength) | |
| 113 tokens->push_back(token.as_string()); | |
| 114 } | |
| 115 } | |
| 116 | |
| 117 } // namespace safe_browsing | |
| OLD | NEW |