| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h" |
| 6 | 6 |
| 7 #include <algorithm> | 7 #include <algorithm> |
| 8 #include <string> | 8 #include <string> |
| 9 #include <vector> | 9 #include <vector> |
| 10 | 10 |
| (...skipping 12 matching lines...) Expand all Loading... |
| 23 | 23 |
| 24 PhishingUrlFeatureExtractor::~PhishingUrlFeatureExtractor() {} | 24 PhishingUrlFeatureExtractor::~PhishingUrlFeatureExtractor() {} |
| 25 | 25 |
| 26 bool PhishingUrlFeatureExtractor::ExtractFeatures(const GURL& url, | 26 bool PhishingUrlFeatureExtractor::ExtractFeatures(const GURL& url, |
| 27 FeatureMap* features) { | 27 FeatureMap* features) { |
| 28 base::ElapsedTimer timer; | 28 base::ElapsedTimer timer; |
| 29 if (url.HostIsIPAddress()) { | 29 if (url.HostIsIPAddress()) { |
| 30 if (!features->AddBooleanFeature(features::kUrlHostIsIpAddress)) | 30 if (!features->AddBooleanFeature(features::kUrlHostIsIpAddress)) |
| 31 return false; | 31 return false; |
| 32 } else { | 32 } else { |
| 33 // Remove any leading/trailing dots. |
| 33 std::string host; | 34 std::string host; |
| 34 TrimString(url.host(), ".", &host); // Remove any leading/trailing dots. | 35 base::TrimString(url.host(), ".", &host); |
| 35 | 36 |
| 36 // TODO(bryner): Ensure that the url encoding is consistent with | 37 // TODO(bryner): Ensure that the url encoding is consistent with |
| 37 // the features in the model. | 38 // the features in the model. |
| 38 | 39 |
| 39 // Disallow unknown registries so that we don't classify | 40 // Disallow unknown registries so that we don't classify |
| 40 // partial hostnames (e.g. "www.subdomain"). | 41 // partial hostnames (e.g. "www.subdomain"). |
| 41 size_t registry_length = | 42 size_t registry_length = |
| 42 net::registry_controlled_domains::GetRegistryLength( | 43 net::registry_controlled_domains::GetRegistryLength( |
| 43 host, | 44 host, |
| 44 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES, | 45 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES, |
| (...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 115 // Copy over only the splits that are 3 or more chars long. | 116 // Copy over only the splits that are 3 or more chars long. |
| 116 // TODO(bryner): Determine a meaningful min size. | 117 // TODO(bryner): Determine a meaningful min size. |
| 117 for (std::vector<std::string>::iterator it = raw_splits.begin(); | 118 for (std::vector<std::string>::iterator it = raw_splits.begin(); |
| 118 it != raw_splits.end(); ++it) { | 119 it != raw_splits.end(); ++it) { |
| 119 if (it->length() >= kMinPathComponentLength) | 120 if (it->length() >= kMinPathComponentLength) |
| 120 tokens->push_back(*it); | 121 tokens->push_back(*it); |
| 121 } | 122 } |
| 122 } | 123 } |
| 123 | 124 |
| 124 } // namespace safe_browsing | 125 } // namespace safe_browsing |
| OLD | NEW |