chrome/renderer/safe_browsing/phishing_url_feature_extractor.cc - Issue 2667343006: Componentize safe_browsing [X+1] : move the renderer part to component.

Side by Side Diff: chrome/renderer/safe_browsing/phishing_url_feature_extractor.cc

Issue 2667343006: Componentize safe_browsing [X+1] : move the renderer part to component.

Patch Set: Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « chrome/renderer/safe_browsing/phishing_url_feature_extractor.h ('k') | chrome/renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"

6

7 #include <algorithm>

8 #include <string>

9 #include <vector>

10

11 #include "base/logging.h"

12 #include "base/metrics/histogram_macros.h"

13 #include "base/strings/string_split.h"

14 #include "base/strings/string_util.h"

15 #include "base/timer/elapsed_timer.h"

16 #include "chrome/renderer/safe_browsing/features.h"

17 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"

18 #include "url/gurl.h"

19

20 namespace safe_browsing {

21

22 PhishingUrlFeatureExtractor::PhishingUrlFeatureExtractor() {}

23

24 PhishingUrlFeatureExtractor::~PhishingUrlFeatureExtractor() {}

25

26 bool PhishingUrlFeatureExtractor::ExtractFeatures(const GURL& url,

27 FeatureMap* features) {

28 base::ElapsedTimer timer;

29 if (url.HostIsIPAddress()) {

30 if (!features->AddBooleanFeature(features::kUrlHostIsIpAddress))

31 return false;

32 } else {

33 // Remove any leading/trailing dots.

34 std::string host;

35 base::TrimString(url.host(), ".", &host);

36

37 // TODO(bryner): Ensure that the url encoding is consistent with

38 // the features in the model.

39

40 // Disallow unknown registries so that we don't classify

41 // partial hostnames (e.g. "www.subdomain").

42 size_t registry_length =

43 net::registry_controlled_domains::GetCanonicalHostRegistryLength(

44 host, net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,

45 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);

46

47 if (registry_length == 0 \|\| registry_length == std::string::npos) {

48 DVLOG(1) << "Could not find TLD for host: " << host;

49 return false;

50 }

51 DCHECK_LT(registry_length, host.size()) << "Non-zero registry length, but "

52 "host is only a TLD: " << host;

53 size_t tld_start = host.size() - registry_length;

54 if (!features->AddBooleanFeature(features::kUrlTldToken +

55 host.substr(tld_start)))

56 return false;

57

58 // Pull off the TLD and the preceeding dot.

59 host.erase(tld_start - 1);

60 std::vector<std::string> host_tokens = base::SplitString(

61 host, ".", base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

62 if (host_tokens.empty()) {

63 DVLOG(1) << "Could not find domain for host: " << host;

64 return false;

65 }

66 if (!features->AddBooleanFeature(features::kUrlDomainToken +

67 host_tokens.back()))

68 return false;

69 host_tokens.pop_back();

70

71 // Now we're just left with the "other" host tokens.

72 for (std::vector<std::string>::iterator it = host_tokens.begin();

73 it != host_tokens.end(); ++it) {

74 if (!features->AddBooleanFeature(features::kUrlOtherHostToken + *it))

75 return false;

76 }

77

78 if (host_tokens.size() > 1) {

79 if (!features->AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne))

80 return false;

81 if (host_tokens.size() > 3) {

82 if (!features->AddBooleanFeature(

83 features::kUrlNumOtherHostTokensGTThree))

84 return false;

85 }

86 }

87 }

88

89 std::vector<std::string> long_tokens;

90 SplitStringIntoLongAlphanumTokens(url.path(), &long_tokens);

91 for (const std::string& token : long_tokens) {

92 if (!features->AddBooleanFeature(features::kUrlPathToken + token))

93 return false;

94 }

95

96 UMA_HISTOGRAM_TIMES("SBClientPhishing.URLFeatureTime", timer.Elapsed());

97 return true;

98 }

99

100 // static

101 void PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(

102 const std::string& full,

103 std::vector<std::string>* tokens) {

104 // Split on common non-alphanumerics.

105 // TODO(bryner): Split on all(?) non-alphanumerics and handle %XX properly.

106 static const char kTokenSeparators[] = ".,\\/_-\|=%:!&";

107 for (const base::StringPiece& token :

108 base::SplitStringPiece(full, kTokenSeparators, base::KEEP_WHITESPACE,

109 base::SPLIT_WANT_NONEMPTY)) {

110 // Copy over only the splits that are 3 or more chars long.

111 // TODO(bryner): Determine a meaningful min size.

112 if (token.length() >= kMinPathComponentLength)

113 tokens->push_back(token.as_string());

114 }

115 }

116

117 } // namespace safe_browsing

OLD	NEW