OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "components/feedback/anonymizer_tool.h" | |
6 | |
7 #include <base/strings/string_number_conversions.h> | |
8 #include <base/strings/string_util.h> | |
9 #include <base/strings/stringprintf.h> | |
10 | |
11 #include "third_party/re2/re2/re2.h" | |
12 | |
13 using re2::RE2; | |
14 | |
15 namespace feedback { | |
16 | |
17 namespace { | |
18 | |
19 // The |kCustomPatterns| array defines patterns to match and anonymize. Each | |
20 // pattern needs to define three capturing parentheses groups: | |
21 // | |
22 // - a group for the pattern before the identifier to be anonymized; | |
23 // - a group for the identifier to be anonymized; | |
24 // - a group for the pattern after the identifier to be anonymized. | |
25 // | |
26 // Every matched identifier (in the context of the whole pattern) is anonymized | |
27 // by replacing it with an incremental instance identifier. Every different | |
28 // pattern defines a separate instance identifier space. See the unit test for | |
29 // AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples. | |
30 // | |
31 // Useful regular expression syntax: | |
32 // | |
33 // +? is a non-greedy (lazy) +. | |
34 // \b matches a word boundary. | |
35 // (?i) turns on case insensitivy for the remainder of the regex. | |
36 // (?-s) turns off "dot matches newline" for the remainder of the regex. | |
37 // (?:regex) denotes non-capturing parentheses group. | |
38 const char* kCustomPatterns[] = { | |
39 "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager | |
40 "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager | |
41 "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant | |
42 "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()", // wpa_supplicant | |
43 "(?-s)(\\[SSID=)(.+?)(\\])", // shill | |
44 }; | |
45 | |
46 } // namespace | |
47 | |
48 AnonymizerTool::AnonymizerTool() | |
49 : custom_patterns_(arraysize(kCustomPatterns)) {} | |
50 | |
51 AnonymizerTool::~AnonymizerTool() {} | |
52 | |
53 std::string AnonymizerTool::Anonymize(const std::string& input) { | |
54 std::string anonymized = AnonymizeMACAddresses(input); | |
55 anonymized = AnonymizeCustomPatterns(anonymized); | |
vasilii
2015/12/17 12:40:36
Looking at the implementation you should take the
battre
2015/12/17 13:24:24
I don't see that this would save much.
vasilii
2015/12/17 13:39:05
You 100% save one copy of |anonymized|. It can be
battre
2015/12/17 14:34:55
Done.
| |
56 return anonymized; | |
57 } | |
58 | |
59 std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) { | |
60 // This regular expression finds the next MAC address. It splits the data into | |
61 // a section preceding the MAC address, an OUI (Organizationally Unique | |
62 // Identifier) part and a NIC (Network Interface Controller) specific part. | |
63 | |
64 RE2::Options options; | |
65 // set_multiline of pcre is not supported by RE2, yet. | |
66 options.set_dot_nl(true); // Dot matches a new line. | |
67 RE2 mac_re( | |
68 "(.*?)(" | |
69 "[0-9a-fA-F][0-9a-fA-F]:" | |
70 "[0-9a-fA-F][0-9a-fA-F]:" | |
71 "[0-9a-fA-F][0-9a-fA-F]):(" | |
72 "[0-9a-fA-F][0-9a-fA-F]:" | |
73 "[0-9a-fA-F][0-9a-fA-F]:" | |
74 "[0-9a-fA-F][0-9a-fA-F])", | |
75 options); | |
76 | |
77 std::string result; | |
78 result.reserve(input.size()); | |
79 | |
80 // Keep consuming, building up a result string as we go. | |
81 re2::StringPiece text(input); | |
82 std::string pre_mac, oui, nic; | |
83 while (re2::RE2::Consume(&text, mac_re, RE2::Arg(&pre_mac), RE2::Arg(&oui), | |
84 RE2::Arg(&nic))) { | |
85 // Look up the MAC address in the hash. | |
86 oui = base::ToLowerASCII(oui); | |
87 nic = base::ToLowerASCII(nic); | |
88 std::string mac = oui + ":" + nic; | |
89 std::string replacement_mac = mac_addresses_[mac]; | |
vasilii
2015/12/17 12:40:37
I'd use a reference here.
battre
2015/12/17 13:24:24
Are you suggesting to the following?
std::string&
| |
90 if (replacement_mac.empty()) { | |
91 // If not found, build up a replacement MAC address by generating a new | |
92 // NIC part. | |
93 int mac_id = mac_addresses_.size(); | |
94 replacement_mac = base::StringPrintf( | |
95 "%s:%02x:%02x:%02x", oui.c_str(), (mac_id & 0x00ff0000) >> 16, | |
96 (mac_id & 0x0000ff00) >> 8, (mac_id & 0x000000ff)); | |
97 mac_addresses_[mac] = replacement_mac; | |
98 } | |
99 | |
100 result += pre_mac; | |
101 result += replacement_mac; | |
102 } | |
103 | |
104 return result + text.as_string(); | |
vasilii
2015/12/17 12:40:37
You should make it more effective by using operato
battre
2015/12/17 13:24:24
I have followed the StringPiece::AppendToString id
| |
105 } | |
106 | |
107 std::string AnonymizerTool::AnonymizeCustomPatterns(const std::string& input) { | |
108 std::string anonymized = input; | |
109 for (size_t i = 0; i < arraysize(kCustomPatterns); i++) { | |
110 anonymized = AnonymizeCustomPattern(anonymized, kCustomPatterns[i], | |
111 &custom_patterns_[i]); | |
112 } | |
113 return anonymized; | |
114 } | |
115 | |
116 // static | |
117 std::string AnonymizerTool::AnonymizeCustomPattern( | |
118 const std::string& input, | |
119 const std::string& pattern, | |
120 std::map<std::string, std::string>* identifier_space) { | |
121 RE2::Options options; | |
122 // set_multiline of pcre is not supported by RE2, yet. | |
123 options.set_dot_nl(true); // Dot matches a new line. | |
124 RE2 re("(.*?)" + pattern, options); | |
125 DCHECK_EQ(4, re.NumberOfCapturingGroups()); | |
126 | |
127 std::string result; | |
128 result.reserve(input.size()); | |
129 | |
130 // Keep consuming, building up a result string as we go. | |
131 re2::StringPiece text(input); | |
132 std::string pre_match, pre_matched_id, matched_id, post_matched_id; | |
133 while (RE2::Consume(&text, re, RE2::Arg(&pre_match), | |
134 RE2::Arg(&pre_matched_id), RE2::Arg(&matched_id), | |
135 RE2::Arg(&post_matched_id))) { | |
136 std::string replacement_id = (*identifier_space)[matched_id]; | |
vasilii
2015/12/17 12:40:37
I'd use reference here too.
battre
2015/12/17 13:24:23
see above.
| |
137 if (replacement_id.empty()) { | |
138 replacement_id = base::IntToString(identifier_space->size()); | |
139 (*identifier_space)[matched_id] = replacement_id; | |
140 } | |
141 | |
142 result += pre_match; | |
143 result += pre_matched_id; | |
144 result += replacement_id; | |
145 result += post_matched_id; | |
146 } | |
147 result += text.as_string(); | |
vasilii
2015/12/17 12:40:37
Use StringPiece::AppendToString.
battre
2015/12/17 13:24:24
Done.
| |
148 return result; | |
149 } | |
150 | |
151 } // namespace feedback | |
OLD | NEW |