OLD | NEW |
---|---|
1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/feedback/anonymizer_tool.h" | 5 #include "components/feedback/anonymizer_tool.h" |
6 | 6 |
7 #include <base/strings/string_number_conversions.h> | 7 #include <utility> |
8 #include <base/strings/string_util.h> | 8 |
9 #include <base/strings/stringprintf.h> | 9 #include "base/strings/string_number_conversions.h" |
10 #include "base/strings/string_util.h" | |
11 #include "base/strings/stringprintf.h" | |
10 | 12 |
11 #include "third_party/re2/src/re2/re2.h" | 13 #include "third_party/re2/src/re2/re2.h" |
12 | 14 |
13 using re2::RE2; | 15 using re2::RE2; |
14 | 16 |
15 namespace feedback { | 17 namespace feedback { |
16 | 18 |
17 namespace { | 19 namespace { |
18 | 20 |
19 // The |kCustomPatterns| array defines patterns to match and anonymize. Each | 21 // The |kCustomPatternsWithContext| array defines patterns to match and |
20 // pattern needs to define three capturing parentheses groups: | 22 // anonymize. Each pattern needs to define three capturing parentheses groups: |
21 // | 23 // |
22 // - a group for the pattern before the identifier to be anonymized; | 24 // - a group for the pattern before the identifier to be anonymized; |
23 // - a group for the identifier to be anonymized; | 25 // - a group for the identifier to be anonymized; |
24 // - a group for the pattern after the identifier to be anonymized. | 26 // - a group for the pattern after the identifier to be anonymized. |
25 // | 27 // |
28 // The first and the last capture group are the origin of the "WithContext" | |
29 // suffix in the name of this constant. | |
30 // | |
26 // Every matched identifier (in the context of the whole pattern) is anonymized | 31 // Every matched identifier (in the context of the whole pattern) is anonymized |
27 // by replacing it with an incremental instance identifier. Every different | 32 // by replacing it with an incremental instance identifier. Every different |
28 // pattern defines a separate instance identifier space. See the unit test for | 33 // pattern defines a separate instance identifier space. See the unit test for |
29 // AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples. | 34 // AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples. |
30 // | 35 // |
31 // Useful regular expression syntax: | 36 // Useful regular expression syntax: |
32 // | 37 // |
33 // +? is a non-greedy (lazy) +. | 38 // +? is a non-greedy (lazy) +. |
34 // \b matches a word boundary. | 39 // \b matches a word boundary. |
35 // (?i) turns on case insensitivy for the remainder of the regex. | 40 // (?i) turns on case insensitivy for the remainder of the regex. |
36 // (?-s) turns off "dot matches newline" for the remainder of the regex. | 41 // (?-s) turns off "dot matches newline" for the remainder of the regex. |
37 // (?:regex) denotes non-capturing parentheses group. | 42 // (?:regex) denotes non-capturing parentheses group. |
38 const char* kCustomPatterns[] = { | 43 const char* kCustomPatternsWithContext[] = { |
39 "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager | 44 "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager |
40 "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager | 45 "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager |
41 "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant | 46 "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant |
42 "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()", // wpa_supplicant | 47 "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()", // wpa_supplicant |
43 "(?-s)(\\[SSID=)(.+?)(\\])", // shill | 48 "(?-s)(\\[SSID=)(.+?)(\\])", // shill |
44 }; | 49 }; |
45 | 50 |
51 // Helper macro: Non capturing group | |
52 #define NCG(x) "(?:" x ")" | |
53 // Helper macro: Optional non capturing group | |
54 #define OPT_NCG(x) NCG(x) "?" | |
55 | |
56 ////////////////////////////////////////////////////////////////////////// | |
57 // Patterns for URLs, or better IRIs, based on RFC 3987 with an artificial | |
58 // limitation on the scheme to increase precision. Otherwise anything | |
59 // like "ID:" would be considered an IRI. | |
60 | |
61 #define UNRESERVED "[-a-z0-9._~]" | |
62 #define RESERVED NGC(GEN_DELIMS "|" SUB_DELIMS) | |
63 #define SUB_DELIMS "[!$&'()*+,;=]" | |
64 #define GEN_DELIMS "[:/?#[\\]@]" | |
65 | |
66 #define DIGIT "[0-9]" | |
67 #define HEXDIG "[0-9a-f]" | |
68 | |
69 #define PCT_ENCODED "%" HEXDIG HEXDIG | |
70 | |
71 #define DEC_OCTET NCG("[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-9]") | |
72 | |
73 #define IPV4ADDRESS DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET | |
74 | |
75 #define H16 NCG(HEXDIG) "{1,4}" | |
76 #define LS32 NCG(H16 ":" H16 "|" IPV4ADDRESS) | |
77 | |
78 #define IPV6ADDRESS NCG( \ | |
79 NCG(H16 ":") "{6}" LS32 "|" \ | |
80 "::" NCG(H16 ":") "{5}" LS32 "|" \ | |
81 OPT_NCG( H16) "::" NCG(H16 ":") "{4}" LS32 "|" \ | |
82 OPT_NCG( NCG(H16 ":") "{0,1}" H16) "::" NCG(H16 ":") "{3}" LS32 "|" \ | |
83 OPT_NCG( NCG(H16 ":") "{0,2}" H16) "::" NCG(H16 ":") "{2}" LS32 "|" \ | |
84 OPT_NCG( NCG(H16 ":") "{0,3}" H16) "::" NCG(H16 ":") LS32 "|" \ | |
85 OPT_NCG( NCG(H16 ":") "{0,4}" H16) "::" LS32 "|" \ | |
86 OPT_NCG( NCG(H16 ":") "{0,5}" H16) "::" H16 "|" \ | |
87 OPT_NCG( NCG(H16 ":") "{0,6}" H16) "::") | |
88 | |
89 #define IPVFUTURE \ | |
90 "v" HEXDIG \ | |
91 "+" \ | |
92 "\\." NCG(UNRESERVED "|" SUB_DELIMS \ | |
93 "|" \ | |
94 ":") "+" | |
95 | |
96 #define IP_LITERAL "\\[" NCG(IPV6ADDRESS "|" IPVFUTURE) "\\]" | |
97 | |
98 #define PORT DIGIT "*" | |
99 | |
100 // This is a diversion of RFC 3987 | |
101 #define SCHEME NCG("http|https|ftp|chrome|chrome-extension|android") | |
102 | |
103 #define IPRIVATE \ | |
104 "[" \ | |
105 "\\x{E000}-\\x{F8FF}" \ | |
106 "\\x{F0000}-\\x{FFFFD}" \ | |
107 "\\x{100000}-\\x{10FFFD}" \ | |
108 "]" | |
109 | |
110 #define UCSCHAR \ | |
111 "[" "\\x{A0}-\\x{D7FF}" "\\x{F900}-\\x{FDCF}" "\\x{FDF0}-\\x{FFEF}" \ | |
112 "\\x{10000}-\\x{1FFFD}" "\\x{20000}-\\x{2FFFD}" "\\x{30000}-\\x{3FFFD}" \ | |
113 "\\x{40000}-\\x{4FFFD}" "\\x{50000}-\\x{5FFFD}" "\\x{60000}-\\x{6FFFD}" \ | |
114 "\\x{70000}-\\x{7FFFD}" "\\x{80000}-\\x{8FFFD}" "\\x{90000}-\\x{9FFFD}" \ | |
115 "\\x{A0000}-\\x{AFFFD}" "\\x{B0000}-\\x{BFFFD}" "\\x{C0000}-\\x{CFFFD}" \ | |
116 "\\x{D0000}-\\x{DFFFD}" "\\x{E1000}-\\x{EFFFD}" "]" | |
117 | |
118 #define IUNRESERVED NCG("[-a-z0-9._~]" "|" UCSCHAR) | |
119 | |
120 #define IPCHAR NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" "[:@]") | |
121 #define IFRAGMENT NCG(IPCHAR "|" "[/?]") "*" | |
122 #define IQUERY NCG(IPCHAR "|" IPRIVATE "|" "[/?]") "*" | |
123 | |
124 #define ISEGMENT IPCHAR "*" | |
125 #define ISEGMENT_NZ IPCHAR "+" | |
126 #define ISEGMENT_NZ_NC \ | |
127 NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS \ | |
128 "|" "@") "+" | |
129 | |
130 #define IPATH_EMPTY "" | |
131 #define IPATH_ROOTLESS ISEGMENT_NZ NCG("/" ISEGMENT) "*" | |
132 #define IPATH_NOSCHEME ISEGMENT_NZ_NC NCG("/" ISEGMENT) "*" | |
133 #define IPATH_ABSOLUTE "/" OPT_NCG(ISEGMENT_NZ NCG("/" ISEGMENT) "*") | |
134 #define IPATH_ABEMPTY NCG("/" ISEGMENT) "*" | |
135 | |
136 #define IPATH NCG(IPATH_ABEMPTY "|" IPATH_ABSOLUTE "|" IPATH_NOSCHEME "|" \ | |
137 IPATH_ROOTLESS "|" IPATH_EMPTY) | |
138 | |
139 #define IREG_NAME NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS) "*" | |
140 | |
141 #define IHOST NCG(IP_LITERAL "|" IPV4ADDRESS "|" IREG_NAME) | |
142 #define IUSERINFO NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" ":") "*" | |
143 #define IAUTHORITY OPT_NCG(IUSERINFO "@") IHOST OPT_NCG(":" PORT) | |
144 | |
145 #define IRELATIVE_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \ | |
146 "|" IPATH_NOSCHEME "|" IPATH_EMPTY) | |
147 | |
148 #define IRELATIVE_REF IRELATIVE_PART OPT_NCG("?" IQUERY) OPT_NCG("#" IFRAGMENT) | |
149 | |
150 // RFC 3987 requires IPATH_EMPTY here but it is omitted so that statements | |
151 // that end with "Android:" for example are not considered a URL. | |
152 #define IHIER_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \ | |
153 "|" IPATH_ROOTLESS) | |
154 | |
155 #define ABSOLUTE_IRI SCHEME ":" IHIER_PART OPT_NCG("?" IQUERY) | |
156 | |
157 #define IRI SCHEME ":" IHIER_PART OPT_NCG("\\?" IQUERY) OPT_NCG("#" IFRAGMENT) | |
158 | |
159 #define IRI_REFERENCE NCG(IRI "|" IRELATIVE_REF) | |
160 | |
161 // TODO(battre): Use http://tools.ietf.org/html/rfc5322 to represent email | |
162 // addresses. Capture names as well ("First Lastname" <foo@bar.com>). | |
163 | |
164 // The |kCustomPatternWithoutContext| array defines further patterns to match | |
165 // and anonymize. Each pattern consists of a single capturing group. | |
166 CustomPatternWithoutContext kCustomPatternsWithoutContext[] = { | |
167 {"URL", "(?i)(" IRI ")"}, | |
168 // Email Addresses need to come after URLs because they can be part | |
169 // of a query parameter. | |
170 {"email", "(?i)([0-9a-z._%+-]+@[a-z0-9.-]+\\.[a-z]{2,6})"}, | |
171 // IP filter rules need to come after URLs so that they don't disturb the | |
172 // URL pattern in case the IP address is part of a URL. | |
173 {"IPv4", "(?i)(" IPV4ADDRESS ")"}, | |
174 {"IPv6", "(?i)(" IPV6ADDRESS ")"}, | |
175 }; | |
176 | |
177 // Functor template that allows calling a function with a set of paramters | |
178 // that get wrapped into an array. This is a variation of RE2's | |
179 // VariadicFunction2 with an extra parameter. | |
180 template <typename Result, typename Param0, typename Param1, typename Param2, | |
181 typename Arg, | |
182 Result (*Func)(Param0, Param1, Param2, Arg*[], int count)> | |
183 class VariadicFunction3 { | |
184 public: | |
185 Result operator()(Param0 p0, Param1 p1, Param2 p2) const { | |
186 return Func(p0, p1, p2, nullptr, 0); | |
187 } | |
188 | |
189 Result operator()(Param0 p0, Param1 p1, Param2 p2, Arg* a0) const { | |
190 Arg* args[] = {a0}; | |
191 return Func(p0, p1, p2, args, 1); | |
192 } | |
193 | |
194 Result operator()(Param0 p0, Param1 p1, Param2 p2, Arg* a0, Arg* a1) const { | |
195 Arg* args[] = {a0, a1}; | |
196 return Func(p0, p1, p2, args, 2); | |
197 } | |
198 | |
199 Result operator()(Param0 p0, Param1 p1, Param2 p2, Arg* a0, Arg* a1, Arg* a2) | |
200 const { | |
201 Arg* args[] = {a0, a1, a2}; | |
202 return Func(p0, p1, p2, args, 3); | |
vasilii
2016/01/08 17:02:26
I think you can collapse these definition using a
battre
2016/01/11 09:02:20
I don't even need this anymore once I use a variad
| |
203 } | |
204 }; | |
205 | |
206 // Like RE2's FindAndConsume, searches for the first occurrence of |pattern| in | |
207 // |input| and consumes the bytes until the end of the pattern matching. Unlike | |
208 // FindAndConsume, the bytes skipped before the match of |pattern| are stored | |
209 // in |skipped_input|. | |
210 bool FindAndConsumeAndGetSkippedN(re2::StringPiece* input, | |
211 const re2::RE2& pattern, | |
212 re2::StringPiece* skipped_input, | |
213 re2::StringPiece* args[], | |
214 int argc) { | |
215 re2::StringPiece old_input = *input; | |
216 | |
217 re2::RE2::Arg a0(argc > 0 ? args[0] : nullptr); | |
218 re2::RE2::Arg a1(argc > 1 ? args[1] : nullptr); | |
219 re2::RE2::Arg a2(argc > 2 ? args[2] : nullptr); | |
220 const re2::RE2::Arg* const wrapped_args[] = {&a0, &a1, &a2}; | |
221 CHECK_LE(argc, 3); | |
222 | |
223 bool result = re2::RE2::FindAndConsumeN(input, pattern, wrapped_args, argc); | |
224 | |
225 if (skipped_input && result && argc > 0) { | |
226 size_t bytes_skipped = args[0]->data() - old_input.data(); | |
227 *skipped_input = re2::StringPiece(old_input.data(), bytes_skipped); | |
228 } | |
229 return result; | |
230 } | |
231 | |
232 const VariadicFunction3<bool, | |
233 re2::StringPiece*, | |
234 const re2::RE2&, | |
235 re2::StringPiece*, | |
236 re2::StringPiece, | |
237 &FindAndConsumeAndGetSkippedN> | |
238 FindAndConsumeAndGetSkipped = {}; | |
239 | |
46 } // namespace | 240 } // namespace |
47 | 241 |
48 AnonymizerTool::AnonymizerTool() | 242 AnonymizerTool::AnonymizerTool() |
49 : custom_patterns_(arraysize(kCustomPatterns)) {} | 243 : custom_patterns_with_context_(arraysize(kCustomPatternsWithContext)), |
244 custom_patterns_without_context_( | |
245 arraysize(kCustomPatternsWithoutContext)) {} | |
50 | 246 |
51 AnonymizerTool::~AnonymizerTool() {} | 247 AnonymizerTool::~AnonymizerTool() {} |
52 | 248 |
53 std::string AnonymizerTool::Anonymize(const std::string& input) { | 249 std::string AnonymizerTool::Anonymize(const std::string& input) { |
54 std::string anonymized = AnonymizeMACAddresses(input); | 250 std::string anonymized = AnonymizeMACAddresses(input); |
55 anonymized = AnonymizeCustomPatterns(std::move(anonymized)); | 251 anonymized = AnonymizeCustomPatterns(std::move(anonymized)); |
56 return anonymized; | 252 return anonymized; |
57 } | 253 } |
58 | 254 |
255 RE2* AnonymizerTool::GetRegExp(const std::string& pattern) { | |
256 if (regexp_cache_.find(pattern) == regexp_cache_.end()) { | |
257 RE2::Options options; | |
258 // set_multiline of pcre is not supported by RE2, yet. | |
259 options.set_dot_nl(true); // Dot matches a new line. | |
260 scoped_ptr<RE2> re = make_scoped_ptr(new RE2(pattern, options)); | |
261 DCHECK_EQ(re2::RE2::NoError, re->error_code()) | |
262 << "Failed to parse:\n" << pattern << "\n" << re->error(); | |
263 regexp_cache_[pattern] = std::move(re); | |
264 } | |
265 return regexp_cache_[pattern].get(); | |
266 } | |
267 | |
59 std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) { | 268 std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) { |
60 // This regular expression finds the next MAC address. It splits the data into | 269 // This regular expression finds the next MAC address. It splits the data into |
61 // a section preceding the MAC address, an OUI (Organizationally Unique | 270 // an OUI (Organizationally Unique Identifier) part and a NIC (Network |
62 // Identifier) part and a NIC (Network Interface Controller) specific part. | 271 // Interface Controller) specific part. |
63 | 272 |
64 RE2::Options options; | 273 RE2* mac_re = GetRegExp( |
65 // set_multiline of pcre is not supported by RE2, yet. | 274 "([0-9a-fA-F][0-9a-fA-F]:" |
66 options.set_dot_nl(true); // Dot matches a new line. | |
67 RE2 mac_re( | |
68 "(.*?)(" | |
69 "[0-9a-fA-F][0-9a-fA-F]:" | |
70 "[0-9a-fA-F][0-9a-fA-F]:" | 275 "[0-9a-fA-F][0-9a-fA-F]:" |
71 "[0-9a-fA-F][0-9a-fA-F]):(" | 276 "[0-9a-fA-F][0-9a-fA-F]):(" |
72 "[0-9a-fA-F][0-9a-fA-F]:" | 277 "[0-9a-fA-F][0-9a-fA-F]:" |
73 "[0-9a-fA-F][0-9a-fA-F]:" | 278 "[0-9a-fA-F][0-9a-fA-F]:" |
74 "[0-9a-fA-F][0-9a-fA-F])", | 279 "[0-9a-fA-F][0-9a-fA-F])"); |
75 options); | |
76 | 280 |
77 std::string result; | 281 std::string result; |
78 result.reserve(input.size()); | 282 result.reserve(input.size()); |
79 | 283 |
80 // Keep consuming, building up a result string as we go. | 284 // Keep consuming, building up a result string as we go. |
81 re2::StringPiece text(input); | 285 re2::StringPiece text(input); |
82 std::string pre_mac, oui, nic; | 286 re2::StringPiece skipped; |
83 while (re2::RE2::Consume(&text, mac_re, RE2::Arg(&pre_mac), RE2::Arg(&oui), | 287 re2::StringPiece pre_mac, oui, nic; |
84 RE2::Arg(&nic))) { | 288 while (FindAndConsumeAndGetSkipped(&text, *mac_re, &skipped, &oui, &nic)) { |
85 // Look up the MAC address in the hash. | 289 // Look up the MAC address in the hash. |
86 oui = base::ToLowerASCII(oui); | 290 std::string oui_string = base::ToLowerASCII(oui.as_string()); |
87 nic = base::ToLowerASCII(nic); | 291 std::string nic_string = base::ToLowerASCII(nic.as_string()); |
88 std::string mac = oui + ":" + nic; | 292 std::string mac = oui_string + ":" + nic_string; |
89 std::string replacement_mac = mac_addresses_[mac]; | 293 std::string replacement_mac = mac_addresses_[mac]; |
90 if (replacement_mac.empty()) { | 294 if (replacement_mac.empty()) { |
91 // If not found, build up a replacement MAC address by generating a new | 295 // If not found, build up a replacement MAC address by generating a new |
92 // NIC part. | 296 // NIC part. |
93 int mac_id = mac_addresses_.size(); | 297 int mac_id = mac_addresses_.size(); |
94 replacement_mac = base::StringPrintf( | 298 replacement_mac = base::StringPrintf( |
95 "%s:%02x:%02x:%02x", oui.c_str(), (mac_id & 0x00ff0000) >> 16, | 299 "%s:%02x:%02x:%02x", oui_string.c_str(), (mac_id & 0x00ff0000) >> 16, |
96 (mac_id & 0x0000ff00) >> 8, (mac_id & 0x000000ff)); | 300 (mac_id & 0x0000ff00) >> 8, (mac_id & 0x000000ff)); |
97 mac_addresses_[mac] = replacement_mac; | 301 mac_addresses_[mac] = replacement_mac; |
98 } | 302 } |
99 | 303 |
100 result += pre_mac; | 304 skipped.AppendToString(&result); |
101 result += replacement_mac; | 305 result += replacement_mac; |
102 } | 306 } |
103 | 307 |
104 text.AppendToString(&result); | 308 text.AppendToString(&result); |
105 return result; | 309 return result; |
106 } | 310 } |
107 | 311 |
312 | |
108 std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) { | 313 std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) { |
109 for (size_t i = 0; i < arraysize(kCustomPatterns); i++) { | 314 for (size_t i = 0; i < arraysize(kCustomPatternsWithContext); i++) { |
110 input = | 315 input = |
111 AnonymizeCustomPattern(input, kCustomPatterns[i], &custom_patterns_[i]); | 316 AnonymizeCustomPatternWithContext(input, kCustomPatternsWithContext[i], |
317 &custom_patterns_with_context_[i]); | |
318 } | |
319 for (size_t i = 0; i < arraysize(kCustomPatternsWithoutContext); i++) { | |
320 input = AnonymizeCustomPatternWithoutContext( | |
321 input, kCustomPatternsWithoutContext[i], | |
322 &custom_patterns_without_context_[i]); | |
112 } | 323 } |
113 return input; | 324 return input; |
114 } | 325 } |
115 | 326 |
116 // static | 327 std::string AnonymizerTool::AnonymizeCustomPatternWithContext( |
117 std::string AnonymizerTool::AnonymizeCustomPattern( | |
118 const std::string& input, | 328 const std::string& input, |
119 const std::string& pattern, | 329 const std::string& pattern, |
120 std::map<std::string, std::string>* identifier_space) { | 330 std::map<std::string, std::string>* identifier_space) { |
121 RE2::Options options; | 331 RE2* re = GetRegExp(pattern); |
122 // set_multiline of pcre is not supported by RE2, yet. | 332 DCHECK_EQ(3, re->NumberOfCapturingGroups()); |
123 options.set_dot_nl(true); // Dot matches a new line. | |
124 RE2 re("(.*?)" + pattern, options); | |
125 DCHECK_EQ(4, re.NumberOfCapturingGroups()); | |
126 | 333 |
127 std::string result; | 334 std::string result; |
128 result.reserve(input.size()); | 335 result.reserve(input.size()); |
129 | 336 |
130 // Keep consuming, building up a result string as we go. | 337 // Keep consuming, building up a result string as we go. |
131 re2::StringPiece text(input); | 338 re2::StringPiece text(input); |
132 std::string pre_match, pre_matched_id, matched_id, post_matched_id; | 339 re2::StringPiece skipped; |
133 while (RE2::Consume(&text, re, RE2::Arg(&pre_match), | 340 re2::StringPiece pre_match, pre_matched_id, matched_id, post_matched_id; |
134 RE2::Arg(&pre_matched_id), RE2::Arg(&matched_id), | 341 while (FindAndConsumeAndGetSkipped(&text, *re, &skipped, &pre_matched_id, |
135 RE2::Arg(&post_matched_id))) { | 342 &matched_id, &post_matched_id)) { |
136 std::string replacement_id = (*identifier_space)[matched_id]; | 343 std::string matched_id_as_string = matched_id.as_string(); |
344 std::string replacement_id = (*identifier_space)[matched_id_as_string]; | |
137 if (replacement_id.empty()) { | 345 if (replacement_id.empty()) { |
138 replacement_id = base::IntToString(identifier_space->size()); | 346 replacement_id = base::IntToString(identifier_space->size()); |
139 (*identifier_space)[matched_id] = replacement_id; | 347 (*identifier_space)[matched_id_as_string] = replacement_id; |
140 } | 348 } |
141 | 349 |
142 result += pre_match; | 350 skipped.AppendToString(&result); |
143 result += pre_matched_id; | 351 pre_matched_id.AppendToString(&result); |
144 result += replacement_id; | 352 result += replacement_id; |
145 result += post_matched_id; | 353 post_matched_id.AppendToString(&result); |
146 } | 354 } |
147 text.AppendToString(&result); | 355 text.AppendToString(&result); |
148 return result; | 356 return result; |
357 } | |
358 | |
359 std::string AnonymizerTool::AnonymizeCustomPatternWithoutContext( | |
360 const std::string& input, | |
361 const CustomPatternWithoutContext& pattern, | |
362 std::map<std::string, std::string>* identifier_space) { | |
363 RE2* re = GetRegExp(pattern.pattern); | |
364 DCHECK_EQ(1, re->NumberOfCapturingGroups()); | |
365 | |
366 std::string result; | |
367 result.reserve(input.size()); | |
368 | |
369 // Keep consuming, building up a result string as we go. | |
370 re2::StringPiece text(input); | |
371 re2::StringPiece skipped; | |
372 re2::StringPiece matched_id; | |
373 while (FindAndConsumeAndGetSkipped(&text, *re, &skipped, &matched_id)) { | |
374 std::string matched_id_as_string = matched_id.as_string(); | |
375 std::string replacement_id = (*identifier_space)[matched_id_as_string]; | |
376 if (replacement_id.empty()) { | |
377 // The weird Uint64toString trick is because Windows does not like to deal | |
378 // with %zu and a size_t in printf, nor does it support %llu. | |
379 replacement_id = base::StringPrintf( | |
380 "<%s: %s>", pattern.alias, | |
381 base::Uint64ToString(identifier_space->size()).c_str()); | |
382 (*identifier_space)[matched_id_as_string] = replacement_id; | |
383 } | |
384 | |
385 skipped.AppendToString(&result); | |
386 result += replacement_id; | |
387 } | |
388 text.AppendToString(&result); | |
389 return result; | |
149 } | 390 } |
150 | 391 |
151 } // namespace feedback | 392 } // namespace feedback |
OLD | NEW |