OLD | NEW |
---|---|
1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/feedback/anonymizer_tool.h" | 5 #include "components/feedback/anonymizer_tool.h" |
6 | 6 |
7 #include <base/strings/string_number_conversions.h> | 7 #include <base/strings/string_number_conversions.h> |
8 #include <base/strings/string_util.h> | 8 #include <base/strings/string_util.h> |
9 #include <base/strings/stringprintf.h> | 9 #include <base/strings/stringprintf.h> |
10 | 10 |
11 #include "third_party/re2/re2/re2.h" | 11 #include "third_party/re2/re2/re2.h" |
12 | 12 |
13 using re2::RE2; | 13 using re2::RE2; |
14 | 14 |
15 namespace feedback { | 15 namespace feedback { |
16 | 16 |
17 namespace { | 17 namespace { |
18 | 18 |
19 // The |kCustomPatterns| array defines patterns to match and anonymize. Each | 19 // The |kCustomPatternsWithContext| array defines patterns to match and |
20 // pattern needs to define three capturing parentheses groups: | 20 // anonymize. Each pattern needs to define three capturing parentheses groups: |
21 // | 21 // |
22 // - a group for the pattern before the identifier to be anonymized; | 22 // - a group for the pattern before the identifier to be anonymized; |
23 // - a group for the identifier to be anonymized; | 23 // - a group for the identifier to be anonymized; |
24 // - a group for the pattern after the identifier to be anonymized. | 24 // - a group for the pattern after the identifier to be anonymized. |
25 // | 25 // |
26 // The first and the last capture group are the origin of the "WithContext" | |
27 // suffix in the name of this constant. | |
28 // | |
26 // Every matched identifier (in the context of the whole pattern) is anonymized | 29 // Every matched identifier (in the context of the whole pattern) is anonymized |
27 // by replacing it with an incremental instance identifier. Every different | 30 // by replacing it with an incremental instance identifier. Every different |
28 // pattern defines a separate instance identifier space. See the unit test for | 31 // pattern defines a separate instance identifier space. See the unit test for |
29 // AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples. | 32 // AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples. |
30 // | 33 // |
31 // Useful regular expression syntax: | 34 // Useful regular expression syntax: |
32 // | 35 // |
33 // +? is a non-greedy (lazy) +. | 36 // +? is a non-greedy (lazy) +. |
34 // \b matches a word boundary. | 37 // \b matches a word boundary. |
35 // (?i) turns on case insensitivy for the remainder of the regex. | 38 // (?i) turns on case insensitivy for the remainder of the regex. |
36 // (?-s) turns off "dot matches newline" for the remainder of the regex. | 39 // (?-s) turns off "dot matches newline" for the remainder of the regex. |
37 // (?:regex) denotes non-capturing parentheses group. | 40 // (?:regex) denotes non-capturing parentheses group. |
38 const char* kCustomPatterns[] = { | 41 const char* kCustomPatternsWithContext[] = { |
39 "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager | 42 "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager |
40 "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager | 43 "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager |
41 "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant | 44 "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant |
42 "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()", // wpa_supplicant | 45 "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()", // wpa_supplicant |
43 "(?-s)(\\[SSID=)(.+?)(\\])", // shill | 46 "(?-s)(\\[SSID=)(.+?)(\\])", // shill |
44 }; | 47 }; |
45 | 48 |
49 // Helper macro: Non capturing group | |
50 #define NCG(x) "(?:" x ")" | |
51 // Helper macro: Optional non capturing group | |
52 #define OPT_NCG(x) NCG(x) "?" | |
53 | |
54 ////////////////////////////////////////////////////////////////////////// | |
55 // Patterns for URLs, or better IRIs, based on RFC 3987 with an artificial | |
56 // limitation on the scheme to increase precision. Otherwise anything | |
57 // like "ID:" would be considered an IRI. | |
58 | |
59 #define UNRESERVED "[-a-z0-9._~]" | |
60 #define RESERVED NGC(GEN_DELIMS "|" SUB_DELIMS) | |
vasilii
2015/12/22 12:31:42
I never used a macros in a macros. Does it work as
battre
2016/01/08 14:14:32
Yes, this is just string replacements.
| |
61 #define SUB_DELIMS "[!$&'()*+,;=]" | |
62 #define GEN_DELIMS "[:/?#[\\]@]" | |
63 | |
64 #define DIGIT "[0-9]" | |
65 #define HEXDIG "[0-9a-f]" | |
66 | |
67 #define PCT_ENCODED "%" HEXDIG HEXDIG | |
68 | |
69 #define DEC_OCTET NCG("[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-9]") | |
70 | |
71 #define IPV4ADDRESS DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET | |
72 | |
73 #define H16 NCG(HEXDIG) "{1,4}" | |
74 #define LS32 NCG(H16 ":" H16 "|" IPV4ADDRESS) | |
75 | |
76 #define IPV6ADDRESS NCG( \ | |
77 NCG(H16 ":") "{6}" LS32 "|" \ | |
78 "::" NCG(H16 ":") "{5}" LS32 "|" \ | |
79 OPT_NCG( H16) "::" NCG(H16 ":") "{4}" LS32 "|" \ | |
80 OPT_NCG( NCG(H16 ":") "{0,1}" H16) "::" NCG(H16 ":") "{3}" LS32 "|" \ | |
81 OPT_NCG( NCG(H16 ":") "{0,2}" H16) "::" NCG(H16 ":") "{2}" LS32 "|" \ | |
82 OPT_NCG( NCG(H16 ":") "{0,3}" H16) "::" NCG(H16 ":") LS32 "|" \ | |
83 OPT_NCG( NCG(H16 ":") "{0,4}" H16) "::" LS32 "|" \ | |
84 OPT_NCG( NCG(H16 ":") "{0,5}" H16) "::" H16 "|" \ | |
85 OPT_NCG( NCG(H16 ":") "{0,6}" H16) "::") | |
86 | |
87 #define IPVFUTURE \ | |
88 "v" HEXDIG \ | |
89 "+" \ | |
90 "\\." NCG(UNRESERVED "|" SUB_DELIMS \ | |
91 "|" \ | |
92 ":") "+" | |
93 | |
94 #define IP_LITERAL "\\[" NCG(IPV6ADDRESS "|" IPVFUTURE) "\\]" | |
95 | |
96 #define PORT DIGIT "*" | |
97 | |
98 // This is a diversion of RFC 3987 | |
99 #define SCHEME NCG("http|https|ftp|chrome|chrome-extension|android") | |
100 | |
101 #define IPRIVATE \ | |
102 "[" \ | |
103 "\\x{E000}-\\x{F8FF}" \ | |
104 "\\x{F0000}-\\x{FFFFD}" \ | |
105 "\\x{100000}-\\x{10FFFD}" \ | |
106 "]" | |
107 | |
108 #define UCSCHAR \ | |
109 "[" "\\x{A0}-\\x{D7FF}" "\\x{F900}-\\x{FDCF}" "\\x{FDF0}-\\x{FFEF}" \ | |
110 "\\x{10000}-\\x{1FFFD}" "\\x{20000}-\\x{2FFFD}" "\\x{30000}-\\x{3FFFD}" \ | |
111 "\\x{40000}-\\x{4FFFD}" "\\x{50000}-\\x{5FFFD}" "\\x{60000}-\\x{6FFFD}" \ | |
112 "\\x{70000}-\\x{7FFFD}" "\\x{80000}-\\x{8FFFD}" "\\x{90000}-\\x{9FFFD}" \ | |
113 "\\x{A0000}-\\x{AFFFD}" "\\x{B0000}-\\x{BFFFD}" "\\x{C0000}-\\x{CFFFD}" \ | |
114 "\\x{D0000}-\\x{DFFFD}" "\\x{E1000}-\\x{EFFFD}" "]" | |
115 | |
116 #define IUNRESERVED NCG("[-a-z0-9._~]" "|" UCSCHAR) | |
117 | |
118 #define IPCHAR NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" "[:@]") | |
119 #define IFRAGMENT NCG(IPCHAR "|" "[/?]") "*" | |
120 #define IQUERY NCG(IPCHAR "|" IPRIVATE "|" "[/?]") "*" | |
121 | |
122 #define ISEGMENT IPCHAR "*" | |
123 #define ISEGMENT_NZ IPCHAR "+" | |
124 #define ISEGMENT_NZ_NC \ | |
125 NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS \ | |
126 "|" "@") "+" | |
127 | |
128 #define IPATH_EMPTY "" | |
129 #define IPATH_ROOTLESS ISEGMENT_NZ NCG("/" ISEGMENT) "*" | |
130 #define IPATH_NOSCHEME ISEGMENT_NZ_NC NCG("/" ISEGMENT) "*" | |
131 #define IPATH_ABSOLUTE "/" OPT_NCG(ISEGMENT_NZ NCG("/" ISEGMENT) "*") | |
132 #define IPATH_ABEMPTY NCG("/" ISEGMENT) "*" | |
133 | |
134 #define IPATH NCG(IPATH_ABEMPTY "|" IPATH_ABSOLUTE "|" IPATH_NOSCHEME "|" \ | |
135 IPATH_ROOTLESS "|" IPATH_EMPTY) | |
136 | |
137 #define IREG_NAME NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS) "*" | |
138 | |
139 #define IHOST NCG(IP_LITERAL "|" IPV4ADDRESS "|" IREG_NAME) | |
140 #define IUSERINFO NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" ":") "*" | |
141 #define IAUTHORITY OPT_NCG(IUSERINFO "@") IHOST OPT_NCG(":" PORT) | |
142 | |
143 #define IRELATIVE_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \ | |
144 "|" IPATH_NOSCHEME "|" IPATH_EMPTY) | |
145 | |
146 #define IRELATIVE_REF IRELATIVE_PART OPT_NCG("?" IQUERY) OPT_NCG("#" IFRAGMENT) | |
147 | |
148 // RFC 3987 requires IPATH_EMPTY here but it is omitted so that statements | |
149 // that end with "Android:" for example are not considered a URL. | |
150 #define IHIER_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \ | |
151 "|" IPATH_ROOTLESS) | |
152 | |
153 #define ABSOLUTE_IRI SCHEME ":" IHIER_PART OPT_NCG("?" IQUERY) | |
154 | |
155 #define IRI SCHEME ":" IHIER_PART OPT_NCG("\\?" IQUERY) OPT_NCG("#" IFRAGMENT) | |
156 | |
157 #define IRI_REFERENCE NCG(IRI "|" IRELATIVE_REF) | |
158 | |
159 // TODO(battre): Use http://tools.ietf.org/html/rfc5322 to represent email | |
160 // addresses. Capture names as well ("First Lastname" <foo@bar.com>). | |
161 | |
162 // The |kCustomPatternWithoutContext| array defines further patterns to match | |
163 // and anonymize. Each pattern consists of a single capturing group. | |
164 CustomPatternWithoutContext kCustomPatternsWithoutContext[] = { | |
165 {"URL", "(?i)(" IRI ")"}, | |
166 // Email Addresses need to come after URLs because they can be part | |
167 // of a query parameter. | |
168 {"email", "(?i)([0-9a-z._%+-]+@[a-z0-9.-]+\\.[a-z]{2,6})"}, | |
169 // IP filter rules need to come after URLs so that they don't disturb the | |
170 // URL pattern in case the IP address is part of a URL. | |
171 {"IPv4", "(?i)(" IPV4ADDRESS ")"}, | |
172 {"IPv6", "(?i)(" IPV6ADDRESS ")"}, | |
173 }; | |
174 | |
46 } // namespace | 175 } // namespace |
47 | 176 |
48 AnonymizerTool::AnonymizerTool() | 177 AnonymizerTool::AnonymizerTool() |
49 : custom_patterns_(arraysize(kCustomPatterns)) {} | 178 : custom_patterns_with_context_(arraysize(kCustomPatternsWithContext)), |
179 custom_patterns_without_context_( | |
180 arraysize(kCustomPatternsWithoutContext)) {} | |
50 | 181 |
51 AnonymizerTool::~AnonymizerTool() {} | 182 AnonymizerTool::~AnonymizerTool() {} |
52 | 183 |
53 std::string AnonymizerTool::Anonymize(const std::string& input) { | 184 std::string AnonymizerTool::Anonymize(const std::string& input) { |
54 std::string anonymized = AnonymizeMACAddresses(input); | 185 std::string anonymized = AnonymizeMACAddresses(input); |
55 anonymized = AnonymizeCustomPatterns(std::move(anonymized)); | 186 anonymized = AnonymizeCustomPatterns(std::move(anonymized)); |
56 return anonymized; | 187 return anonymized; |
57 } | 188 } |
58 | 189 |
59 std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) { | 190 std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) { |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
99 | 230 |
100 result += pre_mac; | 231 result += pre_mac; |
101 result += replacement_mac; | 232 result += replacement_mac; |
102 } | 233 } |
103 | 234 |
104 text.AppendToString(&result); | 235 text.AppendToString(&result); |
105 return result; | 236 return result; |
106 } | 237 } |
107 | 238 |
108 std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) { | 239 std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) { |
109 for (size_t i = 0; i < arraysize(kCustomPatterns); i++) { | 240 for (size_t i = 0; i < arraysize(kCustomPatternsWithContext); i++) { |
110 input = | 241 input = |
111 AnonymizeCustomPattern(input, kCustomPatterns[i], &custom_patterns_[i]); | 242 AnonymizeCustomPatternWithContext(input, kCustomPatternsWithContext[i], |
243 &custom_patterns_with_context_[i]); | |
244 } | |
245 for (size_t i = 0; i < arraysize(kCustomPatternsWithoutContext); i++) { | |
246 input = AnonymizeCustomPatternWithoutContext( | |
247 input, kCustomPatternsWithoutContext[i], | |
248 &custom_patterns_without_context_[i]); | |
112 } | 249 } |
113 return input; | 250 return input; |
114 } | 251 } |
115 | 252 |
116 // static | 253 // static |
117 std::string AnonymizerTool::AnonymizeCustomPattern( | 254 std::string AnonymizerTool::AnonymizeCustomPatternWithContext( |
118 const std::string& input, | 255 const std::string& input, |
119 const std::string& pattern, | 256 const std::string& pattern, |
120 std::map<std::string, std::string>* identifier_space) { | 257 std::map<std::string, std::string>* identifier_space) { |
121 RE2::Options options; | 258 RE2::Options options; |
122 // set_multiline of pcre is not supported by RE2, yet. | 259 // set_multiline of pcre is not supported by RE2, yet. |
123 options.set_dot_nl(true); // Dot matches a new line. | 260 options.set_dot_nl(true); // Dot matches a new line. |
124 RE2 re("(.*?)" + pattern, options); | 261 RE2 re("(.*?)" + pattern, options); |
125 DCHECK_EQ(4, re.NumberOfCapturingGroups()); | 262 DCHECK_EQ(4, re.NumberOfCapturingGroups()); |
126 | 263 |
127 std::string result; | 264 std::string result; |
(...skipping 13 matching lines...) Expand all Loading... | |
141 | 278 |
142 result += pre_match; | 279 result += pre_match; |
143 result += pre_matched_id; | 280 result += pre_matched_id; |
144 result += replacement_id; | 281 result += replacement_id; |
145 result += post_matched_id; | 282 result += post_matched_id; |
146 } | 283 } |
147 text.AppendToString(&result); | 284 text.AppendToString(&result); |
148 return result; | 285 return result; |
149 } | 286 } |
150 | 287 |
288 // static | |
289 std::string AnonymizerTool::AnonymizeCustomPatternWithoutContext( | |
290 const std::string& input, | |
291 const CustomPatternWithoutContext& pattern, | |
292 std::map<std::string, std::string>* identifier_space) { | |
293 RE2::Options options; | |
294 // set_multiline of pcre is not supported by RE2, yet. | |
295 options.set_dot_nl(true); // Dot matches a new line. | |
296 RE2 re(std::string("(.*?)") + pattern.pattern, options); | |
297 DCHECK_EQ(re2::RE2::NoError, re.error_code()) | |
298 << "Failed to parse:\n" << pattern.pattern << "\n" << re.error(); | |
299 DCHECK_EQ(2, re.NumberOfCapturingGroups()); | |
300 | |
301 std::string result; | |
302 result.reserve(input.size()); | |
303 | |
304 // Keep consuming, building up a result string as we go. | |
305 re2::StringPiece text(input); | |
306 std::string pre_match, matched_id; | |
307 while (RE2::Consume(&text, re, RE2::Arg(&pre_match), RE2::Arg(&matched_id))) { | |
308 std::string replacement_id = (*identifier_space)[matched_id]; | |
309 if (replacement_id.empty()) { | |
310 replacement_id = base::StringPrintf("<%s: %zu>", pattern.alias, | |
311 identifier_space->size()); | |
312 (*identifier_space)[matched_id] = replacement_id; | |
313 } | |
314 | |
315 result += pre_match; | |
316 result += replacement_id; | |
317 } | |
318 text.AppendToString(&result); | |
319 return result; | |
320 } | |
321 | |
151 } // namespace feedback | 322 } // namespace feedback |
OLD | NEW |