Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(212)

Side by Side Diff: components/feedback/anonymizer_tool.cc

Issue 1543633003: Added anonymization patterns for URLs and email addresses (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@bug-567870-introduce-anonymizer
Patch Set: Some more tests Created 4 years, 12 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/feedback/anonymizer_tool.h" 5 #include "components/feedback/anonymizer_tool.h"
6 6
7 #include <base/strings/string_number_conversions.h> 7 #include <base/strings/string_number_conversions.h>
8 #include <base/strings/string_util.h> 8 #include <base/strings/string_util.h>
9 #include <base/strings/stringprintf.h> 9 #include <base/strings/stringprintf.h>
10 10
11 #include "third_party/re2/re2/re2.h" 11 #include "third_party/re2/re2/re2.h"
12 12
13 using re2::RE2; 13 using re2::RE2;
14 14
15 namespace feedback { 15 namespace feedback {
16 16
17 namespace { 17 namespace {
18 18
19 // The |kCustomPatterns| array defines patterns to match and anonymize. Each 19 // The |kCustomPatternsWithContext| array defines patterns to match and
20 // pattern needs to define three capturing parentheses groups: 20 // anonymize. Each pattern needs to define three capturing parentheses groups:
21 // 21 //
22 // - a group for the pattern before the identifier to be anonymized; 22 // - a group for the pattern before the identifier to be anonymized;
23 // - a group for the identifier to be anonymized; 23 // - a group for the identifier to be anonymized;
24 // - a group for the pattern after the identifier to be anonymized. 24 // - a group for the pattern after the identifier to be anonymized.
25 // 25 //
26 // The first and the last capture group are the origin of the "WithContext"
27 // suffix in the name of this constant.
28 //
26 // Every matched identifier (in the context of the whole pattern) is anonymized 29 // Every matched identifier (in the context of the whole pattern) is anonymized
27 // by replacing it with an incremental instance identifier. Every different 30 // by replacing it with an incremental instance identifier. Every different
28 // pattern defines a separate instance identifier space. See the unit test for 31 // pattern defines a separate instance identifier space. See the unit test for
29 // AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples. 32 // AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples.
30 // 33 //
31 // Useful regular expression syntax: 34 // Useful regular expression syntax:
32 // 35 //
33 // +? is a non-greedy (lazy) +. 36 // +? is a non-greedy (lazy) +.
34 // \b matches a word boundary. 37 // \b matches a word boundary.
35 // (?i) turns on case insensitivy for the remainder of the regex. 38 // (?i) turns on case insensitivy for the remainder of the regex.
36 // (?-s) turns off "dot matches newline" for the remainder of the regex. 39 // (?-s) turns off "dot matches newline" for the remainder of the regex.
37 // (?:regex) denotes non-capturing parentheses group. 40 // (?:regex) denotes non-capturing parentheses group.
38 const char* kCustomPatterns[] = { 41 const char* kCustomPatternsWithContext[] = {
39 "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager 42 "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager
40 "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager 43 "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager
41 "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant 44 "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant
42 "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()", // wpa_supplicant 45 "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()", // wpa_supplicant
43 "(?-s)(\\[SSID=)(.+?)(\\])", // shill 46 "(?-s)(\\[SSID=)(.+?)(\\])", // shill
44 }; 47 };
45 48
49 // Helper macro: Non capturing group
50 #define NCG(x) "(?:" x ")"
51 // Helper macro: Optional non capturing group
52 #define OPT_NCG(x) NCG(x) "?"
53
54 //////////////////////////////////////////////////////////////////////////
55 // Patterns for URLs, or better IRIs, based on RFC 3987 with an artificial
56 // limitation on the scheme to increase precision. Otherwise anything
57 // like "ID:" would be considered an IRI.
58
59 #define UNRESERVED "[-a-z0-9._~]"
60 #define RESERVED NGC(GEN_DELIMS "|" SUB_DELIMS)
vasilii 2015/12/22 12:31:42 I never used a macros in a macros. Does it work as
battre 2016/01/08 14:14:32 Yes, this is just string replacements.
61 #define SUB_DELIMS "[!$&'()*+,;=]"
62 #define GEN_DELIMS "[:/?#[\\]@]"
63
64 #define DIGIT "[0-9]"
65 #define HEXDIG "[0-9a-f]"
66
67 #define PCT_ENCODED "%" HEXDIG HEXDIG
68
69 #define DEC_OCTET NCG("[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-9]")
70
71 #define IPV4ADDRESS DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET
72
73 #define H16 NCG(HEXDIG) "{1,4}"
74 #define LS32 NCG(H16 ":" H16 "|" IPV4ADDRESS)
75
76 #define IPV6ADDRESS NCG( \
77 NCG(H16 ":") "{6}" LS32 "|" \
78 "::" NCG(H16 ":") "{5}" LS32 "|" \
79 OPT_NCG( H16) "::" NCG(H16 ":") "{4}" LS32 "|" \
80 OPT_NCG( NCG(H16 ":") "{0,1}" H16) "::" NCG(H16 ":") "{3}" LS32 "|" \
81 OPT_NCG( NCG(H16 ":") "{0,2}" H16) "::" NCG(H16 ":") "{2}" LS32 "|" \
82 OPT_NCG( NCG(H16 ":") "{0,3}" H16) "::" NCG(H16 ":") LS32 "|" \
83 OPT_NCG( NCG(H16 ":") "{0,4}" H16) "::" LS32 "|" \
84 OPT_NCG( NCG(H16 ":") "{0,5}" H16) "::" H16 "|" \
85 OPT_NCG( NCG(H16 ":") "{0,6}" H16) "::")
86
87 #define IPVFUTURE \
88 "v" HEXDIG \
89 "+" \
90 "\\." NCG(UNRESERVED "|" SUB_DELIMS \
91 "|" \
92 ":") "+"
93
94 #define IP_LITERAL "\\[" NCG(IPV6ADDRESS "|" IPVFUTURE) "\\]"
95
96 #define PORT DIGIT "*"
97
98 // This is a diversion of RFC 3987
99 #define SCHEME NCG("http|https|ftp|chrome|chrome-extension|android")
100
101 #define IPRIVATE \
102 "[" \
103 "\\x{E000}-\\x{F8FF}" \
104 "\\x{F0000}-\\x{FFFFD}" \
105 "\\x{100000}-\\x{10FFFD}" \
106 "]"
107
108 #define UCSCHAR \
109 "[" "\\x{A0}-\\x{D7FF}" "\\x{F900}-\\x{FDCF}" "\\x{FDF0}-\\x{FFEF}" \
110 "\\x{10000}-\\x{1FFFD}" "\\x{20000}-\\x{2FFFD}" "\\x{30000}-\\x{3FFFD}" \
111 "\\x{40000}-\\x{4FFFD}" "\\x{50000}-\\x{5FFFD}" "\\x{60000}-\\x{6FFFD}" \
112 "\\x{70000}-\\x{7FFFD}" "\\x{80000}-\\x{8FFFD}" "\\x{90000}-\\x{9FFFD}" \
113 "\\x{A0000}-\\x{AFFFD}" "\\x{B0000}-\\x{BFFFD}" "\\x{C0000}-\\x{CFFFD}" \
114 "\\x{D0000}-\\x{DFFFD}" "\\x{E1000}-\\x{EFFFD}" "]"
115
116 #define IUNRESERVED NCG("[-a-z0-9._~]" "|" UCSCHAR)
117
118 #define IPCHAR NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" "[:@]")
119 #define IFRAGMENT NCG(IPCHAR "|" "[/?]") "*"
120 #define IQUERY NCG(IPCHAR "|" IPRIVATE "|" "[/?]") "*"
121
122 #define ISEGMENT IPCHAR "*"
123 #define ISEGMENT_NZ IPCHAR "+"
124 #define ISEGMENT_NZ_NC \
125 NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS \
126 "|" "@") "+"
127
128 #define IPATH_EMPTY ""
129 #define IPATH_ROOTLESS ISEGMENT_NZ NCG("/" ISEGMENT) "*"
130 #define IPATH_NOSCHEME ISEGMENT_NZ_NC NCG("/" ISEGMENT) "*"
131 #define IPATH_ABSOLUTE "/" OPT_NCG(ISEGMENT_NZ NCG("/" ISEGMENT) "*")
132 #define IPATH_ABEMPTY NCG("/" ISEGMENT) "*"
133
134 #define IPATH NCG(IPATH_ABEMPTY "|" IPATH_ABSOLUTE "|" IPATH_NOSCHEME "|" \
135 IPATH_ROOTLESS "|" IPATH_EMPTY)
136
137 #define IREG_NAME NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS) "*"
138
139 #define IHOST NCG(IP_LITERAL "|" IPV4ADDRESS "|" IREG_NAME)
140 #define IUSERINFO NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" ":") "*"
141 #define IAUTHORITY OPT_NCG(IUSERINFO "@") IHOST OPT_NCG(":" PORT)
142
143 #define IRELATIVE_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \
144 "|" IPATH_NOSCHEME "|" IPATH_EMPTY)
145
146 #define IRELATIVE_REF IRELATIVE_PART OPT_NCG("?" IQUERY) OPT_NCG("#" IFRAGMENT)
147
148 // RFC 3987 requires IPATH_EMPTY here but it is omitted so that statements
149 // that end with "Android:" for example are not considered a URL.
150 #define IHIER_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \
151 "|" IPATH_ROOTLESS)
152
153 #define ABSOLUTE_IRI SCHEME ":" IHIER_PART OPT_NCG("?" IQUERY)
154
155 #define IRI SCHEME ":" IHIER_PART OPT_NCG("\\?" IQUERY) OPT_NCG("#" IFRAGMENT)
156
157 #define IRI_REFERENCE NCG(IRI "|" IRELATIVE_REF)
158
159 // TODO(battre): Use http://tools.ietf.org/html/rfc5322 to represent email
160 // addresses. Capture names as well ("First Lastname" <foo@bar.com>).
161
162 // The |kCustomPatternWithoutContext| array defines further patterns to match
163 // and anonymize. Each pattern consists of a single capturing group.
164 CustomPatternWithoutContext kCustomPatternsWithoutContext[] = {
165 {"URL", "(?i)(" IRI ")"},
166 // Email Addresses need to come after URLs because they can be part
167 // of a query parameter.
168 {"email", "(?i)([0-9a-z._%+-]+@[a-z0-9.-]+\\.[a-z]{2,6})"},
169 // IP filter rules need to come after URLs so that they don't disturb the
170 // URL pattern in case the IP address is part of a URL.
171 {"IPv4", "(?i)(" IPV4ADDRESS ")"},
172 {"IPv6", "(?i)(" IPV6ADDRESS ")"},
173 };
174
46 } // namespace 175 } // namespace
47 176
48 AnonymizerTool::AnonymizerTool() 177 AnonymizerTool::AnonymizerTool()
49 : custom_patterns_(arraysize(kCustomPatterns)) {} 178 : custom_patterns_with_context_(arraysize(kCustomPatternsWithContext)),
179 custom_patterns_without_context_(
180 arraysize(kCustomPatternsWithoutContext)) {}
50 181
51 AnonymizerTool::~AnonymizerTool() {} 182 AnonymizerTool::~AnonymizerTool() {}
52 183
53 std::string AnonymizerTool::Anonymize(const std::string& input) { 184 std::string AnonymizerTool::Anonymize(const std::string& input) {
54 std::string anonymized = AnonymizeMACAddresses(input); 185 std::string anonymized = AnonymizeMACAddresses(input);
55 anonymized = AnonymizeCustomPatterns(std::move(anonymized)); 186 anonymized = AnonymizeCustomPatterns(std::move(anonymized));
56 return anonymized; 187 return anonymized;
57 } 188 }
58 189
59 std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) { 190 std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) {
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
99 230
100 result += pre_mac; 231 result += pre_mac;
101 result += replacement_mac; 232 result += replacement_mac;
102 } 233 }
103 234
104 text.AppendToString(&result); 235 text.AppendToString(&result);
105 return result; 236 return result;
106 } 237 }
107 238
108 std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) { 239 std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) {
109 for (size_t i = 0; i < arraysize(kCustomPatterns); i++) { 240 for (size_t i = 0; i < arraysize(kCustomPatternsWithContext); i++) {
110 input = 241 input =
111 AnonymizeCustomPattern(input, kCustomPatterns[i], &custom_patterns_[i]); 242 AnonymizeCustomPatternWithContext(input, kCustomPatternsWithContext[i],
243 &custom_patterns_with_context_[i]);
244 }
245 for (size_t i = 0; i < arraysize(kCustomPatternsWithoutContext); i++) {
246 input = AnonymizeCustomPatternWithoutContext(
247 input, kCustomPatternsWithoutContext[i],
248 &custom_patterns_without_context_[i]);
112 } 249 }
113 return input; 250 return input;
114 } 251 }
115 252
116 // static 253 // static
117 std::string AnonymizerTool::AnonymizeCustomPattern( 254 std::string AnonymizerTool::AnonymizeCustomPatternWithContext(
118 const std::string& input, 255 const std::string& input,
119 const std::string& pattern, 256 const std::string& pattern,
120 std::map<std::string, std::string>* identifier_space) { 257 std::map<std::string, std::string>* identifier_space) {
121 RE2::Options options; 258 RE2::Options options;
122 // set_multiline of pcre is not supported by RE2, yet. 259 // set_multiline of pcre is not supported by RE2, yet.
123 options.set_dot_nl(true); // Dot matches a new line. 260 options.set_dot_nl(true); // Dot matches a new line.
124 RE2 re("(.*?)" + pattern, options); 261 RE2 re("(.*?)" + pattern, options);
125 DCHECK_EQ(4, re.NumberOfCapturingGroups()); 262 DCHECK_EQ(4, re.NumberOfCapturingGroups());
126 263
127 std::string result; 264 std::string result;
(...skipping 13 matching lines...) Expand all
141 278
142 result += pre_match; 279 result += pre_match;
143 result += pre_matched_id; 280 result += pre_matched_id;
144 result += replacement_id; 281 result += replacement_id;
145 result += post_matched_id; 282 result += post_matched_id;
146 } 283 }
147 text.AppendToString(&result); 284 text.AppendToString(&result);
148 return result; 285 return result;
149 } 286 }
150 287
288 // static
289 std::string AnonymizerTool::AnonymizeCustomPatternWithoutContext(
290 const std::string& input,
291 const CustomPatternWithoutContext& pattern,
292 std::map<std::string, std::string>* identifier_space) {
293 RE2::Options options;
294 // set_multiline of pcre is not supported by RE2, yet.
295 options.set_dot_nl(true); // Dot matches a new line.
296 RE2 re(std::string("(.*?)") + pattern.pattern, options);
297 DCHECK_EQ(re2::RE2::NoError, re.error_code())
298 << "Failed to parse:\n" << pattern.pattern << "\n" << re.error();
299 DCHECK_EQ(2, re.NumberOfCapturingGroups());
300
301 std::string result;
302 result.reserve(input.size());
303
304 // Keep consuming, building up a result string as we go.
305 re2::StringPiece text(input);
306 std::string pre_match, matched_id;
307 while (RE2::Consume(&text, re, RE2::Arg(&pre_match), RE2::Arg(&matched_id))) {
308 std::string replacement_id = (*identifier_space)[matched_id];
309 if (replacement_id.empty()) {
310 replacement_id = base::StringPrintf("<%s: %zu>", pattern.alias,
311 identifier_space->size());
312 (*identifier_space)[matched_id] = replacement_id;
313 }
314
315 result += pre_match;
316 result += replacement_id;
317 }
318 text.AppendToString(&result);
319 return result;
320 }
321
151 } // namespace feedback 322 } // namespace feedback
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698