Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/feedback/anonymizer_tool.h" | 5 #include "components/feedback/anonymizer_tool.h" |
| 6 | 6 |
| 7 #include <base/strings/string_number_conversions.h> | 7 #include <base/strings/string_number_conversions.h> |
| 8 #include <base/strings/string_util.h> | 8 #include <base/strings/string_util.h> |
| 9 #include <base/strings/stringprintf.h> | 9 #include <base/strings/stringprintf.h> |
| 10 | 10 |
| 11 #include "third_party/re2/re2/re2.h" | 11 #include "third_party/re2/re2/re2.h" |
| 12 | 12 |
| 13 using re2::RE2; | 13 using re2::RE2; |
| 14 | 14 |
| 15 namespace feedback { | 15 namespace feedback { |
| 16 | 16 |
| 17 namespace { | 17 namespace { |
| 18 | 18 |
| 19 // The |kCustomPatterns| array defines patterns to match and anonymize. Each | 19 // The |kCustomPatternsWithContext| array defines patterns to match and |
| 20 // pattern needs to define three capturing parentheses groups: | 20 // anonymize. Each pattern needs to define three capturing parentheses groups: |
| 21 // | 21 // |
| 22 // - a group for the pattern before the identifier to be anonymized; | 22 // - a group for the pattern before the identifier to be anonymized; |
| 23 // - a group for the identifier to be anonymized; | 23 // - a group for the identifier to be anonymized; |
| 24 // - a group for the pattern after the identifier to be anonymized. | 24 // - a group for the pattern after the identifier to be anonymized. |
| 25 // | 25 // |
| 26 // The first and the last capture group are the origin of the "WithContext" | |
| 27 // suffix in the name of this constant. | |
| 28 // | |
| 26 // Every matched identifier (in the context of the whole pattern) is anonymized | 29 // Every matched identifier (in the context of the whole pattern) is anonymized |
| 27 // by replacing it with an incremental instance identifier. Every different | 30 // by replacing it with an incremental instance identifier. Every different |
| 28 // pattern defines a separate instance identifier space. See the unit test for | 31 // pattern defines a separate instance identifier space. See the unit test for |
| 29 // AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples. | 32 // AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples. |
| 30 // | 33 // |
| 31 // Useful regular expression syntax: | 34 // Useful regular expression syntax: |
| 32 // | 35 // |
| 33 // +? is a non-greedy (lazy) +. | 36 // +? is a non-greedy (lazy) +. |
| 34 // \b matches a word boundary. | 37 // \b matches a word boundary. |
| 35 // (?i) turns on case insensitivy for the remainder of the regex. | 38 // (?i) turns on case insensitivy for the remainder of the regex. |
| 36 // (?-s) turns off "dot matches newline" for the remainder of the regex. | 39 // (?-s) turns off "dot matches newline" for the remainder of the regex. |
| 37 // (?:regex) denotes non-capturing parentheses group. | 40 // (?:regex) denotes non-capturing parentheses group. |
| 38 const char* kCustomPatterns[] = { | 41 const char* kCustomPatternsWithContext[] = { |
| 39 "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager | 42 "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager |
| 40 "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager | 43 "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager |
| 41 "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant | 44 "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant |
| 42 "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()", // wpa_supplicant | 45 "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()", // wpa_supplicant |
| 43 "(?-s)(\\[SSID=)(.+?)(\\])", // shill | 46 "(?-s)(\\[SSID=)(.+?)(\\])", // shill |
| 44 }; | 47 }; |
| 45 | 48 |
| 49 // Helper macro: Non capturing group | |
| 50 #define NCG(x) "(?:" x ")" | |
| 51 // Helper macro: Optional non capturing group | |
| 52 #define OPT_NCG(x) NCG(x) "?" | |
| 53 | |
| 54 ////////////////////////////////////////////////////////////////////////// | |
| 55 // Patterns for URLs, or better IRIs, based on RFC 3987 with an artificial | |
| 56 // limitation on the scheme to increase precision. Otherwise anything | |
| 57 // like "ID:" would be considered an IRI. | |
| 58 | |
| 59 #define UNRESERVED "[-a-z0-9._~]" | |
| 60 #define RESERVED NGC(GEN_DELIMS "|" SUB_DELIMS) | |
|
vasilii
2015/12/22 12:31:42
I never used a macros in a macros. Does it work as
battre
2016/01/08 14:14:32
Yes, this is just string replacements.
| |
| 61 #define SUB_DELIMS "[!$&'()*+,;=]" | |
| 62 #define GEN_DELIMS "[:/?#[\\]@]" | |
| 63 | |
| 64 #define DIGIT "[0-9]" | |
| 65 #define HEXDIG "[0-9a-f]" | |
| 66 | |
| 67 #define PCT_ENCODED "%" HEXDIG HEXDIG | |
| 68 | |
| 69 #define DEC_OCTET NCG("[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-9]") | |
| 70 | |
| 71 #define IPV4ADDRESS DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET | |
| 72 | |
| 73 #define H16 NCG(HEXDIG) "{1,4}" | |
| 74 #define LS32 NCG(H16 ":" H16 "|" IPV4ADDRESS) | |
| 75 | |
| 76 #define IPV6ADDRESS NCG( \ | |
| 77 NCG(H16 ":") "{6}" LS32 "|" \ | |
| 78 "::" NCG(H16 ":") "{5}" LS32 "|" \ | |
| 79 OPT_NCG( H16) "::" NCG(H16 ":") "{4}" LS32 "|" \ | |
| 80 OPT_NCG( NCG(H16 ":") "{0,1}" H16) "::" NCG(H16 ":") "{3}" LS32 "|" \ | |
| 81 OPT_NCG( NCG(H16 ":") "{0,2}" H16) "::" NCG(H16 ":") "{2}" LS32 "|" \ | |
| 82 OPT_NCG( NCG(H16 ":") "{0,3}" H16) "::" NCG(H16 ":") LS32 "|" \ | |
| 83 OPT_NCG( NCG(H16 ":") "{0,4}" H16) "::" LS32 "|" \ | |
| 84 OPT_NCG( NCG(H16 ":") "{0,5}" H16) "::" H16 "|" \ | |
| 85 OPT_NCG( NCG(H16 ":") "{0,6}" H16) "::") | |
| 86 | |
| 87 #define IPVFUTURE \ | |
| 88 "v" HEXDIG \ | |
| 89 "+" \ | |
| 90 "\\." NCG(UNRESERVED "|" SUB_DELIMS \ | |
| 91 "|" \ | |
| 92 ":") "+" | |
| 93 | |
| 94 #define IP_LITERAL "\\[" NCG(IPV6ADDRESS "|" IPVFUTURE) "\\]" | |
| 95 | |
| 96 #define PORT DIGIT "*" | |
| 97 | |
| 98 // This is a diversion of RFC 3987 | |
| 99 #define SCHEME NCG("http|https|ftp|chrome|chrome-extension|android") | |
| 100 | |
| 101 #define IPRIVATE \ | |
| 102 "[" \ | |
| 103 "\\x{E000}-\\x{F8FF}" \ | |
| 104 "\\x{F0000}-\\x{FFFFD}" \ | |
| 105 "\\x{100000}-\\x{10FFFD}" \ | |
| 106 "]" | |
| 107 | |
| 108 #define UCSCHAR \ | |
| 109 "[" "\\x{A0}-\\x{D7FF}" "\\x{F900}-\\x{FDCF}" "\\x{FDF0}-\\x{FFEF}" \ | |
| 110 "\\x{10000}-\\x{1FFFD}" "\\x{20000}-\\x{2FFFD}" "\\x{30000}-\\x{3FFFD}" \ | |
| 111 "\\x{40000}-\\x{4FFFD}" "\\x{50000}-\\x{5FFFD}" "\\x{60000}-\\x{6FFFD}" \ | |
| 112 "\\x{70000}-\\x{7FFFD}" "\\x{80000}-\\x{8FFFD}" "\\x{90000}-\\x{9FFFD}" \ | |
| 113 "\\x{A0000}-\\x{AFFFD}" "\\x{B0000}-\\x{BFFFD}" "\\x{C0000}-\\x{CFFFD}" \ | |
| 114 "\\x{D0000}-\\x{DFFFD}" "\\x{E1000}-\\x{EFFFD}" "]" | |
| 115 | |
| 116 #define IUNRESERVED NCG("[-a-z0-9._~]" "|" UCSCHAR) | |
| 117 | |
| 118 #define IPCHAR NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" "[:@]") | |
| 119 #define IFRAGMENT NCG(IPCHAR "|" "[/?]") "*" | |
| 120 #define IQUERY NCG(IPCHAR "|" IPRIVATE "|" "[/?]") "*" | |
| 121 | |
| 122 #define ISEGMENT IPCHAR "*" | |
| 123 #define ISEGMENT_NZ IPCHAR "+" | |
| 124 #define ISEGMENT_NZ_NC \ | |
| 125 NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS \ | |
| 126 "|" "@") "+" | |
| 127 | |
| 128 #define IPATH_EMPTY "" | |
| 129 #define IPATH_ROOTLESS ISEGMENT_NZ NCG("/" ISEGMENT) "*" | |
| 130 #define IPATH_NOSCHEME ISEGMENT_NZ_NC NCG("/" ISEGMENT) "*" | |
| 131 #define IPATH_ABSOLUTE "/" OPT_NCG(ISEGMENT_NZ NCG("/" ISEGMENT) "*") | |
| 132 #define IPATH_ABEMPTY NCG("/" ISEGMENT) "*" | |
| 133 | |
| 134 #define IPATH NCG(IPATH_ABEMPTY "|" IPATH_ABSOLUTE "|" IPATH_NOSCHEME "|" \ | |
| 135 IPATH_ROOTLESS "|" IPATH_EMPTY) | |
| 136 | |
| 137 #define IREG_NAME NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS) "*" | |
| 138 | |
| 139 #define IHOST NCG(IP_LITERAL "|" IPV4ADDRESS "|" IREG_NAME) | |
| 140 #define IUSERINFO NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" ":") "*" | |
| 141 #define IAUTHORITY OPT_NCG(IUSERINFO "@") IHOST OPT_NCG(":" PORT) | |
| 142 | |
| 143 #define IRELATIVE_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \ | |
| 144 "|" IPATH_NOSCHEME "|" IPATH_EMPTY) | |
| 145 | |
| 146 #define IRELATIVE_REF IRELATIVE_PART OPT_NCG("?" IQUERY) OPT_NCG("#" IFRAGMENT) | |
| 147 | |
| 148 // RFC 3987 requires IPATH_EMPTY here but it is omitted so that statements | |
| 149 // that end with "Android:" for example are not considered a URL. | |
| 150 #define IHIER_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \ | |
| 151 "|" IPATH_ROOTLESS) | |
| 152 | |
| 153 #define ABSOLUTE_IRI SCHEME ":" IHIER_PART OPT_NCG("?" IQUERY) | |
| 154 | |
| 155 #define IRI SCHEME ":" IHIER_PART OPT_NCG("\\?" IQUERY) OPT_NCG("#" IFRAGMENT) | |
| 156 | |
| 157 #define IRI_REFERENCE NCG(IRI "|" IRELATIVE_REF) | |
| 158 | |
| 159 // TODO(battre): Use http://tools.ietf.org/html/rfc5322 to represent email | |
| 160 // addresses. Capture names as well ("First Lastname" <foo@bar.com>). | |
| 161 | |
| 162 // The |kCustomPatternWithoutContext| array defines further patterns to match | |
| 163 // and anonymize. Each pattern consists of a single capturing group. | |
| 164 CustomPatternWithoutContext kCustomPatternsWithoutContext[] = { | |
| 165 {"URL", "(?i)(" IRI ")"}, | |
| 166 // Email Addresses need to come after URLs because they can be part | |
| 167 // of a query parameter. | |
| 168 {"email", "(?i)([0-9a-z._%+-]+@[a-z0-9.-]+\\.[a-z]{2,6})"}, | |
| 169 // IP filter rules need to come after URLs so that they don't disturb the | |
| 170 // URL pattern in case the IP address is part of a URL. | |
| 171 {"IPv4", "(?i)(" IPV4ADDRESS ")"}, | |
| 172 {"IPv6", "(?i)(" IPV6ADDRESS ")"}, | |
| 173 }; | |
| 174 | |
| 46 } // namespace | 175 } // namespace |
| 47 | 176 |
| 48 AnonymizerTool::AnonymizerTool() | 177 AnonymizerTool::AnonymizerTool() |
| 49 : custom_patterns_(arraysize(kCustomPatterns)) {} | 178 : custom_patterns_with_context_(arraysize(kCustomPatternsWithContext)), |
| 179 custom_patterns_without_context_( | |
| 180 arraysize(kCustomPatternsWithoutContext)) {} | |
| 50 | 181 |
| 51 AnonymizerTool::~AnonymizerTool() {} | 182 AnonymizerTool::~AnonymizerTool() {} |
| 52 | 183 |
| 53 std::string AnonymizerTool::Anonymize(const std::string& input) { | 184 std::string AnonymizerTool::Anonymize(const std::string& input) { |
| 54 std::string anonymized = AnonymizeMACAddresses(input); | 185 std::string anonymized = AnonymizeMACAddresses(input); |
| 55 anonymized = AnonymizeCustomPatterns(std::move(anonymized)); | 186 anonymized = AnonymizeCustomPatterns(std::move(anonymized)); |
| 56 return anonymized; | 187 return anonymized; |
| 57 } | 188 } |
| 58 | 189 |
| 59 std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) { | 190 std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) { |
| (...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 99 | 230 |
| 100 result += pre_mac; | 231 result += pre_mac; |
| 101 result += replacement_mac; | 232 result += replacement_mac; |
| 102 } | 233 } |
| 103 | 234 |
| 104 text.AppendToString(&result); | 235 text.AppendToString(&result); |
| 105 return result; | 236 return result; |
| 106 } | 237 } |
| 107 | 238 |
| 108 std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) { | 239 std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) { |
| 109 for (size_t i = 0; i < arraysize(kCustomPatterns); i++) { | 240 for (size_t i = 0; i < arraysize(kCustomPatternsWithContext); i++) { |
| 110 input = | 241 input = |
| 111 AnonymizeCustomPattern(input, kCustomPatterns[i], &custom_patterns_[i]); | 242 AnonymizeCustomPatternWithContext(input, kCustomPatternsWithContext[i], |
| 243 &custom_patterns_with_context_[i]); | |
| 244 } | |
| 245 for (size_t i = 0; i < arraysize(kCustomPatternsWithoutContext); i++) { | |
| 246 input = AnonymizeCustomPatternWithoutContext( | |
| 247 input, kCustomPatternsWithoutContext[i], | |
| 248 &custom_patterns_without_context_[i]); | |
| 112 } | 249 } |
| 113 return input; | 250 return input; |
| 114 } | 251 } |
| 115 | 252 |
| 116 // static | 253 // static |
| 117 std::string AnonymizerTool::AnonymizeCustomPattern( | 254 std::string AnonymizerTool::AnonymizeCustomPatternWithContext( |
| 118 const std::string& input, | 255 const std::string& input, |
| 119 const std::string& pattern, | 256 const std::string& pattern, |
| 120 std::map<std::string, std::string>* identifier_space) { | 257 std::map<std::string, std::string>* identifier_space) { |
| 121 RE2::Options options; | 258 RE2::Options options; |
| 122 // set_multiline of pcre is not supported by RE2, yet. | 259 // set_multiline of pcre is not supported by RE2, yet. |
| 123 options.set_dot_nl(true); // Dot matches a new line. | 260 options.set_dot_nl(true); // Dot matches a new line. |
| 124 RE2 re("(.*?)" + pattern, options); | 261 RE2 re("(.*?)" + pattern, options); |
| 125 DCHECK_EQ(4, re.NumberOfCapturingGroups()); | 262 DCHECK_EQ(4, re.NumberOfCapturingGroups()); |
| 126 | 263 |
| 127 std::string result; | 264 std::string result; |
| (...skipping 13 matching lines...) Expand all Loading... | |
| 141 | 278 |
| 142 result += pre_match; | 279 result += pre_match; |
| 143 result += pre_matched_id; | 280 result += pre_matched_id; |
| 144 result += replacement_id; | 281 result += replacement_id; |
| 145 result += post_matched_id; | 282 result += post_matched_id; |
| 146 } | 283 } |
| 147 text.AppendToString(&result); | 284 text.AppendToString(&result); |
| 148 return result; | 285 return result; |
| 149 } | 286 } |
| 150 | 287 |
| 288 // static | |
| 289 std::string AnonymizerTool::AnonymizeCustomPatternWithoutContext( | |
| 290 const std::string& input, | |
| 291 const CustomPatternWithoutContext& pattern, | |
| 292 std::map<std::string, std::string>* identifier_space) { | |
| 293 RE2::Options options; | |
| 294 // set_multiline of pcre is not supported by RE2, yet. | |
| 295 options.set_dot_nl(true); // Dot matches a new line. | |
| 296 RE2 re(std::string("(.*?)") + pattern.pattern, options); | |
| 297 DCHECK_EQ(re2::RE2::NoError, re.error_code()) | |
| 298 << "Failed to parse:\n" << pattern.pattern << "\n" << re.error(); | |
| 299 DCHECK_EQ(2, re.NumberOfCapturingGroups()); | |
| 300 | |
| 301 std::string result; | |
| 302 result.reserve(input.size()); | |
| 303 | |
| 304 // Keep consuming, building up a result string as we go. | |
| 305 re2::StringPiece text(input); | |
| 306 std::string pre_match, matched_id; | |
| 307 while (RE2::Consume(&text, re, RE2::Arg(&pre_match), RE2::Arg(&matched_id))) { | |
| 308 std::string replacement_id = (*identifier_space)[matched_id]; | |
| 309 if (replacement_id.empty()) { | |
| 310 replacement_id = base::StringPrintf("<%s: %zu>", pattern.alias, | |
| 311 identifier_space->size()); | |
| 312 (*identifier_space)[matched_id] = replacement_id; | |
| 313 } | |
| 314 | |
| 315 result += pre_match; | |
| 316 result += replacement_id; | |
| 317 } | |
| 318 text.AppendToString(&result); | |
| 319 return result; | |
| 320 } | |
| 321 | |
| 151 } // namespace feedback | 322 } // namespace feedback |
| OLD | NEW |