Chromium Code Reviews| OLD | NEW | 
|---|---|
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be | 
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. | 
| 4 | 4 | 
| 5 #include "components/feedback/anonymizer_tool.h" | 5 #include "components/feedback/anonymizer_tool.h" | 
| 6 | 6 | 
| 7 #include <base/strings/string_number_conversions.h> | 7 #include <utility> | 
| 8 #include <base/strings/string_util.h> | 8 | 
| 9 #include <base/strings/stringprintf.h> | 9 #include "base/strings/string_number_conversions.h" | 
| 10 #include "base/strings/string_util.h" | |
| 11 #include "base/strings/stringprintf.h" | |
| 10 | 12 | 
| 
 
tfarina
2016/01/11 15:25:31
forgot to remove this blank line.
 
 | |
| 11 #include "third_party/re2/src/re2/re2.h" | 13 #include "third_party/re2/src/re2/re2.h" | 
| 12 | 14 | 
| 13 using re2::RE2; | 15 using re2::RE2; | 
| 14 | 16 | 
| 15 namespace feedback { | 17 namespace feedback { | 
| 16 | 18 | 
| 17 namespace { | 19 namespace { | 
| 18 | 20 | 
| 19 // The |kCustomPatterns| array defines patterns to match and anonymize. Each | 21 // The |kCustomPatternsWithContext| array defines patterns to match and | 
| 20 // pattern needs to define three capturing parentheses groups: | 22 // anonymize. Each pattern needs to define three capturing parentheses groups: | 
| 21 // | 23 // | 
| 22 // - a group for the pattern before the identifier to be anonymized; | 24 // - a group for the pattern before the identifier to be anonymized; | 
| 23 // - a group for the identifier to be anonymized; | 25 // - a group for the identifier to be anonymized; | 
| 24 // - a group for the pattern after the identifier to be anonymized. | 26 // - a group for the pattern after the identifier to be anonymized. | 
| 25 // | 27 // | 
| 28 // The first and the last capture group are the origin of the "WithContext" | |
| 29 // suffix in the name of this constant. | |
| 30 // | |
| 26 // Every matched identifier (in the context of the whole pattern) is anonymized | 31 // Every matched identifier (in the context of the whole pattern) is anonymized | 
| 27 // by replacing it with an incremental instance identifier. Every different | 32 // by replacing it with an incremental instance identifier. Every different | 
| 28 // pattern defines a separate instance identifier space. See the unit test for | 33 // pattern defines a separate instance identifier space. See the unit test for | 
| 29 // AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples. | 34 // AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples. | 
| 30 // | 35 // | 
| 31 // Useful regular expression syntax: | 36 // Useful regular expression syntax: | 
| 32 // | 37 // | 
| 33 // +? is a non-greedy (lazy) +. | 38 // +? is a non-greedy (lazy) +. | 
| 34 // \b matches a word boundary. | 39 // \b matches a word boundary. | 
| 35 // (?i) turns on case insensitivy for the remainder of the regex. | 40 // (?i) turns on case insensitivy for the remainder of the regex. | 
| 36 // (?-s) turns off "dot matches newline" for the remainder of the regex. | 41 // (?-s) turns off "dot matches newline" for the remainder of the regex. | 
| 37 // (?:regex) denotes non-capturing parentheses group. | 42 // (?:regex) denotes non-capturing parentheses group. | 
| 38 const char* kCustomPatterns[] = { | 43 const char* kCustomPatternsWithContext[] = { | 
| 39 "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager | 44 "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager | 
| 40 "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager | 45 "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager | 
| 41 "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant | 46 "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant | 
| 42 "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()", // wpa_supplicant | 47 "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()", // wpa_supplicant | 
| 43 "(?-s)(\\[SSID=)(.+?)(\\])", // shill | 48 "(?-s)(\\[SSID=)(.+?)(\\])", // shill | 
| 44 }; | 49 }; | 
| 45 | 50 | 
| 51 // Helper macro: Non capturing group | |
| 52 #define NCG(x) "(?:" x ")" | |
| 53 // Helper macro: Optional non capturing group | |
| 54 #define OPT_NCG(x) NCG(x) "?" | |
| 55 | |
| 56 ////////////////////////////////////////////////////////////////////////// | |
| 57 // Patterns for URLs, or better IRIs, based on RFC 3987 with an artificial | |
| 58 // limitation on the scheme to increase precision. Otherwise anything | |
| 59 // like "ID:" would be considered an IRI. | |
| 60 | |
| 61 #define UNRESERVED "[-a-z0-9._~]" | |
| 62 #define RESERVED NGC(GEN_DELIMS "|" SUB_DELIMS) | |
| 63 #define SUB_DELIMS "[!$&'()*+,;=]" | |
| 64 #define GEN_DELIMS "[:/?#[\\]@]" | |
| 65 | |
| 66 #define DIGIT "[0-9]" | |
| 67 #define HEXDIG "[0-9a-f]" | |
| 68 | |
| 69 #define PCT_ENCODED "%" HEXDIG HEXDIG | |
| 70 | |
| 71 #define DEC_OCTET NCG("[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-9]") | |
| 72 | |
| 73 #define IPV4ADDRESS DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET | |
| 74 | |
| 75 #define H16 NCG(HEXDIG) "{1,4}" | |
| 76 #define LS32 NCG(H16 ":" H16 "|" IPV4ADDRESS) | |
| 77 | |
| 78 #define IPV6ADDRESS NCG( \ | |
| 79 NCG(H16 ":") "{6}" LS32 "|" \ | |
| 80 "::" NCG(H16 ":") "{5}" LS32 "|" \ | |
| 81 OPT_NCG( H16) "::" NCG(H16 ":") "{4}" LS32 "|" \ | |
| 82 OPT_NCG( NCG(H16 ":") "{0,1}" H16) "::" NCG(H16 ":") "{3}" LS32 "|" \ | |
| 83 OPT_NCG( NCG(H16 ":") "{0,2}" H16) "::" NCG(H16 ":") "{2}" LS32 "|" \ | |
| 84 OPT_NCG( NCG(H16 ":") "{0,3}" H16) "::" NCG(H16 ":") LS32 "|" \ | |
| 85 OPT_NCG( NCG(H16 ":") "{0,4}" H16) "::" LS32 "|" \ | |
| 86 OPT_NCG( NCG(H16 ":") "{0,5}" H16) "::" H16 "|" \ | |
| 87 OPT_NCG( NCG(H16 ":") "{0,6}" H16) "::") | |
| 88 | |
| 89 #define IPVFUTURE \ | |
| 90 "v" HEXDIG \ | |
| 91 "+" \ | |
| 92 "\\." NCG(UNRESERVED "|" SUB_DELIMS \ | |
| 93 "|" \ | |
| 94 ":") "+" | |
| 95 | |
| 96 #define IP_LITERAL "\\[" NCG(IPV6ADDRESS "|" IPVFUTURE) "\\]" | |
| 97 | |
| 98 #define PORT DIGIT "*" | |
| 99 | |
| 100 // This is a diversion of RFC 3987 | |
| 101 #define SCHEME NCG("http|https|ftp|chrome|chrome-extension|android") | |
| 102 | |
| 103 #define IPRIVATE \ | |
| 104 "[" \ | |
| 105 "\\x{E000}-\\x{F8FF}" \ | |
| 106 "\\x{F0000}-\\x{FFFFD}" \ | |
| 107 "\\x{100000}-\\x{10FFFD}" \ | |
| 108 "]" | |
| 109 | |
| 110 #define UCSCHAR \ | |
| 111 "[" "\\x{A0}-\\x{D7FF}" "\\x{F900}-\\x{FDCF}" "\\x{FDF0}-\\x{FFEF}" \ | |
| 112 "\\x{10000}-\\x{1FFFD}" "\\x{20000}-\\x{2FFFD}" "\\x{30000}-\\x{3FFFD}" \ | |
| 113 "\\x{40000}-\\x{4FFFD}" "\\x{50000}-\\x{5FFFD}" "\\x{60000}-\\x{6FFFD}" \ | |
| 114 "\\x{70000}-\\x{7FFFD}" "\\x{80000}-\\x{8FFFD}" "\\x{90000}-\\x{9FFFD}" \ | |
| 115 "\\x{A0000}-\\x{AFFFD}" "\\x{B0000}-\\x{BFFFD}" "\\x{C0000}-\\x{CFFFD}" \ | |
| 116 "\\x{D0000}-\\x{DFFFD}" "\\x{E1000}-\\x{EFFFD}" "]" | |
| 117 | |
| 118 #define IUNRESERVED NCG("[-a-z0-9._~]" "|" UCSCHAR) | |
| 119 | |
| 120 #define IPCHAR NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" "[:@]") | |
| 121 #define IFRAGMENT NCG(IPCHAR "|" "[/?]") "*" | |
| 122 #define IQUERY NCG(IPCHAR "|" IPRIVATE "|" "[/?]") "*" | |
| 123 | |
| 124 #define ISEGMENT IPCHAR "*" | |
| 125 #define ISEGMENT_NZ IPCHAR "+" | |
| 126 #define ISEGMENT_NZ_NC \ | |
| 127 NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS \ | |
| 128 "|" "@") "+" | |
| 129 | |
| 130 #define IPATH_EMPTY "" | |
| 131 #define IPATH_ROOTLESS ISEGMENT_NZ NCG("/" ISEGMENT) "*" | |
| 132 #define IPATH_NOSCHEME ISEGMENT_NZ_NC NCG("/" ISEGMENT) "*" | |
| 133 #define IPATH_ABSOLUTE "/" OPT_NCG(ISEGMENT_NZ NCG("/" ISEGMENT) "*") | |
| 134 #define IPATH_ABEMPTY NCG("/" ISEGMENT) "*" | |
| 135 | |
| 136 #define IPATH NCG(IPATH_ABEMPTY "|" IPATH_ABSOLUTE "|" IPATH_NOSCHEME "|" \ | |
| 137 IPATH_ROOTLESS "|" IPATH_EMPTY) | |
| 138 | |
| 139 #define IREG_NAME NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS) "*" | |
| 140 | |
| 141 #define IHOST NCG(IP_LITERAL "|" IPV4ADDRESS "|" IREG_NAME) | |
| 142 #define IUSERINFO NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" ":") "*" | |
| 143 #define IAUTHORITY OPT_NCG(IUSERINFO "@") IHOST OPT_NCG(":" PORT) | |
| 144 | |
| 145 #define IRELATIVE_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \ | |
| 146 "|" IPATH_NOSCHEME "|" IPATH_EMPTY) | |
| 147 | |
| 148 #define IRELATIVE_REF IRELATIVE_PART OPT_NCG("?" IQUERY) OPT_NCG("#" IFRAGMENT) | |
| 149 | |
| 150 // RFC 3987 requires IPATH_EMPTY here but it is omitted so that statements | |
| 151 // that end with "Android:" for example are not considered a URL. | |
| 152 #define IHIER_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \ | |
| 153 "|" IPATH_ROOTLESS) | |
| 154 | |
| 155 #define ABSOLUTE_IRI SCHEME ":" IHIER_PART OPT_NCG("?" IQUERY) | |
| 156 | |
| 157 #define IRI SCHEME ":" IHIER_PART OPT_NCG("\\?" IQUERY) OPT_NCG("#" IFRAGMENT) | |
| 158 | |
| 159 #define IRI_REFERENCE NCG(IRI "|" IRELATIVE_REF) | |
| 160 | |
| 161 // TODO(battre): Use http://tools.ietf.org/html/rfc5322 to represent email | |
| 162 // addresses. Capture names as well ("First Lastname" <foo@bar.com>). | |
| 163 | |
| 164 // The |kCustomPatternWithoutContext| array defines further patterns to match | |
| 165 // and anonymize. Each pattern consists of a single capturing group. | |
| 166 CustomPatternWithoutContext kCustomPatternsWithoutContext[] = { | |
| 167 {"URL", "(?i)(" IRI ")"}, | |
| 168 // Email Addresses need to come after URLs because they can be part | |
| 169 // of a query parameter. | |
| 170 {"email", "(?i)([0-9a-z._%+-]+@[a-z0-9.-]+\\.[a-z]{2,6})"}, | |
| 171 // IP filter rules need to come after URLs so that they don't disturb the | |
| 172 // URL pattern in case the IP address is part of a URL. | |
| 173 {"IPv4", "(?i)(" IPV4ADDRESS ")"}, | |
| 174 {"IPv6", "(?i)(" IPV6ADDRESS ")"}, | |
| 175 }; | |
| 176 | |
| 177 // Like RE2's FindAndConsume, searches for the first occurrence of |pattern| in | |
| 178 // |input| and consumes the bytes until the end of the pattern matching. Unlike | |
| 179 // FindAndConsume, the bytes skipped before the match of |pattern| are stored | |
| 180 // in |skipped_input|. |args| needs to contain at least one element. | |
| 181 // Returns whether a match was found. | |
| 182 // | |
| 183 // Example: input = "aaabbbc", pattern = "(b+)" leads to skipped_input = "aaa", | |
| 184 // args[0] = "bbb", and the beginning input is moved to the right so that it | |
| 185 // only contains "c". | |
| 186 // Example: input = "aaabbbc", pattern = "(z+)" leads to input = "aaabbbc", | |
| 187 // the args values are not modified and skipped_input is not modified. | |
| 188 bool FindAndConsumeAndGetSkippedN(re2::StringPiece* input, | |
| 189 const re2::RE2& pattern, | |
| 190 re2::StringPiece* skipped_input, | |
| 191 re2::StringPiece* args[], | |
| 192 int argc) { | |
| 193 re2::StringPiece old_input = *input; | |
| 194 | |
| 195 CHECK_GE(argc, 1); | |
| 196 re2::RE2::Arg a0(argc > 0 ? args[0] : nullptr); | |
| 197 re2::RE2::Arg a1(argc > 1 ? args[1] : nullptr); | |
| 198 re2::RE2::Arg a2(argc > 2 ? args[2] : nullptr); | |
| 199 const re2::RE2::Arg* const wrapped_args[] = {&a0, &a1, &a2}; | |
| 200 CHECK_LE(argc, 3); | |
| 201 | |
| 202 bool result = re2::RE2::FindAndConsumeN(input, pattern, wrapped_args, argc); | |
| 203 | |
| 204 if (skipped_input && result) { | |
| 205 size_t bytes_skipped = args[0]->data() - old_input.data(); | |
| 206 *skipped_input = re2::StringPiece(old_input.data(), bytes_skipped); | |
| 207 } | |
| 208 return result; | |
| 209 } | |
| 210 | |
| 211 // All |match_groups| need to be of type re2::StringPiece*. | |
| 212 template <typename... Arg> | |
| 213 bool FindAndConsumeAndGetSkipped(re2::StringPiece* input, | |
| 214 const re2::RE2& pattern, | |
| 215 re2::StringPiece* skipped_input, | |
| 216 Arg*... match_groups) { | |
| 217 re2::StringPiece* args[] = {match_groups...}; | |
| 218 return FindAndConsumeAndGetSkippedN(input, pattern, skipped_input, args, | |
| 219 arraysize(args)); | |
| 220 } | |
| 221 | |
| 46 } // namespace | 222 } // namespace | 
| 47 | 223 | 
| 48 AnonymizerTool::AnonymizerTool() | 224 AnonymizerTool::AnonymizerTool() | 
| 49 : custom_patterns_(arraysize(kCustomPatterns)) {} | 225 : custom_patterns_with_context_(arraysize(kCustomPatternsWithContext)), | 
| 226 custom_patterns_without_context_( | |
| 227 arraysize(kCustomPatternsWithoutContext)) {} | |
| 50 | 228 | 
| 51 AnonymizerTool::~AnonymizerTool() {} | 229 AnonymizerTool::~AnonymizerTool() {} | 
| 52 | 230 | 
| 53 std::string AnonymizerTool::Anonymize(const std::string& input) { | 231 std::string AnonymizerTool::Anonymize(const std::string& input) { | 
| 54 std::string anonymized = AnonymizeMACAddresses(input); | 232 std::string anonymized = AnonymizeMACAddresses(input); | 
| 55 anonymized = AnonymizeCustomPatterns(std::move(anonymized)); | 233 anonymized = AnonymizeCustomPatterns(std::move(anonymized)); | 
| 56 return anonymized; | 234 return anonymized; | 
| 57 } | 235 } | 
| 58 | 236 | 
| 237 RE2* AnonymizerTool::GetRegExp(const std::string& pattern) { | |
| 238 if (regexp_cache_.find(pattern) == regexp_cache_.end()) { | |
| 239 RE2::Options options; | |
| 240 // set_multiline of pcre is not supported by RE2, yet. | |
| 241 options.set_dot_nl(true); // Dot matches a new line. | |
| 242 scoped_ptr<RE2> re = make_scoped_ptr(new RE2(pattern, options)); | |
| 243 DCHECK_EQ(re2::RE2::NoError, re->error_code()) | |
| 244 << "Failed to parse:\n" << pattern << "\n" << re->error(); | |
| 245 regexp_cache_[pattern] = std::move(re); | |
| 246 } | |
| 247 return regexp_cache_[pattern].get(); | |
| 248 } | |
| 249 | |
| 59 std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) { | 250 std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) { | 
| 60 // This regular expression finds the next MAC address. It splits the data into | 251 // This regular expression finds the next MAC address. It splits the data into | 
| 61 // a section preceding the MAC address, an OUI (Organizationally Unique | 252 // an OUI (Organizationally Unique Identifier) part and a NIC (Network | 
| 62 // Identifier) part and a NIC (Network Interface Controller) specific part. | 253 // Interface Controller) specific part. | 
| 63 | 254 | 
| 64 RE2::Options options; | 255 RE2* mac_re = GetRegExp( | 
| 65 // set_multiline of pcre is not supported by RE2, yet. | 256 "([0-9a-fA-F][0-9a-fA-F]:" | 
| 66 options.set_dot_nl(true); // Dot matches a new line. | |
| 67 RE2 mac_re( | |
| 68 "(.*?)(" | |
| 69 "[0-9a-fA-F][0-9a-fA-F]:" | |
| 70 "[0-9a-fA-F][0-9a-fA-F]:" | 257 "[0-9a-fA-F][0-9a-fA-F]:" | 
| 71 "[0-9a-fA-F][0-9a-fA-F]):(" | 258 "[0-9a-fA-F][0-9a-fA-F]):(" | 
| 72 "[0-9a-fA-F][0-9a-fA-F]:" | 259 "[0-9a-fA-F][0-9a-fA-F]:" | 
| 73 "[0-9a-fA-F][0-9a-fA-F]:" | 260 "[0-9a-fA-F][0-9a-fA-F]:" | 
| 74 "[0-9a-fA-F][0-9a-fA-F])", | 261 "[0-9a-fA-F][0-9a-fA-F])"); | 
| 75 options); | |
| 76 | 262 | 
| 77 std::string result; | 263 std::string result; | 
| 78 result.reserve(input.size()); | 264 result.reserve(input.size()); | 
| 79 | 265 | 
| 80 // Keep consuming, building up a result string as we go. | 266 // Keep consuming, building up a result string as we go. | 
| 81 re2::StringPiece text(input); | 267 re2::StringPiece text(input); | 
| 82 std::string pre_mac, oui, nic; | 268 re2::StringPiece skipped; | 
| 83 while (re2::RE2::Consume(&text, mac_re, RE2::Arg(&pre_mac), RE2::Arg(&oui), | 269 re2::StringPiece pre_mac, oui, nic; | 
| 84 RE2::Arg(&nic))) { | 270 while (FindAndConsumeAndGetSkipped(&text, *mac_re, &skipped, &oui, &nic)) { | 
| 85 // Look up the MAC address in the hash. | 271 // Look up the MAC address in the hash. | 
| 86 oui = base::ToLowerASCII(oui); | 272 std::string oui_string = base::ToLowerASCII(oui.as_string()); | 
| 87 nic = base::ToLowerASCII(nic); | 273 std::string nic_string = base::ToLowerASCII(nic.as_string()); | 
| 88 std::string mac = oui + ":" + nic; | 274 std::string mac = oui_string + ":" + nic_string; | 
| 89 std::string replacement_mac = mac_addresses_[mac]; | 275 std::string replacement_mac = mac_addresses_[mac]; | 
| 90 if (replacement_mac.empty()) { | 276 if (replacement_mac.empty()) { | 
| 91 // If not found, build up a replacement MAC address by generating a new | 277 // If not found, build up a replacement MAC address by generating a new | 
| 92 // NIC part. | 278 // NIC part. | 
| 93 int mac_id = mac_addresses_.size(); | 279 int mac_id = mac_addresses_.size(); | 
| 94 replacement_mac = base::StringPrintf( | 280 replacement_mac = base::StringPrintf( | 
| 95 "%s:%02x:%02x:%02x", oui.c_str(), (mac_id & 0x00ff0000) >> 16, | 281 "%s:%02x:%02x:%02x", oui_string.c_str(), (mac_id & 0x00ff0000) >> 16, | 
| 96 (mac_id & 0x0000ff00) >> 8, (mac_id & 0x000000ff)); | 282 (mac_id & 0x0000ff00) >> 8, (mac_id & 0x000000ff)); | 
| 97 mac_addresses_[mac] = replacement_mac; | 283 mac_addresses_[mac] = replacement_mac; | 
| 98 } | 284 } | 
| 99 | 285 | 
| 100 result += pre_mac; | 286 skipped.AppendToString(&result); | 
| 101 result += replacement_mac; | 287 result += replacement_mac; | 
| 102 } | 288 } | 
| 103 | 289 | 
| 104 text.AppendToString(&result); | 290 text.AppendToString(&result); | 
| 105 return result; | 291 return result; | 
| 106 } | 292 } | 
| 107 | 293 | 
| 108 std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) { | 294 std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) { | 
| 109 for (size_t i = 0; i < arraysize(kCustomPatterns); i++) { | 295 for (size_t i = 0; i < arraysize(kCustomPatternsWithContext); i++) { | 
| 110 input = | 296 input = | 
| 111 AnonymizeCustomPattern(input, kCustomPatterns[i], &custom_patterns_[i]); | 297 AnonymizeCustomPatternWithContext(input, kCustomPatternsWithContext[i], | 
| 298 &custom_patterns_with_context_[i]); | |
| 299 } | |
| 300 for (size_t i = 0; i < arraysize(kCustomPatternsWithoutContext); i++) { | |
| 301 input = AnonymizeCustomPatternWithoutContext( | |
| 302 input, kCustomPatternsWithoutContext[i], | |
| 303 &custom_patterns_without_context_[i]); | |
| 112 } | 304 } | 
| 113 return input; | 305 return input; | 
| 114 } | 306 } | 
| 115 | 307 | 
| 116 // static | 308 std::string AnonymizerTool::AnonymizeCustomPatternWithContext( | 
| 117 std::string AnonymizerTool::AnonymizeCustomPattern( | |
| 118 const std::string& input, | 309 const std::string& input, | 
| 119 const std::string& pattern, | 310 const std::string& pattern, | 
| 120 std::map<std::string, std::string>* identifier_space) { | 311 std::map<std::string, std::string>* identifier_space) { | 
| 121 RE2::Options options; | 312 RE2* re = GetRegExp(pattern); | 
| 122 // set_multiline of pcre is not supported by RE2, yet. | 313 DCHECK_EQ(3, re->NumberOfCapturingGroups()); | 
| 123 options.set_dot_nl(true); // Dot matches a new line. | |
| 124 RE2 re("(.*?)" + pattern, options); | |
| 125 DCHECK_EQ(4, re.NumberOfCapturingGroups()); | |
| 126 | 314 | 
| 127 std::string result; | 315 std::string result; | 
| 128 result.reserve(input.size()); | 316 result.reserve(input.size()); | 
| 129 | 317 | 
| 130 // Keep consuming, building up a result string as we go. | 318 // Keep consuming, building up a result string as we go. | 
| 131 re2::StringPiece text(input); | 319 re2::StringPiece text(input); | 
| 132 std::string pre_match, pre_matched_id, matched_id, post_matched_id; | 320 re2::StringPiece skipped; | 
| 133 while (RE2::Consume(&text, re, RE2::Arg(&pre_match), | 321 re2::StringPiece pre_match, pre_matched_id, matched_id, post_matched_id; | 
| 134 RE2::Arg(&pre_matched_id), RE2::Arg(&matched_id), | 322 while (FindAndConsumeAndGetSkipped(&text, *re, &skipped, &pre_matched_id, | 
| 135 RE2::Arg(&post_matched_id))) { | 323 &matched_id, &post_matched_id)) { | 
| 136 std::string replacement_id = (*identifier_space)[matched_id]; | 324 std::string matched_id_as_string = matched_id.as_string(); | 
| 325 std::string replacement_id = (*identifier_space)[matched_id_as_string]; | |
| 137 if (replacement_id.empty()) { | 326 if (replacement_id.empty()) { | 
| 138 replacement_id = base::IntToString(identifier_space->size()); | 327 replacement_id = base::IntToString(identifier_space->size()); | 
| 139 (*identifier_space)[matched_id] = replacement_id; | 328 (*identifier_space)[matched_id_as_string] = replacement_id; | 
| 140 } | 329 } | 
| 141 | 330 | 
| 142 result += pre_match; | 331 skipped.AppendToString(&result); | 
| 143 result += pre_matched_id; | 332 pre_matched_id.AppendToString(&result); | 
| 144 result += replacement_id; | 333 result += replacement_id; | 
| 145 result += post_matched_id; | 334 post_matched_id.AppendToString(&result); | 
| 146 } | 335 } | 
| 147 text.AppendToString(&result); | 336 text.AppendToString(&result); | 
| 148 return result; | 337 return result; | 
| 338 } | |
| 339 | |
| 340 std::string AnonymizerTool::AnonymizeCustomPatternWithoutContext( | |
| 341 const std::string& input, | |
| 342 const CustomPatternWithoutContext& pattern, | |
| 343 std::map<std::string, std::string>* identifier_space) { | |
| 344 RE2* re = GetRegExp(pattern.pattern); | |
| 345 DCHECK_EQ(1, re->NumberOfCapturingGroups()); | |
| 346 | |
| 347 std::string result; | |
| 348 result.reserve(input.size()); | |
| 349 | |
| 350 // Keep consuming, building up a result string as we go. | |
| 351 re2::StringPiece text(input); | |
| 352 re2::StringPiece skipped; | |
| 353 re2::StringPiece matched_id; | |
| 354 while (FindAndConsumeAndGetSkipped(&text, *re, &skipped, &matched_id)) { | |
| 355 std::string matched_id_as_string = matched_id.as_string(); | |
| 356 std::string replacement_id = (*identifier_space)[matched_id_as_string]; | |
| 357 if (replacement_id.empty()) { | |
| 358 // The weird Uint64toString trick is because Windows does not like to deal | |
| 359 // with %zu and a size_t in printf, nor does it support %llu. | |
| 360 replacement_id = base::StringPrintf( | |
| 361 "<%s: %s>", pattern.alias, | |
| 362 base::Uint64ToString(identifier_space->size()).c_str()); | |
| 363 (*identifier_space)[matched_id_as_string] = replacement_id; | |
| 364 } | |
| 365 | |
| 366 skipped.AppendToString(&result); | |
| 367 result += replacement_id; | |
| 368 } | |
| 369 text.AppendToString(&result); | |
| 370 return result; | |
| 149 } | 371 } | 
| 150 | 372 | 
| 151 } // namespace feedback | 373 } // namespace feedback | 
| OLD | NEW |