Chromium Code Reviews| Index: components/feedback/anonymizer_tool.cc |
| diff --git a/components/feedback/anonymizer_tool.cc b/components/feedback/anonymizer_tool.cc |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..21d1230fefba8cff28d72092f473657c485e67a4 |
| --- /dev/null |
| +++ b/components/feedback/anonymizer_tool.cc |
| @@ -0,0 +1,151 @@ |
| +// Copyright 2015 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +#include "components/feedback/anonymizer_tool.h" |
| + |
| +#include <base/strings/string_number_conversions.h> |
| +#include <base/strings/string_util.h> |
| +#include <base/strings/stringprintf.h> |
| + |
| +#include "third_party/re2/re2/re2.h" |
| + |
| +using re2::RE2; |
| + |
| +namespace feedback { |
| + |
| +namespace { |
| + |
| +// The |kCustomPatterns| array defines patterns to match and anonymize. Each |
| +// pattern needs to define three capturing parentheses groups: |
| +// |
| +// - a group for the pattern before the identifier to be anonymized; |
| +// - a group for the identifier to be anonymized; |
| +// - a group for the pattern after the identifier to be anonymized. |
| +// |
| +// Every matched identifier (in the context of the whole pattern) is anonymized |
| +// by replacing it with an incremental instance identifier. Every different |
| +// pattern defines a separate instance identifier space. See the unit test for |
| +// AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples. |
| +// |
| +// Useful regular expression syntax: |
| +// |
| +// +? is a non-greedy (lazy) +. |
| +// \b matches a word boundary. |
| +// (?i) turns on case insensitivy for the remainder of the regex. |
| +// (?-s) turns off "dot matches newline" for the remainder of the regex. |
| +// (?:regex) denotes non-capturing parentheses group. |
| +const char* kCustomPatterns[] = { |
| + "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager |
| + "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager |
| + "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant |
| + "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()", // wpa_supplicant |
| + "(?-s)(\\[SSID=)(.+?)(\\])", // shill |
| +}; |
| + |
| +} // namespace |
| + |
| +AnonymizerTool::AnonymizerTool() |
| + : custom_patterns_(arraysize(kCustomPatterns)) {} |
| + |
| +AnonymizerTool::~AnonymizerTool() {} |
| + |
| +std::string AnonymizerTool::Anonymize(const std::string& input) { |
| + std::string anonymized = AnonymizeMACAddresses(input); |
| + anonymized = AnonymizeCustomPatterns(anonymized); |
|
vasilii
2015/12/17 12:40:36
Looking at the implementation you should take the
battre
2015/12/17 13:24:24
I don't see that this would save much.
vasilii
2015/12/17 13:39:05
You 100% save one copy of |anonymized|. It can be
battre
2015/12/17 14:34:55
Done.
|
| + return anonymized; |
| +} |
| + |
| +std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) { |
| + // This regular expression finds the next MAC address. It splits the data into |
| + // a section preceding the MAC address, an OUI (Organizationally Unique |
| + // Identifier) part and a NIC (Network Interface Controller) specific part. |
| + |
| + RE2::Options options; |
| + // set_multiline of pcre is not supported by RE2, yet. |
| + options.set_dot_nl(true); // Dot matches a new line. |
| + RE2 mac_re( |
| + "(.*?)(" |
| + "[0-9a-fA-F][0-9a-fA-F]:" |
| + "[0-9a-fA-F][0-9a-fA-F]:" |
| + "[0-9a-fA-F][0-9a-fA-F]):(" |
| + "[0-9a-fA-F][0-9a-fA-F]:" |
| + "[0-9a-fA-F][0-9a-fA-F]:" |
| + "[0-9a-fA-F][0-9a-fA-F])", |
| + options); |
| + |
| + std::string result; |
| + result.reserve(input.size()); |
| + |
| + // Keep consuming, building up a result string as we go. |
| + re2::StringPiece text(input); |
| + std::string pre_mac, oui, nic; |
| + while (re2::RE2::Consume(&text, mac_re, RE2::Arg(&pre_mac), RE2::Arg(&oui), |
| + RE2::Arg(&nic))) { |
| + // Look up the MAC address in the hash. |
| + oui = base::ToLowerASCII(oui); |
| + nic = base::ToLowerASCII(nic); |
| + std::string mac = oui + ":" + nic; |
| + std::string replacement_mac = mac_addresses_[mac]; |
|
vasilii
2015/12/17 12:40:37
I'd use a reference here.
battre
2015/12/17 13:24:24
Are you suggesting to the following?
std::string&
|
| + if (replacement_mac.empty()) { |
| + // If not found, build up a replacement MAC address by generating a new |
| + // NIC part. |
| + int mac_id = mac_addresses_.size(); |
| + replacement_mac = base::StringPrintf( |
| + "%s:%02x:%02x:%02x", oui.c_str(), (mac_id & 0x00ff0000) >> 16, |
| + (mac_id & 0x0000ff00) >> 8, (mac_id & 0x000000ff)); |
| + mac_addresses_[mac] = replacement_mac; |
| + } |
| + |
| + result += pre_mac; |
| + result += replacement_mac; |
| + } |
| + |
| + return result + text.as_string(); |
|
vasilii
2015/12/17 12:40:37
You should make it more effective by using operato
battre
2015/12/17 13:24:24
I have followed the StringPiece::AppendToString id
|
| +} |
| + |
| +std::string AnonymizerTool::AnonymizeCustomPatterns(const std::string& input) { |
| + std::string anonymized = input; |
| + for (size_t i = 0; i < arraysize(kCustomPatterns); i++) { |
| + anonymized = AnonymizeCustomPattern(anonymized, kCustomPatterns[i], |
| + &custom_patterns_[i]); |
| + } |
| + return anonymized; |
| +} |
| + |
| +// static |
| +std::string AnonymizerTool::AnonymizeCustomPattern( |
| + const std::string& input, |
| + const std::string& pattern, |
| + std::map<std::string, std::string>* identifier_space) { |
| + RE2::Options options; |
| + // set_multiline of pcre is not supported by RE2, yet. |
| + options.set_dot_nl(true); // Dot matches a new line. |
| + RE2 re("(.*?)" + pattern, options); |
| + DCHECK_EQ(4, re.NumberOfCapturingGroups()); |
| + |
| + std::string result; |
| + result.reserve(input.size()); |
| + |
| + // Keep consuming, building up a result string as we go. |
| + re2::StringPiece text(input); |
| + std::string pre_match, pre_matched_id, matched_id, post_matched_id; |
| + while (RE2::Consume(&text, re, RE2::Arg(&pre_match), |
| + RE2::Arg(&pre_matched_id), RE2::Arg(&matched_id), |
| + RE2::Arg(&post_matched_id))) { |
| + std::string replacement_id = (*identifier_space)[matched_id]; |
|
vasilii
2015/12/17 12:40:37
I'd use reference here too.
battre
2015/12/17 13:24:23
see above.
|
| + if (replacement_id.empty()) { |
| + replacement_id = base::IntToString(identifier_space->size()); |
| + (*identifier_space)[matched_id] = replacement_id; |
| + } |
| + |
| + result += pre_match; |
| + result += pre_matched_id; |
| + result += replacement_id; |
| + result += post_matched_id; |
| + } |
| + result += text.as_string(); |
|
vasilii
2015/12/17 12:40:37
Use StringPiece::AppendToString.
battre
2015/12/17 13:24:24
Done.
|
| + return result; |
| +} |
| + |
| +} // namespace feedback |