Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(234)

Unified Diff: components/feedback/anonymizer_tool.cc

Issue 1530403003: Add anonymizer tool (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: components/feedback/anonymizer_tool.cc
diff --git a/components/feedback/anonymizer_tool.cc b/components/feedback/anonymizer_tool.cc
new file mode 100644
index 0000000000000000000000000000000000000000..21d1230fefba8cff28d72092f473657c485e67a4
--- /dev/null
+++ b/components/feedback/anonymizer_tool.cc
@@ -0,0 +1,151 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/feedback/anonymizer_tool.h"
+
+#include <base/strings/string_number_conversions.h>
+#include <base/strings/string_util.h>
+#include <base/strings/stringprintf.h>
+
+#include "third_party/re2/re2/re2.h"
+
+using re2::RE2;
+
+namespace feedback {
+
+namespace {
+
+// The |kCustomPatterns| array defines patterns to match and anonymize. Each
+// pattern needs to define three capturing parentheses groups:
+//
+// - a group for the pattern before the identifier to be anonymized;
+// - a group for the identifier to be anonymized;
+// - a group for the pattern after the identifier to be anonymized.
+//
+// Every matched identifier (in the context of the whole pattern) is anonymized
+// by replacing it with an incremental instance identifier. Every different
+// pattern defines a separate instance identifier space. See the unit test for
+// AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples.
+//
+// Useful regular expression syntax:
+//
+// +? is a non-greedy (lazy) +.
+// \b matches a word boundary.
+// (?i) turns on case insensitivy for the remainder of the regex.
+// (?-s) turns off "dot matches newline" for the remainder of the regex.
+// (?:regex) denotes non-capturing parentheses group.
+const char* kCustomPatterns[] = {
+ "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager
+ "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager
+ "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant
+ "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()", // wpa_supplicant
+ "(?-s)(\\[SSID=)(.+?)(\\])", // shill
+};
+
+} // namespace
+
+AnonymizerTool::AnonymizerTool()
+ : custom_patterns_(arraysize(kCustomPatterns)) {}
+
+AnonymizerTool::~AnonymizerTool() {}
+
+std::string AnonymizerTool::Anonymize(const std::string& input) {
+ std::string anonymized = AnonymizeMACAddresses(input);
+ anonymized = AnonymizeCustomPatterns(anonymized);
vasilii 2015/12/17 12:40:36 Looking at the implementation you should take the
battre 2015/12/17 13:24:24 I don't see that this would save much.
vasilii 2015/12/17 13:39:05 You 100% save one copy of |anonymized|. It can be
battre 2015/12/17 14:34:55 Done.
+ return anonymized;
+}
+
+std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) {
+ // This regular expression finds the next MAC address. It splits the data into
+ // a section preceding the MAC address, an OUI (Organizationally Unique
+ // Identifier) part and a NIC (Network Interface Controller) specific part.
+
+ RE2::Options options;
+ // set_multiline of pcre is not supported by RE2, yet.
+ options.set_dot_nl(true); // Dot matches a new line.
+ RE2 mac_re(
+ "(.*?)("
+ "[0-9a-fA-F][0-9a-fA-F]:"
+ "[0-9a-fA-F][0-9a-fA-F]:"
+ "[0-9a-fA-F][0-9a-fA-F]):("
+ "[0-9a-fA-F][0-9a-fA-F]:"
+ "[0-9a-fA-F][0-9a-fA-F]:"
+ "[0-9a-fA-F][0-9a-fA-F])",
+ options);
+
+ std::string result;
+ result.reserve(input.size());
+
+ // Keep consuming, building up a result string as we go.
+ re2::StringPiece text(input);
+ std::string pre_mac, oui, nic;
+ while (re2::RE2::Consume(&text, mac_re, RE2::Arg(&pre_mac), RE2::Arg(&oui),
+ RE2::Arg(&nic))) {
+ // Look up the MAC address in the hash.
+ oui = base::ToLowerASCII(oui);
+ nic = base::ToLowerASCII(nic);
+ std::string mac = oui + ":" + nic;
+ std::string replacement_mac = mac_addresses_[mac];
vasilii 2015/12/17 12:40:37 I'd use a reference here.
battre 2015/12/17 13:24:24 Are you suggesting to the following? std::string&
+ if (replacement_mac.empty()) {
+ // If not found, build up a replacement MAC address by generating a new
+ // NIC part.
+ int mac_id = mac_addresses_.size();
+ replacement_mac = base::StringPrintf(
+ "%s:%02x:%02x:%02x", oui.c_str(), (mac_id & 0x00ff0000) >> 16,
+ (mac_id & 0x0000ff00) >> 8, (mac_id & 0x000000ff));
+ mac_addresses_[mac] = replacement_mac;
+ }
+
+ result += pre_mac;
+ result += replacement_mac;
+ }
+
+ return result + text.as_string();
vasilii 2015/12/17 12:40:37 You should make it more effective by using operato
battre 2015/12/17 13:24:24 I have followed the StringPiece::AppendToString id
+}
+
+std::string AnonymizerTool::AnonymizeCustomPatterns(const std::string& input) {
+ std::string anonymized = input;
+ for (size_t i = 0; i < arraysize(kCustomPatterns); i++) {
+ anonymized = AnonymizeCustomPattern(anonymized, kCustomPatterns[i],
+ &custom_patterns_[i]);
+ }
+ return anonymized;
+}
+
+// static
+std::string AnonymizerTool::AnonymizeCustomPattern(
+ const std::string& input,
+ const std::string& pattern,
+ std::map<std::string, std::string>* identifier_space) {
+ RE2::Options options;
+ // set_multiline of pcre is not supported by RE2, yet.
+ options.set_dot_nl(true); // Dot matches a new line.
+ RE2 re("(.*?)" + pattern, options);
+ DCHECK_EQ(4, re.NumberOfCapturingGroups());
+
+ std::string result;
+ result.reserve(input.size());
+
+ // Keep consuming, building up a result string as we go.
+ re2::StringPiece text(input);
+ std::string pre_match, pre_matched_id, matched_id, post_matched_id;
+ while (RE2::Consume(&text, re, RE2::Arg(&pre_match),
+ RE2::Arg(&pre_matched_id), RE2::Arg(&matched_id),
+ RE2::Arg(&post_matched_id))) {
+ std::string replacement_id = (*identifier_space)[matched_id];
vasilii 2015/12/17 12:40:37 I'd use reference here too.
battre 2015/12/17 13:24:23 see above.
+ if (replacement_id.empty()) {
+ replacement_id = base::IntToString(identifier_space->size());
+ (*identifier_space)[matched_id] = replacement_id;
+ }
+
+ result += pre_match;
+ result += pre_matched_id;
+ result += replacement_id;
+ result += post_matched_id;
+ }
+ result += text.as_string();
vasilii 2015/12/17 12:40:37 Use StringPiece::AppendToString.
battre 2015/12/17 13:24:24 Done.
+ return result;
+}
+
+} // namespace feedback

Powered by Google App Engine
This is Rietveld 408576698