Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(200)

Side by Side Diff: components/feedback/anonymizer_tool.cc

Issue 1530403003: Add anonymizer tool (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Addressed Vasilii's comments Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/feedback/anonymizer_tool.h"
6
7 #include <base/strings/string_number_conversions.h>
8 #include <base/strings/string_util.h>
9 #include <base/strings/stringprintf.h>
10
11 #include "third_party/re2/re2/re2.h"
12
13 using re2::RE2;
14
15 namespace feedback {
16
17 namespace {
18
19 // The |kCustomPatterns| array defines patterns to match and anonymize. Each
20 // pattern needs to define three capturing parentheses groups:
21 //
22 // - a group for the pattern before the identifier to be anonymized;
23 // - a group for the identifier to be anonymized;
24 // - a group for the pattern after the identifier to be anonymized.
25 //
26 // Every matched identifier (in the context of the whole pattern) is anonymized
27 // by replacing it with an incremental instance identifier. Every different
28 // pattern defines a separate instance identifier space. See the unit test for
29 // AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples.
30 //
31 // Useful regular expression syntax:
32 //
33 // +? is a non-greedy (lazy) +.
34 // \b matches a word boundary.
35 // (?i) turns on case insensitivy for the remainder of the regex.
36 // (?-s) turns off "dot matches newline" for the remainder of the regex.
37 // (?:regex) denotes non-capturing parentheses group.
38 const char* kCustomPatterns[] = {
39 "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager
40 "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager
41 "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant
42 "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()", // wpa_supplicant
43 "(?-s)(\\[SSID=)(.+?)(\\])", // shill
44 };
45
46 } // namespace
47
48 AnonymizerTool::AnonymizerTool()
49 : custom_patterns_(arraysize(kCustomPatterns)) {}
50
51 AnonymizerTool::~AnonymizerTool() {}
52
53 std::string AnonymizerTool::Anonymize(const std::string& input) {
54 std::string anonymized = AnonymizeMACAddresses(input);
55 anonymized = AnonymizeCustomPatterns(anonymized);
56 return anonymized;
57 }
58
59 std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) {
60 // This regular expression finds the next MAC address. It splits the data into
61 // a section preceding the MAC address, an OUI (Organizationally Unique
62 // Identifier) part and a NIC (Network Interface Controller) specific part.
63
64 RE2::Options options;
65 // set_multiline of pcre is not supported by RE2, yet.
66 options.set_dot_nl(true); // Dot matches a new line.
67 RE2 mac_re(
68 "(.*?)("
69 "[0-9a-fA-F][0-9a-fA-F]:"
70 "[0-9a-fA-F][0-9a-fA-F]:"
71 "[0-9a-fA-F][0-9a-fA-F]):("
72 "[0-9a-fA-F][0-9a-fA-F]:"
73 "[0-9a-fA-F][0-9a-fA-F]:"
74 "[0-9a-fA-F][0-9a-fA-F])",
75 options);
76
77 std::string result;
78 result.reserve(input.size());
79
80 // Keep consuming, building up a result string as we go.
81 re2::StringPiece text(input);
82 std::string pre_mac, oui, nic;
83 while (re2::RE2::Consume(&text, mac_re, RE2::Arg(&pre_mac), RE2::Arg(&oui),
84 RE2::Arg(&nic))) {
85 // Look up the MAC address in the hash.
86 oui = base::ToLowerASCII(oui);
87 nic = base::ToLowerASCII(nic);
88 std::string mac = oui + ":" + nic;
89 std::string replacement_mac = mac_addresses_[mac];
90 if (replacement_mac.empty()) {
91 // If not found, build up a replacement MAC address by generating a new
92 // NIC part.
93 int mac_id = mac_addresses_.size();
94 replacement_mac = base::StringPrintf(
95 "%s:%02x:%02x:%02x", oui.c_str(), (mac_id & 0x00ff0000) >> 16,
96 (mac_id & 0x0000ff00) >> 8, (mac_id & 0x000000ff));
97 mac_addresses_[mac] = replacement_mac;
98 }
99
100 result += pre_mac;
101 result += replacement_mac;
102 }
103
104 text.AppendToString(&result);
105 return result;
106 }
107
108 std::string AnonymizerTool::AnonymizeCustomPatterns(const std::string& input) {
109 std::string anonymized = input;
110 for (size_t i = 0; i < arraysize(kCustomPatterns); i++) {
111 anonymized = AnonymizeCustomPattern(anonymized, kCustomPatterns[i],
112 &custom_patterns_[i]);
113 }
114 return anonymized;
115 }
116
117 // static
118 std::string AnonymizerTool::AnonymizeCustomPattern(
119 const std::string& input,
120 const std::string& pattern,
121 std::map<std::string, std::string>* identifier_space) {
122 RE2::Options options;
123 // set_multiline of pcre is not supported by RE2, yet.
124 options.set_dot_nl(true); // Dot matches a new line.
125 RE2 re("(.*?)" + pattern, options);
126 DCHECK_EQ(4, re.NumberOfCapturingGroups());
127
128 std::string result;
129 result.reserve(input.size());
130
131 // Keep consuming, building up a result string as we go.
132 re2::StringPiece text(input);
133 std::string pre_match, pre_matched_id, matched_id, post_matched_id;
134 while (RE2::Consume(&text, re, RE2::Arg(&pre_match),
135 RE2::Arg(&pre_matched_id), RE2::Arg(&matched_id),
136 RE2::Arg(&post_matched_id))) {
137 std::string replacement_id = (*identifier_space)[matched_id];
138 if (replacement_id.empty()) {
139 replacement_id = base::IntToString(identifier_space->size());
140 (*identifier_space)[matched_id] = replacement_id;
141 }
142
143 result += pre_match;
144 result += pre_matched_id;
145 result += replacement_id;
146 result += post_matched_id;
147 }
148 text.AppendToString(&result);
149 return result;
150 }
151
152 } // namespace feedback
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698