OLD | NEW |
(Empty) | |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "components/feedback/anonymizer_tool.h" |
| 6 |
| 7 #include <base/strings/string_number_conversions.h> |
| 8 #include <base/strings/string_util.h> |
| 9 #include <base/strings/stringprintf.h> |
| 10 |
| 11 #include "third_party/re2/src/re2/re2.h" |
| 12 |
| 13 using re2::RE2; |
| 14 |
| 15 namespace feedback { |
| 16 |
| 17 namespace { |
| 18 |
| 19 // The |kCustomPatterns| array defines patterns to match and anonymize. Each |
| 20 // pattern needs to define three capturing parentheses groups: |
| 21 // |
| 22 // - a group for the pattern before the identifier to be anonymized; |
| 23 // - a group for the identifier to be anonymized; |
| 24 // - a group for the pattern after the identifier to be anonymized. |
| 25 // |
| 26 // Every matched identifier (in the context of the whole pattern) is anonymized |
| 27 // by replacing it with an incremental instance identifier. Every different |
| 28 // pattern defines a separate instance identifier space. See the unit test for |
| 29 // AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples. |
| 30 // |
| 31 // Useful regular expression syntax: |
| 32 // |
| 33 // +? is a non-greedy (lazy) +. |
| 34 // \b matches a word boundary. |
| 35 // (?i) turns on case insensitivy for the remainder of the regex. |
| 36 // (?-s) turns off "dot matches newline" for the remainder of the regex. |
| 37 // (?:regex) denotes non-capturing parentheses group. |
| 38 const char* kCustomPatterns[] = { |
| 39 "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager |
| 40 "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager |
| 41 "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant |
| 42 "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()", // wpa_supplicant |
| 43 "(?-s)(\\[SSID=)(.+?)(\\])", // shill |
| 44 }; |
| 45 |
| 46 } // namespace |
| 47 |
| 48 AnonymizerTool::AnonymizerTool() |
| 49 : custom_patterns_(arraysize(kCustomPatterns)) {} |
| 50 |
| 51 AnonymizerTool::~AnonymizerTool() {} |
| 52 |
| 53 std::string AnonymizerTool::Anonymize(const std::string& input) { |
| 54 std::string anonymized = AnonymizeMACAddresses(input); |
| 55 anonymized = AnonymizeCustomPatterns(std::move(anonymized)); |
| 56 return anonymized; |
| 57 } |
| 58 |
| 59 std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) { |
| 60 // This regular expression finds the next MAC address. It splits the data into |
| 61 // a section preceding the MAC address, an OUI (Organizationally Unique |
| 62 // Identifier) part and a NIC (Network Interface Controller) specific part. |
| 63 |
| 64 RE2::Options options; |
| 65 // set_multiline of pcre is not supported by RE2, yet. |
| 66 options.set_dot_nl(true); // Dot matches a new line. |
| 67 RE2 mac_re( |
| 68 "(.*?)(" |
| 69 "[0-9a-fA-F][0-9a-fA-F]:" |
| 70 "[0-9a-fA-F][0-9a-fA-F]:" |
| 71 "[0-9a-fA-F][0-9a-fA-F]):(" |
| 72 "[0-9a-fA-F][0-9a-fA-F]:" |
| 73 "[0-9a-fA-F][0-9a-fA-F]:" |
| 74 "[0-9a-fA-F][0-9a-fA-F])", |
| 75 options); |
| 76 |
| 77 std::string result; |
| 78 result.reserve(input.size()); |
| 79 |
| 80 // Keep consuming, building up a result string as we go. |
| 81 re2::StringPiece text(input); |
| 82 std::string pre_mac, oui, nic; |
| 83 while (re2::RE2::Consume(&text, mac_re, RE2::Arg(&pre_mac), RE2::Arg(&oui), |
| 84 RE2::Arg(&nic))) { |
| 85 // Look up the MAC address in the hash. |
| 86 oui = base::ToLowerASCII(oui); |
| 87 nic = base::ToLowerASCII(nic); |
| 88 std::string mac = oui + ":" + nic; |
| 89 std::string replacement_mac = mac_addresses_[mac]; |
| 90 if (replacement_mac.empty()) { |
| 91 // If not found, build up a replacement MAC address by generating a new |
| 92 // NIC part. |
| 93 int mac_id = mac_addresses_.size(); |
| 94 replacement_mac = base::StringPrintf( |
| 95 "%s:%02x:%02x:%02x", oui.c_str(), (mac_id & 0x00ff0000) >> 16, |
| 96 (mac_id & 0x0000ff00) >> 8, (mac_id & 0x000000ff)); |
| 97 mac_addresses_[mac] = replacement_mac; |
| 98 } |
| 99 |
| 100 result += pre_mac; |
| 101 result += replacement_mac; |
| 102 } |
| 103 |
| 104 text.AppendToString(&result); |
| 105 return result; |
| 106 } |
| 107 |
| 108 std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) { |
| 109 for (size_t i = 0; i < arraysize(kCustomPatterns); i++) { |
| 110 input = |
| 111 AnonymizeCustomPattern(input, kCustomPatterns[i], &custom_patterns_[i]); |
| 112 } |
| 113 return input; |
| 114 } |
| 115 |
| 116 // static |
| 117 std::string AnonymizerTool::AnonymizeCustomPattern( |
| 118 const std::string& input, |
| 119 const std::string& pattern, |
| 120 std::map<std::string, std::string>* identifier_space) { |
| 121 RE2::Options options; |
| 122 // set_multiline of pcre is not supported by RE2, yet. |
| 123 options.set_dot_nl(true); // Dot matches a new line. |
| 124 RE2 re("(.*?)" + pattern, options); |
| 125 DCHECK_EQ(4, re.NumberOfCapturingGroups()); |
| 126 |
| 127 std::string result; |
| 128 result.reserve(input.size()); |
| 129 |
| 130 // Keep consuming, building up a result string as we go. |
| 131 re2::StringPiece text(input); |
| 132 std::string pre_match, pre_matched_id, matched_id, post_matched_id; |
| 133 while (RE2::Consume(&text, re, RE2::Arg(&pre_match), |
| 134 RE2::Arg(&pre_matched_id), RE2::Arg(&matched_id), |
| 135 RE2::Arg(&post_matched_id))) { |
| 136 std::string replacement_id = (*identifier_space)[matched_id]; |
| 137 if (replacement_id.empty()) { |
| 138 replacement_id = base::IntToString(identifier_space->size()); |
| 139 (*identifier_space)[matched_id] = replacement_id; |
| 140 } |
| 141 |
| 142 result += pre_match; |
| 143 result += pre_matched_id; |
| 144 result += replacement_id; |
| 145 result += post_matched_id; |
| 146 } |
| 147 text.AppendToString(&result); |
| 148 return result; |
| 149 } |
| 150 |
| 151 } // namespace feedback |
OLD | NEW |