components/feedback/anonymizer_tool.cc - Issue 1543633003: Added anonymization patterns for URLs and email addresses

Unified Diff: components/feedback/anonymizer_tool.cc

Issue 1543633003: Added anonymization patterns for URLs and email addresses (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@bug-567870-introduce-anonymizer

Patch Set: Fixed includes Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: components/feedback/anonymizer_tool.cc

diff --git a/components/feedback/anonymizer_tool.cc b/components/feedback/anonymizer_tool.cc

index 713ceb6a207e475e357dcdc34324708c0c8bb075..c0911d9936012c9a27a07b57e2f72b61deface23 100644

--- a/components/feedback/anonymizer_tool.cc

+++ b/components/feedback/anonymizer_tool.cc

@@ -4,9 +4,11 @@

#include "components/feedback/anonymizer_tool.h"

-#include <base/strings/string_number_conversions.h>

-#include <base/strings/string_util.h>

-#include <base/strings/stringprintf.h>

+#include <utility>

+#include "base/strings/string_number_conversions.h"

+#include "base/strings/string_util.h"

+#include "base/strings/stringprintf.h"

#include "third_party/re2/src/re2/re2.h"

@@ -16,13 +18,16 @@ namespace feedback {

namespace {

-// The |kCustomPatterns| array defines patterns to match and anonymize. Each

-// pattern needs to define three capturing parentheses groups:

+// The |kCustomPatternsWithContext| array defines patterns to match and

+// anonymize. Each pattern needs to define three capturing parentheses groups:

// - a group for the pattern before the identifier to be anonymized;

// - a group for the identifier to be anonymized;

// - a group for the pattern after the identifier to be anonymized.

+// The first and the last capture group are the origin of the "WithContext"

+// suffix in the name of this constant.

+//

// Every matched identifier (in the context of the whole pattern) is anonymized

// by replacing it with an incremental instance identifier. Every different

// pattern defines a separate instance identifier space. See the unit test for

@@ -35,7 +40,7 @@ namespace {

// (?i) turns on case insensitivy for the remainder of the regex.

// (?-s) turns off "dot matches newline" for the remainder of the regex.

// (?:regex) denotes non-capturing parentheses group.

-const char* kCustomPatterns[] = {

+const char* kCustomPatternsWithContext[] = {

"(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager

"(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager

"(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant

@@ -43,10 +48,175 @@ const char* kCustomPatterns[] = {

"(?-s)(\\[SSID=)(.+?)(\\])", // shill

};

+// Helper macro: Non capturing group

+#define NCG(x) "(?:" x ")"

+// Helper macro: Optional non capturing group

+#define OPT_NCG(x) NCG(x) "?"

+//////////////////////////////////////////////////////////////////////////

+// Patterns for URLs, or better IRIs, based on RFC 3987 with an artificial

+// limitation on the scheme to increase precision. Otherwise anything

+// like "ID:" would be considered an IRI.

+#define UNRESERVED "[-a-z0-9._~]"

+#define RESERVED NGC(GEN_DELIMS "|" SUB_DELIMS)

+#define SUB_DELIMS "[!$&'()*+,;=]"

+#define GEN_DELIMS "[:/?#[\\]@]"

+#define DIGIT "[0-9]"

+#define HEXDIG "[0-9a-f]"

+#define PCT_ENCODED "%" HEXDIG HEXDIG

+#define DEC_OCTET NCG("[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-9]")

+#define IPV4ADDRESS DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET

+#define H16 NCG(HEXDIG) "{1,4}"

+#define LS32 NCG(H16 ":" H16 "|" IPV4ADDRESS)

+#define IPV6ADDRESS NCG( \

+ NCG(H16 ":") "{6}" LS32 "|" \

+ "::" NCG(H16 ":") "{5}" LS32 "|" \

+ OPT_NCG( H16) "::" NCG(H16 ":") "{4}" LS32 "|" \

+ OPT_NCG( NCG(H16 ":") "{0,1}" H16) "::" NCG(H16 ":") "{3}" LS32 "|" \

+ OPT_NCG( NCG(H16 ":") "{0,2}" H16) "::" NCG(H16 ":") "{2}" LS32 "|" \

+ OPT_NCG( NCG(H16 ":") "{0,3}" H16) "::" NCG(H16 ":") LS32 "|" \

+ OPT_NCG( NCG(H16 ":") "{0,4}" H16) "::" LS32 "|" \

+ OPT_NCG( NCG(H16 ":") "{0,5}" H16) "::" H16 "|" \

+ OPT_NCG( NCG(H16 ":") "{0,6}" H16) "::")

+#define IPVFUTURE \

+ "v" HEXDIG \

+ "+" \

+ "\\." NCG(UNRESERVED "|" SUB_DELIMS \

+ "|" \

+ ":") "+"

+#define IP_LITERAL "\\[" NCG(IPV6ADDRESS "|" IPVFUTURE) "\\]"

+#define PORT DIGIT "*"

+// This is a diversion of RFC 3987

+#define IPRIVATE \

+ "[" \

+ "\\x{E000}-\\x{F8FF}" \

+ "\\x{F0000}-\\x{FFFFD}" \

+ "\\x{100000}-\\x{10FFFD}" \

+ "]"

+#define UCSCHAR \

+ "[" "\\x{A0}-\\x{D7FF}" "\\x{F900}-\\x{FDCF}" "\\x{FDF0}-\\x{FFEF}" \

+ "\\x{10000}-\\x{1FFFD}" "\\x{20000}-\\x{2FFFD}" "\\x{30000}-\\x{3FFFD}" \

+ "\\x{40000}-\\x{4FFFD}" "\\x{50000}-\\x{5FFFD}" "\\x{60000}-\\x{6FFFD}" \

+ "\\x{70000}-\\x{7FFFD}" "\\x{80000}-\\x{8FFFD}" "\\x{90000}-\\x{9FFFD}" \

+ "\\x{A0000}-\\x{AFFFD}" "\\x{B0000}-\\x{BFFFD}" "\\x{C0000}-\\x{CFFFD}" \

+ "\\x{D0000}-\\x{DFFFD}" "\\x{E1000}-\\x{EFFFD}" "]"

+#define IUNRESERVED NCG("[-a-z0-9._~]" "|" UCSCHAR)

+#define IPCHAR NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" "[:@]")

+#define IFRAGMENT NCG(IPCHAR "|" "[/?]") "*"

+#define IQUERY NCG(IPCHAR "|" IPRIVATE "|" "[/?]") "*"

+#define ISEGMENT IPCHAR "*"

+#define ISEGMENT_NZ IPCHAR "+"

+#define ISEGMENT_NZ_NC \

+ NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS \

+ "|" "@") "+"

+#define IPATH_EMPTY ""

+#define IPATH_ROOTLESS ISEGMENT_NZ NCG("/" ISEGMENT) "*"

+#define IPATH_NOSCHEME ISEGMENT_NZ_NC NCG("/" ISEGMENT) "*"

+#define IPATH_ABSOLUTE "/" OPT_NCG(ISEGMENT_NZ NCG("/" ISEGMENT) "*")

+#define IPATH_ABEMPTY NCG("/" ISEGMENT) "*"

+#define IPATH NCG(IPATH_ABEMPTY "|" IPATH_ABSOLUTE "|" IPATH_NOSCHEME "|" \

+ IPATH_ROOTLESS "|" IPATH_EMPTY)

+#define IREG_NAME NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS) "*"

+#define IHOST NCG(IP_LITERAL "|" IPV4ADDRESS "|" IREG_NAME)

+#define IUSERINFO NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" ":") "*"

+#define IAUTHORITY OPT_NCG(IUSERINFO "@") IHOST OPT_NCG(":" PORT)

+#define IRELATIVE_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \

+ "|" IPATH_NOSCHEME "|" IPATH_EMPTY)

+#define IRELATIVE_REF IRELATIVE_PART OPT_NCG("?" IQUERY) OPT_NCG("#" IFRAGMENT)

+// RFC 3987 requires IPATH_EMPTY here but it is omitted so that statements

+// that end with "Android:" for example are not considered a URL.

+#define IHIER_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \

+ "|" IPATH_ROOTLESS)

+#define ABSOLUTE_IRI SCHEME ":" IHIER_PART OPT_NCG("?" IQUERY)

+#define IRI SCHEME ":" IHIER_PART OPT_NCG("\\?" IQUERY) OPT_NCG("#" IFRAGMENT)

+#define IRI_REFERENCE NCG(IRI "|" IRELATIVE_REF)

+// TODO(battre): Use http://tools.ietf.org/html/rfc5322 to represent email

+// addresses. Capture names as well ("First Lastname" <foo@bar.com>).

+// The |kCustomPatternWithoutContext| array defines further patterns to match

+// and anonymize. Each pattern consists of a single capturing group.

+CustomPatternWithoutContext kCustomPatternsWithoutContext[] = {

+ {"URL", "(?i)(" IRI ")"},

+ // Email Addresses need to come after URLs because they can be part

+ // of a query parameter.

+ {"email", "(?i)([0-9a-z._%+-]+@[a-z0-9.-]+\\.[a-z]{2,6})"},

+ // IP filter rules need to come after URLs so that they don't disturb the

+ // URL pattern in case the IP address is part of a URL.

+ {"IPv4", "(?i)(" IPV4ADDRESS ")"},

+ {"IPv6", "(?i)(" IPV6ADDRESS ")"},

+};

+// Like RE2's FindAndConsume, searches for the first occurrence of |pattern| in

+// |input| and consumes the bytes until the end of the pattern matching. Unlike

+// FindAndConsume, the bytes skipped before the match of |pattern| are stored

+// in |skipped_input|.

vasilii 2016/01/11 10:54:29 I think it's misleading because if you pass no arg

battre 2016/01/11 10:59:23 I have added an example. Is it clearer now?

vasilii 2016/01/11 11:58:32 The comment is false. Something meaningful is stor

battre 2016/01/11 12:33:08 Done.

+bool FindAndConsumeAndGetSkippedN(re2::StringPiece* input,

+ const re2::RE2& pattern,

+ re2::StringPiece* skipped_input,

+ re2::StringPiece* args[],

+ int argc) {

+ re2::StringPiece old_input = *input;

+ re2::RE2::Arg a0(argc > 0 ? args[0] : nullptr);

+ re2::RE2::Arg a1(argc > 1 ? args[1] : nullptr);

+ re2::RE2::Arg a2(argc > 2 ? args[2] : nullptr);

+ const re2::RE2::Arg* const wrapped_args[] = {&a0, &a1, &a2};

+ CHECK_LE(argc, 3);

+ bool result = re2::RE2::FindAndConsumeN(input, pattern, wrapped_args, argc);

+ if (skipped_input && result && argc > 0) {

+ size_t bytes_skipped = args[0]->data() - old_input.data();

+ *skipped_input = re2::StringPiece(old_input.data(), bytes_skipped);

+ }

+ return result;

+// All |match_groups| need to be of type re2::StringPiece*.

+template <typename... Arg>

+bool FindAndConsumeAndGetSkipped(re2::StringPiece* input,

+ const re2::RE2& pattern,

+ re2::StringPiece* skipped_input,

+ Arg*... match_groups) {

+ re2::StringPiece* args[] = {match_groups...};

+ return FindAndConsumeAndGetSkippedN(input, pattern, skipped_input, args,

+ arraysize(args));

} // namespace

AnonymizerTool::AnonymizerTool()

- : custom_patterns_(arraysize(kCustomPatterns)) {}

+ : custom_patterns_with_context_(arraysize(kCustomPatternsWithContext)),

+ custom_patterns_without_context_(

+ arraysize(kCustomPatternsWithoutContext)) {}

AnonymizerTool::~AnonymizerTool() {}

@@ -56,48 +226,56 @@ std::string AnonymizerTool::Anonymize(const std::string& input) {

return anonymized;

}

+RE2* AnonymizerTool::GetRegExp(const std::string& pattern) {

+ if (regexp_cache_.find(pattern) == regexp_cache_.end()) {

+ RE2::Options options;

+ // set_multiline of pcre is not supported by RE2, yet.

+ options.set_dot_nl(true); // Dot matches a new line.

+ scoped_ptr<RE2> re = make_scoped_ptr(new RE2(pattern, options));

+ DCHECK_EQ(re2::RE2::NoError, re->error_code())

+ << "Failed to parse:\n" << pattern << "\n" << re->error();

+ regexp_cache_[pattern] = std::move(re);

+ }

+ return regexp_cache_[pattern].get();

std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) {

// This regular expression finds the next MAC address. It splits the data into

- // a section preceding the MAC address, an OUI (Organizationally Unique

- // Identifier) part and a NIC (Network Interface Controller) specific part.

- RE2::Options options;

- // set_multiline of pcre is not supported by RE2, yet.

- options.set_dot_nl(true); // Dot matches a new line.

- RE2 mac_re(

- "(.*?)("

- "[0-9a-fA-F][0-9a-fA-F]:"

+ // an OUI (Organizationally Unique Identifier) part and a NIC (Network

+ // Interface Controller) specific part.

+ RE2* mac_re = GetRegExp(

+ "([0-9a-fA-F][0-9a-fA-F]:"

"[0-9a-fA-F][0-9a-fA-F]:"

"[0-9a-fA-F][0-9a-fA-F]):("

"[0-9a-fA-F][0-9a-fA-F]:"

- "[0-9a-fA-F][0-9a-fA-F])",

- options);

+ "[0-9a-fA-F][0-9a-fA-F])");

std::string result;

result.reserve(input.size());

// Keep consuming, building up a result string as we go.

re2::StringPiece text(input);

- std::string pre_mac, oui, nic;

- while (re2::RE2::Consume(&text, mac_re, RE2::Arg(&pre_mac), RE2::Arg(&oui),

- RE2::Arg(&nic))) {

+ re2::StringPiece skipped;

+ re2::StringPiece pre_mac, oui, nic;

+ while (FindAndConsumeAndGetSkipped(&text, *mac_re, &skipped, &oui, &nic)) {

// Look up the MAC address in the hash.

- oui = base::ToLowerASCII(oui);

- nic = base::ToLowerASCII(nic);

- std::string mac = oui + ":" + nic;

+ std::string oui_string = base::ToLowerASCII(oui.as_string());

+ std::string nic_string = base::ToLowerASCII(nic.as_string());

+ std::string mac = oui_string + ":" + nic_string;

std::string replacement_mac = mac_addresses_[mac];

if (replacement_mac.empty()) {

// If not found, build up a replacement MAC address by generating a new

// NIC part.

int mac_id = mac_addresses_.size();

replacement_mac = base::StringPrintf(

- "%s:%02x:%02x:%02x", oui.c_str(), (mac_id & 0x00ff0000) >> 16,

+ "%s:%02x:%02x:%02x", oui_string.c_str(), (mac_id & 0x00ff0000) >> 16,

(mac_id & 0x0000ff00) >> 8, (mac_id & 0x000000ff));

mac_addresses_[mac] = replacement_mac;

}

- result += pre_mac;

+ skipped.AppendToString(&result);

result += replacement_mac;

}

@@ -106,43 +284,79 @@ std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) {

}

std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) {

- for (size_t i = 0; i < arraysize(kCustomPatterns); i++) {

+ for (size_t i = 0; i < arraysize(kCustomPatternsWithContext); i++) {

input =

- AnonymizeCustomPattern(input, kCustomPatterns[i], &custom_patterns_[i]);

+ AnonymizeCustomPatternWithContext(input, kCustomPatternsWithContext[i],

+ &custom_patterns_with_context_[i]);

+ }

+ for (size_t i = 0; i < arraysize(kCustomPatternsWithoutContext); i++) {

+ input = AnonymizeCustomPatternWithoutContext(

+ input, kCustomPatternsWithoutContext[i],

+ &custom_patterns_without_context_[i]);

}

return input;

}

-// static

-std::string AnonymizerTool::AnonymizeCustomPattern(

+std::string AnonymizerTool::AnonymizeCustomPatternWithContext(

const std::string& input,

const std::string& pattern,

std::map<std::string, std::string>* identifier_space) {

- RE2::Options options;

- // set_multiline of pcre is not supported by RE2, yet.

- options.set_dot_nl(true); // Dot matches a new line.

- RE2 re("(.*?)" + pattern, options);

- DCHECK_EQ(4, re.NumberOfCapturingGroups());

+ RE2* re = GetRegExp(pattern);

+ DCHECK_EQ(3, re->NumberOfCapturingGroups());

std::string result;

result.reserve(input.size());

// Keep consuming, building up a result string as we go.

re2::StringPiece text(input);

- std::string pre_match, pre_matched_id, matched_id, post_matched_id;

- while (RE2::Consume(&text, re, RE2::Arg(&pre_match),

- RE2::Arg(&pre_matched_id), RE2::Arg(&matched_id),

- RE2::Arg(&post_matched_id))) {

- std::string replacement_id = (*identifier_space)[matched_id];

+ re2::StringPiece skipped;

+ re2::StringPiece pre_match, pre_matched_id, matched_id, post_matched_id;

+ while (FindAndConsumeAndGetSkipped(&text, *re, &skipped, &pre_matched_id,

+ &matched_id, &post_matched_id)) {

+ std::string matched_id_as_string = matched_id.as_string();

+ std::string replacement_id = (*identifier_space)[matched_id_as_string];

if (replacement_id.empty()) {

replacement_id = base::IntToString(identifier_space->size());

- (*identifier_space)[matched_id] = replacement_id;

+ (*identifier_space)[matched_id_as_string] = replacement_id;

+ }

+ skipped.AppendToString(&result);

+ pre_matched_id.AppendToString(&result);

+ result += replacement_id;

+ post_matched_id.AppendToString(&result);

+ }

+ text.AppendToString(&result);

+ return result;

+std::string AnonymizerTool::AnonymizeCustomPatternWithoutContext(

+ const std::string& input,

+ const CustomPatternWithoutContext& pattern,

+ std::map<std::string, std::string>* identifier_space) {

+ RE2* re = GetRegExp(pattern.pattern);

+ DCHECK_EQ(1, re->NumberOfCapturingGroups());

+ std::string result;

+ result.reserve(input.size());

+ // Keep consuming, building up a result string as we go.

+ re2::StringPiece text(input);

+ re2::StringPiece skipped;

+ re2::StringPiece matched_id;

+ while (FindAndConsumeAndGetSkipped(&text, *re, &skipped, &matched_id)) {

+ std::string matched_id_as_string = matched_id.as_string();

+ std::string replacement_id = (*identifier_space)[matched_id_as_string];

+ if (replacement_id.empty()) {

+ // The weird Uint64toString trick is because Windows does not like to deal

+ // with %zu and a size_t in printf, nor does it support %llu.

+ replacement_id = base::StringPrintf(

+ "<%s: %s>", pattern.alias,

+ base::Uint64ToString(identifier_space->size()).c_str());

+ (*identifier_space)[matched_id_as_string] = replacement_id;

}

- result += pre_match;

- result += pre_matched_id;

+ skipped.AppendToString(&result);

result += replacement_id;

- result += post_matched_id;

}

text.AppendToString(&result);

return result;

« no previous file with comments | « components/feedback/anonymizer_tool.h ('k') | components/feedback/anonymizer_tool_unittest.cc » ('j') | no next file with comments »