| Index: components/feedback/anonymizer_tool.cc
|
| diff --git a/components/feedback/anonymizer_tool.cc b/components/feedback/anonymizer_tool.cc
|
| index 713ceb6a207e475e357dcdc34324708c0c8bb075..a2988f81f63cf883b3a6a6fdca2569961c4b4261 100644
|
| --- a/components/feedback/anonymizer_tool.cc
|
| +++ b/components/feedback/anonymizer_tool.cc
|
| @@ -4,9 +4,11 @@
|
|
|
| #include "components/feedback/anonymizer_tool.h"
|
|
|
| -#include <base/strings/string_number_conversions.h>
|
| -#include <base/strings/string_util.h>
|
| -#include <base/strings/stringprintf.h>
|
| +#include <utility>
|
| +
|
| +#include "base/strings/string_number_conversions.h"
|
| +#include "base/strings/string_util.h"
|
| +#include "base/strings/stringprintf.h"
|
|
|
| #include "third_party/re2/src/re2/re2.h"
|
|
|
| @@ -16,13 +18,16 @@ namespace feedback {
|
|
|
| namespace {
|
|
|
| -// The |kCustomPatterns| array defines patterns to match and anonymize. Each
|
| -// pattern needs to define three capturing parentheses groups:
|
| +// The |kCustomPatternsWithContext| array defines patterns to match and
|
| +// anonymize. Each pattern needs to define three capturing parentheses groups:
|
| //
|
| // - a group for the pattern before the identifier to be anonymized;
|
| // - a group for the identifier to be anonymized;
|
| // - a group for the pattern after the identifier to be anonymized.
|
| //
|
| +// The first and the last capture group are the origin of the "WithContext"
|
| +// suffix in the name of this constant.
|
| +//
|
| // Every matched identifier (in the context of the whole pattern) is anonymized
|
| // by replacing it with an incremental instance identifier. Every different
|
| // pattern defines a separate instance identifier space. See the unit test for
|
| @@ -35,7 +40,7 @@ namespace {
|
| // (?i) turns on case insensitivy for the remainder of the regex.
|
| // (?-s) turns off "dot matches newline" for the remainder of the regex.
|
| // (?:regex) denotes non-capturing parentheses group.
|
| -const char* kCustomPatterns[] = {
|
| +const char* kCustomPatternsWithContext[] = {
|
| "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager
|
| "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager
|
| "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant
|
| @@ -43,10 +48,201 @@ const char* kCustomPatterns[] = {
|
| "(?-s)(\\[SSID=)(.+?)(\\])", // shill
|
| };
|
|
|
| +// Helper macro: Non capturing group
|
| +#define NCG(x) "(?:" x ")"
|
| +// Helper macro: Optional non capturing group
|
| +#define OPT_NCG(x) NCG(x) "?"
|
| +
|
| +//////////////////////////////////////////////////////////////////////////
|
| +// Patterns for URLs, or better IRIs, based on RFC 3987 with an artificial
|
| +// limitation on the scheme to increase precision. Otherwise anything
|
| +// like "ID:" would be considered an IRI.
|
| +
|
| +#define UNRESERVED "[-a-z0-9._~]"
|
| +#define RESERVED NGC(GEN_DELIMS "|" SUB_DELIMS)
|
| +#define SUB_DELIMS "[!$&'()*+,;=]"
|
| +#define GEN_DELIMS "[:/?#[\\]@]"
|
| +
|
| +#define DIGIT "[0-9]"
|
| +#define HEXDIG "[0-9a-f]"
|
| +
|
| +#define PCT_ENCODED "%" HEXDIG HEXDIG
|
| +
|
| +#define DEC_OCTET NCG("[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-9]")
|
| +
|
| +#define IPV4ADDRESS DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET
|
| +
|
| +#define H16 NCG(HEXDIG) "{1,4}"
|
| +#define LS32 NCG(H16 ":" H16 "|" IPV4ADDRESS)
|
| +
|
| +#define IPV6ADDRESS NCG( \
|
| + NCG(H16 ":") "{6}" LS32 "|" \
|
| + "::" NCG(H16 ":") "{5}" LS32 "|" \
|
| + OPT_NCG( H16) "::" NCG(H16 ":") "{4}" LS32 "|" \
|
| + OPT_NCG( NCG(H16 ":") "{0,1}" H16) "::" NCG(H16 ":") "{3}" LS32 "|" \
|
| + OPT_NCG( NCG(H16 ":") "{0,2}" H16) "::" NCG(H16 ":") "{2}" LS32 "|" \
|
| + OPT_NCG( NCG(H16 ":") "{0,3}" H16) "::" NCG(H16 ":") LS32 "|" \
|
| + OPT_NCG( NCG(H16 ":") "{0,4}" H16) "::" LS32 "|" \
|
| + OPT_NCG( NCG(H16 ":") "{0,5}" H16) "::" H16 "|" \
|
| + OPT_NCG( NCG(H16 ":") "{0,6}" H16) "::")
|
| +
|
| +#define IPVFUTURE \
|
| + "v" HEXDIG \
|
| + "+" \
|
| + "\\." NCG(UNRESERVED "|" SUB_DELIMS \
|
| + "|" \
|
| + ":") "+"
|
| +
|
| +#define IP_LITERAL "\\[" NCG(IPV6ADDRESS "|" IPVFUTURE) "\\]"
|
| +
|
| +#define PORT DIGIT "*"
|
| +
|
| +// This is a diversion of RFC 3987
|
| +#define SCHEME NCG("http|https|ftp|chrome|chrome-extension|android")
|
| +
|
| +#define IPRIVATE \
|
| + "[" \
|
| + "\\x{E000}-\\x{F8FF}" \
|
| + "\\x{F0000}-\\x{FFFFD}" \
|
| + "\\x{100000}-\\x{10FFFD}" \
|
| + "]"
|
| +
|
| +#define UCSCHAR \
|
| + "[" "\\x{A0}-\\x{D7FF}" "\\x{F900}-\\x{FDCF}" "\\x{FDF0}-\\x{FFEF}" \
|
| + "\\x{10000}-\\x{1FFFD}" "\\x{20000}-\\x{2FFFD}" "\\x{30000}-\\x{3FFFD}" \
|
| + "\\x{40000}-\\x{4FFFD}" "\\x{50000}-\\x{5FFFD}" "\\x{60000}-\\x{6FFFD}" \
|
| + "\\x{70000}-\\x{7FFFD}" "\\x{80000}-\\x{8FFFD}" "\\x{90000}-\\x{9FFFD}" \
|
| + "\\x{A0000}-\\x{AFFFD}" "\\x{B0000}-\\x{BFFFD}" "\\x{C0000}-\\x{CFFFD}" \
|
| + "\\x{D0000}-\\x{DFFFD}" "\\x{E1000}-\\x{EFFFD}" "]"
|
| +
|
| +#define IUNRESERVED NCG("[-a-z0-9._~]" "|" UCSCHAR)
|
| +
|
| +#define IPCHAR NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" "[:@]")
|
| +#define IFRAGMENT NCG(IPCHAR "|" "[/?]") "*"
|
| +#define IQUERY NCG(IPCHAR "|" IPRIVATE "|" "[/?]") "*"
|
| +
|
| +#define ISEGMENT IPCHAR "*"
|
| +#define ISEGMENT_NZ IPCHAR "+"
|
| +#define ISEGMENT_NZ_NC \
|
| + NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS \
|
| + "|" "@") "+"
|
| +
|
| +#define IPATH_EMPTY ""
|
| +#define IPATH_ROOTLESS ISEGMENT_NZ NCG("/" ISEGMENT) "*"
|
| +#define IPATH_NOSCHEME ISEGMENT_NZ_NC NCG("/" ISEGMENT) "*"
|
| +#define IPATH_ABSOLUTE "/" OPT_NCG(ISEGMENT_NZ NCG("/" ISEGMENT) "*")
|
| +#define IPATH_ABEMPTY NCG("/" ISEGMENT) "*"
|
| +
|
| +#define IPATH NCG(IPATH_ABEMPTY "|" IPATH_ABSOLUTE "|" IPATH_NOSCHEME "|" \
|
| + IPATH_ROOTLESS "|" IPATH_EMPTY)
|
| +
|
| +#define IREG_NAME NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS) "*"
|
| +
|
| +#define IHOST NCG(IP_LITERAL "|" IPV4ADDRESS "|" IREG_NAME)
|
| +#define IUSERINFO NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" ":") "*"
|
| +#define IAUTHORITY OPT_NCG(IUSERINFO "@") IHOST OPT_NCG(":" PORT)
|
| +
|
| +#define IRELATIVE_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \
|
| + "|" IPATH_NOSCHEME "|" IPATH_EMPTY)
|
| +
|
| +#define IRELATIVE_REF IRELATIVE_PART OPT_NCG("?" IQUERY) OPT_NCG("#" IFRAGMENT)
|
| +
|
| +// RFC 3987 requires IPATH_EMPTY here but it is omitted so that statements
|
| +// that end with "Android:" for example are not considered a URL.
|
| +#define IHIER_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \
|
| + "|" IPATH_ROOTLESS)
|
| +
|
| +#define ABSOLUTE_IRI SCHEME ":" IHIER_PART OPT_NCG("?" IQUERY)
|
| +
|
| +#define IRI SCHEME ":" IHIER_PART OPT_NCG("\\?" IQUERY) OPT_NCG("#" IFRAGMENT)
|
| +
|
| +#define IRI_REFERENCE NCG(IRI "|" IRELATIVE_REF)
|
| +
|
| +// TODO(battre): Use http://tools.ietf.org/html/rfc5322 to represent email
|
| +// addresses. Capture names as well ("First Lastname" <foo@bar.com>).
|
| +
|
| +// The |kCustomPatternWithoutContext| array defines further patterns to match
|
| +// and anonymize. Each pattern consists of a single capturing group.
|
| +CustomPatternWithoutContext kCustomPatternsWithoutContext[] = {
|
| + {"URL", "(?i)(" IRI ")"},
|
| + // Email Addresses need to come after URLs because they can be part
|
| + // of a query parameter.
|
| + {"email", "(?i)([0-9a-z._%+-]+@[a-z0-9.-]+\\.[a-z]{2,6})"},
|
| + // IP filter rules need to come after URLs so that they don't disturb the
|
| + // URL pattern in case the IP address is part of a URL.
|
| + {"IPv4", "(?i)(" IPV4ADDRESS ")"},
|
| + {"IPv6", "(?i)(" IPV6ADDRESS ")"},
|
| +};
|
| +
|
| +// Functor template that allows calling a function with a set of paramters
|
| +// that get wrapped into an array. This is a variation of RE2's
|
| +// VariadicFunction2 with an extra parameter.
|
| +template <typename Result, typename Param0, typename Param1, typename Param2,
|
| + typename Arg,
|
| + Result (*Func)(Param0, Param1, Param2, Arg*[], int count)>
|
| +class VariadicFunction3 {
|
| + public:
|
| + Result operator()(Param0 p0, Param1 p1, Param2 p2) const {
|
| + return Func(p0, p1, p2, nullptr, 0);
|
| + }
|
| +
|
| + Result operator()(Param0 p0, Param1 p1, Param2 p2, Arg* a0) const {
|
| + Arg* args[] = {a0};
|
| + return Func(p0, p1, p2, args, 1);
|
| + }
|
| +
|
| + Result operator()(Param0 p0, Param1 p1, Param2 p2, Arg* a0, Arg* a1) const {
|
| + Arg* args[] = {a0, a1};
|
| + return Func(p0, p1, p2, args, 2);
|
| + }
|
| +
|
| + Result operator()(Param0 p0, Param1 p1, Param2 p2, Arg* a0, Arg* a1, Arg* a2)
|
| + const {
|
| + Arg* args[] = {a0, a1, a2};
|
| + return Func(p0, p1, p2, args, 3);
|
| + }
|
| +};
|
| +
|
| +// Like RE2's FindAndConsume, searches for the first occurrence of |pattern| in
|
| +// |input| and consumes the bytes until the end of the pattern matching. Unlike
|
| +// FindAndConsume, the bytes skipped before the match of |pattern| are stored
|
| +// in |skipped_input|.
|
| +bool FindAndConsumeAndGetSkippedN(re2::StringPiece* input,
|
| + const re2::RE2& pattern,
|
| + re2::StringPiece* skipped_input,
|
| + re2::StringPiece* args[],
|
| + int argc) {
|
| + re2::StringPiece old_input = *input;
|
| +
|
| + re2::RE2::Arg a0(argc > 0 ? args[0] : nullptr);
|
| + re2::RE2::Arg a1(argc > 1 ? args[1] : nullptr);
|
| + re2::RE2::Arg a2(argc > 2 ? args[2] : nullptr);
|
| + const re2::RE2::Arg* const wrapped_args[] = {&a0, &a1, &a2};
|
| + CHECK_LE(argc, 3);
|
| +
|
| + bool result = re2::RE2::FindAndConsumeN(input, pattern, wrapped_args, argc);
|
| +
|
| + if (skipped && result && argc > 0) {
|
| + size_t bytes_skipped = args[0]->data() - old_input.data();
|
| + *skipped_input = re2::StringPiece(old_input.data(), bytes_skipped);
|
| + }
|
| + return result;
|
| +}
|
| +
|
| +const VariadicFunction3<bool,
|
| + re2::StringPiece*,
|
| + const re2::RE2&,
|
| + re2::StringPiece*,
|
| + re2::StringPiece,
|
| + &FindAndConsumeAndGetSkippedN>
|
| + FindAndConsumeAndGetSkipped = {};
|
| +
|
| } // namespace
|
|
|
| AnonymizerTool::AnonymizerTool()
|
| - : custom_patterns_(arraysize(kCustomPatterns)) {}
|
| + : custom_patterns_with_context_(arraysize(kCustomPatternsWithContext)),
|
| + custom_patterns_without_context_(
|
| + arraysize(kCustomPatternsWithoutContext)) {}
|
|
|
| AnonymizerTool::~AnonymizerTool() {}
|
|
|
| @@ -56,48 +252,56 @@ std::string AnonymizerTool::Anonymize(const std::string& input) {
|
| return anonymized;
|
| }
|
|
|
| +RE2* AnonymizerTool::GetRegExp(const std::string& pattern) {
|
| + if (regexp_cache_.find(pattern) == regexp_cache_.end()) {
|
| + RE2::Options options;
|
| + // set_multiline of pcre is not supported by RE2, yet.
|
| + options.set_dot_nl(true); // Dot matches a new line.
|
| + scoped_ptr<RE2> re = make_scoped_ptr(new RE2(pattern, options));
|
| + DCHECK_EQ(re2::RE2::NoError, re->error_code())
|
| + << "Failed to parse:\n" << pattern << "\n" << re->error();
|
| + regexp_cache_[pattern] = std::move(re);
|
| + }
|
| + return regexp_cache_[pattern].get();
|
| +}
|
| +
|
| std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) {
|
| // This regular expression finds the next MAC address. It splits the data into
|
| - // a section preceding the MAC address, an OUI (Organizationally Unique
|
| - // Identifier) part and a NIC (Network Interface Controller) specific part.
|
| -
|
| - RE2::Options options;
|
| - // set_multiline of pcre is not supported by RE2, yet.
|
| - options.set_dot_nl(true); // Dot matches a new line.
|
| - RE2 mac_re(
|
| - "(.*?)("
|
| - "[0-9a-fA-F][0-9a-fA-F]:"
|
| + // an OUI (Organizationally Unique Identifier) part and a NIC (Network
|
| + // Interface Controller) specific part.
|
| +
|
| + RE2* mac_re = GetRegExp(
|
| + "([0-9a-fA-F][0-9a-fA-F]:"
|
| "[0-9a-fA-F][0-9a-fA-F]:"
|
| "[0-9a-fA-F][0-9a-fA-F]):("
|
| "[0-9a-fA-F][0-9a-fA-F]:"
|
| "[0-9a-fA-F][0-9a-fA-F]:"
|
| - "[0-9a-fA-F][0-9a-fA-F])",
|
| - options);
|
| + "[0-9a-fA-F][0-9a-fA-F])");
|
|
|
| std::string result;
|
| result.reserve(input.size());
|
|
|
| // Keep consuming, building up a result string as we go.
|
| re2::StringPiece text(input);
|
| - std::string pre_mac, oui, nic;
|
| - while (re2::RE2::Consume(&text, mac_re, RE2::Arg(&pre_mac), RE2::Arg(&oui),
|
| - RE2::Arg(&nic))) {
|
| + re2::StringPiece skipped;
|
| + re2::StringPiece pre_mac, oui, nic;
|
| + while (FindAndConsumeAndGetSkipped(&text, *mac_re, &skipped, &oui, &nic)) {
|
| // Look up the MAC address in the hash.
|
| - oui = base::ToLowerASCII(oui);
|
| - nic = base::ToLowerASCII(nic);
|
| - std::string mac = oui + ":" + nic;
|
| + std::string oui_string = base::ToLowerASCII(oui.as_string());
|
| + std::string nic_string = base::ToLowerASCII(nic.as_string());
|
| + std::string mac = oui_string + ":" + nic_string;
|
| std::string replacement_mac = mac_addresses_[mac];
|
| if (replacement_mac.empty()) {
|
| // If not found, build up a replacement MAC address by generating a new
|
| // NIC part.
|
| int mac_id = mac_addresses_.size();
|
| replacement_mac = base::StringPrintf(
|
| - "%s:%02x:%02x:%02x", oui.c_str(), (mac_id & 0x00ff0000) >> 16,
|
| + "%s:%02x:%02x:%02x", oui_string.c_str(), (mac_id & 0x00ff0000) >> 16,
|
| (mac_id & 0x0000ff00) >> 8, (mac_id & 0x000000ff));
|
| mac_addresses_[mac] = replacement_mac;
|
| }
|
|
|
| - result += pre_mac;
|
| + skipped.AppendToString(&result);
|
| result += replacement_mac;
|
| }
|
|
|
| @@ -105,44 +309,81 @@ std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) {
|
| return result;
|
| }
|
|
|
| +
|
| std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) {
|
| - for (size_t i = 0; i < arraysize(kCustomPatterns); i++) {
|
| + for (size_t i = 0; i < arraysize(kCustomPatternsWithContext); i++) {
|
| input =
|
| - AnonymizeCustomPattern(input, kCustomPatterns[i], &custom_patterns_[i]);
|
| + AnonymizeCustomPatternWithContext(input, kCustomPatternsWithContext[i],
|
| + &custom_patterns_with_context_[i]);
|
| + }
|
| + for (size_t i = 0; i < arraysize(kCustomPatternsWithoutContext); i++) {
|
| + input = AnonymizeCustomPatternWithoutContext(
|
| + input, kCustomPatternsWithoutContext[i],
|
| + &custom_patterns_without_context_[i]);
|
| }
|
| return input;
|
| }
|
|
|
| -// static
|
| -std::string AnonymizerTool::AnonymizeCustomPattern(
|
| +std::string AnonymizerTool::AnonymizeCustomPatternWithContext(
|
| const std::string& input,
|
| const std::string& pattern,
|
| std::map<std::string, std::string>* identifier_space) {
|
| - RE2::Options options;
|
| - // set_multiline of pcre is not supported by RE2, yet.
|
| - options.set_dot_nl(true); // Dot matches a new line.
|
| - RE2 re("(.*?)" + pattern, options);
|
| - DCHECK_EQ(4, re.NumberOfCapturingGroups());
|
| + RE2* re = GetRegExp(pattern);
|
| + DCHECK_EQ(3, re->NumberOfCapturingGroups());
|
|
|
| std::string result;
|
| result.reserve(input.size());
|
|
|
| // Keep consuming, building up a result string as we go.
|
| re2::StringPiece text(input);
|
| - std::string pre_match, pre_matched_id, matched_id, post_matched_id;
|
| - while (RE2::Consume(&text, re, RE2::Arg(&pre_match),
|
| - RE2::Arg(&pre_matched_id), RE2::Arg(&matched_id),
|
| - RE2::Arg(&post_matched_id))) {
|
| - std::string replacement_id = (*identifier_space)[matched_id];
|
| + re2::StringPiece skipped;
|
| + re2::StringPiece pre_match, pre_matched_id, matched_id, post_matched_id;
|
| + while (FindAndConsumeAndGetSkipped(&text, *re, &skipped, &pre_matched_id,
|
| + &matched_id, &post_matched_id)) {
|
| + std::string matched_id_as_string = matched_id.as_string();
|
| + std::string replacement_id = (*identifier_space)[matched_id_as_string];
|
| if (replacement_id.empty()) {
|
| replacement_id = base::IntToString(identifier_space->size());
|
| - (*identifier_space)[matched_id] = replacement_id;
|
| + (*identifier_space)[matched_id_as_string] = replacement_id;
|
| + }
|
| +
|
| + skipped.AppendToString(&result);
|
| + pre_matched_id.AppendToString(&result);
|
| + result += replacement_id;
|
| + post_matched_id.AppendToString(&result);
|
| + }
|
| + text.AppendToString(&result);
|
| + return result;
|
| +}
|
| +
|
| +std::string AnonymizerTool::AnonymizeCustomPatternWithoutContext(
|
| + const std::string& input,
|
| + const CustomPatternWithoutContext& pattern,
|
| + std::map<std::string, std::string>* identifier_space) {
|
| + RE2* re = GetRegExp(pattern.pattern);
|
| + DCHECK_EQ(1, re->NumberOfCapturingGroups());
|
| +
|
| + std::string result;
|
| + result.reserve(input.size());
|
| +
|
| + // Keep consuming, building up a result string as we go.
|
| + re2::StringPiece text(input);
|
| + re2::StringPiece skipped;
|
| + re2::StringPiece matched_id;
|
| + while (FindAndConsumeAndGetSkipped(&text, *re, &skipped, &matched_id)) {
|
| + std::string matched_id_as_string = matched_id.as_string();
|
| + std::string replacement_id = (*identifier_space)[matched_id_as_string];
|
| + if (replacement_id.empty()) {
|
| + // The weird Uint64toString trick is because Windows does not like to deal
|
| + // with %zu and a size_t in printf, nor does it support %llu.
|
| + replacement_id = base::StringPrintf(
|
| + "<%s: %s>", pattern.alias,
|
| + base::Uint64ToString(identifier_space->size()).c_str());
|
| + (*identifier_space)[matched_id_as_string] = replacement_id;
|
| }
|
|
|
| - result += pre_match;
|
| - result += pre_matched_id;
|
| + skipped.AppendToString(&result);
|
| result += replacement_id;
|
| - result += post_matched_id;
|
| }
|
| text.AppendToString(&result);
|
| return result;
|
|
|