Chromium Code Reviews| Index: components/feedback/anonymizer_tool.cc |
| diff --git a/components/feedback/anonymizer_tool.cc b/components/feedback/anonymizer_tool.cc |
| index 13b3758f1584680d7a6da30fb87e4c864cc7ca0d..1d6123e5d6fbe9eed4517052a25ce7e33ce0c247 100644 |
| --- a/components/feedback/anonymizer_tool.cc |
| +++ b/components/feedback/anonymizer_tool.cc |
| @@ -16,13 +16,16 @@ namespace feedback { |
| namespace { |
| -// The |kCustomPatterns| array defines patterns to match and anonymize. Each |
| -// pattern needs to define three capturing parentheses groups: |
| +// The |kCustomPatternsWithContext| array defines patterns to match and |
| +// anonymize. Each pattern needs to define three capturing parentheses groups: |
| // |
| // - a group for the pattern before the identifier to be anonymized; |
| // - a group for the identifier to be anonymized; |
| // - a group for the pattern after the identifier to be anonymized. |
| // |
| +// The first and the last capture group are the origin of the "WithContext" |
| +// suffix in the name of this constant. |
| +// |
| // Every matched identifier (in the context of the whole pattern) is anonymized |
| // by replacing it with an incremental instance identifier. Every different |
| // pattern defines a separate instance identifier space. See the unit test for |
| @@ -35,7 +38,7 @@ namespace { |
| // (?i) turns on case insensitivy for the remainder of the regex. |
| // (?-s) turns off "dot matches newline" for the remainder of the regex. |
| // (?:regex) denotes non-capturing parentheses group. |
| -const char* kCustomPatterns[] = { |
| +const char* kCustomPatternsWithContext[] = { |
| "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager |
| "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager |
| "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant |
| @@ -43,10 +46,138 @@ const char* kCustomPatterns[] = { |
| "(?-s)(\\[SSID=)(.+?)(\\])", // shill |
| }; |
| +// Helper macro: Non capturing group |
| +#define NCG(x) "(?:" x ")" |
| +// Helper macro: Optional non capturing group |
| +#define OPT_NCG(x) NCG(x) "?" |
| + |
| +////////////////////////////////////////////////////////////////////////// |
| +// Patterns for URLs, or better IRIs, based on RFC 3987 with an artificial |
| +// limitation on the scheme to increase precision. Otherwise anything |
| +// like "ID:" would be considered an IRI. |
| + |
| +#define UNRESERVED "[-a-z0-9._~]" |
| +#define RESERVED NGC(GEN_DELIMS "|" SUB_DELIMS) |
|
vasilii
2015/12/22 12:31:42
I never used a macros in a macros. Does it work as
battre
2016/01/08 14:14:32
Yes, this is just string replacements.
|
| +#define SUB_DELIMS "[!$&'()*+,;=]" |
| +#define GEN_DELIMS "[:/?#[\\]@]" |
| + |
| +#define DIGIT "[0-9]" |
| +#define HEXDIG "[0-9a-f]" |
| + |
| +#define PCT_ENCODED "%" HEXDIG HEXDIG |
| + |
| +#define DEC_OCTET NCG("[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-9]") |
| + |
| +#define IPV4ADDRESS DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET |
| + |
| +#define H16 NCG(HEXDIG) "{1,4}" |
| +#define LS32 NCG(H16 ":" H16 "|" IPV4ADDRESS) |
| + |
| +#define IPV6ADDRESS NCG( \ |
| + NCG(H16 ":") "{6}" LS32 "|" \ |
| + "::" NCG(H16 ":") "{5}" LS32 "|" \ |
| + OPT_NCG( H16) "::" NCG(H16 ":") "{4}" LS32 "|" \ |
| + OPT_NCG( NCG(H16 ":") "{0,1}" H16) "::" NCG(H16 ":") "{3}" LS32 "|" \ |
| + OPT_NCG( NCG(H16 ":") "{0,2}" H16) "::" NCG(H16 ":") "{2}" LS32 "|" \ |
| + OPT_NCG( NCG(H16 ":") "{0,3}" H16) "::" NCG(H16 ":") LS32 "|" \ |
| + OPT_NCG( NCG(H16 ":") "{0,4}" H16) "::" LS32 "|" \ |
| + OPT_NCG( NCG(H16 ":") "{0,5}" H16) "::" H16 "|" \ |
| + OPT_NCG( NCG(H16 ":") "{0,6}" H16) "::") |
| + |
| +#define IPVFUTURE \ |
| + "v" HEXDIG \ |
| + "+" \ |
| + "\\." NCG(UNRESERVED "|" SUB_DELIMS \ |
| + "|" \ |
| + ":") "+" |
| + |
| +#define IP_LITERAL "\\[" NCG(IPV6ADDRESS "|" IPVFUTURE) "\\]" |
| + |
| +#define PORT DIGIT "*" |
| + |
| +// This is a diversion of RFC 3987 |
| +#define SCHEME NCG("http|https|ftp|chrome|chrome-extension|android") |
| + |
| +#define IPRIVATE \ |
| + "[" \ |
| + "\\x{E000}-\\x{F8FF}" \ |
| + "\\x{F0000}-\\x{FFFFD}" \ |
| + "\\x{100000}-\\x{10FFFD}" \ |
| + "]" |
| + |
| +#define UCSCHAR \ |
| + "[" "\\x{A0}-\\x{D7FF}" "\\x{F900}-\\x{FDCF}" "\\x{FDF0}-\\x{FFEF}" \ |
| + "\\x{10000}-\\x{1FFFD}" "\\x{20000}-\\x{2FFFD}" "\\x{30000}-\\x{3FFFD}" \ |
| + "\\x{40000}-\\x{4FFFD}" "\\x{50000}-\\x{5FFFD}" "\\x{60000}-\\x{6FFFD}" \ |
| + "\\x{70000}-\\x{7FFFD}" "\\x{80000}-\\x{8FFFD}" "\\x{90000}-\\x{9FFFD}" \ |
| + "\\x{A0000}-\\x{AFFFD}" "\\x{B0000}-\\x{BFFFD}" "\\x{C0000}-\\x{CFFFD}" \ |
| + "\\x{D0000}-\\x{DFFFD}" "\\x{E1000}-\\x{EFFFD}" "]" |
| + |
| +#define IUNRESERVED NCG("[-a-z0-9._~]" "|" UCSCHAR) |
| + |
| +#define IPCHAR NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" "[:@]") |
| +#define IFRAGMENT NCG(IPCHAR "|" "[/?]") "*" |
| +#define IQUERY NCG(IPCHAR "|" IPRIVATE "|" "[/?]") "*" |
| + |
| +#define ISEGMENT IPCHAR "*" |
| +#define ISEGMENT_NZ IPCHAR "+" |
| +#define ISEGMENT_NZ_NC \ |
| + NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS \ |
| + "|" "@") "+" |
| + |
| +#define IPATH_EMPTY "" |
| +#define IPATH_ROOTLESS ISEGMENT_NZ NCG("/" ISEGMENT) "*" |
| +#define IPATH_NOSCHEME ISEGMENT_NZ_NC NCG("/" ISEGMENT) "*" |
| +#define IPATH_ABSOLUTE "/" OPT_NCG(ISEGMENT_NZ NCG("/" ISEGMENT) "*") |
| +#define IPATH_ABEMPTY NCG("/" ISEGMENT) "*" |
| + |
| +#define IPATH NCG(IPATH_ABEMPTY "|" IPATH_ABSOLUTE "|" IPATH_NOSCHEME "|" \ |
| + IPATH_ROOTLESS "|" IPATH_EMPTY) |
| + |
| +#define IREG_NAME NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS) "*" |
| + |
| +#define IHOST NCG(IP_LITERAL "|" IPV4ADDRESS "|" IREG_NAME) |
| +#define IUSERINFO NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" ":") "*" |
| +#define IAUTHORITY OPT_NCG(IUSERINFO "@") IHOST OPT_NCG(":" PORT) |
| + |
| +#define IRELATIVE_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \ |
| + "|" IPATH_NOSCHEME "|" IPATH_EMPTY) |
| + |
| +#define IRELATIVE_REF IRELATIVE_PART OPT_NCG("?" IQUERY) OPT_NCG("#" IFRAGMENT) |
| + |
| +// RFC 3987 requires IPATH_EMPTY here but it is omitted so that statements |
| +// that end with "Android:" for example are not considered a URL. |
| +#define IHIER_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \ |
| + "|" IPATH_ROOTLESS) |
| + |
| +#define ABSOLUTE_IRI SCHEME ":" IHIER_PART OPT_NCG("?" IQUERY) |
| + |
| +#define IRI SCHEME ":" IHIER_PART OPT_NCG("\\?" IQUERY) OPT_NCG("#" IFRAGMENT) |
| + |
| +#define IRI_REFERENCE NCG(IRI "|" IRELATIVE_REF) |
| + |
| +// TODO(battre): Use http://tools.ietf.org/html/rfc5322 to represent email |
| +// addresses. Capture names as well ("First Lastname" <foo@bar.com>). |
| + |
| +// The |kCustomPatternWithoutContext| array defines further patterns to match |
| +// and anonymize. Each pattern consists of a single capturing group. |
| +CustomPatternWithoutContext kCustomPatternsWithoutContext[] = { |
| + {"URL", "(?i)(" IRI ")"}, |
| + // Email Addresses need to come after URLs because they can be part |
| + // of a query parameter. |
| + {"email", "(?i)([0-9a-z._%+-]+@[a-z0-9.-]+\\.[a-z]{2,6})"}, |
| + // IP filter rules need to come after URLs so that they don't disturb the |
| + // URL pattern in case the IP address is part of a URL. |
| + {"IPv4", "(?i)(" IPV4ADDRESS ")"}, |
| + {"IPv6", "(?i)(" IPV6ADDRESS ")"}, |
| +}; |
| + |
| } // namespace |
| AnonymizerTool::AnonymizerTool() |
| - : custom_patterns_(arraysize(kCustomPatterns)) {} |
| + : custom_patterns_with_context_(arraysize(kCustomPatternsWithContext)), |
| + custom_patterns_without_context_( |
| + arraysize(kCustomPatternsWithoutContext)) {} |
| AnonymizerTool::~AnonymizerTool() {} |
| @@ -106,15 +237,21 @@ std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) { |
| } |
| std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) { |
| - for (size_t i = 0; i < arraysize(kCustomPatterns); i++) { |
| + for (size_t i = 0; i < arraysize(kCustomPatternsWithContext); i++) { |
| input = |
| - AnonymizeCustomPattern(input, kCustomPatterns[i], &custom_patterns_[i]); |
| + AnonymizeCustomPatternWithContext(input, kCustomPatternsWithContext[i], |
| + &custom_patterns_with_context_[i]); |
| + } |
| + for (size_t i = 0; i < arraysize(kCustomPatternsWithoutContext); i++) { |
| + input = AnonymizeCustomPatternWithoutContext( |
| + input, kCustomPatternsWithoutContext[i], |
| + &custom_patterns_without_context_[i]); |
| } |
| return input; |
| } |
| // static |
| -std::string AnonymizerTool::AnonymizeCustomPattern( |
| +std::string AnonymizerTool::AnonymizeCustomPatternWithContext( |
| const std::string& input, |
| const std::string& pattern, |
| std::map<std::string, std::string>* identifier_space) { |
| @@ -148,4 +285,38 @@ std::string AnonymizerTool::AnonymizeCustomPattern( |
| return result; |
| } |
| +// static |
| +std::string AnonymizerTool::AnonymizeCustomPatternWithoutContext( |
| + const std::string& input, |
| + const CustomPatternWithoutContext& pattern, |
| + std::map<std::string, std::string>* identifier_space) { |
| + RE2::Options options; |
| + // set_multiline of pcre is not supported by RE2, yet. |
| + options.set_dot_nl(true); // Dot matches a new line. |
| + RE2 re(std::string("(.*?)") + pattern.pattern, options); |
| + DCHECK_EQ(re2::RE2::NoError, re.error_code()) |
| + << "Failed to parse:\n" << pattern.pattern << "\n" << re.error(); |
| + DCHECK_EQ(2, re.NumberOfCapturingGroups()); |
| + |
| + std::string result; |
| + result.reserve(input.size()); |
| + |
| + // Keep consuming, building up a result string as we go. |
| + re2::StringPiece text(input); |
| + std::string pre_match, matched_id; |
| + while (RE2::Consume(&text, re, RE2::Arg(&pre_match), RE2::Arg(&matched_id))) { |
| + std::string replacement_id = (*identifier_space)[matched_id]; |
| + if (replacement_id.empty()) { |
| + replacement_id = base::StringPrintf("<%s: %zu>", pattern.alias, |
| + identifier_space->size()); |
| + (*identifier_space)[matched_id] = replacement_id; |
| + } |
| + |
| + result += pre_match; |
| + result += replacement_id; |
| + } |
| + text.AppendToString(&result); |
| + return result; |
| +} |
| + |
| } // namespace feedback |