Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(86)

Unified Diff: components/feedback/anonymizer_tool.cc

Issue 1543633003: Added anonymization patterns for URLs and email addresses (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@bug-567870-introduce-anonymizer
Patch Set: Some more tests Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: components/feedback/anonymizer_tool.cc
diff --git a/components/feedback/anonymizer_tool.cc b/components/feedback/anonymizer_tool.cc
index 13b3758f1584680d7a6da30fb87e4c864cc7ca0d..1d6123e5d6fbe9eed4517052a25ce7e33ce0c247 100644
--- a/components/feedback/anonymizer_tool.cc
+++ b/components/feedback/anonymizer_tool.cc
@@ -16,13 +16,16 @@ namespace feedback {
namespace {
-// The |kCustomPatterns| array defines patterns to match and anonymize. Each
-// pattern needs to define three capturing parentheses groups:
+// The |kCustomPatternsWithContext| array defines patterns to match and
+// anonymize. Each pattern needs to define three capturing parentheses groups:
//
// - a group for the pattern before the identifier to be anonymized;
// - a group for the identifier to be anonymized;
// - a group for the pattern after the identifier to be anonymized.
//
+// The first and the last capture group are the origin of the "WithContext"
+// suffix in the name of this constant.
+//
// Every matched identifier (in the context of the whole pattern) is anonymized
// by replacing it with an incremental instance identifier. Every different
// pattern defines a separate instance identifier space. See the unit test for
@@ -35,7 +38,7 @@ namespace {
// (?i) turns on case insensitivy for the remainder of the regex.
// (?-s) turns off "dot matches newline" for the remainder of the regex.
// (?:regex) denotes non-capturing parentheses group.
-const char* kCustomPatterns[] = {
+const char* kCustomPatternsWithContext[] = {
"(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager
"(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager
"(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant
@@ -43,10 +46,138 @@ const char* kCustomPatterns[] = {
"(?-s)(\\[SSID=)(.+?)(\\])", // shill
};
+// Helper macro: Non capturing group
+#define NCG(x) "(?:" x ")"
+// Helper macro: Optional non capturing group
+#define OPT_NCG(x) NCG(x) "?"
+
+//////////////////////////////////////////////////////////////////////////
+// Patterns for URLs, or better IRIs, based on RFC 3987 with an artificial
+// limitation on the scheme to increase precision. Otherwise anything
+// like "ID:" would be considered an IRI.
+
+#define UNRESERVED "[-a-z0-9._~]"
+#define RESERVED NGC(GEN_DELIMS "|" SUB_DELIMS)
vasilii 2015/12/22 12:31:42 I never used a macros in a macros. Does it work as
battre 2016/01/08 14:14:32 Yes, this is just string replacements.
+#define SUB_DELIMS "[!$&'()*+,;=]"
+#define GEN_DELIMS "[:/?#[\\]@]"
+
+#define DIGIT "[0-9]"
+#define HEXDIG "[0-9a-f]"
+
+#define PCT_ENCODED "%" HEXDIG HEXDIG
+
+#define DEC_OCTET NCG("[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-9]")
+
+#define IPV4ADDRESS DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET
+
+#define H16 NCG(HEXDIG) "{1,4}"
+#define LS32 NCG(H16 ":" H16 "|" IPV4ADDRESS)
+
+#define IPV6ADDRESS NCG( \
+ NCG(H16 ":") "{6}" LS32 "|" \
+ "::" NCG(H16 ":") "{5}" LS32 "|" \
+ OPT_NCG( H16) "::" NCG(H16 ":") "{4}" LS32 "|" \
+ OPT_NCG( NCG(H16 ":") "{0,1}" H16) "::" NCG(H16 ":") "{3}" LS32 "|" \
+ OPT_NCG( NCG(H16 ":") "{0,2}" H16) "::" NCG(H16 ":") "{2}" LS32 "|" \
+ OPT_NCG( NCG(H16 ":") "{0,3}" H16) "::" NCG(H16 ":") LS32 "|" \
+ OPT_NCG( NCG(H16 ":") "{0,4}" H16) "::" LS32 "|" \
+ OPT_NCG( NCG(H16 ":") "{0,5}" H16) "::" H16 "|" \
+ OPT_NCG( NCG(H16 ":") "{0,6}" H16) "::")
+
+#define IPVFUTURE \
+ "v" HEXDIG \
+ "+" \
+ "\\." NCG(UNRESERVED "|" SUB_DELIMS \
+ "|" \
+ ":") "+"
+
+#define IP_LITERAL "\\[" NCG(IPV6ADDRESS "|" IPVFUTURE) "\\]"
+
+#define PORT DIGIT "*"
+
+// This is a diversion of RFC 3987
+#define SCHEME NCG("http|https|ftp|chrome|chrome-extension|android")
+
+#define IPRIVATE \
+ "[" \
+ "\\x{E000}-\\x{F8FF}" \
+ "\\x{F0000}-\\x{FFFFD}" \
+ "\\x{100000}-\\x{10FFFD}" \
+ "]"
+
+#define UCSCHAR \
+ "[" "\\x{A0}-\\x{D7FF}" "\\x{F900}-\\x{FDCF}" "\\x{FDF0}-\\x{FFEF}" \
+ "\\x{10000}-\\x{1FFFD}" "\\x{20000}-\\x{2FFFD}" "\\x{30000}-\\x{3FFFD}" \
+ "\\x{40000}-\\x{4FFFD}" "\\x{50000}-\\x{5FFFD}" "\\x{60000}-\\x{6FFFD}" \
+ "\\x{70000}-\\x{7FFFD}" "\\x{80000}-\\x{8FFFD}" "\\x{90000}-\\x{9FFFD}" \
+ "\\x{A0000}-\\x{AFFFD}" "\\x{B0000}-\\x{BFFFD}" "\\x{C0000}-\\x{CFFFD}" \
+ "\\x{D0000}-\\x{DFFFD}" "\\x{E1000}-\\x{EFFFD}" "]"
+
+#define IUNRESERVED NCG("[-a-z0-9._~]" "|" UCSCHAR)
+
+#define IPCHAR NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" "[:@]")
+#define IFRAGMENT NCG(IPCHAR "|" "[/?]") "*"
+#define IQUERY NCG(IPCHAR "|" IPRIVATE "|" "[/?]") "*"
+
+#define ISEGMENT IPCHAR "*"
+#define ISEGMENT_NZ IPCHAR "+"
+#define ISEGMENT_NZ_NC \
+ NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS \
+ "|" "@") "+"
+
+#define IPATH_EMPTY ""
+#define IPATH_ROOTLESS ISEGMENT_NZ NCG("/" ISEGMENT) "*"
+#define IPATH_NOSCHEME ISEGMENT_NZ_NC NCG("/" ISEGMENT) "*"
+#define IPATH_ABSOLUTE "/" OPT_NCG(ISEGMENT_NZ NCG("/" ISEGMENT) "*")
+#define IPATH_ABEMPTY NCG("/" ISEGMENT) "*"
+
+#define IPATH NCG(IPATH_ABEMPTY "|" IPATH_ABSOLUTE "|" IPATH_NOSCHEME "|" \
+ IPATH_ROOTLESS "|" IPATH_EMPTY)
+
+#define IREG_NAME NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS) "*"
+
+#define IHOST NCG(IP_LITERAL "|" IPV4ADDRESS "|" IREG_NAME)
+#define IUSERINFO NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" ":") "*"
+#define IAUTHORITY OPT_NCG(IUSERINFO "@") IHOST OPT_NCG(":" PORT)
+
+#define IRELATIVE_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \
+ "|" IPATH_NOSCHEME "|" IPATH_EMPTY)
+
+#define IRELATIVE_REF IRELATIVE_PART OPT_NCG("?" IQUERY) OPT_NCG("#" IFRAGMENT)
+
+// RFC 3987 requires IPATH_EMPTY here but it is omitted so that statements
+// that end with "Android:" for example are not considered a URL.
+#define IHIER_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \
+ "|" IPATH_ROOTLESS)
+
+#define ABSOLUTE_IRI SCHEME ":" IHIER_PART OPT_NCG("?" IQUERY)
+
+#define IRI SCHEME ":" IHIER_PART OPT_NCG("\\?" IQUERY) OPT_NCG("#" IFRAGMENT)
+
+#define IRI_REFERENCE NCG(IRI "|" IRELATIVE_REF)
+
+// TODO(battre): Use http://tools.ietf.org/html/rfc5322 to represent email
+// addresses. Capture names as well ("First Lastname" <foo@bar.com>).
+
+// The |kCustomPatternWithoutContext| array defines further patterns to match
+// and anonymize. Each pattern consists of a single capturing group.
+CustomPatternWithoutContext kCustomPatternsWithoutContext[] = {
+ {"URL", "(?i)(" IRI ")"},
+ // Email Addresses need to come after URLs because they can be part
+ // of a query parameter.
+ {"email", "(?i)([0-9a-z._%+-]+@[a-z0-9.-]+\\.[a-z]{2,6})"},
+ // IP filter rules need to come after URLs so that they don't disturb the
+ // URL pattern in case the IP address is part of a URL.
+ {"IPv4", "(?i)(" IPV4ADDRESS ")"},
+ {"IPv6", "(?i)(" IPV6ADDRESS ")"},
+};
+
} // namespace
AnonymizerTool::AnonymizerTool()
- : custom_patterns_(arraysize(kCustomPatterns)) {}
+ : custom_patterns_with_context_(arraysize(kCustomPatternsWithContext)),
+ custom_patterns_without_context_(
+ arraysize(kCustomPatternsWithoutContext)) {}
AnonymizerTool::~AnonymizerTool() {}
@@ -106,15 +237,21 @@ std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) {
}
std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) {
- for (size_t i = 0; i < arraysize(kCustomPatterns); i++) {
+ for (size_t i = 0; i < arraysize(kCustomPatternsWithContext); i++) {
input =
- AnonymizeCustomPattern(input, kCustomPatterns[i], &custom_patterns_[i]);
+ AnonymizeCustomPatternWithContext(input, kCustomPatternsWithContext[i],
+ &custom_patterns_with_context_[i]);
+ }
+ for (size_t i = 0; i < arraysize(kCustomPatternsWithoutContext); i++) {
+ input = AnonymizeCustomPatternWithoutContext(
+ input, kCustomPatternsWithoutContext[i],
+ &custom_patterns_without_context_[i]);
}
return input;
}
// static
-std::string AnonymizerTool::AnonymizeCustomPattern(
+std::string AnonymizerTool::AnonymizeCustomPatternWithContext(
const std::string& input,
const std::string& pattern,
std::map<std::string, std::string>* identifier_space) {
@@ -148,4 +285,38 @@ std::string AnonymizerTool::AnonymizeCustomPattern(
return result;
}
+// static
+std::string AnonymizerTool::AnonymizeCustomPatternWithoutContext(
+ const std::string& input,
+ const CustomPatternWithoutContext& pattern,
+ std::map<std::string, std::string>* identifier_space) {
+ RE2::Options options;
+ // set_multiline of pcre is not supported by RE2, yet.
+ options.set_dot_nl(true); // Dot matches a new line.
+ RE2 re(std::string("(.*?)") + pattern.pattern, options);
+ DCHECK_EQ(re2::RE2::NoError, re.error_code())
+ << "Failed to parse:\n" << pattern.pattern << "\n" << re.error();
+ DCHECK_EQ(2, re.NumberOfCapturingGroups());
+
+ std::string result;
+ result.reserve(input.size());
+
+ // Keep consuming, building up a result string as we go.
+ re2::StringPiece text(input);
+ std::string pre_match, matched_id;
+ while (RE2::Consume(&text, re, RE2::Arg(&pre_match), RE2::Arg(&matched_id))) {
+ std::string replacement_id = (*identifier_space)[matched_id];
+ if (replacement_id.empty()) {
+ replacement_id = base::StringPrintf("<%s: %zu>", pattern.alias,
+ identifier_space->size());
+ (*identifier_space)[matched_id] = replacement_id;
+ }
+
+ result += pre_match;
+ result += replacement_id;
+ }
+ text.AppendToString(&result);
+ return result;
+}
+
} // namespace feedback

Powered by Google App Engine
This is Rietveld 408576698