components/feedback/anonymizer_tool.cc - Issue 1543633003: Added anonymization patterns for URLs and email addresses

Side by Side Diff: components/feedback/anonymizer_tool.cc

Issue 1543633003: Added anonymization patterns for URLs and email addresses (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@bug-567870-introduce-anonymizer

Patch Set: Clarification Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2015 The Chromium Authors. All rights reserved.	1 // Copyright 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/feedback/anonymizer_tool.h"	5 #include "components/feedback/anonymizer_tool.h"

6	6

7 #include <base/strings/string_number_conversions.h>	7 #include <utility>

8 #include <base/strings/string_util.h>	8

9 #include <base/strings/stringprintf.h>	9 #include "base/strings/string_number_conversions.h"

	10 #include "base/strings/string_util.h"

	11 #include "base/strings/stringprintf.h"

10	12

11 #include "third_party/re2/src/re2/re2.h"	13 #include "third_party/re2/src/re2/re2.h"

12	14

13 using re2::RE2;	15 using re2::RE2;

14	16

15 namespace feedback {	17 namespace feedback {

16	18

17 namespace {	19 namespace {

18	20

19 // The \|kCustomPatterns\| array defines patterns to match and anonymize. Each	21 // The \|kCustomPatternsWithContext\| array defines patterns to match and

20 // pattern needs to define three capturing parentheses groups:	22 // anonymize. Each pattern needs to define three capturing parentheses groups:

21 //	23 //

22 // - a group for the pattern before the identifier to be anonymized;	24 // - a group for the pattern before the identifier to be anonymized;

23 // - a group for the identifier to be anonymized;	25 // - a group for the identifier to be anonymized;

24 // - a group for the pattern after the identifier to be anonymized.	26 // - a group for the pattern after the identifier to be anonymized.

25 //	27 //

	28 // The first and the last capture group are the origin of the "WithContext"

	29 // suffix in the name of this constant.

	30 //

26 // Every matched identifier (in the context of the whole pattern) is anonymized	31 // Every matched identifier (in the context of the whole pattern) is anonymized

27 // by replacing it with an incremental instance identifier. Every different	32 // by replacing it with an incremental instance identifier. Every different

28 // pattern defines a separate instance identifier space. See the unit test for	33 // pattern defines a separate instance identifier space. See the unit test for

29 // AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples.	34 // AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples.

30 //	35 //

31 // Useful regular expression syntax:	36 // Useful regular expression syntax:

32 //	37 //

33 // +? is a non-greedy (lazy) +.	38 // +? is a non-greedy (lazy) +.

34 // \b matches a word boundary.	39 // \b matches a word boundary.

35 // (?i) turns on case insensitivy for the remainder of the regex.	40 // (?i) turns on case insensitivy for the remainder of the regex.

36 // (?-s) turns off "dot matches newline" for the remainder of the regex.	41 // (?-s) turns off "dot matches newline" for the remainder of the regex.

37 // (?:regex) denotes non-capturing parentheses group.	42 // (?:regex) denotes non-capturing parentheses group.

38 const char* kCustomPatterns[] = {	43 const char* kCustomPatternsWithContext[] = {

39 "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager	44 "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager

40 "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager	45 "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager

41 "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant	46 "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant

42 "(?-s)(\\bSSID - hexdump\$len=[0-9]+\$: )(.+)()", // wpa_supplicant	47 "(?-s)(\\bSSID - hexdump\$len=[0-9]+\$: )(.+)()", // wpa_supplicant

43 "(?-s)(\\[SSID=)(.+?)(\\])", // shill	48 "(?-s)(\\[SSID=)(.+?)(\\])", // shill

44 };	49 };

45	50

	51 // Helper macro: Non capturing group

	52 #define NCG(x) "(?:" x ")"

	53 // Helper macro: Optional non capturing group

	54 #define OPT_NCG(x) NCG(x) "?"

	55

	56 //////////////////////////////////////////////////////////////////////////

	57 // Patterns for URLs, or better IRIs, based on RFC 3987 with an artificial

	58 // limitation on the scheme to increase precision. Otherwise anything

	59 // like "ID:" would be considered an IRI.

	60

	61 #define UNRESERVED "[-a-z0-9._~]"

	62 #define RESERVED NGC(GEN_DELIMS "\|" SUB_DELIMS)

	63 #define SUB_DELIMS "[!$&'()*+,;=]"

	64 #define GEN_DELIMS "[:/?#[\\]@]"

	65

	66 #define DIGIT "[0-9]"

	67 #define HEXDIG "[0-9a-f]"

	68

	69 #define PCT_ENCODED "%" HEXDIG HEXDIG

	70

	71 #define DEC_OCTET NCG("[0-9]\|[1-9][0-9]\|1[0-9][0-9]\|2[0-4][0-9]\|25[0-9]")

	72

	73 #define IPV4ADDRESS DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET

	74

	75 #define H16 NCG(HEXDIG) "{1,4}"

	76 #define LS32 NCG(H16 ":" H16 "\|" IPV4ADDRESS)

	77

	78 #define IPV6ADDRESS NCG( \

	79 NCG(H16 ":") "{6}" LS32 "\|" \

	80 "::" NCG(H16 ":") "{5}" LS32 "\|" \

	81 OPT_NCG( H16) "::" NCG(H16 ":") "{4}" LS32 "\|" \

	82 OPT_NCG( NCG(H16 ":") "{0,1}" H16) "::" NCG(H16 ":") "{3}" LS32 "\|" \

	83 OPT_NCG( NCG(H16 ":") "{0,2}" H16) "::" NCG(H16 ":") "{2}" LS32 "\|" \

	84 OPT_NCG( NCG(H16 ":") "{0,3}" H16) "::" NCG(H16 ":") LS32 "\|" \

	85 OPT_NCG( NCG(H16 ":") "{0,4}" H16) "::" LS32 "\|" \

	86 OPT_NCG( NCG(H16 ":") "{0,5}" H16) "::" H16 "\|" \

	87 OPT_NCG( NCG(H16 ":") "{0,6}" H16) "::")

	88

	89 #define IPVFUTURE \

	90 "v" HEXDIG \

	91 "+" \

	92 "\\." NCG(UNRESERVED "\|" SUB_DELIMS \

	93 "\|" \

	94 ":") "+"

	95

	96 #define IP_LITERAL "\\[" NCG(IPV6ADDRESS "\|" IPVFUTURE) "\\]"

	97

	98 #define PORT DIGIT "*"

	99

	100 // This is a diversion of RFC 3987

	101 #define SCHEME NCG("http\|https\|ftp\|chrome\|chrome-extension\|android")

	102

	103 #define IPRIVATE \

	104 "[" \

	105 "\\x{E000}-\\x{F8FF}" \

	106 "\\x{F0000}-\\x{FFFFD}" \

	107 "\\x{100000}-\\x{10FFFD}" \

	108 "]"

	109

	110 #define UCSCHAR \

	111 "[" "\\x{A0}-\\x{D7FF}" "\\x{F900}-\\x{FDCF}" "\\x{FDF0}-\\x{FFEF}" \

	112 "\\x{10000}-\\x{1FFFD}" "\\x{20000}-\\x{2FFFD}" "\\x{30000}-\\x{3FFFD}" \

	113 "\\x{40000}-\\x{4FFFD}" "\\x{50000}-\\x{5FFFD}" "\\x{60000}-\\x{6FFFD}" \

	114 "\\x{70000}-\\x{7FFFD}" "\\x{80000}-\\x{8FFFD}" "\\x{90000}-\\x{9FFFD}" \

	115 "\\x{A0000}-\\x{AFFFD}" "\\x{B0000}-\\x{BFFFD}" "\\x{C0000}-\\x{CFFFD}" \

	116 "\\x{D0000}-\\x{DFFFD}" "\\x{E1000}-\\x{EFFFD}" "]"

	117

	118 #define IUNRESERVED NCG("[-a-z0-9._~]" "\|" UCSCHAR)

	119

	120 #define IPCHAR NCG(IUNRESERVED "\|" PCT_ENCODED "\|" SUB_DELIMS "\|" "[:@]")

	121 #define IFRAGMENT NCG(IPCHAR "\|" "[/?]") "*"

	122 #define IQUERY NCG(IPCHAR "\|" IPRIVATE "\|" "[/?]") "*"

	123

	124 #define ISEGMENT IPCHAR "*"

	125 #define ISEGMENT_NZ IPCHAR "+"

	126 #define ISEGMENT_NZ_NC \

	127 NCG(IUNRESERVED "\|" PCT_ENCODED "\|" SUB_DELIMS \

	128 "\|" "@") "+"

	129

	130 #define IPATH_EMPTY ""

	131 #define IPATH_ROOTLESS ISEGMENT_NZ NCG("/" ISEGMENT) "*"

	132 #define IPATH_NOSCHEME ISEGMENT_NZ_NC NCG("/" ISEGMENT) "*"

	133 #define IPATH_ABSOLUTE "/" OPT_NCG(ISEGMENT_NZ NCG("/" ISEGMENT) "*")

	134 #define IPATH_ABEMPTY NCG("/" ISEGMENT) "*"

	135

	136 #define IPATH NCG(IPATH_ABEMPTY "\|" IPATH_ABSOLUTE "\|" IPATH_NOSCHEME "\|" \

	137 IPATH_ROOTLESS "\|" IPATH_EMPTY)

	138

	139 #define IREG_NAME NCG(IUNRESERVED "\|" PCT_ENCODED "\|" SUB_DELIMS) "*"

	140

	141 #define IHOST NCG(IP_LITERAL "\|" IPV4ADDRESS "\|" IREG_NAME)

	142 #define IUSERINFO NCG(IUNRESERVED "\|" PCT_ENCODED "\|" SUB_DELIMS "\|" ":") "*"

	143 #define IAUTHORITY OPT_NCG(IUSERINFO "@") IHOST OPT_NCG(":" PORT)

	144

	145 #define IRELATIVE_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "\|" IPATH_ABSOLUTE \

	146 "\|" IPATH_NOSCHEME "\|" IPATH_EMPTY)

	147

	148 #define IRELATIVE_REF IRELATIVE_PART OPT_NCG("?" IQUERY) OPT_NCG("#" IFRAGMENT)

	149

	150 // RFC 3987 requires IPATH_EMPTY here but it is omitted so that statements

	151 // that end with "Android:" for example are not considered a URL.

	152 #define IHIER_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "\|" IPATH_ABSOLUTE \

	153 "\|" IPATH_ROOTLESS)

	154

	155 #define ABSOLUTE_IRI SCHEME ":" IHIER_PART OPT_NCG("?" IQUERY)

	156

	157 #define IRI SCHEME ":" IHIER_PART OPT_NCG("\\?" IQUERY) OPT_NCG("#" IFRAGMENT)

	158

	159 #define IRI_REFERENCE NCG(IRI "\|" IRELATIVE_REF)

	160

	161 // TODO(battre): Use http://tools.ietf.org/html/rfc5322 to represent email

	162 // addresses. Capture names as well ("First Lastname" <foo@bar.com>).

	163

	164 // The \|kCustomPatternWithoutContext\| array defines further patterns to match

	165 // and anonymize. Each pattern consists of a single capturing group.

	166 CustomPatternWithoutContext kCustomPatternsWithoutContext[] = {

	167 {"URL", "(?i)(" IRI ")"},

	168 // Email Addresses need to come after URLs because they can be part

	169 // of a query parameter.

	170 {"email", "(?i)([0-9a-z._%+-]+@[a-z0-9.-]+\\.[a-z]{2,6})"},

	171 // IP filter rules need to come after URLs so that they don't disturb the

	172 // URL pattern in case the IP address is part of a URL.

	173 {"IPv4", "(?i)(" IPV4ADDRESS ")"},

	174 {"IPv6", "(?i)(" IPV6ADDRESS ")"},

	175 };

	176

	177 // Like RE2's FindAndConsume, searches for the first occurrence of \|pattern\| in

	178 // \|input\| and consumes the bytes until the end of the pattern matching. Unlike

	179 // FindAndConsume, the bytes skipped before the match of \|pattern\| are stored

	180 // in \|skipped_input\|.

	181 // Example: input = "aaabbb", pattern = "(b+)" leads to skipped_input = "aaa"

	182 // and args[0] = "bbb".

	183 bool FindAndConsumeAndGetSkippedN(re2::StringPiece* input,

	184 const re2::RE2& pattern,

	185 re2::StringPiece* skipped_input,

	186 re2::StringPiece* args[],

	187 int argc) {

	188 re2::StringPiece old_input = *input;

	189

	190 re2::RE2::Arg a0(argc > 0 ? args[0] : nullptr);

	191 re2::RE2::Arg a1(argc > 1 ? args[1] : nullptr);

	192 re2::RE2::Arg a2(argc > 2 ? args[2] : nullptr);

	193 const re2::RE2::Arg* const wrapped_args[] = {&a0, &a1, &a2};

	194 CHECK_LE(argc, 3);

	195

	196 if (skipped_input)

	197 skipped_input = input;
	vasilii 2016/01/11 11:58:32 This worsens my concern. Now if \|args\| is empty th This worsens my concern. Now if \|args\| is empty then \|skipped_input\| contains wrong value upon exit. battre 2016/01/11 12:33:08 Done. Show quoted text On 2016/01/11 11:58:32, vasilii wrote: > This worsens my concern. Now if \|args\| is empty then \|skipped_input\| contains > wrong value upon exit. Done.
	198

	199 bool result = re2::RE2::FindAndConsumeN(input, pattern, wrapped_args, argc);

	200

	201 if (skipped_input && result && argc > 0) {

	202 size_t bytes_skipped = args[0]->data() - old_input.data();

	203 *skipped_input = re2::StringPiece(old_input.data(), bytes_skipped);

	204 }

	205 return result;

	206 }

	207

	208 // All \|match_groups\| need to be of type re2::StringPiece*.

	209 template <typename... Arg>

	210 bool FindAndConsumeAndGetSkipped(re2::StringPiece* input,

	211 const re2::RE2& pattern,

	212 re2::StringPiece* skipped_input,

	213 Arg*... match_groups) {

	214 re2::StringPiece* args[] = {match_groups...};

	215 return FindAndConsumeAndGetSkippedN(input, pattern, skipped_input, args,

	216 arraysize(args));

	217 }

	218

46 } // namespace	219 } // namespace

47	220

48 AnonymizerTool::AnonymizerTool()	221 AnonymizerTool::AnonymizerTool()

49 : custom_patterns_(arraysize(kCustomPatterns)) {}	222 : custom_patterns_with_context_(arraysize(kCustomPatternsWithContext)),

	223 custom_patterns_without_context_(

	224 arraysize(kCustomPatternsWithoutContext)) {}

50	225

51 AnonymizerTool::~AnonymizerTool() {}	226 AnonymizerTool::~AnonymizerTool() {}

52	227

53 std::string AnonymizerTool::Anonymize(const std::string& input) {	228 std::string AnonymizerTool::Anonymize(const std::string& input) {

54 std::string anonymized = AnonymizeMACAddresses(input);	229 std::string anonymized = AnonymizeMACAddresses(input);

55 anonymized = AnonymizeCustomPatterns(std::move(anonymized));	230 anonymized = AnonymizeCustomPatterns(std::move(anonymized));

56 return anonymized;	231 return anonymized;

57 }	232 }

58	233

	234 RE2* AnonymizerTool::GetRegExp(const std::string& pattern) {

	235 if (regexp_cache_.find(pattern) == regexp_cache_.end()) {

	236 RE2::Options options;

	237 // set_multiline of pcre is not supported by RE2, yet.

	238 options.set_dot_nl(true); // Dot matches a new line.

	239 scoped_ptr<RE2> re = make_scoped_ptr(new RE2(pattern, options));

	240 DCHECK_EQ(re2::RE2::NoError, re->error_code())

	241 << "Failed to parse:\n" << pattern << "\n" << re->error();

	242 regexp_cache_[pattern] = std::move(re);

	243 }

	244 return regexp_cache_[pattern].get();

	245 }

	246

59 std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) {	247 std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) {

60 // This regular expression finds the next MAC address. It splits the data into	248 // This regular expression finds the next MAC address. It splits the data into

61 // a section preceding the MAC address, an OUI (Organizationally Unique	249 // an OUI (Organizationally Unique Identifier) part and a NIC (Network

62 // Identifier) part and a NIC (Network Interface Controller) specific part.	250 // Interface Controller) specific part.

63	251

64 RE2::Options options;	252 RE2* mac_re = GetRegExp(

65 // set_multiline of pcre is not supported by RE2, yet.	253 "([0-9a-fA-F][0-9a-fA-F]:"

66 options.set_dot_nl(true); // Dot matches a new line.

67 RE2 mac_re(

68 "(.*?)("

69 "[0-9a-fA-F][0-9a-fA-F]:"

70 "[0-9a-fA-F][0-9a-fA-F]:"	254 "[0-9a-fA-F][0-9a-fA-F]:"

71 "[0-9a-fA-F][0-9a-fA-F]):("	255 "[0-9a-fA-F][0-9a-fA-F]):("

72 "[0-9a-fA-F][0-9a-fA-F]:"	256 "[0-9a-fA-F][0-9a-fA-F]:"

73 "[0-9a-fA-F][0-9a-fA-F]:"	257 "[0-9a-fA-F][0-9a-fA-F]:"

74 "[0-9a-fA-F][0-9a-fA-F])",	258 "[0-9a-fA-F][0-9a-fA-F])");

75 options);

76	259

77 std::string result;	260 std::string result;

78 result.reserve(input.size());	261 result.reserve(input.size());

79	262

80 // Keep consuming, building up a result string as we go.	263 // Keep consuming, building up a result string as we go.

81 re2::StringPiece text(input);	264 re2::StringPiece text(input);

82 std::string pre_mac, oui, nic;	265 re2::StringPiece skipped;

83 while (re2::RE2::Consume(&text, mac_re, RE2::Arg(&pre_mac), RE2::Arg(&oui),	266 re2::StringPiece pre_mac, oui, nic;

84 RE2::Arg(&nic))) {	267 while (FindAndConsumeAndGetSkipped(&text, *mac_re, &skipped, &oui, &nic)) {

85 // Look up the MAC address in the hash.	268 // Look up the MAC address in the hash.

86 oui = base::ToLowerASCII(oui);	269 std::string oui_string = base::ToLowerASCII(oui.as_string());

87 nic = base::ToLowerASCII(nic);	270 std::string nic_string = base::ToLowerASCII(nic.as_string());

88 std::string mac = oui + ":" + nic;	271 std::string mac = oui_string + ":" + nic_string;

89 std::string replacement_mac = mac_addresses_[mac];	272 std::string replacement_mac = mac_addresses_[mac];

90 if (replacement_mac.empty()) {	273 if (replacement_mac.empty()) {

91 // If not found, build up a replacement MAC address by generating a new	274 // If not found, build up a replacement MAC address by generating a new

92 // NIC part.	275 // NIC part.

93 int mac_id = mac_addresses_.size();	276 int mac_id = mac_addresses_.size();

94 replacement_mac = base::StringPrintf(	277 replacement_mac = base::StringPrintf(

95 "%s:%02x:%02x:%02x", oui.c_str(), (mac_id & 0x00ff0000) >> 16,	278 "%s:%02x:%02x:%02x", oui_string.c_str(), (mac_id & 0x00ff0000) >> 16,

96 (mac_id & 0x0000ff00) >> 8, (mac_id & 0x000000ff));	279 (mac_id & 0x0000ff00) >> 8, (mac_id & 0x000000ff));

97 mac_addresses_[mac] = replacement_mac;	280 mac_addresses_[mac] = replacement_mac;

98 }	281 }

99	282

100 result += pre_mac;	283 skipped.AppendToString(&result);

101 result += replacement_mac;	284 result += replacement_mac;

102 }	285 }

103	286

104 text.AppendToString(&result);	287 text.AppendToString(&result);

105 return result;	288 return result;

106 }	289 }

107	290

108 std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) {	291 std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) {

109 for (size_t i = 0; i < arraysize(kCustomPatterns); i++) {	292 for (size_t i = 0; i < arraysize(kCustomPatternsWithContext); i++) {

110 input =	293 input =

111 AnonymizeCustomPattern(input, kCustomPatterns[i], &custom_patterns_[i]);	294 AnonymizeCustomPatternWithContext(input, kCustomPatternsWithContext[i],

	295 &custom_patterns_with_context_[i]);

	296 }

	297 for (size_t i = 0; i < arraysize(kCustomPatternsWithoutContext); i++) {

	298 input = AnonymizeCustomPatternWithoutContext(

	299 input, kCustomPatternsWithoutContext[i],

	300 &custom_patterns_without_context_[i]);

112 }	301 }

113 return input;	302 return input;

114 }	303 }

115	304

116 // static	305 std::string AnonymizerTool::AnonymizeCustomPatternWithContext(

117 std::string AnonymizerTool::AnonymizeCustomPattern(

118 const std::string& input,	306 const std::string& input,

119 const std::string& pattern,	307 const std::string& pattern,

120 std::map<std::string, std::string>* identifier_space) {	308 std::map<std::string, std::string>* identifier_space) {

121 RE2::Options options;	309 RE2* re = GetRegExp(pattern);

122 // set_multiline of pcre is not supported by RE2, yet.	310 DCHECK_EQ(3, re->NumberOfCapturingGroups());

123 options.set_dot_nl(true); // Dot matches a new line.

124 RE2 re("(.*?)" + pattern, options);

125 DCHECK_EQ(4, re.NumberOfCapturingGroups());

126	311

127 std::string result;	312 std::string result;

128 result.reserve(input.size());	313 result.reserve(input.size());

129	314

130 // Keep consuming, building up a result string as we go.	315 // Keep consuming, building up a result string as we go.

131 re2::StringPiece text(input);	316 re2::StringPiece text(input);

132 std::string pre_match, pre_matched_id, matched_id, post_matched_id;	317 re2::StringPiece skipped;

133 while (RE2::Consume(&text, re, RE2::Arg(&pre_match),	318 re2::StringPiece pre_match, pre_matched_id, matched_id, post_matched_id;

134 RE2::Arg(&pre_matched_id), RE2::Arg(&matched_id),	319 while (FindAndConsumeAndGetSkipped(&text, *re, &skipped, &pre_matched_id,

135 RE2::Arg(&post_matched_id))) {	320 &matched_id, &post_matched_id)) {

136 std::string replacement_id = (*identifier_space)[matched_id];	321 std::string matched_id_as_string = matched_id.as_string();

	322 std::string replacement_id = (*identifier_space)[matched_id_as_string];

137 if (replacement_id.empty()) {	323 if (replacement_id.empty()) {

138 replacement_id = base::IntToString(identifier_space->size());	324 replacement_id = base::IntToString(identifier_space->size());

139 (*identifier_space)[matched_id] = replacement_id;	325 (*identifier_space)[matched_id_as_string] = replacement_id;

140 }	326 }

141	327

142 result += pre_match;	328 skipped.AppendToString(&result);

143 result += pre_matched_id;	329 pre_matched_id.AppendToString(&result);

144 result += replacement_id;	330 result += replacement_id;

145 result += post_matched_id;	331 post_matched_id.AppendToString(&result);

146 }	332 }

147 text.AppendToString(&result);	333 text.AppendToString(&result);

148 return result;	334 return result;

	335 }

	336

	337 std::string AnonymizerTool::AnonymizeCustomPatternWithoutContext(

	338 const std::string& input,

	339 const CustomPatternWithoutContext& pattern,

	340 std::map<std::string, std::string>* identifier_space) {

	341 RE2* re = GetRegExp(pattern.pattern);

	342 DCHECK_EQ(1, re->NumberOfCapturingGroups());

	343

	344 std::string result;

	345 result.reserve(input.size());

	346

	347 // Keep consuming, building up a result string as we go.

	348 re2::StringPiece text(input);

	349 re2::StringPiece skipped;

	350 re2::StringPiece matched_id;

	351 while (FindAndConsumeAndGetSkipped(&text, *re, &skipped, &matched_id)) {

	352 std::string matched_id_as_string = matched_id.as_string();

	353 std::string replacement_id = (*identifier_space)[matched_id_as_string];

	354 if (replacement_id.empty()) {

	355 // The weird Uint64toString trick is because Windows does not like to deal

	356 // with %zu and a size_t in printf, nor does it support %llu.

	357 replacement_id = base::StringPrintf(

	358 "<%s: %s>", pattern.alias,

	359 base::Uint64ToString(identifier_space->size()).c_str());

	360 (*identifier_space)[matched_id_as_string] = replacement_id;

	361 }

	362

	363 skipped.AppendToString(&result);

	364 result += replacement_id;

	365 }

	366 text.AppendToString(&result);

	367 return result;

149 }	368 }

150	369

151 } // namespace feedback	370 } // namespace feedback

OLD	NEW

« no previous file with comments | « components/feedback/anonymizer_tool.h ('k') | components/feedback/anonymizer_tool_unittest.cc » ('j') | no next file with comments »