components/feedback/anonymizer_tool.cc - Issue 1543633003: Added anonymization patterns for URLs and email addresses

Side by Side Diff: components/feedback/anonymizer_tool.cc

Issue 1543633003: Added anonymization patterns for URLs and email addresses (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@bug-567870-introduce-anonymizer

Patch Set: Fixed renaming of one variable and resulting compiler error Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2015 The Chromium Authors. All rights reserved.	1 // Copyright 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/feedback/anonymizer_tool.h"	5 #include "components/feedback/anonymizer_tool.h"

6	6

7 #include <base/strings/string_number_conversions.h>	7 #include <utility>

8 #include <base/strings/string_util.h>	8

9 #include <base/strings/stringprintf.h>	9 #include "base/strings/string_number_conversions.h"

	10 #include "base/strings/string_util.h"

	11 #include "base/strings/stringprintf.h"

10	12

11 #include "third_party/re2/src/re2/re2.h"	13 #include "third_party/re2/src/re2/re2.h"

12	14

13 using re2::RE2;	15 using re2::RE2;

14	16

15 namespace feedback {	17 namespace feedback {

16	18

17 namespace {	19 namespace {

18	20

19 // The \|kCustomPatterns\| array defines patterns to match and anonymize. Each	21 // The \|kCustomPatternsWithContext\| array defines patterns to match and

20 // pattern needs to define three capturing parentheses groups:	22 // anonymize. Each pattern needs to define three capturing parentheses groups:

21 //	23 //

22 // - a group for the pattern before the identifier to be anonymized;	24 // - a group for the pattern before the identifier to be anonymized;

23 // - a group for the identifier to be anonymized;	25 // - a group for the identifier to be anonymized;

24 // - a group for the pattern after the identifier to be anonymized.	26 // - a group for the pattern after the identifier to be anonymized.

25 //	27 //

	28 // The first and the last capture group are the origin of the "WithContext"

	29 // suffix in the name of this constant.

	30 //

26 // Every matched identifier (in the context of the whole pattern) is anonymized	31 // Every matched identifier (in the context of the whole pattern) is anonymized

27 // by replacing it with an incremental instance identifier. Every different	32 // by replacing it with an incremental instance identifier. Every different

28 // pattern defines a separate instance identifier space. See the unit test for	33 // pattern defines a separate instance identifier space. See the unit test for

29 // AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples.	34 // AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples.

30 //	35 //

31 // Useful regular expression syntax:	36 // Useful regular expression syntax:

32 //	37 //

33 // +? is a non-greedy (lazy) +.	38 // +? is a non-greedy (lazy) +.

34 // \b matches a word boundary.	39 // \b matches a word boundary.

35 // (?i) turns on case insensitivy for the remainder of the regex.	40 // (?i) turns on case insensitivy for the remainder of the regex.

36 // (?-s) turns off "dot matches newline" for the remainder of the regex.	41 // (?-s) turns off "dot matches newline" for the remainder of the regex.

37 // (?:regex) denotes non-capturing parentheses group.	42 // (?:regex) denotes non-capturing parentheses group.

38 const char* kCustomPatterns[] = {	43 const char* kCustomPatternsWithContext[] = {

39 "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager	44 "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager

40 "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager	45 "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager

41 "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant	46 "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant

42 "(?-s)(\\bSSID - hexdump\$len=[0-9]+\$: )(.+)()", // wpa_supplicant	47 "(?-s)(\\bSSID - hexdump\$len=[0-9]+\$: )(.+)()", // wpa_supplicant

43 "(?-s)(\\[SSID=)(.+?)(\\])", // shill	48 "(?-s)(\\[SSID=)(.+?)(\\])", // shill

44 };	49 };

45	50

	51 // Helper macro: Non capturing group

	52 #define NCG(x) "(?:" x ")"

	53 // Helper macro: Optional non capturing group

	54 #define OPT_NCG(x) NCG(x) "?"

	55

	56 //////////////////////////////////////////////////////////////////////////

	57 // Patterns for URLs, or better IRIs, based on RFC 3987 with an artificial

	58 // limitation on the scheme to increase precision. Otherwise anything

	59 // like "ID:" would be considered an IRI.

	60

	61 #define UNRESERVED "[-a-z0-9._~]"

	62 #define RESERVED NGC(GEN_DELIMS "\|" SUB_DELIMS)

	63 #define SUB_DELIMS "[!$&'()*+,;=]"

	64 #define GEN_DELIMS "[:/?#[\\]@]"

	65

	66 #define DIGIT "[0-9]"

	67 #define HEXDIG "[0-9a-f]"

	68

	69 #define PCT_ENCODED "%" HEXDIG HEXDIG

	70

	71 #define DEC_OCTET NCG("[0-9]\|[1-9][0-9]\|1[0-9][0-9]\|2[0-4][0-9]\|25[0-9]")

	72

	73 #define IPV4ADDRESS DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET

	74

	75 #define H16 NCG(HEXDIG) "{1,4}"

	76 #define LS32 NCG(H16 ":" H16 "\|" IPV4ADDRESS)

	77

	78 #define IPV6ADDRESS NCG( \

	79 NCG(H16 ":") "{6}" LS32 "\|" \

	80 "::" NCG(H16 ":") "{5}" LS32 "\|" \

	81 OPT_NCG( H16) "::" NCG(H16 ":") "{4}" LS32 "\|" \

	82 OPT_NCG( NCG(H16 ":") "{0,1}" H16) "::" NCG(H16 ":") "{3}" LS32 "\|" \

	83 OPT_NCG( NCG(H16 ":") "{0,2}" H16) "::" NCG(H16 ":") "{2}" LS32 "\|" \

	84 OPT_NCG( NCG(H16 ":") "{0,3}" H16) "::" NCG(H16 ":") LS32 "\|" \

	85 OPT_NCG( NCG(H16 ":") "{0,4}" H16) "::" LS32 "\|" \

	86 OPT_NCG( NCG(H16 ":") "{0,5}" H16) "::" H16 "\|" \

	87 OPT_NCG( NCG(H16 ":") "{0,6}" H16) "::")

	88

	89 #define IPVFUTURE \

	90 "v" HEXDIG \

	91 "+" \

	92 "\\." NCG(UNRESERVED "\|" SUB_DELIMS \

	93 "\|" \

	94 ":") "+"

	95

	96 #define IP_LITERAL "\\[" NCG(IPV6ADDRESS "\|" IPVFUTURE) "\\]"

	97

	98 #define PORT DIGIT "*"

	99

	100 // This is a diversion of RFC 3987

	101 #define SCHEME NCG("http\|https\|ftp\|chrome\|chrome-extension\|android")

	102

	103 #define IPRIVATE \

	104 "[" \

	105 "\\x{E000}-\\x{F8FF}" \

	106 "\\x{F0000}-\\x{FFFFD}" \

	107 "\\x{100000}-\\x{10FFFD}" \

	108 "]"

	109

	110 #define UCSCHAR \

	111 "[" "\\x{A0}-\\x{D7FF}" "\\x{F900}-\\x{FDCF}" "\\x{FDF0}-\\x{FFEF}" \

	112 "\\x{10000}-\\x{1FFFD}" "\\x{20000}-\\x{2FFFD}" "\\x{30000}-\\x{3FFFD}" \

	113 "\\x{40000}-\\x{4FFFD}" "\\x{50000}-\\x{5FFFD}" "\\x{60000}-\\x{6FFFD}" \

	114 "\\x{70000}-\\x{7FFFD}" "\\x{80000}-\\x{8FFFD}" "\\x{90000}-\\x{9FFFD}" \

	115 "\\x{A0000}-\\x{AFFFD}" "\\x{B0000}-\\x{BFFFD}" "\\x{C0000}-\\x{CFFFD}" \

	116 "\\x{D0000}-\\x{DFFFD}" "\\x{E1000}-\\x{EFFFD}" "]"

	117

	118 #define IUNRESERVED NCG("[-a-z0-9._~]" "\|" UCSCHAR)

	119

	120 #define IPCHAR NCG(IUNRESERVED "\|" PCT_ENCODED "\|" SUB_DELIMS "\|" "[:@]")

	121 #define IFRAGMENT NCG(IPCHAR "\|" "[/?]") "*"

	122 #define IQUERY NCG(IPCHAR "\|" IPRIVATE "\|" "[/?]") "*"

	123

	124 #define ISEGMENT IPCHAR "*"

	125 #define ISEGMENT_NZ IPCHAR "+"

	126 #define ISEGMENT_NZ_NC \

	127 NCG(IUNRESERVED "\|" PCT_ENCODED "\|" SUB_DELIMS \

	128 "\|" "@") "+"

	129

	130 #define IPATH_EMPTY ""

	131 #define IPATH_ROOTLESS ISEGMENT_NZ NCG("/" ISEGMENT) "*"

	132 #define IPATH_NOSCHEME ISEGMENT_NZ_NC NCG("/" ISEGMENT) "*"

	133 #define IPATH_ABSOLUTE "/" OPT_NCG(ISEGMENT_NZ NCG("/" ISEGMENT) "*")

	134 #define IPATH_ABEMPTY NCG("/" ISEGMENT) "*"

	135

	136 #define IPATH NCG(IPATH_ABEMPTY "\|" IPATH_ABSOLUTE "\|" IPATH_NOSCHEME "\|" \

	137 IPATH_ROOTLESS "\|" IPATH_EMPTY)

	138

	139 #define IREG_NAME NCG(IUNRESERVED "\|" PCT_ENCODED "\|" SUB_DELIMS) "*"

	140

	141 #define IHOST NCG(IP_LITERAL "\|" IPV4ADDRESS "\|" IREG_NAME)

	142 #define IUSERINFO NCG(IUNRESERVED "\|" PCT_ENCODED "\|" SUB_DELIMS "\|" ":") "*"

	143 #define IAUTHORITY OPT_NCG(IUSERINFO "@") IHOST OPT_NCG(":" PORT)

	144

	145 #define IRELATIVE_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "\|" IPATH_ABSOLUTE \

	146 "\|" IPATH_NOSCHEME "\|" IPATH_EMPTY)

	147

	148 #define IRELATIVE_REF IRELATIVE_PART OPT_NCG("?" IQUERY) OPT_NCG("#" IFRAGMENT)

	149

	150 // RFC 3987 requires IPATH_EMPTY here but it is omitted so that statements

	151 // that end with "Android:" for example are not considered a URL.

	152 #define IHIER_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "\|" IPATH_ABSOLUTE \

	153 "\|" IPATH_ROOTLESS)

	154

	155 #define ABSOLUTE_IRI SCHEME ":" IHIER_PART OPT_NCG("?" IQUERY)

	156

	157 #define IRI SCHEME ":" IHIER_PART OPT_NCG("\\?" IQUERY) OPT_NCG("#" IFRAGMENT)

	158

	159 #define IRI_REFERENCE NCG(IRI "\|" IRELATIVE_REF)

	160

	161 // TODO(battre): Use http://tools.ietf.org/html/rfc5322 to represent email

	162 // addresses. Capture names as well ("First Lastname" <foo@bar.com>).

	163

	164 // The \|kCustomPatternWithoutContext\| array defines further patterns to match

	165 // and anonymize. Each pattern consists of a single capturing group.

	166 CustomPatternWithoutContext kCustomPatternsWithoutContext[] = {

	167 {"URL", "(?i)(" IRI ")"},

	168 // Email Addresses need to come after URLs because they can be part

	169 // of a query parameter.

	170 {"email", "(?i)([0-9a-z._%+-]+@[a-z0-9.-]+\\.[a-z]{2,6})"},

	171 // IP filter rules need to come after URLs so that they don't disturb the

	172 // URL pattern in case the IP address is part of a URL.

	173 {"IPv4", "(?i)(" IPV4ADDRESS ")"},

	174 {"IPv6", "(?i)(" IPV6ADDRESS ")"},

	175 };

	176

	177 // Functor template that allows calling a function with a set of paramters

	178 // that get wrapped into an array. This is a variation of RE2's

	179 // VariadicFunction2 with an extra parameter.

	180 template <typename Result, typename Param0, typename Param1, typename Param2,

	181 typename Arg,

	182 Result (Func)(Param0, Param1, Param2, Arg[], int count)>

	183 class VariadicFunction3 {

	184 public:

	185 Result operator()(Param0 p0, Param1 p1, Param2 p2) const {

	186 return Func(p0, p1, p2, nullptr, 0);

	187 }

	188

	189 Result operator()(Param0 p0, Param1 p1, Param2 p2, Arg* a0) const {

	190 Arg* args[] = {a0};

	191 return Func(p0, p1, p2, args, 1);

	192 }

	193

	194 Result operator()(Param0 p0, Param1 p1, Param2 p2, Arg* a0, Arg* a1) const {

	195 Arg* args[] = {a0, a1};

	196 return Func(p0, p1, p2, args, 2);

	197 }

	198

	199 Result operator()(Param0 p0, Param1 p1, Param2 p2, Arg* a0, Arg* a1, Arg* a2)

	200 const {

	201 Arg* args[] = {a0, a1, a2};

	202 return Func(p0, p1, p2, args, 3);
	vasilii 2016/01/08 17:02:26 I think you can collapse these definition using a I think you can collapse these definition using a variadic template. template <typename Result, typename Param0, typename Param1, typename Param2, typename ...Arg> Result FindAndConsumeAndGetSkippedInternal(Param0 p0, Param1 p1, Param2 p2, Arg... args) { auto* args_array = {args...}; return FindAndConsumeAndGetSkippedN(p0, p1, p2, args, arraysize(args)); } using FindAndConsumeAndGetSkipped = FindAndConsumeAndGetSkippedInternal<your parameters>; battre 2016/01/11 09:02:20 I don't even need this anymore once I use a variad Show quoted text On 2016/01/08 17:02:26, vasilii wrote: > I think you can collapse these definition using a variadic template. > > template <typename Result, typename Param0, typename Param1, typename Param2, > typename ...Arg> > Result FindAndConsumeAndGetSkippedInternal(Param0 p0, Param1 p1, Param2 p2, > Arg... args) { > auto* args_array = {args...}; > return FindAndConsumeAndGetSkippedN(p0, p1, p2, args, arraysize(args)); > } > > using FindAndConsumeAndGetSkipped = FindAndConsumeAndGetSkippedInternal<your > parameters>; I don't even need this anymore once I use a variadic template. Done.
	203 }

	204 };

	205

	206 // Like RE2's FindAndConsume, searches for the first occurrence of \|pattern\| in

	207 // \|input\| and consumes the bytes until the end of the pattern matching. Unlike

	208 // FindAndConsume, the bytes skipped before the match of \|pattern\| are stored

	209 // in \|skipped_input\|.

	210 bool FindAndConsumeAndGetSkippedN(re2::StringPiece* input,

	211 const re2::RE2& pattern,

	212 re2::StringPiece* skipped_input,

	213 re2::StringPiece* args[],

	214 int argc) {

	215 re2::StringPiece old_input = *input;

	216

	217 re2::RE2::Arg a0(argc > 0 ? args[0] : nullptr);

	218 re2::RE2::Arg a1(argc > 1 ? args[1] : nullptr);

	219 re2::RE2::Arg a2(argc > 2 ? args[2] : nullptr);

	220 const re2::RE2::Arg* const wrapped_args[] = {&a0, &a1, &a2};

	221 CHECK_LE(argc, 3);

	222

	223 bool result = re2::RE2::FindAndConsumeN(input, pattern, wrapped_args, argc);

	224

	225 if (skipped_input && result && argc > 0) {

	226 size_t bytes_skipped = args[0]->data() - old_input.data();

	227 *skipped_input = re2::StringPiece(old_input.data(), bytes_skipped);

	228 }

	229 return result;

	230 }

	231

	232 const VariadicFunction3<bool,

	233 re2::StringPiece*,

	234 const re2::RE2&,

	235 re2::StringPiece*,

	236 re2::StringPiece,

	237 &FindAndConsumeAndGetSkippedN>

	238 FindAndConsumeAndGetSkipped = {};

	239

46 } // namespace	240 } // namespace

47	241

48 AnonymizerTool::AnonymizerTool()	242 AnonymizerTool::AnonymizerTool()

49 : custom_patterns_(arraysize(kCustomPatterns)) {}	243 : custom_patterns_with_context_(arraysize(kCustomPatternsWithContext)),

	244 custom_patterns_without_context_(

	245 arraysize(kCustomPatternsWithoutContext)) {}

50	246

51 AnonymizerTool::~AnonymizerTool() {}	247 AnonymizerTool::~AnonymizerTool() {}

52	248

53 std::string AnonymizerTool::Anonymize(const std::string& input) {	249 std::string AnonymizerTool::Anonymize(const std::string& input) {

54 std::string anonymized = AnonymizeMACAddresses(input);	250 std::string anonymized = AnonymizeMACAddresses(input);

55 anonymized = AnonymizeCustomPatterns(std::move(anonymized));	251 anonymized = AnonymizeCustomPatterns(std::move(anonymized));

56 return anonymized;	252 return anonymized;

57 }	253 }

58	254

	255 RE2* AnonymizerTool::GetRegExp(const std::string& pattern) {

	256 if (regexp_cache_.find(pattern) == regexp_cache_.end()) {

	257 RE2::Options options;

	258 // set_multiline of pcre is not supported by RE2, yet.

	259 options.set_dot_nl(true); // Dot matches a new line.

	260 scoped_ptr<RE2> re = make_scoped_ptr(new RE2(pattern, options));

	261 DCHECK_EQ(re2::RE2::NoError, re->error_code())

	262 << "Failed to parse:\n" << pattern << "\n" << re->error();

	263 regexp_cache_[pattern] = std::move(re);

	264 }

	265 return regexp_cache_[pattern].get();

	266 }

	267

59 std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) {	268 std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) {

60 // This regular expression finds the next MAC address. It splits the data into	269 // This regular expression finds the next MAC address. It splits the data into

61 // a section preceding the MAC address, an OUI (Organizationally Unique	270 // an OUI (Organizationally Unique Identifier) part and a NIC (Network

62 // Identifier) part and a NIC (Network Interface Controller) specific part.	271 // Interface Controller) specific part.

63	272

64 RE2::Options options;	273 RE2* mac_re = GetRegExp(

65 // set_multiline of pcre is not supported by RE2, yet.	274 "([0-9a-fA-F][0-9a-fA-F]:"

66 options.set_dot_nl(true); // Dot matches a new line.

67 RE2 mac_re(

68 "(.*?)("

69 "[0-9a-fA-F][0-9a-fA-F]:"

70 "[0-9a-fA-F][0-9a-fA-F]:"	275 "[0-9a-fA-F][0-9a-fA-F]:"

71 "[0-9a-fA-F][0-9a-fA-F]):("	276 "[0-9a-fA-F][0-9a-fA-F]):("

72 "[0-9a-fA-F][0-9a-fA-F]:"	277 "[0-9a-fA-F][0-9a-fA-F]:"

73 "[0-9a-fA-F][0-9a-fA-F]:"	278 "[0-9a-fA-F][0-9a-fA-F]:"

74 "[0-9a-fA-F][0-9a-fA-F])",	279 "[0-9a-fA-F][0-9a-fA-F])");

75 options);

76	280

77 std::string result;	281 std::string result;

78 result.reserve(input.size());	282 result.reserve(input.size());

79	283

80 // Keep consuming, building up a result string as we go.	284 // Keep consuming, building up a result string as we go.

81 re2::StringPiece text(input);	285 re2::StringPiece text(input);

82 std::string pre_mac, oui, nic;	286 re2::StringPiece skipped;

83 while (re2::RE2::Consume(&text, mac_re, RE2::Arg(&pre_mac), RE2::Arg(&oui),	287 re2::StringPiece pre_mac, oui, nic;

84 RE2::Arg(&nic))) {	288 while (FindAndConsumeAndGetSkipped(&text, *mac_re, &skipped, &oui, &nic)) {

85 // Look up the MAC address in the hash.	289 // Look up the MAC address in the hash.

86 oui = base::ToLowerASCII(oui);	290 std::string oui_string = base::ToLowerASCII(oui.as_string());

87 nic = base::ToLowerASCII(nic);	291 std::string nic_string = base::ToLowerASCII(nic.as_string());

88 std::string mac = oui + ":" + nic;	292 std::string mac = oui_string + ":" + nic_string;

89 std::string replacement_mac = mac_addresses_[mac];	293 std::string replacement_mac = mac_addresses_[mac];

90 if (replacement_mac.empty()) {	294 if (replacement_mac.empty()) {

91 // If not found, build up a replacement MAC address by generating a new	295 // If not found, build up a replacement MAC address by generating a new

92 // NIC part.	296 // NIC part.

93 int mac_id = mac_addresses_.size();	297 int mac_id = mac_addresses_.size();

94 replacement_mac = base::StringPrintf(	298 replacement_mac = base::StringPrintf(

95 "%s:%02x:%02x:%02x", oui.c_str(), (mac_id & 0x00ff0000) >> 16,	299 "%s:%02x:%02x:%02x", oui_string.c_str(), (mac_id & 0x00ff0000) >> 16,

96 (mac_id & 0x0000ff00) >> 8, (mac_id & 0x000000ff));	300 (mac_id & 0x0000ff00) >> 8, (mac_id & 0x000000ff));

97 mac_addresses_[mac] = replacement_mac;	301 mac_addresses_[mac] = replacement_mac;

98 }	302 }

99	303

100 result += pre_mac;	304 skipped.AppendToString(&result);

101 result += replacement_mac;	305 result += replacement_mac;

102 }	306 }

103	307

104 text.AppendToString(&result);	308 text.AppendToString(&result);

105 return result;	309 return result;

106 }	310 }

107	311

	312

108 std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) {	313 std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) {

109 for (size_t i = 0; i < arraysize(kCustomPatterns); i++) {	314 for (size_t i = 0; i < arraysize(kCustomPatternsWithContext); i++) {

110 input =	315 input =

111 AnonymizeCustomPattern(input, kCustomPatterns[i], &custom_patterns_[i]);	316 AnonymizeCustomPatternWithContext(input, kCustomPatternsWithContext[i],

	317 &custom_patterns_with_context_[i]);

	318 }

	319 for (size_t i = 0; i < arraysize(kCustomPatternsWithoutContext); i++) {

	320 input = AnonymizeCustomPatternWithoutContext(

	321 input, kCustomPatternsWithoutContext[i],

	322 &custom_patterns_without_context_[i]);

112 }	323 }

113 return input;	324 return input;

114 }	325 }

115	326

116 // static	327 std::string AnonymizerTool::AnonymizeCustomPatternWithContext(

117 std::string AnonymizerTool::AnonymizeCustomPattern(

118 const std::string& input,	328 const std::string& input,

119 const std::string& pattern,	329 const std::string& pattern,

120 std::map<std::string, std::string>* identifier_space) {	330 std::map<std::string, std::string>* identifier_space) {

121 RE2::Options options;	331 RE2* re = GetRegExp(pattern);

122 // set_multiline of pcre is not supported by RE2, yet.	332 DCHECK_EQ(3, re->NumberOfCapturingGroups());

123 options.set_dot_nl(true); // Dot matches a new line.

124 RE2 re("(.*?)" + pattern, options);

125 DCHECK_EQ(4, re.NumberOfCapturingGroups());

126	333

127 std::string result;	334 std::string result;

128 result.reserve(input.size());	335 result.reserve(input.size());

129	336

130 // Keep consuming, building up a result string as we go.	337 // Keep consuming, building up a result string as we go.

131 re2::StringPiece text(input);	338 re2::StringPiece text(input);

132 std::string pre_match, pre_matched_id, matched_id, post_matched_id;	339 re2::StringPiece skipped;

133 while (RE2::Consume(&text, re, RE2::Arg(&pre_match),	340 re2::StringPiece pre_match, pre_matched_id, matched_id, post_matched_id;

134 RE2::Arg(&pre_matched_id), RE2::Arg(&matched_id),	341 while (FindAndConsumeAndGetSkipped(&text, *re, &skipped, &pre_matched_id,

135 RE2::Arg(&post_matched_id))) {	342 &matched_id, &post_matched_id)) {

136 std::string replacement_id = (*identifier_space)[matched_id];	343 std::string matched_id_as_string = matched_id.as_string();

	344 std::string replacement_id = (*identifier_space)[matched_id_as_string];

137 if (replacement_id.empty()) {	345 if (replacement_id.empty()) {

138 replacement_id = base::IntToString(identifier_space->size());	346 replacement_id = base::IntToString(identifier_space->size());

139 (*identifier_space)[matched_id] = replacement_id;	347 (*identifier_space)[matched_id_as_string] = replacement_id;

140 }	348 }

141	349

142 result += pre_match;	350 skipped.AppendToString(&result);

143 result += pre_matched_id;	351 pre_matched_id.AppendToString(&result);

144 result += replacement_id;	352 result += replacement_id;

145 result += post_matched_id;	353 post_matched_id.AppendToString(&result);

146 }	354 }

147 text.AppendToString(&result);	355 text.AppendToString(&result);

148 return result;	356 return result;

	357 }

	358

	359 std::string AnonymizerTool::AnonymizeCustomPatternWithoutContext(

	360 const std::string& input,

	361 const CustomPatternWithoutContext& pattern,

	362 std::map<std::string, std::string>* identifier_space) {

	363 RE2* re = GetRegExp(pattern.pattern);

	364 DCHECK_EQ(1, re->NumberOfCapturingGroups());

	365

	366 std::string result;

	367 result.reserve(input.size());

	368

	369 // Keep consuming, building up a result string as we go.

	370 re2::StringPiece text(input);

	371 re2::StringPiece skipped;

	372 re2::StringPiece matched_id;

	373 while (FindAndConsumeAndGetSkipped(&text, *re, &skipped, &matched_id)) {

	374 std::string matched_id_as_string = matched_id.as_string();

	375 std::string replacement_id = (*identifier_space)[matched_id_as_string];

	376 if (replacement_id.empty()) {

	377 // The weird Uint64toString trick is because Windows does not like to deal

	378 // with %zu and a size_t in printf, nor does it support %llu.

	379 replacement_id = base::StringPrintf(

	380 "<%s: %s>", pattern.alias,

	381 base::Uint64ToString(identifier_space->size()).c_str());

	382 (*identifier_space)[matched_id_as_string] = replacement_id;

	383 }

	384

	385 skipped.AppendToString(&result);

	386 result += replacement_id;

	387 }

	388 text.AppendToString(&result);

	389 return result;

149 }	390 }

150	391

151 } // namespace feedback	392 } // namespace feedback

OLD	NEW

« no previous file with comments | « components/feedback/anonymizer_tool.h ('k') | components/feedback/anonymizer_tool_unittest.cc » ('j') | no next file with comments »