OLD | NEW |
| (Empty) |
1 // Copyright 2008 The RE2 Authors. All Rights Reserved. | |
2 // Use of this source code is governed by a BSD-style | |
3 // license that can be found in the LICENSE file. | |
4 | |
5 // Regular expression generator: generates all possible | |
6 // regular expressions within parameters (see regexp_generator.h for details). | |
7 | |
8 // The regexp generator first generates a sequence of commands in a simple | |
9 // postfix language. Each command in the language is a string, | |
10 // like "a" or "%s*" or "%s|%s". | |
11 // | |
12 // To evaluate a command, enough arguments are popped from the value stack to | |
13 // plug into the %s slots. Then the result is pushed onto the stack. | |
14 // For example, the command sequence | |
15 // a b %s%s c | |
16 // results in the stack | |
17 // ab c | |
18 // | |
19 // GeneratePostfix generates all possible command sequences. | |
20 // Then RunPostfix turns each sequence into a regular expression | |
21 // and passes the regexp to HandleRegexp. | |
22 | |
23 #include <string.h> | |
24 #include <string> | |
25 #include <stack> | |
26 #include <vector> | |
27 #include "util/test.h" | |
28 #include "re2/testing/regexp_generator.h" | |
29 | |
30 namespace re2 { | |
31 | |
32 // Returns a vector of the egrep regexp operators. | |
33 const vector<string>& RegexpGenerator::EgrepOps() { | |
34 static const char *ops[] = { | |
35 "%s%s", | |
36 "%s|%s", | |
37 "%s*", | |
38 "%s+", | |
39 "%s?", | |
40 "%s\\C*", | |
41 }; | |
42 static vector<string> v(ops, ops + arraysize(ops)); | |
43 return v; | |
44 } | |
45 | |
46 RegexpGenerator::RegexpGenerator(int maxatoms, int maxops, | |
47 const vector<string>& atoms, | |
48 const vector<string>& ops) | |
49 : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) { | |
50 // Degenerate case. | |
51 if (atoms_.size() == 0) | |
52 maxatoms_ = 0; | |
53 if (ops_.size() == 0) | |
54 maxops_ = 0; | |
55 } | |
56 | |
57 // Generates all possible regular expressions (within the parameters), | |
58 // calling HandleRegexp for each one. | |
59 void RegexpGenerator::Generate() { | |
60 vector<string> postfix; | |
61 GeneratePostfix(&postfix, 0, 0, 0); | |
62 } | |
63 | |
64 // Generates random regular expressions, calling HandleRegexp for each one. | |
65 void RegexpGenerator::GenerateRandom(int32 seed, int n) { | |
66 ACMRandom acm(seed); | |
67 acm_ = &acm; | |
68 | |
69 for (int i = 0; i < n; i++) { | |
70 vector<string> postfix; | |
71 GenerateRandomPostfix(&postfix, 0, 0, 0); | |
72 } | |
73 | |
74 acm_ = NULL; | |
75 } | |
76 | |
77 // Counts and returns the number of occurrences of "%s" in s. | |
78 static int CountArgs(const string& s) { | |
79 const char *p = s.c_str(); | |
80 int n = 0; | |
81 while ((p = strstr(p, "%s")) != NULL) { | |
82 p += 2; | |
83 n++; | |
84 } | |
85 return n; | |
86 } | |
87 | |
88 // Generates all possible postfix command sequences. | |
89 // Each sequence is handed off to RunPostfix to generate a regular expression. | |
90 // The arguments are: | |
91 // post: the current postfix sequence | |
92 // nstk: the number of elements that would be on the stack after executing | |
93 // the sequence | |
94 // ops: the number of operators used in the sequence | |
95 // atoms: the number of atoms used in the sequence | |
96 // For example, if post were ["a", "b", "%s%s", "c"], | |
97 // then nstk = 2, ops = 1, atoms = 3. | |
98 // | |
99 // The initial call should be GeneratePostfix([empty vector], 0, 0, 0). | |
100 // | |
101 void RegexpGenerator::GeneratePostfix(vector<string>* post, int nstk, | |
102 int ops, int atoms) { | |
103 if (nstk == 1) | |
104 RunPostfix(*post); | |
105 | |
106 // Early out: if used too many operators or can't | |
107 // get back down to a single expression on the stack | |
108 // using binary operators, give up. | |
109 if (ops + nstk - 1 > maxops_) | |
110 return; | |
111 | |
112 // Add atoms if there is room. | |
113 if (atoms < maxatoms_) { | |
114 for (size_t i = 0; i < atoms_.size(); i++) { | |
115 post->push_back(atoms_[i]); | |
116 GeneratePostfix(post, nstk + 1, ops, atoms + 1); | |
117 post->pop_back(); | |
118 } | |
119 } | |
120 | |
121 // Add operators if there are enough arguments. | |
122 if (ops < maxops_) { | |
123 for (size_t i = 0; i < ops_.size(); i++) { | |
124 const string& fmt = ops_[i]; | |
125 int nargs = CountArgs(fmt); | |
126 if (nargs <= nstk) { | |
127 post->push_back(fmt); | |
128 GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms); | |
129 post->pop_back(); | |
130 } | |
131 } | |
132 } | |
133 } | |
134 | |
135 // Generates a random postfix command sequence. | |
136 // Stops and returns true once a single sequence has been generated. | |
137 bool RegexpGenerator::GenerateRandomPostfix(vector<string>* post, int nstk, | |
138 int ops, int atoms) { | |
139 for (;;) { | |
140 // Stop if we get to a single element, but only sometimes. | |
141 if (nstk == 1 && acm_->Uniform(maxatoms_ + 1 - atoms) == 0) { | |
142 RunPostfix(*post); | |
143 return true; | |
144 } | |
145 | |
146 // Early out: if used too many operators or can't | |
147 // get back down to a single expression on the stack | |
148 // using binary operators, give up. | |
149 if (ops + nstk - 1 > maxops_) | |
150 return false; | |
151 | |
152 // Add operators if there are enough arguments. | |
153 if (ops < maxops_ && acm_->Uniform(2) == 0) { | |
154 const string& fmt = ops_[acm_->Uniform(static_cast<int32>(ops_.size()))]; | |
155 int nargs = CountArgs(fmt); | |
156 if (nargs <= nstk) { | |
157 post->push_back(fmt); | |
158 bool ret = GenerateRandomPostfix(post, nstk - nargs + 1, | |
159 ops + 1, atoms); | |
160 post->pop_back(); | |
161 if (ret) | |
162 return true; | |
163 } | |
164 } | |
165 | |
166 // Add atoms if there is room. | |
167 if (atoms < maxatoms_ && acm_->Uniform(2) == 0) { | |
168 post->push_back(atoms_[acm_->Uniform(static_cast<int32>(atoms_.size()))]); | |
169 bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1); | |
170 post->pop_back(); | |
171 if (ret) | |
172 return true; | |
173 } | |
174 } | |
175 } | |
176 | |
177 // Interprets the postfix command sequence to create a regular expression | |
178 // passed to HandleRegexp. The results of operators like %s|%s are wrapped | |
179 // in (?: ) to avoid needing to maintain a precedence table. | |
180 void RegexpGenerator::RunPostfix(const vector<string>& post) { | |
181 stack<string> regexps; | |
182 for (size_t i = 0; i < post.size(); i++) { | |
183 switch (CountArgs(post[i])) { | |
184 default: | |
185 LOG(FATAL) << "Bad operator: " << post[i]; | |
186 case 0: | |
187 regexps.push(post[i]); | |
188 break; | |
189 case 1: { | |
190 string a = regexps.top(); | |
191 regexps.pop(); | |
192 regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")"); | |
193 break; | |
194 } | |
195 case 2: { | |
196 string b = regexps.top(); | |
197 regexps.pop(); | |
198 string a = regexps.top(); | |
199 regexps.pop(); | |
200 regexps.push("(?:" + | |
201 StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) + | |
202 ")"); | |
203 break; | |
204 } | |
205 } | |
206 } | |
207 | |
208 if (regexps.size() != 1) { | |
209 // Internal error - should never happen. | |
210 printf("Bad regexp program:\n"); | |
211 for (size_t i = 0; i < post.size(); i++) { | |
212 printf(" %s\n", CEscape(post[i]).c_str()); | |
213 } | |
214 printf("Stack after running program:\n"); | |
215 while (!regexps.empty()) { | |
216 printf(" %s\n", CEscape(regexps.top()).c_str()); | |
217 regexps.pop(); | |
218 } | |
219 LOG(FATAL) << "Bad regexp program."; | |
220 } | |
221 | |
222 HandleRegexp(regexps.top()); | |
223 HandleRegexp("^(?:" + regexps.top() + ")$"); | |
224 HandleRegexp("^(?:" + regexps.top() + ")"); | |
225 HandleRegexp("(?:" + regexps.top() + ")$"); | |
226 } | |
227 | |
228 // Split s into an vector of strings, one for each UTF-8 character. | |
229 vector<string> Explode(const StringPiece& s) { | |
230 vector<string> v; | |
231 | |
232 for (const char *q = s.begin(); q < s.end(); ) { | |
233 const char* p = q; | |
234 Rune r; | |
235 q += chartorune(&r, q); | |
236 v.push_back(string(p, q - p)); | |
237 } | |
238 | |
239 return v; | |
240 } | |
241 | |
242 // Split string everywhere a substring is found, returning | |
243 // vector of pieces. | |
244 vector<string> Split(const StringPiece& sep, const StringPiece& s) { | |
245 vector<string> v; | |
246 | |
247 if (sep.size() == 0) | |
248 return Explode(s); | |
249 | |
250 const char *p = s.begin(); | |
251 for (const char *q = s.begin(); q + sep.size() <= s.end(); q++) { | |
252 if (StringPiece(q, sep.size()) == sep) { | |
253 v.push_back(string(p, q - p)); | |
254 p = q + sep.size(); | |
255 q = p - 1; // -1 for ++ in loop | |
256 continue; | |
257 } | |
258 } | |
259 if (p < s.end()) | |
260 v.push_back(string(p, s.end() - p)); | |
261 return v; | |
262 } | |
263 | |
264 } // namespace re2 | |
OLD | NEW |