Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(13)

Side by Side Diff: third_party/re2/re2/testing/dfa_test.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/re2/re2/testing/compile_test.cc ('k') | third_party/re2/re2/testing/dump.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "util/thread.h"
6 #include "util/test.h"
7 #include "re2/prog.h"
8 #include "re2/re2.h"
9 #include "re2/regexp.h"
10 #include "re2/testing/regexp_generator.h"
11 #include "re2/testing/string_generator.h"
12
13 static const bool UsingMallocCounter = false;
14
15 DECLARE_bool(re2_dfa_bail_when_slow);
16
17 DEFINE_int32(size, 8, "log2(number of DFA nodes)");
18 DEFINE_int32(repeat, 2, "Repetition count.");
19 DEFINE_int32(threads, 4, "number of threads");
20
21 namespace re2 {
22
23 // Check that multithreaded access to DFA class works.
24
25 // Helper thread: builds entire DFA for prog.
26 class BuildThread : public Thread {
27 public:
28 BuildThread(Prog* prog) : prog_(prog) {}
29 virtual void Run() {
30 CHECK(prog_->BuildEntireDFA(Prog::kFirstMatch));
31 }
32
33 private:
34 Prog* prog_;
35 };
36
37 TEST(Multithreaded, BuildEntireDFA) {
38 // Create regexp with 2^FLAGS_size states in DFA.
39 string s = "a";
40 for (int i = 0; i < FLAGS_size; i++)
41 s += "[ab]";
42 s += "b";
43
44 // Check that single-threaded code works.
45 {
46 //LOG(INFO) << s;
47 Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
48 CHECK(re);
49 Prog* prog = re->CompileToProg(0);
50 CHECK(prog);
51 BuildThread* t = new BuildThread(prog);
52 t->SetJoinable(true);
53 t->Start();
54 t->Join();
55 delete t;
56 delete prog;
57 re->Decref();
58 }
59
60 // Build the DFA simultaneously in a bunch of threads.
61 for (int i = 0; i < FLAGS_repeat; i++) {
62 Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
63 CHECK(re);
64 Prog* prog = re->CompileToProg(0);
65 CHECK(prog);
66
67 vector<BuildThread*> threads;
68 for (int j = 0; j < FLAGS_threads; j++) {
69 BuildThread *t = new BuildThread(prog);
70 t->SetJoinable(true);
71 threads.push_back(t);
72 }
73 for (int j = 0; j < FLAGS_threads; j++)
74 threads[j]->Start();
75 for (int j = 0; j < FLAGS_threads; j++) {
76 threads[j]->Join();
77 delete threads[j];
78 }
79
80 // One more compile, to make sure everything is okay.
81 prog->BuildEntireDFA(Prog::kFirstMatch);
82 delete prog;
83 re->Decref();
84 }
85 }
86
87 // Check that DFA size requirements are followed.
88 // BuildEntireDFA will, like SearchDFA, stop building out
89 // the DFA once the memory limits are reached.
90 TEST(SingleThreaded, BuildEntireDFA) {
91 // Create regexp with 2^30 states in DFA.
92 string s = "a";
93 for (int i = 0; i < 30; i++)
94 s += "[ab]";
95 s += "b";
96
97 Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
98 CHECK(re);
99 int max = 24;
100 for (int i = 17; i < max; i++) {
101 int64 limit = 1<<i;
102 int64 usage;
103 //int64 progusage, dfamem;
104 {
105 testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
106 Prog* prog = re->CompileToProg(limit);
107 CHECK(prog);
108 //progusage = m.HeapGrowth();
109 //dfamem = prog->dfa_mem();
110 prog->BuildEntireDFA(Prog::kFirstMatch);
111 prog->BuildEntireDFA(Prog::kLongestMatch);
112 usage = m.HeapGrowth();
113 delete prog;
114 }
115 if (!UsingMallocCounter)
116 continue;
117 //LOG(INFO) << "limit " << limit << ", "
118 // << "prog usage " << progusage << ", "
119 // << "DFA budget " << dfamem << ", "
120 // << "total " << usage;
121 // Tolerate +/- 10%.
122 CHECK_GT(usage, limit*9/10);
123 CHECK_LT(usage, limit*11/10);
124 }
125 re->Decref();
126 }
127
128 // Generates and returns a string over binary alphabet {0,1} that contains
129 // all possible binary sequences of length n as subsequences. The obvious
130 // brute force method would generate a string of length n * 2^n, but this
131 // generates a string of length n + 2^n - 1 called a De Bruijn cycle.
132 // See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
133 // Such a string is useful for testing a DFA. If you have a DFA
134 // where distinct last n bytes implies distinct states, then running on a
135 // DeBruijn string causes the DFA to need to create a new state at every
136 // position in the input, never reusing any states until it gets to the
137 // end of the string. This is the worst possible case for DFA execution.
138 static string DeBruijnString(int n) {
139 CHECK_LT(n, static_cast<int>(8*sizeof(int)));
140 CHECK_GT(n, 0);
141
142 vector<bool> did(1<<n);
143 for (int i = 0; i < 1<<n; i++)
144 did[i] = false;
145
146 string s;
147 for (int i = 0; i < n-1; i++)
148 s.append("0");
149 int bits = 0;
150 int mask = (1<<n) - 1;
151 for (int i = 0; i < (1<<n); i++) {
152 bits <<= 1;
153 bits &= mask;
154 if (!did[bits|1]) {
155 bits |= 1;
156 s.append("1");
157 } else {
158 s.append("0");
159 }
160 CHECK(!did[bits]);
161 did[bits] = true;
162 }
163 return s;
164 }
165
166 // Test that the DFA gets the right result even if it runs
167 // out of memory during a search. The regular expression
168 // 0[01]{n}$ matches a binary string of 0s and 1s only if
169 // the (n+1)th-to-last character is a 0. Matching this in
170 // a single forward pass (as done by the DFA) requires
171 // keeping one bit for each of the last n+1 characters
172 // (whether each was a 0), or 2^(n+1) possible states.
173 // If we run this regexp to search in a string that contains
174 // every possible n-character binary string as a substring,
175 // then it will have to run through at least 2^n states.
176 // States are big data structures -- certainly more than 1 byte --
177 // so if the DFA can search correctly while staying within a
178 // 2^n byte limit, it must be handling out-of-memory conditions
179 // gracefully.
180 TEST(SingleThreaded, SearchDFA) {
181 // Choice of n is mostly arbitrary, except that:
182 // * making n too big makes the test run for too long.
183 // * making n too small makes the DFA refuse to run,
184 // because it has so little memory compared to the program size.
185 // Empirically, n = 18 is a good compromise between the two.
186 const int n = 18;
187
188 Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
189 Regexp::LikePerl, NULL);
190 CHECK(re);
191
192 // The De Bruijn string for n ends with a 1 followed by n 0s in a row,
193 // which is not a match for 0[01]{n}$. Adding one more 0 is a match.
194 string no_match = DeBruijnString(n);
195 string match = no_match + "0";
196
197 // The De Bruijn string is the worst case input for this regexp.
198 // By default, the DFA will notice that it is flushing its cache
199 // too frequently and will bail out early, so that RE2 can use the
200 // NFA implementation instead. (The DFA loses its speed advantage
201 // if it can't get a good cache hit rate.)
202 // Tell the DFA to trudge along instead.
203 FLAGS_re2_dfa_bail_when_slow = false;
204
205 int64 usage;
206 int64 peak_usage;
207 {
208 testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
209 Prog* prog = re->CompileToProg(1<<n);
210 CHECK(prog);
211 for (int i = 0; i < 10; i++) {
212 bool matched, failed = false;
213 matched = prog->SearchDFA(match, NULL,
214 Prog::kUnanchored, Prog::kFirstMatch,
215 NULL, &failed, NULL);
216 CHECK(!failed);
217 CHECK(matched);
218 matched = prog->SearchDFA(no_match, NULL,
219 Prog::kUnanchored, Prog::kFirstMatch,
220 NULL, &failed, NULL);
221 CHECK(!failed);
222 CHECK(!matched);
223 }
224 usage = m.HeapGrowth();
225 peak_usage = m.PeakHeapGrowth();
226 delete prog;
227 }
228 if (!UsingMallocCounter)
229 return;
230 //LOG(INFO) << "usage " << usage << ", "
231 // << "peak usage " << peak_usage;
232 CHECK_LT(usage, 1<<n);
233 CHECK_LT(peak_usage, 1<<n);
234 re->Decref();
235 }
236
237 // Helper thread: searches for match, which should match,
238 // and no_match, which should not.
239 class SearchThread : public Thread {
240 public:
241 SearchThread(Prog* prog, const StringPiece& match,
242 const StringPiece& no_match)
243 : prog_(prog), match_(match), no_match_(no_match) {}
244
245 virtual void Run() {
246 for (int i = 0; i < 2; i++) {
247 bool matched, failed = false;
248 matched = prog_->SearchDFA(match_, NULL,
249 Prog::kUnanchored, Prog::kFirstMatch,
250 NULL, &failed, NULL);
251 CHECK(!failed);
252 CHECK(matched);
253 matched = prog_->SearchDFA(no_match_, NULL,
254 Prog::kUnanchored, Prog::kFirstMatch,
255 NULL, &failed, NULL);
256 CHECK(!failed);
257 CHECK(!matched);
258 }
259 }
260
261 private:
262 Prog* prog_;
263 StringPiece match_;
264 StringPiece no_match_;
265 };
266
267 TEST(Multithreaded, SearchDFA) {
268 // Same as single-threaded test above.
269 const int n = 18;
270 Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
271 Regexp::LikePerl, NULL);
272 CHECK(re);
273 string no_match = DeBruijnString(n);
274 string match = no_match + "0";
275 FLAGS_re2_dfa_bail_when_slow = false;
276
277 // Check that single-threaded code works.
278 {
279 Prog* prog = re->CompileToProg(1<<n);
280 CHECK(prog);
281 SearchThread* t = new SearchThread(prog, match, no_match);
282 t->SetJoinable(true);
283 t->Start();
284 t->Join();
285 delete t;
286 delete prog;
287 }
288
289 // Run the search simultaneously in a bunch of threads.
290 // Reuse same flags for Multithreaded.BuildDFA above.
291 for (int i = 0; i < FLAGS_repeat; i++) {
292 //LOG(INFO) << "Search " << i;
293 Prog* prog = re->CompileToProg(1<<n);
294 CHECK(prog);
295
296 vector<SearchThread*> threads;
297 for (int j = 0; j < FLAGS_threads; j++) {
298 SearchThread *t = new SearchThread(prog, match, no_match);
299 t->SetJoinable(true);
300 threads.push_back(t);
301 }
302 for (int j = 0; j < FLAGS_threads; j++)
303 threads[j]->Start();
304 for (int j = 0; j < FLAGS_threads; j++) {
305 threads[j]->Join();
306 delete threads[j];
307 }
308 delete prog;
309 }
310 re->Decref();
311 }
312
313 struct ReverseTest {
314 const char *regexp;
315 const char *text;
316 bool match;
317 };
318
319 // Test that reverse DFA handles anchored/unanchored correctly.
320 // It's in the DFA interface but not used by RE2.
321 ReverseTest reverse_tests[] = {
322 { "\\A(a|b)", "abc", true },
323 { "(a|b)\\z", "cba", true },
324 { "\\A(a|b)", "cba", false },
325 { "(a|b)\\z", "abc", false },
326 };
327
328 TEST(DFA, ReverseMatch) {
329 int nfail = 0;
330 for (int i = 0; i < arraysize(reverse_tests); i++) {
331 const ReverseTest& t = reverse_tests[i];
332 Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
333 CHECK(re);
334 Prog *prog = re->CompileToReverseProg(0);
335 CHECK(prog);
336 bool failed = false;
337 bool matched = prog->SearchDFA(t.text, NULL, Prog::kUnanchored, Prog::kFirst Match, NULL, &failed, NULL);
338 if (matched != t.match) {
339 LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
340 nfail++;
341 }
342 delete prog;
343 re->Decref();
344 }
345 EXPECT_EQ(nfail, 0);
346 }
347
348 } // namespace re2
OLDNEW
« no previous file with comments | « third_party/re2/re2/testing/compile_test.cc ('k') | third_party/re2/re2/testing/dump.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698