| OLD | NEW |
| (Empty) |
| 1 // Copyright 2006-2008 The RE2 Authors. All Rights Reserved. | |
| 2 // Use of this source code is governed by a BSD-style | |
| 3 // license that can be found in the LICENSE file. | |
| 4 | |
| 5 #include "util/thread.h" | |
| 6 #include "util/test.h" | |
| 7 #include "re2/prog.h" | |
| 8 #include "re2/re2.h" | |
| 9 #include "re2/regexp.h" | |
| 10 #include "re2/testing/regexp_generator.h" | |
| 11 #include "re2/testing/string_generator.h" | |
| 12 | |
| 13 static const bool UsingMallocCounter = false; | |
| 14 | |
| 15 DECLARE_bool(re2_dfa_bail_when_slow); | |
| 16 | |
| 17 DEFINE_int32(size, 8, "log2(number of DFA nodes)"); | |
| 18 DEFINE_int32(repeat, 2, "Repetition count."); | |
| 19 DEFINE_int32(threads, 4, "number of threads"); | |
| 20 | |
| 21 namespace re2 { | |
| 22 | |
| 23 // Check that multithreaded access to DFA class works. | |
| 24 | |
| 25 // Helper thread: builds entire DFA for prog. | |
| 26 class BuildThread : public Thread { | |
| 27 public: | |
| 28 BuildThread(Prog* prog) : prog_(prog) {} | |
| 29 virtual void Run() { | |
| 30 CHECK(prog_->BuildEntireDFA(Prog::kFirstMatch)); | |
| 31 } | |
| 32 | |
| 33 private: | |
| 34 Prog* prog_; | |
| 35 }; | |
| 36 | |
| 37 TEST(Multithreaded, BuildEntireDFA) { | |
| 38 // Create regexp with 2^FLAGS_size states in DFA. | |
| 39 string s = "a"; | |
| 40 for (int i = 0; i < FLAGS_size; i++) | |
| 41 s += "[ab]"; | |
| 42 s += "b"; | |
| 43 | |
| 44 // Check that single-threaded code works. | |
| 45 { | |
| 46 //LOG(INFO) << s; | |
| 47 Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL); | |
| 48 CHECK(re); | |
| 49 Prog* prog = re->CompileToProg(0); | |
| 50 CHECK(prog); | |
| 51 BuildThread* t = new BuildThread(prog); | |
| 52 t->SetJoinable(true); | |
| 53 t->Start(); | |
| 54 t->Join(); | |
| 55 delete t; | |
| 56 delete prog; | |
| 57 re->Decref(); | |
| 58 } | |
| 59 | |
| 60 // Build the DFA simultaneously in a bunch of threads. | |
| 61 for (int i = 0; i < FLAGS_repeat; i++) { | |
| 62 Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL); | |
| 63 CHECK(re); | |
| 64 Prog* prog = re->CompileToProg(0); | |
| 65 CHECK(prog); | |
| 66 | |
| 67 vector<BuildThread*> threads; | |
| 68 for (int j = 0; j < FLAGS_threads; j++) { | |
| 69 BuildThread *t = new BuildThread(prog); | |
| 70 t->SetJoinable(true); | |
| 71 threads.push_back(t); | |
| 72 } | |
| 73 for (int j = 0; j < FLAGS_threads; j++) | |
| 74 threads[j]->Start(); | |
| 75 for (int j = 0; j < FLAGS_threads; j++) { | |
| 76 threads[j]->Join(); | |
| 77 delete threads[j]; | |
| 78 } | |
| 79 | |
| 80 // One more compile, to make sure everything is okay. | |
| 81 prog->BuildEntireDFA(Prog::kFirstMatch); | |
| 82 delete prog; | |
| 83 re->Decref(); | |
| 84 } | |
| 85 } | |
| 86 | |
| 87 // Check that DFA size requirements are followed. | |
| 88 // BuildEntireDFA will, like SearchDFA, stop building out | |
| 89 // the DFA once the memory limits are reached. | |
| 90 TEST(SingleThreaded, BuildEntireDFA) { | |
| 91 // Create regexp with 2^30 states in DFA. | |
| 92 string s = "a"; | |
| 93 for (int i = 0; i < 30; i++) | |
| 94 s += "[ab]"; | |
| 95 s += "b"; | |
| 96 | |
| 97 Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL); | |
| 98 CHECK(re); | |
| 99 int max = 24; | |
| 100 for (int i = 17; i < max; i++) { | |
| 101 int64 limit = 1<<i; | |
| 102 int64 usage; | |
| 103 //int64 progusage, dfamem; | |
| 104 { | |
| 105 testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY); | |
| 106 Prog* prog = re->CompileToProg(limit); | |
| 107 CHECK(prog); | |
| 108 //progusage = m.HeapGrowth(); | |
| 109 //dfamem = prog->dfa_mem(); | |
| 110 prog->BuildEntireDFA(Prog::kFirstMatch); | |
| 111 prog->BuildEntireDFA(Prog::kLongestMatch); | |
| 112 usage = m.HeapGrowth(); | |
| 113 delete prog; | |
| 114 } | |
| 115 if (!UsingMallocCounter) | |
| 116 continue; | |
| 117 //LOG(INFO) << "limit " << limit << ", " | |
| 118 // << "prog usage " << progusage << ", " | |
| 119 // << "DFA budget " << dfamem << ", " | |
| 120 // << "total " << usage; | |
| 121 // Tolerate +/- 10%. | |
| 122 CHECK_GT(usage, limit*9/10); | |
| 123 CHECK_LT(usage, limit*11/10); | |
| 124 } | |
| 125 re->Decref(); | |
| 126 } | |
| 127 | |
| 128 // Generates and returns a string over binary alphabet {0,1} that contains | |
| 129 // all possible binary sequences of length n as subsequences. The obvious | |
| 130 // brute force method would generate a string of length n * 2^n, but this | |
| 131 // generates a string of length n + 2^n - 1 called a De Bruijn cycle. | |
| 132 // See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17. | |
| 133 // Such a string is useful for testing a DFA. If you have a DFA | |
| 134 // where distinct last n bytes implies distinct states, then running on a | |
| 135 // DeBruijn string causes the DFA to need to create a new state at every | |
| 136 // position in the input, never reusing any states until it gets to the | |
| 137 // end of the string. This is the worst possible case for DFA execution. | |
| 138 static string DeBruijnString(int n) { | |
| 139 CHECK_LT(n, static_cast<int>(8*sizeof(int))); | |
| 140 CHECK_GT(n, 0); | |
| 141 | |
| 142 vector<bool> did(1<<n); | |
| 143 for (int i = 0; i < 1<<n; i++) | |
| 144 did[i] = false; | |
| 145 | |
| 146 string s; | |
| 147 for (int i = 0; i < n-1; i++) | |
| 148 s.append("0"); | |
| 149 int bits = 0; | |
| 150 int mask = (1<<n) - 1; | |
| 151 for (int i = 0; i < (1<<n); i++) { | |
| 152 bits <<= 1; | |
| 153 bits &= mask; | |
| 154 if (!did[bits|1]) { | |
| 155 bits |= 1; | |
| 156 s.append("1"); | |
| 157 } else { | |
| 158 s.append("0"); | |
| 159 } | |
| 160 CHECK(!did[bits]); | |
| 161 did[bits] = true; | |
| 162 } | |
| 163 return s; | |
| 164 } | |
| 165 | |
| 166 // Test that the DFA gets the right result even if it runs | |
| 167 // out of memory during a search. The regular expression | |
| 168 // 0[01]{n}$ matches a binary string of 0s and 1s only if | |
| 169 // the (n+1)th-to-last character is a 0. Matching this in | |
| 170 // a single forward pass (as done by the DFA) requires | |
| 171 // keeping one bit for each of the last n+1 characters | |
| 172 // (whether each was a 0), or 2^(n+1) possible states. | |
| 173 // If we run this regexp to search in a string that contains | |
| 174 // every possible n-character binary string as a substring, | |
| 175 // then it will have to run through at least 2^n states. | |
| 176 // States are big data structures -- certainly more than 1 byte -- | |
| 177 // so if the DFA can search correctly while staying within a | |
| 178 // 2^n byte limit, it must be handling out-of-memory conditions | |
| 179 // gracefully. | |
| 180 TEST(SingleThreaded, SearchDFA) { | |
| 181 // Choice of n is mostly arbitrary, except that: | |
| 182 // * making n too big makes the test run for too long. | |
| 183 // * making n too small makes the DFA refuse to run, | |
| 184 // because it has so little memory compared to the program size. | |
| 185 // Empirically, n = 18 is a good compromise between the two. | |
| 186 const int n = 18; | |
| 187 | |
| 188 Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n), | |
| 189 Regexp::LikePerl, NULL); | |
| 190 CHECK(re); | |
| 191 | |
| 192 // The De Bruijn string for n ends with a 1 followed by n 0s in a row, | |
| 193 // which is not a match for 0[01]{n}$. Adding one more 0 is a match. | |
| 194 string no_match = DeBruijnString(n); | |
| 195 string match = no_match + "0"; | |
| 196 | |
| 197 // The De Bruijn string is the worst case input for this regexp. | |
| 198 // By default, the DFA will notice that it is flushing its cache | |
| 199 // too frequently and will bail out early, so that RE2 can use the | |
| 200 // NFA implementation instead. (The DFA loses its speed advantage | |
| 201 // if it can't get a good cache hit rate.) | |
| 202 // Tell the DFA to trudge along instead. | |
| 203 FLAGS_re2_dfa_bail_when_slow = false; | |
| 204 | |
| 205 int64 usage; | |
| 206 int64 peak_usage; | |
| 207 { | |
| 208 testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY); | |
| 209 Prog* prog = re->CompileToProg(1<<n); | |
| 210 CHECK(prog); | |
| 211 for (int i = 0; i < 10; i++) { | |
| 212 bool matched, failed = false; | |
| 213 matched = prog->SearchDFA(match, NULL, | |
| 214 Prog::kUnanchored, Prog::kFirstMatch, | |
| 215 NULL, &failed, NULL); | |
| 216 CHECK(!failed); | |
| 217 CHECK(matched); | |
| 218 matched = prog->SearchDFA(no_match, NULL, | |
| 219 Prog::kUnanchored, Prog::kFirstMatch, | |
| 220 NULL, &failed, NULL); | |
| 221 CHECK(!failed); | |
| 222 CHECK(!matched); | |
| 223 } | |
| 224 usage = m.HeapGrowth(); | |
| 225 peak_usage = m.PeakHeapGrowth(); | |
| 226 delete prog; | |
| 227 } | |
| 228 if (!UsingMallocCounter) | |
| 229 return; | |
| 230 //LOG(INFO) << "usage " << usage << ", " | |
| 231 // << "peak usage " << peak_usage; | |
| 232 CHECK_LT(usage, 1<<n); | |
| 233 CHECK_LT(peak_usage, 1<<n); | |
| 234 re->Decref(); | |
| 235 } | |
| 236 | |
| 237 // Helper thread: searches for match, which should match, | |
| 238 // and no_match, which should not. | |
| 239 class SearchThread : public Thread { | |
| 240 public: | |
| 241 SearchThread(Prog* prog, const StringPiece& match, | |
| 242 const StringPiece& no_match) | |
| 243 : prog_(prog), match_(match), no_match_(no_match) {} | |
| 244 | |
| 245 virtual void Run() { | |
| 246 for (int i = 0; i < 2; i++) { | |
| 247 bool matched, failed = false; | |
| 248 matched = prog_->SearchDFA(match_, NULL, | |
| 249 Prog::kUnanchored, Prog::kFirstMatch, | |
| 250 NULL, &failed, NULL); | |
| 251 CHECK(!failed); | |
| 252 CHECK(matched); | |
| 253 matched = prog_->SearchDFA(no_match_, NULL, | |
| 254 Prog::kUnanchored, Prog::kFirstMatch, | |
| 255 NULL, &failed, NULL); | |
| 256 CHECK(!failed); | |
| 257 CHECK(!matched); | |
| 258 } | |
| 259 } | |
| 260 | |
| 261 private: | |
| 262 Prog* prog_; | |
| 263 StringPiece match_; | |
| 264 StringPiece no_match_; | |
| 265 }; | |
| 266 | |
| 267 TEST(Multithreaded, SearchDFA) { | |
| 268 // Same as single-threaded test above. | |
| 269 const int n = 18; | |
| 270 Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n), | |
| 271 Regexp::LikePerl, NULL); | |
| 272 CHECK(re); | |
| 273 string no_match = DeBruijnString(n); | |
| 274 string match = no_match + "0"; | |
| 275 FLAGS_re2_dfa_bail_when_slow = false; | |
| 276 | |
| 277 // Check that single-threaded code works. | |
| 278 { | |
| 279 Prog* prog = re->CompileToProg(1<<n); | |
| 280 CHECK(prog); | |
| 281 SearchThread* t = new SearchThread(prog, match, no_match); | |
| 282 t->SetJoinable(true); | |
| 283 t->Start(); | |
| 284 t->Join(); | |
| 285 delete t; | |
| 286 delete prog; | |
| 287 } | |
| 288 | |
| 289 // Run the search simultaneously in a bunch of threads. | |
| 290 // Reuse same flags for Multithreaded.BuildDFA above. | |
| 291 for (int i = 0; i < FLAGS_repeat; i++) { | |
| 292 //LOG(INFO) << "Search " << i; | |
| 293 Prog* prog = re->CompileToProg(1<<n); | |
| 294 CHECK(prog); | |
| 295 | |
| 296 vector<SearchThread*> threads; | |
| 297 for (int j = 0; j < FLAGS_threads; j++) { | |
| 298 SearchThread *t = new SearchThread(prog, match, no_match); | |
| 299 t->SetJoinable(true); | |
| 300 threads.push_back(t); | |
| 301 } | |
| 302 for (int j = 0; j < FLAGS_threads; j++) | |
| 303 threads[j]->Start(); | |
| 304 for (int j = 0; j < FLAGS_threads; j++) { | |
| 305 threads[j]->Join(); | |
| 306 delete threads[j]; | |
| 307 } | |
| 308 delete prog; | |
| 309 } | |
| 310 re->Decref(); | |
| 311 } | |
| 312 | |
| 313 struct ReverseTest { | |
| 314 const char *regexp; | |
| 315 const char *text; | |
| 316 bool match; | |
| 317 }; | |
| 318 | |
| 319 // Test that reverse DFA handles anchored/unanchored correctly. | |
| 320 // It's in the DFA interface but not used by RE2. | |
| 321 ReverseTest reverse_tests[] = { | |
| 322 { "\\A(a|b)", "abc", true }, | |
| 323 { "(a|b)\\z", "cba", true }, | |
| 324 { "\\A(a|b)", "cba", false }, | |
| 325 { "(a|b)\\z", "abc", false }, | |
| 326 }; | |
| 327 | |
| 328 TEST(DFA, ReverseMatch) { | |
| 329 int nfail = 0; | |
| 330 for (int i = 0; i < arraysize(reverse_tests); i++) { | |
| 331 const ReverseTest& t = reverse_tests[i]; | |
| 332 Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL); | |
| 333 CHECK(re); | |
| 334 Prog *prog = re->CompileToReverseProg(0); | |
| 335 CHECK(prog); | |
| 336 bool failed = false; | |
| 337 bool matched = prog->SearchDFA(t.text, NULL, Prog::kUnanchored, Prog::kFirst
Match, NULL, &failed, NULL); | |
| 338 if (matched != t.match) { | |
| 339 LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match; | |
| 340 nfail++; | |
| 341 } | |
| 342 delete prog; | |
| 343 re->Decref(); | |
| 344 } | |
| 345 EXPECT_EQ(nfail, 0); | |
| 346 } | |
| 347 | |
| 348 } // namespace re2 | |
| OLD | NEW |