Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2)

Side by Side Diff: third_party/re2/re2/testing/tester.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Re-Added LICENSE and OWNERS file Created 4 years, 12 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/re2/re2/testing/tester.h ('k') | third_party/re2/re2/tostring.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression engine tester -- test all the implementations against each other.
6
7 #include "util/util.h"
8 #include "util/flags.h"
9 #include "re2/testing/tester.h"
10 #include "re2/prog.h"
11 #include "re2/re2.h"
12 #include "re2/regexp.h"
13
14 DEFINE_bool(dump_prog, false, "dump regexp program");
15 DEFINE_bool(log_okay, false, "log successful runs");
16 DEFINE_bool(dump_rprog, false, "dump reversed regexp program");
17
18 DEFINE_int32(max_regexp_failures, 100,
19 "maximum number of regexp test failures (-1 = unlimited)");
20
21 DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");
22
23 namespace re2 {
24
25 enum {
26 kMaxSubmatch = 1+16, // $0...$16
27 };
28
29 const char* engine_types[kEngineMax] = {
30 "Backtrack",
31 "NFA",
32 "DFA",
33 "DFA1",
34 "OnePass",
35 "BitState",
36 "RE2",
37 "RE2a",
38 "RE2b",
39 "PCRE",
40 };
41
42 // Returns the name string for the type t.
43 static string EngineString(Engine t) {
44 if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) {
45 return StringPrintf("type%d", static_cast<int>(t));
46 }
47 return engine_types[t];
48 }
49
50 // Returns bit mask of engines to use.
51 static uint32 Engines() {
52 static uint32 cached_engines;
53 static bool did_parse;
54
55 if (did_parse)
56 return cached_engines;
57
58 if (FLAGS_regexp_engines.empty()) {
59 cached_engines = ~0;
60 } else {
61 for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
62 if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str()))
63 cached_engines |= 1<<i;
64 }
65
66 if (cached_engines == 0)
67 LOG(INFO) << "Warning: no engines enabled.";
68 if (!UsingPCRE)
69 cached_engines &= ~(1<<kEnginePCRE);
70 for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
71 if (cached_engines & (1<<i))
72 LOG(INFO) << EngineString(i) << " enabled";
73 }
74 did_parse = true;
75 return cached_engines;
76 }
77
78 // The result of running a match.
79 struct TestInstance::Result {
80 bool skipped; // test skipped: wasn't applicable
81 bool matched; // found a match
82 bool untrusted; // don't really trust the answer
83 bool have_submatch; // computed all submatch info
84 bool have_submatch0; // computed just submatch[0]
85 StringPiece submatch[kMaxSubmatch];
86 };
87
88 typedef TestInstance::Result Result;
89
90 // Formats a single capture range s in text in the form (a,b)
91 // where a and b are the starting and ending offsets of s in text.
92 static string FormatCapture(const StringPiece& text, const StringPiece& s) {
93 if (s.begin() == NULL)
94 return "(?,?)";
95 return StringPrintf("(%d,%d)",
96 static_cast<int>(s.begin() - text.begin()),
97 static_cast<int>(s.end() - text.begin()));
98 }
99
100 // Returns whether text contains non-ASCII (>= 0x80) bytes.
101 static bool NonASCII(const StringPiece& text) {
102 for (int i = 0; i < text.size(); i++)
103 if ((uint8)text[i] >= 0x80)
104 return true;
105 return false;
106 }
107
108 // Returns string representation of match kind.
109 static string FormatKind(Prog::MatchKind kind) {
110 switch (kind) {
111 case Prog::kFullMatch:
112 return "full match";
113 case Prog::kLongestMatch:
114 return "longest match";
115 case Prog::kFirstMatch:
116 return "first match";
117 case Prog::kManyMatch:
118 return "many match";
119 }
120 return "???";
121 }
122
123 // Returns string representation of anchor kind.
124 static string FormatAnchor(Prog::Anchor anchor) {
125 switch (anchor) {
126 case Prog::kAnchored:
127 return "anchored";
128 case Prog::kUnanchored:
129 return "unanchored";
130 }
131 return "???";
132 }
133
134 struct ParseMode {
135 Regexp::ParseFlags parse_flags;
136 string desc;
137 };
138
139 static const Regexp::ParseFlags single_line =
140 Regexp::LikePerl;
141 static const Regexp::ParseFlags multi_line =
142 static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
143
144 static ParseMode parse_modes[] = {
145 { single_line, "single-line" },
146 { single_line|Regexp::Latin1, "single-line, latin1" },
147 { multi_line, "multiline" },
148 { multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
149 { multi_line|Regexp::Latin1, "multiline, latin1" },
150 };
151
152 static string FormatMode(Regexp::ParseFlags flags) {
153 for (int i = 0; i < arraysize(parse_modes); i++)
154 if (parse_modes[i].parse_flags == flags)
155 return parse_modes[i].desc;
156 return StringPrintf("%#x", static_cast<uint>(flags));
157 }
158
159 // Constructs and saves all the matching engines that
160 // will be required for the given tests.
161 TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
162 Regexp::ParseFlags flags)
163 : regexp_str_(regexp_str),
164 kind_(kind),
165 flags_(flags),
166 error_(false),
167 regexp_(NULL),
168 num_captures_(0),
169 prog_(NULL),
170 rprog_(NULL),
171 re_(NULL),
172 re2_(NULL) {
173
174 VLOG(1) << CEscape(regexp_str);
175
176 // Compile regexp to prog.
177 // Always required - needed for backtracking (reference implementation).
178 RegexpStatus status;
179 regexp_ = Regexp::Parse(regexp_str, flags, &status);
180 if (regexp_ == NULL) {
181 LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
182 << " mode: " << FormatMode(flags);
183 error_ = true;
184 return;
185 }
186 num_captures_ = regexp_->NumCaptures();
187 prog_ = regexp_->CompileToProg(0);
188 if (prog_ == NULL) {
189 LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
190 error_ = true;
191 return;
192 }
193 if (FLAGS_dump_prog) {
194 LOG(INFO) << "Prog for "
195 << " regexp "
196 << CEscape(regexp_str_)
197 << " (" << FormatKind(kind_)
198 << ", " << FormatMode(flags_)
199 << ")\n"
200 << prog_->Dump();
201 }
202
203 // Compile regexp to reversed prog. Only needed for DFA engines.
204 if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
205 rprog_ = regexp_->CompileToReverseProg(0);
206 if (rprog_ == NULL) {
207 LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
208 error_ = true;
209 return;
210 }
211 if (FLAGS_dump_rprog)
212 LOG(INFO) << rprog_->Dump();
213 }
214
215 // Create re string that will be used for RE and RE2.
216 string re = regexp_str.as_string();
217 // Accomodate flags.
218 // Regexp::Latin1 will be accomodated below.
219 if (!(flags & Regexp::OneLine))
220 re = "(?m)" + re;
221 if (flags & Regexp::NonGreedy)
222 re = "(?U)" + re;
223 if (flags & Regexp::DotNL)
224 re = "(?s)" + re;
225
226 // Compile regexp to RE2.
227 if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
228 RE2::Options options;
229 if (flags & Regexp::Latin1)
230 options.set_encoding(RE2::Options::EncodingLatin1);
231 if (kind_ == Prog::kLongestMatch)
232 options.set_longest_match(true);
233 re2_ = new RE2(re, options);
234 if (!re2_->error().empty()) {
235 LOG(INFO) << "Cannot RE2: " << CEscape(re);
236 error_ = true;
237 return;
238 }
239 }
240
241 // Compile regexp to RE.
242 // PCRE as exposed by the RE interface isn't always usable.
243 // 1. It disagrees about handling of empty-string reptitions
244 // like matching (a*)* against "b". PCRE treats the (a*) as
245 // occurring once, while we treat it as occurring not at all.
246 // 2. It treats $ as this weird thing meaning end of string
247 // or before the \n at the end of the string.
248 // 3. It doesn't implement POSIX leftmost-longest matching.
249 // 4. It lets \s match vertical tab.
250 // MimicsPCRE() detects 1 and 2.
251 if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
252 kind_ != Prog::kLongestMatch) {
253 PCRE_Options o;
254 o.set_option(PCRE::UTF8);
255 if (flags & Regexp::Latin1)
256 o.set_option(PCRE::None);
257 // PCRE has interface bug keeping us from finding $0, so
258 // add one more layer of parens.
259 re_ = new PCRE("("+re+")", o);
260 if (!re_->error().empty()) {
261 LOG(INFO) << "Cannot PCRE: " << CEscape(re);
262 error_ = true;
263 return;
264 }
265 }
266 }
267
268 TestInstance::~TestInstance() {
269 if (regexp_)
270 regexp_->Decref();
271 delete prog_;
272 delete rprog_;
273 delete re_;
274 delete re2_;
275 }
276
277 // Runs a single search using the named engine type.
278 // This interface hides all the irregularities of the various
279 // engine interfaces from the rest of this file.
280 void TestInstance::RunSearch(Engine type,
281 const StringPiece& orig_text,
282 const StringPiece& orig_context,
283 Prog::Anchor anchor,
284 Result *result) {
285 memset(result, 0, sizeof *result);
286 if (regexp_ == NULL) {
287 result->skipped = true;
288 return;
289 }
290 int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0
291 if (nsubmatch > kMaxSubmatch)
292 nsubmatch = kMaxSubmatch;
293
294 StringPiece text = orig_text;
295 StringPiece context = orig_context;
296
297 switch (type) {
298 default:
299 LOG(FATAL) << "Bad RunSearch type: " << (int)type;
300
301 case kEngineBacktrack:
302 if (prog_ == NULL) {
303 result->skipped = true;
304 break;
305 }
306 result->matched =
307 prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
308 result->submatch, nsubmatch);
309 result->have_submatch = true;
310 break;
311
312 case kEngineNFA:
313 if (prog_ == NULL) {
314 result->skipped = true;
315 break;
316 }
317 result->matched =
318 prog_->SearchNFA(text, context, anchor, kind_,
319 result->submatch, nsubmatch);
320 result->have_submatch = true;
321 break;
322
323 case kEngineDFA:
324 if (prog_ == NULL) {
325 result->skipped = true;
326 break;
327 }
328 result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
329 &result->skipped, NULL);
330 break;
331
332 case kEngineDFA1:
333 if (prog_ == NULL || rprog_ == NULL) {
334 result->skipped = true;
335 break;
336 }
337 result->matched =
338 prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
339 &result->skipped, NULL);
340 // If anchored, no need for second run,
341 // but do it anyway to find more bugs.
342 if (result->matched) {
343 if (!rprog_->SearchDFA(result->submatch[0], context,
344 Prog::kAnchored, Prog::kLongestMatch,
345 result->submatch,
346 &result->skipped, NULL)) {
347 LOG(ERROR) << "Reverse DFA inconsistency: "
348 << CEscape(regexp_str_)
349 << " on " << CEscape(text);
350 result->matched = false;
351 }
352 }
353 result->have_submatch0 = true;
354 break;
355
356 case kEngineOnePass:
357 if (prog_ == NULL ||
358 anchor == Prog::kUnanchored ||
359 !prog_->IsOnePass() ||
360 nsubmatch > Prog::kMaxOnePassCapture) {
361 result->skipped = true;
362 break;
363 }
364 result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
365 result->submatch, nsubmatch);
366 result->have_submatch = true;
367 break;
368
369 case kEngineBitState:
370 if (prog_ == NULL) {
371 result->skipped = true;
372 break;
373 }
374 result->matched = prog_->SearchBitState(text, context, anchor, kind_,
375 result->submatch, nsubmatch);
376 result->have_submatch = true;
377 break;
378
379 case kEngineRE2:
380 case kEngineRE2a:
381 case kEngineRE2b: {
382 if (!re2_ || text.end() != context.end()) {
383 result->skipped = true;
384 break;
385 }
386
387 RE2::Anchor re_anchor;
388 if (anchor == Prog::kAnchored)
389 re_anchor = RE2::ANCHOR_START;
390 else
391 re_anchor = RE2::UNANCHORED;
392 if (kind_ == Prog::kFullMatch)
393 re_anchor = RE2::ANCHOR_BOTH;
394
395 result->matched = re2_->Match(
396 context,
397 static_cast<int>(text.begin() - context.begin()),
398 static_cast<int>(text.end() - context.begin()),
399 re_anchor,
400 result->submatch,
401 nsubmatch);
402 result->have_submatch = nsubmatch > 0;
403 break;
404 }
405
406 case kEnginePCRE: {
407 if (!re_ || text.begin() != context.begin() ||
408 text.end() != context.end()) {
409 result->skipped = true;
410 break;
411 }
412
413 // PCRE 8.34 or so started allowing vertical tab to match \s,
414 // following a change made in Perl 5.18. RE2 does not.
415 if ((regexp_str_.contains("\\s") || regexp_str_.contains("\\S")) &&
416 text.contains("\v")) {
417 result->skipped = true;
418 break;
419 }
420
421 const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
422 PCRE::Arg *a = new PCRE::Arg[nsubmatch];
423 for (int i = 0; i < nsubmatch; i++) {
424 a[i] = PCRE::Arg(&result->submatch[i]);
425 argptr[i] = &a[i];
426 }
427 int consumed;
428 PCRE::Anchor pcre_anchor;
429 if (anchor == Prog::kAnchored)
430 pcre_anchor = PCRE::ANCHOR_START;
431 else
432 pcre_anchor = PCRE::UNANCHORED;
433 if (kind_ == Prog::kFullMatch)
434 pcre_anchor = PCRE::ANCHOR_BOTH;
435 re_->ClearHitLimit();
436 result->matched =
437 re_->DoMatch(text,
438 pcre_anchor,
439 &consumed,
440 argptr, nsubmatch);
441 if (re_->HitLimit()) {
442 result->untrusted = true;
443 delete[] argptr;
444 delete[] a;
445 break;
446 }
447 result->have_submatch = true;
448
449 // Work around RE interface bug: PCRE returns -1 as the
450 // offsets for an unmatched subexpression, and RE should
451 // turn that into StringPiece(NULL) but in fact it uses
452 // StringPiece(text.begin() - 1, 0). Oops.
453 for (int i = 0; i < nsubmatch; i++)
454 if (result->submatch[i].begin() == text.begin() - 1)
455 result->submatch[i] = NULL;
456 delete[] argptr;
457 delete[] a;
458 break;
459 }
460 }
461
462 if (!result->matched)
463 memset(result->submatch, 0, sizeof result->submatch);
464 }
465
466 // Checks whether r is okay given that correct is the right answer.
467 // Specifically, r's answers have to match (but it doesn't have to
468 // claim to have all the answers).
469 static bool ResultOkay(const Result& r, const Result& correct) {
470 if (r.skipped)
471 return true;
472 if (r.matched != correct.matched)
473 return false;
474 if (r.have_submatch || r.have_submatch0) {
475 for (int i = 0; i < kMaxSubmatch; i++) {
476 if (correct.submatch[i].begin() != r.submatch[i].begin() ||
477 correct.submatch[i].size() != r.submatch[i].size())
478 return false;
479 if (!r.have_submatch)
480 break;
481 }
482 }
483 return true;
484 }
485
486 // Runs a single test.
487 bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
488 Prog::Anchor anchor) {
489 // Backtracking is the gold standard.
490 Result correct;
491 RunSearch(kEngineBacktrack, text, context, anchor, &correct);
492 if (correct.skipped) {
493 if (regexp_ == NULL)
494 return true;
495 LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
496 << " " << FormatMode(flags_);
497 return false;
498 }
499 VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
500 << " text " << CEscape(text)
501 << " (" << FormatKind(kind_)
502 << ", " << FormatAnchor(anchor)
503 << ", " << FormatMode(flags_)
504 << ")";
505
506 // Compare the others.
507 bool all_okay = true;
508 for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
509 if (!(Engines() & (1<<i)))
510 continue;
511
512 Result r;
513 RunSearch(i, text, context, anchor, &r);
514 if (ResultOkay(r, correct)) {
515 if (FLAGS_log_okay)
516 LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
517 continue;
518 }
519
520 // We disagree with PCRE on the meaning of some Unicode matches.
521 // In particular, we treat non-ASCII UTF-8 as non-word characters.
522 // We also treat "empty" character sets like [^\w\W] as being
523 // impossible to match, while PCRE apparently excludes some code
524 // points (e.g., 0x0080) from both \w and \W.
525 if (i == kEnginePCRE && NonASCII(text))
526 continue;
527
528 if (!r.untrusted)
529 all_okay = false;
530
531 LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
532 context, anchor);
533 if (r.matched != correct.matched) {
534 if (r.matched) {
535 LOG(INFO) << " Should not match (but does).";
536 } else {
537 LOG(INFO) << " Should match (but does not).";
538 continue;
539 }
540 }
541 for (int i = 0; i < 1+num_captures_; i++) {
542 if (r.submatch[i].begin() != correct.submatch[i].begin() ||
543 r.submatch[i].end() != correct.submatch[i].end()) {
544 LOG(INFO) <<
545 StringPrintf(" $%d: should be %s is %s",
546 i,
547 FormatCapture(text, correct.submatch[i]).c_str(),
548 FormatCapture(text, r.submatch[i]).c_str());
549 } else {
550 LOG(INFO) <<
551 StringPrintf(" $%d: %s ok", i,
552 FormatCapture(text, r.submatch[i]).c_str());
553 }
554 }
555 }
556
557 if (!all_okay) {
558 if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)
559 LOG(QFATAL) << "Too many regexp failures.";
560 }
561
562 return all_okay;
563 }
564
565 void TestInstance::LogMatch(const char* prefix, Engine e,
566 const StringPiece& text, const StringPiece& context,
567 Prog::Anchor anchor) {
568 LOG(INFO) << prefix
569 << EngineString(e)
570 << " regexp "
571 << CEscape(regexp_str_)
572 << " "
573 << CEscape(regexp_->ToString())
574 << " text "
575 << CEscape(text)
576 << " ("
577 << text.begin() - context.begin()
578 << ","
579 << text.end() - context.begin()
580 << ") of context "
581 << CEscape(context)
582 << " (" << FormatKind(kind_)
583 << ", " << FormatAnchor(anchor)
584 << ", " << FormatMode(flags_)
585 << ")";
586 }
587
588 static Prog::MatchKind kinds[] = {
589 Prog::kFirstMatch,
590 Prog::kLongestMatch,
591 Prog::kFullMatch,
592 };
593
594 // Test all possible match kinds and parse modes.
595 Tester::Tester(const StringPiece& regexp) {
596 error_ = false;
597 for (int i = 0; i < arraysize(kinds); i++) {
598 for (int j = 0; j < arraysize(parse_modes); j++) {
599 TestInstance* t = new TestInstance(regexp, kinds[i],
600 parse_modes[j].parse_flags);
601 error_ |= t->error();
602 v_.push_back(t);
603 }
604 }
605 }
606
607 Tester::~Tester() {
608 for (size_t i = 0; i < v_.size(); i++)
609 delete v_[i];
610 }
611
612 bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
613 Prog::Anchor anchor) {
614 bool okay = true;
615 for (size_t i = 0; i < v_.size(); i++)
616 okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
617 return okay;
618 }
619
620 static Prog::Anchor anchors[] = {
621 Prog::kAnchored,
622 Prog::kUnanchored
623 };
624
625 bool Tester::TestInput(const StringPiece& text) {
626 bool okay = TestInputInContext(text, text);
627 if (text.size() > 0) {
628 StringPiece sp;
629 sp = text;
630 sp.remove_prefix(1);
631 okay &= TestInputInContext(sp, text);
632 sp = text;
633 sp.remove_suffix(1);
634 okay &= TestInputInContext(sp, text);
635 }
636 return okay;
637 }
638
639 bool Tester::TestInputInContext(const StringPiece& text,
640 const StringPiece& context) {
641 bool okay = true;
642 for (int i = 0; i < arraysize(anchors); i++)
643 okay &= TestCase(text, context, anchors[i]);
644 return okay;
645 }
646
647 bool TestRegexpOnText(const StringPiece& regexp,
648 const StringPiece& text) {
649 Tester t(regexp);
650 return t.TestInput(text);
651 }
652
653 } // namespace re2
OLDNEW
« no previous file with comments | « third_party/re2/re2/testing/tester.h ('k') | third_party/re2/re2/tostring.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698