Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(654)

Side by Side Diff: third_party/re2/re2/simplify.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Re-Added LICENSE and OWNERS file Created 4 years, 12 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/re2/re2/set.cc ('k') | third_party/re2/re2/stringpiece.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Rewrite POSIX and other features in re
6 // to use simple extended regular expression features.
7 // Also sort and simplify character classes.
8
9 #include "util/util.h"
10 #include "re2/regexp.h"
11 #include "re2/walker-inl.h"
12
13 namespace re2 {
14
15 // Parses the regexp src and then simplifies it and sets *dst to the
16 // string representation of the simplified form. Returns true on success.
17 // Returns false and sets *error (if error != NULL) on error.
18 bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
19 string* dst,
20 RegexpStatus* status) {
21 Regexp* re = Parse(src, flags, status);
22 if (re == NULL)
23 return false;
24 Regexp* sre = re->Simplify();
25 re->Decref();
26 if (sre == NULL) {
27 // Should not happen, since Simplify never fails.
28 LOG(ERROR) << "Simplify failed on " << src;
29 if (status) {
30 status->set_code(kRegexpInternalError);
31 status->set_error_arg(src);
32 }
33 return false;
34 }
35 *dst = sre->ToString();
36 sre->Decref();
37 return true;
38 }
39
40 // Assuming the simple_ flags on the children are accurate,
41 // is this Regexp* simple?
42 bool Regexp::ComputeSimple() {
43 Regexp** subs;
44 switch (op_) {
45 case kRegexpNoMatch:
46 case kRegexpEmptyMatch:
47 case kRegexpLiteral:
48 case kRegexpLiteralString:
49 case kRegexpBeginLine:
50 case kRegexpEndLine:
51 case kRegexpBeginText:
52 case kRegexpWordBoundary:
53 case kRegexpNoWordBoundary:
54 case kRegexpEndText:
55 case kRegexpAnyChar:
56 case kRegexpAnyByte:
57 case kRegexpHaveMatch:
58 return true;
59 case kRegexpConcat:
60 case kRegexpAlternate:
61 // These are simple as long as the subpieces are simple.
62 subs = sub();
63 for (int i = 0; i < nsub_; i++)
64 if (!subs[i]->simple())
65 return false;
66 return true;
67 case kRegexpCharClass:
68 // Simple as long as the char class is not empty, not full.
69 if (ccb_ != NULL)
70 return !ccb_->empty() && !ccb_->full();
71 return !cc_->empty() && !cc_->full();
72 case kRegexpCapture:
73 subs = sub();
74 return subs[0]->simple();
75 case kRegexpStar:
76 case kRegexpPlus:
77 case kRegexpQuest:
78 subs = sub();
79 if (!subs[0]->simple())
80 return false;
81 switch (subs[0]->op_) {
82 case kRegexpStar:
83 case kRegexpPlus:
84 case kRegexpQuest:
85 case kRegexpEmptyMatch:
86 case kRegexpNoMatch:
87 return false;
88 default:
89 break;
90 }
91 return true;
92 case kRegexpRepeat:
93 return false;
94 }
95 LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
96 return false;
97 }
98
99 // Walker subclass used by Simplify.
100 // Coalesces runs of star/plus/quest/repeat of the same literal along with any
101 // occurrences of that literal into repeats of that literal. It also works for
102 // char classes, any char and any byte.
103 // PostVisit creates the coalesced result, which should then be simplified.
104 class CoalesceWalker : public Regexp::Walker<Regexp*> {
105 public:
106 CoalesceWalker() {}
107 virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
108 Regexp** child_args, int nchild_args);
109 virtual Regexp* Copy(Regexp* re);
110 virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
111
112 private:
113 // These functions are declared inside CoalesceWalker so that
114 // they can edit the private fields of the Regexps they construct.
115
116 // Returns true if r1 and r2 can be coalesced. In particular, ensures that
117 // the parse flags are consistent. (They will not be checked again later.)
118 static bool CanCoalesce(Regexp* r1, Regexp* r2);
119
120 // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
121 // will be empty match and the coalesced op. In other cases, where part of a
122 // literal string was removed to be coalesced, the array elements afterwards
123 // will be the coalesced op and the remainder of the literal string.
124 static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
125
126 DISALLOW_COPY_AND_ASSIGN(CoalesceWalker);
127 };
128
129 // Walker subclass used by Simplify.
130 // The simplify walk is purely post-recursive: given the simplified children,
131 // PostVisit creates the simplified result.
132 // The child_args are simplified Regexp*s.
133 class SimplifyWalker : public Regexp::Walker<Regexp*> {
134 public:
135 SimplifyWalker() {}
136 virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
137 virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
138 Regexp** child_args, int nchild_args);
139 virtual Regexp* Copy(Regexp* re);
140 virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
141
142 private:
143 // These functions are declared inside SimplifyWalker so that
144 // they can edit the private fields of the Regexps they construct.
145
146 // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
147 // Caller must Decref return value when done with it.
148 static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
149
150 // Simplifies the expression re{min,max} in terms of *, +, and ?.
151 // Returns a new regexp. Does not edit re. Does not consume reference to re.
152 // Caller must Decref return value when done with it.
153 static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
154 Regexp::ParseFlags parse_flags);
155
156 // Simplifies a character class by expanding any named classes
157 // into rune ranges. Does not edit re. Does not consume ref to re.
158 // Caller must Decref return value when done with it.
159 static Regexp* SimplifyCharClass(Regexp* re);
160
161 DISALLOW_COPY_AND_ASSIGN(SimplifyWalker);
162 };
163
164 // Simplifies a regular expression, returning a new regexp.
165 // The new regexp uses traditional Unix egrep features only,
166 // plus the Perl (?:) non-capturing parentheses.
167 // Otherwise, no POSIX or Perl additions. The new regexp
168 // captures exactly the same subexpressions (with the same indices)
169 // as the original.
170 // Does not edit current object.
171 // Caller must Decref() return value when done with it.
172
173 Regexp* Regexp::Simplify() {
174 CoalesceWalker cw;
175 Regexp* cre = cw.Walk(this, NULL);
176 if (cre == NULL)
177 return cre;
178 SimplifyWalker sw;
179 Regexp* sre = sw.Walk(cre, NULL);
180 cre->Decref();
181 return sre;
182 }
183
184 #define Simplify DontCallSimplify // Avoid accidental recursion
185
186 // Utility function for PostVisit implementations that compares re->sub() with
187 // child_args to determine whether any child_args changed. In the common case,
188 // where nothing changed, calls Decref() for all child_args and returns false,
189 // so PostVisit must return re->Incref(). Otherwise, returns true.
190 static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
191 for (int i = 0; i < re->nsub(); i++) {
192 Regexp* sub = re->sub()[i];
193 Regexp* newsub = child_args[i];
194 if (newsub != sub)
195 return true;
196 }
197 for (int i = 0; i < re->nsub(); i++) {
198 Regexp* newsub = child_args[i];
199 newsub->Decref();
200 }
201 return false;
202 }
203
204 Regexp* CoalesceWalker::Copy(Regexp* re) {
205 return re->Incref();
206 }
207
208 Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
209 // This should never be called, since we use Walk and not
210 // WalkExponential.
211 LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
212 return re->Incref();
213 }
214
215 Regexp* CoalesceWalker::PostVisit(Regexp* re,
216 Regexp* parent_arg,
217 Regexp* pre_arg,
218 Regexp** child_args,
219 int nchild_args) {
220 if (re->nsub() == 0)
221 return re->Incref();
222
223 if (re->op() != kRegexpConcat) {
224 if (!ChildArgsChanged(re, child_args))
225 return re->Incref();
226
227 // Something changed. Build a new op.
228 Regexp* nre = new Regexp(re->op(), re->parse_flags());
229 nre->AllocSub(re->nsub());
230 Regexp** nre_subs = nre->sub();
231 for (int i = 0; i < re->nsub(); i++)
232 nre_subs[i] = child_args[i];
233 // Repeats and Captures have additional data that must be copied.
234 if (re->op() == kRegexpRepeat) {
235 nre->min_ = re->min();
236 nre->max_ = re->max();
237 } else if (re->op() == kRegexpCapture) {
238 nre->cap_ = re->cap();
239 }
240 return nre;
241 }
242
243 bool can_coalesce = false;
244 for (int i = 0; i < re->nsub(); i++) {
245 if (i+1 < re->nsub() &&
246 CanCoalesce(child_args[i], child_args[i+1])) {
247 can_coalesce = true;
248 break;
249 }
250 }
251 if (!can_coalesce) {
252 if (!ChildArgsChanged(re, child_args))
253 return re->Incref();
254
255 // Something changed. Build a new op.
256 Regexp* nre = new Regexp(re->op(), re->parse_flags());
257 nre->AllocSub(re->nsub());
258 Regexp** nre_subs = nre->sub();
259 for (int i = 0; i < re->nsub(); i++)
260 nre_subs[i] = child_args[i];
261 return nre;
262 }
263
264 for (int i = 0; i < re->nsub(); i++) {
265 if (i+1 < re->nsub() &&
266 CanCoalesce(child_args[i], child_args[i+1]))
267 DoCoalesce(&child_args[i], &child_args[i+1]);
268 }
269 // Determine how many empty matches were left by DoCoalesce.
270 int n = 0;
271 for (int i = n; i < re->nsub(); i++) {
272 if (child_args[i]->op() == kRegexpEmptyMatch)
273 n++;
274 }
275 // Build a new op.
276 Regexp* nre = new Regexp(re->op(), re->parse_flags());
277 nre->AllocSub(re->nsub() - n);
278 Regexp** nre_subs = nre->sub();
279 for (int i = 0, j = 0; i < re->nsub(); i++) {
280 if (child_args[i]->op() == kRegexpEmptyMatch) {
281 child_args[i]->Decref();
282 continue;
283 }
284 nre_subs[j] = child_args[i];
285 j++;
286 }
287 return nre;
288 }
289
290 bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
291 // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
292 // any byte.
293 if ((r1->op() == kRegexpStar ||
294 r1->op() == kRegexpPlus ||
295 r1->op() == kRegexpQuest ||
296 r1->op() == kRegexpRepeat) &&
297 (r1->sub()[0]->op() == kRegexpLiteral ||
298 r1->sub()[0]->op() == kRegexpCharClass ||
299 r1->sub()[0]->op() == kRegexpAnyChar ||
300 r1->sub()[0]->op() == kRegexpAnyByte)) {
301 // r2 must be a star/plus/quest/repeat of the same literal, char class,
302 // any char or any byte.
303 if ((r2->op() == kRegexpStar ||
304 r2->op() == kRegexpPlus ||
305 r2->op() == kRegexpQuest ||
306 r2->op() == kRegexpRepeat) &&
307 Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
308 // The parse flags must be consistent.
309 ((r1->parse_flags() & Regexp::NonGreedy) ==
310 (r2->parse_flags() & Regexp::NonGreedy))) {
311 return true;
312 }
313 // ... OR an occurrence of that literal, char class, any char or any byte
314 if (Regexp::Equal(r1->sub()[0], r2)) {
315 return true;
316 }
317 // ... OR a literal string that begins with that literal.
318 if (r1->sub()[0]->op() == kRegexpLiteral &&
319 r2->op() == kRegexpLiteralString &&
320 r2->runes()[0] == r1->sub()[0]->rune() &&
321 // The parse flags must be consistent.
322 ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
323 (r2->parse_flags() & Regexp::FoldCase))) {
324 return true;
325 }
326 }
327 return false;
328 }
329
330 void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
331 Regexp* r1 = *r1ptr;
332 Regexp* r2 = *r2ptr;
333
334 Regexp* nre = Regexp::Repeat(
335 r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
336
337 switch (r1->op()) {
338 case kRegexpStar:
339 nre->min_ = 0;
340 nre->max_ = -1;
341 break;
342
343 case kRegexpPlus:
344 nre->min_ = 1;
345 nre->max_ = -1;
346 break;
347
348 case kRegexpQuest:
349 nre->min_ = 0;
350 nre->max_ = 1;
351 break;
352
353 case kRegexpRepeat:
354 nre->min_ = r1->min();
355 nre->max_ = r1->max();
356 break;
357
358 default:
359 LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
360 nre->Decref();
361 return;
362 }
363
364 switch (r2->op()) {
365 case kRegexpStar:
366 nre->max_ = -1;
367 goto LeaveEmpty;
368
369 case kRegexpPlus:
370 nre->min_++;
371 nre->max_ = -1;
372 goto LeaveEmpty;
373
374 case kRegexpQuest:
375 if (nre->max() != -1)
376 nre->max_++;
377 goto LeaveEmpty;
378
379 case kRegexpRepeat:
380 nre->min_ += r2->min();
381 if (r2->max() == -1)
382 nre->max_ = -1;
383 else if (nre->max() != -1)
384 nre->max_ += r2->max();
385 goto LeaveEmpty;
386
387 case kRegexpLiteral:
388 case kRegexpCharClass:
389 case kRegexpAnyChar:
390 case kRegexpAnyByte:
391 nre->min_++;
392 if (nre->max() != -1)
393 nre->max_++;
394 goto LeaveEmpty;
395
396 LeaveEmpty:
397 *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
398 *r2ptr = nre;
399 break;
400
401 case kRegexpLiteralString: {
402 Rune r = r1->sub()[0]->rune();
403 // Determine how much of the literal string is removed.
404 // We know that we have at least one rune. :)
405 int n = 1;
406 while (n < r2->nrunes() && r2->runes()[n] == r)
407 n++;
408 nre->min_ += n;
409 if (nre->max() != -1)
410 nre->max_ += n;
411 if (n == r2->nrunes())
412 goto LeaveEmpty;
413 *r1ptr = nre;
414 *r2ptr = Regexp::LiteralString(
415 &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
416 break;
417 }
418
419 default:
420 LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
421 nre->Decref();
422 return;
423 }
424
425 r1->Decref();
426 r2->Decref();
427 }
428
429 Regexp* SimplifyWalker::Copy(Regexp* re) {
430 return re->Incref();
431 }
432
433 Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
434 // This should never be called, since we use Walk and not
435 // WalkExponential.
436 LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
437 return re->Incref();
438 }
439
440 Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
441 if (re->simple()) {
442 *stop = true;
443 return re->Incref();
444 }
445 return NULL;
446 }
447
448 Regexp* SimplifyWalker::PostVisit(Regexp* re,
449 Regexp* parent_arg,
450 Regexp* pre_arg,
451 Regexp** child_args,
452 int nchild_args) {
453 switch (re->op()) {
454 case kRegexpNoMatch:
455 case kRegexpEmptyMatch:
456 case kRegexpLiteral:
457 case kRegexpLiteralString:
458 case kRegexpBeginLine:
459 case kRegexpEndLine:
460 case kRegexpBeginText:
461 case kRegexpWordBoundary:
462 case kRegexpNoWordBoundary:
463 case kRegexpEndText:
464 case kRegexpAnyChar:
465 case kRegexpAnyByte:
466 case kRegexpHaveMatch:
467 // All these are always simple.
468 re->simple_ = true;
469 return re->Incref();
470
471 case kRegexpConcat:
472 case kRegexpAlternate: {
473 // These are simple as long as the subpieces are simple.
474 if (!ChildArgsChanged(re, child_args)) {
475 re->simple_ = true;
476 return re->Incref();
477 }
478 Regexp* nre = new Regexp(re->op(), re->parse_flags());
479 nre->AllocSub(re->nsub());
480 Regexp** nre_subs = nre->sub();
481 for (int i = 0; i < re->nsub(); i++)
482 nre_subs[i] = child_args[i];
483 nre->simple_ = true;
484 return nre;
485 }
486
487 case kRegexpCapture: {
488 Regexp* newsub = child_args[0];
489 if (newsub == re->sub()[0]) {
490 newsub->Decref();
491 re->simple_ = true;
492 return re->Incref();
493 }
494 Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
495 nre->AllocSub(1);
496 nre->sub()[0] = newsub;
497 nre->cap_ = re->cap();
498 nre->simple_ = true;
499 return nre;
500 }
501
502 case kRegexpStar:
503 case kRegexpPlus:
504 case kRegexpQuest: {
505 Regexp* newsub = child_args[0];
506 // Special case: repeat the empty string as much as
507 // you want, but it's still the empty string.
508 if (newsub->op() == kRegexpEmptyMatch)
509 return newsub;
510
511 // These are simple as long as the subpiece is simple.
512 if (newsub == re->sub()[0]) {
513 newsub->Decref();
514 re->simple_ = true;
515 return re->Incref();
516 }
517
518 // These are also idempotent if flags are constant.
519 if (re->op() == newsub->op() &&
520 re->parse_flags() == newsub->parse_flags())
521 return newsub;
522
523 Regexp* nre = new Regexp(re->op(), re->parse_flags());
524 nre->AllocSub(1);
525 nre->sub()[0] = newsub;
526 nre->simple_ = true;
527 return nre;
528 }
529
530 case kRegexpRepeat: {
531 Regexp* newsub = child_args[0];
532 // Special case: repeat the empty string as much as
533 // you want, but it's still the empty string.
534 if (newsub->op() == kRegexpEmptyMatch)
535 return newsub;
536
537 Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
538 re->parse_flags());
539 newsub->Decref();
540 nre->simple_ = true;
541 return nre;
542 }
543
544 case kRegexpCharClass: {
545 Regexp* nre = SimplifyCharClass(re);
546 nre->simple_ = true;
547 return nre;
548 }
549 }
550
551 LOG(ERROR) << "Simplify case not handled: " << re->op();
552 return re->Incref();
553 }
554
555 // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
556 // Returns a new Regexp, handing the ref to the caller.
557 Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
558 Regexp::ParseFlags parse_flags) {
559 Regexp* re = new Regexp(kRegexpConcat, parse_flags);
560 re->AllocSub(2);
561 Regexp** subs = re->sub();
562 subs[0] = re1;
563 subs[1] = re2;
564 return re;
565 }
566
567 // Simplifies the expression re{min,max} in terms of *, +, and ?.
568 // Returns a new regexp. Does not edit re. Does not consume reference to re.
569 // Caller must Decref return value when done with it.
570 // The result will *not* necessarily have the right capturing parens
571 // if you call ToString() and re-parse it: (x){2} becomes (x)(x),
572 // but in the Regexp* representation, both (x) are marked as $1.
573 Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
574 Regexp::ParseFlags f) {
575 // x{n,} means at least n matches of x.
576 if (max == -1) {
577 // Special case: x{0,} is x*
578 if (min == 0)
579 return Regexp::Star(re->Incref(), f);
580
581 // Special case: x{1,} is x+
582 if (min == 1)
583 return Regexp::Plus(re->Incref(), f);
584
585 // General case: x{4,} is xxxx+
586 Regexp* nre = new Regexp(kRegexpConcat, f);
587 nre->AllocSub(min);
588 Regexp** nre_subs = nre->sub();
589 for (int i = 0; i < min-1; i++)
590 nre_subs[i] = re->Incref();
591 nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
592 return nre;
593 }
594
595 // Special case: (x){0} matches only empty string.
596 if (min == 0 && max == 0)
597 return new Regexp(kRegexpEmptyMatch, f);
598
599 // Special case: x{1} is just x.
600 if (min == 1 && max == 1)
601 return re->Incref();
602
603 // General case: x{n,m} means n copies of x and m copies of x?.
604 // The machine will do less work if we nest the final m copies,
605 // so that x{2,5} = xx(x(x(x)?)?)?
606
607 // Build leading prefix: xx. Capturing only on the last one.
608 Regexp* nre = NULL;
609 if (min > 0) {
610 nre = new Regexp(kRegexpConcat, f);
611 nre->AllocSub(min);
612 Regexp** nre_subs = nre->sub();
613 for (int i = 0; i < min; i++)
614 nre_subs[i] = re->Incref();
615 }
616
617 // Build and attach suffix: (x(x(x)?)?)?
618 if (max > min) {
619 Regexp* suf = Regexp::Quest(re->Incref(), f);
620 for (int i = min+1; i < max; i++)
621 suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
622 if (nre == NULL)
623 nre = suf;
624 else
625 nre = Concat2(nre, suf, f);
626 }
627
628 if (nre == NULL) {
629 // Some degenerate case, like min > max, or min < max < 0.
630 // This shouldn't happen, because the parser rejects such regexps.
631 LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
632 return new Regexp(kRegexpNoMatch, f);
633 }
634
635 return nre;
636 }
637
638 // Simplifies a character class.
639 // Caller must Decref return value when done with it.
640 Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
641 CharClass* cc = re->cc();
642
643 // Special cases
644 if (cc->empty())
645 return new Regexp(kRegexpNoMatch, re->parse_flags());
646 if (cc->full())
647 return new Regexp(kRegexpAnyChar, re->parse_flags());
648
649 return re->Incref();
650 }
651
652 } // namespace re2
OLDNEW
« no previous file with comments | « third_party/re2/re2/set.cc ('k') | third_party/re2/re2/stringpiece.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698