Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(393)

Side by Side Diff: third_party/re2/re2/prefilter.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Re-Added LICENSE and OWNERS file Created 4 years, 12 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/re2/re2/prefilter.h ('k') | third_party/re2/re2/prefilter_tree.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "util/util.h"
6 #include "re2/prefilter.h"
7 #include "re2/re2.h"
8 #include "re2/unicode_casefold.h"
9 #include "re2/walker-inl.h"
10
11 namespace re2 {
12
13 static const int Trace = false;
14
15 typedef set<string>::iterator SSIter;
16 typedef set<string>::const_iterator ConstSSIter;
17
18 GLOBAL_MUTEX(alloc_id_mutex);
19 static int alloc_id = 100000; // Used for debugging.
20 // Initializes a Prefilter, allocating subs_ as necessary.
21 Prefilter::Prefilter(Op op) {
22 op_ = op;
23 subs_ = NULL;
24 if (op_ == AND || op_ == OR)
25 subs_ = new vector<Prefilter*>;
26
27 GLOBAL_MUTEX_LOCK(alloc_id_mutex);
28 alloc_id_ = alloc_id++;
29 GLOBAL_MUTEX_UNLOCK(alloc_id_mutex);
30 VLOG(10) << "alloc_id: " << alloc_id_;
31 }
32
33 // Destroys a Prefilter.
34 Prefilter::~Prefilter() {
35 VLOG(10) << "Deleted: " << alloc_id_;
36 if (subs_) {
37 for (size_t i = 0; i < subs_->size(); i++)
38 delete (*subs_)[i];
39 delete subs_;
40 subs_ = NULL;
41 }
42 }
43
44 // Simplify if the node is an empty Or or And.
45 Prefilter* Prefilter::Simplify() {
46 if (op_ != AND && op_ != OR) {
47 return this;
48 }
49
50 // Nothing left in the AND/OR.
51 if (subs_->size() == 0) {
52 if (op_ == AND)
53 op_ = ALL; // AND of nothing is true
54 else
55 op_ = NONE; // OR of nothing is false
56
57 return this;
58 }
59
60 // Just one subnode: throw away wrapper.
61 if (subs_->size() == 1) {
62 Prefilter* a = (*subs_)[0];
63 subs_->clear();
64 delete this;
65 return a->Simplify();
66 }
67
68 return this;
69 }
70
71 // Combines two Prefilters together to create an "op" (AND or OR).
72 // The passed Prefilters will be part of the returned Prefilter or deleted.
73 // Does lots of work to avoid creating unnecessarily complicated structures.
74 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
75 // If a, b can be rewritten as op, do so.
76 a = a->Simplify();
77 b = b->Simplify();
78
79 // Canonicalize: a->op <= b->op.
80 if (a->op() > b->op()) {
81 Prefilter* t = a;
82 a = b;
83 b = t;
84 }
85
86 // Trivial cases.
87 // ALL AND b = b
88 // NONE OR b = b
89 // ALL OR b = ALL
90 // NONE AND b = NONE
91 // Don't need to look at b, because of canonicalization above.
92 // ALL and NONE are smallest opcodes.
93 if (a->op() == ALL || a->op() == NONE) {
94 if ((a->op() == ALL && op == AND) ||
95 (a->op() == NONE && op == OR)) {
96 delete a;
97 return b;
98 } else {
99 delete b;
100 return a;
101 }
102 }
103
104 // If a and b match op, merge their contents.
105 if (a->op() == op && b->op() == op) {
106 for (size_t i = 0; i < b->subs()->size(); i++) {
107 Prefilter* bb = (*b->subs())[i];
108 a->subs()->push_back(bb);
109 }
110 b->subs()->clear();
111 delete b;
112 return a;
113 }
114
115 // If a already has the same op as the op that is under construction
116 // add in b (similarly if b already has the same op, add in a).
117 if (b->op() == op) {
118 Prefilter* t = a;
119 a = b;
120 b = t;
121 }
122 if (a->op() == op) {
123 a->subs()->push_back(b);
124 return a;
125 }
126
127 // Otherwise just return the op.
128 Prefilter* c = new Prefilter(op);
129 c->subs()->push_back(a);
130 c->subs()->push_back(b);
131 return c;
132 }
133
134 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
135 return AndOr(AND, a, b);
136 }
137
138 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
139 return AndOr(OR, a, b);
140 }
141
142 static void SimplifyStringSet(set<string> *ss) {
143 // Now make sure that the strings aren't redundant. For example, if
144 // we know "ab" is a required string, then it doesn't help at all to
145 // know that "abc" is also a required string, so delete "abc". This
146 // is because, when we are performing a string search to filter
147 // regexps, matching ab will already allow this regexp to be a
148 // candidate for match, so further matching abc is redundant.
149
150 for (SSIter i = ss->begin(); i != ss->end(); ++i) {
151 SSIter j = i;
152 ++j;
153 while (j != ss->end()) {
154 // Increment j early so that we can erase the element it points to.
155 SSIter old_j = j;
156 ++j;
157 if (old_j->find(*i) != string::npos)
158 ss->erase(old_j);
159 }
160 }
161 }
162
163 Prefilter* Prefilter::OrStrings(set<string>* ss) {
164 SimplifyStringSet(ss);
165 Prefilter* or_prefilter = NULL;
166 if (!ss->empty()) {
167 or_prefilter = new Prefilter(NONE);
168 for (SSIter i = ss->begin(); i != ss->end(); ++i)
169 or_prefilter = Or(or_prefilter, FromString(*i));
170 }
171 return or_prefilter;
172 }
173
174 static Rune ToLowerRune(Rune r) {
175 if (r < Runeself) {
176 if ('A' <= r && r <= 'Z')
177 r += 'a' - 'A';
178 return r;
179 }
180
181 const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
182 if (f == NULL || r < f->lo)
183 return r;
184 return ApplyFold(f, r);
185 }
186
187 static Rune ToLowerRuneLatin1(Rune r) {
188 if ('A' <= r && r <= 'Z')
189 r += 'a' - 'A';
190 return r;
191 }
192
193 Prefilter* Prefilter::FromString(const string& str) {
194 Prefilter* m = new Prefilter(Prefilter::ATOM);
195 m->atom_ = str;
196 return m;
197 }
198
199 // Information about a regexp used during computation of Prefilter.
200 // Can be thought of as information about the set of strings matching
201 // the given regular expression.
202 class Prefilter::Info {
203 public:
204 Info();
205 ~Info();
206
207 // More constructors. They delete their Info* arguments.
208 static Info* Alt(Info* a, Info* b);
209 static Info* Concat(Info* a, Info* b);
210 static Info* And(Info* a, Info* b);
211 static Info* Star(Info* a);
212 static Info* Plus(Info* a);
213 static Info* Quest(Info* a);
214 static Info* EmptyString();
215 static Info* NoMatch();
216 static Info* AnyChar();
217 static Info* CClass(CharClass* cc, bool latin1);
218 static Info* Literal(Rune r);
219 static Info* LiteralLatin1(Rune r);
220 static Info* AnyMatch();
221
222 // Format Info as a string.
223 string ToString();
224
225 // Caller takes ownership of the Prefilter.
226 Prefilter* TakeMatch();
227
228 set<string>& exact() { return exact_; }
229
230 bool is_exact() const { return is_exact_; }
231
232 class Walker;
233
234 private:
235 set<string> exact_;
236
237 // When is_exact_ is true, the strings that match
238 // are placed in exact_. When it is no longer an exact
239 // set of strings that match this RE, then is_exact_
240 // is false and the match_ contains the required match
241 // criteria.
242 bool is_exact_;
243
244 // Accumulated Prefilter query that any
245 // match for this regexp is guaranteed to match.
246 Prefilter* match_;
247 };
248
249
250 Prefilter::Info::Info()
251 : is_exact_(false),
252 match_(NULL) {
253 }
254
255 Prefilter::Info::~Info() {
256 delete match_;
257 }
258
259 Prefilter* Prefilter::Info::TakeMatch() {
260 if (is_exact_) {
261 match_ = Prefilter::OrStrings(&exact_);
262 is_exact_ = false;
263 }
264 Prefilter* m = match_;
265 match_ = NULL;
266 return m;
267 }
268
269 // Format a Info in string form.
270 string Prefilter::Info::ToString() {
271 if (is_exact_) {
272 int n = 0;
273 string s;
274 for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
275 if (n++ > 0)
276 s += ",";
277 s += *i;
278 }
279 return s;
280 }
281
282 if (match_)
283 return match_->DebugString();
284
285 return "";
286 }
287
288 // Add the strings from src to dst.
289 static void CopyIn(const set<string>& src, set<string>* dst) {
290 for (ConstSSIter i = src.begin(); i != src.end(); ++i)
291 dst->insert(*i);
292 }
293
294 // Add the cross-product of a and b to dst.
295 // (For each string i in a and j in b, add i+j.)
296 static void CrossProduct(const set<string>& a,
297 const set<string>& b,
298 set<string>* dst) {
299 for (ConstSSIter i = a.begin(); i != a.end(); ++i)
300 for (ConstSSIter j = b.begin(); j != b.end(); ++j)
301 dst->insert(*i + *j);
302 }
303
304 // Concats a and b. Requires that both are exact sets.
305 // Forms an exact set that is a crossproduct of a and b.
306 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
307 if (a == NULL)
308 return b;
309 DCHECK(a->is_exact_);
310 DCHECK(b && b->is_exact_);
311 Info *ab = new Info();
312
313 CrossProduct(a->exact_, b->exact_, &ab->exact_);
314 ab->is_exact_ = true;
315
316 delete a;
317 delete b;
318 return ab;
319 }
320
321 // Constructs an inexact Info for ab given a and b.
322 // Used only when a or b is not exact or when the
323 // exact cross product is likely to be too big.
324 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
325 if (a == NULL)
326 return b;
327 if (b == NULL)
328 return a;
329
330 Info *ab = new Info();
331
332 ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
333 ab->is_exact_ = false;
334 delete a;
335 delete b;
336 return ab;
337 }
338
339 // Constructs Info for a|b given a and b.
340 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
341 Info *ab = new Info();
342
343 if (a->is_exact_ && b->is_exact_) {
344 CopyIn(a->exact_, &ab->exact_);
345 CopyIn(b->exact_, &ab->exact_);
346 ab->is_exact_ = true;
347 } else {
348 // Either a or b has is_exact_ = false. If the other
349 // one has is_exact_ = true, we move it to match_ and
350 // then create a OR of a,b. The resulting Info has
351 // is_exact_ = false.
352 ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
353 ab->is_exact_ = false;
354 }
355
356 delete a;
357 delete b;
358 return ab;
359 }
360
361 // Constructs Info for a? given a.
362 Prefilter::Info* Prefilter::Info::Quest(Info *a) {
363 Info *ab = new Info();
364
365 ab->is_exact_ = false;
366 ab->match_ = new Prefilter(ALL);
367 delete a;
368 return ab;
369 }
370
371 // Constructs Info for a* given a.
372 // Same as a? -- not much to do.
373 Prefilter::Info* Prefilter::Info::Star(Info *a) {
374 return Quest(a);
375 }
376
377 // Constructs Info for a+ given a. If a was exact set, it isn't
378 // anymore.
379 Prefilter::Info* Prefilter::Info::Plus(Info *a) {
380 Info *ab = new Info();
381
382 ab->match_ = a->TakeMatch();
383 ab->is_exact_ = false;
384
385 delete a;
386 return ab;
387 }
388
389 static string RuneToString(Rune r) {
390 char buf[UTFmax];
391 int n = runetochar(buf, &r);
392 return string(buf, n);
393 }
394
395 static string RuneToStringLatin1(Rune r) {
396 char c = r & 0xff;
397 return string(&c, 1);
398 }
399
400 // Constructs Info for literal rune.
401 Prefilter::Info* Prefilter::Info::Literal(Rune r) {
402 Info* info = new Info();
403 info->exact_.insert(RuneToString(ToLowerRune(r)));
404 info->is_exact_ = true;
405 return info;
406 }
407
408 // Constructs Info for literal rune for Latin1 encoded string.
409 Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
410 Info* info = new Info();
411 info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
412 info->is_exact_ = true;
413 return info;
414 }
415
416 // Constructs Info for dot (any character).
417 Prefilter::Info* Prefilter::Info::AnyChar() {
418 Prefilter::Info* info = new Prefilter::Info();
419 info->match_ = new Prefilter(ALL);
420 return info;
421 }
422
423 // Constructs Prefilter::Info for no possible match.
424 Prefilter::Info* Prefilter::Info::NoMatch() {
425 Prefilter::Info* info = new Prefilter::Info();
426 info->match_ = new Prefilter(NONE);
427 return info;
428 }
429
430 // Constructs Prefilter::Info for any possible match.
431 // This Prefilter::Info is valid for any regular expression,
432 // since it makes no assertions whatsoever about the
433 // strings being matched.
434 Prefilter::Info* Prefilter::Info::AnyMatch() {
435 Prefilter::Info *info = new Prefilter::Info();
436 info->match_ = new Prefilter(ALL);
437 return info;
438 }
439
440 // Constructs Prefilter::Info for just the empty string.
441 Prefilter::Info* Prefilter::Info::EmptyString() {
442 Prefilter::Info* info = new Prefilter::Info();
443 info->is_exact_ = true;
444 info->exact_.insert("");
445 return info;
446 }
447
448 // Constructs Prefilter::Info for a character class.
449 typedef CharClass::iterator CCIter;
450 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
451 bool latin1) {
452 if (Trace) {
453 VLOG(0) << "CharClassInfo:";
454 for (CCIter i = cc->begin(); i != cc->end(); ++i)
455 VLOG(0) << " " << i->lo << "-" << i->hi;
456 }
457
458 // If the class is too large, it's okay to overestimate.
459 if (cc->size() > 10)
460 return AnyChar();
461
462 Prefilter::Info *a = new Prefilter::Info();
463 for (CCIter i = cc->begin(); i != cc->end(); ++i)
464 for (Rune r = i->lo; r <= i->hi; r++) {
465 if (latin1) {
466 a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
467 } else {
468 a->exact_.insert(RuneToString(ToLowerRune(r)));
469 }
470 }
471
472
473 a->is_exact_ = true;
474
475 if (Trace) {
476 VLOG(0) << " = " << a->ToString();
477 }
478
479 return a;
480 }
481
482 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
483 public:
484 Walker(bool latin1) : latin1_(latin1) {}
485
486 virtual Info* PostVisit(
487 Regexp* re, Info* parent_arg,
488 Info* pre_arg,
489 Info** child_args, int nchild_args);
490
491 virtual Info* ShortVisit(
492 Regexp* re,
493 Info* parent_arg);
494
495 bool latin1() { return latin1_; }
496 private:
497 bool latin1_;
498 DISALLOW_COPY_AND_ASSIGN(Walker);
499 };
500
501 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
502 if (Trace) {
503 LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
504 }
505
506 bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
507 Prefilter::Info::Walker w(latin1);
508 Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
509
510 if (w.stopped_early()) {
511 delete info;
512 return NULL;
513 }
514
515 return info;
516 }
517
518 Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
519 Regexp* re, Prefilter::Info* parent_arg) {
520 return AnyMatch();
521 }
522
523 // Constructs the Prefilter::Info for the given regular expression.
524 // Assumes re is simplified.
525 Prefilter::Info* Prefilter::Info::Walker::PostVisit(
526 Regexp* re, Prefilter::Info* parent_arg,
527 Prefilter::Info* pre_arg, Prefilter::Info** child_args,
528 int nchild_args) {
529 Prefilter::Info *info;
530 switch (re->op()) {
531 default:
532 case kRegexpRepeat:
533 LOG(DFATAL) << "Bad regexp op " << re->op();
534 info = EmptyString();
535 break;
536
537 case kRegexpNoMatch:
538 info = NoMatch();
539 break;
540
541 // These ops match the empty string:
542 case kRegexpEmptyMatch: // anywhere
543 case kRegexpBeginLine: // at beginning of line
544 case kRegexpEndLine: // at end of line
545 case kRegexpBeginText: // at beginning of text
546 case kRegexpEndText: // at end of text
547 case kRegexpWordBoundary: // at word boundary
548 case kRegexpNoWordBoundary: // not at word boundary
549 info = EmptyString();
550 break;
551
552 case kRegexpLiteral:
553 if (latin1()) {
554 info = LiteralLatin1(re->rune());
555 }
556 else {
557 info = Literal(re->rune());
558 }
559 break;
560
561 case kRegexpLiteralString:
562 if (re->nrunes() == 0) {
563 info = NoMatch();
564 break;
565 }
566 if (latin1()) {
567 info = LiteralLatin1(re->runes()[0]);
568 for (int i = 1; i < re->nrunes(); i++) {
569 info = Concat(info, LiteralLatin1(re->runes()[i]));
570 }
571 } else {
572 info = Literal(re->runes()[0]);
573 for (int i = 1; i < re->nrunes(); i++) {
574 info = Concat(info, Literal(re->runes()[i]));
575 }
576 }
577 break;
578
579 case kRegexpConcat: {
580 // Accumulate in info.
581 // Exact is concat of recent contiguous exact nodes.
582 info = NULL;
583 Info* exact = NULL;
584 for (int i = 0; i < nchild_args; i++) {
585 Info* ci = child_args[i]; // child info
586 if (!ci->is_exact() ||
587 (exact && ci->exact().size() * exact->exact().size() > 16)) {
588 // Exact run is over.
589 info = And(info, exact);
590 exact = NULL;
591 // Add this child's info.
592 info = And(info, ci);
593 } else {
594 // Append to exact run.
595 exact = Concat(exact, ci);
596 }
597 }
598 info = And(info, exact);
599 }
600 break;
601
602 case kRegexpAlternate:
603 info = child_args[0];
604 for (int i = 1; i < nchild_args; i++)
605 info = Alt(info, child_args[i]);
606 VLOG(10) << "Alt: " << info->ToString();
607 break;
608
609 case kRegexpStar:
610 info = Star(child_args[0]);
611 break;
612
613 case kRegexpQuest:
614 info = Quest(child_args[0]);
615 break;
616
617 case kRegexpPlus:
618 info = Plus(child_args[0]);
619 break;
620
621 case kRegexpAnyChar:
622 // Claim nothing, except that it's not empty.
623 info = AnyChar();
624 break;
625
626 case kRegexpCharClass:
627 info = CClass(re->cc(), latin1());
628 break;
629
630 case kRegexpCapture:
631 // These don't affect the set of matching strings.
632 info = child_args[0];
633 break;
634 }
635
636 if (Trace) {
637 VLOG(0) << "BuildInfo " << re->ToString()
638 << ": " << (info ? info->ToString() : "");
639 }
640
641 return info;
642 }
643
644
645 Prefilter* Prefilter::FromRegexp(Regexp* re) {
646 if (re == NULL)
647 return NULL;
648
649 Regexp* simple = re->Simplify();
650 Prefilter::Info *info = BuildInfo(simple);
651
652 simple->Decref();
653 if (info == NULL)
654 return NULL;
655
656 Prefilter* m = info->TakeMatch();
657
658 delete info;
659 return m;
660 }
661
662 string Prefilter::DebugString() const {
663 switch (op_) {
664 default:
665 LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
666 return StringPrintf("op%d", op_);
667 case NONE:
668 return "*no-matches*";
669 case ATOM:
670 return atom_;
671 case ALL:
672 return "";
673 case AND: {
674 string s = "";
675 for (size_t i = 0; i < subs_->size(); i++) {
676 if (i > 0)
677 s += " ";
678 Prefilter* sub = (*subs_)[i];
679 s += sub ? sub->DebugString() : "<nil>";
680 }
681 return s;
682 }
683 case OR: {
684 string s = "(";
685 for (size_t i = 0; i < subs_->size(); i++) {
686 if (i > 0)
687 s += "|";
688 Prefilter* sub = (*subs_)[i];
689 s += sub ? sub->DebugString() : "<nil>";
690 }
691 s += ")";
692 return s;
693 }
694 }
695 }
696
697 Prefilter* Prefilter::FromRE2(const RE2* re2) {
698 if (re2 == NULL)
699 return NULL;
700
701 Regexp* regexp = re2->Regexp();
702 if (regexp == NULL)
703 return NULL;
704
705 return FromRegexp(regexp);
706 }
707
708
709 } // namespace re2
OLDNEW
« no previous file with comments | « third_party/re2/re2/prefilter.h ('k') | third_party/re2/re2/prefilter_tree.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698