Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(90)

Side by Side Diff: third_party/re2/re2/compile.cc

Issue 1516543002: Update re2 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Nits Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2007 The RE2 Authors. All Rights Reserved. 1 // Copyright 2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style 2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file. 3 // license that can be found in the LICENSE file.
4 4
5 // Compile regular expression to Prog. 5 // Compile regular expression to Prog.
6 // 6 //
7 // Prog and Inst are defined in prog.h. 7 // Prog and Inst are defined in prog.h.
8 // This file's external interface is just Regexp::CompileToProg. 8 // This file's external interface is just Regexp::CompileToProg.
9 // The Compiler class defined in this file is private. 9 // The Compiler class defined in this file is private.
10 10
(...skipping 212 matching lines...) Expand 10 before | Expand all | Expand 10 after
223 int inst_len_; // Number of instructions used. 223 int inst_len_; // Number of instructions used.
224 int inst_cap_; // Number of instructions allocated. 224 int inst_cap_; // Number of instructions allocated.
225 225
226 int64 max_mem_; // Total memory budget. 226 int64 max_mem_; // Total memory budget.
227 227
228 map<uint64, int> rune_cache_; 228 map<uint64, int> rune_cache_;
229 Frag rune_range_; 229 Frag rune_range_;
230 230
231 RE2::Anchor anchor_; // anchor mode for RE2::Set 231 RE2::Anchor anchor_; // anchor mode for RE2::Set
232 232
233 DISALLOW_EVIL_CONSTRUCTORS(Compiler); 233 DISALLOW_COPY_AND_ASSIGN(Compiler);
234 }; 234 };
235 235
236 Compiler::Compiler() { 236 Compiler::Compiler() {
237 prog_ = new Prog(); 237 prog_ = new Prog();
238 failed_ = false; 238 failed_ = false;
239 encoding_ = kEncodingUTF8; 239 encoding_ = kEncodingUTF8;
240 reversed_ = false; 240 reversed_ = false;
241 inst_ = NULL; 241 inst_ = NULL;
242 inst_len_ = 0; 242 inst_len_ = 0;
243 inst_cap_ = 0; 243 inst_cap_ = 0;
(...skipping 120 matching lines...) Expand 10 before | Expand all | Expand 10 after
364 364
365 // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) 365 // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
366 Frag Compiler::Plus(Frag a, bool nongreedy) { 366 Frag Compiler::Plus(Frag a, bool nongreedy) {
367 // a+ is just a* with a different entry point. 367 // a+ is just a* with a different entry point.
368 Frag f = Star(a, nongreedy); 368 Frag f = Star(a, nongreedy);
369 return Frag(a.begin, f.end); 369 return Frag(a.begin, f.end);
370 } 370 }
371 371
372 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) 372 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
373 Frag Compiler::Quest(Frag a, bool nongreedy) { 373 Frag Compiler::Quest(Frag a, bool nongreedy) {
374 if (IsNoMatch(a))
375 return Nop();
374 int id = AllocInst(1); 376 int id = AllocInst(1);
375 if (id < 0) 377 if (id < 0)
376 return NoMatch(); 378 return NoMatch();
377 PatchList pl; 379 PatchList pl;
378 if (nongreedy) { 380 if (nongreedy) {
379 inst_[id].InitAlt(0, a.begin); 381 inst_[id].InitAlt(0, a.begin);
380 pl = PatchList::Mk(id << 1); 382 pl = PatchList::Mk(id << 1);
381 } else { 383 } else {
382 inst_[id].InitAlt(a.begin, 0); 384 inst_[id].InitAlt(a.begin, 0);
383 pl = PatchList::Mk((id << 1) | 1); 385 pl = PatchList::Mk((id << 1) | 1);
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
426 Frag Compiler::EmptyWidth(EmptyOp empty) { 428 Frag Compiler::EmptyWidth(EmptyOp empty) {
427 int id = AllocInst(1); 429 int id = AllocInst(1);
428 if (id < 0) 430 if (id < 0)
429 return NoMatch(); 431 return NoMatch();
430 inst_[id].InitEmptyWidth(empty, 0); 432 inst_[id].InitEmptyWidth(empty, 0);
431 if (empty & (kEmptyBeginLine|kEmptyEndLine)) 433 if (empty & (kEmptyBeginLine|kEmptyEndLine))
432 prog_->MarkByteRange('\n', '\n'); 434 prog_->MarkByteRange('\n', '\n');
433 if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) { 435 if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
434 int j; 436 int j;
435 for (int i = 0; i < 256; i = j) { 437 for (int i = 0; i < 256; i = j) {
436 for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++) 438 for (j = i + 1; j < 256 &&
439 Prog::IsWordChar(static_cast<uint8>(i)) ==
440 Prog::IsWordChar(static_cast<uint8>(j));
441 j++)
437 ; 442 ;
438 prog_->MarkByteRange(i, j-1); 443 prog_->MarkByteRange(i, j-1);
439 } 444 }
440 } 445 }
441 return Frag(id, PatchList::Mk(id << 1)); 446 return Frag(id, PatchList::Mk(id << 1));
442 } 447 }
443 448
444 // Given a fragment a, returns a fragment with capturing parens around a. 449 // Given a fragment a, returns a fragment with capturing parens around a.
445 Frag Compiler::Capture(Frag a, int n) { 450 Frag Compiler::Capture(Frag a, int n) {
451 if (IsNoMatch(a))
452 return NoMatch();
446 int id = AllocInst(2); 453 int id = AllocInst(2);
447 if (id < 0) 454 if (id < 0)
448 return NoMatch(); 455 return NoMatch();
449 inst_[id].InitCapture(2*n, a.begin); 456 inst_[id].InitCapture(2*n, a.begin);
450 inst_[id+1].InitCapture(2*n+1, 0); 457 inst_[id+1].InitCapture(2*n+1, 0);
451 PatchList::Patch(inst_, a.end, id+1); 458 PatchList::Patch(inst_, a.end, id+1);
452 459
453 return Frag(id, PatchList::Mk((id+1) << 1)); 460 return Frag(id, PatchList::Mk((id+1) << 1));
454 } 461 }
455 462
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
492 int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { 499 int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
493 // In Latin1 mode, there's no point in caching. 500 // In Latin1 mode, there's no point in caching.
494 // In forward UTF-8 mode, only need to cache continuation bytes. 501 // In forward UTF-8 mode, only need to cache continuation bytes.
495 if (encoding_ == kEncodingLatin1 || 502 if (encoding_ == kEncodingLatin1 ||
496 (encoding_ == kEncodingUTF8 && 503 (encoding_ == kEncodingUTF8 &&
497 !reversed_ && 504 !reversed_ &&
498 !(0x80 <= lo && hi <= 0xbf))) { 505 !(0x80 <= lo && hi <= 0xbf))) {
499 return UncachedRuneByteSuffix(lo, hi, foldcase, next); 506 return UncachedRuneByteSuffix(lo, hi, foldcase, next);
500 } 507 }
501 508
502 uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | (foldcase ? 1ULL : 0UL L); 509 uint64 key = (uint64)next << 17 |
510 (uint64)lo << 9 |
511 (uint64)hi << 1 |
512 (uint64)foldcase;
503 map<uint64, int>::iterator it = rune_cache_.find(key); 513 map<uint64, int>::iterator it = rune_cache_.find(key);
504 if (it != rune_cache_.end()) 514 if (it != rune_cache_.end())
505 return it->second; 515 return it->second;
506 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); 516 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
507 rune_cache_[key] = id; 517 rune_cache_[key] = id;
508 return id; 518 return id;
509 } 519 }
510 520
511 void Compiler::AddSuffix(int id) { 521 void Compiler::AddSuffix(int id) {
512 if (rune_range_.begin == 0) { 522 if (rune_range_.begin == 0) {
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
544 break; 554 break;
545 } 555 }
546 } 556 }
547 557
548 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { 558 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
549 // Latin1 is easy: runes *are* bytes. 559 // Latin1 is easy: runes *are* bytes.
550 if (lo > hi || lo > 0xFF) 560 if (lo > hi || lo > 0xFF)
551 return; 561 return;
552 if (hi > 0xFF) 562 if (hi > 0xFF)
553 hi = 0xFF; 563 hi = 0xFF;
554 AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); 564 AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi),
565 foldcase, 0));
555 } 566 }
556 567
557 // Table describing how to make a UTF-8 matching machine 568 // Table describing how to make a UTF-8 matching machine
558 // for the rune range 80-10FFFF (Runeself-Runemax). 569 // for the rune range 80-10FFFF (Runeself-Runemax).
559 // This range happens frequently enough (for example /./ and /[^a-z]/) 570 // This range happens frequently enough (for example /./ and /[^a-z]/)
560 // and the rune_cache_ map is slow enough that this is worth 571 // and the rune_cache_ map is slow enough that this is worth
561 // special handling. Makes compilation of a small expression 572 // special handling. Makes compilation of a small expression
562 // with a dot in it about 10% faster. 573 // with a dot in it about 10% faster.
563 // The * in the comments below mark whole sequences. 574 // The * in the comments below mark whole sequences.
564 static struct ByteRangeProg { 575 static struct ByteRangeProg {
(...skipping 20 matching lines...) Expand all
585 { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF* 596 { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF*
586 }; 597 };
587 598
588 void Compiler::Add_80_10ffff() { 599 void Compiler::Add_80_10ffff() {
589 int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialize d; silences gcc warning 600 int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialize d; silences gcc warning
590 for (int i = 0; i < arraysize(prog_80_10ffff); i++) { 601 for (int i = 0; i < arraysize(prog_80_10ffff); i++) {
591 const ByteRangeProg& p = prog_80_10ffff[i]; 602 const ByteRangeProg& p = prog_80_10ffff[i];
592 int next = 0; 603 int next = 0;
593 if (p.next >= 0) 604 if (p.next >= 0)
594 next = inst[p.next]; 605 next = inst[p.next];
595 inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next); 606 inst[i] = UncachedRuneByteSuffix(static_cast<uint8>(p.lo),
607 static_cast<uint8>(p.hi), false, next);
596 if ((p.lo & 0xC0) != 0x80) 608 if ((p.lo & 0xC0) != 0x80)
597 AddSuffix(inst[i]); 609 AddSuffix(inst[i]);
598 } 610 }
599 } 611 }
600 612
601 void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { 613 void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
602 if (lo > hi) 614 if (lo > hi)
603 return; 615 return;
604 616
605 // Pick off 80-10FFFF as a common special case 617 // Pick off 80-10FFFF as a common special case
606 // that can bypass the slow rune_cache_. 618 // that can bypass the slow rune_cache_.
607 if (lo == 0x80 && hi == 0x10ffff && !reversed_) { 619 if (lo == 0x80 && hi == 0x10ffff && !reversed_) {
608 Add_80_10ffff(); 620 Add_80_10ffff();
609 return; 621 return;
610 } 622 }
611 623
612 // Split range into same-length sized ranges. 624 // Split range into same-length sized ranges.
613 for (int i = 1; i < UTFmax; i++) { 625 for (int i = 1; i < UTFmax; i++) {
614 Rune max = MaxRune(i); 626 Rune max = MaxRune(i);
615 if (lo <= max && max < hi) { 627 if (lo <= max && max < hi) {
616 AddRuneRangeUTF8(lo, max, foldcase); 628 AddRuneRangeUTF8(lo, max, foldcase);
617 AddRuneRangeUTF8(max+1, hi, foldcase); 629 AddRuneRangeUTF8(max+1, hi, foldcase);
618 return; 630 return;
619 } 631 }
620 } 632 }
621 633
622 // ASCII range is always a special case. 634 // ASCII range is always a special case.
623 if (hi < Runeself) { 635 if (hi < Runeself) {
624 AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); 636 AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi),
637 foldcase, 0));
625 return; 638 return;
626 } 639 }
627 640
628 // Split range into sections that agree on leading bytes. 641 // Split range into sections that agree on leading bytes.
629 for (int i = 1; i < UTFmax; i++) { 642 for (int i = 1; i < UTFmax; i++) {
630 uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence 643 uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
631 if ((lo & ~m) != (hi & ~m)) { 644 if ((lo & ~m) != (hi & ~m)) {
632 if ((lo & m) != 0) { 645 if ((lo & m) != 0) {
633 AddRuneRangeUTF8(lo, lo|m, foldcase); 646 AddRuneRangeUTF8(lo, lo|m, foldcase);
634 AddRuneRangeUTF8((lo|m)+1, hi, foldcase); 647 AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
(...skipping 107 matching lines...) Expand 10 before | Expand all | Expand 10 after
742 } 755 }
743 756
744 case kRegexpAlternate: { 757 case kRegexpAlternate: {
745 Frag f = child_frags[0]; 758 Frag f = child_frags[0];
746 for (int i = 1; i < nchild_frags; i++) 759 for (int i = 1; i < nchild_frags; i++)
747 f = Alt(f, child_frags[i]); 760 f = Alt(f, child_frags[i]);
748 return f; 761 return f;
749 } 762 }
750 763
751 case kRegexpStar: 764 case kRegexpStar:
752 return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy); 765 return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
753 766
754 case kRegexpPlus: 767 case kRegexpPlus:
755 return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy); 768 return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
756 769
757 case kRegexpQuest: 770 case kRegexpQuest:
758 return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy); 771 return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
759 772
760 case kRegexpLiteral: 773 case kRegexpLiteral:
761 return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase); 774 return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);
762 775
763 case kRegexpLiteralString: { 776 case kRegexpLiteralString: {
764 // Concatenation of literals. 777 // Concatenation of literals.
765 if (re->nrunes() == 0) 778 if (re->nrunes() == 0)
766 return Nop(); 779 return Nop();
767 Frag f; 780 Frag f;
768 for (int i = 0; i < re->nrunes(); i++) { 781 for (int i = 0; i < re->nrunes(); i++) {
769 Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase); 782 Frag f1 = Literal(re->runes()[i],
783 (re->parse_flags()&Regexp::FoldCase) != 0);
770 if (i == 0) 784 if (i == 0)
771 f = f1; 785 f = f1;
772 else 786 else
773 f = Cat(f, f1); 787 f = Cat(f, f1);
774 } 788 }
775 return f; 789 return f;
776 } 790 }
777 791
778 case kRegexpAnyChar: 792 case kRegexpAnyChar:
779 BeginRange(); 793 BeginRange();
(...skipping 24 matching lines...) Expand all
804 // character ranges in the class. 818 // character ranges in the class.
805 BeginRange(); 819 BeginRange();
806 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) { 820 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
807 // ASCII case-folding optimization (see above). 821 // ASCII case-folding optimization (see above).
808 if (foldascii && 'A' <= i->lo && i->hi <= 'Z') 822 if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
809 continue; 823 continue;
810 824
811 // If this range contains all of A-Za-z or none of it, 825 // If this range contains all of A-Za-z or none of it,
812 // the fold flag is unnecessary; don't bother. 826 // the fold flag is unnecessary; don't bother.
813 bool fold = foldascii; 827 bool fold = foldascii;
814 if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo) 828 if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo ||
829 ('Z' < i->lo && i->hi < 'a'))
815 fold = false; 830 fold = false;
816 831
817 AddRuneRange(i->lo, i->hi, fold); 832 AddRuneRange(i->lo, i->hi, fold);
818 } 833 }
819 return EndRange(); 834 return EndRange();
820 } 835 }
821 836
822 case kRegexpCapture: 837 case kRegexpCapture:
823 // If this is a non-capturing parenthesis -- (?:foo) -- 838 // If this is a non-capturing parenthesis -- (?:foo) --
824 // just use the inner expression. 839 // just use the inner expression.
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after
947 962
948 void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, 963 void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
949 RE2::Anchor anchor) { 964 RE2::Anchor anchor) {
950 prog_->set_flags(flags); 965 prog_->set_flags(flags);
951 966
952 if (flags & Regexp::Latin1) 967 if (flags & Regexp::Latin1)
953 encoding_ = kEncodingLatin1; 968 encoding_ = kEncodingLatin1;
954 max_mem_ = max_mem; 969 max_mem_ = max_mem;
955 if (max_mem <= 0) { 970 if (max_mem <= 0) {
956 max_inst_ = 100000; // more than enough 971 max_inst_ = 100000; // more than enough
957 } else if (max_mem <= sizeof(Prog)) { 972 } else if (max_mem <= static_cast<int64>(sizeof(Prog))) {
958 // No room for anything. 973 // No room for anything.
959 max_inst_ = 0; 974 max_inst_ = 0;
960 } else { 975 } else {
961 int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); 976 int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
962 // Limit instruction count so that inst->id() fits nicely in an int. 977 // Limit instruction count so that inst->id() fits nicely in an int.
963 // SparseArray also assumes that the indices (inst->id()) are ints. 978 // SparseArray also assumes that the indices (inst->id()) are ints.
964 // The call to WalkExponential uses 2*max_inst_ below, 979 // The call to WalkExponential uses 2*max_inst_ below,
965 // and other places in the code use 2 or 3 * prog->size(). 980 // and other places in the code use 2 or 3 * prog->size().
966 // Limiting to 2^24 should avoid overflow in those places. 981 // Limiting to 2^24 should avoid overflow in those places.
967 // (The point of allowing more than 32 bits of memory is to 982 // (The point of allowing more than 32 bits of memory is to
968 // have plenty of room for the DFA states, not to use it up 983 // have plenty of room for the DFA states, not to use it up
969 // on the program.) 984 // on the program.)
970 if (m >= 1<<24) 985 if (m >= 1<<24)
971 m = 1<<24; 986 m = 1<<24;
972 987
973 // Inst imposes its own limit (currently bigger than 2^24 but be safe). 988 // Inst imposes its own limit (currently bigger than 2^24 but be safe).
974 if (m > Prog::Inst::kMaxInst) 989 if (m > Prog::Inst::kMaxInst)
975 m = Prog::Inst::kMaxInst; 990 m = Prog::Inst::kMaxInst;
976 991
977 max_inst_ = m; 992 max_inst_ = static_cast<int>(m);
978 } 993 }
979 994
980 anchor_ = anchor; 995 anchor_ = anchor;
981 } 996 }
982 997
983 // Compiles re, returning program. 998 // Compiles re, returning program.
984 // Caller is responsible for deleting prog_. 999 // Caller is responsible for deleting prog_.
985 // If reversed is true, compiles a program that expects 1000 // If reversed is true, compiles a program that expects
986 // to run over the input string backward (reverses all concatenations). 1001 // to run over the input string backward (reverses all concatenations).
987 // The reversed flag is also recorded in the returned program. 1002 // The reversed flag is also recorded in the returned program.
(...skipping 139 matching lines...) Expand 10 before | Expand all | Expand 10 after
1127 1142
1128 return prog; 1143 return prog;
1129 } 1144 }
1130 1145
1131 Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor, 1146 Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
1132 Regexp* re) { 1147 Regexp* re) {
1133 return Compiler::CompileSet(options, anchor, re); 1148 return Compiler::CompileSet(options, anchor, re);
1134 } 1149 }
1135 1150
1136 } // namespace re2 1151 } // namespace re2
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698