third_party/re2/re2/compile.cc - Issue 1516543002: Update re2

Side by Side Diff: third_party/re2/re2/compile.cc

Issue 1516543002: Update re2 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Nits Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2007 The RE2 Authors. All Rights Reserved.	1 // Copyright 2007 The RE2 Authors. All Rights Reserved.

2 // Use of this source code is governed by a BSD-style	2 // Use of this source code is governed by a BSD-style

3 // license that can be found in the LICENSE file.	3 // license that can be found in the LICENSE file.

4	4

5 // Compile regular expression to Prog.	5 // Compile regular expression to Prog.

6 //	6 //

7 // Prog and Inst are defined in prog.h.	7 // Prog and Inst are defined in prog.h.

8 // This file's external interface is just Regexp::CompileToProg.	8 // This file's external interface is just Regexp::CompileToProg.

9 // The Compiler class defined in this file is private.	9 // The Compiler class defined in this file is private.

10	10

(...skipping 212 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
223 int inst_len_; // Number of instructions used.	223 int inst_len_; // Number of instructions used.

224 int inst_cap_; // Number of instructions allocated.	224 int inst_cap_; // Number of instructions allocated.

225	225

226 int64 max_mem_; // Total memory budget.	226 int64 max_mem_; // Total memory budget.

227	227

228 map<uint64, int> rune_cache_;	228 map<uint64, int> rune_cache_;

229 Frag rune_range_;	229 Frag rune_range_;

230	230

231 RE2::Anchor anchor_; // anchor mode for RE2::Set	231 RE2::Anchor anchor_; // anchor mode for RE2::Set

232	232

233 DISALLOW_EVIL_CONSTRUCTORS(Compiler);	233 DISALLOW_COPY_AND_ASSIGN(Compiler);

234 };	234 };

235	235

236 Compiler::Compiler() {	236 Compiler::Compiler() {

237 prog_ = new Prog();	237 prog_ = new Prog();

238 failed_ = false;	238 failed_ = false;

239 encoding_ = kEncodingUTF8;	239 encoding_ = kEncodingUTF8;

240 reversed_ = false;	240 reversed_ = false;

241 inst_ = NULL;	241 inst_ = NULL;

242 inst_len_ = 0;	242 inst_len_ = 0;

243 inst_cap_ = 0;	243 inst_cap_ = 0;

(...skipping 120 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
364	364

365 // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)	365 // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)

366 Frag Compiler::Plus(Frag a, bool nongreedy) {	366 Frag Compiler::Plus(Frag a, bool nongreedy) {

367 // a+ is just a* with a different entry point.	367 // a+ is just a* with a different entry point.

368 Frag f = Star(a, nongreedy);	368 Frag f = Star(a, nongreedy);

369 return Frag(a.begin, f.end);	369 return Frag(a.begin, f.end);

370 }	370 }

371	371

372 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)	372 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)

373 Frag Compiler::Quest(Frag a, bool nongreedy) {	373 Frag Compiler::Quest(Frag a, bool nongreedy) {

	374 if (IsNoMatch(a))

	375 return Nop();

374 int id = AllocInst(1);	376 int id = AllocInst(1);

375 if (id < 0)	377 if (id < 0)

376 return NoMatch();	378 return NoMatch();

377 PatchList pl;	379 PatchList pl;

378 if (nongreedy) {	380 if (nongreedy) {

379 inst_[id].InitAlt(0, a.begin);	381 inst_[id].InitAlt(0, a.begin);

380 pl = PatchList::Mk(id << 1);	382 pl = PatchList::Mk(id << 1);

381 } else {	383 } else {

382 inst_[id].InitAlt(a.begin, 0);	384 inst_[id].InitAlt(a.begin, 0);

383 pl = PatchList::Mk((id << 1) \| 1);	385 pl = PatchList::Mk((id << 1) \| 1);

(...skipping 42 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
426 Frag Compiler::EmptyWidth(EmptyOp empty) {	428 Frag Compiler::EmptyWidth(EmptyOp empty) {

427 int id = AllocInst(1);	429 int id = AllocInst(1);

428 if (id < 0)	430 if (id < 0)

429 return NoMatch();	431 return NoMatch();

430 inst_[id].InitEmptyWidth(empty, 0);	432 inst_[id].InitEmptyWidth(empty, 0);

431 if (empty & (kEmptyBeginLine\|kEmptyEndLine))	433 if (empty & (kEmptyBeginLine\|kEmptyEndLine))

432 prog_->MarkByteRange('\n', '\n');	434 prog_->MarkByteRange('\n', '\n');

433 if (empty & (kEmptyWordBoundary\|kEmptyNonWordBoundary)) {	435 if (empty & (kEmptyWordBoundary\|kEmptyNonWordBoundary)) {

434 int j;	436 int j;

435 for (int i = 0; i < 256; i = j) {	437 for (int i = 0; i < 256; i = j) {

436 for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++)	438 for (j = i + 1; j < 256 &&

	439 Prog::IsWordChar(static_cast<uint8>(i)) ==

	440 Prog::IsWordChar(static_cast<uint8>(j));

	441 j++)

437 ;	442 ;

438 prog_->MarkByteRange(i, j-1);	443 prog_->MarkByteRange(i, j-1);

439 }	444 }

440 }	445 }

441 return Frag(id, PatchList::Mk(id << 1));	446 return Frag(id, PatchList::Mk(id << 1));

442 }	447 }

443	448

444 // Given a fragment a, returns a fragment with capturing parens around a.	449 // Given a fragment a, returns a fragment with capturing parens around a.

445 Frag Compiler::Capture(Frag a, int n) {	450 Frag Compiler::Capture(Frag a, int n) {

	451 if (IsNoMatch(a))

	452 return NoMatch();

446 int id = AllocInst(2);	453 int id = AllocInst(2);

447 if (id < 0)	454 if (id < 0)

448 return NoMatch();	455 return NoMatch();

449 inst_[id].InitCapture(2*n, a.begin);	456 inst_[id].InitCapture(2*n, a.begin);

450 inst_[id+1].InitCapture(2*n+1, 0);	457 inst_[id+1].InitCapture(2*n+1, 0);

451 PatchList::Patch(inst_, a.end, id+1);	458 PatchList::Patch(inst_, a.end, id+1);

452	459

453 return Frag(id, PatchList::Mk((id+1) << 1));	460 return Frag(id, PatchList::Mk((id+1) << 1));

454 }	461 }

455	462

(...skipping 36 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
492 int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {	499 int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {

493 // In Latin1 mode, there's no point in caching.	500 // In Latin1 mode, there's no point in caching.

494 // In forward UTF-8 mode, only need to cache continuation bytes.	501 // In forward UTF-8 mode, only need to cache continuation bytes.

495 if (encoding_ == kEncodingLatin1 \|\|	502 if (encoding_ == kEncodingLatin1 \|\|

496 (encoding_ == kEncodingUTF8 &&	503 (encoding_ == kEncodingUTF8 &&

497 !reversed_ &&	504 !reversed_ &&

498 !(0x80 <= lo && hi <= 0xbf))) {	505 !(0x80 <= lo && hi <= 0xbf))) {

499 return UncachedRuneByteSuffix(lo, hi, foldcase, next);	506 return UncachedRuneByteSuffix(lo, hi, foldcase, next);

500 }	507 }

501	508

502 uint64 key = ((uint64)next << 17) \| (lo<<9) \| (hi<<1) \| (foldcase ? 1ULL : 0UL L);	509 uint64 key = (uint64)next << 17 \|

	510 (uint64)lo << 9 \|

	511 (uint64)hi << 1 \|

	512 (uint64)foldcase;

503 map<uint64, int>::iterator it = rune_cache_.find(key);	513 map<uint64, int>::iterator it = rune_cache_.find(key);

504 if (it != rune_cache_.end())	514 if (it != rune_cache_.end())

505 return it->second;	515 return it->second;

506 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);	516 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);

507 rune_cache_[key] = id;	517 rune_cache_[key] = id;

508 return id;	518 return id;

509 }	519 }

510	520

511 void Compiler::AddSuffix(int id) {	521 void Compiler::AddSuffix(int id) {

512 if (rune_range_.begin == 0) {	522 if (rune_range_.begin == 0) {

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
544 break;	554 break;

545 }	555 }

546 }	556 }

547	557

548 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {	558 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {

549 // Latin1 is easy: runes are bytes.	559 // Latin1 is easy: runes are bytes.

550 if (lo > hi \|\| lo > 0xFF)	560 if (lo > hi \|\| lo > 0xFF)

551 return;	561 return;

552 if (hi > 0xFF)	562 if (hi > 0xFF)

553 hi = 0xFF;	563 hi = 0xFF;

554 AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));	564 AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi),

	565 foldcase, 0));

555 }	566 }

556	567

557 // Table describing how to make a UTF-8 matching machine	568 // Table describing how to make a UTF-8 matching machine

558 // for the rune range 80-10FFFF (Runeself-Runemax).	569 // for the rune range 80-10FFFF (Runeself-Runemax).

559 // This range happens frequently enough (for example /./ and /[^a-z]/)	570 // This range happens frequently enough (for example /./ and /[^a-z]/)

560 // and the rune_cache_ map is slow enough that this is worth	571 // and the rune_cache_ map is slow enough that this is worth

561 // special handling. Makes compilation of a small expression	572 // special handling. Makes compilation of a small expression

562 // with a dot in it about 10% faster.	573 // with a dot in it about 10% faster.

563 // The * in the comments below mark whole sequences.	574 // The * in the comments below mark whole sequences.

564 static struct ByteRangeProg {	575 static struct ByteRangeProg {

(...skipping 20 matching lines...) Expand all Loading...
585 { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF*	596 { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF*

586 };	597 };

587	598

588 void Compiler::Add_80_10ffff() {	599 void Compiler::Add_80_10ffff() {

589 int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialize d; silences gcc warning	600 int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialize d; silences gcc warning

590 for (int i = 0; i < arraysize(prog_80_10ffff); i++) {	601 for (int i = 0; i < arraysize(prog_80_10ffff); i++) {

591 const ByteRangeProg& p = prog_80_10ffff[i];	602 const ByteRangeProg& p = prog_80_10ffff[i];

592 int next = 0;	603 int next = 0;

593 if (p.next >= 0)	604 if (p.next >= 0)

594 next = inst[p.next];	605 next = inst[p.next];

595 inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next);	606 inst[i] = UncachedRuneByteSuffix(static_cast<uint8>(p.lo),

	607 static_cast<uint8>(p.hi), false, next);

596 if ((p.lo & 0xC0) != 0x80)	608 if ((p.lo & 0xC0) != 0x80)

597 AddSuffix(inst[i]);	609 AddSuffix(inst[i]);

598 }	610 }

599 }	611 }

600	612

601 void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {	613 void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {

602 if (lo > hi)	614 if (lo > hi)

603 return;	615 return;

604	616

605 // Pick off 80-10FFFF as a common special case	617 // Pick off 80-10FFFF as a common special case

606 // that can bypass the slow rune_cache_.	618 // that can bypass the slow rune_cache_.

607 if (lo == 0x80 && hi == 0x10ffff && !reversed_) {	619 if (lo == 0x80 && hi == 0x10ffff && !reversed_) {

608 Add_80_10ffff();	620 Add_80_10ffff();

609 return;	621 return;

610 }	622 }

611	623

612 // Split range into same-length sized ranges.	624 // Split range into same-length sized ranges.

613 for (int i = 1; i < UTFmax; i++) {	625 for (int i = 1; i < UTFmax; i++) {

614 Rune max = MaxRune(i);	626 Rune max = MaxRune(i);

615 if (lo <= max && max < hi) {	627 if (lo <= max && max < hi) {

616 AddRuneRangeUTF8(lo, max, foldcase);	628 AddRuneRangeUTF8(lo, max, foldcase);

617 AddRuneRangeUTF8(max+1, hi, foldcase);	629 AddRuneRangeUTF8(max+1, hi, foldcase);

618 return;	630 return;

619 }	631 }

620 }	632 }

621	633

622 // ASCII range is always a special case.	634 // ASCII range is always a special case.

623 if (hi < Runeself) {	635 if (hi < Runeself) {

624 AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));	636 AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi),

	637 foldcase, 0));

625 return;	638 return;

626 }	639 }

627	640

628 // Split range into sections that agree on leading bytes.	641 // Split range into sections that agree on leading bytes.

629 for (int i = 1; i < UTFmax; i++) {	642 for (int i = 1; i < UTFmax; i++) {

630 uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence	643 uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence

631 if ((lo & ~m) != (hi & ~m)) {	644 if ((lo & ~m) != (hi & ~m)) {

632 if ((lo & m) != 0) {	645 if ((lo & m) != 0) {

633 AddRuneRangeUTF8(lo, lo\|m, foldcase);	646 AddRuneRangeUTF8(lo, lo\|m, foldcase);

634 AddRuneRangeUTF8((lo\|m)+1, hi, foldcase);	647 AddRuneRangeUTF8((lo\|m)+1, hi, foldcase);

(...skipping 107 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
742 }	755 }

743	756

744 case kRegexpAlternate: {	757 case kRegexpAlternate: {

745 Frag f = child_frags[0];	758 Frag f = child_frags[0];

746 for (int i = 1; i < nchild_frags; i++)	759 for (int i = 1; i < nchild_frags; i++)

747 f = Alt(f, child_frags[i]);	760 f = Alt(f, child_frags[i]);

748 return f;	761 return f;

749 }	762 }

750	763

751 case kRegexpStar:	764 case kRegexpStar:

752 return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy);	765 return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);

753	766

754 case kRegexpPlus:	767 case kRegexpPlus:

755 return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy);	768 return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);

756	769

757 case kRegexpQuest:	770 case kRegexpQuest:

758 return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy);	771 return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);

759	772

760 case kRegexpLiteral:	773 case kRegexpLiteral:

761 return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase);	774 return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);

762	775

763 case kRegexpLiteralString: {	776 case kRegexpLiteralString: {

764 // Concatenation of literals.	777 // Concatenation of literals.

765 if (re->nrunes() == 0)	778 if (re->nrunes() == 0)

766 return Nop();	779 return Nop();

767 Frag f;	780 Frag f;

768 for (int i = 0; i < re->nrunes(); i++) {	781 for (int i = 0; i < re->nrunes(); i++) {

769 Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase);	782 Frag f1 = Literal(re->runes()[i],

	783 (re->parse_flags()&Regexp::FoldCase) != 0);

770 if (i == 0)	784 if (i == 0)

771 f = f1;	785 f = f1;

772 else	786 else

773 f = Cat(f, f1);	787 f = Cat(f, f1);

774 }	788 }

775 return f;	789 return f;

776 }	790 }

777	791

778 case kRegexpAnyChar:	792 case kRegexpAnyChar:

779 BeginRange();	793 BeginRange();

(...skipping 24 matching lines...) Expand all Loading...
804 // character ranges in the class.	818 // character ranges in the class.

805 BeginRange();	819 BeginRange();

806 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {	820 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {

807 // ASCII case-folding optimization (see above).	821 // ASCII case-folding optimization (see above).

808 if (foldascii && 'A' <= i->lo && i->hi <= 'Z')	822 if (foldascii && 'A' <= i->lo && i->hi <= 'Z')

809 continue;	823 continue;

810	824

811 // If this range contains all of A-Za-z or none of it,	825 // If this range contains all of A-Za-z or none of it,

812 // the fold flag is unnecessary; don't bother.	826 // the fold flag is unnecessary; don't bother.

813 bool fold = foldascii;	827 bool fold = foldascii;

814 if ((i->lo <= 'A' && 'z' <= i->hi) \|\| i->hi < 'A' \|\| 'z' < i->lo)	828 if ((i->lo <= 'A' && 'z' <= i->hi) \|\| i->hi < 'A' \|\| 'z' < i->lo \|\|

	829 ('Z' < i->lo && i->hi < 'a'))

815 fold = false;	830 fold = false;

816	831

817 AddRuneRange(i->lo, i->hi, fold);	832 AddRuneRange(i->lo, i->hi, fold);

818 }	833 }

819 return EndRange();	834 return EndRange();

820 }	835 }

821	836

822 case kRegexpCapture:	837 case kRegexpCapture:

823 // If this is a non-capturing parenthesis -- (?:foo) --	838 // If this is a non-capturing parenthesis -- (?:foo) --

824 // just use the inner expression.	839 // just use the inner expression.

(...skipping 122 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
947	962

948 void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,	963 void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,

949 RE2::Anchor anchor) {	964 RE2::Anchor anchor) {

950 prog_->set_flags(flags);	965 prog_->set_flags(flags);

951	966

952 if (flags & Regexp::Latin1)	967 if (flags & Regexp::Latin1)

953 encoding_ = kEncodingLatin1;	968 encoding_ = kEncodingLatin1;

954 max_mem_ = max_mem;	969 max_mem_ = max_mem;

955 if (max_mem <= 0) {	970 if (max_mem <= 0) {

956 max_inst_ = 100000; // more than enough	971 max_inst_ = 100000; // more than enough

957 } else if (max_mem <= sizeof(Prog)) {	972 } else if (max_mem <= static_cast<int64>(sizeof(Prog))) {

958 // No room for anything.	973 // No room for anything.

959 max_inst_ = 0;	974 max_inst_ = 0;

960 } else {	975 } else {

961 int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);	976 int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);

962 // Limit instruction count so that inst->id() fits nicely in an int.	977 // Limit instruction count so that inst->id() fits nicely in an int.

963 // SparseArray also assumes that the indices (inst->id()) are ints.	978 // SparseArray also assumes that the indices (inst->id()) are ints.

964 // The call to WalkExponential uses 2*max_inst_ below,	979 // The call to WalkExponential uses 2*max_inst_ below,

965 // and other places in the code use 2 or 3 * prog->size().	980 // and other places in the code use 2 or 3 * prog->size().

966 // Limiting to 2^24 should avoid overflow in those places.	981 // Limiting to 2^24 should avoid overflow in those places.

967 // (The point of allowing more than 32 bits of memory is to	982 // (The point of allowing more than 32 bits of memory is to

968 // have plenty of room for the DFA states, not to use it up	983 // have plenty of room for the DFA states, not to use it up

969 // on the program.)	984 // on the program.)

970 if (m >= 1<<24)	985 if (m >= 1<<24)

971 m = 1<<24;	986 m = 1<<24;

972	987

973 // Inst imposes its own limit (currently bigger than 2^24 but be safe).	988 // Inst imposes its own limit (currently bigger than 2^24 but be safe).

974 if (m > Prog::Inst::kMaxInst)	989 if (m > Prog::Inst::kMaxInst)

975 m = Prog::Inst::kMaxInst;	990 m = Prog::Inst::kMaxInst;

976	991

977 max_inst_ = m;	992 max_inst_ = static_cast<int>(m);

978 }	993 }

979	994

980 anchor_ = anchor;	995 anchor_ = anchor;

981 }	996 }

982	997

983 // Compiles re, returning program.	998 // Compiles re, returning program.

984 // Caller is responsible for deleting prog_.	999 // Caller is responsible for deleting prog_.

985 // If reversed is true, compiles a program that expects	1000 // If reversed is true, compiles a program that expects

986 // to run over the input string backward (reverses all concatenations).	1001 // to run over the input string backward (reverses all concatenations).

987 // The reversed flag is also recorded in the returned program.	1002 // The reversed flag is also recorded in the returned program.

(...skipping 139 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1127	1142

1128 return prog;	1143 return prog;

1129 }	1144 }

1130	1145

1131 Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,	1146 Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,

1132 Regexp* re) {	1147 Regexp* re) {

1133 return Compiler::CompileSet(options, anchor, re);	1148 return Compiler::CompileSet(options, anchor, re);

1134 }	1149 }

1135	1150

1136 } // namespace re2	1151 } // namespace re2

OLD	NEW

« third_party/re2/patches/remove-valgrind-code.patch ('K') | « third_party/re2/re2/bitstate.cc ('k') | third_party/re2/re2/dfa.cc » ('j') | no next file with comments »