third_party/re2/re2/compile.cc - Issue 1544433002: Replace RE2 import with a dependency

Side by Side Diff: third_party/re2/re2/compile.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright 2007 The RE2 Authors. All Rights Reserved.

2 // Use of this source code is governed by a BSD-style

3 // license that can be found in the LICENSE file.

4

5 // Compile regular expression to Prog.

6 //

7 // Prog and Inst are defined in prog.h.

8 // This file's external interface is just Regexp::CompileToProg.

9 // The Compiler class defined in this file is private.

10

11 #include "re2/prog.h"

12 #include "re2/re2.h"

13 #include "re2/regexp.h"

14 #include "re2/walker-inl.h"

15

16 namespace re2 {

17

18 // List of pointers to Inst* that need to be filled in (patched).

19 // Because the Inst* haven't been filled in yet,

20 // we can use the Inst* word to hold the list's "next" pointer.

21 // It's kind of sleazy, but it works well in practice.

22 // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.

23 //

24 // Because the out and out1 fields in Inst are no longer pointers,

25 // we can't use pointers directly here either. Instead, p refers

26 // to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1).

27 // p == 0 represents the NULL list. This is okay because instruction #0

28 // is always the fail instruction, which never appears on a list.

29

30 struct PatchList {

31 uint32 p;

32

33 // Returns patch list containing just p.

34 static PatchList Mk(uint32 p);

35

36 // Patches all the entries on l to have value v.

37 // Caller must not ever use patch list again.

38 static void Patch(Prog::Inst *inst0, PatchList l, uint32 v);

39

40 // Deref returns the next pointer pointed at by p.

41 static PatchList Deref(Prog::Inst *inst0, PatchList l);

42

43 // Appends two patch lists and returns result.

44 static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2);

45 };

46

47 static PatchList nullPatchList = { 0 };

48

49 // Returns patch list containing just p.

50 PatchList PatchList::Mk(uint32 p) {

51 PatchList l;

52 l.p = p;

53 return l;

54 }

55

56 // Returns the next pointer pointed at by l.

57 PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {

58 Prog::Inst* ip = &inst0[l.p>>1];

59 if (l.p&1)

60 l.p = ip->out1();

61 else

62 l.p = ip->out();

63 return l;

64 }

65

66 // Patches all the entries on l to have value v.

67 void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) {

68 while (l.p != 0) {

69 Prog::Inst* ip = &inst0[l.p>>1];

70 if (l.p&1) {

71 l.p = ip->out1();

72 ip->out1_ = val;

73 } else {

74 l.p = ip->out();

75 ip->set_out(val);

76 }

77 }

78 }

79

80 // Appends two patch lists and returns result.

81 PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {

82 if (l1.p == 0)

83 return l2;

84 if (l2.p == 0)

85 return l1;

86

87 PatchList l = l1;

88 for (;;) {

89 PatchList next = PatchList::Deref(inst0, l);

90 if (next.p == 0)

91 break;

92 l = next;

93 }

94

95 Prog::Inst* ip = &inst0[l.p>>1];

96 if (l.p&1)

97 ip->out1_ = l2.p;

98 else

99 ip->set_out(l2.p);

100

101 return l1;

102 }

103

104 // Compiled program fragment.

105 struct Frag {

106 uint32 begin;

107 PatchList end;

108

109 Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector

110 Frag(uint32 begin, PatchList end) : begin(begin), end(end) {}

111 };

112

113 // Input encodings.

114 enum Encoding {

115 kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)

116 kEncodingLatin1, // Latin1 (0-FF)

117 };

118

119 class Compiler : public Regexp::Walker<Frag> {

120 public:

121 explicit Compiler();

122 ~Compiler();

123

124 // Compiles Regexp to a new Prog.

125 // Caller is responsible for deleting Prog when finished with it.

126 // If reversed is true, compiles for walking over the input

127 // string backward (reverses all concatenations).

128 static Prog Compile(Regexp re, bool reversed, int64 max_mem);

129

130 // Compiles alternation of all the re to a new Prog.

131 // Each re has a match with an id equal to its index in the vector.

132 static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,

133 Regexp* re);

134

135 // Interface for Regexp::Walker, which helps traverse the Regexp.

136 // The walk is purely post-recursive: given the machines for the

137 // children, PostVisit combines them to create the machine for

138 // the current node. The child_args are Frags.

139 // The Compiler traverses the Regexp parse tree, visiting

140 // each node in depth-first order. It invokes PreVisit before

141 // visiting the node's children and PostVisit after visiting

142 // the children.

143 Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);

144 Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,

145 int nchild_args);

146 Frag ShortVisit(Regexp* re, Frag parent_arg);

147 Frag Copy(Frag arg);

148

149 // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??

150 Frag Plus(Frag a, bool nongreedy);

151 Frag Star(Frag a, bool nongreedy);

152 Frag Quest(Frag a, bool nongreedy);

153

154 // Given fragment a, returns (a) capturing as \n.

155 Frag Capture(Frag a, int n);

156

157 // Given fragments a and b, returns ab; a\|b

158 Frag Cat(Frag a, Frag b);

159 Frag Alt(Frag a, Frag b);

160

161 // Returns a fragment that can't match anything.

162 Frag NoMatch();

163

164 // Returns a fragment that matches the empty string.

165 Frag Match(int32 id);

166

167 // Returns a no-op fragment.

168 Frag Nop();

169

170 // Returns a fragment matching the byte range lo-hi.

171 Frag ByteRange(int lo, int hi, bool foldcase);

172

173 // Returns a fragment matching an empty-width special op.

174 Frag EmptyWidth(EmptyOp op);

175

176 // Adds n instructions to the program.

177 // Returns the index of the first one.

178 // Returns -1 if no more instructions are available.

179 int AllocInst(int n);

180

181 // Deletes unused instructions.

182 void Trim();

183

184 // Rune range compiler.

185

186 // Begins a new alternation.

187 void BeginRange();

188

189 // Adds a fragment matching the rune range lo-hi.

190 void AddRuneRange(Rune lo, Rune hi, bool foldcase);

191 void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);

192 void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);

193 void Add_80_10ffff();

194

195 // New suffix that matches the byte range lo-hi, then goes to next.

196 int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);

197 int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);

198

199 // Adds a suffix to alternation.

200 void AddSuffix(int id);

201

202 // Returns the alternation of all the added suffixes.

203 Frag EndRange();

204

205 // Single rune.

206 Frag Literal(Rune r, bool foldcase);

207

208 void Setup(Regexp::ParseFlags, int64, RE2::Anchor);

209 Prog* Finish();

210

211 // Returns .* where dot = any byte

212 Frag DotStar();

213

214 private:

215 Prog* prog_; // Program being built.

216 bool failed_; // Did we give up compiling?

217 Encoding encoding_; // Input encoding

218 bool reversed_; // Should program run backward over text?

219

220 int max_inst_; // Maximum number of instructions.

221

222 Prog::Inst* inst_; // Pointer to first instruction.

223 int inst_len_; // Number of instructions used.

224 int inst_cap_; // Number of instructions allocated.

225

226 int64 max_mem_; // Total memory budget.

227

228 map<uint64, int> rune_cache_;

229 Frag rune_range_;

230

231 RE2::Anchor anchor_; // anchor mode for RE2::Set

232

233 DISALLOW_COPY_AND_ASSIGN(Compiler);

234 };

235

236 Compiler::Compiler() {

237 prog_ = new Prog();

238 failed_ = false;

239 encoding_ = kEncodingUTF8;

240 reversed_ = false;

241 inst_ = NULL;

242 inst_len_ = 0;

243 inst_cap_ = 0;

244 max_inst_ = 1; // make AllocInst for fail instruction okay

245 max_mem_ = 0;

246 int fail = AllocInst(1);

247 inst_[fail].InitFail();

248 max_inst_ = 0; // Caller must change

249 }

250

251 Compiler::~Compiler() {

252 delete prog_;

253 delete[] inst_;

254 }

255

256 int Compiler::AllocInst(int n) {

257 if (failed_ \|\| inst_len_ + n > max_inst_) {

258 failed_ = true;

259 return -1;

260 }

261

262 if (inst_len_ + n > inst_cap_) {

263 if (inst_cap_ == 0)

264 inst_cap_ = 8;

265 while (inst_len_ + n > inst_cap_)

266 inst_cap_ *= 2;

267 Prog::Inst* ip = new Prog::Inst[inst_cap_];

268 memmove(ip, inst_, inst_len_ * sizeof ip[0]);

269 memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]);

270 delete[] inst_;

271 inst_ = ip;

272 }

273 int id = inst_len_;

274 inst_len_ += n;

275 return id;

276 }

277

278 void Compiler::Trim() {

279 if (inst_len_ < inst_cap_) {

280 Prog::Inst* ip = new Prog::Inst[inst_len_];

281 memmove(ip, inst_, inst_len_ * sizeof ip[0]);

282 delete[] inst_;

283 inst_ = ip;

284 inst_cap_ = inst_len_;

285 }

286 }

287

288 // These routines are somewhat hard to visualize in text --

289 // see http://swtch.com/~rsc/regexp/regexp1.html for

290 // pictures explaining what is going on here.

291

292 // Returns an unmatchable fragment.

293 Frag Compiler::NoMatch() {

294 return Frag(0, nullPatchList);

295 }

296

297 // Is a an unmatchable fragment?

298 static bool IsNoMatch(Frag a) {

299 return a.begin == 0;

300 }

301

302 // Given fragments a and b, returns fragment for ab.

303 Frag Compiler::Cat(Frag a, Frag b) {

304 if (IsNoMatch(a) \|\| IsNoMatch(b))

305 return NoMatch();

306

307 // Elide no-op.

308 Prog::Inst* begin = &inst_[a.begin];

309 if (begin->opcode() == kInstNop &&

310 a.end.p == (a.begin << 1) &&

311 begin->out() == 0) {

312 PatchList::Patch(inst_, a.end, b.begin); // in case refs to a somewhere

313 return b;

314 }

315

316 // To run backward over string, reverse all concatenations.

317 if (reversed_) {

318 PatchList::Patch(inst_, b.end, a.begin);

319 return Frag(b.begin, a.end);

320 }

321

322 PatchList::Patch(inst_, a.end, b.begin);

323 return Frag(a.begin, b.end);

324 }

325

326 // Given fragments for a and b, returns fragment for a\|b.

327 Frag Compiler::Alt(Frag a, Frag b) {

328 // Special case for convenience in loops.

329 if (IsNoMatch(a))

330 return b;

331 if (IsNoMatch(b))

332 return a;

333

334 int id = AllocInst(1);

335 if (id < 0)

336 return NoMatch();

337

338 inst_[id].InitAlt(a.begin, b.begin);

339 return Frag(id, PatchList::Append(inst_, a.end, b.end));

340 }

341

342 // When capturing submatches in like-Perl mode, a kOpAlt Inst

343 // treats out_ as the first choice, out1_ as the second.

344 //

345 // For *, +, and ?, if out_ causes another repetition,

346 // then the operator is greedy. If out1_ is the repetition

347 // (and out_ moves forward), then the operator is non-greedy.

348

349 // Given a fragment a, returns a fragment for a* or a*? (if nongreedy)

350 Frag Compiler::Star(Frag a, bool nongreedy) {

351 int id = AllocInst(1);

352 if (id < 0)

353 return NoMatch();

354 inst_[id].InitAlt(0, 0);

355 PatchList::Patch(inst_, a.end, id);

356 if (nongreedy) {

357 inst_[id].out1_ = a.begin;

358 return Frag(id, PatchList::Mk(id << 1));

359 } else {

360 inst_[id].set_out(a.begin);

361 return Frag(id, PatchList::Mk((id << 1) \| 1));

362 }

363 }

364

365 // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)

366 Frag Compiler::Plus(Frag a, bool nongreedy) {

367 // a+ is just a* with a different entry point.

368 Frag f = Star(a, nongreedy);

369 return Frag(a.begin, f.end);

370 }

371

372 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)

373 Frag Compiler::Quest(Frag a, bool nongreedy) {

374 if (IsNoMatch(a))

375 return Nop();

376 int id = AllocInst(1);

377 if (id < 0)

378 return NoMatch();

379 PatchList pl;

380 if (nongreedy) {

381 inst_[id].InitAlt(0, a.begin);

382 pl = PatchList::Mk(id << 1);

383 } else {

384 inst_[id].InitAlt(a.begin, 0);

385 pl = PatchList::Mk((id << 1) \| 1);

386 }

387 return Frag(id, PatchList::Append(inst_, pl, a.end));

388 }

389

390 // Returns a fragment for the byte range lo-hi.

391 Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {

392 int id = AllocInst(1);

393 if (id < 0)

394 return NoMatch();

395 inst_[id].InitByteRange(lo, hi, foldcase, 0);

396 prog_->byte_inst_count_++;

397 prog_->MarkByteRange(lo, hi);

398 if (foldcase && lo <= 'z' && hi >= 'a') {

399 if (lo < 'a')

400 lo = 'a';

401 if (hi > 'z')

402 hi = 'z';

403 if (lo <= hi)

404 prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a');

405 }

406 return Frag(id, PatchList::Mk(id << 1));

407 }

408

409 // Returns a no-op fragment. Sometimes unavoidable.

410 Frag Compiler::Nop() {

411 int id = AllocInst(1);

412 if (id < 0)

413 return NoMatch();

414 inst_[id].InitNop(0);

415 return Frag(id, PatchList::Mk(id << 1));

416 }

417

418 // Returns a fragment that signals a match.

419 Frag Compiler::Match(int32 match_id) {

420 int id = AllocInst(1);

421 if (id < 0)

422 return NoMatch();

423 inst_[id].InitMatch(match_id);

424 return Frag(id, nullPatchList);

425 }

426

427 // Returns a fragment matching a particular empty-width op (like ^ or $)

428 Frag Compiler::EmptyWidth(EmptyOp empty) {

429 int id = AllocInst(1);

430 if (id < 0)

431 return NoMatch();

432 inst_[id].InitEmptyWidth(empty, 0);

433 if (empty & (kEmptyBeginLine\|kEmptyEndLine))

434 prog_->MarkByteRange('\n', '\n');

435 if (empty & (kEmptyWordBoundary\|kEmptyNonWordBoundary)) {

436 int j;

437 for (int i = 0; i < 256; i = j) {

438 for (j = i + 1; j < 256 &&

439 Prog::IsWordChar(static_cast<uint8>(i)) ==

440 Prog::IsWordChar(static_cast<uint8>(j));

441 j++)

442 ;

443 prog_->MarkByteRange(i, j-1);

444 }

445 }

446 return Frag(id, PatchList::Mk(id << 1));

447 }

448

449 // Given a fragment a, returns a fragment with capturing parens around a.

450 Frag Compiler::Capture(Frag a, int n) {

451 if (IsNoMatch(a))

452 return NoMatch();

453 int id = AllocInst(2);

454 if (id < 0)

455 return NoMatch();

456 inst_[id].InitCapture(2*n, a.begin);

457 inst_[id+1].InitCapture(2*n+1, 0);

458 PatchList::Patch(inst_, a.end, id+1);

459

460 return Frag(id, PatchList::Mk((id+1) << 1));

461 }

462

463 // A Rune is a name for a Unicode code point.

464 // Returns maximum rune encoded by UTF-8 sequence of length len.

465 static int MaxRune(int len) {

466 int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)

467 if (len == 1)

468 b = 7;

469 else

470 b = 8-(len+1) + 6*(len-1);

471 return (1<<b) - 1; // maximum Rune for b bits.

472 }

473

474 // The rune range compiler caches common suffix fragments,

475 // which are very common in UTF-8 (e.g., [80-bf]).

476 // The fragment suffixes are identified by their start

477 // instructions. NULL denotes the eventual end match.

478 // The Frag accumulates in rune_range_. Caching common

479 // suffixes reduces the UTF-8 "." from 32 to 24 instructions,

480 // and it reduces the corresponding one-pass NFA from 16 nodes to 8.

481

482 void Compiler::BeginRange() {

483 rune_cache_.clear();

484 rune_range_.begin = 0;

485 rune_range_.end = nullPatchList;

486 }

487

488 int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,

489 int next) {

490 Frag f = ByteRange(lo, hi, foldcase);

491 if (next != 0) {

492 PatchList::Patch(inst_, f.end, next);

493 } else {

494 rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end);

495 }

496 return f.begin;

497 }

498

499 int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {

500 // In Latin1 mode, there's no point in caching.

501 // In forward UTF-8 mode, only need to cache continuation bytes.

502 if (encoding_ == kEncodingLatin1 \|\|

503 (encoding_ == kEncodingUTF8 &&

504 !reversed_ &&

505 !(0x80 <= lo && hi <= 0xbf))) {

506 return UncachedRuneByteSuffix(lo, hi, foldcase, next);

507 }

508

509 uint64 key = (uint64)next << 17 \|

510 (uint64)lo << 9 \|

511 (uint64)hi << 1 \|

512 (uint64)foldcase;

513 map<uint64, int>::iterator it = rune_cache_.find(key);

514 if (it != rune_cache_.end())

515 return it->second;

516 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);

517 rune_cache_[key] = id;

518 return id;

519 }

520

521 void Compiler::AddSuffix(int id) {

522 if (rune_range_.begin == 0) {

523 rune_range_.begin = id;

524 return;

525 }

526

527 int alt = AllocInst(1);

528 if (alt < 0) {

529 rune_range_.begin = 0;

530 return;

531 }

532 inst_[alt].InitAlt(rune_range_.begin, id);

533 rune_range_.begin = alt;

534 }

535

536 Frag Compiler::EndRange() {

537 return rune_range_;

538 }

539

540 // Converts rune range lo-hi into a fragment that recognizes

541 // the bytes that would make up those runes in the current

542 // encoding (Latin 1 or UTF-8).

543 // This lets the machine work byte-by-byte even when

544 // using multibyte encodings.

545

546 void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {

547 switch (encoding_) {

548 default:

549 case kEncodingUTF8:

550 AddRuneRangeUTF8(lo, hi, foldcase);

551 break;

552 case kEncodingLatin1:

553 AddRuneRangeLatin1(lo, hi, foldcase);

554 break;

555 }

556 }

557

558 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {

559 // Latin1 is easy: runes are bytes.

560 if (lo > hi \|\| lo > 0xFF)

561 return;

562 if (hi > 0xFF)

563 hi = 0xFF;

564 AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi),

565 foldcase, 0));

566 }

567

568 // Table describing how to make a UTF-8 matching machine

569 // for the rune range 80-10FFFF (Runeself-Runemax).

570 // This range happens frequently enough (for example /./ and /[^a-z]/)

571 // and the rune_cache_ map is slow enough that this is worth

572 // special handling. Makes compilation of a small expression

573 // with a dot in it about 10% faster.

574 // The * in the comments below mark whole sequences.

575 static struct ByteRangeProg {

576 int next;

577 int lo;

578 int hi;

579 } prog_80_10ffff[] = {

580 // Two-byte

581 { -1, 0x80, 0xBF, }, // 0: 80-BF

582 { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF*

583

584 // Three-byte

585 { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF

586 { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF*

587 { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF

588 { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF*

589

590 // Four-byte

591 { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF

592 { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF*

593 { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF

594 { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF*

595 { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF

596 { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF*

597 };

598

599 void Compiler::Add_80_10ffff() {

600 int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialize d; silences gcc warning

601 for (int i = 0; i < arraysize(prog_80_10ffff); i++) {

602 const ByteRangeProg& p = prog_80_10ffff[i];

603 int next = 0;

604 if (p.next >= 0)

605 next = inst[p.next];

606 inst[i] = UncachedRuneByteSuffix(static_cast<uint8>(p.lo),

607 static_cast<uint8>(p.hi), false, next);

608 if ((p.lo & 0xC0) != 0x80)

609 AddSuffix(inst[i]);

610 }

611 }

612

613 void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {

614 if (lo > hi)

615 return;

616

617 // Pick off 80-10FFFF as a common special case

618 // that can bypass the slow rune_cache_.

619 if (lo == 0x80 && hi == 0x10ffff && !reversed_) {

620 Add_80_10ffff();

621 return;

622 }

623

624 // Split range into same-length sized ranges.

625 for (int i = 1; i < UTFmax; i++) {

626 Rune max = MaxRune(i);

627 if (lo <= max && max < hi) {

628 AddRuneRangeUTF8(lo, max, foldcase);

629 AddRuneRangeUTF8(max+1, hi, foldcase);

630 return;

631 }

632 }

633

634 // ASCII range is always a special case.

635 if (hi < Runeself) {

636 AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi),

637 foldcase, 0));

638 return;

639 }

640

641 // Split range into sections that agree on leading bytes.

642 for (int i = 1; i < UTFmax; i++) {

643 uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence

644 if ((lo & ~m) != (hi & ~m)) {

645 if ((lo & m) != 0) {

646 AddRuneRangeUTF8(lo, lo\|m, foldcase);

647 AddRuneRangeUTF8((lo\|m)+1, hi, foldcase);

648 return;

649 }

650 if ((hi & m) != m) {

651 AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);

652 AddRuneRangeUTF8(hi&~m, hi, foldcase);

653 return;

654 }

655 }

656 }

657

658 // Finally. Generate byte matching equivalent for lo-hi.

659 uint8 ulo[UTFmax], uhi[UTFmax];

660 int n = runetochar(reinterpret_cast<char*>(ulo), &lo);

661 int m = runetochar(reinterpret_cast<char*>(uhi), &hi);

662 (void)m; // USED(m)

663 DCHECK_EQ(n, m);

664

665 int id = 0;

666 if (reversed_) {

667 for (int i = 0; i < n; i++)

668 id = RuneByteSuffix(ulo[i], uhi[i], false, id);

669 } else {

670 for (int i = n-1; i >= 0; i--)

671 id = RuneByteSuffix(ulo[i], uhi[i], false, id);

672 }

673 AddSuffix(id);

674 }

675

676 // Should not be called.

677 Frag Compiler::Copy(Frag arg) {

678 // We're using WalkExponential; there should be no copying.

679 LOG(DFATAL) << "Compiler::Copy called!";

680 failed_ = true;

681 return NoMatch();

682 }

683

684 // Visits a node quickly; called once WalkExponential has

685 // decided to cut this walk short.

686 Frag Compiler::ShortVisit(Regexp* re, Frag) {

687 failed_ = true;

688 return NoMatch();

689 }

690

691 // Called before traversing a node's children during the walk.

692 Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {

693 // Cut off walk if we've already failed.

694 if (failed_)

695 *stop = true;

696

697 return Frag(); // not used by caller

698 }

699

700 Frag Compiler::Literal(Rune r, bool foldcase) {

701 switch (encoding_) {

702 default:

703 return Frag();

704

705 case kEncodingLatin1:

706 return ByteRange(r, r, foldcase);

707

708 case kEncodingUTF8: {

709 if (r < Runeself) // Make common case fast.

710 return ByteRange(r, r, foldcase);

711 uint8 buf[UTFmax];

712 int n = runetochar(reinterpret_cast<char*>(buf), &r);

713 Frag f = ByteRange((uint8)buf[0], buf[0], false);

714 for (int i = 1; i < n; i++)

715 f = Cat(f, ByteRange((uint8)buf[i], buf[i], false));

716 return f;

717 }

718 }

719 }

720

721 // Called after traversing the node's children during the walk.

722 // Given their frags, build and return the frag for this re.

723 Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,

724 int nchild_frags) {

725 // If a child failed, don't bother going forward, especially

726 // since the child_frags might contain Frags with NULLs in them.

727 if (failed_)

728 return NoMatch();

729

730 // Given the child fragments, return the fragment for this node.

731 switch (re->op()) {

732 case kRegexpRepeat:

733 // Should not see; code at bottom of function will print error

734 break;

735

736 case kRegexpNoMatch:

737 return NoMatch();

738

739 case kRegexpEmptyMatch:

740 return Nop();

741

742 case kRegexpHaveMatch: {

743 Frag f = Match(re->match_id());

744 // Remember unanchored match to end of string.

745 if (anchor_ != RE2::ANCHOR_BOTH)

746 f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f));

747 return f;

748 }

749

750 case kRegexpConcat: {

751 Frag f = child_frags[0];

752 for (int i = 1; i < nchild_frags; i++)

753 f = Cat(f, child_frags[i]);

754 return f;

755 }

756

757 case kRegexpAlternate: {

758 Frag f = child_frags[0];

759 for (int i = 1; i < nchild_frags; i++)

760 f = Alt(f, child_frags[i]);

761 return f;

762 }

763

764 case kRegexpStar:

765 return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);

766

767 case kRegexpPlus:

768 return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);

769

770 case kRegexpQuest:

771 return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);

772

773 case kRegexpLiteral:

774 return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);

775

776 case kRegexpLiteralString: {

777 // Concatenation of literals.

778 if (re->nrunes() == 0)

779 return Nop();

780 Frag f;

781 for (int i = 0; i < re->nrunes(); i++) {

782 Frag f1 = Literal(re->runes()[i],

783 (re->parse_flags()&Regexp::FoldCase) != 0);

784 if (i == 0)

785 f = f1;

786 else

787 f = Cat(f, f1);

788 }

789 return f;

790 }

791

792 case kRegexpAnyChar:

793 BeginRange();

794 AddRuneRange(0, Runemax, false);

795 return EndRange();

796

797 case kRegexpAnyByte:

798 return ByteRange(0x00, 0xFF, false);

799

800 case kRegexpCharClass: {

801 CharClass* cc = re->cc();

802 if (cc->empty()) {

803 // This can't happen.

804 LOG(DFATAL) << "No ranges in char class";

805 failed_ = true;

806 return NoMatch();

807 }

808

809 // ASCII case-folding optimization: if the char class

810 // behaves the same on A-Z as it does on a-z,

811 // discard any ranges wholly contained in A-Z

812 // and mark the other ranges as foldascii.

813 // This reduces the size of a program for

814 // (?i)abc from 3 insts per letter to 1 per letter.

815 bool foldascii = cc->FoldsASCII();

816

817 // Character class is just a big OR of the different

818 // character ranges in the class.

819 BeginRange();

820 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {

821 // ASCII case-folding optimization (see above).

822 if (foldascii && 'A' <= i->lo && i->hi <= 'Z')

823 continue;

824

825 // If this range contains all of A-Za-z or none of it,

826 // the fold flag is unnecessary; don't bother.

827 bool fold = foldascii;

828 if ((i->lo <= 'A' && 'z' <= i->hi) \|\| i->hi < 'A' \|\| 'z' < i->lo \|\|

829 ('Z' < i->lo && i->hi < 'a'))

830 fold = false;

831

832 AddRuneRange(i->lo, i->hi, fold);

833 }

834 return EndRange();

835 }

836

837 case kRegexpCapture:

838 // If this is a non-capturing parenthesis -- (?:foo) --

839 // just use the inner expression.

840 if (re->cap() < 0)

841 return child_frags[0];

842 return Capture(child_frags[0], re->cap());

843

844 case kRegexpBeginLine:

845 return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);

846

847 case kRegexpEndLine:

848 return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);

849

850 case kRegexpBeginText:

851 return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);

852

853 case kRegexpEndText:

854 return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);

855

856 case kRegexpWordBoundary:

857 return EmptyWidth(kEmptyWordBoundary);

858

859 case kRegexpNoWordBoundary:

860 return EmptyWidth(kEmptyNonWordBoundary);

861 }

862 LOG(DFATAL) << "Missing case in Compiler: " << re->op();

863 failed_ = true;

864 return NoMatch();

865 }

866

867 // Is this regexp required to start at the beginning of the text?

868 // Only approximate; can return false for complicated regexps like (\Aa\|\Ab),

869 // but handles (\A(a\|b)). Could use the Walker to write a more exact one.

870 static bool IsAnchorStart(Regexp** pre, int depth) {

871 Regexp* re = *pre;

872 Regexp* sub;

873 // The depth limit makes sure that we don't overflow

874 // the stack on a deeply nested regexp. As the comment

875 // above says, IsAnchorStart is conservative, so returning

876 // a false negative is okay. The exact limit is somewhat arbitrary.

877 if (re == NULL \|\| depth >= 4)

878 return false;

879 switch (re->op()) {

880 default:

881 break;

882 case kRegexpConcat:

883 if (re->nsub() > 0) {

884 sub = re->sub()[0]->Incref();

885 if (IsAnchorStart(&sub, depth+1)) {

886 Regexp** subcopy = new Regexp*[re->nsub()];

887 subcopy[0] = sub; // already have reference

888 for (int i = 1; i < re->nsub(); i++)

889 subcopy[i] = re->sub()[i]->Incref();

890 *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());

891 delete[] subcopy;

892 re->Decref();

893 return true;

894 }

895 sub->Decref();

896 }

897 break;

898 case kRegexpCapture:

899 sub = re->sub()[0]->Incref();

900 if (IsAnchorStart(&sub, depth+1)) {

901 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());

902 re->Decref();

903 return true;

904 }

905 sub->Decref();

906 break;

907 case kRegexpBeginText:

908 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());

909 re->Decref();

910 return true;

911 }

912 return false;

913 }

914

915 // Is this regexp required to start at the end of the text?

916 // Only approximate; can return false for complicated regexps like (a\z\|b\z),

917 // but handles ((a\|b)\z). Could use the Walker to write a more exact one.

918 static bool IsAnchorEnd(Regexp** pre, int depth) {

919 Regexp* re = *pre;

920 Regexp* sub;

921 // The depth limit makes sure that we don't overflow

922 // the stack on a deeply nested regexp. As the comment

923 // above says, IsAnchorEnd is conservative, so returning

924 // a false negative is okay. The exact limit is somewhat arbitrary.

925 if (re == NULL \|\| depth >= 4)

926 return false;

927 switch (re->op()) {

928 default:

929 break;

930 case kRegexpConcat:

931 if (re->nsub() > 0) {

932 sub = re->sub()[re->nsub() - 1]->Incref();

933 if (IsAnchorEnd(&sub, depth+1)) {

934 Regexp** subcopy = new Regexp*[re->nsub()];

935 subcopy[re->nsub() - 1] = sub; // already have reference

936 for (int i = 0; i < re->nsub() - 1; i++)

937 subcopy[i] = re->sub()[i]->Incref();

938 *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());

939 delete[] subcopy;

940 re->Decref();

941 return true;

942 }

943 sub->Decref();

944 }

945 break;

946 case kRegexpCapture:

947 sub = re->sub()[0]->Incref();

948 if (IsAnchorEnd(&sub, depth+1)) {

949 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());

950 re->Decref();

951 return true;

952 }

953 sub->Decref();

954 break;

955 case kRegexpEndText:

956 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());

957 re->Decref();

958 return true;

959 }

960 return false;

961 }

962

963 void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,

964 RE2::Anchor anchor) {

965 prog_->set_flags(flags);

966

967 if (flags & Regexp::Latin1)

968 encoding_ = kEncodingLatin1;

969 max_mem_ = max_mem;

970 if (max_mem <= 0) {

971 max_inst_ = 100000; // more than enough

972 } else if (max_mem <= static_cast<int64>(sizeof(Prog))) {

973 // No room for anything.

974 max_inst_ = 0;

975 } else {

976 int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);

977 // Limit instruction count so that inst->id() fits nicely in an int.

978 // SparseArray also assumes that the indices (inst->id()) are ints.

979 // The call to WalkExponential uses 2*max_inst_ below,

980 // and other places in the code use 2 or 3 * prog->size().

981 // Limiting to 2^24 should avoid overflow in those places.

982 // (The point of allowing more than 32 bits of memory is to

983 // have plenty of room for the DFA states, not to use it up

984 // on the program.)

985 if (m >= 1<<24)

986 m = 1<<24;

987

988 // Inst imposes its own limit (currently bigger than 2^24 but be safe).

989 if (m > Prog::Inst::kMaxInst)

990 m = Prog::Inst::kMaxInst;

991

992 max_inst_ = static_cast<int>(m);

993 }

994

995 anchor_ = anchor;

996 }

997

998 // Compiles re, returning program.

999 // Caller is responsible for deleting prog_.

1000 // If reversed is true, compiles a program that expects

1001 // to run over the input string backward (reverses all concatenations).

1002 // The reversed flag is also recorded in the returned program.

1003 Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {

1004 Compiler c;

1005

1006 c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */);

1007 c.reversed_ = reversed;

1008

1009 // Simplify to remove things like counted repetitions

1010 // and character classes like \d.

1011 Regexp* sre = re->Simplify();

1012 if (sre == NULL)

1013 return NULL;

1014

1015 // Record whether prog is anchored, removing the anchors.

1016 // (They get in the way of other optimizations.)

1017 bool is_anchor_start = IsAnchorStart(&sre, 0);

1018 bool is_anchor_end = IsAnchorEnd(&sre, 0);

1019

1020 // Generate fragment for entire regexp.

1021 Frag f = c.WalkExponential(sre, Frag(), 2*c.max_inst_);

1022 sre->Decref();

1023 if (c.failed_)

1024 return NULL;

1025

1026 // Success! Finish by putting Match node at end, and record start.

1027 // Turn off c.reversed_ (if it is set) to force the remaining concatenations

1028 // to behave normally.

1029 c.reversed_ = false;

1030 Frag all = c.Cat(f, c.Match(0));

1031 c.prog_->set_start(all.begin);

1032

1033 if (reversed) {

1034 c.prog_->set_anchor_start(is_anchor_end);

1035 c.prog_->set_anchor_end(is_anchor_start);

1036 } else {

1037 c.prog_->set_anchor_start(is_anchor_start);

1038 c.prog_->set_anchor_end(is_anchor_end);

1039 }

1040

1041 // Also create unanchored version, which starts with a .*? loop.

1042 if (c.prog_->anchor_start()) {

1043 c.prog_->set_start_unanchored(c.prog_->start());

1044 } else {

1045 Frag unanchored = c.Cat(c.DotStar(), all);

1046 c.prog_->set_start_unanchored(unanchored.begin);

1047 }

1048

1049 c.prog_->set_reversed(reversed);

1050

1051 // Hand ownership of prog_ to caller.

1052 return c.Finish();

1053 }

1054

1055 Prog* Compiler::Finish() {

1056 if (failed_)

1057 return NULL;

1058

1059 if (prog_->start() == 0 && prog_->start_unanchored() == 0) {

1060 // No possible matches; keep Fail instruction only.

1061 inst_len_ = 1;

1062 }

1063

1064 // Trim instruction to minimum array and transfer to Prog.

1065 Trim();

1066 prog_->inst_ = inst_;

1067 prog_->size_ = inst_len_;

1068 inst_ = NULL;

1069

1070 // Compute byte map.

1071 prog_->ComputeByteMap();

1072

1073 prog_->Optimize();

1074

1075 // Record remaining memory for DFA.

1076 if (max_mem_ <= 0) {

1077 prog_->set_dfa_mem(1<<20);

1078 } else {

1079 int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst);

1080 if (m < 0)

1081 m = 0;

1082 prog_->set_dfa_mem(m);

1083 }

1084

1085 Prog* p = prog_;

1086 prog_ = NULL;

1087 return p;

1088 }

1089

1090 // Converts Regexp to Prog.

1091 Prog* Regexp::CompileToProg(int64 max_mem) {

1092 return Compiler::Compile(this, false, max_mem);

1093 }

1094

1095 Prog* Regexp::CompileToReverseProg(int64 max_mem) {

1096 return Compiler::Compile(this, true, max_mem);

1097 }

1098

1099 Frag Compiler::DotStar() {

1100 return Star(ByteRange(0x00, 0xff, false), true);

1101 }

1102

1103 // Compiles RE set to Prog.

1104 Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,

1105 Regexp* re) {

1106 Compiler c;

1107

1108 Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags());

1109 c.Setup(pf, options.max_mem(), anchor);

1110

1111 // Compile alternation of fragments.

1112 Frag all = c.WalkExponential(re, Frag(), 2*c.max_inst_);

1113 re->Decref();

1114 if (c.failed_)

1115 return NULL;

1116

1117 if (anchor == RE2::UNANCHORED) {

1118 // The trailing .* was added while handling kRegexpHaveMatch.

1119 // We just have to add the leading one.

1120 all = c.Cat(c.DotStar(), all);

1121 }

1122

1123 c.prog_->set_start(all.begin);

1124 c.prog_->set_start_unanchored(all.begin);

1125 c.prog_->set_anchor_start(true);

1126 c.prog_->set_anchor_end(true);

1127

1128 Prog* prog = c.Finish();

1129 if (prog == NULL)

1130 return NULL;

1131

1132 // Make sure DFA has enough memory to operate,

1133 // since we're not going to fall back to the NFA.

1134 bool failed;

1135 StringPiece sp = "hello, world";

1136 prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,

1137 NULL, &failed, NULL);

1138 if (failed) {

1139 delete prog;

1140 return NULL;

1141 }

1142

1143 return prog;

1144 }

1145

1146 Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,

1147 Regexp* re) {

1148 return Compiler::CompileSet(options, anchor, re);

1149 }

1150

1151 } // namespace re2

OLD	NEW

« no previous file with comments | « third_party/re2/re2/bitstate.cc ('k') | third_party/re2/re2/dfa.cc » ('j') | no next file with comments »