third_party/re2/re2/simplify.cc - Issue 1544433002: Replace RE2 import with a dependency

Side by Side Diff: third_party/re2/re2/simplify.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Re-Added LICENSE and OWNERS file Created 4 years, 12 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright 2006 The RE2 Authors. All Rights Reserved.

2 // Use of this source code is governed by a BSD-style

3 // license that can be found in the LICENSE file.

4

5 // Rewrite POSIX and other features in re

6 // to use simple extended regular expression features.

7 // Also sort and simplify character classes.

8

9 #include "util/util.h"

10 #include "re2/regexp.h"

11 #include "re2/walker-inl.h"

12

13 namespace re2 {

14

15 // Parses the regexp src and then simplifies it and sets *dst to the

16 // string representation of the simplified form. Returns true on success.

17 // Returns false and sets *error (if error != NULL) on error.

18 bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,

19 string* dst,

20 RegexpStatus* status) {

21 Regexp* re = Parse(src, flags, status);

22 if (re == NULL)

23 return false;

24 Regexp* sre = re->Simplify();

25 re->Decref();

26 if (sre == NULL) {

27 // Should not happen, since Simplify never fails.

28 LOG(ERROR) << "Simplify failed on " << src;

29 if (status) {

30 status->set_code(kRegexpInternalError);

31 status->set_error_arg(src);

32 }

33 return false;

34 }

35 *dst = sre->ToString();

36 sre->Decref();

37 return true;

38 }

39

40 // Assuming the simple_ flags on the children are accurate,

41 // is this Regexp* simple?

42 bool Regexp::ComputeSimple() {

43 Regexp** subs;

44 switch (op_) {

45 case kRegexpNoMatch:

46 case kRegexpEmptyMatch:

47 case kRegexpLiteral:

48 case kRegexpLiteralString:

49 case kRegexpBeginLine:

50 case kRegexpEndLine:

51 case kRegexpBeginText:

52 case kRegexpWordBoundary:

53 case kRegexpNoWordBoundary:

54 case kRegexpEndText:

55 case kRegexpAnyChar:

56 case kRegexpAnyByte:

57 case kRegexpHaveMatch:

58 return true;

59 case kRegexpConcat:

60 case kRegexpAlternate:

61 // These are simple as long as the subpieces are simple.

62 subs = sub();

63 for (int i = 0; i < nsub_; i++)

64 if (!subs[i]->simple())

65 return false;

66 return true;

67 case kRegexpCharClass:

68 // Simple as long as the char class is not empty, not full.

69 if (ccb_ != NULL)

70 return !ccb_->empty() && !ccb_->full();

71 return !cc_->empty() && !cc_->full();

72 case kRegexpCapture:

73 subs = sub();

74 return subs[0]->simple();

75 case kRegexpStar:

76 case kRegexpPlus:

77 case kRegexpQuest:

78 subs = sub();

79 if (!subs[0]->simple())

80 return false;

81 switch (subs[0]->op_) {

82 case kRegexpStar:

83 case kRegexpPlus:

84 case kRegexpQuest:

85 case kRegexpEmptyMatch:

86 case kRegexpNoMatch:

87 return false;

88 default:

89 break;

90 }

91 return true;

92 case kRegexpRepeat:

93 return false;

94 }

95 LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;

96 return false;

97 }

98

99 // Walker subclass used by Simplify.

100 // Coalesces runs of star/plus/quest/repeat of the same literal along with any

101 // occurrences of that literal into repeats of that literal. It also works for

102 // char classes, any char and any byte.

103 // PostVisit creates the coalesced result, which should then be simplified.

104 class CoalesceWalker : public Regexp::Walker<Regexp*> {

105 public:

106 CoalesceWalker() {}

107 virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,

108 Regexp** child_args, int nchild_args);

109 virtual Regexp* Copy(Regexp* re);

110 virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);

111

112 private:

113 // These functions are declared inside CoalesceWalker so that

114 // they can edit the private fields of the Regexps they construct.

115

116 // Returns true if r1 and r2 can be coalesced. In particular, ensures that

117 // the parse flags are consistent. (They will not be checked again later.)

118 static bool CanCoalesce(Regexp* r1, Regexp* r2);

119

120 // Coalesces r1ptr and r2ptr. In most cases, the array elements afterwards

121 // will be empty match and the coalesced op. In other cases, where part of a

122 // literal string was removed to be coalesced, the array elements afterwards

123 // will be the coalesced op and the remainder of the literal string.

124 static void DoCoalesce(Regexp r1ptr, Regexp r2ptr);

125

126 DISALLOW_COPY_AND_ASSIGN(CoalesceWalker);

127 };

128

129 // Walker subclass used by Simplify.

130 // The simplify walk is purely post-recursive: given the simplified children,

131 // PostVisit creates the simplified result.

132 // The child_args are simplified Regexp*s.

133 class SimplifyWalker : public Regexp::Walker<Regexp*> {

134 public:

135 SimplifyWalker() {}

136 virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);

137 virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,

138 Regexp** child_args, int nchild_args);

139 virtual Regexp* Copy(Regexp* re);

140 virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);

141

142 private:

143 // These functions are declared inside SimplifyWalker so that

144 // they can edit the private fields of the Regexps they construct.

145

146 // Creates a concatenation of two Regexp, consuming refs to re1 and re2.

147 // Caller must Decref return value when done with it.

148 static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);

149

150 // Simplifies the expression re{min,max} in terms of *, +, and ?.

151 // Returns a new regexp. Does not edit re. Does not consume reference to re.

152 // Caller must Decref return value when done with it.

153 static Regexp* SimplifyRepeat(Regexp* re, int min, int max,

154 Regexp::ParseFlags parse_flags);

155

156 // Simplifies a character class by expanding any named classes

157 // into rune ranges. Does not edit re. Does not consume ref to re.

158 // Caller must Decref return value when done with it.

159 static Regexp* SimplifyCharClass(Regexp* re);

160

161 DISALLOW_COPY_AND_ASSIGN(SimplifyWalker);

162 };

163

164 // Simplifies a regular expression, returning a new regexp.

165 // The new regexp uses traditional Unix egrep features only,

166 // plus the Perl (?:) non-capturing parentheses.

167 // Otherwise, no POSIX or Perl additions. The new regexp

168 // captures exactly the same subexpressions (with the same indices)

169 // as the original.

170 // Does not edit current object.

171 // Caller must Decref() return value when done with it.

172

173 Regexp* Regexp::Simplify() {

174 CoalesceWalker cw;

175 Regexp* cre = cw.Walk(this, NULL);

176 if (cre == NULL)

177 return cre;

178 SimplifyWalker sw;

179 Regexp* sre = sw.Walk(cre, NULL);

180 cre->Decref();

181 return sre;

182 }

183

184 #define Simplify DontCallSimplify // Avoid accidental recursion

185

186 // Utility function for PostVisit implementations that compares re->sub() with

187 // child_args to determine whether any child_args changed. In the common case,

188 // where nothing changed, calls Decref() for all child_args and returns false,

189 // so PostVisit must return re->Incref(). Otherwise, returns true.

190 static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {

191 for (int i = 0; i < re->nsub(); i++) {

192 Regexp* sub = re->sub()[i];

193 Regexp* newsub = child_args[i];

194 if (newsub != sub)

195 return true;

196 }

197 for (int i = 0; i < re->nsub(); i++) {

198 Regexp* newsub = child_args[i];

199 newsub->Decref();

200 }

201 return false;

202 }

203

204 Regexp* CoalesceWalker::Copy(Regexp* re) {

205 return re->Incref();

206 }

207

208 Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {

209 // This should never be called, since we use Walk and not

210 // WalkExponential.

211 LOG(DFATAL) << "CoalesceWalker::ShortVisit called";

212 return re->Incref();

213 }

214

215 Regexp* CoalesceWalker::PostVisit(Regexp* re,

216 Regexp* parent_arg,

217 Regexp* pre_arg,

218 Regexp** child_args,

219 int nchild_args) {

220 if (re->nsub() == 0)

221 return re->Incref();

222

223 if (re->op() != kRegexpConcat) {

224 if (!ChildArgsChanged(re, child_args))

225 return re->Incref();

226

227 // Something changed. Build a new op.

228 Regexp* nre = new Regexp(re->op(), re->parse_flags());

229 nre->AllocSub(re->nsub());

230 Regexp** nre_subs = nre->sub();

231 for (int i = 0; i < re->nsub(); i++)

232 nre_subs[i] = child_args[i];

233 // Repeats and Captures have additional data that must be copied.

234 if (re->op() == kRegexpRepeat) {

235 nre->min_ = re->min();

236 nre->max_ = re->max();

237 } else if (re->op() == kRegexpCapture) {

238 nre->cap_ = re->cap();

239 }

240 return nre;

241 }

242

243 bool can_coalesce = false;

244 for (int i = 0; i < re->nsub(); i++) {

245 if (i+1 < re->nsub() &&

246 CanCoalesce(child_args[i], child_args[i+1])) {

247 can_coalesce = true;

248 break;

249 }

250 }

251 if (!can_coalesce) {

252 if (!ChildArgsChanged(re, child_args))

253 return re->Incref();

254

255 // Something changed. Build a new op.

256 Regexp* nre = new Regexp(re->op(), re->parse_flags());

257 nre->AllocSub(re->nsub());

258 Regexp** nre_subs = nre->sub();

259 for (int i = 0; i < re->nsub(); i++)

260 nre_subs[i] = child_args[i];

261 return nre;

262 }

263

264 for (int i = 0; i < re->nsub(); i++) {

265 if (i+1 < re->nsub() &&

266 CanCoalesce(child_args[i], child_args[i+1]))

267 DoCoalesce(&child_args[i], &child_args[i+1]);

268 }

269 // Determine how many empty matches were left by DoCoalesce.

270 int n = 0;

271 for (int i = n; i < re->nsub(); i++) {

272 if (child_args[i]->op() == kRegexpEmptyMatch)

273 n++;

274 }

275 // Build a new op.

276 Regexp* nre = new Regexp(re->op(), re->parse_flags());

277 nre->AllocSub(re->nsub() - n);

278 Regexp** nre_subs = nre->sub();

279 for (int i = 0, j = 0; i < re->nsub(); i++) {

280 if (child_args[i]->op() == kRegexpEmptyMatch) {

281 child_args[i]->Decref();

282 continue;

283 }

284 nre_subs[j] = child_args[i];

285 j++;

286 }

287 return nre;

288 }

289

290 bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {

291 // r1 must be a star/plus/quest/repeat of a literal, char class, any char or

292 // any byte.

293 if ((r1->op() == kRegexpStar \|\|

294 r1->op() == kRegexpPlus \|\|

295 r1->op() == kRegexpQuest \|\|

296 r1->op() == kRegexpRepeat) &&

297 (r1->sub()[0]->op() == kRegexpLiteral \|\|

298 r1->sub()[0]->op() == kRegexpCharClass \|\|

299 r1->sub()[0]->op() == kRegexpAnyChar \|\|

300 r1->sub()[0]->op() == kRegexpAnyByte)) {

301 // r2 must be a star/plus/quest/repeat of the same literal, char class,

302 // any char or any byte.

303 if ((r2->op() == kRegexpStar \|\|

304 r2->op() == kRegexpPlus \|\|

305 r2->op() == kRegexpQuest \|\|

306 r2->op() == kRegexpRepeat) &&

307 Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&

308 // The parse flags must be consistent.

309 ((r1->parse_flags() & Regexp::NonGreedy) ==

310 (r2->parse_flags() & Regexp::NonGreedy))) {

311 return true;

312 }

313 // ... OR an occurrence of that literal, char class, any char or any byte

314 if (Regexp::Equal(r1->sub()[0], r2)) {

315 return true;

316 }

317 // ... OR a literal string that begins with that literal.

318 if (r1->sub()[0]->op() == kRegexpLiteral &&

319 r2->op() == kRegexpLiteralString &&

320 r2->runes()[0] == r1->sub()[0]->rune() &&

321 // The parse flags must be consistent.

322 ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==

323 (r2->parse_flags() & Regexp::FoldCase))) {

324 return true;

325 }

326 }

327 return false;

328 }

329

330 void CoalesceWalker::DoCoalesce(Regexp r1ptr, Regexp r2ptr) {

331 Regexp* r1 = *r1ptr;

332 Regexp* r2 = *r2ptr;

333

334 Regexp* nre = Regexp::Repeat(

335 r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);

336

337 switch (r1->op()) {

338 case kRegexpStar:

339 nre->min_ = 0;

340 nre->max_ = -1;

341 break;

342

343 case kRegexpPlus:

344 nre->min_ = 1;

345 nre->max_ = -1;

346 break;

347

348 case kRegexpQuest:

349 nre->min_ = 0;

350 nre->max_ = 1;

351 break;

352

353 case kRegexpRepeat:

354 nre->min_ = r1->min();

355 nre->max_ = r1->max();

356 break;

357

358 default:

359 LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();

360 nre->Decref();

361 return;

362 }

363

364 switch (r2->op()) {

365 case kRegexpStar:

366 nre->max_ = -1;

367 goto LeaveEmpty;

368

369 case kRegexpPlus:

370 nre->min_++;

371 nre->max_ = -1;

372 goto LeaveEmpty;

373

374 case kRegexpQuest:

375 if (nre->max() != -1)

376 nre->max_++;

377 goto LeaveEmpty;

378

379 case kRegexpRepeat:

380 nre->min_ += r2->min();

381 if (r2->max() == -1)

382 nre->max_ = -1;

383 else if (nre->max() != -1)

384 nre->max_ += r2->max();

385 goto LeaveEmpty;

386

387 case kRegexpLiteral:

388 case kRegexpCharClass:

389 case kRegexpAnyChar:

390 case kRegexpAnyByte:

391 nre->min_++;

392 if (nre->max() != -1)

393 nre->max_++;

394 goto LeaveEmpty;

395

396 LeaveEmpty:

397 *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);

398 *r2ptr = nre;

399 break;

400

401 case kRegexpLiteralString: {

402 Rune r = r1->sub()[0]->rune();

403 // Determine how much of the literal string is removed.

404 // We know that we have at least one rune. :)

405 int n = 1;

406 while (n < r2->nrunes() && r2->runes()[n] == r)

407 n++;

408 nre->min_ += n;

409 if (nre->max() != -1)

410 nre->max_ += n;

411 if (n == r2->nrunes())

412 goto LeaveEmpty;

413 *r1ptr = nre;

414 *r2ptr = Regexp::LiteralString(

415 &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());

416 break;

417 }

418

419 default:

420 LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();

421 nre->Decref();

422 return;

423 }

424

425 r1->Decref();

426 r2->Decref();

427 }

428

429 Regexp* SimplifyWalker::Copy(Regexp* re) {

430 return re->Incref();

431 }

432

433 Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {

434 // This should never be called, since we use Walk and not

435 // WalkExponential.

436 LOG(DFATAL) << "SimplifyWalker::ShortVisit called";

437 return re->Incref();

438 }

439

440 Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {

441 if (re->simple()) {

442 *stop = true;

443 return re->Incref();

444 }

445 return NULL;

446 }

447

448 Regexp* SimplifyWalker::PostVisit(Regexp* re,

449 Regexp* parent_arg,

450 Regexp* pre_arg,

451 Regexp** child_args,

452 int nchild_args) {

453 switch (re->op()) {

454 case kRegexpNoMatch:

455 case kRegexpEmptyMatch:

456 case kRegexpLiteral:

457 case kRegexpLiteralString:

458 case kRegexpBeginLine:

459 case kRegexpEndLine:

460 case kRegexpBeginText:

461 case kRegexpWordBoundary:

462 case kRegexpNoWordBoundary:

463 case kRegexpEndText:

464 case kRegexpAnyChar:

465 case kRegexpAnyByte:

466 case kRegexpHaveMatch:

467 // All these are always simple.

468 re->simple_ = true;

469 return re->Incref();

470

471 case kRegexpConcat:

472 case kRegexpAlternate: {

473 // These are simple as long as the subpieces are simple.

474 if (!ChildArgsChanged(re, child_args)) {

475 re->simple_ = true;

476 return re->Incref();

477 }

478 Regexp* nre = new Regexp(re->op(), re->parse_flags());

479 nre->AllocSub(re->nsub());

480 Regexp** nre_subs = nre->sub();

481 for (int i = 0; i < re->nsub(); i++)

482 nre_subs[i] = child_args[i];

483 nre->simple_ = true;

484 return nre;

485 }

486

487 case kRegexpCapture: {

488 Regexp* newsub = child_args[0];

489 if (newsub == re->sub()[0]) {

490 newsub->Decref();

491 re->simple_ = true;

492 return re->Incref();

493 }

494 Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());

495 nre->AllocSub(1);

496 nre->sub()[0] = newsub;

497 nre->cap_ = re->cap();

498 nre->simple_ = true;

499 return nre;

500 }

501

502 case kRegexpStar:

503 case kRegexpPlus:

504 case kRegexpQuest: {

505 Regexp* newsub = child_args[0];

506 // Special case: repeat the empty string as much as

507 // you want, but it's still the empty string.

508 if (newsub->op() == kRegexpEmptyMatch)

509 return newsub;

510

511 // These are simple as long as the subpiece is simple.

512 if (newsub == re->sub()[0]) {

513 newsub->Decref();

514 re->simple_ = true;

515 return re->Incref();

516 }

517

518 // These are also idempotent if flags are constant.

519 if (re->op() == newsub->op() &&

520 re->parse_flags() == newsub->parse_flags())

521 return newsub;

522

523 Regexp* nre = new Regexp(re->op(), re->parse_flags());

524 nre->AllocSub(1);

525 nre->sub()[0] = newsub;

526 nre->simple_ = true;

527 return nre;

528 }

529

530 case kRegexpRepeat: {

531 Regexp* newsub = child_args[0];

532 // Special case: repeat the empty string as much as

533 // you want, but it's still the empty string.

534 if (newsub->op() == kRegexpEmptyMatch)

535 return newsub;

536

537 Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,

538 re->parse_flags());

539 newsub->Decref();

540 nre->simple_ = true;

541 return nre;

542 }

543

544 case kRegexpCharClass: {

545 Regexp* nre = SimplifyCharClass(re);

546 nre->simple_ = true;

547 return nre;

548 }

549 }

550

551 LOG(ERROR) << "Simplify case not handled: " << re->op();

552 return re->Incref();

553 }

554

555 // Creates a concatenation of two Regexp, consuming refs to re1 and re2.

556 // Returns a new Regexp, handing the ref to the caller.

557 Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,

558 Regexp::ParseFlags parse_flags) {

559 Regexp* re = new Regexp(kRegexpConcat, parse_flags);

560 re->AllocSub(2);

561 Regexp** subs = re->sub();

562 subs[0] = re1;

563 subs[1] = re2;

564 return re;

565 }

566

567 // Simplifies the expression re{min,max} in terms of *, +, and ?.

568 // Returns a new regexp. Does not edit re. Does not consume reference to re.

569 // Caller must Decref return value when done with it.

570 // The result will not necessarily have the right capturing parens

571 // if you call ToString() and re-parse it: (x){2} becomes (x)(x),

572 // but in the Regexp* representation, both (x) are marked as $1.

573 Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,

574 Regexp::ParseFlags f) {

575 // x{n,} means at least n matches of x.

576 if (max == -1) {

577 // Special case: x{0,} is x*

578 if (min == 0)

579 return Regexp::Star(re->Incref(), f);

580

581 // Special case: x{1,} is x+

582 if (min == 1)

583 return Regexp::Plus(re->Incref(), f);

584

585 // General case: x{4,} is xxxx+

586 Regexp* nre = new Regexp(kRegexpConcat, f);

587 nre->AllocSub(min);

588 Regexp** nre_subs = nre->sub();

589 for (int i = 0; i < min-1; i++)

590 nre_subs[i] = re->Incref();

591 nre_subs[min-1] = Regexp::Plus(re->Incref(), f);

592 return nre;

593 }

594

595 // Special case: (x){0} matches only empty string.

596 if (min == 0 && max == 0)

597 return new Regexp(kRegexpEmptyMatch, f);

598

599 // Special case: x{1} is just x.

600 if (min == 1 && max == 1)

601 return re->Incref();

602

603 // General case: x{n,m} means n copies of x and m copies of x?.

604 // The machine will do less work if we nest the final m copies,

605 // so that x{2,5} = xx(x(x(x)?)?)?

606

607 // Build leading prefix: xx. Capturing only on the last one.

608 Regexp* nre = NULL;

609 if (min > 0) {

610 nre = new Regexp(kRegexpConcat, f);

611 nre->AllocSub(min);

612 Regexp** nre_subs = nre->sub();

613 for (int i = 0; i < min; i++)

614 nre_subs[i] = re->Incref();

615 }

616

617 // Build and attach suffix: (x(x(x)?)?)?

618 if (max > min) {

619 Regexp* suf = Regexp::Quest(re->Incref(), f);

620 for (int i = min+1; i < max; i++)

621 suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);

622 if (nre == NULL)

623 nre = suf;

624 else

625 nre = Concat2(nre, suf, f);

626 }

627

628 if (nre == NULL) {

629 // Some degenerate case, like min > max, or min < max < 0.

630 // This shouldn't happen, because the parser rejects such regexps.

631 LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;

632 return new Regexp(kRegexpNoMatch, f);

633 }

634

635 return nre;

636 }

637

638 // Simplifies a character class.

639 // Caller must Decref return value when done with it.

640 Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {

641 CharClass* cc = re->cc();

642

643 // Special cases

644 if (cc->empty())

645 return new Regexp(kRegexpNoMatch, re->parse_flags());

646 if (cc->full())

647 return new Regexp(kRegexpAnyChar, re->parse_flags());

648

649 return re->Incref();

650 }

651

652 } // namespace re2

OLD	NEW

« no previous file with comments | « third_party/re2/re2/set.cc ('k') | third_party/re2/re2/stringpiece.h » ('j') | no next file with comments »