third_party/re2/re2/re2.cc - Issue 1544433002: Replace RE2 import with a dependency

Side by Side Diff: third_party/re2/re2/re2.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.

2 // Use of this source code is governed by a BSD-style

3 // license that can be found in the LICENSE file.

4

5 // Regular expression interface RE2.

6 //

7 // Originally the PCRE C++ wrapper, but adapted to use

8 // the new automata-based regular expression engines.

9

10 #include "re2/re2.h"

11

12 #include <stdio.h>

13 #include <string>

14 #include <errno.h>

15 #include "util/util.h"

16 #include "util/flags.h"

17 #include "util/sparse_array.h"

18 #include "re2/prog.h"

19 #include "re2/regexp.h"

20

21 DEFINE_bool(trace_re2, false, "trace RE2 execution");

22

23 namespace re2 {

24

25 // Maximum number of args we can set

26 static const int kMaxArgs = 16;

27 static const int kVecSize = 1+kMaxArgs;

28

29 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Ful lMatchN> RE2::FullMatch = {};

30 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::Par tialMatchN> RE2::PartialMatch = {};

31 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume = {};

32 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndCo nsumeN> RE2::FindAndConsume = {};

33

34 // This will trigger LNK2005 error in MSVC.

35 #ifndef _MSC_VER

36 const int RE2::Options::kDefaultMaxMem; // initialized in re2.h

37 #endif

38

39 RE2::Options::Options(RE2::CannedOptions opt)

40 : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),

41 posix_syntax_(opt == RE2::POSIX),

42 longest_match_(opt == RE2::POSIX),

43 log_errors_(opt != RE2::Quiet),

44 max_mem_(kDefaultMaxMem),

45 literal_(false),

46 never_nl_(false),

47 dot_nl_(false),

48 never_capture_(false),

49 case_sensitive_(true),

50 perl_classes_(false),

51 word_boundary_(false),

52 one_line_(false) {

53 }

54

55 // static empty things for use as const references.

56 // To avoid global constructors, initialized on demand.

57 GLOBAL_MUTEX(empty_mutex);

58 static const string *empty_string;

59 static const map<string, int> *empty_named_groups;

60 static const map<int, string> *empty_group_names;

61

62 static void InitEmpty() {

63 GLOBAL_MUTEX_LOCK(empty_mutex);

64 if (empty_string == NULL) {

65 empty_string = new string;

66 empty_named_groups = new map<string, int>;

67 empty_group_names = new map<int, string>;

68 }

69 GLOBAL_MUTEX_UNLOCK(empty_mutex);

70 }

71

72 // Converts from Regexp error code to RE2 error code.

73 // Maybe some day they will diverge. In any event, this

74 // hides the existence of Regexp from RE2 users.

75 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {

76 switch (code) {

77 case re2::kRegexpSuccess:

78 return RE2::NoError;

79 case re2::kRegexpInternalError:

80 return RE2::ErrorInternal;

81 case re2::kRegexpBadEscape:

82 return RE2::ErrorBadEscape;

83 case re2::kRegexpBadCharClass:

84 return RE2::ErrorBadCharClass;

85 case re2::kRegexpBadCharRange:

86 return RE2::ErrorBadCharRange;

87 case re2::kRegexpMissingBracket:

88 return RE2::ErrorMissingBracket;

89 case re2::kRegexpMissingParen:

90 return RE2::ErrorMissingParen;

91 case re2::kRegexpTrailingBackslash:

92 return RE2::ErrorTrailingBackslash;

93 case re2::kRegexpRepeatArgument:

94 return RE2::ErrorRepeatArgument;

95 case re2::kRegexpRepeatSize:

96 return RE2::ErrorRepeatSize;

97 case re2::kRegexpRepeatOp:

98 return RE2::ErrorRepeatOp;

99 case re2::kRegexpBadPerlOp:

100 return RE2::ErrorBadPerlOp;

101 case re2::kRegexpBadUTF8:

102 return RE2::ErrorBadUTF8;

103 case re2::kRegexpBadNamedCapture:

104 return RE2::ErrorBadNamedCapture;

105 }

106 return RE2::ErrorInternal;

107 }

108

109 static string trunc(const StringPiece& pattern) {

110 if (pattern.size() < 100)

111 return pattern.as_string();

112 return pattern.substr(0, 100).as_string() + "...";

113 }

114

115

116 RE2::RE2(const char* pattern) {

117 Init(pattern, DefaultOptions);

118 }

119

120 RE2::RE2(const string& pattern) {

121 Init(pattern, DefaultOptions);

122 }

123

124 RE2::RE2(const StringPiece& pattern) {

125 Init(pattern, DefaultOptions);

126 }

127

128 RE2::RE2(const StringPiece& pattern, const Options& options) {

129 Init(pattern, options);

130 }

131

132 int RE2::Options::ParseFlags() const {

133 int flags = Regexp::ClassNL;

134 switch (encoding()) {

135 default:

136 if (log_errors())

137 LOG(ERROR) << "Unknown encoding " << encoding();

138 break;

139 case RE2::Options::EncodingUTF8:

140 break;

141 case RE2::Options::EncodingLatin1:

142 flags \|= Regexp::Latin1;

143 break;

144 }

145

146 if (!posix_syntax())

147 flags \|= Regexp::LikePerl;

148

149 if (literal())

150 flags \|= Regexp::Literal;

151

152 if (never_nl())

153 flags \|= Regexp::NeverNL;

154

155 if (dot_nl())

156 flags \|= Regexp::DotNL;

157

158 if (never_capture())

159 flags \|= Regexp::NeverCapture;

160

161 if (!case_sensitive())

162 flags \|= Regexp::FoldCase;

163

164 if (perl_classes())

165 flags \|= Regexp::PerlClasses;

166

167 if (word_boundary())

168 flags \|= Regexp::PerlB;

169

170 if (one_line())

171 flags \|= Regexp::OneLine;

172

173 return flags;

174 }

175

176 void RE2::Init(const StringPiece& pattern, const Options& options) {

177 mutex_ = new Mutex;

178 pattern_ = pattern.as_string();

179 options_.Copy(options);

180 InitEmpty();

181 error_ = empty_string;

182 error_code_ = NoError;

183 suffix_regexp_ = NULL;

184 entire_regexp_ = NULL;

185 prog_ = NULL;

186 rprog_ = NULL;

187 named_groups_ = NULL;

188 group_names_ = NULL;

189 num_captures_ = -1;

190

191 RegexpStatus status;

192 entire_regexp_ = Regexp::Parse(

193 pattern_,

194 static_cast<Regexp::ParseFlags>(options_.ParseFlags()),

195 &status);

196 if (entire_regexp_ == NULL) {

197 if (error_ == empty_string)

198 error_ = new string(status.Text());

199 if (options_.log_errors()) {

200 LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "

201 << status.Text();

202 }

203 error_arg_ = status.error_arg().as_string();

204 error_code_ = RegexpErrorToRE2(status.code());

205 return;

206 }

207

208 prefix_.clear();

209 prefix_foldcase_ = false;

210 re2::Regexp* suffix;

211 if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))

212 suffix_regexp_ = suffix;

213 else

214 suffix_regexp_ = entire_regexp_->Incref();

215

216 // Two thirds of the memory goes to the forward Prog,

217 // one third to the reverse prog, because the forward

218 // Prog has two DFAs but the reverse prog has one.

219 prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);

220 if (prog_ == NULL) {

221 if (options_.log_errors())

222 LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";

223 error_ = new string("pattern too large - compile failed");

224 error_code_ = RE2::ErrorPatternTooLarge;

225 return;

226 }

227

228 // Could delay this until the first match call that

229 // cares about submatch information, but the one-pass

230 // machine's memory gets cut from the DFA memory budget,

231 // and that is harder to do if the DFA has already

232 // been built.

233 is_one_pass_ = prog_->IsOnePass();

234 }

235

236 // Returns rprog_, computing it if needed.

237 re2::Prog* RE2::ReverseProg() const {

238 MutexLock l(mutex_);

239 if (rprog_ == NULL && error_ == empty_string) {

240 rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);

241 if (rprog_ == NULL) {

242 if (options_.log_errors())

243 LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";

244 error_ = new string("pattern too large - reverse compile failed");

245 error_code_ = RE2::ErrorPatternTooLarge;

246 return NULL;

247 }

248 }

249 return rprog_;

250 }

251

252 RE2::~RE2() {

253 if (suffix_regexp_)

254 suffix_regexp_->Decref();

255 if (entire_regexp_)

256 entire_regexp_->Decref();

257 delete mutex_;

258 delete prog_;

259 delete rprog_;

260 if (error_ != empty_string)

261 delete error_;

262 if (named_groups_ != NULL && named_groups_ != empty_named_groups)

263 delete named_groups_;

264 if (group_names_ != NULL && group_names_ != empty_group_names)

265 delete group_names_;

266 }

267

268 int RE2::ProgramSize() const {

269 if (prog_ == NULL)

270 return -1;

271 return prog_->size();

272 }

273

274 int RE2::ProgramFanout(map<int, int>* histogram) const {

275 if (prog_ == NULL)

276 return -1;

277 SparseArray<int> fanout(prog_->size());

278 prog_->Fanout(&fanout);

279 histogram->clear();

280 for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {

281 // TODO(junyer): Optimise this?

282 int bucket = 0;

283 while (1 << bucket < i->second) {

284 bucket++;

285 }

286 (*histogram)[bucket]++;

287 }

288 return histogram->rbegin()->first;

289 }

290

291 // Returns num_captures_, computing it if needed, or -1 if the

292 // regexp wasn't valid on construction.

293 int RE2::NumberOfCapturingGroups() const {

294 MutexLock l(mutex_);

295 if (suffix_regexp_ == NULL)

296 return -1;

297 if (num_captures_ == -1)

298 num_captures_ = suffix_regexp_->NumCaptures();

299 return num_captures_;

300 }

301

302 // Returns named_groups_, computing it if needed.

303 const map<string, int>& RE2::NamedCapturingGroups() const {

304 MutexLock l(mutex_);

305 if (!ok())

306 return *empty_named_groups;

307 if (named_groups_ == NULL) {

308 named_groups_ = suffix_regexp_->NamedCaptures();

309 if (named_groups_ == NULL)

310 named_groups_ = empty_named_groups;

311 }

312 return *named_groups_;

313 }

314

315 // Returns group_names_, computing it if needed.

316 const map<int, string>& RE2::CapturingGroupNames() const {

317 MutexLock l(mutex_);

318 if (!ok())

319 return *empty_group_names;

320 if (group_names_ == NULL) {

321 group_names_ = suffix_regexp_->CaptureNames();

322 if (group_names_ == NULL)

323 group_names_ = empty_group_names;

324 }

325 return *group_names_;

326 }

327

328 /*** Convenience interfaces ***/

329

330 bool RE2::FullMatchN(const StringPiece& text, const RE2& re,

331 const Arg* const args[], int n) {

332 return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);

333 }

334

335 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,

336 const Arg* const args[], int n) {

337 return re.DoMatch(text, UNANCHORED, NULL, args, n);

338 }

339

340 bool RE2::ConsumeN(StringPiece* input, const RE2& re,

341 const Arg* const args[], int n) {

342 int consumed;

343 if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {

344 input->remove_prefix(consumed);

345 return true;

346 } else {

347 return false;

348 }

349 }

350

351 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,

352 const Arg* const args[], int n) {

353 int consumed;

354 if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {

355 input->remove_prefix(consumed);

356 return true;

357 } else {

358 return false;

359 }

360 }

361

362 // Returns the maximum submatch needed for the rewrite to be done by Replace().

363 // E.g. if rewrite == "foo \\2,\\1", returns 2.

364 int RE2::MaxSubmatch(const StringPiece& rewrite) {

365 int max = 0;

366 for (const char s = rewrite.data(), end = s + rewrite.size();

367 s < end; s++) {

368 if (*s == '\\') {

369 s++;

370 int c = (s < end) ? *s : -1;

371 if (isdigit(c)) {

372 int n = (c - '0');

373 if (n > max)

374 max = n;

375 }

376 }

377 }

378 return max;

379 }

380

381 bool RE2::Replace(string *str,

382 const RE2& re,

383 const StringPiece& rewrite) {

384 StringPiece vec[kVecSize];

385 int nvec = 1 + MaxSubmatch(rewrite);

386 if (nvec > arraysize(vec))

387 return false;

388 if (!re.Match(*str, 0, static_cast<int>(str->size()), UNANCHORED, vec, nvec))

389 return false;

390

391 string s;

392 if (!re.Rewrite(&s, rewrite, vec, nvec))

393 return false;

394

395 assert(vec[0].begin() >= str->data());

396 assert(vec[0].end() <= str->data()+str->size());

397 str->replace(vec[0].data() - str->data(), vec[0].size(), s);

398 return true;

399 }

400

401 int RE2::GlobalReplace(string *str,

402 const RE2& re,

403 const StringPiece& rewrite) {

404 StringPiece vec[kVecSize];

405 int nvec = 1 + MaxSubmatch(rewrite);

406 if (nvec > arraysize(vec))

407 return false;

408

409 const char* p = str->data();

410 const char* ep = p + str->size();

411 const char* lastend = NULL;

412 string out;

413 int count = 0;

414 while (p <= ep) {

415 if (!re.Match(*str, static_cast<int>(p - str->data()),

416 static_cast<int>(str->size()), UNANCHORED, vec, nvec))

417 break;

418 if (p < vec[0].begin())

419 out.append(p, vec[0].begin() - p);

420 if (vec[0].begin() == lastend && vec[0].size() == 0) {

421 // Disallow empty match at end of last match: skip ahead.

422 if (p < ep)

423 out.append(p, 1);

424 p++;

425 continue;

426 }

427 re.Rewrite(&out, rewrite, vec, nvec);

428 p = vec[0].end();

429 lastend = p;

430 count++;

431 }

432

433 if (count == 0)

434 return 0;

435

436 if (p < ep)

437 out.append(p, ep - p);

438 swap(out, *str);

439 return count;

440 }

441

442 bool RE2::Extract(const StringPiece &text,

443 const RE2& re,

444 const StringPiece &rewrite,

445 string *out) {

446 StringPiece vec[kVecSize];

447 int nvec = 1 + MaxSubmatch(rewrite);

448 if (nvec > arraysize(vec))

449 return false;

450

451 if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))

452 return false;

453

454 out->clear();

455 return re.Rewrite(out, rewrite, vec, nvec);

456 }

457

458 string RE2::QuoteMeta(const StringPiece& unquoted) {

459 string result;

460 result.reserve(unquoted.size() << 1);

461

462 // Escape any ascii character not in [A-Za-z_0-9].

463 //

464 // Note that it's legal to escape a character even if it has no

465 // special meaning in a regular expression -- so this function does

466 // that. (This also makes it identical to the perl function of the

467 // same name except for the null-character special case;

468 // see `perldoc -f quotemeta`.)

469 for (int ii = 0; ii < unquoted.length(); ++ii) {

470 // Note that using 'isalnum' here raises the benchmark time from

471 // 32ns to 58ns:

472 if ((unquoted[ii] < 'a' \|\| unquoted[ii] > 'z') &&

473 (unquoted[ii] < 'A' \|\| unquoted[ii] > 'Z') &&

474 (unquoted[ii] < '0' \|\| unquoted[ii] > '9') &&

475 unquoted[ii] != '_' &&

476 // If this is the part of a UTF8 or Latin1 character, we need

477 // to copy this byte without escaping. Experimentally this is

478 // what works correctly with the regexp library.

479 !(unquoted[ii] & 128)) {

480 if (unquoted[ii] == '\0') { // Special handling for null chars.

481 // Note that this special handling is not strictly required for RE2,

482 // but this quoting is required for other regexp libraries such as

483 // PCRE.

484 // Can't use "\\0" since the next character might be a digit.

485 result += "\\x00";

486 continue;

487 }

488 result += '\\';

489 }

490 result += unquoted[ii];

491 }

492

493 return result;

494 }

495

496 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {

497 if (prog_ == NULL)

498 return false;

499

500 int n = static_cast<int>(prefix_.size());

501 if (n > maxlen)

502 n = maxlen;

503

504 // Determine initial min max from prefix_ literal.

505 string pmin, pmax;

506 pmin = prefix_.substr(0, n);

507 pmax = prefix_.substr(0, n);

508 if (prefix_foldcase_) {

509 // prefix is ASCII lowercase; change pmin to uppercase.

510 for (int i = 0; i < n; i++) {

511 if ('a' <= pmin[i] && pmin[i] <= 'z')

512 pmin[i] += 'A' - 'a';

513 }

514 }

515

516 // Add to prefix min max using PossibleMatchRange on regexp.

517 string dmin, dmax;

518 maxlen -= n;

519 if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {

520 pmin += dmin;

521 pmax += dmax;

522 } else if (pmax.size() > 0) {

523 // prog_->PossibleMatchRange has failed us,

524 // but we still have useful information from prefix_.

525 // Round up pmax to allow any possible suffix.

526 pmax = PrefixSuccessor(pmax);

527 } else {

528 // Nothing useful.

529 *min = "";

530 *max = "";

531 return false;

532 }

533

534 *min = pmin;

535 *max = pmax;

536 return true;

537 }

538

539 // Avoid possible locale nonsense in standard strcasecmp.

540 // The string a is known to be all lowercase.

541 static int ascii_strcasecmp(const char* a, const char* b, int len) {

542 const char *ae = a + len;

543

544 for (; a < ae; a++, b++) {

545 uint8 x = *a;

546 uint8 y = *b;

547 if ('A' <= y && y <= 'Z')

548 y += 'a' - 'A';

549 if (x != y)

550 return x - y;

551 }

552 return 0;

553 }

554

555

556 /*** Actual matching and rewriting code ***/

557

558 bool RE2::Match(const StringPiece& text,

559 int startpos,

560 int endpos,

561 Anchor re_anchor,

562 StringPiece* submatch,

563 int nsubmatch) const {

564 if (!ok() \|\| suffix_regexp_ == NULL) {

565 if (options_.log_errors())

566 LOG(ERROR) << "Invalid RE2: " << *error_;

567 return false;

568 }

569

570 if (startpos < 0 \|\| startpos > endpos \|\| endpos > text.size()) {

571 if (options_.log_errors())

572 LOG(ERROR) << "RE2: invalid startpos, endpos pair. ["

573 << "startpos: " << startpos << ", "

574 << "endpos: " << endpos << ", "

575 << "text size: " << text.size() << "]";

576 return false;

577 }

578

579 StringPiece subtext = text;

580 subtext.remove_prefix(startpos);

581 subtext.remove_suffix(text.size() - endpos);

582

583 // Use DFAs to find exact location of match, filter out non-matches.

584

585 // Don't ask for the location if we won't use it.

586 // SearchDFA can do extra optimizations in that case.

587 StringPiece match;

588 StringPiece* matchp = &match;

589 if (nsubmatch == 0)

590 matchp = NULL;

591

592 int ncap = 1 + NumberOfCapturingGroups();

593 if (ncap > nsubmatch)

594 ncap = nsubmatch;

595

596 // If the regexp is anchored explicitly, must not be in middle of text.

597 if (prog_->anchor_start() && startpos != 0)

598 return false;

599

600 // If the regexp is anchored explicitly, update re_anchor

601 // so that we can potentially fall into a faster case below.

602 if (prog_->anchor_start() && prog_->anchor_end())

603 re_anchor = ANCHOR_BOTH;

604 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)

605 re_anchor = ANCHOR_START;

606

607 // Check for the required prefix, if any.

608 int prefixlen = 0;

609 if (!prefix_.empty()) {

610 if (startpos != 0)

611 return false;

612 prefixlen = static_cast<int>(prefix_.size());

613 if (prefixlen > subtext.size())

614 return false;

615 if (prefix_foldcase_) {

616 if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)

617 return false;

618 } else {

619 if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)

620 return false;

621 }

622 subtext.remove_prefix(prefixlen);

623 // If there is a required prefix, the anchor must be at least ANCHOR_START.

624 if (re_anchor != ANCHOR_BOTH)

625 re_anchor = ANCHOR_START;

626 }

627

628 Prog::Anchor anchor = Prog::kUnanchored;

629 Prog::MatchKind kind = Prog::kFirstMatch;

630 if (options_.longest_match())

631 kind = Prog::kLongestMatch;

632 bool skipped_test = false;

633

634 bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);

635

636 // SearchBitState allocates a bit vector of size prog_->size() * text.size().

637 // It also allocates a stack of 3-word structures which could potentially

638 // grow as large as prog_->size() * text.size() but in practice is much

639 // smaller.

640 // Conditions for using SearchBitState:

641 const int MaxBitStateProg = 500; // prog_->size() <= Max.

642 const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits)

643 bool can_bit_state = prog_->size() <= MaxBitStateProg;

644 int bit_state_text_max = MaxBitStateVector / prog_->size();

645

646 bool dfa_failed = false;

647 switch (re_anchor) {

648 default:

649 case UNANCHORED: {

650 if (!prog_->SearchDFA(subtext, text, anchor, kind,

651 matchp, &dfa_failed, NULL)) {

652 if (dfa_failed) {

653 // Fall back to NFA below.

654 skipped_test = true;

655 if (FLAGS_trace_re2)

656 LOG(INFO) << "Match " << trunc(pattern_)

657 << " [" << CEscape(subtext) << "]"

658 << " DFA failed.";

659 break;

660 }

661 if (FLAGS_trace_re2)

662 LOG(INFO) << "Match " << trunc(pattern_)

663 << " [" << CEscape(subtext) << "]"

664 << " used DFA - no match.";

665 return false;

666 }

667 if (FLAGS_trace_re2)

668 LOG(INFO) << "Match " << trunc(pattern_)

669 << " [" << CEscape(subtext) << "]"

670 << " used DFA - match";

671 if (matchp == NULL) // Matched. Don't care where

672 return true;

673 // SearchDFA set match[0].end() but didn't know where the

674 // match started. Run the regexp backward from match[0].end()

675 // to find the longest possible match -- that's where it started.

676 Prog* prog = ReverseProg();

677 if (prog == NULL)

678 return false;

679 if (!prog->SearchDFA(match, text, Prog::kAnchored,

680 Prog::kLongestMatch, &match, &dfa_failed, NULL)) {

681 if (dfa_failed) {

682 // Fall back to NFA below.

683 skipped_test = true;

684 if (FLAGS_trace_re2)

685 LOG(INFO) << "Match " << trunc(pattern_)

686 << " [" << CEscape(subtext) << "]"

687 << " reverse DFA failed.";

688 break;

689 }

690 if (FLAGS_trace_re2)

691 LOG(INFO) << "Match " << trunc(pattern_)

692 << " [" << CEscape(subtext) << "]"

693 << " DFA inconsistency.";

694 if (options_.log_errors())

695 LOG(ERROR) << "DFA inconsistency";

696 return false;

697 }

698 if (FLAGS_trace_re2)

699 LOG(INFO) << "Match " << trunc(pattern_)

700 << " [" << CEscape(subtext) << "]"

701 << " used reverse DFA.";

702 break;

703 }

704

705 case ANCHOR_BOTH:

706 case ANCHOR_START:

707 if (re_anchor == ANCHOR_BOTH)

708 kind = Prog::kFullMatch;

709 anchor = Prog::kAnchored;

710

711 // If only a small amount of text and need submatch

712 // information anyway and we're going to use OnePass or BitState

713 // to get it, we might as well not even bother with the DFA:

714 // OnePass or BitState will be fast enough.

715 // On tiny texts, OnePass outruns even the DFA, and

716 // it doesn't have the shared state and occasional mutex that

717 // the DFA does.

718 if (can_one_pass && text.size() <= 4096 &&

719 (ncap > 1 \|\| text.size() <= 8)) {

720 if (FLAGS_trace_re2)

721 LOG(INFO) << "Match " << trunc(pattern_)

722 << " [" << CEscape(subtext) << "]"

723 << " skipping DFA for OnePass.";

724 skipped_test = true;

725 break;

726 }

727 if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {

728 if (FLAGS_trace_re2)

729 LOG(INFO) << "Match " << trunc(pattern_)

730 << " [" << CEscape(subtext) << "]"

731 << " skipping DFA for BitState.";

732 skipped_test = true;

733 break;

734 }

735 if (!prog_->SearchDFA(subtext, text, anchor, kind,

736 &match, &dfa_failed, NULL)) {

737 if (dfa_failed) {

738 if (FLAGS_trace_re2)

739 LOG(INFO) << "Match " << trunc(pattern_)

740 << " [" << CEscape(subtext) << "]"

741 << " DFA failed.";

742 skipped_test = true;

743 break;

744 }

745 if (FLAGS_trace_re2)

746 LOG(INFO) << "Match " << trunc(pattern_)

747 << " [" << CEscape(subtext) << "]"

748 << " used DFA - no match.";

749 return false;

750 }

751 break;

752 }

753

754 if (!skipped_test && ncap <= 1) {

755 // We know exactly where it matches. That's enough.

756 if (ncap == 1)

757 submatch[0] = match;

758 } else {

759 StringPiece subtext1;

760 if (skipped_test) {

761 // DFA ran out of memory or was skipped:

762 // need to search in entire original text.

763 subtext1 = subtext;

764 } else {

765 // DFA found the exact match location:

766 // let NFA run an anchored, full match search

767 // to find submatch locations.

768 subtext1 = match;

769 anchor = Prog::kAnchored;

770 kind = Prog::kFullMatch;

771 }

772

773 if (can_one_pass && anchor != Prog::kUnanchored) {

774 if (FLAGS_trace_re2)

775 LOG(INFO) << "Match " << trunc(pattern_)

776 << " [" << CEscape(subtext) << "]"

777 << " using OnePass.";

778 if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {

779 if (!skipped_test && options_.log_errors())

780 LOG(ERROR) << "SearchOnePass inconsistency";

781 return false;

782 }

783 } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {

784 if (FLAGS_trace_re2)

785 LOG(INFO) << "Match " << trunc(pattern_)

786 << " [" << CEscape(subtext) << "]"

787 << " using BitState.";

788 if (!prog_->SearchBitState(subtext1, text, anchor,

789 kind, submatch, ncap)) {

790 if (!skipped_test && options_.log_errors())

791 LOG(ERROR) << "SearchBitState inconsistency";

792 return false;

793 }

794 } else {

795 if (FLAGS_trace_re2)

796 LOG(INFO) << "Match " << trunc(pattern_)

797 << " [" << CEscape(subtext) << "]"

798 << " using NFA.";

799 if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {

800 if (!skipped_test && options_.log_errors())

801 LOG(ERROR) << "SearchNFA inconsistency";

802 return false;

803 }

804 }

805 }

806

807 // Adjust overall match for required prefix that we stripped off.

808 if (prefixlen > 0 && nsubmatch > 0)

809 submatch[0] = StringPiece(submatch[0].begin() - prefixlen,

810 submatch[0].size() + prefixlen);

811

812 // Zero submatches that don't exist in the regexp.

813 for (int i = ncap; i < nsubmatch; i++)

814 submatch[i] = NULL;

815 return true;

816 }

817

818 // Internal matcher - like Match() but takes Args not StringPieces.

819 bool RE2::DoMatch(const StringPiece& text,

820 Anchor anchor,

821 int* consumed,

822 const Arg* const* args,

823 int n) const {

824 if (!ok()) {

825 if (options_.log_errors())

826 LOG(ERROR) << "Invalid RE2: " << *error_;

827 return false;

828 }

829

830 // Count number of capture groups needed.

831 int nvec;

832 if (n == 0 && consumed == NULL)

833 nvec = 0;

834 else

835 nvec = n+1;

836

837 StringPiece* vec;

838 StringPiece stkvec[kVecSize];

839 StringPiece* heapvec = NULL;

840

841 if (nvec <= arraysize(stkvec)) {

842 vec = stkvec;

843 } else {

844 vec = new StringPiece[nvec];

845 heapvec = vec;

846 }

847

848 if (!Match(text, 0, text.size(), anchor, vec, nvec)) {

849 delete[] heapvec;

850 return false;

851 }

852

853 if (consumed != NULL)

854 *consumed = static_cast<int>(vec[0].end() - text.begin());

855

856 if (n == 0 \|\| args == NULL) {

857 // We are not interested in results

858 delete[] heapvec;

859 return true;

860 }

861

862 int ncap = NumberOfCapturingGroups();

863 if (ncap < n) {

864 // RE has fewer capturing groups than number of arg pointers passed in

865 VLOG(1) << "Asked for " << n << " but only have " << ncap;

866 delete[] heapvec;

867 return false;

868 }

869

870 // If we got here, we must have matched the whole pattern.

871 for (int i = 0; i < n; i++) {

872 const StringPiece& s = vec[i+1];

873 if (!args[i]->Parse(s.data(), s.size())) {

874 // TODO: Should we indicate what the error was?

875 VLOG(1) << "Parse error on #" << i << " " << s << " "

876 << (void*)s.data() << "/" << s.size();

877 delete[] heapvec;

878 return false;

879 }

880 }

881

882 delete[] heapvec;

883 return true;

884 }

885

886 // Append the "rewrite" string, with backslash subsitutions from "vec",

887 // to string "out".

888 bool RE2::Rewrite(string *out, const StringPiece &rewrite,

889 const StringPiece *vec, int veclen) const {

890 for (const char s = rewrite.data(), end = s + rewrite.size();

891 s < end; s++) {

892 if (*s != '\\') {

893 out->push_back(*s);

894 continue;

895 }

896 s++;

897 int c = (s < end) ? *s : -1;

898 if (isdigit(c)) {

899 int n = (c - '0');

900 if (n >= veclen) {

901 if (options_.log_errors()) {

902 LOG(ERROR) << "requested group " << n

903 << " in regexp " << rewrite.data();

904 }

905 return false;

906 }

907 StringPiece snip = vec[n];

908 if (snip.size() > 0)

909 out->append(snip.data(), snip.size());

910 } else if (c == '\\') {

911 out->push_back('\\');

912 } else {

913 if (options_.log_errors())

914 LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();

915 return false;

916 }

917 }

918 return true;

919 }

920

921 // Checks that the rewrite string is well-formed with respect to this

922 // regular expression.

923 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {

924 int max_token = -1;

925 for (const char s = rewrite.data(), end = s + rewrite.size();

926 s < end; s++) {

927 int c = *s;

928 if (c != '\\') {

929 continue;

930 }

931 if (++s == end) {

932 *error = "Rewrite schema error: '\\' not allowed at end.";

933 return false;

934 }

935 c = *s;

936 if (c == '\\') {

937 continue;

938 }

939 if (!isdigit(c)) {

940 *error = "Rewrite schema error: "

941 "'\\' must be followed by a digit or '\\'.";

942 return false;

943 }

944 int n = (c - '0');

945 if (max_token < n) {

946 max_token = n;

947 }

948 }

949

950 if (max_token > NumberOfCapturingGroups()) {

951 SStringPrintf(error, "Rewrite schema requests %d matches, "

952 "but the regexp only has %d parenthesized subexpressions.",

953 max_token, NumberOfCapturingGroups());

954 return false;

955 }

956 return true;

957 }

958

959 /*** Parsers for various types ***/

960

961 bool RE2::Arg::parse_null(const char* str, int n, void* dest) {

962 // We fail if somebody asked us to store into a non-NULL void* pointer

963 return (dest == NULL);

964 }

965

966 bool RE2::Arg::parse_string(const char* str, int n, void* dest) {

967 if (dest == NULL) return true;

968 reinterpret_cast<string*>(dest)->assign(str, n);

969 return true;

970 }

971

972 bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {

973 if (dest == NULL) return true;

974 reinterpret_cast<StringPiece*>(dest)->set(str, n);

975 return true;

976 }

977

978 bool RE2::Arg::parse_char(const char* str, int n, void* dest) {

979 if (n != 1) return false;

980 if (dest == NULL) return true;

981 (reinterpret_cast<char>(dest)) = str[0];

982 return true;

983 }

984

985 bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {

986 if (n != 1) return false;

987 if (dest == NULL) return true;

988 (reinterpret_cast<unsigned char>(dest)) = str[0];

989 return true;

990 }

991

992 // Largest number spec that we are willing to parse

993 static const int kMaxNumberLength = 32;

994

995 // REQUIRES "buf" must have length at least nbuf.

996 // Copies "str" into "buf" and null-terminates.

997 // Overwrites *np with the new length.

998 static const char* TerminateNumber(char* buf, int nbuf, const char* str, int* np ,

999 bool accept_spaces) {

1000 int n = *np;

1001 if (n <= 0) return "";

1002 if (n > 0 && isspace(*str)) {

1003 // We are less forgiving than the strtoxxx() routines and do not

1004 // allow leading spaces. We do allow leading spaces for floats.

1005 if (!accept_spaces) {

1006 return "";

1007 }

1008 while (n > 0 && isspace(*str)) {

1009 n--;

1010 str++;

1011 }

1012 }

1013

1014 // Although buf has a fixed maximum size, we can still handle

1015 // arbitrarily large integers correctly by omitting leading zeros.

1016 // (Numbers that are still too long will be out of range.)

1017 // Before deciding whether str is too long,

1018 // remove leading zeros with s/000+/00/.

1019 // Leaving the leading two zeros in place means that

1020 // we don't change 0000x123 (invalid) into 0x123 (valid).

1021 // Skip over leading - before replacing.

1022 bool neg = false;

1023 if (n >= 1 && str[0] == '-') {

1024 neg = true;

1025 n--;

1026 str++;

1027 }

1028

1029 if (n >= 3 && str[0] == '0' && str[1] == '0') {

1030 while (n >= 3 && str[2] == '0') {

1031 n--;

1032 str++;

1033 }

1034 }

1035

1036 if (neg) { // make room in buf for -

1037 n++;

1038 str--;

1039 }

1040

1041 if (n > nbuf-1) return "";

1042

1043 memmove(buf, str, n);

1044 if (neg) {

1045 buf[0] = '-';

1046 }

1047 buf[n] = '\0';

1048 *np = n;

1049 return buf;

1050 }

1051

1052 bool RE2::Arg::parse_long_radix(const char* str,

1053 int n,

1054 void* dest,

1055 int radix) {

1056 if (n == 0) return false;

1057 char buf[kMaxNumberLength+1];

1058 str = TerminateNumber(buf, sizeof buf, str, &n, false);

1059 char* end;

1060 errno = 0;

1061 long r = strtol(str, &end, radix);

1062 if (end != str + n) return false; // Leftover junk

1063 if (errno) return false;

1064 if (dest == NULL) return true;

1065 (reinterpret_cast<long>(dest)) = r;

1066 return true;

1067 }

1068

1069 bool RE2::Arg::parse_ulong_radix(const char* str,

1070 int n,

1071 void* dest,

1072 int radix) {

1073 if (n == 0) return false;

1074 char buf[kMaxNumberLength+1];

1075 str = TerminateNumber(buf, sizeof buf, str, &n, false);

1076 if (str[0] == '-') {

1077 // strtoul() will silently accept negative numbers and parse

1078 // them. This module is more strict and treats them as errors.

1079 return false;

1080 }

1081

1082 char* end;

1083 errno = 0;

1084 unsigned long r = strtoul(str, &end, radix);

1085 if (end != str + n) return false; // Leftover junk

1086 if (errno) return false;

1087 if (dest == NULL) return true;

1088 (reinterpret_cast<unsigned long>(dest)) = r;

1089 return true;

1090 }

1091

1092 bool RE2::Arg::parse_short_radix(const char* str,

1093 int n,

1094 void* dest,

1095 int radix) {

1096 long r;

1097 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse

1098 if ((short)r != r) return false; // Out of range

1099 if (dest == NULL) return true;

1100 (reinterpret_cast<short>(dest)) = (short)r;

1101 return true;

1102 }

1103

1104 bool RE2::Arg::parse_ushort_radix(const char* str,

1105 int n,

1106 void* dest,

1107 int radix) {

1108 unsigned long r;

1109 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse

1110 if ((ushort)r != r) return false; // Out of range

1111 if (dest == NULL) return true;

1112 (reinterpret_cast<unsigned short>(dest)) = (ushort)r;

1113 return true;

1114 }

1115

1116 bool RE2::Arg::parse_int_radix(const char* str,

1117 int n,

1118 void* dest,

1119 int radix) {

1120 long r;

1121 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse

1122 if ((int)r != r) return false; // Out of range

1123 if (dest == NULL) return true;

1124 (reinterpret_cast<int>(dest)) = r;

1125 return true;

1126 }

1127

1128 bool RE2::Arg::parse_uint_radix(const char* str,

1129 int n,

1130 void* dest,

1131 int radix) {

1132 unsigned long r;

1133 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse

1134 if ((uint)r != r) return false; // Out of range

1135 if (dest == NULL) return true;

1136 (reinterpret_cast<unsigned int>(dest)) = r;

1137 return true;

1138 }

1139

1140 #if RE2_HAVE_LONGLONG

1141 bool RE2::Arg::parse_longlong_radix(const char* str,

1142 int n,

1143 void* dest,

1144 int radix) {

1145 if (n == 0) return false;

1146 char buf[kMaxNumberLength+1];

1147 str = TerminateNumber(buf, sizeof buf, str, &n, false);

1148 char* end;

1149 errno = 0;

1150 int64 r = strtoll(str, &end, radix);

1151 if (end != str + n) return false; // Leftover junk

1152 if (errno) return false;

1153 if (dest == NULL) return true;

1154 (reinterpret_cast<int64>(dest)) = r;

1155 return true;

1156 }

1157

1158 bool RE2::Arg::parse_ulonglong_radix(const char* str,

1159 int n,

1160 void* dest,

1161 int radix) {

1162 if (n == 0) return false;

1163 char buf[kMaxNumberLength+1];

1164 str = TerminateNumber(buf, sizeof buf, str, &n, false);

1165 if (str[0] == '-') {

1166 // strtoull() will silently accept negative numbers and parse

1167 // them. This module is more strict and treats them as errors.

1168 return false;

1169 }

1170 char* end;

1171 errno = 0;

1172 uint64 r = strtoull(str, &end, radix);

1173 if (end != str + n) return false; // Leftover junk

1174 if (errno) return false;

1175 if (dest == NULL) return true;

1176 (reinterpret_cast<uint64>(dest)) = r;

1177 return true;

1178 }

1179 #endif

1180

1181 static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {

1182 if (n == 0) return false;

1183 static const int kMaxLength = 200;

1184 char buf[kMaxLength+1];

1185 str = TerminateNumber(buf, sizeof buf, str, &n, true);

1186 char* end;

1187 errno = 0;

1188 double r;

1189 if (isfloat) {

1190 r = strtof(str, &end);

1191 } else {

1192 r = strtod(str, &end);

1193 }

1194 if (end != str + n) return false; // Leftover junk

1195 if (errno) return false;

1196 if (dest == NULL) return true;

1197 if (isfloat) {

1198 (reinterpret_cast<float>(dest)) = (float)r;

1199 } else {

1200 (reinterpret_cast<double>(dest)) = r;

1201 }

1202 return true;

1203 }

1204

1205 bool RE2::Arg::parse_double(const char* str, int n, void* dest) {

1206 return parse_double_float(str, n, false, dest);

1207 }

1208

1209 bool RE2::Arg::parse_float(const char* str, int n, void* dest) {

1210 return parse_double_float(str, n, true, dest);

1211 }

1212

1213

1214 #define DEFINE_INTEGER_PARSERS(name) \

1215 bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \

1216 return parse_##name##_radix(str, n, dest, 10); \

1217 } \

1218 bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \

1219 return parse_##name##_radix(str, n, dest, 16); \

1220 } \

1221 bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \

1222 return parse_##name##_radix(str, n, dest, 8); \

1223 } \

1224 bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \

1225 return parse_##name##_radix(str, n, dest, 0); \

1226 }

1227

1228 DEFINE_INTEGER_PARSERS(short);

1229 DEFINE_INTEGER_PARSERS(ushort);

1230 DEFINE_INTEGER_PARSERS(int);

1231 DEFINE_INTEGER_PARSERS(uint);

1232 DEFINE_INTEGER_PARSERS(long);

1233 DEFINE_INTEGER_PARSERS(ulong);

1234 DEFINE_INTEGER_PARSERS(longlong);

1235 DEFINE_INTEGER_PARSERS(ulonglong);

1236

1237 #undef DEFINE_INTEGER_PARSERS

1238

1239 } // namespace re2

OLD	NEW

« no previous file with comments | « third_party/re2/re2/re2.h ('k') | third_party/re2/re2/regexp.h » ('j') | no next file with comments »