third_party/re2/re2/dfa.cc - Issue 1544433002: Replace RE2 import with a dependency

Side by Side Diff: third_party/re2/re2/dfa.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright 2008 The RE2 Authors. All Rights Reserved.

2 // Use of this source code is governed by a BSD-style

3 // license that can be found in the LICENSE file.

4

5 // A DFA (deterministic finite automaton)-based regular expression search.

6 //

7 // The DFA search has two main parts: the construction of the automaton,

8 // which is represented by a graph of State structures, and the execution

9 // of the automaton over a given input string.

10 //

11 // The basic idea is that the State graph is constructed so that the

12 // execution can simply start with a state s, and then for each byte c in

13 // the input string, execute "s = s->next[c]", checking at each point whether

14 // the current s represents a matching state.

15 //

16 // The simple explanation just given does convey the essence of this code,

17 // but it omits the details of how the State graph gets constructed as well

18 // as some performance-driven optimizations to the execution of the automaton.

19 // All these details are explained in the comments for the code following

20 // the definition of class DFA.

21 //

22 // See http://swtch.com/~rsc/regexp/ for a very bare-bones equivalent.

23

24 #include "util/atomicops.h"

25 #include "util/flags.h"

26 #include "util/sparse_set.h"

27 #include "re2/prog.h"

28 #include "re2/stringpiece.h"

29

30 DEFINE_bool(re2_dfa_bail_when_slow, true,

31 "Whether the RE2 DFA should bail out early "

32 "if the NFA would be faster (for testing).");

33

34 namespace re2 {

35

36 #if !defined(__linux__) /* only Linux seems to have memrchr */

37 static void* memrchr(const void* s, int c, size_t n) {

38 const unsigned char* p = (const unsigned char*)s;

39 for (p += n; n > 0; n--)

40 if (*--p == c)

41 return (void*)p;

42

43 return NULL;

44 }

45 #endif

46

47 // Changing this to true compiles in prints that trace execution of the DFA.

48 // Generates a lot of output -- only useful for debugging.

49 static const bool DebugDFA = false;

50

51 // A DFA implementation of a regular expression program.

52 // Since this is entirely a forward declaration mandated by C++,

53 // some of the comments here are better understood after reading

54 // the comments in the sections that follow the DFA definition.

55 class DFA {

56 public:

57 DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem);

58 ~DFA();

59 bool ok() const { return !init_failed_; }

60 Prog::MatchKind kind() { return kind_; }

61

62 // Searches for the regular expression in text, which is considered

63 // as a subsection of context for the purposes of interpreting flags

64 // like ^ and $ and \A and \z.

65 // Returns whether a match was found.

66 // If a match is found, sets *ep to the end point of the best match in text.

67 // If "anchored", the match must begin at the start of text.

68 // If "want_earliest_match", the match that ends first is used, not

69 // necessarily the best one.

70 // If "run_forward" is true, the DFA runs from text.begin() to text.end().

71 // If it is false, the DFA runs from text.end() to text.begin(),

72 // returning the leftmost end of the match instead of the rightmost one.

73 // If the DFA cannot complete the search (for example, if it is out of

74 // memory), it sets *failed and returns false.

75 bool Search(const StringPiece& text, const StringPiece& context,

76 bool anchored, bool want_earliest_match, bool run_forward,

77 bool* failed, const char** ep, vector<int>* matches);

78

79 // Builds out all states for the entire DFA. FOR TESTING ONLY

80 // Returns number of states.

81 int BuildAllStates();

82

83 // Computes min and max for matching strings. Won't return strings

84 // bigger than maxlen.

85 bool PossibleMatchRange(string* min, string* max, int maxlen);

86

87 // These data structures are logically private, but C++ makes it too

88 // difficult to mark them as such.

89 class Workq;

90 class RWLocker;

91 class StateSaver;

92

93 // A single DFA state. The DFA is represented as a graph of these

94 // States, linked by the next_ pointers. If in state s and reading

95 // byte c, the next state should be s->next_[c].

96 struct State {

97 inline bool IsMatch() const { return (flag_ & kFlagMatch) != 0; }

98 void SaveMatch(vector<int>* v);

99

100 int* inst_; // Instruction pointers in the state.

101 int ninst_; // # of inst_ pointers.

102 uint flag_; // Empty string bitfield flags in effect on the way

103 // into this state, along with kFlagMatch if this

104 // is a matching state.

105 State** next_; // Outgoing arrows from State,

106 // one per input byte class

107 };

108

109 enum {

110 kByteEndText = 256, // imaginary byte at end of text

111

112 kFlagEmptyMask = 0xFFF, // State.flag_: bits holding kEmptyXXX flags

113 kFlagMatch = 0x1000, // State.flag_: this is a matching state

114 kFlagLastWord = 0x2000, // State.flag_: last byte was a word char

115 kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left

116 };

117

118 #ifndef STL_MSVC

119 // STL function structures for use with unordered_set.

120 struct StateEqual {

121 bool operator()(const State* a, const State* b) const {

122 if (a == b)

123 return true;

124 if (a == NULL \|\| b == NULL)

125 return false;

126 if (a->ninst_ != b->ninst_)

127 return false;

128 if (a->flag_ != b->flag_)

129 return false;

130 for (int i = 0; i < a->ninst_; i++)

131 if (a->inst_[i] != b->inst_[i])

132 return false;

133 return true; // they're equal

134 }

135 };

136 #endif // STL_MSVC

137 struct StateHash {

138 size_t operator()(const State* a) const {

139 if (a == NULL)

140 return 0;

141 const char* s = reinterpret_cast<const char*>(a->inst_);

142 int len = a->ninst_ * sizeof a->inst_[0];

143 if (sizeof(size_t) == sizeof(uint32))

144 return Hash32StringWithSeed(s, len, a->flag_);

145 else

146 return static_cast<size_t>(Hash64StringWithSeed(s, len, a->flag_));

147 }

148 #ifdef STL_MSVC

149 // Less than operator.

150 bool operator()(const State* a, const State* b) const {

151 if (a == b)

152 return false;

153 if (a == NULL \|\| b == NULL)

154 return a == NULL;

155 if (a->ninst_ != b->ninst_)

156 return a->ninst_ < b->ninst_;

157 if (a->flag_ != b->flag_)

158 return a->flag_ < b->flag_;

159 for (int i = 0; i < a->ninst_; ++i)

160 if (a->inst_[i] != b->inst_[i])

161 return a->inst_[i] < b->inst_[i];

162 return false; // they're equal

163 }

164 // The two public members are required by msvc. 4 and 8 are default values.

165 // Reference: http://msdn.microsoft.com/en-us/library/1s1byw77.aspx

166 static const size_t bucket_size = 4;

167 static const size_t min_buckets = 8;

168 #endif // STL_MSVC

169 };

170

171 #ifdef STL_MSVC

172 typedef unordered_set<State*, StateHash> StateSet;

173 #else // !STL_MSVC

174 typedef unordered_set<State*, StateHash, StateEqual> StateSet;

175 #endif // STL_MSVC

176

177

178 private:

179 // Special "firstbyte" values for a state. (Values >= 0 denote actual bytes.)

180 enum {

181 kFbUnknown = -1, // No analysis has been performed.

182 kFbMany = -2, // Many bytes will lead out of this state.

183 kFbNone = -3, // No bytes lead out of this state.

184 };

185

186 enum {

187 // Indices into start_ for unanchored searches.

188 // Add kStartAnchored for anchored searches.

189 kStartBeginText = 0, // text at beginning of context

190 kStartBeginLine = 2, // text at beginning of line

191 kStartAfterWordChar = 4, // text follows a word character

192 kStartAfterNonWordChar = 6, // text follows non-word character

193 kMaxStart = 8,

194

195 kStartAnchored = 1,

196 };

197

198 // Resets the DFA State cache, flushing all saved State* information.

199 // Releases and reacquires cache_mutex_ via cache_lock, so any

200 // State* existing before the call are not valid after the call.

201 // Use a StateSaver to preserve important states across the call.

202 // cache_mutex_.r <= L < mutex_

203 // After: cache_mutex_.w <= L < mutex_

204 void ResetCache(RWLocker* cache_lock);

205

206 // Looks up and returns the State corresponding to a Workq.

207 // L >= mutex_

208 State* WorkqToCachedState(Workq* q, uint flag);

209

210 // Looks up and returns a State matching the inst, ninst, and flag.

211 // L >= mutex_

212 State* CachedState(int* inst, int ninst, uint flag);

213

214 // Clear the cache entirely.

215 // Must hold cache_mutex_.w or be in destructor.

216 void ClearCache();

217

218 // Converts a State into a Workq: the opposite of WorkqToCachedState.

219 // L >= mutex_

220 static void StateToWorkq(State* s, Workq* q);

221

222 // Runs a State on a given byte, returning the next state.

223 State* RunStateOnByteUnlocked(State*, int); // cache_mutex_.r <= L < mutex_

224 State* RunStateOnByte(State*, int); // L >= mutex_

225

226 // Runs a Workq on a given byte followed by a set of empty-string flags,

227 // producing a new Workq in nq. If a match instruction is encountered,

228 // sets *ismatch to true.

229 // L >= mutex_

230 void RunWorkqOnByte(Workq* q, Workq* nq,

231 int c, uint flag, bool* ismatch,

232 Prog::MatchKind kind);

233

234 // Runs a Workq on a set of empty-string flags, producing a new Workq in nq.

235 // L >= mutex_

236 void RunWorkqOnEmptyString(Workq* q, Workq* nq, uint flag);

237

238 // Adds the instruction id to the Workq, following empty arrows

239 // according to flag.

240 // L >= mutex_

241 void AddToQueue(Workq* q, int id, uint flag);

242

243 // For debugging, returns a text representation of State.

244 static string DumpState(State* state);

245

246 // For debugging, returns a text representation of a Workq.

247 static string DumpWorkq(Workq* q);

248

249 // Search parameters

250 struct SearchParams {

251 SearchParams(const StringPiece& text, const StringPiece& context,

252 RWLocker* cache_lock)

253 : text(text), context(context),

254 anchored(false),

255 want_earliest_match(false),

256 run_forward(false),

257 start(NULL),

258 firstbyte(kFbUnknown),

259 cache_lock(cache_lock),

260 failed(false),

261 ep(NULL),

262 matches(NULL) { }

263

264 StringPiece text;

265 StringPiece context;

266 bool anchored;

267 bool want_earliest_match;

268 bool run_forward;

269 State* start;

270 int firstbyte;

271 RWLocker *cache_lock;

272 bool failed; // "out" parameter: whether search gave up

273 const char* ep; // "out" parameter: end pointer for match

274 vector<int>* matches;

275

276 private:

277 DISALLOW_COPY_AND_ASSIGN(SearchParams);

278 };

279

280 // Before each search, the parameters to Search are analyzed by

281 // AnalyzeSearch to determine the state in which to start and the

282 // "firstbyte" for that state, if any.

283 struct StartInfo {

284 StartInfo() : start(NULL), firstbyte(kFbUnknown) { }

285 State* start;

286 volatile int firstbyte;

287 };

288

289 // Fills in params->start and params->firstbyte using

290 // the other search parameters. Returns true on success,

291 // false on failure.

292 // cache_mutex_.r <= L < mutex_

293 bool AnalyzeSearch(SearchParams* params);

294 bool AnalyzeSearchHelper(SearchParams* params, StartInfo* info, uint flags);

295

296 // The generic search loop, inlined to create specialized versions.

297 // cache_mutex_.r <= L < mutex_

298 // Might unlock and relock cache_mutex_ via params->cache_lock.

299 inline bool InlinedSearchLoop(SearchParams* params,

300 bool have_firstbyte,

301 bool want_earliest_match,

302 bool run_forward);

303

304 // The specialized versions of InlinedSearchLoop. The three letters

305 // at the ends of the name denote the true/false values used as the

306 // last three parameters of InlinedSearchLoop.

307 // cache_mutex_.r <= L < mutex_

308 // Might unlock and relock cache_mutex_ via params->cache_lock.

309 bool SearchFFF(SearchParams* params);

310 bool SearchFFT(SearchParams* params);

311 bool SearchFTF(SearchParams* params);

312 bool SearchFTT(SearchParams* params);

313 bool SearchTFF(SearchParams* params);

314 bool SearchTFT(SearchParams* params);

315 bool SearchTTF(SearchParams* params);

316 bool SearchTTT(SearchParams* params);

317

318 // The main search loop: calls an appropriate specialized version of

319 // InlinedSearchLoop.

320 // cache_mutex_.r <= L < mutex_

321 // Might unlock and relock cache_mutex_ via params->cache_lock.

322 bool FastSearchLoop(SearchParams* params);

323

324 // For debugging, a slow search loop that calls InlinedSearchLoop

325 // directly -- because the booleans passed are not constants, the

326 // loop is not specialized like the SearchFFF etc. versions, so it

327 // runs much more slowly. Useful only for debugging.

328 // cache_mutex_.r <= L < mutex_

329 // Might unlock and relock cache_mutex_ via params->cache_lock.

330 bool SlowSearchLoop(SearchParams* params);

331

332 // Looks up bytes in bytemap_ but handles case c == kByteEndText too.

333 int ByteMap(int c) {

334 if (c == kByteEndText)

335 return prog_->bytemap_range();

336 return prog_->bytemap()[c];

337 }

338

339 // Constant after initialization.

340 Prog* prog_; // The regular expression program to run.

341 Prog::MatchKind kind_; // The kind of DFA.

342 bool init_failed_; // initialization failed (out of memory)

343

344 Mutex mutex_; // mutex_ >= cache_mutex_.r

345

346 // Scratch areas, protected by mutex_.

347 Workq* q0_; // Two pre-allocated work queues.

348 Workq* q1_;

349 int* astack_; // Pre-allocated stack for AddToQueue

350 int nastack_;

351

352 // State* cache. Many threads use and add to the cache simultaneously,

353 // holding cache_mutex_ for reading and mutex_ (above) when adding.

354 // If the cache fills and needs to be discarded, the discarding is done

355 // while holding cache_mutex_ for writing, to avoid interrupting other

356 // readers. Any State* pointers are only valid while cache_mutex_

357 // is held.

358 Mutex cache_mutex_;

359 int64 mem_budget_; // Total memory budget for all States.

360 int64 state_budget_; // Amount of memory remaining for new States.

361 StateSet state_cache_; // All States computed so far.

362 StartInfo start_[kMaxStart];

363 bool cache_warned_; // have printed to LOG(INFO) about the cache

364 };

365

366 // Shorthand for casting to uint8*.

367 static inline const uint8* BytePtr(const void* v) {

368 return reinterpret_cast<const uint8*>(v);

369 }

370

371 // Work queues

372

373 // Marks separate thread groups of different priority

374 // in the work queue when in leftmost-longest matching mode.

375 #define Mark (-1)

376

377 // Internally, the DFA uses a sparse array of

378 // program instruction pointers as a work queue.

379 // In leftmost longest mode, marks separate sections

380 // of workq that started executing at different

381 // locations in the string (earlier locations first).

382 class DFA::Workq : public SparseSet {

383 public:

384 // Constructor: n is number of normal slots, maxmark number of mark slots.

385 Workq(int n, int maxmark) :

386 SparseSet(n+maxmark),

387 n_(n),

388 maxmark_(maxmark),

389 nextmark_(n),

390 last_was_mark_(true) {

391 }

392

393 bool is_mark(int i) { return i >= n_; }

394

395 int maxmark() { return maxmark_; }

396

397 void clear() {

398 SparseSet::clear();

399 nextmark_ = n_;

400 }

401

402 void mark() {

403 if (last_was_mark_)

404 return;

405 last_was_mark_ = false;

406 SparseSet::insert_new(nextmark_++);

407 }

408

409 int size() {

410 return n_ + maxmark_;

411 }

412

413 void insert(int id) {

414 if (contains(id))

415 return;

416 insert_new(id);

417 }

418

419 void insert_new(int id) {

420 last_was_mark_ = false;

421 SparseSet::insert_new(id);

422 }

423

424 private:

425 int n_; // size excluding marks

426 int maxmark_; // maximum number of marks

427 int nextmark_; // id of next mark

428 bool last_was_mark_; // last inserted was mark

429 DISALLOW_COPY_AND_ASSIGN(Workq);

430 };

431

432 DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem)

433 : prog_(prog),

434 kind_(kind),

435 init_failed_(false),

436 q0_(NULL),

437 q1_(NULL),

438 astack_(NULL),

439 mem_budget_(max_mem),

440 cache_warned_(false) {

441 if (DebugDFA)

442 fprintf(stderr, "\nkind %d\n%s\n", (int)kind_, prog_->DumpUnanchored().c_str ());

443 int nmark = 0;

444 if (kind_ == Prog::kLongestMatch)

445 nmark = prog->size();

446 nastack_ = 2 * prog->size() + nmark;

447

448 // Account for space needed for DFA, q0, q1, astack.

449 mem_budget_ -= sizeof(DFA);

450 mem_budget_ -= (prog_->size() + nmark) *

451 (sizeof(int)+sizeof(int)) * 2; // q0, q1

452 mem_budget_ -= nastack_ * sizeof(int); // astack

453 if (mem_budget_ < 0) {

454 LOG(INFO) << StringPrintf("DFA out of memory: prog size %d mem %lld",

455 prog_->size(), max_mem);

456 init_failed_ = true;

457 return;

458 }

459

460 state_budget_ = mem_budget_;

461

462 // Make sure there is a reasonable amount of working room left.

463 // At minimum, the search requires room for two states in order

464 // to limp along, restarting frequently. We'll get better performance

465 // if there is room for a larger number of states, say 20.

466 int64 one_state = sizeof(State) + (prog_->size()+nmark)*sizeof(int) +

467 (prog_->bytemap_range()+1)sizeof(State);

468 if (state_budget_ < 20*one_state) {

469 LOG(INFO) << StringPrintf("DFA out of memory: prog size %d mem %lld",

470 prog_->size(), max_mem);

471 init_failed_ = true;

472 return;

473 }

474

475 q0_ = new Workq(prog->size(), nmark);

476 q1_ = new Workq(prog->size(), nmark);

477 astack_ = new int[nastack_];

478 }

479

480 DFA::~DFA() {

481 delete q0_;

482 delete q1_;

483 delete[] astack_;

484 ClearCache();

485 }

486

487 // In the DFA state graph, s->next[c] == NULL means that the

488 // state has not yet been computed and needs to be. We need

489 // a different special value to signal that s->next[c] is a

490 // state that can never lead to a match (and thus the search

491 // can be called off). Hence DeadState.

492 #define DeadState reinterpret_cast<State*>(1)

493

494 // Signals that the rest of the string matches no matter what it is.

495 #define FullMatchState reinterpret_cast<State*>(2)

496

497 #define SpecialStateMax FullMatchState

498

499 // Debugging printouts

500

501 // For debugging, returns a string representation of the work queue.

502 string DFA::DumpWorkq(Workq* q) {

503 string s;

504 const char* sep = "";

505 for (DFA::Workq::iterator it = q->begin(); it != q->end(); ++it) {

506 if (q->is_mark(*it)) {

507 StringAppendF(&s, "\|");

508 sep = "";

509 } else {

510 StringAppendF(&s, "%s%d", sep, *it);

511 sep = ",";

512 }

513 }

514 return s;

515 }

516

517 // For debugging, returns a string representation of the state.

518 string DFA::DumpState(State* state) {

519 if (state == NULL)

520 return "_";

521 if (state == DeadState)

522 return "X";

523 if (state == FullMatchState)

524 return "*";

525 string s;

526 const char* sep = "";

527 StringAppendF(&s, "(%p)", state);

528 for (int i = 0; i < state->ninst_; i++) {

529 if (state->inst_[i] == Mark) {

530 StringAppendF(&s, "\|");

531 sep = "";

532 } else {

533 StringAppendF(&s, "%s%d", sep, state->inst_[i]);

534 sep = ",";

535 }

536 }

537 StringAppendF(&s, " flag=%#x", state->flag_);

538 return s;

539 }

540

541 //////////////////////////////////////////////////////////////////////

542 //

543 // DFA state graph construction.

544 //

545 // The DFA state graph is a heavily-linked collection of State* structures.

546 // The state_cache_ is a set of all the State structures ever allocated,

547 // so that if the same state is reached by two different paths,

548 // the same State structure can be used. This reduces allocation

549 // requirements and also avoids duplication of effort across the two

550 // identical states.

551 //

552 // A State is defined by an ordered list of instruction ids and a flag word.

553 //

554 // The choice of an ordered list of instructions differs from a typical

555 // textbook DFA implementation, which would use an unordered set.

556 // Textbook descriptions, however, only care about whether

557 // the DFA matches, not where it matches in the text. To decide where the

558 // DFA matches, we need to mimic the behavior of the dominant backtracking

559 // implementations like PCRE, which try one possible regular expression

560 // execution, then another, then another, stopping when one of them succeeds.

561 // The DFA execution tries these many executions in parallel, representing

562 // each by an instruction id. These pointers are ordered in the State.inst_

563 // list in the same order that the executions would happen in a backtracking

564 // search: if a match is found during execution of inst_[2], inst_[i] for i>=3

565 // can be discarded.

566 //

567 // Textbooks also typically do not consider context-aware empty string operators

568 // like ^ or $. These are handled by the flag word, which specifies the set

569 // of empty-string operators that should be matched when executing at the

570 // current text position. These flag bits are defined in prog.h.

571 // The flag word also contains two DFA-specific bits: kFlagMatch if the state

572 // is a matching state (one that reached a kInstMatch in the program)

573 // and kFlagLastWord if the last processed byte was a word character, for the

574 // implementation of \B and \b.

575 //

576 // The flag word also contains, shifted up 16 bits, the bits looked for by

577 // any kInstEmptyWidth instructions in the state. These provide a useful

578 // summary indicating when new flags might be useful.

579 //

580 // The permanent representation of a State's instruction ids is just an array,

581 // but while a state is being analyzed, these instruction ids are represented

582 // as a Workq, which is an array that allows iteration in insertion order.

583

584 // NOTE(rsc): The choice of State construction determines whether the DFA

585 // mimics backtracking implementations (so-called leftmost first matching) or

586 // traditional DFA implementations (so-called leftmost longest matching as

587 // prescribed by POSIX). This implementation chooses to mimic the

588 // backtracking implementations, because we want to replace PCRE. To get

589 // POSIX behavior, the states would need to be considered not as a simple

590 // ordered list of instruction ids, but as a list of unordered sets of instructi on

591 // ids. A match by a state in one set would inhibit the running of sets

592 // farther down the list but not other instruction ids in the same set. Each

593 // set would correspond to matches beginning at a given point in the string.

594 // This is implemented by separating different sets with Mark pointers.

595

596 // Looks in the State cache for a State matching q, flag.

597 // If one is found, returns it. If one is not found, allocates one,

598 // inserts it in the cache, and returns it.

599 DFA::State* DFA::WorkqToCachedState(Workq* q, uint flag) {

600 if (DEBUG_MODE)

601 mutex_.AssertHeld();

602

603 // Construct array of instruction ids for the new state.

604 // Only ByteRange, EmptyWidth, and Match instructions are useful to keep:

605 // those are the only operators with any effect in

606 // RunWorkqOnEmptyString or RunWorkqOnByte.

607 int* inst = new int[q->size()];

608 int n = 0;

609 uint needflags = 0; // flags needed by kInstEmptyWidth instructions

610 bool sawmatch = false; // whether queue contains guaranteed kInstMatch

611 bool sawmark = false; // whether queue contains a Mark

612 if (DebugDFA)

613 fprintf(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q).c_str(), flag);

614 for (Workq::iterator it = q->begin(); it != q->end(); ++it) {

615 int id = *it;

616 if (sawmatch && (kind_ == Prog::kFirstMatch \|\| q->is_mark(id)))

617 break;

618 if (q->is_mark(id)) {

619 if (n > 0 && inst[n-1] != Mark) {

620 sawmark = true;

621 inst[n++] = Mark;

622 }

623 continue;

624 }

625 Prog::Inst* ip = prog_->inst(id);

626 switch (ip->opcode()) {

627 case kInstAltMatch:

628 // This state will continue to a match no matter what

629 // the rest of the input is. If it is the highest priority match

630 // being considered, return the special FullMatchState

631 // to indicate that it's all matches from here out.

632 if (kind_ != Prog::kManyMatch &&

633 (kind_ != Prog::kFirstMatch \|\|

634 (it == q->begin() && ip->greedy(prog_))) &&

635 (kind_ != Prog::kLongestMatch \|\| !sawmark) &&

636 (flag & kFlagMatch)) {

637 delete[] inst;

638 if (DebugDFA)

639 fprintf(stderr, " -> FullMatchState\n");

640 return FullMatchState;

641 }

642 // Fall through.

643 case kInstByteRange: // These are useful.

644 case kInstEmptyWidth:

645 case kInstMatch:

646 case kInstAlt: // Not useful, but necessary [*]

647 inst[n++] = *it;

648 if (ip->opcode() == kInstEmptyWidth)

649 needflags \|= ip->empty();

650 if (ip->opcode() == kInstMatch && !prog_->anchor_end())

651 sawmatch = true;

652 break;

653

654 default: // The rest are not.

655 break;

656 }

657

658 // [*] kInstAlt would seem useless to record in a state, since

659 // we've already followed both its arrows and saved all the

660 // interesting states we can reach from there. The problem

661 // is that one of the empty-width instructions might lead

662 // back to the same kInstAlt (if an empty-width operator is starred),

663 // producing a different evaluation order depending on whether

664 // we keep the kInstAlt to begin with. Sigh.

665 // A specific case that this affects is /(^\|a)+/ matching "a".

666 // If we don't save the kInstAlt, we will match the whole "a" (0,1)

667 // but in fact the correct leftmost-first match is the leading "" (0,0).

668 }

669 DCHECK_LE(n, q->size());

670 if (n > 0 && inst[n-1] == Mark)

671 n--;

672

673 // If there are no empty-width instructions waiting to execute,

674 // then the extra flag bits will not be used, so there is no

675 // point in saving them. (Discarding them reduces the number

676 // of distinct states.)

677 if (needflags == 0)

678 flag &= kFlagMatch;

679

680 // NOTE(rsc): The code above cannot do flag &= needflags,

681 // because if the right flags were present to pass the current

682 // kInstEmptyWidth instructions, new kInstEmptyWidth instructions

683 // might be reached that in turn need different flags.

684 // The only sure thing is that if there are no kInstEmptyWidth

685 // instructions at all, no flags will be needed.

686 // We could do the extra work to figure out the full set of

687 // possibly needed flags by exploring past the kInstEmptyWidth

688 // instructions, but the check above -- are any flags needed

689 // at all? -- handles the most common case. More fine-grained

690 // analysis can only be justified by measurements showing that

691 // too many redundant states are being allocated.

692

693 // If there are no Insts in the list, it's a dead state,

694 // which is useful to signal with a special pointer so that

695 // the execution loop can stop early. This is only okay

696 // if the state is not a matching state.

697 if (n == 0 && flag == 0) {

698 delete[] inst;

699 if (DebugDFA)

700 fprintf(stderr, " -> DeadState\n");

701 return DeadState;

702 }

703

704 // If we're in longest match mode, the state is a sequence of

705 // unordered state sets separated by Marks. Sort each set

706 // to canonicalize, to reduce the number of distinct sets stored.

707 if (kind_ == Prog::kLongestMatch) {

708 int* ip = inst;

709 int* ep = ip + n;

710 while (ip < ep) {

711 int* markp = ip;

712 while (markp < ep && *markp != Mark)

713 markp++;

714 sort(ip, markp);

715 if (markp < ep)

716 markp++;

717 ip = markp;

718 }

719 }

720

721 // Save the needed empty-width flags in the top bits for use later.

722 flag \|= needflags << kFlagNeedShift;

723

724 State* state = CachedState(inst, n, flag);

725 delete[] inst;

726 return state;

727 }

728

729 // Looks in the State cache for a State matching inst, ninst, flag.

730 // If one is found, returns it. If one is not found, allocates one,

731 // inserts it in the cache, and returns it.

732 DFA::State* DFA::CachedState(int* inst, int ninst, uint flag) {

733 if (DEBUG_MODE)

734 mutex_.AssertHeld();

735

736 // Look in the cache for a pre-existing state.

737 State state = { inst, ninst, flag, NULL };

738 StateSet::iterator it = state_cache_.find(&state);

739 if (it != state_cache_.end()) {

740 if (DebugDFA)

741 fprintf(stderr, " -cached-> %s\n", DumpState(*it).c_str());

742 return *it;

743 }

744

745 // Must have enough memory for new state.

746 // In addition to what we're going to allocate,

747 // the state cache hash table seems to incur about 32 bytes per

748 // State*, empirically.

749 const int kStateCacheOverhead = 32;

750 int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot

751 int mem = sizeof(State) + nnextsizeof(State) + ninst*sizeof(int);

752 if (mem_budget_ < mem + kStateCacheOverhead) {

753 mem_budget_ = -1;

754 return NULL;

755 }

756 mem_budget_ -= mem + kStateCacheOverhead;

757

758 // Allocate new state, along with room for next and inst.

759 char* space = new char[mem];

760 State* s = reinterpret_cast<State*>(space);

761 s->next_ = reinterpret_cast<State**>(s + 1);

762 s->inst_ = reinterpret_cast<int*>(s->next_ + nnext);

763 memset(s->next_, 0, nnext*sizeof s->next_[0]);

764 memmove(s->inst_, inst, ninst*sizeof s->inst_[0]);

765 s->ninst_ = ninst;

766 s->flag_ = flag;

767 if (DebugDFA)

768 fprintf(stderr, " -> %s\n", DumpState(s).c_str());

769

770 // Put state in cache and return it.

771 state_cache_.insert(s);

772 return s;

773 }

774

775 // Clear the cache. Must hold cache_mutex_.w or be in destructor.

776 void DFA::ClearCache() {

777 // In case state_cache_ doesn't support deleting entries

778 // during iteration, copy into a vector and then delete.

779 vector<State*> v;

780 v.reserve(state_cache_.size());

781 for (StateSet::iterator it = state_cache_.begin();

782 it != state_cache_.end(); ++it)

783 v.push_back(*it);

784 state_cache_.clear();

785 for (size_t i = 0; i < v.size(); i++)

786 delete[] reinterpret_cast<const char*>(v[i]);

787 }

788

789 // Copies insts in state s to the work queue q.

790 void DFA::StateToWorkq(State* s, Workq* q) {

791 q->clear();

792 for (int i = 0; i < s->ninst_; i++) {

793 if (s->inst_[i] == Mark)

794 q->mark();

795 else

796 q->insert_new(s->inst_[i]);

797 }

798 }

799

800 // Adds ip to the work queue, following empty arrows according to flag

801 // and expanding kInstAlt instructions (two-target gotos).

802 void DFA::AddToQueue(Workq* q, int id, uint flag) {

803

804 // Use astack_ to hold our stack of states yet to process.

805 // It is sized to have room for nastack_ == 2*prog->size() + nmark

806 // instructions, which is enough: each instruction can be

807 // processed by the switch below only once, and the processing

808 // pushes at most two instructions plus maybe a mark.

809 // (If we're using marks, nmark == prog->size(); otherwise nmark == 0.)

810 int* stk = astack_;

811 int nstk = 0;

812

813 stk[nstk++] = id;

814 while (nstk > 0) {

815 DCHECK_LE(nstk, nastack_);

816 id = stk[--nstk];

817

818 if (id == Mark) {

819 q->mark();

820 continue;

821 }

822

823 if (id == 0)

824 continue;

825

826 // If ip is already on the queue, nothing to do.

827 // Otherwise add it. We don't actually keep all the ones

828 // that get added -- for example, kInstAlt is ignored

829 // when on a work queue -- but adding all ip's here

830 // increases the likelihood of q->contains(id),

831 // reducing the amount of duplicated work.

832 if (q->contains(id))

833 continue;

834 q->insert_new(id);

835

836 // Process instruction.

837 Prog::Inst* ip = prog_->inst(id);

838 switch (ip->opcode()) {

839 case kInstFail: // can't happen: discarded above

840 break;

841

842 case kInstByteRange: // just save these on the queue

843 case kInstMatch:

844 break;

845

846 case kInstCapture: // DFA treats captures as no-ops.

847 case kInstNop:

848 stk[nstk++] = ip->out();

849 break;

850

851 case kInstAlt: // two choices: expand both, in order

852 case kInstAltMatch:

853 // Want to visit out then out1, so push on stack in reverse order.

854 // This instruction is the [00-FF]* loop at the beginning of

855 // a leftmost-longest unanchored search, separate out from out1

856 // with a Mark, so that out1's threads (which will start farther

857 // to the right in the string being searched) are lower priority

858 // than the current ones.

859 stk[nstk++] = ip->out1();

860 if (q->maxmark() > 0 &&

861 id == prog_->start_unanchored() && id != prog_->start())

862 stk[nstk++] = Mark;

863 stk[nstk++] = ip->out();

864 break;

865

866 case kInstEmptyWidth:

867 // Continue on if we have all the right flag bits.

868 if (ip->empty() & ~flag)

869 break;

870 stk[nstk++] = ip->out();

871 break;

872 }

873 }

874 }

875

876 // Running of work queues. In the work queue, order matters:

877 // the queue is sorted in priority order. If instruction i comes before j,

878 // then the instructions that i produces during the run must come before

879 // the ones that j produces. In order to keep this invariant, all the

880 // work queue runners have to take an old queue to process and then

881 // also a new queue to fill in. It's not acceptable to add to the end of

882 // an existing queue, because new instructions will not end up in the

883 // correct position.

884

885 // Runs the work queue, processing the empty strings indicated by flag.

886 // For example, flag == kEmptyBeginLine\|kEmptyEndLine means to match

887 // both ^ and $. It is important that callers pass all flags at once:

888 // processing both ^ and $ is not the same as first processing only ^

889 // and then processing only $. Doing the two-step sequence won't match

890 // ^$^$^$ but processing ^ and $ simultaneously will (and is the behavior

891 // exhibited by existing implementations).

892 void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint flag) {

893 newq->clear();

894 for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {

895 if (oldq->is_mark(*i))

896 AddToQueue(newq, Mark, flag);

897 else

898 AddToQueue(newq, *i, flag);

899 }

900 }

901

902 // Runs the work queue, processing the single byte c followed by any empty

903 // strings indicated by flag. For example, c == 'a' and flag == kEmptyEndLine,

904 // means to match c$. Sets the bool *ismatch to true if the end of the

905 // regular expression program has been reached (the regexp has matched).

906 void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,

907 int c, uint flag, bool* ismatch,

908 Prog::MatchKind kind) {

909 if (DEBUG_MODE)

910 mutex_.AssertHeld();

911

912 newq->clear();

913 for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {

914 if (oldq->is_mark(*i)) {

915 if (*ismatch)

916 return;

917 newq->mark();

918 continue;

919 }

920 int id = *i;

921 Prog::Inst* ip = prog_->inst(id);

922 switch (ip->opcode()) {

923 case kInstFail: // never succeeds

924 case kInstCapture: // already followed

925 case kInstNop: // already followed

926 case kInstAlt: // already followed

927 case kInstAltMatch: // already followed

928 case kInstEmptyWidth: // already followed

929 break;

930

931 case kInstByteRange: // can follow if c is in range

932 if (ip->Matches(c))

933 AddToQueue(newq, ip->out(), flag);

934 break;

935

936 case kInstMatch:

937 if (prog_->anchor_end() && c != kByteEndText)

938 break;

939 *ismatch = true;

940 if (kind == Prog::kFirstMatch) {

941 // Can stop processing work queue since we found a match.

942 return;

943 }

944 break;

945 }

946 }

947

948 if (DebugDFA)

949 fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n", DumpWorkq(oldq).c_str(),

950 c, flag, DumpWorkq(newq).c_str(), *ismatch);

951 }

952

953 // Processes input byte c in state, returning new state.

954 // Caller does not hold mutex.

955 DFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) {

956 // Keep only one RunStateOnByte going

957 // even if the DFA is being run by multiple threads.

958 MutexLock l(&mutex_);

959 return RunStateOnByte(state, c);

960 }

961

962 // Processes input byte c in state, returning new state.

963 DFA::State* DFA::RunStateOnByte(State* state, int c) {

964 if (DEBUG_MODE)

965 mutex_.AssertHeld();

966 if (state <= SpecialStateMax) {

967 if (state == FullMatchState) {

968 // It is convenient for routines like PossibleMatchRange

969 // if we implement RunStateOnByte for FullMatchState:

970 // once you get into this state you never get out,

971 // so it's pretty easy.

972 return FullMatchState;

973 }

974 if (state == DeadState) {

975 LOG(DFATAL) << "DeadState in RunStateOnByte";

976 return NULL;

977 }

978 if (state == NULL) {

979 LOG(DFATAL) << "NULL state in RunStateOnByte";

980 return NULL;

981 }

982 LOG(DFATAL) << "Unexpected special state in RunStateOnByte";

983 return NULL;

984 }

985

986 // If someone else already computed this, return it.

987 State* ns;

988 ATOMIC_LOAD_CONSUME(ns, &state->next_[ByteMap(c)]);

989 if (ns != NULL)

990 return ns;

991

992 // Convert state into Workq.

993 StateToWorkq(state, q0_);

994

995 // Flags marking the kinds of empty-width things (^ $ etc)

996 // around this byte. Before the byte we have the flags recorded

997 // in the State structure itself. After the byte we have

998 // nothing yet (but that will change: read on).

999 uint needflag = state->flag_ >> kFlagNeedShift;

1000 uint beforeflag = state->flag_ & kFlagEmptyMask;

1001 uint oldbeforeflag = beforeflag;

1002 uint afterflag = 0;

1003

1004 if (c == '\n') {

1005 // Insert implicit $ and ^ around \n

1006 beforeflag \|= kEmptyEndLine;

1007 afterflag \|= kEmptyBeginLine;

1008 }

1009

1010 if (c == kByteEndText) {

1011 // Insert implicit $ and \z before the fake "end text" byte.

1012 beforeflag \|= kEmptyEndLine \| kEmptyEndText;

1013 }

1014

1015 // The state flag kFlagLastWord says whether the last

1016 // byte processed was a word character. Use that info to

1017 // insert empty-width (non-)word boundaries.

1018 bool islastword = (state->flag_ & kFlagLastWord) != 0;

1019 bool isword = (c != kByteEndText && Prog::IsWordChar(static_cast<uint8>(c)));

1020 if (isword == islastword)

1021 beforeflag \|= kEmptyNonWordBoundary;

1022 else

1023 beforeflag \|= kEmptyWordBoundary;

1024

1025 // Okay, finally ready to run.

1026 // Only useful to rerun on empty string if there are new, useful flags.

1027 if (beforeflag & ~oldbeforeflag & needflag) {

1028 RunWorkqOnEmptyString(q0_, q1_, beforeflag);

1029 swap(q0_, q1_);

1030 }

1031 bool ismatch = false;

1032 RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch, kind_);

1033

1034 // Most of the time, we build the state from the output of

1035 // RunWorkqOnByte, so swap q0_ and q1_ here. However, so that

1036 // RE2::Set can tell exactly which match instructions

1037 // contributed to the match, don't swap if c is kByteEndText.

1038 // The resulting state wouldn't be correct for further processing

1039 // of the string, but we're at the end of the text so that's okay.

1040 // Leaving q0_ alone preseves the match instructions that led to

1041 // the current setting of ismatch.

1042 if (c != kByteEndText \|\| kind_ != Prog::kManyMatch)

1043 swap(q0_, q1_);

1044

1045 // Save afterflag along with ismatch and isword in new state.

1046 uint flag = afterflag;

1047 if (ismatch)

1048 flag \|= kFlagMatch;

1049 if (isword)

1050 flag \|= kFlagLastWord;

1051

1052 ns = WorkqToCachedState(q0_, flag);

1053

1054 // Flush ns before linking to it.

1055 // Write barrier before updating state->next_ so that the

1056 // main search loop can proceed without any locking, for speed.

1057 // (Otherwise it would need one mutex operation per input byte.)

1058 ATOMIC_STORE_RELEASE(&state->next_[ByteMap(c)], ns);

1059 return ns;

1060 }

1061

1062

1063 //////////////////////////////////////////////////////////////////////

1064 // DFA cache reset.

1065

1066 // Reader-writer lock helper.

1067 //

1068 // The DFA uses a reader-writer mutex to protect the state graph itself.

1069 // Traversing the state graph requires holding the mutex for reading,

1070 // and discarding the state graph and starting over requires holding the

1071 // lock for writing. If a search needs to expand the graph but is out

1072 // of memory, it will need to drop its read lock and then acquire the

1073 // write lock. Since it cannot then atomically downgrade from write lock

1074 // to read lock, it runs the rest of the search holding the write lock.

1075 // (This probably helps avoid repeated contention, but really the decision

1076 // is forced by the Mutex interface.) It's a bit complicated to keep

1077 // track of whether the lock is held for reading or writing and thread

1078 // that through the search, so instead we encapsulate it in the RWLocker

1079 // and pass that around.

1080

1081 class DFA::RWLocker {

1082 public:

1083 explicit RWLocker(Mutex* mu);

1084 ~RWLocker();

1085

1086 // If the lock is only held for reading right now,

1087 // drop the read lock and re-acquire for writing.

1088 // Subsequent calls to LockForWriting are no-ops.

1089 // Notice that the lock is released temporarily.

1090 void LockForWriting();

1091

1092 // Returns whether the lock is already held for writing.

1093 bool IsLockedForWriting() {

1094 return writing_;

1095 }

1096

1097 private:

1098 Mutex* mu_;

1099 bool writing_;

1100

1101 DISALLOW_COPY_AND_ASSIGN(RWLocker);

1102 };

1103

1104 DFA::RWLocker::RWLocker(Mutex* mu)

1105 : mu_(mu), writing_(false) {

1106

1107 mu_->ReaderLock();

1108 }

1109

1110 // This function is marked as NO_THREAD_SAFETY_ANALYSIS because the annotations

1111 // does not support lock upgrade.

1112 void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS {

1113 if (!writing_) {

1114 mu_->ReaderUnlock();

1115 mu_->Lock();

1116 writing_ = true;

1117 }

1118 }

1119

1120 DFA::RWLocker::~RWLocker() {

1121 if (writing_)

1122 mu_->WriterUnlock();

1123 else

1124 mu_->ReaderUnlock();

1125 }

1126

1127

1128 // When the DFA's State cache fills, we discard all the states in the

1129 // cache and start over. Many threads can be using and adding to the

1130 // cache at the same time, so we synchronize using the cache_mutex_

1131 // to keep from stepping on other threads. Specifically, all the

1132 // threads using the current cache hold cache_mutex_ for reading.

1133 // When a thread decides to flush the cache, it drops cache_mutex_

1134 // and then re-acquires it for writing. That ensures there are no

1135 // other threads accessing the cache anymore. The rest of the search

1136 // runs holding cache_mutex_ for writing, avoiding any contention

1137 // with or cache pollution caused by other threads.

1138

1139 void DFA::ResetCache(RWLocker* cache_lock) {

1140 // Re-acquire the cache_mutex_ for writing (exclusive use).

1141 bool was_writing = cache_lock->IsLockedForWriting();

1142 cache_lock->LockForWriting();

1143

1144 // If we already held cache_mutex_ for writing, it means

1145 // this invocation of Search() has already reset the

1146 // cache once already. That's a pretty clear indication

1147 // that the cache is too small. Warn about that, once.

1148 // TODO(rsc): Only warn if state_cache_.size() < some threshold.

1149 if (was_writing && !cache_warned_) {

1150 LOG(INFO) << "DFA memory cache could be too small: "

1151 << "only room for " << state_cache_.size() << " states.";

1152 cache_warned_ = true;

1153 }

1154

1155 // Clear the cache, reset the memory budget.

1156 for (int i = 0; i < kMaxStart; i++) {

1157 start_[i].start = NULL;

1158 start_[i].firstbyte = kFbUnknown;

1159 }

1160 ClearCache();

1161 mem_budget_ = state_budget_;

1162 }

1163

1164 // Typically, a couple States do need to be preserved across a cache

1165 // reset, like the State at the current point in the search.

1166 // The StateSaver class helps keep States across cache resets.

1167 // It makes a copy of the state's guts outside the cache (before the reset)

1168 // and then can be asked, after the reset, to recreate the State

1169 // in the new cache. For example, in a DFA method ("this" is a DFA):

1170 //

1171 // StateSaver saver(this, s);

1172 // ResetCache(cache_lock);

1173 // s = saver.Restore();

1174 //

1175 // The saver should always have room in the cache to re-create the state,

1176 // because resetting the cache locks out all other threads, and the cache

1177 // is known to have room for at least a couple states (otherwise the DFA

1178 // constructor fails).

1179

1180 class DFA::StateSaver {

1181 public:

1182 explicit StateSaver(DFA* dfa, State* state);

1183 ~StateSaver();

1184

1185 // Recreates and returns a state equivalent to the

1186 // original state passed to the constructor.

1187 // Returns NULL if the cache has filled, but

1188 // since the DFA guarantees to have room in the cache

1189 // for a couple states, should never return NULL

1190 // if used right after ResetCache.

1191 State* Restore();

1192

1193 private:

1194 DFA* dfa_; // the DFA to use

1195 int* inst_; // saved info from State

1196 int ninst_;

1197 uint flag_;

1198 bool is_special_; // whether original state was special

1199 State* special_; // if is_special_, the original state

1200

1201 DISALLOW_COPY_AND_ASSIGN(StateSaver);

1202 };

1203

1204 DFA::StateSaver::StateSaver(DFA* dfa, State* state) {

1205 dfa_ = dfa;

1206 if (state <= SpecialStateMax) {

1207 inst_ = NULL;

1208 ninst_ = 0;

1209 flag_ = 0;

1210 is_special_ = true;

1211 special_ = state;

1212 return;

1213 }

1214 is_special_ = false;

1215 special_ = NULL;

1216 flag_ = state->flag_;

1217 ninst_ = state->ninst_;

1218 inst_ = new int[ninst_];

1219 memmove(inst_, state->inst_, ninst_*sizeof inst_[0]);

1220 }

1221

1222 DFA::StateSaver::~StateSaver() {

1223 if (!is_special_)

1224 delete[] inst_;

1225 }

1226

1227 DFA::State* DFA::StateSaver::Restore() {

1228 if (is_special_)

1229 return special_;

1230 MutexLock l(&dfa_->mutex_);

1231 State* s = dfa_->CachedState(inst_, ninst_, flag_);

1232 if (s == NULL)

1233 LOG(DFATAL) << "StateSaver failed to restore state.";

1234 return s;

1235 }

1236

1237

1238 //////////////////////////////////////////////////////////////////////

1239 //

1240 // DFA execution.

1241 //

1242 // The basic search loop is easy: start in a state s and then for each

1243 // byte c in the input, s = s->next[c].

1244 //

1245 // This simple description omits a few efficiency-driven complications.

1246 //

1247 // First, the State graph is constructed incrementally: it is possible

1248 // that s->next[c] is null, indicating that that state has not been

1249 // fully explored. In this case, RunStateOnByte must be invoked to

1250 // determine the next state, which is cached in s->next[c] to save

1251 // future effort. An alternative reason for s->next[c] to be null is

1252 // that the DFA has reached a so-called "dead state", in which any match

1253 // is no longer possible. In this case RunStateOnByte will return NULL

1254 // and the processing of the string can stop early.

1255 //

1256 // Second, a 256-element pointer array for s->next_ makes each State

1257 // quite large (2kB on 64-bit machines). Instead, dfa->bytemap_[]

1258 // maps from bytes to "byte classes" and then next_ only needs to have

1259 // as many pointers as there are byte classes. A byte class is simply a

1260 // range of bytes that the regexp never distinguishes between.

1261 // A regexp looking for a[abc] would have four byte ranges -- 0 to 'a'-1,

1262 // 'a', 'b' to 'c', and 'c' to 0xFF. The bytemap slows us a little bit

1263 // but in exchange we typically cut the size of a State (and thus our

1264 // memory footprint) by about 5-10x. The comments still refer to

1265 // s->next[c] for simplicity, but code should refer to s->next_[bytemap_[c]].

1266 //

1267 // Third, it is common for a DFA for an unanchored match to begin in a

1268 // state in which only one particular byte value can take the DFA to a

1269 // different state. That is, s->next[c] != s for only one c. In this

1270 // situation, the DFA can do better than executing the simple loop.

1271 // Instead, it can call memchr to search very quickly for the byte c.

1272 // Whether the start state has this property is determined during a

1273 // pre-compilation pass, and if so, the byte b is passed to the search

1274 // loop as the "firstbyte" argument, along with a boolean "have_firstbyte".

1275 //

1276 // Fourth, the desired behavior is to search for the leftmost-best match

1277 // (approximately, the same one that Perl would find), which is not

1278 // necessarily the match ending earliest in the string. Each time a

1279 // match is found, it must be noted, but the DFA must continue on in

1280 // hope of finding a higher-priority match. In some cases, the caller only

1281 // cares whether there is any match at all, not which one is found.

1282 // The "want_earliest_match" flag causes the search to stop at the first

1283 // match found.

1284 //

1285 // Fifth, one algorithm that uses the DFA needs it to run over the

1286 // input string backward, beginning at the end and ending at the beginning.

1287 // Passing false for the "run_forward" flag causes the DFA to run backward.

1288 //

1289 // The checks for these last three cases, which in a naive implementation

1290 // would be performed once per input byte, slow the general loop enough

1291 // to merit specialized versions of the search loop for each of the

1292 // eight possible settings of the three booleans. Rather than write

1293 // eight different functions, we write one general implementation and then

1294 // inline it to create the specialized ones.

1295 //

1296 // Note that matches are delayed by one byte, to make it easier to

1297 // accomodate match conditions depending on the next input byte (like $ and \b).

1298 // When s->next[c]->IsMatch(), it means that there is a match ending just

1299 // before byte c.

1300

1301 // The generic search loop. Searches text for a match, returning

1302 // the pointer to the end of the chosen match, or NULL if no match.

1303 // The bools are equal to the same-named variables in params, but

1304 // making them function arguments lets the inliner specialize

1305 // this function to each combination (see two paragraphs above).

1306 inline bool DFA::InlinedSearchLoop(SearchParams* params,

1307 bool have_firstbyte,

1308 bool want_earliest_match,

1309 bool run_forward) {

1310 State* start = params->start;

1311 const uint8* bp = BytePtr(params->text.begin()); // start of text

1312 const uint8* p = bp; // text scanning point

1313 const uint8* ep = BytePtr(params->text.end()); // end of text

1314 const uint8* resetp = NULL; // p at last cache reset

1315 if (!run_forward)

1316 swap(p, ep);

1317

1318 const uint8* bytemap = prog_->bytemap();

1319 const uint8* lastmatch = NULL; // most recent matching position in text

1320 bool matched = false;

1321 State* s = start;

1322

1323 if (s->IsMatch()) {

1324 matched = true;

1325 lastmatch = p;

1326 if (want_earliest_match) {

1327 params->ep = reinterpret_cast<const char*>(lastmatch);

1328 return true;

1329 }

1330 }

1331

1332 while (p != ep) {

1333 if (DebugDFA)

1334 fprintf(stderr, "@%d: %s\n", static_cast<int>(p - bp),

1335 DumpState(s).c_str());

1336 if (have_firstbyte && s == start) {

1337 // In start state, only way out is to find firstbyte,

1338 // so use optimized assembly in memchr to skip ahead.

1339 // If firstbyte isn't found, we can skip to the end

1340 // of the string.

1341 if (run_forward) {

1342 if ((p = BytePtr(memchr(p, params->firstbyte, ep - p))) == NULL) {

1343 p = ep;

1344 break;

1345 }

1346 } else {

1347 if ((p = BytePtr(memrchr(ep, params->firstbyte, p - ep))) == NULL) {

1348 p = ep;

1349 break;

1350 }

1351 p++;

1352 }

1353 }

1354

1355 int c;

1356 if (run_forward)

1357 c = *p++;

1358 else

1359 c = *--p;

1360

1361 // Note that multiple threads might be consulting

1362 // s->next_[bytemap[c]] simultaneously.

1363 // RunStateOnByte takes care of the appropriate locking,

1364 // including a memory barrier so that the unlocked access

1365 // (sometimes known as "double-checked locking") is safe.

1366 // The alternative would be either one DFA per thread

1367 // or one mutex operation per input byte.

1368 //

1369 // ns == DeadState means the state is known to be dead

1370 // (no more matches are possible).

1371 // ns == NULL means the state has not yet been computed

1372 // (need to call RunStateOnByteUnlocked).

1373 // RunStateOnByte returns ns == NULL if it is out of memory.

1374 // ns == FullMatchState means the rest of the string matches.

1375 //

1376 // Okay to use bytemap[] not ByteMap() here, because

1377 // c is known to be an actual byte and not kByteEndText.

1378

1379 State* ns;

1380 ATOMIC_LOAD_CONSUME(ns, &s->next_[bytemap[c]]);

1381 if (ns == NULL) {

1382 ns = RunStateOnByteUnlocked(s, c);

1383 if (ns == NULL) {

1384 // After we reset the cache, we hold cache_mutex exclusively,

1385 // so if resetp != NULL, it means we filled the DFA state

1386 // cache with this search alone (without any other threads).

1387 // Benchmarks show that doing a state computation on every

1388 // byte runs at about 0.2 MB/s, while the NFA (nfa.cc) can do the

1389 // same at about 2 MB/s. Unless we're processing an average

1390 // of 10 bytes per state computation, fail so that RE2 can

1391 // fall back to the NFA.

1392 if (FLAGS_re2_dfa_bail_when_slow && resetp != NULL &&

1393 static_cast<unsigned long>(p - resetp) < 10*state_cache_.size()) {

1394 params->failed = true;

1395 return false;

1396 }

1397 resetp = p;

1398

1399 // Prepare to save start and s across the reset.

1400 StateSaver save_start(this, start);

1401 StateSaver save_s(this, s);

1402

1403 // Discard all the States in the cache.

1404 ResetCache(params->cache_lock);

1405

1406 // Restore start and s so we can continue.

1407 if ((start = save_start.Restore()) == NULL \|\|

1408 (s = save_s.Restore()) == NULL) {

1409 // Restore already did LOG(DFATAL).

1410 params->failed = true;

1411 return false;

1412 }

1413 ns = RunStateOnByteUnlocked(s, c);

1414 if (ns == NULL) {

1415 LOG(DFATAL) << "RunStateOnByteUnlocked failed after ResetCache";

1416 params->failed = true;

1417 return false;

1418 }

1419 }

1420 }

1421 if (ns <= SpecialStateMax) {

1422 if (ns == DeadState) {

1423 params->ep = reinterpret_cast<const char*>(lastmatch);

1424 return matched;

1425 }

1426 // FullMatchState

1427 params->ep = reinterpret_cast<const char*>(ep);

1428 return true;

1429 }

1430 s = ns;

1431

1432 if (s->IsMatch()) {

1433 matched = true;

1434 // The DFA notices the match one byte late,

1435 // so adjust p before using it in the match.

1436 if (run_forward)

1437 lastmatch = p - 1;

1438 else

1439 lastmatch = p + 1;

1440 if (DebugDFA)

1441 fprintf(stderr, "match @%d! [%s]\n",

1442 static_cast<int>(lastmatch - bp),

1443 DumpState(s).c_str());

1444

1445 if (want_earliest_match) {

1446 params->ep = reinterpret_cast<const char*>(lastmatch);

1447 return true;

1448 }

1449 }

1450 }

1451

1452 // Process one more byte to see if it triggers a match.

1453 // (Remember, matches are delayed one byte.)

1454 int lastbyte;

1455 if (run_forward) {

1456 if (params->text.end() == params->context.end())

1457 lastbyte = kByteEndText;

1458 else

1459 lastbyte = params->text.end()[0] & 0xFF;

1460 } else {

1461 if (params->text.begin() == params->context.begin())

1462 lastbyte = kByteEndText;

1463 else

1464 lastbyte = params->text.begin()[-1] & 0xFF;

1465 }

1466

1467 State* ns;

1468 ATOMIC_LOAD_CONSUME(ns, &s->next_[ByteMap(lastbyte)]);

1469 if (ns == NULL) {

1470 ns = RunStateOnByteUnlocked(s, lastbyte);

1471 if (ns == NULL) {

1472 StateSaver save_s(this, s);

1473 ResetCache(params->cache_lock);

1474 if ((s = save_s.Restore()) == NULL) {

1475 params->failed = true;

1476 return false;

1477 }

1478 ns = RunStateOnByteUnlocked(s, lastbyte);

1479 if (ns == NULL) {

1480 LOG(DFATAL) << "RunStateOnByteUnlocked failed after Reset";

1481 params->failed = true;

1482 return false;

1483 }

1484 }

1485 }

1486 s = ns;

1487 if (DebugDFA)

1488 fprintf(stderr, "@_: %s\n", DumpState(s).c_str());

1489 if (s == FullMatchState) {

1490 params->ep = reinterpret_cast<const char*>(ep);

1491 return true;

1492 }

1493 if (s > SpecialStateMax && s->IsMatch()) {

1494 matched = true;

1495 lastmatch = p;

1496 if (params->matches && kind_ == Prog::kManyMatch) {

1497 vector<int>* v = params->matches;

1498 v->clear();

1499 for (int i = 0; i < s->ninst_; i++) {

1500 Prog::Inst* ip = prog_->inst(s->inst_[i]);

1501 if (ip->opcode() == kInstMatch)

1502 v->push_back(ip->match_id());

1503 }

1504 }

1505 if (DebugDFA)

1506 fprintf(stderr, "match @%d! [%s]\n", static_cast<int>(lastmatch - bp),

1507 DumpState(s).c_str());

1508 }

1509 params->ep = reinterpret_cast<const char*>(lastmatch);

1510 return matched;

1511 }

1512

1513 // Inline specializations of the general loop.

1514 bool DFA::SearchFFF(SearchParams* params) {

1515 return InlinedSearchLoop(params, 0, 0, 0);

1516 }

1517 bool DFA::SearchFFT(SearchParams* params) {

1518 return InlinedSearchLoop(params, 0, 0, 1);

1519 }

1520 bool DFA::SearchFTF(SearchParams* params) {

1521 return InlinedSearchLoop(params, 0, 1, 0);

1522 }

1523 bool DFA::SearchFTT(SearchParams* params) {

1524 return InlinedSearchLoop(params, 0, 1, 1);

1525 }

1526 bool DFA::SearchTFF(SearchParams* params) {

1527 return InlinedSearchLoop(params, 1, 0, 0);

1528 }

1529 bool DFA::SearchTFT(SearchParams* params) {

1530 return InlinedSearchLoop(params, 1, 0, 1);

1531 }

1532 bool DFA::SearchTTF(SearchParams* params) {

1533 return InlinedSearchLoop(params, 1, 1, 0);

1534 }

1535 bool DFA::SearchTTT(SearchParams* params) {

1536 return InlinedSearchLoop(params, 1, 1, 1);

1537 }

1538

1539 // For debugging, calls the general code directly.

1540 bool DFA::SlowSearchLoop(SearchParams* params) {

1541 return InlinedSearchLoop(params,

1542 params->firstbyte >= 0,

1543 params->want_earliest_match,

1544 params->run_forward);

1545 }

1546

1547 // For performance, calls the appropriate specialized version

1548 // of InlinedSearchLoop.

1549 bool DFA::FastSearchLoop(SearchParams* params) {

1550 // Because the methods are private, the Searches array

1551 // cannot be declared at top level.

1552 static bool (DFA::Searches[])(SearchParams) = {

1553 &DFA::SearchFFF,

1554 &DFA::SearchFFT,

1555 &DFA::SearchFTF,

1556 &DFA::SearchFTT,

1557 &DFA::SearchTFF,

1558 &DFA::SearchTFT,

1559 &DFA::SearchTTF,

1560 &DFA::SearchTTT,

1561 };

1562

1563 bool have_firstbyte = (params->firstbyte >= 0);

1564 int index = 4 * have_firstbyte +

1565 2 * params->want_earliest_match +

1566 1 * params->run_forward;

1567 return (this->*Searches[index])(params);

1568 }

1569

1570

1571 // The discussion of DFA execution above ignored the question of how

1572 // to determine the initial state for the search loop. There are two

1573 // factors that influence the choice of start state.

1574 //

1575 // The first factor is whether the search is anchored or not.

1576 // The regexp program (Prog*) itself has

1577 // two different entry points: one for anchored searches and one for

1578 // unanchored searches. (The unanchored version starts with a leading ".*?"

1579 // and then jumps to the anchored one.)

1580 //

1581 // The second factor is where text appears in the larger context, which

1582 // determines which empty-string operators can be matched at the beginning

1583 // of execution. If text is at the very beginning of context, \A and ^ match.

1584 // Otherwise if text is at the beginning of a line, then ^ matches.

1585 // Otherwise it matters whether the character before text is a word character

1586 // or a non-word character.

1587 //

1588 // The two cases (unanchored vs not) and four cases (empty-string flags)

1589 // combine to make the eight cases recorded in the DFA's begin_text_[2],

1590 // begin_line_[2], after_wordchar_[2], and after_nonwordchar_[2] cached

1591 // StartInfos. The start state for each is filled in the first time it

1592 // is used for an actual search.

1593

1594 // Examines text, context, and anchored to determine the right start

1595 // state for the DFA search loop. Fills in params and returns true on success.

1596 // Returns false on failure.

1597 bool DFA::AnalyzeSearch(SearchParams* params) {

1598 const StringPiece& text = params->text;

1599 const StringPiece& context = params->context;

1600

1601 // Sanity check: make sure that text lies within context.

1602 if (text.begin() < context.begin() \|\| text.end() > context.end()) {

1603 LOG(DFATAL) << "Text is not inside context.";

1604 params->start = DeadState;

1605 return true;

1606 }

1607

1608 // Determine correct search type.

1609 int start;

1610 uint flags;

1611 if (params->run_forward) {

1612 if (text.begin() == context.begin()) {

1613 start = kStartBeginText;

1614 flags = kEmptyBeginText\|kEmptyBeginLine;

1615 } else if (text.begin()[-1] == '\n') {

1616 start = kStartBeginLine;

1617 flags = kEmptyBeginLine;

1618 } else if (Prog::IsWordChar(text.begin()[-1] & 0xFF)) {

1619 start = kStartAfterWordChar;

1620 flags = kFlagLastWord;

1621 } else {

1622 start = kStartAfterNonWordChar;

1623 flags = 0;

1624 }

1625 } else {

1626 if (text.end() == context.end()) {

1627 start = kStartBeginText;

1628 flags = kEmptyBeginText\|kEmptyBeginLine;

1629 } else if (text.end()[0] == '\n') {

1630 start = kStartBeginLine;

1631 flags = kEmptyBeginLine;

1632 } else if (Prog::IsWordChar(text.end()[0] & 0xFF)) {

1633 start = kStartAfterWordChar;

1634 flags = kFlagLastWord;

1635 } else {

1636 start = kStartAfterNonWordChar;

1637 flags = 0;

1638 }

1639 }

1640 if (params->anchored \|\| prog_->anchor_start())

1641 start \|= kStartAnchored;

1642 StartInfo* info = &start_[start];

1643

1644 // Try once without cache_lock for writing.

1645 // Try again after resetting the cache

1646 // (ResetCache will relock cache_lock for writing).

1647 if (!AnalyzeSearchHelper(params, info, flags)) {

1648 ResetCache(params->cache_lock);

1649 if (!AnalyzeSearchHelper(params, info, flags)) {

1650 LOG(DFATAL) << "Failed to analyze start state.";

1651 params->failed = true;

1652 return false;

1653 }

1654 }

1655

1656 if (DebugDFA) {

1657 int fb;

1658 ATOMIC_LOAD_RELAXED(fb, &info->firstbyte);

1659 fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s firstbyte=%d\n",

1660 params->anchored, params->run_forward, flags,

1661 DumpState(info->start).c_str(), fb);

1662 }

1663

1664 params->start = info->start;

1665 ATOMIC_LOAD_ACQUIRE(params->firstbyte, &info->firstbyte);

1666

1667 return true;

1668 }

1669

1670 // Fills in info if needed. Returns true on success, false on failure.

1671 bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info,

1672 uint flags) {

1673 // Quick check.

1674 int fb;

1675 ATOMIC_LOAD_ACQUIRE(fb, &info->firstbyte);

1676 if (fb != kFbUnknown)

1677 return true;

1678

1679 MutexLock l(&mutex_);

1680 if (info->firstbyte != kFbUnknown)

1681 return true;

1682

1683 q0_->clear();

1684 AddToQueue(q0_,

1685 params->anchored ? prog_->start() : prog_->start_unanchored(),

1686 flags);

1687 info->start = WorkqToCachedState(q0_, flags);

1688 if (info->start == NULL)

1689 return false;

1690

1691 if (info->start == DeadState) {

1692 // Synchronize with "quick check" above.

1693 ATOMIC_STORE_RELEASE(&info->firstbyte, kFbNone);

1694 return true;

1695 }

1696

1697 if (info->start == FullMatchState) {

1698 // Synchronize with "quick check" above.

1699 ATOMIC_STORE_RELEASE(&info->firstbyte, kFbNone); // will be ignored

1700 return true;

1701 }

1702

1703 // Compute info->firstbyte by running state on all

1704 // possible byte values, looking for a single one that

1705 // leads to a different state.

1706 int firstbyte = kFbNone;

1707 for (int i = 0; i < 256; i++) {

1708 State* s = RunStateOnByte(info->start, i);

1709 if (s == NULL) {

1710 // Synchronize with "quick check" above.

1711 ATOMIC_STORE_RELEASE(&info->firstbyte, firstbyte);

1712 return false;

1713 }

1714 if (s == info->start)

1715 continue;

1716 // Goes to new state...

1717 if (firstbyte == kFbNone) {

1718 firstbyte = i; // ... first one

1719 } else {

1720 firstbyte = kFbMany; // ... too many

1721 break;

1722 }

1723 }

1724 // Synchronize with "quick check" above.

1725 ATOMIC_STORE_RELEASE(&info->firstbyte, firstbyte);

1726 return true;

1727 }

1728

1729 // The actual DFA search: calls AnalyzeSearch and then FastSearchLoop.

1730 bool DFA::Search(const StringPiece& text,

1731 const StringPiece& context,

1732 bool anchored,

1733 bool want_earliest_match,

1734 bool run_forward,

1735 bool* failed,

1736 const char** epp,

1737 vector<int>* matches) {

1738 *epp = NULL;

1739 if (!ok()) {

1740 *failed = true;

1741 return false;

1742 }

1743 *failed = false;

1744

1745 if (DebugDFA) {

1746 fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str());

1747 fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n",

1748 text.as_string().c_str(), anchored, want_earliest_match,

1749 run_forward, kind_);

1750 }

1751

1752 RWLocker l(&cache_mutex_);

1753 SearchParams params(text, context, &l);

1754 params.anchored = anchored;

1755 params.want_earliest_match = want_earliest_match;

1756 params.run_forward = run_forward;

1757 params.matches = matches;

1758

1759 if (!AnalyzeSearch(&params)) {

1760 *failed = true;

1761 return false;

1762 }

1763 if (params.start == DeadState)

1764 return false;

1765 if (params.start == FullMatchState) {

1766 if (run_forward == want_earliest_match)

1767 *epp = text.begin();

1768 else

1769 *epp = text.end();

1770 return true;

1771 }

1772 if (DebugDFA)

1773 fprintf(stderr, "start %s\n", DumpState(params.start).c_str());

1774 bool ret = FastSearchLoop(&params);

1775 if (params.failed) {

1776 *failed = true;

1777 return false;

1778 }

1779 *epp = params.ep;

1780 return ret;

1781 }

1782

1783 // Deletes dfa.

1784 //

1785 // This is a separate function so that

1786 // prog.h can be used without moving the definition of

1787 // class DFA out of this file. If you set

1788 // prog->dfa_ = dfa;

1789 // then you also have to set

1790 // prog->delete_dfa_ = DeleteDFA;

1791 // so that ~Prog can delete the dfa.

1792 static void DeleteDFA(DFA* dfa) {

1793 delete dfa;

1794 }

1795

1796 DFA* Prog::GetDFA(MatchKind kind) {

1797 DFAvolatile pdfa;

1798 if (kind == kFirstMatch \|\| kind == kManyMatch) {

1799 pdfa = &dfa_first_;

1800 } else {

1801 kind = kLongestMatch;

1802 pdfa = &dfa_longest_;

1803 }

1804

1805 // Quick check.

1806 DFA *dfa;

1807 ATOMIC_LOAD_ACQUIRE(dfa, pdfa);

1808 if (dfa != NULL)

1809 return dfa;

1810

1811 MutexLock l(&dfa_mutex_);

1812 dfa = *pdfa;

1813 if (dfa != NULL)

1814 return dfa;

1815

1816 // For a forward DFA, half the memory goes to each DFA.

1817 // For a reverse DFA, all the memory goes to the

1818 // "longest match" DFA, because RE2 never does reverse

1819 // "first match" searches.

1820 int64 m = dfa_mem_/2;

1821 if (reversed_) {

1822 if (kind == kLongestMatch \|\| kind == kManyMatch)

1823 m = dfa_mem_;

1824 else

1825 m = 0;

1826 }

1827 dfa = new DFA(this, kind, m);

1828 delete_dfa_ = DeleteDFA;

1829

1830 // Synchronize with "quick check" above.

1831 ATOMIC_STORE_RELEASE(pdfa, dfa);

1832

1833 return dfa;

1834 }

1835

1836

1837 // Executes the regexp program to search in text,

1838 // which itself is inside the larger context. (As a convenience,

1839 // passing a NULL context is equivalent to passing text.)

1840 // Returns true if a match is found, false if not.

1841 // If a match is found, fills in match0->end() to point at the end of the match

1842 // and sets match0->begin() to text.begin(), since the DFA can't track

1843 // where the match actually began.

1844 //

1845 // This is the only external interface (class DFA only exists in this file).

1846 //

1847 bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,

1848 Anchor anchor, MatchKind kind,

1849 StringPiece* match0, bool* failed, vector<int>* matches) {

1850 *failed = false;

1851

1852 StringPiece context = const_context;

1853 if (context.begin() == NULL)

1854 context = text;

1855 bool carat = anchor_start();

1856 bool dollar = anchor_end();

1857 if (reversed_) {

1858 bool t = carat;

1859 carat = dollar;

1860 dollar = t;

1861 }

1862 if (carat && context.begin() != text.begin())

1863 return false;

1864 if (dollar && context.end() != text.end())

1865 return false;

1866

1867 // Handle full match by running an anchored longest match

1868 // and then checking if it covers all of text.

1869 bool anchored = anchor == kAnchored \|\| anchor_start() \|\| kind == kFullMatch;

1870 bool endmatch = false;

1871 if (kind == kManyMatch) {

1872 endmatch = true;

1873 } else if (kind == kFullMatch \|\| anchor_end()) {

1874 endmatch = true;

1875 kind = kLongestMatch;

1876 }

1877

1878 // If the caller doesn't care where the match is (just whether one exists),

1879 // then we can stop at the very first match we find, the so-called

1880 // "shortest match".

1881 bool want_shortest_match = false;

1882 if (match0 == NULL && !endmatch) {

1883 want_shortest_match = true;

1884 kind = kLongestMatch;

1885 }

1886

1887 DFA* dfa = GetDFA(kind);

1888 const char* ep;

1889 bool matched = dfa->Search(text, context, anchored,

1890 want_shortest_match, !reversed_,

1891 failed, &ep, matches);

1892 if (*failed)

1893 return false;

1894 if (!matched)

1895 return false;

1896 if (endmatch && ep != (reversed_ ? text.begin() : text.end()))

1897 return false;

1898

1899 // If caller cares, record the boundary of the match.

1900 // We only know where it ends, so use the boundary of text

1901 // as the beginning.

1902 if (match0) {

1903 if (reversed_)

1904 match0->set(ep, static_cast<int>(text.end() - ep));

1905 else

1906 match0->set(text.begin(), static_cast<int>(ep - text.begin()));

1907 }

1908 return true;

1909 }

1910

1911 // Build out all states in DFA. Returns number of states.

1912 int DFA::BuildAllStates() {

1913 if (!ok())

1914 return 0;

1915

1916 // Pick out start state for unanchored search

1917 // at beginning of text.

1918 RWLocker l(&cache_mutex_);

1919 SearchParams params(NULL, NULL, &l);

1920 params.anchored = false;

1921 if (!AnalyzeSearch(&params) \|\| params.start <= SpecialStateMax)

1922 return 0;

1923

1924 // Add start state to work queue.

1925 StateSet queued;

1926 vector<State*> q;

1927 queued.insert(params.start);

1928 q.push_back(params.start);

1929

1930 // Flood to expand every state.

1931 for (size_t i = 0; i < q.size(); i++) {

1932 State* s = q[i];

1933 for (int c = 0; c < 257; c++) {

1934 State* ns = RunStateOnByteUnlocked(s, c);

1935 if (ns > SpecialStateMax && queued.find(ns) == queued.end()) {

1936 queued.insert(ns);

1937 q.push_back(ns);

1938 }

1939 }

1940 }

1941

1942 return static_cast<int>(q.size());

1943 }

1944

1945 // Build out all states in DFA for kind. Returns number of states.

1946 int Prog::BuildEntireDFA(MatchKind kind) {

1947 //LOG(ERROR) << "BuildEntireDFA is only for testing.";

1948 return GetDFA(kind)->BuildAllStates();

1949 }

1950

1951 // Computes min and max for matching string.

1952 // Won't return strings bigger than maxlen.

1953 bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) {

1954 if (!ok())

1955 return false;

1956

1957 // NOTE: if future users of PossibleMatchRange want more precision when

1958 // presented with infinitely repeated elements, consider making this a

1959 // parameter to PossibleMatchRange.

1960 static int kMaxEltRepetitions = 0;

1961

1962 // Keep track of the number of times we've visited states previously. We only

1963 // revisit a given state if it's part of a repeated group, so if the value

1964 // portion of the map tuple exceeds kMaxEltRepetitions we bail out and set

1965 // \|max\| to \|PrefixSuccessor(max)\|.

1966 //

1967 // Also note that previously_visited_states[UnseenStatePtr] will, in the STL

1968 // tradition, implicitly insert a '0' value at first use. We take advantage

1969 // of that property below.

1970 map<State*, int> previously_visited_states;

1971

1972 // Pick out start state for anchored search at beginning of text.

1973 RWLocker l(&cache_mutex_);

1974 SearchParams params(NULL, NULL, &l);

1975 params.anchored = true;

1976 if (!AnalyzeSearch(&params))

1977 return false;

1978 if (params.start == DeadState) { // No matching strings

1979 *min = "";

1980 *max = "";

1981 return true;

1982 }

1983 if (params.start == FullMatchState) // Every string matches: no max

1984 return false;

1985

1986 // The DFA is essentially a big graph rooted at params.start,

1987 // and paths in the graph correspond to accepted strings.

1988 // Each node in the graph has potentially 256+1 arrows

1989 // coming out, one for each byte plus the magic end of

1990 // text character kByteEndText.

1991

1992 // To find the smallest possible prefix of an accepted

1993 // string, we just walk the graph preferring to follow

1994 // arrows with the lowest bytes possible. To find the

1995 // largest possible prefix, we follow the largest bytes

1996 // possible.

1997

1998 // The test for whether there is an arrow from s on byte j is

1999 // ns = RunStateOnByteUnlocked(s, j);

2000 // if (ns == NULL)

2001 // return false;

2002 // if (ns != DeadState && ns->ninst > 0)

2003 // The RunStateOnByteUnlocked call asks the DFA to build out the graph.

2004 // It returns NULL only if the DFA has run out of memory,

2005 // in which case we can't be sure of anything.

2006 // The second check sees whether there was graph built

2007 // and whether it is interesting graph. Nodes might have

2008 // ns->ninst == 0 if they exist only to represent the fact

2009 // that a match was found on the previous byte.

2010

2011 // Build minimum prefix.

2012 State* s = params.start;

2013 min->clear();

2014 MutexLock lock(&mutex_);

2015 for (int i = 0; i < maxlen; i++) {

2016 if (previously_visited_states[s] > kMaxEltRepetitions) {

2017 VLOG(2) << "Hit kMaxEltRepetitions=" << kMaxEltRepetitions

2018 << " for state s=" << s << " and min=" << CEscape(*min);

2019 break;

2020 }

2021 previously_visited_states[s]++;

2022

2023 // Stop if min is a match.

2024 State* ns = RunStateOnByte(s, kByteEndText);

2025 if (ns == NULL) // DFA out of memory

2026 return false;

2027 if (ns != DeadState && (ns == FullMatchState \|\| ns->IsMatch()))

2028 break;

2029

2030 // Try to extend the string with low bytes.

2031 bool extended = false;

2032 for (int j = 0; j < 256; j++) {

2033 ns = RunStateOnByte(s, j);

2034 if (ns == NULL) // DFA out of memory

2035 return false;

2036 if (ns == FullMatchState \|\|

2037 (ns > SpecialStateMax && ns->ninst_ > 0)) {

2038 extended = true;

2039 min->append(1, static_cast<char>(j));

2040 s = ns;

2041 break;

2042 }

2043 }

2044 if (!extended)

2045 break;

2046 }

2047

2048 // Build maximum prefix.

2049 previously_visited_states.clear();

2050 s = params.start;

2051 max->clear();

2052 for (int i = 0; i < maxlen; i++) {

2053 if (previously_visited_states[s] > kMaxEltRepetitions) {

2054 VLOG(2) << "Hit kMaxEltRepetitions=" << kMaxEltRepetitions

2055 << " for state s=" << s << " and max=" << CEscape(*max);

2056 break;

2057 }

2058 previously_visited_states[s] += 1;

2059

2060 // Try to extend the string with high bytes.

2061 bool extended = false;

2062 for (int j = 255; j >= 0; j--) {

2063 State* ns = RunStateOnByte(s, j);

2064 if (ns == NULL)

2065 return false;

2066 if (ns == FullMatchState \|\|

2067 (ns > SpecialStateMax && ns->ninst_ > 0)) {

2068 extended = true;

2069 max->append(1, static_cast<char>(j));

2070 s = ns;

2071 break;

2072 }

2073 }

2074 if (!extended) {

2075 // Done, no need for PrefixSuccessor.

2076 return true;

2077 }

2078 }

2079

2080 // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b

2081 max = PrefixSuccessor(max);

2082

2083 // If there are no bytes left, we have no way to say "there is no maximum

2084 // string". We could make the interface more complicated and be able to

2085 // return "there is no maximum but here is a minimum", but that seems like

2086 // overkill -- the most common no-max case is all possible strings, so not

2087 // telling the caller that the empty string is the minimum match isn't a

2088 // great loss.

2089 if (max->empty())

2090 return false;

2091

2092 return true;

2093 }

2094

2095 // PossibleMatchRange for a Prog.

2096 bool Prog::PossibleMatchRange(string* min, string* max, int maxlen) {

2097 DFA* dfa = NULL;

2098 {

2099 MutexLock l(&dfa_mutex_);

2100 // Have to use dfa_longest_ to get all strings for full matches.

2101 // For example, (a\|aa) never matches aa in first-match mode.

2102 dfa = dfa_longest_;

2103 if (dfa == NULL) {

2104 dfa = new DFA(this, Prog::kLongestMatch, dfa_mem_/2);

2105 ATOMIC_STORE_RELEASE(&dfa_longest_, dfa);

2106 delete_dfa_ = DeleteDFA;

2107 }

2108 }

2109 return dfa->PossibleMatchRange(min, max, maxlen);

2110 }

2111

2112 } // namespace re2

OLD	NEW

« no previous file with comments | « third_party/re2/re2/compile.cc ('k') | third_party/re2/re2/filtered_re2.h » ('j') | no next file with comments »