third_party/re2/re2/nfa.cc - Issue 1544433002: Replace RE2 import with a dependency

Unified Diff: third_party/re2/re2/nfa.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: third_party/re2/re2/nfa.cc

diff --git a/third_party/re2/re2/nfa.cc b/third_party/re2/re2/nfa.cc

deleted file mode 100644

index bc8996c420f3b709dd4ea539a303a69a2eca8ee2..0000000000000000000000000000000000000000

--- a/third_party/re2/re2/nfa.cc

+++ /dev/null

@@ -1,758 +0,0 @@

-// Use of this source code is governed by a BSD-style

-// license that can be found in the LICENSE file.

-// Tested by search_test.cc.

-//

-// Prog::SearchNFA, an NFA search.

-// This is an actual NFA like the theorists talk about,

-// not the pseudo-NFA found in backtracking regexp implementations.

-//

-// IMPLEMENTATION

-//

-// This algorithm is a variant of one that appeared in Rob Pike's sam editor,

-// which is a variant of the one described in Thompson's 1968 CACM paper.

-// See http://swtch.com/~rsc/regexp/ for various history. The main feature

-// over the DFA implementation is that it tracks submatch boundaries.

-//

-// When the choice of submatch boundaries is ambiguous, this particular

-// implementation makes the same choices that traditional backtracking

-// implementations (in particular, Perl and PCRE) do.

-// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential

-// time in the length of the input.

-//

-// Like Thompson's original machine and like the DFA implementation, this

-// implementation notices a match only once it is one byte past it.

-#include "re2/prog.h"

-#include "re2/regexp.h"

-#include "util/sparse_array.h"

-#include "util/sparse_set.h"

-namespace re2 {

-class NFA {

- public:

- NFA(Prog* prog);

- ~NFA();

- // Searches for a matching string.

- // * If anchored is true, only considers matches starting at offset.

- // Otherwise finds lefmost match at or after offset.

- // * If longest is true, returns the longest match starting

- // at the chosen start point. Otherwise returns the so-called

- // left-biased match, the one traditional backtracking engines

- // (like Perl and PCRE) find.

- // Records submatch boundaries in submatch[1..nsubmatch-1].

- // Submatch[0] is the entire match. When there is a choice in

- // which text matches each subexpression, the submatch boundaries

- // are chosen to match what a backtracking implementation would choose.

- bool Search(const StringPiece& text, const StringPiece& context,

- bool anchored, bool longest,

- StringPiece* submatch, int nsubmatch);

- static const int Debug = 0;

- private:

- struct Thread {

- union {

- int id;

- Thread* next; // when on free list

- };

- const char** capture;

- };

- // State for explicit stack in AddToThreadq.

- struct AddState {

- int id; // Inst to process

- int j;

- const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip

- AddState()

- : id(0), j(-1), cap_j(NULL) {}

- explicit AddState(int id)

- : id(id), j(-1), cap_j(NULL) {}

- AddState(int id, const char* cap_j, int j)

- : id(id), j(j), cap_j(cap_j) {}

- };

- // Threadq is a list of threads. The list is sorted by the order

- // in which Perl would explore that particular state -- the earlier

- // choices appear earlier in the list.

- typedef SparseArray<Thread*> Threadq;

- inline Thread* AllocThread();

- inline void FreeThread(Thread*);

- // Add id (or its children, following unlabeled arrows)

- // to the workqueue q with associated capture info.

- void AddToThreadq(Threadq* q, int id, int flag,

- const char* p, const char** capture);

- // Run runq on byte c, appending new states to nextq.

- // Updates matched_ and match_ as new, better matches are found.

- // p is position of the next byte (the one after c)

- // in the input string, used when processing capturing parens.

- // flag is the bitwise or of Bol, Eol, etc., specifying whether

- // ^, $ and \b match the current input point (after c).

- inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);

- // Returns text version of capture information, for debugging.

- string FormatCapture(const char** capture);

- inline void CopyCapture(const char** dst, const char** src);

- // Computes whether all matches must begin with the same first

- // byte, and if so, returns that byte. If not, returns -1.

- int ComputeFirstByte();

- Prog* prog_; // underlying program

- int start_; // start instruction in program

- int ncapture_; // number of submatches to track

- bool longest_; // whether searching for longest match

- bool endmatch_; // whether match must end at text.end()

- const char* btext_; // beginning of text being matched (for FormatSubmatch)

- const char* etext_; // end of text being matched (for endmatch_)

- Threadq q0_, q1_; // pre-allocated for Search.

- const char** match_; // best match so far

- bool matched_; // any match so far?

- AddState* astack_; // pre-allocated for AddToThreadq

- int nastack_;

- int first_byte_; // required first byte for match, or -1 if none

- Thread* free_threads_; // free list

- DISALLOW_COPY_AND_ASSIGN(NFA);

-};

-NFA::NFA(Prog* prog) {

- prog_ = prog;

- start_ = prog->start();

- ncapture_ = 0;

- longest_ = false;

- endmatch_ = false;

- btext_ = NULL;

- etext_ = NULL;

- q0_.resize(prog_->size());

- q1_.resize(prog_->size());

- nastack_ = 2*prog_->size();

- astack_ = new AddState[nastack_];

- match_ = NULL;

- matched_ = false;

- free_threads_ = NULL;

- first_byte_ = ComputeFirstByte();

-NFA::~NFA() {

- delete[] match_;

- delete[] astack_;

- Thread* next;

- for (Thread* t = free_threads_; t; t = next) {

- next = t->next;

- delete[] t->capture;

- delete t;

- }

-void NFA::FreeThread(Thread *t) {

- if (t == NULL)

- return;

- t->next = free_threads_;

- free_threads_ = t;

-NFA::Thread* NFA::AllocThread() {

- Thread* t = free_threads_;

- if (t == NULL) {

- t = new Thread;

- t->capture = new const char*[ncapture_];

- return t;

- }

- free_threads_ = t->next;

- return t;

-void NFA::CopyCapture(const char** dst, const char** src) {

- for (int i = 0; i < ncapture_; i+=2) {

- dst[i] = src[i];

- dst[i+1] = src[i+1];

- }

-// Follows all empty arrows from id0 and enqueues all the states reached.

-// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.

-// The pointer p is the current input position, and m is the

-// current set of match boundaries.

-void NFA::AddToThreadq(Threadq* q, int id0, int flag,

- const char* p, const char** capture) {

- if (id0 == 0)

- return;

- // Astack_ is pre-allocated to avoid resize operations.

- // It has room for 2*prog_->size() entries, which is enough:

- // Each inst in prog can be processed at most once,

- // pushing at most two entries on stk.

- int nstk = 0;

- AddState* stk = astack_;

- stk[nstk++] = AddState(id0);

- while (nstk > 0) {

- DCHECK_LE(nstk, nastack_);

- const AddState& a = stk[--nstk];

- if (a.j >= 0)

- capture[a.j] = a.cap_j;

- int id = a.id;

- if (id == 0)

- continue;

- if (q->has_index(id)) {

- if (Debug)

- fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str());

- continue;

- }

- // Create entry in q no matter what. We might fill it in below,

- // or we might not. Even if not, it is necessary to have it,

- // so that we don't revisit id0 during the recursion.

- q->set_new(id, NULL);

- Thread** tp = &q->find(id)->second;

- int j;

- Thread* t;

- Prog::Inst* ip = prog_->inst(id);

- switch (ip->opcode()) {

- default:

- LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";

- break;

- case kInstFail:

- break;

- case kInstAltMatch:

- // Save state; will pick up at next byte.

- t = AllocThread();

- t->id = id;

- CopyCapture(t->capture, capture);

- *tp = t;

- // fall through

- case kInstAlt:

- // Explore alternatives.

- stk[nstk++] = AddState(ip->out1());

- stk[nstk++] = AddState(ip->out());

- break;

- case kInstNop:

- // Continue on.

- stk[nstk++] = AddState(ip->out());

- break;

- case kInstCapture:

- if ((j=ip->cap()) < ncapture_) {

- // Push a dummy whose only job is to restore capture[j]

- // once we finish exploring this possibility.

- stk[nstk++] = AddState(0, capture[j], j);

- // Record capture.

- capture[j] = p;

- }

- stk[nstk++] = AddState(ip->out());

- break;

- case kInstMatch:

- case kInstByteRange:

- // Save state; will pick up at next byte.

- t = AllocThread();

- t->id = id;

- CopyCapture(t->capture, capture);

- *tp = t;

- if (Debug)

- fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);

- break;

- case kInstEmptyWidth:

- // Continue on if we have all the right flag bits.

- if (ip->empty() & ~flag)

- break;

- stk[nstk++] = AddState(ip->out());

- break;

- }

-// Run runq on byte c, appending new states to nextq.

-// Updates match as new, better matches are found.

-// p is position of the byte c in the input string,

-// used when processing capturing parens.

-// flag is the bitwise or of Bol, Eol, etc., specifying whether

-// ^, $ and \b match the current input point (after c).

-// Frees all the threads on runq.

-// If there is a shortcut to the end, returns that shortcut.

-int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {

- nextq->clear();

- for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {

- Thread* t = i->second;

- if (t == NULL)

- continue;

- if (longest_) {

- // Can skip any threads started after our current best match.

- if (matched_ && match_[0] < t->capture[0]) {

- FreeThread(t);

- continue;

- }

- int id = t->id;

- Prog::Inst* ip = prog_->inst(id);

- switch (ip->opcode()) {

- default:

- // Should only see the values handled below.

- LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";

- break;

- case kInstByteRange:

- if (ip->Matches(c))

- AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);

- break;

- case kInstAltMatch:

- if (i != runq->begin())

- break;

- // The match is ours if we want it.

- if (ip->greedy(prog_) || longest_) {

- CopyCapture((const char**)match_, t->capture);

- FreeThread(t);

- for (++i; i != runq->end(); ++i)

- FreeThread(i->second);

- runq->clear();

- matched_ = true;

- if (ip->greedy(prog_))

- return ip->out1();

- return ip->out();

- }

- break;

- case kInstMatch:

- if (endmatch_ && p != etext_)

- break;

- const char* old = t->capture[1]; // previous end pointer

- t->capture[1] = p;

- if (longest_) {

- // Leftmost-longest mode: save this match only if

- // it is either farther to the left or at the same

- // point but longer than an existing match.

- if (!matched_ || t->capture[0] < match_[0] ||

- (t->capture[0] == match_[0] && t->capture[1] > match_[1]))

- CopyCapture((const char**)match_, t->capture);

- } else {

- // Leftmost-biased mode: this match is by definition

- // better than what we've already found (see next line).

- CopyCapture((const char**)match_, t->capture);

- // Cut off the threads that can only find matches

- // worse than the one we just found: don't run the

- // rest of the current Threadq.

- t->capture[0] = old;

- FreeThread(t);

- for (++i; i != runq->end(); ++i)

- FreeThread(i->second);

- runq->clear();

- matched_ = true;

- return 0;

- }

- t->capture[0] = old;

- matched_ = true;

- break;

- }

- FreeThread(t);

- }

- runq->clear();

- return 0;

-string NFA::FormatCapture(const char** capture) {

- string s;

- for (int i = 0; i < ncapture_; i+=2) {

- if (capture[i] == NULL)

- StringAppendF(&s, "(?,?)");

- else if (capture[i+1] == NULL)

- StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));

- else

- StringAppendF(&s, "(%d,%d)",

- (int)(capture[i] - btext_),

- (int)(capture[i+1] - btext_));

- }

- return s;

-// Returns whether haystack contains needle's memory.

-static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {

- return haystack.begin() <= needle.begin() &&

- haystack.end() >= needle.end();

-bool NFA::Search(const StringPiece& text, const StringPiece& const_context,

- bool anchored, bool longest,

- StringPiece* submatch, int nsubmatch) {

- if (start_ == 0)

- return false;

- StringPiece context = const_context;

- if (context.begin() == NULL)

- context = text;

- if (!StringPieceContains(context, text)) {

- LOG(FATAL) << "Bad args: context does not contain text "

- << reinterpret_cast<const void*>(context.begin())

- << "+" << context.size() << " "

- << reinterpret_cast<const void*>(text.begin())

- << "+" << text.size();

- return false;

- }

- if (prog_->anchor_start() && context.begin() != text.begin())

- return false;

- if (prog_->anchor_end() && context.end() != text.end())

- return false;

- anchored |= prog_->anchor_start();

- if (prog_->anchor_end()) {

- longest = true;

- endmatch_ = true;

- etext_ = text.end();

- }

- if (nsubmatch < 0) {

- LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;

- return false;

- }

- // Save search parameters.

- ncapture_ = 2*nsubmatch;

- longest_ = longest;

- if (nsubmatch == 0) {

- // We need to maintain match[0], both to distinguish the

- // longest match (if longest is true) and also to tell

- // whether we've seen any matches at all.

- ncapture_ = 2;

- }

- match_ = new const char*[ncapture_];

- matched_ = false;

- memset(match_, 0, ncapture_*sizeof match_[0]);

- // For debugging prints.

- btext_ = context.begin();

- if (Debug) {

- fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",

- text.as_string().c_str(), context.as_string().c_str(), anchored,

- longest);

- }

- // Set up search.

- Threadq* runq = &q0_;

- Threadq* nextq = &q1_;

- runq->clear();

- nextq->clear();

- memset(&match_[0], 0, ncapture_*sizeof match_[0]);

- const char* bp = context.begin();

- int c = -1;

- int wasword = 0;

- if (text.begin() > context.begin()) {

- c = text.begin()[-1] & 0xFF;

- wasword = Prog::IsWordChar(static_cast<uint8>(c));

- }

- // Loop over the text, stepping the machine.

- for (const char* p = text.begin();; p++) {

- // Check for empty-width specials.

- int flag = 0;

- // ^ and \A

- if (p == context.begin())

- flag |= kEmptyBeginText | kEmptyBeginLine;

- else if (p <= context.end() && p[-1] == '\n')

- flag |= kEmptyBeginLine;

- // $ and \z

- if (p == context.end())

- flag |= kEmptyEndText | kEmptyEndLine;

- else if (p < context.end() && p[0] == '\n')

- flag |= kEmptyEndLine;

- // \b and \B

- int isword = 0;

- if (p < context.end())

- isword = Prog::IsWordChar(p[0] & 0xFF);

- if (isword != wasword)

- flag |= kEmptyWordBoundary;

- else

- flag |= kEmptyNonWordBoundary;

- if (Debug) {

- fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);

- for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {

- Thread* t = i->second;

- if (t == NULL)

- continue;

- fprintf(stderr, " %d%s", t->id,

- FormatCapture((const char**)t->capture).c_str());

- }

- fprintf(stderr, "\n");

- }

- // Process previous character (waited until now to avoid

- // repeating the flag computation above).

- // This is a no-op the first time around the loop, because

- // runq is empty.

- int id = Step(runq, nextq, c, flag, p-1);

- DCHECK_EQ(runq->size(), 0);

- swap(nextq, runq);

- nextq->clear();

- if (id != 0) {

- // We're done: full match ahead.

- p = text.end();

- for (;;) {

- Prog::Inst* ip = prog_->inst(id);

- switch (ip->opcode()) {

- default:

- LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();

- break;

- case kInstCapture:

- if (ip->cap() < ncapture_)

- match_[ip->cap()] = p;

- id = ip->out();

- continue;

- case kInstNop:

- id = ip->out();

- continue;

- case kInstMatch:

- match_[1] = p;

- matched_ = true;

- break;

- case kInstEmptyWidth:

- if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {

- LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();

- break;

- }

- id = ip->out();

- continue;

- }

- break;

- }

- break;

- }

- if (p > text.end())

- break;

- // Start a new thread if there have not been any matches.

- // (No point in starting a new thread if there have been

- // matches, since it would be to the right of the match

- // we already found.)

- if (!matched_ && (!anchored || p == text.begin())) {

- // If there's a required first byte for an unanchored search

- // and we're not in the middle of any possible matches,

- // use memchr to search for the byte quickly.

- if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&

- p < text.end() && (p[0] & 0xFF) != first_byte_) {

- p = reinterpret_cast<const char*>(memchr(p, first_byte_,

- text.end() - p));

- if (p == NULL) {

- p = text.end();

- isword = 0;

- } else {

- isword = Prog::IsWordChar(p[0] & 0xFF);

- }

- flag = Prog::EmptyFlags(context, p);

- }

- // Steal match storage (cleared but unused as of yet)

- // temporarily to hold match boundaries for new thread.

- match_[0] = p;

- AddToThreadq(runq, start_, flag, p, match_);

- match_[0] = NULL;

- }

- // If all the threads have died, stop early.

- if (runq->size() == 0) {

- if (Debug)

- fprintf(stderr, "dead\n");

- break;

- }

- if (p == text.end())

- c = 0;

- else

- c = *p & 0xFF;

- wasword = isword;

- // Will run step(runq, nextq, c, ...) on next iteration. See above.

- }

- for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)

- FreeThread(i->second);

- if (matched_) {

- for (int i = 0; i < nsubmatch; i++)

- submatch[i].set(match_[2*i],

- static_cast<int>(match_[2*i+1] - match_[2*i]));

- if (Debug)

- fprintf(stderr, "match (%d,%d)\n",

- static_cast<int>(match_[0] - btext_),

- static_cast<int>(match_[1] - btext_));

- return true;

- }

- VLOG(1) << "No matches found";

- return false;

-// Computes whether all successful matches have a common first byte,

-// and if so, returns that byte. If not, returns -1.

-int NFA::ComputeFirstByte() {

- if (start_ == 0)

- return -1;

- int b = -1; // first byte, not yet computed

- typedef SparseSet Workq;

- Workq q(prog_->size());

- q.insert(start_);

- for (Workq::iterator it = q.begin(); it != q.end(); ++it) {

- int id = *it;

- Prog::Inst* ip = prog_->inst(id);

- switch (ip->opcode()) {

- default:

- LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";

- break;

- case kInstMatch:

- // The empty string matches: no first byte.

- return -1;

- case kInstByteRange:

- // Must match only a single byte

- if (ip->lo() != ip->hi())

- return -1;

- if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')

- return -1;

- // If we haven't seen any bytes yet, record it;

- // otherwise must match the one we saw before.

- if (b == -1)

- b = ip->lo();

- else if (b != ip->lo())

- return -1;

- break;

- case kInstNop:

- case kInstCapture:

- case kInstEmptyWidth:

- // Continue on.

- // Ignore ip->empty() flags for kInstEmptyWidth

- // in order to be as conservative as possible

- // (assume all possible empty-width flags are true).

- if (ip->out())

- q.insert(ip->out());

- break;

- case kInstAlt:

- case kInstAltMatch:

- // Explore alternatives.

- if (ip->out())

- q.insert(ip->out());

- if (ip->out1())

- q.insert(ip->out1());

- break;

- case kInstFail:

- break;

- }

- return b;

-bool

-Prog::SearchNFA(const StringPiece& text, const StringPiece& context,

- Anchor anchor, MatchKind kind,

- StringPiece* match, int nmatch) {

- if (NFA::Debug)

- Dump();

- NFA nfa(this);

- StringPiece sp;

- if (kind == kFullMatch) {

- anchor = kAnchored;

- if (nmatch == 0) {

- match = &sp;

- nmatch = 1;

- }

- if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))

- return false;

- if (kind == kFullMatch && match[0].end() != text.end())

- return false;

- return true;

-// For each instruction i in the program reachable from the start, compute the

-// number of instructions reachable from i by following only empty transitions

-// and record that count as fanout[i].

-//

-// fanout holds the results and is also the work queue for the outer iteration.

-// reachable holds the reached nodes for the inner iteration.

-void Prog::Fanout(SparseArray<int>* fanout) {

- DCHECK_EQ(fanout->max_size(), size());

- SparseSet reachable(size());

- fanout->clear();

- fanout->set_new(start(), 0);

- for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {

- int* count = &i->second;

- reachable.clear();

- reachable.insert(i->index());

- for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {

- Prog::Inst* ip = inst(*j);

- switch (ip->opcode()) {

- default:

- LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";

- break;

- case kInstByteRange:

- (*count)++;

- if (!fanout->has_index(ip->out())) {

- fanout->set_new(ip->out(), 0);

- }

- break;

- case kInstAlt:

- case kInstAltMatch:

- reachable.insert(ip->out1());

- // fall through

- case kInstCapture:

- case kInstEmptyWidth:

- case kInstNop:

- reachable.insert(ip->out());

- break;

- case kInstMatch:

- case kInstFail:

- break;

- }

-} // namespace re2

« no previous file with comments | « third_party/re2/re2/mimics_pcre.cc ('k') | third_party/re2/re2/onepass.cc » ('j') | no next file with comments »