| Index: third_party/re2/re2/prog.h
|
| diff --git a/third_party/re2/re2/prog.h b/third_party/re2/re2/prog.h
|
| deleted file mode 100644
|
| index 8c5b2c4939e238955792ab9e3e1a95ff4427ee42..0000000000000000000000000000000000000000
|
| --- a/third_party/re2/re2/prog.h
|
| +++ /dev/null
|
| @@ -1,381 +0,0 @@
|
| -// Copyright 2007 The RE2 Authors. All Rights Reserved.
|
| -// Use of this source code is governed by a BSD-style
|
| -// license that can be found in the LICENSE file.
|
| -
|
| -// Compiled representation of regular expressions.
|
| -// See regexp.h for the Regexp class, which represents a regular
|
| -// expression symbolically.
|
| -
|
| -#ifndef RE2_PROG_H__
|
| -#define RE2_PROG_H__
|
| -
|
| -#include "util/util.h"
|
| -#include "util/sparse_array.h"
|
| -#include "re2/re2.h"
|
| -
|
| -namespace re2 {
|
| -
|
| -// Simple fixed-size bitmap.
|
| -template<int Bits>
|
| -class Bitmap {
|
| - public:
|
| - Bitmap() { Reset(); }
|
| - int Size() { return Bits; }
|
| -
|
| - void Reset() {
|
| - for (int i = 0; i < Words; i++)
|
| - w_[i] = 0;
|
| - }
|
| - bool Get(int k) const {
|
| - return w_[k >> WordLog] & (1<<(k & 31));
|
| - }
|
| - void Set(int k) {
|
| - w_[k >> WordLog] |= 1<<(k & 31);
|
| - }
|
| - void Clear(int k) {
|
| - w_[k >> WordLog] &= ~(1<<(k & 31));
|
| - }
|
| - uint32 Word(int i) const {
|
| - return w_[i];
|
| - }
|
| -
|
| - private:
|
| - static const int WordLog = 5;
|
| - static const int Words = (Bits+31)/32;
|
| - uint32 w_[Words];
|
| - DISALLOW_COPY_AND_ASSIGN(Bitmap);
|
| -};
|
| -
|
| -
|
| -// Opcodes for Inst
|
| -enum InstOp {
|
| - kInstAlt = 0, // choose between out_ and out1_
|
| - kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
|
| - kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
|
| - kInstCapture, // capturing parenthesis number cap_
|
| - kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
|
| - kInstMatch, // found a match!
|
| - kInstNop, // no-op; occasionally unavoidable
|
| - kInstFail, // never match; occasionally unavoidable
|
| -};
|
| -
|
| -// Bit flags for empty-width specials
|
| -enum EmptyOp {
|
| - kEmptyBeginLine = 1<<0, // ^ - beginning of line
|
| - kEmptyEndLine = 1<<1, // $ - end of line
|
| - kEmptyBeginText = 1<<2, // \A - beginning of text
|
| - kEmptyEndText = 1<<3, // \z - end of text
|
| - kEmptyWordBoundary = 1<<4, // \b - word boundary
|
| - kEmptyNonWordBoundary = 1<<5, // \B - not \b
|
| - kEmptyAllFlags = (1<<6)-1,
|
| -};
|
| -
|
| -class Regexp;
|
| -
|
| -class DFA;
|
| -struct OneState;
|
| -
|
| -// Compiled form of regexp program.
|
| -class Prog {
|
| - public:
|
| - Prog();
|
| - ~Prog();
|
| -
|
| - // Single instruction in regexp program.
|
| - class Inst {
|
| - public:
|
| - Inst() : out_opcode_(0), out1_(0) { }
|
| -
|
| - // Constructors per opcode
|
| - void InitAlt(uint32 out, uint32 out1);
|
| - void InitByteRange(int lo, int hi, int foldcase, uint32 out);
|
| - void InitCapture(int cap, uint32 out);
|
| - void InitEmptyWidth(EmptyOp empty, uint32 out);
|
| - void InitMatch(int id);
|
| - void InitNop(uint32 out);
|
| - void InitFail();
|
| -
|
| - // Getters
|
| - int id(Prog* p) { return static_cast<int>(this - p->inst_); }
|
| - InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
|
| - int out() { return out_opcode_>>3; }
|
| - int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
|
| - int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
|
| - int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
|
| - int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
|
| - int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
|
| - int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
|
| - EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
|
| - bool greedy(Prog *p) {
|
| - DCHECK_EQ(opcode(), kInstAltMatch);
|
| - return p->inst(out())->opcode() == kInstByteRange;
|
| - }
|
| -
|
| - // Does this inst (an kInstByteRange) match c?
|
| - inline bool Matches(int c) {
|
| - DCHECK_EQ(opcode(), kInstByteRange);
|
| - if (foldcase_ && 'A' <= c && c <= 'Z')
|
| - c += 'a' - 'A';
|
| - return lo_ <= c && c <= hi_;
|
| - }
|
| -
|
| - // Returns string representation for debugging.
|
| - string Dump();
|
| -
|
| - // Maximum instruction id.
|
| - // (Must fit in out_opcode_, and PatchList steals another bit.)
|
| - static const int kMaxInst = (1<<28) - 1;
|
| -
|
| - private:
|
| - void set_opcode(InstOp opcode) {
|
| - out_opcode_ = (out()<<3) | opcode;
|
| - }
|
| -
|
| - void set_out(int out) {
|
| - out_opcode_ = (out<<3) | opcode();
|
| - }
|
| -
|
| - void set_out_opcode(int out, InstOp opcode) {
|
| - out_opcode_ = (out<<3) | opcode;
|
| - }
|
| -
|
| - uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode
|
| - union { // additional instruction arguments:
|
| - uint32 out1_; // opcode == kInstAlt
|
| - // alternate next instruction
|
| -
|
| - int32 cap_; // opcode == kInstCapture
|
| - // Index of capture register (holds text
|
| - // position recorded by capturing parentheses).
|
| - // For \n (the submatch for the nth parentheses),
|
| - // the left parenthesis captures into register 2*n
|
| - // and the right one captures into register 2*n+1.
|
| -
|
| - int32 match_id_; // opcode == kInstMatch
|
| - // Match ID to identify this match (for re2::Set).
|
| -
|
| - struct { // opcode == kInstByteRange
|
| - uint8 lo_; // byte range is lo_-hi_ inclusive
|
| - uint8 hi_; //
|
| - uint8 foldcase_; // convert A-Z to a-z before checking range.
|
| - };
|
| -
|
| - EmptyOp empty_; // opcode == kInstEmptyWidth
|
| - // empty_ is bitwise OR of kEmpty* flags above.
|
| - };
|
| -
|
| - friend class Compiler;
|
| - friend struct PatchList;
|
| - friend class Prog;
|
| -
|
| - DISALLOW_COPY_AND_ASSIGN(Inst);
|
| - };
|
| -
|
| - // Whether to anchor the search.
|
| - enum Anchor {
|
| - kUnanchored, // match anywhere
|
| - kAnchored, // match only starting at beginning of text
|
| - };
|
| -
|
| - // Kind of match to look for (for anchor != kFullMatch)
|
| - //
|
| - // kLongestMatch mode finds the overall longest
|
| - // match but still makes its submatch choices the way
|
| - // Perl would, not in the way prescribed by POSIX.
|
| - // The POSIX rules are much more expensive to implement,
|
| - // and no one has needed them.
|
| - //
|
| - // kFullMatch is not strictly necessary -- we could use
|
| - // kLongestMatch and then check the length of the match -- but
|
| - // the matching code can run faster if it knows to consider only
|
| - // full matches.
|
| - enum MatchKind {
|
| - kFirstMatch, // like Perl, PCRE
|
| - kLongestMatch, // like egrep or POSIX
|
| - kFullMatch, // match only entire text; implies anchor==kAnchored
|
| - kManyMatch // for SearchDFA, records set of matches
|
| - };
|
| -
|
| - Inst *inst(int id) { return &inst_[id]; }
|
| - int start() { return start_; }
|
| - int start_unanchored() { return start_unanchored_; }
|
| - void set_start(int start) { start_ = start; }
|
| - void set_start_unanchored(int start) { start_unanchored_ = start; }
|
| - int size() { return size_; }
|
| - bool reversed() { return reversed_; }
|
| - void set_reversed(bool reversed) { reversed_ = reversed; }
|
| - int byte_inst_count() { return byte_inst_count_; }
|
| - const Bitmap<256>& byterange() { return byterange_; }
|
| - void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
|
| - int64 dfa_mem() { return dfa_mem_; }
|
| - int flags() { return flags_; }
|
| - void set_flags(int flags) { flags_ = flags; }
|
| - bool anchor_start() { return anchor_start_; }
|
| - void set_anchor_start(bool b) { anchor_start_ = b; }
|
| - bool anchor_end() { return anchor_end_; }
|
| - void set_anchor_end(bool b) { anchor_end_ = b; }
|
| - int bytemap_range() { return bytemap_range_; }
|
| - const uint8* bytemap() { return bytemap_; }
|
| -
|
| - // Returns string representation of program for debugging.
|
| - string Dump();
|
| - string DumpUnanchored();
|
| -
|
| - // Record that at some point in the prog, the bytes in the range
|
| - // lo-hi (inclusive) are treated as different from bytes outside the range.
|
| - // Tracking this lets the DFA collapse commonly-treated byte ranges
|
| - // when recording state pointers, greatly reducing its memory footprint.
|
| - void MarkByteRange(int lo, int hi);
|
| -
|
| - // Returns the set of kEmpty flags that are in effect at
|
| - // position p within context.
|
| - static uint32 EmptyFlags(const StringPiece& context, const char* p);
|
| -
|
| - // Returns whether byte c is a word character: ASCII only.
|
| - // Used by the implementation of \b and \B.
|
| - // This is not right for Unicode, but:
|
| - // - it's hard to get right in a byte-at-a-time matching world
|
| - // (the DFA has only one-byte lookahead).
|
| - // - even if the lookahead were possible, the Progs would be huge.
|
| - // This crude approximation is the same one PCRE uses.
|
| - static bool IsWordChar(uint8 c) {
|
| - return ('A' <= c && c <= 'Z') ||
|
| - ('a' <= c && c <= 'z') ||
|
| - ('0' <= c && c <= '9') ||
|
| - c == '_';
|
| - }
|
| -
|
| - // Execution engines. They all search for the regexp (run the prog)
|
| - // in text, which is in the larger context (used for ^ $ \b etc).
|
| - // Anchor and kind control the kind of search.
|
| - // Returns true if match found, false if not.
|
| - // If match found, fills match[0..nmatch-1] with submatch info.
|
| - // match[0] is overall match, match[1] is first set of parens, etc.
|
| - // If a particular submatch is not matched during the regexp match,
|
| - // it is set to NULL.
|
| - //
|
| - // Matching text == StringPiece(NULL, 0) is treated as any other empty
|
| - // string, but note that on return, it will not be possible to distinguish
|
| - // submatches that matched that empty string from submatches that didn't
|
| - // match anything. Either way, match[i] == NULL.
|
| -
|
| - // Search using NFA: can find submatches but kind of slow.
|
| - bool SearchNFA(const StringPiece& text, const StringPiece& context,
|
| - Anchor anchor, MatchKind kind,
|
| - StringPiece* match, int nmatch);
|
| -
|
| - // Search using DFA: much faster than NFA but only finds
|
| - // end of match and can use a lot more memory.
|
| - // Returns whether a match was found.
|
| - // If the DFA runs out of memory, sets *failed to true and returns false.
|
| - // If matches != NULL and kind == kManyMatch and there is a match,
|
| - // SearchDFA fills matches with the match IDs of the final matching state.
|
| - bool SearchDFA(const StringPiece& text, const StringPiece& context,
|
| - Anchor anchor, MatchKind kind,
|
| - StringPiece* match0, bool* failed,
|
| - vector<int>* matches);
|
| -
|
| - // Build the entire DFA for the given match kind. FOR TESTING ONLY.
|
| - // Usually the DFA is built out incrementally, as needed, which
|
| - // avoids lots of unnecessary work. This function is useful only
|
| - // for testing purposes. Returns number of states.
|
| - int BuildEntireDFA(MatchKind kind);
|
| -
|
| - // Compute byte map.
|
| - void ComputeByteMap();
|
| -
|
| - // Run peep-hole optimizer on program.
|
| - void Optimize();
|
| -
|
| - // One-pass NFA: only correct if IsOnePass() is true,
|
| - // but much faster than NFA (competitive with PCRE)
|
| - // for those expressions.
|
| - bool IsOnePass();
|
| - bool SearchOnePass(const StringPiece& text, const StringPiece& context,
|
| - Anchor anchor, MatchKind kind,
|
| - StringPiece* match, int nmatch);
|
| -
|
| - // Bit-state backtracking. Fast on small cases but uses memory
|
| - // proportional to the product of the program size and the text size.
|
| - bool SearchBitState(const StringPiece& text, const StringPiece& context,
|
| - Anchor anchor, MatchKind kind,
|
| - StringPiece* match, int nmatch);
|
| -
|
| - static const int kMaxOnePassCapture = 5; // $0 through $4
|
| -
|
| - // Backtracking search: the gold standard against which the other
|
| - // implementations are checked. FOR TESTING ONLY.
|
| - // It allocates a ton of memory to avoid running forever.
|
| - // It is also recursive, so can't use in production (will overflow stacks).
|
| - // The name "Unsafe" here is supposed to be a flag that
|
| - // you should not be using this function.
|
| - bool UnsafeSearchBacktrack(const StringPiece& text,
|
| - const StringPiece& context,
|
| - Anchor anchor, MatchKind kind,
|
| - StringPiece* match, int nmatch);
|
| -
|
| - // Computes range for any strings matching regexp. The min and max can in
|
| - // some cases be arbitrarily precise, so the caller gets to specify the
|
| - // maximum desired length of string returned.
|
| - //
|
| - // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
|
| - // string s that is an anchored match for this regexp satisfies
|
| - // min <= s && s <= max.
|
| - //
|
| - // Note that PossibleMatchRange() will only consider the first copy of an
|
| - // infinitely repeated element (i.e., any regexp element followed by a '*' or
|
| - // '+' operator). Regexps with "{N}" constructions are not affected, as those
|
| - // do not compile down to infinite repetitions.
|
| - //
|
| - // Returns true on success, false on error.
|
| - bool PossibleMatchRange(string* min, string* max, int maxlen);
|
| -
|
| - // EXPERIMENTAL! SUBJECT TO CHANGE!
|
| - // Outputs the program fanout into the given sparse array.
|
| - void Fanout(SparseArray<int>* fanout);
|
| -
|
| - // Compiles a collection of regexps to Prog. Each regexp will have
|
| - // its own Match instruction recording the index in the vector.
|
| - static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
|
| - Regexp* re);
|
| -
|
| - private:
|
| - friend class Compiler;
|
| -
|
| - DFA* GetDFA(MatchKind kind);
|
| -
|
| - bool anchor_start_; // regexp has explicit start anchor
|
| - bool anchor_end_; // regexp has explicit end anchor
|
| - bool reversed_; // whether program runs backward over input
|
| - bool did_onepass_; // has IsOnePass been called?
|
| -
|
| - int start_; // entry point for program
|
| - int start_unanchored_; // unanchored entry point for program
|
| - int size_; // number of instructions
|
| - int byte_inst_count_; // number of kInstByteRange instructions
|
| - int bytemap_range_; // bytemap_[x] < bytemap_range_
|
| - int flags_; // regexp parse flags
|
| - int onepass_statesize_; // byte size of each OneState* node
|
| -
|
| - Inst* inst_; // pointer to instruction array
|
| -
|
| - Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_
|
| - DFA* volatile dfa_first_; // DFA cached for kFirstMatch
|
| - DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
|
| - int64 dfa_mem_; // Maximum memory for DFAs.
|
| - void (*delete_dfa_)(DFA* dfa);
|
| -
|
| - Bitmap<256> byterange_; // byterange.Get(x) true if x ends a
|
| - // commonly-treated byte range.
|
| - uint8 bytemap_[256]; // map from input bytes to byte classes
|
| - uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x
|
| -
|
| - uint8* onepass_nodes_; // data for OnePass nodes
|
| - OneState* onepass_start_; // start node for OnePass program
|
| -
|
| - DISALLOW_COPY_AND_ASSIGN(Prog);
|
| -};
|
| -
|
| -} // namespace re2
|
| -
|
| -#endif // RE2_PROG_H__
|
|
|