third_party/re2/re2/prog.h - Issue 1544433002: Replace RE2 import with a dependency

Unified Diff: third_party/re2/re2/prog.h

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: third_party/re2/re2/prog.h

diff --git a/third_party/re2/re2/prog.h b/third_party/re2/re2/prog.h

deleted file mode 100644

index 8c5b2c4939e238955792ab9e3e1a95ff4427ee42..0000000000000000000000000000000000000000

--- a/third_party/re2/re2/prog.h

+++ /dev/null

@@ -1,381 +0,0 @@

-// Use of this source code is governed by a BSD-style

-// license that can be found in the LICENSE file.

-// Compiled representation of regular expressions.

-// See regexp.h for the Regexp class, which represents a regular

-// expression symbolically.

-#ifndef RE2_PROG_H__

-#define RE2_PROG_H__

-#include "util/util.h"

-#include "util/sparse_array.h"

-#include "re2/re2.h"

-namespace re2 {

-// Simple fixed-size bitmap.

-template<int Bits>

-class Bitmap {

- public:

- Bitmap() { Reset(); }

- int Size() { return Bits; }

- void Reset() {

- for (int i = 0; i < Words; i++)

- w_[i] = 0;

- }

- bool Get(int k) const {

- return w_[k >> WordLog] & (1<<(k & 31));

- }

- void Set(int k) {

- w_[k >> WordLog] |= 1<<(k & 31);

- }

- void Clear(int k) {

- w_[k >> WordLog] &= ~(1<<(k & 31));

- }

- uint32 Word(int i) const {

- return w_[i];

- }

- private:

- static const int WordLog = 5;

- static const int Words = (Bits+31)/32;

- uint32 w_[Words];

- DISALLOW_COPY_AND_ASSIGN(Bitmap);

-};

-// Opcodes for Inst

-enum InstOp {

- kInstAlt = 0, // choose between out_ and out1_

- kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.

- kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]

- kInstCapture, // capturing parenthesis number cap_

- kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_

- kInstMatch, // found a match!

- kInstNop, // no-op; occasionally unavoidable

- kInstFail, // never match; occasionally unavoidable

-};

-// Bit flags for empty-width specials

-enum EmptyOp {

- kEmptyBeginLine = 1<<0, // ^ - beginning of line

- kEmptyEndLine = 1<<1, // $ - end of line

- kEmptyBeginText = 1<<2, // \A - beginning of text

- kEmptyEndText = 1<<3, // \z - end of text

- kEmptyWordBoundary = 1<<4, // \b - word boundary

- kEmptyNonWordBoundary = 1<<5, // \B - not \b

- kEmptyAllFlags = (1<<6)-1,

-};

-class Regexp;

-class DFA;

-struct OneState;

-// Compiled form of regexp program.

-class Prog {

- public:

- Prog();

- ~Prog();

- // Single instruction in regexp program.

- class Inst {

- public:

- Inst() : out_opcode_(0), out1_(0) { }

- // Constructors per opcode

- void InitAlt(uint32 out, uint32 out1);

- void InitByteRange(int lo, int hi, int foldcase, uint32 out);

- void InitCapture(int cap, uint32 out);

- void InitEmptyWidth(EmptyOp empty, uint32 out);

- void InitMatch(int id);

- void InitNop(uint32 out);

- void InitFail();

- // Getters

- int id(Prog* p) { return static_cast<int>(this - p->inst_); }

- InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }

- int out() { return out_opcode_>>3; }

- int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }

- int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }

- int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }

- int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }

- int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }

- int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }

- EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }

- bool greedy(Prog *p) {

- DCHECK_EQ(opcode(), kInstAltMatch);

- return p->inst(out())->opcode() == kInstByteRange;

- }

- // Does this inst (an kInstByteRange) match c?

- inline bool Matches(int c) {

- DCHECK_EQ(opcode(), kInstByteRange);

- if (foldcase_ && 'A' <= c && c <= 'Z')

- c += 'a' - 'A';

- return lo_ <= c && c <= hi_;

- }

- // Returns string representation for debugging.

- string Dump();

- // Maximum instruction id.

- // (Must fit in out_opcode_, and PatchList steals another bit.)

- static const int kMaxInst = (1<<28) - 1;

- private:

- void set_opcode(InstOp opcode) {

- out_opcode_ = (out()<<3) | opcode;

- }

- void set_out(int out) {

- out_opcode_ = (out<<3) | opcode();

- }

- void set_out_opcode(int out, InstOp opcode) {

- out_opcode_ = (out<<3) | opcode;

- }

- uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode

- union { // additional instruction arguments:

- uint32 out1_; // opcode == kInstAlt

- // alternate next instruction

- int32 cap_; // opcode == kInstCapture

- // Index of capture register (holds text

- // position recorded by capturing parentheses).

- // For \n (the submatch for the nth parentheses),

- // the left parenthesis captures into register 2*n

- // and the right one captures into register 2*n+1.

- int32 match_id_; // opcode == kInstMatch

- // Match ID to identify this match (for re2::Set).

- struct { // opcode == kInstByteRange

- uint8 lo_; // byte range is lo_-hi_ inclusive

- uint8 hi_; //

- uint8 foldcase_; // convert A-Z to a-z before checking range.

- };

- EmptyOp empty_; // opcode == kInstEmptyWidth

- // empty_ is bitwise OR of kEmpty* flags above.

- };

- friend class Compiler;

- friend struct PatchList;

- friend class Prog;

- DISALLOW_COPY_AND_ASSIGN(Inst);

- };

- // Whether to anchor the search.

- enum Anchor {

- kUnanchored, // match anywhere

- kAnchored, // match only starting at beginning of text

- };

- // Kind of match to look for (for anchor != kFullMatch)

- //

- // kLongestMatch mode finds the overall longest

- // match but still makes its submatch choices the way

- // Perl would, not in the way prescribed by POSIX.

- // The POSIX rules are much more expensive to implement,

- // and no one has needed them.

- //

- // kFullMatch is not strictly necessary -- we could use

- // kLongestMatch and then check the length of the match -- but

- // the matching code can run faster if it knows to consider only

- // full matches.

- enum MatchKind {

- kFirstMatch, // like Perl, PCRE

- kLongestMatch, // like egrep or POSIX

- kFullMatch, // match only entire text; implies anchor==kAnchored

- kManyMatch // for SearchDFA, records set of matches

- };

- Inst *inst(int id) { return &inst_[id]; }

- int start() { return start_; }

- int start_unanchored() { return start_unanchored_; }

- void set_start(int start) { start_ = start; }

- void set_start_unanchored(int start) { start_unanchored_ = start; }

- int size() { return size_; }

- bool reversed() { return reversed_; }

- void set_reversed(bool reversed) { reversed_ = reversed; }

- int byte_inst_count() { return byte_inst_count_; }

- const Bitmap<256>& byterange() { return byterange_; }

- void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }

- int64 dfa_mem() { return dfa_mem_; }

- int flags() { return flags_; }

- void set_flags(int flags) { flags_ = flags; }

- bool anchor_start() { return anchor_start_; }

- void set_anchor_start(bool b) { anchor_start_ = b; }

- bool anchor_end() { return anchor_end_; }

- void set_anchor_end(bool b) { anchor_end_ = b; }

- int bytemap_range() { return bytemap_range_; }

- const uint8* bytemap() { return bytemap_; }

- // Returns string representation of program for debugging.

- string Dump();

- string DumpUnanchored();

- // Record that at some point in the prog, the bytes in the range

- // lo-hi (inclusive) are treated as different from bytes outside the range.

- // Tracking this lets the DFA collapse commonly-treated byte ranges

- // when recording state pointers, greatly reducing its memory footprint.

- void MarkByteRange(int lo, int hi);

- // Returns the set of kEmpty flags that are in effect at

- // position p within context.

- static uint32 EmptyFlags(const StringPiece& context, const char* p);

- // Returns whether byte c is a word character: ASCII only.

- // Used by the implementation of \b and \B.

- // This is not right for Unicode, but:

- // - it's hard to get right in a byte-at-a-time matching world

- // (the DFA has only one-byte lookahead).

- // - even if the lookahead were possible, the Progs would be huge.

- // This crude approximation is the same one PCRE uses.

- static bool IsWordChar(uint8 c) {

- return ('A' <= c && c <= 'Z') ||

- ('a' <= c && c <= 'z') ||

- ('0' <= c && c <= '9') ||

- c == '_';

- }

- // Execution engines. They all search for the regexp (run the prog)

- // in text, which is in the larger context (used for ^ $ \b etc).

- // Anchor and kind control the kind of search.

- // Returns true if match found, false if not.

- // If match found, fills match[0..nmatch-1] with submatch info.

- // match[0] is overall match, match[1] is first set of parens, etc.

- // If a particular submatch is not matched during the regexp match,

- // it is set to NULL.

- //

- // Matching text == StringPiece(NULL, 0) is treated as any other empty

- // string, but note that on return, it will not be possible to distinguish

- // submatches that matched that empty string from submatches that didn't

- // match anything. Either way, match[i] == NULL.

- // Search using NFA: can find submatches but kind of slow.

- bool SearchNFA(const StringPiece& text, const StringPiece& context,

- Anchor anchor, MatchKind kind,

- StringPiece* match, int nmatch);

- // Search using DFA: much faster than NFA but only finds

- // end of match and can use a lot more memory.

- // Returns whether a match was found.

- // If the DFA runs out of memory, sets *failed to true and returns false.

- // If matches != NULL and kind == kManyMatch and there is a match,

- // SearchDFA fills matches with the match IDs of the final matching state.

- bool SearchDFA(const StringPiece& text, const StringPiece& context,

- Anchor anchor, MatchKind kind,

- StringPiece* match0, bool* failed,

- vector<int>* matches);

- // Build the entire DFA for the given match kind. FOR TESTING ONLY.

- // Usually the DFA is built out incrementally, as needed, which

- // avoids lots of unnecessary work. This function is useful only

- // for testing purposes. Returns number of states.

- int BuildEntireDFA(MatchKind kind);

- // Compute byte map.

- void ComputeByteMap();

- // Run peep-hole optimizer on program.

- void Optimize();

- // One-pass NFA: only correct if IsOnePass() is true,

- // but much faster than NFA (competitive with PCRE)

- // for those expressions.

- bool IsOnePass();

- bool SearchOnePass(const StringPiece& text, const StringPiece& context,

- Anchor anchor, MatchKind kind,

- StringPiece* match, int nmatch);

- // Bit-state backtracking. Fast on small cases but uses memory

- // proportional to the product of the program size and the text size.

- bool SearchBitState(const StringPiece& text, const StringPiece& context,

- Anchor anchor, MatchKind kind,

- StringPiece* match, int nmatch);

- static const int kMaxOnePassCapture = 5; // $0 through $4

- // Backtracking search: the gold standard against which the other

- // implementations are checked. FOR TESTING ONLY.

- // It allocates a ton of memory to avoid running forever.

- // It is also recursive, so can't use in production (will overflow stacks).

- // The name "Unsafe" here is supposed to be a flag that

- // you should not be using this function.

- bool UnsafeSearchBacktrack(const StringPiece& text,

- const StringPiece& context,

- Anchor anchor, MatchKind kind,

- StringPiece* match, int nmatch);

- // Computes range for any strings matching regexp. The min and max can in

- // some cases be arbitrarily precise, so the caller gets to specify the

- // maximum desired length of string returned.

- //

- // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any

- // string s that is an anchored match for this regexp satisfies

- // min <= s && s <= max.

- //

- // Note that PossibleMatchRange() will only consider the first copy of an

- // infinitely repeated element (i.e., any regexp element followed by a '*' or

- // '+' operator). Regexps with "{N}" constructions are not affected, as those

- // do not compile down to infinite repetitions.

- //

- // Returns true on success, false on error.

- bool PossibleMatchRange(string* min, string* max, int maxlen);

- // EXPERIMENTAL! SUBJECT TO CHANGE!

- // Outputs the program fanout into the given sparse array.

- void Fanout(SparseArray<int>* fanout);

- // Compiles a collection of regexps to Prog. Each regexp will have

- // its own Match instruction recording the index in the vector.

- static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,

- Regexp* re);

- private:

- friend class Compiler;

- DFA* GetDFA(MatchKind kind);

- bool anchor_start_; // regexp has explicit start anchor

- bool anchor_end_; // regexp has explicit end anchor

- bool reversed_; // whether program runs backward over input

- bool did_onepass_; // has IsOnePass been called?

- int start_; // entry point for program

- int start_unanchored_; // unanchored entry point for program

- int size_; // number of instructions

- int byte_inst_count_; // number of kInstByteRange instructions

- int bytemap_range_; // bytemap_[x] < bytemap_range_

- int flags_; // regexp parse flags

- int onepass_statesize_; // byte size of each OneState* node

- Inst* inst_; // pointer to instruction array

- Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_

- DFA* volatile dfa_first_; // DFA cached for kFirstMatch

- DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch

- int64 dfa_mem_; // Maximum memory for DFAs.

- void (*delete_dfa_)(DFA* dfa);

- Bitmap<256> byterange_; // byterange.Get(x) true if x ends a

- // commonly-treated byte range.

- uint8 bytemap_[256]; // map from input bytes to byte classes

- uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x

- uint8* onepass_nodes_; // data for OnePass nodes

- OneState* onepass_start_; // start node for OnePass program

- DISALLOW_COPY_AND_ASSIGN(Prog);

-};

-} // namespace re2

-#endif // RE2_PROG_H__

« no previous file with comments | « third_party/re2/re2/prefilter_tree.cc ('k') | third_party/re2/re2/prog.cc » ('j') | no next file with comments »