Index: third_party/re2/re2/compile.cc |
diff --git a/third_party/re2/re2/compile.cc b/third_party/re2/re2/compile.cc |
deleted file mode 100644 |
index 50375242e5e26f8acbfb109efd244e3bdc930999..0000000000000000000000000000000000000000 |
--- a/third_party/re2/re2/compile.cc |
+++ /dev/null |
@@ -1,1151 +0,0 @@ |
-// Copyright 2007 The RE2 Authors. All Rights Reserved. |
-// Use of this source code is governed by a BSD-style |
-// license that can be found in the LICENSE file. |
- |
-// Compile regular expression to Prog. |
-// |
-// Prog and Inst are defined in prog.h. |
-// This file's external interface is just Regexp::CompileToProg. |
-// The Compiler class defined in this file is private. |
- |
-#include "re2/prog.h" |
-#include "re2/re2.h" |
-#include "re2/regexp.h" |
-#include "re2/walker-inl.h" |
- |
-namespace re2 { |
- |
-// List of pointers to Inst* that need to be filled in (patched). |
-// Because the Inst* haven't been filled in yet, |
-// we can use the Inst* word to hold the list's "next" pointer. |
-// It's kind of sleazy, but it works well in practice. |
-// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. |
-// |
-// Because the out and out1 fields in Inst are no longer pointers, |
-// we can't use pointers directly here either. Instead, p refers |
-// to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1). |
-// p == 0 represents the NULL list. This is okay because instruction #0 |
-// is always the fail instruction, which never appears on a list. |
- |
-struct PatchList { |
- uint32 p; |
- |
- // Returns patch list containing just p. |
- static PatchList Mk(uint32 p); |
- |
- // Patches all the entries on l to have value v. |
- // Caller must not ever use patch list again. |
- static void Patch(Prog::Inst *inst0, PatchList l, uint32 v); |
- |
- // Deref returns the next pointer pointed at by p. |
- static PatchList Deref(Prog::Inst *inst0, PatchList l); |
- |
- // Appends two patch lists and returns result. |
- static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2); |
-}; |
- |
-static PatchList nullPatchList = { 0 }; |
- |
-// Returns patch list containing just p. |
-PatchList PatchList::Mk(uint32 p) { |
- PatchList l; |
- l.p = p; |
- return l; |
-} |
- |
-// Returns the next pointer pointed at by l. |
-PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) { |
- Prog::Inst* ip = &inst0[l.p>>1]; |
- if (l.p&1) |
- l.p = ip->out1(); |
- else |
- l.p = ip->out(); |
- return l; |
-} |
- |
-// Patches all the entries on l to have value v. |
-void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) { |
- while (l.p != 0) { |
- Prog::Inst* ip = &inst0[l.p>>1]; |
- if (l.p&1) { |
- l.p = ip->out1(); |
- ip->out1_ = val; |
- } else { |
- l.p = ip->out(); |
- ip->set_out(val); |
- } |
- } |
-} |
- |
-// Appends two patch lists and returns result. |
-PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { |
- if (l1.p == 0) |
- return l2; |
- if (l2.p == 0) |
- return l1; |
- |
- PatchList l = l1; |
- for (;;) { |
- PatchList next = PatchList::Deref(inst0, l); |
- if (next.p == 0) |
- break; |
- l = next; |
- } |
- |
- Prog::Inst* ip = &inst0[l.p>>1]; |
- if (l.p&1) |
- ip->out1_ = l2.p; |
- else |
- ip->set_out(l2.p); |
- |
- return l1; |
-} |
- |
-// Compiled program fragment. |
-struct Frag { |
- uint32 begin; |
- PatchList end; |
- |
- Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector |
- Frag(uint32 begin, PatchList end) : begin(begin), end(end) {} |
-}; |
- |
-// Input encodings. |
-enum Encoding { |
- kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) |
- kEncodingLatin1, // Latin1 (0-FF) |
-}; |
- |
-class Compiler : public Regexp::Walker<Frag> { |
- public: |
- explicit Compiler(); |
- ~Compiler(); |
- |
- // Compiles Regexp to a new Prog. |
- // Caller is responsible for deleting Prog when finished with it. |
- // If reversed is true, compiles for walking over the input |
- // string backward (reverses all concatenations). |
- static Prog *Compile(Regexp* re, bool reversed, int64 max_mem); |
- |
- // Compiles alternation of all the re to a new Prog. |
- // Each re has a match with an id equal to its index in the vector. |
- static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, |
- Regexp* re); |
- |
- // Interface for Regexp::Walker, which helps traverse the Regexp. |
- // The walk is purely post-recursive: given the machines for the |
- // children, PostVisit combines them to create the machine for |
- // the current node. The child_args are Frags. |
- // The Compiler traverses the Regexp parse tree, visiting |
- // each node in depth-first order. It invokes PreVisit before |
- // visiting the node's children and PostVisit after visiting |
- // the children. |
- Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop); |
- Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args, |
- int nchild_args); |
- Frag ShortVisit(Regexp* re, Frag parent_arg); |
- Frag Copy(Frag arg); |
- |
- // Given fragment a, returns a+ or a+?; a* or a*?; a? or a?? |
- Frag Plus(Frag a, bool nongreedy); |
- Frag Star(Frag a, bool nongreedy); |
- Frag Quest(Frag a, bool nongreedy); |
- |
- // Given fragment a, returns (a) capturing as \n. |
- Frag Capture(Frag a, int n); |
- |
- // Given fragments a and b, returns ab; a|b |
- Frag Cat(Frag a, Frag b); |
- Frag Alt(Frag a, Frag b); |
- |
- // Returns a fragment that can't match anything. |
- Frag NoMatch(); |
- |
- // Returns a fragment that matches the empty string. |
- Frag Match(int32 id); |
- |
- // Returns a no-op fragment. |
- Frag Nop(); |
- |
- // Returns a fragment matching the byte range lo-hi. |
- Frag ByteRange(int lo, int hi, bool foldcase); |
- |
- // Returns a fragment matching an empty-width special op. |
- Frag EmptyWidth(EmptyOp op); |
- |
- // Adds n instructions to the program. |
- // Returns the index of the first one. |
- // Returns -1 if no more instructions are available. |
- int AllocInst(int n); |
- |
- // Deletes unused instructions. |
- void Trim(); |
- |
- // Rune range compiler. |
- |
- // Begins a new alternation. |
- void BeginRange(); |
- |
- // Adds a fragment matching the rune range lo-hi. |
- void AddRuneRange(Rune lo, Rune hi, bool foldcase); |
- void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); |
- void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); |
- void Add_80_10ffff(); |
- |
- // New suffix that matches the byte range lo-hi, then goes to next. |
- int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); |
- int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); |
- |
- // Adds a suffix to alternation. |
- void AddSuffix(int id); |
- |
- // Returns the alternation of all the added suffixes. |
- Frag EndRange(); |
- |
- // Single rune. |
- Frag Literal(Rune r, bool foldcase); |
- |
- void Setup(Regexp::ParseFlags, int64, RE2::Anchor); |
- Prog* Finish(); |
- |
- // Returns .* where dot = any byte |
- Frag DotStar(); |
- |
- private: |
- Prog* prog_; // Program being built. |
- bool failed_; // Did we give up compiling? |
- Encoding encoding_; // Input encoding |
- bool reversed_; // Should program run backward over text? |
- |
- int max_inst_; // Maximum number of instructions. |
- |
- Prog::Inst* inst_; // Pointer to first instruction. |
- int inst_len_; // Number of instructions used. |
- int inst_cap_; // Number of instructions allocated. |
- |
- int64 max_mem_; // Total memory budget. |
- |
- map<uint64, int> rune_cache_; |
- Frag rune_range_; |
- |
- RE2::Anchor anchor_; // anchor mode for RE2::Set |
- |
- DISALLOW_COPY_AND_ASSIGN(Compiler); |
-}; |
- |
-Compiler::Compiler() { |
- prog_ = new Prog(); |
- failed_ = false; |
- encoding_ = kEncodingUTF8; |
- reversed_ = false; |
- inst_ = NULL; |
- inst_len_ = 0; |
- inst_cap_ = 0; |
- max_inst_ = 1; // make AllocInst for fail instruction okay |
- max_mem_ = 0; |
- int fail = AllocInst(1); |
- inst_[fail].InitFail(); |
- max_inst_ = 0; // Caller must change |
-} |
- |
-Compiler::~Compiler() { |
- delete prog_; |
- delete[] inst_; |
-} |
- |
-int Compiler::AllocInst(int n) { |
- if (failed_ || inst_len_ + n > max_inst_) { |
- failed_ = true; |
- return -1; |
- } |
- |
- if (inst_len_ + n > inst_cap_) { |
- if (inst_cap_ == 0) |
- inst_cap_ = 8; |
- while (inst_len_ + n > inst_cap_) |
- inst_cap_ *= 2; |
- Prog::Inst* ip = new Prog::Inst[inst_cap_]; |
- memmove(ip, inst_, inst_len_ * sizeof ip[0]); |
- memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]); |
- delete[] inst_; |
- inst_ = ip; |
- } |
- int id = inst_len_; |
- inst_len_ += n; |
- return id; |
-} |
- |
-void Compiler::Trim() { |
- if (inst_len_ < inst_cap_) { |
- Prog::Inst* ip = new Prog::Inst[inst_len_]; |
- memmove(ip, inst_, inst_len_ * sizeof ip[0]); |
- delete[] inst_; |
- inst_ = ip; |
- inst_cap_ = inst_len_; |
- } |
-} |
- |
-// These routines are somewhat hard to visualize in text -- |
-// see http://swtch.com/~rsc/regexp/regexp1.html for |
-// pictures explaining what is going on here. |
- |
-// Returns an unmatchable fragment. |
-Frag Compiler::NoMatch() { |
- return Frag(0, nullPatchList); |
-} |
- |
-// Is a an unmatchable fragment? |
-static bool IsNoMatch(Frag a) { |
- return a.begin == 0; |
-} |
- |
-// Given fragments a and b, returns fragment for ab. |
-Frag Compiler::Cat(Frag a, Frag b) { |
- if (IsNoMatch(a) || IsNoMatch(b)) |
- return NoMatch(); |
- |
- // Elide no-op. |
- Prog::Inst* begin = &inst_[a.begin]; |
- if (begin->opcode() == kInstNop && |
- a.end.p == (a.begin << 1) && |
- begin->out() == 0) { |
- PatchList::Patch(inst_, a.end, b.begin); // in case refs to a somewhere |
- return b; |
- } |
- |
- // To run backward over string, reverse all concatenations. |
- if (reversed_) { |
- PatchList::Patch(inst_, b.end, a.begin); |
- return Frag(b.begin, a.end); |
- } |
- |
- PatchList::Patch(inst_, a.end, b.begin); |
- return Frag(a.begin, b.end); |
-} |
- |
-// Given fragments for a and b, returns fragment for a|b. |
-Frag Compiler::Alt(Frag a, Frag b) { |
- // Special case for convenience in loops. |
- if (IsNoMatch(a)) |
- return b; |
- if (IsNoMatch(b)) |
- return a; |
- |
- int id = AllocInst(1); |
- if (id < 0) |
- return NoMatch(); |
- |
- inst_[id].InitAlt(a.begin, b.begin); |
- return Frag(id, PatchList::Append(inst_, a.end, b.end)); |
-} |
- |
-// When capturing submatches in like-Perl mode, a kOpAlt Inst |
-// treats out_ as the first choice, out1_ as the second. |
-// |
-// For *, +, and ?, if out_ causes another repetition, |
-// then the operator is greedy. If out1_ is the repetition |
-// (and out_ moves forward), then the operator is non-greedy. |
- |
-// Given a fragment a, returns a fragment for a* or a*? (if nongreedy) |
-Frag Compiler::Star(Frag a, bool nongreedy) { |
- int id = AllocInst(1); |
- if (id < 0) |
- return NoMatch(); |
- inst_[id].InitAlt(0, 0); |
- PatchList::Patch(inst_, a.end, id); |
- if (nongreedy) { |
- inst_[id].out1_ = a.begin; |
- return Frag(id, PatchList::Mk(id << 1)); |
- } else { |
- inst_[id].set_out(a.begin); |
- return Frag(id, PatchList::Mk((id << 1) | 1)); |
- } |
-} |
- |
-// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) |
-Frag Compiler::Plus(Frag a, bool nongreedy) { |
- // a+ is just a* with a different entry point. |
- Frag f = Star(a, nongreedy); |
- return Frag(a.begin, f.end); |
-} |
- |
-// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) |
-Frag Compiler::Quest(Frag a, bool nongreedy) { |
- if (IsNoMatch(a)) |
- return Nop(); |
- int id = AllocInst(1); |
- if (id < 0) |
- return NoMatch(); |
- PatchList pl; |
- if (nongreedy) { |
- inst_[id].InitAlt(0, a.begin); |
- pl = PatchList::Mk(id << 1); |
- } else { |
- inst_[id].InitAlt(a.begin, 0); |
- pl = PatchList::Mk((id << 1) | 1); |
- } |
- return Frag(id, PatchList::Append(inst_, pl, a.end)); |
-} |
- |
-// Returns a fragment for the byte range lo-hi. |
-Frag Compiler::ByteRange(int lo, int hi, bool foldcase) { |
- int id = AllocInst(1); |
- if (id < 0) |
- return NoMatch(); |
- inst_[id].InitByteRange(lo, hi, foldcase, 0); |
- prog_->byte_inst_count_++; |
- prog_->MarkByteRange(lo, hi); |
- if (foldcase && lo <= 'z' && hi >= 'a') { |
- if (lo < 'a') |
- lo = 'a'; |
- if (hi > 'z') |
- hi = 'z'; |
- if (lo <= hi) |
- prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a'); |
- } |
- return Frag(id, PatchList::Mk(id << 1)); |
-} |
- |
-// Returns a no-op fragment. Sometimes unavoidable. |
-Frag Compiler::Nop() { |
- int id = AllocInst(1); |
- if (id < 0) |
- return NoMatch(); |
- inst_[id].InitNop(0); |
- return Frag(id, PatchList::Mk(id << 1)); |
-} |
- |
-// Returns a fragment that signals a match. |
-Frag Compiler::Match(int32 match_id) { |
- int id = AllocInst(1); |
- if (id < 0) |
- return NoMatch(); |
- inst_[id].InitMatch(match_id); |
- return Frag(id, nullPatchList); |
-} |
- |
-// Returns a fragment matching a particular empty-width op (like ^ or $) |
-Frag Compiler::EmptyWidth(EmptyOp empty) { |
- int id = AllocInst(1); |
- if (id < 0) |
- return NoMatch(); |
- inst_[id].InitEmptyWidth(empty, 0); |
- if (empty & (kEmptyBeginLine|kEmptyEndLine)) |
- prog_->MarkByteRange('\n', '\n'); |
- if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) { |
- int j; |
- for (int i = 0; i < 256; i = j) { |
- for (j = i + 1; j < 256 && |
- Prog::IsWordChar(static_cast<uint8>(i)) == |
- Prog::IsWordChar(static_cast<uint8>(j)); |
- j++) |
- ; |
- prog_->MarkByteRange(i, j-1); |
- } |
- } |
- return Frag(id, PatchList::Mk(id << 1)); |
-} |
- |
-// Given a fragment a, returns a fragment with capturing parens around a. |
-Frag Compiler::Capture(Frag a, int n) { |
- if (IsNoMatch(a)) |
- return NoMatch(); |
- int id = AllocInst(2); |
- if (id < 0) |
- return NoMatch(); |
- inst_[id].InitCapture(2*n, a.begin); |
- inst_[id+1].InitCapture(2*n+1, 0); |
- PatchList::Patch(inst_, a.end, id+1); |
- |
- return Frag(id, PatchList::Mk((id+1) << 1)); |
-} |
- |
-// A Rune is a name for a Unicode code point. |
-// Returns maximum rune encoded by UTF-8 sequence of length len. |
-static int MaxRune(int len) { |
- int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax) |
- if (len == 1) |
- b = 7; |
- else |
- b = 8-(len+1) + 6*(len-1); |
- return (1<<b) - 1; // maximum Rune for b bits. |
-} |
- |
-// The rune range compiler caches common suffix fragments, |
-// which are very common in UTF-8 (e.g., [80-bf]). |
-// The fragment suffixes are identified by their start |
-// instructions. NULL denotes the eventual end match. |
-// The Frag accumulates in rune_range_. Caching common |
-// suffixes reduces the UTF-8 "." from 32 to 24 instructions, |
-// and it reduces the corresponding one-pass NFA from 16 nodes to 8. |
- |
-void Compiler::BeginRange() { |
- rune_cache_.clear(); |
- rune_range_.begin = 0; |
- rune_range_.end = nullPatchList; |
-} |
- |
-int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, |
- int next) { |
- Frag f = ByteRange(lo, hi, foldcase); |
- if (next != 0) { |
- PatchList::Patch(inst_, f.end, next); |
- } else { |
- rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end); |
- } |
- return f.begin; |
-} |
- |
-int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { |
- // In Latin1 mode, there's no point in caching. |
- // In forward UTF-8 mode, only need to cache continuation bytes. |
- if (encoding_ == kEncodingLatin1 || |
- (encoding_ == kEncodingUTF8 && |
- !reversed_ && |
- !(0x80 <= lo && hi <= 0xbf))) { |
- return UncachedRuneByteSuffix(lo, hi, foldcase, next); |
- } |
- |
- uint64 key = (uint64)next << 17 | |
- (uint64)lo << 9 | |
- (uint64)hi << 1 | |
- (uint64)foldcase; |
- map<uint64, int>::iterator it = rune_cache_.find(key); |
- if (it != rune_cache_.end()) |
- return it->second; |
- int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); |
- rune_cache_[key] = id; |
- return id; |
-} |
- |
-void Compiler::AddSuffix(int id) { |
- if (rune_range_.begin == 0) { |
- rune_range_.begin = id; |
- return; |
- } |
- |
- int alt = AllocInst(1); |
- if (alt < 0) { |
- rune_range_.begin = 0; |
- return; |
- } |
- inst_[alt].InitAlt(rune_range_.begin, id); |
- rune_range_.begin = alt; |
-} |
- |
-Frag Compiler::EndRange() { |
- return rune_range_; |
-} |
- |
-// Converts rune range lo-hi into a fragment that recognizes |
-// the bytes that would make up those runes in the current |
-// encoding (Latin 1 or UTF-8). |
-// This lets the machine work byte-by-byte even when |
-// using multibyte encodings. |
- |
-void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) { |
- switch (encoding_) { |
- default: |
- case kEncodingUTF8: |
- AddRuneRangeUTF8(lo, hi, foldcase); |
- break; |
- case kEncodingLatin1: |
- AddRuneRangeLatin1(lo, hi, foldcase); |
- break; |
- } |
-} |
- |
-void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { |
- // Latin1 is easy: runes *are* bytes. |
- if (lo > hi || lo > 0xFF) |
- return; |
- if (hi > 0xFF) |
- hi = 0xFF; |
- AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi), |
- foldcase, 0)); |
-} |
- |
-// Table describing how to make a UTF-8 matching machine |
-// for the rune range 80-10FFFF (Runeself-Runemax). |
-// This range happens frequently enough (for example /./ and /[^a-z]/) |
-// and the rune_cache_ map is slow enough that this is worth |
-// special handling. Makes compilation of a small expression |
-// with a dot in it about 10% faster. |
-// The * in the comments below mark whole sequences. |
-static struct ByteRangeProg { |
- int next; |
- int lo; |
- int hi; |
-} prog_80_10ffff[] = { |
- // Two-byte |
- { -1, 0x80, 0xBF, }, // 0: 80-BF |
- { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF* |
- |
- // Three-byte |
- { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF |
- { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF* |
- { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF |
- { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF* |
- |
- // Four-byte |
- { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF |
- { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF* |
- { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF |
- { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF* |
- { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF |
- { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF* |
-}; |
- |
-void Compiler::Add_80_10ffff() { |
- int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning |
- for (int i = 0; i < arraysize(prog_80_10ffff); i++) { |
- const ByteRangeProg& p = prog_80_10ffff[i]; |
- int next = 0; |
- if (p.next >= 0) |
- next = inst[p.next]; |
- inst[i] = UncachedRuneByteSuffix(static_cast<uint8>(p.lo), |
- static_cast<uint8>(p.hi), false, next); |
- if ((p.lo & 0xC0) != 0x80) |
- AddSuffix(inst[i]); |
- } |
-} |
- |
-void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { |
- if (lo > hi) |
- return; |
- |
- // Pick off 80-10FFFF as a common special case |
- // that can bypass the slow rune_cache_. |
- if (lo == 0x80 && hi == 0x10ffff && !reversed_) { |
- Add_80_10ffff(); |
- return; |
- } |
- |
- // Split range into same-length sized ranges. |
- for (int i = 1; i < UTFmax; i++) { |
- Rune max = MaxRune(i); |
- if (lo <= max && max < hi) { |
- AddRuneRangeUTF8(lo, max, foldcase); |
- AddRuneRangeUTF8(max+1, hi, foldcase); |
- return; |
- } |
- } |
- |
- // ASCII range is always a special case. |
- if (hi < Runeself) { |
- AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi), |
- foldcase, 0)); |
- return; |
- } |
- |
- // Split range into sections that agree on leading bytes. |
- for (int i = 1; i < UTFmax; i++) { |
- uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence |
- if ((lo & ~m) != (hi & ~m)) { |
- if ((lo & m) != 0) { |
- AddRuneRangeUTF8(lo, lo|m, foldcase); |
- AddRuneRangeUTF8((lo|m)+1, hi, foldcase); |
- return; |
- } |
- if ((hi & m) != m) { |
- AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase); |
- AddRuneRangeUTF8(hi&~m, hi, foldcase); |
- return; |
- } |
- } |
- } |
- |
- // Finally. Generate byte matching equivalent for lo-hi. |
- uint8 ulo[UTFmax], uhi[UTFmax]; |
- int n = runetochar(reinterpret_cast<char*>(ulo), &lo); |
- int m = runetochar(reinterpret_cast<char*>(uhi), &hi); |
- (void)m; // USED(m) |
- DCHECK_EQ(n, m); |
- |
- int id = 0; |
- if (reversed_) { |
- for (int i = 0; i < n; i++) |
- id = RuneByteSuffix(ulo[i], uhi[i], false, id); |
- } else { |
- for (int i = n-1; i >= 0; i--) |
- id = RuneByteSuffix(ulo[i], uhi[i], false, id); |
- } |
- AddSuffix(id); |
-} |
- |
-// Should not be called. |
-Frag Compiler::Copy(Frag arg) { |
- // We're using WalkExponential; there should be no copying. |
- LOG(DFATAL) << "Compiler::Copy called!"; |
- failed_ = true; |
- return NoMatch(); |
-} |
- |
-// Visits a node quickly; called once WalkExponential has |
-// decided to cut this walk short. |
-Frag Compiler::ShortVisit(Regexp* re, Frag) { |
- failed_ = true; |
- return NoMatch(); |
-} |
- |
-// Called before traversing a node's children during the walk. |
-Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) { |
- // Cut off walk if we've already failed. |
- if (failed_) |
- *stop = true; |
- |
- return Frag(); // not used by caller |
-} |
- |
-Frag Compiler::Literal(Rune r, bool foldcase) { |
- switch (encoding_) { |
- default: |
- return Frag(); |
- |
- case kEncodingLatin1: |
- return ByteRange(r, r, foldcase); |
- |
- case kEncodingUTF8: { |
- if (r < Runeself) // Make common case fast. |
- return ByteRange(r, r, foldcase); |
- uint8 buf[UTFmax]; |
- int n = runetochar(reinterpret_cast<char*>(buf), &r); |
- Frag f = ByteRange((uint8)buf[0], buf[0], false); |
- for (int i = 1; i < n; i++) |
- f = Cat(f, ByteRange((uint8)buf[i], buf[i], false)); |
- return f; |
- } |
- } |
-} |
- |
-// Called after traversing the node's children during the walk. |
-// Given their frags, build and return the frag for this re. |
-Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, |
- int nchild_frags) { |
- // If a child failed, don't bother going forward, especially |
- // since the child_frags might contain Frags with NULLs in them. |
- if (failed_) |
- return NoMatch(); |
- |
- // Given the child fragments, return the fragment for this node. |
- switch (re->op()) { |
- case kRegexpRepeat: |
- // Should not see; code at bottom of function will print error |
- break; |
- |
- case kRegexpNoMatch: |
- return NoMatch(); |
- |
- case kRegexpEmptyMatch: |
- return Nop(); |
- |
- case kRegexpHaveMatch: { |
- Frag f = Match(re->match_id()); |
- // Remember unanchored match to end of string. |
- if (anchor_ != RE2::ANCHOR_BOTH) |
- f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f)); |
- return f; |
- } |
- |
- case kRegexpConcat: { |
- Frag f = child_frags[0]; |
- for (int i = 1; i < nchild_frags; i++) |
- f = Cat(f, child_frags[i]); |
- return f; |
- } |
- |
- case kRegexpAlternate: { |
- Frag f = child_frags[0]; |
- for (int i = 1; i < nchild_frags; i++) |
- f = Alt(f, child_frags[i]); |
- return f; |
- } |
- |
- case kRegexpStar: |
- return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); |
- |
- case kRegexpPlus: |
- return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); |
- |
- case kRegexpQuest: |
- return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); |
- |
- case kRegexpLiteral: |
- return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0); |
- |
- case kRegexpLiteralString: { |
- // Concatenation of literals. |
- if (re->nrunes() == 0) |
- return Nop(); |
- Frag f; |
- for (int i = 0; i < re->nrunes(); i++) { |
- Frag f1 = Literal(re->runes()[i], |
- (re->parse_flags()&Regexp::FoldCase) != 0); |
- if (i == 0) |
- f = f1; |
- else |
- f = Cat(f, f1); |
- } |
- return f; |
- } |
- |
- case kRegexpAnyChar: |
- BeginRange(); |
- AddRuneRange(0, Runemax, false); |
- return EndRange(); |
- |
- case kRegexpAnyByte: |
- return ByteRange(0x00, 0xFF, false); |
- |
- case kRegexpCharClass: { |
- CharClass* cc = re->cc(); |
- if (cc->empty()) { |
- // This can't happen. |
- LOG(DFATAL) << "No ranges in char class"; |
- failed_ = true; |
- return NoMatch(); |
- } |
- |
- // ASCII case-folding optimization: if the char class |
- // behaves the same on A-Z as it does on a-z, |
- // discard any ranges wholly contained in A-Z |
- // and mark the other ranges as foldascii. |
- // This reduces the size of a program for |
- // (?i)abc from 3 insts per letter to 1 per letter. |
- bool foldascii = cc->FoldsASCII(); |
- |
- // Character class is just a big OR of the different |
- // character ranges in the class. |
- BeginRange(); |
- for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) { |
- // ASCII case-folding optimization (see above). |
- if (foldascii && 'A' <= i->lo && i->hi <= 'Z') |
- continue; |
- |
- // If this range contains all of A-Za-z or none of it, |
- // the fold flag is unnecessary; don't bother. |
- bool fold = foldascii; |
- if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo || |
- ('Z' < i->lo && i->hi < 'a')) |
- fold = false; |
- |
- AddRuneRange(i->lo, i->hi, fold); |
- } |
- return EndRange(); |
- } |
- |
- case kRegexpCapture: |
- // If this is a non-capturing parenthesis -- (?:foo) -- |
- // just use the inner expression. |
- if (re->cap() < 0) |
- return child_frags[0]; |
- return Capture(child_frags[0], re->cap()); |
- |
- case kRegexpBeginLine: |
- return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine); |
- |
- case kRegexpEndLine: |
- return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine); |
- |
- case kRegexpBeginText: |
- return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText); |
- |
- case kRegexpEndText: |
- return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText); |
- |
- case kRegexpWordBoundary: |
- return EmptyWidth(kEmptyWordBoundary); |
- |
- case kRegexpNoWordBoundary: |
- return EmptyWidth(kEmptyNonWordBoundary); |
- } |
- LOG(DFATAL) << "Missing case in Compiler: " << re->op(); |
- failed_ = true; |
- return NoMatch(); |
-} |
- |
-// Is this regexp required to start at the beginning of the text? |
-// Only approximate; can return false for complicated regexps like (\Aa|\Ab), |
-// but handles (\A(a|b)). Could use the Walker to write a more exact one. |
-static bool IsAnchorStart(Regexp** pre, int depth) { |
- Regexp* re = *pre; |
- Regexp* sub; |
- // The depth limit makes sure that we don't overflow |
- // the stack on a deeply nested regexp. As the comment |
- // above says, IsAnchorStart is conservative, so returning |
- // a false negative is okay. The exact limit is somewhat arbitrary. |
- if (re == NULL || depth >= 4) |
- return false; |
- switch (re->op()) { |
- default: |
- break; |
- case kRegexpConcat: |
- if (re->nsub() > 0) { |
- sub = re->sub()[0]->Incref(); |
- if (IsAnchorStart(&sub, depth+1)) { |
- Regexp** subcopy = new Regexp*[re->nsub()]; |
- subcopy[0] = sub; // already have reference |
- for (int i = 1; i < re->nsub(); i++) |
- subcopy[i] = re->sub()[i]->Incref(); |
- *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); |
- delete[] subcopy; |
- re->Decref(); |
- return true; |
- } |
- sub->Decref(); |
- } |
- break; |
- case kRegexpCapture: |
- sub = re->sub()[0]->Incref(); |
- if (IsAnchorStart(&sub, depth+1)) { |
- *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); |
- re->Decref(); |
- return true; |
- } |
- sub->Decref(); |
- break; |
- case kRegexpBeginText: |
- *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); |
- re->Decref(); |
- return true; |
- } |
- return false; |
-} |
- |
-// Is this regexp required to start at the end of the text? |
-// Only approximate; can return false for complicated regexps like (a\z|b\z), |
-// but handles ((a|b)\z). Could use the Walker to write a more exact one. |
-static bool IsAnchorEnd(Regexp** pre, int depth) { |
- Regexp* re = *pre; |
- Regexp* sub; |
- // The depth limit makes sure that we don't overflow |
- // the stack on a deeply nested regexp. As the comment |
- // above says, IsAnchorEnd is conservative, so returning |
- // a false negative is okay. The exact limit is somewhat arbitrary. |
- if (re == NULL || depth >= 4) |
- return false; |
- switch (re->op()) { |
- default: |
- break; |
- case kRegexpConcat: |
- if (re->nsub() > 0) { |
- sub = re->sub()[re->nsub() - 1]->Incref(); |
- if (IsAnchorEnd(&sub, depth+1)) { |
- Regexp** subcopy = new Regexp*[re->nsub()]; |
- subcopy[re->nsub() - 1] = sub; // already have reference |
- for (int i = 0; i < re->nsub() - 1; i++) |
- subcopy[i] = re->sub()[i]->Incref(); |
- *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); |
- delete[] subcopy; |
- re->Decref(); |
- return true; |
- } |
- sub->Decref(); |
- } |
- break; |
- case kRegexpCapture: |
- sub = re->sub()[0]->Incref(); |
- if (IsAnchorEnd(&sub, depth+1)) { |
- *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); |
- re->Decref(); |
- return true; |
- } |
- sub->Decref(); |
- break; |
- case kRegexpEndText: |
- *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); |
- re->Decref(); |
- return true; |
- } |
- return false; |
-} |
- |
-void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, |
- RE2::Anchor anchor) { |
- prog_->set_flags(flags); |
- |
- if (flags & Regexp::Latin1) |
- encoding_ = kEncodingLatin1; |
- max_mem_ = max_mem; |
- if (max_mem <= 0) { |
- max_inst_ = 100000; // more than enough |
- } else if (max_mem <= static_cast<int64>(sizeof(Prog))) { |
- // No room for anything. |
- max_inst_ = 0; |
- } else { |
- int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); |
- // Limit instruction count so that inst->id() fits nicely in an int. |
- // SparseArray also assumes that the indices (inst->id()) are ints. |
- // The call to WalkExponential uses 2*max_inst_ below, |
- // and other places in the code use 2 or 3 * prog->size(). |
- // Limiting to 2^24 should avoid overflow in those places. |
- // (The point of allowing more than 32 bits of memory is to |
- // have plenty of room for the DFA states, not to use it up |
- // on the program.) |
- if (m >= 1<<24) |
- m = 1<<24; |
- |
- // Inst imposes its own limit (currently bigger than 2^24 but be safe). |
- if (m > Prog::Inst::kMaxInst) |
- m = Prog::Inst::kMaxInst; |
- |
- max_inst_ = static_cast<int>(m); |
- } |
- |
- anchor_ = anchor; |
-} |
- |
-// Compiles re, returning program. |
-// Caller is responsible for deleting prog_. |
-// If reversed is true, compiles a program that expects |
-// to run over the input string backward (reverses all concatenations). |
-// The reversed flag is also recorded in the returned program. |
-Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) { |
- Compiler c; |
- |
- c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */); |
- c.reversed_ = reversed; |
- |
- // Simplify to remove things like counted repetitions |
- // and character classes like \d. |
- Regexp* sre = re->Simplify(); |
- if (sre == NULL) |
- return NULL; |
- |
- // Record whether prog is anchored, removing the anchors. |
- // (They get in the way of other optimizations.) |
- bool is_anchor_start = IsAnchorStart(&sre, 0); |
- bool is_anchor_end = IsAnchorEnd(&sre, 0); |
- |
- // Generate fragment for entire regexp. |
- Frag f = c.WalkExponential(sre, Frag(), 2*c.max_inst_); |
- sre->Decref(); |
- if (c.failed_) |
- return NULL; |
- |
- // Success! Finish by putting Match node at end, and record start. |
- // Turn off c.reversed_ (if it is set) to force the remaining concatenations |
- // to behave normally. |
- c.reversed_ = false; |
- Frag all = c.Cat(f, c.Match(0)); |
- c.prog_->set_start(all.begin); |
- |
- if (reversed) { |
- c.prog_->set_anchor_start(is_anchor_end); |
- c.prog_->set_anchor_end(is_anchor_start); |
- } else { |
- c.prog_->set_anchor_start(is_anchor_start); |
- c.prog_->set_anchor_end(is_anchor_end); |
- } |
- |
- // Also create unanchored version, which starts with a .*? loop. |
- if (c.prog_->anchor_start()) { |
- c.prog_->set_start_unanchored(c.prog_->start()); |
- } else { |
- Frag unanchored = c.Cat(c.DotStar(), all); |
- c.prog_->set_start_unanchored(unanchored.begin); |
- } |
- |
- c.prog_->set_reversed(reversed); |
- |
- // Hand ownership of prog_ to caller. |
- return c.Finish(); |
-} |
- |
-Prog* Compiler::Finish() { |
- if (failed_) |
- return NULL; |
- |
- if (prog_->start() == 0 && prog_->start_unanchored() == 0) { |
- // No possible matches; keep Fail instruction only. |
- inst_len_ = 1; |
- } |
- |
- // Trim instruction to minimum array and transfer to Prog. |
- Trim(); |
- prog_->inst_ = inst_; |
- prog_->size_ = inst_len_; |
- inst_ = NULL; |
- |
- // Compute byte map. |
- prog_->ComputeByteMap(); |
- |
- prog_->Optimize(); |
- |
- // Record remaining memory for DFA. |
- if (max_mem_ <= 0) { |
- prog_->set_dfa_mem(1<<20); |
- } else { |
- int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst); |
- if (m < 0) |
- m = 0; |
- prog_->set_dfa_mem(m); |
- } |
- |
- Prog* p = prog_; |
- prog_ = NULL; |
- return p; |
-} |
- |
-// Converts Regexp to Prog. |
-Prog* Regexp::CompileToProg(int64 max_mem) { |
- return Compiler::Compile(this, false, max_mem); |
-} |
- |
-Prog* Regexp::CompileToReverseProg(int64 max_mem) { |
- return Compiler::Compile(this, true, max_mem); |
-} |
- |
-Frag Compiler::DotStar() { |
- return Star(ByteRange(0x00, 0xff, false), true); |
-} |
- |
-// Compiles RE set to Prog. |
-Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor, |
- Regexp* re) { |
- Compiler c; |
- |
- Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags()); |
- c.Setup(pf, options.max_mem(), anchor); |
- |
- // Compile alternation of fragments. |
- Frag all = c.WalkExponential(re, Frag(), 2*c.max_inst_); |
- re->Decref(); |
- if (c.failed_) |
- return NULL; |
- |
- if (anchor == RE2::UNANCHORED) { |
- // The trailing .* was added while handling kRegexpHaveMatch. |
- // We just have to add the leading one. |
- all = c.Cat(c.DotStar(), all); |
- } |
- |
- c.prog_->set_start(all.begin); |
- c.prog_->set_start_unanchored(all.begin); |
- c.prog_->set_anchor_start(true); |
- c.prog_->set_anchor_end(true); |
- |
- Prog* prog = c.Finish(); |
- if (prog == NULL) |
- return NULL; |
- |
- // Make sure DFA has enough memory to operate, |
- // since we're not going to fall back to the NFA. |
- bool failed; |
- StringPiece sp = "hello, world"; |
- prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, |
- NULL, &failed, NULL); |
- if (failed) { |
- delete prog; |
- return NULL; |
- } |
- |
- return prog; |
-} |
- |
-Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor, |
- Regexp* re) { |
- return Compiler::CompileSet(options, anchor, re); |
-} |
- |
-} // namespace re2 |