| OLD | NEW |
| (Empty) |
| 1 // Copyright 2007 The RE2 Authors. All Rights Reserved. | |
| 2 // Use of this source code is governed by a BSD-style | |
| 3 // license that can be found in the LICENSE file. | |
| 4 | |
| 5 // Compile regular expression to Prog. | |
| 6 // | |
| 7 // Prog and Inst are defined in prog.h. | |
| 8 // This file's external interface is just Regexp::CompileToProg. | |
| 9 // The Compiler class defined in this file is private. | |
| 10 | |
| 11 #include "re2/prog.h" | |
| 12 #include "re2/re2.h" | |
| 13 #include "re2/regexp.h" | |
| 14 #include "re2/walker-inl.h" | |
| 15 | |
| 16 namespace re2 { | |
| 17 | |
| 18 // List of pointers to Inst* that need to be filled in (patched). | |
| 19 // Because the Inst* haven't been filled in yet, | |
| 20 // we can use the Inst* word to hold the list's "next" pointer. | |
| 21 // It's kind of sleazy, but it works well in practice. | |
| 22 // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. | |
| 23 // | |
| 24 // Because the out and out1 fields in Inst are no longer pointers, | |
| 25 // we can't use pointers directly here either. Instead, p refers | |
| 26 // to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1). | |
| 27 // p == 0 represents the NULL list. This is okay because instruction #0 | |
| 28 // is always the fail instruction, which never appears on a list. | |
| 29 | |
| 30 struct PatchList { | |
| 31 uint32 p; | |
| 32 | |
| 33 // Returns patch list containing just p. | |
| 34 static PatchList Mk(uint32 p); | |
| 35 | |
| 36 // Patches all the entries on l to have value v. | |
| 37 // Caller must not ever use patch list again. | |
| 38 static void Patch(Prog::Inst *inst0, PatchList l, uint32 v); | |
| 39 | |
| 40 // Deref returns the next pointer pointed at by p. | |
| 41 static PatchList Deref(Prog::Inst *inst0, PatchList l); | |
| 42 | |
| 43 // Appends two patch lists and returns result. | |
| 44 static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2); | |
| 45 }; | |
| 46 | |
| 47 static PatchList nullPatchList = { 0 }; | |
| 48 | |
| 49 // Returns patch list containing just p. | |
| 50 PatchList PatchList::Mk(uint32 p) { | |
| 51 PatchList l; | |
| 52 l.p = p; | |
| 53 return l; | |
| 54 } | |
| 55 | |
| 56 // Returns the next pointer pointed at by l. | |
| 57 PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) { | |
| 58 Prog::Inst* ip = &inst0[l.p>>1]; | |
| 59 if (l.p&1) | |
| 60 l.p = ip->out1(); | |
| 61 else | |
| 62 l.p = ip->out(); | |
| 63 return l; | |
| 64 } | |
| 65 | |
| 66 // Patches all the entries on l to have value v. | |
| 67 void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) { | |
| 68 while (l.p != 0) { | |
| 69 Prog::Inst* ip = &inst0[l.p>>1]; | |
| 70 if (l.p&1) { | |
| 71 l.p = ip->out1(); | |
| 72 ip->out1_ = val; | |
| 73 } else { | |
| 74 l.p = ip->out(); | |
| 75 ip->set_out(val); | |
| 76 } | |
| 77 } | |
| 78 } | |
| 79 | |
| 80 // Appends two patch lists and returns result. | |
| 81 PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { | |
| 82 if (l1.p == 0) | |
| 83 return l2; | |
| 84 if (l2.p == 0) | |
| 85 return l1; | |
| 86 | |
| 87 PatchList l = l1; | |
| 88 for (;;) { | |
| 89 PatchList next = PatchList::Deref(inst0, l); | |
| 90 if (next.p == 0) | |
| 91 break; | |
| 92 l = next; | |
| 93 } | |
| 94 | |
| 95 Prog::Inst* ip = &inst0[l.p>>1]; | |
| 96 if (l.p&1) | |
| 97 ip->out1_ = l2.p; | |
| 98 else | |
| 99 ip->set_out(l2.p); | |
| 100 | |
| 101 return l1; | |
| 102 } | |
| 103 | |
| 104 // Compiled program fragment. | |
| 105 struct Frag { | |
| 106 uint32 begin; | |
| 107 PatchList end; | |
| 108 | |
| 109 Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector | |
| 110 Frag(uint32 begin, PatchList end) : begin(begin), end(end) {} | |
| 111 }; | |
| 112 | |
| 113 // Input encodings. | |
| 114 enum Encoding { | |
| 115 kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) | |
| 116 kEncodingLatin1, // Latin1 (0-FF) | |
| 117 }; | |
| 118 | |
| 119 class Compiler : public Regexp::Walker<Frag> { | |
| 120 public: | |
| 121 explicit Compiler(); | |
| 122 ~Compiler(); | |
| 123 | |
| 124 // Compiles Regexp to a new Prog. | |
| 125 // Caller is responsible for deleting Prog when finished with it. | |
| 126 // If reversed is true, compiles for walking over the input | |
| 127 // string backward (reverses all concatenations). | |
| 128 static Prog *Compile(Regexp* re, bool reversed, int64 max_mem); | |
| 129 | |
| 130 // Compiles alternation of all the re to a new Prog. | |
| 131 // Each re has a match with an id equal to its index in the vector. | |
| 132 static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, | |
| 133 Regexp* re); | |
| 134 | |
| 135 // Interface for Regexp::Walker, which helps traverse the Regexp. | |
| 136 // The walk is purely post-recursive: given the machines for the | |
| 137 // children, PostVisit combines them to create the machine for | |
| 138 // the current node. The child_args are Frags. | |
| 139 // The Compiler traverses the Regexp parse tree, visiting | |
| 140 // each node in depth-first order. It invokes PreVisit before | |
| 141 // visiting the node's children and PostVisit after visiting | |
| 142 // the children. | |
| 143 Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop); | |
| 144 Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args, | |
| 145 int nchild_args); | |
| 146 Frag ShortVisit(Regexp* re, Frag parent_arg); | |
| 147 Frag Copy(Frag arg); | |
| 148 | |
| 149 // Given fragment a, returns a+ or a+?; a* or a*?; a? or a?? | |
| 150 Frag Plus(Frag a, bool nongreedy); | |
| 151 Frag Star(Frag a, bool nongreedy); | |
| 152 Frag Quest(Frag a, bool nongreedy); | |
| 153 | |
| 154 // Given fragment a, returns (a) capturing as \n. | |
| 155 Frag Capture(Frag a, int n); | |
| 156 | |
| 157 // Given fragments a and b, returns ab; a|b | |
| 158 Frag Cat(Frag a, Frag b); | |
| 159 Frag Alt(Frag a, Frag b); | |
| 160 | |
| 161 // Returns a fragment that can't match anything. | |
| 162 Frag NoMatch(); | |
| 163 | |
| 164 // Returns a fragment that matches the empty string. | |
| 165 Frag Match(int32 id); | |
| 166 | |
| 167 // Returns a no-op fragment. | |
| 168 Frag Nop(); | |
| 169 | |
| 170 // Returns a fragment matching the byte range lo-hi. | |
| 171 Frag ByteRange(int lo, int hi, bool foldcase); | |
| 172 | |
| 173 // Returns a fragment matching an empty-width special op. | |
| 174 Frag EmptyWidth(EmptyOp op); | |
| 175 | |
| 176 // Adds n instructions to the program. | |
| 177 // Returns the index of the first one. | |
| 178 // Returns -1 if no more instructions are available. | |
| 179 int AllocInst(int n); | |
| 180 | |
| 181 // Deletes unused instructions. | |
| 182 void Trim(); | |
| 183 | |
| 184 // Rune range compiler. | |
| 185 | |
| 186 // Begins a new alternation. | |
| 187 void BeginRange(); | |
| 188 | |
| 189 // Adds a fragment matching the rune range lo-hi. | |
| 190 void AddRuneRange(Rune lo, Rune hi, bool foldcase); | |
| 191 void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); | |
| 192 void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); | |
| 193 void Add_80_10ffff(); | |
| 194 | |
| 195 // New suffix that matches the byte range lo-hi, then goes to next. | |
| 196 int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); | |
| 197 int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); | |
| 198 | |
| 199 // Adds a suffix to alternation. | |
| 200 void AddSuffix(int id); | |
| 201 | |
| 202 // Returns the alternation of all the added suffixes. | |
| 203 Frag EndRange(); | |
| 204 | |
| 205 // Single rune. | |
| 206 Frag Literal(Rune r, bool foldcase); | |
| 207 | |
| 208 void Setup(Regexp::ParseFlags, int64, RE2::Anchor); | |
| 209 Prog* Finish(); | |
| 210 | |
| 211 // Returns .* where dot = any byte | |
| 212 Frag DotStar(); | |
| 213 | |
| 214 private: | |
| 215 Prog* prog_; // Program being built. | |
| 216 bool failed_; // Did we give up compiling? | |
| 217 Encoding encoding_; // Input encoding | |
| 218 bool reversed_; // Should program run backward over text? | |
| 219 | |
| 220 int max_inst_; // Maximum number of instructions. | |
| 221 | |
| 222 Prog::Inst* inst_; // Pointer to first instruction. | |
| 223 int inst_len_; // Number of instructions used. | |
| 224 int inst_cap_; // Number of instructions allocated. | |
| 225 | |
| 226 int64 max_mem_; // Total memory budget. | |
| 227 | |
| 228 map<uint64, int> rune_cache_; | |
| 229 Frag rune_range_; | |
| 230 | |
| 231 RE2::Anchor anchor_; // anchor mode for RE2::Set | |
| 232 | |
| 233 DISALLOW_COPY_AND_ASSIGN(Compiler); | |
| 234 }; | |
| 235 | |
| 236 Compiler::Compiler() { | |
| 237 prog_ = new Prog(); | |
| 238 failed_ = false; | |
| 239 encoding_ = kEncodingUTF8; | |
| 240 reversed_ = false; | |
| 241 inst_ = NULL; | |
| 242 inst_len_ = 0; | |
| 243 inst_cap_ = 0; | |
| 244 max_inst_ = 1; // make AllocInst for fail instruction okay | |
| 245 max_mem_ = 0; | |
| 246 int fail = AllocInst(1); | |
| 247 inst_[fail].InitFail(); | |
| 248 max_inst_ = 0; // Caller must change | |
| 249 } | |
| 250 | |
| 251 Compiler::~Compiler() { | |
| 252 delete prog_; | |
| 253 delete[] inst_; | |
| 254 } | |
| 255 | |
| 256 int Compiler::AllocInst(int n) { | |
| 257 if (failed_ || inst_len_ + n > max_inst_) { | |
| 258 failed_ = true; | |
| 259 return -1; | |
| 260 } | |
| 261 | |
| 262 if (inst_len_ + n > inst_cap_) { | |
| 263 if (inst_cap_ == 0) | |
| 264 inst_cap_ = 8; | |
| 265 while (inst_len_ + n > inst_cap_) | |
| 266 inst_cap_ *= 2; | |
| 267 Prog::Inst* ip = new Prog::Inst[inst_cap_]; | |
| 268 memmove(ip, inst_, inst_len_ * sizeof ip[0]); | |
| 269 memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]); | |
| 270 delete[] inst_; | |
| 271 inst_ = ip; | |
| 272 } | |
| 273 int id = inst_len_; | |
| 274 inst_len_ += n; | |
| 275 return id; | |
| 276 } | |
| 277 | |
| 278 void Compiler::Trim() { | |
| 279 if (inst_len_ < inst_cap_) { | |
| 280 Prog::Inst* ip = new Prog::Inst[inst_len_]; | |
| 281 memmove(ip, inst_, inst_len_ * sizeof ip[0]); | |
| 282 delete[] inst_; | |
| 283 inst_ = ip; | |
| 284 inst_cap_ = inst_len_; | |
| 285 } | |
| 286 } | |
| 287 | |
| 288 // These routines are somewhat hard to visualize in text -- | |
| 289 // see http://swtch.com/~rsc/regexp/regexp1.html for | |
| 290 // pictures explaining what is going on here. | |
| 291 | |
| 292 // Returns an unmatchable fragment. | |
| 293 Frag Compiler::NoMatch() { | |
| 294 return Frag(0, nullPatchList); | |
| 295 } | |
| 296 | |
| 297 // Is a an unmatchable fragment? | |
| 298 static bool IsNoMatch(Frag a) { | |
| 299 return a.begin == 0; | |
| 300 } | |
| 301 | |
| 302 // Given fragments a and b, returns fragment for ab. | |
| 303 Frag Compiler::Cat(Frag a, Frag b) { | |
| 304 if (IsNoMatch(a) || IsNoMatch(b)) | |
| 305 return NoMatch(); | |
| 306 | |
| 307 // Elide no-op. | |
| 308 Prog::Inst* begin = &inst_[a.begin]; | |
| 309 if (begin->opcode() == kInstNop && | |
| 310 a.end.p == (a.begin << 1) && | |
| 311 begin->out() == 0) { | |
| 312 PatchList::Patch(inst_, a.end, b.begin); // in case refs to a somewhere | |
| 313 return b; | |
| 314 } | |
| 315 | |
| 316 // To run backward over string, reverse all concatenations. | |
| 317 if (reversed_) { | |
| 318 PatchList::Patch(inst_, b.end, a.begin); | |
| 319 return Frag(b.begin, a.end); | |
| 320 } | |
| 321 | |
| 322 PatchList::Patch(inst_, a.end, b.begin); | |
| 323 return Frag(a.begin, b.end); | |
| 324 } | |
| 325 | |
| 326 // Given fragments for a and b, returns fragment for a|b. | |
| 327 Frag Compiler::Alt(Frag a, Frag b) { | |
| 328 // Special case for convenience in loops. | |
| 329 if (IsNoMatch(a)) | |
| 330 return b; | |
| 331 if (IsNoMatch(b)) | |
| 332 return a; | |
| 333 | |
| 334 int id = AllocInst(1); | |
| 335 if (id < 0) | |
| 336 return NoMatch(); | |
| 337 | |
| 338 inst_[id].InitAlt(a.begin, b.begin); | |
| 339 return Frag(id, PatchList::Append(inst_, a.end, b.end)); | |
| 340 } | |
| 341 | |
| 342 // When capturing submatches in like-Perl mode, a kOpAlt Inst | |
| 343 // treats out_ as the first choice, out1_ as the second. | |
| 344 // | |
| 345 // For *, +, and ?, if out_ causes another repetition, | |
| 346 // then the operator is greedy. If out1_ is the repetition | |
| 347 // (and out_ moves forward), then the operator is non-greedy. | |
| 348 | |
| 349 // Given a fragment a, returns a fragment for a* or a*? (if nongreedy) | |
| 350 Frag Compiler::Star(Frag a, bool nongreedy) { | |
| 351 int id = AllocInst(1); | |
| 352 if (id < 0) | |
| 353 return NoMatch(); | |
| 354 inst_[id].InitAlt(0, 0); | |
| 355 PatchList::Patch(inst_, a.end, id); | |
| 356 if (nongreedy) { | |
| 357 inst_[id].out1_ = a.begin; | |
| 358 return Frag(id, PatchList::Mk(id << 1)); | |
| 359 } else { | |
| 360 inst_[id].set_out(a.begin); | |
| 361 return Frag(id, PatchList::Mk((id << 1) | 1)); | |
| 362 } | |
| 363 } | |
| 364 | |
| 365 // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) | |
| 366 Frag Compiler::Plus(Frag a, bool nongreedy) { | |
| 367 // a+ is just a* with a different entry point. | |
| 368 Frag f = Star(a, nongreedy); | |
| 369 return Frag(a.begin, f.end); | |
| 370 } | |
| 371 | |
| 372 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) | |
| 373 Frag Compiler::Quest(Frag a, bool nongreedy) { | |
| 374 if (IsNoMatch(a)) | |
| 375 return Nop(); | |
| 376 int id = AllocInst(1); | |
| 377 if (id < 0) | |
| 378 return NoMatch(); | |
| 379 PatchList pl; | |
| 380 if (nongreedy) { | |
| 381 inst_[id].InitAlt(0, a.begin); | |
| 382 pl = PatchList::Mk(id << 1); | |
| 383 } else { | |
| 384 inst_[id].InitAlt(a.begin, 0); | |
| 385 pl = PatchList::Mk((id << 1) | 1); | |
| 386 } | |
| 387 return Frag(id, PatchList::Append(inst_, pl, a.end)); | |
| 388 } | |
| 389 | |
| 390 // Returns a fragment for the byte range lo-hi. | |
| 391 Frag Compiler::ByteRange(int lo, int hi, bool foldcase) { | |
| 392 int id = AllocInst(1); | |
| 393 if (id < 0) | |
| 394 return NoMatch(); | |
| 395 inst_[id].InitByteRange(lo, hi, foldcase, 0); | |
| 396 prog_->byte_inst_count_++; | |
| 397 prog_->MarkByteRange(lo, hi); | |
| 398 if (foldcase && lo <= 'z' && hi >= 'a') { | |
| 399 if (lo < 'a') | |
| 400 lo = 'a'; | |
| 401 if (hi > 'z') | |
| 402 hi = 'z'; | |
| 403 if (lo <= hi) | |
| 404 prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a'); | |
| 405 } | |
| 406 return Frag(id, PatchList::Mk(id << 1)); | |
| 407 } | |
| 408 | |
| 409 // Returns a no-op fragment. Sometimes unavoidable. | |
| 410 Frag Compiler::Nop() { | |
| 411 int id = AllocInst(1); | |
| 412 if (id < 0) | |
| 413 return NoMatch(); | |
| 414 inst_[id].InitNop(0); | |
| 415 return Frag(id, PatchList::Mk(id << 1)); | |
| 416 } | |
| 417 | |
| 418 // Returns a fragment that signals a match. | |
| 419 Frag Compiler::Match(int32 match_id) { | |
| 420 int id = AllocInst(1); | |
| 421 if (id < 0) | |
| 422 return NoMatch(); | |
| 423 inst_[id].InitMatch(match_id); | |
| 424 return Frag(id, nullPatchList); | |
| 425 } | |
| 426 | |
| 427 // Returns a fragment matching a particular empty-width op (like ^ or $) | |
| 428 Frag Compiler::EmptyWidth(EmptyOp empty) { | |
| 429 int id = AllocInst(1); | |
| 430 if (id < 0) | |
| 431 return NoMatch(); | |
| 432 inst_[id].InitEmptyWidth(empty, 0); | |
| 433 if (empty & (kEmptyBeginLine|kEmptyEndLine)) | |
| 434 prog_->MarkByteRange('\n', '\n'); | |
| 435 if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) { | |
| 436 int j; | |
| 437 for (int i = 0; i < 256; i = j) { | |
| 438 for (j = i + 1; j < 256 && | |
| 439 Prog::IsWordChar(static_cast<uint8>(i)) == | |
| 440 Prog::IsWordChar(static_cast<uint8>(j)); | |
| 441 j++) | |
| 442 ; | |
| 443 prog_->MarkByteRange(i, j-1); | |
| 444 } | |
| 445 } | |
| 446 return Frag(id, PatchList::Mk(id << 1)); | |
| 447 } | |
| 448 | |
| 449 // Given a fragment a, returns a fragment with capturing parens around a. | |
| 450 Frag Compiler::Capture(Frag a, int n) { | |
| 451 if (IsNoMatch(a)) | |
| 452 return NoMatch(); | |
| 453 int id = AllocInst(2); | |
| 454 if (id < 0) | |
| 455 return NoMatch(); | |
| 456 inst_[id].InitCapture(2*n, a.begin); | |
| 457 inst_[id+1].InitCapture(2*n+1, 0); | |
| 458 PatchList::Patch(inst_, a.end, id+1); | |
| 459 | |
| 460 return Frag(id, PatchList::Mk((id+1) << 1)); | |
| 461 } | |
| 462 | |
| 463 // A Rune is a name for a Unicode code point. | |
| 464 // Returns maximum rune encoded by UTF-8 sequence of length len. | |
| 465 static int MaxRune(int len) { | |
| 466 int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax) | |
| 467 if (len == 1) | |
| 468 b = 7; | |
| 469 else | |
| 470 b = 8-(len+1) + 6*(len-1); | |
| 471 return (1<<b) - 1; // maximum Rune for b bits. | |
| 472 } | |
| 473 | |
| 474 // The rune range compiler caches common suffix fragments, | |
| 475 // which are very common in UTF-8 (e.g., [80-bf]). | |
| 476 // The fragment suffixes are identified by their start | |
| 477 // instructions. NULL denotes the eventual end match. | |
| 478 // The Frag accumulates in rune_range_. Caching common | |
| 479 // suffixes reduces the UTF-8 "." from 32 to 24 instructions, | |
| 480 // and it reduces the corresponding one-pass NFA from 16 nodes to 8. | |
| 481 | |
| 482 void Compiler::BeginRange() { | |
| 483 rune_cache_.clear(); | |
| 484 rune_range_.begin = 0; | |
| 485 rune_range_.end = nullPatchList; | |
| 486 } | |
| 487 | |
| 488 int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, | |
| 489 int next) { | |
| 490 Frag f = ByteRange(lo, hi, foldcase); | |
| 491 if (next != 0) { | |
| 492 PatchList::Patch(inst_, f.end, next); | |
| 493 } else { | |
| 494 rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end); | |
| 495 } | |
| 496 return f.begin; | |
| 497 } | |
| 498 | |
| 499 int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { | |
| 500 // In Latin1 mode, there's no point in caching. | |
| 501 // In forward UTF-8 mode, only need to cache continuation bytes. | |
| 502 if (encoding_ == kEncodingLatin1 || | |
| 503 (encoding_ == kEncodingUTF8 && | |
| 504 !reversed_ && | |
| 505 !(0x80 <= lo && hi <= 0xbf))) { | |
| 506 return UncachedRuneByteSuffix(lo, hi, foldcase, next); | |
| 507 } | |
| 508 | |
| 509 uint64 key = (uint64)next << 17 | | |
| 510 (uint64)lo << 9 | | |
| 511 (uint64)hi << 1 | | |
| 512 (uint64)foldcase; | |
| 513 map<uint64, int>::iterator it = rune_cache_.find(key); | |
| 514 if (it != rune_cache_.end()) | |
| 515 return it->second; | |
| 516 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); | |
| 517 rune_cache_[key] = id; | |
| 518 return id; | |
| 519 } | |
| 520 | |
| 521 void Compiler::AddSuffix(int id) { | |
| 522 if (rune_range_.begin == 0) { | |
| 523 rune_range_.begin = id; | |
| 524 return; | |
| 525 } | |
| 526 | |
| 527 int alt = AllocInst(1); | |
| 528 if (alt < 0) { | |
| 529 rune_range_.begin = 0; | |
| 530 return; | |
| 531 } | |
| 532 inst_[alt].InitAlt(rune_range_.begin, id); | |
| 533 rune_range_.begin = alt; | |
| 534 } | |
| 535 | |
| 536 Frag Compiler::EndRange() { | |
| 537 return rune_range_; | |
| 538 } | |
| 539 | |
| 540 // Converts rune range lo-hi into a fragment that recognizes | |
| 541 // the bytes that would make up those runes in the current | |
| 542 // encoding (Latin 1 or UTF-8). | |
| 543 // This lets the machine work byte-by-byte even when | |
| 544 // using multibyte encodings. | |
| 545 | |
| 546 void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) { | |
| 547 switch (encoding_) { | |
| 548 default: | |
| 549 case kEncodingUTF8: | |
| 550 AddRuneRangeUTF8(lo, hi, foldcase); | |
| 551 break; | |
| 552 case kEncodingLatin1: | |
| 553 AddRuneRangeLatin1(lo, hi, foldcase); | |
| 554 break; | |
| 555 } | |
| 556 } | |
| 557 | |
| 558 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { | |
| 559 // Latin1 is easy: runes *are* bytes. | |
| 560 if (lo > hi || lo > 0xFF) | |
| 561 return; | |
| 562 if (hi > 0xFF) | |
| 563 hi = 0xFF; | |
| 564 AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi), | |
| 565 foldcase, 0)); | |
| 566 } | |
| 567 | |
| 568 // Table describing how to make a UTF-8 matching machine | |
| 569 // for the rune range 80-10FFFF (Runeself-Runemax). | |
| 570 // This range happens frequently enough (for example /./ and /[^a-z]/) | |
| 571 // and the rune_cache_ map is slow enough that this is worth | |
| 572 // special handling. Makes compilation of a small expression | |
| 573 // with a dot in it about 10% faster. | |
| 574 // The * in the comments below mark whole sequences. | |
| 575 static struct ByteRangeProg { | |
| 576 int next; | |
| 577 int lo; | |
| 578 int hi; | |
| 579 } prog_80_10ffff[] = { | |
| 580 // Two-byte | |
| 581 { -1, 0x80, 0xBF, }, // 0: 80-BF | |
| 582 { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF* | |
| 583 | |
| 584 // Three-byte | |
| 585 { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF | |
| 586 { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF* | |
| 587 { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF | |
| 588 { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF* | |
| 589 | |
| 590 // Four-byte | |
| 591 { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF | |
| 592 { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF* | |
| 593 { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF | |
| 594 { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF* | |
| 595 { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF | |
| 596 { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF* | |
| 597 }; | |
| 598 | |
| 599 void Compiler::Add_80_10ffff() { | |
| 600 int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialize
d; silences gcc warning | |
| 601 for (int i = 0; i < arraysize(prog_80_10ffff); i++) { | |
| 602 const ByteRangeProg& p = prog_80_10ffff[i]; | |
| 603 int next = 0; | |
| 604 if (p.next >= 0) | |
| 605 next = inst[p.next]; | |
| 606 inst[i] = UncachedRuneByteSuffix(static_cast<uint8>(p.lo), | |
| 607 static_cast<uint8>(p.hi), false, next); | |
| 608 if ((p.lo & 0xC0) != 0x80) | |
| 609 AddSuffix(inst[i]); | |
| 610 } | |
| 611 } | |
| 612 | |
| 613 void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { | |
| 614 if (lo > hi) | |
| 615 return; | |
| 616 | |
| 617 // Pick off 80-10FFFF as a common special case | |
| 618 // that can bypass the slow rune_cache_. | |
| 619 if (lo == 0x80 && hi == 0x10ffff && !reversed_) { | |
| 620 Add_80_10ffff(); | |
| 621 return; | |
| 622 } | |
| 623 | |
| 624 // Split range into same-length sized ranges. | |
| 625 for (int i = 1; i < UTFmax; i++) { | |
| 626 Rune max = MaxRune(i); | |
| 627 if (lo <= max && max < hi) { | |
| 628 AddRuneRangeUTF8(lo, max, foldcase); | |
| 629 AddRuneRangeUTF8(max+1, hi, foldcase); | |
| 630 return; | |
| 631 } | |
| 632 } | |
| 633 | |
| 634 // ASCII range is always a special case. | |
| 635 if (hi < Runeself) { | |
| 636 AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi), | |
| 637 foldcase, 0)); | |
| 638 return; | |
| 639 } | |
| 640 | |
| 641 // Split range into sections that agree on leading bytes. | |
| 642 for (int i = 1; i < UTFmax; i++) { | |
| 643 uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence | |
| 644 if ((lo & ~m) != (hi & ~m)) { | |
| 645 if ((lo & m) != 0) { | |
| 646 AddRuneRangeUTF8(lo, lo|m, foldcase); | |
| 647 AddRuneRangeUTF8((lo|m)+1, hi, foldcase); | |
| 648 return; | |
| 649 } | |
| 650 if ((hi & m) != m) { | |
| 651 AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase); | |
| 652 AddRuneRangeUTF8(hi&~m, hi, foldcase); | |
| 653 return; | |
| 654 } | |
| 655 } | |
| 656 } | |
| 657 | |
| 658 // Finally. Generate byte matching equivalent for lo-hi. | |
| 659 uint8 ulo[UTFmax], uhi[UTFmax]; | |
| 660 int n = runetochar(reinterpret_cast<char*>(ulo), &lo); | |
| 661 int m = runetochar(reinterpret_cast<char*>(uhi), &hi); | |
| 662 (void)m; // USED(m) | |
| 663 DCHECK_EQ(n, m); | |
| 664 | |
| 665 int id = 0; | |
| 666 if (reversed_) { | |
| 667 for (int i = 0; i < n; i++) | |
| 668 id = RuneByteSuffix(ulo[i], uhi[i], false, id); | |
| 669 } else { | |
| 670 for (int i = n-1; i >= 0; i--) | |
| 671 id = RuneByteSuffix(ulo[i], uhi[i], false, id); | |
| 672 } | |
| 673 AddSuffix(id); | |
| 674 } | |
| 675 | |
| 676 // Should not be called. | |
| 677 Frag Compiler::Copy(Frag arg) { | |
| 678 // We're using WalkExponential; there should be no copying. | |
| 679 LOG(DFATAL) << "Compiler::Copy called!"; | |
| 680 failed_ = true; | |
| 681 return NoMatch(); | |
| 682 } | |
| 683 | |
| 684 // Visits a node quickly; called once WalkExponential has | |
| 685 // decided to cut this walk short. | |
| 686 Frag Compiler::ShortVisit(Regexp* re, Frag) { | |
| 687 failed_ = true; | |
| 688 return NoMatch(); | |
| 689 } | |
| 690 | |
| 691 // Called before traversing a node's children during the walk. | |
| 692 Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) { | |
| 693 // Cut off walk if we've already failed. | |
| 694 if (failed_) | |
| 695 *stop = true; | |
| 696 | |
| 697 return Frag(); // not used by caller | |
| 698 } | |
| 699 | |
| 700 Frag Compiler::Literal(Rune r, bool foldcase) { | |
| 701 switch (encoding_) { | |
| 702 default: | |
| 703 return Frag(); | |
| 704 | |
| 705 case kEncodingLatin1: | |
| 706 return ByteRange(r, r, foldcase); | |
| 707 | |
| 708 case kEncodingUTF8: { | |
| 709 if (r < Runeself) // Make common case fast. | |
| 710 return ByteRange(r, r, foldcase); | |
| 711 uint8 buf[UTFmax]; | |
| 712 int n = runetochar(reinterpret_cast<char*>(buf), &r); | |
| 713 Frag f = ByteRange((uint8)buf[0], buf[0], false); | |
| 714 for (int i = 1; i < n; i++) | |
| 715 f = Cat(f, ByteRange((uint8)buf[i], buf[i], false)); | |
| 716 return f; | |
| 717 } | |
| 718 } | |
| 719 } | |
| 720 | |
| 721 // Called after traversing the node's children during the walk. | |
| 722 // Given their frags, build and return the frag for this re. | |
| 723 Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, | |
| 724 int nchild_frags) { | |
| 725 // If a child failed, don't bother going forward, especially | |
| 726 // since the child_frags might contain Frags with NULLs in them. | |
| 727 if (failed_) | |
| 728 return NoMatch(); | |
| 729 | |
| 730 // Given the child fragments, return the fragment for this node. | |
| 731 switch (re->op()) { | |
| 732 case kRegexpRepeat: | |
| 733 // Should not see; code at bottom of function will print error | |
| 734 break; | |
| 735 | |
| 736 case kRegexpNoMatch: | |
| 737 return NoMatch(); | |
| 738 | |
| 739 case kRegexpEmptyMatch: | |
| 740 return Nop(); | |
| 741 | |
| 742 case kRegexpHaveMatch: { | |
| 743 Frag f = Match(re->match_id()); | |
| 744 // Remember unanchored match to end of string. | |
| 745 if (anchor_ != RE2::ANCHOR_BOTH) | |
| 746 f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f)); | |
| 747 return f; | |
| 748 } | |
| 749 | |
| 750 case kRegexpConcat: { | |
| 751 Frag f = child_frags[0]; | |
| 752 for (int i = 1; i < nchild_frags; i++) | |
| 753 f = Cat(f, child_frags[i]); | |
| 754 return f; | |
| 755 } | |
| 756 | |
| 757 case kRegexpAlternate: { | |
| 758 Frag f = child_frags[0]; | |
| 759 for (int i = 1; i < nchild_frags; i++) | |
| 760 f = Alt(f, child_frags[i]); | |
| 761 return f; | |
| 762 } | |
| 763 | |
| 764 case kRegexpStar: | |
| 765 return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); | |
| 766 | |
| 767 case kRegexpPlus: | |
| 768 return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); | |
| 769 | |
| 770 case kRegexpQuest: | |
| 771 return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); | |
| 772 | |
| 773 case kRegexpLiteral: | |
| 774 return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0); | |
| 775 | |
| 776 case kRegexpLiteralString: { | |
| 777 // Concatenation of literals. | |
| 778 if (re->nrunes() == 0) | |
| 779 return Nop(); | |
| 780 Frag f; | |
| 781 for (int i = 0; i < re->nrunes(); i++) { | |
| 782 Frag f1 = Literal(re->runes()[i], | |
| 783 (re->parse_flags()&Regexp::FoldCase) != 0); | |
| 784 if (i == 0) | |
| 785 f = f1; | |
| 786 else | |
| 787 f = Cat(f, f1); | |
| 788 } | |
| 789 return f; | |
| 790 } | |
| 791 | |
| 792 case kRegexpAnyChar: | |
| 793 BeginRange(); | |
| 794 AddRuneRange(0, Runemax, false); | |
| 795 return EndRange(); | |
| 796 | |
| 797 case kRegexpAnyByte: | |
| 798 return ByteRange(0x00, 0xFF, false); | |
| 799 | |
| 800 case kRegexpCharClass: { | |
| 801 CharClass* cc = re->cc(); | |
| 802 if (cc->empty()) { | |
| 803 // This can't happen. | |
| 804 LOG(DFATAL) << "No ranges in char class"; | |
| 805 failed_ = true; | |
| 806 return NoMatch(); | |
| 807 } | |
| 808 | |
| 809 // ASCII case-folding optimization: if the char class | |
| 810 // behaves the same on A-Z as it does on a-z, | |
| 811 // discard any ranges wholly contained in A-Z | |
| 812 // and mark the other ranges as foldascii. | |
| 813 // This reduces the size of a program for | |
| 814 // (?i)abc from 3 insts per letter to 1 per letter. | |
| 815 bool foldascii = cc->FoldsASCII(); | |
| 816 | |
| 817 // Character class is just a big OR of the different | |
| 818 // character ranges in the class. | |
| 819 BeginRange(); | |
| 820 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) { | |
| 821 // ASCII case-folding optimization (see above). | |
| 822 if (foldascii && 'A' <= i->lo && i->hi <= 'Z') | |
| 823 continue; | |
| 824 | |
| 825 // If this range contains all of A-Za-z or none of it, | |
| 826 // the fold flag is unnecessary; don't bother. | |
| 827 bool fold = foldascii; | |
| 828 if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo || | |
| 829 ('Z' < i->lo && i->hi < 'a')) | |
| 830 fold = false; | |
| 831 | |
| 832 AddRuneRange(i->lo, i->hi, fold); | |
| 833 } | |
| 834 return EndRange(); | |
| 835 } | |
| 836 | |
| 837 case kRegexpCapture: | |
| 838 // If this is a non-capturing parenthesis -- (?:foo) -- | |
| 839 // just use the inner expression. | |
| 840 if (re->cap() < 0) | |
| 841 return child_frags[0]; | |
| 842 return Capture(child_frags[0], re->cap()); | |
| 843 | |
| 844 case kRegexpBeginLine: | |
| 845 return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine); | |
| 846 | |
| 847 case kRegexpEndLine: | |
| 848 return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine); | |
| 849 | |
| 850 case kRegexpBeginText: | |
| 851 return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText); | |
| 852 | |
| 853 case kRegexpEndText: | |
| 854 return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText); | |
| 855 | |
| 856 case kRegexpWordBoundary: | |
| 857 return EmptyWidth(kEmptyWordBoundary); | |
| 858 | |
| 859 case kRegexpNoWordBoundary: | |
| 860 return EmptyWidth(kEmptyNonWordBoundary); | |
| 861 } | |
| 862 LOG(DFATAL) << "Missing case in Compiler: " << re->op(); | |
| 863 failed_ = true; | |
| 864 return NoMatch(); | |
| 865 } | |
| 866 | |
| 867 // Is this regexp required to start at the beginning of the text? | |
| 868 // Only approximate; can return false for complicated regexps like (\Aa|\Ab), | |
| 869 // but handles (\A(a|b)). Could use the Walker to write a more exact one. | |
| 870 static bool IsAnchorStart(Regexp** pre, int depth) { | |
| 871 Regexp* re = *pre; | |
| 872 Regexp* sub; | |
| 873 // The depth limit makes sure that we don't overflow | |
| 874 // the stack on a deeply nested regexp. As the comment | |
| 875 // above says, IsAnchorStart is conservative, so returning | |
| 876 // a false negative is okay. The exact limit is somewhat arbitrary. | |
| 877 if (re == NULL || depth >= 4) | |
| 878 return false; | |
| 879 switch (re->op()) { | |
| 880 default: | |
| 881 break; | |
| 882 case kRegexpConcat: | |
| 883 if (re->nsub() > 0) { | |
| 884 sub = re->sub()[0]->Incref(); | |
| 885 if (IsAnchorStart(&sub, depth+1)) { | |
| 886 Regexp** subcopy = new Regexp*[re->nsub()]; | |
| 887 subcopy[0] = sub; // already have reference | |
| 888 for (int i = 1; i < re->nsub(); i++) | |
| 889 subcopy[i] = re->sub()[i]->Incref(); | |
| 890 *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); | |
| 891 delete[] subcopy; | |
| 892 re->Decref(); | |
| 893 return true; | |
| 894 } | |
| 895 sub->Decref(); | |
| 896 } | |
| 897 break; | |
| 898 case kRegexpCapture: | |
| 899 sub = re->sub()[0]->Incref(); | |
| 900 if (IsAnchorStart(&sub, depth+1)) { | |
| 901 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); | |
| 902 re->Decref(); | |
| 903 return true; | |
| 904 } | |
| 905 sub->Decref(); | |
| 906 break; | |
| 907 case kRegexpBeginText: | |
| 908 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); | |
| 909 re->Decref(); | |
| 910 return true; | |
| 911 } | |
| 912 return false; | |
| 913 } | |
| 914 | |
| 915 // Is this regexp required to start at the end of the text? | |
| 916 // Only approximate; can return false for complicated regexps like (a\z|b\z), | |
| 917 // but handles ((a|b)\z). Could use the Walker to write a more exact one. | |
| 918 static bool IsAnchorEnd(Regexp** pre, int depth) { | |
| 919 Regexp* re = *pre; | |
| 920 Regexp* sub; | |
| 921 // The depth limit makes sure that we don't overflow | |
| 922 // the stack on a deeply nested regexp. As the comment | |
| 923 // above says, IsAnchorEnd is conservative, so returning | |
| 924 // a false negative is okay. The exact limit is somewhat arbitrary. | |
| 925 if (re == NULL || depth >= 4) | |
| 926 return false; | |
| 927 switch (re->op()) { | |
| 928 default: | |
| 929 break; | |
| 930 case kRegexpConcat: | |
| 931 if (re->nsub() > 0) { | |
| 932 sub = re->sub()[re->nsub() - 1]->Incref(); | |
| 933 if (IsAnchorEnd(&sub, depth+1)) { | |
| 934 Regexp** subcopy = new Regexp*[re->nsub()]; | |
| 935 subcopy[re->nsub() - 1] = sub; // already have reference | |
| 936 for (int i = 0; i < re->nsub() - 1; i++) | |
| 937 subcopy[i] = re->sub()[i]->Incref(); | |
| 938 *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); | |
| 939 delete[] subcopy; | |
| 940 re->Decref(); | |
| 941 return true; | |
| 942 } | |
| 943 sub->Decref(); | |
| 944 } | |
| 945 break; | |
| 946 case kRegexpCapture: | |
| 947 sub = re->sub()[0]->Incref(); | |
| 948 if (IsAnchorEnd(&sub, depth+1)) { | |
| 949 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); | |
| 950 re->Decref(); | |
| 951 return true; | |
| 952 } | |
| 953 sub->Decref(); | |
| 954 break; | |
| 955 case kRegexpEndText: | |
| 956 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); | |
| 957 re->Decref(); | |
| 958 return true; | |
| 959 } | |
| 960 return false; | |
| 961 } | |
| 962 | |
| 963 void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, | |
| 964 RE2::Anchor anchor) { | |
| 965 prog_->set_flags(flags); | |
| 966 | |
| 967 if (flags & Regexp::Latin1) | |
| 968 encoding_ = kEncodingLatin1; | |
| 969 max_mem_ = max_mem; | |
| 970 if (max_mem <= 0) { | |
| 971 max_inst_ = 100000; // more than enough | |
| 972 } else if (max_mem <= static_cast<int64>(sizeof(Prog))) { | |
| 973 // No room for anything. | |
| 974 max_inst_ = 0; | |
| 975 } else { | |
| 976 int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); | |
| 977 // Limit instruction count so that inst->id() fits nicely in an int. | |
| 978 // SparseArray also assumes that the indices (inst->id()) are ints. | |
| 979 // The call to WalkExponential uses 2*max_inst_ below, | |
| 980 // and other places in the code use 2 or 3 * prog->size(). | |
| 981 // Limiting to 2^24 should avoid overflow in those places. | |
| 982 // (The point of allowing more than 32 bits of memory is to | |
| 983 // have plenty of room for the DFA states, not to use it up | |
| 984 // on the program.) | |
| 985 if (m >= 1<<24) | |
| 986 m = 1<<24; | |
| 987 | |
| 988 // Inst imposes its own limit (currently bigger than 2^24 but be safe). | |
| 989 if (m > Prog::Inst::kMaxInst) | |
| 990 m = Prog::Inst::kMaxInst; | |
| 991 | |
| 992 max_inst_ = static_cast<int>(m); | |
| 993 } | |
| 994 | |
| 995 anchor_ = anchor; | |
| 996 } | |
| 997 | |
| 998 // Compiles re, returning program. | |
| 999 // Caller is responsible for deleting prog_. | |
| 1000 // If reversed is true, compiles a program that expects | |
| 1001 // to run over the input string backward (reverses all concatenations). | |
| 1002 // The reversed flag is also recorded in the returned program. | |
| 1003 Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) { | |
| 1004 Compiler c; | |
| 1005 | |
| 1006 c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */); | |
| 1007 c.reversed_ = reversed; | |
| 1008 | |
| 1009 // Simplify to remove things like counted repetitions | |
| 1010 // and character classes like \d. | |
| 1011 Regexp* sre = re->Simplify(); | |
| 1012 if (sre == NULL) | |
| 1013 return NULL; | |
| 1014 | |
| 1015 // Record whether prog is anchored, removing the anchors. | |
| 1016 // (They get in the way of other optimizations.) | |
| 1017 bool is_anchor_start = IsAnchorStart(&sre, 0); | |
| 1018 bool is_anchor_end = IsAnchorEnd(&sre, 0); | |
| 1019 | |
| 1020 // Generate fragment for entire regexp. | |
| 1021 Frag f = c.WalkExponential(sre, Frag(), 2*c.max_inst_); | |
| 1022 sre->Decref(); | |
| 1023 if (c.failed_) | |
| 1024 return NULL; | |
| 1025 | |
| 1026 // Success! Finish by putting Match node at end, and record start. | |
| 1027 // Turn off c.reversed_ (if it is set) to force the remaining concatenations | |
| 1028 // to behave normally. | |
| 1029 c.reversed_ = false; | |
| 1030 Frag all = c.Cat(f, c.Match(0)); | |
| 1031 c.prog_->set_start(all.begin); | |
| 1032 | |
| 1033 if (reversed) { | |
| 1034 c.prog_->set_anchor_start(is_anchor_end); | |
| 1035 c.prog_->set_anchor_end(is_anchor_start); | |
| 1036 } else { | |
| 1037 c.prog_->set_anchor_start(is_anchor_start); | |
| 1038 c.prog_->set_anchor_end(is_anchor_end); | |
| 1039 } | |
| 1040 | |
| 1041 // Also create unanchored version, which starts with a .*? loop. | |
| 1042 if (c.prog_->anchor_start()) { | |
| 1043 c.prog_->set_start_unanchored(c.prog_->start()); | |
| 1044 } else { | |
| 1045 Frag unanchored = c.Cat(c.DotStar(), all); | |
| 1046 c.prog_->set_start_unanchored(unanchored.begin); | |
| 1047 } | |
| 1048 | |
| 1049 c.prog_->set_reversed(reversed); | |
| 1050 | |
| 1051 // Hand ownership of prog_ to caller. | |
| 1052 return c.Finish(); | |
| 1053 } | |
| 1054 | |
| 1055 Prog* Compiler::Finish() { | |
| 1056 if (failed_) | |
| 1057 return NULL; | |
| 1058 | |
| 1059 if (prog_->start() == 0 && prog_->start_unanchored() == 0) { | |
| 1060 // No possible matches; keep Fail instruction only. | |
| 1061 inst_len_ = 1; | |
| 1062 } | |
| 1063 | |
| 1064 // Trim instruction to minimum array and transfer to Prog. | |
| 1065 Trim(); | |
| 1066 prog_->inst_ = inst_; | |
| 1067 prog_->size_ = inst_len_; | |
| 1068 inst_ = NULL; | |
| 1069 | |
| 1070 // Compute byte map. | |
| 1071 prog_->ComputeByteMap(); | |
| 1072 | |
| 1073 prog_->Optimize(); | |
| 1074 | |
| 1075 // Record remaining memory for DFA. | |
| 1076 if (max_mem_ <= 0) { | |
| 1077 prog_->set_dfa_mem(1<<20); | |
| 1078 } else { | |
| 1079 int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst); | |
| 1080 if (m < 0) | |
| 1081 m = 0; | |
| 1082 prog_->set_dfa_mem(m); | |
| 1083 } | |
| 1084 | |
| 1085 Prog* p = prog_; | |
| 1086 prog_ = NULL; | |
| 1087 return p; | |
| 1088 } | |
| 1089 | |
| 1090 // Converts Regexp to Prog. | |
| 1091 Prog* Regexp::CompileToProg(int64 max_mem) { | |
| 1092 return Compiler::Compile(this, false, max_mem); | |
| 1093 } | |
| 1094 | |
| 1095 Prog* Regexp::CompileToReverseProg(int64 max_mem) { | |
| 1096 return Compiler::Compile(this, true, max_mem); | |
| 1097 } | |
| 1098 | |
| 1099 Frag Compiler::DotStar() { | |
| 1100 return Star(ByteRange(0x00, 0xff, false), true); | |
| 1101 } | |
| 1102 | |
| 1103 // Compiles RE set to Prog. | |
| 1104 Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor, | |
| 1105 Regexp* re) { | |
| 1106 Compiler c; | |
| 1107 | |
| 1108 Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags()); | |
| 1109 c.Setup(pf, options.max_mem(), anchor); | |
| 1110 | |
| 1111 // Compile alternation of fragments. | |
| 1112 Frag all = c.WalkExponential(re, Frag(), 2*c.max_inst_); | |
| 1113 re->Decref(); | |
| 1114 if (c.failed_) | |
| 1115 return NULL; | |
| 1116 | |
| 1117 if (anchor == RE2::UNANCHORED) { | |
| 1118 // The trailing .* was added while handling kRegexpHaveMatch. | |
| 1119 // We just have to add the leading one. | |
| 1120 all = c.Cat(c.DotStar(), all); | |
| 1121 } | |
| 1122 | |
| 1123 c.prog_->set_start(all.begin); | |
| 1124 c.prog_->set_start_unanchored(all.begin); | |
| 1125 c.prog_->set_anchor_start(true); | |
| 1126 c.prog_->set_anchor_end(true); | |
| 1127 | |
| 1128 Prog* prog = c.Finish(); | |
| 1129 if (prog == NULL) | |
| 1130 return NULL; | |
| 1131 | |
| 1132 // Make sure DFA has enough memory to operate, | |
| 1133 // since we're not going to fall back to the NFA. | |
| 1134 bool failed; | |
| 1135 StringPiece sp = "hello, world"; | |
| 1136 prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, | |
| 1137 NULL, &failed, NULL); | |
| 1138 if (failed) { | |
| 1139 delete prog; | |
| 1140 return NULL; | |
| 1141 } | |
| 1142 | |
| 1143 return prog; | |
| 1144 } | |
| 1145 | |
| 1146 Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor, | |
| 1147 Regexp* re) { | |
| 1148 return Compiler::CompileSet(options, anchor, re); | |
| 1149 } | |
| 1150 | |
| 1151 } // namespace re2 | |
| OLD | NEW |