OLD | NEW |
| (Empty) |
1 // Copyright 2007 The RE2 Authors. All Rights Reserved. | |
2 // Use of this source code is governed by a BSD-style | |
3 // license that can be found in the LICENSE file. | |
4 | |
5 // Compile regular expression to Prog. | |
6 // | |
7 // Prog and Inst are defined in prog.h. | |
8 // This file's external interface is just Regexp::CompileToProg. | |
9 // The Compiler class defined in this file is private. | |
10 | |
11 #include "re2/prog.h" | |
12 #include "re2/re2.h" | |
13 #include "re2/regexp.h" | |
14 #include "re2/walker-inl.h" | |
15 | |
16 namespace re2 { | |
17 | |
18 // List of pointers to Inst* that need to be filled in (patched). | |
19 // Because the Inst* haven't been filled in yet, | |
20 // we can use the Inst* word to hold the list's "next" pointer. | |
21 // It's kind of sleazy, but it works well in practice. | |
22 // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. | |
23 // | |
24 // Because the out and out1 fields in Inst are no longer pointers, | |
25 // we can't use pointers directly here either. Instead, p refers | |
26 // to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1). | |
27 // p == 0 represents the NULL list. This is okay because instruction #0 | |
28 // is always the fail instruction, which never appears on a list. | |
29 | |
30 struct PatchList { | |
31 uint32 p; | |
32 | |
33 // Returns patch list containing just p. | |
34 static PatchList Mk(uint32 p); | |
35 | |
36 // Patches all the entries on l to have value v. | |
37 // Caller must not ever use patch list again. | |
38 static void Patch(Prog::Inst *inst0, PatchList l, uint32 v); | |
39 | |
40 // Deref returns the next pointer pointed at by p. | |
41 static PatchList Deref(Prog::Inst *inst0, PatchList l); | |
42 | |
43 // Appends two patch lists and returns result. | |
44 static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2); | |
45 }; | |
46 | |
47 static PatchList nullPatchList = { 0 }; | |
48 | |
49 // Returns patch list containing just p. | |
50 PatchList PatchList::Mk(uint32 p) { | |
51 PatchList l; | |
52 l.p = p; | |
53 return l; | |
54 } | |
55 | |
56 // Returns the next pointer pointed at by l. | |
57 PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) { | |
58 Prog::Inst* ip = &inst0[l.p>>1]; | |
59 if (l.p&1) | |
60 l.p = ip->out1(); | |
61 else | |
62 l.p = ip->out(); | |
63 return l; | |
64 } | |
65 | |
66 // Patches all the entries on l to have value v. | |
67 void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) { | |
68 while (l.p != 0) { | |
69 Prog::Inst* ip = &inst0[l.p>>1]; | |
70 if (l.p&1) { | |
71 l.p = ip->out1(); | |
72 ip->out1_ = val; | |
73 } else { | |
74 l.p = ip->out(); | |
75 ip->set_out(val); | |
76 } | |
77 } | |
78 } | |
79 | |
80 // Appends two patch lists and returns result. | |
81 PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { | |
82 if (l1.p == 0) | |
83 return l2; | |
84 if (l2.p == 0) | |
85 return l1; | |
86 | |
87 PatchList l = l1; | |
88 for (;;) { | |
89 PatchList next = PatchList::Deref(inst0, l); | |
90 if (next.p == 0) | |
91 break; | |
92 l = next; | |
93 } | |
94 | |
95 Prog::Inst* ip = &inst0[l.p>>1]; | |
96 if (l.p&1) | |
97 ip->out1_ = l2.p; | |
98 else | |
99 ip->set_out(l2.p); | |
100 | |
101 return l1; | |
102 } | |
103 | |
104 // Compiled program fragment. | |
105 struct Frag { | |
106 uint32 begin; | |
107 PatchList end; | |
108 | |
109 Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector | |
110 Frag(uint32 begin, PatchList end) : begin(begin), end(end) {} | |
111 }; | |
112 | |
113 // Input encodings. | |
114 enum Encoding { | |
115 kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) | |
116 kEncodingLatin1, // Latin1 (0-FF) | |
117 }; | |
118 | |
119 class Compiler : public Regexp::Walker<Frag> { | |
120 public: | |
121 explicit Compiler(); | |
122 ~Compiler(); | |
123 | |
124 // Compiles Regexp to a new Prog. | |
125 // Caller is responsible for deleting Prog when finished with it. | |
126 // If reversed is true, compiles for walking over the input | |
127 // string backward (reverses all concatenations). | |
128 static Prog *Compile(Regexp* re, bool reversed, int64 max_mem); | |
129 | |
130 // Compiles alternation of all the re to a new Prog. | |
131 // Each re has a match with an id equal to its index in the vector. | |
132 static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, | |
133 Regexp* re); | |
134 | |
135 // Interface for Regexp::Walker, which helps traverse the Regexp. | |
136 // The walk is purely post-recursive: given the machines for the | |
137 // children, PostVisit combines them to create the machine for | |
138 // the current node. The child_args are Frags. | |
139 // The Compiler traverses the Regexp parse tree, visiting | |
140 // each node in depth-first order. It invokes PreVisit before | |
141 // visiting the node's children and PostVisit after visiting | |
142 // the children. | |
143 Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop); | |
144 Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args, | |
145 int nchild_args); | |
146 Frag ShortVisit(Regexp* re, Frag parent_arg); | |
147 Frag Copy(Frag arg); | |
148 | |
149 // Given fragment a, returns a+ or a+?; a* or a*?; a? or a?? | |
150 Frag Plus(Frag a, bool nongreedy); | |
151 Frag Star(Frag a, bool nongreedy); | |
152 Frag Quest(Frag a, bool nongreedy); | |
153 | |
154 // Given fragment a, returns (a) capturing as \n. | |
155 Frag Capture(Frag a, int n); | |
156 | |
157 // Given fragments a and b, returns ab; a|b | |
158 Frag Cat(Frag a, Frag b); | |
159 Frag Alt(Frag a, Frag b); | |
160 | |
161 // Returns a fragment that can't match anything. | |
162 Frag NoMatch(); | |
163 | |
164 // Returns a fragment that matches the empty string. | |
165 Frag Match(int32 id); | |
166 | |
167 // Returns a no-op fragment. | |
168 Frag Nop(); | |
169 | |
170 // Returns a fragment matching the byte range lo-hi. | |
171 Frag ByteRange(int lo, int hi, bool foldcase); | |
172 | |
173 // Returns a fragment matching an empty-width special op. | |
174 Frag EmptyWidth(EmptyOp op); | |
175 | |
176 // Adds n instructions to the program. | |
177 // Returns the index of the first one. | |
178 // Returns -1 if no more instructions are available. | |
179 int AllocInst(int n); | |
180 | |
181 // Deletes unused instructions. | |
182 void Trim(); | |
183 | |
184 // Rune range compiler. | |
185 | |
186 // Begins a new alternation. | |
187 void BeginRange(); | |
188 | |
189 // Adds a fragment matching the rune range lo-hi. | |
190 void AddRuneRange(Rune lo, Rune hi, bool foldcase); | |
191 void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); | |
192 void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); | |
193 void Add_80_10ffff(); | |
194 | |
195 // New suffix that matches the byte range lo-hi, then goes to next. | |
196 int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); | |
197 int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); | |
198 | |
199 // Adds a suffix to alternation. | |
200 void AddSuffix(int id); | |
201 | |
202 // Returns the alternation of all the added suffixes. | |
203 Frag EndRange(); | |
204 | |
205 // Single rune. | |
206 Frag Literal(Rune r, bool foldcase); | |
207 | |
208 void Setup(Regexp::ParseFlags, int64, RE2::Anchor); | |
209 Prog* Finish(); | |
210 | |
211 // Returns .* where dot = any byte | |
212 Frag DotStar(); | |
213 | |
214 private: | |
215 Prog* prog_; // Program being built. | |
216 bool failed_; // Did we give up compiling? | |
217 Encoding encoding_; // Input encoding | |
218 bool reversed_; // Should program run backward over text? | |
219 | |
220 int max_inst_; // Maximum number of instructions. | |
221 | |
222 Prog::Inst* inst_; // Pointer to first instruction. | |
223 int inst_len_; // Number of instructions used. | |
224 int inst_cap_; // Number of instructions allocated. | |
225 | |
226 int64 max_mem_; // Total memory budget. | |
227 | |
228 map<uint64, int> rune_cache_; | |
229 Frag rune_range_; | |
230 | |
231 RE2::Anchor anchor_; // anchor mode for RE2::Set | |
232 | |
233 DISALLOW_COPY_AND_ASSIGN(Compiler); | |
234 }; | |
235 | |
236 Compiler::Compiler() { | |
237 prog_ = new Prog(); | |
238 failed_ = false; | |
239 encoding_ = kEncodingUTF8; | |
240 reversed_ = false; | |
241 inst_ = NULL; | |
242 inst_len_ = 0; | |
243 inst_cap_ = 0; | |
244 max_inst_ = 1; // make AllocInst for fail instruction okay | |
245 max_mem_ = 0; | |
246 int fail = AllocInst(1); | |
247 inst_[fail].InitFail(); | |
248 max_inst_ = 0; // Caller must change | |
249 } | |
250 | |
251 Compiler::~Compiler() { | |
252 delete prog_; | |
253 delete[] inst_; | |
254 } | |
255 | |
256 int Compiler::AllocInst(int n) { | |
257 if (failed_ || inst_len_ + n > max_inst_) { | |
258 failed_ = true; | |
259 return -1; | |
260 } | |
261 | |
262 if (inst_len_ + n > inst_cap_) { | |
263 if (inst_cap_ == 0) | |
264 inst_cap_ = 8; | |
265 while (inst_len_ + n > inst_cap_) | |
266 inst_cap_ *= 2; | |
267 Prog::Inst* ip = new Prog::Inst[inst_cap_]; | |
268 memmove(ip, inst_, inst_len_ * sizeof ip[0]); | |
269 memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]); | |
270 delete[] inst_; | |
271 inst_ = ip; | |
272 } | |
273 int id = inst_len_; | |
274 inst_len_ += n; | |
275 return id; | |
276 } | |
277 | |
278 void Compiler::Trim() { | |
279 if (inst_len_ < inst_cap_) { | |
280 Prog::Inst* ip = new Prog::Inst[inst_len_]; | |
281 memmove(ip, inst_, inst_len_ * sizeof ip[0]); | |
282 delete[] inst_; | |
283 inst_ = ip; | |
284 inst_cap_ = inst_len_; | |
285 } | |
286 } | |
287 | |
288 // These routines are somewhat hard to visualize in text -- | |
289 // see http://swtch.com/~rsc/regexp/regexp1.html for | |
290 // pictures explaining what is going on here. | |
291 | |
292 // Returns an unmatchable fragment. | |
293 Frag Compiler::NoMatch() { | |
294 return Frag(0, nullPatchList); | |
295 } | |
296 | |
297 // Is a an unmatchable fragment? | |
298 static bool IsNoMatch(Frag a) { | |
299 return a.begin == 0; | |
300 } | |
301 | |
302 // Given fragments a and b, returns fragment for ab. | |
303 Frag Compiler::Cat(Frag a, Frag b) { | |
304 if (IsNoMatch(a) || IsNoMatch(b)) | |
305 return NoMatch(); | |
306 | |
307 // Elide no-op. | |
308 Prog::Inst* begin = &inst_[a.begin]; | |
309 if (begin->opcode() == kInstNop && | |
310 a.end.p == (a.begin << 1) && | |
311 begin->out() == 0) { | |
312 PatchList::Patch(inst_, a.end, b.begin); // in case refs to a somewhere | |
313 return b; | |
314 } | |
315 | |
316 // To run backward over string, reverse all concatenations. | |
317 if (reversed_) { | |
318 PatchList::Patch(inst_, b.end, a.begin); | |
319 return Frag(b.begin, a.end); | |
320 } | |
321 | |
322 PatchList::Patch(inst_, a.end, b.begin); | |
323 return Frag(a.begin, b.end); | |
324 } | |
325 | |
326 // Given fragments for a and b, returns fragment for a|b. | |
327 Frag Compiler::Alt(Frag a, Frag b) { | |
328 // Special case for convenience in loops. | |
329 if (IsNoMatch(a)) | |
330 return b; | |
331 if (IsNoMatch(b)) | |
332 return a; | |
333 | |
334 int id = AllocInst(1); | |
335 if (id < 0) | |
336 return NoMatch(); | |
337 | |
338 inst_[id].InitAlt(a.begin, b.begin); | |
339 return Frag(id, PatchList::Append(inst_, a.end, b.end)); | |
340 } | |
341 | |
342 // When capturing submatches in like-Perl mode, a kOpAlt Inst | |
343 // treats out_ as the first choice, out1_ as the second. | |
344 // | |
345 // For *, +, and ?, if out_ causes another repetition, | |
346 // then the operator is greedy. If out1_ is the repetition | |
347 // (and out_ moves forward), then the operator is non-greedy. | |
348 | |
349 // Given a fragment a, returns a fragment for a* or a*? (if nongreedy) | |
350 Frag Compiler::Star(Frag a, bool nongreedy) { | |
351 int id = AllocInst(1); | |
352 if (id < 0) | |
353 return NoMatch(); | |
354 inst_[id].InitAlt(0, 0); | |
355 PatchList::Patch(inst_, a.end, id); | |
356 if (nongreedy) { | |
357 inst_[id].out1_ = a.begin; | |
358 return Frag(id, PatchList::Mk(id << 1)); | |
359 } else { | |
360 inst_[id].set_out(a.begin); | |
361 return Frag(id, PatchList::Mk((id << 1) | 1)); | |
362 } | |
363 } | |
364 | |
365 // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) | |
366 Frag Compiler::Plus(Frag a, bool nongreedy) { | |
367 // a+ is just a* with a different entry point. | |
368 Frag f = Star(a, nongreedy); | |
369 return Frag(a.begin, f.end); | |
370 } | |
371 | |
372 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) | |
373 Frag Compiler::Quest(Frag a, bool nongreedy) { | |
374 if (IsNoMatch(a)) | |
375 return Nop(); | |
376 int id = AllocInst(1); | |
377 if (id < 0) | |
378 return NoMatch(); | |
379 PatchList pl; | |
380 if (nongreedy) { | |
381 inst_[id].InitAlt(0, a.begin); | |
382 pl = PatchList::Mk(id << 1); | |
383 } else { | |
384 inst_[id].InitAlt(a.begin, 0); | |
385 pl = PatchList::Mk((id << 1) | 1); | |
386 } | |
387 return Frag(id, PatchList::Append(inst_, pl, a.end)); | |
388 } | |
389 | |
390 // Returns a fragment for the byte range lo-hi. | |
391 Frag Compiler::ByteRange(int lo, int hi, bool foldcase) { | |
392 int id = AllocInst(1); | |
393 if (id < 0) | |
394 return NoMatch(); | |
395 inst_[id].InitByteRange(lo, hi, foldcase, 0); | |
396 prog_->byte_inst_count_++; | |
397 prog_->MarkByteRange(lo, hi); | |
398 if (foldcase && lo <= 'z' && hi >= 'a') { | |
399 if (lo < 'a') | |
400 lo = 'a'; | |
401 if (hi > 'z') | |
402 hi = 'z'; | |
403 if (lo <= hi) | |
404 prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a'); | |
405 } | |
406 return Frag(id, PatchList::Mk(id << 1)); | |
407 } | |
408 | |
409 // Returns a no-op fragment. Sometimes unavoidable. | |
410 Frag Compiler::Nop() { | |
411 int id = AllocInst(1); | |
412 if (id < 0) | |
413 return NoMatch(); | |
414 inst_[id].InitNop(0); | |
415 return Frag(id, PatchList::Mk(id << 1)); | |
416 } | |
417 | |
418 // Returns a fragment that signals a match. | |
419 Frag Compiler::Match(int32 match_id) { | |
420 int id = AllocInst(1); | |
421 if (id < 0) | |
422 return NoMatch(); | |
423 inst_[id].InitMatch(match_id); | |
424 return Frag(id, nullPatchList); | |
425 } | |
426 | |
427 // Returns a fragment matching a particular empty-width op (like ^ or $) | |
428 Frag Compiler::EmptyWidth(EmptyOp empty) { | |
429 int id = AllocInst(1); | |
430 if (id < 0) | |
431 return NoMatch(); | |
432 inst_[id].InitEmptyWidth(empty, 0); | |
433 if (empty & (kEmptyBeginLine|kEmptyEndLine)) | |
434 prog_->MarkByteRange('\n', '\n'); | |
435 if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) { | |
436 int j; | |
437 for (int i = 0; i < 256; i = j) { | |
438 for (j = i + 1; j < 256 && | |
439 Prog::IsWordChar(static_cast<uint8>(i)) == | |
440 Prog::IsWordChar(static_cast<uint8>(j)); | |
441 j++) | |
442 ; | |
443 prog_->MarkByteRange(i, j-1); | |
444 } | |
445 } | |
446 return Frag(id, PatchList::Mk(id << 1)); | |
447 } | |
448 | |
449 // Given a fragment a, returns a fragment with capturing parens around a. | |
450 Frag Compiler::Capture(Frag a, int n) { | |
451 if (IsNoMatch(a)) | |
452 return NoMatch(); | |
453 int id = AllocInst(2); | |
454 if (id < 0) | |
455 return NoMatch(); | |
456 inst_[id].InitCapture(2*n, a.begin); | |
457 inst_[id+1].InitCapture(2*n+1, 0); | |
458 PatchList::Patch(inst_, a.end, id+1); | |
459 | |
460 return Frag(id, PatchList::Mk((id+1) << 1)); | |
461 } | |
462 | |
463 // A Rune is a name for a Unicode code point. | |
464 // Returns maximum rune encoded by UTF-8 sequence of length len. | |
465 static int MaxRune(int len) { | |
466 int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax) | |
467 if (len == 1) | |
468 b = 7; | |
469 else | |
470 b = 8-(len+1) + 6*(len-1); | |
471 return (1<<b) - 1; // maximum Rune for b bits. | |
472 } | |
473 | |
474 // The rune range compiler caches common suffix fragments, | |
475 // which are very common in UTF-8 (e.g., [80-bf]). | |
476 // The fragment suffixes are identified by their start | |
477 // instructions. NULL denotes the eventual end match. | |
478 // The Frag accumulates in rune_range_. Caching common | |
479 // suffixes reduces the UTF-8 "." from 32 to 24 instructions, | |
480 // and it reduces the corresponding one-pass NFA from 16 nodes to 8. | |
481 | |
482 void Compiler::BeginRange() { | |
483 rune_cache_.clear(); | |
484 rune_range_.begin = 0; | |
485 rune_range_.end = nullPatchList; | |
486 } | |
487 | |
488 int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, | |
489 int next) { | |
490 Frag f = ByteRange(lo, hi, foldcase); | |
491 if (next != 0) { | |
492 PatchList::Patch(inst_, f.end, next); | |
493 } else { | |
494 rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end); | |
495 } | |
496 return f.begin; | |
497 } | |
498 | |
499 int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { | |
500 // In Latin1 mode, there's no point in caching. | |
501 // In forward UTF-8 mode, only need to cache continuation bytes. | |
502 if (encoding_ == kEncodingLatin1 || | |
503 (encoding_ == kEncodingUTF8 && | |
504 !reversed_ && | |
505 !(0x80 <= lo && hi <= 0xbf))) { | |
506 return UncachedRuneByteSuffix(lo, hi, foldcase, next); | |
507 } | |
508 | |
509 uint64 key = (uint64)next << 17 | | |
510 (uint64)lo << 9 | | |
511 (uint64)hi << 1 | | |
512 (uint64)foldcase; | |
513 map<uint64, int>::iterator it = rune_cache_.find(key); | |
514 if (it != rune_cache_.end()) | |
515 return it->second; | |
516 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); | |
517 rune_cache_[key] = id; | |
518 return id; | |
519 } | |
520 | |
521 void Compiler::AddSuffix(int id) { | |
522 if (rune_range_.begin == 0) { | |
523 rune_range_.begin = id; | |
524 return; | |
525 } | |
526 | |
527 int alt = AllocInst(1); | |
528 if (alt < 0) { | |
529 rune_range_.begin = 0; | |
530 return; | |
531 } | |
532 inst_[alt].InitAlt(rune_range_.begin, id); | |
533 rune_range_.begin = alt; | |
534 } | |
535 | |
536 Frag Compiler::EndRange() { | |
537 return rune_range_; | |
538 } | |
539 | |
540 // Converts rune range lo-hi into a fragment that recognizes | |
541 // the bytes that would make up those runes in the current | |
542 // encoding (Latin 1 or UTF-8). | |
543 // This lets the machine work byte-by-byte even when | |
544 // using multibyte encodings. | |
545 | |
546 void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) { | |
547 switch (encoding_) { | |
548 default: | |
549 case kEncodingUTF8: | |
550 AddRuneRangeUTF8(lo, hi, foldcase); | |
551 break; | |
552 case kEncodingLatin1: | |
553 AddRuneRangeLatin1(lo, hi, foldcase); | |
554 break; | |
555 } | |
556 } | |
557 | |
558 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { | |
559 // Latin1 is easy: runes *are* bytes. | |
560 if (lo > hi || lo > 0xFF) | |
561 return; | |
562 if (hi > 0xFF) | |
563 hi = 0xFF; | |
564 AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi), | |
565 foldcase, 0)); | |
566 } | |
567 | |
568 // Table describing how to make a UTF-8 matching machine | |
569 // for the rune range 80-10FFFF (Runeself-Runemax). | |
570 // This range happens frequently enough (for example /./ and /[^a-z]/) | |
571 // and the rune_cache_ map is slow enough that this is worth | |
572 // special handling. Makes compilation of a small expression | |
573 // with a dot in it about 10% faster. | |
574 // The * in the comments below mark whole sequences. | |
575 static struct ByteRangeProg { | |
576 int next; | |
577 int lo; | |
578 int hi; | |
579 } prog_80_10ffff[] = { | |
580 // Two-byte | |
581 { -1, 0x80, 0xBF, }, // 0: 80-BF | |
582 { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF* | |
583 | |
584 // Three-byte | |
585 { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF | |
586 { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF* | |
587 { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF | |
588 { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF* | |
589 | |
590 // Four-byte | |
591 { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF | |
592 { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF* | |
593 { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF | |
594 { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF* | |
595 { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF | |
596 { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF* | |
597 }; | |
598 | |
599 void Compiler::Add_80_10ffff() { | |
600 int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialize
d; silences gcc warning | |
601 for (int i = 0; i < arraysize(prog_80_10ffff); i++) { | |
602 const ByteRangeProg& p = prog_80_10ffff[i]; | |
603 int next = 0; | |
604 if (p.next >= 0) | |
605 next = inst[p.next]; | |
606 inst[i] = UncachedRuneByteSuffix(static_cast<uint8>(p.lo), | |
607 static_cast<uint8>(p.hi), false, next); | |
608 if ((p.lo & 0xC0) != 0x80) | |
609 AddSuffix(inst[i]); | |
610 } | |
611 } | |
612 | |
613 void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { | |
614 if (lo > hi) | |
615 return; | |
616 | |
617 // Pick off 80-10FFFF as a common special case | |
618 // that can bypass the slow rune_cache_. | |
619 if (lo == 0x80 && hi == 0x10ffff && !reversed_) { | |
620 Add_80_10ffff(); | |
621 return; | |
622 } | |
623 | |
624 // Split range into same-length sized ranges. | |
625 for (int i = 1; i < UTFmax; i++) { | |
626 Rune max = MaxRune(i); | |
627 if (lo <= max && max < hi) { | |
628 AddRuneRangeUTF8(lo, max, foldcase); | |
629 AddRuneRangeUTF8(max+1, hi, foldcase); | |
630 return; | |
631 } | |
632 } | |
633 | |
634 // ASCII range is always a special case. | |
635 if (hi < Runeself) { | |
636 AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi), | |
637 foldcase, 0)); | |
638 return; | |
639 } | |
640 | |
641 // Split range into sections that agree on leading bytes. | |
642 for (int i = 1; i < UTFmax; i++) { | |
643 uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence | |
644 if ((lo & ~m) != (hi & ~m)) { | |
645 if ((lo & m) != 0) { | |
646 AddRuneRangeUTF8(lo, lo|m, foldcase); | |
647 AddRuneRangeUTF8((lo|m)+1, hi, foldcase); | |
648 return; | |
649 } | |
650 if ((hi & m) != m) { | |
651 AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase); | |
652 AddRuneRangeUTF8(hi&~m, hi, foldcase); | |
653 return; | |
654 } | |
655 } | |
656 } | |
657 | |
658 // Finally. Generate byte matching equivalent for lo-hi. | |
659 uint8 ulo[UTFmax], uhi[UTFmax]; | |
660 int n = runetochar(reinterpret_cast<char*>(ulo), &lo); | |
661 int m = runetochar(reinterpret_cast<char*>(uhi), &hi); | |
662 (void)m; // USED(m) | |
663 DCHECK_EQ(n, m); | |
664 | |
665 int id = 0; | |
666 if (reversed_) { | |
667 for (int i = 0; i < n; i++) | |
668 id = RuneByteSuffix(ulo[i], uhi[i], false, id); | |
669 } else { | |
670 for (int i = n-1; i >= 0; i--) | |
671 id = RuneByteSuffix(ulo[i], uhi[i], false, id); | |
672 } | |
673 AddSuffix(id); | |
674 } | |
675 | |
676 // Should not be called. | |
677 Frag Compiler::Copy(Frag arg) { | |
678 // We're using WalkExponential; there should be no copying. | |
679 LOG(DFATAL) << "Compiler::Copy called!"; | |
680 failed_ = true; | |
681 return NoMatch(); | |
682 } | |
683 | |
684 // Visits a node quickly; called once WalkExponential has | |
685 // decided to cut this walk short. | |
686 Frag Compiler::ShortVisit(Regexp* re, Frag) { | |
687 failed_ = true; | |
688 return NoMatch(); | |
689 } | |
690 | |
691 // Called before traversing a node's children during the walk. | |
692 Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) { | |
693 // Cut off walk if we've already failed. | |
694 if (failed_) | |
695 *stop = true; | |
696 | |
697 return Frag(); // not used by caller | |
698 } | |
699 | |
700 Frag Compiler::Literal(Rune r, bool foldcase) { | |
701 switch (encoding_) { | |
702 default: | |
703 return Frag(); | |
704 | |
705 case kEncodingLatin1: | |
706 return ByteRange(r, r, foldcase); | |
707 | |
708 case kEncodingUTF8: { | |
709 if (r < Runeself) // Make common case fast. | |
710 return ByteRange(r, r, foldcase); | |
711 uint8 buf[UTFmax]; | |
712 int n = runetochar(reinterpret_cast<char*>(buf), &r); | |
713 Frag f = ByteRange((uint8)buf[0], buf[0], false); | |
714 for (int i = 1; i < n; i++) | |
715 f = Cat(f, ByteRange((uint8)buf[i], buf[i], false)); | |
716 return f; | |
717 } | |
718 } | |
719 } | |
720 | |
721 // Called after traversing the node's children during the walk. | |
722 // Given their frags, build and return the frag for this re. | |
723 Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, | |
724 int nchild_frags) { | |
725 // If a child failed, don't bother going forward, especially | |
726 // since the child_frags might contain Frags with NULLs in them. | |
727 if (failed_) | |
728 return NoMatch(); | |
729 | |
730 // Given the child fragments, return the fragment for this node. | |
731 switch (re->op()) { | |
732 case kRegexpRepeat: | |
733 // Should not see; code at bottom of function will print error | |
734 break; | |
735 | |
736 case kRegexpNoMatch: | |
737 return NoMatch(); | |
738 | |
739 case kRegexpEmptyMatch: | |
740 return Nop(); | |
741 | |
742 case kRegexpHaveMatch: { | |
743 Frag f = Match(re->match_id()); | |
744 // Remember unanchored match to end of string. | |
745 if (anchor_ != RE2::ANCHOR_BOTH) | |
746 f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f)); | |
747 return f; | |
748 } | |
749 | |
750 case kRegexpConcat: { | |
751 Frag f = child_frags[0]; | |
752 for (int i = 1; i < nchild_frags; i++) | |
753 f = Cat(f, child_frags[i]); | |
754 return f; | |
755 } | |
756 | |
757 case kRegexpAlternate: { | |
758 Frag f = child_frags[0]; | |
759 for (int i = 1; i < nchild_frags; i++) | |
760 f = Alt(f, child_frags[i]); | |
761 return f; | |
762 } | |
763 | |
764 case kRegexpStar: | |
765 return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); | |
766 | |
767 case kRegexpPlus: | |
768 return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); | |
769 | |
770 case kRegexpQuest: | |
771 return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); | |
772 | |
773 case kRegexpLiteral: | |
774 return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0); | |
775 | |
776 case kRegexpLiteralString: { | |
777 // Concatenation of literals. | |
778 if (re->nrunes() == 0) | |
779 return Nop(); | |
780 Frag f; | |
781 for (int i = 0; i < re->nrunes(); i++) { | |
782 Frag f1 = Literal(re->runes()[i], | |
783 (re->parse_flags()&Regexp::FoldCase) != 0); | |
784 if (i == 0) | |
785 f = f1; | |
786 else | |
787 f = Cat(f, f1); | |
788 } | |
789 return f; | |
790 } | |
791 | |
792 case kRegexpAnyChar: | |
793 BeginRange(); | |
794 AddRuneRange(0, Runemax, false); | |
795 return EndRange(); | |
796 | |
797 case kRegexpAnyByte: | |
798 return ByteRange(0x00, 0xFF, false); | |
799 | |
800 case kRegexpCharClass: { | |
801 CharClass* cc = re->cc(); | |
802 if (cc->empty()) { | |
803 // This can't happen. | |
804 LOG(DFATAL) << "No ranges in char class"; | |
805 failed_ = true; | |
806 return NoMatch(); | |
807 } | |
808 | |
809 // ASCII case-folding optimization: if the char class | |
810 // behaves the same on A-Z as it does on a-z, | |
811 // discard any ranges wholly contained in A-Z | |
812 // and mark the other ranges as foldascii. | |
813 // This reduces the size of a program for | |
814 // (?i)abc from 3 insts per letter to 1 per letter. | |
815 bool foldascii = cc->FoldsASCII(); | |
816 | |
817 // Character class is just a big OR of the different | |
818 // character ranges in the class. | |
819 BeginRange(); | |
820 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) { | |
821 // ASCII case-folding optimization (see above). | |
822 if (foldascii && 'A' <= i->lo && i->hi <= 'Z') | |
823 continue; | |
824 | |
825 // If this range contains all of A-Za-z or none of it, | |
826 // the fold flag is unnecessary; don't bother. | |
827 bool fold = foldascii; | |
828 if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo || | |
829 ('Z' < i->lo && i->hi < 'a')) | |
830 fold = false; | |
831 | |
832 AddRuneRange(i->lo, i->hi, fold); | |
833 } | |
834 return EndRange(); | |
835 } | |
836 | |
837 case kRegexpCapture: | |
838 // If this is a non-capturing parenthesis -- (?:foo) -- | |
839 // just use the inner expression. | |
840 if (re->cap() < 0) | |
841 return child_frags[0]; | |
842 return Capture(child_frags[0], re->cap()); | |
843 | |
844 case kRegexpBeginLine: | |
845 return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine); | |
846 | |
847 case kRegexpEndLine: | |
848 return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine); | |
849 | |
850 case kRegexpBeginText: | |
851 return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText); | |
852 | |
853 case kRegexpEndText: | |
854 return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText); | |
855 | |
856 case kRegexpWordBoundary: | |
857 return EmptyWidth(kEmptyWordBoundary); | |
858 | |
859 case kRegexpNoWordBoundary: | |
860 return EmptyWidth(kEmptyNonWordBoundary); | |
861 } | |
862 LOG(DFATAL) << "Missing case in Compiler: " << re->op(); | |
863 failed_ = true; | |
864 return NoMatch(); | |
865 } | |
866 | |
867 // Is this regexp required to start at the beginning of the text? | |
868 // Only approximate; can return false for complicated regexps like (\Aa|\Ab), | |
869 // but handles (\A(a|b)). Could use the Walker to write a more exact one. | |
870 static bool IsAnchorStart(Regexp** pre, int depth) { | |
871 Regexp* re = *pre; | |
872 Regexp* sub; | |
873 // The depth limit makes sure that we don't overflow | |
874 // the stack on a deeply nested regexp. As the comment | |
875 // above says, IsAnchorStart is conservative, so returning | |
876 // a false negative is okay. The exact limit is somewhat arbitrary. | |
877 if (re == NULL || depth >= 4) | |
878 return false; | |
879 switch (re->op()) { | |
880 default: | |
881 break; | |
882 case kRegexpConcat: | |
883 if (re->nsub() > 0) { | |
884 sub = re->sub()[0]->Incref(); | |
885 if (IsAnchorStart(&sub, depth+1)) { | |
886 Regexp** subcopy = new Regexp*[re->nsub()]; | |
887 subcopy[0] = sub; // already have reference | |
888 for (int i = 1; i < re->nsub(); i++) | |
889 subcopy[i] = re->sub()[i]->Incref(); | |
890 *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); | |
891 delete[] subcopy; | |
892 re->Decref(); | |
893 return true; | |
894 } | |
895 sub->Decref(); | |
896 } | |
897 break; | |
898 case kRegexpCapture: | |
899 sub = re->sub()[0]->Incref(); | |
900 if (IsAnchorStart(&sub, depth+1)) { | |
901 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); | |
902 re->Decref(); | |
903 return true; | |
904 } | |
905 sub->Decref(); | |
906 break; | |
907 case kRegexpBeginText: | |
908 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); | |
909 re->Decref(); | |
910 return true; | |
911 } | |
912 return false; | |
913 } | |
914 | |
915 // Is this regexp required to start at the end of the text? | |
916 // Only approximate; can return false for complicated regexps like (a\z|b\z), | |
917 // but handles ((a|b)\z). Could use the Walker to write a more exact one. | |
918 static bool IsAnchorEnd(Regexp** pre, int depth) { | |
919 Regexp* re = *pre; | |
920 Regexp* sub; | |
921 // The depth limit makes sure that we don't overflow | |
922 // the stack on a deeply nested regexp. As the comment | |
923 // above says, IsAnchorEnd is conservative, so returning | |
924 // a false negative is okay. The exact limit is somewhat arbitrary. | |
925 if (re == NULL || depth >= 4) | |
926 return false; | |
927 switch (re->op()) { | |
928 default: | |
929 break; | |
930 case kRegexpConcat: | |
931 if (re->nsub() > 0) { | |
932 sub = re->sub()[re->nsub() - 1]->Incref(); | |
933 if (IsAnchorEnd(&sub, depth+1)) { | |
934 Regexp** subcopy = new Regexp*[re->nsub()]; | |
935 subcopy[re->nsub() - 1] = sub; // already have reference | |
936 for (int i = 0; i < re->nsub() - 1; i++) | |
937 subcopy[i] = re->sub()[i]->Incref(); | |
938 *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); | |
939 delete[] subcopy; | |
940 re->Decref(); | |
941 return true; | |
942 } | |
943 sub->Decref(); | |
944 } | |
945 break; | |
946 case kRegexpCapture: | |
947 sub = re->sub()[0]->Incref(); | |
948 if (IsAnchorEnd(&sub, depth+1)) { | |
949 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); | |
950 re->Decref(); | |
951 return true; | |
952 } | |
953 sub->Decref(); | |
954 break; | |
955 case kRegexpEndText: | |
956 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); | |
957 re->Decref(); | |
958 return true; | |
959 } | |
960 return false; | |
961 } | |
962 | |
963 void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, | |
964 RE2::Anchor anchor) { | |
965 prog_->set_flags(flags); | |
966 | |
967 if (flags & Regexp::Latin1) | |
968 encoding_ = kEncodingLatin1; | |
969 max_mem_ = max_mem; | |
970 if (max_mem <= 0) { | |
971 max_inst_ = 100000; // more than enough | |
972 } else if (max_mem <= static_cast<int64>(sizeof(Prog))) { | |
973 // No room for anything. | |
974 max_inst_ = 0; | |
975 } else { | |
976 int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); | |
977 // Limit instruction count so that inst->id() fits nicely in an int. | |
978 // SparseArray also assumes that the indices (inst->id()) are ints. | |
979 // The call to WalkExponential uses 2*max_inst_ below, | |
980 // and other places in the code use 2 or 3 * prog->size(). | |
981 // Limiting to 2^24 should avoid overflow in those places. | |
982 // (The point of allowing more than 32 bits of memory is to | |
983 // have plenty of room for the DFA states, not to use it up | |
984 // on the program.) | |
985 if (m >= 1<<24) | |
986 m = 1<<24; | |
987 | |
988 // Inst imposes its own limit (currently bigger than 2^24 but be safe). | |
989 if (m > Prog::Inst::kMaxInst) | |
990 m = Prog::Inst::kMaxInst; | |
991 | |
992 max_inst_ = static_cast<int>(m); | |
993 } | |
994 | |
995 anchor_ = anchor; | |
996 } | |
997 | |
998 // Compiles re, returning program. | |
999 // Caller is responsible for deleting prog_. | |
1000 // If reversed is true, compiles a program that expects | |
1001 // to run over the input string backward (reverses all concatenations). | |
1002 // The reversed flag is also recorded in the returned program. | |
1003 Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) { | |
1004 Compiler c; | |
1005 | |
1006 c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */); | |
1007 c.reversed_ = reversed; | |
1008 | |
1009 // Simplify to remove things like counted repetitions | |
1010 // and character classes like \d. | |
1011 Regexp* sre = re->Simplify(); | |
1012 if (sre == NULL) | |
1013 return NULL; | |
1014 | |
1015 // Record whether prog is anchored, removing the anchors. | |
1016 // (They get in the way of other optimizations.) | |
1017 bool is_anchor_start = IsAnchorStart(&sre, 0); | |
1018 bool is_anchor_end = IsAnchorEnd(&sre, 0); | |
1019 | |
1020 // Generate fragment for entire regexp. | |
1021 Frag f = c.WalkExponential(sre, Frag(), 2*c.max_inst_); | |
1022 sre->Decref(); | |
1023 if (c.failed_) | |
1024 return NULL; | |
1025 | |
1026 // Success! Finish by putting Match node at end, and record start. | |
1027 // Turn off c.reversed_ (if it is set) to force the remaining concatenations | |
1028 // to behave normally. | |
1029 c.reversed_ = false; | |
1030 Frag all = c.Cat(f, c.Match(0)); | |
1031 c.prog_->set_start(all.begin); | |
1032 | |
1033 if (reversed) { | |
1034 c.prog_->set_anchor_start(is_anchor_end); | |
1035 c.prog_->set_anchor_end(is_anchor_start); | |
1036 } else { | |
1037 c.prog_->set_anchor_start(is_anchor_start); | |
1038 c.prog_->set_anchor_end(is_anchor_end); | |
1039 } | |
1040 | |
1041 // Also create unanchored version, which starts with a .*? loop. | |
1042 if (c.prog_->anchor_start()) { | |
1043 c.prog_->set_start_unanchored(c.prog_->start()); | |
1044 } else { | |
1045 Frag unanchored = c.Cat(c.DotStar(), all); | |
1046 c.prog_->set_start_unanchored(unanchored.begin); | |
1047 } | |
1048 | |
1049 c.prog_->set_reversed(reversed); | |
1050 | |
1051 // Hand ownership of prog_ to caller. | |
1052 return c.Finish(); | |
1053 } | |
1054 | |
1055 Prog* Compiler::Finish() { | |
1056 if (failed_) | |
1057 return NULL; | |
1058 | |
1059 if (prog_->start() == 0 && prog_->start_unanchored() == 0) { | |
1060 // No possible matches; keep Fail instruction only. | |
1061 inst_len_ = 1; | |
1062 } | |
1063 | |
1064 // Trim instruction to minimum array and transfer to Prog. | |
1065 Trim(); | |
1066 prog_->inst_ = inst_; | |
1067 prog_->size_ = inst_len_; | |
1068 inst_ = NULL; | |
1069 | |
1070 // Compute byte map. | |
1071 prog_->ComputeByteMap(); | |
1072 | |
1073 prog_->Optimize(); | |
1074 | |
1075 // Record remaining memory for DFA. | |
1076 if (max_mem_ <= 0) { | |
1077 prog_->set_dfa_mem(1<<20); | |
1078 } else { | |
1079 int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst); | |
1080 if (m < 0) | |
1081 m = 0; | |
1082 prog_->set_dfa_mem(m); | |
1083 } | |
1084 | |
1085 Prog* p = prog_; | |
1086 prog_ = NULL; | |
1087 return p; | |
1088 } | |
1089 | |
1090 // Converts Regexp to Prog. | |
1091 Prog* Regexp::CompileToProg(int64 max_mem) { | |
1092 return Compiler::Compile(this, false, max_mem); | |
1093 } | |
1094 | |
1095 Prog* Regexp::CompileToReverseProg(int64 max_mem) { | |
1096 return Compiler::Compile(this, true, max_mem); | |
1097 } | |
1098 | |
1099 Frag Compiler::DotStar() { | |
1100 return Star(ByteRange(0x00, 0xff, false), true); | |
1101 } | |
1102 | |
1103 // Compiles RE set to Prog. | |
1104 Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor, | |
1105 Regexp* re) { | |
1106 Compiler c; | |
1107 | |
1108 Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags()); | |
1109 c.Setup(pf, options.max_mem(), anchor); | |
1110 | |
1111 // Compile alternation of fragments. | |
1112 Frag all = c.WalkExponential(re, Frag(), 2*c.max_inst_); | |
1113 re->Decref(); | |
1114 if (c.failed_) | |
1115 return NULL; | |
1116 | |
1117 if (anchor == RE2::UNANCHORED) { | |
1118 // The trailing .* was added while handling kRegexpHaveMatch. | |
1119 // We just have to add the leading one. | |
1120 all = c.Cat(c.DotStar(), all); | |
1121 } | |
1122 | |
1123 c.prog_->set_start(all.begin); | |
1124 c.prog_->set_start_unanchored(all.begin); | |
1125 c.prog_->set_anchor_start(true); | |
1126 c.prog_->set_anchor_end(true); | |
1127 | |
1128 Prog* prog = c.Finish(); | |
1129 if (prog == NULL) | |
1130 return NULL; | |
1131 | |
1132 // Make sure DFA has enough memory to operate, | |
1133 // since we're not going to fall back to the NFA. | |
1134 bool failed; | |
1135 StringPiece sp = "hello, world"; | |
1136 prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, | |
1137 NULL, &failed, NULL); | |
1138 if (failed) { | |
1139 delete prog; | |
1140 return NULL; | |
1141 } | |
1142 | |
1143 return prog; | |
1144 } | |
1145 | |
1146 Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor, | |
1147 Regexp* re) { | |
1148 return Compiler::CompileSet(options, anchor, re); | |
1149 } | |
1150 | |
1151 } // namespace re2 | |
OLD | NEW |