Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(138)

Side by Side Diff: third_party/re2/re2/compile.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/re2/re2/bitstate.cc ('k') | third_party/re2/re2/dfa.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Compile regular expression to Prog.
6 //
7 // Prog and Inst are defined in prog.h.
8 // This file's external interface is just Regexp::CompileToProg.
9 // The Compiler class defined in this file is private.
10
11 #include "re2/prog.h"
12 #include "re2/re2.h"
13 #include "re2/regexp.h"
14 #include "re2/walker-inl.h"
15
16 namespace re2 {
17
18 // List of pointers to Inst* that need to be filled in (patched).
19 // Because the Inst* haven't been filled in yet,
20 // we can use the Inst* word to hold the list's "next" pointer.
21 // It's kind of sleazy, but it works well in practice.
22 // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
23 //
24 // Because the out and out1 fields in Inst are no longer pointers,
25 // we can't use pointers directly here either. Instead, p refers
26 // to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1).
27 // p == 0 represents the NULL list. This is okay because instruction #0
28 // is always the fail instruction, which never appears on a list.
29
30 struct PatchList {
31 uint32 p;
32
33 // Returns patch list containing just p.
34 static PatchList Mk(uint32 p);
35
36 // Patches all the entries on l to have value v.
37 // Caller must not ever use patch list again.
38 static void Patch(Prog::Inst *inst0, PatchList l, uint32 v);
39
40 // Deref returns the next pointer pointed at by p.
41 static PatchList Deref(Prog::Inst *inst0, PatchList l);
42
43 // Appends two patch lists and returns result.
44 static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2);
45 };
46
47 static PatchList nullPatchList = { 0 };
48
49 // Returns patch list containing just p.
50 PatchList PatchList::Mk(uint32 p) {
51 PatchList l;
52 l.p = p;
53 return l;
54 }
55
56 // Returns the next pointer pointed at by l.
57 PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {
58 Prog::Inst* ip = &inst0[l.p>>1];
59 if (l.p&1)
60 l.p = ip->out1();
61 else
62 l.p = ip->out();
63 return l;
64 }
65
66 // Patches all the entries on l to have value v.
67 void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) {
68 while (l.p != 0) {
69 Prog::Inst* ip = &inst0[l.p>>1];
70 if (l.p&1) {
71 l.p = ip->out1();
72 ip->out1_ = val;
73 } else {
74 l.p = ip->out();
75 ip->set_out(val);
76 }
77 }
78 }
79
80 // Appends two patch lists and returns result.
81 PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
82 if (l1.p == 0)
83 return l2;
84 if (l2.p == 0)
85 return l1;
86
87 PatchList l = l1;
88 for (;;) {
89 PatchList next = PatchList::Deref(inst0, l);
90 if (next.p == 0)
91 break;
92 l = next;
93 }
94
95 Prog::Inst* ip = &inst0[l.p>>1];
96 if (l.p&1)
97 ip->out1_ = l2.p;
98 else
99 ip->set_out(l2.p);
100
101 return l1;
102 }
103
104 // Compiled program fragment.
105 struct Frag {
106 uint32 begin;
107 PatchList end;
108
109 Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector
110 Frag(uint32 begin, PatchList end) : begin(begin), end(end) {}
111 };
112
113 // Input encodings.
114 enum Encoding {
115 kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
116 kEncodingLatin1, // Latin1 (0-FF)
117 };
118
119 class Compiler : public Regexp::Walker<Frag> {
120 public:
121 explicit Compiler();
122 ~Compiler();
123
124 // Compiles Regexp to a new Prog.
125 // Caller is responsible for deleting Prog when finished with it.
126 // If reversed is true, compiles for walking over the input
127 // string backward (reverses all concatenations).
128 static Prog *Compile(Regexp* re, bool reversed, int64 max_mem);
129
130 // Compiles alternation of all the re to a new Prog.
131 // Each re has a match with an id equal to its index in the vector.
132 static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
133 Regexp* re);
134
135 // Interface for Regexp::Walker, which helps traverse the Regexp.
136 // The walk is purely post-recursive: given the machines for the
137 // children, PostVisit combines them to create the machine for
138 // the current node. The child_args are Frags.
139 // The Compiler traverses the Regexp parse tree, visiting
140 // each node in depth-first order. It invokes PreVisit before
141 // visiting the node's children and PostVisit after visiting
142 // the children.
143 Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
144 Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
145 int nchild_args);
146 Frag ShortVisit(Regexp* re, Frag parent_arg);
147 Frag Copy(Frag arg);
148
149 // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
150 Frag Plus(Frag a, bool nongreedy);
151 Frag Star(Frag a, bool nongreedy);
152 Frag Quest(Frag a, bool nongreedy);
153
154 // Given fragment a, returns (a) capturing as \n.
155 Frag Capture(Frag a, int n);
156
157 // Given fragments a and b, returns ab; a|b
158 Frag Cat(Frag a, Frag b);
159 Frag Alt(Frag a, Frag b);
160
161 // Returns a fragment that can't match anything.
162 Frag NoMatch();
163
164 // Returns a fragment that matches the empty string.
165 Frag Match(int32 id);
166
167 // Returns a no-op fragment.
168 Frag Nop();
169
170 // Returns a fragment matching the byte range lo-hi.
171 Frag ByteRange(int lo, int hi, bool foldcase);
172
173 // Returns a fragment matching an empty-width special op.
174 Frag EmptyWidth(EmptyOp op);
175
176 // Adds n instructions to the program.
177 // Returns the index of the first one.
178 // Returns -1 if no more instructions are available.
179 int AllocInst(int n);
180
181 // Deletes unused instructions.
182 void Trim();
183
184 // Rune range compiler.
185
186 // Begins a new alternation.
187 void BeginRange();
188
189 // Adds a fragment matching the rune range lo-hi.
190 void AddRuneRange(Rune lo, Rune hi, bool foldcase);
191 void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
192 void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
193 void Add_80_10ffff();
194
195 // New suffix that matches the byte range lo-hi, then goes to next.
196 int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
197 int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
198
199 // Adds a suffix to alternation.
200 void AddSuffix(int id);
201
202 // Returns the alternation of all the added suffixes.
203 Frag EndRange();
204
205 // Single rune.
206 Frag Literal(Rune r, bool foldcase);
207
208 void Setup(Regexp::ParseFlags, int64, RE2::Anchor);
209 Prog* Finish();
210
211 // Returns .* where dot = any byte
212 Frag DotStar();
213
214 private:
215 Prog* prog_; // Program being built.
216 bool failed_; // Did we give up compiling?
217 Encoding encoding_; // Input encoding
218 bool reversed_; // Should program run backward over text?
219
220 int max_inst_; // Maximum number of instructions.
221
222 Prog::Inst* inst_; // Pointer to first instruction.
223 int inst_len_; // Number of instructions used.
224 int inst_cap_; // Number of instructions allocated.
225
226 int64 max_mem_; // Total memory budget.
227
228 map<uint64, int> rune_cache_;
229 Frag rune_range_;
230
231 RE2::Anchor anchor_; // anchor mode for RE2::Set
232
233 DISALLOW_COPY_AND_ASSIGN(Compiler);
234 };
235
236 Compiler::Compiler() {
237 prog_ = new Prog();
238 failed_ = false;
239 encoding_ = kEncodingUTF8;
240 reversed_ = false;
241 inst_ = NULL;
242 inst_len_ = 0;
243 inst_cap_ = 0;
244 max_inst_ = 1; // make AllocInst for fail instruction okay
245 max_mem_ = 0;
246 int fail = AllocInst(1);
247 inst_[fail].InitFail();
248 max_inst_ = 0; // Caller must change
249 }
250
251 Compiler::~Compiler() {
252 delete prog_;
253 delete[] inst_;
254 }
255
256 int Compiler::AllocInst(int n) {
257 if (failed_ || inst_len_ + n > max_inst_) {
258 failed_ = true;
259 return -1;
260 }
261
262 if (inst_len_ + n > inst_cap_) {
263 if (inst_cap_ == 0)
264 inst_cap_ = 8;
265 while (inst_len_ + n > inst_cap_)
266 inst_cap_ *= 2;
267 Prog::Inst* ip = new Prog::Inst[inst_cap_];
268 memmove(ip, inst_, inst_len_ * sizeof ip[0]);
269 memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]);
270 delete[] inst_;
271 inst_ = ip;
272 }
273 int id = inst_len_;
274 inst_len_ += n;
275 return id;
276 }
277
278 void Compiler::Trim() {
279 if (inst_len_ < inst_cap_) {
280 Prog::Inst* ip = new Prog::Inst[inst_len_];
281 memmove(ip, inst_, inst_len_ * sizeof ip[0]);
282 delete[] inst_;
283 inst_ = ip;
284 inst_cap_ = inst_len_;
285 }
286 }
287
288 // These routines are somewhat hard to visualize in text --
289 // see http://swtch.com/~rsc/regexp/regexp1.html for
290 // pictures explaining what is going on here.
291
292 // Returns an unmatchable fragment.
293 Frag Compiler::NoMatch() {
294 return Frag(0, nullPatchList);
295 }
296
297 // Is a an unmatchable fragment?
298 static bool IsNoMatch(Frag a) {
299 return a.begin == 0;
300 }
301
302 // Given fragments a and b, returns fragment for ab.
303 Frag Compiler::Cat(Frag a, Frag b) {
304 if (IsNoMatch(a) || IsNoMatch(b))
305 return NoMatch();
306
307 // Elide no-op.
308 Prog::Inst* begin = &inst_[a.begin];
309 if (begin->opcode() == kInstNop &&
310 a.end.p == (a.begin << 1) &&
311 begin->out() == 0) {
312 PatchList::Patch(inst_, a.end, b.begin); // in case refs to a somewhere
313 return b;
314 }
315
316 // To run backward over string, reverse all concatenations.
317 if (reversed_) {
318 PatchList::Patch(inst_, b.end, a.begin);
319 return Frag(b.begin, a.end);
320 }
321
322 PatchList::Patch(inst_, a.end, b.begin);
323 return Frag(a.begin, b.end);
324 }
325
326 // Given fragments for a and b, returns fragment for a|b.
327 Frag Compiler::Alt(Frag a, Frag b) {
328 // Special case for convenience in loops.
329 if (IsNoMatch(a))
330 return b;
331 if (IsNoMatch(b))
332 return a;
333
334 int id = AllocInst(1);
335 if (id < 0)
336 return NoMatch();
337
338 inst_[id].InitAlt(a.begin, b.begin);
339 return Frag(id, PatchList::Append(inst_, a.end, b.end));
340 }
341
342 // When capturing submatches in like-Perl mode, a kOpAlt Inst
343 // treats out_ as the first choice, out1_ as the second.
344 //
345 // For *, +, and ?, if out_ causes another repetition,
346 // then the operator is greedy. If out1_ is the repetition
347 // (and out_ moves forward), then the operator is non-greedy.
348
349 // Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
350 Frag Compiler::Star(Frag a, bool nongreedy) {
351 int id = AllocInst(1);
352 if (id < 0)
353 return NoMatch();
354 inst_[id].InitAlt(0, 0);
355 PatchList::Patch(inst_, a.end, id);
356 if (nongreedy) {
357 inst_[id].out1_ = a.begin;
358 return Frag(id, PatchList::Mk(id << 1));
359 } else {
360 inst_[id].set_out(a.begin);
361 return Frag(id, PatchList::Mk((id << 1) | 1));
362 }
363 }
364
365 // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
366 Frag Compiler::Plus(Frag a, bool nongreedy) {
367 // a+ is just a* with a different entry point.
368 Frag f = Star(a, nongreedy);
369 return Frag(a.begin, f.end);
370 }
371
372 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
373 Frag Compiler::Quest(Frag a, bool nongreedy) {
374 if (IsNoMatch(a))
375 return Nop();
376 int id = AllocInst(1);
377 if (id < 0)
378 return NoMatch();
379 PatchList pl;
380 if (nongreedy) {
381 inst_[id].InitAlt(0, a.begin);
382 pl = PatchList::Mk(id << 1);
383 } else {
384 inst_[id].InitAlt(a.begin, 0);
385 pl = PatchList::Mk((id << 1) | 1);
386 }
387 return Frag(id, PatchList::Append(inst_, pl, a.end));
388 }
389
390 // Returns a fragment for the byte range lo-hi.
391 Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
392 int id = AllocInst(1);
393 if (id < 0)
394 return NoMatch();
395 inst_[id].InitByteRange(lo, hi, foldcase, 0);
396 prog_->byte_inst_count_++;
397 prog_->MarkByteRange(lo, hi);
398 if (foldcase && lo <= 'z' && hi >= 'a') {
399 if (lo < 'a')
400 lo = 'a';
401 if (hi > 'z')
402 hi = 'z';
403 if (lo <= hi)
404 prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a');
405 }
406 return Frag(id, PatchList::Mk(id << 1));
407 }
408
409 // Returns a no-op fragment. Sometimes unavoidable.
410 Frag Compiler::Nop() {
411 int id = AllocInst(1);
412 if (id < 0)
413 return NoMatch();
414 inst_[id].InitNop(0);
415 return Frag(id, PatchList::Mk(id << 1));
416 }
417
418 // Returns a fragment that signals a match.
419 Frag Compiler::Match(int32 match_id) {
420 int id = AllocInst(1);
421 if (id < 0)
422 return NoMatch();
423 inst_[id].InitMatch(match_id);
424 return Frag(id, nullPatchList);
425 }
426
427 // Returns a fragment matching a particular empty-width op (like ^ or $)
428 Frag Compiler::EmptyWidth(EmptyOp empty) {
429 int id = AllocInst(1);
430 if (id < 0)
431 return NoMatch();
432 inst_[id].InitEmptyWidth(empty, 0);
433 if (empty & (kEmptyBeginLine|kEmptyEndLine))
434 prog_->MarkByteRange('\n', '\n');
435 if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
436 int j;
437 for (int i = 0; i < 256; i = j) {
438 for (j = i + 1; j < 256 &&
439 Prog::IsWordChar(static_cast<uint8>(i)) ==
440 Prog::IsWordChar(static_cast<uint8>(j));
441 j++)
442 ;
443 prog_->MarkByteRange(i, j-1);
444 }
445 }
446 return Frag(id, PatchList::Mk(id << 1));
447 }
448
449 // Given a fragment a, returns a fragment with capturing parens around a.
450 Frag Compiler::Capture(Frag a, int n) {
451 if (IsNoMatch(a))
452 return NoMatch();
453 int id = AllocInst(2);
454 if (id < 0)
455 return NoMatch();
456 inst_[id].InitCapture(2*n, a.begin);
457 inst_[id+1].InitCapture(2*n+1, 0);
458 PatchList::Patch(inst_, a.end, id+1);
459
460 return Frag(id, PatchList::Mk((id+1) << 1));
461 }
462
463 // A Rune is a name for a Unicode code point.
464 // Returns maximum rune encoded by UTF-8 sequence of length len.
465 static int MaxRune(int len) {
466 int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
467 if (len == 1)
468 b = 7;
469 else
470 b = 8-(len+1) + 6*(len-1);
471 return (1<<b) - 1; // maximum Rune for b bits.
472 }
473
474 // The rune range compiler caches common suffix fragments,
475 // which are very common in UTF-8 (e.g., [80-bf]).
476 // The fragment suffixes are identified by their start
477 // instructions. NULL denotes the eventual end match.
478 // The Frag accumulates in rune_range_. Caching common
479 // suffixes reduces the UTF-8 "." from 32 to 24 instructions,
480 // and it reduces the corresponding one-pass NFA from 16 nodes to 8.
481
482 void Compiler::BeginRange() {
483 rune_cache_.clear();
484 rune_range_.begin = 0;
485 rune_range_.end = nullPatchList;
486 }
487
488 int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
489 int next) {
490 Frag f = ByteRange(lo, hi, foldcase);
491 if (next != 0) {
492 PatchList::Patch(inst_, f.end, next);
493 } else {
494 rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end);
495 }
496 return f.begin;
497 }
498
499 int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
500 // In Latin1 mode, there's no point in caching.
501 // In forward UTF-8 mode, only need to cache continuation bytes.
502 if (encoding_ == kEncodingLatin1 ||
503 (encoding_ == kEncodingUTF8 &&
504 !reversed_ &&
505 !(0x80 <= lo && hi <= 0xbf))) {
506 return UncachedRuneByteSuffix(lo, hi, foldcase, next);
507 }
508
509 uint64 key = (uint64)next << 17 |
510 (uint64)lo << 9 |
511 (uint64)hi << 1 |
512 (uint64)foldcase;
513 map<uint64, int>::iterator it = rune_cache_.find(key);
514 if (it != rune_cache_.end())
515 return it->second;
516 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
517 rune_cache_[key] = id;
518 return id;
519 }
520
521 void Compiler::AddSuffix(int id) {
522 if (rune_range_.begin == 0) {
523 rune_range_.begin = id;
524 return;
525 }
526
527 int alt = AllocInst(1);
528 if (alt < 0) {
529 rune_range_.begin = 0;
530 return;
531 }
532 inst_[alt].InitAlt(rune_range_.begin, id);
533 rune_range_.begin = alt;
534 }
535
536 Frag Compiler::EndRange() {
537 return rune_range_;
538 }
539
540 // Converts rune range lo-hi into a fragment that recognizes
541 // the bytes that would make up those runes in the current
542 // encoding (Latin 1 or UTF-8).
543 // This lets the machine work byte-by-byte even when
544 // using multibyte encodings.
545
546 void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
547 switch (encoding_) {
548 default:
549 case kEncodingUTF8:
550 AddRuneRangeUTF8(lo, hi, foldcase);
551 break;
552 case kEncodingLatin1:
553 AddRuneRangeLatin1(lo, hi, foldcase);
554 break;
555 }
556 }
557
558 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
559 // Latin1 is easy: runes *are* bytes.
560 if (lo > hi || lo > 0xFF)
561 return;
562 if (hi > 0xFF)
563 hi = 0xFF;
564 AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi),
565 foldcase, 0));
566 }
567
568 // Table describing how to make a UTF-8 matching machine
569 // for the rune range 80-10FFFF (Runeself-Runemax).
570 // This range happens frequently enough (for example /./ and /[^a-z]/)
571 // and the rune_cache_ map is slow enough that this is worth
572 // special handling. Makes compilation of a small expression
573 // with a dot in it about 10% faster.
574 // The * in the comments below mark whole sequences.
575 static struct ByteRangeProg {
576 int next;
577 int lo;
578 int hi;
579 } prog_80_10ffff[] = {
580 // Two-byte
581 { -1, 0x80, 0xBF, }, // 0: 80-BF
582 { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF*
583
584 // Three-byte
585 { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF
586 { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF*
587 { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF
588 { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF*
589
590 // Four-byte
591 { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF
592 { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF*
593 { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF
594 { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF*
595 { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF
596 { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF*
597 };
598
599 void Compiler::Add_80_10ffff() {
600 int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialize d; silences gcc warning
601 for (int i = 0; i < arraysize(prog_80_10ffff); i++) {
602 const ByteRangeProg& p = prog_80_10ffff[i];
603 int next = 0;
604 if (p.next >= 0)
605 next = inst[p.next];
606 inst[i] = UncachedRuneByteSuffix(static_cast<uint8>(p.lo),
607 static_cast<uint8>(p.hi), false, next);
608 if ((p.lo & 0xC0) != 0x80)
609 AddSuffix(inst[i]);
610 }
611 }
612
613 void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
614 if (lo > hi)
615 return;
616
617 // Pick off 80-10FFFF as a common special case
618 // that can bypass the slow rune_cache_.
619 if (lo == 0x80 && hi == 0x10ffff && !reversed_) {
620 Add_80_10ffff();
621 return;
622 }
623
624 // Split range into same-length sized ranges.
625 for (int i = 1; i < UTFmax; i++) {
626 Rune max = MaxRune(i);
627 if (lo <= max && max < hi) {
628 AddRuneRangeUTF8(lo, max, foldcase);
629 AddRuneRangeUTF8(max+1, hi, foldcase);
630 return;
631 }
632 }
633
634 // ASCII range is always a special case.
635 if (hi < Runeself) {
636 AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi),
637 foldcase, 0));
638 return;
639 }
640
641 // Split range into sections that agree on leading bytes.
642 for (int i = 1; i < UTFmax; i++) {
643 uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
644 if ((lo & ~m) != (hi & ~m)) {
645 if ((lo & m) != 0) {
646 AddRuneRangeUTF8(lo, lo|m, foldcase);
647 AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
648 return;
649 }
650 if ((hi & m) != m) {
651 AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
652 AddRuneRangeUTF8(hi&~m, hi, foldcase);
653 return;
654 }
655 }
656 }
657
658 // Finally. Generate byte matching equivalent for lo-hi.
659 uint8 ulo[UTFmax], uhi[UTFmax];
660 int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
661 int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
662 (void)m; // USED(m)
663 DCHECK_EQ(n, m);
664
665 int id = 0;
666 if (reversed_) {
667 for (int i = 0; i < n; i++)
668 id = RuneByteSuffix(ulo[i], uhi[i], false, id);
669 } else {
670 for (int i = n-1; i >= 0; i--)
671 id = RuneByteSuffix(ulo[i], uhi[i], false, id);
672 }
673 AddSuffix(id);
674 }
675
676 // Should not be called.
677 Frag Compiler::Copy(Frag arg) {
678 // We're using WalkExponential; there should be no copying.
679 LOG(DFATAL) << "Compiler::Copy called!";
680 failed_ = true;
681 return NoMatch();
682 }
683
684 // Visits a node quickly; called once WalkExponential has
685 // decided to cut this walk short.
686 Frag Compiler::ShortVisit(Regexp* re, Frag) {
687 failed_ = true;
688 return NoMatch();
689 }
690
691 // Called before traversing a node's children during the walk.
692 Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
693 // Cut off walk if we've already failed.
694 if (failed_)
695 *stop = true;
696
697 return Frag(); // not used by caller
698 }
699
700 Frag Compiler::Literal(Rune r, bool foldcase) {
701 switch (encoding_) {
702 default:
703 return Frag();
704
705 case kEncodingLatin1:
706 return ByteRange(r, r, foldcase);
707
708 case kEncodingUTF8: {
709 if (r < Runeself) // Make common case fast.
710 return ByteRange(r, r, foldcase);
711 uint8 buf[UTFmax];
712 int n = runetochar(reinterpret_cast<char*>(buf), &r);
713 Frag f = ByteRange((uint8)buf[0], buf[0], false);
714 for (int i = 1; i < n; i++)
715 f = Cat(f, ByteRange((uint8)buf[i], buf[i], false));
716 return f;
717 }
718 }
719 }
720
721 // Called after traversing the node's children during the walk.
722 // Given their frags, build and return the frag for this re.
723 Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
724 int nchild_frags) {
725 // If a child failed, don't bother going forward, especially
726 // since the child_frags might contain Frags with NULLs in them.
727 if (failed_)
728 return NoMatch();
729
730 // Given the child fragments, return the fragment for this node.
731 switch (re->op()) {
732 case kRegexpRepeat:
733 // Should not see; code at bottom of function will print error
734 break;
735
736 case kRegexpNoMatch:
737 return NoMatch();
738
739 case kRegexpEmptyMatch:
740 return Nop();
741
742 case kRegexpHaveMatch: {
743 Frag f = Match(re->match_id());
744 // Remember unanchored match to end of string.
745 if (anchor_ != RE2::ANCHOR_BOTH)
746 f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f));
747 return f;
748 }
749
750 case kRegexpConcat: {
751 Frag f = child_frags[0];
752 for (int i = 1; i < nchild_frags; i++)
753 f = Cat(f, child_frags[i]);
754 return f;
755 }
756
757 case kRegexpAlternate: {
758 Frag f = child_frags[0];
759 for (int i = 1; i < nchild_frags; i++)
760 f = Alt(f, child_frags[i]);
761 return f;
762 }
763
764 case kRegexpStar:
765 return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
766
767 case kRegexpPlus:
768 return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
769
770 case kRegexpQuest:
771 return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
772
773 case kRegexpLiteral:
774 return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);
775
776 case kRegexpLiteralString: {
777 // Concatenation of literals.
778 if (re->nrunes() == 0)
779 return Nop();
780 Frag f;
781 for (int i = 0; i < re->nrunes(); i++) {
782 Frag f1 = Literal(re->runes()[i],
783 (re->parse_flags()&Regexp::FoldCase) != 0);
784 if (i == 0)
785 f = f1;
786 else
787 f = Cat(f, f1);
788 }
789 return f;
790 }
791
792 case kRegexpAnyChar:
793 BeginRange();
794 AddRuneRange(0, Runemax, false);
795 return EndRange();
796
797 case kRegexpAnyByte:
798 return ByteRange(0x00, 0xFF, false);
799
800 case kRegexpCharClass: {
801 CharClass* cc = re->cc();
802 if (cc->empty()) {
803 // This can't happen.
804 LOG(DFATAL) << "No ranges in char class";
805 failed_ = true;
806 return NoMatch();
807 }
808
809 // ASCII case-folding optimization: if the char class
810 // behaves the same on A-Z as it does on a-z,
811 // discard any ranges wholly contained in A-Z
812 // and mark the other ranges as foldascii.
813 // This reduces the size of a program for
814 // (?i)abc from 3 insts per letter to 1 per letter.
815 bool foldascii = cc->FoldsASCII();
816
817 // Character class is just a big OR of the different
818 // character ranges in the class.
819 BeginRange();
820 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
821 // ASCII case-folding optimization (see above).
822 if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
823 continue;
824
825 // If this range contains all of A-Za-z or none of it,
826 // the fold flag is unnecessary; don't bother.
827 bool fold = foldascii;
828 if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo ||
829 ('Z' < i->lo && i->hi < 'a'))
830 fold = false;
831
832 AddRuneRange(i->lo, i->hi, fold);
833 }
834 return EndRange();
835 }
836
837 case kRegexpCapture:
838 // If this is a non-capturing parenthesis -- (?:foo) --
839 // just use the inner expression.
840 if (re->cap() < 0)
841 return child_frags[0];
842 return Capture(child_frags[0], re->cap());
843
844 case kRegexpBeginLine:
845 return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
846
847 case kRegexpEndLine:
848 return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
849
850 case kRegexpBeginText:
851 return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
852
853 case kRegexpEndText:
854 return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
855
856 case kRegexpWordBoundary:
857 return EmptyWidth(kEmptyWordBoundary);
858
859 case kRegexpNoWordBoundary:
860 return EmptyWidth(kEmptyNonWordBoundary);
861 }
862 LOG(DFATAL) << "Missing case in Compiler: " << re->op();
863 failed_ = true;
864 return NoMatch();
865 }
866
867 // Is this regexp required to start at the beginning of the text?
868 // Only approximate; can return false for complicated regexps like (\Aa|\Ab),
869 // but handles (\A(a|b)). Could use the Walker to write a more exact one.
870 static bool IsAnchorStart(Regexp** pre, int depth) {
871 Regexp* re = *pre;
872 Regexp* sub;
873 // The depth limit makes sure that we don't overflow
874 // the stack on a deeply nested regexp. As the comment
875 // above says, IsAnchorStart is conservative, so returning
876 // a false negative is okay. The exact limit is somewhat arbitrary.
877 if (re == NULL || depth >= 4)
878 return false;
879 switch (re->op()) {
880 default:
881 break;
882 case kRegexpConcat:
883 if (re->nsub() > 0) {
884 sub = re->sub()[0]->Incref();
885 if (IsAnchorStart(&sub, depth+1)) {
886 Regexp** subcopy = new Regexp*[re->nsub()];
887 subcopy[0] = sub; // already have reference
888 for (int i = 1; i < re->nsub(); i++)
889 subcopy[i] = re->sub()[i]->Incref();
890 *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());
891 delete[] subcopy;
892 re->Decref();
893 return true;
894 }
895 sub->Decref();
896 }
897 break;
898 case kRegexpCapture:
899 sub = re->sub()[0]->Incref();
900 if (IsAnchorStart(&sub, depth+1)) {
901 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
902 re->Decref();
903 return true;
904 }
905 sub->Decref();
906 break;
907 case kRegexpBeginText:
908 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
909 re->Decref();
910 return true;
911 }
912 return false;
913 }
914
915 // Is this regexp required to start at the end of the text?
916 // Only approximate; can return false for complicated regexps like (a\z|b\z),
917 // but handles ((a|b)\z). Could use the Walker to write a more exact one.
918 static bool IsAnchorEnd(Regexp** pre, int depth) {
919 Regexp* re = *pre;
920 Regexp* sub;
921 // The depth limit makes sure that we don't overflow
922 // the stack on a deeply nested regexp. As the comment
923 // above says, IsAnchorEnd is conservative, so returning
924 // a false negative is okay. The exact limit is somewhat arbitrary.
925 if (re == NULL || depth >= 4)
926 return false;
927 switch (re->op()) {
928 default:
929 break;
930 case kRegexpConcat:
931 if (re->nsub() > 0) {
932 sub = re->sub()[re->nsub() - 1]->Incref();
933 if (IsAnchorEnd(&sub, depth+1)) {
934 Regexp** subcopy = new Regexp*[re->nsub()];
935 subcopy[re->nsub() - 1] = sub; // already have reference
936 for (int i = 0; i < re->nsub() - 1; i++)
937 subcopy[i] = re->sub()[i]->Incref();
938 *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());
939 delete[] subcopy;
940 re->Decref();
941 return true;
942 }
943 sub->Decref();
944 }
945 break;
946 case kRegexpCapture:
947 sub = re->sub()[0]->Incref();
948 if (IsAnchorEnd(&sub, depth+1)) {
949 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
950 re->Decref();
951 return true;
952 }
953 sub->Decref();
954 break;
955 case kRegexpEndText:
956 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
957 re->Decref();
958 return true;
959 }
960 return false;
961 }
962
963 void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
964 RE2::Anchor anchor) {
965 prog_->set_flags(flags);
966
967 if (flags & Regexp::Latin1)
968 encoding_ = kEncodingLatin1;
969 max_mem_ = max_mem;
970 if (max_mem <= 0) {
971 max_inst_ = 100000; // more than enough
972 } else if (max_mem <= static_cast<int64>(sizeof(Prog))) {
973 // No room for anything.
974 max_inst_ = 0;
975 } else {
976 int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
977 // Limit instruction count so that inst->id() fits nicely in an int.
978 // SparseArray also assumes that the indices (inst->id()) are ints.
979 // The call to WalkExponential uses 2*max_inst_ below,
980 // and other places in the code use 2 or 3 * prog->size().
981 // Limiting to 2^24 should avoid overflow in those places.
982 // (The point of allowing more than 32 bits of memory is to
983 // have plenty of room for the DFA states, not to use it up
984 // on the program.)
985 if (m >= 1<<24)
986 m = 1<<24;
987
988 // Inst imposes its own limit (currently bigger than 2^24 but be safe).
989 if (m > Prog::Inst::kMaxInst)
990 m = Prog::Inst::kMaxInst;
991
992 max_inst_ = static_cast<int>(m);
993 }
994
995 anchor_ = anchor;
996 }
997
998 // Compiles re, returning program.
999 // Caller is responsible for deleting prog_.
1000 // If reversed is true, compiles a program that expects
1001 // to run over the input string backward (reverses all concatenations).
1002 // The reversed flag is also recorded in the returned program.
1003 Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
1004 Compiler c;
1005
1006 c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */);
1007 c.reversed_ = reversed;
1008
1009 // Simplify to remove things like counted repetitions
1010 // and character classes like \d.
1011 Regexp* sre = re->Simplify();
1012 if (sre == NULL)
1013 return NULL;
1014
1015 // Record whether prog is anchored, removing the anchors.
1016 // (They get in the way of other optimizations.)
1017 bool is_anchor_start = IsAnchorStart(&sre, 0);
1018 bool is_anchor_end = IsAnchorEnd(&sre, 0);
1019
1020 // Generate fragment for entire regexp.
1021 Frag f = c.WalkExponential(sre, Frag(), 2*c.max_inst_);
1022 sre->Decref();
1023 if (c.failed_)
1024 return NULL;
1025
1026 // Success! Finish by putting Match node at end, and record start.
1027 // Turn off c.reversed_ (if it is set) to force the remaining concatenations
1028 // to behave normally.
1029 c.reversed_ = false;
1030 Frag all = c.Cat(f, c.Match(0));
1031 c.prog_->set_start(all.begin);
1032
1033 if (reversed) {
1034 c.prog_->set_anchor_start(is_anchor_end);
1035 c.prog_->set_anchor_end(is_anchor_start);
1036 } else {
1037 c.prog_->set_anchor_start(is_anchor_start);
1038 c.prog_->set_anchor_end(is_anchor_end);
1039 }
1040
1041 // Also create unanchored version, which starts with a .*? loop.
1042 if (c.prog_->anchor_start()) {
1043 c.prog_->set_start_unanchored(c.prog_->start());
1044 } else {
1045 Frag unanchored = c.Cat(c.DotStar(), all);
1046 c.prog_->set_start_unanchored(unanchored.begin);
1047 }
1048
1049 c.prog_->set_reversed(reversed);
1050
1051 // Hand ownership of prog_ to caller.
1052 return c.Finish();
1053 }
1054
1055 Prog* Compiler::Finish() {
1056 if (failed_)
1057 return NULL;
1058
1059 if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
1060 // No possible matches; keep Fail instruction only.
1061 inst_len_ = 1;
1062 }
1063
1064 // Trim instruction to minimum array and transfer to Prog.
1065 Trim();
1066 prog_->inst_ = inst_;
1067 prog_->size_ = inst_len_;
1068 inst_ = NULL;
1069
1070 // Compute byte map.
1071 prog_->ComputeByteMap();
1072
1073 prog_->Optimize();
1074
1075 // Record remaining memory for DFA.
1076 if (max_mem_ <= 0) {
1077 prog_->set_dfa_mem(1<<20);
1078 } else {
1079 int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst);
1080 if (m < 0)
1081 m = 0;
1082 prog_->set_dfa_mem(m);
1083 }
1084
1085 Prog* p = prog_;
1086 prog_ = NULL;
1087 return p;
1088 }
1089
1090 // Converts Regexp to Prog.
1091 Prog* Regexp::CompileToProg(int64 max_mem) {
1092 return Compiler::Compile(this, false, max_mem);
1093 }
1094
1095 Prog* Regexp::CompileToReverseProg(int64 max_mem) {
1096 return Compiler::Compile(this, true, max_mem);
1097 }
1098
1099 Frag Compiler::DotStar() {
1100 return Star(ByteRange(0x00, 0xff, false), true);
1101 }
1102
1103 // Compiles RE set to Prog.
1104 Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
1105 Regexp* re) {
1106 Compiler c;
1107
1108 Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags());
1109 c.Setup(pf, options.max_mem(), anchor);
1110
1111 // Compile alternation of fragments.
1112 Frag all = c.WalkExponential(re, Frag(), 2*c.max_inst_);
1113 re->Decref();
1114 if (c.failed_)
1115 return NULL;
1116
1117 if (anchor == RE2::UNANCHORED) {
1118 // The trailing .* was added while handling kRegexpHaveMatch.
1119 // We just have to add the leading one.
1120 all = c.Cat(c.DotStar(), all);
1121 }
1122
1123 c.prog_->set_start(all.begin);
1124 c.prog_->set_start_unanchored(all.begin);
1125 c.prog_->set_anchor_start(true);
1126 c.prog_->set_anchor_end(true);
1127
1128 Prog* prog = c.Finish();
1129 if (prog == NULL)
1130 return NULL;
1131
1132 // Make sure DFA has enough memory to operate,
1133 // since we're not going to fall back to the NFA.
1134 bool failed;
1135 StringPiece sp = "hello, world";
1136 prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1137 NULL, &failed, NULL);
1138 if (failed) {
1139 delete prog;
1140 return NULL;
1141 }
1142
1143 return prog;
1144 }
1145
1146 Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
1147 Regexp* re) {
1148 return Compiler::CompileSet(options, anchor, re);
1149 }
1150
1151 } // namespace re2
OLDNEW
« no previous file with comments | « third_party/re2/re2/bitstate.cc ('k') | third_party/re2/re2/dfa.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698