OLD | NEW |
1 // Copyright 2008 The RE2 Authors. All Rights Reserved. | 1 // Copyright 2008 The RE2 Authors. All Rights Reserved. |
2 // Use of this source code is governed by a BSD-style | 2 // Use of this source code is governed by a BSD-style |
3 // license that can be found in the LICENSE file. | 3 // license that can be found in the LICENSE file. |
4 | 4 |
5 // Tested by search_test.cc. | 5 // Tested by search_test.cc. |
6 // | 6 // |
7 // Prog::SearchOnePass is an efficient implementation of | 7 // Prog::SearchOnePass is an efficient implementation of |
8 // regular expression search with submatch tracking for | 8 // regular expression search with submatch tracking for |
9 // what I call "one-pass regular expressions". (An alternate | 9 // what I call "one-pass regular expressions". (An alternate |
10 // name might be "backtracking-free regular expressions".) | 10 // name might be "backtracking-free regular expressions".) |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
46 // One-pass regular expressions get used a lot when RE is | 46 // One-pass regular expressions get used a lot when RE is |
47 // used for parsing simple strings, so it pays off to | 47 // used for parsing simple strings, so it pays off to |
48 // notice them and handle them efficiently. | 48 // notice them and handle them efficiently. |
49 // | 49 // |
50 // See also Anne Brüggemann-Klein and Derick Wood, | 50 // See also Anne Brüggemann-Klein and Derick Wood, |
51 // "One-unambiguous regular languages", Information and Computation 142(2). | 51 // "One-unambiguous regular languages", Information and Computation 142(2). |
52 | 52 |
53 #include <string.h> | 53 #include <string.h> |
54 #include <map> | 54 #include <map> |
55 #include "util/util.h" | 55 #include "util/util.h" |
56 #include "util/arena.h" | |
57 #include "util/sparse_set.h" | 56 #include "util/sparse_set.h" |
58 #include "re2/prog.h" | 57 #include "re2/prog.h" |
59 #include "re2/stringpiece.h" | 58 #include "re2/stringpiece.h" |
60 | 59 |
61 namespace re2 { | 60 namespace re2 { |
62 | 61 |
63 static const int Debug = 0; | 62 static const int Debug = 0; |
64 | 63 |
65 // The key insight behind this implementation is that the | 64 // The key insight behind this implementation is that the |
66 // non-determinism in an NFA for a one-pass regular expression | 65 // non-determinism in an NFA for a one-pass regular expression |
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
119 // To execute a one-pass regular expression program, we can build | 118 // To execute a one-pass regular expression program, we can build |
120 // a DFA (no non-determinism) that has at most as many states as | 119 // a DFA (no non-determinism) that has at most as many states as |
121 // the NFA (compare this to the possibly exponential number of states | 120 // the NFA (compare this to the possibly exponential number of states |
122 // in the general case). Each state records, for each possible | 121 // in the general case). Each state records, for each possible |
123 // input byte, the next state along with the conditions required | 122 // input byte, the next state along with the conditions required |
124 // before entering that state -- empty-width flags that must be true | 123 // before entering that state -- empty-width flags that must be true |
125 // and capture operations that must be performed. It also records | 124 // and capture operations that must be performed. It also records |
126 // whether a set of conditions required to finish a match at that | 125 // whether a set of conditions required to finish a match at that |
127 // point in the input rather than process the next byte. | 126 // point in the input rather than process the next byte. |
128 | 127 |
129 // A state in the one-pass NFA (aka DFA) - just an array of actions. | |
130 struct OneState; | |
131 | |
132 // A state in the one-pass NFA - just an array of actions indexed | 128 // A state in the one-pass NFA - just an array of actions indexed |
133 // by the bytemap_[] of the next input byte. (The bytemap | 129 // by the bytemap_[] of the next input byte. (The bytemap |
134 // maps next input bytes into equivalence classes, to reduce | 130 // maps next input bytes into equivalence classes, to reduce |
135 // the memory footprint.) | 131 // the memory footprint.) |
136 struct OneState { | 132 struct OneState { |
137 uint32 matchcond; // conditions to match right now. | 133 uint32 matchcond; // conditions to match right now. |
138 uint32 action[1]; | 134 uint32 action[1]; |
139 }; | 135 }; |
140 | 136 |
141 // The uint32 conditions in the action are a combination of | 137 // The uint32 conditions in the action are a combination of |
(...skipping 186 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
328 matchcap[i] = cap[i]; | 324 matchcap[i] = cap[i]; |
329 matchcap[1] = p; | 325 matchcap[1] = p; |
330 matched = true; | 326 matched = true; |
331 } | 327 } |
332 } | 328 } |
333 | 329 |
334 done: | 330 done: |
335 if (!matched) | 331 if (!matched) |
336 return false; | 332 return false; |
337 for (int i = 0; i < nmatch; i++) | 333 for (int i = 0; i < nmatch; i++) |
338 match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]); | 334 match[i].set(matchcap[2*i], |
| 335 static_cast<int>(matchcap[2*i+1] - matchcap[2*i])); |
339 return true; | 336 return true; |
340 } | 337 } |
341 | 338 |
342 | 339 |
343 // Analysis to determine whether a given regexp program is one-pass. | 340 // Analysis to determine whether a given regexp program is one-pass. |
344 | 341 |
345 // If ip is not on workq, adds ip to work queue and returns true. | 342 // If ip is not on workq, adds ip to work queue and returns true. |
346 // If ip is already on work queue, does nothing and returns false. | 343 // If ip is already on work queue, does nothing and returns false. |
347 // If ip is NULL, does nothing and returns true (pretends to add it). | 344 // If ip is NULL, does nothing and returns true (pretends to add it). |
348 typedef SparseSet Instq; | 345 typedef SparseSet Instq; |
(...skipping 256 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
605 return true; | 602 return true; |
606 | 603 |
607 fail: | 604 fail: |
608 delete[] stack; | 605 delete[] stack; |
609 delete[] nodebyid; | 606 delete[] nodebyid; |
610 delete[] nodes; | 607 delete[] nodes; |
611 return false; | 608 return false; |
612 } | 609 } |
613 | 610 |
614 } // namespace re2 | 611 } // namespace re2 |
OLD | NEW |