| OLD | NEW |
| 1 // Copyright 2008 The RE2 Authors. All Rights Reserved. | 1 // Copyright 2008 The RE2 Authors. All Rights Reserved. |
| 2 // Use of this source code is governed by a BSD-style | 2 // Use of this source code is governed by a BSD-style |
| 3 // license that can be found in the LICENSE file. | 3 // license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 // Tested by search_test.cc. | 5 // Tested by search_test.cc. |
| 6 // | 6 // |
| 7 // Prog::SearchOnePass is an efficient implementation of | 7 // Prog::SearchOnePass is an efficient implementation of |
| 8 // regular expression search with submatch tracking for | 8 // regular expression search with submatch tracking for |
| 9 // what I call "one-pass regular expressions". (An alternate | 9 // what I call "one-pass regular expressions". (An alternate |
| 10 // name might be "backtracking-free regular expressions".) | 10 // name might be "backtracking-free regular expressions".) |
| (...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 46 // One-pass regular expressions get used a lot when RE is | 46 // One-pass regular expressions get used a lot when RE is |
| 47 // used for parsing simple strings, so it pays off to | 47 // used for parsing simple strings, so it pays off to |
| 48 // notice them and handle them efficiently. | 48 // notice them and handle them efficiently. |
| 49 // | 49 // |
| 50 // See also Anne Brüggemann-Klein and Derick Wood, | 50 // See also Anne Brüggemann-Klein and Derick Wood, |
| 51 // "One-unambiguous regular languages", Information and Computation 142(2). | 51 // "One-unambiguous regular languages", Information and Computation 142(2). |
| 52 | 52 |
| 53 #include <string.h> | 53 #include <string.h> |
| 54 #include <map> | 54 #include <map> |
| 55 #include "util/util.h" | 55 #include "util/util.h" |
| 56 #include "util/arena.h" | |
| 57 #include "util/sparse_set.h" | 56 #include "util/sparse_set.h" |
| 58 #include "re2/prog.h" | 57 #include "re2/prog.h" |
| 59 #include "re2/stringpiece.h" | 58 #include "re2/stringpiece.h" |
| 60 | 59 |
| 61 namespace re2 { | 60 namespace re2 { |
| 62 | 61 |
| 63 static const int Debug = 0; | 62 static const int Debug = 0; |
| 64 | 63 |
| 65 // The key insight behind this implementation is that the | 64 // The key insight behind this implementation is that the |
| 66 // non-determinism in an NFA for a one-pass regular expression | 65 // non-determinism in an NFA for a one-pass regular expression |
| (...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 119 // To execute a one-pass regular expression program, we can build | 118 // To execute a one-pass regular expression program, we can build |
| 120 // a DFA (no non-determinism) that has at most as many states as | 119 // a DFA (no non-determinism) that has at most as many states as |
| 121 // the NFA (compare this to the possibly exponential number of states | 120 // the NFA (compare this to the possibly exponential number of states |
| 122 // in the general case). Each state records, for each possible | 121 // in the general case). Each state records, for each possible |
| 123 // input byte, the next state along with the conditions required | 122 // input byte, the next state along with the conditions required |
| 124 // before entering that state -- empty-width flags that must be true | 123 // before entering that state -- empty-width flags that must be true |
| 125 // and capture operations that must be performed. It also records | 124 // and capture operations that must be performed. It also records |
| 126 // whether a set of conditions required to finish a match at that | 125 // whether a set of conditions required to finish a match at that |
| 127 // point in the input rather than process the next byte. | 126 // point in the input rather than process the next byte. |
| 128 | 127 |
| 129 // A state in the one-pass NFA (aka DFA) - just an array of actions. | |
| 130 struct OneState; | |
| 131 | |
| 132 // A state in the one-pass NFA - just an array of actions indexed | 128 // A state in the one-pass NFA - just an array of actions indexed |
| 133 // by the bytemap_[] of the next input byte. (The bytemap | 129 // by the bytemap_[] of the next input byte. (The bytemap |
| 134 // maps next input bytes into equivalence classes, to reduce | 130 // maps next input bytes into equivalence classes, to reduce |
| 135 // the memory footprint.) | 131 // the memory footprint.) |
| 136 struct OneState { | 132 struct OneState { |
| 137 uint32 matchcond; // conditions to match right now. | 133 uint32 matchcond; // conditions to match right now. |
| 138 uint32 action[1]; | 134 uint32 action[1]; |
| 139 }; | 135 }; |
| 140 | 136 |
| 141 // The uint32 conditions in the action are a combination of | 137 // The uint32 conditions in the action are a combination of |
| (...skipping 186 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 328 matchcap[i] = cap[i]; | 324 matchcap[i] = cap[i]; |
| 329 matchcap[1] = p; | 325 matchcap[1] = p; |
| 330 matched = true; | 326 matched = true; |
| 331 } | 327 } |
| 332 } | 328 } |
| 333 | 329 |
| 334 done: | 330 done: |
| 335 if (!matched) | 331 if (!matched) |
| 336 return false; | 332 return false; |
| 337 for (int i = 0; i < nmatch; i++) | 333 for (int i = 0; i < nmatch; i++) |
| 338 match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]); | 334 match[i].set(matchcap[2*i], |
| 335 static_cast<int>(matchcap[2*i+1] - matchcap[2*i])); |
| 339 return true; | 336 return true; |
| 340 } | 337 } |
| 341 | 338 |
| 342 | 339 |
| 343 // Analysis to determine whether a given regexp program is one-pass. | 340 // Analysis to determine whether a given regexp program is one-pass. |
| 344 | 341 |
| 345 // If ip is not on workq, adds ip to work queue and returns true. | 342 // If ip is not on workq, adds ip to work queue and returns true. |
| 346 // If ip is already on work queue, does nothing and returns false. | 343 // If ip is already on work queue, does nothing and returns false. |
| 347 // If ip is NULL, does nothing and returns true (pretends to add it). | 344 // If ip is NULL, does nothing and returns true (pretends to add it). |
| 348 typedef SparseSet Instq; | 345 typedef SparseSet Instq; |
| (...skipping 256 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 605 return true; | 602 return true; |
| 606 | 603 |
| 607 fail: | 604 fail: |
| 608 delete[] stack; | 605 delete[] stack; |
| 609 delete[] nodebyid; | 606 delete[] nodebyid; |
| 610 delete[] nodes; | 607 delete[] nodes; |
| 611 return false; | 608 return false; |
| 612 } | 609 } |
| 613 | 610 |
| 614 } // namespace re2 | 611 } // namespace re2 |
| OLD | NEW |