Index: fusl/src/regex/regexec.c |
diff --git a/fusl/src/regex/regexec.c b/fusl/src/regex/regexec.c |
new file mode 100644 |
index 0000000000000000000000000000000000000000..16c5d0ac5c7b5ae6f4a7aaa9313d474682ef6a85 |
--- /dev/null |
+++ b/fusl/src/regex/regexec.c |
@@ -0,0 +1,1013 @@ |
+/* |
+ regexec.c - TRE POSIX compatible matching functions (and more). |
+ |
+ Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi> |
+ All rights reserved. |
+ |
+ Redistribution and use in source and binary forms, with or without |
+ modification, are permitted provided that the following conditions |
+ are met: |
+ |
+ 1. Redistributions of source code must retain the above copyright |
+ notice, this list of conditions and the following disclaimer. |
+ |
+ 2. Redistributions in binary form must reproduce the above copyright |
+ notice, this list of conditions and the following disclaimer in the |
+ documentation and/or other materials provided with the distribution. |
+ |
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS |
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
+ |
+*/ |
+ |
+#include <stdlib.h> |
+#include <string.h> |
+#include <wchar.h> |
+#include <wctype.h> |
+#include <limits.h> |
+ |
+#include <regex.h> |
+ |
+#include "tre.h" |
+ |
+#include <assert.h> |
+ |
+static void |
+tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, |
+ const tre_tnfa_t *tnfa, int *tags, int match_eo); |
+ |
+/*********************************************************************** |
+ from tre-match-utils.h |
+***********************************************************************/ |
+ |
+#define GET_NEXT_WCHAR() do { \ |
+ prev_c = next_c; pos += pos_add_next; \ |
+ if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) { \ |
+ if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; } \ |
+ else pos_add_next++; \ |
+ } \ |
+ str_byte += pos_add_next; \ |
+ } while (0) |
+ |
+#define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum(c)) |
+ |
+#define CHECK_ASSERTIONS(assertions) \ |
+ (((assertions & ASSERT_AT_BOL) \ |
+ && (pos > 0 || reg_notbol) \ |
+ && (prev_c != L'\n' || !reg_newline)) \ |
+ || ((assertions & ASSERT_AT_EOL) \ |
+ && (next_c != L'\0' || reg_noteol) \ |
+ && (next_c != L'\n' || !reg_newline)) \ |
+ || ((assertions & ASSERT_AT_BOW) \ |
+ && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \ |
+ || ((assertions & ASSERT_AT_EOW) \ |
+ && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \ |
+ || ((assertions & ASSERT_AT_WB) \ |
+ && (pos != 0 && next_c != L'\0' \ |
+ && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \ |
+ || ((assertions & ASSERT_AT_WB_NEG) \ |
+ && (pos == 0 || next_c == L'\0' \ |
+ || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c)))) |
+ |
+#define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \ |
+ (((trans_i->assertions & ASSERT_CHAR_CLASS) \ |
+ && !(tnfa->cflags & REG_ICASE) \ |
+ && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class)) \ |
+ || ((trans_i->assertions & ASSERT_CHAR_CLASS) \ |
+ && (tnfa->cflags & REG_ICASE) \ |
+ && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class) \ |
+ && !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class)) \ |
+ || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG) \ |
+ && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\ |
+ tnfa->cflags & REG_ICASE))) |
+ |
+ |
+ |
+ |
+/* Returns 1 if `t1' wins `t2', 0 otherwise. */ |
+static int |
+tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, |
+ int *t1, int *t2) |
+{ |
+ int i; |
+ for (i = 0; i < num_tags; i++) |
+ { |
+ if (tag_directions[i] == TRE_TAG_MINIMIZE) |
+ { |
+ if (t1[i] < t2[i]) |
+ return 1; |
+ if (t1[i] > t2[i]) |
+ return 0; |
+ } |
+ else |
+ { |
+ if (t1[i] > t2[i]) |
+ return 1; |
+ if (t1[i] < t2[i]) |
+ return 0; |
+ } |
+ } |
+ /* assert(0);*/ |
+ return 0; |
+} |
+ |
+static int |
+tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase) |
+{ |
+ while (*classes != (tre_ctype_t)0) |
+ if ((!icase && tre_isctype(wc, *classes)) |
+ || (icase && (tre_isctype(tre_toupper(wc), *classes) |
+ || tre_isctype(tre_tolower(wc), *classes)))) |
+ return 1; /* Match. */ |
+ else |
+ classes++; |
+ return 0; /* No match. */ |
+} |
+ |
+ |
+/*********************************************************************** |
+ from tre-match-parallel.c |
+***********************************************************************/ |
+ |
+/* |
+ This algorithm searches for matches basically by reading characters |
+ in the searched string one by one, starting at the beginning. All |
+ matching paths in the TNFA are traversed in parallel. When two or |
+ more paths reach the same state, exactly one is chosen according to |
+ tag ordering rules; if returning submatches is not required it does |
+ not matter which path is chosen. |
+ |
+ The worst case time required for finding the leftmost and longest |
+ match, or determining that there is no match, is always linearly |
+ dependent on the length of the text being searched. |
+ |
+ This algorithm cannot handle TNFAs with back referencing nodes. |
+ See `tre-match-backtrack.c'. |
+*/ |
+ |
+typedef struct { |
+ tre_tnfa_transition_t *state; |
+ int *tags; |
+} tre_tnfa_reach_t; |
+ |
+typedef struct { |
+ int pos; |
+ int **tags; |
+} tre_reach_pos_t; |
+ |
+ |
+static reg_errcode_t |
+tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, |
+ int *match_tags, int eflags, |
+ int *match_end_ofs) |
+{ |
+ /* State variables required by GET_NEXT_WCHAR. */ |
+ tre_char_t prev_c = 0, next_c = 0; |
+ const char *str_byte = string; |
+ int pos = -1; |
+ int pos_add_next = 1; |
+#ifdef TRE_MBSTATE |
+ mbstate_t mbstate; |
+#endif /* TRE_MBSTATE */ |
+ int reg_notbol = eflags & REG_NOTBOL; |
+ int reg_noteol = eflags & REG_NOTEOL; |
+ int reg_newline = tnfa->cflags & REG_NEWLINE; |
+ reg_errcode_t ret; |
+ |
+ char *buf; |
+ tre_tnfa_transition_t *trans_i; |
+ tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i; |
+ tre_reach_pos_t *reach_pos; |
+ int *tag_i; |
+ int num_tags, i; |
+ |
+ int match_eo = -1; /* end offset of match (-1 if no match found yet) */ |
+ int new_match = 0; |
+ int *tmp_tags = NULL; |
+ int *tmp_iptr; |
+ |
+#ifdef TRE_MBSTATE |
+ memset(&mbstate, '\0', sizeof(mbstate)); |
+#endif /* TRE_MBSTATE */ |
+ |
+ if (!match_tags) |
+ num_tags = 0; |
+ else |
+ num_tags = tnfa->num_tags; |
+ |
+ /* Allocate memory for temporary data required for matching. This needs to |
+ be done for every matching operation to be thread safe. This allocates |
+ everything in a single large block from the stack frame using alloca() |
+ or with malloc() if alloca is unavailable. */ |
+ { |
+ int tbytes, rbytes, pbytes, xbytes, total_bytes; |
+ char *tmp_buf; |
+ /* Compute the length of the block we need. */ |
+ tbytes = sizeof(*tmp_tags) * num_tags; |
+ rbytes = sizeof(*reach_next) * (tnfa->num_states + 1); |
+ pbytes = sizeof(*reach_pos) * tnfa->num_states; |
+ xbytes = sizeof(int) * num_tags; |
+ total_bytes = |
+ (sizeof(long) - 1) * 4 /* for alignment paddings */ |
+ + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes; |
+ |
+ /* Allocate the memory. */ |
+ buf = xmalloc((unsigned)total_bytes); |
+ if (buf == NULL) |
+ return REG_ESPACE; |
+ memset(buf, 0, (size_t)total_bytes); |
+ |
+ /* Get the various pointers within tmp_buf (properly aligned). */ |
+ tmp_tags = (void *)buf; |
+ tmp_buf = buf + tbytes; |
+ tmp_buf += ALIGN(tmp_buf, long); |
+ reach_next = (void *)tmp_buf; |
+ tmp_buf += rbytes; |
+ tmp_buf += ALIGN(tmp_buf, long); |
+ reach = (void *)tmp_buf; |
+ tmp_buf += rbytes; |
+ tmp_buf += ALIGN(tmp_buf, long); |
+ reach_pos = (void *)tmp_buf; |
+ tmp_buf += pbytes; |
+ tmp_buf += ALIGN(tmp_buf, long); |
+ for (i = 0; i < tnfa->num_states; i++) |
+ { |
+ reach[i].tags = (void *)tmp_buf; |
+ tmp_buf += xbytes; |
+ reach_next[i].tags = (void *)tmp_buf; |
+ tmp_buf += xbytes; |
+ } |
+ } |
+ |
+ for (i = 0; i < tnfa->num_states; i++) |
+ reach_pos[i].pos = -1; |
+ |
+ GET_NEXT_WCHAR(); |
+ pos = 0; |
+ |
+ reach_next_i = reach_next; |
+ while (1) |
+ { |
+ /* If no match found yet, add the initial states to `reach_next'. */ |
+ if (match_eo < 0) |
+ { |
+ trans_i = tnfa->initial; |
+ while (trans_i->state != NULL) |
+ { |
+ if (reach_pos[trans_i->state_id].pos < pos) |
+ { |
+ if (trans_i->assertions |
+ && CHECK_ASSERTIONS(trans_i->assertions)) |
+ { |
+ trans_i++; |
+ continue; |
+ } |
+ |
+ reach_next_i->state = trans_i->state; |
+ for (i = 0; i < num_tags; i++) |
+ reach_next_i->tags[i] = -1; |
+ tag_i = trans_i->tags; |
+ if (tag_i) |
+ while (*tag_i >= 0) |
+ { |
+ if (*tag_i < num_tags) |
+ reach_next_i->tags[*tag_i] = pos; |
+ tag_i++; |
+ } |
+ if (reach_next_i->state == tnfa->final) |
+ { |
+ match_eo = pos; |
+ new_match = 1; |
+ for (i = 0; i < num_tags; i++) |
+ match_tags[i] = reach_next_i->tags[i]; |
+ } |
+ reach_pos[trans_i->state_id].pos = pos; |
+ reach_pos[trans_i->state_id].tags = &reach_next_i->tags; |
+ reach_next_i++; |
+ } |
+ trans_i++; |
+ } |
+ reach_next_i->state = NULL; |
+ } |
+ else |
+ { |
+ if (num_tags == 0 || reach_next_i == reach_next) |
+ /* We have found a match. */ |
+ break; |
+ } |
+ |
+ /* Check for end of string. */ |
+ if (!next_c) break; |
+ |
+ GET_NEXT_WCHAR(); |
+ |
+ /* Swap `reach' and `reach_next'. */ |
+ reach_i = reach; |
+ reach = reach_next; |
+ reach_next = reach_i; |
+ |
+ /* For each state in `reach', weed out states that don't fulfill the |
+ minimal matching conditions. */ |
+ if (tnfa->num_minimals && new_match) |
+ { |
+ new_match = 0; |
+ reach_next_i = reach_next; |
+ for (reach_i = reach; reach_i->state; reach_i++) |
+ { |
+ int skip = 0; |
+ for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) |
+ { |
+ int end = tnfa->minimal_tags[i]; |
+ int start = tnfa->minimal_tags[i + 1]; |
+ if (end >= num_tags) |
+ { |
+ skip = 1; |
+ break; |
+ } |
+ else if (reach_i->tags[start] == match_tags[start] |
+ && reach_i->tags[end] < match_tags[end]) |
+ { |
+ skip = 1; |
+ break; |
+ } |
+ } |
+ if (!skip) |
+ { |
+ reach_next_i->state = reach_i->state; |
+ tmp_iptr = reach_next_i->tags; |
+ reach_next_i->tags = reach_i->tags; |
+ reach_i->tags = tmp_iptr; |
+ reach_next_i++; |
+ } |
+ } |
+ reach_next_i->state = NULL; |
+ |
+ /* Swap `reach' and `reach_next'. */ |
+ reach_i = reach; |
+ reach = reach_next; |
+ reach_next = reach_i; |
+ } |
+ |
+ /* For each state in `reach' see if there is a transition leaving with |
+ the current input symbol to a state not yet in `reach_next', and |
+ add the destination states to `reach_next'. */ |
+ reach_next_i = reach_next; |
+ for (reach_i = reach; reach_i->state; reach_i++) |
+ { |
+ for (trans_i = reach_i->state; trans_i->state; trans_i++) |
+ { |
+ /* Does this transition match the input symbol? */ |
+ if (trans_i->code_min <= (tre_cint_t)prev_c && |
+ trans_i->code_max >= (tre_cint_t)prev_c) |
+ { |
+ if (trans_i->assertions |
+ && (CHECK_ASSERTIONS(trans_i->assertions) |
+ || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) |
+ { |
+ continue; |
+ } |
+ |
+ /* Compute the tags after this transition. */ |
+ for (i = 0; i < num_tags; i++) |
+ tmp_tags[i] = reach_i->tags[i]; |
+ tag_i = trans_i->tags; |
+ if (tag_i != NULL) |
+ while (*tag_i >= 0) |
+ { |
+ if (*tag_i < num_tags) |
+ tmp_tags[*tag_i] = pos; |
+ tag_i++; |
+ } |
+ |
+ if (reach_pos[trans_i->state_id].pos < pos) |
+ { |
+ /* Found an unvisited node. */ |
+ reach_next_i->state = trans_i->state; |
+ tmp_iptr = reach_next_i->tags; |
+ reach_next_i->tags = tmp_tags; |
+ tmp_tags = tmp_iptr; |
+ reach_pos[trans_i->state_id].pos = pos; |
+ reach_pos[trans_i->state_id].tags = &reach_next_i->tags; |
+ |
+ if (reach_next_i->state == tnfa->final |
+ && (match_eo == -1 |
+ || (num_tags > 0 |
+ && reach_next_i->tags[0] <= match_tags[0]))) |
+ { |
+ match_eo = pos; |
+ new_match = 1; |
+ for (i = 0; i < num_tags; i++) |
+ match_tags[i] = reach_next_i->tags[i]; |
+ } |
+ reach_next_i++; |
+ |
+ } |
+ else |
+ { |
+ assert(reach_pos[trans_i->state_id].pos == pos); |
+ /* Another path has also reached this state. We choose |
+ the winner by examining the tag values for both |
+ paths. */ |
+ if (tre_tag_order(num_tags, tnfa->tag_directions, |
+ tmp_tags, |
+ *reach_pos[trans_i->state_id].tags)) |
+ { |
+ /* The new path wins. */ |
+ tmp_iptr = *reach_pos[trans_i->state_id].tags; |
+ *reach_pos[trans_i->state_id].tags = tmp_tags; |
+ if (trans_i->state == tnfa->final) |
+ { |
+ match_eo = pos; |
+ new_match = 1; |
+ for (i = 0; i < num_tags; i++) |
+ match_tags[i] = tmp_tags[i]; |
+ } |
+ tmp_tags = tmp_iptr; |
+ } |
+ } |
+ } |
+ } |
+ } |
+ reach_next_i->state = NULL; |
+ } |
+ |
+ *match_end_ofs = match_eo; |
+ ret = match_eo >= 0 ? REG_OK : REG_NOMATCH; |
+error_exit: |
+ xfree(buf); |
+ return ret; |
+} |
+ |
+ |
+ |
+/*********************************************************************** |
+ from tre-match-backtrack.c |
+***********************************************************************/ |
+ |
+/* |
+ This matcher is for regexps that use back referencing. Regexp matching |
+ with back referencing is an NP-complete problem on the number of back |
+ references. The easiest way to match them is to use a backtracking |
+ routine which basically goes through all possible paths in the TNFA |
+ and chooses the one which results in the best (leftmost and longest) |
+ match. This can be spectacularly expensive and may run out of stack |
+ space, but there really is no better known generic algorithm. Quoting |
+ Henry Spencer from comp.compilers: |
+ <URL: http://compilers.iecc.com/comparch/article/93-03-102> |
+ |
+ POSIX.2 REs require longest match, which is really exciting to |
+ implement since the obsolete ("basic") variant also includes |
+ \<digit>. I haven't found a better way of tackling this than doing |
+ a preliminary match using a DFA (or simulation) on a modified RE |
+ that just replicates subREs for \<digit>, and then doing a |
+ backtracking match to determine whether the subRE matches were |
+ right. This can be rather slow, but I console myself with the |
+ thought that people who use \<digit> deserve very slow execution. |
+ (Pun unintentional but very appropriate.) |
+ |
+*/ |
+ |
+typedef struct { |
+ int pos; |
+ const char *str_byte; |
+ tre_tnfa_transition_t *state; |
+ int state_id; |
+ int next_c; |
+ int *tags; |
+#ifdef TRE_MBSTATE |
+ mbstate_t mbstate; |
+#endif /* TRE_MBSTATE */ |
+} tre_backtrack_item_t; |
+ |
+typedef struct tre_backtrack_struct { |
+ tre_backtrack_item_t item; |
+ struct tre_backtrack_struct *prev; |
+ struct tre_backtrack_struct *next; |
+} *tre_backtrack_t; |
+ |
+#ifdef TRE_MBSTATE |
+#define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate) |
+#define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate |
+#else /* !TRE_MBSTATE */ |
+#define BT_STACK_MBSTATE_IN |
+#define BT_STACK_MBSTATE_OUT |
+#endif /* !TRE_MBSTATE */ |
+ |
+#define tre_bt_mem_new tre_mem_new |
+#define tre_bt_mem_alloc tre_mem_alloc |
+#define tre_bt_mem_destroy tre_mem_destroy |
+ |
+ |
+#define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \ |
+ do \ |
+ { \ |
+ int i; \ |
+ if (!stack->next) \ |
+ { \ |
+ tre_backtrack_t s; \ |
+ s = tre_bt_mem_alloc(mem, sizeof(*s)); \ |
+ if (!s) \ |
+ { \ |
+ tre_bt_mem_destroy(mem); \ |
+ if (tags) \ |
+ xfree(tags); \ |
+ if (pmatch) \ |
+ xfree(pmatch); \ |
+ if (states_seen) \ |
+ xfree(states_seen); \ |
+ return REG_ESPACE; \ |
+ } \ |
+ s->prev = stack; \ |
+ s->next = NULL; \ |
+ s->item.tags = tre_bt_mem_alloc(mem, \ |
+ sizeof(*tags) * tnfa->num_tags); \ |
+ if (!s->item.tags) \ |
+ { \ |
+ tre_bt_mem_destroy(mem); \ |
+ if (tags) \ |
+ xfree(tags); \ |
+ if (pmatch) \ |
+ xfree(pmatch); \ |
+ if (states_seen) \ |
+ xfree(states_seen); \ |
+ return REG_ESPACE; \ |
+ } \ |
+ stack->next = s; \ |
+ stack = s; \ |
+ } \ |
+ else \ |
+ stack = stack->next; \ |
+ stack->item.pos = (_pos); \ |
+ stack->item.str_byte = (_str_byte); \ |
+ stack->item.state = (_state); \ |
+ stack->item.state_id = (_state_id); \ |
+ stack->item.next_c = (_next_c); \ |
+ for (i = 0; i < tnfa->num_tags; i++) \ |
+ stack->item.tags[i] = (_tags)[i]; \ |
+ BT_STACK_MBSTATE_IN; \ |
+ } \ |
+ while (0) |
+ |
+#define BT_STACK_POP() \ |
+ do \ |
+ { \ |
+ int i; \ |
+ assert(stack->prev); \ |
+ pos = stack->item.pos; \ |
+ str_byte = stack->item.str_byte; \ |
+ state = stack->item.state; \ |
+ next_c = stack->item.next_c; \ |
+ for (i = 0; i < tnfa->num_tags; i++) \ |
+ tags[i] = stack->item.tags[i]; \ |
+ BT_STACK_MBSTATE_OUT; \ |
+ stack = stack->prev; \ |
+ } \ |
+ while (0) |
+ |
+#undef MIN |
+#define MIN(a, b) ((a) <= (b) ? (a) : (b)) |
+ |
+static reg_errcode_t |
+tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, |
+ int *match_tags, int eflags, int *match_end_ofs) |
+{ |
+ /* State variables required by GET_NEXT_WCHAR. */ |
+ tre_char_t prev_c = 0, next_c = 0; |
+ const char *str_byte = string; |
+ int pos = 0; |
+ int pos_add_next = 1; |
+#ifdef TRE_MBSTATE |
+ mbstate_t mbstate; |
+#endif /* TRE_MBSTATE */ |
+ int reg_notbol = eflags & REG_NOTBOL; |
+ int reg_noteol = eflags & REG_NOTEOL; |
+ int reg_newline = tnfa->cflags & REG_NEWLINE; |
+ |
+ /* These are used to remember the necessary values of the above |
+ variables to return to the position where the current search |
+ started from. */ |
+ int next_c_start; |
+ const char *str_byte_start; |
+ int pos_start = -1; |
+#ifdef TRE_MBSTATE |
+ mbstate_t mbstate_start; |
+#endif /* TRE_MBSTATE */ |
+ |
+ /* End offset of best match so far, or -1 if no match found yet. */ |
+ int match_eo = -1; |
+ /* Tag arrays. */ |
+ int *next_tags, *tags = NULL; |
+ /* Current TNFA state. */ |
+ tre_tnfa_transition_t *state; |
+ int *states_seen = NULL; |
+ |
+ /* Memory allocator to for allocating the backtracking stack. */ |
+ tre_mem_t mem = tre_bt_mem_new(); |
+ |
+ /* The backtracking stack. */ |
+ tre_backtrack_t stack; |
+ |
+ tre_tnfa_transition_t *trans_i; |
+ regmatch_t *pmatch = NULL; |
+ int ret; |
+ |
+#ifdef TRE_MBSTATE |
+ memset(&mbstate, '\0', sizeof(mbstate)); |
+#endif /* TRE_MBSTATE */ |
+ |
+ if (!mem) |
+ return REG_ESPACE; |
+ stack = tre_bt_mem_alloc(mem, sizeof(*stack)); |
+ if (!stack) |
+ { |
+ ret = REG_ESPACE; |
+ goto error_exit; |
+ } |
+ stack->prev = NULL; |
+ stack->next = NULL; |
+ |
+ if (tnfa->num_tags) |
+ { |
+ tags = xmalloc(sizeof(*tags) * tnfa->num_tags); |
+ if (!tags) |
+ { |
+ ret = REG_ESPACE; |
+ goto error_exit; |
+ } |
+ } |
+ if (tnfa->num_submatches) |
+ { |
+ pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches); |
+ if (!pmatch) |
+ { |
+ ret = REG_ESPACE; |
+ goto error_exit; |
+ } |
+ } |
+ if (tnfa->num_states) |
+ { |
+ states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states); |
+ if (!states_seen) |
+ { |
+ ret = REG_ESPACE; |
+ goto error_exit; |
+ } |
+ } |
+ |
+ retry: |
+ { |
+ int i; |
+ for (i = 0; i < tnfa->num_tags; i++) |
+ { |
+ tags[i] = -1; |
+ if (match_tags) |
+ match_tags[i] = -1; |
+ } |
+ for (i = 0; i < tnfa->num_states; i++) |
+ states_seen[i] = 0; |
+ } |
+ |
+ state = NULL; |
+ pos = pos_start; |
+ GET_NEXT_WCHAR(); |
+ pos_start = pos; |
+ next_c_start = next_c; |
+ str_byte_start = str_byte; |
+#ifdef TRE_MBSTATE |
+ mbstate_start = mbstate; |
+#endif /* TRE_MBSTATE */ |
+ |
+ /* Handle initial states. */ |
+ next_tags = NULL; |
+ for (trans_i = tnfa->initial; trans_i->state; trans_i++) |
+ { |
+ if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions)) |
+ { |
+ continue; |
+ } |
+ if (state == NULL) |
+ { |
+ /* Start from this state. */ |
+ state = trans_i->state; |
+ next_tags = trans_i->tags; |
+ } |
+ else |
+ { |
+ /* Backtrack to this state. */ |
+ BT_STACK_PUSH(pos, str_byte, 0, trans_i->state, |
+ trans_i->state_id, next_c, tags, mbstate); |
+ { |
+ int *tmp = trans_i->tags; |
+ if (tmp) |
+ while (*tmp >= 0) |
+ stack->item.tags[*tmp++] = pos; |
+ } |
+ } |
+ } |
+ |
+ if (next_tags) |
+ for (; *next_tags >= 0; next_tags++) |
+ tags[*next_tags] = pos; |
+ |
+ |
+ if (state == NULL) |
+ goto backtrack; |
+ |
+ while (1) |
+ { |
+ tre_tnfa_transition_t *next_state; |
+ int empty_br_match; |
+ |
+ if (state == tnfa->final) |
+ { |
+ if (match_eo < pos |
+ || (match_eo == pos |
+ && match_tags |
+ && tre_tag_order(tnfa->num_tags, tnfa->tag_directions, |
+ tags, match_tags))) |
+ { |
+ int i; |
+ /* This match wins the previous match. */ |
+ match_eo = pos; |
+ if (match_tags) |
+ for (i = 0; i < tnfa->num_tags; i++) |
+ match_tags[i] = tags[i]; |
+ } |
+ /* Our TNFAs never have transitions leaving from the final state, |
+ so we jump right to backtracking. */ |
+ goto backtrack; |
+ } |
+ |
+ /* Go to the next character in the input string. */ |
+ empty_br_match = 0; |
+ trans_i = state; |
+ if (trans_i->state && trans_i->assertions & ASSERT_BACKREF) |
+ { |
+ /* This is a back reference state. All transitions leaving from |
+ this state have the same back reference "assertion". Instead |
+ of reading the next character, we match the back reference. */ |
+ int so, eo, bt = trans_i->u.backref; |
+ int bt_len; |
+ int result; |
+ |
+ /* Get the substring we need to match against. Remember to |
+ turn off REG_NOSUB temporarily. */ |
+ tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB, |
+ tnfa, tags, pos); |
+ so = pmatch[bt].rm_so; |
+ eo = pmatch[bt].rm_eo; |
+ bt_len = eo - so; |
+ |
+ result = strncmp((const char*)string + so, str_byte - 1, |
+ (size_t)bt_len); |
+ |
+ if (result == 0) |
+ { |
+ /* Back reference matched. Check for infinite loop. */ |
+ if (bt_len == 0) |
+ empty_br_match = 1; |
+ if (empty_br_match && states_seen[trans_i->state_id]) |
+ { |
+ goto backtrack; |
+ } |
+ |
+ states_seen[trans_i->state_id] = empty_br_match; |
+ |
+ /* Advance in input string and resync `prev_c', `next_c' |
+ and pos. */ |
+ str_byte += bt_len - 1; |
+ pos += bt_len - 1; |
+ GET_NEXT_WCHAR(); |
+ } |
+ else |
+ { |
+ goto backtrack; |
+ } |
+ } |
+ else |
+ { |
+ /* Check for end of string. */ |
+ if (next_c == L'\0') |
+ goto backtrack; |
+ |
+ /* Read the next character. */ |
+ GET_NEXT_WCHAR(); |
+ } |
+ |
+ next_state = NULL; |
+ for (trans_i = state; trans_i->state; trans_i++) |
+ { |
+ if (trans_i->code_min <= (tre_cint_t)prev_c |
+ && trans_i->code_max >= (tre_cint_t)prev_c) |
+ { |
+ if (trans_i->assertions |
+ && (CHECK_ASSERTIONS(trans_i->assertions) |
+ || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) |
+ { |
+ continue; |
+ } |
+ |
+ if (next_state == NULL) |
+ { |
+ /* First matching transition. */ |
+ next_state = trans_i->state; |
+ next_tags = trans_i->tags; |
+ } |
+ else |
+ { |
+ /* Second matching transition. We may need to backtrack here |
+ to take this transition instead of the first one, so we |
+ push this transition in the backtracking stack so we can |
+ jump back here if needed. */ |
+ BT_STACK_PUSH(pos, str_byte, 0, trans_i->state, |
+ trans_i->state_id, next_c, tags, mbstate); |
+ { |
+ int *tmp; |
+ for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++) |
+ stack->item.tags[*tmp] = pos; |
+ } |
+#if 0 /* XXX - it's important not to look at all transitions here to keep |
+ the stack small! */ |
+ break; |
+#endif |
+ } |
+ } |
+ } |
+ |
+ if (next_state != NULL) |
+ { |
+ /* Matching transitions were found. Take the first one. */ |
+ state = next_state; |
+ |
+ /* Update the tag values. */ |
+ if (next_tags) |
+ while (*next_tags >= 0) |
+ tags[*next_tags++] = pos; |
+ } |
+ else |
+ { |
+ backtrack: |
+ /* A matching transition was not found. Try to backtrack. */ |
+ if (stack->prev) |
+ { |
+ if (stack->item.state->assertions & ASSERT_BACKREF) |
+ { |
+ states_seen[stack->item.state_id] = 0; |
+ } |
+ |
+ BT_STACK_POP(); |
+ } |
+ else if (match_eo < 0) |
+ { |
+ /* Try starting from a later position in the input string. */ |
+ /* Check for end of string. */ |
+ if (next_c == L'\0') |
+ { |
+ break; |
+ } |
+ next_c = next_c_start; |
+#ifdef TRE_MBSTATE |
+ mbstate = mbstate_start; |
+#endif /* TRE_MBSTATE */ |
+ str_byte = str_byte_start; |
+ goto retry; |
+ } |
+ else |
+ { |
+ break; |
+ } |
+ } |
+ } |
+ |
+ ret = match_eo >= 0 ? REG_OK : REG_NOMATCH; |
+ *match_end_ofs = match_eo; |
+ |
+ error_exit: |
+ tre_bt_mem_destroy(mem); |
+#ifndef TRE_USE_ALLOCA |
+ if (tags) |
+ xfree(tags); |
+ if (pmatch) |
+ xfree(pmatch); |
+ if (states_seen) |
+ xfree(states_seen); |
+#endif /* !TRE_USE_ALLOCA */ |
+ |
+ return ret; |
+} |
+ |
+/*********************************************************************** |
+ from regexec.c |
+***********************************************************************/ |
+ |
+/* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match |
+ endpoint values. */ |
+static void |
+tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, |
+ const tre_tnfa_t *tnfa, int *tags, int match_eo) |
+{ |
+ tre_submatch_data_t *submatch_data; |
+ unsigned int i, j; |
+ int *parents; |
+ |
+ i = 0; |
+ if (match_eo >= 0 && !(cflags & REG_NOSUB)) |
+ { |
+ /* Construct submatch offsets from the tags. */ |
+ submatch_data = tnfa->submatch_data; |
+ while (i < tnfa->num_submatches && i < nmatch) |
+ { |
+ if (submatch_data[i].so_tag == tnfa->end_tag) |
+ pmatch[i].rm_so = match_eo; |
+ else |
+ pmatch[i].rm_so = tags[submatch_data[i].so_tag]; |
+ |
+ if (submatch_data[i].eo_tag == tnfa->end_tag) |
+ pmatch[i].rm_eo = match_eo; |
+ else |
+ pmatch[i].rm_eo = tags[submatch_data[i].eo_tag]; |
+ |
+ /* If either of the endpoints were not used, this submatch |
+ was not part of the match. */ |
+ if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1) |
+ pmatch[i].rm_so = pmatch[i].rm_eo = -1; |
+ |
+ i++; |
+ } |
+ /* Reset all submatches that are not within all of their parent |
+ submatches. */ |
+ i = 0; |
+ while (i < tnfa->num_submatches && i < nmatch) |
+ { |
+ if (pmatch[i].rm_eo == -1) |
+ assert(pmatch[i].rm_so == -1); |
+ assert(pmatch[i].rm_so <= pmatch[i].rm_eo); |
+ |
+ parents = submatch_data[i].parents; |
+ if (parents != NULL) |
+ for (j = 0; parents[j] >= 0; j++) |
+ { |
+ if (pmatch[i].rm_so < pmatch[parents[j]].rm_so |
+ || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo) |
+ pmatch[i].rm_so = pmatch[i].rm_eo = -1; |
+ } |
+ i++; |
+ } |
+ } |
+ |
+ while (i < nmatch) |
+ { |
+ pmatch[i].rm_so = -1; |
+ pmatch[i].rm_eo = -1; |
+ i++; |
+ } |
+} |
+ |
+ |
+/* |
+ Wrapper functions for POSIX compatible regexp matching. |
+*/ |
+ |
+int |
+regexec(const regex_t *restrict preg, const char *restrict string, |
+ size_t nmatch, regmatch_t pmatch[restrict], int eflags) |
+{ |
+ tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; |
+ reg_errcode_t status; |
+ int *tags = NULL, eo; |
+ if (tnfa->cflags & REG_NOSUB) nmatch = 0; |
+ if (tnfa->num_tags > 0 && nmatch > 0) |
+ { |
+ tags = xmalloc(sizeof(*tags) * tnfa->num_tags); |
+ if (tags == NULL) |
+ return REG_ESPACE; |
+ } |
+ |
+ /* Dispatch to the appropriate matcher. */ |
+ if (tnfa->have_backrefs) |
+ { |
+ /* The regex has back references, use the backtracking matcher. */ |
+ status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo); |
+ } |
+ else |
+ { |
+ /* Exact matching, no back references, use the parallel matcher. */ |
+ status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo); |
+ } |
+ |
+ if (status == REG_OK) |
+ /* A match was found, so fill the submatch registers. */ |
+ tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo); |
+ if (tags) |
+ xfree(tags); |
+ return status; |
+} |