fusl/src/regex/regcomp.c - Issue 1573973002: Add a "fork" of musl as //fusl.

Unified Diff: fusl/src/regex/regcomp.c

Issue 1573973002: Add a "fork" of musl as //fusl. (Closed) Base URL: https://github.com/domokit/mojo.git@master

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: fusl/src/regex/regcomp.c

diff --git a/fusl/src/regex/regcomp.c b/fusl/src/regex/regcomp.c

new file mode 100644

index 0000000000000000000000000000000000000000..330de46759730b55b90d9080fb7798d37ae5db86

--- /dev/null

+++ b/fusl/src/regex/regcomp.c

@@ -0,0 +1,2910 @@

+/*

+ regcomp.c - TRE POSIX compatible regex compilation functions.

+ Redistribution and use in source and binary forms, with or without

+ modification, are permitted provided that the following conditions

+ are met:

+ 1. Redistributions of source code must retain the above copyright

+ notice, this list of conditions and the following disclaimer.

+ 2. Redistributions in binary form must reproduce the above copyright

+ notice, this list of conditions and the following disclaimer in the

+ documentation and/or other materials provided with the distribution.

+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS

+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+#include <string.h>

+#include <stdlib.h>

+#include <regex.h>

+#include <limits.h>

+#include <stdint.h>

+#include <ctype.h>

+#include "tre.h"

+#include <assert.h>

+/***********************************************************************

+ from tre-compile.h

+***********************************************************************/

+typedef struct {

+ int position;

+ int code_min;

+ int code_max;

+ int *tags;

+ int assertions;

+ tre_ctype_t class;

+ tre_ctype_t *neg_classes;

+ int backref;

+} tre_pos_and_tags_t;

+/***********************************************************************

+ from tre-ast.c and tre-ast.h

+***********************************************************************/

+/* The different AST node types. */

+typedef enum {

+ LITERAL,

+ CATENATION,

+ ITERATION,

+ UNION

+} tre_ast_type_t;

+/* Special subtypes of TRE_LITERAL. */

+#define EMPTY -1 /* Empty leaf (denotes empty string). */

+#define ASSERTION -2 /* Assertion leaf. */

+#define TAG -3 /* Tag leaf. */

+#define BACKREF -4 /* Back reference leaf. */

+#define IS_SPECIAL(x) ((x)->code_min < 0)

+#define IS_EMPTY(x) ((x)->code_min == EMPTY)

+#define IS_ASSERTION(x) ((x)->code_min == ASSERTION)

+#define IS_TAG(x) ((x)->code_min == TAG)

+#define IS_BACKREF(x) ((x)->code_min == BACKREF)

+/* A generic AST node. All AST nodes consist of this node on the top

+ level with `obj' pointing to the actual content. */

+typedef struct {

+ tre_ast_type_t type; /* Type of the node. */

+ void *obj; /* Pointer to actual node. */

+ int nullable;

+ int submatch_id;

+ int num_submatches;

+ int num_tags;

+ tre_pos_and_tags_t *firstpos;

+ tre_pos_and_tags_t *lastpos;

+} tre_ast_node_t;

+/* A "literal" node. These are created for assertions, back references,

+ tags, matching parameter settings, and all expressions that match one

+ character. */

+typedef struct {

+ long code_min;

+ long code_max;

+ int position;

+ tre_ctype_t class;

+ tre_ctype_t *neg_classes;

+} tre_literal_t;

+/* A "catenation" node. These are created when two regexps are concatenated.

+ If there are more than one subexpressions in sequence, the `left' part

+ holds all but the last, and `right' part holds the last subexpression

+ (catenation is left associative). */

+typedef struct {

+ tre_ast_node_t *left;

+ tre_ast_node_t *right;

+} tre_catenation_t;

+/* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}"

+ operators. */

+typedef struct {

+ /* Subexpression to match. */

+ tre_ast_node_t *arg;

+ /* Minimum number of consecutive matches. */

+ int min;

+ /* Maximum number of consecutive matches. */

+ int max;

+ /* If 0, match as many characters as possible, if 1 match as few as

+ possible. Note that this does not always mean the same thing as

+ matching as many/few repetitions as possible. */

+ unsigned int minimal:1;

+} tre_iteration_t;

+/* An "union" node. These are created for the "|" operator. */

+typedef struct {

+ tre_ast_node_t *left;

+ tre_ast_node_t *right;

+} tre_union_t;

+static tre_ast_node_t *

+tre_ast_new_node(tre_mem_t mem, int type, void *obj)

+ tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);

+ if (!node || !obj)

+ return 0;

+ node->obj = obj;

+ node->type = type;

+ node->nullable = -1;

+ node->submatch_id = -1;

+ return node;

+static tre_ast_node_t *

+tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)

+ tre_ast_node_t *node;

+ tre_literal_t *lit;

+ lit = tre_mem_calloc(mem, sizeof *lit);

+ node = tre_ast_new_node(mem, LITERAL, lit);

+ if (!node)

+ return 0;

+ lit->code_min = code_min;

+ lit->code_max = code_max;

+ lit->position = position;

+ return node;

+static tre_ast_node_t *

+tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)

+ tre_ast_node_t *node;

+ tre_iteration_t *iter;

+ iter = tre_mem_calloc(mem, sizeof *iter);

+ node = tre_ast_new_node(mem, ITERATION, iter);

+ if (!node)

+ return 0;

+ iter->arg = arg;

+ iter->min = min;

+ iter->max = max;

+ iter->minimal = minimal;

+ node->num_submatches = arg->num_submatches;

+ return node;

+static tre_ast_node_t *

+tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)

+ tre_ast_node_t *node;

+ tre_union_t *un;

+ if (!left)

+ return right;

+ un = tre_mem_calloc(mem, sizeof *un);

+ node = tre_ast_new_node(mem, UNION, un);

+ if (!node || !right)

+ return 0;

+ un->left = left;

+ un->right = right;

+ node->num_submatches = left->num_submatches + right->num_submatches;

+ return node;

+static tre_ast_node_t *

+tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)

+ tre_ast_node_t *node;

+ tre_catenation_t *cat;

+ if (!left)

+ return right;

+ cat = tre_mem_calloc(mem, sizeof *cat);

+ node = tre_ast_new_node(mem, CATENATION, cat);

+ if (!node)

+ return 0;

+ cat->left = left;

+ cat->right = right;

+ node->num_submatches = left->num_submatches + right->num_submatches;

+ return node;

+/***********************************************************************

+ from tre-stack.c and tre-stack.h

+***********************************************************************/

+typedef struct tre_stack_rec tre_stack_t;

+/* Creates a new stack object. `size' is initial size in bytes, `max_size'

+ is maximum size, and `increment' specifies how much more space will be

+ allocated with realloc() if all space gets used up. Returns the stack

+ object or NULL if out of memory. */

+static tre_stack_t *

+tre_stack_new(int size, int max_size, int increment);

+/* Frees the stack object. */

+static void

+tre_stack_destroy(tre_stack_t *s);

+/* Returns the current number of objects in the stack. */

+static int

+tre_stack_num_objects(tre_stack_t *s);

+/* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes

+ `value' on top of stack `s'. Returns REG_ESPACE if out of memory.

+ This tries to realloc() more space before failing if maximum size

+ has not yet been reached. Returns REG_OK if successful. */

+#define declare_pushf(typetag, type) \

+ static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)

+declare_pushf(voidptr, void *);

+declare_pushf(int, int);

+/* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost

+ element off of stack `s' and returns it. The stack must not be

+ empty. */

+#define declare_popf(typetag, type) \

+ static type tre_stack_pop_ ## typetag(tre_stack_t *s)

+declare_popf(voidptr, void *);

+declare_popf(int, int);

+/* Just to save some typing. */

+#define STACK_PUSH(s, typetag, value) \

+ do \

+ { \

+ status = tre_stack_push_ ## typetag(s, value); \

+ } \

+ while (/*CONSTCOND*/0)

+#define STACK_PUSHX(s, typetag, value) \

+ { \

+ status = tre_stack_push_ ## typetag(s, value); \

+ if (status != REG_OK) \

+ break; \

+ }

+#define STACK_PUSHR(s, typetag, value) \

+ { \

+ reg_errcode_t _status; \

+ _status = tre_stack_push_ ## typetag(s, value); \

+ if (_status != REG_OK) \

+ return _status; \

+ }

+union tre_stack_item {

+ void *voidptr_value;

+ int int_value;

+};

+struct tre_stack_rec {

+ int size;

+ int max_size;

+ int increment;

+ int ptr;

+ union tre_stack_item *stack;

+};

+static tre_stack_t *

+tre_stack_new(int size, int max_size, int increment)

+ tre_stack_t *s;

+ s = xmalloc(sizeof(*s));

+ if (s != NULL)

+ {

+ s->stack = xmalloc(sizeof(*s->stack) * size);

+ if (s->stack == NULL)

+ {

+ xfree(s);

+ return NULL;

+ }

+ s->size = size;

+ s->max_size = max_size;

+ s->increment = increment;

+ s->ptr = 0;

+ }

+ return s;

+static void

+tre_stack_destroy(tre_stack_t *s)

+ xfree(s->stack);

+ xfree(s);

+static int

+tre_stack_num_objects(tre_stack_t *s)

+ return s->ptr;

+static reg_errcode_t

+tre_stack_push(tre_stack_t *s, union tre_stack_item value)

+ if (s->ptr < s->size)

+ {

+ s->stack[s->ptr] = value;

+ s->ptr++;

+ }

+ else

+ {

+ if (s->size >= s->max_size)

+ {

+ return REG_ESPACE;

+ }

+ else

+ {

+ union tre_stack_item *new_buffer;

+ int new_size;

+ new_size = s->size + s->increment;

+ if (new_size > s->max_size)

+ new_size = s->max_size;

+ new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);

+ if (new_buffer == NULL)

+ {

+ return REG_ESPACE;

+ }

+ assert(new_size > s->size);

+ s->size = new_size;

+ s->stack = new_buffer;

+ tre_stack_push(s, value);

+ }

+ return REG_OK;

+#define define_pushf(typetag, type) \

+ declare_pushf(typetag, type) { \

+ union tre_stack_item item; \

+ item.typetag ## _value = value; \

+ return tre_stack_push(s, item); \

+define_pushf(int, int)

+define_pushf(voidptr, void *)

+#define define_popf(typetag, type) \

+ declare_popf(typetag, type) { \

+ return s->stack[--s->ptr].typetag ## _value; \

+ }

+define_popf(int, int)

+define_popf(voidptr, void *)

+/***********************************************************************

+ from tre-parse.c and tre-parse.h

+***********************************************************************/

+/* Parse context. */

+typedef struct {

+ /* Memory allocator. The AST is allocated using this. */

+ tre_mem_t mem;

+ /* Stack used for keeping track of regexp syntax. */

+ tre_stack_t *stack;

+ /* The parsed node after a parse function returns. */

+ tre_ast_node_t *n;

+ /* Position in the regexp pattern after a parse function returns. */

+ const char *s;

+ /* The first character of the regexp. */

+ const char *re;

+ /* Current submatch ID. */

+ int submatch_id;

+ /* Current position (number of literal). */

+ int position;

+ /* The highest back reference or -1 if none seen so far. */

+ int max_backref;

+ /* Compilation flags. */

+ int cflags;

+} tre_parse_ctx_t;

+/* Some macros for expanding \w, \s, etc. */

+static const struct {

+ char c;

+ const char *expansion;

+} tre_macros[] = {

+ {'t', "\t"}, {'n', "\n"}, {'r', "\r"},

+ {'f', "\f"}, {'a', "\a"}, {'e', "\033"},

+ {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},

+ {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},

+ { 0, 0 }

+};

+/* Expands a macro delimited by `regex' and `regex_end' to `buf', which

+ must have at least `len' items. Sets buf[0] to zero if the there

+ is no match in `tre_macros'. */

+static const char *tre_expand_macro(const char *s)

+ int i;

+ for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);

+ return tre_macros[i].expansion;

+static int

+tre_compare_lit(const void *a, const void *b)

+ const tre_literal_t *const *la = a;

+ const tre_literal_t *const *lb = b;

+ /* assumes the range of valid code_min is < INT_MAX */

+ return la[0]->code_min - lb[0]->code_min;

+struct literals {

+ tre_mem_t mem;

+ tre_literal_t **a;

+ int len;

+ int cap;

+};

+static tre_literal_t *tre_new_lit(struct literals *p)

+ tre_literal_t **a;

+ if (p->len >= p->cap) {

+ if (p->cap >= 1<<15)

+ return 0;

+ p->cap *= 2;

+ a = xrealloc(p->a, p->cap * sizeof *p->a);

+ if (!a)

+ return 0;

+ p->a = a;

+ }

+ a = p->a + p->len++;

+ *a = tre_mem_calloc(p->mem, sizeof **a);

+ return *a;

+static int add_icase_literals(struct literals *ls, int min, int max)

+ tre_literal_t *lit;

+ int b, e, c;

+ for (c=min; c<=max; ) {

+ /* assumes islower(c) and isupper(c) are exclusive

+ and toupper(c)!=c if islower(c).

+ multiple opposite case characters are not supported */

+ if (tre_islower(c)) {

+ b = e = tre_toupper(c);

+ for (c++, e++; c<=max; c++, e++)

+ if (tre_toupper(c) != e) break;

+ } else if (tre_isupper(c)) {

+ b = e = tre_tolower(c);

+ for (c++, e++; c<=max; c++, e++)

+ if (tre_tolower(c) != e) break;

+ } else {

+ c++;

+ continue;

+ }

+ lit = tre_new_lit(ls);

+ if (!lit)

+ return -1;

+ lit->code_min = b;

+ lit->code_max = e-1;

+ lit->position = -1;

+ }

+ return 0;

+/* Maximum number of character classes in a negated bracket expression. */

+#define MAX_NEG_CLASSES 64

+struct neg {

+ int negate;

+ int len;

+ tre_ctype_t a[MAX_NEG_CLASSES];

+};

+// TODO: parse bracket into a set of non-overlapping [lo,hi] ranges

+/*

+bracket grammar:

+Bracket = '[' List ']' | '[^' List ']'

+List = Term | List Term

+Term = Char | Range | Chclass | Eqclass

+Range = Char '-' Char | Char '-' '-'

+Char = Coll | coll_single

+Meta = ']' | '-'

+Coll = '[.' coll_single '.]' | '[.' coll_multi '.]' | '[.' Meta '.]'

+Eqclass = '[=' coll_single '=]' | '[=' coll_multi '=]'

+Chclass = '[:' class ':]'

+coll_single is a single char collating element but it can be

+ '-' only at the beginning or end of a List and

+ ']' only at the beginning of a List and

+ '^' anywhere except after the openning '['

+*/

+static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)

+ const char *start = s;

+ tre_ctype_t class;

+ int min, max;

+ wchar_t wc;

+ int len;

+ for (;;) {

+ class = 0;

+ len = mbtowc(&wc, s, -1);

+ if (len <= 0)

+ return *s ? REG_BADPAT : REG_EBRACK;

+ if (*s == ']' && s != start) {

+ ctx->s = s+1;

+ return REG_OK;

+ }

+ if (*s == '-' && s != start && s[1] != ']' &&

+ /* extension: [a-z--@] is accepted as [a-z]|[--@] */

+ (s[1] != '-' || s[2] == ']'))

+ return REG_ERANGE;

+ if (*s == '[' && (s[1] == '.' || s[1] == '='))

+ /* collating symbols and equivalence classes are not supported */

+ return REG_ECOLLATE;

+ if (*s == '[' && s[1] == ':') {

+ char tmp[CHARCLASS_NAME_MAX+1];

+ s += 2;

+ for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {

+ if (s[len] == ':') {

+ memcpy(tmp, s, len);

+ tmp[len] = 0;

+ class = tre_ctype(tmp);

+ break;

+ }

+ if (!class || s[len+1] != ']')

+ return REG_ECTYPE;

+ min = 0;

+ max = TRE_CHAR_MAX;

+ s += len+2;

+ } else {

+ min = max = wc;

+ s += len;

+ if (*s == '-' && s[1] != ']') {

+ s++;

+ len = mbtowc(&wc, s, -1);

+ max = wc;

+ /* XXX - Should use collation order instead of

+ encoding values in character ranges. */

+ if (len <= 0 || min > max)

+ return REG_ERANGE;

+ s += len;

+ }

+ if (class && neg->negate) {

+ if (neg->len >= MAX_NEG_CLASSES)

+ return REG_ESPACE;

+ neg->a[neg->len++] = class;

+ } else {

+ tre_literal_t *lit = tre_new_lit(ls);

+ if (!lit)

+ return REG_ESPACE;

+ lit->code_min = min;

+ lit->code_max = max;

+ lit->class = class;

+ lit->position = -1;

+ /* Add opposite-case codepoints if REG_ICASE is present.

+ It seems that POSIX requires that bracket negation

+ should happen before case-folding, but most practical

+ implementations do it the other way around. Changing

+ the order would need efficient representation of

+ case-fold ranges and bracket range sets even with

+ simple patterns so this is ok for now. */

+ if (ctx->cflags & REG_ICASE && !class)

+ if (add_icase_literals(ls, min, max))

+ return REG_ESPACE;

+ }

+static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)

+ int i, max, min, negmax, negmin;

+ tre_ast_node_t *node = 0, *n;

+ tre_ctype_t *nc = 0;

+ tre_literal_t *lit;

+ struct literals ls;

+ struct neg neg;

+ reg_errcode_t err;

+ ls.mem = ctx->mem;

+ ls.len = 0;

+ ls.cap = 32;

+ ls.a = xmalloc(ls.cap * sizeof *ls.a);

+ if (!ls.a)

+ return REG_ESPACE;

+ neg.len = 0;

+ neg.negate = *s == '^';

+ if (neg.negate)

+ s++;

+ err = parse_bracket_terms(ctx, s, &ls, &neg);

+ if (err != REG_OK)

+ goto parse_bracket_done;

+ if (neg.negate) {

+ /* Sort the array if we need to negate it. */

+ qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);

+ /* extra lit for the last negated range */

+ lit = tre_new_lit(&ls);

+ if (!lit) {

+ err = REG_ESPACE;

+ goto parse_bracket_done;

+ }

+ lit->code_min = TRE_CHAR_MAX+1;

+ lit->code_max = TRE_CHAR_MAX+1;

+ lit->position = -1;

+ /* negated classes */

+ if (neg.len) {

+ nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);

+ if (!nc) {

+ err = REG_ESPACE;

+ goto parse_bracket_done;

+ }

+ memcpy(nc, neg.a, neg.len*sizeof *neg.a);

+ nc[neg.len] = 0;

+ }

+ /* Build a union of the items in the array, negated if necessary. */

+ negmax = negmin = 0;

+ for (i = 0; i < ls.len; i++) {

+ lit = ls.a[i];

+ min = lit->code_min;

+ max = lit->code_max;

+ if (neg.negate) {

+ if (min <= negmin) {

+ /* Overlap. */

+ negmin = MAX(max + 1, negmin);

+ continue;

+ }

+ negmax = min - 1;

+ lit->code_min = negmin;

+ lit->code_max = negmax;

+ negmin = max + 1;

+ }

+ lit->position = ctx->position;

+ lit->neg_classes = nc;

+ n = tre_ast_new_node(ctx->mem, LITERAL, lit);

+ node = tre_ast_new_union(ctx->mem, node, n);

+ if (!node) {

+ err = REG_ESPACE;

+ break;

+ }

+parse_bracket_done:

+ xfree(ls.a);

+ ctx->position++;

+ ctx->n = node;

+ return err;

+static const char *parse_dup_count(const char *s, int *n)

+ *n = -1;

+ if (!isdigit(*s))

+ return s;

+ *n = 0;

+ for (;;) {

+ *n = 10 * *n + (*s - '0');

+ s++;

+ if (!isdigit(*s) || *n > RE_DUP_MAX)

+ break;

+ }

+ return s;

+static reg_errcode_t parse_dup(tre_parse_ctx_t *ctx, const char *s)

+ int min, max;

+ s = parse_dup_count(s, &min);

+ if (*s == ',')

+ s = parse_dup_count(s+1, &max);

+ else

+ max = min;

+ if (

+ (max < min && max >= 0) ||

+ max > RE_DUP_MAX ||

+ min > RE_DUP_MAX ||

+ min < 0 ||

+ (!(ctx->cflags & REG_EXTENDED) && *s++ != '\\') ||

+ *s++ != '}'

+ )

+ return REG_BADBR;

+ if (min == 0 && max == 0)

+ ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);

+ else

+ ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);

+ if (!ctx->n)

+ return REG_ESPACE;

+ ctx->s = s;

+ return REG_OK;

+static int hexval(unsigned c)

+ if (c-'0'<10) return c-'0';

+ c |= 32;

+ if (c-'a'<6) return c-'a'+10;

+ return -1;

+static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)

+ if (node->submatch_id >= 0) {

+ tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);

+ if (!n)

+ return REG_ESPACE;

+ n = tre_ast_new_catenation(ctx->mem, n, node);

+ if (!n)

+ return REG_ESPACE;

+ n->num_submatches = node->num_submatches;

+ node = n;

+ }

+ node->submatch_id = subid;

+ node->num_submatches++;

+ ctx->n = node;

+ return REG_OK;

+/*

+BRE grammar:

+Regex = Branch | '^' | '$' | '^$' | '^' Branch | Branch '$' | '^' Branch '$'

+Branch = Atom | Branch Atom

+Dup = '*' | '\{' Count '\}' | '\{' Count ',\}' | '\{' Count ',' Count '\}'

+(leading ^ and trailing $ in a sub expr may be an anchor or literal as well)

+ERE grammar:

+Regex = Branch | Regex '|' Branch

+Branch = Atom | Branch Atom

+Dup = '*' | '+' | '?' | '{' Count '}' | '{' Count ',}' | '{' Count ',' Count '}'

+(a*+?, ^*, $+, \X, {, (|a) are unspecified)

+*/

+static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)

+ int len, ere = ctx->cflags & REG_EXTENDED;

+ const char *p;

+ tre_ast_node_t *node;

+ wchar_t wc;

+ switch (*s) {

+ case '[':

+ return parse_bracket(ctx, s+1);

+ case '\\':

+ p = tre_expand_macro(s+1);

+ if (p) {

+ /* assume \X expansion is a single atom */

+ reg_errcode_t err = parse_atom(ctx, p);

+ ctx->s = s+2;

+ return err;

+ }

+ /* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */

+ switch (*++s) {

+ case 0:

+ return REG_EESCAPE;

+ case 'b':

+ node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);

+ break;

+ case 'B':

+ node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);

+ break;

+ case '<':

+ node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);

+ break;

+ case '>':

+ node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);

+ break;

+ case 'x':

+ s++;

+ int i, v = 0, c;

+ len = 2;

+ if (*s == '{') {

+ len = 8;

+ s++;

+ }

+ for (i=0; i<len && v<0x110000; i++) {

+ c = hexval(s[i]);

+ if (c < 0) break;

+ v = 16*v + c;

+ }

+ s += i;

+ if (len == 8) {

+ if (*s != '}')

+ return REG_EBRACE;

+ s++;

+ }

+ node = tre_ast_new_literal(ctx->mem, v, v, ctx->position);

+ ctx->position++;

+ s--;

+ break;

+ default:

+ if (!ere && (unsigned)*s-'1' < 9) {

+ /* back reference */

+ int val = *s - '0';

+ node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position);

+ ctx->max_backref = MAX(val, ctx->max_backref);

+ } else {

+ /* extension: accept unknown escaped char

+ as a literal */

+ goto parse_literal;

+ }

+ ctx->position++;

+ }

+ s++;

+ break;

+ case '.':

+ if (ctx->cflags & REG_NEWLINE) {

+ tre_ast_node_t *tmp1, *tmp2;

+ tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);

+ tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);

+ if (tmp1 && tmp2)

+ node = tre_ast_new_union(ctx->mem, tmp1, tmp2);

+ else

+ node = 0;

+ } else {

+ node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);

+ }

+ s++;

+ break;

+ case '^':

+ /* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */

+ if (!ere && s != ctx->re)

+ goto parse_literal;

+ node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);

+ s++;

+ break;

+ case '$':

+ /* '$' is special everywhere in EREs, and in the end of the string in BREs. */

+ if (!ere && s[1])

+ goto parse_literal;

+ node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);

+ s++;

+ break;

+ case '*':

+ case '|':

+ case '{':

+ case '+':

+ case '?':

+ if (!ere)

+ goto parse_literal;

+ case 0:

+ node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);

+ break;

+ default:

+parse_literal:

+ len = mbtowc(&wc, s, -1);

+ if (len < 0)

+ return REG_BADPAT;

+ if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {

+ tre_ast_node_t *tmp1, *tmp2;

+ /* multiple opposite case characters are not supported */

+ tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);

+ tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);

+ if (tmp1 && tmp2)

+ node = tre_ast_new_union(ctx->mem, tmp1, tmp2);

+ else

+ node = 0;

+ } else {

+ node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);

+ }

+ ctx->position++;

+ s += len;

+ break;

+ }

+ if (!node)

+ return REG_ESPACE;

+ ctx->n = node;

+ ctx->s = s;

+ return REG_OK;

+#define PUSHPTR(err, s, v) do { \

+ if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \

+ return err; \

+} while(0)

+#define PUSHINT(err, s, v) do { \

+ if ((err = tre_stack_push_int(s, v)) != REG_OK) \

+ return err; \

+} while(0)

+static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)

+ tre_ast_node_t *nbranch=0, *nunion=0;

+ int ere = ctx->cflags & REG_EXTENDED;

+ const char *s = ctx->re;

+ int subid = 0;

+ int depth = 0;

+ reg_errcode_t err;

+ tre_stack_t *stack = ctx->stack;

+ PUSHINT(err, stack, subid++);

+ for (;;) {

+ if ((!ere && *s == '\\' && s[1] == '(') ||

+ (ere && *s == '(')) {

+ PUSHPTR(err, stack, nunion);

+ PUSHPTR(err, stack, nbranch);

+ PUSHINT(err, stack, subid++);

+ s++;

+ if (!ere)

+ s++;

+ depth++;

+ nbranch = nunion = 0;

+ continue;

+ }

+ if ((!ere && *s == '\\' && s[1] == ')') ||

+ (ere && *s == ')' && depth)) {

+ ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);

+ if (!ctx->n)

+ return REG_ESPACE;

+ } else {

+ err = parse_atom(ctx, s);

+ if (err != REG_OK)

+ return err;

+ s = ctx->s;

+ }

+ parse_iter:

+ /* extension: repetitions are accepted after an empty node

+ eg. (+), ^*, a$?, a|{2} */

+ switch (*s) {

+ case '+':

+ case '?':

+ if (!ere)

+ break;

+ /* fallthrough */

+ case '*':;

+ int min=0, max=-1;

+ if (*s == '+')

+ min = 1;

+ if (*s == '?')

+ max = 1;

+ s++;

+ ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);

+ if (!ctx->n)

+ return REG_ESPACE;

+ /* extension: multiple consecutive *+?{,} is unspecified,

+ but (a+)+ has to be supported so accepting a++ makes

+ sense, note however that the RE_DUP_MAX limit can be

+ circumvented: (a{255}){255} uses a lot of memory.. */

+ goto parse_iter;

+ case '\\':

+ if (ere || s[1] != '{')

+ break;

+ s++;

+ goto parse_brace;

+ case '{':

+ if (!ere)

+ break;

+ parse_brace:

+ err = parse_dup(ctx, s+1);

+ if (err != REG_OK)

+ return err;

+ s = ctx->s;

+ goto parse_iter;

+ }

+ nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);

+ if ((ere && *s == '|') ||

+ (ere && *s == ')' && depth) ||

+ (!ere && *s == '\\' && s[1] == ')') ||

+ !*s) {

+ /* extension: empty branch is unspecified (), (|a), (a|)

+ here they are not rejected but match on empty string */

+ int c = *s;

+ nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);

+ nbranch = 0;

+ if (c != '|') {

+ if (c == '\\') {

+ if (!depth) return REG_EPAREN;

+ s+=2;

+ } else if (c == ')')

+ s++;

+ depth--;

+ err = marksub(ctx, nunion, tre_stack_pop_int(stack));

+ if (err != REG_OK)

+ return err;

+ if (!c && depth<0) {

+ ctx->submatch_id = subid;

+ return REG_OK;

+ }

+ if (!c || depth<0)

+ return REG_EPAREN;

+ nbranch = tre_stack_pop_voidptr(stack);

+ nunion = tre_stack_pop_voidptr(stack);

+ goto parse_iter;

+ }

+ s++;

+ }

+/***********************************************************************

+ from tre-compile.c

+***********************************************************************/

+/*

+ TODO:

+ - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive

+ function calls.

+*/

+/*

+ Algorithms to setup tags so that submatch addressing can be done.

+*/

+/* Inserts a catenation node to the root of the tree given in `node'.

+ As the left child a new tag with number `tag_id' to `node' is added,

+ and the right child is the old root. */

+static reg_errcode_t

+tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)

+ tre_catenation_t *c;

+ c = tre_mem_alloc(mem, sizeof(*c));

+ if (c == NULL)

+ return REG_ESPACE;

+ c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);

+ if (c->left == NULL)

+ return REG_ESPACE;

+ c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));

+ if (c->right == NULL)

+ return REG_ESPACE;

+ c->right->obj = node->obj;

+ c->right->type = node->type;

+ c->right->nullable = -1;

+ c->right->submatch_id = -1;

+ c->right->firstpos = NULL;

+ c->right->lastpos = NULL;

+ c->right->num_tags = 0;

+ node->obj = c;

+ node->type = CATENATION;

+ return REG_OK;

+/* Inserts a catenation node to the root of the tree given in `node'.

+ As the right child a new tag with number `tag_id' to `node' is added,

+ and the left child is the old root. */

+static reg_errcode_t

+tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)

+ tre_catenation_t *c;

+ c = tre_mem_alloc(mem, sizeof(*c));

+ if (c == NULL)

+ return REG_ESPACE;

+ c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);

+ if (c->right == NULL)

+ return REG_ESPACE;

+ c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));

+ if (c->left == NULL)

+ return REG_ESPACE;

+ c->left->obj = node->obj;

+ c->left->type = node->type;

+ c->left->nullable = -1;

+ c->left->submatch_id = -1;

+ c->left->firstpos = NULL;

+ c->left->lastpos = NULL;

+ c->left->num_tags = 0;

+ node->obj = c;

+ node->type = CATENATION;

+ return REG_OK;

+typedef enum {

+ ADDTAGS_RECURSE,

+ ADDTAGS_AFTER_ITERATION,

+ ADDTAGS_AFTER_UNION_LEFT,

+ ADDTAGS_AFTER_UNION_RIGHT,

+ ADDTAGS_AFTER_CAT_LEFT,

+ ADDTAGS_AFTER_CAT_RIGHT,

+ ADDTAGS_SET_SUBMATCH_END

+} tre_addtags_symbol_t;

+typedef struct {

+ int tag;

+ int next_tag;

+} tre_tag_states_t;

+/* Go through `regset' and set submatch data for submatches that are

+ using this tag. */

+static void

+tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)

+ int i;

+ for (i = 0; regset[i] >= 0; i++)

+ {

+ int id = regset[i] / 2;

+ int start = !(regset[i] % 2);

+ if (start)

+ tnfa->submatch_data[id].so_tag = tag;

+ else

+ tnfa->submatch_data[id].eo_tag = tag;

+ }

+ regset[0] = -1;

+/* Adds tags to appropriate locations in the parse tree in `tree', so that

+ subexpressions marked for submatch addressing can be traced. */

+static reg_errcode_t

+tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,

+ tre_tnfa_t *tnfa)

+ reg_errcode_t status = REG_OK;

+ tre_addtags_symbol_t symbol;

+ tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */

+ int bottom = tre_stack_num_objects(stack);

+ /* True for first pass (counting number of needed tags) */

+ int first_pass = (mem == NULL || tnfa == NULL);

+ int *regset, *orig_regset;

+ int num_tags = 0; /* Total number of tags. */

+ int num_minimals = 0; /* Number of special minimal tags. */

+ int tag = 0; /* The tag that is to be added next. */

+ int next_tag = 1; /* Next tag to use after this one. */

+ int *parents; /* Stack of submatches the current submatch is

+ contained in. */

+ int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */

+ tre_tag_states_t *saved_states;

+ tre_tag_direction_t direction = TRE_TAG_MINIMIZE;

+ if (!first_pass)

+ {

+ tnfa->end_tag = 0;

+ tnfa->minimal_tags[0] = -1;

+ }

+ regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));

+ if (regset == NULL)

+ return REG_ESPACE;

+ regset[0] = -1;

+ orig_regset = regset;

+ parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));

+ if (parents == NULL)

+ {

+ xfree(regset);

+ return REG_ESPACE;

+ }

+ parents[0] = -1;

+ saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));

+ if (saved_states == NULL)

+ {

+ xfree(regset);

+ xfree(parents);

+ return REG_ESPACE;

+ }

+ else

+ {

+ unsigned int i;

+ for (i = 0; i <= tnfa->num_submatches; i++)

+ saved_states[i].tag = -1;

+ }

+ STACK_PUSH(stack, voidptr, node);

+ STACK_PUSH(stack, int, ADDTAGS_RECURSE);

+ while (tre_stack_num_objects(stack) > bottom)

+ {

+ if (status != REG_OK)

+ break;

+ symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);

+ switch (symbol)

+ {

+ case ADDTAGS_SET_SUBMATCH_END:

+ {

+ int id = tre_stack_pop_int(stack);

+ int i;

+ /* Add end of this submatch to regset. */

+ for (i = 0; regset[i] >= 0; i++);

+ regset[i] = id * 2 + 1;

+ regset[i + 1] = -1;

+ /* Pop this submatch from the parents stack. */

+ for (i = 0; parents[i] >= 0; i++);

+ parents[i - 1] = -1;

+ break;

+ }

+ case ADDTAGS_RECURSE:

+ node = tre_stack_pop_voidptr(stack);

+ if (node->submatch_id >= 0)

+ {

+ int id = node->submatch_id;

+ int i;

+ /* Add start of this submatch to regset. */

+ for (i = 0; regset[i] >= 0; i++);

+ regset[i] = id * 2;

+ regset[i + 1] = -1;

+ if (!first_pass)

+ {

+ for (i = 0; parents[i] >= 0; i++);

+ tnfa->submatch_data[id].parents = NULL;

+ if (i > 0)

+ {

+ int *p = xmalloc(sizeof(*p) * (i + 1));

+ if (p == NULL)

+ {

+ status = REG_ESPACE;

+ break;

+ }

+ assert(tnfa->submatch_data[id].parents == NULL);

+ tnfa->submatch_data[id].parents = p;

+ for (i = 0; parents[i] >= 0; i++)

+ p[i] = parents[i];

+ p[i] = -1;

+ }

+ /* Add end of this submatch to regset after processing this

+ node. */

+ STACK_PUSHX(stack, int, node->submatch_id);

+ STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);

+ }

+ switch (node->type)

+ {

+ case LITERAL:

+ {

+ tre_literal_t *lit = node->obj;

+ if (!IS_SPECIAL(lit) || IS_BACKREF(lit))

+ {

+ int i;

+ if (regset[0] >= 0)

+ {

+ /* Regset is not empty, so add a tag before the

+ literal or backref. */

+ if (!first_pass)

+ {

+ status = tre_add_tag_left(mem, node, tag);

+ tnfa->tag_directions[tag] = direction;

+ if (minimal_tag >= 0)

+ {

+ for (i = 0; tnfa->minimal_tags[i] >= 0; i++);

+ tnfa->minimal_tags[i] = tag;

+ tnfa->minimal_tags[i + 1] = minimal_tag;

+ tnfa->minimal_tags[i + 2] = -1;

+ minimal_tag = -1;

+ num_minimals++;

+ }

+ tre_purge_regset(regset, tnfa, tag);

+ }

+ else

+ {

+ node->num_tags = 1;

+ }

+ regset[0] = -1;

+ tag = next_tag;

+ num_tags++;

+ next_tag++;

+ }

+ else

+ {

+ assert(!IS_TAG(lit));

+ }

+ break;

+ }

+ case CATENATION:

+ {

+ tre_catenation_t *cat = node->obj;

+ tre_ast_node_t *left = cat->left;

+ tre_ast_node_t *right = cat->right;

+ int reserved_tag = -1;

+ /* After processing right child. */

+ STACK_PUSHX(stack, voidptr, node);

+ STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);

+ /* Process right child. */

+ STACK_PUSHX(stack, voidptr, right);

+ STACK_PUSHX(stack, int, ADDTAGS_RECURSE);

+ /* After processing left child. */

+ STACK_PUSHX(stack, int, next_tag + left->num_tags);

+ if (left->num_tags > 0 && right->num_tags > 0)

+ {

+ /* Reserve the next tag to the right child. */

+ reserved_tag = next_tag;

+ next_tag++;

+ }

+ STACK_PUSHX(stack, int, reserved_tag);

+ STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);

+ /* Process left child. */

+ STACK_PUSHX(stack, voidptr, left);

+ STACK_PUSHX(stack, int, ADDTAGS_RECURSE);

+ }

+ break;

+ case ITERATION:

+ {

+ tre_iteration_t *iter = node->obj;

+ if (first_pass)

+ {

+ STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);

+ }

+ else

+ {

+ STACK_PUSHX(stack, int, tag);

+ STACK_PUSHX(stack, int, iter->minimal);

+ }

+ STACK_PUSHX(stack, voidptr, node);

+ STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);

+ STACK_PUSHX(stack, voidptr, iter->arg);

+ STACK_PUSHX(stack, int, ADDTAGS_RECURSE);

+ /* Regset is not empty, so add a tag here. */

+ if (regset[0] >= 0 || iter->minimal)

+ {

+ if (!first_pass)

+ {

+ int i;

+ status = tre_add_tag_left(mem, node, tag);

+ if (iter->minimal)

+ tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;

+ else

+ tnfa->tag_directions[tag] = direction;

+ if (minimal_tag >= 0)

+ {

+ for (i = 0; tnfa->minimal_tags[i] >= 0; i++);

+ tnfa->minimal_tags[i] = tag;

+ tnfa->minimal_tags[i + 1] = minimal_tag;

+ tnfa->minimal_tags[i + 2] = -1;

+ minimal_tag = -1;

+ num_minimals++;

+ }

+ tre_purge_regset(regset, tnfa, tag);

+ }

+ regset[0] = -1;

+ tag = next_tag;

+ num_tags++;

+ next_tag++;

+ }

+ direction = TRE_TAG_MINIMIZE;

+ }

+ break;

+ case UNION:

+ {

+ tre_union_t *uni = node->obj;

+ tre_ast_node_t *left = uni->left;

+ tre_ast_node_t *right = uni->right;

+ int left_tag;

+ int right_tag;

+ if (regset[0] >= 0)

+ {

+ left_tag = next_tag;

+ right_tag = next_tag + 1;

+ }

+ else

+ {

+ left_tag = tag;

+ right_tag = next_tag;

+ }

+ /* After processing right child. */

+ STACK_PUSHX(stack, int, right_tag);

+ STACK_PUSHX(stack, int, left_tag);

+ STACK_PUSHX(stack, voidptr, regset);

+ STACK_PUSHX(stack, int, regset[0] >= 0);

+ STACK_PUSHX(stack, voidptr, node);

+ STACK_PUSHX(stack, voidptr, right);

+ STACK_PUSHX(stack, voidptr, left);

+ STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);

+ /* Process right child. */

+ STACK_PUSHX(stack, voidptr, right);

+ STACK_PUSHX(stack, int, ADDTAGS_RECURSE);

+ /* After processing left child. */

+ STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);

+ /* Process left child. */

+ STACK_PUSHX(stack, voidptr, left);

+ STACK_PUSHX(stack, int, ADDTAGS_RECURSE);

+ /* Regset is not empty, so add a tag here. */

+ if (regset[0] >= 0)

+ {

+ if (!first_pass)

+ {

+ int i;

+ status = tre_add_tag_left(mem, node, tag);

+ tnfa->tag_directions[tag] = direction;

+ if (minimal_tag >= 0)

+ {

+ for (i = 0; tnfa->minimal_tags[i] >= 0; i++);

+ tnfa->minimal_tags[i] = tag;

+ tnfa->minimal_tags[i + 1] = minimal_tag;

+ tnfa->minimal_tags[i + 2] = -1;

+ minimal_tag = -1;

+ num_minimals++;

+ }

+ tre_purge_regset(regset, tnfa, tag);

+ }

+ regset[0] = -1;

+ tag = next_tag;

+ num_tags++;

+ next_tag++;

+ }

+ if (node->num_submatches > 0)

+ {

+ /* The next two tags are reserved for markers. */

+ next_tag++;

+ tag = next_tag;

+ next_tag++;

+ }

+ break;

+ }

+ if (node->submatch_id >= 0)

+ {

+ int i;

+ /* Push this submatch on the parents stack. */

+ for (i = 0; parents[i] >= 0; i++);

+ parents[i] = node->submatch_id;

+ parents[i + 1] = -1;

+ }

+ break; /* end case: ADDTAGS_RECURSE */

+ case ADDTAGS_AFTER_ITERATION:

+ {

+ int minimal = 0;

+ int enter_tag;

+ node = tre_stack_pop_voidptr(stack);

+ if (first_pass)

+ {

+ node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags

+ + tre_stack_pop_int(stack);

+ minimal_tag = -1;

+ }

+ else

+ {

+ minimal = tre_stack_pop_int(stack);

+ enter_tag = tre_stack_pop_int(stack);

+ if (minimal)

+ minimal_tag = enter_tag;

+ }

+ if (!first_pass)

+ {

+ if (minimal)

+ direction = TRE_TAG_MINIMIZE;

+ else

+ direction = TRE_TAG_MAXIMIZE;

+ }

+ break;

+ }

+ case ADDTAGS_AFTER_CAT_LEFT:

+ {

+ int new_tag = tre_stack_pop_int(stack);

+ next_tag = tre_stack_pop_int(stack);

+ if (new_tag >= 0)

+ {

+ tag = new_tag;

+ }

+ break;

+ }

+ case ADDTAGS_AFTER_CAT_RIGHT:

+ node = tre_stack_pop_voidptr(stack);

+ if (first_pass)

+ node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags

+ + ((tre_catenation_t *)node->obj)->right->num_tags;

+ break;

+ case ADDTAGS_AFTER_UNION_LEFT:

+ /* Lift the bottom of the `regset' array so that when processing

+ the right operand the items currently in the array are

+ invisible. The original bottom was saved at ADDTAGS_UNION and

+ will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */

+ while (*regset >= 0)

+ regset++;

+ break;

+ case ADDTAGS_AFTER_UNION_RIGHT:

+ {

+ int added_tags, tag_left, tag_right;

+ tre_ast_node_t *left = tre_stack_pop_voidptr(stack);

+ tre_ast_node_t *right = tre_stack_pop_voidptr(stack);

+ node = tre_stack_pop_voidptr(stack);

+ added_tags = tre_stack_pop_int(stack);

+ if (first_pass)

+ {

+ node->num_tags = ((tre_union_t *)node->obj)->left->num_tags

+ + ((tre_union_t *)node->obj)->right->num_tags + added_tags

+ + ((node->num_submatches > 0) ? 2 : 0);

+ }

+ regset = tre_stack_pop_voidptr(stack);

+ tag_left = tre_stack_pop_int(stack);

+ tag_right = tre_stack_pop_int(stack);

+ /* Add tags after both children, the left child gets a smaller

+ tag than the right child. This guarantees that we prefer

+ the left child over the right child. */

+ /* XXX - This is not always necessary (if the children have

+ tags which must be seen for every match of that child). */

+ /* XXX - Check if this is the only place where tre_add_tag_right

+ is used. If so, use tre_add_tag_left (putting the tag before

+ the child as opposed after the child) and throw away

+ tre_add_tag_right. */

+ if (node->num_submatches > 0)

+ {

+ if (!first_pass)

+ {

+ status = tre_add_tag_right(mem, left, tag_left);

+ tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;

+ if (status == REG_OK)

+ status = tre_add_tag_right(mem, right, tag_right);

+ tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;

+ }

+ num_tags += 2;

+ }

+ direction = TRE_TAG_MAXIMIZE;

+ break;

+ }

+ default:

+ assert(0);

+ break;

+ } /* end switch(symbol) */

+ } /* end while(tre_stack_num_objects(stack) > bottom) */

+ if (!first_pass)

+ tre_purge_regset(regset, tnfa, tag);

+ if (!first_pass && minimal_tag >= 0)

+ {

+ int i;

+ for (i = 0; tnfa->minimal_tags[i] >= 0; i++);

+ tnfa->minimal_tags[i] = tag;

+ tnfa->minimal_tags[i + 1] = minimal_tag;

+ tnfa->minimal_tags[i + 2] = -1;

+ minimal_tag = -1;

+ num_minimals++;

+ }

+ assert(tree->num_tags == num_tags);

+ tnfa->end_tag = num_tags;

+ tnfa->num_tags = num_tags;

+ tnfa->num_minimals = num_minimals;

+ xfree(orig_regset);

+ xfree(parents);

+ xfree(saved_states);

+ return status;

+/*

+ AST to TNFA compilation routines.

+*/

+typedef enum {

+ COPY_RECURSE,

+ COPY_SET_RESULT_PTR

+} tre_copyast_symbol_t;

+/* Flags for tre_copy_ast(). */

+#define COPY_REMOVE_TAGS 1

+#define COPY_MAXIMIZE_FIRST_TAG 2

+static reg_errcode_t

+tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,

+ int flags, int *pos_add, tre_tag_direction_t *tag_directions,

+ tre_ast_node_t **copy, int *max_pos)

+ reg_errcode_t status = REG_OK;

+ int bottom = tre_stack_num_objects(stack);

+ int num_copied = 0;

+ int first_tag = 1;

+ tre_ast_node_t **result = copy;

+ tre_copyast_symbol_t symbol;

+ STACK_PUSH(stack, voidptr, ast);

+ STACK_PUSH(stack, int, COPY_RECURSE);

+ while (status == REG_OK && tre_stack_num_objects(stack) > bottom)

+ {

+ tre_ast_node_t *node;

+ if (status != REG_OK)

+ break;

+ symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);

+ switch (symbol)

+ {

+ case COPY_SET_RESULT_PTR:

+ result = tre_stack_pop_voidptr(stack);

+ break;

+ case COPY_RECURSE:

+ node = tre_stack_pop_voidptr(stack);

+ switch (node->type)

+ {

+ case LITERAL:

+ {

+ tre_literal_t *lit = node->obj;

+ int pos = lit->position;

+ int min = lit->code_min;

+ int max = lit->code_max;

+ if (!IS_SPECIAL(lit) || IS_BACKREF(lit))

+ {

+ /* XXX - e.g. [ab] has only one position but two

+ nodes, so we are creating holes in the state space

+ here. Not fatal, just wastes memory. */

+ pos += *pos_add;

+ num_copied++;

+ }

+ else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))

+ {

+ /* Change this tag to empty. */

+ min = EMPTY;

+ max = pos = -1;

+ }

+ else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)

+ && first_tag)

+ {

+ /* Maximize the first tag. */

+ tag_directions[max] = TRE_TAG_MAXIMIZE;

+ first_tag = 0;

+ }

+ *result = tre_ast_new_literal(mem, min, max, pos);

+ if (*result == NULL)

+ status = REG_ESPACE;

+ else {

+ tre_literal_t *p = (*result)->obj;

+ p->class = lit->class;

+ p->neg_classes = lit->neg_classes;

+ }

+ if (pos > *max_pos)

+ *max_pos = pos;

+ break;

+ }

+ case UNION:

+ {

+ tre_union_t *uni = node->obj;

+ tre_union_t *tmp;

+ *result = tre_ast_new_union(mem, uni->left, uni->right);

+ if (*result == NULL)

+ {

+ status = REG_ESPACE;

+ break;

+ }

+ tmp = (*result)->obj;

+ result = &tmp->left;

+ STACK_PUSHX(stack, voidptr, uni->right);

+ STACK_PUSHX(stack, int, COPY_RECURSE);

+ STACK_PUSHX(stack, voidptr, &tmp->right);

+ STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);

+ STACK_PUSHX(stack, voidptr, uni->left);

+ STACK_PUSHX(stack, int, COPY_RECURSE);

+ break;

+ }

+ case CATENATION:

+ {

+ tre_catenation_t *cat = node->obj;

+ tre_catenation_t *tmp;

+ *result = tre_ast_new_catenation(mem, cat->left, cat->right);

+ if (*result == NULL)

+ {

+ status = REG_ESPACE;

+ break;

+ }

+ tmp = (*result)->obj;

+ tmp->left = NULL;

+ tmp->right = NULL;

+ result = &tmp->left;

+ STACK_PUSHX(stack, voidptr, cat->right);

+ STACK_PUSHX(stack, int, COPY_RECURSE);

+ STACK_PUSHX(stack, voidptr, &tmp->right);

+ STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);

+ STACK_PUSHX(stack, voidptr, cat->left);

+ STACK_PUSHX(stack, int, COPY_RECURSE);

+ break;

+ }

+ case ITERATION:

+ {

+ tre_iteration_t *iter = node->obj;

+ STACK_PUSHX(stack, voidptr, iter->arg);

+ STACK_PUSHX(stack, int, COPY_RECURSE);

+ *result = tre_ast_new_iter(mem, iter->arg, iter->min,

+ iter->max, iter->minimal);

+ if (*result == NULL)

+ {

+ status = REG_ESPACE;

+ break;

+ }

+ iter = (*result)->obj;

+ result = &iter->arg;

+ break;

+ }

+ default:

+ assert(0);

+ break;

+ }

+ break;

+ }

+ *pos_add += num_copied;

+ return status;

+typedef enum {

+ EXPAND_RECURSE,

+ EXPAND_AFTER_ITER

+} tre_expand_ast_symbol_t;

+/* Expands each iteration node that has a finite nonzero minimum or maximum

+ iteration count to a catenated sequence of copies of the node. */

+static reg_errcode_t

+tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,

+ int *position, tre_tag_direction_t *tag_directions)

+ reg_errcode_t status = REG_OK;

+ int bottom = tre_stack_num_objects(stack);

+ int pos_add = 0;

+ int pos_add_total = 0;

+ int max_pos = 0;

+ int iter_depth = 0;

+ STACK_PUSHR(stack, voidptr, ast);

+ STACK_PUSHR(stack, int, EXPAND_RECURSE);

+ while (status == REG_OK && tre_stack_num_objects(stack) > bottom)

+ {

+ tre_ast_node_t *node;

+ tre_expand_ast_symbol_t symbol;

+ if (status != REG_OK)

+ break;

+ symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);

+ node = tre_stack_pop_voidptr(stack);

+ switch (symbol)

+ {

+ case EXPAND_RECURSE:

+ switch (node->type)

+ {

+ case LITERAL:

+ {

+ tre_literal_t *lit= node->obj;

+ if (!IS_SPECIAL(lit) || IS_BACKREF(lit))

+ {

+ lit->position += pos_add;

+ if (lit->position > max_pos)

+ max_pos = lit->position;

+ }

+ break;

+ }

+ case UNION:

+ {

+ tre_union_t *uni = node->obj;

+ STACK_PUSHX(stack, voidptr, uni->right);

+ STACK_PUSHX(stack, int, EXPAND_RECURSE);

+ STACK_PUSHX(stack, voidptr, uni->left);

+ STACK_PUSHX(stack, int, EXPAND_RECURSE);

+ break;

+ }

+ case CATENATION:

+ {

+ tre_catenation_t *cat = node->obj;

+ STACK_PUSHX(stack, voidptr, cat->right);

+ STACK_PUSHX(stack, int, EXPAND_RECURSE);

+ STACK_PUSHX(stack, voidptr, cat->left);

+ STACK_PUSHX(stack, int, EXPAND_RECURSE);

+ break;

+ }

+ case ITERATION:

+ {

+ tre_iteration_t *iter = node->obj;

+ STACK_PUSHX(stack, int, pos_add);

+ STACK_PUSHX(stack, voidptr, node);

+ STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);

+ STACK_PUSHX(stack, voidptr, iter->arg);

+ STACK_PUSHX(stack, int, EXPAND_RECURSE);

+ /* If we are going to expand this node at EXPAND_AFTER_ITER

+ then don't increase the `pos' fields of the nodes now, it

+ will get done when expanding. */

+ if (iter->min > 1 || iter->max > 1)

+ pos_add = 0;

+ iter_depth++;

+ break;

+ }

+ default:

+ assert(0);

+ break;

+ }

+ break;

+ case EXPAND_AFTER_ITER:

+ {

+ tre_iteration_t *iter = node->obj;

+ int pos_add_last;

+ pos_add = tre_stack_pop_int(stack);

+ pos_add_last = pos_add;

+ if (iter->min > 1 || iter->max > 1)

+ {

+ tre_ast_node_t *seq1 = NULL, *seq2 = NULL;

+ int j;

+ int pos_add_save = pos_add;

+ /* Create a catenated sequence of copies of the node. */

+ for (j = 0; j < iter->min; j++)

+ {

+ tre_ast_node_t *copy;

+ /* Remove tags from all but the last copy. */

+ int flags = ((j + 1 < iter->min)

+ ? COPY_REMOVE_TAGS

+ : COPY_MAXIMIZE_FIRST_TAG);

+ pos_add_save = pos_add;

+ status = tre_copy_ast(mem, stack, iter->arg, flags,

+ &pos_add, tag_directions, &copy,

+ &max_pos);

+ if (status != REG_OK)

+ return status;

+ if (seq1 != NULL)

+ seq1 = tre_ast_new_catenation(mem, seq1, copy);

+ else

+ seq1 = copy;

+ if (seq1 == NULL)

+ return REG_ESPACE;

+ }

+ if (iter->max == -1)

+ {

+ /* No upper limit. */

+ pos_add_save = pos_add;

+ status = tre_copy_ast(mem, stack, iter->arg, 0,

+ &pos_add, NULL, &seq2, &max_pos);

+ if (status != REG_OK)

+ return status;

+ seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);

+ if (seq2 == NULL)

+ return REG_ESPACE;

+ }

+ else

+ {

+ for (j = iter->min; j < iter->max; j++)

+ {

+ tre_ast_node_t *tmp, *copy;

+ pos_add_save = pos_add;

+ status = tre_copy_ast(mem, stack, iter->arg, 0,

+ &pos_add, NULL, &copy, &max_pos);

+ if (status != REG_OK)

+ return status;

+ if (seq2 != NULL)

+ seq2 = tre_ast_new_catenation(mem, copy, seq2);

+ else

+ seq2 = copy;

+ if (seq2 == NULL)

+ return REG_ESPACE;

+ tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);

+ if (tmp == NULL)

+ return REG_ESPACE;

+ seq2 = tre_ast_new_union(mem, tmp, seq2);

+ if (seq2 == NULL)

+ return REG_ESPACE;

+ }

+ pos_add = pos_add_save;

+ if (seq1 == NULL)

+ seq1 = seq2;

+ else if (seq2 != NULL)

+ seq1 = tre_ast_new_catenation(mem, seq1, seq2);

+ if (seq1 == NULL)

+ return REG_ESPACE;

+ node->obj = seq1->obj;

+ node->type = seq1->type;

+ }

+ iter_depth--;

+ pos_add_total += pos_add - pos_add_last;

+ if (iter_depth == 0)

+ pos_add = pos_add_total;

+ break;

+ }

+ default:

+ assert(0);

+ break;

+ }

+ *position += pos_add_total;

+ /* `max_pos' should never be larger than `*position' if the above

+ code works, but just an extra safeguard let's make sure

+ `*position' is set large enough so enough memory will be

+ allocated for the transition table. */

+ if (max_pos > *position)

+ *position = max_pos;

+ return status;

+static tre_pos_and_tags_t *

+tre_set_empty(tre_mem_t mem)

+ tre_pos_and_tags_t *new_set;

+ new_set = tre_mem_calloc(mem, sizeof(*new_set));

+ if (new_set == NULL)

+ return NULL;

+ new_set[0].position = -1;

+ new_set[0].code_min = -1;

+ new_set[0].code_max = -1;

+ return new_set;

+static tre_pos_and_tags_t *

+tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,

+ tre_ctype_t class, tre_ctype_t *neg_classes, int backref)

+ tre_pos_and_tags_t *new_set;

+ new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);

+ if (new_set == NULL)

+ return NULL;

+ new_set[0].position = position;

+ new_set[0].code_min = code_min;

+ new_set[0].code_max = code_max;

+ new_set[0].class = class;

+ new_set[0].neg_classes = neg_classes;

+ new_set[0].backref = backref;

+ new_set[1].position = -1;

+ new_set[1].code_min = -1;

+ new_set[1].code_max = -1;

+ return new_set;

+static tre_pos_and_tags_t *

+tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,

+ int *tags, int assertions)

+ int s1, s2, i, j;

+ tre_pos_and_tags_t *new_set;

+ int *new_tags;

+ int num_tags;

+ for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);

+ for (s1 = 0; set1[s1].position >= 0; s1++);

+ for (s2 = 0; set2[s2].position >= 0; s2++);

+ new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));

+ if (!new_set )

+ return NULL;

+ for (s1 = 0; set1[s1].position >= 0; s1++)

+ {

+ new_set[s1].position = set1[s1].position;

+ new_set[s1].code_min = set1[s1].code_min;

+ new_set[s1].code_max = set1[s1].code_max;

+ new_set[s1].assertions = set1[s1].assertions | assertions;

+ new_set[s1].class = set1[s1].class;

+ new_set[s1].neg_classes = set1[s1].neg_classes;

+ new_set[s1].backref = set1[s1].backref;

+ if (set1[s1].tags == NULL && tags == NULL)

+ new_set[s1].tags = NULL;

+ else

+ {

+ for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);

+ new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)

+ * (i + num_tags + 1)));

+ if (new_tags == NULL)

+ return NULL;

+ for (j = 0; j < i; j++)

+ new_tags[j] = set1[s1].tags[j];

+ for (i = 0; i < num_tags; i++)

+ new_tags[j + i] = tags[i];

+ new_tags[j + i] = -1;

+ new_set[s1].tags = new_tags;

+ }

+ for (s2 = 0; set2[s2].position >= 0; s2++)

+ {

+ new_set[s1 + s2].position = set2[s2].position;

+ new_set[s1 + s2].code_min = set2[s2].code_min;

+ new_set[s1 + s2].code_max = set2[s2].code_max;

+ /* XXX - why not | assertions here as well? */

+ new_set[s1 + s2].assertions = set2[s2].assertions;

+ new_set[s1 + s2].class = set2[s2].class;

+ new_set[s1 + s2].neg_classes = set2[s2].neg_classes;

+ new_set[s1 + s2].backref = set2[s2].backref;

+ if (set2[s2].tags == NULL)

+ new_set[s1 + s2].tags = NULL;

+ else

+ {

+ for (i = 0; set2[s2].tags[i] >= 0; i++);

+ new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));

+ if (new_tags == NULL)

+ return NULL;

+ for (j = 0; j < i; j++)

+ new_tags[j] = set2[s2].tags[j];

+ new_tags[j] = -1;

+ new_set[s1 + s2].tags = new_tags;

+ }

+ new_set[s1 + s2].position = -1;

+ return new_set;

+/* Finds the empty path through `node' which is the one that should be

+ taken according to POSIX.2 rules, and adds the tags on that path to

+ `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is

+ set to the number of tags seen on the path. */

+static reg_errcode_t

+tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,

+ int *assertions, int *num_tags_seen)

+ tre_literal_t *lit;

+ tre_union_t *uni;

+ tre_catenation_t *cat;

+ tre_iteration_t *iter;

+ int i;

+ int bottom = tre_stack_num_objects(stack);

+ reg_errcode_t status = REG_OK;

+ if (num_tags_seen)

+ *num_tags_seen = 0;

+ status = tre_stack_push_voidptr(stack, node);

+ /* Walk through the tree recursively. */

+ while (status == REG_OK && tre_stack_num_objects(stack) > bottom)

+ {

+ node = tre_stack_pop_voidptr(stack);

+ switch (node->type)

+ {

+ case LITERAL:

+ lit = (tre_literal_t *)node->obj;

+ switch (lit->code_min)

+ {

+ case TAG:

+ if (lit->code_max >= 0)

+ {

+ if (tags != NULL)

+ {

+ /* Add the tag to `tags'. */

+ for (i = 0; tags[i] >= 0; i++)

+ if (tags[i] == lit->code_max)

+ break;

+ if (tags[i] < 0)

+ {

+ tags[i] = lit->code_max;

+ tags[i + 1] = -1;

+ }

+ if (num_tags_seen)

+ (*num_tags_seen)++;

+ }

+ break;

+ case ASSERTION:

+ assert(lit->code_max >= 1

+ || lit->code_max <= ASSERT_LAST);

+ if (assertions != NULL)

+ *assertions |= lit->code_max;

+ break;

+ case EMPTY:

+ break;

+ default:

+ assert(0);

+ break;

+ }

+ break;

+ case UNION:

+ /* Subexpressions starting earlier take priority over ones

+ starting later, so we prefer the left subexpression over the

+ right subexpression. */

+ uni = (tre_union_t *)node->obj;

+ if (uni->left->nullable)

+ STACK_PUSHX(stack, voidptr, uni->left)

+ else if (uni->right->nullable)

+ STACK_PUSHX(stack, voidptr, uni->right)

+ else

+ assert(0);

+ break;

+ case CATENATION:

+ /* The path must go through both children. */

+ cat = (tre_catenation_t *)node->obj;

+ assert(cat->left->nullable);

+ assert(cat->right->nullable);

+ STACK_PUSHX(stack, voidptr, cat->left);

+ STACK_PUSHX(stack, voidptr, cat->right);

+ break;

+ case ITERATION:

+ /* A match with an empty string is preferred over no match at

+ all, so we go through the argument if possible. */

+ iter = (tre_iteration_t *)node->obj;

+ if (iter->arg->nullable)

+ STACK_PUSHX(stack, voidptr, iter->arg);

+ break;

+ default:

+ assert(0);

+ break;

+ }

+ return status;

+typedef enum {

+ NFL_RECURSE,

+ NFL_POST_UNION,

+ NFL_POST_CATENATION,

+ NFL_POST_ITERATION

+} tre_nfl_stack_symbol_t;

+/* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for

+ the nodes of the AST `tree'. */

+static reg_errcode_t

+tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)

+ int bottom = tre_stack_num_objects(stack);

+ STACK_PUSHR(stack, voidptr, tree);

+ STACK_PUSHR(stack, int, NFL_RECURSE);

+ while (tre_stack_num_objects(stack) > bottom)

+ {

+ tre_nfl_stack_symbol_t symbol;

+ tre_ast_node_t *node;

+ symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);

+ node = tre_stack_pop_voidptr(stack);

+ switch (symbol)

+ {

+ case NFL_RECURSE:

+ switch (node->type)

+ {

+ case LITERAL:

+ {

+ tre_literal_t *lit = (tre_literal_t *)node->obj;

+ if (IS_BACKREF(lit))

+ {

+ /* Back references: nullable = false, firstpos = {i},

+ lastpos = {i}. */

+ node->nullable = 0;

+ node->firstpos = tre_set_one(mem, lit->position, 0,

+ TRE_CHAR_MAX, 0, NULL, -1);

+ if (!node->firstpos)

+ return REG_ESPACE;

+ node->lastpos = tre_set_one(mem, lit->position, 0,

+ TRE_CHAR_MAX, 0, NULL,

+ (int)lit->code_max);

+ if (!node->lastpos)

+ return REG_ESPACE;

+ }

+ else if (lit->code_min < 0)

+ {

+ /* Tags, empty strings, params, and zero width assertions:

+ nullable = true, firstpos = {}, and lastpos = {}. */

+ node->nullable = 1;

+ node->firstpos = tre_set_empty(mem);

+ if (!node->firstpos)

+ return REG_ESPACE;

+ node->lastpos = tre_set_empty(mem);

+ if (!node->lastpos)

+ return REG_ESPACE;

+ }

+ else

+ {

+ /* Literal at position i: nullable = false, firstpos = {i},

+ lastpos = {i}. */

+ node->nullable = 0;

+ node->firstpos =

+ tre_set_one(mem, lit->position, (int)lit->code_min,

+ (int)lit->code_max, 0, NULL, -1);

+ if (!node->firstpos)

+ return REG_ESPACE;

+ node->lastpos = tre_set_one(mem, lit->position,

+ (int)lit->code_min,

+ (int)lit->code_max,

+ lit->class, lit->neg_classes,

+ -1);

+ if (!node->lastpos)

+ return REG_ESPACE;

+ }

+ break;

+ }

+ case UNION:

+ /* Compute the attributes for the two subtrees, and after that

+ for this node. */

+ STACK_PUSHR(stack, voidptr, node);

+ STACK_PUSHR(stack, int, NFL_POST_UNION);

+ STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);

+ STACK_PUSHR(stack, int, NFL_RECURSE);

+ STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);

+ STACK_PUSHR(stack, int, NFL_RECURSE);

+ break;

+ case CATENATION:

+ /* Compute the attributes for the two subtrees, and after that

+ for this node. */

+ STACK_PUSHR(stack, voidptr, node);

+ STACK_PUSHR(stack, int, NFL_POST_CATENATION);

+ STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);

+ STACK_PUSHR(stack, int, NFL_RECURSE);

+ STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);

+ STACK_PUSHR(stack, int, NFL_RECURSE);

+ break;

+ case ITERATION:

+ /* Compute the attributes for the subtree, and after that for

+ this node. */

+ STACK_PUSHR(stack, voidptr, node);

+ STACK_PUSHR(stack, int, NFL_POST_ITERATION);

+ STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);

+ STACK_PUSHR(stack, int, NFL_RECURSE);

+ break;

+ }

+ break; /* end case: NFL_RECURSE */

+ case NFL_POST_UNION:

+ {

+ tre_union_t *uni = (tre_union_t *)node->obj;

+ node->nullable = uni->left->nullable || uni->right->nullable;

+ node->firstpos = tre_set_union(mem, uni->left->firstpos,

+ uni->right->firstpos, NULL, 0);

+ if (!node->firstpos)

+ return REG_ESPACE;

+ node->lastpos = tre_set_union(mem, uni->left->lastpos,

+ uni->right->lastpos, NULL, 0);

+ if (!node->lastpos)

+ return REG_ESPACE;

+ break;

+ }

+ case NFL_POST_ITERATION:

+ {

+ tre_iteration_t *iter = (tre_iteration_t *)node->obj;

+ if (iter->min == 0 || iter->arg->nullable)

+ node->nullable = 1;

+ else

+ node->nullable = 0;

+ node->firstpos = iter->arg->firstpos;

+ node->lastpos = iter->arg->lastpos;

+ break;

+ }

+ case NFL_POST_CATENATION:

+ {

+ int num_tags, *tags, assertions;

+ reg_errcode_t status;

+ tre_catenation_t *cat = node->obj;

+ node->nullable = cat->left->nullable && cat->right->nullable;

+ /* Compute firstpos. */

+ if (cat->left->nullable)

+ {

+ /* The left side matches the empty string. Make a first pass

+ with tre_match_empty() to get the number of tags and

+ parameters. */

+ status = tre_match_empty(stack, cat->left,

+ NULL, NULL, &num_tags);

+ if (status != REG_OK)

+ return status;

+ /* Allocate arrays for the tags and parameters. */

+ tags = xmalloc(sizeof(*tags) * (num_tags + 1));

+ if (!tags)

+ return REG_ESPACE;

+ tags[0] = -1;

+ assertions = 0;

+ /* Second pass with tre_mach_empty() to get the list of

+ tags and parameters. */

+ status = tre_match_empty(stack, cat->left, tags,

+ &assertions, NULL);

+ if (status != REG_OK)

+ {

+ xfree(tags);

+ return status;

+ }

+ node->firstpos =

+ tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,

+ tags, assertions);

+ xfree(tags);

+ if (!node->firstpos)

+ return REG_ESPACE;

+ }

+ else

+ {

+ node->firstpos = cat->left->firstpos;

+ }

+ /* Compute lastpos. */

+ if (cat->right->nullable)

+ {

+ /* The right side matches the empty string. Make a first pass

+ with tre_match_empty() to get the number of tags and

+ parameters. */

+ status = tre_match_empty(stack, cat->right,

+ NULL, NULL, &num_tags);

+ if (status != REG_OK)

+ return status;

+ /* Allocate arrays for the tags and parameters. */

+ tags = xmalloc(sizeof(int) * (num_tags + 1));

+ if (!tags)

+ return REG_ESPACE;

+ tags[0] = -1;

+ assertions = 0;

+ /* Second pass with tre_mach_empty() to get the list of

+ tags and parameters. */

+ status = tre_match_empty(stack, cat->right, tags,

+ &assertions, NULL);

+ if (status != REG_OK)

+ {

+ xfree(tags);

+ return status;

+ }

+ node->lastpos =

+ tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,

+ tags, assertions);

+ xfree(tags);

+ if (!node->lastpos)

+ return REG_ESPACE;

+ }

+ else

+ {

+ node->lastpos = cat->right->lastpos;

+ }

+ break;

+ }

+ default:

+ assert(0);

+ break;

+ }

+ return REG_OK;

+/* Adds a transition from each position in `p1' to each position in `p2'. */

+static reg_errcode_t

+tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,

+ tre_tnfa_transition_t *transitions,

+ int *counts, int *offs)

+ tre_pos_and_tags_t *orig_p2 = p2;

+ tre_tnfa_transition_t *trans;

+ int i, j, k, l, dup, prev_p2_pos;

+ if (transitions != NULL)

+ while (p1->position >= 0)

+ {

+ p2 = orig_p2;

+ prev_p2_pos = -1;

+ while (p2->position >= 0)

+ {

+ /* Optimization: if this position was already handled, skip it. */

+ if (p2->position == prev_p2_pos)

+ {

+ p2++;

+ continue;

+ }

+ prev_p2_pos = p2->position;

+ /* Set `trans' to point to the next unused transition from

+ position `p1->position'. */

+ trans = transitions + offs[p1->position];

+ while (trans->state != NULL)

+ {

+#if 0

+ /* If we find a previous transition from `p1->position' to

+ `p2->position', it is overwritten. This can happen only

+ if there are nested loops in the regexp, like in "((a)*)*".

+ In POSIX.2 repetition using the outer loop is always

+ preferred over using the inner loop. Therefore the

+ transition for the inner loop is useless and can be thrown

+ away. */

+ /* XXX - The same position is used for all nodes in a bracket

+ expression, so this optimization cannot be used (it will

+ break bracket expressions) unless I figure out a way to

+ detect it here. */

+ if (trans->state_id == p2->position)

+ {

+ break;

+ }

+#endif

+ trans++;

+ }

+ if (trans->state == NULL)

+ (trans + 1)->state = NULL;

+ /* Use the character ranges, assertions, etc. from `p1' for

+ the transition from `p1' to `p2'. */

+ trans->code_min = p1->code_min;

+ trans->code_max = p1->code_max;

+ trans->state = transitions + offs[p2->position];

+ trans->state_id = p2->position;

+ trans->assertions = p1->assertions | p2->assertions

+ | (p1->class ? ASSERT_CHAR_CLASS : 0)

+ | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);

+ if (p1->backref >= 0)

+ {

+ assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);

+ assert(p2->backref < 0);

+ trans->u.backref = p1->backref;

+ trans->assertions |= ASSERT_BACKREF;

+ }

+ else

+ trans->u.class = p1->class;

+ if (p1->neg_classes != NULL)

+ {

+ for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);

+ trans->neg_classes =

+ xmalloc(sizeof(*trans->neg_classes) * (i + 1));

+ if (trans->neg_classes == NULL)

+ return REG_ESPACE;

+ for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)

+ trans->neg_classes[i] = p1->neg_classes[i];

+ trans->neg_classes[i] = (tre_ctype_t)0;

+ }

+ else

+ trans->neg_classes = NULL;

+ /* Find out how many tags this transition has. */

+ i = 0;

+ if (p1->tags != NULL)

+ while(p1->tags[i] >= 0)

+ i++;

+ j = 0;

+ if (p2->tags != NULL)

+ while(p2->tags[j] >= 0)

+ j++;

+ /* If we are overwriting a transition, free the old tag array. */

+ if (trans->tags != NULL)

+ xfree(trans->tags);

+ trans->tags = NULL;

+ /* If there were any tags, allocate an array and fill it. */

+ if (i + j > 0)

+ {

+ trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));

+ if (!trans->tags)

+ return REG_ESPACE;

+ i = 0;

+ if (p1->tags != NULL)

+ while(p1->tags[i] >= 0)

+ {

+ trans->tags[i] = p1->tags[i];

+ i++;

+ }

+ l = i;

+ j = 0;

+ if (p2->tags != NULL)

+ while (p2->tags[j] >= 0)

+ {

+ /* Don't add duplicates. */

+ dup = 0;

+ for (k = 0; k < i; k++)

+ if (trans->tags[k] == p2->tags[j])

+ {

+ dup = 1;

+ break;

+ }

+ if (!dup)

+ trans->tags[l++] = p2->tags[j];

+ j++;

+ }

+ trans->tags[l] = -1;

+ }

+ p2++;

+ }

+ p1++;

+ }

+ else

+ /* Compute a maximum limit for the number of transitions leaving

+ from each state. */

+ while (p1->position >= 0)

+ {

+ p2 = orig_p2;

+ while (p2->position >= 0)

+ {

+ counts[p1->position]++;

+ p2++;

+ }

+ p1++;

+ }

+ return REG_OK;

+/* Converts the syntax tree to a TNFA. All the transitions in the TNFA are

+ labelled with one character range (there are no transitions on empty

+ strings). The TNFA takes O(n^2) space in the worst case, `n' is size of

+ the regexp. */

+static reg_errcode_t

+tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,

+ int *counts, int *offs)

+ tre_union_t *uni;

+ tre_catenation_t *cat;

+ tre_iteration_t *iter;

+ reg_errcode_t errcode = REG_OK;

+ /* XXX - recurse using a stack!. */

+ switch (node->type)

+ {

+ case LITERAL:

+ break;

+ case UNION:

+ uni = (tre_union_t *)node->obj;

+ errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);

+ if (errcode != REG_OK)

+ return errcode;

+ errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);

+ break;

+ case CATENATION:

+ cat = (tre_catenation_t *)node->obj;

+ /* Add a transition from each position in cat->left->lastpos

+ to each position in cat->right->firstpos. */

+ errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,

+ transitions, counts, offs);

+ if (errcode != REG_OK)

+ return errcode;

+ errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);

+ if (errcode != REG_OK)

+ return errcode;

+ errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);

+ break;

+ case ITERATION:

+ iter = (tre_iteration_t *)node->obj;

+ assert(iter->max == -1 || iter->max == 1);

+ if (iter->max == -1)

+ {

+ assert(iter->min == 0 || iter->min == 1);

+ /* Add a transition from each last position in the iterated

+ expression to each first position. */

+ errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,

+ transitions, counts, offs);

+ if (errcode != REG_OK)

+ return errcode;

+ }

+ errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);

+ break;

+ }

+ return errcode;

+#define ERROR_EXIT(err) \

+ do \

+ { \

+ errcode = err; \

+ if (/*CONSTCOND*/1) \

+ goto error_exit; \

+ } \

+ while (/*CONSTCOND*/0)

+int

+regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)

+ tre_stack_t *stack;

+ tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;

+ tre_pos_and_tags_t *p;

+ int *counts = NULL, *offs = NULL;

+ int i, add = 0;

+ tre_tnfa_transition_t *transitions, *initial;

+ tre_tnfa_t *tnfa = NULL;

+ tre_submatch_data_t *submatch_data;

+ tre_tag_direction_t *tag_directions = NULL;

+ reg_errcode_t errcode;

+ tre_mem_t mem;

+ /* Parse context. */

+ tre_parse_ctx_t parse_ctx;

+ /* Allocate a stack used throughout the compilation process for various

+ purposes. */

+ stack = tre_stack_new(512, 10240, 128);

+ if (!stack)

+ return REG_ESPACE;

+ /* Allocate a fast memory allocator. */

+ mem = tre_mem_new();

+ if (!mem)

+ {

+ tre_stack_destroy(stack);

+ return REG_ESPACE;

+ }

+ /* Parse the regexp. */

+ memset(&parse_ctx, 0, sizeof(parse_ctx));

+ parse_ctx.mem = mem;

+ parse_ctx.stack = stack;

+ parse_ctx.re = regex;

+ parse_ctx.cflags = cflags;

+ parse_ctx.max_backref = -1;

+ errcode = tre_parse(&parse_ctx);

+ if (errcode != REG_OK)

+ ERROR_EXIT(errcode);

+ preg->re_nsub = parse_ctx.submatch_id - 1;

+ tree = parse_ctx.n;

+#ifdef TRE_DEBUG

+ tre_ast_print(tree);

+#endif /* TRE_DEBUG */

+ /* Referring to nonexistent subexpressions is illegal. */

+ if (parse_ctx.max_backref > (int)preg->re_nsub)

+ ERROR_EXIT(REG_ESUBREG);

+ /* Allocate the TNFA struct. */

+ tnfa = xcalloc(1, sizeof(tre_tnfa_t));

+ if (tnfa == NULL)

+ ERROR_EXIT(REG_ESPACE);

+ tnfa->have_backrefs = parse_ctx.max_backref >= 0;

+ tnfa->have_approx = 0;

+ tnfa->num_submatches = parse_ctx.submatch_id;

+ /* Set up tags for submatch addressing. If REG_NOSUB is set and the

+ regexp does not have back references, this can be skipped. */

+ if (tnfa->have_backrefs || !(cflags & REG_NOSUB))

+ {

+ /* Figure out how many tags we will need. */

+ errcode = tre_add_tags(NULL, stack, tree, tnfa);

+ if (errcode != REG_OK)

+ ERROR_EXIT(errcode);

+ if (tnfa->num_tags > 0)

+ {

+ tag_directions = xmalloc(sizeof(*tag_directions)

+ * (tnfa->num_tags + 1));

+ if (tag_directions == NULL)

+ ERROR_EXIT(REG_ESPACE);

+ tnfa->tag_directions = tag_directions;

+ memset(tag_directions, -1,

+ sizeof(*tag_directions) * (tnfa->num_tags + 1));

+ }

+ tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,

+ sizeof(*tnfa->minimal_tags));

+ if (tnfa->minimal_tags == NULL)

+ ERROR_EXIT(REG_ESPACE);

+ submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,

+ sizeof(*submatch_data));

+ if (submatch_data == NULL)

+ ERROR_EXIT(REG_ESPACE);

+ tnfa->submatch_data = submatch_data;

+ errcode = tre_add_tags(mem, stack, tree, tnfa);

+ if (errcode != REG_OK)

+ ERROR_EXIT(errcode);

+ }

+ /* Expand iteration nodes. */

+ errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,

+ tag_directions);

+ if (errcode != REG_OK)

+ ERROR_EXIT(errcode);

+ /* Add a dummy node for the final state.

+ XXX - For certain patterns this dummy node can be optimized away,

+ for example "a*" or "ab*". Figure out a simple way to detect

+ this possibility. */

+ tmp_ast_l = tree;

+ tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);

+ if (tmp_ast_r == NULL)

+ ERROR_EXIT(REG_ESPACE);

+ tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);

+ if (tree == NULL)

+ ERROR_EXIT(REG_ESPACE);

+ errcode = tre_compute_nfl(mem, stack, tree);

+ if (errcode != REG_OK)

+ ERROR_EXIT(errcode);

+ counts = xmalloc(sizeof(int) * parse_ctx.position);

+ if (counts == NULL)

+ ERROR_EXIT(REG_ESPACE);

+ offs = xmalloc(sizeof(int) * parse_ctx.position);

+ if (offs == NULL)

+ ERROR_EXIT(REG_ESPACE);

+ for (i = 0; i < parse_ctx.position; i++)

+ counts[i] = 0;

+ tre_ast_to_tnfa(tree, NULL, counts, NULL);

+ add = 0;

+ for (i = 0; i < parse_ctx.position; i++)

+ {

+ offs[i] = add;

+ add += counts[i] + 1;

+ counts[i] = 0;

+ }

+ transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));

+ if (transitions == NULL)

+ ERROR_EXIT(REG_ESPACE);

+ tnfa->transitions = transitions;

+ tnfa->num_transitions = add;

+ errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);

+ if (errcode != REG_OK)

+ ERROR_EXIT(errcode);

+ tnfa->firstpos_chars = NULL;

+ p = tree->firstpos;

+ i = 0;

+ while (p->position >= 0)

+ {

+ i++;

+ p++;

+ }

+ initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));

+ if (initial == NULL)

+ ERROR_EXIT(REG_ESPACE);

+ tnfa->initial = initial;

+ i = 0;

+ for (p = tree->firstpos; p->position >= 0; p++)

+ {

+ initial[i].state = transitions + offs[p->position];

+ initial[i].state_id = p->position;

+ initial[i].tags = NULL;

+ /* Copy the arrays p->tags, and p->params, they are allocated

+ from a tre_mem object. */

+ if (p->tags)

+ {

+ int j;

+ for (j = 0; p->tags[j] >= 0; j++);

+ initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));

+ if (!initial[i].tags)

+ ERROR_EXIT(REG_ESPACE);

+ memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));

+ }

+ initial[i].assertions = p->assertions;

+ i++;

+ }

+ initial[i].state = NULL;

+ tnfa->num_transitions = add;

+ tnfa->final = transitions + offs[tree->lastpos[0].position];

+ tnfa->num_states = parse_ctx.position;

+ tnfa->cflags = cflags;

+ tre_mem_destroy(mem);

+ tre_stack_destroy(stack);

+ xfree(counts);

+ xfree(offs);

+ preg->TRE_REGEX_T_FIELD = (void *)tnfa;

+ return REG_OK;

+ error_exit:

+ /* Free everything that was allocated and return the error code. */

+ tre_mem_destroy(mem);

+ if (stack != NULL)

+ tre_stack_destroy(stack);

+ if (counts != NULL)

+ xfree(counts);

+ if (offs != NULL)

+ xfree(offs);

+ preg->TRE_REGEX_T_FIELD = (void *)tnfa;

+ regfree(preg);

+ return errcode;

+void

+regfree(regex_t *preg)

+ tre_tnfa_t *tnfa;

+ unsigned int i;

+ tre_tnfa_transition_t *trans;

+ tnfa = (void *)preg->TRE_REGEX_T_FIELD;

+ if (!tnfa)

+ return;

+ for (i = 0; i < tnfa->num_transitions; i++)

+ if (tnfa->transitions[i].state)

+ {

+ if (tnfa->transitions[i].tags)

+ xfree(tnfa->transitions[i].tags);

+ if (tnfa->transitions[i].neg_classes)

+ xfree(tnfa->transitions[i].neg_classes);

+ }

+ if (tnfa->transitions)

+ xfree(tnfa->transitions);

+ if (tnfa->initial)

+ {

+ for (trans = tnfa->initial; trans->state; trans++)

+ {

+ if (trans->tags)

+ xfree(trans->tags);

+ }

+ xfree(tnfa->initial);

+ }

+ if (tnfa->submatch_data)

+ {

+ for (i = 0; i < tnfa->num_submatches; i++)

+ if (tnfa->submatch_data[i].parents)

+ xfree(tnfa->submatch_data[i].parents);

+ xfree(tnfa->submatch_data);

+ }

+ if (tnfa->tag_directions)

+ xfree(tnfa->tag_directions);

+ if (tnfa->firstpos_chars)

+ xfree(tnfa->firstpos_chars);

+ if (tnfa->minimal_tags)

+ xfree(tnfa->minimal_tags);

+ xfree(tnfa);

« no previous file with comments | « fusl/src/regex/glob.c ('k') | fusl/src/regex/regerror.c » ('j') | no next file with comments »