fusl/src/regex/regexec.c - Issue 1573973002: Add a "fork" of musl as //fusl.

Side by Side Diff: fusl/src/regex/regexec.c

Issue 1573973002: Add a "fork" of musl as //fusl. (Closed) Base URL: https://github.com/domokit/mojo.git@master

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 /*

	2 regexec.c - TRE POSIX compatible matching functions (and more).

	3

	4 Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>

	5 All rights reserved.

	6

	7 Redistribution and use in source and binary forms, with or without

	8 modification, are permitted provided that the following conditions

	9 are met:

	10

	11 1. Redistributions of source code must retain the above copyright

	12 notice, this list of conditions and the following disclaimer.

	13

	14 2. Redistributions in binary form must reproduce the above copyright

	15 notice, this list of conditions and the following disclaimer in the

	16 documentation and/or other materials provided with the distribution.

	17

	18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS

	19 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

	20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

	21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

	22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

	23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

	24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

	25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

	26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

	27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

	28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	29

	30 */

	31

	32 #include <stdlib.h>

	33 #include <string.h>

	34 #include <wchar.h>

	35 #include <wctype.h>

	36 #include <limits.h>

	37

	38 #include <regex.h>

	39

	40 #include "tre.h"

	41

	42 #include <assert.h>

	43

	44 static void

	45 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,

	46 const tre_tnfa_t tnfa, int tags, int match_eo);

	47

	48 /***********************************************************************

	49 from tre-match-utils.h

	50 ***********************************************************************/

	51

	52 #define GET_NEXT_WCHAR() do { \

	53 prev_c = next_c; pos += pos_add_next; \

	54 if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) { \

	55 if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; } \

	56 else pos_add_next++; \

	57 } \

	58 str_byte += pos_add_next; \

	59 } while (0)

	60

	61 #define IS_WORD_CHAR(c) ((c) == L'_' \|\| tre_isalnum(c))

	62

	63 #define CHECK_ASSERTIONS(assertions) \

	64 (((assertions & ASSERT_AT_BOL) \

	65 && (pos > 0 \|\| reg_notbol) \

	66 && (prev_c != L'\n' \|\| !reg_newline)) \

	67 \|\| ((assertions & ASSERT_AT_EOL) \

	68 && (next_c != L'\0' \|\| reg_noteol) \

	69 && (next_c != L'\n' \|\| !reg_newline)) \

	70 \|\| ((assertions & ASSERT_AT_BOW) \

	71 && (IS_WORD_CHAR(prev_c) \|\| !IS_WORD_CHAR(next_c))) \

	72 \|\| ((assertions & ASSERT_AT_EOW) \

	73 && (!IS_WORD_CHAR(prev_c) \|\| IS_WORD_CHAR(next_c))) \

	74 \|\| ((assertions & ASSERT_AT_WB) \

	75 && (pos != 0 && next_c != L'\0' \

	76 && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \

	77 \|\| ((assertions & ASSERT_AT_WB_NEG) \

	78 && (pos == 0 \|\| next_c == L'\0' \

	79 \|\| IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))

	80

	81 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \

	82 (((trans_i->assertions & ASSERT_CHAR_CLASS) \

	83 && !(tnfa->cflags & REG_ICASE) \

	84 && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class)) \

	85 \|\| ((trans_i->assertions & ASSERT_CHAR_CLASS) \

	86 && (tnfa->cflags & REG_ICASE) \

	87 && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class) \

	88 && !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class)) \

	89 \|\| ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG) \

	90 && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\

	91 tnfa->cflags & REG_ICASE)))

	92

	93

	94

	95

	96 /* Returns 1 if `t1' wins `t2', 0 otherwise. */

	97 static int

	98 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,

	99 int t1, int t2)

	100 {

	101 int i;

	102 for (i = 0; i < num_tags; i++)

	103 {

	104 if (tag_directions[i] == TRE_TAG_MINIMIZE)

	105 {

	106 if (t1[i] < t2[i])

	107 return 1;

	108 if (t1[i] > t2[i])

	109 return 0;

	110 }

	111 else

	112 {

	113 if (t1[i] > t2[i])

	114 return 1;

	115 if (t1[i] < t2[i])

	116 return 0;

	117 }

	118 }

	119 /* assert(0);*/

	120 return 0;

	121 }

	122

	123 static int

	124 tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase)

	125 {

	126 while (*classes != (tre_ctype_t)0)

	127 if ((!icase && tre_isctype(wc, *classes))

	128 \|\| (icase && (tre_isctype(tre_toupper(wc), *classes)

	129 \|\| tre_isctype(tre_tolower(wc), *classes))))

	130 return 1; /* Match. */

	131 else

	132 classes++;

	133 return 0; /* No match. */

	134 }

	135

	136

	137 /***********************************************************************

	138 from tre-match-parallel.c

	139 ***********************************************************************/

	140

	141 /*

	142 This algorithm searches for matches basically by reading characters

	143 in the searched string one by one, starting at the beginning. All

	144 matching paths in the TNFA are traversed in parallel. When two or

	145 more paths reach the same state, exactly one is chosen according to

	146 tag ordering rules; if returning submatches is not required it does

	147 not matter which path is chosen.

	148

	149 The worst case time required for finding the leftmost and longest

	150 match, or determining that there is no match, is always linearly

	151 dependent on the length of the text being searched.

	152

	153 This algorithm cannot handle TNFAs with back referencing nodes.

	154 See `tre-match-backtrack.c'.

	155 */

	156

	157 typedef struct {

	158 tre_tnfa_transition_t *state;

	159 int *tags;

	160 } tre_tnfa_reach_t;

	161

	162 typedef struct {

	163 int pos;

	164 int **tags;

	165 } tre_reach_pos_t;

	166

	167

	168 static reg_errcode_t

	169 tre_tnfa_run_parallel(const tre_tnfa_t tnfa, const void string,

	170 int *match_tags, int eflags,

	171 int *match_end_ofs)

	172 {

	173 /* State variables required by GET_NEXT_WCHAR. */

	174 tre_char_t prev_c = 0, next_c = 0;

	175 const char *str_byte = string;

	176 int pos = -1;

	177 int pos_add_next = 1;

	178 #ifdef TRE_MBSTATE

	179 mbstate_t mbstate;

	180 #endif /* TRE_MBSTATE */

	181 int reg_notbol = eflags & REG_NOTBOL;

	182 int reg_noteol = eflags & REG_NOTEOL;

	183 int reg_newline = tnfa->cflags & REG_NEWLINE;

	184 reg_errcode_t ret;

	185

	186 char *buf;

	187 tre_tnfa_transition_t *trans_i;

	188 tre_tnfa_reach_t reach, reach_next, reach_i, reach_next_i;

	189 tre_reach_pos_t *reach_pos;

	190 int *tag_i;

	191 int num_tags, i;

	192

	193 int match_eo = -1; /* end offset of match (-1 if no match found yet) */

	194 int new_match = 0;

	195 int *tmp_tags = NULL;

	196 int *tmp_iptr;

	197

	198 #ifdef TRE_MBSTATE

	199 memset(&mbstate, '\0', sizeof(mbstate));

	200 #endif /* TRE_MBSTATE */

	201

	202 if (!match_tags)

	203 num_tags = 0;

	204 else

	205 num_tags = tnfa->num_tags;

	206

	207 /* Allocate memory for temporary data required for matching. This needs to

	208 be done for every matching operation to be thread safe. This allocates

	209 everything in a single large block from the stack frame using alloca()

	210 or with malloc() if alloca is unavailable. */

	211 {

	212 int tbytes, rbytes, pbytes, xbytes, total_bytes;

	213 char *tmp_buf;

	214 /* Compute the length of the block we need. */

	215 tbytes = sizeof(tmp_tags) num_tags;

	216 rbytes = sizeof(reach_next) (tnfa->num_states + 1);

	217 pbytes = sizeof(reach_pos) tnfa->num_states;

	218 xbytes = sizeof(int) * num_tags;

	219 total_bytes =

	220 (sizeof(long) - 1) * 4 /* for alignment paddings */

	221 + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;

	222

	223 /* Allocate the memory. */

	224 buf = xmalloc((unsigned)total_bytes);

	225 if (buf == NULL)

	226 return REG_ESPACE;

	227 memset(buf, 0, (size_t)total_bytes);

	228

	229 /* Get the various pointers within tmp_buf (properly aligned). */

	230 tmp_tags = (void *)buf;

	231 tmp_buf = buf + tbytes;

	232 tmp_buf += ALIGN(tmp_buf, long);

	233 reach_next = (void *)tmp_buf;

	234 tmp_buf += rbytes;

	235 tmp_buf += ALIGN(tmp_buf, long);

	236 reach = (void *)tmp_buf;

	237 tmp_buf += rbytes;

	238 tmp_buf += ALIGN(tmp_buf, long);

	239 reach_pos = (void *)tmp_buf;

	240 tmp_buf += pbytes;

	241 tmp_buf += ALIGN(tmp_buf, long);

	242 for (i = 0; i < tnfa->num_states; i++)

	243 {

	244 reach[i].tags = (void *)tmp_buf;

	245 tmp_buf += xbytes;

	246 reach_next[i].tags = (void *)tmp_buf;

	247 tmp_buf += xbytes;

	248 }

	249 }

	250

	251 for (i = 0; i < tnfa->num_states; i++)

	252 reach_pos[i].pos = -1;

	253

	254 GET_NEXT_WCHAR();

	255 pos = 0;

	256

	257 reach_next_i = reach_next;

	258 while (1)

	259 {

	260 /* If no match found yet, add the initial states to `reach_next'. */

	261 if (match_eo < 0)

	262 {

	263 trans_i = tnfa->initial;

	264 while (trans_i->state != NULL)

	265 {

	266 if (reach_pos[trans_i->state_id].pos < pos)

	267 {

	268 if (trans_i->assertions

	269 && CHECK_ASSERTIONS(trans_i->assertions))

	270 {

	271 trans_i++;

	272 continue;

	273 }

	274

	275 reach_next_i->state = trans_i->state;

	276 for (i = 0; i < num_tags; i++)

	277 reach_next_i->tags[i] = -1;

	278 tag_i = trans_i->tags;

	279 if (tag_i)

	280 while (*tag_i >= 0)

	281 {

	282 if (*tag_i < num_tags)

	283 reach_next_i->tags[*tag_i] = pos;

	284 tag_i++;

	285 }

	286 if (reach_next_i->state == tnfa->final)

	287 {

	288 match_eo = pos;

	289 new_match = 1;

	290 for (i = 0; i < num_tags; i++)

	291 match_tags[i] = reach_next_i->tags[i];

	292 }

	293 reach_pos[trans_i->state_id].pos = pos;

	294 reach_pos[trans_i->state_id].tags = &reach_next_i->tags;

	295 reach_next_i++;

	296 }

	297 trans_i++;

	298 }

	299 reach_next_i->state = NULL;

	300 }

	301 else

	302 {

	303 if (num_tags == 0 \|\| reach_next_i == reach_next)

	304 /* We have found a match. */

	305 break;

	306 }

	307

	308 /* Check for end of string. */

	309 if (!next_c) break;

	310

	311 GET_NEXT_WCHAR();

	312

	313 /* Swap `reach' and `reach_next'. */

	314 reach_i = reach;

	315 reach = reach_next;

	316 reach_next = reach_i;

	317

	318 /* For each state in `reach', weed out states that don't fulfill the

	319 minimal matching conditions. */

	320 if (tnfa->num_minimals && new_match)

	321 {

	322 new_match = 0;

	323 reach_next_i = reach_next;

	324 for (reach_i = reach; reach_i->state; reach_i++)

	325 {

	326 int skip = 0;

	327 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)

	328 {

	329 int end = tnfa->minimal_tags[i];

	330 int start = tnfa->minimal_tags[i + 1];

	331 if (end >= num_tags)

	332 {

	333 skip = 1;

	334 break;

	335 }

	336 else if (reach_i->tags[start] == match_tags[start]

	337 && reach_i->tags[end] < match_tags[end])

	338 {

	339 skip = 1;

	340 break;

	341 }

	342 }

	343 if (!skip)

	344 {

	345 reach_next_i->state = reach_i->state;

	346 tmp_iptr = reach_next_i->tags;

	347 reach_next_i->tags = reach_i->tags;

	348 reach_i->tags = tmp_iptr;

	349 reach_next_i++;

	350 }

	351 }

	352 reach_next_i->state = NULL;

	353

	354 /* Swap `reach' and `reach_next'. */

	355 reach_i = reach;

	356 reach = reach_next;

	357 reach_next = reach_i;

	358 }

	359

	360 /* For each state in `reach' see if there is a transition leaving with

	361 the current input symbol to a state not yet in `reach_next', and

	362 add the destination states to `reach_next'. */

	363 reach_next_i = reach_next;

	364 for (reach_i = reach; reach_i->state; reach_i++)

	365 {

	366 for (trans_i = reach_i->state; trans_i->state; trans_i++)

	367 {

	368 /* Does this transition match the input symbol? */

	369 if (trans_i->code_min <= (tre_cint_t)prev_c &&

	370 trans_i->code_max >= (tre_cint_t)prev_c)

	371 {

	372 if (trans_i->assertions

	373 && (CHECK_ASSERTIONS(trans_i->assertions)

	374 \|\| CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))

	375 {

	376 continue;

	377 }

	378

	379 /* Compute the tags after this transition. */

	380 for (i = 0; i < num_tags; i++)

	381 tmp_tags[i] = reach_i->tags[i];

	382 tag_i = trans_i->tags;

	383 if (tag_i != NULL)

	384 while (*tag_i >= 0)

	385 {

	386 if (*tag_i < num_tags)

	387 tmp_tags[*tag_i] = pos;

	388 tag_i++;

	389 }

	390

	391 if (reach_pos[trans_i->state_id].pos < pos)

	392 {

	393 /* Found an unvisited node. */

	394 reach_next_i->state = trans_i->state;

	395 tmp_iptr = reach_next_i->tags;

	396 reach_next_i->tags = tmp_tags;

	397 tmp_tags = tmp_iptr;

	398 reach_pos[trans_i->state_id].pos = pos;

	399 reach_pos[trans_i->state_id].tags = &reach_next_i->tags;

	400

	401 if (reach_next_i->state == tnfa->final

	402 && (match_eo == -1

	403 \|\| (num_tags > 0

	404 && reach_next_i->tags[0] <= match_tags[0])))

	405 {

	406 match_eo = pos;

	407 new_match = 1;

	408 for (i = 0; i < num_tags; i++)

	409 match_tags[i] = reach_next_i->tags[i];

	410 }

	411 reach_next_i++;

	412

	413 }

	414 else

	415 {

	416 assert(reach_pos[trans_i->state_id].pos == pos);

	417 /* Another path has also reached this state. We choose

	418 the winner by examining the tag values for both

	419 paths. */

	420 if (tre_tag_order(num_tags, tnfa->tag_directions,

	421 tmp_tags,

	422 *reach_pos[trans_i->state_id].tags))

	423 {

	424 /* The new path wins. */

	425 tmp_iptr = *reach_pos[trans_i->state_id].tags;

	426 *reach_pos[trans_i->state_id].tags = tmp_tags;

	427 if (trans_i->state == tnfa->final)

	428 {

	429 match_eo = pos;

	430 new_match = 1;

	431 for (i = 0; i < num_tags; i++)

	432 match_tags[i] = tmp_tags[i];

	433 }

	434 tmp_tags = tmp_iptr;

	435 }

	436 }

	437 }

	438 }

	439 }

	440 reach_next_i->state = NULL;

	441 }

	442

	443 *match_end_ofs = match_eo;

	444 ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;

	445 error_exit:

	446 xfree(buf);

	447 return ret;

	448 }

	449

	450

	451

	452 /***********************************************************************

	453 from tre-match-backtrack.c

	454 ***********************************************************************/

	455

	456 /*

	457 This matcher is for regexps that use back referencing. Regexp matching

	458 with back referencing is an NP-complete problem on the number of back

	459 references. The easiest way to match them is to use a backtracking

	460 routine which basically goes through all possible paths in the TNFA

	461 and chooses the one which results in the best (leftmost and longest)

	462 match. This can be spectacularly expensive and may run out of stack

	463 space, but there really is no better known generic algorithm. Quoting

	464 Henry Spencer from comp.compilers:

	465 <URL: http://compilers.iecc.com/comparch/article/93-03-102>

	466

	467 POSIX.2 REs require longest match, which is really exciting to

	468 implement since the obsolete ("basic") variant also includes

	469 \<digit>. I haven't found a better way of tackling this than doing

	470 a preliminary match using a DFA (or simulation) on a modified RE

	471 that just replicates subREs for \<digit>, and then doing a

	472 backtracking match to determine whether the subRE matches were

	473 right. This can be rather slow, but I console myself with the

	474 thought that people who use \<digit> deserve very slow execution.

	475 (Pun unintentional but very appropriate.)

	476

	477 */

	478

	479 typedef struct {

	480 int pos;

	481 const char *str_byte;

	482 tre_tnfa_transition_t *state;

	483 int state_id;

	484 int next_c;

	485 int *tags;

	486 #ifdef TRE_MBSTATE

	487 mbstate_t mbstate;

	488 #endif /* TRE_MBSTATE */

	489 } tre_backtrack_item_t;

	490

	491 typedef struct tre_backtrack_struct {

	492 tre_backtrack_item_t item;

	493 struct tre_backtrack_struct *prev;

	494 struct tre_backtrack_struct *next;

	495 } *tre_backtrack_t;

	496

	497 #ifdef TRE_MBSTATE

	498 #define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate)

	499 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate

	500 #else /* !TRE_MBSTATE */

	501 #define BT_STACK_MBSTATE_IN

	502 #define BT_STACK_MBSTATE_OUT

	503 #endif /* !TRE_MBSTATE */

	504

	505 #define tre_bt_mem_new tre_mem_new

	506 #define tre_bt_mem_alloc tre_mem_alloc

	507 #define tre_bt_mem_destroy tre_mem_destroy

	508

	509

	510 #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _t ags, _mbstate) \

	511 do \

	512 { \

	513 int i; \

	514 if (!stack->next) \

	515 { \

	516 tre_backtrack_t s; \

	517 s = tre_bt_mem_alloc(mem, sizeof(*s)); \

	518 if (!s) \

	519 { \

	520 tre_bt_mem_destroy(mem); \

	521 if (tags) \

	522 xfree(tags); \

	523 if (pmatch) \

	524 xfree(pmatch); \

	525 if (states_seen) \

	526 xfree(states_seen); \

	527 return REG_ESPACE; \

	528 } \

	529 s->prev = stack; \

	530 s->next = NULL; \

	531 s->item.tags = tre_bt_mem_alloc(mem, \

	532 sizeof(tags) tnfa->num_tags); \

	533 if (!s->item.tags) \

	534 { \

	535 tre_bt_mem_destroy(mem); \

	536 if (tags) \

	537 xfree(tags); \

	538 if (pmatch) \

	539 xfree(pmatch); \

	540 if (states_seen) \

	541 xfree(states_seen); \

	542 return REG_ESPACE; \

	543 } \

	544 stack->next = s; \

	545 stack = s; \

	546 } \

	547 else \

	548 stack = stack->next; \

	549 stack->item.pos = (_pos); \

	550 stack->item.str_byte = (_str_byte); \

	551 stack->item.state = (_state); \

	552 stack->item.state_id = (_state_id); \

	553 stack->item.next_c = (_next_c); \

	554 for (i = 0; i < tnfa->num_tags; i++) \

	555 stack->item.tags[i] = (_tags)[i]; \

	556 BT_STACK_MBSTATE_IN; \

	557 } \

	558 while (0)

	559

	560 #define BT_STACK_POP() \

	561 do \

	562 { \

	563 int i; \

	564 assert(stack->prev); \

	565 pos = stack->item.pos; \

	566 str_byte = stack->item.str_byte; \

	567 state = stack->item.state; \

	568 next_c = stack->item.next_c; \

	569 for (i = 0; i < tnfa->num_tags; i++) \

	570 tags[i] = stack->item.tags[i]; \

	571 BT_STACK_MBSTATE_OUT; \

	572 stack = stack->prev; \

	573 } \

	574 while (0)

	575

	576 #undef MIN

	577 #define MIN(a, b) ((a) <= (b) ? (a) : (b))

	578

	579 static reg_errcode_t

	580 tre_tnfa_run_backtrack(const tre_tnfa_t tnfa, const void string,

	581 int match_tags, int eflags, int match_end_ofs)

	582 {

	583 /* State variables required by GET_NEXT_WCHAR. */

	584 tre_char_t prev_c = 0, next_c = 0;

	585 const char *str_byte = string;

	586 int pos = 0;

	587 int pos_add_next = 1;

	588 #ifdef TRE_MBSTATE

	589 mbstate_t mbstate;

	590 #endif /* TRE_MBSTATE */

	591 int reg_notbol = eflags & REG_NOTBOL;

	592 int reg_noteol = eflags & REG_NOTEOL;

	593 int reg_newline = tnfa->cflags & REG_NEWLINE;

	594

	595 /* These are used to remember the necessary values of the above

	596 variables to return to the position where the current search

	597 started from. */

	598 int next_c_start;

	599 const char *str_byte_start;

	600 int pos_start = -1;

	601 #ifdef TRE_MBSTATE

	602 mbstate_t mbstate_start;

	603 #endif /* TRE_MBSTATE */

	604

	605 /* End offset of best match so far, or -1 if no match found yet. */

	606 int match_eo = -1;

	607 /* Tag arrays. */

	608 int next_tags, tags = NULL;

	609 /* Current TNFA state. */

	610 tre_tnfa_transition_t *state;

	611 int *states_seen = NULL;

	612

	613 /* Memory allocator to for allocating the backtracking stack. */

	614 tre_mem_t mem = tre_bt_mem_new();

	615

	616 /* The backtracking stack. */

	617 tre_backtrack_t stack;

	618

	619 tre_tnfa_transition_t *trans_i;

	620 regmatch_t *pmatch = NULL;

	621 int ret;

	622

	623 #ifdef TRE_MBSTATE

	624 memset(&mbstate, '\0', sizeof(mbstate));

	625 #endif /* TRE_MBSTATE */

	626

	627 if (!mem)

	628 return REG_ESPACE;

	629 stack = tre_bt_mem_alloc(mem, sizeof(*stack));

	630 if (!stack)

	631 {

	632 ret = REG_ESPACE;

	633 goto error_exit;

	634 }

	635 stack->prev = NULL;

	636 stack->next = NULL;

	637

	638 if (tnfa->num_tags)

	639 {

	640 tags = xmalloc(sizeof(tags) tnfa->num_tags);

	641 if (!tags)

	642 {

	643 ret = REG_ESPACE;

	644 goto error_exit;

	645 }

	646 }

	647 if (tnfa->num_submatches)

	648 {

	649 pmatch = xmalloc(sizeof(pmatch) tnfa->num_submatches);

	650 if (!pmatch)

	651 {

	652 ret = REG_ESPACE;

	653 goto error_exit;

	654 }

	655 }

	656 if (tnfa->num_states)

	657 {

	658 states_seen = xmalloc(sizeof(states_seen) tnfa->num_states);

	659 if (!states_seen)

	660 {

	661 ret = REG_ESPACE;

	662 goto error_exit;

	663 }

	664 }

	665

	666 retry:

	667 {

	668 int i;

	669 for (i = 0; i < tnfa->num_tags; i++)

	670 {

	671 tags[i] = -1;

	672 if (match_tags)

	673 match_tags[i] = -1;

	674 }

	675 for (i = 0; i < tnfa->num_states; i++)

	676 states_seen[i] = 0;

	677 }

	678

	679 state = NULL;

	680 pos = pos_start;

	681 GET_NEXT_WCHAR();

	682 pos_start = pos;

	683 next_c_start = next_c;

	684 str_byte_start = str_byte;

	685 #ifdef TRE_MBSTATE

	686 mbstate_start = mbstate;

	687 #endif /* TRE_MBSTATE */

	688

	689 /* Handle initial states. */

	690 next_tags = NULL;

	691 for (trans_i = tnfa->initial; trans_i->state; trans_i++)

	692 {

	693 if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))

	694 {

	695 continue;

	696 }

	697 if (state == NULL)

	698 {

	699 /* Start from this state. */

	700 state = trans_i->state;

	701 next_tags = trans_i->tags;

	702 }

	703 else

	704 {

	705 /* Backtrack to this state. */

	706 BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,

	707 trans_i->state_id, next_c, tags, mbstate);

	708 {

	709 int *tmp = trans_i->tags;

	710 if (tmp)

	711 while (*tmp >= 0)

	712 stack->item.tags[*tmp++] = pos;

	713 }

	714 }

	715 }

	716

	717 if (next_tags)

	718 for (; *next_tags >= 0; next_tags++)

	719 tags[*next_tags] = pos;

	720

	721

	722 if (state == NULL)

	723 goto backtrack;

	724

	725 while (1)

	726 {

	727 tre_tnfa_transition_t *next_state;

	728 int empty_br_match;

	729

	730 if (state == tnfa->final)

	731 {

	732 if (match_eo < pos

	733 \|\| (match_eo == pos

	734 && match_tags

	735 && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,

	736 tags, match_tags)))

	737 {

	738 int i;

	739 /* This match wins the previous match. */

	740 match_eo = pos;

	741 if (match_tags)

	742 for (i = 0; i < tnfa->num_tags; i++)

	743 match_tags[i] = tags[i];

	744 }

	745 /* Our TNFAs never have transitions leaving from the final state,

	746 so we jump right to backtracking. */

	747 goto backtrack;

	748 }

	749

	750 /* Go to the next character in the input string. */

	751 empty_br_match = 0;

	752 trans_i = state;

	753 if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)

	754 {

	755 /* This is a back reference state. All transitions leaving from

	756 this state have the same back reference "assertion". Instead

	757 of reading the next character, we match the back reference. */

	758 int so, eo, bt = trans_i->u.backref;

	759 int bt_len;

	760 int result;

	761

	762 /* Get the substring we need to match against. Remember to

	763 turn off REG_NOSUB temporarily. */

	764 tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,

	765 tnfa, tags, pos);

	766 so = pmatch[bt].rm_so;

	767 eo = pmatch[bt].rm_eo;

	768 bt_len = eo - so;

	769

	770 result = strncmp((const char*)string + so, str_byte - 1,

	771 (size_t)bt_len);

	772

	773 if (result == 0)

	774 {

	775 /* Back reference matched. Check for infinite loop. */

	776 if (bt_len == 0)

	777 empty_br_match = 1;

	778 if (empty_br_match && states_seen[trans_i->state_id])

	779 {

	780 goto backtrack;

	781 }

	782

	783 states_seen[trans_i->state_id] = empty_br_match;

	784

	785 /* Advance in input string and resync `prev_c', `next_c'

	786 and pos. */

	787 str_byte += bt_len - 1;

	788 pos += bt_len - 1;

	789 GET_NEXT_WCHAR();

	790 }

	791 else

	792 {

	793 goto backtrack;

	794 }

	795 }

	796 else

	797 {

	798 /* Check for end of string. */

	799 if (next_c == L'\0')

	800 goto backtrack;

	801

	802 /* Read the next character. */

	803 GET_NEXT_WCHAR();

	804 }

	805

	806 next_state = NULL;

	807 for (trans_i = state; trans_i->state; trans_i++)

	808 {

	809 if (trans_i->code_min <= (tre_cint_t)prev_c

	810 && trans_i->code_max >= (tre_cint_t)prev_c)

	811 {

	812 if (trans_i->assertions

	813 && (CHECK_ASSERTIONS(trans_i->assertions)

	814 \|\| CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))

	815 {

	816 continue;

	817 }

	818

	819 if (next_state == NULL)

	820 {

	821 /* First matching transition. */

	822 next_state = trans_i->state;

	823 next_tags = trans_i->tags;

	824 }

	825 else

	826 {

	827 /* Second matching transition. We may need to backtrack here

	828 to take this transition instead of the first one, so we

	829 push this transition in the backtracking stack so we can

	830 jump back here if needed. */

	831 BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,

	832 trans_i->state_id, next_c, tags, mbstate);

	833 {

	834 int *tmp;

	835 for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)

	836 stack->item.tags[*tmp] = pos;

	837 }

	838 #if 0 /* XXX - it's important not to look at all transitions here to keep

	839 the stack small! */

	840 break;

	841 #endif

	842 }

	843 }

	844 }

	845

	846 if (next_state != NULL)

	847 {

	848 /* Matching transitions were found. Take the first one. */

	849 state = next_state;

	850

	851 /* Update the tag values. */

	852 if (next_tags)

	853 while (*next_tags >= 0)

	854 tags[*next_tags++] = pos;

	855 }

	856 else

	857 {

	858 backtrack:

	859 /* A matching transition was not found. Try to backtrack. */

	860 if (stack->prev)

	861 {

	862 if (stack->item.state->assertions & ASSERT_BACKREF)

	863 {

	864 states_seen[stack->item.state_id] = 0;

	865 }

	866

	867 BT_STACK_POP();

	868 }

	869 else if (match_eo < 0)

	870 {

	871 /* Try starting from a later position in the input string. */

	872 /* Check for end of string. */

	873 if (next_c == L'\0')

	874 {

	875 break;

	876 }

	877 next_c = next_c_start;

	878 #ifdef TRE_MBSTATE

	879 mbstate = mbstate_start;

	880 #endif /* TRE_MBSTATE */

	881 str_byte = str_byte_start;

	882 goto retry;

	883 }

	884 else

	885 {

	886 break;

	887 }

	888 }

	889 }

	890

	891 ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;

	892 *match_end_ofs = match_eo;

	893

	894 error_exit:

	895 tre_bt_mem_destroy(mem);

	896 #ifndef TRE_USE_ALLOCA

	897 if (tags)

	898 xfree(tags);

	899 if (pmatch)

	900 xfree(pmatch);

	901 if (states_seen)

	902 xfree(states_seen);

	903 #endif /* !TRE_USE_ALLOCA */

	904

	905 return ret;

	906 }

	907

	908 /***********************************************************************

	909 from regexec.c

	910 ***********************************************************************/

	911

	912 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match

	913 endpoint values. */

	914 static void

	915 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,

	916 const tre_tnfa_t tnfa, int tags, int match_eo)

	917 {

	918 tre_submatch_data_t *submatch_data;

	919 unsigned int i, j;

	920 int *parents;

	921

	922 i = 0;

	923 if (match_eo >= 0 && !(cflags & REG_NOSUB))

	924 {

	925 /* Construct submatch offsets from the tags. */

	926 submatch_data = tnfa->submatch_data;

	927 while (i < tnfa->num_submatches && i < nmatch)

	928 {

	929 if (submatch_data[i].so_tag == tnfa->end_tag)

	930 pmatch[i].rm_so = match_eo;

	931 else

	932 pmatch[i].rm_so = tags[submatch_data[i].so_tag];

	933

	934 if (submatch_data[i].eo_tag == tnfa->end_tag)

	935 pmatch[i].rm_eo = match_eo;

	936 else

	937 pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];

	938

	939 /* If either of the endpoints were not used, this submatch

	940 was not part of the match. */

	941 if (pmatch[i].rm_so == -1 \|\| pmatch[i].rm_eo == -1)

	942 pmatch[i].rm_so = pmatch[i].rm_eo = -1;

	943

	944 i++;

	945 }

	946 /* Reset all submatches that are not within all of their parent

	947 submatches. */

	948 i = 0;

	949 while (i < tnfa->num_submatches && i < nmatch)

	950 {

	951 if (pmatch[i].rm_eo == -1)

	952 assert(pmatch[i].rm_so == -1);

	953 assert(pmatch[i].rm_so <= pmatch[i].rm_eo);

	954

	955 parents = submatch_data[i].parents;

	956 if (parents != NULL)

	957 for (j = 0; parents[j] >= 0; j++)

	958 {

	959 if (pmatch[i].rm_so < pmatch[parents[j]].rm_so

	960 \|\| pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)

	961 pmatch[i].rm_so = pmatch[i].rm_eo = -1;

	962 }

	963 i++;

	964 }

	965 }

	966

	967 while (i < nmatch)

	968 {

	969 pmatch[i].rm_so = -1;

	970 pmatch[i].rm_eo = -1;

	971 i++;

	972 }

	973 }

	974

	975

	976 /*

	977 Wrapper functions for POSIX compatible regexp matching.

	978 */

	979

	980 int

	981 regexec(const regex_t restrict preg, const char restrict string,

	982 size_t nmatch, regmatch_t pmatch[restrict], int eflags)

	983 {

	984 tre_tnfa_t tnfa = (void )preg->TRE_REGEX_T_FIELD;

	985 reg_errcode_t status;

	986 int *tags = NULL, eo;

	987 if (tnfa->cflags & REG_NOSUB) nmatch = 0;

	988 if (tnfa->num_tags > 0 && nmatch > 0)

	989 {

	990 tags = xmalloc(sizeof(tags) tnfa->num_tags);

	991 if (tags == NULL)

	992 return REG_ESPACE;

	993 }

	994

	995 /* Dispatch to the appropriate matcher. */

	996 if (tnfa->have_backrefs)

	997 {

	998 /* The regex has back references, use the backtracking matcher. */

	999 status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);

	1000 }

	1001 else

	1002 {

	1003 /* Exact matching, no back references, use the parallel matcher. */

	1004 status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);

	1005 }

	1006

	1007 if (status == REG_OK)

	1008 /* A match was found, so fill the submatch registers. */

	1009 tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);

	1010 if (tags)

	1011 xfree(tags);

	1012 return status;

	1013 }

OLD	NEW

« no previous file with comments | « fusl/src/regex/regerror.c ('k') | fusl/src/regex/tre.h » ('j') | no next file with comments »