OLD | NEW |
(Empty) | |
| 1 /* |
| 2 regexec.c - TRE POSIX compatible matching functions (and more). |
| 3 |
| 4 Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi> |
| 5 All rights reserved. |
| 6 |
| 7 Redistribution and use in source and binary forms, with or without |
| 8 modification, are permitted provided that the following conditions |
| 9 are met: |
| 10 |
| 11 1. Redistributions of source code must retain the above copyright |
| 12 notice, this list of conditions and the following disclaimer. |
| 13 |
| 14 2. Redistributions in binary form must reproduce the above copyright |
| 15 notice, this list of conditions and the following disclaimer in the |
| 16 documentation and/or other materials provided with the distribution. |
| 17 |
| 18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS |
| 19 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 29 |
| 30 */ |
| 31 |
| 32 #include <stdlib.h> |
| 33 #include <string.h> |
| 34 #include <wchar.h> |
| 35 #include <wctype.h> |
| 36 #include <limits.h> |
| 37 |
| 38 #include <regex.h> |
| 39 |
| 40 #include "tre.h" |
| 41 |
| 42 #include <assert.h> |
| 43 |
| 44 static void |
| 45 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, |
| 46 const tre_tnfa_t *tnfa, int *tags, int match_eo); |
| 47 |
| 48 /*********************************************************************** |
| 49 from tre-match-utils.h |
| 50 ***********************************************************************/ |
| 51 |
| 52 #define GET_NEXT_WCHAR() do { \ |
| 53 prev_c = next_c; pos += pos_add_next; \ |
| 54 if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) { \ |
| 55 if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; } \ |
| 56 else pos_add_next++; \ |
| 57 } \ |
| 58 str_byte += pos_add_next; \ |
| 59 } while (0) |
| 60 |
| 61 #define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum(c)) |
| 62 |
| 63 #define CHECK_ASSERTIONS(assertions) \ |
| 64 (((assertions & ASSERT_AT_BOL) \ |
| 65 && (pos > 0 || reg_notbol) \ |
| 66 && (prev_c != L'\n' || !reg_newline)) \ |
| 67 || ((assertions & ASSERT_AT_EOL) \ |
| 68 && (next_c != L'\0' || reg_noteol) \ |
| 69 && (next_c != L'\n' || !reg_newline)) \ |
| 70 || ((assertions & ASSERT_AT_BOW) \ |
| 71 && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \ |
| 72 || ((assertions & ASSERT_AT_EOW) \ |
| 73 && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \ |
| 74 || ((assertions & ASSERT_AT_WB) \ |
| 75 && (pos != 0 && next_c != L'\0' \ |
| 76 && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \ |
| 77 || ((assertions & ASSERT_AT_WB_NEG) \ |
| 78 && (pos == 0 || next_c == L'\0' \ |
| 79 || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c)))) |
| 80 |
| 81 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \ |
| 82 (((trans_i->assertions & ASSERT_CHAR_CLASS) \ |
| 83 && !(tnfa->cflags & REG_ICASE) \ |
| 84 && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class)) \ |
| 85 || ((trans_i->assertions & ASSERT_CHAR_CLASS) \ |
| 86 && (tnfa->cflags & REG_ICASE) \ |
| 87 && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class) \ |
| 88 && !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class)) \ |
| 89 || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG) \ |
| 90 && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\ |
| 91 tnfa->cflags & REG_ICASE))) |
| 92 |
| 93 |
| 94 |
| 95 |
| 96 /* Returns 1 if `t1' wins `t2', 0 otherwise. */ |
| 97 static int |
| 98 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, |
| 99 int *t1, int *t2) |
| 100 { |
| 101 int i; |
| 102 for (i = 0; i < num_tags; i++) |
| 103 { |
| 104 if (tag_directions[i] == TRE_TAG_MINIMIZE) |
| 105 { |
| 106 if (t1[i] < t2[i]) |
| 107 return 1; |
| 108 if (t1[i] > t2[i]) |
| 109 return 0; |
| 110 } |
| 111 else |
| 112 { |
| 113 if (t1[i] > t2[i]) |
| 114 return 1; |
| 115 if (t1[i] < t2[i]) |
| 116 return 0; |
| 117 } |
| 118 } |
| 119 /* assert(0);*/ |
| 120 return 0; |
| 121 } |
| 122 |
| 123 static int |
| 124 tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase) |
| 125 { |
| 126 while (*classes != (tre_ctype_t)0) |
| 127 if ((!icase && tre_isctype(wc, *classes)) |
| 128 || (icase && (tre_isctype(tre_toupper(wc), *classes) |
| 129 || tre_isctype(tre_tolower(wc), *classes)))) |
| 130 return 1; /* Match. */ |
| 131 else |
| 132 classes++; |
| 133 return 0; /* No match. */ |
| 134 } |
| 135 |
| 136 |
| 137 /*********************************************************************** |
| 138 from tre-match-parallel.c |
| 139 ***********************************************************************/ |
| 140 |
| 141 /* |
| 142 This algorithm searches for matches basically by reading characters |
| 143 in the searched string one by one, starting at the beginning. All |
| 144 matching paths in the TNFA are traversed in parallel. When two or |
| 145 more paths reach the same state, exactly one is chosen according to |
| 146 tag ordering rules; if returning submatches is not required it does |
| 147 not matter which path is chosen. |
| 148 |
| 149 The worst case time required for finding the leftmost and longest |
| 150 match, or determining that there is no match, is always linearly |
| 151 dependent on the length of the text being searched. |
| 152 |
| 153 This algorithm cannot handle TNFAs with back referencing nodes. |
| 154 See `tre-match-backtrack.c'. |
| 155 */ |
| 156 |
| 157 typedef struct { |
| 158 tre_tnfa_transition_t *state; |
| 159 int *tags; |
| 160 } tre_tnfa_reach_t; |
| 161 |
| 162 typedef struct { |
| 163 int pos; |
| 164 int **tags; |
| 165 } tre_reach_pos_t; |
| 166 |
| 167 |
| 168 static reg_errcode_t |
| 169 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, |
| 170 int *match_tags, int eflags, |
| 171 int *match_end_ofs) |
| 172 { |
| 173 /* State variables required by GET_NEXT_WCHAR. */ |
| 174 tre_char_t prev_c = 0, next_c = 0; |
| 175 const char *str_byte = string; |
| 176 int pos = -1; |
| 177 int pos_add_next = 1; |
| 178 #ifdef TRE_MBSTATE |
| 179 mbstate_t mbstate; |
| 180 #endif /* TRE_MBSTATE */ |
| 181 int reg_notbol = eflags & REG_NOTBOL; |
| 182 int reg_noteol = eflags & REG_NOTEOL; |
| 183 int reg_newline = tnfa->cflags & REG_NEWLINE; |
| 184 reg_errcode_t ret; |
| 185 |
| 186 char *buf; |
| 187 tre_tnfa_transition_t *trans_i; |
| 188 tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i; |
| 189 tre_reach_pos_t *reach_pos; |
| 190 int *tag_i; |
| 191 int num_tags, i; |
| 192 |
| 193 int match_eo = -1; /* end offset of match (-1 if no match found yet) */ |
| 194 int new_match = 0; |
| 195 int *tmp_tags = NULL; |
| 196 int *tmp_iptr; |
| 197 |
| 198 #ifdef TRE_MBSTATE |
| 199 memset(&mbstate, '\0', sizeof(mbstate)); |
| 200 #endif /* TRE_MBSTATE */ |
| 201 |
| 202 if (!match_tags) |
| 203 num_tags = 0; |
| 204 else |
| 205 num_tags = tnfa->num_tags; |
| 206 |
| 207 /* Allocate memory for temporary data required for matching. This needs to |
| 208 be done for every matching operation to be thread safe. This allocates |
| 209 everything in a single large block from the stack frame using alloca() |
| 210 or with malloc() if alloca is unavailable. */ |
| 211 { |
| 212 int tbytes, rbytes, pbytes, xbytes, total_bytes; |
| 213 char *tmp_buf; |
| 214 /* Compute the length of the block we need. */ |
| 215 tbytes = sizeof(*tmp_tags) * num_tags; |
| 216 rbytes = sizeof(*reach_next) * (tnfa->num_states + 1); |
| 217 pbytes = sizeof(*reach_pos) * tnfa->num_states; |
| 218 xbytes = sizeof(int) * num_tags; |
| 219 total_bytes = |
| 220 (sizeof(long) - 1) * 4 /* for alignment paddings */ |
| 221 + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes; |
| 222 |
| 223 /* Allocate the memory. */ |
| 224 buf = xmalloc((unsigned)total_bytes); |
| 225 if (buf == NULL) |
| 226 return REG_ESPACE; |
| 227 memset(buf, 0, (size_t)total_bytes); |
| 228 |
| 229 /* Get the various pointers within tmp_buf (properly aligned). */ |
| 230 tmp_tags = (void *)buf; |
| 231 tmp_buf = buf + tbytes; |
| 232 tmp_buf += ALIGN(tmp_buf, long); |
| 233 reach_next = (void *)tmp_buf; |
| 234 tmp_buf += rbytes; |
| 235 tmp_buf += ALIGN(tmp_buf, long); |
| 236 reach = (void *)tmp_buf; |
| 237 tmp_buf += rbytes; |
| 238 tmp_buf += ALIGN(tmp_buf, long); |
| 239 reach_pos = (void *)tmp_buf; |
| 240 tmp_buf += pbytes; |
| 241 tmp_buf += ALIGN(tmp_buf, long); |
| 242 for (i = 0; i < tnfa->num_states; i++) |
| 243 { |
| 244 reach[i].tags = (void *)tmp_buf; |
| 245 tmp_buf += xbytes; |
| 246 reach_next[i].tags = (void *)tmp_buf; |
| 247 tmp_buf += xbytes; |
| 248 } |
| 249 } |
| 250 |
| 251 for (i = 0; i < tnfa->num_states; i++) |
| 252 reach_pos[i].pos = -1; |
| 253 |
| 254 GET_NEXT_WCHAR(); |
| 255 pos = 0; |
| 256 |
| 257 reach_next_i = reach_next; |
| 258 while (1) |
| 259 { |
| 260 /* If no match found yet, add the initial states to `reach_next'. */ |
| 261 if (match_eo < 0) |
| 262 { |
| 263 trans_i = tnfa->initial; |
| 264 while (trans_i->state != NULL) |
| 265 { |
| 266 if (reach_pos[trans_i->state_id].pos < pos) |
| 267 { |
| 268 if (trans_i->assertions |
| 269 && CHECK_ASSERTIONS(trans_i->assertions)) |
| 270 { |
| 271 trans_i++; |
| 272 continue; |
| 273 } |
| 274 |
| 275 reach_next_i->state = trans_i->state; |
| 276 for (i = 0; i < num_tags; i++) |
| 277 reach_next_i->tags[i] = -1; |
| 278 tag_i = trans_i->tags; |
| 279 if (tag_i) |
| 280 while (*tag_i >= 0) |
| 281 { |
| 282 if (*tag_i < num_tags) |
| 283 reach_next_i->tags[*tag_i] = pos; |
| 284 tag_i++; |
| 285 } |
| 286 if (reach_next_i->state == tnfa->final) |
| 287 { |
| 288 match_eo = pos; |
| 289 new_match = 1; |
| 290 for (i = 0; i < num_tags; i++) |
| 291 match_tags[i] = reach_next_i->tags[i]; |
| 292 } |
| 293 reach_pos[trans_i->state_id].pos = pos; |
| 294 reach_pos[trans_i->state_id].tags = &reach_next_i->tags; |
| 295 reach_next_i++; |
| 296 } |
| 297 trans_i++; |
| 298 } |
| 299 reach_next_i->state = NULL; |
| 300 } |
| 301 else |
| 302 { |
| 303 if (num_tags == 0 || reach_next_i == reach_next) |
| 304 /* We have found a match. */ |
| 305 break; |
| 306 } |
| 307 |
| 308 /* Check for end of string. */ |
| 309 if (!next_c) break; |
| 310 |
| 311 GET_NEXT_WCHAR(); |
| 312 |
| 313 /* Swap `reach' and `reach_next'. */ |
| 314 reach_i = reach; |
| 315 reach = reach_next; |
| 316 reach_next = reach_i; |
| 317 |
| 318 /* For each state in `reach', weed out states that don't fulfill the |
| 319 minimal matching conditions. */ |
| 320 if (tnfa->num_minimals && new_match) |
| 321 { |
| 322 new_match = 0; |
| 323 reach_next_i = reach_next; |
| 324 for (reach_i = reach; reach_i->state; reach_i++) |
| 325 { |
| 326 int skip = 0; |
| 327 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) |
| 328 { |
| 329 int end = tnfa->minimal_tags[i]; |
| 330 int start = tnfa->minimal_tags[i + 1]; |
| 331 if (end >= num_tags) |
| 332 { |
| 333 skip = 1; |
| 334 break; |
| 335 } |
| 336 else if (reach_i->tags[start] == match_tags[start] |
| 337 && reach_i->tags[end] < match_tags[end]) |
| 338 { |
| 339 skip = 1; |
| 340 break; |
| 341 } |
| 342 } |
| 343 if (!skip) |
| 344 { |
| 345 reach_next_i->state = reach_i->state; |
| 346 tmp_iptr = reach_next_i->tags; |
| 347 reach_next_i->tags = reach_i->tags; |
| 348 reach_i->tags = tmp_iptr; |
| 349 reach_next_i++; |
| 350 } |
| 351 } |
| 352 reach_next_i->state = NULL; |
| 353 |
| 354 /* Swap `reach' and `reach_next'. */ |
| 355 reach_i = reach; |
| 356 reach = reach_next; |
| 357 reach_next = reach_i; |
| 358 } |
| 359 |
| 360 /* For each state in `reach' see if there is a transition leaving with |
| 361 the current input symbol to a state not yet in `reach_next', and |
| 362 add the destination states to `reach_next'. */ |
| 363 reach_next_i = reach_next; |
| 364 for (reach_i = reach; reach_i->state; reach_i++) |
| 365 { |
| 366 for (trans_i = reach_i->state; trans_i->state; trans_i++) |
| 367 { |
| 368 /* Does this transition match the input symbol? */ |
| 369 if (trans_i->code_min <= (tre_cint_t)prev_c && |
| 370 trans_i->code_max >= (tre_cint_t)prev_c) |
| 371 { |
| 372 if (trans_i->assertions |
| 373 && (CHECK_ASSERTIONS(trans_i->assertions) |
| 374 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) |
| 375 { |
| 376 continue; |
| 377 } |
| 378 |
| 379 /* Compute the tags after this transition. */ |
| 380 for (i = 0; i < num_tags; i++) |
| 381 tmp_tags[i] = reach_i->tags[i]; |
| 382 tag_i = trans_i->tags; |
| 383 if (tag_i != NULL) |
| 384 while (*tag_i >= 0) |
| 385 { |
| 386 if (*tag_i < num_tags) |
| 387 tmp_tags[*tag_i] = pos; |
| 388 tag_i++; |
| 389 } |
| 390 |
| 391 if (reach_pos[trans_i->state_id].pos < pos) |
| 392 { |
| 393 /* Found an unvisited node. */ |
| 394 reach_next_i->state = trans_i->state; |
| 395 tmp_iptr = reach_next_i->tags; |
| 396 reach_next_i->tags = tmp_tags; |
| 397 tmp_tags = tmp_iptr; |
| 398 reach_pos[trans_i->state_id].pos = pos; |
| 399 reach_pos[trans_i->state_id].tags = &reach_next_i->tags; |
| 400 |
| 401 if (reach_next_i->state == tnfa->final |
| 402 && (match_eo == -1 |
| 403 || (num_tags > 0 |
| 404 && reach_next_i->tags[0] <= match_tags[0]))) |
| 405 { |
| 406 match_eo = pos; |
| 407 new_match = 1; |
| 408 for (i = 0; i < num_tags; i++) |
| 409 match_tags[i] = reach_next_i->tags[i]; |
| 410 } |
| 411 reach_next_i++; |
| 412 |
| 413 } |
| 414 else |
| 415 { |
| 416 assert(reach_pos[trans_i->state_id].pos == pos); |
| 417 /* Another path has also reached this state. We choose |
| 418 the winner by examining the tag values for both |
| 419 paths. */ |
| 420 if (tre_tag_order(num_tags, tnfa->tag_directions, |
| 421 tmp_tags, |
| 422 *reach_pos[trans_i->state_id].tags)) |
| 423 { |
| 424 /* The new path wins. */ |
| 425 tmp_iptr = *reach_pos[trans_i->state_id].tags; |
| 426 *reach_pos[trans_i->state_id].tags = tmp_tags; |
| 427 if (trans_i->state == tnfa->final) |
| 428 { |
| 429 match_eo = pos; |
| 430 new_match = 1; |
| 431 for (i = 0; i < num_tags; i++) |
| 432 match_tags[i] = tmp_tags[i]; |
| 433 } |
| 434 tmp_tags = tmp_iptr; |
| 435 } |
| 436 } |
| 437 } |
| 438 } |
| 439 } |
| 440 reach_next_i->state = NULL; |
| 441 } |
| 442 |
| 443 *match_end_ofs = match_eo; |
| 444 ret = match_eo >= 0 ? REG_OK : REG_NOMATCH; |
| 445 error_exit: |
| 446 xfree(buf); |
| 447 return ret; |
| 448 } |
| 449 |
| 450 |
| 451 |
| 452 /*********************************************************************** |
| 453 from tre-match-backtrack.c |
| 454 ***********************************************************************/ |
| 455 |
| 456 /* |
| 457 This matcher is for regexps that use back referencing. Regexp matching |
| 458 with back referencing is an NP-complete problem on the number of back |
| 459 references. The easiest way to match them is to use a backtracking |
| 460 routine which basically goes through all possible paths in the TNFA |
| 461 and chooses the one which results in the best (leftmost and longest) |
| 462 match. This can be spectacularly expensive and may run out of stack |
| 463 space, but there really is no better known generic algorithm. Quoting |
| 464 Henry Spencer from comp.compilers: |
| 465 <URL: http://compilers.iecc.com/comparch/article/93-03-102> |
| 466 |
| 467 POSIX.2 REs require longest match, which is really exciting to |
| 468 implement since the obsolete ("basic") variant also includes |
| 469 \<digit>. I haven't found a better way of tackling this than doing |
| 470 a preliminary match using a DFA (or simulation) on a modified RE |
| 471 that just replicates subREs for \<digit>, and then doing a |
| 472 backtracking match to determine whether the subRE matches were |
| 473 right. This can be rather slow, but I console myself with the |
| 474 thought that people who use \<digit> deserve very slow execution. |
| 475 (Pun unintentional but very appropriate.) |
| 476 |
| 477 */ |
| 478 |
| 479 typedef struct { |
| 480 int pos; |
| 481 const char *str_byte; |
| 482 tre_tnfa_transition_t *state; |
| 483 int state_id; |
| 484 int next_c; |
| 485 int *tags; |
| 486 #ifdef TRE_MBSTATE |
| 487 mbstate_t mbstate; |
| 488 #endif /* TRE_MBSTATE */ |
| 489 } tre_backtrack_item_t; |
| 490 |
| 491 typedef struct tre_backtrack_struct { |
| 492 tre_backtrack_item_t item; |
| 493 struct tre_backtrack_struct *prev; |
| 494 struct tre_backtrack_struct *next; |
| 495 } *tre_backtrack_t; |
| 496 |
| 497 #ifdef TRE_MBSTATE |
| 498 #define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate) |
| 499 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate |
| 500 #else /* !TRE_MBSTATE */ |
| 501 #define BT_STACK_MBSTATE_IN |
| 502 #define BT_STACK_MBSTATE_OUT |
| 503 #endif /* !TRE_MBSTATE */ |
| 504 |
| 505 #define tre_bt_mem_new tre_mem_new |
| 506 #define tre_bt_mem_alloc tre_mem_alloc |
| 507 #define tre_bt_mem_destroy tre_mem_destroy |
| 508 |
| 509 |
| 510 #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _t
ags, _mbstate) \ |
| 511 do \ |
| 512 { \ |
| 513 int i; \ |
| 514 if (!stack->next) \ |
| 515 { \ |
| 516 tre_backtrack_t s; \ |
| 517 s = tre_bt_mem_alloc(mem, sizeof(*s)); \ |
| 518 if (!s) \ |
| 519 { \ |
| 520 tre_bt_mem_destroy(mem); \ |
| 521 if (tags) \ |
| 522 xfree(tags); \ |
| 523 if (pmatch) \ |
| 524 xfree(pmatch); \ |
| 525 if (states_seen) \ |
| 526 xfree(states_seen); \ |
| 527 return REG_ESPACE; \ |
| 528 } \ |
| 529 s->prev = stack; \ |
| 530 s->next = NULL; \ |
| 531 s->item.tags = tre_bt_mem_alloc(mem, \ |
| 532 sizeof(*tags) * tnfa->num_tags); \ |
| 533 if (!s->item.tags) \ |
| 534 { \ |
| 535 tre_bt_mem_destroy(mem); \ |
| 536 if (tags) \ |
| 537 xfree(tags); \ |
| 538 if (pmatch) \ |
| 539 xfree(pmatch); \ |
| 540 if (states_seen) \ |
| 541 xfree(states_seen); \ |
| 542 return REG_ESPACE; \ |
| 543 } \ |
| 544 stack->next = s; \ |
| 545 stack = s; \ |
| 546 } \ |
| 547 else \ |
| 548 stack = stack->next; \ |
| 549 stack->item.pos = (_pos); \ |
| 550 stack->item.str_byte = (_str_byte); \ |
| 551 stack->item.state = (_state); \ |
| 552 stack->item.state_id = (_state_id); \ |
| 553 stack->item.next_c = (_next_c); \ |
| 554 for (i = 0; i < tnfa->num_tags; i++) \ |
| 555 stack->item.tags[i] = (_tags)[i]; \ |
| 556 BT_STACK_MBSTATE_IN; \ |
| 557 } \ |
| 558 while (0) |
| 559 |
| 560 #define BT_STACK_POP() \ |
| 561 do \ |
| 562 { \ |
| 563 int i; \ |
| 564 assert(stack->prev); \ |
| 565 pos = stack->item.pos; \ |
| 566 str_byte = stack->item.str_byte; \ |
| 567 state = stack->item.state; \ |
| 568 next_c = stack->item.next_c; \ |
| 569 for (i = 0; i < tnfa->num_tags; i++) \ |
| 570 tags[i] = stack->item.tags[i]; \ |
| 571 BT_STACK_MBSTATE_OUT; \ |
| 572 stack = stack->prev; \ |
| 573 } \ |
| 574 while (0) |
| 575 |
| 576 #undef MIN |
| 577 #define MIN(a, b) ((a) <= (b) ? (a) : (b)) |
| 578 |
| 579 static reg_errcode_t |
| 580 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, |
| 581 int *match_tags, int eflags, int *match_end_ofs) |
| 582 { |
| 583 /* State variables required by GET_NEXT_WCHAR. */ |
| 584 tre_char_t prev_c = 0, next_c = 0; |
| 585 const char *str_byte = string; |
| 586 int pos = 0; |
| 587 int pos_add_next = 1; |
| 588 #ifdef TRE_MBSTATE |
| 589 mbstate_t mbstate; |
| 590 #endif /* TRE_MBSTATE */ |
| 591 int reg_notbol = eflags & REG_NOTBOL; |
| 592 int reg_noteol = eflags & REG_NOTEOL; |
| 593 int reg_newline = tnfa->cflags & REG_NEWLINE; |
| 594 |
| 595 /* These are used to remember the necessary values of the above |
| 596 variables to return to the position where the current search |
| 597 started from. */ |
| 598 int next_c_start; |
| 599 const char *str_byte_start; |
| 600 int pos_start = -1; |
| 601 #ifdef TRE_MBSTATE |
| 602 mbstate_t mbstate_start; |
| 603 #endif /* TRE_MBSTATE */ |
| 604 |
| 605 /* End offset of best match so far, or -1 if no match found yet. */ |
| 606 int match_eo = -1; |
| 607 /* Tag arrays. */ |
| 608 int *next_tags, *tags = NULL; |
| 609 /* Current TNFA state. */ |
| 610 tre_tnfa_transition_t *state; |
| 611 int *states_seen = NULL; |
| 612 |
| 613 /* Memory allocator to for allocating the backtracking stack. */ |
| 614 tre_mem_t mem = tre_bt_mem_new(); |
| 615 |
| 616 /* The backtracking stack. */ |
| 617 tre_backtrack_t stack; |
| 618 |
| 619 tre_tnfa_transition_t *trans_i; |
| 620 regmatch_t *pmatch = NULL; |
| 621 int ret; |
| 622 |
| 623 #ifdef TRE_MBSTATE |
| 624 memset(&mbstate, '\0', sizeof(mbstate)); |
| 625 #endif /* TRE_MBSTATE */ |
| 626 |
| 627 if (!mem) |
| 628 return REG_ESPACE; |
| 629 stack = tre_bt_mem_alloc(mem, sizeof(*stack)); |
| 630 if (!stack) |
| 631 { |
| 632 ret = REG_ESPACE; |
| 633 goto error_exit; |
| 634 } |
| 635 stack->prev = NULL; |
| 636 stack->next = NULL; |
| 637 |
| 638 if (tnfa->num_tags) |
| 639 { |
| 640 tags = xmalloc(sizeof(*tags) * tnfa->num_tags); |
| 641 if (!tags) |
| 642 { |
| 643 ret = REG_ESPACE; |
| 644 goto error_exit; |
| 645 } |
| 646 } |
| 647 if (tnfa->num_submatches) |
| 648 { |
| 649 pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches); |
| 650 if (!pmatch) |
| 651 { |
| 652 ret = REG_ESPACE; |
| 653 goto error_exit; |
| 654 } |
| 655 } |
| 656 if (tnfa->num_states) |
| 657 { |
| 658 states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states); |
| 659 if (!states_seen) |
| 660 { |
| 661 ret = REG_ESPACE; |
| 662 goto error_exit; |
| 663 } |
| 664 } |
| 665 |
| 666 retry: |
| 667 { |
| 668 int i; |
| 669 for (i = 0; i < tnfa->num_tags; i++) |
| 670 { |
| 671 tags[i] = -1; |
| 672 if (match_tags) |
| 673 match_tags[i] = -1; |
| 674 } |
| 675 for (i = 0; i < tnfa->num_states; i++) |
| 676 states_seen[i] = 0; |
| 677 } |
| 678 |
| 679 state = NULL; |
| 680 pos = pos_start; |
| 681 GET_NEXT_WCHAR(); |
| 682 pos_start = pos; |
| 683 next_c_start = next_c; |
| 684 str_byte_start = str_byte; |
| 685 #ifdef TRE_MBSTATE |
| 686 mbstate_start = mbstate; |
| 687 #endif /* TRE_MBSTATE */ |
| 688 |
| 689 /* Handle initial states. */ |
| 690 next_tags = NULL; |
| 691 for (trans_i = tnfa->initial; trans_i->state; trans_i++) |
| 692 { |
| 693 if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions)) |
| 694 { |
| 695 continue; |
| 696 } |
| 697 if (state == NULL) |
| 698 { |
| 699 /* Start from this state. */ |
| 700 state = trans_i->state; |
| 701 next_tags = trans_i->tags; |
| 702 } |
| 703 else |
| 704 { |
| 705 /* Backtrack to this state. */ |
| 706 BT_STACK_PUSH(pos, str_byte, 0, trans_i->state, |
| 707 trans_i->state_id, next_c, tags, mbstate); |
| 708 { |
| 709 int *tmp = trans_i->tags; |
| 710 if (tmp) |
| 711 while (*tmp >= 0) |
| 712 stack->item.tags[*tmp++] = pos; |
| 713 } |
| 714 } |
| 715 } |
| 716 |
| 717 if (next_tags) |
| 718 for (; *next_tags >= 0; next_tags++) |
| 719 tags[*next_tags] = pos; |
| 720 |
| 721 |
| 722 if (state == NULL) |
| 723 goto backtrack; |
| 724 |
| 725 while (1) |
| 726 { |
| 727 tre_tnfa_transition_t *next_state; |
| 728 int empty_br_match; |
| 729 |
| 730 if (state == tnfa->final) |
| 731 { |
| 732 if (match_eo < pos |
| 733 || (match_eo == pos |
| 734 && match_tags |
| 735 && tre_tag_order(tnfa->num_tags, tnfa->tag_directions, |
| 736 tags, match_tags))) |
| 737 { |
| 738 int i; |
| 739 /* This match wins the previous match. */ |
| 740 match_eo = pos; |
| 741 if (match_tags) |
| 742 for (i = 0; i < tnfa->num_tags; i++) |
| 743 match_tags[i] = tags[i]; |
| 744 } |
| 745 /* Our TNFAs never have transitions leaving from the final state, |
| 746 so we jump right to backtracking. */ |
| 747 goto backtrack; |
| 748 } |
| 749 |
| 750 /* Go to the next character in the input string. */ |
| 751 empty_br_match = 0; |
| 752 trans_i = state; |
| 753 if (trans_i->state && trans_i->assertions & ASSERT_BACKREF) |
| 754 { |
| 755 /* This is a back reference state. All transitions leaving from |
| 756 this state have the same back reference "assertion". Instead |
| 757 of reading the next character, we match the back reference. */ |
| 758 int so, eo, bt = trans_i->u.backref; |
| 759 int bt_len; |
| 760 int result; |
| 761 |
| 762 /* Get the substring we need to match against. Remember to |
| 763 turn off REG_NOSUB temporarily. */ |
| 764 tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB, |
| 765 tnfa, tags, pos); |
| 766 so = pmatch[bt].rm_so; |
| 767 eo = pmatch[bt].rm_eo; |
| 768 bt_len = eo - so; |
| 769 |
| 770 result = strncmp((const char*)string + so, str_byte - 1, |
| 771 (size_t)bt_len); |
| 772 |
| 773 if (result == 0) |
| 774 { |
| 775 /* Back reference matched. Check for infinite loop. */ |
| 776 if (bt_len == 0) |
| 777 empty_br_match = 1; |
| 778 if (empty_br_match && states_seen[trans_i->state_id]) |
| 779 { |
| 780 goto backtrack; |
| 781 } |
| 782 |
| 783 states_seen[trans_i->state_id] = empty_br_match; |
| 784 |
| 785 /* Advance in input string and resync `prev_c', `next_c' |
| 786 and pos. */ |
| 787 str_byte += bt_len - 1; |
| 788 pos += bt_len - 1; |
| 789 GET_NEXT_WCHAR(); |
| 790 } |
| 791 else |
| 792 { |
| 793 goto backtrack; |
| 794 } |
| 795 } |
| 796 else |
| 797 { |
| 798 /* Check for end of string. */ |
| 799 if (next_c == L'\0') |
| 800 goto backtrack; |
| 801 |
| 802 /* Read the next character. */ |
| 803 GET_NEXT_WCHAR(); |
| 804 } |
| 805 |
| 806 next_state = NULL; |
| 807 for (trans_i = state; trans_i->state; trans_i++) |
| 808 { |
| 809 if (trans_i->code_min <= (tre_cint_t)prev_c |
| 810 && trans_i->code_max >= (tre_cint_t)prev_c) |
| 811 { |
| 812 if (trans_i->assertions |
| 813 && (CHECK_ASSERTIONS(trans_i->assertions) |
| 814 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) |
| 815 { |
| 816 continue; |
| 817 } |
| 818 |
| 819 if (next_state == NULL) |
| 820 { |
| 821 /* First matching transition. */ |
| 822 next_state = trans_i->state; |
| 823 next_tags = trans_i->tags; |
| 824 } |
| 825 else |
| 826 { |
| 827 /* Second matching transition. We may need to backtrack here |
| 828 to take this transition instead of the first one, so we |
| 829 push this transition in the backtracking stack so we can |
| 830 jump back here if needed. */ |
| 831 BT_STACK_PUSH(pos, str_byte, 0, trans_i->state, |
| 832 trans_i->state_id, next_c, tags, mbstate); |
| 833 { |
| 834 int *tmp; |
| 835 for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++) |
| 836 stack->item.tags[*tmp] = pos; |
| 837 } |
| 838 #if 0 /* XXX - it's important not to look at all transitions here to keep |
| 839 the stack small! */ |
| 840 break; |
| 841 #endif |
| 842 } |
| 843 } |
| 844 } |
| 845 |
| 846 if (next_state != NULL) |
| 847 { |
| 848 /* Matching transitions were found. Take the first one. */ |
| 849 state = next_state; |
| 850 |
| 851 /* Update the tag values. */ |
| 852 if (next_tags) |
| 853 while (*next_tags >= 0) |
| 854 tags[*next_tags++] = pos; |
| 855 } |
| 856 else |
| 857 { |
| 858 backtrack: |
| 859 /* A matching transition was not found. Try to backtrack. */ |
| 860 if (stack->prev) |
| 861 { |
| 862 if (stack->item.state->assertions & ASSERT_BACKREF) |
| 863 { |
| 864 states_seen[stack->item.state_id] = 0; |
| 865 } |
| 866 |
| 867 BT_STACK_POP(); |
| 868 } |
| 869 else if (match_eo < 0) |
| 870 { |
| 871 /* Try starting from a later position in the input string. */ |
| 872 /* Check for end of string. */ |
| 873 if (next_c == L'\0') |
| 874 { |
| 875 break; |
| 876 } |
| 877 next_c = next_c_start; |
| 878 #ifdef TRE_MBSTATE |
| 879 mbstate = mbstate_start; |
| 880 #endif /* TRE_MBSTATE */ |
| 881 str_byte = str_byte_start; |
| 882 goto retry; |
| 883 } |
| 884 else |
| 885 { |
| 886 break; |
| 887 } |
| 888 } |
| 889 } |
| 890 |
| 891 ret = match_eo >= 0 ? REG_OK : REG_NOMATCH; |
| 892 *match_end_ofs = match_eo; |
| 893 |
| 894 error_exit: |
| 895 tre_bt_mem_destroy(mem); |
| 896 #ifndef TRE_USE_ALLOCA |
| 897 if (tags) |
| 898 xfree(tags); |
| 899 if (pmatch) |
| 900 xfree(pmatch); |
| 901 if (states_seen) |
| 902 xfree(states_seen); |
| 903 #endif /* !TRE_USE_ALLOCA */ |
| 904 |
| 905 return ret; |
| 906 } |
| 907 |
| 908 /*********************************************************************** |
| 909 from regexec.c |
| 910 ***********************************************************************/ |
| 911 |
| 912 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match |
| 913 endpoint values. */ |
| 914 static void |
| 915 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, |
| 916 const tre_tnfa_t *tnfa, int *tags, int match_eo) |
| 917 { |
| 918 tre_submatch_data_t *submatch_data; |
| 919 unsigned int i, j; |
| 920 int *parents; |
| 921 |
| 922 i = 0; |
| 923 if (match_eo >= 0 && !(cflags & REG_NOSUB)) |
| 924 { |
| 925 /* Construct submatch offsets from the tags. */ |
| 926 submatch_data = tnfa->submatch_data; |
| 927 while (i < tnfa->num_submatches && i < nmatch) |
| 928 { |
| 929 if (submatch_data[i].so_tag == tnfa->end_tag) |
| 930 pmatch[i].rm_so = match_eo; |
| 931 else |
| 932 pmatch[i].rm_so = tags[submatch_data[i].so_tag]; |
| 933 |
| 934 if (submatch_data[i].eo_tag == tnfa->end_tag) |
| 935 pmatch[i].rm_eo = match_eo; |
| 936 else |
| 937 pmatch[i].rm_eo = tags[submatch_data[i].eo_tag]; |
| 938 |
| 939 /* If either of the endpoints were not used, this submatch |
| 940 was not part of the match. */ |
| 941 if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1) |
| 942 pmatch[i].rm_so = pmatch[i].rm_eo = -1; |
| 943 |
| 944 i++; |
| 945 } |
| 946 /* Reset all submatches that are not within all of their parent |
| 947 submatches. */ |
| 948 i = 0; |
| 949 while (i < tnfa->num_submatches && i < nmatch) |
| 950 { |
| 951 if (pmatch[i].rm_eo == -1) |
| 952 assert(pmatch[i].rm_so == -1); |
| 953 assert(pmatch[i].rm_so <= pmatch[i].rm_eo); |
| 954 |
| 955 parents = submatch_data[i].parents; |
| 956 if (parents != NULL) |
| 957 for (j = 0; parents[j] >= 0; j++) |
| 958 { |
| 959 if (pmatch[i].rm_so < pmatch[parents[j]].rm_so |
| 960 || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo) |
| 961 pmatch[i].rm_so = pmatch[i].rm_eo = -1; |
| 962 } |
| 963 i++; |
| 964 } |
| 965 } |
| 966 |
| 967 while (i < nmatch) |
| 968 { |
| 969 pmatch[i].rm_so = -1; |
| 970 pmatch[i].rm_eo = -1; |
| 971 i++; |
| 972 } |
| 973 } |
| 974 |
| 975 |
| 976 /* |
| 977 Wrapper functions for POSIX compatible regexp matching. |
| 978 */ |
| 979 |
| 980 int |
| 981 regexec(const regex_t *restrict preg, const char *restrict string, |
| 982 size_t nmatch, regmatch_t pmatch[restrict], int eflags) |
| 983 { |
| 984 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; |
| 985 reg_errcode_t status; |
| 986 int *tags = NULL, eo; |
| 987 if (tnfa->cflags & REG_NOSUB) nmatch = 0; |
| 988 if (tnfa->num_tags > 0 && nmatch > 0) |
| 989 { |
| 990 tags = xmalloc(sizeof(*tags) * tnfa->num_tags); |
| 991 if (tags == NULL) |
| 992 return REG_ESPACE; |
| 993 } |
| 994 |
| 995 /* Dispatch to the appropriate matcher. */ |
| 996 if (tnfa->have_backrefs) |
| 997 { |
| 998 /* The regex has back references, use the backtracking matcher. */ |
| 999 status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo); |
| 1000 } |
| 1001 else |
| 1002 { |
| 1003 /* Exact matching, no back references, use the parallel matcher. */ |
| 1004 status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo); |
| 1005 } |
| 1006 |
| 1007 if (status == REG_OK) |
| 1008 /* A match was found, so fill the submatch registers. */ |
| 1009 tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo); |
| 1010 if (tags) |
| 1011 xfree(tags); |
| 1012 return status; |
| 1013 } |
OLD | NEW |