| OLD | NEW |
| 1 /* | 1 /* |
| 2 regcomp.c - TRE POSIX compatible regex compilation functions. | 2 regcomp.c - TRE POSIX compatible regex compilation functions. |
| 3 | 3 |
| 4 Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi> | 4 Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi> |
| 5 All rights reserved. | 5 All rights reserved. |
| 6 | 6 |
| 7 Redistribution and use in source and binary forms, with or without | 7 Redistribution and use in source and binary forms, with or without |
| 8 modification, are permitted provided that the following conditions | 8 modification, are permitted provided that the following conditions |
| 9 are met: | 9 are met: |
| 10 | 10 |
| (...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 91 tags, matching parameter settings, and all expressions that match one | 91 tags, matching parameter settings, and all expressions that match one |
| 92 character. */ | 92 character. */ |
| 93 typedef struct { | 93 typedef struct { |
| 94 long code_min; | 94 long code_min; |
| 95 long code_max; | 95 long code_max; |
| 96 int position; | 96 int position; |
| 97 tre_ctype_t class; | 97 tre_ctype_t class; |
| 98 tre_ctype_t* neg_classes; | 98 tre_ctype_t* neg_classes; |
| 99 } tre_literal_t; | 99 } tre_literal_t; |
| 100 | 100 |
| 101 /* A "catenation" node.» These are created when two regexps are concatenated. | 101 /* A "catenation" node. These are created when two regexps are concatenated. |
| 102 If there are more than one subexpressions in sequence, the `left' part | 102 If there are more than one subexpressions in sequence, the `left' part |
| 103 holds all but the last, and `right' part holds the last subexpression | 103 holds all but the last, and `right' part holds the last subexpression |
| 104 (catenation is left associative). */ | 104 (catenation is left associative). */ |
| 105 typedef struct { | 105 typedef struct { |
| 106 tre_ast_node_t* left; | 106 tre_ast_node_t* left; |
| 107 tre_ast_node_t* right; | 107 tre_ast_node_t* right; |
| 108 } tre_catenation_t; | 108 } tre_catenation_t; |
| 109 | 109 |
| 110 /* An "iteration" node.» These are created for the "*", "+", "?", and "{m,n}" | 110 /* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}" |
| 111 operators. */ | 111 operators. */ |
| 112 typedef struct { | 112 typedef struct { |
| 113 /* Subexpression to match. */ | 113 /* Subexpression to match. */ |
| 114 tre_ast_node_t* arg; | 114 tre_ast_node_t* arg; |
| 115 /* Minimum number of consecutive matches. */ | 115 /* Minimum number of consecutive matches. */ |
| 116 int min; | 116 int min; |
| 117 /* Maximum number of consecutive matches. */ | 117 /* Maximum number of consecutive matches. */ |
| 118 int max; | 118 int max; |
| 119 /* If 0, match as many characters as possible, if 1 match as few as | 119 /* If 0, match as many characters as possible, if 1 match as few as |
| 120 possible.» Note that this does not always mean the same thing as | 120 possible. Note that this does not always mean the same thing as |
| 121 matching as many/few repetitions as possible. */ | 121 matching as many/few repetitions as possible. */ |
| 122 unsigned int minimal : 1; | 122 unsigned int minimal : 1; |
| 123 } tre_iteration_t; | 123 } tre_iteration_t; |
| 124 | 124 |
| 125 /* An "union" node. These are created for the "|" operator. */ | 125 /* An "union" node. These are created for the "|" operator. */ |
| 126 typedef struct { | 126 typedef struct { |
| 127 tre_ast_node_t* left; | 127 tre_ast_node_t* left; |
| 128 tre_ast_node_t* right; | 128 tre_ast_node_t* right; |
| 129 } tre_union_t; | 129 } tre_union_t; |
| 130 | 130 |
| (...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 211 node->num_submatches = left->num_submatches + right->num_submatches; | 211 node->num_submatches = left->num_submatches + right->num_submatches; |
| 212 return node; | 212 return node; |
| 213 } | 213 } |
| 214 | 214 |
| 215 /*********************************************************************** | 215 /*********************************************************************** |
| 216 from tre-stack.c and tre-stack.h | 216 from tre-stack.c and tre-stack.h |
| 217 ***********************************************************************/ | 217 ***********************************************************************/ |
| 218 | 218 |
| 219 typedef struct tre_stack_rec tre_stack_t; | 219 typedef struct tre_stack_rec tre_stack_t; |
| 220 | 220 |
| 221 /* Creates a new stack object.» `size' is initial size in bytes, `max_size' | 221 /* Creates a new stack object. `size' is initial size in bytes, `max_size' |
| 222 is maximum size, and `increment' specifies how much more space will be | 222 is maximum size, and `increment' specifies how much more space will be |
| 223 allocated with realloc() if all space gets used up.» Returns the stack | 223 allocated with realloc() if all space gets used up. Returns the stack |
| 224 object or NULL if out of memory. */ | 224 object or NULL if out of memory. */ |
| 225 static tre_stack_t* tre_stack_new(int size, int max_size, int increment); | 225 static tre_stack_t* tre_stack_new(int size, int max_size, int increment); |
| 226 | 226 |
| 227 /* Frees the stack object. */ | 227 /* Frees the stack object. */ |
| 228 static void tre_stack_destroy(tre_stack_t* s); | 228 static void tre_stack_destroy(tre_stack_t* s); |
| 229 | 229 |
| 230 /* Returns the current number of objects in the stack. */ | 230 /* Returns the current number of objects in the stack. */ |
| 231 static int tre_stack_num_objects(tre_stack_t* s); | 231 static int tre_stack_num_objects(tre_stack_t* s); |
| 232 | 232 |
| 233 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes | 233 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes |
| (...skipping 1245 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1479 case ADDTAGS_AFTER_CAT_RIGHT: | 1479 case ADDTAGS_AFTER_CAT_RIGHT: |
| 1480 node = tre_stack_pop_voidptr(stack); | 1480 node = tre_stack_pop_voidptr(stack); |
| 1481 if (first_pass) | 1481 if (first_pass) |
| 1482 node->num_tags = ((tre_catenation_t*)node->obj)->left->num_tags + | 1482 node->num_tags = ((tre_catenation_t*)node->obj)->left->num_tags + |
| 1483 ((tre_catenation_t*)node->obj)->right->num_tags; | 1483 ((tre_catenation_t*)node->obj)->right->num_tags; |
| 1484 break; | 1484 break; |
| 1485 | 1485 |
| 1486 case ADDTAGS_AFTER_UNION_LEFT: | 1486 case ADDTAGS_AFTER_UNION_LEFT: |
| 1487 /* Lift the bottom of the `regset' array so that when processing | 1487 /* Lift the bottom of the `regset' array so that when processing |
| 1488 the right operand the items currently in the array are | 1488 the right operand the items currently in the array are |
| 1489 invisible.» The original bottom was saved at ADDTAGS_UNION and | 1489 invisible. The original bottom was saved at ADDTAGS_UNION and |
| 1490 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */ | 1490 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */ |
| 1491 while (*regset >= 0) | 1491 while (*regset >= 0) |
| 1492 regset++; | 1492 regset++; |
| 1493 break; | 1493 break; |
| 1494 | 1494 |
| 1495 case ADDTAGS_AFTER_UNION_RIGHT: { | 1495 case ADDTAGS_AFTER_UNION_RIGHT: { |
| 1496 int added_tags, tag_left, tag_right; | 1496 int added_tags, tag_left, tag_right; |
| 1497 tre_ast_node_t* left = tre_stack_pop_voidptr(stack); | 1497 tre_ast_node_t* left = tre_stack_pop_voidptr(stack); |
| 1498 tre_ast_node_t* right = tre_stack_pop_voidptr(stack); | 1498 tre_ast_node_t* right = tre_stack_pop_voidptr(stack); |
| 1499 node = tre_stack_pop_voidptr(stack); | 1499 node = tre_stack_pop_voidptr(stack); |
| 1500 added_tags = tre_stack_pop_int(stack); | 1500 added_tags = tre_stack_pop_int(stack); |
| 1501 if (first_pass) { | 1501 if (first_pass) { |
| 1502 node->num_tags = ((tre_union_t*)node->obj)->left->num_tags + | 1502 node->num_tags = ((tre_union_t*)node->obj)->left->num_tags + |
| 1503 ((tre_union_t*)node->obj)->right->num_tags + | 1503 ((tre_union_t*)node->obj)->right->num_tags + |
| 1504 added_tags + ((node->num_submatches > 0) ? 2 : 0); | 1504 added_tags + ((node->num_submatches > 0) ? 2 : 0); |
| 1505 } | 1505 } |
| 1506 regset = tre_stack_pop_voidptr(stack); | 1506 regset = tre_stack_pop_voidptr(stack); |
| 1507 tag_left = tre_stack_pop_int(stack); | 1507 tag_left = tre_stack_pop_int(stack); |
| 1508 tag_right = tre_stack_pop_int(stack); | 1508 tag_right = tre_stack_pop_int(stack); |
| 1509 | 1509 |
| 1510 /* Add tags after both children, the left child gets a smaller | 1510 /* Add tags after both children, the left child gets a smaller |
| 1511 tag than the right child. This guarantees that we prefer | 1511 tag than the right child. This guarantees that we prefer |
| 1512 the left child over the right child. */ | 1512 the left child over the right child. */ |
| 1513 /* XXX - This is not always necessary (if the children have | 1513 /* XXX - This is not always necessary (if the children have |
| 1514 tags which must be seen for every match of that child). */ | 1514 tags which must be seen for every match of that child). */ |
| 1515 /* XXX - Check if this is the only place where tre_add_tag_right | 1515 /* XXX - Check if this is the only place where tre_add_tag_right |
| 1516 is used.» If so, use tre_add_tag_left (putting the tag before | 1516 is used. If so, use tre_add_tag_left (putting the tag before |
| 1517 the child as opposed after the child) and throw away | 1517 the child as opposed after the child) and throw away |
| 1518 tre_add_tag_right. */ | 1518 tre_add_tag_right. */ |
| 1519 if (node->num_submatches > 0) { | 1519 if (node->num_submatches > 0) { |
| 1520 if (!first_pass) { | 1520 if (!first_pass) { |
| 1521 status = tre_add_tag_right(mem, left, tag_left); | 1521 status = tre_add_tag_right(mem, left, tag_left); |
| 1522 tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE; | 1522 tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE; |
| 1523 if (status == REG_OK) | 1523 if (status == REG_OK) |
| 1524 status = tre_add_tag_right(mem, right, tag_right); | 1524 status = tre_add_tag_right(mem, right, tag_right); |
| 1525 tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE; | 1525 tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE; |
| 1526 } | 1526 } |
| (...skipping 777 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2304 /* Optimization: if this position was already handled, skip it. */ | 2304 /* Optimization: if this position was already handled, skip it. */ |
| 2305 if (p2->position == prev_p2_pos) { | 2305 if (p2->position == prev_p2_pos) { |
| 2306 p2++; | 2306 p2++; |
| 2307 continue; | 2307 continue; |
| 2308 } | 2308 } |
| 2309 prev_p2_pos = p2->position; | 2309 prev_p2_pos = p2->position; |
| 2310 /* Set `trans' to point to the next unused transition from | 2310 /* Set `trans' to point to the next unused transition from |
| 2311 position `p1->position'. */ | 2311 position `p1->position'. */ |
| 2312 trans = transitions + offs[p1->position]; | 2312 trans = transitions + offs[p1->position]; |
| 2313 while (trans->state != NULL) { | 2313 while (trans->state != NULL) { |
| 2314 #if 0 | |
| 2315 /* If we find a previous transition from `p1->position' to | |
| 2316 `p2->position', it is overwritten. This can happen only | |
| 2317 if there are nested loops in the regexp, like in "((a)*)*". | |
| 2318 In POSIX.2 repetition using the outer loop is always | |
| 2319 preferred over using the inner loop. Therefore the | |
| 2320 transition for the inner loop is useless and can be thrown | |
| 2321 away. */ | |
| 2322 /* XXX - The same position is used for all nodes in a bracket | |
| 2323 expression, so this optimization cannot be used (it will | |
| 2324 break bracket expressions) unless I figure out a way to | |
| 2325 detect it here. */ | |
| 2326 if (trans->state_id == p2->position) | |
| 2327 { | |
| 2328 break; | |
| 2329 } | |
| 2330 #endif | |
| 2331 trans++; | 2314 trans++; |
| 2332 } | 2315 } |
| 2333 | 2316 |
| 2334 if (trans->state == NULL) | 2317 if (trans->state == NULL) |
| 2335 (trans + 1)->state = NULL; | 2318 (trans + 1)->state = NULL; |
| 2336 /* Use the character ranges, assertions, etc. from `p1' for | 2319 /* Use the character ranges, assertions, etc. from `p1' for |
| 2337 the transition from `p1' to `p2'. */ | 2320 the transition from `p1' to `p2'. */ |
| 2338 trans->code_min = p1->code_min; | 2321 trans->code_min = p1->code_min; |
| 2339 trans->code_max = p1->code_max; | 2322 trans->code_max = p1->code_max; |
| 2340 trans->state = transitions + offs[p2->position]; | 2323 trans->state = transitions + offs[p2->position]; |
| (...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2417 p2 = orig_p2; | 2400 p2 = orig_p2; |
| 2418 while (p2->position >= 0) { | 2401 while (p2->position >= 0) { |
| 2419 counts[p1->position]++; | 2402 counts[p1->position]++; |
| 2420 p2++; | 2403 p2++; |
| 2421 } | 2404 } |
| 2422 p1++; | 2405 p1++; |
| 2423 } | 2406 } |
| 2424 return REG_OK; | 2407 return REG_OK; |
| 2425 } | 2408 } |
| 2426 | 2409 |
| 2427 /* Converts the syntax tree to a TNFA.» All the transitions in the TNFA are | 2410 /* Converts the syntax tree to a TNFA. All the transitions in the TNFA are |
| 2428 labelled with one character range (there are no transitions on empty | 2411 labelled with one character range (there are no transitions on empty |
| 2429 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of | 2412 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of |
| 2430 the regexp. */ | 2413 the regexp. */ |
| 2431 static reg_errcode_t tre_ast_to_tnfa(tre_ast_node_t* node, | 2414 static reg_errcode_t tre_ast_to_tnfa(tre_ast_node_t* node, |
| 2432 tre_tnfa_transition_t* transitions, | 2415 tre_tnfa_transition_t* transitions, |
| 2433 int* counts, | 2416 int* counts, |
| 2434 int* offs) { | 2417 int* offs) { |
| 2435 tre_union_t* uni; | 2418 tre_union_t* uni; |
| 2436 tre_catenation_t* cat; | 2419 tre_catenation_t* cat; |
| 2437 tre_iteration_t* iter; | 2420 tre_iteration_t* iter; |
| (...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2579 } | 2562 } |
| 2580 | 2563 |
| 2581 /* Expand iteration nodes. */ | 2564 /* Expand iteration nodes. */ |
| 2582 errcode = | 2565 errcode = |
| 2583 tre_expand_ast(mem, stack, tree, &parse_ctx.position, tag_directions); | 2566 tre_expand_ast(mem, stack, tree, &parse_ctx.position, tag_directions); |
| 2584 if (errcode != REG_OK) | 2567 if (errcode != REG_OK) |
| 2585 ERROR_EXIT(errcode); | 2568 ERROR_EXIT(errcode); |
| 2586 | 2569 |
| 2587 /* Add a dummy node for the final state. | 2570 /* Add a dummy node for the final state. |
| 2588 XXX - For certain patterns this dummy node can be optimized away, | 2571 XXX - For certain patterns this dummy node can be optimized away, |
| 2589 for example "a*" or "ab*".» Figure out a simple way to detect | 2572 for example "a*" or "ab*". Figure out a simple way to detect |
| 2590 this possibility. */ | 2573 this possibility. */ |
| 2591 tmp_ast_l = tree; | 2574 tmp_ast_l = tree; |
| 2592 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++); | 2575 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++); |
| 2593 if (tmp_ast_r == NULL) | 2576 if (tmp_ast_r == NULL) |
| 2594 ERROR_EXIT(REG_ESPACE); | 2577 ERROR_EXIT(REG_ESPACE); |
| 2595 | 2578 |
| 2596 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r); | 2579 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r); |
| 2597 if (tree == NULL) | 2580 if (tree == NULL) |
| 2598 ERROR_EXIT(REG_ESPACE); | 2581 ERROR_EXIT(REG_ESPACE); |
| 2599 | 2582 |
| (...skipping 126 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2726 } | 2709 } |
| 2727 | 2710 |
| 2728 if (tnfa->tag_directions) | 2711 if (tnfa->tag_directions) |
| 2729 xfree(tnfa->tag_directions); | 2712 xfree(tnfa->tag_directions); |
| 2730 if (tnfa->firstpos_chars) | 2713 if (tnfa->firstpos_chars) |
| 2731 xfree(tnfa->firstpos_chars); | 2714 xfree(tnfa->firstpos_chars); |
| 2732 if (tnfa->minimal_tags) | 2715 if (tnfa->minimal_tags) |
| 2733 xfree(tnfa->minimal_tags); | 2716 xfree(tnfa->minimal_tags); |
| 2734 xfree(tnfa); | 2717 xfree(tnfa); |
| 2735 } | 2718 } |
| OLD | NEW |