| OLD | NEW |
| 1 /* | 1 /* |
| 2 regcomp.c - TRE POSIX compatible regex compilation functions. | 2 regcomp.c - TRE POSIX compatible regex compilation functions. |
| 3 | 3 |
| 4 Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi> | 4 Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi> |
| 5 All rights reserved. | 5 All rights reserved. |
| 6 | 6 |
| 7 Redistribution and use in source and binary forms, with or without | 7 Redistribution and use in source and binary forms, with or without |
| 8 modification, are permitted provided that the following conditions | 8 modification, are permitted provided that the following conditions |
| 9 are met: | 9 are met: |
| 10 | 10 |
| (...skipping 690 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 701 *n = 0; | 701 *n = 0; |
| 702 for (;;) { | 702 for (;;) { |
| 703 *n = 10 * *n + (*s - '0'); | 703 *n = 10 * *n + (*s - '0'); |
| 704 s++; | 704 s++; |
| 705 if (!isdigit(*s) || *n > RE_DUP_MAX) | 705 if (!isdigit(*s) || *n > RE_DUP_MAX) |
| 706 break; | 706 break; |
| 707 } | 707 } |
| 708 return s; | 708 return s; |
| 709 } | 709 } |
| 710 | 710 |
| 711 static reg_errcode_t parse_dup(tre_parse_ctx_t *ctx, const char *s) | 711 static const char *parse_dup(const char *s, int ere, int *pmin, int *pmax) |
| 712 { | 712 { |
| 713 int min, max; | 713 int min, max; |
| 714 | 714 |
| 715 s = parse_dup_count(s, &min); | 715 s = parse_dup_count(s, &min); |
| 716 if (*s == ',') | 716 if (*s == ',') |
| 717 s = parse_dup_count(s+1, &max); | 717 s = parse_dup_count(s+1, &max); |
| 718 else | 718 else |
| 719 max = min; | 719 max = min; |
| 720 | 720 |
| 721 if ( | 721 if ( |
| 722 (max < min && max >= 0) || | 722 (max < min && max >= 0) || |
| 723 max > RE_DUP_MAX || | 723 max > RE_DUP_MAX || |
| 724 min > RE_DUP_MAX || | 724 min > RE_DUP_MAX || |
| 725 min < 0 || | 725 min < 0 || |
| 726 » » (!(ctx->cflags & REG_EXTENDED) && *s++ != '\\') || | 726 » » (!ere && *s++ != '\\') || |
| 727 *s++ != '}' | 727 *s++ != '}' |
| 728 ) | 728 ) |
| 729 » » return REG_BADBR; | 729 » » return 0; |
| 730 | 730 » *pmin = min; |
| 731 » if (min == 0 && max == 0) | 731 » *pmax = max; |
| 732 » » ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); | 732 » return s; |
| 733 » else | |
| 734 » » ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0); | |
| 735 » if (!ctx->n) | |
| 736 » » return REG_ESPACE; | |
| 737 » ctx->s = s; | |
| 738 » return REG_OK; | |
| 739 } | 733 } |
| 740 | 734 |
| 741 static int hexval(unsigned c) | 735 static int hexval(unsigned c) |
| 742 { | 736 { |
| 743 if (c-'0'<10) return c-'0'; | 737 if (c-'0'<10) return c-'0'; |
| 744 c |= 32; | 738 c |= 32; |
| 745 if (c-'a'<6) return c-'a'+10; | 739 if (c-'a'<6) return c-'a'+10; |
| 746 return -1; | 740 return -1; |
| 747 } | 741 } |
| 748 | 742 |
| (...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 827 c = hexval(s[i]); | 821 c = hexval(s[i]); |
| 828 if (c < 0) break; | 822 if (c < 0) break; |
| 829 v = 16*v + c; | 823 v = 16*v + c; |
| 830 } | 824 } |
| 831 s += i; | 825 s += i; |
| 832 if (len == 8) { | 826 if (len == 8) { |
| 833 if (*s != '}') | 827 if (*s != '}') |
| 834 return REG_EBRACE; | 828 return REG_EBRACE; |
| 835 s++; | 829 s++; |
| 836 } | 830 } |
| 837 » » » node = tre_ast_new_literal(ctx->mem, v, v, ctx->position
); | 831 » » » node = tre_ast_new_literal(ctx->mem, v, v, ctx->position
++); |
| 838 » » » ctx->position++; | |
| 839 s--; | 832 s--; |
| 840 break; | 833 break; |
| 834 case '{': |
| 835 case '+': |
| 836 case '?': |
| 837 /* extension: treat \+, \? as repetitions in BRE */ |
| 838 /* reject repetitions after empty expression in BRE */ |
| 839 if (!ere) |
| 840 return REG_BADRPT; |
| 841 case '|': |
| 842 /* extension: treat \| as alternation in BRE */ |
| 843 if (!ere) { |
| 844 node = tre_ast_new_literal(ctx->mem, EMPTY, -1,
-1); |
| 845 s--; |
| 846 goto end; |
| 847 } |
| 848 /* fallthrough */ |
| 841 default: | 849 default: |
| 842 if (!ere && (unsigned)*s-'1' < 9) { | 850 if (!ere && (unsigned)*s-'1' < 9) { |
| 843 /* back reference */ | 851 /* back reference */ |
| 844 int val = *s - '0'; | 852 int val = *s - '0'; |
| 845 » » » » node = tre_ast_new_literal(ctx->mem, BACKREF, va
l, ctx->position); | 853 » » » » node = tre_ast_new_literal(ctx->mem, BACKREF, va
l, ctx->position++); |
| 846 ctx->max_backref = MAX(val, ctx->max_backref); | 854 ctx->max_backref = MAX(val, ctx->max_backref); |
| 847 } else { | 855 } else { |
| 848 /* extension: accept unknown escaped char | 856 /* extension: accept unknown escaped char |
| 849 as a literal */ | 857 as a literal */ |
| 850 goto parse_literal; | 858 goto parse_literal; |
| 851 } | 859 } |
| 852 ctx->position++; | |
| 853 } | 860 } |
| 854 s++; | 861 s++; |
| 855 break; | 862 break; |
| 856 case '.': | 863 case '.': |
| 857 if (ctx->cflags & REG_NEWLINE) { | 864 if (ctx->cflags & REG_NEWLINE) { |
| 858 tre_ast_node_t *tmp1, *tmp2; | 865 tre_ast_node_t *tmp1, *tmp2; |
| 859 tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->pos
ition++); | 866 tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->pos
ition++); |
| 860 tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MA
X, ctx->position++); | 867 tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MA
X, ctx->position++); |
| 861 if (tmp1 && tmp2) | 868 if (tmp1 && tmp2) |
| 862 node = tre_ast_new_union(ctx->mem, tmp1, tmp2); | 869 node = tre_ast_new_union(ctx->mem, tmp1, tmp2); |
| (...skipping 12 matching lines...) Expand all Loading... |
| 875 s++; | 882 s++; |
| 876 break; | 883 break; |
| 877 case '$': | 884 case '$': |
| 878 /* '$' is special everywhere in EREs, and in the end of the stri
ng in BREs. */ | 885 /* '$' is special everywhere in EREs, and in the end of the stri
ng in BREs. */ |
| 879 if (!ere && s[1]) | 886 if (!ere && s[1]) |
| 880 goto parse_literal; | 887 goto parse_literal; |
| 881 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -
1); | 888 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -
1); |
| 882 s++; | 889 s++; |
| 883 break; | 890 break; |
| 884 case '*': | 891 case '*': |
| 885 » case '|': | 892 » » return REG_BADPAT; |
| 886 case '{': | 893 case '{': |
| 887 case '+': | 894 case '+': |
| 888 case '?': | 895 case '?': |
| 896 /* reject repetitions after empty expression in ERE */ |
| 897 if (ere) |
| 898 return REG_BADRPT; |
| 899 case '|': |
| 889 if (!ere) | 900 if (!ere) |
| 890 goto parse_literal; | 901 goto parse_literal; |
| 891 case 0: | 902 case 0: |
| 892 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); | 903 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); |
| 893 break; | 904 break; |
| 894 default: | 905 default: |
| 895 parse_literal: | 906 parse_literal: |
| 896 len = mbtowc(&wc, s, -1); | 907 len = mbtowc(&wc, s, -1); |
| 897 if (len < 0) | 908 if (len < 0) |
| 898 return REG_BADPAT; | 909 return REG_BADPAT; |
| 899 if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(w
c))) { | 910 if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(w
c))) { |
| 900 tre_ast_node_t *tmp1, *tmp2; | 911 tre_ast_node_t *tmp1, *tmp2; |
| 901 /* multiple opposite case characters are not supported *
/ | 912 /* multiple opposite case characters are not supported *
/ |
| 902 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tr
e_toupper(wc), ctx->position); | 913 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tr
e_toupper(wc), ctx->position); |
| 903 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tr
e_tolower(wc), ctx->position); | 914 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tr
e_tolower(wc), ctx->position); |
| 904 if (tmp1 && tmp2) | 915 if (tmp1 && tmp2) |
| 905 node = tre_ast_new_union(ctx->mem, tmp1, tmp2); | 916 node = tre_ast_new_union(ctx->mem, tmp1, tmp2); |
| 906 else | 917 else |
| 907 node = 0; | 918 node = 0; |
| 908 } else { | 919 } else { |
| 909 node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->positi
on); | 920 node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->positi
on); |
| 910 } | 921 } |
| 911 ctx->position++; | 922 ctx->position++; |
| 912 s += len; | 923 s += len; |
| 913 break; | 924 break; |
| 914 } | 925 } |
| 926 end: |
| 915 if (!node) | 927 if (!node) |
| 916 return REG_ESPACE; | 928 return REG_ESPACE; |
| 917 ctx->n = node; | 929 ctx->n = node; |
| 918 ctx->s = s; | 930 ctx->s = s; |
| 919 return REG_OK; | 931 return REG_OK; |
| 920 } | 932 } |
| 921 | 933 |
| 922 #define PUSHPTR(err, s, v) do { \ | 934 #define PUSHPTR(err, s, v) do { \ |
| 923 if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \ | 935 if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \ |
| 924 return err; \ | 936 return err; \ |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 959 if (!ctx->n) | 971 if (!ctx->n) |
| 960 return REG_ESPACE; | 972 return REG_ESPACE; |
| 961 } else { | 973 } else { |
| 962 err = parse_atom(ctx, s); | 974 err = parse_atom(ctx, s); |
| 963 if (err != REG_OK) | 975 if (err != REG_OK) |
| 964 return err; | 976 return err; |
| 965 s = ctx->s; | 977 s = ctx->s; |
| 966 } | 978 } |
| 967 | 979 |
| 968 parse_iter: | 980 parse_iter: |
| 969 » » /* extension: repetitions are accepted after an empty node | 981 » » /* extension: repetitions are rejected after an empty node |
| 970 » » eg. (+), ^*, a$?, a|{2} */ | 982 » » eg. (+), |*, {2}, but assertions are not treated as empty |
| 971 » » switch (*s) { | 983 » » so ^* or $? are accepted currently. */ |
| 972 » » case '+': | 984 » » for (;;) { |
| 973 » » case '?': | 985 » » » int min, max; |
| 974 » » » if (!ere) | 986 |
| 987 » » » if (*s!='\\' && *s!='*') { |
| 988 » » » » if (!ere) |
| 989 » » » » » break; |
| 990 » » » » if (*s!='+' && *s!='?' && *s!='{') |
| 991 » » » » » break; |
| 992 » » » } |
| 993 » » » if (*s=='\\' && ere) |
| 975 break; | 994 break; |
| 976 » » » /* fallthrough */ | 995 » » » /* extension: treat \+, \? as repetitions in BRE */ |
| 977 » » case '*':; | 996 » » » if (*s=='\\' && s[1]!='+' && s[1]!='?' && s[1]!='{') |
| 978 » » » int min=0, max=-1; | 997 » » » » break; |
| 979 » » » if (*s == '+') | 998 » » » if (*s=='\\') |
| 980 » » » » min = 1; | 999 » » » » s++; |
| 981 » » » if (*s == '?') | 1000 |
| 982 » » » » max = 1; | |
| 983 » » » s++; | |
| 984 » » » ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0)
; | |
| 985 » » » if (!ctx->n) | |
| 986 » » » » return REG_ESPACE; | |
| 987 /* extension: multiple consecutive *+?{,} is unspecified
, | 1001 /* extension: multiple consecutive *+?{,} is unspecified
, |
| 988 but (a+)+ has to be supported so accepting a++ makes | 1002 but (a+)+ has to be supported so accepting a++ makes |
| 989 sense, note however that the RE_DUP_MAX limit can be | 1003 sense, note however that the RE_DUP_MAX limit can be |
| 990 circumvented: (a{255}){255} uses a lot of memory.. */ | 1004 circumvented: (a{255}){255} uses a lot of memory.. */ |
| 991 » » » goto parse_iter; | 1005 » » » if (*s=='{') { |
| 992 » » case '\\': | 1006 » » » » s = parse_dup(s+1, ere, &min, &max); |
| 993 » » » if (ere || s[1] != '{') | 1007 » » » » if (!s) |
| 994 » » » » break; | 1008 » » » » » return REG_BADBR; |
| 995 » » » s++; | 1009 » » » } else { |
| 996 » » » goto parse_brace; | 1010 » » » » min=0; |
| 997 » » case '{': | 1011 » » » » max=-1; |
| 998 » » » if (!ere) | 1012 » » » » if (*s == '+') |
| 999 » » » » break; | 1013 » » » » » min = 1; |
| 1000 » » parse_brace: | 1014 » » » » if (*s == '?') |
| 1001 » » » err = parse_dup(ctx, s+1); | 1015 » » » » » max = 1; |
| 1002 » » » if (err != REG_OK) | 1016 » » » » s++; |
| 1003 » » » » return err; | 1017 » » » } |
| 1004 » » » s = ctx->s; | 1018 » » » if (max == 0) |
| 1005 » » » goto parse_iter; | 1019 » » » » ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1
, -1); |
| 1020 » » » else |
| 1021 » » » » ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min,
max, 0); |
| 1022 » » » if (!ctx->n) |
| 1023 » » » » return REG_ESPACE; |
| 1006 } | 1024 } |
| 1007 | 1025 |
| 1008 nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n); | 1026 nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n); |
| 1009 if ((ere && *s == '|') || | 1027 if ((ere && *s == '|') || |
| 1010 (ere && *s == ')' && depth) || | 1028 (ere && *s == ')' && depth) || |
| 1011 (!ere && *s == '\\' && s[1] == ')') || | 1029 (!ere && *s == '\\' && s[1] == ')') || |
| 1030 /* extension: treat \| as alternation in BRE */ |
| 1031 (!ere && *s == '\\' && s[1] == '|') || |
| 1012 !*s) { | 1032 !*s) { |
| 1013 /* extension: empty branch is unspecified (), (|a), (a|) | 1033 /* extension: empty branch is unspecified (), (|a), (a|) |
| 1014 here they are not rejected but match on empty string
*/ | 1034 here they are not rejected but match on empty string
*/ |
| 1015 int c = *s; | 1035 int c = *s; |
| 1016 nunion = tre_ast_new_union(ctx->mem, nunion, nbranch); | 1036 nunion = tre_ast_new_union(ctx->mem, nunion, nbranch); |
| 1017 nbranch = 0; | 1037 nbranch = 0; |
| 1018 » » » if (c != '|') { | 1038 |
| 1039 » » » if (c == '\\' && s[1] == '|') { |
| 1040 » » » » s+=2; |
| 1041 » » » } else if (c == '|') { |
| 1042 » » » » s++; |
| 1043 » » » } else { |
| 1019 if (c == '\\') { | 1044 if (c == '\\') { |
| 1020 if (!depth) return REG_EPAREN; | 1045 if (!depth) return REG_EPAREN; |
| 1021 s+=2; | 1046 s+=2; |
| 1022 } else if (c == ')') | 1047 } else if (c == ')') |
| 1023 s++; | 1048 s++; |
| 1024 depth--; | 1049 depth--; |
| 1025 err = marksub(ctx, nunion, tre_stack_pop_int(sta
ck)); | 1050 err = marksub(ctx, nunion, tre_stack_pop_int(sta
ck)); |
| 1026 if (err != REG_OK) | 1051 if (err != REG_OK) |
| 1027 return err; | 1052 return err; |
| 1028 if (!c && depth<0) { | 1053 if (!c && depth<0) { |
| 1029 ctx->submatch_id = subid; | 1054 ctx->submatch_id = subid; |
| 1030 return REG_OK; | 1055 return REG_OK; |
| 1031 } | 1056 } |
| 1032 if (!c || depth<0) | 1057 if (!c || depth<0) |
| 1033 return REG_EPAREN; | 1058 return REG_EPAREN; |
| 1034 nbranch = tre_stack_pop_voidptr(stack); | 1059 nbranch = tre_stack_pop_voidptr(stack); |
| 1035 nunion = tre_stack_pop_voidptr(stack); | 1060 nunion = tre_stack_pop_voidptr(stack); |
| 1036 goto parse_iter; | 1061 goto parse_iter; |
| 1037 } | 1062 } |
| 1038 s++; | |
| 1039 } | 1063 } |
| 1040 } | 1064 } |
| 1041 } | 1065 } |
| 1042 | 1066 |
| 1043 | 1067 |
| 1044 /*********************************************************************** | 1068 /*********************************************************************** |
| 1045 from tre-compile.c | 1069 from tre-compile.c |
| 1046 ***********************************************************************/ | 1070 ***********************************************************************/ |
| 1047 | 1071 |
| 1048 | 1072 |
| (...skipping 1608 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2657 tre_submatch_data_t *submatch_data; | 2681 tre_submatch_data_t *submatch_data; |
| 2658 tre_tag_direction_t *tag_directions = NULL; | 2682 tre_tag_direction_t *tag_directions = NULL; |
| 2659 reg_errcode_t errcode; | 2683 reg_errcode_t errcode; |
| 2660 tre_mem_t mem; | 2684 tre_mem_t mem; |
| 2661 | 2685 |
| 2662 /* Parse context. */ | 2686 /* Parse context. */ |
| 2663 tre_parse_ctx_t parse_ctx; | 2687 tre_parse_ctx_t parse_ctx; |
| 2664 | 2688 |
| 2665 /* Allocate a stack used throughout the compilation process for various | 2689 /* Allocate a stack used throughout the compilation process for various |
| 2666 purposes. */ | 2690 purposes. */ |
| 2667 stack = tre_stack_new(512, 10240, 128); | 2691 stack = tre_stack_new(512, 1024000, 128); |
| 2668 if (!stack) | 2692 if (!stack) |
| 2669 return REG_ESPACE; | 2693 return REG_ESPACE; |
| 2670 /* Allocate a fast memory allocator. */ | 2694 /* Allocate a fast memory allocator. */ |
| 2671 mem = tre_mem_new(); | 2695 mem = tre_mem_new(); |
| 2672 if (!mem) | 2696 if (!mem) |
| 2673 { | 2697 { |
| 2674 tre_stack_destroy(stack); | 2698 tre_stack_destroy(stack); |
| 2675 return REG_ESPACE; | 2699 return REG_ESPACE; |
| 2676 } | 2700 } |
| 2677 | 2701 |
| (...skipping 223 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2901 } | 2925 } |
| 2902 | 2926 |
| 2903 if (tnfa->tag_directions) | 2927 if (tnfa->tag_directions) |
| 2904 xfree(tnfa->tag_directions); | 2928 xfree(tnfa->tag_directions); |
| 2905 if (tnfa->firstpos_chars) | 2929 if (tnfa->firstpos_chars) |
| 2906 xfree(tnfa->firstpos_chars); | 2930 xfree(tnfa->firstpos_chars); |
| 2907 if (tnfa->minimal_tags) | 2931 if (tnfa->minimal_tags) |
| 2908 xfree(tnfa->minimal_tags); | 2932 xfree(tnfa->minimal_tags); |
| 2909 xfree(tnfa); | 2933 xfree(tnfa); |
| 2910 } | 2934 } |
| OLD | NEW |