| OLD | NEW |
| 1 /* | 1 /* |
| 2 * An implementation of what I call the "Sea of Stars" algorithm for | 2 * An implementation of what I call the "Sea of Stars" algorithm for |
| 3 * POSIX fnmatch(). The basic idea is that we factor the pattern into | 3 * POSIX fnmatch(). The basic idea is that we factor the pattern into |
| 4 * a head component (which we match first and can reject without ever | 4 * a head component (which we match first and can reject without ever |
| 5 * measuring the length of the string), an optional tail component | 5 * measuring the length of the string), an optional tail component |
| 6 * (which only exists if the pattern contains at least one star), and | 6 * (which only exists if the pattern contains at least one star), and |
| 7 * an optional "sea of stars", a set of star-separated components | 7 * an optional "sea of stars", a set of star-separated components |
| 8 * between the head and tail. After the head and tail matches have | 8 * between the head and tail. After the head and tail matches have |
| 9 * been removed from the input string, the components in the "sea of | 9 * been removed from the input string, the components in the "sea of |
| 10 * stars" are matched sequentially by searching for their first | 10 * stars" are matched sequentially by searching for their first |
| 11 * occurrence past the end of the previous match. | 11 * occurrence past the end of the previous match. |
| 12 * | 12 * |
| 13 * - Rich Felker, April 2012 | 13 * - Rich Felker, April 2012 |
| 14 */ | 14 */ |
| 15 | 15 |
| 16 #include <string.h> | 16 #include <string.h> |
| 17 #include <fnmatch.h> | 17 #include <fnmatch.h> |
| 18 #include <stdlib.h> | 18 #include <stdlib.h> |
| 19 #include <wchar.h> | 19 #include <wchar.h> |
| 20 #include <wctype.h> | 20 #include <wctype.h> |
| 21 #include "locale_impl.h" | 21 #include "locale_impl.h" |
| 22 | 22 |
| 23 #define END 0 | 23 #define END 0 |
| 24 #define UNMATCHABLE -2 | 24 #define UNMATCHABLE -2 |
| 25 #define BRACKET -3 | 25 #define BRACKET -3 |
| 26 #define QUESTION -4 | 26 #define QUESTION -4 |
| 27 #define STAR -5 | 27 #define STAR -5 |
| 28 | 28 |
| 29 static int str_next(const char *str, size_t n, size_t *step) | 29 static int str_next(const char* str, size_t n, size_t* step) { |
| 30 { | 30 if (!n) { |
| 31 » if (!n) { | 31 *step = 0; |
| 32 » » *step = 0; | 32 return 0; |
| 33 » » return 0; | 33 } |
| 34 » } | 34 if (str[0] >= 128U) { |
| 35 » if (str[0] >= 128U) { | 35 wchar_t wc; |
| 36 » » wchar_t wc; | 36 int k = mbtowc(&wc, str, n); |
| 37 » » int k = mbtowc(&wc, str, n); | 37 if (k < 0) { |
| 38 » » if (k<0) { | 38 *step = 1; |
| 39 » » » *step = 1; | 39 return -1; |
| 40 » » » return -1; | 40 } |
| 41 » » } | 41 *step = k; |
| 42 » » *step = k; | 42 return wc; |
| 43 » » return wc; | 43 } |
| 44 » } | 44 *step = 1; |
| 45 » *step = 1; | 45 return str[0]; |
| 46 » return str[0]; | 46 } |
| 47 } | 47 |
| 48 | 48 static int pat_next(const char* pat, size_t m, size_t* step, int flags) { |
| 49 static int pat_next(const char *pat, size_t m, size_t *step, int flags) | 49 int esc = 0; |
| 50 { | 50 if (!m || !*pat) { |
| 51 » int esc = 0; | 51 *step = 0; |
| 52 » if (!m || !*pat) { | 52 return END; |
| 53 » » *step = 0; | 53 } |
| 54 » » return END; | 54 *step = 1; |
| 55 » } | 55 if (pat[0] == '\\' && pat[1] && !(flags & FNM_NOESCAPE)) { |
| 56 » *step = 1; | 56 *step = 2; |
| 57 » if (pat[0]=='\\' && pat[1] && !(flags & FNM_NOESCAPE)) { | 57 pat++; |
| 58 » » *step = 2; | 58 esc = 1; |
| 59 » » pat++; | 59 goto escaped; |
| 60 » » esc = 1; | 60 } |
| 61 » » goto escaped; | 61 if (pat[0] == '[') { |
| 62 » } | 62 size_t k = 1; |
| 63 » if (pat[0]=='[') { | 63 if (k < m) |
| 64 » » size_t k = 1; | 64 if (pat[k] == '^' || pat[k] == '!') |
| 65 » » if (k<m) if (pat[k] == '^' || pat[k] == '!') k++; | 65 k++; |
| 66 » » if (k<m) if (pat[k] == ']') k++; | 66 if (k < m) |
| 67 » » for (; k<m && pat[k] && pat[k]!=']'; k++) { | 67 if (pat[k] == ']') |
| 68 » » » if (k+1<m && pat[k+1] && pat[k]=='[' && (pat[k+1]==':' |
| pat[k+1]=='.' || pat[k+1]=='=')) { | 68 k++; |
| 69 » » » » int z = pat[k+1]; | 69 for (; k < m && pat[k] && pat[k] != ']'; k++) { |
| 70 » » » » k+=2; | 70 if (k + 1 < m && pat[k + 1] && pat[k] == '[' && |
| 71 » » » » if (k<m && pat[k]) k++; | 71 (pat[k + 1] == ':' || pat[k + 1] == '.' || pat[k + 1] == '=')) { |
| 72 » » » » while (k<m && pat[k] && (pat[k-1]!=z || pat[k]!=
']')) k++; | 72 int z = pat[k + 1]; |
| 73 » » » » if (k==m || !pat[k]) break; | 73 k += 2; |
| 74 » » » } | 74 if (k < m && pat[k]) |
| 75 » » } | 75 k++; |
| 76 » » if (k==m || !pat[k]) { | 76 while (k < m && pat[k] && (pat[k - 1] != z || pat[k] != ']')) |
| 77 » » » *step = 1; | 77 k++; |
| 78 » » » return '['; | 78 if (k == m || !pat[k]) |
| 79 » » } | 79 break; |
| 80 » » *step = k+1; | 80 } |
| 81 » » return BRACKET; | 81 } |
| 82 » } | 82 if (k == m || !pat[k]) { |
| 83 » if (pat[0] == '*') | 83 *step = 1; |
| 84 » » return STAR; | 84 return '['; |
| 85 » if (pat[0] == '?') | 85 } |
| 86 » » return QUESTION; | 86 *step = k + 1; |
| 87 return BRACKET; |
| 88 } |
| 89 if (pat[0] == '*') |
| 90 return STAR; |
| 91 if (pat[0] == '?') |
| 92 return QUESTION; |
| 87 escaped: | 93 escaped: |
| 88 if (pat[0] >= 128U) { | 94 if (pat[0] >= 128U) { |
| 89 wchar_t wc; | 95 wchar_t wc; |
| 90 int k = mbtowc(&wc, pat, m); | 96 int k = mbtowc(&wc, pat, m); |
| 91 if (k<0) { | 97 if (k < 0) { |
| 92 *step = 0; | 98 *step = 0; |
| 93 return UNMATCHABLE; | 99 return UNMATCHABLE; |
| 94 } | 100 } |
| 95 *step = k + esc; | 101 *step = k + esc; |
| 96 return wc; | 102 return wc; |
| 97 } | 103 } |
| 98 return pat[0]; | 104 return pat[0]; |
| 99 } | 105 } |
| 100 | 106 |
| 101 static int casefold(int k) | 107 static int casefold(int k) { |
| 102 { | 108 int c = towupper(k); |
| 103 int c = towupper(k); | 109 return c == k ? towlower(k) : c; |
| 104 return c == k ? towlower(k) : c; | 110 } |
| 105 } | 111 |
| 106 | 112 static int match_bracket(const char* p, int k, int kfold) { |
| 107 static int match_bracket(const char *p, int k, int kfold) | 113 wchar_t wc; |
| 108 { | 114 int inv = 0; |
| 109 wchar_t wc; | 115 p++; |
| 110 int inv = 0; | 116 if (*p == '^' || *p == '!') { |
| 111 p++; | 117 inv = 1; |
| 112 if (*p=='^' || *p=='!') { | 118 p++; |
| 113 inv = 1; | 119 } |
| 114 p++; | 120 if (*p == ']') { |
| 115 } | 121 if (k == ']') |
| 116 if (*p==']') { | 122 return !inv; |
| 117 if (k==']') return !inv; | 123 p++; |
| 118 p++; | 124 } else if (*p == '-') { |
| 119 } else if (*p=='-') { | 125 if (k == '-') |
| 120 if (k=='-') return !inv; | 126 return !inv; |
| 121 p++; | 127 p++; |
| 122 } | 128 } |
| 123 wc = p[-1]; | 129 wc = p[-1]; |
| 124 for (; *p != ']'; p++) { | 130 for (; *p != ']'; p++) { |
| 125 if (p[0]=='-' && p[1]!=']') { | 131 if (p[0] == '-' && p[1] != ']') { |
| 126 wchar_t wc2; | 132 wchar_t wc2; |
| 127 int l = mbtowc(&wc2, p+1, 4); | 133 int l = mbtowc(&wc2, p + 1, 4); |
| 128 if (l < 0) return 0; | 134 if (l < 0) |
| 129 if (wc <= wc2) | 135 return 0; |
| 130 if ((unsigned)k-wc <= wc2-wc || | 136 if (wc <= wc2) |
| 131 (unsigned)kfold-wc <= wc2-wc) | 137 if ((unsigned)k - wc <= wc2 - wc || (unsigned)kfold - wc <= wc2 - wc) |
| 132 return !inv; | 138 return !inv; |
| 133 p += l-1; | 139 p += l - 1; |
| 134 continue; | 140 continue; |
| 135 } | 141 } |
| 136 if (p[0]=='[' && (p[1]==':' || p[1]=='.' || p[1]=='=')) { | 142 if (p[0] == '[' && (p[1] == ':' || p[1] == '.' || p[1] == '=')) { |
| 137 const char *p0 = p+2; | 143 const char* p0 = p + 2; |
| 138 int z = p[1]; | 144 int z = p[1]; |
| 139 p+=3; | 145 p += 3; |
| 140 while (p[-1]!=z || p[0]!=']') p++; | 146 while (p[-1] != z || p[0] != ']') |
| 141 if (z == ':' && p-1-p0 < 16) { | 147 p++; |
| 142 char buf[16]; | 148 if (z == ':' && p - 1 - p0 < 16) { |
| 143 memcpy(buf, p0, p-1-p0); | 149 char buf[16]; |
| 144 buf[p-1-p0] = 0; | 150 memcpy(buf, p0, p - 1 - p0); |
| 145 if (iswctype(k, wctype(buf)) || | 151 buf[p - 1 - p0] = 0; |
| 146 iswctype(kfold, wctype(buf))) | 152 if (iswctype(k, wctype(buf)) || iswctype(kfold, wctype(buf))) |
| 147 return !inv; | 153 return !inv; |
| 148 } | 154 } |
| 149 continue; | 155 continue; |
| 150 } | 156 } |
| 151 if (*p < 128U) { | 157 if (*p < 128U) { |
| 152 wc = (unsigned char)*p; | 158 wc = (unsigned char)*p; |
| 153 } else { | 159 } else { |
| 154 int l = mbtowc(&wc, p, 4); | 160 int l = mbtowc(&wc, p, 4); |
| 155 if (l < 0) return 0; | 161 if (l < 0) |
| 156 p += l-1; | 162 return 0; |
| 157 } | 163 p += l - 1; |
| 158 if (wc==k || wc==kfold) return !inv; | 164 } |
| 159 } | 165 if (wc == k || wc == kfold) |
| 160 return inv; | 166 return !inv; |
| 161 } | 167 } |
| 162 | 168 return inv; |
| 163 static int fnmatch_internal(const char *pat, size_t m, const char *str, size_t n
, int flags) | 169 } |
| 164 { | 170 |
| 165 const char *p, *ptail, *endpat; | 171 static int fnmatch_internal(const char* pat, |
| 166 const char *s, *stail, *endstr; | 172 size_t m, |
| 167 size_t pinc, sinc, tailcnt=0; | 173 const char* str, |
| 168 int c, k, kfold; | 174 size_t n, |
| 169 | 175 int flags) { |
| 170 if (flags & FNM_PERIOD) { | 176 const char *p, *ptail, *endpat; |
| 171 if (*str == '.' && *pat != '.') | 177 const char *s, *stail, *endstr; |
| 172 return FNM_NOMATCH; | 178 size_t pinc, sinc, tailcnt = 0; |
| 173 } | 179 int c, k, kfold; |
| 174 for (;;) { | 180 |
| 175 switch ((c = pat_next(pat, m, &pinc, flags))) { | 181 if (flags & FNM_PERIOD) { |
| 176 case UNMATCHABLE: | 182 if (*str == '.' && *pat != '.') |
| 177 return FNM_NOMATCH; | 183 return FNM_NOMATCH; |
| 178 case STAR: | 184 } |
| 179 pat++; | 185 for (;;) { |
| 180 m--; | 186 switch ((c = pat_next(pat, m, &pinc, flags))) { |
| 181 break; | 187 case UNMATCHABLE: |
| 182 default: | 188 return FNM_NOMATCH; |
| 183 k = str_next(str, n, &sinc); | 189 case STAR: |
| 184 if (k <= 0) | 190 pat++; |
| 185 return (c==END) ? 0 : FNM_NOMATCH; | 191 m--; |
| 186 str += sinc; | 192 break; |
| 187 n -= sinc; | 193 default: |
| 188 kfold = flags & FNM_CASEFOLD ? casefold(k) : k; | 194 k = str_next(str, n, &sinc); |
| 189 if (c == BRACKET) { | 195 if (k <= 0) |
| 190 if (!match_bracket(pat, k, kfold)) | 196 return (c == END) ? 0 : FNM_NOMATCH; |
| 191 return FNM_NOMATCH; | 197 str += sinc; |
| 192 } else if (c != QUESTION && k != c && kfold != c) { | 198 n -= sinc; |
| 193 return FNM_NOMATCH; | 199 kfold = flags & FNM_CASEFOLD ? casefold(k) : k; |
| 194 } | 200 if (c == BRACKET) { |
| 195 pat+=pinc; | 201 if (!match_bracket(pat, k, kfold)) |
| 196 m-=pinc; | 202 return FNM_NOMATCH; |
| 197 continue; | 203 } else if (c != QUESTION && k != c && kfold != c) { |
| 198 } | 204 return FNM_NOMATCH; |
| 199 break; | 205 } |
| 200 } | 206 pat += pinc; |
| 201 | 207 m -= pinc; |
| 202 /* Compute real pat length if it was initially unknown/-1 */ | 208 continue; |
| 203 m = strnlen(pat, m); | 209 } |
| 204 endpat = pat + m; | 210 break; |
| 205 | 211 } |
| 206 /* Find the last * in pat and count chars needed after it */ | 212 |
| 207 for (p=ptail=pat; p<endpat; p+=pinc) { | 213 /* Compute real pat length if it was initially unknown/-1 */ |
| 208 switch (pat_next(p, endpat-p, &pinc, flags)) { | 214 m = strnlen(pat, m); |
| 209 case UNMATCHABLE: | 215 endpat = pat + m; |
| 210 return FNM_NOMATCH; | 216 |
| 211 case STAR: | 217 /* Find the last * in pat and count chars needed after it */ |
| 212 tailcnt=0; | 218 for (p = ptail = pat; p < endpat; p += pinc) { |
| 213 ptail = p+1; | 219 switch (pat_next(p, endpat - p, &pinc, flags)) { |
| 214 break; | 220 case UNMATCHABLE: |
| 215 default: | 221 return FNM_NOMATCH; |
| 216 tailcnt++; | 222 case STAR: |
| 217 break; | 223 tailcnt = 0; |
| 218 } | 224 ptail = p + 1; |
| 219 } | 225 break; |
| 220 | 226 default: |
| 221 /* Past this point we need not check for UNMATCHABLE in pat, | 227 tailcnt++; |
| 222 * because all of pat has already been parsed once. */ | 228 break; |
| 223 | 229 } |
| 224 /* Compute real str length if it was initially unknown/-1 */ | 230 } |
| 225 n = strnlen(str, n); | 231 |
| 226 endstr = str + n; | 232 /* Past this point we need not check for UNMATCHABLE in pat, |
| 227 if (n < tailcnt) return FNM_NOMATCH; | 233 * because all of pat has already been parsed once. */ |
| 228 | 234 |
| 229 /* Find the final tailcnt chars of str, accounting for UTF-8. | 235 /* Compute real str length if it was initially unknown/-1 */ |
| 230 * On illegal sequences we may get it wrong, but in that case | 236 n = strnlen(str, n); |
| 231 * we necessarily have a matching failure anyway. */ | 237 endstr = str + n; |
| 232 for (s=endstr; s>str && tailcnt; tailcnt--) { | 238 if (n < tailcnt) |
| 233 if (s[-1] < 128U || MB_CUR_MAX==1) s--; | 239 return FNM_NOMATCH; |
| 234 else while ((unsigned char)*--s-0x80U<0x40 && s>str); | 240 |
| 235 } | 241 /* Find the final tailcnt chars of str, accounting for UTF-8. |
| 236 if (tailcnt) return FNM_NOMATCH; | 242 * On illegal sequences we may get it wrong, but in that case |
| 237 stail = s; | 243 * we necessarily have a matching failure anyway. */ |
| 238 | 244 for (s = endstr; s > str && tailcnt; tailcnt--) { |
| 239 /* Check that the pat and str tails match */ | 245 if (s[-1] < 128U || MB_CUR_MAX == 1) |
| 240 p = ptail; | 246 s--; |
| 241 for (;;) { | 247 else |
| 242 c = pat_next(p, endpat-p, &pinc, flags); | 248 while ((unsigned char)*--s - 0x80U < 0x40 && s > str) |
| 243 p += pinc; | 249 ; |
| 244 if ((k = str_next(s, endstr-s, &sinc)) <= 0) { | 250 } |
| 245 if (c != END) return FNM_NOMATCH; | 251 if (tailcnt) |
| 246 break; | 252 return FNM_NOMATCH; |
| 247 } | 253 stail = s; |
| 248 s += sinc; | 254 |
| 249 kfold = flags & FNM_CASEFOLD ? casefold(k) : k; | 255 /* Check that the pat and str tails match */ |
| 250 if (c == BRACKET) { | 256 p = ptail; |
| 251 if (!match_bracket(p-pinc, k, kfold)) | 257 for (;;) { |
| 252 return FNM_NOMATCH; | 258 c = pat_next(p, endpat - p, &pinc, flags); |
| 253 } else if (c != QUESTION && k != c && kfold != c) { | 259 p += pinc; |
| 254 return FNM_NOMATCH; | 260 if ((k = str_next(s, endstr - s, &sinc)) <= 0) { |
| 255 } | 261 if (c != END) |
| 256 } | 262 return FNM_NOMATCH; |
| 257 | 263 break; |
| 258 /* We're all done with the tails now, so throw them out */ | 264 } |
| 259 endstr = stail; | 265 s += sinc; |
| 260 endpat = ptail; | 266 kfold = flags & FNM_CASEFOLD ? casefold(k) : k; |
| 261 | 267 if (c == BRACKET) { |
| 262 /* Match pattern components until there are none left */ | 268 if (!match_bracket(p - pinc, k, kfold)) |
| 263 while (pat<endpat) { | 269 return FNM_NOMATCH; |
| 264 p = pat; | 270 } else if (c != QUESTION && k != c && kfold != c) { |
| 265 s = str; | 271 return FNM_NOMATCH; |
| 266 for (;;) { | 272 } |
| 267 c = pat_next(p, endpat-p, &pinc, flags); | 273 } |
| 268 p += pinc; | 274 |
| 269 /* Encountering * completes/commits a component */ | 275 /* We're all done with the tails now, so throw them out */ |
| 270 if (c == STAR) { | 276 endstr = stail; |
| 271 pat = p; | 277 endpat = ptail; |
| 272 str = s; | 278 |
| 273 break; | 279 /* Match pattern components until there are none left */ |
| 274 } | 280 while (pat < endpat) { |
| 275 k = str_next(s, endstr-s, &sinc); | 281 p = pat; |
| 276 if (!k) | 282 s = str; |
| 277 return FNM_NOMATCH; | 283 for (;;) { |
| 278 kfold = flags & FNM_CASEFOLD ? casefold(k) : k; | 284 c = pat_next(p, endpat - p, &pinc, flags); |
| 279 if (c == BRACKET) { | 285 p += pinc; |
| 280 if (!match_bracket(p-pinc, k, kfold)) | 286 /* Encountering * completes/commits a component */ |
| 281 break; | 287 if (c == STAR) { |
| 282 } else if (c != QUESTION && k != c && kfold != c) { | 288 pat = p; |
| 283 break; | 289 str = s; |
| 284 } | 290 break; |
| 285 s += sinc; | 291 } |
| 286 } | 292 k = str_next(s, endstr - s, &sinc); |
| 287 if (c == STAR) continue; | 293 if (!k) |
| 288 /* If we failed, advance str, by 1 char if it's a valid | 294 return FNM_NOMATCH; |
| 289 * char, or past all invalid bytes otherwise. */ | 295 kfold = flags & FNM_CASEFOLD ? casefold(k) : k; |
| 290 k = str_next(str, endstr-str, &sinc); | 296 if (c == BRACKET) { |
| 291 if (k > 0) str += sinc; | 297 if (!match_bracket(p - pinc, k, kfold)) |
| 292 else for (str++; str_next(str, endstr-str, &sinc)<0; str++); | 298 break; |
| 293 } | 299 } else if (c != QUESTION && k != c && kfold != c) { |
| 294 | 300 break; |
| 295 return 0; | 301 } |
| 296 } | 302 s += sinc; |
| 297 | 303 } |
| 298 int fnmatch(const char *pat, const char *str, int flags) | 304 if (c == STAR) |
| 299 { | 305 continue; |
| 300 const char *s, *p; | 306 /* If we failed, advance str, by 1 char if it's a valid |
| 301 size_t inc; | 307 * char, or past all invalid bytes otherwise. */ |
| 302 int c; | 308 k = str_next(str, endstr - str, &sinc); |
| 303 if (flags & FNM_PATHNAME) for (;;) { | 309 if (k > 0) |
| 304 for (s=str; *s && *s!='/'; s++); | 310 str += sinc; |
| 305 for (p=pat; (c=pat_next(p, -1, &inc, flags))!=END && c!='/'; p+=
inc); | 311 else |
| 306 if (c!=*s && (!*s || !(flags & FNM_LEADING_DIR))) | 312 for (str++; str_next(str, endstr - str, &sinc) < 0; str++) |
| 307 return FNM_NOMATCH; | 313 ; |
| 308 if (fnmatch_internal(pat, p-pat, str, s-str, flags)) | 314 } |
| 309 return FNM_NOMATCH; | 315 |
| 310 if (!c) return 0; | 316 return 0; |
| 311 str = s+1; | 317 } |
| 312 pat = p+inc; | 318 |
| 313 } else if (flags & FNM_LEADING_DIR) { | 319 int fnmatch(const char* pat, const char* str, int flags) { |
| 314 for (s=str; *s; s++) { | 320 const char *s, *p; |
| 315 if (*s != '/') continue; | 321 size_t inc; |
| 316 if (!fnmatch_internal(pat, -1, str, s-str, flags)) | 322 int c; |
| 317 return 0; | 323 if (flags & FNM_PATHNAME) |
| 318 } | 324 for (;;) { |
| 319 } | 325 for (s = str; *s && *s != '/'; s++) |
| 320 return fnmatch_internal(pat, -1, str, -1, flags); | 326 ; |
| 321 } | 327 for (p = pat; (c = pat_next(p, -1, &inc, flags)) != END && c != '/'; |
| 328 p += inc) |
| 329 ; |
| 330 if (c != *s && (!*s || !(flags & FNM_LEADING_DIR))) |
| 331 return FNM_NOMATCH; |
| 332 if (fnmatch_internal(pat, p - pat, str, s - str, flags)) |
| 333 return FNM_NOMATCH; |
| 334 if (!c) |
| 335 return 0; |
| 336 str = s + 1; |
| 337 pat = p + inc; |
| 338 } |
| 339 else if (flags & FNM_LEADING_DIR) { |
| 340 for (s = str; *s; s++) { |
| 341 if (*s != '/') |
| 342 continue; |
| 343 if (!fnmatch_internal(pat, -1, str, s - str, flags)) |
| 344 return 0; |
| 345 } |
| 346 } |
| 347 return fnmatch_internal(pat, -1, str, -1, flags); |
| 348 } |
| OLD | NEW |