OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * An implementation of what I call the "Sea of Stars" algorithm for |
| 3 * POSIX fnmatch(). The basic idea is that we factor the pattern into |
| 4 * a head component (which we match first and can reject without ever |
| 5 * measuring the length of the string), an optional tail component |
| 6 * (which only exists if the pattern contains at least one star), and |
| 7 * an optional "sea of stars", a set of star-separated components |
| 8 * between the head and tail. After the head and tail matches have |
| 9 * been removed from the input string, the components in the "sea of |
| 10 * stars" are matched sequentially by searching for their first |
| 11 * occurrence past the end of the previous match. |
| 12 * |
| 13 * - Rich Felker, April 2012 |
| 14 */ |
| 15 |
| 16 #include <string.h> |
| 17 #include <fnmatch.h> |
| 18 #include <stdlib.h> |
| 19 #include <wchar.h> |
| 20 #include <wctype.h> |
| 21 #include "locale_impl.h" |
| 22 |
| 23 #define END 0 |
| 24 #define UNMATCHABLE -2 |
| 25 #define BRACKET -3 |
| 26 #define QUESTION -4 |
| 27 #define STAR -5 |
| 28 |
| 29 static int str_next(const char *str, size_t n, size_t *step) |
| 30 { |
| 31 if (!n) { |
| 32 *step = 0; |
| 33 return 0; |
| 34 } |
| 35 if (str[0] >= 128U) { |
| 36 wchar_t wc; |
| 37 int k = mbtowc(&wc, str, n); |
| 38 if (k<0) { |
| 39 *step = 1; |
| 40 return -1; |
| 41 } |
| 42 *step = k; |
| 43 return wc; |
| 44 } |
| 45 *step = 1; |
| 46 return str[0]; |
| 47 } |
| 48 |
| 49 static int pat_next(const char *pat, size_t m, size_t *step, int flags) |
| 50 { |
| 51 int esc = 0; |
| 52 if (!m || !*pat) { |
| 53 *step = 0; |
| 54 return END; |
| 55 } |
| 56 *step = 1; |
| 57 if (pat[0]=='\\' && pat[1] && !(flags & FNM_NOESCAPE)) { |
| 58 *step = 2; |
| 59 pat++; |
| 60 esc = 1; |
| 61 goto escaped; |
| 62 } |
| 63 if (pat[0]=='[') { |
| 64 size_t k = 1; |
| 65 if (k<m) if (pat[k] == '^' || pat[k] == '!') k++; |
| 66 if (k<m) if (pat[k] == ']') k++; |
| 67 for (; k<m && pat[k] && pat[k]!=']'; k++) { |
| 68 if (k+1<m && pat[k+1] && pat[k]=='[' && (pat[k+1]==':' |
| pat[k+1]=='.' || pat[k+1]=='=')) { |
| 69 int z = pat[k+1]; |
| 70 k+=2; |
| 71 if (k<m && pat[k]) k++; |
| 72 while (k<m && pat[k] && (pat[k-1]!=z || pat[k]!=
']')) k++; |
| 73 if (k==m || !pat[k]) break; |
| 74 } |
| 75 } |
| 76 if (k==m || !pat[k]) { |
| 77 *step = 1; |
| 78 return '['; |
| 79 } |
| 80 *step = k+1; |
| 81 return BRACKET; |
| 82 } |
| 83 if (pat[0] == '*') |
| 84 return STAR; |
| 85 if (pat[0] == '?') |
| 86 return QUESTION; |
| 87 escaped: |
| 88 if (pat[0] >= 128U) { |
| 89 wchar_t wc; |
| 90 int k = mbtowc(&wc, pat, m); |
| 91 if (k<0) { |
| 92 *step = 0; |
| 93 return UNMATCHABLE; |
| 94 } |
| 95 *step = k + esc; |
| 96 return wc; |
| 97 } |
| 98 return pat[0]; |
| 99 } |
| 100 |
| 101 static int casefold(int k) |
| 102 { |
| 103 int c = towupper(k); |
| 104 return c == k ? towlower(k) : c; |
| 105 } |
| 106 |
| 107 static int match_bracket(const char *p, int k, int kfold) |
| 108 { |
| 109 wchar_t wc; |
| 110 int inv = 0; |
| 111 p++; |
| 112 if (*p=='^' || *p=='!') { |
| 113 inv = 1; |
| 114 p++; |
| 115 } |
| 116 if (*p==']') { |
| 117 if (k==']') return !inv; |
| 118 p++; |
| 119 } else if (*p=='-') { |
| 120 if (k=='-') return !inv; |
| 121 p++; |
| 122 } |
| 123 wc = p[-1]; |
| 124 for (; *p != ']'; p++) { |
| 125 if (p[0]=='-' && p[1]!=']') { |
| 126 wchar_t wc2; |
| 127 int l = mbtowc(&wc2, p+1, 4); |
| 128 if (l < 0) return 0; |
| 129 if (wc <= wc2) |
| 130 if ((unsigned)k-wc <= wc2-wc || |
| 131 (unsigned)kfold-wc <= wc2-wc) |
| 132 return !inv; |
| 133 p += l-1; |
| 134 continue; |
| 135 } |
| 136 if (p[0]=='[' && (p[1]==':' || p[1]=='.' || p[1]=='=')) { |
| 137 const char *p0 = p+2; |
| 138 int z = p[1]; |
| 139 p+=3; |
| 140 while (p[-1]!=z || p[0]!=']') p++; |
| 141 if (z == ':' && p-1-p0 < 16) { |
| 142 char buf[16]; |
| 143 memcpy(buf, p0, p-1-p0); |
| 144 buf[p-1-p0] = 0; |
| 145 if (iswctype(k, wctype(buf)) || |
| 146 iswctype(kfold, wctype(buf))) |
| 147 return !inv; |
| 148 } |
| 149 continue; |
| 150 } |
| 151 if (*p < 128U) { |
| 152 wc = (unsigned char)*p; |
| 153 } else { |
| 154 int l = mbtowc(&wc, p, 4); |
| 155 if (l < 0) return 0; |
| 156 p += l-1; |
| 157 } |
| 158 if (wc==k || wc==kfold) return !inv; |
| 159 } |
| 160 return inv; |
| 161 } |
| 162 |
| 163 static int fnmatch_internal(const char *pat, size_t m, const char *str, size_t n
, int flags) |
| 164 { |
| 165 const char *p, *ptail, *endpat; |
| 166 const char *s, *stail, *endstr; |
| 167 size_t pinc, sinc, tailcnt=0; |
| 168 int c, k, kfold; |
| 169 |
| 170 if (flags & FNM_PERIOD) { |
| 171 if (*str == '.' && *pat != '.') |
| 172 return FNM_NOMATCH; |
| 173 } |
| 174 for (;;) { |
| 175 switch ((c = pat_next(pat, m, &pinc, flags))) { |
| 176 case UNMATCHABLE: |
| 177 return FNM_NOMATCH; |
| 178 case STAR: |
| 179 pat++; |
| 180 m--; |
| 181 break; |
| 182 default: |
| 183 k = str_next(str, n, &sinc); |
| 184 if (k <= 0) |
| 185 return (c==END) ? 0 : FNM_NOMATCH; |
| 186 str += sinc; |
| 187 n -= sinc; |
| 188 kfold = flags & FNM_CASEFOLD ? casefold(k) : k; |
| 189 if (c == BRACKET) { |
| 190 if (!match_bracket(pat, k, kfold)) |
| 191 return FNM_NOMATCH; |
| 192 } else if (c != QUESTION && k != c && kfold != c) { |
| 193 return FNM_NOMATCH; |
| 194 } |
| 195 pat+=pinc; |
| 196 m-=pinc; |
| 197 continue; |
| 198 } |
| 199 break; |
| 200 } |
| 201 |
| 202 /* Compute real pat length if it was initially unknown/-1 */ |
| 203 m = strnlen(pat, m); |
| 204 endpat = pat + m; |
| 205 |
| 206 /* Find the last * in pat and count chars needed after it */ |
| 207 for (p=ptail=pat; p<endpat; p+=pinc) { |
| 208 switch (pat_next(p, endpat-p, &pinc, flags)) { |
| 209 case UNMATCHABLE: |
| 210 return FNM_NOMATCH; |
| 211 case STAR: |
| 212 tailcnt=0; |
| 213 ptail = p+1; |
| 214 break; |
| 215 default: |
| 216 tailcnt++; |
| 217 break; |
| 218 } |
| 219 } |
| 220 |
| 221 /* Past this point we need not check for UNMATCHABLE in pat, |
| 222 * because all of pat has already been parsed once. */ |
| 223 |
| 224 /* Compute real str length if it was initially unknown/-1 */ |
| 225 n = strnlen(str, n); |
| 226 endstr = str + n; |
| 227 if (n < tailcnt) return FNM_NOMATCH; |
| 228 |
| 229 /* Find the final tailcnt chars of str, accounting for UTF-8. |
| 230 * On illegal sequences we may get it wrong, but in that case |
| 231 * we necessarily have a matching failure anyway. */ |
| 232 for (s=endstr; s>str && tailcnt; tailcnt--) { |
| 233 if (s[-1] < 128U || MB_CUR_MAX==1) s--; |
| 234 else while ((unsigned char)*--s-0x80U<0x40 && s>str); |
| 235 } |
| 236 if (tailcnt) return FNM_NOMATCH; |
| 237 stail = s; |
| 238 |
| 239 /* Check that the pat and str tails match */ |
| 240 p = ptail; |
| 241 for (;;) { |
| 242 c = pat_next(p, endpat-p, &pinc, flags); |
| 243 p += pinc; |
| 244 if ((k = str_next(s, endstr-s, &sinc)) <= 0) { |
| 245 if (c != END) return FNM_NOMATCH; |
| 246 break; |
| 247 } |
| 248 s += sinc; |
| 249 kfold = flags & FNM_CASEFOLD ? casefold(k) : k; |
| 250 if (c == BRACKET) { |
| 251 if (!match_bracket(p-pinc, k, kfold)) |
| 252 return FNM_NOMATCH; |
| 253 } else if (c != QUESTION && k != c && kfold != c) { |
| 254 return FNM_NOMATCH; |
| 255 } |
| 256 } |
| 257 |
| 258 /* We're all done with the tails now, so throw them out */ |
| 259 endstr = stail; |
| 260 endpat = ptail; |
| 261 |
| 262 /* Match pattern components until there are none left */ |
| 263 while (pat<endpat) { |
| 264 p = pat; |
| 265 s = str; |
| 266 for (;;) { |
| 267 c = pat_next(p, endpat-p, &pinc, flags); |
| 268 p += pinc; |
| 269 /* Encountering * completes/commits a component */ |
| 270 if (c == STAR) { |
| 271 pat = p; |
| 272 str = s; |
| 273 break; |
| 274 } |
| 275 k = str_next(s, endstr-s, &sinc); |
| 276 if (!k) |
| 277 return FNM_NOMATCH; |
| 278 kfold = flags & FNM_CASEFOLD ? casefold(k) : k; |
| 279 if (c == BRACKET) { |
| 280 if (!match_bracket(p-pinc, k, kfold)) |
| 281 break; |
| 282 } else if (c != QUESTION && k != c && kfold != c) { |
| 283 break; |
| 284 } |
| 285 s += sinc; |
| 286 } |
| 287 if (c == STAR) continue; |
| 288 /* If we failed, advance str, by 1 char if it's a valid |
| 289 * char, or past all invalid bytes otherwise. */ |
| 290 k = str_next(str, endstr-str, &sinc); |
| 291 if (k > 0) str += sinc; |
| 292 else for (str++; str_next(str, endstr-str, &sinc)<0; str++); |
| 293 } |
| 294 |
| 295 return 0; |
| 296 } |
| 297 |
| 298 int fnmatch(const char *pat, const char *str, int flags) |
| 299 { |
| 300 const char *s, *p; |
| 301 size_t inc; |
| 302 int c; |
| 303 if (flags & FNM_PATHNAME) for (;;) { |
| 304 for (s=str; *s && *s!='/'; s++); |
| 305 for (p=pat; (c=pat_next(p, -1, &inc, flags))!=END && c!='/'; p+=
inc); |
| 306 if (c!=*s && (!*s || !(flags & FNM_LEADING_DIR))) |
| 307 return FNM_NOMATCH; |
| 308 if (fnmatch_internal(pat, p-pat, str, s-str, flags)) |
| 309 return FNM_NOMATCH; |
| 310 if (!c) return 0; |
| 311 str = s+1; |
| 312 pat = p+inc; |
| 313 } else if (flags & FNM_LEADING_DIR) { |
| 314 for (s=str; *s; s++) { |
| 315 if (*s != '/') continue; |
| 316 if (!fnmatch_internal(pat, -1, str, s-str, flags)) |
| 317 return 0; |
| 318 } |
| 319 } |
| 320 return fnmatch_internal(pat, -1, str, -1, flags); |
| 321 } |
OLD | NEW |