OLD | NEW |
1 /* | 1 /* |
2 * An implementation of what I call the "Sea of Stars" algorithm for | 2 * An implementation of what I call the "Sea of Stars" algorithm for |
3 * POSIX fnmatch(). The basic idea is that we factor the pattern into | 3 * POSIX fnmatch(). The basic idea is that we factor the pattern into |
4 * a head component (which we match first and can reject without ever | 4 * a head component (which we match first and can reject without ever |
5 * measuring the length of the string), an optional tail component | 5 * measuring the length of the string), an optional tail component |
6 * (which only exists if the pattern contains at least one star), and | 6 * (which only exists if the pattern contains at least one star), and |
7 * an optional "sea of stars", a set of star-separated components | 7 * an optional "sea of stars", a set of star-separated components |
8 * between the head and tail. After the head and tail matches have | 8 * between the head and tail. After the head and tail matches have |
9 * been removed from the input string, the components in the "sea of | 9 * been removed from the input string, the components in the "sea of |
10 * stars" are matched sequentially by searching for their first | 10 * stars" are matched sequentially by searching for their first |
11 * occurrence past the end of the previous match. | 11 * occurrence past the end of the previous match. |
12 * | 12 * |
13 * - Rich Felker, April 2012 | 13 * - Rich Felker, April 2012 |
14 */ | 14 */ |
15 | 15 |
16 #include <string.h> | 16 #include <string.h> |
17 #include <fnmatch.h> | 17 #include <fnmatch.h> |
18 #include <stdlib.h> | 18 #include <stdlib.h> |
19 #include <wchar.h> | 19 #include <wchar.h> |
20 #include <wctype.h> | 20 #include <wctype.h> |
21 #include "locale_impl.h" | 21 #include "locale_impl.h" |
22 | 22 |
23 #define END 0 | 23 #define END 0 |
24 #define UNMATCHABLE -2 | 24 #define UNMATCHABLE -2 |
25 #define BRACKET -3 | 25 #define BRACKET -3 |
26 #define QUESTION -4 | 26 #define QUESTION -4 |
27 #define STAR -5 | 27 #define STAR -5 |
28 | 28 |
29 static int str_next(const char *str, size_t n, size_t *step) | 29 static int str_next(const char* str, size_t n, size_t* step) { |
30 { | 30 if (!n) { |
31 » if (!n) { | 31 *step = 0; |
32 » » *step = 0; | 32 return 0; |
33 » » return 0; | 33 } |
34 » } | 34 if (str[0] >= 128U) { |
35 » if (str[0] >= 128U) { | 35 wchar_t wc; |
36 » » wchar_t wc; | 36 int k = mbtowc(&wc, str, n); |
37 » » int k = mbtowc(&wc, str, n); | 37 if (k < 0) { |
38 » » if (k<0) { | 38 *step = 1; |
39 » » » *step = 1; | 39 return -1; |
40 » » » return -1; | 40 } |
41 » » } | 41 *step = k; |
42 » » *step = k; | 42 return wc; |
43 » » return wc; | 43 } |
44 » } | 44 *step = 1; |
45 » *step = 1; | 45 return str[0]; |
46 » return str[0]; | 46 } |
47 } | 47 |
48 | 48 static int pat_next(const char* pat, size_t m, size_t* step, int flags) { |
49 static int pat_next(const char *pat, size_t m, size_t *step, int flags) | 49 int esc = 0; |
50 { | 50 if (!m || !*pat) { |
51 » int esc = 0; | 51 *step = 0; |
52 » if (!m || !*pat) { | 52 return END; |
53 » » *step = 0; | 53 } |
54 » » return END; | 54 *step = 1; |
55 » } | 55 if (pat[0] == '\\' && pat[1] && !(flags & FNM_NOESCAPE)) { |
56 » *step = 1; | 56 *step = 2; |
57 » if (pat[0]=='\\' && pat[1] && !(flags & FNM_NOESCAPE)) { | 57 pat++; |
58 » » *step = 2; | 58 esc = 1; |
59 » » pat++; | 59 goto escaped; |
60 » » esc = 1; | 60 } |
61 » » goto escaped; | 61 if (pat[0] == '[') { |
62 » } | 62 size_t k = 1; |
63 » if (pat[0]=='[') { | 63 if (k < m) |
64 » » size_t k = 1; | 64 if (pat[k] == '^' || pat[k] == '!') |
65 » » if (k<m) if (pat[k] == '^' || pat[k] == '!') k++; | 65 k++; |
66 » » if (k<m) if (pat[k] == ']') k++; | 66 if (k < m) |
67 » » for (; k<m && pat[k] && pat[k]!=']'; k++) { | 67 if (pat[k] == ']') |
68 » » » if (k+1<m && pat[k+1] && pat[k]=='[' && (pat[k+1]==':' |
| pat[k+1]=='.' || pat[k+1]=='=')) { | 68 k++; |
69 » » » » int z = pat[k+1]; | 69 for (; k < m && pat[k] && pat[k] != ']'; k++) { |
70 » » » » k+=2; | 70 if (k + 1 < m && pat[k + 1] && pat[k] == '[' && |
71 » » » » if (k<m && pat[k]) k++; | 71 (pat[k + 1] == ':' || pat[k + 1] == '.' || pat[k + 1] == '=')) { |
72 » » » » while (k<m && pat[k] && (pat[k-1]!=z || pat[k]!=
']')) k++; | 72 int z = pat[k + 1]; |
73 » » » » if (k==m || !pat[k]) break; | 73 k += 2; |
74 » » » } | 74 if (k < m && pat[k]) |
75 » » } | 75 k++; |
76 » » if (k==m || !pat[k]) { | 76 while (k < m && pat[k] && (pat[k - 1] != z || pat[k] != ']')) |
77 » » » *step = 1; | 77 k++; |
78 » » » return '['; | 78 if (k == m || !pat[k]) |
79 » » } | 79 break; |
80 » » *step = k+1; | 80 } |
81 » » return BRACKET; | 81 } |
82 » } | 82 if (k == m || !pat[k]) { |
83 » if (pat[0] == '*') | 83 *step = 1; |
84 » » return STAR; | 84 return '['; |
85 » if (pat[0] == '?') | 85 } |
86 » » return QUESTION; | 86 *step = k + 1; |
| 87 return BRACKET; |
| 88 } |
| 89 if (pat[0] == '*') |
| 90 return STAR; |
| 91 if (pat[0] == '?') |
| 92 return QUESTION; |
87 escaped: | 93 escaped: |
88 if (pat[0] >= 128U) { | 94 if (pat[0] >= 128U) { |
89 wchar_t wc; | 95 wchar_t wc; |
90 int k = mbtowc(&wc, pat, m); | 96 int k = mbtowc(&wc, pat, m); |
91 if (k<0) { | 97 if (k < 0) { |
92 *step = 0; | 98 *step = 0; |
93 return UNMATCHABLE; | 99 return UNMATCHABLE; |
94 } | 100 } |
95 *step = k + esc; | 101 *step = k + esc; |
96 return wc; | 102 return wc; |
97 } | 103 } |
98 return pat[0]; | 104 return pat[0]; |
99 } | 105 } |
100 | 106 |
101 static int casefold(int k) | 107 static int casefold(int k) { |
102 { | 108 int c = towupper(k); |
103 int c = towupper(k); | 109 return c == k ? towlower(k) : c; |
104 return c == k ? towlower(k) : c; | 110 } |
105 } | 111 |
106 | 112 static int match_bracket(const char* p, int k, int kfold) { |
107 static int match_bracket(const char *p, int k, int kfold) | 113 wchar_t wc; |
108 { | 114 int inv = 0; |
109 wchar_t wc; | 115 p++; |
110 int inv = 0; | 116 if (*p == '^' || *p == '!') { |
111 p++; | 117 inv = 1; |
112 if (*p=='^' || *p=='!') { | 118 p++; |
113 inv = 1; | 119 } |
114 p++; | 120 if (*p == ']') { |
115 } | 121 if (k == ']') |
116 if (*p==']') { | 122 return !inv; |
117 if (k==']') return !inv; | 123 p++; |
118 p++; | 124 } else if (*p == '-') { |
119 } else if (*p=='-') { | 125 if (k == '-') |
120 if (k=='-') return !inv; | 126 return !inv; |
121 p++; | 127 p++; |
122 } | 128 } |
123 wc = p[-1]; | 129 wc = p[-1]; |
124 for (; *p != ']'; p++) { | 130 for (; *p != ']'; p++) { |
125 if (p[0]=='-' && p[1]!=']') { | 131 if (p[0] == '-' && p[1] != ']') { |
126 wchar_t wc2; | 132 wchar_t wc2; |
127 int l = mbtowc(&wc2, p+1, 4); | 133 int l = mbtowc(&wc2, p + 1, 4); |
128 if (l < 0) return 0; | 134 if (l < 0) |
129 if (wc <= wc2) | 135 return 0; |
130 if ((unsigned)k-wc <= wc2-wc || | 136 if (wc <= wc2) |
131 (unsigned)kfold-wc <= wc2-wc) | 137 if ((unsigned)k - wc <= wc2 - wc || (unsigned)kfold - wc <= wc2 - wc) |
132 return !inv; | 138 return !inv; |
133 p += l-1; | 139 p += l - 1; |
134 continue; | 140 continue; |
135 } | 141 } |
136 if (p[0]=='[' && (p[1]==':' || p[1]=='.' || p[1]=='=')) { | 142 if (p[0] == '[' && (p[1] == ':' || p[1] == '.' || p[1] == '=')) { |
137 const char *p0 = p+2; | 143 const char* p0 = p + 2; |
138 int z = p[1]; | 144 int z = p[1]; |
139 p+=3; | 145 p += 3; |
140 while (p[-1]!=z || p[0]!=']') p++; | 146 while (p[-1] != z || p[0] != ']') |
141 if (z == ':' && p-1-p0 < 16) { | 147 p++; |
142 char buf[16]; | 148 if (z == ':' && p - 1 - p0 < 16) { |
143 memcpy(buf, p0, p-1-p0); | 149 char buf[16]; |
144 buf[p-1-p0] = 0; | 150 memcpy(buf, p0, p - 1 - p0); |
145 if (iswctype(k, wctype(buf)) || | 151 buf[p - 1 - p0] = 0; |
146 iswctype(kfold, wctype(buf))) | 152 if (iswctype(k, wctype(buf)) || iswctype(kfold, wctype(buf))) |
147 return !inv; | 153 return !inv; |
148 } | 154 } |
149 continue; | 155 continue; |
150 } | 156 } |
151 if (*p < 128U) { | 157 if (*p < 128U) { |
152 wc = (unsigned char)*p; | 158 wc = (unsigned char)*p; |
153 } else { | 159 } else { |
154 int l = mbtowc(&wc, p, 4); | 160 int l = mbtowc(&wc, p, 4); |
155 if (l < 0) return 0; | 161 if (l < 0) |
156 p += l-1; | 162 return 0; |
157 } | 163 p += l - 1; |
158 if (wc==k || wc==kfold) return !inv; | 164 } |
159 } | 165 if (wc == k || wc == kfold) |
160 return inv; | 166 return !inv; |
161 } | 167 } |
162 | 168 return inv; |
163 static int fnmatch_internal(const char *pat, size_t m, const char *str, size_t n
, int flags) | 169 } |
164 { | 170 |
165 const char *p, *ptail, *endpat; | 171 static int fnmatch_internal(const char* pat, |
166 const char *s, *stail, *endstr; | 172 size_t m, |
167 size_t pinc, sinc, tailcnt=0; | 173 const char* str, |
168 int c, k, kfold; | 174 size_t n, |
169 | 175 int flags) { |
170 if (flags & FNM_PERIOD) { | 176 const char *p, *ptail, *endpat; |
171 if (*str == '.' && *pat != '.') | 177 const char *s, *stail, *endstr; |
172 return FNM_NOMATCH; | 178 size_t pinc, sinc, tailcnt = 0; |
173 } | 179 int c, k, kfold; |
174 for (;;) { | 180 |
175 switch ((c = pat_next(pat, m, &pinc, flags))) { | 181 if (flags & FNM_PERIOD) { |
176 case UNMATCHABLE: | 182 if (*str == '.' && *pat != '.') |
177 return FNM_NOMATCH; | 183 return FNM_NOMATCH; |
178 case STAR: | 184 } |
179 pat++; | 185 for (;;) { |
180 m--; | 186 switch ((c = pat_next(pat, m, &pinc, flags))) { |
181 break; | 187 case UNMATCHABLE: |
182 default: | 188 return FNM_NOMATCH; |
183 k = str_next(str, n, &sinc); | 189 case STAR: |
184 if (k <= 0) | 190 pat++; |
185 return (c==END) ? 0 : FNM_NOMATCH; | 191 m--; |
186 str += sinc; | 192 break; |
187 n -= sinc; | 193 default: |
188 kfold = flags & FNM_CASEFOLD ? casefold(k) : k; | 194 k = str_next(str, n, &sinc); |
189 if (c == BRACKET) { | 195 if (k <= 0) |
190 if (!match_bracket(pat, k, kfold)) | 196 return (c == END) ? 0 : FNM_NOMATCH; |
191 return FNM_NOMATCH; | 197 str += sinc; |
192 } else if (c != QUESTION && k != c && kfold != c) { | 198 n -= sinc; |
193 return FNM_NOMATCH; | 199 kfold = flags & FNM_CASEFOLD ? casefold(k) : k; |
194 } | 200 if (c == BRACKET) { |
195 pat+=pinc; | 201 if (!match_bracket(pat, k, kfold)) |
196 m-=pinc; | 202 return FNM_NOMATCH; |
197 continue; | 203 } else if (c != QUESTION && k != c && kfold != c) { |
198 } | 204 return FNM_NOMATCH; |
199 break; | 205 } |
200 } | 206 pat += pinc; |
201 | 207 m -= pinc; |
202 /* Compute real pat length if it was initially unknown/-1 */ | 208 continue; |
203 m = strnlen(pat, m); | 209 } |
204 endpat = pat + m; | 210 break; |
205 | 211 } |
206 /* Find the last * in pat and count chars needed after it */ | 212 |
207 for (p=ptail=pat; p<endpat; p+=pinc) { | 213 /* Compute real pat length if it was initially unknown/-1 */ |
208 switch (pat_next(p, endpat-p, &pinc, flags)) { | 214 m = strnlen(pat, m); |
209 case UNMATCHABLE: | 215 endpat = pat + m; |
210 return FNM_NOMATCH; | 216 |
211 case STAR: | 217 /* Find the last * in pat and count chars needed after it */ |
212 tailcnt=0; | 218 for (p = ptail = pat; p < endpat; p += pinc) { |
213 ptail = p+1; | 219 switch (pat_next(p, endpat - p, &pinc, flags)) { |
214 break; | 220 case UNMATCHABLE: |
215 default: | 221 return FNM_NOMATCH; |
216 tailcnt++; | 222 case STAR: |
217 break; | 223 tailcnt = 0; |
218 } | 224 ptail = p + 1; |
219 } | 225 break; |
220 | 226 default: |
221 /* Past this point we need not check for UNMATCHABLE in pat, | 227 tailcnt++; |
222 * because all of pat has already been parsed once. */ | 228 break; |
223 | 229 } |
224 /* Compute real str length if it was initially unknown/-1 */ | 230 } |
225 n = strnlen(str, n); | 231 |
226 endstr = str + n; | 232 /* Past this point we need not check for UNMATCHABLE in pat, |
227 if (n < tailcnt) return FNM_NOMATCH; | 233 * because all of pat has already been parsed once. */ |
228 | 234 |
229 /* Find the final tailcnt chars of str, accounting for UTF-8. | 235 /* Compute real str length if it was initially unknown/-1 */ |
230 * On illegal sequences we may get it wrong, but in that case | 236 n = strnlen(str, n); |
231 * we necessarily have a matching failure anyway. */ | 237 endstr = str + n; |
232 for (s=endstr; s>str && tailcnt; tailcnt--) { | 238 if (n < tailcnt) |
233 if (s[-1] < 128U || MB_CUR_MAX==1) s--; | 239 return FNM_NOMATCH; |
234 else while ((unsigned char)*--s-0x80U<0x40 && s>str); | 240 |
235 } | 241 /* Find the final tailcnt chars of str, accounting for UTF-8. |
236 if (tailcnt) return FNM_NOMATCH; | 242 * On illegal sequences we may get it wrong, but in that case |
237 stail = s; | 243 * we necessarily have a matching failure anyway. */ |
238 | 244 for (s = endstr; s > str && tailcnt; tailcnt--) { |
239 /* Check that the pat and str tails match */ | 245 if (s[-1] < 128U || MB_CUR_MAX == 1) |
240 p = ptail; | 246 s--; |
241 for (;;) { | 247 else |
242 c = pat_next(p, endpat-p, &pinc, flags); | 248 while ((unsigned char)*--s - 0x80U < 0x40 && s > str) |
243 p += pinc; | 249 ; |
244 if ((k = str_next(s, endstr-s, &sinc)) <= 0) { | 250 } |
245 if (c != END) return FNM_NOMATCH; | 251 if (tailcnt) |
246 break; | 252 return FNM_NOMATCH; |
247 } | 253 stail = s; |
248 s += sinc; | 254 |
249 kfold = flags & FNM_CASEFOLD ? casefold(k) : k; | 255 /* Check that the pat and str tails match */ |
250 if (c == BRACKET) { | 256 p = ptail; |
251 if (!match_bracket(p-pinc, k, kfold)) | 257 for (;;) { |
252 return FNM_NOMATCH; | 258 c = pat_next(p, endpat - p, &pinc, flags); |
253 } else if (c != QUESTION && k != c && kfold != c) { | 259 p += pinc; |
254 return FNM_NOMATCH; | 260 if ((k = str_next(s, endstr - s, &sinc)) <= 0) { |
255 } | 261 if (c != END) |
256 } | 262 return FNM_NOMATCH; |
257 | 263 break; |
258 /* We're all done with the tails now, so throw them out */ | 264 } |
259 endstr = stail; | 265 s += sinc; |
260 endpat = ptail; | 266 kfold = flags & FNM_CASEFOLD ? casefold(k) : k; |
261 | 267 if (c == BRACKET) { |
262 /* Match pattern components until there are none left */ | 268 if (!match_bracket(p - pinc, k, kfold)) |
263 while (pat<endpat) { | 269 return FNM_NOMATCH; |
264 p = pat; | 270 } else if (c != QUESTION && k != c && kfold != c) { |
265 s = str; | 271 return FNM_NOMATCH; |
266 for (;;) { | 272 } |
267 c = pat_next(p, endpat-p, &pinc, flags); | 273 } |
268 p += pinc; | 274 |
269 /* Encountering * completes/commits a component */ | 275 /* We're all done with the tails now, so throw them out */ |
270 if (c == STAR) { | 276 endstr = stail; |
271 pat = p; | 277 endpat = ptail; |
272 str = s; | 278 |
273 break; | 279 /* Match pattern components until there are none left */ |
274 } | 280 while (pat < endpat) { |
275 k = str_next(s, endstr-s, &sinc); | 281 p = pat; |
276 if (!k) | 282 s = str; |
277 return FNM_NOMATCH; | 283 for (;;) { |
278 kfold = flags & FNM_CASEFOLD ? casefold(k) : k; | 284 c = pat_next(p, endpat - p, &pinc, flags); |
279 if (c == BRACKET) { | 285 p += pinc; |
280 if (!match_bracket(p-pinc, k, kfold)) | 286 /* Encountering * completes/commits a component */ |
281 break; | 287 if (c == STAR) { |
282 } else if (c != QUESTION && k != c && kfold != c) { | 288 pat = p; |
283 break; | 289 str = s; |
284 } | 290 break; |
285 s += sinc; | 291 } |
286 } | 292 k = str_next(s, endstr - s, &sinc); |
287 if (c == STAR) continue; | 293 if (!k) |
288 /* If we failed, advance str, by 1 char if it's a valid | 294 return FNM_NOMATCH; |
289 * char, or past all invalid bytes otherwise. */ | 295 kfold = flags & FNM_CASEFOLD ? casefold(k) : k; |
290 k = str_next(str, endstr-str, &sinc); | 296 if (c == BRACKET) { |
291 if (k > 0) str += sinc; | 297 if (!match_bracket(p - pinc, k, kfold)) |
292 else for (str++; str_next(str, endstr-str, &sinc)<0; str++); | 298 break; |
293 } | 299 } else if (c != QUESTION && k != c && kfold != c) { |
294 | 300 break; |
295 return 0; | 301 } |
296 } | 302 s += sinc; |
297 | 303 } |
298 int fnmatch(const char *pat, const char *str, int flags) | 304 if (c == STAR) |
299 { | 305 continue; |
300 const char *s, *p; | 306 /* If we failed, advance str, by 1 char if it's a valid |
301 size_t inc; | 307 * char, or past all invalid bytes otherwise. */ |
302 int c; | 308 k = str_next(str, endstr - str, &sinc); |
303 if (flags & FNM_PATHNAME) for (;;) { | 309 if (k > 0) |
304 for (s=str; *s && *s!='/'; s++); | 310 str += sinc; |
305 for (p=pat; (c=pat_next(p, -1, &inc, flags))!=END && c!='/'; p+=
inc); | 311 else |
306 if (c!=*s && (!*s || !(flags & FNM_LEADING_DIR))) | 312 for (str++; str_next(str, endstr - str, &sinc) < 0; str++) |
307 return FNM_NOMATCH; | 313 ; |
308 if (fnmatch_internal(pat, p-pat, str, s-str, flags)) | 314 } |
309 return FNM_NOMATCH; | 315 |
310 if (!c) return 0; | 316 return 0; |
311 str = s+1; | 317 } |
312 pat = p+inc; | 318 |
313 } else if (flags & FNM_LEADING_DIR) { | 319 int fnmatch(const char* pat, const char* str, int flags) { |
314 for (s=str; *s; s++) { | 320 const char *s, *p; |
315 if (*s != '/') continue; | 321 size_t inc; |
316 if (!fnmatch_internal(pat, -1, str, s-str, flags)) | 322 int c; |
317 return 0; | 323 if (flags & FNM_PATHNAME) |
318 } | 324 for (;;) { |
319 } | 325 for (s = str; *s && *s != '/'; s++) |
320 return fnmatch_internal(pat, -1, str, -1, flags); | 326 ; |
321 } | 327 for (p = pat; (c = pat_next(p, -1, &inc, flags)) != END && c != '/'; |
| 328 p += inc) |
| 329 ; |
| 330 if (c != *s && (!*s || !(flags & FNM_LEADING_DIR))) |
| 331 return FNM_NOMATCH; |
| 332 if (fnmatch_internal(pat, p - pat, str, s - str, flags)) |
| 333 return FNM_NOMATCH; |
| 334 if (!c) |
| 335 return 0; |
| 336 str = s + 1; |
| 337 pat = p + inc; |
| 338 } |
| 339 else if (flags & FNM_LEADING_DIR) { |
| 340 for (s = str; *s; s++) { |
| 341 if (*s != '/') |
| 342 continue; |
| 343 if (!fnmatch_internal(pat, -1, str, s - str, flags)) |
| 344 return 0; |
| 345 } |
| 346 } |
| 347 return fnmatch_internal(pat, -1, str, -1, flags); |
| 348 } |
OLD | NEW |