OLD | NEW |
| (Empty) |
1 /* Byte-wise substring search, using the Two-Way algorithm. | |
2 Copyright (C) 2008, 2009, 2010, 2011 Free Software Foundation, Inc. | |
3 This file is part of the GNU C Library. | |
4 Written by Eric Blake <ebb9@byu.net>, 2008. | |
5 | |
6 This program is free software; you can redistribute it and/or modify | |
7 it under the terms of the GNU General Public License as published by | |
8 the Free Software Foundation; either version 3, or (at your option) | |
9 any later version. | |
10 | |
11 This program is distributed in the hope that it will be useful, | |
12 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 GNU General Public License for more details. | |
15 | |
16 You should have received a copy of the GNU General Public License along | |
17 with this program; if not, write to the Free Software Foundation, | |
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ | |
19 | |
20 /* Before including this file, you need to include <config.h> and | |
21 <string.h>, and define: | |
22 RESULT_TYPE A macro that expands to the return type. | |
23 AVAILABLE(h, h_l, j, n_l) | |
24 A macro that returns nonzero if there are | |
25 at least N_L bytes left starting at H[J]. | |
26 H is 'unsigned char *', H_L, J, and N_L | |
27 are 'size_t'; H_L is an lvalue. For | |
28 NUL-terminated searches, H_L can be | |
29 modified each iteration to avoid having | |
30 to compute the end of H up front. | |
31 | |
32 For case-insensitivity, you may optionally define: | |
33 CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L | |
34 characters of P1 and P2 are equal. | |
35 CANON_ELEMENT(c) A macro that canonicalizes an element right after | |
36 it has been fetched from one of the two strings. | |
37 The argument is an 'unsigned char'; the result | |
38 must be an 'unsigned char' as well. | |
39 | |
40 This file undefines the macros documented above, and defines | |
41 LONG_NEEDLE_THRESHOLD. | |
42 */ | |
43 | |
44 #include <limits.h> | |
45 #include <stdint.h> | |
46 | |
47 /* We use the Two-Way string matching algorithm, which guarantees | |
48 linear complexity with constant space. Additionally, for long | |
49 needles, we also use a bad character shift table similar to the | |
50 Boyer-Moore algorithm to achieve improved (potentially sub-linear) | |
51 performance. | |
52 | |
53 See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 | |
54 and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm | |
55 */ | |
56 | |
57 /* Point at which computing a bad-byte shift table is likely to be | |
58 worthwhile. Small needles should not compute a table, since it | |
59 adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a | |
60 speedup no greater than a factor of NEEDLE_LEN. The larger the | |
61 needle, the better the potential performance gain. On the other | |
62 hand, on non-POSIX systems with CHAR_BIT larger than eight, the | |
63 memory required for the table is prohibitive. */ | |
64 #if CHAR_BIT < 10 | |
65 # define LONG_NEEDLE_THRESHOLD 32U | |
66 #else | |
67 # define LONG_NEEDLE_THRESHOLD SIZE_MAX | |
68 #endif | |
69 | |
70 #ifndef MAX | |
71 # define MAX(a, b) ((a < b) ? (b) : (a)) | |
72 #endif | |
73 | |
74 #ifndef CANON_ELEMENT | |
75 # define CANON_ELEMENT(c) c | |
76 #endif | |
77 #ifndef CMP_FUNC | |
78 # define CMP_FUNC memcmp | |
79 #endif | |
80 | |
81 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN. | |
82 Return the index of the first byte in the right half, and set | |
83 *PERIOD to the global period of the right half. | |
84 | |
85 The global period of a string is the smallest index (possibly its | |
86 length) at which all remaining bytes in the string are repetitions | |
87 of the prefix (the last repetition may be a subset of the prefix). | |
88 | |
89 When NEEDLE is factored into two halves, a local period is the | |
90 length of the smallest word that shares a suffix with the left half | |
91 and shares a prefix with the right half. All factorizations of a | |
92 non-empty NEEDLE have a local period of at least 1 and no greater | |
93 than NEEDLE_LEN. | |
94 | |
95 A critical factorization has the property that the local period | |
96 equals the global period. All strings have at least one critical | |
97 factorization with the left half smaller than the global period. | |
98 | |
99 Given an ordered alphabet, a critical factorization can be computed | |
100 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the | |
101 larger of two ordered maximal suffixes. The ordered maximal | |
102 suffixes are determined by lexicographic comparison of | |
103 periodicity. */ | |
104 static size_t | |
105 critical_factorization (const unsigned char *needle, size_t needle_len, | |
106 size_t *period) | |
107 { | |
108 /* Index of last byte of left half, or SIZE_MAX. */ | |
109 size_t max_suffix, max_suffix_rev; | |
110 size_t j; /* Index into NEEDLE for current candidate suffix. */ | |
111 size_t k; /* Offset into current period. */ | |
112 size_t p; /* Intermediate period. */ | |
113 unsigned char a, b; /* Current comparison bytes. */ | |
114 | |
115 /* Invariants: | |
116 0 <= j < NEEDLE_LEN - 1 | |
117 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) | |
118 min(max_suffix, max_suffix_rev) < global period of NEEDLE | |
119 1 <= p <= global period of NEEDLE | |
120 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] | |
121 1 <= k <= p | |
122 */ | |
123 | |
124 /* Perform lexicographic search. */ | |
125 max_suffix = SIZE_MAX; | |
126 j = 0; | |
127 k = p = 1; | |
128 while (j + k < needle_len) | |
129 { | |
130 a = CANON_ELEMENT (needle[j + k]); | |
131 b = CANON_ELEMENT (needle[max_suffix + k]); | |
132 if (a < b) | |
133 { | |
134 /* Suffix is smaller, period is entire prefix so far. */ | |
135 j += k; | |
136 k = 1; | |
137 p = j - max_suffix; | |
138 } | |
139 else if (a == b) | |
140 { | |
141 /* Advance through repetition of the current period. */ | |
142 if (k != p) | |
143 ++k; | |
144 else | |
145 { | |
146 j += p; | |
147 k = 1; | |
148 } | |
149 } | |
150 else /* b < a */ | |
151 { | |
152 /* Suffix is larger, start over from current location. */ | |
153 max_suffix = j++; | |
154 k = p = 1; | |
155 } | |
156 } | |
157 *period = p; | |
158 | |
159 /* Perform reverse lexicographic search. */ | |
160 max_suffix_rev = SIZE_MAX; | |
161 j = 0; | |
162 k = p = 1; | |
163 while (j + k < needle_len) | |
164 { | |
165 a = CANON_ELEMENT (needle[j + k]); | |
166 b = CANON_ELEMENT (needle[max_suffix_rev + k]); | |
167 if (b < a) | |
168 { | |
169 /* Suffix is smaller, period is entire prefix so far. */ | |
170 j += k; | |
171 k = 1; | |
172 p = j - max_suffix_rev; | |
173 } | |
174 else if (a == b) | |
175 { | |
176 /* Advance through repetition of the current period. */ | |
177 if (k != p) | |
178 ++k; | |
179 else | |
180 { | |
181 j += p; | |
182 k = 1; | |
183 } | |
184 } | |
185 else /* a < b */ | |
186 { | |
187 /* Suffix is larger, start over from current location. */ | |
188 max_suffix_rev = j++; | |
189 k = p = 1; | |
190 } | |
191 } | |
192 | |
193 /* Choose the longer suffix. Return the first byte of the right | |
194 half, rather than the last byte of the left half. */ | |
195 if (max_suffix_rev + 1 < max_suffix + 1) | |
196 return max_suffix + 1; | |
197 *period = p; | |
198 return max_suffix_rev + 1; | |
199 } | |
200 | |
201 /* Return the first location of non-empty NEEDLE within HAYSTACK, or | |
202 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This | |
203 method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD. | |
204 Performance is guaranteed to be linear, with an initialization cost | |
205 of 2 * NEEDLE_LEN comparisons. | |
206 | |
207 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at | |
208 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. | |
209 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * | |
210 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ | |
211 static RETURN_TYPE | |
212 two_way_short_needle (const unsigned char *haystack, size_t haystack_len, | |
213 const unsigned char *needle, size_t needle_len) | |
214 { | |
215 size_t i; /* Index into current byte of NEEDLE. */ | |
216 size_t j; /* Index into current window of HAYSTACK. */ | |
217 size_t period; /* The period of the right half of needle. */ | |
218 size_t suffix; /* The index of the right half of needle. */ | |
219 | |
220 /* Factor the needle into two halves, such that the left half is | |
221 smaller than the global period, and the right half is | |
222 periodic (with a period as large as NEEDLE_LEN - suffix). */ | |
223 suffix = critical_factorization (needle, needle_len, &period); | |
224 | |
225 /* Perform the search. Each iteration compares the right half | |
226 first. */ | |
227 if (CMP_FUNC (needle, needle + period, suffix) == 0) | |
228 { | |
229 /* Entire needle is periodic; a mismatch can only advance by the | |
230 period, so use memory to avoid rescanning known occurrences | |
231 of the period. */ | |
232 size_t memory = 0; | |
233 j = 0; | |
234 while (AVAILABLE (haystack, haystack_len, j, needle_len)) | |
235 { | |
236 /* Scan for matches in right half. */ | |
237 i = MAX (suffix, memory); | |
238 while (i < needle_len && (CANON_ELEMENT (needle[i]) | |
239 == CANON_ELEMENT (haystack[i + j]))) | |
240 ++i; | |
241 if (needle_len <= i) | |
242 { | |
243 /* Scan for matches in left half. */ | |
244 i = suffix - 1; | |
245 while (memory < i + 1 && (CANON_ELEMENT (needle[i]) | |
246 == CANON_ELEMENT (haystack[i + j]))) | |
247 --i; | |
248 if (i + 1 < memory + 1) | |
249 return (RETURN_TYPE) (haystack + j); | |
250 /* No match, so remember how many repetitions of period | |
251 on the right half were scanned. */ | |
252 j += period; | |
253 memory = needle_len - period; | |
254 } | |
255 else | |
256 { | |
257 j += i - suffix + 1; | |
258 memory = 0; | |
259 } | |
260 } | |
261 } | |
262 else | |
263 { | |
264 /* The two halves of needle are distinct; no extra memory is | |
265 required, and any mismatch results in a maximal shift. */ | |
266 period = MAX (suffix, needle_len - suffix) + 1; | |
267 j = 0; | |
268 while (AVAILABLE (haystack, haystack_len, j, needle_len)) | |
269 { | |
270 /* Scan for matches in right half. */ | |
271 i = suffix; | |
272 while (i < needle_len && (CANON_ELEMENT (needle[i]) | |
273 == CANON_ELEMENT (haystack[i + j]))) | |
274 ++i; | |
275 if (needle_len <= i) | |
276 { | |
277 /* Scan for matches in left half. */ | |
278 i = suffix - 1; | |
279 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) | |
280 == CANON_ELEMENT (haystack[i + j]))) | |
281 --i; | |
282 if (i == SIZE_MAX) | |
283 return (RETURN_TYPE) (haystack + j); | |
284 j += period; | |
285 } | |
286 else | |
287 j += i - suffix + 1; | |
288 } | |
289 } | |
290 return NULL; | |
291 } | |
292 | |
293 /* Return the first location of non-empty NEEDLE within HAYSTACK, or | |
294 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This | |
295 method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN. | |
296 Performance is guaranteed to be linear, with an initialization cost | |
297 of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations. | |
298 | |
299 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at | |
300 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, | |
301 and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible. | |
302 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * | |
303 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and | |
304 sublinear performance is not possible. */ | |
305 static RETURN_TYPE | |
306 two_way_long_needle (const unsigned char *haystack, size_t haystack_len, | |
307 const unsigned char *needle, size_t needle_len) | |
308 { | |
309 size_t i; /* Index into current byte of NEEDLE. */ | |
310 size_t j; /* Index into current window of HAYSTACK. */ | |
311 size_t period; /* The period of the right half of needle. */ | |
312 size_t suffix; /* The index of the right half of needle. */ | |
313 size_t shift_table[1U << CHAR_BIT]; /* See below. */ | |
314 | |
315 /* Factor the needle into two halves, such that the left half is | |
316 smaller than the global period, and the right half is | |
317 periodic (with a period as large as NEEDLE_LEN - suffix). */ | |
318 suffix = critical_factorization (needle, needle_len, &period); | |
319 | |
320 /* Populate shift_table. For each possible byte value c, | |
321 shift_table[c] is the distance from the last occurrence of c to | |
322 the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE. | |
323 shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */ | |
324 for (i = 0; i < 1U << CHAR_BIT; i++) | |
325 shift_table[i] = needle_len; | |
326 for (i = 0; i < needle_len; i++) | |
327 shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1; | |
328 | |
329 /* Perform the search. Each iteration compares the right half | |
330 first. */ | |
331 if (CMP_FUNC (needle, needle + period, suffix) == 0) | |
332 { | |
333 /* Entire needle is periodic; a mismatch can only advance by the | |
334 period, so use memory to avoid rescanning known occurrences | |
335 of the period. */ | |
336 size_t memory = 0; | |
337 size_t shift; | |
338 j = 0; | |
339 while (AVAILABLE (haystack, haystack_len, j, needle_len)) | |
340 { | |
341 /* Check the last byte first; if it does not match, then | |
342 shift to the next possible match location. */ | |
343 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; | |
344 if (0 < shift) | |
345 { | |
346 if (memory && shift < period) | |
347 { | |
348 /* Since needle is periodic, but the last period has | |
349 a byte out of place, there can be no match until | |
350 after the mismatch. */ | |
351 shift = needle_len - period; | |
352 memory = 0; | |
353 } | |
354 j += shift; | |
355 continue; | |
356 } | |
357 /* Scan for matches in right half. The last byte has | |
358 already been matched, by virtue of the shift table. */ | |
359 i = MAX (suffix, memory); | |
360 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) | |
361 == CANON_ELEMENT (haystack[i + j]))) | |
362 ++i; | |
363 if (needle_len - 1 <= i) | |
364 { | |
365 /* Scan for matches in left half. */ | |
366 i = suffix - 1; | |
367 while (memory < i + 1 && (CANON_ELEMENT (needle[i]) | |
368 == CANON_ELEMENT (haystack[i + j]))) | |
369 --i; | |
370 if (i + 1 < memory + 1) | |
371 return (RETURN_TYPE) (haystack + j); | |
372 /* No match, so remember how many repetitions of period | |
373 on the right half were scanned. */ | |
374 j += period; | |
375 memory = needle_len - period; | |
376 } | |
377 else | |
378 { | |
379 j += i - suffix + 1; | |
380 memory = 0; | |
381 } | |
382 } | |
383 } | |
384 else | |
385 { | |
386 /* The two halves of needle are distinct; no extra memory is | |
387 required, and any mismatch results in a maximal shift. */ | |
388 size_t shift; | |
389 period = MAX (suffix, needle_len - suffix) + 1; | |
390 j = 0; | |
391 while (AVAILABLE (haystack, haystack_len, j, needle_len)) | |
392 { | |
393 /* Check the last byte first; if it does not match, then | |
394 shift to the next possible match location. */ | |
395 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; | |
396 if (0 < shift) | |
397 { | |
398 j += shift; | |
399 continue; | |
400 } | |
401 /* Scan for matches in right half. The last byte has | |
402 already been matched, by virtue of the shift table. */ | |
403 i = suffix; | |
404 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) | |
405 == CANON_ELEMENT (haystack[i + j]))) | |
406 ++i; | |
407 if (needle_len - 1 <= i) | |
408 { | |
409 /* Scan for matches in left half. */ | |
410 i = suffix - 1; | |
411 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) | |
412 == CANON_ELEMENT (haystack[i + j]))) | |
413 --i; | |
414 if (i == SIZE_MAX) | |
415 return (RETURN_TYPE) (haystack + j); | |
416 j += period; | |
417 } | |
418 else | |
419 j += i - suffix + 1; | |
420 } | |
421 } | |
422 return NULL; | |
423 } | |
424 | |
425 #undef AVAILABLE | |
426 #undef CANON_ELEMENT | |
427 #undef CMP_FUNC | |
428 #undef MAX | |
429 #undef RETURN_TYPE | |
OLD | NEW |