OLD | NEW |
1 /* Byte-wise substring search, using the Two-Way algorithm. | 1 /* Byte-wise substring search, using the Two-Way algorithm. |
2 Copyright (C) 2008, 2009, 2010, 2011 Free Software Foundation, Inc. | 2 Copyright (C) 2008-2012 Free Software Foundation, Inc. |
3 This file is part of the GNU C Library. | 3 This file is part of the GNU C Library. |
4 Written by Eric Blake <ebb9@byu.net>, 2008. | 4 Written by Eric Blake <ebb9@byu.net>, 2008. |
5 | 5 |
6 This program is free software; you can redistribute it and/or modify | 6 This program is free software; you can redistribute it and/or modify |
7 it under the terms of the GNU General Public License as published by | 7 it under the terms of the GNU General Public License as published by |
8 the Free Software Foundation; either version 3, or (at your option) | 8 the Free Software Foundation; either version 3, or (at your option) |
9 any later version. | 9 any later version. |
10 | 10 |
11 This program is distributed in the hope that it will be useful, | 11 This program is distributed in the hope that it will be useful, |
12 but WITHOUT ANY WARRANTY; without even the implied warranty of | 12 but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 GNU General Public License for more details. | 14 GNU General Public License for more details. |
15 | 15 |
16 You should have received a copy of the GNU General Public License along | 16 You should have received a copy of the GNU General Public License along |
17 with this program; if not, write to the Free Software Foundation, | 17 with this program; if not, see <http://www.gnu.org/licenses/>. */ |
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ | |
19 | 18 |
20 /* Before including this file, you need to include <config.h> and | 19 /* Before including this file, you need to include <config.h> and |
21 <string.h>, and define: | 20 <string.h>, and define: |
22 RESULT_TYPE A macro that expands to the return type. | 21 RESULT_TYPE A macro that expands to the return type. |
23 AVAILABLE(h, h_l, j, n_l) | 22 AVAILABLE(h, h_l, j, n_l) |
24 A macro that returns nonzero if there are | 23 A macro that returns nonzero if there are |
25 at least N_L bytes left starting at H[J]. | 24 at least N_L bytes left starting at H[J]. |
26 H is 'unsigned char *', H_L, J, and N_L | 25 H is 'unsigned char *', H_L, J, and N_L |
27 are 'size_t'; H_L is an lvalue. For | 26 are 'size_t'; H_L is an lvalue. For |
28 NUL-terminated searches, H_L can be | 27 NUL-terminated searches, H_L can be |
29 modified each iteration to avoid having | 28 modified each iteration to avoid having |
30 to compute the end of H up front. | 29 to compute the end of H up front. |
31 | 30 |
32 For case-insensitivity, you may optionally define: | 31 For case-insensitivity, you may optionally define: |
33 CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L | 32 CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L |
34 characters of P1 and P2 are equal. | 33 characters of P1 and P2 are equal. |
35 CANON_ELEMENT(c) A macro that canonicalizes an element right after | 34 CANON_ELEMENT(c) A macro that canonicalizes an element right after |
36 it has been fetched from one of the two strings. | 35 it has been fetched from one of the two strings. |
37 The argument is an 'unsigned char'; the result | 36 The argument is an 'unsigned char'; the result |
38 must be an 'unsigned char' as well. | 37 must be an 'unsigned char' as well. |
39 | 38 |
40 This file undefines the macros documented above, and defines | 39 This file undefines the macros documented above, and defines |
41 LONG_NEEDLE_THRESHOLD. | 40 LONG_NEEDLE_THRESHOLD. |
42 */ | 41 */ |
43 | 42 |
44 #include <limits.h> | 43 #include <limits.h> |
45 #include <stdint.h> | 44 #include <stdint.h> |
46 | 45 |
47 /* We use the Two-Way string matching algorithm, which guarantees | 46 /* We use the Two-Way string matching algorithm (also known as |
48 linear complexity with constant space. Additionally, for long | 47 Chrochemore-Perrin), which guarantees linear complexity with |
49 needles, we also use a bad character shift table similar to the | 48 constant space. Additionally, for long needles, we also use a bad |
50 Boyer-Moore algorithm to achieve improved (potentially sub-linear) | 49 character shift table similar to the Boyer-Moore algorithm to |
51 performance. | 50 achieve improved (potentially sub-linear) performance. |
52 | 51 |
53 See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 | 52 See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260, |
54 and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm | 53 http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm, |
| 54 http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&typ
e=pdf |
55 */ | 55 */ |
56 | 56 |
57 /* Point at which computing a bad-byte shift table is likely to be | 57 /* Point at which computing a bad-byte shift table is likely to be |
58 worthwhile. Small needles should not compute a table, since it | 58 worthwhile. Small needles should not compute a table, since it |
59 adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a | 59 adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a |
60 speedup no greater than a factor of NEEDLE_LEN. The larger the | 60 speedup no greater than a factor of NEEDLE_LEN. The larger the |
61 needle, the better the potential performance gain. On the other | 61 needle, the better the potential performance gain. On the other |
62 hand, on non-POSIX systems with CHAR_BIT larger than eight, the | 62 hand, on non-POSIX systems with CHAR_BIT larger than eight, the |
63 memory required for the table is prohibitive. */ | 63 memory required for the table is prohibitive. */ |
64 #if CHAR_BIT < 10 | 64 #if CHAR_BIT < 10 |
(...skipping 23 matching lines...) Expand all Loading... |
88 | 88 |
89 When NEEDLE is factored into two halves, a local period is the | 89 When NEEDLE is factored into two halves, a local period is the |
90 length of the smallest word that shares a suffix with the left half | 90 length of the smallest word that shares a suffix with the left half |
91 and shares a prefix with the right half. All factorizations of a | 91 and shares a prefix with the right half. All factorizations of a |
92 non-empty NEEDLE have a local period of at least 1 and no greater | 92 non-empty NEEDLE have a local period of at least 1 and no greater |
93 than NEEDLE_LEN. | 93 than NEEDLE_LEN. |
94 | 94 |
95 A critical factorization has the property that the local period | 95 A critical factorization has the property that the local period |
96 equals the global period. All strings have at least one critical | 96 equals the global period. All strings have at least one critical |
97 factorization with the left half smaller than the global period. | 97 factorization with the left half smaller than the global period. |
| 98 And while some strings have more than one critical factorization, |
| 99 it is provable that with an ordered alphabet, at least one of the |
| 100 critical factorizations corresponds to a maximal suffix. |
98 | 101 |
99 Given an ordered alphabet, a critical factorization can be computed | 102 Given an ordered alphabet, a critical factorization can be computed |
100 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the | 103 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the |
101 larger of two ordered maximal suffixes. The ordered maximal | 104 shorter of two ordered maximal suffixes. The ordered maximal |
102 suffixes are determined by lexicographic comparison of | 105 suffixes are determined by lexicographic comparison while tracking |
103 periodicity. */ | 106 periodicity. */ |
104 static size_t | 107 static size_t |
105 critical_factorization (const unsigned char *needle, size_t needle_len, | 108 critical_factorization (const unsigned char *needle, size_t needle_len, |
106 size_t *period) | 109 size_t *period) |
107 { | 110 { |
108 /* Index of last byte of left half, or SIZE_MAX. */ | 111 /* Index of last byte of left half, or SIZE_MAX. */ |
109 size_t max_suffix, max_suffix_rev; | 112 size_t max_suffix, max_suffix_rev; |
110 size_t j; /* Index into NEEDLE for current candidate suffix. */ | 113 size_t j; /* Index into NEEDLE for current candidate suffix. */ |
111 size_t k; /* Offset into current period. */ | 114 size_t k; /* Offset into current period. */ |
112 size_t p; /* Intermediate period. */ | 115 size_t p; /* Intermediate period. */ |
113 unsigned char a, b; /* Current comparison bytes. */ | 116 unsigned char a, b; /* Current comparison bytes. */ |
114 | 117 |
| 118 /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered |
| 119 out 0-length needles. */ |
| 120 if (needle_len < 3) |
| 121 { |
| 122 *period = 1; |
| 123 return needle_len - 1; |
| 124 } |
| 125 |
115 /* Invariants: | 126 /* Invariants: |
116 0 <= j < NEEDLE_LEN - 1 | 127 0 <= j < NEEDLE_LEN - 1 |
117 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) | 128 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) |
118 min(max_suffix, max_suffix_rev) < global period of NEEDLE | 129 min(max_suffix, max_suffix_rev) < global period of NEEDLE |
119 1 <= p <= global period of NEEDLE | 130 1 <= p <= global period of NEEDLE |
120 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] | 131 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] |
121 1 <= k <= p | 132 1 <= k <= p |
122 */ | 133 */ |
123 | 134 |
124 /* Perform lexicographic search. */ | 135 /* Perform lexicographic search. */ |
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
183 } | 194 } |
184 } | 195 } |
185 else /* a < b */ | 196 else /* a < b */ |
186 { | 197 { |
187 /* Suffix is larger, start over from current location. */ | 198 /* Suffix is larger, start over from current location. */ |
188 max_suffix_rev = j++; | 199 max_suffix_rev = j++; |
189 k = p = 1; | 200 k = p = 1; |
190 } | 201 } |
191 } | 202 } |
192 | 203 |
193 /* Choose the longer suffix. Return the first byte of the right | 204 /* Choose the shorter suffix. Return the index of the first byte of |
194 half, rather than the last byte of the left half. */ | 205 the right half, rather than the last byte of the left half. |
| 206 |
| 207 For some examples, 'banana' has two critical factorizations, both |
| 208 exposed by the two lexicographic extreme suffixes of 'anana' and |
| 209 'nana', where both suffixes have a period of 2. On the other |
| 210 hand, with 'aab' and 'bba', both strings have a single critical |
| 211 factorization of the last byte, with the suffix having a period |
| 212 of 1. While the maximal lexicographic suffix of 'aab' is 'b', |
| 213 the maximal lexicographic suffix of 'bba' is 'ba', which is not a |
| 214 critical factorization. Conversely, the maximal reverse |
| 215 lexicographic suffix of 'a' works for 'bba', but not 'ab' for |
| 216 'aab'. The shorter suffix of the two will always be a critical |
| 217 factorization. */ |
195 if (max_suffix_rev + 1 < max_suffix + 1) | 218 if (max_suffix_rev + 1 < max_suffix + 1) |
196 return max_suffix + 1; | 219 return max_suffix + 1; |
197 *period = p; | 220 *period = p; |
198 return max_suffix_rev + 1; | 221 return max_suffix_rev + 1; |
199 } | 222 } |
200 | 223 |
201 /* Return the first location of non-empty NEEDLE within HAYSTACK, or | 224 /* Return the first location of non-empty NEEDLE within HAYSTACK, or |
202 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This | 225 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This |
203 method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD. | 226 method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD. |
204 Performance is guaranteed to be linear, with an initialization cost | 227 Performance is guaranteed to be linear, with an initialization cost |
(...skipping 14 matching lines...) Expand all Loading... |
219 | 242 |
220 /* Factor the needle into two halves, such that the left half is | 243 /* Factor the needle into two halves, such that the left half is |
221 smaller than the global period, and the right half is | 244 smaller than the global period, and the right half is |
222 periodic (with a period as large as NEEDLE_LEN - suffix). */ | 245 periodic (with a period as large as NEEDLE_LEN - suffix). */ |
223 suffix = critical_factorization (needle, needle_len, &period); | 246 suffix = critical_factorization (needle, needle_len, &period); |
224 | 247 |
225 /* Perform the search. Each iteration compares the right half | 248 /* Perform the search. Each iteration compares the right half |
226 first. */ | 249 first. */ |
227 if (CMP_FUNC (needle, needle + period, suffix) == 0) | 250 if (CMP_FUNC (needle, needle + period, suffix) == 0) |
228 { | 251 { |
229 /* Entire needle is periodic; a mismatch can only advance by the | 252 /* Entire needle is periodic; a mismatch in the left half can |
230 period, so use memory to avoid rescanning known occurrences | 253 only advance by the period, so use memory to avoid rescanning |
231 of the period. */ | 254 known occurrences of the period in the right half. */ |
232 size_t memory = 0; | 255 size_t memory = 0; |
233 j = 0; | 256 j = 0; |
234 while (AVAILABLE (haystack, haystack_len, j, needle_len)) | 257 while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
235 { | 258 { |
236 /* Scan for matches in right half. */ | 259 /* Scan for matches in right half. */ |
237 i = MAX (suffix, memory); | 260 i = MAX (suffix, memory); |
238 while (i < needle_len && (CANON_ELEMENT (needle[i]) | 261 while (i < needle_len && (CANON_ELEMENT (needle[i]) |
239 == CANON_ELEMENT (haystack[i + j]))) | 262 == CANON_ELEMENT (haystack[i + j]))) |
240 ++i; | 263 ++i; |
241 if (needle_len <= i) | 264 if (needle_len <= i) |
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
323 shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */ | 346 shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */ |
324 for (i = 0; i < 1U << CHAR_BIT; i++) | 347 for (i = 0; i < 1U << CHAR_BIT; i++) |
325 shift_table[i] = needle_len; | 348 shift_table[i] = needle_len; |
326 for (i = 0; i < needle_len; i++) | 349 for (i = 0; i < needle_len; i++) |
327 shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1; | 350 shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1; |
328 | 351 |
329 /* Perform the search. Each iteration compares the right half | 352 /* Perform the search. Each iteration compares the right half |
330 first. */ | 353 first. */ |
331 if (CMP_FUNC (needle, needle + period, suffix) == 0) | 354 if (CMP_FUNC (needle, needle + period, suffix) == 0) |
332 { | 355 { |
333 /* Entire needle is periodic; a mismatch can only advance by the | 356 /* Entire needle is periodic; a mismatch in the left half can |
334 period, so use memory to avoid rescanning known occurrences | 357 only advance by the period, so use memory to avoid rescanning |
335 of the period. */ | 358 known occurrences of the period in the right half. */ |
336 size_t memory = 0; | 359 size_t memory = 0; |
337 size_t shift; | 360 size_t shift; |
338 j = 0; | 361 j = 0; |
339 while (AVAILABLE (haystack, haystack_len, j, needle_len)) | 362 while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
340 { | 363 { |
341 /* Check the last byte first; if it does not match, then | 364 /* Check the last byte first; if it does not match, then |
342 shift to the next possible match location. */ | 365 shift to the next possible match location. */ |
343 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; | 366 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; |
344 if (0 < shift) | 367 if (0 < shift) |
345 { | 368 { |
346 if (memory && shift < period) | 369 if (memory && shift < period) |
347 { | 370 { |
348 /* Since needle is periodic, but the last period has | 371 /* Since needle is periodic, but the last period has |
349 a byte out of place, there can be no match until | 372 a byte out of place, there can be no match until |
350 after the mismatch. */ | 373 after the mismatch. */ |
351 shift = needle_len - period; | 374 shift = needle_len - period; |
352 memory = 0; | |
353 } | 375 } |
| 376 memory = 0; |
354 j += shift; | 377 j += shift; |
355 continue; | 378 continue; |
356 } | 379 } |
357 /* Scan for matches in right half. The last byte has | 380 /* Scan for matches in right half. The last byte has |
358 already been matched, by virtue of the shift table. */ | 381 already been matched, by virtue of the shift table. */ |
359 i = MAX (suffix, memory); | 382 i = MAX (suffix, memory); |
360 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) | 383 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) |
361 == CANON_ELEMENT (haystack[i + j]))) | 384 == CANON_ELEMENT (haystack[i + j]))) |
362 ++i; | 385 ++i; |
363 if (needle_len - 1 <= i) | 386 if (needle_len - 1 <= i) |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
420 } | 443 } |
421 } | 444 } |
422 return NULL; | 445 return NULL; |
423 } | 446 } |
424 | 447 |
425 #undef AVAILABLE | 448 #undef AVAILABLE |
426 #undef CANON_ELEMENT | 449 #undef CANON_ELEMENT |
427 #undef CMP_FUNC | 450 #undef CMP_FUNC |
428 #undef MAX | 451 #undef MAX |
429 #undef RETURN_TYPE | 452 #undef RETURN_TYPE |
OLD | NEW |