Index: gdb/gnulib/import/str-two-way.h |
diff --git a/gdb/gnulib/str-two-way.h b/gdb/gnulib/import/str-two-way.h |
similarity index 86% |
rename from gdb/gnulib/str-two-way.h |
rename to gdb/gnulib/import/str-two-way.h |
index 7868eb81530e03445e02e2cddef1b6d301d1d134..af8f77b512c5dc987279fee8d71bd285cf483e44 100644 |
--- a/gdb/gnulib/str-two-way.h |
+++ b/gdb/gnulib/import/str-two-way.h |
@@ -1,5 +1,5 @@ |
/* Byte-wise substring search, using the Two-Way algorithm. |
- Copyright (C) 2008, 2009, 2010, 2011 Free Software Foundation, Inc. |
+ Copyright (C) 2008-2012 Free Software Foundation, Inc. |
This file is part of the GNU C Library. |
Written by Eric Blake <ebb9@byu.net>, 2008. |
@@ -14,8 +14,7 @@ |
GNU General Public License for more details. |
You should have received a copy of the GNU General Public License along |
- with this program; if not, write to the Free Software Foundation, |
- Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ |
+ with this program; if not, see <http://www.gnu.org/licenses/>. */ |
/* Before including this file, you need to include <config.h> and |
<string.h>, and define: |
@@ -44,14 +43,15 @@ |
#include <limits.h> |
#include <stdint.h> |
-/* We use the Two-Way string matching algorithm, which guarantees |
- linear complexity with constant space. Additionally, for long |
- needles, we also use a bad character shift table similar to the |
- Boyer-Moore algorithm to achieve improved (potentially sub-linear) |
- performance. |
+/* We use the Two-Way string matching algorithm (also known as |
+ Chrochemore-Perrin), which guarantees linear complexity with |
+ constant space. Additionally, for long needles, we also use a bad |
+ character shift table similar to the Boyer-Moore algorithm to |
+ achieve improved (potentially sub-linear) performance. |
- See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 |
- and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm |
+ See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260, |
+ http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm, |
+ http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf |
*/ |
/* Point at which computing a bad-byte shift table is likely to be |
@@ -95,11 +95,14 @@ |
A critical factorization has the property that the local period |
equals the global period. All strings have at least one critical |
factorization with the left half smaller than the global period. |
+ And while some strings have more than one critical factorization, |
+ it is provable that with an ordered alphabet, at least one of the |
+ critical factorizations corresponds to a maximal suffix. |
Given an ordered alphabet, a critical factorization can be computed |
in linear time, with 2 * NEEDLE_LEN comparisons, by computing the |
- larger of two ordered maximal suffixes. The ordered maximal |
- suffixes are determined by lexicographic comparison of |
+ shorter of two ordered maximal suffixes. The ordered maximal |
+ suffixes are determined by lexicographic comparison while tracking |
periodicity. */ |
static size_t |
critical_factorization (const unsigned char *needle, size_t needle_len, |
@@ -112,6 +115,14 @@ critical_factorization (const unsigned char *needle, size_t needle_len, |
size_t p; /* Intermediate period. */ |
unsigned char a, b; /* Current comparison bytes. */ |
+ /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered |
+ out 0-length needles. */ |
+ if (needle_len < 3) |
+ { |
+ *period = 1; |
+ return needle_len - 1; |
+ } |
+ |
/* Invariants: |
0 <= j < NEEDLE_LEN - 1 |
-1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) |
@@ -190,8 +201,20 @@ critical_factorization (const unsigned char *needle, size_t needle_len, |
} |
} |
- /* Choose the longer suffix. Return the first byte of the right |
- half, rather than the last byte of the left half. */ |
+ /* Choose the shorter suffix. Return the index of the first byte of |
+ the right half, rather than the last byte of the left half. |
+ |
+ For some examples, 'banana' has two critical factorizations, both |
+ exposed by the two lexicographic extreme suffixes of 'anana' and |
+ 'nana', where both suffixes have a period of 2. On the other |
+ hand, with 'aab' and 'bba', both strings have a single critical |
+ factorization of the last byte, with the suffix having a period |
+ of 1. While the maximal lexicographic suffix of 'aab' is 'b', |
+ the maximal lexicographic suffix of 'bba' is 'ba', which is not a |
+ critical factorization. Conversely, the maximal reverse |
+ lexicographic suffix of 'a' works for 'bba', but not 'ab' for |
+ 'aab'. The shorter suffix of the two will always be a critical |
+ factorization. */ |
if (max_suffix_rev + 1 < max_suffix + 1) |
return max_suffix + 1; |
*period = p; |
@@ -226,9 +249,9 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, |
first. */ |
if (CMP_FUNC (needle, needle + period, suffix) == 0) |
{ |
- /* Entire needle is periodic; a mismatch can only advance by the |
- period, so use memory to avoid rescanning known occurrences |
- of the period. */ |
+ /* Entire needle is periodic; a mismatch in the left half can |
+ only advance by the period, so use memory to avoid rescanning |
+ known occurrences of the period in the right half. */ |
size_t memory = 0; |
j = 0; |
while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
@@ -330,9 +353,9 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, |
first. */ |
if (CMP_FUNC (needle, needle + period, suffix) == 0) |
{ |
- /* Entire needle is periodic; a mismatch can only advance by the |
- period, so use memory to avoid rescanning known occurrences |
- of the period. */ |
+ /* Entire needle is periodic; a mismatch in the left half can |
+ only advance by the period, so use memory to avoid rescanning |
+ known occurrences of the period in the right half. */ |
size_t memory = 0; |
size_t shift; |
j = 0; |
@@ -349,8 +372,8 @@ two_way_long_needle (const unsigned char *haystack, size_t haystack_len, |
a byte out of place, there can be no match until |
after the mismatch. */ |
shift = needle_len - period; |
- memory = 0; |
} |
+ memory = 0; |
j += shift; |
continue; |
} |