| Index: courgette/third_party/qsufsort.h
|
| diff --git a/courgette/third_party/qsufsort.h b/courgette/third_party/qsufsort.h
|
| index 399768c474ecf1e76e53a5d46861460fb24d192c..f1bad03cf701cfeb7c4be47a2c848d7e643efd71 100644
|
| --- a/courgette/third_party/qsufsort.h
|
| +++ b/courgette/third_party/qsufsort.h
|
| @@ -13,10 +13,12 @@
|
| 2010-05-26 - Use a paged array for V and I. The address space may be too
|
| fragmented for these big arrays to be contiguous.
|
| --Stephen Adams <sra@chromium.org>
|
| - 2015-08-03 - Extrat qsufsort to a separate file as template.
|
| + 2015-08-03 - Extract qsufsort to a separate file as template.
|
| --Samuel Huang <huangs@chromium.org>
|
| 2015-08-19 - Optimize split() and search(), add comments.
|
| --Samuel Huang <huangs@chromium.org>
|
| + 2016-04-2? - Rewrite split() to use std::sort() with I, V, resolution.
|
| + --Samuel Huang <huangs@chromium.org>
|
| */
|
|
|
| #include <algorithm>
|
| @@ -33,88 +35,57 @@ namespace qsuf {
|
| // (2) indentation,
|
| // (3) using 'const',
|
| // (4) changing the V and I parameters from int* to template <typename T>.
|
| -// (5) optimizing split() and search(); fix styles.
|
| +// (5) optimizing search(); fix styles.
|
| +// (6) rewriting split() to use std::sort(), followed by I, V resolution.
|
| //
|
| // The code appears to be a rewritten version of the suffix array algorithm
|
| // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko
|
| // Sadakane, special cased for bytes.
|
|
|
| +// Before: V[I[start:end]] = end - 1.
|
| +// After: V[I[start:end]] take values in [start, end), forming "groups" of
|
| +// suffixes that are unresolved up to length |h|. But if a group has size 1,
|
| +// i.e., V[I[i]] == i and is unique, then we reassign I[i] to be negative so on
|
| +// the next iteration we'd skip it.
|
| template <typename T> static void split(T I, T V, int start, int end, int h) {
|
| - // For small interval, apply selection sort.
|
| - if (end - start < 6) {
|
| - for (int i = start; i < end; ) {
|
| - int skip = 1;
|
| - int best = V[I[i] + h];
|
| - for (int j = i + 1; j < end; j++) {
|
| - int cur = V[I[j] + h];
|
| - if (best > cur) {
|
| - best = cur;
|
| - int tmp = I[i];
|
| - I[i] = I[j];
|
| - I[j] = tmp;
|
| - skip = 1;
|
| - } else if (best == cur) {
|
| - int tmp = I[i + skip];
|
| - I[i + skip] = I[j];
|
| - I[j] = tmp;
|
| - ++skip;
|
| - }
|
| - }
|
| - if (skip == 1) {
|
| - V[I[i]] = i;
|
| - I[i] = -1;
|
| - } else {
|
| - for (int j = i, jend = i + skip; j < jend; j++)
|
| - V[I[j]] = jend - 1;
|
| - }
|
| - i += skip;
|
| - }
|
| + if (start >= end)
|
| return;
|
| - }
|
|
|
| - // Split [start, end) into 3 intervals:
|
| - // [start, j) with secondary keys < pivot,
|
| - // [j, k) with secondary keys == pivot,
|
| - // [k, end) with secondary keys > pivot.
|
| - int pivot = V[I[(start + end) / 2] + h];
|
| - int j = start;
|
| - int k = end;
|
| - for (int i = start; i < k; ) {
|
| - int cur = V[I[i] + h];
|
| - if (cur < pivot) {
|
| - if (i != j) {
|
| - int tmp = I[i];
|
| - I[i] = I[j];
|
| - I[j] = tmp;
|
| - }
|
| - ++i;
|
| - ++j;
|
| - } else if (cur > pivot) {
|
| - --k;
|
| - int tmp = I[i];
|
| - I[i] = I[k];
|
| - I[k] = tmp;
|
| - } else {
|
| - ++i;
|
| + // Sort I[start:end] using V[I[start:end] + h] as key.
|
| + std::sort(I.begin() + start, I.begin() + end,
|
| + [&](int a, int b) { return V[a + h] < V[b + h]; });
|
| +
|
| + // V[I[start:end] + h] is sorted. [start,end) can be split into distinct
|
| + // "groups" {[a,b)} that statisfy V[I[a:b] + h] = const. For each group [a,b),
|
| + // reassign V[I[a:b]] := b - 1. For size-1 groups (b - a == 1), reassign
|
| + // I[a] := -1 to indicate completion.
|
| + int group_last_member = end - 1;
|
| + int group_key = V[I[group_last_member] + h];
|
| +
|
| + // Iterate backwards to assign V[I[a:b]] := b - 1 in one pass.
|
| + for (int i = end - 2; i >= start; --i) {
|
| + int cur_key = V[I[i] + h];
|
| +
|
| + // We update V[I[a:b]], but this can trample V[I[i] + h]! Fortunately, since
|
| + // we're reassigning all V[] values of |end - 1| to value in [begin, end)
|
| + // (that never appear elsewhere), so if we find anything in [begin, end),
|
| + // we know it used to be |end - 1|, and can reinterpret accordingly.
|
| + if (cur_key >= start && cur_key < end)
|
| + cur_key = end - 1;
|
| +
|
| + // Did we find new group?
|
| + if (group_key != cur_key) {
|
| + // If previous group found has 1 member, then mark completion.
|
| + if (group_last_member == i + 1)
|
| + I[i + 1] = -1;
|
| + group_last_member = i;
|
| + group_key = cur_key;
|
| }
|
| + V[I[i]] = group_last_member;
|
| }
|
|
|
| - // Recurse on the "< pivot" piece.
|
| - if (start < j)
|
| - split<T>(I, V, start, j, h);
|
| -
|
| - // Update the "== pivot" piece.
|
| - if (j == k - 1) {
|
| - V[I[j]] = j;
|
| - I[j] = -1;
|
| - } else {
|
| - for (int i = j; i < k; ++i)
|
| - V[I[i]] = k - 1;
|
| - }
|
| -
|
| - // Recurse on the "> pivot" piece.
|
| - if (k < end)
|
| - split<T>(I, V, k, end, h);
|
| + if (group_last_member == start)
|
| + I[start] = -1;
|
| }
|
|
|
| template <class T>
|
|
|