Index: courgette/third_party/qsufsort.h |
diff --git a/courgette/third_party/qsufsort.h b/courgette/third_party/qsufsort.h |
index 399768c474ecf1e76e53a5d46861460fb24d192c..f1bad03cf701cfeb7c4be47a2c848d7e643efd71 100644 |
--- a/courgette/third_party/qsufsort.h |
+++ b/courgette/third_party/qsufsort.h |
@@ -13,10 +13,12 @@ |
2010-05-26 - Use a paged array for V and I. The address space may be too |
fragmented for these big arrays to be contiguous. |
--Stephen Adams <sra@chromium.org> |
- 2015-08-03 - Extrat qsufsort to a separate file as template. |
+ 2015-08-03 - Extract qsufsort to a separate file as template. |
--Samuel Huang <huangs@chromium.org> |
2015-08-19 - Optimize split() and search(), add comments. |
--Samuel Huang <huangs@chromium.org> |
+ 2016-04-2? - Rewrite split() to use std::sort() with I, V, resolution. |
+ --Samuel Huang <huangs@chromium.org> |
*/ |
#include <algorithm> |
@@ -33,88 +35,57 @@ namespace qsuf { |
// (2) indentation, |
// (3) using 'const', |
// (4) changing the V and I parameters from int* to template <typename T>. |
-// (5) optimizing split() and search(); fix styles. |
+// (5) optimizing search(); fix styles. |
+// (6) rewriting split() to use std::sort(), followed by I, V resolution. |
// |
// The code appears to be a rewritten version of the suffix array algorithm |
// presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko |
// Sadakane, special cased for bytes. |
+// Before: V[I[start:end]] = end - 1. |
+// After: V[I[start:end]] take values in [start, end), forming "groups" of |
+// suffixes that are unresolved up to length |h|. But if a group has size 1, |
+// i.e., V[I[i]] == i and is unique, then we reassign I[i] to be negative so on |
+// the next iteration we'd skip it. |
template <typename T> static void split(T I, T V, int start, int end, int h) { |
- // For small interval, apply selection sort. |
- if (end - start < 6) { |
- for (int i = start; i < end; ) { |
- int skip = 1; |
- int best = V[I[i] + h]; |
- for (int j = i + 1; j < end; j++) { |
- int cur = V[I[j] + h]; |
- if (best > cur) { |
- best = cur; |
- int tmp = I[i]; |
- I[i] = I[j]; |
- I[j] = tmp; |
- skip = 1; |
- } else if (best == cur) { |
- int tmp = I[i + skip]; |
- I[i + skip] = I[j]; |
- I[j] = tmp; |
- ++skip; |
- } |
- } |
- if (skip == 1) { |
- V[I[i]] = i; |
- I[i] = -1; |
- } else { |
- for (int j = i, jend = i + skip; j < jend; j++) |
- V[I[j]] = jend - 1; |
- } |
- i += skip; |
- } |
+ if (start >= end) |
return; |
- } |
- // Split [start, end) into 3 intervals: |
- // [start, j) with secondary keys < pivot, |
- // [j, k) with secondary keys == pivot, |
- // [k, end) with secondary keys > pivot. |
- int pivot = V[I[(start + end) / 2] + h]; |
- int j = start; |
- int k = end; |
- for (int i = start; i < k; ) { |
- int cur = V[I[i] + h]; |
- if (cur < pivot) { |
- if (i != j) { |
- int tmp = I[i]; |
- I[i] = I[j]; |
- I[j] = tmp; |
- } |
- ++i; |
- ++j; |
- } else if (cur > pivot) { |
- --k; |
- int tmp = I[i]; |
- I[i] = I[k]; |
- I[k] = tmp; |
- } else { |
- ++i; |
+ // Sort I[start:end] using V[I[start:end] + h] as key. |
+ std::sort(I.begin() + start, I.begin() + end, |
+ [&](int a, int b) { return V[a + h] < V[b + h]; }); |
+ |
+ // V[I[start:end] + h] is sorted. [start,end) can be split into distinct |
+ // "groups" {[a,b)} that statisfy V[I[a:b] + h] = const. For each group [a,b), |
+ // reassign V[I[a:b]] := b - 1. For size-1 groups (b - a == 1), reassign |
+ // I[a] := -1 to indicate completion. |
+ int group_last_member = end - 1; |
+ int group_key = V[I[group_last_member] + h]; |
+ |
+ // Iterate backwards to assign V[I[a:b]] := b - 1 in one pass. |
+ for (int i = end - 2; i >= start; --i) { |
+ int cur_key = V[I[i] + h]; |
+ |
+ // We update V[I[a:b]], but this can trample V[I[i] + h]! Fortunately, since |
+ // we're reassigning all V[] values of |end - 1| to value in [begin, end) |
+ // (that never appear elsewhere), so if we find anything in [begin, end), |
+ // we know it used to be |end - 1|, and can reinterpret accordingly. |
+ if (cur_key >= start && cur_key < end) |
+ cur_key = end - 1; |
+ |
+ // Did we find new group? |
+ if (group_key != cur_key) { |
+ // If previous group found has 1 member, then mark completion. |
+ if (group_last_member == i + 1) |
+ I[i + 1] = -1; |
+ group_last_member = i; |
+ group_key = cur_key; |
} |
+ V[I[i]] = group_last_member; |
} |
- // Recurse on the "< pivot" piece. |
- if (start < j) |
- split<T>(I, V, start, j, h); |
- |
- // Update the "== pivot" piece. |
- if (j == k - 1) { |
- V[I[j]] = j; |
- I[j] = -1; |
- } else { |
- for (int i = j; i < k; ++i) |
- V[I[i]] = k - 1; |
- } |
- |
- // Recurse on the "> pivot" piece. |
- if (k < end) |
- split<T>(I, V, k, end, h); |
+ if (group_last_member == start) |
+ I[start] = -1; |
} |
template <class T> |