courgette/third_party/qsufsort.h - Issue 1914303002: [Courgette] Fix sorting in QSufSort's split().

Unified Diff: courgette/third_party/qsufsort.h

Issue 1914303002: [Courgette] Fix sorting in QSufSort's split(). (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: courgette/third_party/qsufsort.h

diff --git a/courgette/third_party/qsufsort.h b/courgette/third_party/qsufsort.h

index 399768c474ecf1e76e53a5d46861460fb24d192c..f1bad03cf701cfeb7c4be47a2c848d7e643efd71 100644

--- a/courgette/third_party/qsufsort.h

+++ b/courgette/third_party/qsufsort.h

@@ -13,10 +13,12 @@

2010-05-26 - Use a paged array for V and I. The address space may be too

fragmented for these big arrays to be contiguous.

--Stephen Adams <sra@chromium.org>

- 2015-08-03 - Extrat qsufsort to a separate file as template.

+ 2015-08-03 - Extract qsufsort to a separate file as template.

--Samuel Huang <huangs@chromium.org>

2015-08-19 - Optimize split() and search(), add comments.

--Samuel Huang <huangs@chromium.org>

+ 2016-04-2? - Rewrite split() to use std::sort() with I, V, resolution.

+ --Samuel Huang <huangs@chromium.org>

#include <algorithm>

@@ -33,88 +35,57 @@ namespace qsuf {

// (2) indentation,

// (3) using 'const',

// (4) changing the V and I parameters from int* to template <typename T>.

-// (5) optimizing split() and search(); fix styles.

+// (5) optimizing search(); fix styles.

+// (6) rewriting split() to use std::sort(), followed by I, V resolution.

// The code appears to be a rewritten version of the suffix array algorithm

// presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko

// Sadakane, special cased for bytes.

+// Before: V[I[start:end]] = end - 1.

+// After: V[I[start:end]] take values in [start, end), forming "groups" of

+// suffixes that are unresolved up to length |h|. But if a group has size 1,

+// i.e., V[I[i]] == i and is unique, then we reassign I[i] to be negative so on

+// the next iteration we'd skip it.

template <typename T> static void split(T I, T V, int start, int end, int h) {

- // For small interval, apply selection sort.

- if (end - start < 6) {

- for (int i = start; i < end; ) {

- int skip = 1;

- int best = V[I[i] + h];

- for (int j = i + 1; j < end; j++) {

- int cur = V[I[j] + h];

- if (best > cur) {

- best = cur;

- int tmp = I[i];

- I[i] = I[j];

- I[j] = tmp;

- skip = 1;

- } else if (best == cur) {

- int tmp = I[i + skip];

- I[i + skip] = I[j];

- I[j] = tmp;

- ++skip;

- }

- if (skip == 1) {

- V[I[i]] = i;

- I[i] = -1;

- } else {

- for (int j = i, jend = i + skip; j < jend; j++)

- V[I[j]] = jend - 1;

- }

- i += skip;

- }

+ if (start >= end)

return;

- }

- // Split [start, end) into 3 intervals:

- // [start, j) with secondary keys < pivot,

- // [j, k) with secondary keys == pivot,

- // [k, end) with secondary keys > pivot.

- int pivot = V[I[(start + end) / 2] + h];

- int j = start;

- int k = end;

- for (int i = start; i < k; ) {

- int cur = V[I[i] + h];

- if (cur < pivot) {

- if (i != j) {

- int tmp = I[i];

- I[i] = I[j];

- I[j] = tmp;

- }

- ++i;

- ++j;

- } else if (cur > pivot) {

- --k;

- int tmp = I[i];

- I[i] = I[k];

- I[k] = tmp;

- } else {

- ++i;

+ // Sort I[start:end] using V[I[start:end] + h] as key.

+ std::sort(I.begin() + start, I.begin() + end,

+ [&](int a, int b) { return V[a + h] < V[b + h]; });

+ // V[I[start:end] + h] is sorted. [start,end) can be split into distinct

+ // "groups" {[a,b)} that statisfy V[I[a:b] + h] = const. For each group [a,b),

+ // reassign V[I[a:b]] := b - 1. For size-1 groups (b - a == 1), reassign

+ // I[a] := -1 to indicate completion.

+ int group_last_member = end - 1;

+ int group_key = V[I[group_last_member] + h];

+ // Iterate backwards to assign V[I[a:b]] := b - 1 in one pass.

+ for (int i = end - 2; i >= start; --i) {

+ int cur_key = V[I[i] + h];

+ // We update V[I[a:b]], but this can trample V[I[i] + h]! Fortunately, since

+ // we're reassigning all V[] values of |end - 1| to value in [begin, end)

+ // (that never appear elsewhere), so if we find anything in [begin, end),

+ // we know it used to be |end - 1|, and can reinterpret accordingly.

+ if (cur_key >= start && cur_key < end)

+ cur_key = end - 1;

+ // Did we find new group?

+ if (group_key != cur_key) {

+ // If previous group found has 1 member, then mark completion.

+ if (group_last_member == i + 1)

+ I[i + 1] = -1;

+ group_last_member = i;

+ group_key = cur_key;

}

+ V[I[i]] = group_last_member;

}

- // Recurse on the "< pivot" piece.

- if (start < j)

- split<T>(I, V, start, j, h);

- // Update the "== pivot" piece.

- if (j == k - 1) {

- V[I[j]] = j;

- I[j] = -1;

- } else {

- for (int i = j; i < k; ++i)

- V[I[i]] = k - 1;

- }

- // Recurse on the "> pivot" piece.

- if (k < end)

- split<T>(I, V, k, end, h);

+ if (group_last_member == start)

+ I[start] = -1;

}

template <class T>

« no previous file with comments | « courgette/third_party/paged_array_unittest.cc ('k') | courgette/third_party/qsufsort_unittest.cc » ('j') | no next file with comments »