Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2523)

Unified Diff: courgette/third_party/qsufsort.h

Issue 1914303002: [Courgette] Fix sorting in QSufSort's split(). (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « courgette/third_party/paged_array_unittest.cc ('k') | courgette/third_party/qsufsort_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: courgette/third_party/qsufsort.h
diff --git a/courgette/third_party/qsufsort.h b/courgette/third_party/qsufsort.h
index 399768c474ecf1e76e53a5d46861460fb24d192c..f1bad03cf701cfeb7c4be47a2c848d7e643efd71 100644
--- a/courgette/third_party/qsufsort.h
+++ b/courgette/third_party/qsufsort.h
@@ -13,10 +13,12 @@
2010-05-26 - Use a paged array for V and I. The address space may be too
fragmented for these big arrays to be contiguous.
--Stephen Adams <sra@chromium.org>
- 2015-08-03 - Extrat qsufsort to a separate file as template.
+ 2015-08-03 - Extract qsufsort to a separate file as template.
--Samuel Huang <huangs@chromium.org>
2015-08-19 - Optimize split() and search(), add comments.
--Samuel Huang <huangs@chromium.org>
+ 2016-04-2? - Rewrite split() to use std::sort() with I, V, resolution.
+ --Samuel Huang <huangs@chromium.org>
*/
#include <algorithm>
@@ -33,88 +35,57 @@ namespace qsuf {
// (2) indentation,
// (3) using 'const',
// (4) changing the V and I parameters from int* to template <typename T>.
-// (5) optimizing split() and search(); fix styles.
+// (5) optimizing search(); fix styles.
+// (6) rewriting split() to use std::sort(), followed by I, V resolution.
//
// The code appears to be a rewritten version of the suffix array algorithm
// presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko
// Sadakane, special cased for bytes.
+// Before: V[I[start:end]] = end - 1.
+// After: V[I[start:end]] take values in [start, end), forming "groups" of
+// suffixes that are unresolved up to length |h|. But if a group has size 1,
+// i.e., V[I[i]] == i and is unique, then we reassign I[i] to be negative so on
+// the next iteration we'd skip it.
template <typename T> static void split(T I, T V, int start, int end, int h) {
- // For small interval, apply selection sort.
- if (end - start < 6) {
- for (int i = start; i < end; ) {
- int skip = 1;
- int best = V[I[i] + h];
- for (int j = i + 1; j < end; j++) {
- int cur = V[I[j] + h];
- if (best > cur) {
- best = cur;
- int tmp = I[i];
- I[i] = I[j];
- I[j] = tmp;
- skip = 1;
- } else if (best == cur) {
- int tmp = I[i + skip];
- I[i + skip] = I[j];
- I[j] = tmp;
- ++skip;
- }
- }
- if (skip == 1) {
- V[I[i]] = i;
- I[i] = -1;
- } else {
- for (int j = i, jend = i + skip; j < jend; j++)
- V[I[j]] = jend - 1;
- }
- i += skip;
- }
+ if (start >= end)
return;
- }
- // Split [start, end) into 3 intervals:
- // [start, j) with secondary keys < pivot,
- // [j, k) with secondary keys == pivot,
- // [k, end) with secondary keys > pivot.
- int pivot = V[I[(start + end) / 2] + h];
- int j = start;
- int k = end;
- for (int i = start; i < k; ) {
- int cur = V[I[i] + h];
- if (cur < pivot) {
- if (i != j) {
- int tmp = I[i];
- I[i] = I[j];
- I[j] = tmp;
- }
- ++i;
- ++j;
- } else if (cur > pivot) {
- --k;
- int tmp = I[i];
- I[i] = I[k];
- I[k] = tmp;
- } else {
- ++i;
+ // Sort I[start:end] using V[I[start:end] + h] as key.
+ std::sort(I.begin() + start, I.begin() + end,
+ [&](int a, int b) { return V[a + h] < V[b + h]; });
+
+ // V[I[start:end] + h] is sorted. [start,end) can be split into distinct
+ // "groups" {[a,b)} that statisfy V[I[a:b] + h] = const. For each group [a,b),
+ // reassign V[I[a:b]] := b - 1. For size-1 groups (b - a == 1), reassign
+ // I[a] := -1 to indicate completion.
+ int group_last_member = end - 1;
+ int group_key = V[I[group_last_member] + h];
+
+ // Iterate backwards to assign V[I[a:b]] := b - 1 in one pass.
+ for (int i = end - 2; i >= start; --i) {
+ int cur_key = V[I[i] + h];
+
+ // We update V[I[a:b]], but this can trample V[I[i] + h]! Fortunately, since
+ // we're reassigning all V[] values of |end - 1| to value in [begin, end)
+ // (that never appear elsewhere), so if we find anything in [begin, end),
+ // we know it used to be |end - 1|, and can reinterpret accordingly.
+ if (cur_key >= start && cur_key < end)
+ cur_key = end - 1;
+
+ // Did we find new group?
+ if (group_key != cur_key) {
+ // If previous group found has 1 member, then mark completion.
+ if (group_last_member == i + 1)
+ I[i + 1] = -1;
+ group_last_member = i;
+ group_key = cur_key;
}
+ V[I[i]] = group_last_member;
}
- // Recurse on the "< pivot" piece.
- if (start < j)
- split<T>(I, V, start, j, h);
-
- // Update the "== pivot" piece.
- if (j == k - 1) {
- V[I[j]] = j;
- I[j] = -1;
- } else {
- for (int i = j; i < k; ++i)
- V[I[i]] = k - 1;
- }
-
- // Recurse on the "> pivot" piece.
- if (k < end)
- split<T>(I, V, k, end, h);
+ if (group_last_member == start)
+ I[start] = -1;
}
template <class T>
« no previous file with comments | « courgette/third_party/paged_array_unittest.cc ('k') | courgette/third_party/qsufsort_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698