Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(435)

Side by Side Diff: courgette/third_party/qsufsort.h

Issue 1914303002: [Courgette] Fix sorting in QSufSort's split(). (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 qsufsort.h -- Suffix array generation. 2 qsufsort.h -- Suffix array generation.
3 3
4 Copyright 2003 Colin Percival 4 Copyright 2003 Colin Percival
5 5
6 For the terms under which this work may be distributed, please see 6 For the terms under which this work may be distributed, please see
7 the adjoining file "LICENSE". 7 the adjoining file "LICENSE".
8 8
9 ChangeLog: 9 ChangeLog:
10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit 10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit
11 values throughout. 11 values throughout.
12 --Benjamin Smedberg <benjamin@smedbergs.us> 12 --Benjamin Smedberg <benjamin@smedbergs.us>
13 2010-05-26 - Use a paged array for V and I. The address space may be too 13 2010-05-26 - Use a paged array for V and I. The address space may be too
14 fragmented for these big arrays to be contiguous. 14 fragmented for these big arrays to be contiguous.
15 --Stephen Adams <sra@chromium.org> 15 --Stephen Adams <sra@chromium.org>
16 2015-08-03 - Extrat qsufsort to a separate file as template. 16 2015-08-03 - Extract qsufsort to a separate file as template.
17 --Samuel Huang <huangs@chromium.org> 17 --Samuel Huang <huangs@chromium.org>
18 2015-08-19 - Optimize split() and search(), add comments. 18 2015-08-19 - Optimize split() and search(), add comments.
19 --Samuel Huang <huangs@chromium.org> 19 --Samuel Huang <huangs@chromium.org>
20 2016-04-2? - Rewrite split() to use std::sort() with I, V, resolution.
21 --Samuel Huang <huangs@chromium.org>
20 */ 22 */
21 23
22 #include <algorithm> 24 #include <algorithm>
23 #include <cstring> 25 #include <cstring>
24 26
25 namespace courgette { 27 namespace courgette {
26 namespace qsuf { 28 namespace qsuf {
27 29
28 // ------------------------------------------------------------------------ 30 // ------------------------------------------------------------------------
29 // 31 //
30 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the 32 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the
31 // code formatting and variable names. The changes from the original are: 33 // code formatting and variable names. The changes from the original are:
32 // (1) replacing tabs with spaces, 34 // (1) replacing tabs with spaces,
33 // (2) indentation, 35 // (2) indentation,
34 // (3) using 'const', 36 // (3) using 'const',
35 // (4) changing the V and I parameters from int* to template <typename T>. 37 // (4) changing the V and I parameters from int* to template <typename T>.
36 // (5) optimizing split() and search(); fix styles. 38 // (5) optimizing search(); fix styles.
39 // (6) rewriting split() to use std::sort(), followed by I, V resolution.
37 // 40 //
38 // The code appears to be a rewritten version of the suffix array algorithm 41 // The code appears to be a rewritten version of the suffix array algorithm
39 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko 42 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko
40 // Sadakane, special cased for bytes. 43 // Sadakane, special cased for bytes.
41 44
45 // Before: V[I[start:end]] = end - 1.
46 // After: V[I[start:end]] take values in [start, end), forming "groups" of
47 // suffixes that are unresolved up to length |h|. But if a group has size 1,
48 // i.e., V[I[i]] == i and is unique, then we reassign I[i] to be negative so on
49 // the next iteration we'd skip it.
42 template <typename T> static void split(T I, T V, int start, int end, int h) { 50 template <typename T> static void split(T I, T V, int start, int end, int h) {
43 // For small interval, apply selection sort. 51 if (start >= end)
44 if (end - start < 6) { 52 return;
45 for (int i = start; i < end; ) { 53
46 int skip = 1; 54 // Sort I[start:end] using V[I[start:end] + h] as key.
47 int best = V[I[i] + h]; 55 std::sort(I.begin() + start, I.begin() + end,
48 for (int j = i + 1; j < end; j++) { 56 [&](int a, int b) { return V[a + h] < V[b + h]; });
49 int cur = V[I[j] + h]; 57
50 if (best > cur) { 58 // V[I[start:end] + h] is sorted. [start,end) can be split into distinct
51 best = cur; 59 // "groups" {[a,b)} that statisfy V[I[a:b] + h] = const. For each group [a,b),
52 int tmp = I[i]; 60 // reassign V[I[a:b]] := b - 1. For size-1 groups (b - a == 1), reassign
53 I[i] = I[j]; 61 // I[a] := -1 to indicate completion.
54 I[j] = tmp; 62 int group_last_member = end - 1;
55 skip = 1; 63 int group_key = V[I[group_last_member] + h];
56 } else if (best == cur) { 64
57 int tmp = I[i + skip]; 65 // Iterate backwards to assign V[I[a:b]] := b - 1 in one pass.
58 I[i + skip] = I[j]; 66 for (int i = end - 2; i >= start; --i) {
59 I[j] = tmp; 67 int cur_key = V[I[i] + h];
60 ++skip; 68
61 } 69 // We update V[I[a:b]], but this can trample V[I[i] + h]! Fortunately, since
62 } 70 // we're reassigning all V[] values of |end - 1| to value in [begin, end)
63 if (skip == 1) { 71 // (that never appear elsewhere), so if we find anything in [begin, end),
64 V[I[i]] = i; 72 // we know it used to be |end - 1|, and can reinterpret accordingly.
65 I[i] = -1; 73 if (cur_key >= start && cur_key < end)
66 } else { 74 cur_key = end - 1;
67 for (int j = i, jend = i + skip; j < jend; j++) 75
68 V[I[j]] = jend - 1; 76 // Did we find new group?
69 } 77 if (group_key != cur_key) {
70 i += skip; 78 // If previous group found has 1 member, then mark completion.
79 if (group_last_member == i + 1)
80 I[i + 1] = -1;
81 group_last_member = i;
82 group_key = cur_key;
71 } 83 }
72 return; 84 V[I[i]] = group_last_member;
73 } 85 }
74 86
75 // Split [start, end) into 3 intervals: 87 if (group_last_member == start)
76 // [start, j) with secondary keys < pivot, 88 I[start] = -1;
77 // [j, k) with secondary keys == pivot,
78 // [k, end) with secondary keys > pivot.
79 int pivot = V[I[(start + end) / 2] + h];
80 int j = start;
81 int k = end;
82 for (int i = start; i < k; ) {
83 int cur = V[I[i] + h];
84 if (cur < pivot) {
85 if (i != j) {
86 int tmp = I[i];
87 I[i] = I[j];
88 I[j] = tmp;
89 }
90 ++i;
91 ++j;
92 } else if (cur > pivot) {
93 --k;
94 int tmp = I[i];
95 I[i] = I[k];
96 I[k] = tmp;
97 } else {
98 ++i;
99 }
100 }
101
102 // Recurse on the "< pivot" piece.
103 if (start < j)
104 split<T>(I, V, start, j, h);
105
106 // Update the "== pivot" piece.
107 if (j == k - 1) {
108 V[I[j]] = j;
109 I[j] = -1;
110 } else {
111 for (int i = j; i < k; ++i)
112 V[I[i]] = k - 1;
113 }
114
115 // Recurse on the "> pivot" piece.
116 if (k < end)
117 split<T>(I, V, k, end, h);
118 } 89 }
119 90
120 template <class T> 91 template <class T>
121 static void 92 static void
122 qsufsort(T I, T V,const unsigned char *old,int oldsize) 93 qsufsort(T I, T V,const unsigned char *old,int oldsize)
123 { 94 {
124 int buckets[256]; 95 int buckets[256];
125 int i,h,len; 96 int i,h,len;
126 97
127 for(i=0;i<256;i++) buckets[i]=0; 98 for(i=0;i<256;i++) buckets[i]=0;
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
190 } 161 }
191 *pos = I[hi]; 162 *pos = I[hi];
192 return y; 163 return y;
193 } 164 }
194 165
195 // End of 'verbatim' code. 166 // End of 'verbatim' code.
196 // ------------------------------------------------------------------------ 167 // ------------------------------------------------------------------------
197 168
198 } // namespace qsuf 169 } // namespace qsuf
199 } // namespace courgette 170 } // namespace courgette
OLDNEW
« no previous file with comments | « courgette/third_party/paged_array_unittest.cc ('k') | courgette/third_party/qsufsort_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698