OLD | NEW |
1 /* | 1 /* |
2 qsufsort.h -- Suffix array generation. | 2 qsufsort.h -- Suffix array generation. |
3 | 3 |
4 Copyright 2003 Colin Percival | 4 Copyright 2003 Colin Percival |
5 | 5 |
6 For the terms under which this work may be distributed, please see | 6 For the terms under which this work may be distributed, please see |
7 the adjoining file "LICENSE". | 7 the adjoining file "LICENSE". |
8 | 8 |
9 ChangeLog: | 9 ChangeLog: |
10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit | 10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit |
11 values throughout. | 11 values throughout. |
12 --Benjamin Smedberg <benjamin@smedbergs.us> | 12 --Benjamin Smedberg <benjamin@smedbergs.us> |
13 2010-05-26 - Use a paged array for V and I. The address space may be too | 13 2010-05-26 - Use a paged array for V and I. The address space may be too |
14 fragmented for these big arrays to be contiguous. | 14 fragmented for these big arrays to be contiguous. |
15 --Stephen Adams <sra@chromium.org> | 15 --Stephen Adams <sra@chromium.org> |
16 2015-08-03 - Extrat qsufsort to a separate file as template. | 16 2015-08-03 - Extract QSufSort to a separate file as template. |
17 --Samuel Huang <huangs@chromium.org> | 17 --Samuel Huang <huangs@chromium.org> |
18 2015-08-19 - Optimize split() and search(), add comments. | 18 2015-08-19 - Optimize split() and search(), add comments. |
19 --Samuel Huang <huangs@chromium.org> | 19 --Samuel Huang <huangs@chromium.org> |
| 20 2016-04-27 - Change split() to use Bentley & McIlroy's pivot selection |
| 21 algorithm, which QSufSort originally used. Reference: |
| 22 http://www.larsson.dogma.net/qsufsort.c |
| 23 --Samuel Huang <huangs@chromium.org> |
20 */ | 24 */ |
21 | 25 |
22 #include <algorithm> | 26 #include <algorithm> |
23 #include <cstring> | 27 #include <cstring> |
24 | 28 |
25 namespace courgette { | 29 namespace courgette { |
26 namespace qsuf { | 30 namespace qsuf { |
27 | 31 |
28 // ------------------------------------------------------------------------ | 32 // ------------------------------------------------------------------------ |
29 // | 33 // |
30 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the | 34 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the |
31 // code formatting and variable names. The changes from the original are: | 35 // code formatting and variable names. The changes from the original are: |
32 // (1) replacing tabs with spaces, | 36 // (1) replacing tabs with spaces, |
33 // (2) indentation, | 37 // (2) indentation, |
34 // (3) using 'const', | 38 // (3) using 'const', |
35 // (4) changing the V and I parameters from int* to template <typename T>. | 39 // (4) changing the V and I parameters from int* to template <typename T>. |
36 // (5) optimizing split() and search(); fix styles. | 40 // (5) optimizing split() and search(); fix styles. |
37 // | 41 // |
38 // The code appears to be a rewritten version of the suffix array algorithm | 42 // The code appears to be a rewritten version of the suffix array algorithm |
39 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko | 43 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko |
40 // Sadakane, special cased for bytes. | 44 // Sadakane, special cased for bytes. |
41 | 45 |
42 template <typename T> static void split(T I, T V, int start, int end, int h) { | 46 namespace { |
| 47 |
| 48 template <typename T> T median3(const T& a, const T& b, const T& c) { |
| 49 if (a < b) |
| 50 return b < c ? b : (a < c ? c : a); |
| 51 return b > c ? b : (a > c ? c : a); |
| 52 } |
| 53 |
| 54 } // namespace |
| 55 |
| 56 template <typename T> void split(T I, T V, int start, int end, int h) { |
43 // For small interval, apply selection sort. | 57 // For small interval, apply selection sort. |
44 if (end - start < 6) { | 58 if (end - start < 16) { |
45 for (int i = start; i < end; ) { | 59 for (int i = start; i < end; ) { |
46 int skip = 1; | 60 int skip = 1; |
47 int best = V[I[i] + h]; | 61 int best = V[I[i] + h]; |
48 for (int j = i + 1; j < end; j++) { | 62 for (int j = i + 1; j < end; j++) { |
49 int cur = V[I[j] + h]; | 63 int cur = V[I[j] + h]; |
50 if (best > cur) { | 64 if (best > cur) { |
51 best = cur; | 65 best = cur; |
52 int tmp = I[i]; | 66 int tmp = I[i]; |
53 I[i] = I[j]; | 67 I[i] = I[j]; |
54 I[j] = tmp; | 68 I[j] = tmp; |
(...skipping 10 matching lines...) Expand all Loading... |
65 I[i] = -1; | 79 I[i] = -1; |
66 } else { | 80 } else { |
67 for (int j = i, jend = i + skip; j < jend; j++) | 81 for (int j = i, jend = i + skip; j < jend; j++) |
68 V[I[j]] = jend - 1; | 82 V[I[j]] = jend - 1; |
69 } | 83 } |
70 i += skip; | 84 i += skip; |
71 } | 85 } |
72 return; | 86 return; |
73 } | 87 } |
74 | 88 |
| 89 // Select pivot, algorithm by Bentley & McIlroy. |
| 90 int n = end - start; |
| 91 int mid = start + (n >> 1); |
| 92 int pivot = V[I[mid] + h]; |
| 93 int p1 = V[I[start] + h]; |
| 94 int p2 = V[I[end - 1] + h]; |
| 95 if (n > 40) { // Big array: Pseudomedian of 9. |
| 96 int s = n >> 3; |
| 97 pivot = median3(pivot, V[I[mid - s] + h], V[I[mid + s] + h]); |
| 98 p1 = median3(p1, V[I[start + s] + h], V[I[start + s + s] + h]); |
| 99 p2 = median3(p2, V[I[end - 1 - s] + h], V[I[end - 1 - s - s] + h]); |
| 100 } // Else medium array: Pseudomedian of 3. |
| 101 pivot = median3(pivot, p1, p2); |
| 102 |
75 // Split [start, end) into 3 intervals: | 103 // Split [start, end) into 3 intervals: |
76 // [start, j) with secondary keys < pivot, | 104 // [start, j) with secondary keys < pivot, |
77 // [j, k) with secondary keys == pivot, | 105 // [j, k) with secondary keys == pivot, |
78 // [k, end) with secondary keys > pivot. | 106 // [k, end) with secondary keys > pivot. |
79 int pivot = V[I[(start + end) / 2] + h]; | |
80 int j = start; | 107 int j = start; |
81 int k = end; | 108 int k = end; |
82 for (int i = start; i < k; ) { | 109 for (int i = start; i < k; ) { |
83 int cur = V[I[i] + h]; | 110 int cur = V[I[i] + h]; |
84 if (cur < pivot) { | 111 if (cur < pivot) { |
85 if (i != j) { | 112 if (i != j) { |
86 int tmp = I[i]; | 113 int tmp = I[i]; |
87 I[i] = I[j]; | 114 I[i] = I[j]; |
88 I[j] = tmp; | 115 I[j] = tmp; |
89 } | 116 } |
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
190 } | 217 } |
191 *pos = I[hi]; | 218 *pos = I[hi]; |
192 return y; | 219 return y; |
193 } | 220 } |
194 | 221 |
195 // End of 'verbatim' code. | 222 // End of 'verbatim' code. |
196 // ------------------------------------------------------------------------ | 223 // ------------------------------------------------------------------------ |
197 | 224 |
198 } // namespace qsuf | 225 } // namespace qsuf |
199 } // namespace courgette | 226 } // namespace courgette |
OLD | NEW |