OLD | NEW |
1 /* | 1 /* |
2 qsufsort.h -- Suffix array generation. | 2 qsufsort.h -- Suffix array generation. |
3 | 3 |
4 Copyright 2003 Colin Percival | 4 Copyright 2003 Colin Percival |
5 | 5 |
6 For the terms under which this work may be distributed, please see | 6 For the terms under which this work may be distributed, please see |
7 the adjoining file "LICENSE". | 7 the adjoining file "LICENSE". |
8 | 8 |
9 ChangeLog: | 9 ChangeLog: |
10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit | 10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit |
11 values throughout. | 11 values throughout. |
12 --Benjamin Smedberg <benjamin@smedbergs.us> | 12 --Benjamin Smedberg <benjamin@smedbergs.us> |
13 2010-05-26 - Use a paged array for V and I. The address space may be too | 13 2010-05-26 - Use a paged array for V and I. The address space may be too |
14 fragmented for these big arrays to be contiguous. | 14 fragmented for these big arrays to be contiguous. |
15 --Stephen Adams <sra@chromium.org> | 15 --Stephen Adams <sra@chromium.org> |
16 2015-08-03 - Extrat qsufsort to a separate file as template. | 16 2015-08-03 - Extract qsufsort to a separate file as template. |
17 --Samuel Huang <huangs@chromium.org> | 17 --Samuel Huang <huangs@chromium.org> |
18 2015-08-19 - Optimize split() and search(), add comments. | 18 2015-08-19 - Optimize split() and search(), add comments. |
19 --Samuel Huang <huangs@chromium.org> | 19 --Samuel Huang <huangs@chromium.org> |
| 20 2016-04-2? - Rewrite split() to use std::sort() with I, V, resolution. |
| 21 --Samuel Huang <huangs@chromium.org> |
20 */ | 22 */ |
21 | 23 |
22 #include <algorithm> | 24 #include <algorithm> |
23 #include <cstring> | 25 #include <cstring> |
24 | 26 |
25 namespace courgette { | 27 namespace courgette { |
26 namespace qsuf { | 28 namespace qsuf { |
27 | 29 |
28 // ------------------------------------------------------------------------ | 30 // ------------------------------------------------------------------------ |
29 // | 31 // |
30 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the | 32 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the |
31 // code formatting and variable names. The changes from the original are: | 33 // code formatting and variable names. The changes from the original are: |
32 // (1) replacing tabs with spaces, | 34 // (1) replacing tabs with spaces, |
33 // (2) indentation, | 35 // (2) indentation, |
34 // (3) using 'const', | 36 // (3) using 'const', |
35 // (4) changing the V and I parameters from int* to template <typename T>. | 37 // (4) changing the V and I parameters from int* to template <typename T>. |
36 // (5) optimizing split() and search(); fix styles. | 38 // (5) optimizing search(); fix styles. |
| 39 // (6) rewriting split() to use std::sort(), followed by I, V resolution. |
37 // | 40 // |
38 // The code appears to be a rewritten version of the suffix array algorithm | 41 // The code appears to be a rewritten version of the suffix array algorithm |
39 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko | 42 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko |
40 // Sadakane, special cased for bytes. | 43 // Sadakane, special cased for bytes. |
41 | 44 |
| 45 // Before: V[I[start:end]] = end - 1. |
| 46 // After: V[I[start:end]] take values in [start, end), forming "groups" of |
| 47 // suffixes that are unresolved up to length |h|. But if a group has size 1, |
| 48 // i.e., V[I[i]] == i and is unique, then we reassign I[i] to be negative so on |
| 49 // the next iteration we'd skip it. |
42 template <typename T> static void split(T I, T V, int start, int end, int h) { | 50 template <typename T> static void split(T I, T V, int start, int end, int h) { |
43 // For small interval, apply selection sort. | 51 if (start >= end) |
44 if (end - start < 6) { | 52 return; |
45 for (int i = start; i < end; ) { | 53 |
46 int skip = 1; | 54 // Sort I[start:end] using V[I[start:end] + h] as key. |
47 int best = V[I[i] + h]; | 55 std::sort(I.begin() + start, I.begin() + end, |
48 for (int j = i + 1; j < end; j++) { | 56 [&](int a, int b) { return V[a + h] < V[b + h]; }); |
49 int cur = V[I[j] + h]; | 57 |
50 if (best > cur) { | 58 // V[I[start:end] + h] is sorted. [start,end) can be split into distinct |
51 best = cur; | 59 // "groups" {[a,b)} that statisfy V[I[a:b] + h] = const. For each group [a,b), |
52 int tmp = I[i]; | 60 // reassign V[I[a:b]] := b - 1. For size-1 groups (b - a == 1), reassign |
53 I[i] = I[j]; | 61 // I[a] := -1 to indicate completion. |
54 I[j] = tmp; | 62 int group_last_member = end - 1; |
55 skip = 1; | 63 int group_key = V[I[group_last_member] + h]; |
56 } else if (best == cur) { | 64 |
57 int tmp = I[i + skip]; | 65 // Iterate backwards to assign V[I[a:b]] := b - 1 in one pass. |
58 I[i + skip] = I[j]; | 66 for (int i = end - 2; i >= start; --i) { |
59 I[j] = tmp; | 67 int cur_key = V[I[i] + h]; |
60 ++skip; | 68 |
61 } | 69 // We update V[I[a:b]], but this can trample V[I[i] + h]! Fortunately, since |
62 } | 70 // we're reassigning all V[] values of |end - 1| to value in [begin, end) |
63 if (skip == 1) { | 71 // (that never appear elsewhere), so if we find anything in [begin, end), |
64 V[I[i]] = i; | 72 // we know it used to be |end - 1|, and can reinterpret accordingly. |
65 I[i] = -1; | 73 if (cur_key >= start && cur_key < end) |
66 } else { | 74 cur_key = end - 1; |
67 for (int j = i, jend = i + skip; j < jend; j++) | 75 |
68 V[I[j]] = jend - 1; | 76 // Did we find new group? |
69 } | 77 if (group_key != cur_key) { |
70 i += skip; | 78 // If previous group found has 1 member, then mark completion. |
| 79 if (group_last_member == i + 1) |
| 80 I[i + 1] = -1; |
| 81 group_last_member = i; |
| 82 group_key = cur_key; |
71 } | 83 } |
72 return; | 84 V[I[i]] = group_last_member; |
73 } | 85 } |
74 | 86 |
75 // Split [start, end) into 3 intervals: | 87 if (group_last_member == start) |
76 // [start, j) with secondary keys < pivot, | 88 I[start] = -1; |
77 // [j, k) with secondary keys == pivot, | |
78 // [k, end) with secondary keys > pivot. | |
79 int pivot = V[I[(start + end) / 2] + h]; | |
80 int j = start; | |
81 int k = end; | |
82 for (int i = start; i < k; ) { | |
83 int cur = V[I[i] + h]; | |
84 if (cur < pivot) { | |
85 if (i != j) { | |
86 int tmp = I[i]; | |
87 I[i] = I[j]; | |
88 I[j] = tmp; | |
89 } | |
90 ++i; | |
91 ++j; | |
92 } else if (cur > pivot) { | |
93 --k; | |
94 int tmp = I[i]; | |
95 I[i] = I[k]; | |
96 I[k] = tmp; | |
97 } else { | |
98 ++i; | |
99 } | |
100 } | |
101 | |
102 // Recurse on the "< pivot" piece. | |
103 if (start < j) | |
104 split<T>(I, V, start, j, h); | |
105 | |
106 // Update the "== pivot" piece. | |
107 if (j == k - 1) { | |
108 V[I[j]] = j; | |
109 I[j] = -1; | |
110 } else { | |
111 for (int i = j; i < k; ++i) | |
112 V[I[i]] = k - 1; | |
113 } | |
114 | |
115 // Recurse on the "> pivot" piece. | |
116 if (k < end) | |
117 split<T>(I, V, k, end, h); | |
118 } | 89 } |
119 | 90 |
120 template <class T> | 91 template <class T> |
121 static void | 92 static void |
122 qsufsort(T I, T V,const unsigned char *old,int oldsize) | 93 qsufsort(T I, T V,const unsigned char *old,int oldsize) |
123 { | 94 { |
124 int buckets[256]; | 95 int buckets[256]; |
125 int i,h,len; | 96 int i,h,len; |
126 | 97 |
127 for(i=0;i<256;i++) buckets[i]=0; | 98 for(i=0;i<256;i++) buckets[i]=0; |
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
190 } | 161 } |
191 *pos = I[hi]; | 162 *pos = I[hi]; |
192 return y; | 163 return y; |
193 } | 164 } |
194 | 165 |
195 // End of 'verbatim' code. | 166 // End of 'verbatim' code. |
196 // ------------------------------------------------------------------------ | 167 // ------------------------------------------------------------------------ |
197 | 168 |
198 } // namespace qsuf | 169 } // namespace qsuf |
199 } // namespace courgette | 170 } // namespace courgette |
OLD | NEW |