OLD | NEW |
| (Empty) |
1 /* | |
2 qsufsort.h -- Suffix array generation. | |
3 | |
4 Copyright 2003 Colin Percival | |
5 | |
6 For the terms under which this work may be distributed, please see | |
7 the adjoining file "LICENSE". | |
8 | |
9 ChangeLog: | |
10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit | |
11 values throughout. | |
12 --Benjamin Smedberg <benjamin@smedbergs.us> | |
13 2010-05-26 - Use a paged array for V and I. The address space may be too | |
14 fragmented for these big arrays to be contiguous. | |
15 --Stephen Adams <sra@chromium.org> | |
16 2015-08-03 - Extract QSufSort to a separate file as template. | |
17 --Samuel Huang <huangs@chromium.org> | |
18 2015-08-19 - Optimize split() and search(), add comments. | |
19 --Samuel Huang <huangs@chromium.org> | |
20 2016-04-27 - Change split() to use Bentley & McIlroy's pivot selection | |
21 algorithm, which QSufSort originally used. Reference: | |
22 http://www.larsson.dogma.net/qsufsort.c | |
23 --Samuel Huang <huangs@chromium.org> | |
24 */ | |
25 | |
26 #include <algorithm> | |
27 #include <cstring> | |
28 | |
29 namespace courgette { | |
30 namespace qsuf { | |
31 | |
32 // ------------------------------------------------------------------------ | |
33 // | |
34 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the | |
35 // code formatting and variable names. The changes from the original are: | |
36 // (1) replacing tabs with spaces, | |
37 // (2) indentation, | |
38 // (3) using 'const', | |
39 // (4) changing the V and I parameters from int* to template <typename T>. | |
40 // (5) optimizing split() and search(); fix styles. | |
41 // | |
42 // The code appears to be a rewritten version of the suffix array algorithm | |
43 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko | |
44 // Sadakane, special cased for bytes. | |
45 | |
46 namespace { | |
47 | |
48 template <typename T> T median3(const T& a, const T& b, const T& c) { | |
49 if (a < b) | |
50 return b < c ? b : (a < c ? c : a); | |
51 return b > c ? b : (a > c ? c : a); | |
52 } | |
53 | |
54 } // namespace | |
55 | |
56 template <typename T> void split(T I, T V, int start, int end, int h) { | |
57 // For small interval, apply selection sort. | |
58 if (end - start < 16) { | |
59 for (int i = start; i < end; ) { | |
60 int skip = 1; | |
61 int best = V[I[i] + h]; | |
62 for (int j = i + 1; j < end; j++) { | |
63 int cur = V[I[j] + h]; | |
64 if (best > cur) { | |
65 best = cur; | |
66 int tmp = I[i]; | |
67 I[i] = I[j]; | |
68 I[j] = tmp; | |
69 skip = 1; | |
70 } else if (best == cur) { | |
71 int tmp = I[i + skip]; | |
72 I[i + skip] = I[j]; | |
73 I[j] = tmp; | |
74 ++skip; | |
75 } | |
76 } | |
77 if (skip == 1) { | |
78 V[I[i]] = i; | |
79 I[i] = -1; | |
80 } else { | |
81 for (int j = i, jend = i + skip; j < jend; j++) | |
82 V[I[j]] = jend - 1; | |
83 } | |
84 i += skip; | |
85 } | |
86 return; | |
87 } | |
88 | |
89 // Select pivot, algorithm by Bentley & McIlroy. | |
90 int n = end - start; | |
91 int mid = start + (n >> 1); | |
92 int pivot = V[I[mid] + h]; | |
93 int p1 = V[I[start] + h]; | |
94 int p2 = V[I[end - 1] + h]; | |
95 if (n > 40) { // Big array: Pseudomedian of 9. | |
96 int s = n >> 3; | |
97 pivot = median3(pivot, V[I[mid - s] + h], V[I[mid + s] + h]); | |
98 p1 = median3(p1, V[I[start + s] + h], V[I[start + s + s] + h]); | |
99 p2 = median3(p2, V[I[end - 1 - s] + h], V[I[end - 1 - s - s] + h]); | |
100 } // Else medium array: Pseudomedian of 3. | |
101 pivot = median3(pivot, p1, p2); | |
102 | |
103 // Split [start, end) into 3 intervals: | |
104 // [start, j) with secondary keys < pivot, | |
105 // [j, k) with secondary keys == pivot, | |
106 // [k, end) with secondary keys > pivot. | |
107 int j = start; | |
108 int k = end; | |
109 for (int i = start; i < k; ) { | |
110 int cur = V[I[i] + h]; | |
111 if (cur < pivot) { | |
112 if (i != j) { | |
113 int tmp = I[i]; | |
114 I[i] = I[j]; | |
115 I[j] = tmp; | |
116 } | |
117 ++i; | |
118 ++j; | |
119 } else if (cur > pivot) { | |
120 --k; | |
121 int tmp = I[i]; | |
122 I[i] = I[k]; | |
123 I[k] = tmp; | |
124 } else { | |
125 ++i; | |
126 } | |
127 } | |
128 | |
129 // Recurse on the "< pivot" piece. | |
130 if (start < j) | |
131 split<T>(I, V, start, j, h); | |
132 | |
133 // Update the "== pivot" piece. | |
134 if (j == k - 1) { | |
135 V[I[j]] = j; | |
136 I[j] = -1; | |
137 } else { | |
138 for (int i = j; i < k; ++i) | |
139 V[I[i]] = k - 1; | |
140 } | |
141 | |
142 // Recurse on the "> pivot" piece. | |
143 if (k < end) | |
144 split<T>(I, V, k, end, h); | |
145 } | |
146 | |
147 template <class T> | |
148 static void | |
149 qsufsort(T I, T V,const unsigned char *old,int oldsize) | |
150 { | |
151 int buckets[256]; | |
152 int i,h,len; | |
153 | |
154 for(i=0;i<256;i++) buckets[i]=0; | |
155 for(i=0;i<oldsize;i++) buckets[old[i]]++; | |
156 for(i=1;i<256;i++) buckets[i]+=buckets[i-1]; | |
157 for(i=255;i>0;i--) buckets[i]=buckets[i-1]; | |
158 buckets[0]=0; | |
159 | |
160 for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i; | |
161 I[0]=oldsize; | |
162 for(i=0;i<oldsize;i++) V[i]=buckets[old[i]]; | |
163 V[oldsize]=0; | |
164 for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1; | |
165 I[0]=-1; | |
166 | |
167 for(h=1;I[0]!=-(oldsize+1);h+=h) { | |
168 len=0; | |
169 for(i=0;i<oldsize+1;) { | |
170 if(I[i]<0) { | |
171 len-=I[i]; | |
172 i-=I[i]; | |
173 } else { | |
174 if(len) I[i-len]=-len; | |
175 len=V[I[i]]+1-i; | |
176 split<T>(I,V,i,i+len,h); | |
177 i+=len; | |
178 len=0; | |
179 }; | |
180 }; | |
181 if(len) I[i-len]=-len; | |
182 }; | |
183 | |
184 for(i=0;i<oldsize+1;i++) I[V[i]]=i; | |
185 } | |
186 | |
187 static int | |
188 matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int ne
wsize) | |
189 { | |
190 int i; | |
191 | |
192 for(i=0;(i<oldsize)&&(i<newsize);i++) | |
193 if(old[i]!=newbuf[i]) break; | |
194 | |
195 return i; | |
196 } | |
197 | |
198 template <class T> | |
199 static int search(T I, const unsigned char *old, int oldsize, | |
200 const unsigned char *newbuf, int newsize, int *pos) { | |
201 int lo = 0; | |
202 int hi = oldsize; | |
203 while (hi - lo >= 2) { | |
204 int mid = (lo + hi) / 2; | |
205 if(memcmp(old+I[mid],newbuf,std::min(oldsize-I[mid],newsize))<0) { | |
206 lo = mid; | |
207 } else { | |
208 hi = mid; | |
209 } | |
210 } | |
211 | |
212 int x = matchlen(old + I[lo], oldsize - I[lo], newbuf, newsize); | |
213 int y = matchlen(old + I[hi], oldsize - I[hi], newbuf, newsize); | |
214 if(x > y) { | |
215 *pos = I[lo]; | |
216 return x; | |
217 } | |
218 *pos = I[hi]; | |
219 return y; | |
220 } | |
221 | |
222 // End of 'verbatim' code. | |
223 // ------------------------------------------------------------------------ | |
224 | |
225 } // namespace qsuf | |
226 } // namespace courgette | |
OLD | NEW |