OLD | NEW |
1 /* | 1 /* |
2 qsufsort.h -- Suffix array generation. | 2 qsufsort.h -- Suffix array generation. |
3 | 3 |
4 Copyright 2003 Colin Percival | 4 Copyright 2003 Colin Percival |
5 | 5 |
6 For the terms under which this work may be distributed, please see | 6 For the terms under which this work may be distributed, please see |
7 the adjoining file "LICENSE". | 7 the adjoining file "LICENSE". |
8 | 8 |
9 ChangeLog: | 9 ChangeLog: |
10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit | 10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit |
11 values throughout. | 11 values throughout. |
12 --Benjamin Smedberg <benjamin@smedbergs.us> | 12 --Benjamin Smedberg <benjamin@smedbergs.us> |
13 2010-05-26 - Use a paged array for V and I. The address space may be too | 13 2010-05-26 - Use a paged array for V and I. The address space may be too |
14 fragmented for these big arrays to be contiguous. | 14 fragmented for these big arrays to be contiguous. |
15 --Stephen Adams <sra@chromium.org> | 15 --Stephen Adams <sra@chromium.org> |
16 2015-08-03 - Extract QSufSort to a separate file as template. | 16 2015-08-03 - Extract QSufSort to a separate file as template. |
17 --Samuel Huang <huangs@chromium.org> | 17 --Samuel Huang <huangs@chromium.org> |
18 2015-08-19 - Optimize split() and search(), add comments. | 18 2015-08-19 - Optimize split() and search(), add comments. |
19 --Samuel Huang <huangs@chromium.org> | 19 --Samuel Huang <huangs@chromium.org> |
20 2016-04-27 - Change split() to use Bentley & McIlroy's pivot selection | 20 2016-04-27 - Change split() to use Bentley & McIlroy's pivot selection |
21 algorithm, which QSufSort originally used. Reference: | 21 algorithm, which QSufSort originally used. Reference: |
22 http://www.larsson.dogma.net/qsufsort.c | 22 http://www.larsson.dogma.net/qsufsort.c |
23 --Samuel Huang <huangs@chromium.org> | 23 --Samuel Huang <huangs@chromium.org> |
24 */ | 24 */ |
25 | 25 |
| 26 #ifndef COURGETTE_THIRD_PARTY_BSDIFF_QSUFSORT_H_ |
| 27 #define COURGETTE_THIRD_PARTY_BSDIFF_QSUFSORT_H_ |
| 28 |
26 #include <algorithm> | 29 #include <algorithm> |
27 #include <cstring> | 30 #include <cstring> |
28 | 31 |
29 namespace courgette { | 32 namespace courgette { |
30 namespace qsuf { | 33 namespace qsuf { |
31 | 34 |
32 // ------------------------------------------------------------------------ | 35 // ------------------------------------------------------------------------ |
33 // | 36 // |
34 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the | 37 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the |
35 // code formatting and variable names. The changes from the original are: | 38 // code formatting and variable names. The changes from the original are: |
36 // (1) replacing tabs with spaces, | 39 // (1) replacing tabs with spaces, |
37 // (2) indentation, | 40 // (2) indentation and spacing, |
38 // (3) using 'const', | 41 // (3) using 'const', |
39 // (4) changing the V and I parameters from int* to template <typename T>. | 42 // (4) changing the V and I parameters from int* to template <typename T>. |
40 // (5) optimizing split() and search(); fix styles. | 43 // (5) optimizing split() and search(); fix styles. |
41 // | 44 // |
42 // The code appears to be a rewritten version of the suffix array algorithm | 45 // The code appears to be a rewritten version of the suffix array algorithm |
43 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko | 46 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko |
44 // Sadakane, special cased for bytes. | 47 // Sadakane, special cased for bytes. |
45 | 48 |
46 namespace { | 49 namespace { |
47 | 50 |
48 template <typename T> T median3(const T& a, const T& b, const T& c) { | 51 template <typename T> |
| 52 T median3(const T& a, const T& b, const T& c) { |
49 if (a < b) | 53 if (a < b) |
50 return b < c ? b : (a < c ? c : a); | 54 return b < c ? b : (a < c ? c : a); |
51 return b > c ? b : (a > c ? c : a); | 55 return b > c ? b : (a > c ? c : a); |
52 } | 56 } |
53 | 57 |
54 } // namespace | 58 } // namespace |
55 | 59 |
56 template <typename T> void split(T I, T V, int start, int end, int h) { | 60 template <typename T> |
| 61 void split(T I, T V, int start, int end, int h) { |
57 // For small interval, apply selection sort. | 62 // For small interval, apply selection sort. |
58 if (end - start < 16) { | 63 if (end - start < 16) { |
59 for (int i = start; i < end; ) { | 64 for (int i = start; i < end;) { |
60 int skip = 1; | 65 int skip = 1; |
61 int best = V[I[i] + h]; | 66 int best = V[I[i] + h]; |
62 for (int j = i + 1; j < end; j++) { | 67 for (int j = i + 1; j < end; j++) { |
63 int cur = V[I[j] + h]; | 68 int cur = V[I[j] + h]; |
64 if (best > cur) { | 69 if (best > cur) { |
65 best = cur; | 70 best = cur; |
66 int tmp = I[i]; | 71 int tmp = I[i]; |
67 I[i] = I[j]; | 72 I[i] = I[j]; |
68 I[j] = tmp; | 73 I[j] = tmp; |
69 skip = 1; | 74 skip = 1; |
(...skipping 29 matching lines...) Expand all Loading... |
99 p2 = median3(p2, V[I[end - 1 - s] + h], V[I[end - 1 - s - s] + h]); | 104 p2 = median3(p2, V[I[end - 1 - s] + h], V[I[end - 1 - s - s] + h]); |
100 } // Else medium array: Pseudomedian of 3. | 105 } // Else medium array: Pseudomedian of 3. |
101 pivot = median3(pivot, p1, p2); | 106 pivot = median3(pivot, p1, p2); |
102 | 107 |
103 // Split [start, end) into 3 intervals: | 108 // Split [start, end) into 3 intervals: |
104 // [start, j) with secondary keys < pivot, | 109 // [start, j) with secondary keys < pivot, |
105 // [j, k) with secondary keys == pivot, | 110 // [j, k) with secondary keys == pivot, |
106 // [k, end) with secondary keys > pivot. | 111 // [k, end) with secondary keys > pivot. |
107 int j = start; | 112 int j = start; |
108 int k = end; | 113 int k = end; |
109 for (int i = start; i < k; ) { | 114 for (int i = start; i < k;) { |
110 int cur = V[I[i] + h]; | 115 int cur = V[I[i] + h]; |
111 if (cur < pivot) { | 116 if (cur < pivot) { |
112 if (i != j) { | 117 if (i != j) { |
113 int tmp = I[i]; | 118 int tmp = I[i]; |
114 I[i] = I[j]; | 119 I[i] = I[j]; |
115 I[j] = tmp; | 120 I[j] = tmp; |
116 } | 121 } |
117 ++i; | 122 ++i; |
118 ++j; | 123 ++j; |
119 } else if (cur > pivot) { | 124 } else if (cur > pivot) { |
(...skipping 18 matching lines...) Expand all Loading... |
138 for (int i = j; i < k; ++i) | 143 for (int i = j; i < k; ++i) |
139 V[I[i]] = k - 1; | 144 V[I[i]] = k - 1; |
140 } | 145 } |
141 | 146 |
142 // Recurse on the "> pivot" piece. | 147 // Recurse on the "> pivot" piece. |
143 if (k < end) | 148 if (k < end) |
144 split<T>(I, V, k, end, h); | 149 split<T>(I, V, k, end, h); |
145 } | 150 } |
146 | 151 |
147 template <class T> | 152 template <class T> |
148 static void | 153 static void qsufsort(T I, T V, const unsigned char* old, int oldsize) { |
149 qsufsort(T I, T V,const unsigned char *old,int oldsize) | |
150 { | |
151 int buckets[256]; | 154 int buckets[256]; |
152 int i,h,len; | 155 int i, h, len; |
153 | 156 |
154 for(i=0;i<256;i++) buckets[i]=0; | 157 for (i = 0; i < 256; i++) |
155 for(i=0;i<oldsize;i++) buckets[old[i]]++; | 158 buckets[i] = 0; |
156 for(i=1;i<256;i++) buckets[i]+=buckets[i-1]; | 159 for (i = 0; i < oldsize; i++) |
157 for(i=255;i>0;i--) buckets[i]=buckets[i-1]; | 160 buckets[old[i]]++; |
158 buckets[0]=0; | 161 for (i = 1; i < 256; i++) |
| 162 buckets[i] += buckets[i - 1]; |
| 163 for (i = 255; i > 0; i--) |
| 164 buckets[i] = buckets[i - 1]; |
| 165 buckets[0] = 0; |
159 | 166 |
160 for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i; | 167 for (i = 0; i < oldsize; i++) |
161 I[0]=oldsize; | 168 I[++buckets[old[i]]] = i; |
162 for(i=0;i<oldsize;i++) V[i]=buckets[old[i]]; | 169 I[0] = oldsize; |
163 V[oldsize]=0; | 170 for (i = 0; i < oldsize; i++) |
164 for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1; | 171 V[i] = buckets[old[i]]; |
165 I[0]=-1; | 172 V[oldsize] = 0; |
| 173 for (i = 1; i < 256; i++) |
| 174 if (buckets[i] == buckets[i - 1] + 1) |
| 175 I[buckets[i]] = -1; |
| 176 I[0] = -1; |
166 | 177 |
167 for(h=1;I[0]!=-(oldsize+1);h+=h) { | 178 for (h = 1; I[0] != -(oldsize + 1); h += h) { |
168 len=0; | 179 len = 0; |
169 for(i=0;i<oldsize+1;) { | 180 for (i = 0; i < oldsize + 1;) { |
170 if(I[i]<0) { | 181 if (I[i] < 0) { |
171 len-=I[i]; | 182 len -= I[i]; |
172 i-=I[i]; | 183 i -= I[i]; |
173 } else { | 184 } else { |
174 if(len) I[i-len]=-len; | 185 if (len) |
175 len=V[I[i]]+1-i; | 186 I[i - len] = -len; |
176 split<T>(I,V,i,i+len,h); | 187 len = V[I[i]] + 1 - i; |
177 i+=len; | 188 split<T>(I, V, i, i + len, h); |
178 len=0; | 189 i += len; |
| 190 len = 0; |
179 }; | 191 }; |
180 }; | 192 }; |
181 if(len) I[i-len]=-len; | 193 if (len) |
| 194 I[i - len] = -len; |
182 }; | 195 }; |
183 | 196 |
184 for(i=0;i<oldsize+1;i++) I[V[i]]=i; | 197 for (i = 0; i < oldsize + 1; i++) |
| 198 I[V[i]] = i; |
185 } | 199 } |
186 | 200 |
187 static int | 201 static int matchlen(const unsigned char* old, |
188 matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int ne
wsize) | 202 int oldsize, |
189 { | 203 const unsigned char* newbuf, |
| 204 int newsize) { |
190 int i; | 205 int i; |
191 | 206 |
192 for(i=0;(i<oldsize)&&(i<newsize);i++) | 207 for (i = 0; (i < oldsize) && (i < newsize); i++) |
193 if(old[i]!=newbuf[i]) break; | 208 if (old[i] != newbuf[i]) |
| 209 break; |
194 | 210 |
195 return i; | 211 return i; |
196 } | 212 } |
197 | 213 |
198 template <class T> | 214 template <class T> |
199 static int search(T I, const unsigned char *old, int oldsize, | 215 static int search(T I, |
200 const unsigned char *newbuf, int newsize, int *pos) { | 216 const unsigned char* old, |
| 217 int oldsize, |
| 218 const unsigned char* newbuf, |
| 219 int newsize, |
| 220 int* pos) { |
201 int lo = 0; | 221 int lo = 0; |
202 int hi = oldsize; | 222 int hi = oldsize; |
203 while (hi - lo >= 2) { | 223 while (hi - lo >= 2) { |
204 int mid = (lo + hi) / 2; | 224 int mid = (lo + hi) / 2; |
205 if(memcmp(old+I[mid],newbuf,std::min(oldsize-I[mid],newsize))<0) { | 225 if (memcmp(old + I[mid], newbuf, std::min(oldsize - I[mid], newsize)) < 0) { |
206 lo = mid; | 226 lo = mid; |
207 } else { | 227 } else { |
208 hi = mid; | 228 hi = mid; |
209 } | 229 } |
210 } | 230 } |
211 | 231 |
212 int x = matchlen(old + I[lo], oldsize - I[lo], newbuf, newsize); | 232 int x = matchlen(old + I[lo], oldsize - I[lo], newbuf, newsize); |
213 int y = matchlen(old + I[hi], oldsize - I[hi], newbuf, newsize); | 233 int y = matchlen(old + I[hi], oldsize - I[hi], newbuf, newsize); |
214 if(x > y) { | 234 if (x > y) { |
215 *pos = I[lo]; | 235 *pos = I[lo]; |
216 return x; | 236 return x; |
217 } | 237 } |
218 *pos = I[hi]; | 238 *pos = I[hi]; |
219 return y; | 239 return y; |
220 } | 240 } |
221 | 241 |
222 // End of 'verbatim' code. | 242 // End of 'verbatim' code. |
223 // ------------------------------------------------------------------------ | 243 // ------------------------------------------------------------------------ |
224 | 244 |
225 } // namespace qsuf | 245 } // namespace qsuf |
226 } // namespace courgette | 246 } // namespace courgette |
| 247 #endif // COURGETTE_THIRD_PARTY_BSDIFF_QSUFSORT_H_ |
OLD | NEW |