OLD | NEW |
1 /* | 1 /* |
2 qsufsort.h -- Suffix array generation. | 2 qsufsort.h -- Suffix array generation. |
3 | 3 |
4 Copyright 2003 Colin Percival | 4 Copyright 2003 Colin Percival |
5 | 5 |
6 For the terms under which this work may be distributed, please see | 6 For the terms under which this work may be distributed, please see |
7 the adjoining file "LICENSE". | 7 the adjoining file "LICENSE". |
8 | 8 |
9 ChangeLog: | 9 ChangeLog: |
10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit | 10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit |
11 values throughout. | 11 values throughout. |
12 --Benjamin Smedberg <benjamin@smedbergs.us> | 12 --Benjamin Smedberg <benjamin@smedbergs.us> |
13 2010-05-26 - Use a paged array for V and I. The address space may be too | 13 2010-05-26 - Use a paged array for V and I. The address space may be too |
14 fragmented for these big arrays to be contiguous. | 14 fragmented for these big arrays to be contiguous. |
15 --Stephen Adams <sra@chromium.org> | 15 --Stephen Adams <sra@chromium.org> |
16 2015-08-03 - Extrat qsufsort to a separate file as template. | 16 2015-08-03 - Extrat qsufsort to a separate file as template. |
17 --Samuel Huang <huangs@chromium.org> | 17 --Samuel Huang <huangs@chromium.org> |
| 18 2015-08-19 - Optimize split() and search(), add comments. |
| 19 --Samuel Huang <huangs@chromium.org> |
18 */ | 20 */ |
19 | 21 |
20 #include <algorithm> | 22 #include <algorithm> |
21 #include <cstring> | 23 #include <cstring> |
22 | 24 |
23 namespace courgette { | 25 namespace courgette { |
24 namespace qsuf { | 26 namespace qsuf { |
25 | 27 |
26 // ------------------------------------------------------------------------ | 28 // ------------------------------------------------------------------------ |
27 // | 29 // |
28 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the | 30 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the |
29 // code formatting and variable names. The changes from the original are: | 31 // code formatting and variable names. The changes from the original are: |
30 // (1) replacing tabs with spaces, | 32 // (1) replacing tabs with spaces, |
31 // (2) indentation, | 33 // (2) indentation, |
32 // (3) using 'const', | 34 // (3) using 'const', |
33 // (4) changing the V and I parameters from int* to template <typename T>. | 35 // (4) changing the V and I parameters from int* to template <typename T>. |
| 36 // (5) optimizing split() and search(); fix styles. |
34 // | 37 // |
35 // The code appears to be a rewritten version of the suffix array algorithm | 38 // The code appears to be a rewritten version of the suffix array algorithm |
36 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko | 39 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko |
37 // Sadakane, special cased for bytes. | 40 // Sadakane, special cased for bytes. |
38 | 41 |
39 template <typename T> | 42 template <typename T> static void split(T I, T V, int start, int end, int h) { |
40 static void | 43 // For small interval, apply selection sort. |
41 split(T I,T V,int start,int len,int h) | 44 if (end - start < 6) { |
42 { | 45 for (int i = start; i < end; ) { |
43 int i,j,k,x,tmp,jj,kk; | 46 int skip = 1; |
| 47 int best = V[I[i] + h]; |
| 48 for (int j = i + 1; j < end; j++) { |
| 49 int cur = V[I[j] + h]; |
| 50 if (best > cur) { |
| 51 best = cur; |
| 52 int tmp = I[i]; |
| 53 I[i] = I[j]; |
| 54 I[j] = tmp; |
| 55 skip = 1; |
| 56 } else if (best == cur) { |
| 57 int tmp = I[i + skip]; |
| 58 I[i + skip] = I[j]; |
| 59 I[j] = tmp; |
| 60 ++skip; |
| 61 } |
| 62 } |
| 63 if (skip == 1) { |
| 64 V[I[i]] = i; |
| 65 I[i] = -1; |
| 66 } else { |
| 67 for (int j = i, jend = i + skip; j < jend; j++) |
| 68 V[I[j]] = jend - 1; |
| 69 } |
| 70 i += skip; |
| 71 } |
| 72 return; |
| 73 } |
44 | 74 |
45 if(len<16) { | 75 // Split [start, end) into 3 intervals: |
46 for(k=start;k<start+len;k+=j) { | 76 // [start, j) with secondary keys < pivot, |
47 j=1;x=V[I[k]+h]; | 77 // [j, k) with secondary keys == pivot, |
48 for(i=1;k+i<start+len;i++) { | 78 // [k, end) with secondary keys > pivot. |
49 if(V[I[k+i]+h]<x) { | 79 int pivot = V[I[(start + end) / 2] + h]; |
50 x=V[I[k+i]+h]; | 80 int j = start; |
51 j=0; | 81 int k = end; |
52 }; | 82 for (int i = start; i < k; ) { |
53 if(V[I[k+i]+h]==x) { | 83 int cur = V[I[i] + h]; |
54 tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp; | 84 if (cur < pivot) { |
55 j++; | 85 if (i != j) { |
56 }; | 86 int tmp = I[i]; |
57 }; | 87 I[i] = I[j]; |
58 for(i=0;i<j;i++) V[I[k+i]]=k+j-1; | 88 I[j] = tmp; |
59 if(j==1) I[k]=-1; | 89 } |
60 }; | 90 ++i; |
61 return; | 91 ++j; |
62 }; | 92 } else if (cur > pivot) { |
| 93 --k; |
| 94 int tmp = I[i]; |
| 95 I[i] = I[k]; |
| 96 I[k] = tmp; |
| 97 } else { |
| 98 ++i; |
| 99 } |
| 100 } |
63 | 101 |
64 x=V[I[start+len/2]+h]; | 102 // Recurse on the "< pivot" piece. |
65 jj=0;kk=0; | 103 if (start < j) |
66 for(i=start;i<start+len;i++) { | 104 split<T>(I, V, start, j, h); |
67 if(V[I[i]+h]<x) jj++; | |
68 if(V[I[i]+h]==x) kk++; | |
69 }; | |
70 jj+=start;kk+=jj; | |
71 | 105 |
72 i=start;j=0;k=0; | 106 // Update the "== pivot" piece. |
73 while(i<jj) { | 107 if (j == k - 1) { |
74 if(V[I[i]+h]<x) { | 108 V[I[j]] = j; |
75 i++; | 109 I[j] = -1; |
76 } else if(V[I[i]+h]==x) { | 110 } else { |
77 tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp; | 111 for (int i = j; i < k; ++i) |
78 j++; | 112 V[I[i]] = k - 1; |
79 } else { | 113 } |
80 tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp; | |
81 k++; | |
82 }; | |
83 }; | |
84 | 114 |
85 while(jj+j<kk) { | 115 // Recurse on the "> pivot" piece. |
86 if(V[I[jj+j]+h]==x) { | 116 if (k < end) |
87 j++; | 117 split<T>(I, V, k, end, h); |
88 } else { | |
89 tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp; | |
90 k++; | |
91 }; | |
92 }; | |
93 | |
94 if(jj>start) split<T>(I,V,start,jj-start,h); | |
95 | |
96 for(i=0;i<kk-jj;i++) V[I[jj+i]]=kk-1; | |
97 if(jj==kk-1) I[jj]=-1; | |
98 | |
99 if(start+len>kk) split<T>(I,V,kk,start+len-kk,h); | |
100 } | 118 } |
101 | 119 |
102 template <class T> | 120 template <class T> |
103 static void | 121 static void |
104 qsufsort(T I, T V,const unsigned char *old,int oldsize) | 122 qsufsort(T I, T V,const unsigned char *old,int oldsize) |
105 { | 123 { |
106 int buckets[256]; | 124 int buckets[256]; |
107 int i,h,len; | 125 int i,h,len; |
108 | 126 |
109 for(i=0;i<256;i++) buckets[i]=0; | 127 for(i=0;i<256;i++) buckets[i]=0; |
(...skipping 11 matching lines...) Expand all Loading... |
121 | 139 |
122 for(h=1;I[0]!=-(oldsize+1);h+=h) { | 140 for(h=1;I[0]!=-(oldsize+1);h+=h) { |
123 len=0; | 141 len=0; |
124 for(i=0;i<oldsize+1;) { | 142 for(i=0;i<oldsize+1;) { |
125 if(I[i]<0) { | 143 if(I[i]<0) { |
126 len-=I[i]; | 144 len-=I[i]; |
127 i-=I[i]; | 145 i-=I[i]; |
128 } else { | 146 } else { |
129 if(len) I[i-len]=-len; | 147 if(len) I[i-len]=-len; |
130 len=V[I[i]]+1-i; | 148 len=V[I[i]]+1-i; |
131 split<T>(I,V,i,len,h); | 149 split<T>(I,V,i,i+len,h); |
132 i+=len; | 150 i+=len; |
133 len=0; | 151 len=0; |
134 }; | 152 }; |
135 }; | 153 }; |
136 if(len) I[i-len]=-len; | 154 if(len) I[i-len]=-len; |
137 }; | 155 }; |
138 | 156 |
139 for(i=0;i<oldsize+1;i++) I[V[i]]=i; | 157 for(i=0;i<oldsize+1;i++) I[V[i]]=i; |
140 } | 158 } |
141 | 159 |
142 static int | 160 static int |
143 matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int ne
wsize) | 161 matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int ne
wsize) |
144 { | 162 { |
145 int i; | 163 int i; |
146 | 164 |
147 for(i=0;(i<oldsize)&&(i<newsize);i++) | 165 for(i=0;(i<oldsize)&&(i<newsize);i++) |
148 if(old[i]!=newbuf[i]) break; | 166 if(old[i]!=newbuf[i]) break; |
149 | 167 |
150 return i; | 168 return i; |
151 } | 169 } |
152 | 170 |
153 template <class T> | 171 template <class T> |
154 static int | 172 static int search(T I, const unsigned char *old, int oldsize, |
155 search(T I,const unsigned char *old,int oldsize, | 173 const unsigned char *newbuf, int newsize, int *pos) { |
156 const unsigned char *newbuf,int newsize,int st,int en,int *pos) | 174 int lo = 0; |
157 { | 175 int hi = oldsize; |
158 int x,y; | 176 while (hi - lo >= 2) { |
159 | 177 int mid = (lo + hi) / 2; |
160 if(en-st<2) { | 178 if(memcmp(old+I[mid],newbuf,std::min(oldsize-I[mid],newsize))<0) { |
161 x=matchlen(old+I[st],oldsize-I[st],newbuf,newsize); | 179 lo = mid; |
162 y=matchlen(old+I[en],oldsize-I[en],newbuf,newsize); | |
163 | |
164 if(x>y) { | |
165 *pos=I[st]; | |
166 return x; | |
167 } else { | 180 } else { |
168 *pos=I[en]; | 181 hi = mid; |
169 return y; | |
170 } | 182 } |
171 } | 183 } |
172 | 184 |
173 x=st+(en-st)/2; | 185 int x = matchlen(old + I[lo], oldsize - I[lo], newbuf, newsize); |
174 if(memcmp(old+I[x],newbuf,std::min(oldsize-I[x],newsize))<0) { | 186 int y = matchlen(old + I[hi], oldsize - I[hi], newbuf, newsize); |
175 return search<T>(I,old,oldsize,newbuf,newsize,x,en,pos); | 187 if(x > y) { |
176 } else { | 188 *pos = I[lo]; |
177 return search<T>(I,old,oldsize,newbuf,newsize,st,x,pos); | 189 return x; |
178 } | 190 } |
| 191 *pos = I[hi]; |
| 192 return y; |
179 } | 193 } |
180 | 194 |
181 // End of 'verbatim' code. | 195 // End of 'verbatim' code. |
182 // ------------------------------------------------------------------------ | 196 // ------------------------------------------------------------------------ |
183 | 197 |
184 } // namespace qsuf | 198 } // namespace qsuf |
185 } // namespace courgette | 199 } // namespace courgette |
OLD | NEW |