| OLD | NEW |
| 1 /* | 1 /* |
| 2 qsufsort.h -- Suffix array generation. | 2 qsufsort.h -- Suffix array generation. |
| 3 | 3 |
| 4 Copyright 2003 Colin Percival | 4 Copyright 2003 Colin Percival |
| 5 | 5 |
| 6 For the terms under which this work may be distributed, please see | 6 For the terms under which this work may be distributed, please see |
| 7 the adjoining file "LICENSE". | 7 the adjoining file "LICENSE". |
| 8 | 8 |
| 9 ChangeLog: | 9 ChangeLog: |
| 10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit | 10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit |
| 11 values throughout. | 11 values throughout. |
| 12 --Benjamin Smedberg <benjamin@smedbergs.us> | 12 --Benjamin Smedberg <benjamin@smedbergs.us> |
| 13 2010-05-26 - Use a paged array for V and I. The address space may be too | 13 2010-05-26 - Use a paged array for V and I. The address space may be too |
| 14 fragmented for these big arrays to be contiguous. | 14 fragmented for these big arrays to be contiguous. |
| 15 --Stephen Adams <sra@chromium.org> | 15 --Stephen Adams <sra@chromium.org> |
| 16 2015-08-03 - Extrat qsufsort to a separate file as template. | 16 2015-08-03 - Extrat qsufsort to a separate file as template. |
| 17 --Samuel Huang <huangs@chromium.org> | 17 --Samuel Huang <huangs@chromium.org> |
| 18 2015-08-19 - Optimize split() and search(), add comments. |
| 19 --Samuel Huang <huangs@chromium.org> |
| 18 */ | 20 */ |
| 19 | 21 |
| 20 #include <algorithm> | 22 #include <algorithm> |
| 21 #include <cstring> | 23 #include <cstring> |
| 22 | 24 |
| 23 namespace courgette { | 25 namespace courgette { |
| 24 namespace qsuf { | 26 namespace qsuf { |
| 25 | 27 |
| 26 // ------------------------------------------------------------------------ | 28 // ------------------------------------------------------------------------ |
| 27 // | 29 // |
| 28 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the | 30 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the |
| 29 // code formatting and variable names. The changes from the original are: | 31 // code formatting and variable names. The changes from the original are: |
| 30 // (1) replacing tabs with spaces, | 32 // (1) replacing tabs with spaces, |
| 31 // (2) indentation, | 33 // (2) indentation, |
| 32 // (3) using 'const', | 34 // (3) using 'const', |
| 33 // (4) changing the V and I parameters from int* to template <typename T>. | 35 // (4) changing the V and I parameters from int* to template <typename T>. |
| 36 // (5) optimizing split() and search(); fix styles. |
| 34 // | 37 // |
| 35 // The code appears to be a rewritten version of the suffix array algorithm | 38 // The code appears to be a rewritten version of the suffix array algorithm |
| 36 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko | 39 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko |
| 37 // Sadakane, special cased for bytes. | 40 // Sadakane, special cased for bytes. |
| 38 | 41 |
| 39 template <typename T> | 42 template <typename T> static void split(T I, T V, int start, int end, int h) { |
| 40 static void | 43 // For small interval, apply selection sort. |
| 41 split(T I,T V,int start,int len,int h) | 44 if (end - start < 6) { |
| 42 { | 45 for (int i = start; i < end; ) { |
| 43 int i,j,k,x,tmp,jj,kk; | 46 int skip = 1; |
| 47 int best = V[I[i] + h]; |
| 48 for (int j = i + 1; j < end; j++) { |
| 49 int cur = V[I[j] + h]; |
| 50 if (best > cur) { |
| 51 best = cur; |
| 52 int tmp = I[i]; |
| 53 I[i] = I[j]; |
| 54 I[j] = tmp; |
| 55 skip = 1; |
| 56 } else if (best == cur) { |
| 57 int tmp = I[i + skip]; |
| 58 I[i + skip] = I[j]; |
| 59 I[j] = tmp; |
| 60 ++skip; |
| 61 } |
| 62 } |
| 63 if (skip == 1) { |
| 64 V[I[i]] = i; |
| 65 I[i] = -1; |
| 66 } else { |
| 67 for (int j = i, jend = i + skip; j < jend; j++) |
| 68 V[I[j]] = jend - 1; |
| 69 } |
| 70 i += skip; |
| 71 } |
| 72 return; |
| 73 } |
| 44 | 74 |
| 45 if(len<16) { | 75 // Split [start, end) into 3 intervals: |
| 46 for(k=start;k<start+len;k+=j) { | 76 // [start, j) with secondary keys < pivot, |
| 47 j=1;x=V[I[k]+h]; | 77 // [j, k) with secondary keys == pivot, |
| 48 for(i=1;k+i<start+len;i++) { | 78 // [k, end) with secondary keys > pivot. |
| 49 if(V[I[k+i]+h]<x) { | 79 int pivot = V[I[(start + end) / 2] + h]; |
| 50 x=V[I[k+i]+h]; | 80 int j = start; |
| 51 j=0; | 81 int k = end; |
| 52 }; | 82 for (int i = start; i < k; ) { |
| 53 if(V[I[k+i]+h]==x) { | 83 int cur = V[I[i] + h]; |
| 54 tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp; | 84 if (cur < pivot) { |
| 55 j++; | 85 if (i != j) { |
| 56 }; | 86 int tmp = I[i]; |
| 57 }; | 87 I[i] = I[j]; |
| 58 for(i=0;i<j;i++) V[I[k+i]]=k+j-1; | 88 I[j] = tmp; |
| 59 if(j==1) I[k]=-1; | 89 } |
| 60 }; | 90 ++i; |
| 61 return; | 91 ++j; |
| 62 }; | 92 } else if (cur > pivot) { |
| 93 --k; |
| 94 int tmp = I[i]; |
| 95 I[i] = I[k]; |
| 96 I[k] = tmp; |
| 97 } else { |
| 98 ++i; |
| 99 } |
| 100 } |
| 63 | 101 |
| 64 x=V[I[start+len/2]+h]; | 102 // Recurse on the "< pivot" piece. |
| 65 jj=0;kk=0; | 103 if (start < j) |
| 66 for(i=start;i<start+len;i++) { | 104 split<T>(I, V, start, j, h); |
| 67 if(V[I[i]+h]<x) jj++; | |
| 68 if(V[I[i]+h]==x) kk++; | |
| 69 }; | |
| 70 jj+=start;kk+=jj; | |
| 71 | 105 |
| 72 i=start;j=0;k=0; | 106 // Update the "== pivot" piece. |
| 73 while(i<jj) { | 107 if (j == k - 1) { |
| 74 if(V[I[i]+h]<x) { | 108 V[I[j]] = j; |
| 75 i++; | 109 I[j] = -1; |
| 76 } else if(V[I[i]+h]==x) { | 110 } else { |
| 77 tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp; | 111 for (int i = j; i < k; ++i) |
| 78 j++; | 112 V[I[i]] = k - 1; |
| 79 } else { | 113 } |
| 80 tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp; | |
| 81 k++; | |
| 82 }; | |
| 83 }; | |
| 84 | 114 |
| 85 while(jj+j<kk) { | 115 // Recurse on the "> pivot" piece. |
| 86 if(V[I[jj+j]+h]==x) { | 116 if (k < end) |
| 87 j++; | 117 split<T>(I, V, k, end, h); |
| 88 } else { | |
| 89 tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp; | |
| 90 k++; | |
| 91 }; | |
| 92 }; | |
| 93 | |
| 94 if(jj>start) split<T>(I,V,start,jj-start,h); | |
| 95 | |
| 96 for(i=0;i<kk-jj;i++) V[I[jj+i]]=kk-1; | |
| 97 if(jj==kk-1) I[jj]=-1; | |
| 98 | |
| 99 if(start+len>kk) split<T>(I,V,kk,start+len-kk,h); | |
| 100 } | 118 } |
| 101 | 119 |
| 102 template <class T> | 120 template <class T> |
| 103 static void | 121 static void |
| 104 qsufsort(T I, T V,const unsigned char *old,int oldsize) | 122 qsufsort(T I, T V,const unsigned char *old,int oldsize) |
| 105 { | 123 { |
| 106 int buckets[256]; | 124 int buckets[256]; |
| 107 int i,h,len; | 125 int i,h,len; |
| 108 | 126 |
| 109 for(i=0;i<256;i++) buckets[i]=0; | 127 for(i=0;i<256;i++) buckets[i]=0; |
| (...skipping 11 matching lines...) Expand all Loading... |
| 121 | 139 |
| 122 for(h=1;I[0]!=-(oldsize+1);h+=h) { | 140 for(h=1;I[0]!=-(oldsize+1);h+=h) { |
| 123 len=0; | 141 len=0; |
| 124 for(i=0;i<oldsize+1;) { | 142 for(i=0;i<oldsize+1;) { |
| 125 if(I[i]<0) { | 143 if(I[i]<0) { |
| 126 len-=I[i]; | 144 len-=I[i]; |
| 127 i-=I[i]; | 145 i-=I[i]; |
| 128 } else { | 146 } else { |
| 129 if(len) I[i-len]=-len; | 147 if(len) I[i-len]=-len; |
| 130 len=V[I[i]]+1-i; | 148 len=V[I[i]]+1-i; |
| 131 split<T>(I,V,i,len,h); | 149 split<T>(I,V,i,i+len,h); |
| 132 i+=len; | 150 i+=len; |
| 133 len=0; | 151 len=0; |
| 134 }; | 152 }; |
| 135 }; | 153 }; |
| 136 if(len) I[i-len]=-len; | 154 if(len) I[i-len]=-len; |
| 137 }; | 155 }; |
| 138 | 156 |
| 139 for(i=0;i<oldsize+1;i++) I[V[i]]=i; | 157 for(i=0;i<oldsize+1;i++) I[V[i]]=i; |
| 140 } | 158 } |
| 141 | 159 |
| 142 static int | 160 static int |
| 143 matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int ne
wsize) | 161 matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int ne
wsize) |
| 144 { | 162 { |
| 145 int i; | 163 int i; |
| 146 | 164 |
| 147 for(i=0;(i<oldsize)&&(i<newsize);i++) | 165 for(i=0;(i<oldsize)&&(i<newsize);i++) |
| 148 if(old[i]!=newbuf[i]) break; | 166 if(old[i]!=newbuf[i]) break; |
| 149 | 167 |
| 150 return i; | 168 return i; |
| 151 } | 169 } |
| 152 | 170 |
| 153 template <class T> | 171 template <class T> |
| 154 static int | 172 static int search(T I, const unsigned char *old, int oldsize, |
| 155 search(T I,const unsigned char *old,int oldsize, | 173 const unsigned char *newbuf, int newsize, int *pos) { |
| 156 const unsigned char *newbuf,int newsize,int st,int en,int *pos) | 174 int lo = 0; |
| 157 { | 175 int hi = oldsize; |
| 158 int x,y; | 176 while (hi - lo >= 2) { |
| 159 | 177 int mid = (lo + hi) / 2; |
| 160 if(en-st<2) { | 178 if(memcmp(old+I[mid],newbuf,std::min(oldsize-I[mid],newsize))<0) { |
| 161 x=matchlen(old+I[st],oldsize-I[st],newbuf,newsize); | 179 lo = mid; |
| 162 y=matchlen(old+I[en],oldsize-I[en],newbuf,newsize); | |
| 163 | |
| 164 if(x>y) { | |
| 165 *pos=I[st]; | |
| 166 return x; | |
| 167 } else { | 180 } else { |
| 168 *pos=I[en]; | 181 hi = mid; |
| 169 return y; | |
| 170 } | 182 } |
| 171 } | 183 } |
| 172 | 184 |
| 173 x=st+(en-st)/2; | 185 int x = matchlen(old + I[lo], oldsize - I[lo], newbuf, newsize); |
| 174 if(memcmp(old+I[x],newbuf,std::min(oldsize-I[x],newsize))<0) { | 186 int y = matchlen(old + I[hi], oldsize - I[hi], newbuf, newsize); |
| 175 return search<T>(I,old,oldsize,newbuf,newsize,x,en,pos); | 187 if(x > y) { |
| 176 } else { | 188 *pos = I[lo]; |
| 177 return search<T>(I,old,oldsize,newbuf,newsize,st,x,pos); | 189 return x; |
| 178 } | 190 } |
| 191 *pos = I[hi]; |
| 192 return y; |
| 179 } | 193 } |
| 180 | 194 |
| 181 // End of 'verbatim' code. | 195 // End of 'verbatim' code. |
| 182 // ------------------------------------------------------------------------ | 196 // ------------------------------------------------------------------------ |
| 183 | 197 |
| 184 } // namespace qsuf | 198 } // namespace qsuf |
| 185 } // namespace courgette | 199 } // namespace courgette |
| OLD | NEW |