Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(222)

Side by Side Diff: courgette/third_party/qsufsort.h

Issue 1288703002: [Courgette] Optimize QSufSort to speed up courgette-gen. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Fix comments and spacing. Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « courgette/third_party/bsdiff_create.cc ('k') | courgette/third_party/qsufsort_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 qsufsort.h -- Suffix array generation. 2 qsufsort.h -- Suffix array generation.
3 3
4 Copyright 2003 Colin Percival 4 Copyright 2003 Colin Percival
5 5
6 For the terms under which this work may be distributed, please see 6 For the terms under which this work may be distributed, please see
7 the adjoining file "LICENSE". 7 the adjoining file "LICENSE".
8 8
9 ChangeLog: 9 ChangeLog:
10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit 10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit
11 values throughout. 11 values throughout.
12 --Benjamin Smedberg <benjamin@smedbergs.us> 12 --Benjamin Smedberg <benjamin@smedbergs.us>
13 2010-05-26 - Use a paged array for V and I. The address space may be too 13 2010-05-26 - Use a paged array for V and I. The address space may be too
14 fragmented for these big arrays to be contiguous. 14 fragmented for these big arrays to be contiguous.
15 --Stephen Adams <sra@chromium.org> 15 --Stephen Adams <sra@chromium.org>
16 2015-08-03 - Extrat qsufsort to a separate file as template. 16 2015-08-03 - Extrat qsufsort to a separate file as template.
17 --Samuel Huang <huangs@chromium.org> 17 --Samuel Huang <huangs@chromium.org>
18 2015-08-19 - Optimize split() and search(), add comments.
19 --Samuel Huang <huangs@chromium.org>
18 */ 20 */
19 21
20 #include <algorithm> 22 #include <algorithm>
21 #include <cstring> 23 #include <cstring>
22 24
23 namespace courgette { 25 namespace courgette {
24 namespace qsuf { 26 namespace qsuf {
25 27
26 // ------------------------------------------------------------------------ 28 // ------------------------------------------------------------------------
27 // 29 //
28 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the 30 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the
29 // code formatting and variable names. The changes from the original are: 31 // code formatting and variable names. The changes from the original are:
30 // (1) replacing tabs with spaces, 32 // (1) replacing tabs with spaces,
31 // (2) indentation, 33 // (2) indentation,
32 // (3) using 'const', 34 // (3) using 'const',
33 // (4) changing the V and I parameters from int* to template <typename T>. 35 // (4) changing the V and I parameters from int* to template <typename T>.
36 // (5) optimizing split() and search(); fix styles.
34 // 37 //
35 // The code appears to be a rewritten version of the suffix array algorithm 38 // The code appears to be a rewritten version of the suffix array algorithm
36 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko 39 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko
37 // Sadakane, special cased for bytes. 40 // Sadakane, special cased for bytes.
38 41
39 template <typename T> 42 template <typename T> static void split(T I, T V, int start, int end, int h) {
40 static void 43 // For small interval, apply selection sort.
41 split(T I,T V,int start,int len,int h) 44 if (end - start < 6) {
42 { 45 for (int i = start; i < end; ) {
43 int i,j,k,x,tmp,jj,kk; 46 int skip = 1;
47 int best = V[I[i] + h];
48 for (int j = i + 1; j < end; j++) {
49 int cur = V[I[j] + h];
50 if (best > cur) {
51 best = cur;
52 int tmp = I[i];
53 I[i] = I[j];
54 I[j] = tmp;
55 skip = 1;
56 } else if (best == cur) {
57 int tmp = I[i + skip];
58 I[i + skip] = I[j];
59 I[j] = tmp;
60 ++skip;
61 }
62 }
63 if (skip == 1) {
64 V[I[i]] = i;
65 I[i] = -1;
66 } else {
67 for (int j = i, jend = i + skip; j < jend; j++)
68 V[I[j]] = jend - 1;
69 }
70 i += skip;
71 }
72 return;
73 }
44 74
45 if(len<16) { 75 // Split [start, end) into 3 intervals:
46 for(k=start;k<start+len;k+=j) { 76 // [start, j) with secondary keys < pivot,
47 j=1;x=V[I[k]+h]; 77 // [j, k) with secondary keys == pivot,
48 for(i=1;k+i<start+len;i++) { 78 // [k, end) with secondary keys > pivot.
49 if(V[I[k+i]+h]<x) { 79 int pivot = V[I[(start + end) / 2] + h];
50 x=V[I[k+i]+h]; 80 int j = start;
51 j=0; 81 int k = end;
52 }; 82 for (int i = start; i < k; ) {
53 if(V[I[k+i]+h]==x) { 83 int cur = V[I[i] + h];
54 tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp; 84 if (cur < pivot) {
55 j++; 85 if (i != j) {
56 }; 86 int tmp = I[i];
57 }; 87 I[i] = I[j];
58 for(i=0;i<j;i++) V[I[k+i]]=k+j-1; 88 I[j] = tmp;
59 if(j==1) I[k]=-1; 89 }
60 }; 90 ++i;
61 return; 91 ++j;
62 }; 92 } else if (cur > pivot) {
93 --k;
94 int tmp = I[i];
95 I[i] = I[k];
96 I[k] = tmp;
97 } else {
98 ++i;
99 }
100 }
63 101
64 x=V[I[start+len/2]+h]; 102 // Recurse on the "< pivot" piece.
65 jj=0;kk=0; 103 if (start < j)
66 for(i=start;i<start+len;i++) { 104 split<T>(I, V, start, j, h);
67 if(V[I[i]+h]<x) jj++;
68 if(V[I[i]+h]==x) kk++;
69 };
70 jj+=start;kk+=jj;
71 105
72 i=start;j=0;k=0; 106 // Update the "== pivot" piece.
73 while(i<jj) { 107 if (j == k - 1) {
74 if(V[I[i]+h]<x) { 108 V[I[j]] = j;
75 i++; 109 I[j] = -1;
76 } else if(V[I[i]+h]==x) { 110 } else {
77 tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp; 111 for (int i = j; i < k; ++i)
78 j++; 112 V[I[i]] = k - 1;
79 } else { 113 }
80 tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp;
81 k++;
82 };
83 };
84 114
85 while(jj+j<kk) { 115 // Recurse on the "> pivot" piece.
86 if(V[I[jj+j]+h]==x) { 116 if (k < end)
87 j++; 117 split<T>(I, V, k, end, h);
88 } else {
89 tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp;
90 k++;
91 };
92 };
93
94 if(jj>start) split<T>(I,V,start,jj-start,h);
95
96 for(i=0;i<kk-jj;i++) V[I[jj+i]]=kk-1;
97 if(jj==kk-1) I[jj]=-1;
98
99 if(start+len>kk) split<T>(I,V,kk,start+len-kk,h);
100 } 118 }
101 119
102 template <class T> 120 template <class T>
103 static void 121 static void
104 qsufsort(T I, T V,const unsigned char *old,int oldsize) 122 qsufsort(T I, T V,const unsigned char *old,int oldsize)
105 { 123 {
106 int buckets[256]; 124 int buckets[256];
107 int i,h,len; 125 int i,h,len;
108 126
109 for(i=0;i<256;i++) buckets[i]=0; 127 for(i=0;i<256;i++) buckets[i]=0;
(...skipping 11 matching lines...) Expand all
121 139
122 for(h=1;I[0]!=-(oldsize+1);h+=h) { 140 for(h=1;I[0]!=-(oldsize+1);h+=h) {
123 len=0; 141 len=0;
124 for(i=0;i<oldsize+1;) { 142 for(i=0;i<oldsize+1;) {
125 if(I[i]<0) { 143 if(I[i]<0) {
126 len-=I[i]; 144 len-=I[i];
127 i-=I[i]; 145 i-=I[i];
128 } else { 146 } else {
129 if(len) I[i-len]=-len; 147 if(len) I[i-len]=-len;
130 len=V[I[i]]+1-i; 148 len=V[I[i]]+1-i;
131 split<T>(I,V,i,len,h); 149 split<T>(I,V,i,i+len,h);
132 i+=len; 150 i+=len;
133 len=0; 151 len=0;
134 }; 152 };
135 }; 153 };
136 if(len) I[i-len]=-len; 154 if(len) I[i-len]=-len;
137 }; 155 };
138 156
139 for(i=0;i<oldsize+1;i++) I[V[i]]=i; 157 for(i=0;i<oldsize+1;i++) I[V[i]]=i;
140 } 158 }
141 159
142 static int 160 static int
143 matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int ne wsize) 161 matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int ne wsize)
144 { 162 {
145 int i; 163 int i;
146 164
147 for(i=0;(i<oldsize)&&(i<newsize);i++) 165 for(i=0;(i<oldsize)&&(i<newsize);i++)
148 if(old[i]!=newbuf[i]) break; 166 if(old[i]!=newbuf[i]) break;
149 167
150 return i; 168 return i;
151 } 169 }
152 170
153 template <class T> 171 template <class T>
154 static int 172 static int search(T I, const unsigned char *old, int oldsize,
155 search(T I,const unsigned char *old,int oldsize, 173 const unsigned char *newbuf, int newsize, int *pos) {
156 const unsigned char *newbuf,int newsize,int st,int en,int *pos) 174 int lo = 0;
157 { 175 int hi = oldsize;
158 int x,y; 176 while (hi - lo >= 2) {
159 177 int mid = (lo + hi) / 2;
160 if(en-st<2) { 178 if(memcmp(old+I[mid],newbuf,std::min(oldsize-I[mid],newsize))<0) {
161 x=matchlen(old+I[st],oldsize-I[st],newbuf,newsize); 179 lo = mid;
162 y=matchlen(old+I[en],oldsize-I[en],newbuf,newsize);
163
164 if(x>y) {
165 *pos=I[st];
166 return x;
167 } else { 180 } else {
168 *pos=I[en]; 181 hi = mid;
169 return y;
170 } 182 }
171 } 183 }
172 184
173 x=st+(en-st)/2; 185 int x = matchlen(old + I[lo], oldsize - I[lo], newbuf, newsize);
174 if(memcmp(old+I[x],newbuf,std::min(oldsize-I[x],newsize))<0) { 186 int y = matchlen(old + I[hi], oldsize - I[hi], newbuf, newsize);
175 return search<T>(I,old,oldsize,newbuf,newsize,x,en,pos); 187 if(x > y) {
176 } else { 188 *pos = I[lo];
177 return search<T>(I,old,oldsize,newbuf,newsize,st,x,pos); 189 return x;
178 } 190 }
191 *pos = I[hi];
192 return y;
179 } 193 }
180 194
181 // End of 'verbatim' code. 195 // End of 'verbatim' code.
182 // ------------------------------------------------------------------------ 196 // ------------------------------------------------------------------------
183 197
184 } // namespace qsuf 198 } // namespace qsuf
185 } // namespace courgette 199 } // namespace courgette
OLDNEW
« no previous file with comments | « courgette/third_party/bsdiff_create.cc ('k') | courgette/third_party/qsufsort_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698