courgette/third_party/qsufsort.h - Issue 1272453003: [Courgette] Extract qsufsort module and add tests.

Side by Side Diff: courgette/third_party/qsufsort.h

Issue 1272453003: [Courgette] Extract qsufsort module and add tests. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Using int, but with explicit cast from ::strlen(). Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 bsdiff.c -- Binary patch generator.	2 qsufsort.h -- Suffix array generation.

3	3

4 Copyright 2003 Colin Percival	4 Copyright 2003 Colin Percival

5	5

6 For the terms under which this work may be distributed, please see	6 For the terms under which this work may be distributed, please see

7 the adjoining file "LICENSE".	7 the adjoining file "LICENSE".

8	8

9 ChangeLog:	9 ChangeLog:

10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit	10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit

11 values throughout.	11 values throughout.

12 --Benjamin Smedberg <benjamin@smedbergs.us>	12 --Benjamin Smedberg <benjamin@smedbergs.us>

13 2005-05-18 - Use the same CRC algorithm as bzip2, and leverage the CRC table

14 provided by libbz2.

15 --Darin Fisher <darin@meer.net>

16 2007-11-14 - Changed to use Crc from Lzma library instead of Bzip library

17 --Rahul Kuchhal

18 2009-03-31 - Change to use Streams. Added lots of comments.

19 --Stephen Adams <sra@chromium.org>

20 2010-05-26 - Use a paged array for V and I. The address space may be too	13 2010-05-26 - Use a paged array for V and I. The address space may be too

21 fragmented for these big arrays to be contiguous.	14 fragmented for these big arrays to be contiguous.

22 --Stephen Adams <sra@chromium.org>	15 --Stephen Adams <sra@chromium.org>

	16 2015-08-03 - Extrat qsufsort to a separate file as template.

	17 --Samuel Huang <huangs@chromium.org>

23 */	18 */

24	19

25 #include "courgette/third_party/bsdiff.h"

26

27 #include <stdlib.h>

28 #include <algorithm>	20 #include <algorithm>

29	21 #include <cstring>

30 #include "base/logging.h"

31 #include "base/memory/scoped_ptr.h"

32 #include "base/strings/string_util.h"

33 #include "base/time/time.h"

34

35 #include "courgette/crc.h"

36 #include "courgette/streams.h"

37 #include "courgette/third_party/paged_array.h"

38	22

39 namespace courgette {	23 namespace courgette {

	24 namespace qsuf {

40	25

41 // ------------------------------------------------------------------------	26 // ------------------------------------------------------------------------

42 //	27 //

43 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the	28 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the

44 // code formatting and variable names. The changes from the original are (1)	29 // code formatting and variable names. The changes from the original are:

45 // replacing tabs with spaces, (2) indentation, (3) using 'const', and (4)	30 // (1) replacing tabs with spaces,

46 // changing the V and I parameters from int* to PagedArray<int>&.	31 // (2) indentation,

	32 // (3) using 'const',

	33 // (4) changing the V and I parameters from int* to template <typename T>.

47 //	34 //

48 // The code appears to be a rewritten version of the suffix array algorithm	35 // The code appears to be a rewritten version of the suffix array algorithm

49 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko	36 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko

50 // Sadakane, special cased for bytes.	37 // Sadakane, special cased for bytes.

51	38

	39 template <typename T>

52 static void	40 static void

53 split(PagedArray<int>& I,PagedArray<int>& V,int start,int len,int h)	41 split(T I,T V,int start,int len,int h)

54 {	42 {

55 int i,j,k,x,tmp,jj,kk;	43 int i,j,k,x,tmp,jj,kk;

56	44

57 if(len<16) {	45 if(len<16) {

58 for(k=start;k<start+len;k+=j) {	46 for(k=start;k<start+len;k+=j) {

59 j=1;x=V[I[k]+h];	47 j=1;x=V[I[k]+h];

60 for(i=1;k+i<start+len;i++) {	48 for(i=1;k+i<start+len;i++) {

61 if(V[I[k+i]+h]<x) {	49 if(V[I[k+i]+h]<x) {

62 x=V[I[k+i]+h];	50 x=V[I[k+i]+h];

63 j=0;	51 j=0;

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
96	84

97 while(jj+j<kk) {	85 while(jj+j<kk) {

98 if(V[I[jj+j]+h]==x) {	86 if(V[I[jj+j]+h]==x) {

99 j++;	87 j++;

100 } else {	88 } else {

101 tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp;	89 tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp;

102 k++;	90 k++;

103 };	91 };

104 };	92 };

105	93

106 if(jj>start) split(I,V,start,jj-start,h);	94 if(jj>start) split<T>(I,V,start,jj-start,h);

107	95

108 for(i=0;i<kk-jj;i++) V[I[jj+i]]=kk-1;	96 for(i=0;i<kk-jj;i++) V[I[jj+i]]=kk-1;

109 if(jj==kk-1) I[jj]=-1;	97 if(jj==kk-1) I[jj]=-1;

110	98

111 if(start+len>kk) split(I,V,kk,start+len-kk,h);	99 if(start+len>kk) split<T>(I,V,kk,start+len-kk,h);

112 }	100 }

113	101

	102 template <class T>

114 static void	103 static void

115 qsufsort(PagedArray<int>& I, PagedArray<int>& V,const unsigned char *old,int old size)	104 qsufsort(T I, T V,const unsigned char *old,int oldsize)

116 {	105 {

117 int buckets[256];	106 int buckets[256];

118 int i,h,len;	107 int i,h,len;

119	108

120 for(i=0;i<256;i++) buckets[i]=0;	109 for(i=0;i<256;i++) buckets[i]=0;

121 for(i=0;i<oldsize;i++) buckets[old[i]]++;	110 for(i=0;i<oldsize;i++) buckets[old[i]]++;

122 for(i=1;i<256;i++) buckets[i]+=buckets[i-1];	111 for(i=1;i<256;i++) buckets[i]+=buckets[i-1];

123 for(i=255;i>0;i--) buckets[i]=buckets[i-1];	112 for(i=255;i>0;i--) buckets[i]=buckets[i-1];

124 buckets[0]=0;	113 buckets[0]=0;

125	114

126 for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i;	115 for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i;

127 I[0]=oldsize;	116 I[0]=oldsize;

128 for(i=0;i<oldsize;i++) V[i]=buckets[old[i]];	117 for(i=0;i<oldsize;i++) V[i]=buckets[old[i]];

129 V[oldsize]=0;	118 V[oldsize]=0;

130 for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1;	119 for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1;

131 I[0]=-1;	120 I[0]=-1;

132	121

133 for(h=1;I[0]!=-(oldsize+1);h+=h) {	122 for(h=1;I[0]!=-(oldsize+1);h+=h) {

134 len=0;	123 len=0;

135 for(i=0;i<oldsize+1;) {	124 for(i=0;i<oldsize+1;) {

136 if(I[i]<0) {	125 if(I[i]<0) {

137 len-=I[i];	126 len-=I[i];

138 i-=I[i];	127 i-=I[i];

139 } else {	128 } else {

140 if(len) I[i-len]=-len;	129 if(len) I[i-len]=-len;

141 len=V[I[i]]+1-i;	130 len=V[I[i]]+1-i;

142 split(I,V,i,len,h);	131 split<T>(I,V,i,len,h);

143 i+=len;	132 i+=len;

144 len=0;	133 len=0;

145 };	134 };

146 };	135 };

147 if(len) I[i-len]=-len;	136 if(len) I[i-len]=-len;

148 };	137 };

149	138

150 for(i=0;i<oldsize+1;i++) I[V[i]]=i;	139 for(i=0;i<oldsize+1;i++) I[V[i]]=i;

151 }	140 }

152	141

153 static int	142 static int

154 matchlen(const unsigned char old,int oldsize,const unsigned char newbuf,int ne wsize)	143 matchlen(const unsigned char old,int oldsize,const unsigned char newbuf,int ne wsize)

155 {	144 {

156 int i;	145 int i;

157	146

158 for(i=0;(i<oldsize)&&(i<newsize);i++)	147 for(i=0;(i<oldsize)&&(i<newsize);i++)

159 if(old[i]!=newbuf[i]) break;	148 if(old[i]!=newbuf[i]) break;

160	149

161 return i;	150 return i;

162 }	151 }

163	152

	153 template <class T>

164 static int	154 static int

165 search(PagedArray<int>& I,const unsigned char *old,int oldsize,	155 search(T I,const unsigned char *old,int oldsize,

166 const unsigned char newbuf,int newsize,int st,int en,int pos)	156 const unsigned char newbuf,int newsize,int st,int en,int pos)

167 {	157 {

168 int x,y;	158 int x,y;

169	159

170 if(en-st<2) {	160 if(en-st<2) {

171 x=matchlen(old+I[st],oldsize-I[st],newbuf,newsize);	161 x=matchlen(old+I[st],oldsize-I[st],newbuf,newsize);

172 y=matchlen(old+I[en],oldsize-I[en],newbuf,newsize);	162 y=matchlen(old+I[en],oldsize-I[en],newbuf,newsize);

173	163

174 if(x>y) {	164 if(x>y) {

175 *pos=I[st];	165 *pos=I[st];

176 return x;	166 return x;

177 } else {	167 } else {

178 *pos=I[en];	168 *pos=I[en];

179 return y;	169 return y;

180 }	170 }

181 }	171 }

182	172

183 x=st+(en-st)/2;	173 x=st+(en-st)/2;

184 if(memcmp(old+I[x],newbuf,std::min(oldsize-I[x],newsize))<0) {	174 if(memcmp(old+I[x],newbuf,std::min(oldsize-I[x],newsize))<0) {

185 return search(I,old,oldsize,newbuf,newsize,x,en,pos);	175 return search<T>(I,old,oldsize,newbuf,newsize,x,en,pos);

186 } else {	176 } else {

187 return search(I,old,oldsize,newbuf,newsize,st,x,pos);	177 return search<T>(I,old,oldsize,newbuf,newsize,st,x,pos);

188 }	178 }

189 }	179 }

190	180

191 // End of 'verbatim' code.	181 // End of 'verbatim' code.

192 // ------------------------------------------------------------------------	182 // ------------------------------------------------------------------------

193	183

194 static CheckBool WriteHeader(SinkStream* stream, MBSPatchHeader* header) {	184 } // namespace qsuf

195 bool ok = stream->Write(header->tag, sizeof(header->tag));	185 } // namespace courgette

196 ok &= stream->WriteVarint32(header->slen);

197 ok &= stream->WriteVarint32(header->scrc32);

198 ok &= stream->WriteVarint32(header->dlen);

199 return ok;

200 }

201

202 BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,

203 SourceStream* new_stream,

204 SinkStream* patch_stream)

205 {

206 base::Time start_bsdiff_time = base::Time::Now();

207 VLOG(1) << "Start bsdiff";

208 size_t initial_patch_stream_length = patch_stream->Length();

209

210 SinkStreamSet patch_streams;

211 SinkStream* control_stream_copy_counts = patch_streams.stream(0);

212 SinkStream* control_stream_extra_counts = patch_streams.stream(1);

213 SinkStream* control_stream_seeks = patch_streams.stream(2);

214 SinkStream* diff_skips = patch_streams.stream(3);

215 SinkStream* diff_bytes = patch_streams.stream(4);

216 SinkStream* extra_bytes = patch_streams.stream(5);

217

218 const uint8* old = old_stream->Buffer();

219 const int oldsize = static_cast<int>(old_stream->Remaining());

220

221 uint32 pending_diff_zeros = 0;

222

223 PagedArray<int> I;

224 PagedArray<int> V;

225

226 if (!I.Allocate(oldsize + 1)) {

227 LOG(ERROR) << "Could not allocate I[], " << ((oldsize + 1) * sizeof(int))

228 << " bytes";

229 return MEM_ERROR;

230 }

231

232 if (!V.Allocate(oldsize + 1)) {

233 LOG(ERROR) << "Could not allocate V[], " << ((oldsize + 1) * sizeof(int))

234 << " bytes";

235 return MEM_ERROR;

236 }

237

238 base::Time q_start_time = base::Time::Now();

239 qsufsort(I, V, old, oldsize);

240 VLOG(1) << " done qsufsort "

241 << (base::Time::Now() - q_start_time).InSecondsF();

242 V.clear();

243

244 const uint8* newbuf = new_stream->Buffer();

245 const int newsize = static_cast<int>(new_stream->Remaining());

246

247 int control_length = 0;

248 int diff_bytes_length = 0;

249 int diff_bytes_nonzero = 0;

250 int extra_bytes_length = 0;

251

252 // The patch format is a sequence of triples <copy,extra,seek> where 'copy' is

253 // the number of bytes to copy from the old file (possibly with mistakes),

254 // 'extra' is the number of bytes to copy from a stream of fresh bytes, and

255 // 'seek' is an offset to move to the position to copy for the next triple.

256 //

257 // The invariant at the top of this loop is that we are committed to emitting

258 // a triple for the part of \|newbuf\| surrounding a 'seed' match near

259 // \|lastscan\|. We are searching for a second match that will be the 'seed' of

260 // the next triple. As we scan through \|newbuf\|, one of four things can

261 // happen at the current position \|scan\|:

262 //

263 // 1. We find a nice match that appears to be consistent with the current

264 // seed. Continue scanning. It is likely that this match will become

265 // part of the 'copy'.

266 //

267 // 2. We find match which does much better than extending the current seed

268 // old match. Emit a triple for the current seed and take this match as

269 // the new seed for a new triple. By 'much better' we remove 8 mismatched

270 // bytes by taking the new seed.

271 //

272 // 3. There is not a good match. Continue scanning. These bytes will likely

273 // become part of the 'extra'.

274 //

275 // 4. There is no match because we reached the end of the input, \|newbuf\|.

276

277 // This is how the loop advances through the bytes of \|newbuf\|:

278 //

279 // ...012345678901234567890123456789...

280 // ssssssssss Seed at \|lastscan\|

281 // xxyyyxxyyxy \|scan\| forward, cases (3)(x) & (1)(y)

282 // mmmmmmmm New match will start new seed case (2).

283 // fffffffffffffff \|lenf\| = scan forward from \|lastscan\|

284 // bbbb \|lenb\| = scan back from new seed \|scan\|.

285 // ddddddddddddddd Emit diff bytes for the 'copy'.

286 // xx Emit extra bytes.

287 // ssssssssssss \|lastscan = scan - lenb\| is new seed.

288 // x Cases (1) and (3) ....

289

290

291 int lastscan = 0, lastpos = 0, lastoffset = 0;

292

293 int scan = 0;

294 int match_length = 0;

295

296 while (scan < newsize) {

297 int pos = 0;

298 int oldscore = 0; // Count of how many bytes of the current match at \|scan\|

299 // extend the match at \|lastscan\|.

300

301 scan += match_length;

302 for (int scsc = scan; scan < newsize; ++scan) {

303 match_length = search(I, old, oldsize,

304 newbuf + scan, newsize - scan,

305 0, oldsize, &pos);

306

307 for ( ; scsc < scan + match_length ; scsc++)

308 if ((scsc + lastoffset < oldsize) &&

309 (old[scsc + lastoffset] == newbuf[scsc]))

310 oldscore++;

311

312 if ((match_length == oldscore) && (match_length != 0))

313 break; // Good continuing match, case (1)

314 if (match_length > oldscore + 8)

315 break; // New seed match, case (2)

316

317 if ((scan + lastoffset < oldsize) &&

318 (old[scan + lastoffset] == newbuf[scan]))

319 oldscore--;

320 // Case (3) continues in this loop until we fall out of the loop (4).

321 }

322

323 if ((match_length != oldscore) \|\| (scan == newsize)) { // Cases (2) and (4)

324 // This next chunk of code finds the boundary between the bytes to be

325 // copied as part of the current triple, and the bytes to be copied as

326 // part of the next triple. The \|lastscan\| match is extended forwards as

327 // far as possible provided doing to does not add too many mistakes. The

328 // \|scan\| match is extended backwards in a similar way.

329

330 // Extend the current match (if any) backwards. \|lenb\| is the maximal

331 // extension for which less than half the byte positions in the extension

332 // are wrong.

333 int lenb = 0;

334 if (scan < newsize) { // i.e. not case (4); there is a match to extend.

335 int score = 0, Sb = 0;

336 for (int i = 1; (scan >= lastscan + i) && (pos >= i); i++) {

337 if (old[pos - i] == newbuf[scan - i]) score++;

338 if (score2 - i > Sb2 - lenb) { Sb = score; lenb = i; }

339 }

340 }

341

342 // Extend the lastscan match forward; \|lenf\| is the maximal extension for

343 // which less than half of the byte positions in entire lastscan match are

344 // wrong. There is a subtle point here: \|lastscan\| points to before the

345 // seed match by \|lenb\| bytes from the previous iteration. This is why

346 // the loop measures the total number of mistakes in the the match, not

347 // just the from the match.

348 int lenf = 0;

349 {

350 int score = 0, Sf = 0;

351 for (int i = 0; (lastscan + i < scan) && (lastpos + i < oldsize); ) {

352 if (old[lastpos + i] == newbuf[lastscan + i]) score++;

353 i++;

354 if (score2 - i > Sf2 - lenf) { Sf = score; lenf = i; }

355 }

356 }

357

358 // If the extended scans overlap, pick a position in the overlap region

359 // that maximizes the exact matching bytes.

360 if (lastscan + lenf > scan - lenb) {

361 int overlap = (lastscan + lenf) - (scan - lenb);

362 int score = 0;

363 int Ss = 0, lens = 0;

364 for (int i = 0; i < overlap; i++) {

365 if (newbuf[lastscan + lenf - overlap + i] ==

366 old[lastpos + lenf - overlap + i]) score++;

367 if (newbuf[scan - lenb + i] == old[pos - lenb + i]) score--;

368 if (score > Ss) { Ss = score; lens = i + 1; }

369 }

370

371 lenf += lens - overlap;

372 lenb -= lens;

373 };

374

375 for (int i = 0; i < lenf; i++) {

376 uint8 diff_byte = newbuf[lastscan + i] - old[lastpos + i];

377 if (diff_byte) {

378 ++diff_bytes_nonzero;

379 if (!diff_skips->WriteVarint32(pending_diff_zeros))

380 return MEM_ERROR;

381 pending_diff_zeros = 0;

382 if (!diff_bytes->Write(&diff_byte, 1))

383 return MEM_ERROR;

384 } else {

385 ++pending_diff_zeros;

386 }

387 }

388 int gap = (scan - lenb) - (lastscan + lenf);

389 for (int i = 0; i < gap; i++) {

390 if (!extra_bytes->Write(&newbuf[lastscan + lenf + i], 1))

391 return MEM_ERROR;

392 }

393

394 diff_bytes_length += lenf;

395 extra_bytes_length += gap;

396

397 uint32 copy_count = lenf;

398 uint32 extra_count = gap;

399 int32 seek_adjustment = ((pos - lenb) - (lastpos + lenf));

400

401 if (!control_stream_copy_counts->WriteVarint32(copy_count) \|\|

402 !control_stream_extra_counts->WriteVarint32(extra_count) \|\|

403 !control_stream_seeks->WriteVarint32Signed(seek_adjustment)) {

404 return MEM_ERROR;

405 }

406

407 ++control_length;

408 #ifdef DEBUG_bsmedberg

409 VLOG(1) << StringPrintf("Writing a block: copy: %-8u extra: %-8u seek: "

410 "%+-9d", copy_count, extra_count,

411 seek_adjustment);

412 #endif

413

414 lastscan = scan - lenb; // Include the backward extension in seed.

415 lastpos = pos - lenb; // ditto.

416 lastoffset = lastpos - lastscan;

417 }

418 }

419

420 if (!diff_skips->WriteVarint32(pending_diff_zeros))

421 return MEM_ERROR;

422

423 I.clear();

424

425 MBSPatchHeader header;

426 // The string will have a null terminator that we don't use, hence '-1'.

427 static_assert(sizeof(MBS_PATCH_HEADER_TAG) - 1 == sizeof(header.tag),

428 "MBS_PATCH_HEADER_TAG must match header field size");

429 memcpy(header.tag, MBS_PATCH_HEADER_TAG, sizeof(header.tag));

430 header.slen = oldsize;

431 header.scrc32 = CalculateCrc(old, oldsize);

432 header.dlen = newsize;

433

434 if (!WriteHeader(patch_stream, &header))

435 return MEM_ERROR;

436

437 size_t diff_skips_length = diff_skips->Length();

438 if (!patch_streams.CopyTo(patch_stream))

439 return MEM_ERROR;

440

441 VLOG(1) << "Control tuples: " << control_length

442 << " copy bytes: " << diff_bytes_length

443 << " mistakes: " << diff_bytes_nonzero

444 << " (skips: " << diff_skips_length << ")"

445 << " extra bytes: " << extra_bytes_length

446 << "\nUncompressed bsdiff patch size "

447 << patch_stream->Length() - initial_patch_stream_length

448 << "\nEnd bsdiff "

449 << (base::Time::Now() - start_bsdiff_time).InSecondsF();

450

451 return OK;

452 }

453

454 } // namespace

OLD	NEW

« no previous file with comments | « courgette/third_party/bsdiff_create.cc ('k') | courgette/third_party/qsufsort_unittest.cc » ('j') | no next file with comments »