courgette/third_party/bsdiff_create.cc - Issue 1961963003: Move //courgette/third_party to subfolder.

Side by Side Diff: courgette/third_party/bsdiff_create.cc

Issue 1961963003: Move //courgette/third_party to subfolder. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Fixes according to comments Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 /*

2 bsdiff.c -- Binary patch generator.

3

4 Copyright 2003 Colin Percival

5

6 For the terms under which this work may be distributed, please see

7 the adjoining file "LICENSE".

8

9 ChangeLog:

10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit

11 values throughout.

12 --Benjamin Smedberg <benjamin@smedbergs.us>

13 2005-05-18 - Use the same CRC algorithm as bzip2, and leverage the CRC table

14 provided by libbz2.

15 --Darin Fisher <darin@meer.net>

16 2007-11-14 - Changed to use Crc from Lzma library instead of Bzip library

17 --Rahul Kuchhal

18 2009-03-31 - Change to use Streams. Added lots of comments.

19 --Stephen Adams <sra@chromium.org>

20 2010-05-26 - Use a paged array for V and I. The address space may be too

21 fragmented for these big arrays to be contiguous.

22 --Stephen Adams <sra@chromium.org>

23 2015-08-03 - Extract qsufsort portion to a separate file.

24 --Samuel Huang <huangs@chromium.org>

25 2015-08-12 - Interface change to qsufsort search().

26 --Samuel Huang <huangs@chromium.org>

27 */

28

29 #include "courgette/third_party/bsdiff.h"

30

31 #include <stddef.h>

32 #include <stdint.h>

33 #include <stdlib.h>

34 #include <algorithm>

35

36 #include "base/logging.h"

37 #include "base/strings/string_util.h"

38 #include "base/time/time.h"

39

40 #include "courgette/crc.h"

41 #include "courgette/streams.h"

42 #include "courgette/third_party/paged_array.h"

43 #include "courgette/third_party/qsufsort.h"

44

45 namespace courgette {

46

47 static CheckBool WriteHeader(SinkStream* stream, MBSPatchHeader* header) {

48 bool ok = stream->Write(header->tag, sizeof(header->tag));

49 ok &= stream->WriteVarint32(header->slen);

50 ok &= stream->WriteVarint32(header->scrc32);

51 ok &= stream->WriteVarint32(header->dlen);

52 return ok;

53 }

54

55 BSDiffStatus CreateBinaryPatch(SourceStream* old_stream,

56 SourceStream* new_stream,

57 SinkStream* patch_stream)

58 {

59 base::Time start_bsdiff_time = base::Time::Now();

60 VLOG(1) << "Start bsdiff";

61 size_t initial_patch_stream_length = patch_stream->Length();

62

63 SinkStreamSet patch_streams;

64 SinkStream* control_stream_copy_counts = patch_streams.stream(0);

65 SinkStream* control_stream_extra_counts = patch_streams.stream(1);

66 SinkStream* control_stream_seeks = patch_streams.stream(2);

67 SinkStream* diff_skips = patch_streams.stream(3);

68 SinkStream* diff_bytes = patch_streams.stream(4);

69 SinkStream* extra_bytes = patch_streams.stream(5);

70

71 const uint8_t* old = old_stream->Buffer();

72 const int oldsize = static_cast<int>(old_stream->Remaining());

73

74 uint32_t pending_diff_zeros = 0;

75

76 PagedArray<int> I;

77 PagedArray<int> V;

78

79 if (!I.Allocate(oldsize + 1)) {

80 LOG(ERROR) << "Could not allocate I[], " << ((oldsize + 1) * sizeof(int))

81 << " bytes";

82 return MEM_ERROR;

83 }

84

85 if (!V.Allocate(oldsize + 1)) {

86 LOG(ERROR) << "Could not allocate V[], " << ((oldsize + 1) * sizeof(int))

87 << " bytes";

88 return MEM_ERROR;

89 }

90

91 base::Time q_start_time = base::Time::Now();

92 qsuf::qsufsort<PagedArray<int>&>(I, V, old, oldsize);

93 VLOG(1) << " done qsufsort "

94 << (base::Time::Now() - q_start_time).InSecondsF();

95 V.clear();

96

97 const uint8_t* newbuf = new_stream->Buffer();

98 const int newsize = static_cast<int>(new_stream->Remaining());

99

100 int control_length = 0;

101 int diff_bytes_length = 0;

102 int diff_bytes_nonzero = 0;

103 int extra_bytes_length = 0;

104

105 // The patch format is a sequence of triples <copy,extra,seek> where 'copy' is

106 // the number of bytes to copy from the old file (possibly with mistakes),

107 // 'extra' is the number of bytes to copy from a stream of fresh bytes, and

108 // 'seek' is an offset to move to the position to copy for the next triple.

109 //

110 // The invariant at the top of this loop is that we are committed to emitting

111 // a triple for the part of \|newbuf\| surrounding a 'seed' match near

112 // \|lastscan\|. We are searching for a second match that will be the 'seed' of

113 // the next triple. As we scan through \|newbuf\|, one of four things can

114 // happen at the current position \|scan\|:

115 //

116 // 1. We find a nice match that appears to be consistent with the current

117 // seed. Continue scanning. It is likely that this match will become

118 // part of the 'copy'.

119 //

120 // 2. We find match which does much better than extending the current seed

121 // old match. Emit a triple for the current seed and take this match as

122 // the new seed for a new triple. By 'much better' we remove 8 mismatched

123 // bytes by taking the new seed.

124 //

125 // 3. There is not a good match. Continue scanning. These bytes will likely

126 // become part of the 'extra'.

127 //

128 // 4. There is no match because we reached the end of the input, \|newbuf\|.

129

130 // This is how the loop advances through the bytes of \|newbuf\|:

131 //

132 // ...012345678901234567890123456789...

133 // ssssssssss Seed at \|lastscan\|

134 // xxyyyxxyyxy \|scan\| forward, cases (3)(x) & (1)(y)

135 // mmmmmmmm New match will start new seed case (2).

136 // fffffffffffffff \|lenf\| = scan forward from \|lastscan\|

137 // bbbb \|lenb\| = scan back from new seed \|scan\|.

138 // ddddddddddddddd Emit diff bytes for the 'copy'.

139 // xx Emit extra bytes.

140 // ssssssssssss \|lastscan = scan - lenb\| is new seed.

141 // x Cases (1) and (3) ....

142

143

144 int lastscan = 0, lastpos = 0, lastoffset = 0;

145

146 int scan = 0;

147 int match_length = 0;

148

149 while (scan < newsize) {

150 int pos = 0;

151 int oldscore = 0; // Count of how many bytes of the current match at \|scan\|

152 // extend the match at \|lastscan\|.

153

154 scan += match_length;

155 for (int scsc = scan; scan < newsize; ++scan) {

156 match_length = qsuf::search<PagedArray<int>&>(

157 I, old, oldsize, newbuf + scan, newsize - scan, &pos);

158

159 for ( ; scsc < scan + match_length ; scsc++)

160 if ((scsc + lastoffset < oldsize) &&

161 (old[scsc + lastoffset] == newbuf[scsc]))

162 oldscore++;

163

164 if ((match_length == oldscore) && (match_length != 0))

165 break; // Good continuing match, case (1)

166 if (match_length > oldscore + 8)

167 break; // New seed match, case (2)

168

169 if ((scan + lastoffset < oldsize) &&

170 (old[scan + lastoffset] == newbuf[scan]))

171 oldscore--;

172 // Case (3) continues in this loop until we fall out of the loop (4).

173 }

174

175 if ((match_length != oldscore) \|\| (scan == newsize)) { // Cases (2) and (4)

176 // This next chunk of code finds the boundary between the bytes to be

177 // copied as part of the current triple, and the bytes to be copied as

178 // part of the next triple. The \|lastscan\| match is extended forwards as

179 // far as possible provided doing to does not add too many mistakes. The

180 // \|scan\| match is extended backwards in a similar way.

181

182 // Extend the current match (if any) backwards. \|lenb\| is the maximal

183 // extension for which less than half the byte positions in the extension

184 // are wrong.

185 int lenb = 0;

186 if (scan < newsize) { // i.e. not case (4); there is a match to extend.

187 int score = 0, Sb = 0;

188 for (int i = 1; (scan >= lastscan + i) && (pos >= i); i++) {

189 if (old[pos - i] == newbuf[scan - i]) score++;

190 if (score2 - i > Sb2 - lenb) { Sb = score; lenb = i; }

191 }

192 }

193

194 // Extend the lastscan match forward; \|lenf\| is the maximal extension for

195 // which less than half of the byte positions in entire lastscan match are

196 // wrong. There is a subtle point here: \|lastscan\| points to before the

197 // seed match by \|lenb\| bytes from the previous iteration. This is why

198 // the loop measures the total number of mistakes in the the match, not

199 // just the from the match.

200 int lenf = 0;

201 {

202 int score = 0, Sf = 0;

203 for (int i = 0; (lastscan + i < scan) && (lastpos + i < oldsize); ) {

204 if (old[lastpos + i] == newbuf[lastscan + i]) score++;

205 i++;

206 if (score2 - i > Sf2 - lenf) { Sf = score; lenf = i; }

207 }

208 }

209

210 // If the extended scans overlap, pick a position in the overlap region

211 // that maximizes the exact matching bytes.

212 if (lastscan + lenf > scan - lenb) {

213 int overlap = (lastscan + lenf) - (scan - lenb);

214 int score = 0;

215 int Ss = 0, lens = 0;

216 for (int i = 0; i < overlap; i++) {

217 if (newbuf[lastscan + lenf - overlap + i] ==

218 old[lastpos + lenf - overlap + i]) score++;

219 if (newbuf[scan - lenb + i] == old[pos - lenb + i]) score--;

220 if (score > Ss) { Ss = score; lens = i + 1; }

221 }

222

223 lenf += lens - overlap;

224 lenb -= lens;

225 };

226

227 for (int i = 0; i < lenf; i++) {

228 uint8_t diff_byte = newbuf[lastscan + i] - old[lastpos + i];

229 if (diff_byte) {

230 ++diff_bytes_nonzero;

231 if (!diff_skips->WriteVarint32(pending_diff_zeros))

232 return MEM_ERROR;

233 pending_diff_zeros = 0;

234 if (!diff_bytes->Write(&diff_byte, 1))

235 return MEM_ERROR;

236 } else {

237 ++pending_diff_zeros;

238 }

239 }

240 int gap = (scan - lenb) - (lastscan + lenf);

241 for (int i = 0; i < gap; i++) {

242 if (!extra_bytes->Write(&newbuf[lastscan + lenf + i], 1))

243 return MEM_ERROR;

244 }

245

246 diff_bytes_length += lenf;

247 extra_bytes_length += gap;

248

249 uint32_t copy_count = lenf;

250 uint32_t extra_count = gap;

251 int32_t seek_adjustment = ((pos - lenb) - (lastpos + lenf));

252

253 if (!control_stream_copy_counts->WriteVarint32(copy_count) \|\|

254 !control_stream_extra_counts->WriteVarint32(extra_count) \|\|

255 !control_stream_seeks->WriteVarint32Signed(seek_adjustment)) {

256 return MEM_ERROR;

257 }

258

259 ++control_length;

260 #ifdef DEBUG_bsmedberg

261 VLOG(1) << StringPrintf("Writing a block: copy: %-8u extra: %-8u seek: "

262 "%+-9d", copy_count, extra_count,

263 seek_adjustment);

264 #endif

265

266 lastscan = scan - lenb; // Include the backward extension in seed.

267 lastpos = pos - lenb; // ditto.

268 lastoffset = lastpos - lastscan;

269 }

270 }

271

272 if (!diff_skips->WriteVarint32(pending_diff_zeros))

273 return MEM_ERROR;

274

275 I.clear();

276

277 MBSPatchHeader header;

278 // The string will have a null terminator that we don't use, hence '-1'.

279 static_assert(sizeof(MBS_PATCH_HEADER_TAG) - 1 == sizeof(header.tag),

280 "MBS_PATCH_HEADER_TAG must match header field size");

281 memcpy(header.tag, MBS_PATCH_HEADER_TAG, sizeof(header.tag));

282 header.slen = oldsize;

283 header.scrc32 = CalculateCrc(old, oldsize);

284 header.dlen = newsize;

285

286 if (!WriteHeader(patch_stream, &header))

287 return MEM_ERROR;

288

289 size_t diff_skips_length = diff_skips->Length();

290 if (!patch_streams.CopyTo(patch_stream))

291 return MEM_ERROR;

292

293 VLOG(1) << "Control tuples: " << control_length

294 << " copy bytes: " << diff_bytes_length

295 << " mistakes: " << diff_bytes_nonzero

296 << " (skips: " << diff_skips_length << ")"

297 << " extra bytes: " << extra_bytes_length

298 << "\nUncompressed bsdiff patch size "

299 << patch_stream->Length() - initial_patch_stream_length

300 << "\nEnd bsdiff "

301 << (base::Time::Now() - start_bsdiff_time).InSecondsF();

302

303 return OK;

304 }

305

306 } // namespace courgette

OLD	NEW

« no previous file with comments | « courgette/third_party/bsdiff_apply.cc ('k') | courgette/third_party/paged_array.h » ('j') | no next file with comments »