Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(672)

Side by Side Diff: third_party/zlib/simd.patch

Issue 2084863002: Update Zlib to version 1.2.8 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/zlib/mozzconf.h ('k') | third_party/zlib/trees.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 diff --git a/third_party/zlib/crc32.c b/third_party/zlib/crc32.c
2 index 979a719..09228ed 100644
3 --- a/third_party/zlib/crc32.c
4 +++ b/third_party/zlib/crc32.c
5 @@ -28,6 +28,8 @@
6 # endif /* !DYNAMIC_CRC_TABLE */
7 #endif /* MAKECRCH */
8
9 +#include "deflate.h"
10 +#include "x86.h"
11 #include "zutil.h" /* for STDC and FAR definitions */
12
13 #define local static
14 @@ -423,3 +425,28 @@ uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
15 {
16 return crc32_combine_(crc1, crc2, len2);
17 }
18 +
19 +ZLIB_INTERNAL void crc_reset(deflate_state *const s)
20 +{
21 + if (x86_cpu_enable_simd) {
22 + crc_fold_init(s);
23 + return;
24 + }
25 + s->strm->adler = crc32(0L, Z_NULL, 0);
26 +}
27 +
28 +ZLIB_INTERNAL void crc_finalize(deflate_state *const s)
29 +{
30 + if (x86_cpu_enable_simd)
31 + s->strm->adler = crc_fold_512to32(s);
32 +}
33 +
34 +ZLIB_INTERNAL void copy_with_crc(z_streamp strm, Bytef *dst, long size)
35 +{
36 + if (x86_cpu_enable_simd) {
37 + crc_fold_copy(strm->state, dst, strm->next_in, size);
38 + return;
39 + }
40 + zmemcpy(dst, strm->next_in, size);
41 + strm->adler = crc32(strm->adler, dst, size);
42 +}
43 diff --git a/third_party/zlib/crc_folding.c b/third_party/zlib/crc_folding.c
44 new file mode 100644
45 index 0000000..48d7774
46 --- /dev/null
47 +++ b/third_party/zlib/crc_folding.c
48 @@ -0,0 +1,493 @@
49 +/*
50 + * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
51 + * instruction.
52 + *
53 + * A white paper describing this algorithm can be found at:
54 + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fas t-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
55 + *
56 + * Copyright (C) 2013 Intel Corporation. All rights reserved.
57 + * Authors:
58 + * Wajdi Feghali <wajdi.k.feghali@intel.com>
59 + * Jim Guilford <james.guilford@intel.com>
60 + * Vinodh Gopal <vinodh.gopal@intel.com>
61 + * Erdinc Ozturk <erdinc.ozturk@intel.com>
62 + * Jim Kukunas <james.t.kukunas@linux.intel.com>
63 + *
64 + * For conditions of distribution and use, see copyright notice in zlib.h
65 + */
66 +
67 +#include "deflate.h"
68 +
69 +#include <inttypes.h>
70 +#include <emmintrin.h>
71 +#include <immintrin.h>
72 +#include <wmmintrin.h>
73 +
74 +#define CRC_LOAD(s) \
75 + do { \
76 + __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);\
77 + __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);\
78 + __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);\
79 + __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);\
80 + __m128i xmm_crc_part = _mm_loadu_si128((__m128i *)s->crc0 + 4);
81 +
82 +#define CRC_SAVE(s) \
83 + _mm_storeu_si128((__m128i *)s->crc0 + 0, xmm_crc0);\
84 + _mm_storeu_si128((__m128i *)s->crc0 + 1, xmm_crc1);\
85 + _mm_storeu_si128((__m128i *)s->crc0 + 2, xmm_crc2);\
86 + _mm_storeu_si128((__m128i *)s->crc0 + 3, xmm_crc3);\
87 + _mm_storeu_si128((__m128i *)s->crc0 + 4, xmm_crc_part);\
88 + } while (0);
89 +
90 +ZLIB_INTERNAL void crc_fold_init(deflate_state *const s)
91 +{
92 + CRC_LOAD(s)
93 +
94 + xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
95 + xmm_crc1 = _mm_setzero_si128();
96 + xmm_crc2 = _mm_setzero_si128();
97 + xmm_crc3 = _mm_setzero_si128();
98 +
99 + CRC_SAVE(s)
100 +
101 + s->strm->adler = 0;
102 +}
103 +
104 +local void fold_1(deflate_state *const s,
105 + __m128i *xmm_crc0, __m128i *xmm_crc1,
106 + __m128i *xmm_crc2, __m128i *xmm_crc3)
107 +{
108 + const __m128i xmm_fold4 = _mm_set_epi32(
109 + 0x00000001, 0x54442bd4,
110 + 0x00000001, 0xc6e41596);
111 +
112 + __m128i x_tmp3;
113 + __m128 ps_crc0, ps_crc3, ps_res;
114 +
115 + x_tmp3 = *xmm_crc3;
116 +
117 + *xmm_crc3 = *xmm_crc0;
118 + *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
119 + *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
120 + ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
121 + ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
122 + ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
123 +
124 + *xmm_crc0 = *xmm_crc1;
125 + *xmm_crc1 = *xmm_crc2;
126 + *xmm_crc2 = x_tmp3;
127 + *xmm_crc3 = _mm_castps_si128(ps_res);
128 +}
129 +
130 +local void fold_2(deflate_state *const s,
131 + __m128i *xmm_crc0, __m128i *xmm_crc1,
132 + __m128i *xmm_crc2, __m128i *xmm_crc3)
133 +{
134 + const __m128i xmm_fold4 = _mm_set_epi32(
135 + 0x00000001, 0x54442bd4,
136 + 0x00000001, 0xc6e41596);
137 +
138 + __m128i x_tmp3, x_tmp2;
139 + __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
140 +
141 + x_tmp3 = *xmm_crc3;
142 + x_tmp2 = *xmm_crc2;
143 +
144 + *xmm_crc3 = *xmm_crc1;
145 + *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
146 + *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
147 + ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
148 + ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
149 + ps_res31= _mm_xor_ps(ps_crc3, ps_crc1);
150 +
151 + *xmm_crc2 = *xmm_crc0;
152 + *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
153 + *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
154 + ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
155 + ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
156 + ps_res20= _mm_xor_ps(ps_crc0, ps_crc2);
157 +
158 + *xmm_crc0 = x_tmp2;
159 + *xmm_crc1 = x_tmp3;
160 + *xmm_crc2 = _mm_castps_si128(ps_res20);
161 + *xmm_crc3 = _mm_castps_si128(ps_res31);
162 +}
163 +
164 +local void fold_3(deflate_state *const s,
165 + __m128i *xmm_crc0, __m128i *xmm_crc1,
166 + __m128i *xmm_crc2, __m128i *xmm_crc3)
167 +{
168 + const __m128i xmm_fold4 = _mm_set_epi32(
169 + 0x00000001, 0x54442bd4,
170 + 0x00000001, 0xc6e41596);
171 +
172 + __m128i x_tmp3;
173 + __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
174 +
175 + x_tmp3 = *xmm_crc3;
176 +
177 + *xmm_crc3 = *xmm_crc2;
178 + *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
179 + *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
180 + ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
181 + ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
182 + ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
183 +
184 + *xmm_crc2 = *xmm_crc1;
185 + *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
186 + *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
187 + ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
188 + ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
189 + ps_res21= _mm_xor_ps(ps_crc1, ps_crc2);
190 +
191 + *xmm_crc1 = *xmm_crc0;
192 + *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
193 + *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
194 + ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
195 + ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
196 + ps_res10= _mm_xor_ps(ps_crc0, ps_crc1);
197 +
198 + *xmm_crc0 = x_tmp3;
199 + *xmm_crc1 = _mm_castps_si128(ps_res10);
200 + *xmm_crc2 = _mm_castps_si128(ps_res21);
201 + *xmm_crc3 = _mm_castps_si128(ps_res32);
202 +}
203 +
204 +local void fold_4(deflate_state *const s,
205 + __m128i *xmm_crc0, __m128i *xmm_crc1,
206 + __m128i *xmm_crc2, __m128i *xmm_crc3)
207 +{
208 + const __m128i xmm_fold4 = _mm_set_epi32(
209 + 0x00000001, 0x54442bd4,
210 + 0x00000001, 0xc6e41596);
211 +
212 + __m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
213 + __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
214 + __m128 ps_t0, ps_t1, ps_t2, ps_t3;
215 + __m128 ps_res0, ps_res1, ps_res2, ps_res3;
216 +
217 + x_tmp0 = *xmm_crc0;
218 + x_tmp1 = *xmm_crc1;
219 + x_tmp2 = *xmm_crc2;
220 + x_tmp3 = *xmm_crc3;
221 +
222 + *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
223 + x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
224 + ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
225 + ps_t0 = _mm_castsi128_ps(x_tmp0);
226 + ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
227 +
228 + *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
229 + x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
230 + ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
231 + ps_t1 = _mm_castsi128_ps(x_tmp1);
232 + ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
233 +
234 + *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
235 + x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
236 + ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
237 + ps_t2 = _mm_castsi128_ps(x_tmp2);
238 + ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
239 +
240 + *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
241 + x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
242 + ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
243 + ps_t3 = _mm_castsi128_ps(x_tmp3);
244 + ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
245 +
246 + *xmm_crc0 = _mm_castps_si128(ps_res0);
247 + *xmm_crc1 = _mm_castps_si128(ps_res1);
248 + *xmm_crc2 = _mm_castps_si128(ps_res2);
249 + *xmm_crc3 = _mm_castps_si128(ps_res3);
250 +}
251 +
252 +local const unsigned zalign(32) pshufb_shf_table[60] = {
253 + 0x84838281,0x88878685,0x8c8b8a89,0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
254 + 0x85848382,0x89888786,0x8d8c8b8a,0x01008f8e, /* shl 14 (16 - 3)/shr2 */
255 + 0x86858483,0x8a898887,0x8e8d8c8b,0x0201008f, /* shl 13 (16 - 4)/shr3 */
256 + 0x87868584,0x8b8a8988,0x8f8e8d8c,0x03020100, /* shl 12 (16 - 4)/shr4 */
257 + 0x88878685,0x8c8b8a89,0x008f8e8d,0x04030201, /* shl 11 (16 - 5)/shr5 */
258 + 0x89888786,0x8d8c8b8a,0x01008f8e,0x05040302, /* shl 10 (16 - 6)/shr6 */
259 + 0x8a898887,0x8e8d8c8b,0x0201008f,0x06050403, /* shl 9 (16 - 7)/shr7 */
260 + 0x8b8a8988,0x8f8e8d8c,0x03020100,0x07060504, /* shl 8 (16 - 8)/shr8 */
261 + 0x8c8b8a89,0x008f8e8d,0x04030201,0x08070605, /* shl 7 (16 - 9)/shr9 */
262 + 0x8d8c8b8a,0x01008f8e,0x05040302,0x09080706, /* shl 6 (16 -10)/shr10*/
263 + 0x8e8d8c8b,0x0201008f,0x06050403,0x0a090807, /* shl 5 (16 -11)/shr11*/
264 + 0x8f8e8d8c,0x03020100,0x07060504,0x0b0a0908, /* shl 4 (16 -12)/shr12*/
265 + 0x008f8e8d,0x04030201,0x08070605,0x0c0b0a09, /* shl 3 (16 -13)/shr13*/
266 + 0x01008f8e,0x05040302,0x09080706,0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/
267 + 0x0201008f,0x06050403,0x0a090807,0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
268 +};
269 +
270 +local void partial_fold(deflate_state *const s, const size_t len,
271 + __m128i *xmm_crc0, __m128i *xmm_crc1,
272 + __m128i *xmm_crc2, __m128i *xmm_crc3,
273 + __m128i *xmm_crc_part)
274 +{
275 +
276 + const __m128i xmm_fold4 = _mm_set_epi32(
277 + 0x00000001, 0x54442bd4,
278 + 0x00000001, 0xc6e41596);
279 + const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
280 +
281 + __m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
282 + __m128i xmm_a0_0, xmm_a0_1;
283 + __m128 ps_crc3, psa0_0, psa0_1, ps_res;
284 +
285 + xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1));
286 + xmm_shr = xmm_shl;
287 + xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
288 +
289 + xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
290 +
291 + *xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
292 + xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
293 + *xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
294 +
295 + *xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
296 + xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
297 + *xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
298 +
299 + *xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
300 + xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
301 + *xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
302 +
303 + *xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
304 + *xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
305 + *xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
306 +
307 + xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
308 + xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
309 +
310 + ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
311 + psa0_0 = _mm_castsi128_ps(xmm_a0_0);
312 + psa0_1 = _mm_castsi128_ps(xmm_a0_1);
313 +
314 + ps_res = _mm_xor_ps(ps_crc3, psa0_0);
315 + ps_res = _mm_xor_ps(ps_res, psa0_1);
316 +
317 + *xmm_crc3 = _mm_castps_si128(ps_res);
318 +}
319 +
320 +ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s,
321 + unsigned char *dst, const unsigned char *src, long len)
322 +{
323 + unsigned long algn_diff;
324 + __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
325 +
326 + CRC_LOAD(s)
327 +
328 + if (len < 16) {
329 + if (len == 0)
330 + return;
331 + goto partial;
332 + }
333 +
334 + algn_diff = 0 - (uintptr_t)src & 0xF;
335 + if (algn_diff) {
336 + xmm_crc_part = _mm_loadu_si128((__m128i *)src);
337 + _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
338 +
339 + dst += algn_diff;
340 + src += algn_diff;
341 + len -= algn_diff;
342 +
343 + partial_fold(s, algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
344 + &xmm_crc_part);
345 + }
346 +
347 + while ((len -= 64) >= 0) {
348 + xmm_t0 = _mm_load_si128((__m128i *)src);
349 + xmm_t1 = _mm_load_si128((__m128i *)src + 1);
350 + xmm_t2 = _mm_load_si128((__m128i *)src + 2);
351 + xmm_t3 = _mm_load_si128((__m128i *)src + 3);
352 +
353 + fold_4(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
354 +
355 + _mm_storeu_si128((__m128i *)dst, xmm_t0);
356 + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
357 + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
358 + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
359 +
360 + xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
361 + xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
362 + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
363 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
364 +
365 + src += 64;
366 + dst += 64;
367 + }
368 +
369 + /*
370 + * len = num bytes left - 64
371 + */
372 + if (len + 16 >= 0) {
373 + len += 16;
374 +
375 + xmm_t0 = _mm_load_si128((__m128i *)src);
376 + xmm_t1 = _mm_load_si128((__m128i *)src + 1);
377 + xmm_t2 = _mm_load_si128((__m128i *)src + 2);
378 +
379 + fold_3(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
380 +
381 + _mm_storeu_si128((__m128i *)dst, xmm_t0);
382 + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
383 + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
384 +
385 + xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
386 + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
387 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
388 +
389 + if (len == 0)
390 + goto done;
391 +
392 + dst += 48;
393 + src += 48;
394 + } else if (len + 32 >= 0) {
395 + len += 32;
396 +
397 + xmm_t0 = _mm_load_si128((__m128i *)src);
398 + xmm_t1 = _mm_load_si128((__m128i *)src + 1);
399 +
400 + fold_2(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
401 +
402 + _mm_storeu_si128((__m128i *)dst, xmm_t0);
403 + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
404 +
405 + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
406 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
407 +
408 + if (len == 0)
409 + goto done;
410 +
411 + dst += 32;
412 + src += 32;
413 + } else if (len + 48 >= 0) {
414 + len += 48;
415 +
416 + xmm_t0 = _mm_load_si128((__m128i *)src);
417 +
418 + fold_1(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
419 +
420 + _mm_storeu_si128((__m128i *)dst, xmm_t0);
421 +
422 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
423 +
424 + if (len == 0)
425 + goto done;
426 +
427 + dst += 16;
428 + src += 16;
429 + } else {
430 + len += 64;
431 + if (len == 0)
432 + goto done;
433 + }
434 +
435 +partial:
436 +
437 +#if defined(_MSC_VER)
438 + /* VS does not permit the use of _mm_set_epi64x in 32-bit builds */
439 + {
440 + int32_t parts[4] = {0, 0, 0, 0};
441 + memcpy(&parts, src, len);
442 + xmm_crc_part = _mm_set_epi32(parts[3], parts[2], parts[1], parts[0]);
443 + }
444 +#else
445 + {
446 + int64_t parts[2] = {0, 0};
447 + memcpy(&parts, src, len);
448 + xmm_crc_part = _mm_set_epi64x(parts[1], parts[0]);
449 + }
450 +#endif
451 +
452 + _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
453 + partial_fold(s, len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
454 + &xmm_crc_part);
455 +done:
456 + CRC_SAVE(s)
457 +}
458 +
459 +local const unsigned zalign(16) crc_k[] = {
460 + 0xccaa009e, 0x00000000, /* rk1 */
461 + 0x751997d0, 0x00000001, /* rk2 */
462 + 0xccaa009e, 0x00000000, /* rk5 */
463 + 0x63cd6124, 0x00000001, /* rk6 */
464 + 0xf7011640, 0x00000001, /* rk7 */
465 + 0xdb710640, 0x00000001 /* rk8 */
466 +};
467 +
468 +local const unsigned zalign(16) crc_mask[4] = {
469 + 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
470 +};
471 +
472 +local const unsigned zalign(16) crc_mask2[4] = {
473 + 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
474 +};
475 +
476 +unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s)
477 +{
478 + const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
479 + const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
480 +
481 + unsigned crc;
482 + __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
483 +
484 + CRC_LOAD(s)
485 +
486 + /*
487 + * k1
488 + */
489 + crc_fold = _mm_load_si128((__m128i *)crc_k);
490 +
491 + x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
492 + xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
493 + xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
494 + xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
495 +
496 + x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
497 + xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
498 + xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
499 + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
500 +
501 + x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
502 + xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
503 + xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
504 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
505 +
506 + /*
507 + * k5
508 + */
509 + crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
510 +
511 + xmm_crc0 = xmm_crc3;
512 + xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
513 + xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
514 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
515 +
516 + xmm_crc0 = xmm_crc3;
517 + xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
518 + xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
519 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
520 + xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
521 +
522 + /*
523 + * k7
524 + */
525 + xmm_crc1 = xmm_crc3;
526 + xmm_crc2 = xmm_crc3;
527 + crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
528 +
529 + xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
530 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
531 + xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
532 +
533 + xmm_crc2 = xmm_crc3;
534 + xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
535 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
536 + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
537 +
538 + crc = _mm_extract_epi32(xmm_crc3, 2);
539 + return ~crc;
540 + CRC_SAVE(s)
541 +}
542 diff --git a/third_party/zlib/deflate.c b/third_party/zlib/deflate.c
543 index 7c95b30..59645eb 100644
544 --- a/third_party/zlib/deflate.c
545 +++ b/third_party/zlib/deflate.c
546 @@ -48,8 +48,9 @@
547 */
548
549 /* @(#) $Id$ */
550 -
551 +#include <assert.h>
552 #include "deflate.h"
553 +#include "x86.h"
554
555 const char deflate_copyright[] =
556 " deflate 1.2.8 Copyright 1995-2013 Jean-loup Gailly and Mark Adler ";
557 @@ -85,7 +86,7 @@ local block_state deflate_huff OF((deflate_state *s, int flu sh));
558 local void lm_init OF((deflate_state *s));
559 local void putShortMSB OF((deflate_state *s, uInt b));
560 local void flush_pending OF((z_streamp strm));
561 -local int read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
562 +
563 #ifdef ASMV
564 void match_init OF((void)); /* asm code initialization */
565 uInt longest_match OF((deflate_state *s, IPos cur_match, int clas));
566 @@ -98,6 +99,23 @@ local void check_match OF((deflate_state *s, IPos start, IPo s match,
567 int length));
568 #endif
569
570 +/* For fill_window_sse.c to use */
571 +ZLIB_INTERNAL int read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
572 +
573 +/* From crc32.c */
574 +extern void ZLIB_INTERNAL crc_reset(deflate_state *const s);
575 +extern void ZLIB_INTERNAL crc_finalize(deflate_state *const s);
576 +extern void ZLIB_INTERNAL copy_with_crc(z_streamp strm, Bytef *dst, long size);
577 +
578 +#ifdef _MSC_VER
579 +#define INLINE __inline
580 +#else
581 +#define INLINE inline
582 +#endif
583 +
584 +/* Inline optimisation */
585 +local INLINE Pos insert_string_sse(deflate_state *const s, const Pos str);
586 +
587 /* ===========================================================================
588 * Local data
589 */
590 @@ -167,7 +185,6 @@ struct static_tree_desc_s {int dummy;}; /* for buggy compile rs */
591 */
592 #define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask)
593
594 -
595 /* ===========================================================================
596 * Insert string str in the dictionary and set match_head to the previous head
597 * of the hash chain (the most recent string with same hash key). Return
598 @@ -178,17 +195,28 @@ struct static_tree_desc_s {int dummy;}; /* for buggy compi lers */
599 * input characters and the first MIN_MATCH bytes of str are valid
600 * (except for the last MIN_MATCH-1 bytes of the input file).
601 */
602 +local INLINE Pos insert_string_c(deflate_state *const s, const Pos str)
603 +{
604 + Pos ret;
605 +
606 + UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]);
607 #ifdef FASTEST
608 -#define INSERT_STRING(s, str, match_head) \
609 - (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
610 - match_head = s->head[s->ins_h], \
611 - s->head[s->ins_h] = (Pos)(str))
612 + ret = s->head[s->ins_h];
613 #else
614 -#define INSERT_STRING(s, str, match_head) \
615 - (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
616 - match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h], \
617 - s->head[s->ins_h] = (Pos)(str))
618 + ret = s->prev[str & s->w_mask] = s->head[s->ins_h];
619 #endif
620 + s->head[s->ins_h] = str;
621 +
622 + return ret;
623 +}
624 +
625 +local INLINE Pos insert_string(deflate_state *const s, const Pos str)
626 +{
627 + if (x86_cpu_enable_simd)
628 + return insert_string_sse(s, str);
629 + return insert_string_c(s, str);
630 +}
631 +
632
633 /* ===========================================================================
634 * Initialize the hash table (avoiding 64K overflow for 16 bit systems).
635 @@ -222,6 +250,7 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, m emLevel, strategy,
636 const char *version;
637 int stream_size;
638 {
639 + unsigned window_padding = 8;
640 deflate_state *s;
641 int wrap = 1;
642 static const char my_version[] = ZLIB_VERSION;
643 @@ -231,6 +260,8 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, m emLevel, strategy,
644 * output size for (length,distance) codes is <= 24 bits.
645 */
646
647 + x86_check_features();
648 +
649 if (version == Z_NULL || version[0] != my_version[0] ||
650 stream_size != sizeof(z_stream)) {
651 return Z_VERSION_ERROR;
652 @@ -286,12 +317,17 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
653 s->w_size = 1 << s->w_bits;
654 s->w_mask = s->w_size - 1;
655
656 - s->hash_bits = memLevel + 7;
657 + if (x86_cpu_enable_simd) {
658 + s->hash_bits = 15;
659 + } else {
660 + s->hash_bits = memLevel + 7;
661 + }
662 +
663 s->hash_size = 1 << s->hash_bits;
664 s->hash_mask = s->hash_size - 1;
665 s->hash_shift = ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH);
666
667 - s->window = (Bytef *) ZALLOC(strm, s->w_size, 2*sizeof(Byte));
668 + s->window = (Bytef *) ZALLOC(strm, s->w_size + window_padding, 2*sizeof(Byt e));
669 s->prev = (Posf *) ZALLOC(strm, s->w_size, sizeof(Pos));
670 s->head = (Posf *) ZALLOC(strm, s->hash_size, sizeof(Pos));
671 s->class_bitmap = NULL;
672 @@ -369,11 +405,7 @@ int ZEXPORT deflateSetDictionary (strm, dictionary, dictLen gth)
673 str = s->strstart;
674 n = s->lookahead - (MIN_MATCH-1);
675 do {
676 - UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]);
677 -#ifndef FASTEST
678 - s->prev[str & s->w_mask] = s->head[s->ins_h];
679 -#endif
680 - s->head[s->ins_h] = (Pos)str;
681 + insert_string(s, str);
682 str++;
683 } while (--n);
684 s->strstart = str;
685 @@ -696,7 +728,7 @@ int ZEXPORT deflate (strm, flush)
686 if (s->status == INIT_STATE) {
687 #ifdef GZIP
688 if (s->wrap == 2) {
689 - strm->adler = crc32(0L, Z_NULL, 0);
690 + crc_reset(s);
691 put_byte(s, 31);
692 put_byte(s, 139);
693 put_byte(s, 8);
694 @@ -975,6 +1007,7 @@ int ZEXPORT deflate (strm, flush)
695 /* Write the trailer */
696 #ifdef GZIP
697 if (s->wrap == 2) {
698 + crc_finalize(s);
699 put_byte(s, (Byte)(strm->adler & 0xff));
700 put_byte(s, (Byte)((strm->adler >> 8) & 0xff));
701 put_byte(s, (Byte)((strm->adler >> 16) & 0xff));
702 @@ -1097,7 +1130,7 @@ int ZEXPORT deflateCopy (dest, source)
703 * allocating a large strm->next_in buffer and copying from it.
704 * (See also flush_pending()).
705 */
706 -local int read_buf(strm, buf, size)
707 +ZLIB_INTERNAL int read_buf(strm, buf, size)
708 z_streamp strm;
709 Bytef *buf;
710 unsigned size;
711 @@ -1109,15 +1142,16 @@ local int read_buf(strm, buf, size)
712
713 strm->avail_in -= len;
714
715 - zmemcpy(buf, strm->next_in, len);
716 - if (strm->state->wrap == 1) {
717 - strm->adler = adler32(strm->adler, buf, len);
718 - }
719 #ifdef GZIP
720 - else if (strm->state->wrap == 2) {
721 - strm->adler = crc32(strm->adler, buf, len);
722 - }
723 + if (strm->state->wrap == 2)
724 + copy_with_crc(strm, buf, len);
725 + else
726 #endif
727 + {
728 + zmemcpy(buf, strm->next_in, len);
729 + if (strm->state->wrap == 1)
730 + strm->adler = adler32(strm->adler, buf, len);
731 + }
732 strm->next_in += len;
733 strm->total_in += len;
734
735 @@ -1530,7 +1564,19 @@ local void check_match(s, start, match, length)
736 * performed for at least two bytes (required for the zip translate_eol
737 * option -- not supported here).
738 */
739 -local void fill_window(s)
740 +local void fill_window_c(deflate_state *s);
741 +
742 +local void fill_window(deflate_state *s)
743 +{
744 + if (x86_cpu_enable_simd) {
745 + fill_window_sse(s);
746 + return;
747 + }
748 +
749 + fill_window_c(s);
750 +}
751 +
752 +local void fill_window_c(s)
753 deflate_state *s;
754 {
755 register unsigned n, m;
756 @@ -1818,7 +1864,7 @@ local block_state deflate_fast(s, flush, clas)
757 */
758 hash_head = NIL;
759 if (s->lookahead >= MIN_MATCH) {
760 - INSERT_STRING(s, s->strstart, hash_head);
761 + hash_head = insert_string(s, s->strstart);
762 }
763
764 /* Find the longest match, discarding those <= prev_length.
765 @@ -1849,7 +1895,7 @@ local block_state deflate_fast(s, flush, clas)
766 s->match_length--; /* string at strstart already in table */
767 do {
768 s->strstart++;
769 - INSERT_STRING(s, s->strstart, hash_head);
770 + hash_head = insert_string(s, s->strstart);
771 /* strstart never exceeds WSIZE-MAX_MATCH, so there are
772 * always MIN_MATCH bytes ahead.
773 */
774 @@ -1934,7 +1980,7 @@ local block_state deflate_slow(s, flush, clas)
775 */
776 hash_head = NIL;
777 if (s->lookahead >= MIN_MATCH) {
778 - INSERT_STRING(s, s->strstart, hash_head);
779 + hash_head = insert_string(s, s->strstart);
780 }
781
782 /* Find the longest match, discarding those <= prev_length.
783 @@ -2003,7 +2049,7 @@ local block_state deflate_slow(s, flush, clas)
784 s->prev_length -= 2;
785 do {
786 if (++s->strstart <= max_insert) {
787 - INSERT_STRING(s, s->strstart, hash_head);
788 + hash_head = insert_string(s, s->strstart);
789 }
790 } while (--s->prev_length != 0);
791 s->match_available = 0;
792 @@ -2163,3 +2209,37 @@ local block_state deflate_huff(s, flush)
793 FLUSH_BLOCK(s, 0);
794 return block_done;
795 }
796 +
797 +/* Safe to inline this as GCC/clang will use inline asm and Visual Studio will
798 + * use intrinsic without extra params
799 + */
800 +local INLINE Pos insert_string_sse(deflate_state *const s, const Pos str)
801 +{
802 + Pos ret;
803 + unsigned *ip, val, h = 0;
804 +
805 + ip = (unsigned *)&s->window[str];
806 + val = *ip;
807 +
808 + if (s->level >= 6)
809 + val &= 0xFFFFFF;
810 +
811 +/* Windows clang should use inline asm */
812 +#if defined(_MSC_VER) && !defined(__clang__)
813 + h = _mm_crc32_u32(h, val);
814 +#elif defined(__i386__) || defined(__amd64__)
815 + __asm__ __volatile__ (
816 + "crc32 %1,%0\n\t"
817 + : "+r" (h)
818 + : "r" (val)
819 + );
820 +#else
821 + /* This should never happen */
822 + assert(0);
823 +#endif
824 +
825 + ret = s->head[h & s->hash_mask];
826 + s->head[h & s->hash_mask] = str;
827 + s->prev[str & s->w_mask] = ret;
828 + return ret;
829 +}
830 diff --git a/third_party/zlib/deflate.h b/third_party/zlib/deflate.h
831 index c795034..c61e4ab 100644
832 --- a/third_party/zlib/deflate.h
833 +++ b/third_party/zlib/deflate.h
834 @@ -109,7 +109,7 @@ typedef struct internal_state {
835 uInt gzindex; /* where in extra, name, or comment */
836 Byte method; /* can only be DEFLATED */
837 int last_flush; /* value of flush param for previous deflate call */
838 -
839 + unsigned zalign(16) crc0[4 * 5];
840 /* used by deflate.c: */
841
842 uInt w_size; /* LZ77 window size (32K by default) */
843 @@ -348,4 +348,14 @@ void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, c harf *buf,
844 flush = _tr_tally(s, distance, length)
845 #endif
846
847 +/* Functions that are SIMD optimised on x86 */
848 +void ZLIB_INTERNAL crc_fold_init(deflate_state* const s);
849 +void ZLIB_INTERNAL crc_fold_copy(deflate_state* const s,
850 + unsigned char* dst,
851 + const unsigned char* src,
852 + long len);
853 +unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state* const s);
854 +
855 +void ZLIB_INTERNAL fill_window_sse(deflate_state* s);
856 +
857 #endif /* DEFLATE_H */
858 diff --git a/third_party/zlib/fill_window_sse.c b/third_party/zlib/fill_window_s se.c
859 new file mode 100644
860 index 0000000..949ccce
861 --- /dev/null
862 +++ b/third_party/zlib/fill_window_sse.c
863 @@ -0,0 +1,175 @@
864 +/*
865 + * Fill Window with SSE2-optimized hash shifting
866 + *
867 + * Copyright (C) 2013 Intel Corporation
868 + * Authors:
869 + * Arjan van de Ven <arjan@linux.intel.com>
870 + * Jim Kukunas <james.t.kukunas@linux.intel.com>
871 + *
872 + * For conditions of distribution and use, see copyright notice in zlib.h
873 + */
874 +
875 +#include <immintrin.h>
876 +#include "deflate.h"
877 +
878 +#define UPDATE_HASH(s,h,i) \
879 + {\
880 + if (s->level < 6) { \
881 + h = (3483 * (s->window[i]) +\
882 + 23081* (s->window[i+1]) +\
883 + 6954 * (s->window[i+2]) +\
884 + 20947* (s->window[i+3])) & s->hash_mask;\
885 + } else {\
886 + h = (25881* (s->window[i]) +\
887 + 24674* (s->window[i+1]) +\
888 + 25811* (s->window[i+2])) & s->hash_mask;\
889 + }\
890 + }\
891 +
892 +extern int read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
893 +
894 +void fill_window_sse(deflate_state *s)
895 +{
896 + const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
897 +
898 + register unsigned n;
899 + register Posf *p;
900 + unsigned more; /* Amount of free space at the end of the window. */
901 + uInt wsize = s->w_size;
902 +
903 + Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
904 +
905 + do {
906 + more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);
907 +
908 + /* Deal with !@#$% 64K limit: */
909 + if (sizeof(int) <= 2) {
910 + if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
911 + more = wsize;
912 +
913 + } else if (more == (unsigned)(-1)) {
914 + /* Very unlikely, but possible on 16 bit machine if
915 + * strstart == 0 && lookahead == 1 (input done a byte at time)
916 + */
917 + more--;
918 + }
919 + }
920 +
921 + /* If the window is almost full and there is insufficient lookahead,
922 + * move the upper half to the lower one to make room in the upper half.
923 + */
924 + if (s->strstart >= wsize+MAX_DIST(s)) {
925 +
926 + zmemcpy(s->window, s->window+wsize, (unsigned)wsize);
927 + s->match_start -= wsize;
928 + s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
929 + s->block_start -= (long) wsize;
930 +
931 + /* Slide the hash table (could be avoided with 32 bit values
932 + at the expense of memory usage). We slide even when level == 0
933 + to keep the hash table consistent if we switch back to level > 0
934 + later. (Using level 0 permanently is not an optimal usage of
935 + zlib, so we don't care about this pathological case.)
936 + */
937 + n = s->hash_size;
938 + p = &s->head[n];
939 + p -= 8;
940 + do {
941 + __m128i value, result;
942 +
943 + value = _mm_loadu_si128((__m128i *)p);
944 + result = _mm_subs_epu16(value, xmm_wsize);
945 + _mm_storeu_si128((__m128i *)p, result);
946 +
947 + p -= 8;
948 + n -= 8;
949 + } while (n > 0);
950 +
951 + n = wsize;
952 +#ifndef FASTEST
953 + p = &s->prev[n];
954 + p -= 8;
955 + do {
956 + __m128i value, result;
957 +
958 + value = _mm_loadu_si128((__m128i *)p);
959 + result = _mm_subs_epu16(value, xmm_wsize);
960 + _mm_storeu_si128((__m128i *)p, result);
961 +
962 + p -= 8;
963 + n -= 8;
964 + } while (n > 0);
965 +#endif
966 + more += wsize;
967 + }
968 + if (s->strm->avail_in == 0) break;
969 +
970 + /* If there was no sliding:
971 + * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
972 + * more == window_size - lookahead - strstart
973 + * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
974 + * => more >= window_size - 2*WSIZE + 2
975 + * In the BIG_MEM or MMAP case (not yet supported),
976 + * window_size == input_size + MIN_LOOKAHEAD &&
977 + * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
978 + * Otherwise, window_size == 2*WSIZE so more >= 2.
979 + * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
980 + */
981 + Assert(more >= 2, "more < 2");
982 +
983 + n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
984 + s->lookahead += n;
985 +
986 + /* Initialize the hash value now that we have some input: */
987 + if (s->lookahead >= MIN_MATCH) {
988 + uInt str = s->strstart;
989 + s->ins_h = s->window[str];
990 + if (str >= 1)
991 + UPDATE_HASH(s, s->ins_h, str + 1 - (MIN_MATCH-1));
992 +#if MIN_MATCH != 3
993 + Call UPDATE_HASH() MIN_MATCH-3 more times
994 +#endif
995 + }
996 + /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
997 + * but this is not important since only literal bytes will be emitted.
998 + */
999 +
1000 + } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
1001 +
1002 + /* If the WIN_INIT bytes after the end of the current data have never been
1003 + * written, then zero those bytes in order to avoid memory check reports of
1004 + * the use of uninitialized (or uninitialised as Julian writes) bytes by
1005 + * the longest match routines. Update the high water mark for the next
1006 + * time through here. WIN_INIT is set to MAX_MATCH since the longest match
1007 + * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
1008 + */
1009 + if (s->high_water < s->window_size) {
1010 + ulg curr = s->strstart + (ulg)(s->lookahead);
1011 + ulg init;
1012 +
1013 + if (s->high_water < curr) {
1014 + /* Previous high water mark below current data -- zero WIN_INIT
1015 + * bytes or up to end of window, whichever is less.
1016 + */
1017 + init = s->window_size - curr;
1018 + if (init > WIN_INIT)
1019 + init = WIN_INIT;
1020 + zmemzero(s->window + curr, (unsigned)init);
1021 + s->high_water = curr + init;
1022 + }
1023 + else if (s->high_water < (ulg)curr + WIN_INIT) {
1024 + /* High water mark at or above current data, but below current data
1025 + * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
1026 + * to end of window, whichever is less.
1027 + */
1028 + init = (ulg)curr + WIN_INIT - s->high_water;
1029 + if (init > s->window_size - s->high_water)
1030 + init = s->window_size - s->high_water;
1031 + zmemzero(s->window + s->high_water, (unsigned)init);
1032 + s->high_water += init;
1033 + }
1034 + }
1035 +
1036 + Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
1037 + "not enough room for search");
1038 +}
1039 diff --git a/third_party/zlib/simd_stub.c b/third_party/zlib/simd_stub.c
1040 new file mode 100644
1041 index 0000000..796f1f6
1042 --- /dev/null
1043 +++ b/third_party/zlib/simd_stub.c
1044 @@ -0,0 +1,35 @@
1045 +/* simd_stub.c -- stub implementations
1046 +* Copyright (C) 2014 Intel Corporation
1047 +* For conditions of distribution and use, see copyright notice in zlib.h
1048 +*/
1049 +#include <assert.h>
1050 +
1051 +#include "deflate.h"
1052 +#include "x86.h"
1053 +
1054 +int x86_cpu_enable_simd = 0;
1055 +
1056 +void ZLIB_INTERNAL crc_fold_init(deflate_state *const s) {
1057 + assert(0);
1058 +}
1059 +
1060 +void ZLIB_INTERNAL crc_fold_copy(deflate_state *const s,
1061 + unsigned char *dst,
1062 + const unsigned char *src,
1063 + long len) {
1064 + assert(0);
1065 +}
1066 +
1067 +unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
1068 + assert(0);
1069 + return 0;
1070 +}
1071 +
1072 +void ZLIB_INTERNAL fill_window_sse(deflate_state *s)
1073 +{
1074 + assert(0);
1075 +}
1076 +
1077 +void x86_check_features(void)
1078 +{
1079 +}
1080 diff --git a/third_party/zlib/x86.c b/third_party/zlib/x86.c
1081 new file mode 100644
1082 index 0000000..e6532fd
1083 --- /dev/null
1084 +++ b/third_party/zlib/x86.c
1085 @@ -0,0 +1,91 @@
1086 +/*
1087 + * x86 feature check
1088 + *
1089 + * Copyright (C) 2013 Intel Corporation. All rights reserved.
1090 + * Author:
1091 + * Jim Kukunas
1092 + *
1093 + * For conditions of distribution and use, see copyright notice in zlib.h
1094 + */
1095 +
1096 +#include "x86.h"
1097 +
1098 +int x86_cpu_enable_simd = 0;
1099 +
1100 +#ifndef _MSC_VER
1101 +#include <pthread.h>
1102 +
1103 +pthread_once_t cpu_check_inited_once = PTHREAD_ONCE_INIT;
1104 +static void _x86_check_features(void);
1105 +
1106 +void x86_check_features(void)
1107 +{
1108 + pthread_once(&cpu_check_inited_once, _x86_check_features);
1109 +}
1110 +
1111 +static void _x86_check_features(void)
1112 +{
1113 + int x86_cpu_has_sse2;
1114 + int x86_cpu_has_sse42;
1115 + int x86_cpu_has_pclmulqdq;
1116 + unsigned eax, ebx, ecx, edx;
1117 +
1118 + eax = 1;
1119 +#ifdef __i386__
1120 + __asm__ __volatile__ (
1121 + "xchg %%ebx, %1\n\t"
1122 + "cpuid\n\t"
1123 + "xchg %1, %%ebx\n\t"
1124 + : "+a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx)
1125 + );
1126 +#else
1127 + __asm__ __volatile__ (
1128 + "cpuid\n\t"
1129 + : "+a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
1130 + );
1131 +#endif /* (__i386__) */
1132 +
1133 + x86_cpu_has_sse2 = edx & 0x4000000;
1134 + x86_cpu_has_sse42 = ecx & 0x100000;
1135 + x86_cpu_has_pclmulqdq = ecx & 0x2;
1136 +
1137 + x86_cpu_enable_simd = x86_cpu_has_sse2 &&
1138 + x86_cpu_has_sse42 &&
1139 + x86_cpu_has_pclmulqdq;
1140 +}
1141 +#else
1142 +#include <intrin.h>
1143 +#include <windows.h>
1144 +
1145 +static BOOL CALLBACK _x86_check_features(PINIT_ONCE once,
1146 + PVOID param,
1147 + PVOID *context);
1148 +static INIT_ONCE cpu_check_inited_once = INIT_ONCE_STATIC_INIT;
1149 +
1150 +void x86_check_features(void)
1151 +{
1152 + InitOnceExecuteOnce(&cpu_check_inited_once, _x86_check_features,
1153 + NULL, NULL);
1154 +}
1155 +
1156 +static BOOL CALLBACK _x86_check_features(PINIT_ONCE once,
1157 + PVOID param,
1158 + PVOID *context)
1159 +{
1160 + int x86_cpu_has_sse2;
1161 + int x86_cpu_has_sse42;
1162 + int x86_cpu_has_pclmulqdq;
1163 + int regs[4];
1164 +
1165 + __cpuid(regs, 1);
1166 +
1167 + x86_cpu_has_sse2 = regs[3] & 0x4000000;
1168 + x86_cpu_has_sse42= regs[2] & 0x100000;
1169 + x86_cpu_has_pclmulqdq = regs[2] & 0x2;
1170 +
1171 + x86_cpu_enable_simd = x86_cpu_has_sse2 &&
1172 + x86_cpu_has_sse42 &&
1173 + x86_cpu_has_pclmulqdq;
1174 + return TRUE;
1175 +}
1176 +#endif /* _MSC_VER */
1177 diff --git a/third_party/zlib/x86.h b/third_party/zlib/x86.h
1178 new file mode 100644
1179 index 0000000..ac3d180
1180 --- /dev/null
1181 +++ b/third_party/zlib/x86.h
1182 @@ -0,0 +1,13 @@
1183 +/* x86.h -- check for x86 CPU features
1184 +* Copyright (C) 2013 Intel Corporation Jim Kukunas
1185 +* For conditions of distribution and use, see copyright notice in zlib.h
1186 +*/
1187 +
1188 +#ifndef X86_H
1189 +#define X86_H
1190 +
1191 +extern int x86_cpu_enable_simd;
1192 +
1193 +void x86_check_features(void);
1194 +
1195 +#endif /* X86_H */
1196 --
1197 2.7.4
1198
OLDNEW
« no previous file with comments | « third_party/zlib/mozzconf.h ('k') | third_party/zlib/trees.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698