Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: third_party/zlib/crc_folding.c

Issue 552123005: Integrate SIMD optimisations for zlib (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Fix crc_fold_copy to work with inputs where len % 16 > 0 Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
3 * instruction.
4 *
5 * A white paper describing this algorithm can be found at:
6 * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast -crc-computation-generic-polynomials-pclmulqdq-paper.pdf
7 *
8 * Copyright (C) 2013 Intel Corporation. All rights reserved.
9 * Authors:
10 * Wajdi Feghali <wajdi.k.feghali@intel.com>
11 * Jim Guilford <james.guilford@intel.com>
12 * Vinodh Gopal <vinodh.gopal@intel.com>
13 * Erdinc Ozturk <erdinc.ozturk@intel.com>
14 * Jim Kukunas <james.t.kukunas@linux.intel.com>
15 *
16 * For conditions of distribution and use, see copyright notice in zlib.h
17 */
18
19 #include "deflate.h"
20
21 #include <inttypes.h>
22 #include <immintrin.h>
23 #include <wmmintrin.h>
24
25 #define CRC_LOAD(s) \
26 do { \
27 __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);\
28 __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);\
29 __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);\
30 __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);\
31 __m128i xmm_crc_part = _mm_loadu_si128((__m128i *)s->crc0 + 4);
32
33 #define CRC_SAVE(s) \
34 _mm_storeu_si128((__m128i *)s->crc0 + 0, xmm_crc0);\
35 _mm_storeu_si128((__m128i *)s->crc0 + 1, xmm_crc1);\
36 _mm_storeu_si128((__m128i *)s->crc0 + 2, xmm_crc2);\
37 _mm_storeu_si128((__m128i *)s->crc0 + 3, xmm_crc3);\
38 _mm_storeu_si128((__m128i *)s->crc0 + 4, xmm_crc_part);\
39 } while (0);
40
41 ZLIB_INTERNAL void crc_fold_init(deflate_state *const s)
42 {
43 CRC_LOAD(s)
44
45 xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
46 xmm_crc1 = _mm_setzero_si128();
47 xmm_crc2 = _mm_setzero_si128();
48 xmm_crc3 = _mm_setzero_si128();
49
50 CRC_SAVE(s)
51
52 s->strm->adler = 0;
53 }
54
55 local void fold_1(deflate_state *const s,
56 __m128i *xmm_crc0, __m128i *xmm_crc1,
57 __m128i *xmm_crc2, __m128i *xmm_crc3)
58 {
59 const __m128i xmm_fold4 = _mm_set_epi32(
60 0x00000001, 0x54442bd4,
61 0x00000001, 0xc6e41596);
62
63 __m128i x_tmp3;
64 __m128 ps_crc0, ps_crc3, ps_res;
65
66 x_tmp3 = *xmm_crc3;
67
68 *xmm_crc3 = *xmm_crc0;
69 *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
70 *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
71 ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
72 ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
73 ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
74
75 *xmm_crc0 = *xmm_crc1;
76 *xmm_crc1 = *xmm_crc2;
77 *xmm_crc2 = x_tmp3;
78 *xmm_crc3 = _mm_castps_si128(ps_res);
79 }
80
81 local void fold_2(deflate_state *const s,
82 __m128i *xmm_crc0, __m128i *xmm_crc1,
83 __m128i *xmm_crc2, __m128i *xmm_crc3)
84 {
85 const __m128i xmm_fold4 = _mm_set_epi32(
86 0x00000001, 0x54442bd4,
87 0x00000001, 0xc6e41596);
88
89 __m128i x_tmp3, x_tmp2;
90 __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
91
92 x_tmp3 = *xmm_crc3;
93 x_tmp2 = *xmm_crc2;
94
95 *xmm_crc3 = *xmm_crc1;
96 *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
97 *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
98 ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
99 ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
100 ps_res31= _mm_xor_ps(ps_crc3, ps_crc1);
101
102 *xmm_crc2 = *xmm_crc0;
103 *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
104 *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
105 ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
106 ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
107 ps_res20= _mm_xor_ps(ps_crc0, ps_crc2);
108
109 *xmm_crc0 = x_tmp2;
110 *xmm_crc1 = x_tmp3;
111 *xmm_crc2 = _mm_castps_si128(ps_res20);
112 *xmm_crc3 = _mm_castps_si128(ps_res31);
113 }
114
115 local void fold_3(deflate_state *const s,
116 __m128i *xmm_crc0, __m128i *xmm_crc1,
117 __m128i *xmm_crc2, __m128i *xmm_crc3)
118 {
119 const __m128i xmm_fold4 = _mm_set_epi32(
120 0x00000001, 0x54442bd4,
121 0x00000001, 0xc6e41596);
122
123 __m128i x_tmp3;
124 __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
125
126 x_tmp3 = *xmm_crc3;
127
128 *xmm_crc3 = *xmm_crc2;
129 *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
130 *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
131 ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
132 ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
133 ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
134
135 *xmm_crc2 = *xmm_crc1;
136 *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
137 *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
138 ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
139 ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
140 ps_res21= _mm_xor_ps(ps_crc1, ps_crc2);
141
142 *xmm_crc1 = *xmm_crc0;
143 *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
144 *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
145 ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
146 ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
147 ps_res10= _mm_xor_ps(ps_crc0, ps_crc1);
148
149 *xmm_crc0 = x_tmp3;
150 *xmm_crc1 = _mm_castps_si128(ps_res10);
151 *xmm_crc2 = _mm_castps_si128(ps_res21);
152 *xmm_crc3 = _mm_castps_si128(ps_res32);
153 }
154
155 local void fold_4(deflate_state *const s,
156 __m128i *xmm_crc0, __m128i *xmm_crc1,
157 __m128i *xmm_crc2, __m128i *xmm_crc3)
158 {
159 const __m128i xmm_fold4 = _mm_set_epi32(
160 0x00000001, 0x54442bd4,
161 0x00000001, 0xc6e41596);
162
163 __m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
164 __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
165 __m128 ps_t0, ps_t1, ps_t2, ps_t3;
166 __m128 ps_res0, ps_res1, ps_res2, ps_res3;
167
168 x_tmp0 = *xmm_crc0;
169 x_tmp1 = *xmm_crc1;
170 x_tmp2 = *xmm_crc2;
171 x_tmp3 = *xmm_crc3;
172
173 *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
174 x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
175 ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
176 ps_t0 = _mm_castsi128_ps(x_tmp0);
177 ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
178
179 *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
180 x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
181 ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
182 ps_t1 = _mm_castsi128_ps(x_tmp1);
183 ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
184
185 *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
186 x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
187 ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
188 ps_t2 = _mm_castsi128_ps(x_tmp2);
189 ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
190
191 *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
192 x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
193 ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
194 ps_t3 = _mm_castsi128_ps(x_tmp3);
195 ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
196
197 *xmm_crc0 = _mm_castps_si128(ps_res0);
198 *xmm_crc1 = _mm_castps_si128(ps_res1);
199 *xmm_crc2 = _mm_castps_si128(ps_res2);
200 *xmm_crc3 = _mm_castps_si128(ps_res3);
201 }
202
203 local const unsigned zalign(32) pshufb_shf_table[60] = {
204 0x84838281,0x88878685,0x8c8b8a89,0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
205 0x85848382,0x89888786,0x8d8c8b8a,0x01008f8e, /* shl 14 (16 - 3)/shr2 */
206 0x86858483,0x8a898887,0x8e8d8c8b,0x0201008f, /* shl 13 (16 - 4)/shr3 */
207 0x87868584,0x8b8a8988,0x8f8e8d8c,0x03020100, /* shl 12 (16 - 4)/shr4 */
208 0x88878685,0x8c8b8a89,0x008f8e8d,0x04030201, /* shl 11 (16 - 5)/shr5 */
209 0x89888786,0x8d8c8b8a,0x01008f8e,0x05040302, /* shl 10 (16 - 6)/shr6 */
210 0x8a898887,0x8e8d8c8b,0x0201008f,0x06050403, /* shl 9 (16 - 7)/shr7 */
211 0x8b8a8988,0x8f8e8d8c,0x03020100,0x07060504, /* shl 8 (16 - 8)/shr8 */
212 0x8c8b8a89,0x008f8e8d,0x04030201,0x08070605, /* shl 7 (16 - 9)/shr9 */
213 0x8d8c8b8a,0x01008f8e,0x05040302,0x09080706, /* shl 6 (16 -10)/shr10*/
214 0x8e8d8c8b,0x0201008f,0x06050403,0x0a090807, /* shl 5 (16 -11)/shr11*/
215 0x8f8e8d8c,0x03020100,0x07060504,0x0b0a0908, /* shl 4 (16 -12)/shr12*/
216 0x008f8e8d,0x04030201,0x08070605,0x0c0b0a09, /* shl 3 (16 -13)/shr13*/
217 0x01008f8e,0x05040302,0x09080706,0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/
218 0x0201008f,0x06050403,0x0a090807,0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
219 };
220
221 local void partial_fold(deflate_state *const s, const size_t len,
222 __m128i *xmm_crc0, __m128i *xmm_crc1,
223 __m128i *xmm_crc2, __m128i *xmm_crc3,
224 __m128i *xmm_crc_part)
225 {
226
227 const __m128i xmm_fold4 = _mm_set_epi32(
228 0x00000001, 0x54442bd4,
229 0x00000001, 0xc6e41596);
230 const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
231
232 __m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
233 __m128i xmm_a0_0, xmm_a0_1;
234 __m128 ps_crc3, psa0_0, psa0_1, ps_res;
235
236 xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1));
237 xmm_shr = xmm_shl;
238 xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
239
240 xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
241
242 *xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
243 xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
244 *xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
245
246 *xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
247 xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
248 *xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
249
250 *xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
251 xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
252 *xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
253
254 *xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
255 *xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
256 *xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
257
258 xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
259 xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
260
261 ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
262 psa0_0 = _mm_castsi128_ps(xmm_a0_0);
263 psa0_1 = _mm_castsi128_ps(xmm_a0_1);
264
265 ps_res = _mm_xor_ps(ps_crc3, psa0_0);
266 ps_res = _mm_xor_ps(ps_res, psa0_1);
267
268 *xmm_crc3 = _mm_castps_si128(ps_res);
269 }
270
271 ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s,
272 unsigned char *dst, const unsigned char *src, long len)
273 {
274 unsigned long algn_diff;
275 __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
276
277 CRC_LOAD(s)
278
279 if (len < 16) {
280 if (len == 0)
281 return;
282 goto partial;
283 }
284
285 algn_diff = 0 - (unsigned long)src & 0xF;
286 if (algn_diff) {
287 xmm_crc_part = _mm_loadu_si128((__m128i *)src);
288 _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
289
290 dst += algn_diff;
291 src += algn_diff;
292 len -= algn_diff;
293
294 partial_fold(s, algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
295 &xmm_crc_part);
296 }
297
298 while ((len -= 64) >= 0) {
299 xmm_t0 = _mm_load_si128((__m128i *)src);
300 xmm_t1 = _mm_load_si128((__m128i *)src + 1);
301 xmm_t2 = _mm_load_si128((__m128i *)src + 2);
302 xmm_t3 = _mm_load_si128((__m128i *)src + 3);
303
304 fold_4(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
305
306 _mm_storeu_si128((__m128i *)dst, xmm_t0);
307 _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
308 _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
309 _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
310
311 xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
312 xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
313 xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
314 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
315
316 src += 64;
317 dst += 64;
318 }
319
320 /*
321 * len = num bytes left - 64
322 */
323 if (len + 16 >= 0) {
324 len += 16;
325
326 xmm_t0 = _mm_load_si128((__m128i *)src);
327 xmm_t1 = _mm_load_si128((__m128i *)src + 1);
328 xmm_t2 = _mm_load_si128((__m128i *)src + 2);
329
330 fold_3(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
331
332 _mm_storeu_si128((__m128i *)dst, xmm_t0);
333 _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
334 _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
335
336 xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
337 xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
338 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
339
340 if (len == 0)
341 goto done;
342
343 dst += 48;
344 src += 48;
345 } else if (len + 32 >= 0) {
346 len += 32;
347
348 xmm_t0 = _mm_load_si128((__m128i *)src);
349 xmm_t1 = _mm_load_si128((__m128i *)src + 1);
350
351 fold_2(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
352
353 _mm_storeu_si128((__m128i *)dst, xmm_t0);
354 _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
355
356 xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
357 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
358
359 if (len == 0)
360 goto done;
361
362 dst += 32;
363 src += 32;
364 } else if (len + 48 >= 0) {
365 len += 48;
366
367 xmm_t0 = _mm_load_si128((__m128i *)src);
368
369 fold_1(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
370
371 _mm_storeu_si128((__m128i *)dst, xmm_t0);
372
373 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
374
375 if (len == 0)
376 goto done;
377
378 dst += 16;
379 src += 16;
380 } else {
381 len += 64;
382 if (len == 0)
383 goto done;
384 }
385
386 partial:
387 memcpy(&xmm_crc_part, src, len);
388
389 _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
390 partial_fold(s, len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
391 &xmm_crc_part);
392 done:
393 CRC_SAVE(s)
394 }
395
396 local const unsigned zalign(16) crc_k[] = {
397 0xccaa009e, 0x00000000, /* rk1 */
398 0x751997d0, 0x00000001, /* rk2 */
399 0xccaa009e, 0x00000000, /* rk5 */
400 0x63cd6124, 0x00000001, /* rk6 */
401 0xf7011640, 0x00000001, /* rk7 */
402 0xdb710640, 0x00000001 /* rk8 */
403 };
404
405 local const unsigned zalign(16) crc_mask[4] = {
406 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
407 };
408
409 local const unsigned zalign(16) crc_mask2[4] = {
410 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
411 };
412
413 unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s)
414 {
415 const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
416 const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
417
418 unsigned crc;
419 __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
420
421 CRC_LOAD(s)
422
423 /*
424 * k1
425 */
426 crc_fold = _mm_load_si128((__m128i *)crc_k);
427
428 x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
429 xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
430 xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
431 xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
432
433 x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
434 xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
435 xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
436 xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
437
438 x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
439 xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
440 xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
441 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
442
443 /*
444 * k5
445 */
446 crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
447
448 xmm_crc0 = xmm_crc3;
449 xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
450 xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
451 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
452
453 xmm_crc0 = xmm_crc3;
454 xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
455 xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
456 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
457 xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
458
459 /*
460 * k7
461 */
462 xmm_crc1 = xmm_crc3;
463 xmm_crc2 = xmm_crc3;
464 crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
465
466 xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
467 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
468 xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
469
470 xmm_crc2 = xmm_crc3;
471 xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
472 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
473 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
474
475 crc = _mm_extract_epi32(xmm_crc3, 2);
476 return ~crc;
477 CRC_SAVE(s)
478 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698