OLD | NEW |
| (Empty) |
1 #if defined(__x86_64__) | |
2 .text | |
3 | |
4 .extern OPENSSL_ia32cap_P | |
5 .hidden OPENSSL_ia32cap_P | |
6 | |
7 .align 64 | |
8 .Lzero: | |
9 .long 0,0,0,0 | |
10 .Lone: | |
11 .long 1,0,0,0 | |
12 .Linc: | |
13 .long 0,1,2,3 | |
14 .Lfour: | |
15 .long 4,4,4,4 | |
16 .Lincy: | |
17 .long 0,2,4,6,1,3,5,7 | |
18 .Leight: | |
19 .long 8,8,8,8,8,8,8,8 | |
20 .Lrot16: | |
21 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd | |
22 .Lrot24: | |
23 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe | |
24 .Lsigma: | |
25 .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 | |
26 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,
82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110
,115,115,108,46,111,114,103,62,0 | |
27 .globl ChaCha20_ctr32 | |
28 .hidden ChaCha20_ctr32 | |
29 .type ChaCha20_ctr32,@function | |
30 .align 64 | |
31 ChaCha20_ctr32: | |
32 cmpq $0,%rdx | |
33 je .Lno_data | |
34 movq OPENSSL_ia32cap_P+4(%rip),%r10 | |
35 testl $512,%r10d | |
36 jnz .LChaCha20_ssse3 | |
37 | |
38 pushq %rbx | |
39 pushq %rbp | |
40 pushq %r12 | |
41 pushq %r13 | |
42 pushq %r14 | |
43 pushq %r15 | |
44 subq $64+24,%rsp | |
45 | |
46 | |
47 movdqu (%rcx),%xmm1 | |
48 movdqu 16(%rcx),%xmm2 | |
49 movdqu (%r8),%xmm3 | |
50 movdqa .Lone(%rip),%xmm4 | |
51 | |
52 | |
53 movdqa %xmm1,16(%rsp) | |
54 movdqa %xmm2,32(%rsp) | |
55 movdqa %xmm3,48(%rsp) | |
56 movq %rdx,%rbp | |
57 jmp .Loop_outer | |
58 | |
59 .align 32 | |
60 .Loop_outer: | |
61 movl $0x61707865,%eax | |
62 movl $0x3320646e,%ebx | |
63 movl $0x79622d32,%ecx | |
64 movl $0x6b206574,%edx | |
65 movl 16(%rsp),%r8d | |
66 movl 20(%rsp),%r9d | |
67 movl 24(%rsp),%r10d | |
68 movl 28(%rsp),%r11d | |
69 movd %xmm3,%r12d | |
70 movl 52(%rsp),%r13d | |
71 movl 56(%rsp),%r14d | |
72 movl 60(%rsp),%r15d | |
73 | |
74 movq %rbp,64+0(%rsp) | |
75 movl $10,%ebp | |
76 movq %rsi,64+8(%rsp) | |
77 .byte 102,72,15,126,214 | |
78 movq %rdi,64+16(%rsp) | |
79 movq %rsi,%rdi | |
80 shrq $32,%rdi | |
81 jmp .Loop | |
82 | |
83 .align 32 | |
84 .Loop: | |
85 addl %r8d,%eax | |
86 xorl %eax,%r12d | |
87 roll $16,%r12d | |
88 addl %r9d,%ebx | |
89 xorl %ebx,%r13d | |
90 roll $16,%r13d | |
91 addl %r12d,%esi | |
92 xorl %esi,%r8d | |
93 roll $12,%r8d | |
94 addl %r13d,%edi | |
95 xorl %edi,%r9d | |
96 roll $12,%r9d | |
97 addl %r8d,%eax | |
98 xorl %eax,%r12d | |
99 roll $8,%r12d | |
100 addl %r9d,%ebx | |
101 xorl %ebx,%r13d | |
102 roll $8,%r13d | |
103 addl %r12d,%esi | |
104 xorl %esi,%r8d | |
105 roll $7,%r8d | |
106 addl %r13d,%edi | |
107 xorl %edi,%r9d | |
108 roll $7,%r9d | |
109 movl %esi,32(%rsp) | |
110 movl %edi,36(%rsp) | |
111 movl 40(%rsp),%esi | |
112 movl 44(%rsp),%edi | |
113 addl %r10d,%ecx | |
114 xorl %ecx,%r14d | |
115 roll $16,%r14d | |
116 addl %r11d,%edx | |
117 xorl %edx,%r15d | |
118 roll $16,%r15d | |
119 addl %r14d,%esi | |
120 xorl %esi,%r10d | |
121 roll $12,%r10d | |
122 addl %r15d,%edi | |
123 xorl %edi,%r11d | |
124 roll $12,%r11d | |
125 addl %r10d,%ecx | |
126 xorl %ecx,%r14d | |
127 roll $8,%r14d | |
128 addl %r11d,%edx | |
129 xorl %edx,%r15d | |
130 roll $8,%r15d | |
131 addl %r14d,%esi | |
132 xorl %esi,%r10d | |
133 roll $7,%r10d | |
134 addl %r15d,%edi | |
135 xorl %edi,%r11d | |
136 roll $7,%r11d | |
137 addl %r9d,%eax | |
138 xorl %eax,%r15d | |
139 roll $16,%r15d | |
140 addl %r10d,%ebx | |
141 xorl %ebx,%r12d | |
142 roll $16,%r12d | |
143 addl %r15d,%esi | |
144 xorl %esi,%r9d | |
145 roll $12,%r9d | |
146 addl %r12d,%edi | |
147 xorl %edi,%r10d | |
148 roll $12,%r10d | |
149 addl %r9d,%eax | |
150 xorl %eax,%r15d | |
151 roll $8,%r15d | |
152 addl %r10d,%ebx | |
153 xorl %ebx,%r12d | |
154 roll $8,%r12d | |
155 addl %r15d,%esi | |
156 xorl %esi,%r9d | |
157 roll $7,%r9d | |
158 addl %r12d,%edi | |
159 xorl %edi,%r10d | |
160 roll $7,%r10d | |
161 movl %esi,40(%rsp) | |
162 movl %edi,44(%rsp) | |
163 movl 32(%rsp),%esi | |
164 movl 36(%rsp),%edi | |
165 addl %r11d,%ecx | |
166 xorl %ecx,%r13d | |
167 roll $16,%r13d | |
168 addl %r8d,%edx | |
169 xorl %edx,%r14d | |
170 roll $16,%r14d | |
171 addl %r13d,%esi | |
172 xorl %esi,%r11d | |
173 roll $12,%r11d | |
174 addl %r14d,%edi | |
175 xorl %edi,%r8d | |
176 roll $12,%r8d | |
177 addl %r11d,%ecx | |
178 xorl %ecx,%r13d | |
179 roll $8,%r13d | |
180 addl %r8d,%edx | |
181 xorl %edx,%r14d | |
182 roll $8,%r14d | |
183 addl %r13d,%esi | |
184 xorl %esi,%r11d | |
185 roll $7,%r11d | |
186 addl %r14d,%edi | |
187 xorl %edi,%r8d | |
188 roll $7,%r8d | |
189 decl %ebp | |
190 jnz .Loop | |
191 movl %edi,36(%rsp) | |
192 movl %esi,32(%rsp) | |
193 movq 64(%rsp),%rbp | |
194 movdqa %xmm2,%xmm1 | |
195 movq 64+8(%rsp),%rsi | |
196 paddd %xmm4,%xmm3 | |
197 movq 64+16(%rsp),%rdi | |
198 | |
199 addl $0x61707865,%eax | |
200 addl $0x3320646e,%ebx | |
201 addl $0x79622d32,%ecx | |
202 addl $0x6b206574,%edx | |
203 addl 16(%rsp),%r8d | |
204 addl 20(%rsp),%r9d | |
205 addl 24(%rsp),%r10d | |
206 addl 28(%rsp),%r11d | |
207 addl 48(%rsp),%r12d | |
208 addl 52(%rsp),%r13d | |
209 addl 56(%rsp),%r14d | |
210 addl 60(%rsp),%r15d | |
211 paddd 32(%rsp),%xmm1 | |
212 | |
213 cmpq $64,%rbp | |
214 jb .Ltail | |
215 | |
216 xorl 0(%rsi),%eax | |
217 xorl 4(%rsi),%ebx | |
218 xorl 8(%rsi),%ecx | |
219 xorl 12(%rsi),%edx | |
220 xorl 16(%rsi),%r8d | |
221 xorl 20(%rsi),%r9d | |
222 xorl 24(%rsi),%r10d | |
223 xorl 28(%rsi),%r11d | |
224 movdqu 32(%rsi),%xmm0 | |
225 xorl 48(%rsi),%r12d | |
226 xorl 52(%rsi),%r13d | |
227 xorl 56(%rsi),%r14d | |
228 xorl 60(%rsi),%r15d | |
229 leaq 64(%rsi),%rsi | |
230 pxor %xmm1,%xmm0 | |
231 | |
232 movdqa %xmm2,32(%rsp) | |
233 movd %xmm3,48(%rsp) | |
234 | |
235 movl %eax,0(%rdi) | |
236 movl %ebx,4(%rdi) | |
237 movl %ecx,8(%rdi) | |
238 movl %edx,12(%rdi) | |
239 movl %r8d,16(%rdi) | |
240 movl %r9d,20(%rdi) | |
241 movl %r10d,24(%rdi) | |
242 movl %r11d,28(%rdi) | |
243 movdqu %xmm0,32(%rdi) | |
244 movl %r12d,48(%rdi) | |
245 movl %r13d,52(%rdi) | |
246 movl %r14d,56(%rdi) | |
247 movl %r15d,60(%rdi) | |
248 leaq 64(%rdi),%rdi | |
249 | |
250 subq $64,%rbp | |
251 jnz .Loop_outer | |
252 | |
253 jmp .Ldone | |
254 | |
255 .align 16 | |
256 .Ltail: | |
257 movl %eax,0(%rsp) | |
258 movl %ebx,4(%rsp) | |
259 xorq %rbx,%rbx | |
260 movl %ecx,8(%rsp) | |
261 movl %edx,12(%rsp) | |
262 movl %r8d,16(%rsp) | |
263 movl %r9d,20(%rsp) | |
264 movl %r10d,24(%rsp) | |
265 movl %r11d,28(%rsp) | |
266 movdqa %xmm1,32(%rsp) | |
267 movl %r12d,48(%rsp) | |
268 movl %r13d,52(%rsp) | |
269 movl %r14d,56(%rsp) | |
270 movl %r15d,60(%rsp) | |
271 | |
272 .Loop_tail: | |
273 movzbl (%rsi,%rbx,1),%eax | |
274 movzbl (%rsp,%rbx,1),%edx | |
275 leaq 1(%rbx),%rbx | |
276 xorl %edx,%eax | |
277 movb %al,-1(%rdi,%rbx,1) | |
278 decq %rbp | |
279 jnz .Loop_tail | |
280 | |
281 .Ldone: | |
282 addq $64+24,%rsp | |
283 popq %r15 | |
284 popq %r14 | |
285 popq %r13 | |
286 popq %r12 | |
287 popq %rbp | |
288 popq %rbx | |
289 .Lno_data: | |
290 .byte 0xf3,0xc3 | |
291 .size ChaCha20_ctr32,.-ChaCha20_ctr32 | |
292 .type ChaCha20_ssse3,@function | |
293 .align 32 | |
294 ChaCha20_ssse3: | |
295 .LChaCha20_ssse3: | |
296 cmpq $128,%rdx | |
297 ja .LChaCha20_4x | |
298 | |
299 .Ldo_sse3_after_all: | |
300 pushq %rbx | |
301 pushq %rbp | |
302 pushq %r12 | |
303 pushq %r13 | |
304 pushq %r14 | |
305 pushq %r15 | |
306 | |
307 subq $64+24,%rsp | |
308 movdqa .Lsigma(%rip),%xmm0 | |
309 movdqu (%rcx),%xmm1 | |
310 movdqu 16(%rcx),%xmm2 | |
311 movdqu (%r8),%xmm3 | |
312 movdqa .Lrot16(%rip),%xmm6 | |
313 movdqa .Lrot24(%rip),%xmm7 | |
314 | |
315 movdqa %xmm0,0(%rsp) | |
316 movdqa %xmm1,16(%rsp) | |
317 movdqa %xmm2,32(%rsp) | |
318 movdqa %xmm3,48(%rsp) | |
319 movl $10,%ebp | |
320 jmp .Loop_ssse3 | |
321 | |
322 .align 32 | |
323 .Loop_outer_ssse3: | |
324 movdqa .Lone(%rip),%xmm3 | |
325 movdqa 0(%rsp),%xmm0 | |
326 movdqa 16(%rsp),%xmm1 | |
327 movdqa 32(%rsp),%xmm2 | |
328 paddd 48(%rsp),%xmm3 | |
329 movl $10,%ebp | |
330 movdqa %xmm3,48(%rsp) | |
331 jmp .Loop_ssse3 | |
332 | |
333 .align 32 | |
334 .Loop_ssse3: | |
335 paddd %xmm1,%xmm0 | |
336 pxor %xmm0,%xmm3 | |
337 .byte 102,15,56,0,222 | |
338 paddd %xmm3,%xmm2 | |
339 pxor %xmm2,%xmm1 | |
340 movdqa %xmm1,%xmm4 | |
341 psrld $20,%xmm1 | |
342 pslld $12,%xmm4 | |
343 por %xmm4,%xmm1 | |
344 paddd %xmm1,%xmm0 | |
345 pxor %xmm0,%xmm3 | |
346 .byte 102,15,56,0,223 | |
347 paddd %xmm3,%xmm2 | |
348 pxor %xmm2,%xmm1 | |
349 movdqa %xmm1,%xmm4 | |
350 psrld $25,%xmm1 | |
351 pslld $7,%xmm4 | |
352 por %xmm4,%xmm1 | |
353 pshufd $78,%xmm2,%xmm2 | |
354 pshufd $57,%xmm1,%xmm1 | |
355 pshufd $147,%xmm3,%xmm3 | |
356 nop | |
357 paddd %xmm1,%xmm0 | |
358 pxor %xmm0,%xmm3 | |
359 .byte 102,15,56,0,222 | |
360 paddd %xmm3,%xmm2 | |
361 pxor %xmm2,%xmm1 | |
362 movdqa %xmm1,%xmm4 | |
363 psrld $20,%xmm1 | |
364 pslld $12,%xmm4 | |
365 por %xmm4,%xmm1 | |
366 paddd %xmm1,%xmm0 | |
367 pxor %xmm0,%xmm3 | |
368 .byte 102,15,56,0,223 | |
369 paddd %xmm3,%xmm2 | |
370 pxor %xmm2,%xmm1 | |
371 movdqa %xmm1,%xmm4 | |
372 psrld $25,%xmm1 | |
373 pslld $7,%xmm4 | |
374 por %xmm4,%xmm1 | |
375 pshufd $78,%xmm2,%xmm2 | |
376 pshufd $147,%xmm1,%xmm1 | |
377 pshufd $57,%xmm3,%xmm3 | |
378 decl %ebp | |
379 jnz .Loop_ssse3 | |
380 paddd 0(%rsp),%xmm0 | |
381 paddd 16(%rsp),%xmm1 | |
382 paddd 32(%rsp),%xmm2 | |
383 paddd 48(%rsp),%xmm3 | |
384 | |
385 cmpq $64,%rdx | |
386 jb .Ltail_ssse3 | |
387 | |
388 movdqu 0(%rsi),%xmm4 | |
389 movdqu 16(%rsi),%xmm5 | |
390 pxor %xmm4,%xmm0 | |
391 movdqu 32(%rsi),%xmm4 | |
392 pxor %xmm5,%xmm1 | |
393 movdqu 48(%rsi),%xmm5 | |
394 leaq 64(%rsi),%rsi | |
395 pxor %xmm4,%xmm2 | |
396 pxor %xmm5,%xmm3 | |
397 | |
398 movdqu %xmm0,0(%rdi) | |
399 movdqu %xmm1,16(%rdi) | |
400 movdqu %xmm2,32(%rdi) | |
401 movdqu %xmm3,48(%rdi) | |
402 leaq 64(%rdi),%rdi | |
403 | |
404 subq $64,%rdx | |
405 jnz .Loop_outer_ssse3 | |
406 | |
407 jmp .Ldone_ssse3 | |
408 | |
409 .align 16 | |
410 .Ltail_ssse3: | |
411 movdqa %xmm0,0(%rsp) | |
412 movdqa %xmm1,16(%rsp) | |
413 movdqa %xmm2,32(%rsp) | |
414 movdqa %xmm3,48(%rsp) | |
415 xorq %rbx,%rbx | |
416 | |
417 .Loop_tail_ssse3: | |
418 movzbl (%rsi,%rbx,1),%eax | |
419 movzbl (%rsp,%rbx,1),%ecx | |
420 leaq 1(%rbx),%rbx | |
421 xorl %ecx,%eax | |
422 movb %al,-1(%rdi,%rbx,1) | |
423 decq %rdx | |
424 jnz .Loop_tail_ssse3 | |
425 | |
426 .Ldone_ssse3: | |
427 addq $64+24,%rsp | |
428 popq %r15 | |
429 popq %r14 | |
430 popq %r13 | |
431 popq %r12 | |
432 popq %rbp | |
433 popq %rbx | |
434 .byte 0xf3,0xc3 | |
435 .size ChaCha20_ssse3,.-ChaCha20_ssse3 | |
436 .type ChaCha20_4x,@function | |
437 .align 32 | |
438 ChaCha20_4x: | |
439 .LChaCha20_4x: | |
440 movq %r10,%r11 | |
441 shrq $32,%r10 | |
442 testq $32,%r10 | |
443 jnz .LChaCha20_8x | |
444 cmpq $192,%rdx | |
445 ja .Lproceed4x | |
446 | |
447 andq $71303168,%r11 | |
448 cmpq $4194304,%r11 | |
449 je .Ldo_sse3_after_all | |
450 | |
451 .Lproceed4x: | |
452 leaq -120(%rsp),%r11 | |
453 subq $0x148+0,%rsp | |
454 movdqa .Lsigma(%rip),%xmm11 | |
455 movdqu (%rcx),%xmm15 | |
456 movdqu 16(%rcx),%xmm7 | |
457 movdqu (%r8),%xmm3 | |
458 leaq 256(%rsp),%rcx | |
459 leaq .Lrot16(%rip),%r10 | |
460 leaq .Lrot24(%rip),%r11 | |
461 | |
462 pshufd $0x00,%xmm11,%xmm8 | |
463 pshufd $0x55,%xmm11,%xmm9 | |
464 movdqa %xmm8,64(%rsp) | |
465 pshufd $0xaa,%xmm11,%xmm10 | |
466 movdqa %xmm9,80(%rsp) | |
467 pshufd $0xff,%xmm11,%xmm11 | |
468 movdqa %xmm10,96(%rsp) | |
469 movdqa %xmm11,112(%rsp) | |
470 | |
471 pshufd $0x00,%xmm15,%xmm12 | |
472 pshufd $0x55,%xmm15,%xmm13 | |
473 movdqa %xmm12,128-256(%rcx) | |
474 pshufd $0xaa,%xmm15,%xmm14 | |
475 movdqa %xmm13,144-256(%rcx) | |
476 pshufd $0xff,%xmm15,%xmm15 | |
477 movdqa %xmm14,160-256(%rcx) | |
478 movdqa %xmm15,176-256(%rcx) | |
479 | |
480 pshufd $0x00,%xmm7,%xmm4 | |
481 pshufd $0x55,%xmm7,%xmm5 | |
482 movdqa %xmm4,192-256(%rcx) | |
483 pshufd $0xaa,%xmm7,%xmm6 | |
484 movdqa %xmm5,208-256(%rcx) | |
485 pshufd $0xff,%xmm7,%xmm7 | |
486 movdqa %xmm6,224-256(%rcx) | |
487 movdqa %xmm7,240-256(%rcx) | |
488 | |
489 pshufd $0x00,%xmm3,%xmm0 | |
490 pshufd $0x55,%xmm3,%xmm1 | |
491 paddd .Linc(%rip),%xmm0 | |
492 pshufd $0xaa,%xmm3,%xmm2 | |
493 movdqa %xmm1,272-256(%rcx) | |
494 pshufd $0xff,%xmm3,%xmm3 | |
495 movdqa %xmm2,288-256(%rcx) | |
496 movdqa %xmm3,304-256(%rcx) | |
497 | |
498 jmp .Loop_enter4x | |
499 | |
500 .align 32 | |
501 .Loop_outer4x: | |
502 movdqa 64(%rsp),%xmm8 | |
503 movdqa 80(%rsp),%xmm9 | |
504 movdqa 96(%rsp),%xmm10 | |
505 movdqa 112(%rsp),%xmm11 | |
506 movdqa 128-256(%rcx),%xmm12 | |
507 movdqa 144-256(%rcx),%xmm13 | |
508 movdqa 160-256(%rcx),%xmm14 | |
509 movdqa 176-256(%rcx),%xmm15 | |
510 movdqa 192-256(%rcx),%xmm4 | |
511 movdqa 208-256(%rcx),%xmm5 | |
512 movdqa 224-256(%rcx),%xmm6 | |
513 movdqa 240-256(%rcx),%xmm7 | |
514 movdqa 256-256(%rcx),%xmm0 | |
515 movdqa 272-256(%rcx),%xmm1 | |
516 movdqa 288-256(%rcx),%xmm2 | |
517 movdqa 304-256(%rcx),%xmm3 | |
518 paddd .Lfour(%rip),%xmm0 | |
519 | |
520 .Loop_enter4x: | |
521 movdqa %xmm6,32(%rsp) | |
522 movdqa %xmm7,48(%rsp) | |
523 movdqa (%r10),%xmm7 | |
524 movl $10,%eax | |
525 movdqa %xmm0,256-256(%rcx) | |
526 jmp .Loop4x | |
527 | |
528 .align 32 | |
529 .Loop4x: | |
530 paddd %xmm12,%xmm8 | |
531 paddd %xmm13,%xmm9 | |
532 pxor %xmm8,%xmm0 | |
533 pxor %xmm9,%xmm1 | |
534 .byte 102,15,56,0,199 | |
535 .byte 102,15,56,0,207 | |
536 paddd %xmm0,%xmm4 | |
537 paddd %xmm1,%xmm5 | |
538 pxor %xmm4,%xmm12 | |
539 pxor %xmm5,%xmm13 | |
540 movdqa %xmm12,%xmm6 | |
541 pslld $12,%xmm12 | |
542 psrld $20,%xmm6 | |
543 movdqa %xmm13,%xmm7 | |
544 pslld $12,%xmm13 | |
545 por %xmm6,%xmm12 | |
546 psrld $20,%xmm7 | |
547 movdqa (%r11),%xmm6 | |
548 por %xmm7,%xmm13 | |
549 paddd %xmm12,%xmm8 | |
550 paddd %xmm13,%xmm9 | |
551 pxor %xmm8,%xmm0 | |
552 pxor %xmm9,%xmm1 | |
553 .byte 102,15,56,0,198 | |
554 .byte 102,15,56,0,206 | |
555 paddd %xmm0,%xmm4 | |
556 paddd %xmm1,%xmm5 | |
557 pxor %xmm4,%xmm12 | |
558 pxor %xmm5,%xmm13 | |
559 movdqa %xmm12,%xmm7 | |
560 pslld $7,%xmm12 | |
561 psrld $25,%xmm7 | |
562 movdqa %xmm13,%xmm6 | |
563 pslld $7,%xmm13 | |
564 por %xmm7,%xmm12 | |
565 psrld $25,%xmm6 | |
566 movdqa (%r10),%xmm7 | |
567 por %xmm6,%xmm13 | |
568 movdqa %xmm4,0(%rsp) | |
569 movdqa %xmm5,16(%rsp) | |
570 movdqa 32(%rsp),%xmm4 | |
571 movdqa 48(%rsp),%xmm5 | |
572 paddd %xmm14,%xmm10 | |
573 paddd %xmm15,%xmm11 | |
574 pxor %xmm10,%xmm2 | |
575 pxor %xmm11,%xmm3 | |
576 .byte 102,15,56,0,215 | |
577 .byte 102,15,56,0,223 | |
578 paddd %xmm2,%xmm4 | |
579 paddd %xmm3,%xmm5 | |
580 pxor %xmm4,%xmm14 | |
581 pxor %xmm5,%xmm15 | |
582 movdqa %xmm14,%xmm6 | |
583 pslld $12,%xmm14 | |
584 psrld $20,%xmm6 | |
585 movdqa %xmm15,%xmm7 | |
586 pslld $12,%xmm15 | |
587 por %xmm6,%xmm14 | |
588 psrld $20,%xmm7 | |
589 movdqa (%r11),%xmm6 | |
590 por %xmm7,%xmm15 | |
591 paddd %xmm14,%xmm10 | |
592 paddd %xmm15,%xmm11 | |
593 pxor %xmm10,%xmm2 | |
594 pxor %xmm11,%xmm3 | |
595 .byte 102,15,56,0,214 | |
596 .byte 102,15,56,0,222 | |
597 paddd %xmm2,%xmm4 | |
598 paddd %xmm3,%xmm5 | |
599 pxor %xmm4,%xmm14 | |
600 pxor %xmm5,%xmm15 | |
601 movdqa %xmm14,%xmm7 | |
602 pslld $7,%xmm14 | |
603 psrld $25,%xmm7 | |
604 movdqa %xmm15,%xmm6 | |
605 pslld $7,%xmm15 | |
606 por %xmm7,%xmm14 | |
607 psrld $25,%xmm6 | |
608 movdqa (%r10),%xmm7 | |
609 por %xmm6,%xmm15 | |
610 paddd %xmm13,%xmm8 | |
611 paddd %xmm14,%xmm9 | |
612 pxor %xmm8,%xmm3 | |
613 pxor %xmm9,%xmm0 | |
614 .byte 102,15,56,0,223 | |
615 .byte 102,15,56,0,199 | |
616 paddd %xmm3,%xmm4 | |
617 paddd %xmm0,%xmm5 | |
618 pxor %xmm4,%xmm13 | |
619 pxor %xmm5,%xmm14 | |
620 movdqa %xmm13,%xmm6 | |
621 pslld $12,%xmm13 | |
622 psrld $20,%xmm6 | |
623 movdqa %xmm14,%xmm7 | |
624 pslld $12,%xmm14 | |
625 por %xmm6,%xmm13 | |
626 psrld $20,%xmm7 | |
627 movdqa (%r11),%xmm6 | |
628 por %xmm7,%xmm14 | |
629 paddd %xmm13,%xmm8 | |
630 paddd %xmm14,%xmm9 | |
631 pxor %xmm8,%xmm3 | |
632 pxor %xmm9,%xmm0 | |
633 .byte 102,15,56,0,222 | |
634 .byte 102,15,56,0,198 | |
635 paddd %xmm3,%xmm4 | |
636 paddd %xmm0,%xmm5 | |
637 pxor %xmm4,%xmm13 | |
638 pxor %xmm5,%xmm14 | |
639 movdqa %xmm13,%xmm7 | |
640 pslld $7,%xmm13 | |
641 psrld $25,%xmm7 | |
642 movdqa %xmm14,%xmm6 | |
643 pslld $7,%xmm14 | |
644 por %xmm7,%xmm13 | |
645 psrld $25,%xmm6 | |
646 movdqa (%r10),%xmm7 | |
647 por %xmm6,%xmm14 | |
648 movdqa %xmm4,32(%rsp) | |
649 movdqa %xmm5,48(%rsp) | |
650 movdqa 0(%rsp),%xmm4 | |
651 movdqa 16(%rsp),%xmm5 | |
652 paddd %xmm15,%xmm10 | |
653 paddd %xmm12,%xmm11 | |
654 pxor %xmm10,%xmm1 | |
655 pxor %xmm11,%xmm2 | |
656 .byte 102,15,56,0,207 | |
657 .byte 102,15,56,0,215 | |
658 paddd %xmm1,%xmm4 | |
659 paddd %xmm2,%xmm5 | |
660 pxor %xmm4,%xmm15 | |
661 pxor %xmm5,%xmm12 | |
662 movdqa %xmm15,%xmm6 | |
663 pslld $12,%xmm15 | |
664 psrld $20,%xmm6 | |
665 movdqa %xmm12,%xmm7 | |
666 pslld $12,%xmm12 | |
667 por %xmm6,%xmm15 | |
668 psrld $20,%xmm7 | |
669 movdqa (%r11),%xmm6 | |
670 por %xmm7,%xmm12 | |
671 paddd %xmm15,%xmm10 | |
672 paddd %xmm12,%xmm11 | |
673 pxor %xmm10,%xmm1 | |
674 pxor %xmm11,%xmm2 | |
675 .byte 102,15,56,0,206 | |
676 .byte 102,15,56,0,214 | |
677 paddd %xmm1,%xmm4 | |
678 paddd %xmm2,%xmm5 | |
679 pxor %xmm4,%xmm15 | |
680 pxor %xmm5,%xmm12 | |
681 movdqa %xmm15,%xmm7 | |
682 pslld $7,%xmm15 | |
683 psrld $25,%xmm7 | |
684 movdqa %xmm12,%xmm6 | |
685 pslld $7,%xmm12 | |
686 por %xmm7,%xmm15 | |
687 psrld $25,%xmm6 | |
688 movdqa (%r10),%xmm7 | |
689 por %xmm6,%xmm12 | |
690 decl %eax | |
691 jnz .Loop4x | |
692 | |
693 paddd 64(%rsp),%xmm8 | |
694 paddd 80(%rsp),%xmm9 | |
695 paddd 96(%rsp),%xmm10 | |
696 paddd 112(%rsp),%xmm11 | |
697 | |
698 movdqa %xmm8,%xmm6 | |
699 punpckldq %xmm9,%xmm8 | |
700 movdqa %xmm10,%xmm7 | |
701 punpckldq %xmm11,%xmm10 | |
702 punpckhdq %xmm9,%xmm6 | |
703 punpckhdq %xmm11,%xmm7 | |
704 movdqa %xmm8,%xmm9 | |
705 punpcklqdq %xmm10,%xmm8 | |
706 movdqa %xmm6,%xmm11 | |
707 punpcklqdq %xmm7,%xmm6 | |
708 punpckhqdq %xmm10,%xmm9 | |
709 punpckhqdq %xmm7,%xmm11 | |
710 paddd 128-256(%rcx),%xmm12 | |
711 paddd 144-256(%rcx),%xmm13 | |
712 paddd 160-256(%rcx),%xmm14 | |
713 paddd 176-256(%rcx),%xmm15 | |
714 | |
715 movdqa %xmm8,0(%rsp) | |
716 movdqa %xmm9,16(%rsp) | |
717 movdqa 32(%rsp),%xmm8 | |
718 movdqa 48(%rsp),%xmm9 | |
719 | |
720 movdqa %xmm12,%xmm10 | |
721 punpckldq %xmm13,%xmm12 | |
722 movdqa %xmm14,%xmm7 | |
723 punpckldq %xmm15,%xmm14 | |
724 punpckhdq %xmm13,%xmm10 | |
725 punpckhdq %xmm15,%xmm7 | |
726 movdqa %xmm12,%xmm13 | |
727 punpcklqdq %xmm14,%xmm12 | |
728 movdqa %xmm10,%xmm15 | |
729 punpcklqdq %xmm7,%xmm10 | |
730 punpckhqdq %xmm14,%xmm13 | |
731 punpckhqdq %xmm7,%xmm15 | |
732 paddd 192-256(%rcx),%xmm4 | |
733 paddd 208-256(%rcx),%xmm5 | |
734 paddd 224-256(%rcx),%xmm8 | |
735 paddd 240-256(%rcx),%xmm9 | |
736 | |
737 movdqa %xmm6,32(%rsp) | |
738 movdqa %xmm11,48(%rsp) | |
739 | |
740 movdqa %xmm4,%xmm14 | |
741 punpckldq %xmm5,%xmm4 | |
742 movdqa %xmm8,%xmm7 | |
743 punpckldq %xmm9,%xmm8 | |
744 punpckhdq %xmm5,%xmm14 | |
745 punpckhdq %xmm9,%xmm7 | |
746 movdqa %xmm4,%xmm5 | |
747 punpcklqdq %xmm8,%xmm4 | |
748 movdqa %xmm14,%xmm9 | |
749 punpcklqdq %xmm7,%xmm14 | |
750 punpckhqdq %xmm8,%xmm5 | |
751 punpckhqdq %xmm7,%xmm9 | |
752 paddd 256-256(%rcx),%xmm0 | |
753 paddd 272-256(%rcx),%xmm1 | |
754 paddd 288-256(%rcx),%xmm2 | |
755 paddd 304-256(%rcx),%xmm3 | |
756 | |
757 movdqa %xmm0,%xmm8 | |
758 punpckldq %xmm1,%xmm0 | |
759 movdqa %xmm2,%xmm7 | |
760 punpckldq %xmm3,%xmm2 | |
761 punpckhdq %xmm1,%xmm8 | |
762 punpckhdq %xmm3,%xmm7 | |
763 movdqa %xmm0,%xmm1 | |
764 punpcklqdq %xmm2,%xmm0 | |
765 movdqa %xmm8,%xmm3 | |
766 punpcklqdq %xmm7,%xmm8 | |
767 punpckhqdq %xmm2,%xmm1 | |
768 punpckhqdq %xmm7,%xmm3 | |
769 cmpq $256,%rdx | |
770 jb .Ltail4x | |
771 | |
772 movdqu 0(%rsi),%xmm6 | |
773 movdqu 16(%rsi),%xmm11 | |
774 movdqu 32(%rsi),%xmm2 | |
775 movdqu 48(%rsi),%xmm7 | |
776 pxor 0(%rsp),%xmm6 | |
777 pxor %xmm12,%xmm11 | |
778 pxor %xmm4,%xmm2 | |
779 pxor %xmm0,%xmm7 | |
780 | |
781 movdqu %xmm6,0(%rdi) | |
782 movdqu 64(%rsi),%xmm6 | |
783 movdqu %xmm11,16(%rdi) | |
784 movdqu 80(%rsi),%xmm11 | |
785 movdqu %xmm2,32(%rdi) | |
786 movdqu 96(%rsi),%xmm2 | |
787 movdqu %xmm7,48(%rdi) | |
788 movdqu 112(%rsi),%xmm7 | |
789 leaq 128(%rsi),%rsi | |
790 pxor 16(%rsp),%xmm6 | |
791 pxor %xmm13,%xmm11 | |
792 pxor %xmm5,%xmm2 | |
793 pxor %xmm1,%xmm7 | |
794 | |
795 movdqu %xmm6,64(%rdi) | |
796 movdqu 0(%rsi),%xmm6 | |
797 movdqu %xmm11,80(%rdi) | |
798 movdqu 16(%rsi),%xmm11 | |
799 movdqu %xmm2,96(%rdi) | |
800 movdqu 32(%rsi),%xmm2 | |
801 movdqu %xmm7,112(%rdi) | |
802 leaq 128(%rdi),%rdi | |
803 movdqu 48(%rsi),%xmm7 | |
804 pxor 32(%rsp),%xmm6 | |
805 pxor %xmm10,%xmm11 | |
806 pxor %xmm14,%xmm2 | |
807 pxor %xmm8,%xmm7 | |
808 | |
809 movdqu %xmm6,0(%rdi) | |
810 movdqu 64(%rsi),%xmm6 | |
811 movdqu %xmm11,16(%rdi) | |
812 movdqu 80(%rsi),%xmm11 | |
813 movdqu %xmm2,32(%rdi) | |
814 movdqu 96(%rsi),%xmm2 | |
815 movdqu %xmm7,48(%rdi) | |
816 movdqu 112(%rsi),%xmm7 | |
817 leaq 128(%rsi),%rsi | |
818 pxor 48(%rsp),%xmm6 | |
819 pxor %xmm15,%xmm11 | |
820 pxor %xmm9,%xmm2 | |
821 pxor %xmm3,%xmm7 | |
822 movdqu %xmm6,64(%rdi) | |
823 movdqu %xmm11,80(%rdi) | |
824 movdqu %xmm2,96(%rdi) | |
825 movdqu %xmm7,112(%rdi) | |
826 leaq 128(%rdi),%rdi | |
827 | |
828 subq $256,%rdx | |
829 jnz .Loop_outer4x | |
830 | |
831 jmp .Ldone4x | |
832 | |
833 .Ltail4x: | |
834 cmpq $192,%rdx | |
835 jae .L192_or_more4x | |
836 cmpq $128,%rdx | |
837 jae .L128_or_more4x | |
838 cmpq $64,%rdx | |
839 jae .L64_or_more4x | |
840 | |
841 | |
842 xorq %r10,%r10 | |
843 | |
844 movdqa %xmm12,16(%rsp) | |
845 movdqa %xmm4,32(%rsp) | |
846 movdqa %xmm0,48(%rsp) | |
847 jmp .Loop_tail4x | |
848 | |
849 .align 32 | |
850 .L64_or_more4x: | |
851 movdqu 0(%rsi),%xmm6 | |
852 movdqu 16(%rsi),%xmm11 | |
853 movdqu 32(%rsi),%xmm2 | |
854 movdqu 48(%rsi),%xmm7 | |
855 pxor 0(%rsp),%xmm6 | |
856 pxor %xmm12,%xmm11 | |
857 pxor %xmm4,%xmm2 | |
858 pxor %xmm0,%xmm7 | |
859 movdqu %xmm6,0(%rdi) | |
860 movdqu %xmm11,16(%rdi) | |
861 movdqu %xmm2,32(%rdi) | |
862 movdqu %xmm7,48(%rdi) | |
863 je .Ldone4x | |
864 | |
865 movdqa 16(%rsp),%xmm6 | |
866 leaq 64(%rsi),%rsi | |
867 xorq %r10,%r10 | |
868 movdqa %xmm6,0(%rsp) | |
869 movdqa %xmm13,16(%rsp) | |
870 leaq 64(%rdi),%rdi | |
871 movdqa %xmm5,32(%rsp) | |
872 subq $64,%rdx | |
873 movdqa %xmm1,48(%rsp) | |
874 jmp .Loop_tail4x | |
875 | |
876 .align 32 | |
877 .L128_or_more4x: | |
878 movdqu 0(%rsi),%xmm6 | |
879 movdqu 16(%rsi),%xmm11 | |
880 movdqu 32(%rsi),%xmm2 | |
881 movdqu 48(%rsi),%xmm7 | |
882 pxor 0(%rsp),%xmm6 | |
883 pxor %xmm12,%xmm11 | |
884 pxor %xmm4,%xmm2 | |
885 pxor %xmm0,%xmm7 | |
886 | |
887 movdqu %xmm6,0(%rdi) | |
888 movdqu 64(%rsi),%xmm6 | |
889 movdqu %xmm11,16(%rdi) | |
890 movdqu 80(%rsi),%xmm11 | |
891 movdqu %xmm2,32(%rdi) | |
892 movdqu 96(%rsi),%xmm2 | |
893 movdqu %xmm7,48(%rdi) | |
894 movdqu 112(%rsi),%xmm7 | |
895 pxor 16(%rsp),%xmm6 | |
896 pxor %xmm13,%xmm11 | |
897 pxor %xmm5,%xmm2 | |
898 pxor %xmm1,%xmm7 | |
899 movdqu %xmm6,64(%rdi) | |
900 movdqu %xmm11,80(%rdi) | |
901 movdqu %xmm2,96(%rdi) | |
902 movdqu %xmm7,112(%rdi) | |
903 je .Ldone4x | |
904 | |
905 movdqa 32(%rsp),%xmm6 | |
906 leaq 128(%rsi),%rsi | |
907 xorq %r10,%r10 | |
908 movdqa %xmm6,0(%rsp) | |
909 movdqa %xmm10,16(%rsp) | |
910 leaq 128(%rdi),%rdi | |
911 movdqa %xmm14,32(%rsp) | |
912 subq $128,%rdx | |
913 movdqa %xmm8,48(%rsp) | |
914 jmp .Loop_tail4x | |
915 | |
916 .align 32 | |
917 .L192_or_more4x: | |
918 movdqu 0(%rsi),%xmm6 | |
919 movdqu 16(%rsi),%xmm11 | |
920 movdqu 32(%rsi),%xmm2 | |
921 movdqu 48(%rsi),%xmm7 | |
922 pxor 0(%rsp),%xmm6 | |
923 pxor %xmm12,%xmm11 | |
924 pxor %xmm4,%xmm2 | |
925 pxor %xmm0,%xmm7 | |
926 | |
927 movdqu %xmm6,0(%rdi) | |
928 movdqu 64(%rsi),%xmm6 | |
929 movdqu %xmm11,16(%rdi) | |
930 movdqu 80(%rsi),%xmm11 | |
931 movdqu %xmm2,32(%rdi) | |
932 movdqu 96(%rsi),%xmm2 | |
933 movdqu %xmm7,48(%rdi) | |
934 movdqu 112(%rsi),%xmm7 | |
935 leaq 128(%rsi),%rsi | |
936 pxor 16(%rsp),%xmm6 | |
937 pxor %xmm13,%xmm11 | |
938 pxor %xmm5,%xmm2 | |
939 pxor %xmm1,%xmm7 | |
940 | |
941 movdqu %xmm6,64(%rdi) | |
942 movdqu 0(%rsi),%xmm6 | |
943 movdqu %xmm11,80(%rdi) | |
944 movdqu 16(%rsi),%xmm11 | |
945 movdqu %xmm2,96(%rdi) | |
946 movdqu 32(%rsi),%xmm2 | |
947 movdqu %xmm7,112(%rdi) | |
948 leaq 128(%rdi),%rdi | |
949 movdqu 48(%rsi),%xmm7 | |
950 pxor 32(%rsp),%xmm6 | |
951 pxor %xmm10,%xmm11 | |
952 pxor %xmm14,%xmm2 | |
953 pxor %xmm8,%xmm7 | |
954 movdqu %xmm6,0(%rdi) | |
955 movdqu %xmm11,16(%rdi) | |
956 movdqu %xmm2,32(%rdi) | |
957 movdqu %xmm7,48(%rdi) | |
958 je .Ldone4x | |
959 | |
960 movdqa 48(%rsp),%xmm6 | |
961 leaq 64(%rsi),%rsi | |
962 xorq %r10,%r10 | |
963 movdqa %xmm6,0(%rsp) | |
964 movdqa %xmm15,16(%rsp) | |
965 leaq 64(%rdi),%rdi | |
966 movdqa %xmm9,32(%rsp) | |
967 subq $192,%rdx | |
968 movdqa %xmm3,48(%rsp) | |
969 | |
970 .Loop_tail4x: | |
971 movzbl (%rsi,%r10,1),%eax | |
972 movzbl (%rsp,%r10,1),%ecx | |
973 leaq 1(%r10),%r10 | |
974 xorl %ecx,%eax | |
975 movb %al,-1(%rdi,%r10,1) | |
976 decq %rdx | |
977 jnz .Loop_tail4x | |
978 | |
979 .Ldone4x: | |
980 addq $0x148+0,%rsp | |
981 .byte 0xf3,0xc3 | |
982 .size ChaCha20_4x,.-ChaCha20_4x | |
983 .type ChaCha20_8x,@function | |
984 .align 32 | |
985 ChaCha20_8x: | |
986 .LChaCha20_8x: | |
987 movq %rsp,%r10 | |
988 subq $0x280+8,%rsp | |
989 andq $-32,%rsp | |
990 vzeroupper | |
991 movq %r10,640(%rsp) | |
992 | |
993 | |
994 | |
995 | |
996 | |
997 | |
998 | |
999 | |
1000 | |
1001 | |
1002 vbroadcasti128 .Lsigma(%rip),%ymm11 | |
1003 vbroadcasti128 (%rcx),%ymm3 | |
1004 vbroadcasti128 16(%rcx),%ymm15 | |
1005 vbroadcasti128 (%r8),%ymm7 | |
1006 leaq 256(%rsp),%rcx | |
1007 leaq 512(%rsp),%rax | |
1008 leaq .Lrot16(%rip),%r10 | |
1009 leaq .Lrot24(%rip),%r11 | |
1010 | |
1011 vpshufd $0x00,%ymm11,%ymm8 | |
1012 vpshufd $0x55,%ymm11,%ymm9 | |
1013 vmovdqa %ymm8,128-256(%rcx) | |
1014 vpshufd $0xaa,%ymm11,%ymm10 | |
1015 vmovdqa %ymm9,160-256(%rcx) | |
1016 vpshufd $0xff,%ymm11,%ymm11 | |
1017 vmovdqa %ymm10,192-256(%rcx) | |
1018 vmovdqa %ymm11,224-256(%rcx) | |
1019 | |
1020 vpshufd $0x00,%ymm3,%ymm0 | |
1021 vpshufd $0x55,%ymm3,%ymm1 | |
1022 vmovdqa %ymm0,256-256(%rcx) | |
1023 vpshufd $0xaa,%ymm3,%ymm2 | |
1024 vmovdqa %ymm1,288-256(%rcx) | |
1025 vpshufd $0xff,%ymm3,%ymm3 | |
1026 vmovdqa %ymm2,320-256(%rcx) | |
1027 vmovdqa %ymm3,352-256(%rcx) | |
1028 | |
1029 vpshufd $0x00,%ymm15,%ymm12 | |
1030 vpshufd $0x55,%ymm15,%ymm13 | |
1031 vmovdqa %ymm12,384-512(%rax) | |
1032 vpshufd $0xaa,%ymm15,%ymm14 | |
1033 vmovdqa %ymm13,416-512(%rax) | |
1034 vpshufd $0xff,%ymm15,%ymm15 | |
1035 vmovdqa %ymm14,448-512(%rax) | |
1036 vmovdqa %ymm15,480-512(%rax) | |
1037 | |
1038 vpshufd $0x00,%ymm7,%ymm4 | |
1039 vpshufd $0x55,%ymm7,%ymm5 | |
1040 vpaddd .Lincy(%rip),%ymm4,%ymm4 | |
1041 vpshufd $0xaa,%ymm7,%ymm6 | |
1042 vmovdqa %ymm5,544-512(%rax) | |
1043 vpshufd $0xff,%ymm7,%ymm7 | |
1044 vmovdqa %ymm6,576-512(%rax) | |
1045 vmovdqa %ymm7,608-512(%rax) | |
1046 | |
1047 jmp .Loop_enter8x | |
1048 | |
1049 .align 32 | |
1050 .Loop_outer8x: | |
1051 vmovdqa 128-256(%rcx),%ymm8 | |
1052 vmovdqa 160-256(%rcx),%ymm9 | |
1053 vmovdqa 192-256(%rcx),%ymm10 | |
1054 vmovdqa 224-256(%rcx),%ymm11 | |
1055 vmovdqa 256-256(%rcx),%ymm0 | |
1056 vmovdqa 288-256(%rcx),%ymm1 | |
1057 vmovdqa 320-256(%rcx),%ymm2 | |
1058 vmovdqa 352-256(%rcx),%ymm3 | |
1059 vmovdqa 384-512(%rax),%ymm12 | |
1060 vmovdqa 416-512(%rax),%ymm13 | |
1061 vmovdqa 448-512(%rax),%ymm14 | |
1062 vmovdqa 480-512(%rax),%ymm15 | |
1063 vmovdqa 512-512(%rax),%ymm4 | |
1064 vmovdqa 544-512(%rax),%ymm5 | |
1065 vmovdqa 576-512(%rax),%ymm6 | |
1066 vmovdqa 608-512(%rax),%ymm7 | |
1067 vpaddd .Leight(%rip),%ymm4,%ymm4 | |
1068 | |
1069 .Loop_enter8x: | |
1070 vmovdqa %ymm14,64(%rsp) | |
1071 vmovdqa %ymm15,96(%rsp) | |
1072 vbroadcasti128 (%r10),%ymm15 | |
1073 vmovdqa %ymm4,512-512(%rax) | |
1074 movl $10,%eax | |
1075 jmp .Loop8x | |
1076 | |
1077 .align 32 | |
1078 .Loop8x: | |
1079 vpaddd %ymm0,%ymm8,%ymm8 | |
1080 vpxor %ymm4,%ymm8,%ymm4 | |
1081 vpshufb %ymm15,%ymm4,%ymm4 | |
1082 vpaddd %ymm1,%ymm9,%ymm9 | |
1083 vpxor %ymm5,%ymm9,%ymm5 | |
1084 vpshufb %ymm15,%ymm5,%ymm5 | |
1085 vpaddd %ymm4,%ymm12,%ymm12 | |
1086 vpxor %ymm0,%ymm12,%ymm0 | |
1087 vpslld $12,%ymm0,%ymm14 | |
1088 vpsrld $20,%ymm0,%ymm0 | |
1089 vpor %ymm0,%ymm14,%ymm0 | |
1090 vbroadcasti128 (%r11),%ymm14 | |
1091 vpaddd %ymm5,%ymm13,%ymm13 | |
1092 vpxor %ymm1,%ymm13,%ymm1 | |
1093 vpslld $12,%ymm1,%ymm15 | |
1094 vpsrld $20,%ymm1,%ymm1 | |
1095 vpor %ymm1,%ymm15,%ymm1 | |
1096 vpaddd %ymm0,%ymm8,%ymm8 | |
1097 vpxor %ymm4,%ymm8,%ymm4 | |
1098 vpshufb %ymm14,%ymm4,%ymm4 | |
1099 vpaddd %ymm1,%ymm9,%ymm9 | |
1100 vpxor %ymm5,%ymm9,%ymm5 | |
1101 vpshufb %ymm14,%ymm5,%ymm5 | |
1102 vpaddd %ymm4,%ymm12,%ymm12 | |
1103 vpxor %ymm0,%ymm12,%ymm0 | |
1104 vpslld $7,%ymm0,%ymm15 | |
1105 vpsrld $25,%ymm0,%ymm0 | |
1106 vpor %ymm0,%ymm15,%ymm0 | |
1107 vbroadcasti128 (%r10),%ymm15 | |
1108 vpaddd %ymm5,%ymm13,%ymm13 | |
1109 vpxor %ymm1,%ymm13,%ymm1 | |
1110 vpslld $7,%ymm1,%ymm14 | |
1111 vpsrld $25,%ymm1,%ymm1 | |
1112 vpor %ymm1,%ymm14,%ymm1 | |
1113 vmovdqa %ymm12,0(%rsp) | |
1114 vmovdqa %ymm13,32(%rsp) | |
1115 vmovdqa 64(%rsp),%ymm12 | |
1116 vmovdqa 96(%rsp),%ymm13 | |
1117 vpaddd %ymm2,%ymm10,%ymm10 | |
1118 vpxor %ymm6,%ymm10,%ymm6 | |
1119 vpshufb %ymm15,%ymm6,%ymm6 | |
1120 vpaddd %ymm3,%ymm11,%ymm11 | |
1121 vpxor %ymm7,%ymm11,%ymm7 | |
1122 vpshufb %ymm15,%ymm7,%ymm7 | |
1123 vpaddd %ymm6,%ymm12,%ymm12 | |
1124 vpxor %ymm2,%ymm12,%ymm2 | |
1125 vpslld $12,%ymm2,%ymm14 | |
1126 vpsrld $20,%ymm2,%ymm2 | |
1127 vpor %ymm2,%ymm14,%ymm2 | |
1128 vbroadcasti128 (%r11),%ymm14 | |
1129 vpaddd %ymm7,%ymm13,%ymm13 | |
1130 vpxor %ymm3,%ymm13,%ymm3 | |
1131 vpslld $12,%ymm3,%ymm15 | |
1132 vpsrld $20,%ymm3,%ymm3 | |
1133 vpor %ymm3,%ymm15,%ymm3 | |
1134 vpaddd %ymm2,%ymm10,%ymm10 | |
1135 vpxor %ymm6,%ymm10,%ymm6 | |
1136 vpshufb %ymm14,%ymm6,%ymm6 | |
1137 vpaddd %ymm3,%ymm11,%ymm11 | |
1138 vpxor %ymm7,%ymm11,%ymm7 | |
1139 vpshufb %ymm14,%ymm7,%ymm7 | |
1140 vpaddd %ymm6,%ymm12,%ymm12 | |
1141 vpxor %ymm2,%ymm12,%ymm2 | |
1142 vpslld $7,%ymm2,%ymm15 | |
1143 vpsrld $25,%ymm2,%ymm2 | |
1144 vpor %ymm2,%ymm15,%ymm2 | |
1145 vbroadcasti128 (%r10),%ymm15 | |
1146 vpaddd %ymm7,%ymm13,%ymm13 | |
1147 vpxor %ymm3,%ymm13,%ymm3 | |
1148 vpslld $7,%ymm3,%ymm14 | |
1149 vpsrld $25,%ymm3,%ymm3 | |
1150 vpor %ymm3,%ymm14,%ymm3 | |
1151 vpaddd %ymm1,%ymm8,%ymm8 | |
1152 vpxor %ymm7,%ymm8,%ymm7 | |
1153 vpshufb %ymm15,%ymm7,%ymm7 | |
1154 vpaddd %ymm2,%ymm9,%ymm9 | |
1155 vpxor %ymm4,%ymm9,%ymm4 | |
1156 vpshufb %ymm15,%ymm4,%ymm4 | |
1157 vpaddd %ymm7,%ymm12,%ymm12 | |
1158 vpxor %ymm1,%ymm12,%ymm1 | |
1159 vpslld $12,%ymm1,%ymm14 | |
1160 vpsrld $20,%ymm1,%ymm1 | |
1161 vpor %ymm1,%ymm14,%ymm1 | |
1162 vbroadcasti128 (%r11),%ymm14 | |
1163 vpaddd %ymm4,%ymm13,%ymm13 | |
1164 vpxor %ymm2,%ymm13,%ymm2 | |
1165 vpslld $12,%ymm2,%ymm15 | |
1166 vpsrld $20,%ymm2,%ymm2 | |
1167 vpor %ymm2,%ymm15,%ymm2 | |
1168 vpaddd %ymm1,%ymm8,%ymm8 | |
1169 vpxor %ymm7,%ymm8,%ymm7 | |
1170 vpshufb %ymm14,%ymm7,%ymm7 | |
1171 vpaddd %ymm2,%ymm9,%ymm9 | |
1172 vpxor %ymm4,%ymm9,%ymm4 | |
1173 vpshufb %ymm14,%ymm4,%ymm4 | |
1174 vpaddd %ymm7,%ymm12,%ymm12 | |
1175 vpxor %ymm1,%ymm12,%ymm1 | |
1176 vpslld $7,%ymm1,%ymm15 | |
1177 vpsrld $25,%ymm1,%ymm1 | |
1178 vpor %ymm1,%ymm15,%ymm1 | |
1179 vbroadcasti128 (%r10),%ymm15 | |
1180 vpaddd %ymm4,%ymm13,%ymm13 | |
1181 vpxor %ymm2,%ymm13,%ymm2 | |
1182 vpslld $7,%ymm2,%ymm14 | |
1183 vpsrld $25,%ymm2,%ymm2 | |
1184 vpor %ymm2,%ymm14,%ymm2 | |
1185 vmovdqa %ymm12,64(%rsp) | |
1186 vmovdqa %ymm13,96(%rsp) | |
1187 vmovdqa 0(%rsp),%ymm12 | |
1188 vmovdqa 32(%rsp),%ymm13 | |
1189 vpaddd %ymm3,%ymm10,%ymm10 | |
1190 vpxor %ymm5,%ymm10,%ymm5 | |
1191 vpshufb %ymm15,%ymm5,%ymm5 | |
1192 vpaddd %ymm0,%ymm11,%ymm11 | |
1193 vpxor %ymm6,%ymm11,%ymm6 | |
1194 vpshufb %ymm15,%ymm6,%ymm6 | |
1195 vpaddd %ymm5,%ymm12,%ymm12 | |
1196 vpxor %ymm3,%ymm12,%ymm3 | |
1197 vpslld $12,%ymm3,%ymm14 | |
1198 vpsrld $20,%ymm3,%ymm3 | |
1199 vpor %ymm3,%ymm14,%ymm3 | |
1200 vbroadcasti128 (%r11),%ymm14 | |
1201 vpaddd %ymm6,%ymm13,%ymm13 | |
1202 vpxor %ymm0,%ymm13,%ymm0 | |
1203 vpslld $12,%ymm0,%ymm15 | |
1204 vpsrld $20,%ymm0,%ymm0 | |
1205 vpor %ymm0,%ymm15,%ymm0 | |
1206 vpaddd %ymm3,%ymm10,%ymm10 | |
1207 vpxor %ymm5,%ymm10,%ymm5 | |
1208 vpshufb %ymm14,%ymm5,%ymm5 | |
1209 vpaddd %ymm0,%ymm11,%ymm11 | |
1210 vpxor %ymm6,%ymm11,%ymm6 | |
1211 vpshufb %ymm14,%ymm6,%ymm6 | |
1212 vpaddd %ymm5,%ymm12,%ymm12 | |
1213 vpxor %ymm3,%ymm12,%ymm3 | |
1214 vpslld $7,%ymm3,%ymm15 | |
1215 vpsrld $25,%ymm3,%ymm3 | |
1216 vpor %ymm3,%ymm15,%ymm3 | |
1217 vbroadcasti128 (%r10),%ymm15 | |
1218 vpaddd %ymm6,%ymm13,%ymm13 | |
1219 vpxor %ymm0,%ymm13,%ymm0 | |
1220 vpslld $7,%ymm0,%ymm14 | |
1221 vpsrld $25,%ymm0,%ymm0 | |
1222 vpor %ymm0,%ymm14,%ymm0 | |
1223 decl %eax | |
1224 jnz .Loop8x | |
1225 | |
1226 leaq 512(%rsp),%rax | |
1227 vpaddd 128-256(%rcx),%ymm8,%ymm8 | |
1228 vpaddd 160-256(%rcx),%ymm9,%ymm9 | |
1229 vpaddd 192-256(%rcx),%ymm10,%ymm10 | |
1230 vpaddd 224-256(%rcx),%ymm11,%ymm11 | |
1231 | |
1232 vpunpckldq %ymm9,%ymm8,%ymm14 | |
1233 vpunpckldq %ymm11,%ymm10,%ymm15 | |
1234 vpunpckhdq %ymm9,%ymm8,%ymm8 | |
1235 vpunpckhdq %ymm11,%ymm10,%ymm10 | |
1236 vpunpcklqdq %ymm15,%ymm14,%ymm9 | |
1237 vpunpckhqdq %ymm15,%ymm14,%ymm14 | |
1238 vpunpcklqdq %ymm10,%ymm8,%ymm11 | |
1239 vpunpckhqdq %ymm10,%ymm8,%ymm8 | |
1240 vpaddd 256-256(%rcx),%ymm0,%ymm0 | |
1241 vpaddd 288-256(%rcx),%ymm1,%ymm1 | |
1242 vpaddd 320-256(%rcx),%ymm2,%ymm2 | |
1243 vpaddd 352-256(%rcx),%ymm3,%ymm3 | |
1244 | |
1245 vpunpckldq %ymm1,%ymm0,%ymm10 | |
1246 vpunpckldq %ymm3,%ymm2,%ymm15 | |
1247 vpunpckhdq %ymm1,%ymm0,%ymm0 | |
1248 vpunpckhdq %ymm3,%ymm2,%ymm2 | |
1249 vpunpcklqdq %ymm15,%ymm10,%ymm1 | |
1250 vpunpckhqdq %ymm15,%ymm10,%ymm10 | |
1251 vpunpcklqdq %ymm2,%ymm0,%ymm3 | |
1252 vpunpckhqdq %ymm2,%ymm0,%ymm0 | |
1253 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 | |
1254 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 | |
1255 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 | |
1256 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 | |
1257 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 | |
1258 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 | |
1259 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 | |
1260 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 | |
1261 vmovdqa %ymm15,0(%rsp) | |
1262 vmovdqa %ymm9,32(%rsp) | |
1263 vmovdqa 64(%rsp),%ymm15 | |
1264 vmovdqa 96(%rsp),%ymm9 | |
1265 | |
1266 vpaddd 384-512(%rax),%ymm12,%ymm12 | |
1267 vpaddd 416-512(%rax),%ymm13,%ymm13 | |
1268 vpaddd 448-512(%rax),%ymm15,%ymm15 | |
1269 vpaddd 480-512(%rax),%ymm9,%ymm9 | |
1270 | |
1271 vpunpckldq %ymm13,%ymm12,%ymm2 | |
1272 vpunpckldq %ymm9,%ymm15,%ymm8 | |
1273 vpunpckhdq %ymm13,%ymm12,%ymm12 | |
1274 vpunpckhdq %ymm9,%ymm15,%ymm15 | |
1275 vpunpcklqdq %ymm8,%ymm2,%ymm13 | |
1276 vpunpckhqdq %ymm8,%ymm2,%ymm2 | |
1277 vpunpcklqdq %ymm15,%ymm12,%ymm9 | |
1278 vpunpckhqdq %ymm15,%ymm12,%ymm12 | |
1279 vpaddd 512-512(%rax),%ymm4,%ymm4 | |
1280 vpaddd 544-512(%rax),%ymm5,%ymm5 | |
1281 vpaddd 576-512(%rax),%ymm6,%ymm6 | |
1282 vpaddd 608-512(%rax),%ymm7,%ymm7 | |
1283 | |
1284 vpunpckldq %ymm5,%ymm4,%ymm15 | |
1285 vpunpckldq %ymm7,%ymm6,%ymm8 | |
1286 vpunpckhdq %ymm5,%ymm4,%ymm4 | |
1287 vpunpckhdq %ymm7,%ymm6,%ymm6 | |
1288 vpunpcklqdq %ymm8,%ymm15,%ymm5 | |
1289 vpunpckhqdq %ymm8,%ymm15,%ymm15 | |
1290 vpunpcklqdq %ymm6,%ymm4,%ymm7 | |
1291 vpunpckhqdq %ymm6,%ymm4,%ymm4 | |
1292 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 | |
1293 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 | |
1294 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 | |
1295 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 | |
1296 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 | |
1297 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 | |
1298 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 | |
1299 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 | |
1300 vmovdqa 0(%rsp),%ymm6 | |
1301 vmovdqa 32(%rsp),%ymm12 | |
1302 | |
1303 cmpq $512,%rdx | |
1304 jb .Ltail8x | |
1305 | |
1306 vpxor 0(%rsi),%ymm6,%ymm6 | |
1307 vpxor 32(%rsi),%ymm8,%ymm8 | |
1308 vpxor 64(%rsi),%ymm1,%ymm1 | |
1309 vpxor 96(%rsi),%ymm5,%ymm5 | |
1310 leaq 128(%rsi),%rsi | |
1311 vmovdqu %ymm6,0(%rdi) | |
1312 vmovdqu %ymm8,32(%rdi) | |
1313 vmovdqu %ymm1,64(%rdi) | |
1314 vmovdqu %ymm5,96(%rdi) | |
1315 leaq 128(%rdi),%rdi | |
1316 | |
1317 vpxor 0(%rsi),%ymm12,%ymm12 | |
1318 vpxor 32(%rsi),%ymm13,%ymm13 | |
1319 vpxor 64(%rsi),%ymm10,%ymm10 | |
1320 vpxor 96(%rsi),%ymm15,%ymm15 | |
1321 leaq 128(%rsi),%rsi | |
1322 vmovdqu %ymm12,0(%rdi) | |
1323 vmovdqu %ymm13,32(%rdi) | |
1324 vmovdqu %ymm10,64(%rdi) | |
1325 vmovdqu %ymm15,96(%rdi) | |
1326 leaq 128(%rdi),%rdi | |
1327 | |
1328 vpxor 0(%rsi),%ymm14,%ymm14 | |
1329 vpxor 32(%rsi),%ymm2,%ymm2 | |
1330 vpxor 64(%rsi),%ymm3,%ymm3 | |
1331 vpxor 96(%rsi),%ymm7,%ymm7 | |
1332 leaq 128(%rsi),%rsi | |
1333 vmovdqu %ymm14,0(%rdi) | |
1334 vmovdqu %ymm2,32(%rdi) | |
1335 vmovdqu %ymm3,64(%rdi) | |
1336 vmovdqu %ymm7,96(%rdi) | |
1337 leaq 128(%rdi),%rdi | |
1338 | |
1339 vpxor 0(%rsi),%ymm11,%ymm11 | |
1340 vpxor 32(%rsi),%ymm9,%ymm9 | |
1341 vpxor 64(%rsi),%ymm0,%ymm0 | |
1342 vpxor 96(%rsi),%ymm4,%ymm4 | |
1343 leaq 128(%rsi),%rsi | |
1344 vmovdqu %ymm11,0(%rdi) | |
1345 vmovdqu %ymm9,32(%rdi) | |
1346 vmovdqu %ymm0,64(%rdi) | |
1347 vmovdqu %ymm4,96(%rdi) | |
1348 leaq 128(%rdi),%rdi | |
1349 | |
1350 subq $512,%rdx | |
1351 jnz .Loop_outer8x | |
1352 | |
1353 jmp .Ldone8x | |
1354 | |
1355 .Ltail8x: | |
1356 cmpq $448,%rdx | |
1357 jae .L448_or_more8x | |
1358 cmpq $384,%rdx | |
1359 jae .L384_or_more8x | |
1360 cmpq $320,%rdx | |
1361 jae .L320_or_more8x | |
1362 cmpq $256,%rdx | |
1363 jae .L256_or_more8x | |
1364 cmpq $192,%rdx | |
1365 jae .L192_or_more8x | |
1366 cmpq $128,%rdx | |
1367 jae .L128_or_more8x | |
1368 cmpq $64,%rdx | |
1369 jae .L64_or_more8x | |
1370 | |
1371 xorq %r10,%r10 | |
1372 vmovdqa %ymm6,0(%rsp) | |
1373 vmovdqa %ymm8,32(%rsp) | |
1374 jmp .Loop_tail8x | |
1375 | |
1376 .align 32 | |
1377 .L64_or_more8x: | |
1378 vpxor 0(%rsi),%ymm6,%ymm6 | |
1379 vpxor 32(%rsi),%ymm8,%ymm8 | |
1380 vmovdqu %ymm6,0(%rdi) | |
1381 vmovdqu %ymm8,32(%rdi) | |
1382 je .Ldone8x | |
1383 | |
1384 leaq 64(%rsi),%rsi | |
1385 xorq %r10,%r10 | |
1386 vmovdqa %ymm1,0(%rsp) | |
1387 leaq 64(%rdi),%rdi | |
1388 subq $64,%rdx | |
1389 vmovdqa %ymm5,32(%rsp) | |
1390 jmp .Loop_tail8x | |
1391 | |
1392 .align 32 | |
1393 .L128_or_more8x: | |
1394 vpxor 0(%rsi),%ymm6,%ymm6 | |
1395 vpxor 32(%rsi),%ymm8,%ymm8 | |
1396 vpxor 64(%rsi),%ymm1,%ymm1 | |
1397 vpxor 96(%rsi),%ymm5,%ymm5 | |
1398 vmovdqu %ymm6,0(%rdi) | |
1399 vmovdqu %ymm8,32(%rdi) | |
1400 vmovdqu %ymm1,64(%rdi) | |
1401 vmovdqu %ymm5,96(%rdi) | |
1402 je .Ldone8x | |
1403 | |
1404 leaq 128(%rsi),%rsi | |
1405 xorq %r10,%r10 | |
1406 vmovdqa %ymm12,0(%rsp) | |
1407 leaq 128(%rdi),%rdi | |
1408 subq $128,%rdx | |
1409 vmovdqa %ymm13,32(%rsp) | |
1410 jmp .Loop_tail8x | |
1411 | |
1412 .align 32 | |
1413 .L192_or_more8x: | |
1414 vpxor 0(%rsi),%ymm6,%ymm6 | |
1415 vpxor 32(%rsi),%ymm8,%ymm8 | |
1416 vpxor 64(%rsi),%ymm1,%ymm1 | |
1417 vpxor 96(%rsi),%ymm5,%ymm5 | |
1418 vpxor 128(%rsi),%ymm12,%ymm12 | |
1419 vpxor 160(%rsi),%ymm13,%ymm13 | |
1420 vmovdqu %ymm6,0(%rdi) | |
1421 vmovdqu %ymm8,32(%rdi) | |
1422 vmovdqu %ymm1,64(%rdi) | |
1423 vmovdqu %ymm5,96(%rdi) | |
1424 vmovdqu %ymm12,128(%rdi) | |
1425 vmovdqu %ymm13,160(%rdi) | |
1426 je .Ldone8x | |
1427 | |
1428 leaq 192(%rsi),%rsi | |
1429 xorq %r10,%r10 | |
1430 vmovdqa %ymm10,0(%rsp) | |
1431 leaq 192(%rdi),%rdi | |
1432 subq $192,%rdx | |
1433 vmovdqa %ymm15,32(%rsp) | |
1434 jmp .Loop_tail8x | |
1435 | |
1436 .align 32 | |
1437 .L256_or_more8x: | |
1438 vpxor 0(%rsi),%ymm6,%ymm6 | |
1439 vpxor 32(%rsi),%ymm8,%ymm8 | |
1440 vpxor 64(%rsi),%ymm1,%ymm1 | |
1441 vpxor 96(%rsi),%ymm5,%ymm5 | |
1442 vpxor 128(%rsi),%ymm12,%ymm12 | |
1443 vpxor 160(%rsi),%ymm13,%ymm13 | |
1444 vpxor 192(%rsi),%ymm10,%ymm10 | |
1445 vpxor 224(%rsi),%ymm15,%ymm15 | |
1446 vmovdqu %ymm6,0(%rdi) | |
1447 vmovdqu %ymm8,32(%rdi) | |
1448 vmovdqu %ymm1,64(%rdi) | |
1449 vmovdqu %ymm5,96(%rdi) | |
1450 vmovdqu %ymm12,128(%rdi) | |
1451 vmovdqu %ymm13,160(%rdi) | |
1452 vmovdqu %ymm10,192(%rdi) | |
1453 vmovdqu %ymm15,224(%rdi) | |
1454 je .Ldone8x | |
1455 | |
1456 leaq 256(%rsi),%rsi | |
1457 xorq %r10,%r10 | |
1458 vmovdqa %ymm14,0(%rsp) | |
1459 leaq 256(%rdi),%rdi | |
1460 subq $256,%rdx | |
1461 vmovdqa %ymm2,32(%rsp) | |
1462 jmp .Loop_tail8x | |
1463 | |
1464 .align 32 | |
1465 .L320_or_more8x: | |
1466 vpxor 0(%rsi),%ymm6,%ymm6 | |
1467 vpxor 32(%rsi),%ymm8,%ymm8 | |
1468 vpxor 64(%rsi),%ymm1,%ymm1 | |
1469 vpxor 96(%rsi),%ymm5,%ymm5 | |
1470 vpxor 128(%rsi),%ymm12,%ymm12 | |
1471 vpxor 160(%rsi),%ymm13,%ymm13 | |
1472 vpxor 192(%rsi),%ymm10,%ymm10 | |
1473 vpxor 224(%rsi),%ymm15,%ymm15 | |
1474 vpxor 256(%rsi),%ymm14,%ymm14 | |
1475 vpxor 288(%rsi),%ymm2,%ymm2 | |
1476 vmovdqu %ymm6,0(%rdi) | |
1477 vmovdqu %ymm8,32(%rdi) | |
1478 vmovdqu %ymm1,64(%rdi) | |
1479 vmovdqu %ymm5,96(%rdi) | |
1480 vmovdqu %ymm12,128(%rdi) | |
1481 vmovdqu %ymm13,160(%rdi) | |
1482 vmovdqu %ymm10,192(%rdi) | |
1483 vmovdqu %ymm15,224(%rdi) | |
1484 vmovdqu %ymm14,256(%rdi) | |
1485 vmovdqu %ymm2,288(%rdi) | |
1486 je .Ldone8x | |
1487 | |
1488 leaq 320(%rsi),%rsi | |
1489 xorq %r10,%r10 | |
1490 vmovdqa %ymm3,0(%rsp) | |
1491 leaq 320(%rdi),%rdi | |
1492 subq $320,%rdx | |
1493 vmovdqa %ymm7,32(%rsp) | |
1494 jmp .Loop_tail8x | |
1495 | |
1496 .align 32 | |
1497 .L384_or_more8x: | |
1498 vpxor 0(%rsi),%ymm6,%ymm6 | |
1499 vpxor 32(%rsi),%ymm8,%ymm8 | |
1500 vpxor 64(%rsi),%ymm1,%ymm1 | |
1501 vpxor 96(%rsi),%ymm5,%ymm5 | |
1502 vpxor 128(%rsi),%ymm12,%ymm12 | |
1503 vpxor 160(%rsi),%ymm13,%ymm13 | |
1504 vpxor 192(%rsi),%ymm10,%ymm10 | |
1505 vpxor 224(%rsi),%ymm15,%ymm15 | |
1506 vpxor 256(%rsi),%ymm14,%ymm14 | |
1507 vpxor 288(%rsi),%ymm2,%ymm2 | |
1508 vpxor 320(%rsi),%ymm3,%ymm3 | |
1509 vpxor 352(%rsi),%ymm7,%ymm7 | |
1510 vmovdqu %ymm6,0(%rdi) | |
1511 vmovdqu %ymm8,32(%rdi) | |
1512 vmovdqu %ymm1,64(%rdi) | |
1513 vmovdqu %ymm5,96(%rdi) | |
1514 vmovdqu %ymm12,128(%rdi) | |
1515 vmovdqu %ymm13,160(%rdi) | |
1516 vmovdqu %ymm10,192(%rdi) | |
1517 vmovdqu %ymm15,224(%rdi) | |
1518 vmovdqu %ymm14,256(%rdi) | |
1519 vmovdqu %ymm2,288(%rdi) | |
1520 vmovdqu %ymm3,320(%rdi) | |
1521 vmovdqu %ymm7,352(%rdi) | |
1522 je .Ldone8x | |
1523 | |
1524 leaq 384(%rsi),%rsi | |
1525 xorq %r10,%r10 | |
1526 vmovdqa %ymm11,0(%rsp) | |
1527 leaq 384(%rdi),%rdi | |
1528 subq $384,%rdx | |
1529 vmovdqa %ymm9,32(%rsp) | |
1530 jmp .Loop_tail8x | |
1531 | |
1532 .align 32 | |
1533 .L448_or_more8x: | |
1534 vpxor 0(%rsi),%ymm6,%ymm6 | |
1535 vpxor 32(%rsi),%ymm8,%ymm8 | |
1536 vpxor 64(%rsi),%ymm1,%ymm1 | |
1537 vpxor 96(%rsi),%ymm5,%ymm5 | |
1538 vpxor 128(%rsi),%ymm12,%ymm12 | |
1539 vpxor 160(%rsi),%ymm13,%ymm13 | |
1540 vpxor 192(%rsi),%ymm10,%ymm10 | |
1541 vpxor 224(%rsi),%ymm15,%ymm15 | |
1542 vpxor 256(%rsi),%ymm14,%ymm14 | |
1543 vpxor 288(%rsi),%ymm2,%ymm2 | |
1544 vpxor 320(%rsi),%ymm3,%ymm3 | |
1545 vpxor 352(%rsi),%ymm7,%ymm7 | |
1546 vpxor 384(%rsi),%ymm11,%ymm11 | |
1547 vpxor 416(%rsi),%ymm9,%ymm9 | |
1548 vmovdqu %ymm6,0(%rdi) | |
1549 vmovdqu %ymm8,32(%rdi) | |
1550 vmovdqu %ymm1,64(%rdi) | |
1551 vmovdqu %ymm5,96(%rdi) | |
1552 vmovdqu %ymm12,128(%rdi) | |
1553 vmovdqu %ymm13,160(%rdi) | |
1554 vmovdqu %ymm10,192(%rdi) | |
1555 vmovdqu %ymm15,224(%rdi) | |
1556 vmovdqu %ymm14,256(%rdi) | |
1557 vmovdqu %ymm2,288(%rdi) | |
1558 vmovdqu %ymm3,320(%rdi) | |
1559 vmovdqu %ymm7,352(%rdi) | |
1560 vmovdqu %ymm11,384(%rdi) | |
1561 vmovdqu %ymm9,416(%rdi) | |
1562 je .Ldone8x | |
1563 | |
1564 leaq 448(%rsi),%rsi | |
1565 xorq %r10,%r10 | |
1566 vmovdqa %ymm0,0(%rsp) | |
1567 leaq 448(%rdi),%rdi | |
1568 subq $448,%rdx | |
1569 vmovdqa %ymm4,32(%rsp) | |
1570 | |
1571 .Loop_tail8x: | |
1572 movzbl (%rsi,%r10,1),%eax | |
1573 movzbl (%rsp,%r10,1),%ecx | |
1574 leaq 1(%r10),%r10 | |
1575 xorl %ecx,%eax | |
1576 movb %al,-1(%rdi,%r10,1) | |
1577 decq %rdx | |
1578 jnz .Loop_tail8x | |
1579 | |
1580 .Ldone8x: | |
1581 vzeroall | |
1582 movq 640(%rsp),%rsp | |
1583 .byte 0xf3,0xc3 | |
1584 .size ChaCha20_8x,.-ChaCha20_8x | |
1585 #endif | |
OLD | NEW |