OLD | NEW |
| (Empty) |
1 default rel | |
2 %define XMMWORD | |
3 %define YMMWORD | |
4 %define ZMMWORD | |
5 section .text code align=64 | |
6 | |
7 | |
8 EXTERN OPENSSL_ia32cap_P | |
9 | |
10 ALIGN 64 | |
11 $L$zero: | |
12 DD 0,0,0,0 | |
13 $L$one: | |
14 DD 1,0,0,0 | |
15 $L$inc: | |
16 DD 0,1,2,3 | |
17 $L$four: | |
18 DD 4,4,4,4 | |
19 $L$incy: | |
20 DD 0,2,4,6,1,3,5,7 | |
21 $L$eight: | |
22 DD 8,8,8,8,8,8,8,8 | |
23 $L$rot16: | |
24 DB 0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd | |
25 $L$rot24: | |
26 DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe | |
27 $L$sigma: | |
28 DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107 | |
29 DB 0 | |
30 DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 | |
31 DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32 | |
32 DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115 | |
33 DB 108,46,111,114,103,62,0 | |
34 global ChaCha20_ctr32 | |
35 | |
36 ALIGN 64 | |
37 ChaCha20_ctr32: | |
38 mov QWORD[8+rsp],rdi ;WIN64 prologue | |
39 mov QWORD[16+rsp],rsi | |
40 mov rax,rsp | |
41 $L$SEH_begin_ChaCha20_ctr32: | |
42 mov rdi,rcx | |
43 mov rsi,rdx | |
44 mov rdx,r8 | |
45 mov rcx,r9 | |
46 mov r8,QWORD[40+rsp] | |
47 | |
48 | |
49 cmp rdx,0 | |
50 je NEAR $L$no_data | |
51 mov r10,QWORD[((OPENSSL_ia32cap_P+4))] | |
52 test r10d,512 | |
53 jnz NEAR $L$ChaCha20_ssse3 | |
54 | |
55 push rbx | |
56 push rbp | |
57 push r12 | |
58 push r13 | |
59 push r14 | |
60 push r15 | |
61 sub rsp,64+24 | |
62 | |
63 | |
64 movdqu xmm1,XMMWORD[rcx] | |
65 movdqu xmm2,XMMWORD[16+rcx] | |
66 movdqu xmm3,XMMWORD[r8] | |
67 movdqa xmm4,XMMWORD[$L$one] | |
68 | |
69 | |
70 movdqa XMMWORD[16+rsp],xmm1 | |
71 movdqa XMMWORD[32+rsp],xmm2 | |
72 movdqa XMMWORD[48+rsp],xmm3 | |
73 mov rbp,rdx | |
74 jmp NEAR $L$oop_outer | |
75 | |
76 ALIGN 32 | |
77 $L$oop_outer: | |
78 mov eax,0x61707865 | |
79 mov ebx,0x3320646e | |
80 mov ecx,0x79622d32 | |
81 mov edx,0x6b206574 | |
82 mov r8d,DWORD[16+rsp] | |
83 mov r9d,DWORD[20+rsp] | |
84 mov r10d,DWORD[24+rsp] | |
85 mov r11d,DWORD[28+rsp] | |
86 movd r12d,xmm3 | |
87 mov r13d,DWORD[52+rsp] | |
88 mov r14d,DWORD[56+rsp] | |
89 mov r15d,DWORD[60+rsp] | |
90 | |
91 mov QWORD[((64+0))+rsp],rbp | |
92 mov ebp,10 | |
93 mov QWORD[((64+8))+rsp],rsi | |
94 DB 102,72,15,126,214 | |
95 mov QWORD[((64+16))+rsp],rdi | |
96 mov rdi,rsi | |
97 shr rdi,32 | |
98 jmp NEAR $L$oop | |
99 | |
100 ALIGN 32 | |
101 $L$oop: | |
102 add eax,r8d | |
103 xor r12d,eax | |
104 rol r12d,16 | |
105 add ebx,r9d | |
106 xor r13d,ebx | |
107 rol r13d,16 | |
108 add esi,r12d | |
109 xor r8d,esi | |
110 rol r8d,12 | |
111 add edi,r13d | |
112 xor r9d,edi | |
113 rol r9d,12 | |
114 add eax,r8d | |
115 xor r12d,eax | |
116 rol r12d,8 | |
117 add ebx,r9d | |
118 xor r13d,ebx | |
119 rol r13d,8 | |
120 add esi,r12d | |
121 xor r8d,esi | |
122 rol r8d,7 | |
123 add edi,r13d | |
124 xor r9d,edi | |
125 rol r9d,7 | |
126 mov DWORD[32+rsp],esi | |
127 mov DWORD[36+rsp],edi | |
128 mov esi,DWORD[40+rsp] | |
129 mov edi,DWORD[44+rsp] | |
130 add ecx,r10d | |
131 xor r14d,ecx | |
132 rol r14d,16 | |
133 add edx,r11d | |
134 xor r15d,edx | |
135 rol r15d,16 | |
136 add esi,r14d | |
137 xor r10d,esi | |
138 rol r10d,12 | |
139 add edi,r15d | |
140 xor r11d,edi | |
141 rol r11d,12 | |
142 add ecx,r10d | |
143 xor r14d,ecx | |
144 rol r14d,8 | |
145 add edx,r11d | |
146 xor r15d,edx | |
147 rol r15d,8 | |
148 add esi,r14d | |
149 xor r10d,esi | |
150 rol r10d,7 | |
151 add edi,r15d | |
152 xor r11d,edi | |
153 rol r11d,7 | |
154 add eax,r9d | |
155 xor r15d,eax | |
156 rol r15d,16 | |
157 add ebx,r10d | |
158 xor r12d,ebx | |
159 rol r12d,16 | |
160 add esi,r15d | |
161 xor r9d,esi | |
162 rol r9d,12 | |
163 add edi,r12d | |
164 xor r10d,edi | |
165 rol r10d,12 | |
166 add eax,r9d | |
167 xor r15d,eax | |
168 rol r15d,8 | |
169 add ebx,r10d | |
170 xor r12d,ebx | |
171 rol r12d,8 | |
172 add esi,r15d | |
173 xor r9d,esi | |
174 rol r9d,7 | |
175 add edi,r12d | |
176 xor r10d,edi | |
177 rol r10d,7 | |
178 mov DWORD[40+rsp],esi | |
179 mov DWORD[44+rsp],edi | |
180 mov esi,DWORD[32+rsp] | |
181 mov edi,DWORD[36+rsp] | |
182 add ecx,r11d | |
183 xor r13d,ecx | |
184 rol r13d,16 | |
185 add edx,r8d | |
186 xor r14d,edx | |
187 rol r14d,16 | |
188 add esi,r13d | |
189 xor r11d,esi | |
190 rol r11d,12 | |
191 add edi,r14d | |
192 xor r8d,edi | |
193 rol r8d,12 | |
194 add ecx,r11d | |
195 xor r13d,ecx | |
196 rol r13d,8 | |
197 add edx,r8d | |
198 xor r14d,edx | |
199 rol r14d,8 | |
200 add esi,r13d | |
201 xor r11d,esi | |
202 rol r11d,7 | |
203 add edi,r14d | |
204 xor r8d,edi | |
205 rol r8d,7 | |
206 dec ebp | |
207 jnz NEAR $L$oop | |
208 mov DWORD[36+rsp],edi | |
209 mov DWORD[32+rsp],esi | |
210 mov rbp,QWORD[64+rsp] | |
211 movdqa xmm1,xmm2 | |
212 mov rsi,QWORD[((64+8))+rsp] | |
213 paddd xmm3,xmm4 | |
214 mov rdi,QWORD[((64+16))+rsp] | |
215 | |
216 add eax,0x61707865 | |
217 add ebx,0x3320646e | |
218 add ecx,0x79622d32 | |
219 add edx,0x6b206574 | |
220 add r8d,DWORD[16+rsp] | |
221 add r9d,DWORD[20+rsp] | |
222 add r10d,DWORD[24+rsp] | |
223 add r11d,DWORD[28+rsp] | |
224 add r12d,DWORD[48+rsp] | |
225 add r13d,DWORD[52+rsp] | |
226 add r14d,DWORD[56+rsp] | |
227 add r15d,DWORD[60+rsp] | |
228 paddd xmm1,XMMWORD[32+rsp] | |
229 | |
230 cmp rbp,64 | |
231 jb NEAR $L$tail | |
232 | |
233 xor eax,DWORD[rsi] | |
234 xor ebx,DWORD[4+rsi] | |
235 xor ecx,DWORD[8+rsi] | |
236 xor edx,DWORD[12+rsi] | |
237 xor r8d,DWORD[16+rsi] | |
238 xor r9d,DWORD[20+rsi] | |
239 xor r10d,DWORD[24+rsi] | |
240 xor r11d,DWORD[28+rsi] | |
241 movdqu xmm0,XMMWORD[32+rsi] | |
242 xor r12d,DWORD[48+rsi] | |
243 xor r13d,DWORD[52+rsi] | |
244 xor r14d,DWORD[56+rsi] | |
245 xor r15d,DWORD[60+rsi] | |
246 lea rsi,[64+rsi] | |
247 pxor xmm0,xmm1 | |
248 | |
249 movdqa XMMWORD[32+rsp],xmm2 | |
250 movd DWORD[48+rsp],xmm3 | |
251 | |
252 mov DWORD[rdi],eax | |
253 mov DWORD[4+rdi],ebx | |
254 mov DWORD[8+rdi],ecx | |
255 mov DWORD[12+rdi],edx | |
256 mov DWORD[16+rdi],r8d | |
257 mov DWORD[20+rdi],r9d | |
258 mov DWORD[24+rdi],r10d | |
259 mov DWORD[28+rdi],r11d | |
260 movdqu XMMWORD[32+rdi],xmm0 | |
261 mov DWORD[48+rdi],r12d | |
262 mov DWORD[52+rdi],r13d | |
263 mov DWORD[56+rdi],r14d | |
264 mov DWORD[60+rdi],r15d | |
265 lea rdi,[64+rdi] | |
266 | |
267 sub rbp,64 | |
268 jnz NEAR $L$oop_outer | |
269 | |
270 jmp NEAR $L$done | |
271 | |
272 ALIGN 16 | |
273 $L$tail: | |
274 mov DWORD[rsp],eax | |
275 mov DWORD[4+rsp],ebx | |
276 xor rbx,rbx | |
277 mov DWORD[8+rsp],ecx | |
278 mov DWORD[12+rsp],edx | |
279 mov DWORD[16+rsp],r8d | |
280 mov DWORD[20+rsp],r9d | |
281 mov DWORD[24+rsp],r10d | |
282 mov DWORD[28+rsp],r11d | |
283 movdqa XMMWORD[32+rsp],xmm1 | |
284 mov DWORD[48+rsp],r12d | |
285 mov DWORD[52+rsp],r13d | |
286 mov DWORD[56+rsp],r14d | |
287 mov DWORD[60+rsp],r15d | |
288 | |
289 $L$oop_tail: | |
290 movzx eax,BYTE[rbx*1+rsi] | |
291 movzx edx,BYTE[rbx*1+rsp] | |
292 lea rbx,[1+rbx] | |
293 xor eax,edx | |
294 mov BYTE[((-1))+rbx*1+rdi],al | |
295 dec rbp | |
296 jnz NEAR $L$oop_tail | |
297 | |
298 $L$done: | |
299 add rsp,64+24 | |
300 pop r15 | |
301 pop r14 | |
302 pop r13 | |
303 pop r12 | |
304 pop rbp | |
305 pop rbx | |
306 $L$no_data: | |
307 mov rdi,QWORD[8+rsp] ;WIN64 epilogue | |
308 mov rsi,QWORD[16+rsp] | |
309 DB 0F3h,0C3h ;repret | |
310 $L$SEH_end_ChaCha20_ctr32: | |
311 | |
312 ALIGN 32 | |
313 ChaCha20_ssse3: | |
314 mov QWORD[8+rsp],rdi ;WIN64 prologue | |
315 mov QWORD[16+rsp],rsi | |
316 mov rax,rsp | |
317 $L$SEH_begin_ChaCha20_ssse3: | |
318 mov rdi,rcx | |
319 mov rsi,rdx | |
320 mov rdx,r8 | |
321 mov rcx,r9 | |
322 mov r8,QWORD[40+rsp] | |
323 | |
324 | |
325 $L$ChaCha20_ssse3: | |
326 cmp rdx,128 | |
327 ja NEAR $L$ChaCha20_4x | |
328 | |
329 $L$do_sse3_after_all: | |
330 push rbx | |
331 push rbp | |
332 push r12 | |
333 push r13 | |
334 push r14 | |
335 push r15 | |
336 | |
337 sub rsp,64+72 | |
338 movaps XMMWORD[(64+32)+rsp],xmm6 | |
339 movaps XMMWORD[(64+48)+rsp],xmm7 | |
340 movdqa xmm0,XMMWORD[$L$sigma] | |
341 movdqu xmm1,XMMWORD[rcx] | |
342 movdqu xmm2,XMMWORD[16+rcx] | |
343 movdqu xmm3,XMMWORD[r8] | |
344 movdqa xmm6,XMMWORD[$L$rot16] | |
345 movdqa xmm7,XMMWORD[$L$rot24] | |
346 | |
347 movdqa XMMWORD[rsp],xmm0 | |
348 movdqa XMMWORD[16+rsp],xmm1 | |
349 movdqa XMMWORD[32+rsp],xmm2 | |
350 movdqa XMMWORD[48+rsp],xmm3 | |
351 mov ebp,10 | |
352 jmp NEAR $L$oop_ssse3 | |
353 | |
354 ALIGN 32 | |
355 $L$oop_outer_ssse3: | |
356 movdqa xmm3,XMMWORD[$L$one] | |
357 movdqa xmm0,XMMWORD[rsp] | |
358 movdqa xmm1,XMMWORD[16+rsp] | |
359 movdqa xmm2,XMMWORD[32+rsp] | |
360 paddd xmm3,XMMWORD[48+rsp] | |
361 mov ebp,10 | |
362 movdqa XMMWORD[48+rsp],xmm3 | |
363 jmp NEAR $L$oop_ssse3 | |
364 | |
365 ALIGN 32 | |
366 $L$oop_ssse3: | |
367 paddd xmm0,xmm1 | |
368 pxor xmm3,xmm0 | |
369 DB 102,15,56,0,222 | |
370 paddd xmm2,xmm3 | |
371 pxor xmm1,xmm2 | |
372 movdqa xmm4,xmm1 | |
373 psrld xmm1,20 | |
374 pslld xmm4,12 | |
375 por xmm1,xmm4 | |
376 paddd xmm0,xmm1 | |
377 pxor xmm3,xmm0 | |
378 DB 102,15,56,0,223 | |
379 paddd xmm2,xmm3 | |
380 pxor xmm1,xmm2 | |
381 movdqa xmm4,xmm1 | |
382 psrld xmm1,25 | |
383 pslld xmm4,7 | |
384 por xmm1,xmm4 | |
385 pshufd xmm2,xmm2,78 | |
386 pshufd xmm1,xmm1,57 | |
387 pshufd xmm3,xmm3,147 | |
388 nop | |
389 paddd xmm0,xmm1 | |
390 pxor xmm3,xmm0 | |
391 DB 102,15,56,0,222 | |
392 paddd xmm2,xmm3 | |
393 pxor xmm1,xmm2 | |
394 movdqa xmm4,xmm1 | |
395 psrld xmm1,20 | |
396 pslld xmm4,12 | |
397 por xmm1,xmm4 | |
398 paddd xmm0,xmm1 | |
399 pxor xmm3,xmm0 | |
400 DB 102,15,56,0,223 | |
401 paddd xmm2,xmm3 | |
402 pxor xmm1,xmm2 | |
403 movdqa xmm4,xmm1 | |
404 psrld xmm1,25 | |
405 pslld xmm4,7 | |
406 por xmm1,xmm4 | |
407 pshufd xmm2,xmm2,78 | |
408 pshufd xmm1,xmm1,147 | |
409 pshufd xmm3,xmm3,57 | |
410 dec ebp | |
411 jnz NEAR $L$oop_ssse3 | |
412 paddd xmm0,XMMWORD[rsp] | |
413 paddd xmm1,XMMWORD[16+rsp] | |
414 paddd xmm2,XMMWORD[32+rsp] | |
415 paddd xmm3,XMMWORD[48+rsp] | |
416 | |
417 cmp rdx,64 | |
418 jb NEAR $L$tail_ssse3 | |
419 | |
420 movdqu xmm4,XMMWORD[rsi] | |
421 movdqu xmm5,XMMWORD[16+rsi] | |
422 pxor xmm0,xmm4 | |
423 movdqu xmm4,XMMWORD[32+rsi] | |
424 pxor xmm1,xmm5 | |
425 movdqu xmm5,XMMWORD[48+rsi] | |
426 lea rsi,[64+rsi] | |
427 pxor xmm2,xmm4 | |
428 pxor xmm3,xmm5 | |
429 | |
430 movdqu XMMWORD[rdi],xmm0 | |
431 movdqu XMMWORD[16+rdi],xmm1 | |
432 movdqu XMMWORD[32+rdi],xmm2 | |
433 movdqu XMMWORD[48+rdi],xmm3 | |
434 lea rdi,[64+rdi] | |
435 | |
436 sub rdx,64 | |
437 jnz NEAR $L$oop_outer_ssse3 | |
438 | |
439 jmp NEAR $L$done_ssse3 | |
440 | |
441 ALIGN 16 | |
442 $L$tail_ssse3: | |
443 movdqa XMMWORD[rsp],xmm0 | |
444 movdqa XMMWORD[16+rsp],xmm1 | |
445 movdqa XMMWORD[32+rsp],xmm2 | |
446 movdqa XMMWORD[48+rsp],xmm3 | |
447 xor rbx,rbx | |
448 | |
449 $L$oop_tail_ssse3: | |
450 movzx eax,BYTE[rbx*1+rsi] | |
451 movzx ecx,BYTE[rbx*1+rsp] | |
452 lea rbx,[1+rbx] | |
453 xor eax,ecx | |
454 mov BYTE[((-1))+rbx*1+rdi],al | |
455 dec rdx | |
456 jnz NEAR $L$oop_tail_ssse3 | |
457 | |
458 $L$done_ssse3: | |
459 movaps xmm6,XMMWORD[((64+32))+rsp] | |
460 movaps xmm7,XMMWORD[((64+48))+rsp] | |
461 add rsp,64+72 | |
462 pop r15 | |
463 pop r14 | |
464 pop r13 | |
465 pop r12 | |
466 pop rbp | |
467 pop rbx | |
468 mov rdi,QWORD[8+rsp] ;WIN64 epilogue | |
469 mov rsi,QWORD[16+rsp] | |
470 DB 0F3h,0C3h ;repret | |
471 $L$SEH_end_ChaCha20_ssse3: | |
472 | |
473 ALIGN 32 | |
474 ChaCha20_4x: | |
475 mov QWORD[8+rsp],rdi ;WIN64 prologue | |
476 mov QWORD[16+rsp],rsi | |
477 mov rax,rsp | |
478 $L$SEH_begin_ChaCha20_4x: | |
479 mov rdi,rcx | |
480 mov rsi,rdx | |
481 mov rdx,r8 | |
482 mov rcx,r9 | |
483 mov r8,QWORD[40+rsp] | |
484 | |
485 | |
486 $L$ChaCha20_4x: | |
487 mov r11,r10 | |
488 shr r10,32 | |
489 test r10,32 | |
490 jnz NEAR $L$ChaCha20_8x | |
491 cmp rdx,192 | |
492 ja NEAR $L$proceed4x | |
493 | |
494 and r11,71303168 | |
495 cmp r11,4194304 | |
496 je NEAR $L$do_sse3_after_all | |
497 | |
498 $L$proceed4x: | |
499 lea r11,[((-120))+rsp] | |
500 sub rsp,0x148+160 | |
501 movaps XMMWORD[(-48)+r11],xmm6 | |
502 movaps XMMWORD[(-32)+r11],xmm7 | |
503 movaps XMMWORD[(-16)+r11],xmm8 | |
504 movaps XMMWORD[r11],xmm9 | |
505 movaps XMMWORD[16+r11],xmm10 | |
506 movaps XMMWORD[32+r11],xmm11 | |
507 movaps XMMWORD[48+r11],xmm12 | |
508 movaps XMMWORD[64+r11],xmm13 | |
509 movaps XMMWORD[80+r11],xmm14 | |
510 movaps XMMWORD[96+r11],xmm15 | |
511 movdqa xmm11,XMMWORD[$L$sigma] | |
512 movdqu xmm15,XMMWORD[rcx] | |
513 movdqu xmm7,XMMWORD[16+rcx] | |
514 movdqu xmm3,XMMWORD[r8] | |
515 lea rcx,[256+rsp] | |
516 lea r10,[$L$rot16] | |
517 lea r11,[$L$rot24] | |
518 | |
519 pshufd xmm8,xmm11,0x00 | |
520 pshufd xmm9,xmm11,0x55 | |
521 movdqa XMMWORD[64+rsp],xmm8 | |
522 pshufd xmm10,xmm11,0xaa | |
523 movdqa XMMWORD[80+rsp],xmm9 | |
524 pshufd xmm11,xmm11,0xff | |
525 movdqa XMMWORD[96+rsp],xmm10 | |
526 movdqa XMMWORD[112+rsp],xmm11 | |
527 | |
528 pshufd xmm12,xmm15,0x00 | |
529 pshufd xmm13,xmm15,0x55 | |
530 movdqa XMMWORD[(128-256)+rcx],xmm12 | |
531 pshufd xmm14,xmm15,0xaa | |
532 movdqa XMMWORD[(144-256)+rcx],xmm13 | |
533 pshufd xmm15,xmm15,0xff | |
534 movdqa XMMWORD[(160-256)+rcx],xmm14 | |
535 movdqa XMMWORD[(176-256)+rcx],xmm15 | |
536 | |
537 pshufd xmm4,xmm7,0x00 | |
538 pshufd xmm5,xmm7,0x55 | |
539 movdqa XMMWORD[(192-256)+rcx],xmm4 | |
540 pshufd xmm6,xmm7,0xaa | |
541 movdqa XMMWORD[(208-256)+rcx],xmm5 | |
542 pshufd xmm7,xmm7,0xff | |
543 movdqa XMMWORD[(224-256)+rcx],xmm6 | |
544 movdqa XMMWORD[(240-256)+rcx],xmm7 | |
545 | |
546 pshufd xmm0,xmm3,0x00 | |
547 pshufd xmm1,xmm3,0x55 | |
548 paddd xmm0,XMMWORD[$L$inc] | |
549 pshufd xmm2,xmm3,0xaa | |
550 movdqa XMMWORD[(272-256)+rcx],xmm1 | |
551 pshufd xmm3,xmm3,0xff | |
552 movdqa XMMWORD[(288-256)+rcx],xmm2 | |
553 movdqa XMMWORD[(304-256)+rcx],xmm3 | |
554 | |
555 jmp NEAR $L$oop_enter4x | |
556 | |
557 ALIGN 32 | |
558 $L$oop_outer4x: | |
559 movdqa xmm8,XMMWORD[64+rsp] | |
560 movdqa xmm9,XMMWORD[80+rsp] | |
561 movdqa xmm10,XMMWORD[96+rsp] | |
562 movdqa xmm11,XMMWORD[112+rsp] | |
563 movdqa xmm12,XMMWORD[((128-256))+rcx] | |
564 movdqa xmm13,XMMWORD[((144-256))+rcx] | |
565 movdqa xmm14,XMMWORD[((160-256))+rcx] | |
566 movdqa xmm15,XMMWORD[((176-256))+rcx] | |
567 movdqa xmm4,XMMWORD[((192-256))+rcx] | |
568 movdqa xmm5,XMMWORD[((208-256))+rcx] | |
569 movdqa xmm6,XMMWORD[((224-256))+rcx] | |
570 movdqa xmm7,XMMWORD[((240-256))+rcx] | |
571 movdqa xmm0,XMMWORD[((256-256))+rcx] | |
572 movdqa xmm1,XMMWORD[((272-256))+rcx] | |
573 movdqa xmm2,XMMWORD[((288-256))+rcx] | |
574 movdqa xmm3,XMMWORD[((304-256))+rcx] | |
575 paddd xmm0,XMMWORD[$L$four] | |
576 | |
577 $L$oop_enter4x: | |
578 movdqa XMMWORD[32+rsp],xmm6 | |
579 movdqa XMMWORD[48+rsp],xmm7 | |
580 movdqa xmm7,XMMWORD[r10] | |
581 mov eax,10 | |
582 movdqa XMMWORD[(256-256)+rcx],xmm0 | |
583 jmp NEAR $L$oop4x | |
584 | |
585 ALIGN 32 | |
586 $L$oop4x: | |
587 paddd xmm8,xmm12 | |
588 paddd xmm9,xmm13 | |
589 pxor xmm0,xmm8 | |
590 pxor xmm1,xmm9 | |
591 DB 102,15,56,0,199 | |
592 DB 102,15,56,0,207 | |
593 paddd xmm4,xmm0 | |
594 paddd xmm5,xmm1 | |
595 pxor xmm12,xmm4 | |
596 pxor xmm13,xmm5 | |
597 movdqa xmm6,xmm12 | |
598 pslld xmm12,12 | |
599 psrld xmm6,20 | |
600 movdqa xmm7,xmm13 | |
601 pslld xmm13,12 | |
602 por xmm12,xmm6 | |
603 psrld xmm7,20 | |
604 movdqa xmm6,XMMWORD[r11] | |
605 por xmm13,xmm7 | |
606 paddd xmm8,xmm12 | |
607 paddd xmm9,xmm13 | |
608 pxor xmm0,xmm8 | |
609 pxor xmm1,xmm9 | |
610 DB 102,15,56,0,198 | |
611 DB 102,15,56,0,206 | |
612 paddd xmm4,xmm0 | |
613 paddd xmm5,xmm1 | |
614 pxor xmm12,xmm4 | |
615 pxor xmm13,xmm5 | |
616 movdqa xmm7,xmm12 | |
617 pslld xmm12,7 | |
618 psrld xmm7,25 | |
619 movdqa xmm6,xmm13 | |
620 pslld xmm13,7 | |
621 por xmm12,xmm7 | |
622 psrld xmm6,25 | |
623 movdqa xmm7,XMMWORD[r10] | |
624 por xmm13,xmm6 | |
625 movdqa XMMWORD[rsp],xmm4 | |
626 movdqa XMMWORD[16+rsp],xmm5 | |
627 movdqa xmm4,XMMWORD[32+rsp] | |
628 movdqa xmm5,XMMWORD[48+rsp] | |
629 paddd xmm10,xmm14 | |
630 paddd xmm11,xmm15 | |
631 pxor xmm2,xmm10 | |
632 pxor xmm3,xmm11 | |
633 DB 102,15,56,0,215 | |
634 DB 102,15,56,0,223 | |
635 paddd xmm4,xmm2 | |
636 paddd xmm5,xmm3 | |
637 pxor xmm14,xmm4 | |
638 pxor xmm15,xmm5 | |
639 movdqa xmm6,xmm14 | |
640 pslld xmm14,12 | |
641 psrld xmm6,20 | |
642 movdqa xmm7,xmm15 | |
643 pslld xmm15,12 | |
644 por xmm14,xmm6 | |
645 psrld xmm7,20 | |
646 movdqa xmm6,XMMWORD[r11] | |
647 por xmm15,xmm7 | |
648 paddd xmm10,xmm14 | |
649 paddd xmm11,xmm15 | |
650 pxor xmm2,xmm10 | |
651 pxor xmm3,xmm11 | |
652 DB 102,15,56,0,214 | |
653 DB 102,15,56,0,222 | |
654 paddd xmm4,xmm2 | |
655 paddd xmm5,xmm3 | |
656 pxor xmm14,xmm4 | |
657 pxor xmm15,xmm5 | |
658 movdqa xmm7,xmm14 | |
659 pslld xmm14,7 | |
660 psrld xmm7,25 | |
661 movdqa xmm6,xmm15 | |
662 pslld xmm15,7 | |
663 por xmm14,xmm7 | |
664 psrld xmm6,25 | |
665 movdqa xmm7,XMMWORD[r10] | |
666 por xmm15,xmm6 | |
667 paddd xmm8,xmm13 | |
668 paddd xmm9,xmm14 | |
669 pxor xmm3,xmm8 | |
670 pxor xmm0,xmm9 | |
671 DB 102,15,56,0,223 | |
672 DB 102,15,56,0,199 | |
673 paddd xmm4,xmm3 | |
674 paddd xmm5,xmm0 | |
675 pxor xmm13,xmm4 | |
676 pxor xmm14,xmm5 | |
677 movdqa xmm6,xmm13 | |
678 pslld xmm13,12 | |
679 psrld xmm6,20 | |
680 movdqa xmm7,xmm14 | |
681 pslld xmm14,12 | |
682 por xmm13,xmm6 | |
683 psrld xmm7,20 | |
684 movdqa xmm6,XMMWORD[r11] | |
685 por xmm14,xmm7 | |
686 paddd xmm8,xmm13 | |
687 paddd xmm9,xmm14 | |
688 pxor xmm3,xmm8 | |
689 pxor xmm0,xmm9 | |
690 DB 102,15,56,0,222 | |
691 DB 102,15,56,0,198 | |
692 paddd xmm4,xmm3 | |
693 paddd xmm5,xmm0 | |
694 pxor xmm13,xmm4 | |
695 pxor xmm14,xmm5 | |
696 movdqa xmm7,xmm13 | |
697 pslld xmm13,7 | |
698 psrld xmm7,25 | |
699 movdqa xmm6,xmm14 | |
700 pslld xmm14,7 | |
701 por xmm13,xmm7 | |
702 psrld xmm6,25 | |
703 movdqa xmm7,XMMWORD[r10] | |
704 por xmm14,xmm6 | |
705 movdqa XMMWORD[32+rsp],xmm4 | |
706 movdqa XMMWORD[48+rsp],xmm5 | |
707 movdqa xmm4,XMMWORD[rsp] | |
708 movdqa xmm5,XMMWORD[16+rsp] | |
709 paddd xmm10,xmm15 | |
710 paddd xmm11,xmm12 | |
711 pxor xmm1,xmm10 | |
712 pxor xmm2,xmm11 | |
713 DB 102,15,56,0,207 | |
714 DB 102,15,56,0,215 | |
715 paddd xmm4,xmm1 | |
716 paddd xmm5,xmm2 | |
717 pxor xmm15,xmm4 | |
718 pxor xmm12,xmm5 | |
719 movdqa xmm6,xmm15 | |
720 pslld xmm15,12 | |
721 psrld xmm6,20 | |
722 movdqa xmm7,xmm12 | |
723 pslld xmm12,12 | |
724 por xmm15,xmm6 | |
725 psrld xmm7,20 | |
726 movdqa xmm6,XMMWORD[r11] | |
727 por xmm12,xmm7 | |
728 paddd xmm10,xmm15 | |
729 paddd xmm11,xmm12 | |
730 pxor xmm1,xmm10 | |
731 pxor xmm2,xmm11 | |
732 DB 102,15,56,0,206 | |
733 DB 102,15,56,0,214 | |
734 paddd xmm4,xmm1 | |
735 paddd xmm5,xmm2 | |
736 pxor xmm15,xmm4 | |
737 pxor xmm12,xmm5 | |
738 movdqa xmm7,xmm15 | |
739 pslld xmm15,7 | |
740 psrld xmm7,25 | |
741 movdqa xmm6,xmm12 | |
742 pslld xmm12,7 | |
743 por xmm15,xmm7 | |
744 psrld xmm6,25 | |
745 movdqa xmm7,XMMWORD[r10] | |
746 por xmm12,xmm6 | |
747 dec eax | |
748 jnz NEAR $L$oop4x | |
749 | |
750 paddd xmm8,XMMWORD[64+rsp] | |
751 paddd xmm9,XMMWORD[80+rsp] | |
752 paddd xmm10,XMMWORD[96+rsp] | |
753 paddd xmm11,XMMWORD[112+rsp] | |
754 | |
755 movdqa xmm6,xmm8 | |
756 punpckldq xmm8,xmm9 | |
757 movdqa xmm7,xmm10 | |
758 punpckldq xmm10,xmm11 | |
759 punpckhdq xmm6,xmm9 | |
760 punpckhdq xmm7,xmm11 | |
761 movdqa xmm9,xmm8 | |
762 punpcklqdq xmm8,xmm10 | |
763 movdqa xmm11,xmm6 | |
764 punpcklqdq xmm6,xmm7 | |
765 punpckhqdq xmm9,xmm10 | |
766 punpckhqdq xmm11,xmm7 | |
767 paddd xmm12,XMMWORD[((128-256))+rcx] | |
768 paddd xmm13,XMMWORD[((144-256))+rcx] | |
769 paddd xmm14,XMMWORD[((160-256))+rcx] | |
770 paddd xmm15,XMMWORD[((176-256))+rcx] | |
771 | |
772 movdqa XMMWORD[rsp],xmm8 | |
773 movdqa XMMWORD[16+rsp],xmm9 | |
774 movdqa xmm8,XMMWORD[32+rsp] | |
775 movdqa xmm9,XMMWORD[48+rsp] | |
776 | |
777 movdqa xmm10,xmm12 | |
778 punpckldq xmm12,xmm13 | |
779 movdqa xmm7,xmm14 | |
780 punpckldq xmm14,xmm15 | |
781 punpckhdq xmm10,xmm13 | |
782 punpckhdq xmm7,xmm15 | |
783 movdqa xmm13,xmm12 | |
784 punpcklqdq xmm12,xmm14 | |
785 movdqa xmm15,xmm10 | |
786 punpcklqdq xmm10,xmm7 | |
787 punpckhqdq xmm13,xmm14 | |
788 punpckhqdq xmm15,xmm7 | |
789 paddd xmm4,XMMWORD[((192-256))+rcx] | |
790 paddd xmm5,XMMWORD[((208-256))+rcx] | |
791 paddd xmm8,XMMWORD[((224-256))+rcx] | |
792 paddd xmm9,XMMWORD[((240-256))+rcx] | |
793 | |
794 movdqa XMMWORD[32+rsp],xmm6 | |
795 movdqa XMMWORD[48+rsp],xmm11 | |
796 | |
797 movdqa xmm14,xmm4 | |
798 punpckldq xmm4,xmm5 | |
799 movdqa xmm7,xmm8 | |
800 punpckldq xmm8,xmm9 | |
801 punpckhdq xmm14,xmm5 | |
802 punpckhdq xmm7,xmm9 | |
803 movdqa xmm5,xmm4 | |
804 punpcklqdq xmm4,xmm8 | |
805 movdqa xmm9,xmm14 | |
806 punpcklqdq xmm14,xmm7 | |
807 punpckhqdq xmm5,xmm8 | |
808 punpckhqdq xmm9,xmm7 | |
809 paddd xmm0,XMMWORD[((256-256))+rcx] | |
810 paddd xmm1,XMMWORD[((272-256))+rcx] | |
811 paddd xmm2,XMMWORD[((288-256))+rcx] | |
812 paddd xmm3,XMMWORD[((304-256))+rcx] | |
813 | |
814 movdqa xmm8,xmm0 | |
815 punpckldq xmm0,xmm1 | |
816 movdqa xmm7,xmm2 | |
817 punpckldq xmm2,xmm3 | |
818 punpckhdq xmm8,xmm1 | |
819 punpckhdq xmm7,xmm3 | |
820 movdqa xmm1,xmm0 | |
821 punpcklqdq xmm0,xmm2 | |
822 movdqa xmm3,xmm8 | |
823 punpcklqdq xmm8,xmm7 | |
824 punpckhqdq xmm1,xmm2 | |
825 punpckhqdq xmm3,xmm7 | |
826 cmp rdx,64*4 | |
827 jb NEAR $L$tail4x | |
828 | |
829 movdqu xmm6,XMMWORD[rsi] | |
830 movdqu xmm11,XMMWORD[16+rsi] | |
831 movdqu xmm2,XMMWORD[32+rsi] | |
832 movdqu xmm7,XMMWORD[48+rsi] | |
833 pxor xmm6,XMMWORD[rsp] | |
834 pxor xmm11,xmm12 | |
835 pxor xmm2,xmm4 | |
836 pxor xmm7,xmm0 | |
837 | |
838 movdqu XMMWORD[rdi],xmm6 | |
839 movdqu xmm6,XMMWORD[64+rsi] | |
840 movdqu XMMWORD[16+rdi],xmm11 | |
841 movdqu xmm11,XMMWORD[80+rsi] | |
842 movdqu XMMWORD[32+rdi],xmm2 | |
843 movdqu xmm2,XMMWORD[96+rsi] | |
844 movdqu XMMWORD[48+rdi],xmm7 | |
845 movdqu xmm7,XMMWORD[112+rsi] | |
846 lea rsi,[128+rsi] | |
847 pxor xmm6,XMMWORD[16+rsp] | |
848 pxor xmm11,xmm13 | |
849 pxor xmm2,xmm5 | |
850 pxor xmm7,xmm1 | |
851 | |
852 movdqu XMMWORD[64+rdi],xmm6 | |
853 movdqu xmm6,XMMWORD[rsi] | |
854 movdqu XMMWORD[80+rdi],xmm11 | |
855 movdqu xmm11,XMMWORD[16+rsi] | |
856 movdqu XMMWORD[96+rdi],xmm2 | |
857 movdqu xmm2,XMMWORD[32+rsi] | |
858 movdqu XMMWORD[112+rdi],xmm7 | |
859 lea rdi,[128+rdi] | |
860 movdqu xmm7,XMMWORD[48+rsi] | |
861 pxor xmm6,XMMWORD[32+rsp] | |
862 pxor xmm11,xmm10 | |
863 pxor xmm2,xmm14 | |
864 pxor xmm7,xmm8 | |
865 | |
866 movdqu XMMWORD[rdi],xmm6 | |
867 movdqu xmm6,XMMWORD[64+rsi] | |
868 movdqu XMMWORD[16+rdi],xmm11 | |
869 movdqu xmm11,XMMWORD[80+rsi] | |
870 movdqu XMMWORD[32+rdi],xmm2 | |
871 movdqu xmm2,XMMWORD[96+rsi] | |
872 movdqu XMMWORD[48+rdi],xmm7 | |
873 movdqu xmm7,XMMWORD[112+rsi] | |
874 lea rsi,[128+rsi] | |
875 pxor xmm6,XMMWORD[48+rsp] | |
876 pxor xmm11,xmm15 | |
877 pxor xmm2,xmm9 | |
878 pxor xmm7,xmm3 | |
879 movdqu XMMWORD[64+rdi],xmm6 | |
880 movdqu XMMWORD[80+rdi],xmm11 | |
881 movdqu XMMWORD[96+rdi],xmm2 | |
882 movdqu XMMWORD[112+rdi],xmm7 | |
883 lea rdi,[128+rdi] | |
884 | |
885 sub rdx,64*4 | |
886 jnz NEAR $L$oop_outer4x | |
887 | |
888 jmp NEAR $L$done4x | |
889 | |
890 $L$tail4x: | |
891 cmp rdx,192 | |
892 jae NEAR $L$192_or_more4x | |
893 cmp rdx,128 | |
894 jae NEAR $L$128_or_more4x | |
895 cmp rdx,64 | |
896 jae NEAR $L$64_or_more4x | |
897 | |
898 | |
899 xor r10,r10 | |
900 | |
901 movdqa XMMWORD[16+rsp],xmm12 | |
902 movdqa XMMWORD[32+rsp],xmm4 | |
903 movdqa XMMWORD[48+rsp],xmm0 | |
904 jmp NEAR $L$oop_tail4x | |
905 | |
906 ALIGN 32 | |
907 $L$64_or_more4x: | |
908 movdqu xmm6,XMMWORD[rsi] | |
909 movdqu xmm11,XMMWORD[16+rsi] | |
910 movdqu xmm2,XMMWORD[32+rsi] | |
911 movdqu xmm7,XMMWORD[48+rsi] | |
912 pxor xmm6,XMMWORD[rsp] | |
913 pxor xmm11,xmm12 | |
914 pxor xmm2,xmm4 | |
915 pxor xmm7,xmm0 | |
916 movdqu XMMWORD[rdi],xmm6 | |
917 movdqu XMMWORD[16+rdi],xmm11 | |
918 movdqu XMMWORD[32+rdi],xmm2 | |
919 movdqu XMMWORD[48+rdi],xmm7 | |
920 je NEAR $L$done4x | |
921 | |
922 movdqa xmm6,XMMWORD[16+rsp] | |
923 lea rsi,[64+rsi] | |
924 xor r10,r10 | |
925 movdqa XMMWORD[rsp],xmm6 | |
926 movdqa XMMWORD[16+rsp],xmm13 | |
927 lea rdi,[64+rdi] | |
928 movdqa XMMWORD[32+rsp],xmm5 | |
929 sub rdx,64 | |
930 movdqa XMMWORD[48+rsp],xmm1 | |
931 jmp NEAR $L$oop_tail4x | |
932 | |
933 ALIGN 32 | |
934 $L$128_or_more4x: | |
935 movdqu xmm6,XMMWORD[rsi] | |
936 movdqu xmm11,XMMWORD[16+rsi] | |
937 movdqu xmm2,XMMWORD[32+rsi] | |
938 movdqu xmm7,XMMWORD[48+rsi] | |
939 pxor xmm6,XMMWORD[rsp] | |
940 pxor xmm11,xmm12 | |
941 pxor xmm2,xmm4 | |
942 pxor xmm7,xmm0 | |
943 | |
944 movdqu XMMWORD[rdi],xmm6 | |
945 movdqu xmm6,XMMWORD[64+rsi] | |
946 movdqu XMMWORD[16+rdi],xmm11 | |
947 movdqu xmm11,XMMWORD[80+rsi] | |
948 movdqu XMMWORD[32+rdi],xmm2 | |
949 movdqu xmm2,XMMWORD[96+rsi] | |
950 movdqu XMMWORD[48+rdi],xmm7 | |
951 movdqu xmm7,XMMWORD[112+rsi] | |
952 pxor xmm6,XMMWORD[16+rsp] | |
953 pxor xmm11,xmm13 | |
954 pxor xmm2,xmm5 | |
955 pxor xmm7,xmm1 | |
956 movdqu XMMWORD[64+rdi],xmm6 | |
957 movdqu XMMWORD[80+rdi],xmm11 | |
958 movdqu XMMWORD[96+rdi],xmm2 | |
959 movdqu XMMWORD[112+rdi],xmm7 | |
960 je NEAR $L$done4x | |
961 | |
962 movdqa xmm6,XMMWORD[32+rsp] | |
963 lea rsi,[128+rsi] | |
964 xor r10,r10 | |
965 movdqa XMMWORD[rsp],xmm6 | |
966 movdqa XMMWORD[16+rsp],xmm10 | |
967 lea rdi,[128+rdi] | |
968 movdqa XMMWORD[32+rsp],xmm14 | |
969 sub rdx,128 | |
970 movdqa XMMWORD[48+rsp],xmm8 | |
971 jmp NEAR $L$oop_tail4x | |
972 | |
973 ALIGN 32 | |
974 $L$192_or_more4x: | |
975 movdqu xmm6,XMMWORD[rsi] | |
976 movdqu xmm11,XMMWORD[16+rsi] | |
977 movdqu xmm2,XMMWORD[32+rsi] | |
978 movdqu xmm7,XMMWORD[48+rsi] | |
979 pxor xmm6,XMMWORD[rsp] | |
980 pxor xmm11,xmm12 | |
981 pxor xmm2,xmm4 | |
982 pxor xmm7,xmm0 | |
983 | |
984 movdqu XMMWORD[rdi],xmm6 | |
985 movdqu xmm6,XMMWORD[64+rsi] | |
986 movdqu XMMWORD[16+rdi],xmm11 | |
987 movdqu xmm11,XMMWORD[80+rsi] | |
988 movdqu XMMWORD[32+rdi],xmm2 | |
989 movdqu xmm2,XMMWORD[96+rsi] | |
990 movdqu XMMWORD[48+rdi],xmm7 | |
991 movdqu xmm7,XMMWORD[112+rsi] | |
992 lea rsi,[128+rsi] | |
993 pxor xmm6,XMMWORD[16+rsp] | |
994 pxor xmm11,xmm13 | |
995 pxor xmm2,xmm5 | |
996 pxor xmm7,xmm1 | |
997 | |
998 movdqu XMMWORD[64+rdi],xmm6 | |
999 movdqu xmm6,XMMWORD[rsi] | |
1000 movdqu XMMWORD[80+rdi],xmm11 | |
1001 movdqu xmm11,XMMWORD[16+rsi] | |
1002 movdqu XMMWORD[96+rdi],xmm2 | |
1003 movdqu xmm2,XMMWORD[32+rsi] | |
1004 movdqu XMMWORD[112+rdi],xmm7 | |
1005 lea rdi,[128+rdi] | |
1006 movdqu xmm7,XMMWORD[48+rsi] | |
1007 pxor xmm6,XMMWORD[32+rsp] | |
1008 pxor xmm11,xmm10 | |
1009 pxor xmm2,xmm14 | |
1010 pxor xmm7,xmm8 | |
1011 movdqu XMMWORD[rdi],xmm6 | |
1012 movdqu XMMWORD[16+rdi],xmm11 | |
1013 movdqu XMMWORD[32+rdi],xmm2 | |
1014 movdqu XMMWORD[48+rdi],xmm7 | |
1015 je NEAR $L$done4x | |
1016 | |
1017 movdqa xmm6,XMMWORD[48+rsp] | |
1018 lea rsi,[64+rsi] | |
1019 xor r10,r10 | |
1020 movdqa XMMWORD[rsp],xmm6 | |
1021 movdqa XMMWORD[16+rsp],xmm15 | |
1022 lea rdi,[64+rdi] | |
1023 movdqa XMMWORD[32+rsp],xmm9 | |
1024 sub rdx,192 | |
1025 movdqa XMMWORD[48+rsp],xmm3 | |
1026 | |
1027 $L$oop_tail4x: | |
1028 movzx eax,BYTE[r10*1+rsi] | |
1029 movzx ecx,BYTE[r10*1+rsp] | |
1030 lea r10,[1+r10] | |
1031 xor eax,ecx | |
1032 mov BYTE[((-1))+r10*1+rdi],al | |
1033 dec rdx | |
1034 jnz NEAR $L$oop_tail4x | |
1035 | |
1036 $L$done4x: | |
1037 lea r11,[((320+48))+rsp] | |
1038 movaps xmm6,XMMWORD[((-48))+r11] | |
1039 movaps xmm7,XMMWORD[((-32))+r11] | |
1040 movaps xmm8,XMMWORD[((-16))+r11] | |
1041 movaps xmm9,XMMWORD[r11] | |
1042 movaps xmm10,XMMWORD[16+r11] | |
1043 movaps xmm11,XMMWORD[32+r11] | |
1044 movaps xmm12,XMMWORD[48+r11] | |
1045 movaps xmm13,XMMWORD[64+r11] | |
1046 movaps xmm14,XMMWORD[80+r11] | |
1047 movaps xmm15,XMMWORD[96+r11] | |
1048 add rsp,0x148+160 | |
1049 mov rdi,QWORD[8+rsp] ;WIN64 epilogue | |
1050 mov rsi,QWORD[16+rsp] | |
1051 DB 0F3h,0C3h ;repret | |
1052 $L$SEH_end_ChaCha20_4x: | |
1053 | |
1054 ALIGN 32 | |
1055 ChaCha20_8x: | |
1056 mov QWORD[8+rsp],rdi ;WIN64 prologue | |
1057 mov QWORD[16+rsp],rsi | |
1058 mov rax,rsp | |
1059 $L$SEH_begin_ChaCha20_8x: | |
1060 mov rdi,rcx | |
1061 mov rsi,rdx | |
1062 mov rdx,r8 | |
1063 mov rcx,r9 | |
1064 mov r8,QWORD[40+rsp] | |
1065 | |
1066 | |
1067 $L$ChaCha20_8x: | |
1068 mov r10,rsp | |
1069 sub rsp,0x280+176 | |
1070 and rsp,-32 | |
1071 lea r11,[((656+48))+rsp] | |
1072 movaps XMMWORD[(-48)+r11],xmm6 | |
1073 movaps XMMWORD[(-32)+r11],xmm7 | |
1074 movaps XMMWORD[(-16)+r11],xmm8 | |
1075 movaps XMMWORD[r11],xmm9 | |
1076 movaps XMMWORD[16+r11],xmm10 | |
1077 movaps XMMWORD[32+r11],xmm11 | |
1078 movaps XMMWORD[48+r11],xmm12 | |
1079 movaps XMMWORD[64+r11],xmm13 | |
1080 movaps XMMWORD[80+r11],xmm14 | |
1081 movaps XMMWORD[96+r11],xmm15 | |
1082 vzeroupper | |
1083 mov QWORD[640+rsp],r10 | |
1084 | |
1085 | |
1086 | |
1087 | |
1088 | |
1089 | |
1090 | |
1091 | |
1092 | |
1093 | |
1094 vbroadcasti128 ymm11,XMMWORD[$L$sigma] | |
1095 vbroadcasti128 ymm3,XMMWORD[rcx] | |
1096 vbroadcasti128 ymm15,XMMWORD[16+rcx] | |
1097 vbroadcasti128 ymm7,XMMWORD[r8] | |
1098 lea rcx,[256+rsp] | |
1099 lea rax,[512+rsp] | |
1100 lea r10,[$L$rot16] | |
1101 lea r11,[$L$rot24] | |
1102 | |
1103 vpshufd ymm8,ymm11,0x00 | |
1104 vpshufd ymm9,ymm11,0x55 | |
1105 vmovdqa YMMWORD[(128-256)+rcx],ymm8 | |
1106 vpshufd ymm10,ymm11,0xaa | |
1107 vmovdqa YMMWORD[(160-256)+rcx],ymm9 | |
1108 vpshufd ymm11,ymm11,0xff | |
1109 vmovdqa YMMWORD[(192-256)+rcx],ymm10 | |
1110 vmovdqa YMMWORD[(224-256)+rcx],ymm11 | |
1111 | |
1112 vpshufd ymm0,ymm3,0x00 | |
1113 vpshufd ymm1,ymm3,0x55 | |
1114 vmovdqa YMMWORD[(256-256)+rcx],ymm0 | |
1115 vpshufd ymm2,ymm3,0xaa | |
1116 vmovdqa YMMWORD[(288-256)+rcx],ymm1 | |
1117 vpshufd ymm3,ymm3,0xff | |
1118 vmovdqa YMMWORD[(320-256)+rcx],ymm2 | |
1119 vmovdqa YMMWORD[(352-256)+rcx],ymm3 | |
1120 | |
1121 vpshufd ymm12,ymm15,0x00 | |
1122 vpshufd ymm13,ymm15,0x55 | |
1123 vmovdqa YMMWORD[(384-512)+rax],ymm12 | |
1124 vpshufd ymm14,ymm15,0xaa | |
1125 vmovdqa YMMWORD[(416-512)+rax],ymm13 | |
1126 vpshufd ymm15,ymm15,0xff | |
1127 vmovdqa YMMWORD[(448-512)+rax],ymm14 | |
1128 vmovdqa YMMWORD[(480-512)+rax],ymm15 | |
1129 | |
1130 vpshufd ymm4,ymm7,0x00 | |
1131 vpshufd ymm5,ymm7,0x55 | |
1132 vpaddd ymm4,ymm4,YMMWORD[$L$incy] | |
1133 vpshufd ymm6,ymm7,0xaa | |
1134 vmovdqa YMMWORD[(544-512)+rax],ymm5 | |
1135 vpshufd ymm7,ymm7,0xff | |
1136 vmovdqa YMMWORD[(576-512)+rax],ymm6 | |
1137 vmovdqa YMMWORD[(608-512)+rax],ymm7 | |
1138 | |
1139 jmp NEAR $L$oop_enter8x | |
1140 | |
1141 ALIGN 32 | |
1142 $L$oop_outer8x: | |
1143 vmovdqa ymm8,YMMWORD[((128-256))+rcx] | |
1144 vmovdqa ymm9,YMMWORD[((160-256))+rcx] | |
1145 vmovdqa ymm10,YMMWORD[((192-256))+rcx] | |
1146 vmovdqa ymm11,YMMWORD[((224-256))+rcx] | |
1147 vmovdqa ymm0,YMMWORD[((256-256))+rcx] | |
1148 vmovdqa ymm1,YMMWORD[((288-256))+rcx] | |
1149 vmovdqa ymm2,YMMWORD[((320-256))+rcx] | |
1150 vmovdqa ymm3,YMMWORD[((352-256))+rcx] | |
1151 vmovdqa ymm12,YMMWORD[((384-512))+rax] | |
1152 vmovdqa ymm13,YMMWORD[((416-512))+rax] | |
1153 vmovdqa ymm14,YMMWORD[((448-512))+rax] | |
1154 vmovdqa ymm15,YMMWORD[((480-512))+rax] | |
1155 vmovdqa ymm4,YMMWORD[((512-512))+rax] | |
1156 vmovdqa ymm5,YMMWORD[((544-512))+rax] | |
1157 vmovdqa ymm6,YMMWORD[((576-512))+rax] | |
1158 vmovdqa ymm7,YMMWORD[((608-512))+rax] | |
1159 vpaddd ymm4,ymm4,YMMWORD[$L$eight] | |
1160 | |
1161 $L$oop_enter8x: | |
1162 vmovdqa YMMWORD[64+rsp],ymm14 | |
1163 vmovdqa YMMWORD[96+rsp],ymm15 | |
1164 vbroadcasti128 ymm15,XMMWORD[r10] | |
1165 vmovdqa YMMWORD[(512-512)+rax],ymm4 | |
1166 mov eax,10 | |
1167 jmp NEAR $L$oop8x | |
1168 | |
1169 ALIGN 32 | |
1170 $L$oop8x: | |
1171 vpaddd ymm8,ymm8,ymm0 | |
1172 vpxor ymm4,ymm8,ymm4 | |
1173 vpshufb ymm4,ymm4,ymm15 | |
1174 vpaddd ymm9,ymm9,ymm1 | |
1175 vpxor ymm5,ymm9,ymm5 | |
1176 vpshufb ymm5,ymm5,ymm15 | |
1177 vpaddd ymm12,ymm12,ymm4 | |
1178 vpxor ymm0,ymm12,ymm0 | |
1179 vpslld ymm14,ymm0,12 | |
1180 vpsrld ymm0,ymm0,20 | |
1181 vpor ymm0,ymm14,ymm0 | |
1182 vbroadcasti128 ymm14,XMMWORD[r11] | |
1183 vpaddd ymm13,ymm13,ymm5 | |
1184 vpxor ymm1,ymm13,ymm1 | |
1185 vpslld ymm15,ymm1,12 | |
1186 vpsrld ymm1,ymm1,20 | |
1187 vpor ymm1,ymm15,ymm1 | |
1188 vpaddd ymm8,ymm8,ymm0 | |
1189 vpxor ymm4,ymm8,ymm4 | |
1190 vpshufb ymm4,ymm4,ymm14 | |
1191 vpaddd ymm9,ymm9,ymm1 | |
1192 vpxor ymm5,ymm9,ymm5 | |
1193 vpshufb ymm5,ymm5,ymm14 | |
1194 vpaddd ymm12,ymm12,ymm4 | |
1195 vpxor ymm0,ymm12,ymm0 | |
1196 vpslld ymm15,ymm0,7 | |
1197 vpsrld ymm0,ymm0,25 | |
1198 vpor ymm0,ymm15,ymm0 | |
1199 vbroadcasti128 ymm15,XMMWORD[r10] | |
1200 vpaddd ymm13,ymm13,ymm5 | |
1201 vpxor ymm1,ymm13,ymm1 | |
1202 vpslld ymm14,ymm1,7 | |
1203 vpsrld ymm1,ymm1,25 | |
1204 vpor ymm1,ymm14,ymm1 | |
1205 vmovdqa YMMWORD[rsp],ymm12 | |
1206 vmovdqa YMMWORD[32+rsp],ymm13 | |
1207 vmovdqa ymm12,YMMWORD[64+rsp] | |
1208 vmovdqa ymm13,YMMWORD[96+rsp] | |
1209 vpaddd ymm10,ymm10,ymm2 | |
1210 vpxor ymm6,ymm10,ymm6 | |
1211 vpshufb ymm6,ymm6,ymm15 | |
1212 vpaddd ymm11,ymm11,ymm3 | |
1213 vpxor ymm7,ymm11,ymm7 | |
1214 vpshufb ymm7,ymm7,ymm15 | |
1215 vpaddd ymm12,ymm12,ymm6 | |
1216 vpxor ymm2,ymm12,ymm2 | |
1217 vpslld ymm14,ymm2,12 | |
1218 vpsrld ymm2,ymm2,20 | |
1219 vpor ymm2,ymm14,ymm2 | |
1220 vbroadcasti128 ymm14,XMMWORD[r11] | |
1221 vpaddd ymm13,ymm13,ymm7 | |
1222 vpxor ymm3,ymm13,ymm3 | |
1223 vpslld ymm15,ymm3,12 | |
1224 vpsrld ymm3,ymm3,20 | |
1225 vpor ymm3,ymm15,ymm3 | |
1226 vpaddd ymm10,ymm10,ymm2 | |
1227 vpxor ymm6,ymm10,ymm6 | |
1228 vpshufb ymm6,ymm6,ymm14 | |
1229 vpaddd ymm11,ymm11,ymm3 | |
1230 vpxor ymm7,ymm11,ymm7 | |
1231 vpshufb ymm7,ymm7,ymm14 | |
1232 vpaddd ymm12,ymm12,ymm6 | |
1233 vpxor ymm2,ymm12,ymm2 | |
1234 vpslld ymm15,ymm2,7 | |
1235 vpsrld ymm2,ymm2,25 | |
1236 vpor ymm2,ymm15,ymm2 | |
1237 vbroadcasti128 ymm15,XMMWORD[r10] | |
1238 vpaddd ymm13,ymm13,ymm7 | |
1239 vpxor ymm3,ymm13,ymm3 | |
1240 vpslld ymm14,ymm3,7 | |
1241 vpsrld ymm3,ymm3,25 | |
1242 vpor ymm3,ymm14,ymm3 | |
1243 vpaddd ymm8,ymm8,ymm1 | |
1244 vpxor ymm7,ymm8,ymm7 | |
1245 vpshufb ymm7,ymm7,ymm15 | |
1246 vpaddd ymm9,ymm9,ymm2 | |
1247 vpxor ymm4,ymm9,ymm4 | |
1248 vpshufb ymm4,ymm4,ymm15 | |
1249 vpaddd ymm12,ymm12,ymm7 | |
1250 vpxor ymm1,ymm12,ymm1 | |
1251 vpslld ymm14,ymm1,12 | |
1252 vpsrld ymm1,ymm1,20 | |
1253 vpor ymm1,ymm14,ymm1 | |
1254 vbroadcasti128 ymm14,XMMWORD[r11] | |
1255 vpaddd ymm13,ymm13,ymm4 | |
1256 vpxor ymm2,ymm13,ymm2 | |
1257 vpslld ymm15,ymm2,12 | |
1258 vpsrld ymm2,ymm2,20 | |
1259 vpor ymm2,ymm15,ymm2 | |
1260 vpaddd ymm8,ymm8,ymm1 | |
1261 vpxor ymm7,ymm8,ymm7 | |
1262 vpshufb ymm7,ymm7,ymm14 | |
1263 vpaddd ymm9,ymm9,ymm2 | |
1264 vpxor ymm4,ymm9,ymm4 | |
1265 vpshufb ymm4,ymm4,ymm14 | |
1266 vpaddd ymm12,ymm12,ymm7 | |
1267 vpxor ymm1,ymm12,ymm1 | |
1268 vpslld ymm15,ymm1,7 | |
1269 vpsrld ymm1,ymm1,25 | |
1270 vpor ymm1,ymm15,ymm1 | |
1271 vbroadcasti128 ymm15,XMMWORD[r10] | |
1272 vpaddd ymm13,ymm13,ymm4 | |
1273 vpxor ymm2,ymm13,ymm2 | |
1274 vpslld ymm14,ymm2,7 | |
1275 vpsrld ymm2,ymm2,25 | |
1276 vpor ymm2,ymm14,ymm2 | |
1277 vmovdqa YMMWORD[64+rsp],ymm12 | |
1278 vmovdqa YMMWORD[96+rsp],ymm13 | |
1279 vmovdqa ymm12,YMMWORD[rsp] | |
1280 vmovdqa ymm13,YMMWORD[32+rsp] | |
1281 vpaddd ymm10,ymm10,ymm3 | |
1282 vpxor ymm5,ymm10,ymm5 | |
1283 vpshufb ymm5,ymm5,ymm15 | |
1284 vpaddd ymm11,ymm11,ymm0 | |
1285 vpxor ymm6,ymm11,ymm6 | |
1286 vpshufb ymm6,ymm6,ymm15 | |
1287 vpaddd ymm12,ymm12,ymm5 | |
1288 vpxor ymm3,ymm12,ymm3 | |
1289 vpslld ymm14,ymm3,12 | |
1290 vpsrld ymm3,ymm3,20 | |
1291 vpor ymm3,ymm14,ymm3 | |
1292 vbroadcasti128 ymm14,XMMWORD[r11] | |
1293 vpaddd ymm13,ymm13,ymm6 | |
1294 vpxor ymm0,ymm13,ymm0 | |
1295 vpslld ymm15,ymm0,12 | |
1296 vpsrld ymm0,ymm0,20 | |
1297 vpor ymm0,ymm15,ymm0 | |
1298 vpaddd ymm10,ymm10,ymm3 | |
1299 vpxor ymm5,ymm10,ymm5 | |
1300 vpshufb ymm5,ymm5,ymm14 | |
1301 vpaddd ymm11,ymm11,ymm0 | |
1302 vpxor ymm6,ymm11,ymm6 | |
1303 vpshufb ymm6,ymm6,ymm14 | |
1304 vpaddd ymm12,ymm12,ymm5 | |
1305 vpxor ymm3,ymm12,ymm3 | |
1306 vpslld ymm15,ymm3,7 | |
1307 vpsrld ymm3,ymm3,25 | |
1308 vpor ymm3,ymm15,ymm3 | |
1309 vbroadcasti128 ymm15,XMMWORD[r10] | |
1310 vpaddd ymm13,ymm13,ymm6 | |
1311 vpxor ymm0,ymm13,ymm0 | |
1312 vpslld ymm14,ymm0,7 | |
1313 vpsrld ymm0,ymm0,25 | |
1314 vpor ymm0,ymm14,ymm0 | |
1315 dec eax | |
1316 jnz NEAR $L$oop8x | |
1317 | |
1318 lea rax,[512+rsp] | |
1319 vpaddd ymm8,ymm8,YMMWORD[((128-256))+rcx] | |
1320 vpaddd ymm9,ymm9,YMMWORD[((160-256))+rcx] | |
1321 vpaddd ymm10,ymm10,YMMWORD[((192-256))+rcx] | |
1322 vpaddd ymm11,ymm11,YMMWORD[((224-256))+rcx] | |
1323 | |
1324 vpunpckldq ymm14,ymm8,ymm9 | |
1325 vpunpckldq ymm15,ymm10,ymm11 | |
1326 vpunpckhdq ymm8,ymm8,ymm9 | |
1327 vpunpckhdq ymm10,ymm10,ymm11 | |
1328 vpunpcklqdq ymm9,ymm14,ymm15 | |
1329 vpunpckhqdq ymm14,ymm14,ymm15 | |
1330 vpunpcklqdq ymm11,ymm8,ymm10 | |
1331 vpunpckhqdq ymm8,ymm8,ymm10 | |
1332 vpaddd ymm0,ymm0,YMMWORD[((256-256))+rcx] | |
1333 vpaddd ymm1,ymm1,YMMWORD[((288-256))+rcx] | |
1334 vpaddd ymm2,ymm2,YMMWORD[((320-256))+rcx] | |
1335 vpaddd ymm3,ymm3,YMMWORD[((352-256))+rcx] | |
1336 | |
1337 vpunpckldq ymm10,ymm0,ymm1 | |
1338 vpunpckldq ymm15,ymm2,ymm3 | |
1339 vpunpckhdq ymm0,ymm0,ymm1 | |
1340 vpunpckhdq ymm2,ymm2,ymm3 | |
1341 vpunpcklqdq ymm1,ymm10,ymm15 | |
1342 vpunpckhqdq ymm10,ymm10,ymm15 | |
1343 vpunpcklqdq ymm3,ymm0,ymm2 | |
1344 vpunpckhqdq ymm0,ymm0,ymm2 | |
1345 vperm2i128 ymm15,ymm9,ymm1,0x20 | |
1346 vperm2i128 ymm1,ymm9,ymm1,0x31 | |
1347 vperm2i128 ymm9,ymm14,ymm10,0x20 | |
1348 vperm2i128 ymm10,ymm14,ymm10,0x31 | |
1349 vperm2i128 ymm14,ymm11,ymm3,0x20 | |
1350 vperm2i128 ymm3,ymm11,ymm3,0x31 | |
1351 vperm2i128 ymm11,ymm8,ymm0,0x20 | |
1352 vperm2i128 ymm0,ymm8,ymm0,0x31 | |
1353 vmovdqa YMMWORD[rsp],ymm15 | |
1354 vmovdqa YMMWORD[32+rsp],ymm9 | |
1355 vmovdqa ymm15,YMMWORD[64+rsp] | |
1356 vmovdqa ymm9,YMMWORD[96+rsp] | |
1357 | |
1358 vpaddd ymm12,ymm12,YMMWORD[((384-512))+rax] | |
1359 vpaddd ymm13,ymm13,YMMWORD[((416-512))+rax] | |
1360 vpaddd ymm15,ymm15,YMMWORD[((448-512))+rax] | |
1361 vpaddd ymm9,ymm9,YMMWORD[((480-512))+rax] | |
1362 | |
1363 vpunpckldq ymm2,ymm12,ymm13 | |
1364 vpunpckldq ymm8,ymm15,ymm9 | |
1365 vpunpckhdq ymm12,ymm12,ymm13 | |
1366 vpunpckhdq ymm15,ymm15,ymm9 | |
1367 vpunpcklqdq ymm13,ymm2,ymm8 | |
1368 vpunpckhqdq ymm2,ymm2,ymm8 | |
1369 vpunpcklqdq ymm9,ymm12,ymm15 | |
1370 vpunpckhqdq ymm12,ymm12,ymm15 | |
1371 vpaddd ymm4,ymm4,YMMWORD[((512-512))+rax] | |
1372 vpaddd ymm5,ymm5,YMMWORD[((544-512))+rax] | |
1373 vpaddd ymm6,ymm6,YMMWORD[((576-512))+rax] | |
1374 vpaddd ymm7,ymm7,YMMWORD[((608-512))+rax] | |
1375 | |
1376 vpunpckldq ymm15,ymm4,ymm5 | |
1377 vpunpckldq ymm8,ymm6,ymm7 | |
1378 vpunpckhdq ymm4,ymm4,ymm5 | |
1379 vpunpckhdq ymm6,ymm6,ymm7 | |
1380 vpunpcklqdq ymm5,ymm15,ymm8 | |
1381 vpunpckhqdq ymm15,ymm15,ymm8 | |
1382 vpunpcklqdq ymm7,ymm4,ymm6 | |
1383 vpunpckhqdq ymm4,ymm4,ymm6 | |
1384 vperm2i128 ymm8,ymm13,ymm5,0x20 | |
1385 vperm2i128 ymm5,ymm13,ymm5,0x31 | |
1386 vperm2i128 ymm13,ymm2,ymm15,0x20 | |
1387 vperm2i128 ymm15,ymm2,ymm15,0x31 | |
1388 vperm2i128 ymm2,ymm9,ymm7,0x20 | |
1389 vperm2i128 ymm7,ymm9,ymm7,0x31 | |
1390 vperm2i128 ymm9,ymm12,ymm4,0x20 | |
1391 vperm2i128 ymm4,ymm12,ymm4,0x31 | |
1392 vmovdqa ymm6,YMMWORD[rsp] | |
1393 vmovdqa ymm12,YMMWORD[32+rsp] | |
1394 | |
1395 cmp rdx,64*8 | |
1396 jb NEAR $L$tail8x | |
1397 | |
1398 vpxor ymm6,ymm6,YMMWORD[rsi] | |
1399 vpxor ymm8,ymm8,YMMWORD[32+rsi] | |
1400 vpxor ymm1,ymm1,YMMWORD[64+rsi] | |
1401 vpxor ymm5,ymm5,YMMWORD[96+rsi] | |
1402 lea rsi,[128+rsi] | |
1403 vmovdqu YMMWORD[rdi],ymm6 | |
1404 vmovdqu YMMWORD[32+rdi],ymm8 | |
1405 vmovdqu YMMWORD[64+rdi],ymm1 | |
1406 vmovdqu YMMWORD[96+rdi],ymm5 | |
1407 lea rdi,[128+rdi] | |
1408 | |
1409 vpxor ymm12,ymm12,YMMWORD[rsi] | |
1410 vpxor ymm13,ymm13,YMMWORD[32+rsi] | |
1411 vpxor ymm10,ymm10,YMMWORD[64+rsi] | |
1412 vpxor ymm15,ymm15,YMMWORD[96+rsi] | |
1413 lea rsi,[128+rsi] | |
1414 vmovdqu YMMWORD[rdi],ymm12 | |
1415 vmovdqu YMMWORD[32+rdi],ymm13 | |
1416 vmovdqu YMMWORD[64+rdi],ymm10 | |
1417 vmovdqu YMMWORD[96+rdi],ymm15 | |
1418 lea rdi,[128+rdi] | |
1419 | |
1420 vpxor ymm14,ymm14,YMMWORD[rsi] | |
1421 vpxor ymm2,ymm2,YMMWORD[32+rsi] | |
1422 vpxor ymm3,ymm3,YMMWORD[64+rsi] | |
1423 vpxor ymm7,ymm7,YMMWORD[96+rsi] | |
1424 lea rsi,[128+rsi] | |
1425 vmovdqu YMMWORD[rdi],ymm14 | |
1426 vmovdqu YMMWORD[32+rdi],ymm2 | |
1427 vmovdqu YMMWORD[64+rdi],ymm3 | |
1428 vmovdqu YMMWORD[96+rdi],ymm7 | |
1429 lea rdi,[128+rdi] | |
1430 | |
1431 vpxor ymm11,ymm11,YMMWORD[rsi] | |
1432 vpxor ymm9,ymm9,YMMWORD[32+rsi] | |
1433 vpxor ymm0,ymm0,YMMWORD[64+rsi] | |
1434 vpxor ymm4,ymm4,YMMWORD[96+rsi] | |
1435 lea rsi,[128+rsi] | |
1436 vmovdqu YMMWORD[rdi],ymm11 | |
1437 vmovdqu YMMWORD[32+rdi],ymm9 | |
1438 vmovdqu YMMWORD[64+rdi],ymm0 | |
1439 vmovdqu YMMWORD[96+rdi],ymm4 | |
1440 lea rdi,[128+rdi] | |
1441 | |
1442 sub rdx,64*8 | |
1443 jnz NEAR $L$oop_outer8x | |
1444 | |
1445 jmp NEAR $L$done8x | |
1446 | |
1447 $L$tail8x: | |
1448 cmp rdx,448 | |
1449 jae NEAR $L$448_or_more8x | |
1450 cmp rdx,384 | |
1451 jae NEAR $L$384_or_more8x | |
1452 cmp rdx,320 | |
1453 jae NEAR $L$320_or_more8x | |
1454 cmp rdx,256 | |
1455 jae NEAR $L$256_or_more8x | |
1456 cmp rdx,192 | |
1457 jae NEAR $L$192_or_more8x | |
1458 cmp rdx,128 | |
1459 jae NEAR $L$128_or_more8x | |
1460 cmp rdx,64 | |
1461 jae NEAR $L$64_or_more8x | |
1462 | |
1463 xor r10,r10 | |
1464 vmovdqa YMMWORD[rsp],ymm6 | |
1465 vmovdqa YMMWORD[32+rsp],ymm8 | |
1466 jmp NEAR $L$oop_tail8x | |
1467 | |
1468 ALIGN 32 | |
1469 $L$64_or_more8x: | |
1470 vpxor ymm6,ymm6,YMMWORD[rsi] | |
1471 vpxor ymm8,ymm8,YMMWORD[32+rsi] | |
1472 vmovdqu YMMWORD[rdi],ymm6 | |
1473 vmovdqu YMMWORD[32+rdi],ymm8 | |
1474 je NEAR $L$done8x | |
1475 | |
1476 lea rsi,[64+rsi] | |
1477 xor r10,r10 | |
1478 vmovdqa YMMWORD[rsp],ymm1 | |
1479 lea rdi,[64+rdi] | |
1480 sub rdx,64 | |
1481 vmovdqa YMMWORD[32+rsp],ymm5 | |
1482 jmp NEAR $L$oop_tail8x | |
1483 | |
1484 ALIGN 32 | |
1485 $L$128_or_more8x: | |
1486 vpxor ymm6,ymm6,YMMWORD[rsi] | |
1487 vpxor ymm8,ymm8,YMMWORD[32+rsi] | |
1488 vpxor ymm1,ymm1,YMMWORD[64+rsi] | |
1489 vpxor ymm5,ymm5,YMMWORD[96+rsi] | |
1490 vmovdqu YMMWORD[rdi],ymm6 | |
1491 vmovdqu YMMWORD[32+rdi],ymm8 | |
1492 vmovdqu YMMWORD[64+rdi],ymm1 | |
1493 vmovdqu YMMWORD[96+rdi],ymm5 | |
1494 je NEAR $L$done8x | |
1495 | |
1496 lea rsi,[128+rsi] | |
1497 xor r10,r10 | |
1498 vmovdqa YMMWORD[rsp],ymm12 | |
1499 lea rdi,[128+rdi] | |
1500 sub rdx,128 | |
1501 vmovdqa YMMWORD[32+rsp],ymm13 | |
1502 jmp NEAR $L$oop_tail8x | |
1503 | |
1504 ALIGN 32 | |
1505 $L$192_or_more8x: | |
1506 vpxor ymm6,ymm6,YMMWORD[rsi] | |
1507 vpxor ymm8,ymm8,YMMWORD[32+rsi] | |
1508 vpxor ymm1,ymm1,YMMWORD[64+rsi] | |
1509 vpxor ymm5,ymm5,YMMWORD[96+rsi] | |
1510 vpxor ymm12,ymm12,YMMWORD[128+rsi] | |
1511 vpxor ymm13,ymm13,YMMWORD[160+rsi] | |
1512 vmovdqu YMMWORD[rdi],ymm6 | |
1513 vmovdqu YMMWORD[32+rdi],ymm8 | |
1514 vmovdqu YMMWORD[64+rdi],ymm1 | |
1515 vmovdqu YMMWORD[96+rdi],ymm5 | |
1516 vmovdqu YMMWORD[128+rdi],ymm12 | |
1517 vmovdqu YMMWORD[160+rdi],ymm13 | |
1518 je NEAR $L$done8x | |
1519 | |
1520 lea rsi,[192+rsi] | |
1521 xor r10,r10 | |
1522 vmovdqa YMMWORD[rsp],ymm10 | |
1523 lea rdi,[192+rdi] | |
1524 sub rdx,192 | |
1525 vmovdqa YMMWORD[32+rsp],ymm15 | |
1526 jmp NEAR $L$oop_tail8x | |
1527 | |
1528 ALIGN 32 | |
1529 $L$256_or_more8x: | |
1530 vpxor ymm6,ymm6,YMMWORD[rsi] | |
1531 vpxor ymm8,ymm8,YMMWORD[32+rsi] | |
1532 vpxor ymm1,ymm1,YMMWORD[64+rsi] | |
1533 vpxor ymm5,ymm5,YMMWORD[96+rsi] | |
1534 vpxor ymm12,ymm12,YMMWORD[128+rsi] | |
1535 vpxor ymm13,ymm13,YMMWORD[160+rsi] | |
1536 vpxor ymm10,ymm10,YMMWORD[192+rsi] | |
1537 vpxor ymm15,ymm15,YMMWORD[224+rsi] | |
1538 vmovdqu YMMWORD[rdi],ymm6 | |
1539 vmovdqu YMMWORD[32+rdi],ymm8 | |
1540 vmovdqu YMMWORD[64+rdi],ymm1 | |
1541 vmovdqu YMMWORD[96+rdi],ymm5 | |
1542 vmovdqu YMMWORD[128+rdi],ymm12 | |
1543 vmovdqu YMMWORD[160+rdi],ymm13 | |
1544 vmovdqu YMMWORD[192+rdi],ymm10 | |
1545 vmovdqu YMMWORD[224+rdi],ymm15 | |
1546 je NEAR $L$done8x | |
1547 | |
1548 lea rsi,[256+rsi] | |
1549 xor r10,r10 | |
1550 vmovdqa YMMWORD[rsp],ymm14 | |
1551 lea rdi,[256+rdi] | |
1552 sub rdx,256 | |
1553 vmovdqa YMMWORD[32+rsp],ymm2 | |
1554 jmp NEAR $L$oop_tail8x | |
1555 | |
1556 ALIGN 32 | |
1557 $L$320_or_more8x: | |
1558 vpxor ymm6,ymm6,YMMWORD[rsi] | |
1559 vpxor ymm8,ymm8,YMMWORD[32+rsi] | |
1560 vpxor ymm1,ymm1,YMMWORD[64+rsi] | |
1561 vpxor ymm5,ymm5,YMMWORD[96+rsi] | |
1562 vpxor ymm12,ymm12,YMMWORD[128+rsi] | |
1563 vpxor ymm13,ymm13,YMMWORD[160+rsi] | |
1564 vpxor ymm10,ymm10,YMMWORD[192+rsi] | |
1565 vpxor ymm15,ymm15,YMMWORD[224+rsi] | |
1566 vpxor ymm14,ymm14,YMMWORD[256+rsi] | |
1567 vpxor ymm2,ymm2,YMMWORD[288+rsi] | |
1568 vmovdqu YMMWORD[rdi],ymm6 | |
1569 vmovdqu YMMWORD[32+rdi],ymm8 | |
1570 vmovdqu YMMWORD[64+rdi],ymm1 | |
1571 vmovdqu YMMWORD[96+rdi],ymm5 | |
1572 vmovdqu YMMWORD[128+rdi],ymm12 | |
1573 vmovdqu YMMWORD[160+rdi],ymm13 | |
1574 vmovdqu YMMWORD[192+rdi],ymm10 | |
1575 vmovdqu YMMWORD[224+rdi],ymm15 | |
1576 vmovdqu YMMWORD[256+rdi],ymm14 | |
1577 vmovdqu YMMWORD[288+rdi],ymm2 | |
1578 je NEAR $L$done8x | |
1579 | |
1580 lea rsi,[320+rsi] | |
1581 xor r10,r10 | |
1582 vmovdqa YMMWORD[rsp],ymm3 | |
1583 lea rdi,[320+rdi] | |
1584 sub rdx,320 | |
1585 vmovdqa YMMWORD[32+rsp],ymm7 | |
1586 jmp NEAR $L$oop_tail8x | |
1587 | |
1588 ALIGN 32 | |
1589 $L$384_or_more8x: | |
1590 vpxor ymm6,ymm6,YMMWORD[rsi] | |
1591 vpxor ymm8,ymm8,YMMWORD[32+rsi] | |
1592 vpxor ymm1,ymm1,YMMWORD[64+rsi] | |
1593 vpxor ymm5,ymm5,YMMWORD[96+rsi] | |
1594 vpxor ymm12,ymm12,YMMWORD[128+rsi] | |
1595 vpxor ymm13,ymm13,YMMWORD[160+rsi] | |
1596 vpxor ymm10,ymm10,YMMWORD[192+rsi] | |
1597 vpxor ymm15,ymm15,YMMWORD[224+rsi] | |
1598 vpxor ymm14,ymm14,YMMWORD[256+rsi] | |
1599 vpxor ymm2,ymm2,YMMWORD[288+rsi] | |
1600 vpxor ymm3,ymm3,YMMWORD[320+rsi] | |
1601 vpxor ymm7,ymm7,YMMWORD[352+rsi] | |
1602 vmovdqu YMMWORD[rdi],ymm6 | |
1603 vmovdqu YMMWORD[32+rdi],ymm8 | |
1604 vmovdqu YMMWORD[64+rdi],ymm1 | |
1605 vmovdqu YMMWORD[96+rdi],ymm5 | |
1606 vmovdqu YMMWORD[128+rdi],ymm12 | |
1607 vmovdqu YMMWORD[160+rdi],ymm13 | |
1608 vmovdqu YMMWORD[192+rdi],ymm10 | |
1609 vmovdqu YMMWORD[224+rdi],ymm15 | |
1610 vmovdqu YMMWORD[256+rdi],ymm14 | |
1611 vmovdqu YMMWORD[288+rdi],ymm2 | |
1612 vmovdqu YMMWORD[320+rdi],ymm3 | |
1613 vmovdqu YMMWORD[352+rdi],ymm7 | |
1614 je NEAR $L$done8x | |
1615 | |
1616 lea rsi,[384+rsi] | |
1617 xor r10,r10 | |
1618 vmovdqa YMMWORD[rsp],ymm11 | |
1619 lea rdi,[384+rdi] | |
1620 sub rdx,384 | |
1621 vmovdqa YMMWORD[32+rsp],ymm9 | |
1622 jmp NEAR $L$oop_tail8x | |
1623 | |
1624 ALIGN 32 | |
1625 $L$448_or_more8x: | |
1626 vpxor ymm6,ymm6,YMMWORD[rsi] | |
1627 vpxor ymm8,ymm8,YMMWORD[32+rsi] | |
1628 vpxor ymm1,ymm1,YMMWORD[64+rsi] | |
1629 vpxor ymm5,ymm5,YMMWORD[96+rsi] | |
1630 vpxor ymm12,ymm12,YMMWORD[128+rsi] | |
1631 vpxor ymm13,ymm13,YMMWORD[160+rsi] | |
1632 vpxor ymm10,ymm10,YMMWORD[192+rsi] | |
1633 vpxor ymm15,ymm15,YMMWORD[224+rsi] | |
1634 vpxor ymm14,ymm14,YMMWORD[256+rsi] | |
1635 vpxor ymm2,ymm2,YMMWORD[288+rsi] | |
1636 vpxor ymm3,ymm3,YMMWORD[320+rsi] | |
1637 vpxor ymm7,ymm7,YMMWORD[352+rsi] | |
1638 vpxor ymm11,ymm11,YMMWORD[384+rsi] | |
1639 vpxor ymm9,ymm9,YMMWORD[416+rsi] | |
1640 vmovdqu YMMWORD[rdi],ymm6 | |
1641 vmovdqu YMMWORD[32+rdi],ymm8 | |
1642 vmovdqu YMMWORD[64+rdi],ymm1 | |
1643 vmovdqu YMMWORD[96+rdi],ymm5 | |
1644 vmovdqu YMMWORD[128+rdi],ymm12 | |
1645 vmovdqu YMMWORD[160+rdi],ymm13 | |
1646 vmovdqu YMMWORD[192+rdi],ymm10 | |
1647 vmovdqu YMMWORD[224+rdi],ymm15 | |
1648 vmovdqu YMMWORD[256+rdi],ymm14 | |
1649 vmovdqu YMMWORD[288+rdi],ymm2 | |
1650 vmovdqu YMMWORD[320+rdi],ymm3 | |
1651 vmovdqu YMMWORD[352+rdi],ymm7 | |
1652 vmovdqu YMMWORD[384+rdi],ymm11 | |
1653 vmovdqu YMMWORD[416+rdi],ymm9 | |
1654 je NEAR $L$done8x | |
1655 | |
1656 lea rsi,[448+rsi] | |
1657 xor r10,r10 | |
1658 vmovdqa YMMWORD[rsp],ymm0 | |
1659 lea rdi,[448+rdi] | |
1660 sub rdx,448 | |
1661 vmovdqa YMMWORD[32+rsp],ymm4 | |
1662 | |
1663 $L$oop_tail8x: | |
1664 movzx eax,BYTE[r10*1+rsi] | |
1665 movzx ecx,BYTE[r10*1+rsp] | |
1666 lea r10,[1+r10] | |
1667 xor eax,ecx | |
1668 mov BYTE[((-1))+r10*1+rdi],al | |
1669 dec rdx | |
1670 jnz NEAR $L$oop_tail8x | |
1671 | |
1672 $L$done8x: | |
1673 vzeroall | |
1674 lea r11,[((656+48))+rsp] | |
1675 movaps xmm6,XMMWORD[((-48))+r11] | |
1676 movaps xmm7,XMMWORD[((-32))+r11] | |
1677 movaps xmm8,XMMWORD[((-16))+r11] | |
1678 movaps xmm9,XMMWORD[r11] | |
1679 movaps xmm10,XMMWORD[16+r11] | |
1680 movaps xmm11,XMMWORD[32+r11] | |
1681 movaps xmm12,XMMWORD[48+r11] | |
1682 movaps xmm13,XMMWORD[64+r11] | |
1683 movaps xmm14,XMMWORD[80+r11] | |
1684 movaps xmm15,XMMWORD[96+r11] | |
1685 mov rsp,QWORD[640+rsp] | |
1686 mov rdi,QWORD[8+rsp] ;WIN64 epilogue | |
1687 mov rsi,QWORD[16+rsp] | |
1688 DB 0F3h,0C3h ;repret | |
1689 $L$SEH_end_ChaCha20_8x: | |
OLD | NEW |