Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(775)

Side by Side Diff: third_party/boringssl/win-x86_64/crypto/chacha/chacha-x86_64.asm

Issue 2219933002: Land BoringSSL roll on master (Closed) Base URL: git@github.com:dart-lang/sdk.git@master
Patch Set: Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 default rel
2 %define XMMWORD
3 %define YMMWORD
4 %define ZMMWORD
5 section .text code align=64
6
7
8 EXTERN OPENSSL_ia32cap_P
9
10 ALIGN 64
11 $L$zero:
12 DD 0,0,0,0
13 $L$one:
14 DD 1,0,0,0
15 $L$inc:
16 DD 0,1,2,3
17 $L$four:
18 DD 4,4,4,4
19 $L$incy:
20 DD 0,2,4,6,1,3,5,7
21 $L$eight:
22 DD 8,8,8,8,8,8,8,8
23 $L$rot16:
24 DB 0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd
25 $L$rot24:
26 DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe
27 $L$sigma:
28 DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
29 DB 0
30 DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
31 DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
32 DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
33 DB 108,46,111,114,103,62,0
34 global ChaCha20_ctr32
35
36 ALIGN 64
37 ChaCha20_ctr32:
38 mov QWORD[8+rsp],rdi ;WIN64 prologue
39 mov QWORD[16+rsp],rsi
40 mov rax,rsp
41 $L$SEH_begin_ChaCha20_ctr32:
42 mov rdi,rcx
43 mov rsi,rdx
44 mov rdx,r8
45 mov rcx,r9
46 mov r8,QWORD[40+rsp]
47
48
49 cmp rdx,0
50 je NEAR $L$no_data
51 mov r10,QWORD[((OPENSSL_ia32cap_P+4))]
52 test r10d,512
53 jnz NEAR $L$ChaCha20_ssse3
54
55 push rbx
56 push rbp
57 push r12
58 push r13
59 push r14
60 push r15
61 sub rsp,64+24
62
63
64 movdqu xmm1,XMMWORD[rcx]
65 movdqu xmm2,XMMWORD[16+rcx]
66 movdqu xmm3,XMMWORD[r8]
67 movdqa xmm4,XMMWORD[$L$one]
68
69
70 movdqa XMMWORD[16+rsp],xmm1
71 movdqa XMMWORD[32+rsp],xmm2
72 movdqa XMMWORD[48+rsp],xmm3
73 mov rbp,rdx
74 jmp NEAR $L$oop_outer
75
76 ALIGN 32
77 $L$oop_outer:
78 mov eax,0x61707865
79 mov ebx,0x3320646e
80 mov ecx,0x79622d32
81 mov edx,0x6b206574
82 mov r8d,DWORD[16+rsp]
83 mov r9d,DWORD[20+rsp]
84 mov r10d,DWORD[24+rsp]
85 mov r11d,DWORD[28+rsp]
86 movd r12d,xmm3
87 mov r13d,DWORD[52+rsp]
88 mov r14d,DWORD[56+rsp]
89 mov r15d,DWORD[60+rsp]
90
91 mov QWORD[((64+0))+rsp],rbp
92 mov ebp,10
93 mov QWORD[((64+8))+rsp],rsi
94 DB 102,72,15,126,214
95 mov QWORD[((64+16))+rsp],rdi
96 mov rdi,rsi
97 shr rdi,32
98 jmp NEAR $L$oop
99
100 ALIGN 32
101 $L$oop:
102 add eax,r8d
103 xor r12d,eax
104 rol r12d,16
105 add ebx,r9d
106 xor r13d,ebx
107 rol r13d,16
108 add esi,r12d
109 xor r8d,esi
110 rol r8d,12
111 add edi,r13d
112 xor r9d,edi
113 rol r9d,12
114 add eax,r8d
115 xor r12d,eax
116 rol r12d,8
117 add ebx,r9d
118 xor r13d,ebx
119 rol r13d,8
120 add esi,r12d
121 xor r8d,esi
122 rol r8d,7
123 add edi,r13d
124 xor r9d,edi
125 rol r9d,7
126 mov DWORD[32+rsp],esi
127 mov DWORD[36+rsp],edi
128 mov esi,DWORD[40+rsp]
129 mov edi,DWORD[44+rsp]
130 add ecx,r10d
131 xor r14d,ecx
132 rol r14d,16
133 add edx,r11d
134 xor r15d,edx
135 rol r15d,16
136 add esi,r14d
137 xor r10d,esi
138 rol r10d,12
139 add edi,r15d
140 xor r11d,edi
141 rol r11d,12
142 add ecx,r10d
143 xor r14d,ecx
144 rol r14d,8
145 add edx,r11d
146 xor r15d,edx
147 rol r15d,8
148 add esi,r14d
149 xor r10d,esi
150 rol r10d,7
151 add edi,r15d
152 xor r11d,edi
153 rol r11d,7
154 add eax,r9d
155 xor r15d,eax
156 rol r15d,16
157 add ebx,r10d
158 xor r12d,ebx
159 rol r12d,16
160 add esi,r15d
161 xor r9d,esi
162 rol r9d,12
163 add edi,r12d
164 xor r10d,edi
165 rol r10d,12
166 add eax,r9d
167 xor r15d,eax
168 rol r15d,8
169 add ebx,r10d
170 xor r12d,ebx
171 rol r12d,8
172 add esi,r15d
173 xor r9d,esi
174 rol r9d,7
175 add edi,r12d
176 xor r10d,edi
177 rol r10d,7
178 mov DWORD[40+rsp],esi
179 mov DWORD[44+rsp],edi
180 mov esi,DWORD[32+rsp]
181 mov edi,DWORD[36+rsp]
182 add ecx,r11d
183 xor r13d,ecx
184 rol r13d,16
185 add edx,r8d
186 xor r14d,edx
187 rol r14d,16
188 add esi,r13d
189 xor r11d,esi
190 rol r11d,12
191 add edi,r14d
192 xor r8d,edi
193 rol r8d,12
194 add ecx,r11d
195 xor r13d,ecx
196 rol r13d,8
197 add edx,r8d
198 xor r14d,edx
199 rol r14d,8
200 add esi,r13d
201 xor r11d,esi
202 rol r11d,7
203 add edi,r14d
204 xor r8d,edi
205 rol r8d,7
206 dec ebp
207 jnz NEAR $L$oop
208 mov DWORD[36+rsp],edi
209 mov DWORD[32+rsp],esi
210 mov rbp,QWORD[64+rsp]
211 movdqa xmm1,xmm2
212 mov rsi,QWORD[((64+8))+rsp]
213 paddd xmm3,xmm4
214 mov rdi,QWORD[((64+16))+rsp]
215
216 add eax,0x61707865
217 add ebx,0x3320646e
218 add ecx,0x79622d32
219 add edx,0x6b206574
220 add r8d,DWORD[16+rsp]
221 add r9d,DWORD[20+rsp]
222 add r10d,DWORD[24+rsp]
223 add r11d,DWORD[28+rsp]
224 add r12d,DWORD[48+rsp]
225 add r13d,DWORD[52+rsp]
226 add r14d,DWORD[56+rsp]
227 add r15d,DWORD[60+rsp]
228 paddd xmm1,XMMWORD[32+rsp]
229
230 cmp rbp,64
231 jb NEAR $L$tail
232
233 xor eax,DWORD[rsi]
234 xor ebx,DWORD[4+rsi]
235 xor ecx,DWORD[8+rsi]
236 xor edx,DWORD[12+rsi]
237 xor r8d,DWORD[16+rsi]
238 xor r9d,DWORD[20+rsi]
239 xor r10d,DWORD[24+rsi]
240 xor r11d,DWORD[28+rsi]
241 movdqu xmm0,XMMWORD[32+rsi]
242 xor r12d,DWORD[48+rsi]
243 xor r13d,DWORD[52+rsi]
244 xor r14d,DWORD[56+rsi]
245 xor r15d,DWORD[60+rsi]
246 lea rsi,[64+rsi]
247 pxor xmm0,xmm1
248
249 movdqa XMMWORD[32+rsp],xmm2
250 movd DWORD[48+rsp],xmm3
251
252 mov DWORD[rdi],eax
253 mov DWORD[4+rdi],ebx
254 mov DWORD[8+rdi],ecx
255 mov DWORD[12+rdi],edx
256 mov DWORD[16+rdi],r8d
257 mov DWORD[20+rdi],r9d
258 mov DWORD[24+rdi],r10d
259 mov DWORD[28+rdi],r11d
260 movdqu XMMWORD[32+rdi],xmm0
261 mov DWORD[48+rdi],r12d
262 mov DWORD[52+rdi],r13d
263 mov DWORD[56+rdi],r14d
264 mov DWORD[60+rdi],r15d
265 lea rdi,[64+rdi]
266
267 sub rbp,64
268 jnz NEAR $L$oop_outer
269
270 jmp NEAR $L$done
271
272 ALIGN 16
273 $L$tail:
274 mov DWORD[rsp],eax
275 mov DWORD[4+rsp],ebx
276 xor rbx,rbx
277 mov DWORD[8+rsp],ecx
278 mov DWORD[12+rsp],edx
279 mov DWORD[16+rsp],r8d
280 mov DWORD[20+rsp],r9d
281 mov DWORD[24+rsp],r10d
282 mov DWORD[28+rsp],r11d
283 movdqa XMMWORD[32+rsp],xmm1
284 mov DWORD[48+rsp],r12d
285 mov DWORD[52+rsp],r13d
286 mov DWORD[56+rsp],r14d
287 mov DWORD[60+rsp],r15d
288
289 $L$oop_tail:
290 movzx eax,BYTE[rbx*1+rsi]
291 movzx edx,BYTE[rbx*1+rsp]
292 lea rbx,[1+rbx]
293 xor eax,edx
294 mov BYTE[((-1))+rbx*1+rdi],al
295 dec rbp
296 jnz NEAR $L$oop_tail
297
298 $L$done:
299 add rsp,64+24
300 pop r15
301 pop r14
302 pop r13
303 pop r12
304 pop rbp
305 pop rbx
306 $L$no_data:
307 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
308 mov rsi,QWORD[16+rsp]
309 DB 0F3h,0C3h ;repret
310 $L$SEH_end_ChaCha20_ctr32:
311
312 ALIGN 32
313 ChaCha20_ssse3:
314 mov QWORD[8+rsp],rdi ;WIN64 prologue
315 mov QWORD[16+rsp],rsi
316 mov rax,rsp
317 $L$SEH_begin_ChaCha20_ssse3:
318 mov rdi,rcx
319 mov rsi,rdx
320 mov rdx,r8
321 mov rcx,r9
322 mov r8,QWORD[40+rsp]
323
324
325 $L$ChaCha20_ssse3:
326 cmp rdx,128
327 ja NEAR $L$ChaCha20_4x
328
329 $L$do_sse3_after_all:
330 push rbx
331 push rbp
332 push r12
333 push r13
334 push r14
335 push r15
336
337 sub rsp,64+72
338 movaps XMMWORD[(64+32)+rsp],xmm6
339 movaps XMMWORD[(64+48)+rsp],xmm7
340 movdqa xmm0,XMMWORD[$L$sigma]
341 movdqu xmm1,XMMWORD[rcx]
342 movdqu xmm2,XMMWORD[16+rcx]
343 movdqu xmm3,XMMWORD[r8]
344 movdqa xmm6,XMMWORD[$L$rot16]
345 movdqa xmm7,XMMWORD[$L$rot24]
346
347 movdqa XMMWORD[rsp],xmm0
348 movdqa XMMWORD[16+rsp],xmm1
349 movdqa XMMWORD[32+rsp],xmm2
350 movdqa XMMWORD[48+rsp],xmm3
351 mov ebp,10
352 jmp NEAR $L$oop_ssse3
353
354 ALIGN 32
355 $L$oop_outer_ssse3:
356 movdqa xmm3,XMMWORD[$L$one]
357 movdqa xmm0,XMMWORD[rsp]
358 movdqa xmm1,XMMWORD[16+rsp]
359 movdqa xmm2,XMMWORD[32+rsp]
360 paddd xmm3,XMMWORD[48+rsp]
361 mov ebp,10
362 movdqa XMMWORD[48+rsp],xmm3
363 jmp NEAR $L$oop_ssse3
364
365 ALIGN 32
366 $L$oop_ssse3:
367 paddd xmm0,xmm1
368 pxor xmm3,xmm0
369 DB 102,15,56,0,222
370 paddd xmm2,xmm3
371 pxor xmm1,xmm2
372 movdqa xmm4,xmm1
373 psrld xmm1,20
374 pslld xmm4,12
375 por xmm1,xmm4
376 paddd xmm0,xmm1
377 pxor xmm3,xmm0
378 DB 102,15,56,0,223
379 paddd xmm2,xmm3
380 pxor xmm1,xmm2
381 movdqa xmm4,xmm1
382 psrld xmm1,25
383 pslld xmm4,7
384 por xmm1,xmm4
385 pshufd xmm2,xmm2,78
386 pshufd xmm1,xmm1,57
387 pshufd xmm3,xmm3,147
388 nop
389 paddd xmm0,xmm1
390 pxor xmm3,xmm0
391 DB 102,15,56,0,222
392 paddd xmm2,xmm3
393 pxor xmm1,xmm2
394 movdqa xmm4,xmm1
395 psrld xmm1,20
396 pslld xmm4,12
397 por xmm1,xmm4
398 paddd xmm0,xmm1
399 pxor xmm3,xmm0
400 DB 102,15,56,0,223
401 paddd xmm2,xmm3
402 pxor xmm1,xmm2
403 movdqa xmm4,xmm1
404 psrld xmm1,25
405 pslld xmm4,7
406 por xmm1,xmm4
407 pshufd xmm2,xmm2,78
408 pshufd xmm1,xmm1,147
409 pshufd xmm3,xmm3,57
410 dec ebp
411 jnz NEAR $L$oop_ssse3
412 paddd xmm0,XMMWORD[rsp]
413 paddd xmm1,XMMWORD[16+rsp]
414 paddd xmm2,XMMWORD[32+rsp]
415 paddd xmm3,XMMWORD[48+rsp]
416
417 cmp rdx,64
418 jb NEAR $L$tail_ssse3
419
420 movdqu xmm4,XMMWORD[rsi]
421 movdqu xmm5,XMMWORD[16+rsi]
422 pxor xmm0,xmm4
423 movdqu xmm4,XMMWORD[32+rsi]
424 pxor xmm1,xmm5
425 movdqu xmm5,XMMWORD[48+rsi]
426 lea rsi,[64+rsi]
427 pxor xmm2,xmm4
428 pxor xmm3,xmm5
429
430 movdqu XMMWORD[rdi],xmm0
431 movdqu XMMWORD[16+rdi],xmm1
432 movdqu XMMWORD[32+rdi],xmm2
433 movdqu XMMWORD[48+rdi],xmm3
434 lea rdi,[64+rdi]
435
436 sub rdx,64
437 jnz NEAR $L$oop_outer_ssse3
438
439 jmp NEAR $L$done_ssse3
440
441 ALIGN 16
442 $L$tail_ssse3:
443 movdqa XMMWORD[rsp],xmm0
444 movdqa XMMWORD[16+rsp],xmm1
445 movdqa XMMWORD[32+rsp],xmm2
446 movdqa XMMWORD[48+rsp],xmm3
447 xor rbx,rbx
448
449 $L$oop_tail_ssse3:
450 movzx eax,BYTE[rbx*1+rsi]
451 movzx ecx,BYTE[rbx*1+rsp]
452 lea rbx,[1+rbx]
453 xor eax,ecx
454 mov BYTE[((-1))+rbx*1+rdi],al
455 dec rdx
456 jnz NEAR $L$oop_tail_ssse3
457
458 $L$done_ssse3:
459 movaps xmm6,XMMWORD[((64+32))+rsp]
460 movaps xmm7,XMMWORD[((64+48))+rsp]
461 add rsp,64+72
462 pop r15
463 pop r14
464 pop r13
465 pop r12
466 pop rbp
467 pop rbx
468 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
469 mov rsi,QWORD[16+rsp]
470 DB 0F3h,0C3h ;repret
471 $L$SEH_end_ChaCha20_ssse3:
472
473 ALIGN 32
474 ChaCha20_4x:
475 mov QWORD[8+rsp],rdi ;WIN64 prologue
476 mov QWORD[16+rsp],rsi
477 mov rax,rsp
478 $L$SEH_begin_ChaCha20_4x:
479 mov rdi,rcx
480 mov rsi,rdx
481 mov rdx,r8
482 mov rcx,r9
483 mov r8,QWORD[40+rsp]
484
485
486 $L$ChaCha20_4x:
487 mov r11,r10
488 shr r10,32
489 test r10,32
490 jnz NEAR $L$ChaCha20_8x
491 cmp rdx,192
492 ja NEAR $L$proceed4x
493
494 and r11,71303168
495 cmp r11,4194304
496 je NEAR $L$do_sse3_after_all
497
498 $L$proceed4x:
499 lea r11,[((-120))+rsp]
500 sub rsp,0x148+160
501 movaps XMMWORD[(-48)+r11],xmm6
502 movaps XMMWORD[(-32)+r11],xmm7
503 movaps XMMWORD[(-16)+r11],xmm8
504 movaps XMMWORD[r11],xmm9
505 movaps XMMWORD[16+r11],xmm10
506 movaps XMMWORD[32+r11],xmm11
507 movaps XMMWORD[48+r11],xmm12
508 movaps XMMWORD[64+r11],xmm13
509 movaps XMMWORD[80+r11],xmm14
510 movaps XMMWORD[96+r11],xmm15
511 movdqa xmm11,XMMWORD[$L$sigma]
512 movdqu xmm15,XMMWORD[rcx]
513 movdqu xmm7,XMMWORD[16+rcx]
514 movdqu xmm3,XMMWORD[r8]
515 lea rcx,[256+rsp]
516 lea r10,[$L$rot16]
517 lea r11,[$L$rot24]
518
519 pshufd xmm8,xmm11,0x00
520 pshufd xmm9,xmm11,0x55
521 movdqa XMMWORD[64+rsp],xmm8
522 pshufd xmm10,xmm11,0xaa
523 movdqa XMMWORD[80+rsp],xmm9
524 pshufd xmm11,xmm11,0xff
525 movdqa XMMWORD[96+rsp],xmm10
526 movdqa XMMWORD[112+rsp],xmm11
527
528 pshufd xmm12,xmm15,0x00
529 pshufd xmm13,xmm15,0x55
530 movdqa XMMWORD[(128-256)+rcx],xmm12
531 pshufd xmm14,xmm15,0xaa
532 movdqa XMMWORD[(144-256)+rcx],xmm13
533 pshufd xmm15,xmm15,0xff
534 movdqa XMMWORD[(160-256)+rcx],xmm14
535 movdqa XMMWORD[(176-256)+rcx],xmm15
536
537 pshufd xmm4,xmm7,0x00
538 pshufd xmm5,xmm7,0x55
539 movdqa XMMWORD[(192-256)+rcx],xmm4
540 pshufd xmm6,xmm7,0xaa
541 movdqa XMMWORD[(208-256)+rcx],xmm5
542 pshufd xmm7,xmm7,0xff
543 movdqa XMMWORD[(224-256)+rcx],xmm6
544 movdqa XMMWORD[(240-256)+rcx],xmm7
545
546 pshufd xmm0,xmm3,0x00
547 pshufd xmm1,xmm3,0x55
548 paddd xmm0,XMMWORD[$L$inc]
549 pshufd xmm2,xmm3,0xaa
550 movdqa XMMWORD[(272-256)+rcx],xmm1
551 pshufd xmm3,xmm3,0xff
552 movdqa XMMWORD[(288-256)+rcx],xmm2
553 movdqa XMMWORD[(304-256)+rcx],xmm3
554
555 jmp NEAR $L$oop_enter4x
556
557 ALIGN 32
558 $L$oop_outer4x:
559 movdqa xmm8,XMMWORD[64+rsp]
560 movdqa xmm9,XMMWORD[80+rsp]
561 movdqa xmm10,XMMWORD[96+rsp]
562 movdqa xmm11,XMMWORD[112+rsp]
563 movdqa xmm12,XMMWORD[((128-256))+rcx]
564 movdqa xmm13,XMMWORD[((144-256))+rcx]
565 movdqa xmm14,XMMWORD[((160-256))+rcx]
566 movdqa xmm15,XMMWORD[((176-256))+rcx]
567 movdqa xmm4,XMMWORD[((192-256))+rcx]
568 movdqa xmm5,XMMWORD[((208-256))+rcx]
569 movdqa xmm6,XMMWORD[((224-256))+rcx]
570 movdqa xmm7,XMMWORD[((240-256))+rcx]
571 movdqa xmm0,XMMWORD[((256-256))+rcx]
572 movdqa xmm1,XMMWORD[((272-256))+rcx]
573 movdqa xmm2,XMMWORD[((288-256))+rcx]
574 movdqa xmm3,XMMWORD[((304-256))+rcx]
575 paddd xmm0,XMMWORD[$L$four]
576
577 $L$oop_enter4x:
578 movdqa XMMWORD[32+rsp],xmm6
579 movdqa XMMWORD[48+rsp],xmm7
580 movdqa xmm7,XMMWORD[r10]
581 mov eax,10
582 movdqa XMMWORD[(256-256)+rcx],xmm0
583 jmp NEAR $L$oop4x
584
585 ALIGN 32
586 $L$oop4x:
587 paddd xmm8,xmm12
588 paddd xmm9,xmm13
589 pxor xmm0,xmm8
590 pxor xmm1,xmm9
591 DB 102,15,56,0,199
592 DB 102,15,56,0,207
593 paddd xmm4,xmm0
594 paddd xmm5,xmm1
595 pxor xmm12,xmm4
596 pxor xmm13,xmm5
597 movdqa xmm6,xmm12
598 pslld xmm12,12
599 psrld xmm6,20
600 movdqa xmm7,xmm13
601 pslld xmm13,12
602 por xmm12,xmm6
603 psrld xmm7,20
604 movdqa xmm6,XMMWORD[r11]
605 por xmm13,xmm7
606 paddd xmm8,xmm12
607 paddd xmm9,xmm13
608 pxor xmm0,xmm8
609 pxor xmm1,xmm9
610 DB 102,15,56,0,198
611 DB 102,15,56,0,206
612 paddd xmm4,xmm0
613 paddd xmm5,xmm1
614 pxor xmm12,xmm4
615 pxor xmm13,xmm5
616 movdqa xmm7,xmm12
617 pslld xmm12,7
618 psrld xmm7,25
619 movdqa xmm6,xmm13
620 pslld xmm13,7
621 por xmm12,xmm7
622 psrld xmm6,25
623 movdqa xmm7,XMMWORD[r10]
624 por xmm13,xmm6
625 movdqa XMMWORD[rsp],xmm4
626 movdqa XMMWORD[16+rsp],xmm5
627 movdqa xmm4,XMMWORD[32+rsp]
628 movdqa xmm5,XMMWORD[48+rsp]
629 paddd xmm10,xmm14
630 paddd xmm11,xmm15
631 pxor xmm2,xmm10
632 pxor xmm3,xmm11
633 DB 102,15,56,0,215
634 DB 102,15,56,0,223
635 paddd xmm4,xmm2
636 paddd xmm5,xmm3
637 pxor xmm14,xmm4
638 pxor xmm15,xmm5
639 movdqa xmm6,xmm14
640 pslld xmm14,12
641 psrld xmm6,20
642 movdqa xmm7,xmm15
643 pslld xmm15,12
644 por xmm14,xmm6
645 psrld xmm7,20
646 movdqa xmm6,XMMWORD[r11]
647 por xmm15,xmm7
648 paddd xmm10,xmm14
649 paddd xmm11,xmm15
650 pxor xmm2,xmm10
651 pxor xmm3,xmm11
652 DB 102,15,56,0,214
653 DB 102,15,56,0,222
654 paddd xmm4,xmm2
655 paddd xmm5,xmm3
656 pxor xmm14,xmm4
657 pxor xmm15,xmm5
658 movdqa xmm7,xmm14
659 pslld xmm14,7
660 psrld xmm7,25
661 movdqa xmm6,xmm15
662 pslld xmm15,7
663 por xmm14,xmm7
664 psrld xmm6,25
665 movdqa xmm7,XMMWORD[r10]
666 por xmm15,xmm6
667 paddd xmm8,xmm13
668 paddd xmm9,xmm14
669 pxor xmm3,xmm8
670 pxor xmm0,xmm9
671 DB 102,15,56,0,223
672 DB 102,15,56,0,199
673 paddd xmm4,xmm3
674 paddd xmm5,xmm0
675 pxor xmm13,xmm4
676 pxor xmm14,xmm5
677 movdqa xmm6,xmm13
678 pslld xmm13,12
679 psrld xmm6,20
680 movdqa xmm7,xmm14
681 pslld xmm14,12
682 por xmm13,xmm6
683 psrld xmm7,20
684 movdqa xmm6,XMMWORD[r11]
685 por xmm14,xmm7
686 paddd xmm8,xmm13
687 paddd xmm9,xmm14
688 pxor xmm3,xmm8
689 pxor xmm0,xmm9
690 DB 102,15,56,0,222
691 DB 102,15,56,0,198
692 paddd xmm4,xmm3
693 paddd xmm5,xmm0
694 pxor xmm13,xmm4
695 pxor xmm14,xmm5
696 movdqa xmm7,xmm13
697 pslld xmm13,7
698 psrld xmm7,25
699 movdqa xmm6,xmm14
700 pslld xmm14,7
701 por xmm13,xmm7
702 psrld xmm6,25
703 movdqa xmm7,XMMWORD[r10]
704 por xmm14,xmm6
705 movdqa XMMWORD[32+rsp],xmm4
706 movdqa XMMWORD[48+rsp],xmm5
707 movdqa xmm4,XMMWORD[rsp]
708 movdqa xmm5,XMMWORD[16+rsp]
709 paddd xmm10,xmm15
710 paddd xmm11,xmm12
711 pxor xmm1,xmm10
712 pxor xmm2,xmm11
713 DB 102,15,56,0,207
714 DB 102,15,56,0,215
715 paddd xmm4,xmm1
716 paddd xmm5,xmm2
717 pxor xmm15,xmm4
718 pxor xmm12,xmm5
719 movdqa xmm6,xmm15
720 pslld xmm15,12
721 psrld xmm6,20
722 movdqa xmm7,xmm12
723 pslld xmm12,12
724 por xmm15,xmm6
725 psrld xmm7,20
726 movdqa xmm6,XMMWORD[r11]
727 por xmm12,xmm7
728 paddd xmm10,xmm15
729 paddd xmm11,xmm12
730 pxor xmm1,xmm10
731 pxor xmm2,xmm11
732 DB 102,15,56,0,206
733 DB 102,15,56,0,214
734 paddd xmm4,xmm1
735 paddd xmm5,xmm2
736 pxor xmm15,xmm4
737 pxor xmm12,xmm5
738 movdqa xmm7,xmm15
739 pslld xmm15,7
740 psrld xmm7,25
741 movdqa xmm6,xmm12
742 pslld xmm12,7
743 por xmm15,xmm7
744 psrld xmm6,25
745 movdqa xmm7,XMMWORD[r10]
746 por xmm12,xmm6
747 dec eax
748 jnz NEAR $L$oop4x
749
750 paddd xmm8,XMMWORD[64+rsp]
751 paddd xmm9,XMMWORD[80+rsp]
752 paddd xmm10,XMMWORD[96+rsp]
753 paddd xmm11,XMMWORD[112+rsp]
754
755 movdqa xmm6,xmm8
756 punpckldq xmm8,xmm9
757 movdqa xmm7,xmm10
758 punpckldq xmm10,xmm11
759 punpckhdq xmm6,xmm9
760 punpckhdq xmm7,xmm11
761 movdqa xmm9,xmm8
762 punpcklqdq xmm8,xmm10
763 movdqa xmm11,xmm6
764 punpcklqdq xmm6,xmm7
765 punpckhqdq xmm9,xmm10
766 punpckhqdq xmm11,xmm7
767 paddd xmm12,XMMWORD[((128-256))+rcx]
768 paddd xmm13,XMMWORD[((144-256))+rcx]
769 paddd xmm14,XMMWORD[((160-256))+rcx]
770 paddd xmm15,XMMWORD[((176-256))+rcx]
771
772 movdqa XMMWORD[rsp],xmm8
773 movdqa XMMWORD[16+rsp],xmm9
774 movdqa xmm8,XMMWORD[32+rsp]
775 movdqa xmm9,XMMWORD[48+rsp]
776
777 movdqa xmm10,xmm12
778 punpckldq xmm12,xmm13
779 movdqa xmm7,xmm14
780 punpckldq xmm14,xmm15
781 punpckhdq xmm10,xmm13
782 punpckhdq xmm7,xmm15
783 movdqa xmm13,xmm12
784 punpcklqdq xmm12,xmm14
785 movdqa xmm15,xmm10
786 punpcklqdq xmm10,xmm7
787 punpckhqdq xmm13,xmm14
788 punpckhqdq xmm15,xmm7
789 paddd xmm4,XMMWORD[((192-256))+rcx]
790 paddd xmm5,XMMWORD[((208-256))+rcx]
791 paddd xmm8,XMMWORD[((224-256))+rcx]
792 paddd xmm9,XMMWORD[((240-256))+rcx]
793
794 movdqa XMMWORD[32+rsp],xmm6
795 movdqa XMMWORD[48+rsp],xmm11
796
797 movdqa xmm14,xmm4
798 punpckldq xmm4,xmm5
799 movdqa xmm7,xmm8
800 punpckldq xmm8,xmm9
801 punpckhdq xmm14,xmm5
802 punpckhdq xmm7,xmm9
803 movdqa xmm5,xmm4
804 punpcklqdq xmm4,xmm8
805 movdqa xmm9,xmm14
806 punpcklqdq xmm14,xmm7
807 punpckhqdq xmm5,xmm8
808 punpckhqdq xmm9,xmm7
809 paddd xmm0,XMMWORD[((256-256))+rcx]
810 paddd xmm1,XMMWORD[((272-256))+rcx]
811 paddd xmm2,XMMWORD[((288-256))+rcx]
812 paddd xmm3,XMMWORD[((304-256))+rcx]
813
814 movdqa xmm8,xmm0
815 punpckldq xmm0,xmm1
816 movdqa xmm7,xmm2
817 punpckldq xmm2,xmm3
818 punpckhdq xmm8,xmm1
819 punpckhdq xmm7,xmm3
820 movdqa xmm1,xmm0
821 punpcklqdq xmm0,xmm2
822 movdqa xmm3,xmm8
823 punpcklqdq xmm8,xmm7
824 punpckhqdq xmm1,xmm2
825 punpckhqdq xmm3,xmm7
826 cmp rdx,64*4
827 jb NEAR $L$tail4x
828
829 movdqu xmm6,XMMWORD[rsi]
830 movdqu xmm11,XMMWORD[16+rsi]
831 movdqu xmm2,XMMWORD[32+rsi]
832 movdqu xmm7,XMMWORD[48+rsi]
833 pxor xmm6,XMMWORD[rsp]
834 pxor xmm11,xmm12
835 pxor xmm2,xmm4
836 pxor xmm7,xmm0
837
838 movdqu XMMWORD[rdi],xmm6
839 movdqu xmm6,XMMWORD[64+rsi]
840 movdqu XMMWORD[16+rdi],xmm11
841 movdqu xmm11,XMMWORD[80+rsi]
842 movdqu XMMWORD[32+rdi],xmm2
843 movdqu xmm2,XMMWORD[96+rsi]
844 movdqu XMMWORD[48+rdi],xmm7
845 movdqu xmm7,XMMWORD[112+rsi]
846 lea rsi,[128+rsi]
847 pxor xmm6,XMMWORD[16+rsp]
848 pxor xmm11,xmm13
849 pxor xmm2,xmm5
850 pxor xmm7,xmm1
851
852 movdqu XMMWORD[64+rdi],xmm6
853 movdqu xmm6,XMMWORD[rsi]
854 movdqu XMMWORD[80+rdi],xmm11
855 movdqu xmm11,XMMWORD[16+rsi]
856 movdqu XMMWORD[96+rdi],xmm2
857 movdqu xmm2,XMMWORD[32+rsi]
858 movdqu XMMWORD[112+rdi],xmm7
859 lea rdi,[128+rdi]
860 movdqu xmm7,XMMWORD[48+rsi]
861 pxor xmm6,XMMWORD[32+rsp]
862 pxor xmm11,xmm10
863 pxor xmm2,xmm14
864 pxor xmm7,xmm8
865
866 movdqu XMMWORD[rdi],xmm6
867 movdqu xmm6,XMMWORD[64+rsi]
868 movdqu XMMWORD[16+rdi],xmm11
869 movdqu xmm11,XMMWORD[80+rsi]
870 movdqu XMMWORD[32+rdi],xmm2
871 movdqu xmm2,XMMWORD[96+rsi]
872 movdqu XMMWORD[48+rdi],xmm7
873 movdqu xmm7,XMMWORD[112+rsi]
874 lea rsi,[128+rsi]
875 pxor xmm6,XMMWORD[48+rsp]
876 pxor xmm11,xmm15
877 pxor xmm2,xmm9
878 pxor xmm7,xmm3
879 movdqu XMMWORD[64+rdi],xmm6
880 movdqu XMMWORD[80+rdi],xmm11
881 movdqu XMMWORD[96+rdi],xmm2
882 movdqu XMMWORD[112+rdi],xmm7
883 lea rdi,[128+rdi]
884
885 sub rdx,64*4
886 jnz NEAR $L$oop_outer4x
887
888 jmp NEAR $L$done4x
889
890 $L$tail4x:
891 cmp rdx,192
892 jae NEAR $L$192_or_more4x
893 cmp rdx,128
894 jae NEAR $L$128_or_more4x
895 cmp rdx,64
896 jae NEAR $L$64_or_more4x
897
898
899 xor r10,r10
900
901 movdqa XMMWORD[16+rsp],xmm12
902 movdqa XMMWORD[32+rsp],xmm4
903 movdqa XMMWORD[48+rsp],xmm0
904 jmp NEAR $L$oop_tail4x
905
906 ALIGN 32
907 $L$64_or_more4x:
908 movdqu xmm6,XMMWORD[rsi]
909 movdqu xmm11,XMMWORD[16+rsi]
910 movdqu xmm2,XMMWORD[32+rsi]
911 movdqu xmm7,XMMWORD[48+rsi]
912 pxor xmm6,XMMWORD[rsp]
913 pxor xmm11,xmm12
914 pxor xmm2,xmm4
915 pxor xmm7,xmm0
916 movdqu XMMWORD[rdi],xmm6
917 movdqu XMMWORD[16+rdi],xmm11
918 movdqu XMMWORD[32+rdi],xmm2
919 movdqu XMMWORD[48+rdi],xmm7
920 je NEAR $L$done4x
921
922 movdqa xmm6,XMMWORD[16+rsp]
923 lea rsi,[64+rsi]
924 xor r10,r10
925 movdqa XMMWORD[rsp],xmm6
926 movdqa XMMWORD[16+rsp],xmm13
927 lea rdi,[64+rdi]
928 movdqa XMMWORD[32+rsp],xmm5
929 sub rdx,64
930 movdqa XMMWORD[48+rsp],xmm1
931 jmp NEAR $L$oop_tail4x
932
933 ALIGN 32
934 $L$128_or_more4x:
935 movdqu xmm6,XMMWORD[rsi]
936 movdqu xmm11,XMMWORD[16+rsi]
937 movdqu xmm2,XMMWORD[32+rsi]
938 movdqu xmm7,XMMWORD[48+rsi]
939 pxor xmm6,XMMWORD[rsp]
940 pxor xmm11,xmm12
941 pxor xmm2,xmm4
942 pxor xmm7,xmm0
943
944 movdqu XMMWORD[rdi],xmm6
945 movdqu xmm6,XMMWORD[64+rsi]
946 movdqu XMMWORD[16+rdi],xmm11
947 movdqu xmm11,XMMWORD[80+rsi]
948 movdqu XMMWORD[32+rdi],xmm2
949 movdqu xmm2,XMMWORD[96+rsi]
950 movdqu XMMWORD[48+rdi],xmm7
951 movdqu xmm7,XMMWORD[112+rsi]
952 pxor xmm6,XMMWORD[16+rsp]
953 pxor xmm11,xmm13
954 pxor xmm2,xmm5
955 pxor xmm7,xmm1
956 movdqu XMMWORD[64+rdi],xmm6
957 movdqu XMMWORD[80+rdi],xmm11
958 movdqu XMMWORD[96+rdi],xmm2
959 movdqu XMMWORD[112+rdi],xmm7
960 je NEAR $L$done4x
961
962 movdqa xmm6,XMMWORD[32+rsp]
963 lea rsi,[128+rsi]
964 xor r10,r10
965 movdqa XMMWORD[rsp],xmm6
966 movdqa XMMWORD[16+rsp],xmm10
967 lea rdi,[128+rdi]
968 movdqa XMMWORD[32+rsp],xmm14
969 sub rdx,128
970 movdqa XMMWORD[48+rsp],xmm8
971 jmp NEAR $L$oop_tail4x
972
973 ALIGN 32
974 $L$192_or_more4x:
975 movdqu xmm6,XMMWORD[rsi]
976 movdqu xmm11,XMMWORD[16+rsi]
977 movdqu xmm2,XMMWORD[32+rsi]
978 movdqu xmm7,XMMWORD[48+rsi]
979 pxor xmm6,XMMWORD[rsp]
980 pxor xmm11,xmm12
981 pxor xmm2,xmm4
982 pxor xmm7,xmm0
983
984 movdqu XMMWORD[rdi],xmm6
985 movdqu xmm6,XMMWORD[64+rsi]
986 movdqu XMMWORD[16+rdi],xmm11
987 movdqu xmm11,XMMWORD[80+rsi]
988 movdqu XMMWORD[32+rdi],xmm2
989 movdqu xmm2,XMMWORD[96+rsi]
990 movdqu XMMWORD[48+rdi],xmm7
991 movdqu xmm7,XMMWORD[112+rsi]
992 lea rsi,[128+rsi]
993 pxor xmm6,XMMWORD[16+rsp]
994 pxor xmm11,xmm13
995 pxor xmm2,xmm5
996 pxor xmm7,xmm1
997
998 movdqu XMMWORD[64+rdi],xmm6
999 movdqu xmm6,XMMWORD[rsi]
1000 movdqu XMMWORD[80+rdi],xmm11
1001 movdqu xmm11,XMMWORD[16+rsi]
1002 movdqu XMMWORD[96+rdi],xmm2
1003 movdqu xmm2,XMMWORD[32+rsi]
1004 movdqu XMMWORD[112+rdi],xmm7
1005 lea rdi,[128+rdi]
1006 movdqu xmm7,XMMWORD[48+rsi]
1007 pxor xmm6,XMMWORD[32+rsp]
1008 pxor xmm11,xmm10
1009 pxor xmm2,xmm14
1010 pxor xmm7,xmm8
1011 movdqu XMMWORD[rdi],xmm6
1012 movdqu XMMWORD[16+rdi],xmm11
1013 movdqu XMMWORD[32+rdi],xmm2
1014 movdqu XMMWORD[48+rdi],xmm7
1015 je NEAR $L$done4x
1016
1017 movdqa xmm6,XMMWORD[48+rsp]
1018 lea rsi,[64+rsi]
1019 xor r10,r10
1020 movdqa XMMWORD[rsp],xmm6
1021 movdqa XMMWORD[16+rsp],xmm15
1022 lea rdi,[64+rdi]
1023 movdqa XMMWORD[32+rsp],xmm9
1024 sub rdx,192
1025 movdqa XMMWORD[48+rsp],xmm3
1026
1027 $L$oop_tail4x:
1028 movzx eax,BYTE[r10*1+rsi]
1029 movzx ecx,BYTE[r10*1+rsp]
1030 lea r10,[1+r10]
1031 xor eax,ecx
1032 mov BYTE[((-1))+r10*1+rdi],al
1033 dec rdx
1034 jnz NEAR $L$oop_tail4x
1035
1036 $L$done4x:
1037 lea r11,[((320+48))+rsp]
1038 movaps xmm6,XMMWORD[((-48))+r11]
1039 movaps xmm7,XMMWORD[((-32))+r11]
1040 movaps xmm8,XMMWORD[((-16))+r11]
1041 movaps xmm9,XMMWORD[r11]
1042 movaps xmm10,XMMWORD[16+r11]
1043 movaps xmm11,XMMWORD[32+r11]
1044 movaps xmm12,XMMWORD[48+r11]
1045 movaps xmm13,XMMWORD[64+r11]
1046 movaps xmm14,XMMWORD[80+r11]
1047 movaps xmm15,XMMWORD[96+r11]
1048 add rsp,0x148+160
1049 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
1050 mov rsi,QWORD[16+rsp]
1051 DB 0F3h,0C3h ;repret
1052 $L$SEH_end_ChaCha20_4x:
1053
1054 ALIGN 32
1055 ChaCha20_8x:
1056 mov QWORD[8+rsp],rdi ;WIN64 prologue
1057 mov QWORD[16+rsp],rsi
1058 mov rax,rsp
1059 $L$SEH_begin_ChaCha20_8x:
1060 mov rdi,rcx
1061 mov rsi,rdx
1062 mov rdx,r8
1063 mov rcx,r9
1064 mov r8,QWORD[40+rsp]
1065
1066
1067 $L$ChaCha20_8x:
1068 mov r10,rsp
1069 sub rsp,0x280+176
1070 and rsp,-32
1071 lea r11,[((656+48))+rsp]
1072 movaps XMMWORD[(-48)+r11],xmm6
1073 movaps XMMWORD[(-32)+r11],xmm7
1074 movaps XMMWORD[(-16)+r11],xmm8
1075 movaps XMMWORD[r11],xmm9
1076 movaps XMMWORD[16+r11],xmm10
1077 movaps XMMWORD[32+r11],xmm11
1078 movaps XMMWORD[48+r11],xmm12
1079 movaps XMMWORD[64+r11],xmm13
1080 movaps XMMWORD[80+r11],xmm14
1081 movaps XMMWORD[96+r11],xmm15
1082 vzeroupper
1083 mov QWORD[640+rsp],r10
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094 vbroadcasti128 ymm11,XMMWORD[$L$sigma]
1095 vbroadcasti128 ymm3,XMMWORD[rcx]
1096 vbroadcasti128 ymm15,XMMWORD[16+rcx]
1097 vbroadcasti128 ymm7,XMMWORD[r8]
1098 lea rcx,[256+rsp]
1099 lea rax,[512+rsp]
1100 lea r10,[$L$rot16]
1101 lea r11,[$L$rot24]
1102
1103 vpshufd ymm8,ymm11,0x00
1104 vpshufd ymm9,ymm11,0x55
1105 vmovdqa YMMWORD[(128-256)+rcx],ymm8
1106 vpshufd ymm10,ymm11,0xaa
1107 vmovdqa YMMWORD[(160-256)+rcx],ymm9
1108 vpshufd ymm11,ymm11,0xff
1109 vmovdqa YMMWORD[(192-256)+rcx],ymm10
1110 vmovdqa YMMWORD[(224-256)+rcx],ymm11
1111
1112 vpshufd ymm0,ymm3,0x00
1113 vpshufd ymm1,ymm3,0x55
1114 vmovdqa YMMWORD[(256-256)+rcx],ymm0
1115 vpshufd ymm2,ymm3,0xaa
1116 vmovdqa YMMWORD[(288-256)+rcx],ymm1
1117 vpshufd ymm3,ymm3,0xff
1118 vmovdqa YMMWORD[(320-256)+rcx],ymm2
1119 vmovdqa YMMWORD[(352-256)+rcx],ymm3
1120
1121 vpshufd ymm12,ymm15,0x00
1122 vpshufd ymm13,ymm15,0x55
1123 vmovdqa YMMWORD[(384-512)+rax],ymm12
1124 vpshufd ymm14,ymm15,0xaa
1125 vmovdqa YMMWORD[(416-512)+rax],ymm13
1126 vpshufd ymm15,ymm15,0xff
1127 vmovdqa YMMWORD[(448-512)+rax],ymm14
1128 vmovdqa YMMWORD[(480-512)+rax],ymm15
1129
1130 vpshufd ymm4,ymm7,0x00
1131 vpshufd ymm5,ymm7,0x55
1132 vpaddd ymm4,ymm4,YMMWORD[$L$incy]
1133 vpshufd ymm6,ymm7,0xaa
1134 vmovdqa YMMWORD[(544-512)+rax],ymm5
1135 vpshufd ymm7,ymm7,0xff
1136 vmovdqa YMMWORD[(576-512)+rax],ymm6
1137 vmovdqa YMMWORD[(608-512)+rax],ymm7
1138
1139 jmp NEAR $L$oop_enter8x
1140
1141 ALIGN 32
1142 $L$oop_outer8x:
1143 vmovdqa ymm8,YMMWORD[((128-256))+rcx]
1144 vmovdqa ymm9,YMMWORD[((160-256))+rcx]
1145 vmovdqa ymm10,YMMWORD[((192-256))+rcx]
1146 vmovdqa ymm11,YMMWORD[((224-256))+rcx]
1147 vmovdqa ymm0,YMMWORD[((256-256))+rcx]
1148 vmovdqa ymm1,YMMWORD[((288-256))+rcx]
1149 vmovdqa ymm2,YMMWORD[((320-256))+rcx]
1150 vmovdqa ymm3,YMMWORD[((352-256))+rcx]
1151 vmovdqa ymm12,YMMWORD[((384-512))+rax]
1152 vmovdqa ymm13,YMMWORD[((416-512))+rax]
1153 vmovdqa ymm14,YMMWORD[((448-512))+rax]
1154 vmovdqa ymm15,YMMWORD[((480-512))+rax]
1155 vmovdqa ymm4,YMMWORD[((512-512))+rax]
1156 vmovdqa ymm5,YMMWORD[((544-512))+rax]
1157 vmovdqa ymm6,YMMWORD[((576-512))+rax]
1158 vmovdqa ymm7,YMMWORD[((608-512))+rax]
1159 vpaddd ymm4,ymm4,YMMWORD[$L$eight]
1160
1161 $L$oop_enter8x:
1162 vmovdqa YMMWORD[64+rsp],ymm14
1163 vmovdqa YMMWORD[96+rsp],ymm15
1164 vbroadcasti128 ymm15,XMMWORD[r10]
1165 vmovdqa YMMWORD[(512-512)+rax],ymm4
1166 mov eax,10
1167 jmp NEAR $L$oop8x
1168
1169 ALIGN 32
1170 $L$oop8x:
1171 vpaddd ymm8,ymm8,ymm0
1172 vpxor ymm4,ymm8,ymm4
1173 vpshufb ymm4,ymm4,ymm15
1174 vpaddd ymm9,ymm9,ymm1
1175 vpxor ymm5,ymm9,ymm5
1176 vpshufb ymm5,ymm5,ymm15
1177 vpaddd ymm12,ymm12,ymm4
1178 vpxor ymm0,ymm12,ymm0
1179 vpslld ymm14,ymm0,12
1180 vpsrld ymm0,ymm0,20
1181 vpor ymm0,ymm14,ymm0
1182 vbroadcasti128 ymm14,XMMWORD[r11]
1183 vpaddd ymm13,ymm13,ymm5
1184 vpxor ymm1,ymm13,ymm1
1185 vpslld ymm15,ymm1,12
1186 vpsrld ymm1,ymm1,20
1187 vpor ymm1,ymm15,ymm1
1188 vpaddd ymm8,ymm8,ymm0
1189 vpxor ymm4,ymm8,ymm4
1190 vpshufb ymm4,ymm4,ymm14
1191 vpaddd ymm9,ymm9,ymm1
1192 vpxor ymm5,ymm9,ymm5
1193 vpshufb ymm5,ymm5,ymm14
1194 vpaddd ymm12,ymm12,ymm4
1195 vpxor ymm0,ymm12,ymm0
1196 vpslld ymm15,ymm0,7
1197 vpsrld ymm0,ymm0,25
1198 vpor ymm0,ymm15,ymm0
1199 vbroadcasti128 ymm15,XMMWORD[r10]
1200 vpaddd ymm13,ymm13,ymm5
1201 vpxor ymm1,ymm13,ymm1
1202 vpslld ymm14,ymm1,7
1203 vpsrld ymm1,ymm1,25
1204 vpor ymm1,ymm14,ymm1
1205 vmovdqa YMMWORD[rsp],ymm12
1206 vmovdqa YMMWORD[32+rsp],ymm13
1207 vmovdqa ymm12,YMMWORD[64+rsp]
1208 vmovdqa ymm13,YMMWORD[96+rsp]
1209 vpaddd ymm10,ymm10,ymm2
1210 vpxor ymm6,ymm10,ymm6
1211 vpshufb ymm6,ymm6,ymm15
1212 vpaddd ymm11,ymm11,ymm3
1213 vpxor ymm7,ymm11,ymm7
1214 vpshufb ymm7,ymm7,ymm15
1215 vpaddd ymm12,ymm12,ymm6
1216 vpxor ymm2,ymm12,ymm2
1217 vpslld ymm14,ymm2,12
1218 vpsrld ymm2,ymm2,20
1219 vpor ymm2,ymm14,ymm2
1220 vbroadcasti128 ymm14,XMMWORD[r11]
1221 vpaddd ymm13,ymm13,ymm7
1222 vpxor ymm3,ymm13,ymm3
1223 vpslld ymm15,ymm3,12
1224 vpsrld ymm3,ymm3,20
1225 vpor ymm3,ymm15,ymm3
1226 vpaddd ymm10,ymm10,ymm2
1227 vpxor ymm6,ymm10,ymm6
1228 vpshufb ymm6,ymm6,ymm14
1229 vpaddd ymm11,ymm11,ymm3
1230 vpxor ymm7,ymm11,ymm7
1231 vpshufb ymm7,ymm7,ymm14
1232 vpaddd ymm12,ymm12,ymm6
1233 vpxor ymm2,ymm12,ymm2
1234 vpslld ymm15,ymm2,7
1235 vpsrld ymm2,ymm2,25
1236 vpor ymm2,ymm15,ymm2
1237 vbroadcasti128 ymm15,XMMWORD[r10]
1238 vpaddd ymm13,ymm13,ymm7
1239 vpxor ymm3,ymm13,ymm3
1240 vpslld ymm14,ymm3,7
1241 vpsrld ymm3,ymm3,25
1242 vpor ymm3,ymm14,ymm3
1243 vpaddd ymm8,ymm8,ymm1
1244 vpxor ymm7,ymm8,ymm7
1245 vpshufb ymm7,ymm7,ymm15
1246 vpaddd ymm9,ymm9,ymm2
1247 vpxor ymm4,ymm9,ymm4
1248 vpshufb ymm4,ymm4,ymm15
1249 vpaddd ymm12,ymm12,ymm7
1250 vpxor ymm1,ymm12,ymm1
1251 vpslld ymm14,ymm1,12
1252 vpsrld ymm1,ymm1,20
1253 vpor ymm1,ymm14,ymm1
1254 vbroadcasti128 ymm14,XMMWORD[r11]
1255 vpaddd ymm13,ymm13,ymm4
1256 vpxor ymm2,ymm13,ymm2
1257 vpslld ymm15,ymm2,12
1258 vpsrld ymm2,ymm2,20
1259 vpor ymm2,ymm15,ymm2
1260 vpaddd ymm8,ymm8,ymm1
1261 vpxor ymm7,ymm8,ymm7
1262 vpshufb ymm7,ymm7,ymm14
1263 vpaddd ymm9,ymm9,ymm2
1264 vpxor ymm4,ymm9,ymm4
1265 vpshufb ymm4,ymm4,ymm14
1266 vpaddd ymm12,ymm12,ymm7
1267 vpxor ymm1,ymm12,ymm1
1268 vpslld ymm15,ymm1,7
1269 vpsrld ymm1,ymm1,25
1270 vpor ymm1,ymm15,ymm1
1271 vbroadcasti128 ymm15,XMMWORD[r10]
1272 vpaddd ymm13,ymm13,ymm4
1273 vpxor ymm2,ymm13,ymm2
1274 vpslld ymm14,ymm2,7
1275 vpsrld ymm2,ymm2,25
1276 vpor ymm2,ymm14,ymm2
1277 vmovdqa YMMWORD[64+rsp],ymm12
1278 vmovdqa YMMWORD[96+rsp],ymm13
1279 vmovdqa ymm12,YMMWORD[rsp]
1280 vmovdqa ymm13,YMMWORD[32+rsp]
1281 vpaddd ymm10,ymm10,ymm3
1282 vpxor ymm5,ymm10,ymm5
1283 vpshufb ymm5,ymm5,ymm15
1284 vpaddd ymm11,ymm11,ymm0
1285 vpxor ymm6,ymm11,ymm6
1286 vpshufb ymm6,ymm6,ymm15
1287 vpaddd ymm12,ymm12,ymm5
1288 vpxor ymm3,ymm12,ymm3
1289 vpslld ymm14,ymm3,12
1290 vpsrld ymm3,ymm3,20
1291 vpor ymm3,ymm14,ymm3
1292 vbroadcasti128 ymm14,XMMWORD[r11]
1293 vpaddd ymm13,ymm13,ymm6
1294 vpxor ymm0,ymm13,ymm0
1295 vpslld ymm15,ymm0,12
1296 vpsrld ymm0,ymm0,20
1297 vpor ymm0,ymm15,ymm0
1298 vpaddd ymm10,ymm10,ymm3
1299 vpxor ymm5,ymm10,ymm5
1300 vpshufb ymm5,ymm5,ymm14
1301 vpaddd ymm11,ymm11,ymm0
1302 vpxor ymm6,ymm11,ymm6
1303 vpshufb ymm6,ymm6,ymm14
1304 vpaddd ymm12,ymm12,ymm5
1305 vpxor ymm3,ymm12,ymm3
1306 vpslld ymm15,ymm3,7
1307 vpsrld ymm3,ymm3,25
1308 vpor ymm3,ymm15,ymm3
1309 vbroadcasti128 ymm15,XMMWORD[r10]
1310 vpaddd ymm13,ymm13,ymm6
1311 vpxor ymm0,ymm13,ymm0
1312 vpslld ymm14,ymm0,7
1313 vpsrld ymm0,ymm0,25
1314 vpor ymm0,ymm14,ymm0
1315 dec eax
1316 jnz NEAR $L$oop8x
1317
1318 lea rax,[512+rsp]
1319 vpaddd ymm8,ymm8,YMMWORD[((128-256))+rcx]
1320 vpaddd ymm9,ymm9,YMMWORD[((160-256))+rcx]
1321 vpaddd ymm10,ymm10,YMMWORD[((192-256))+rcx]
1322 vpaddd ymm11,ymm11,YMMWORD[((224-256))+rcx]
1323
1324 vpunpckldq ymm14,ymm8,ymm9
1325 vpunpckldq ymm15,ymm10,ymm11
1326 vpunpckhdq ymm8,ymm8,ymm9
1327 vpunpckhdq ymm10,ymm10,ymm11
1328 vpunpcklqdq ymm9,ymm14,ymm15
1329 vpunpckhqdq ymm14,ymm14,ymm15
1330 vpunpcklqdq ymm11,ymm8,ymm10
1331 vpunpckhqdq ymm8,ymm8,ymm10
1332 vpaddd ymm0,ymm0,YMMWORD[((256-256))+rcx]
1333 vpaddd ymm1,ymm1,YMMWORD[((288-256))+rcx]
1334 vpaddd ymm2,ymm2,YMMWORD[((320-256))+rcx]
1335 vpaddd ymm3,ymm3,YMMWORD[((352-256))+rcx]
1336
1337 vpunpckldq ymm10,ymm0,ymm1
1338 vpunpckldq ymm15,ymm2,ymm3
1339 vpunpckhdq ymm0,ymm0,ymm1
1340 vpunpckhdq ymm2,ymm2,ymm3
1341 vpunpcklqdq ymm1,ymm10,ymm15
1342 vpunpckhqdq ymm10,ymm10,ymm15
1343 vpunpcklqdq ymm3,ymm0,ymm2
1344 vpunpckhqdq ymm0,ymm0,ymm2
1345 vperm2i128 ymm15,ymm9,ymm1,0x20
1346 vperm2i128 ymm1,ymm9,ymm1,0x31
1347 vperm2i128 ymm9,ymm14,ymm10,0x20
1348 vperm2i128 ymm10,ymm14,ymm10,0x31
1349 vperm2i128 ymm14,ymm11,ymm3,0x20
1350 vperm2i128 ymm3,ymm11,ymm3,0x31
1351 vperm2i128 ymm11,ymm8,ymm0,0x20
1352 vperm2i128 ymm0,ymm8,ymm0,0x31
1353 vmovdqa YMMWORD[rsp],ymm15
1354 vmovdqa YMMWORD[32+rsp],ymm9
1355 vmovdqa ymm15,YMMWORD[64+rsp]
1356 vmovdqa ymm9,YMMWORD[96+rsp]
1357
1358 vpaddd ymm12,ymm12,YMMWORD[((384-512))+rax]
1359 vpaddd ymm13,ymm13,YMMWORD[((416-512))+rax]
1360 vpaddd ymm15,ymm15,YMMWORD[((448-512))+rax]
1361 vpaddd ymm9,ymm9,YMMWORD[((480-512))+rax]
1362
1363 vpunpckldq ymm2,ymm12,ymm13
1364 vpunpckldq ymm8,ymm15,ymm9
1365 vpunpckhdq ymm12,ymm12,ymm13
1366 vpunpckhdq ymm15,ymm15,ymm9
1367 vpunpcklqdq ymm13,ymm2,ymm8
1368 vpunpckhqdq ymm2,ymm2,ymm8
1369 vpunpcklqdq ymm9,ymm12,ymm15
1370 vpunpckhqdq ymm12,ymm12,ymm15
1371 vpaddd ymm4,ymm4,YMMWORD[((512-512))+rax]
1372 vpaddd ymm5,ymm5,YMMWORD[((544-512))+rax]
1373 vpaddd ymm6,ymm6,YMMWORD[((576-512))+rax]
1374 vpaddd ymm7,ymm7,YMMWORD[((608-512))+rax]
1375
1376 vpunpckldq ymm15,ymm4,ymm5
1377 vpunpckldq ymm8,ymm6,ymm7
1378 vpunpckhdq ymm4,ymm4,ymm5
1379 vpunpckhdq ymm6,ymm6,ymm7
1380 vpunpcklqdq ymm5,ymm15,ymm8
1381 vpunpckhqdq ymm15,ymm15,ymm8
1382 vpunpcklqdq ymm7,ymm4,ymm6
1383 vpunpckhqdq ymm4,ymm4,ymm6
1384 vperm2i128 ymm8,ymm13,ymm5,0x20
1385 vperm2i128 ymm5,ymm13,ymm5,0x31
1386 vperm2i128 ymm13,ymm2,ymm15,0x20
1387 vperm2i128 ymm15,ymm2,ymm15,0x31
1388 vperm2i128 ymm2,ymm9,ymm7,0x20
1389 vperm2i128 ymm7,ymm9,ymm7,0x31
1390 vperm2i128 ymm9,ymm12,ymm4,0x20
1391 vperm2i128 ymm4,ymm12,ymm4,0x31
1392 vmovdqa ymm6,YMMWORD[rsp]
1393 vmovdqa ymm12,YMMWORD[32+rsp]
1394
1395 cmp rdx,64*8
1396 jb NEAR $L$tail8x
1397
1398 vpxor ymm6,ymm6,YMMWORD[rsi]
1399 vpxor ymm8,ymm8,YMMWORD[32+rsi]
1400 vpxor ymm1,ymm1,YMMWORD[64+rsi]
1401 vpxor ymm5,ymm5,YMMWORD[96+rsi]
1402 lea rsi,[128+rsi]
1403 vmovdqu YMMWORD[rdi],ymm6
1404 vmovdqu YMMWORD[32+rdi],ymm8
1405 vmovdqu YMMWORD[64+rdi],ymm1
1406 vmovdqu YMMWORD[96+rdi],ymm5
1407 lea rdi,[128+rdi]
1408
1409 vpxor ymm12,ymm12,YMMWORD[rsi]
1410 vpxor ymm13,ymm13,YMMWORD[32+rsi]
1411 vpxor ymm10,ymm10,YMMWORD[64+rsi]
1412 vpxor ymm15,ymm15,YMMWORD[96+rsi]
1413 lea rsi,[128+rsi]
1414 vmovdqu YMMWORD[rdi],ymm12
1415 vmovdqu YMMWORD[32+rdi],ymm13
1416 vmovdqu YMMWORD[64+rdi],ymm10
1417 vmovdqu YMMWORD[96+rdi],ymm15
1418 lea rdi,[128+rdi]
1419
1420 vpxor ymm14,ymm14,YMMWORD[rsi]
1421 vpxor ymm2,ymm2,YMMWORD[32+rsi]
1422 vpxor ymm3,ymm3,YMMWORD[64+rsi]
1423 vpxor ymm7,ymm7,YMMWORD[96+rsi]
1424 lea rsi,[128+rsi]
1425 vmovdqu YMMWORD[rdi],ymm14
1426 vmovdqu YMMWORD[32+rdi],ymm2
1427 vmovdqu YMMWORD[64+rdi],ymm3
1428 vmovdqu YMMWORD[96+rdi],ymm7
1429 lea rdi,[128+rdi]
1430
1431 vpxor ymm11,ymm11,YMMWORD[rsi]
1432 vpxor ymm9,ymm9,YMMWORD[32+rsi]
1433 vpxor ymm0,ymm0,YMMWORD[64+rsi]
1434 vpxor ymm4,ymm4,YMMWORD[96+rsi]
1435 lea rsi,[128+rsi]
1436 vmovdqu YMMWORD[rdi],ymm11
1437 vmovdqu YMMWORD[32+rdi],ymm9
1438 vmovdqu YMMWORD[64+rdi],ymm0
1439 vmovdqu YMMWORD[96+rdi],ymm4
1440 lea rdi,[128+rdi]
1441
1442 sub rdx,64*8
1443 jnz NEAR $L$oop_outer8x
1444
1445 jmp NEAR $L$done8x
1446
1447 $L$tail8x:
1448 cmp rdx,448
1449 jae NEAR $L$448_or_more8x
1450 cmp rdx,384
1451 jae NEAR $L$384_or_more8x
1452 cmp rdx,320
1453 jae NEAR $L$320_or_more8x
1454 cmp rdx,256
1455 jae NEAR $L$256_or_more8x
1456 cmp rdx,192
1457 jae NEAR $L$192_or_more8x
1458 cmp rdx,128
1459 jae NEAR $L$128_or_more8x
1460 cmp rdx,64
1461 jae NEAR $L$64_or_more8x
1462
1463 xor r10,r10
1464 vmovdqa YMMWORD[rsp],ymm6
1465 vmovdqa YMMWORD[32+rsp],ymm8
1466 jmp NEAR $L$oop_tail8x
1467
1468 ALIGN 32
1469 $L$64_or_more8x:
1470 vpxor ymm6,ymm6,YMMWORD[rsi]
1471 vpxor ymm8,ymm8,YMMWORD[32+rsi]
1472 vmovdqu YMMWORD[rdi],ymm6
1473 vmovdqu YMMWORD[32+rdi],ymm8
1474 je NEAR $L$done8x
1475
1476 lea rsi,[64+rsi]
1477 xor r10,r10
1478 vmovdqa YMMWORD[rsp],ymm1
1479 lea rdi,[64+rdi]
1480 sub rdx,64
1481 vmovdqa YMMWORD[32+rsp],ymm5
1482 jmp NEAR $L$oop_tail8x
1483
1484 ALIGN 32
1485 $L$128_or_more8x:
1486 vpxor ymm6,ymm6,YMMWORD[rsi]
1487 vpxor ymm8,ymm8,YMMWORD[32+rsi]
1488 vpxor ymm1,ymm1,YMMWORD[64+rsi]
1489 vpxor ymm5,ymm5,YMMWORD[96+rsi]
1490 vmovdqu YMMWORD[rdi],ymm6
1491 vmovdqu YMMWORD[32+rdi],ymm8
1492 vmovdqu YMMWORD[64+rdi],ymm1
1493 vmovdqu YMMWORD[96+rdi],ymm5
1494 je NEAR $L$done8x
1495
1496 lea rsi,[128+rsi]
1497 xor r10,r10
1498 vmovdqa YMMWORD[rsp],ymm12
1499 lea rdi,[128+rdi]
1500 sub rdx,128
1501 vmovdqa YMMWORD[32+rsp],ymm13
1502 jmp NEAR $L$oop_tail8x
1503
1504 ALIGN 32
1505 $L$192_or_more8x:
1506 vpxor ymm6,ymm6,YMMWORD[rsi]
1507 vpxor ymm8,ymm8,YMMWORD[32+rsi]
1508 vpxor ymm1,ymm1,YMMWORD[64+rsi]
1509 vpxor ymm5,ymm5,YMMWORD[96+rsi]
1510 vpxor ymm12,ymm12,YMMWORD[128+rsi]
1511 vpxor ymm13,ymm13,YMMWORD[160+rsi]
1512 vmovdqu YMMWORD[rdi],ymm6
1513 vmovdqu YMMWORD[32+rdi],ymm8
1514 vmovdqu YMMWORD[64+rdi],ymm1
1515 vmovdqu YMMWORD[96+rdi],ymm5
1516 vmovdqu YMMWORD[128+rdi],ymm12
1517 vmovdqu YMMWORD[160+rdi],ymm13
1518 je NEAR $L$done8x
1519
1520 lea rsi,[192+rsi]
1521 xor r10,r10
1522 vmovdqa YMMWORD[rsp],ymm10
1523 lea rdi,[192+rdi]
1524 sub rdx,192
1525 vmovdqa YMMWORD[32+rsp],ymm15
1526 jmp NEAR $L$oop_tail8x
1527
1528 ALIGN 32
1529 $L$256_or_more8x:
1530 vpxor ymm6,ymm6,YMMWORD[rsi]
1531 vpxor ymm8,ymm8,YMMWORD[32+rsi]
1532 vpxor ymm1,ymm1,YMMWORD[64+rsi]
1533 vpxor ymm5,ymm5,YMMWORD[96+rsi]
1534 vpxor ymm12,ymm12,YMMWORD[128+rsi]
1535 vpxor ymm13,ymm13,YMMWORD[160+rsi]
1536 vpxor ymm10,ymm10,YMMWORD[192+rsi]
1537 vpxor ymm15,ymm15,YMMWORD[224+rsi]
1538 vmovdqu YMMWORD[rdi],ymm6
1539 vmovdqu YMMWORD[32+rdi],ymm8
1540 vmovdqu YMMWORD[64+rdi],ymm1
1541 vmovdqu YMMWORD[96+rdi],ymm5
1542 vmovdqu YMMWORD[128+rdi],ymm12
1543 vmovdqu YMMWORD[160+rdi],ymm13
1544 vmovdqu YMMWORD[192+rdi],ymm10
1545 vmovdqu YMMWORD[224+rdi],ymm15
1546 je NEAR $L$done8x
1547
1548 lea rsi,[256+rsi]
1549 xor r10,r10
1550 vmovdqa YMMWORD[rsp],ymm14
1551 lea rdi,[256+rdi]
1552 sub rdx,256
1553 vmovdqa YMMWORD[32+rsp],ymm2
1554 jmp NEAR $L$oop_tail8x
1555
1556 ALIGN 32
1557 $L$320_or_more8x:
1558 vpxor ymm6,ymm6,YMMWORD[rsi]
1559 vpxor ymm8,ymm8,YMMWORD[32+rsi]
1560 vpxor ymm1,ymm1,YMMWORD[64+rsi]
1561 vpxor ymm5,ymm5,YMMWORD[96+rsi]
1562 vpxor ymm12,ymm12,YMMWORD[128+rsi]
1563 vpxor ymm13,ymm13,YMMWORD[160+rsi]
1564 vpxor ymm10,ymm10,YMMWORD[192+rsi]
1565 vpxor ymm15,ymm15,YMMWORD[224+rsi]
1566 vpxor ymm14,ymm14,YMMWORD[256+rsi]
1567 vpxor ymm2,ymm2,YMMWORD[288+rsi]
1568 vmovdqu YMMWORD[rdi],ymm6
1569 vmovdqu YMMWORD[32+rdi],ymm8
1570 vmovdqu YMMWORD[64+rdi],ymm1
1571 vmovdqu YMMWORD[96+rdi],ymm5
1572 vmovdqu YMMWORD[128+rdi],ymm12
1573 vmovdqu YMMWORD[160+rdi],ymm13
1574 vmovdqu YMMWORD[192+rdi],ymm10
1575 vmovdqu YMMWORD[224+rdi],ymm15
1576 vmovdqu YMMWORD[256+rdi],ymm14
1577 vmovdqu YMMWORD[288+rdi],ymm2
1578 je NEAR $L$done8x
1579
1580 lea rsi,[320+rsi]
1581 xor r10,r10
1582 vmovdqa YMMWORD[rsp],ymm3
1583 lea rdi,[320+rdi]
1584 sub rdx,320
1585 vmovdqa YMMWORD[32+rsp],ymm7
1586 jmp NEAR $L$oop_tail8x
1587
1588 ALIGN 32
1589 $L$384_or_more8x:
1590 vpxor ymm6,ymm6,YMMWORD[rsi]
1591 vpxor ymm8,ymm8,YMMWORD[32+rsi]
1592 vpxor ymm1,ymm1,YMMWORD[64+rsi]
1593 vpxor ymm5,ymm5,YMMWORD[96+rsi]
1594 vpxor ymm12,ymm12,YMMWORD[128+rsi]
1595 vpxor ymm13,ymm13,YMMWORD[160+rsi]
1596 vpxor ymm10,ymm10,YMMWORD[192+rsi]
1597 vpxor ymm15,ymm15,YMMWORD[224+rsi]
1598 vpxor ymm14,ymm14,YMMWORD[256+rsi]
1599 vpxor ymm2,ymm2,YMMWORD[288+rsi]
1600 vpxor ymm3,ymm3,YMMWORD[320+rsi]
1601 vpxor ymm7,ymm7,YMMWORD[352+rsi]
1602 vmovdqu YMMWORD[rdi],ymm6
1603 vmovdqu YMMWORD[32+rdi],ymm8
1604 vmovdqu YMMWORD[64+rdi],ymm1
1605 vmovdqu YMMWORD[96+rdi],ymm5
1606 vmovdqu YMMWORD[128+rdi],ymm12
1607 vmovdqu YMMWORD[160+rdi],ymm13
1608 vmovdqu YMMWORD[192+rdi],ymm10
1609 vmovdqu YMMWORD[224+rdi],ymm15
1610 vmovdqu YMMWORD[256+rdi],ymm14
1611 vmovdqu YMMWORD[288+rdi],ymm2
1612 vmovdqu YMMWORD[320+rdi],ymm3
1613 vmovdqu YMMWORD[352+rdi],ymm7
1614 je NEAR $L$done8x
1615
1616 lea rsi,[384+rsi]
1617 xor r10,r10
1618 vmovdqa YMMWORD[rsp],ymm11
1619 lea rdi,[384+rdi]
1620 sub rdx,384
1621 vmovdqa YMMWORD[32+rsp],ymm9
1622 jmp NEAR $L$oop_tail8x
1623
1624 ALIGN 32
1625 $L$448_or_more8x:
1626 vpxor ymm6,ymm6,YMMWORD[rsi]
1627 vpxor ymm8,ymm8,YMMWORD[32+rsi]
1628 vpxor ymm1,ymm1,YMMWORD[64+rsi]
1629 vpxor ymm5,ymm5,YMMWORD[96+rsi]
1630 vpxor ymm12,ymm12,YMMWORD[128+rsi]
1631 vpxor ymm13,ymm13,YMMWORD[160+rsi]
1632 vpxor ymm10,ymm10,YMMWORD[192+rsi]
1633 vpxor ymm15,ymm15,YMMWORD[224+rsi]
1634 vpxor ymm14,ymm14,YMMWORD[256+rsi]
1635 vpxor ymm2,ymm2,YMMWORD[288+rsi]
1636 vpxor ymm3,ymm3,YMMWORD[320+rsi]
1637 vpxor ymm7,ymm7,YMMWORD[352+rsi]
1638 vpxor ymm11,ymm11,YMMWORD[384+rsi]
1639 vpxor ymm9,ymm9,YMMWORD[416+rsi]
1640 vmovdqu YMMWORD[rdi],ymm6
1641 vmovdqu YMMWORD[32+rdi],ymm8
1642 vmovdqu YMMWORD[64+rdi],ymm1
1643 vmovdqu YMMWORD[96+rdi],ymm5
1644 vmovdqu YMMWORD[128+rdi],ymm12
1645 vmovdqu YMMWORD[160+rdi],ymm13
1646 vmovdqu YMMWORD[192+rdi],ymm10
1647 vmovdqu YMMWORD[224+rdi],ymm15
1648 vmovdqu YMMWORD[256+rdi],ymm14
1649 vmovdqu YMMWORD[288+rdi],ymm2
1650 vmovdqu YMMWORD[320+rdi],ymm3
1651 vmovdqu YMMWORD[352+rdi],ymm7
1652 vmovdqu YMMWORD[384+rdi],ymm11
1653 vmovdqu YMMWORD[416+rdi],ymm9
1654 je NEAR $L$done8x
1655
1656 lea rsi,[448+rsi]
1657 xor r10,r10
1658 vmovdqa YMMWORD[rsp],ymm0
1659 lea rdi,[448+rdi]
1660 sub rdx,448
1661 vmovdqa YMMWORD[32+rsp],ymm4
1662
1663 $L$oop_tail8x:
1664 movzx eax,BYTE[r10*1+rsi]
1665 movzx ecx,BYTE[r10*1+rsp]
1666 lea r10,[1+r10]
1667 xor eax,ecx
1668 mov BYTE[((-1))+r10*1+rdi],al
1669 dec rdx
1670 jnz NEAR $L$oop_tail8x
1671
1672 $L$done8x:
1673 vzeroall
1674 lea r11,[((656+48))+rsp]
1675 movaps xmm6,XMMWORD[((-48))+r11]
1676 movaps xmm7,XMMWORD[((-32))+r11]
1677 movaps xmm8,XMMWORD[((-16))+r11]
1678 movaps xmm9,XMMWORD[r11]
1679 movaps xmm10,XMMWORD[16+r11]
1680 movaps xmm11,XMMWORD[32+r11]
1681 movaps xmm12,XMMWORD[48+r11]
1682 movaps xmm13,XMMWORD[64+r11]
1683 movaps xmm14,XMMWORD[80+r11]
1684 movaps xmm15,XMMWORD[96+r11]
1685 mov rsp,QWORD[640+rsp]
1686 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
1687 mov rsi,QWORD[16+rsp]
1688 DB 0F3h,0C3h ;repret
1689 $L$SEH_end_ChaCha20_8x:
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698