OLD | NEW |
(Empty) | |
| 1 default rel |
| 2 %define XMMWORD |
| 3 %define YMMWORD |
| 4 %define ZMMWORD |
| 5 section .text code align=64 |
| 6 |
| 7 |
| 8 EXTERN OPENSSL_ia32cap_P |
| 9 |
| 10 ALIGN 64 |
| 11 $L$zero: |
| 12 DD 0,0,0,0 |
| 13 $L$one: |
| 14 DD 1,0,0,0 |
| 15 $L$inc: |
| 16 DD 0,1,2,3 |
| 17 $L$four: |
| 18 DD 4,4,4,4 |
| 19 $L$incy: |
| 20 DD 0,2,4,6,1,3,5,7 |
| 21 $L$eight: |
| 22 DD 8,8,8,8,8,8,8,8 |
| 23 $L$rot16: |
| 24 DB 0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd |
| 25 $L$rot24: |
| 26 DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe |
| 27 $L$sigma: |
| 28 DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107 |
| 29 DB 0 |
| 30 DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 |
| 31 DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32 |
| 32 DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115 |
| 33 DB 108,46,111,114,103,62,0 |
| 34 global ChaCha20_ctr32 |
| 35 |
| 36 ALIGN 64 |
| 37 ChaCha20_ctr32: |
| 38 mov QWORD[8+rsp],rdi ;WIN64 prologue |
| 39 mov QWORD[16+rsp],rsi |
| 40 mov rax,rsp |
| 41 $L$SEH_begin_ChaCha20_ctr32: |
| 42 mov rdi,rcx |
| 43 mov rsi,rdx |
| 44 mov rdx,r8 |
| 45 mov rcx,r9 |
| 46 mov r8,QWORD[40+rsp] |
| 47 |
| 48 |
| 49 cmp rdx,0 |
| 50 je NEAR $L$no_data |
| 51 mov r10,QWORD[((OPENSSL_ia32cap_P+4))] |
| 52 test r10d,512 |
| 53 jnz NEAR $L$ChaCha20_ssse3 |
| 54 |
| 55 push rbx |
| 56 push rbp |
| 57 push r12 |
| 58 push r13 |
| 59 push r14 |
| 60 push r15 |
| 61 sub rsp,64+24 |
| 62 |
| 63 |
| 64 movdqu xmm1,XMMWORD[rcx] |
| 65 movdqu xmm2,XMMWORD[16+rcx] |
| 66 movdqu xmm3,XMMWORD[r8] |
| 67 movdqa xmm4,XMMWORD[$L$one] |
| 68 |
| 69 |
| 70 movdqa XMMWORD[16+rsp],xmm1 |
| 71 movdqa XMMWORD[32+rsp],xmm2 |
| 72 movdqa XMMWORD[48+rsp],xmm3 |
| 73 mov rbp,rdx |
| 74 jmp NEAR $L$oop_outer |
| 75 |
| 76 ALIGN 32 |
| 77 $L$oop_outer: |
| 78 mov eax,0x61707865 |
| 79 mov ebx,0x3320646e |
| 80 mov ecx,0x79622d32 |
| 81 mov edx,0x6b206574 |
| 82 mov r8d,DWORD[16+rsp] |
| 83 mov r9d,DWORD[20+rsp] |
| 84 mov r10d,DWORD[24+rsp] |
| 85 mov r11d,DWORD[28+rsp] |
| 86 movd r12d,xmm3 |
| 87 mov r13d,DWORD[52+rsp] |
| 88 mov r14d,DWORD[56+rsp] |
| 89 mov r15d,DWORD[60+rsp] |
| 90 |
| 91 mov QWORD[((64+0))+rsp],rbp |
| 92 mov ebp,10 |
| 93 mov QWORD[((64+8))+rsp],rsi |
| 94 DB 102,72,15,126,214 |
| 95 mov QWORD[((64+16))+rsp],rdi |
| 96 mov rdi,rsi |
| 97 shr rdi,32 |
| 98 jmp NEAR $L$oop |
| 99 |
| 100 ALIGN 32 |
| 101 $L$oop: |
| 102 add eax,r8d |
| 103 xor r12d,eax |
| 104 rol r12d,16 |
| 105 add ebx,r9d |
| 106 xor r13d,ebx |
| 107 rol r13d,16 |
| 108 add esi,r12d |
| 109 xor r8d,esi |
| 110 rol r8d,12 |
| 111 add edi,r13d |
| 112 xor r9d,edi |
| 113 rol r9d,12 |
| 114 add eax,r8d |
| 115 xor r12d,eax |
| 116 rol r12d,8 |
| 117 add ebx,r9d |
| 118 xor r13d,ebx |
| 119 rol r13d,8 |
| 120 add esi,r12d |
| 121 xor r8d,esi |
| 122 rol r8d,7 |
| 123 add edi,r13d |
| 124 xor r9d,edi |
| 125 rol r9d,7 |
| 126 mov DWORD[32+rsp],esi |
| 127 mov DWORD[36+rsp],edi |
| 128 mov esi,DWORD[40+rsp] |
| 129 mov edi,DWORD[44+rsp] |
| 130 add ecx,r10d |
| 131 xor r14d,ecx |
| 132 rol r14d,16 |
| 133 add edx,r11d |
| 134 xor r15d,edx |
| 135 rol r15d,16 |
| 136 add esi,r14d |
| 137 xor r10d,esi |
| 138 rol r10d,12 |
| 139 add edi,r15d |
| 140 xor r11d,edi |
| 141 rol r11d,12 |
| 142 add ecx,r10d |
| 143 xor r14d,ecx |
| 144 rol r14d,8 |
| 145 add edx,r11d |
| 146 xor r15d,edx |
| 147 rol r15d,8 |
| 148 add esi,r14d |
| 149 xor r10d,esi |
| 150 rol r10d,7 |
| 151 add edi,r15d |
| 152 xor r11d,edi |
| 153 rol r11d,7 |
| 154 add eax,r9d |
| 155 xor r15d,eax |
| 156 rol r15d,16 |
| 157 add ebx,r10d |
| 158 xor r12d,ebx |
| 159 rol r12d,16 |
| 160 add esi,r15d |
| 161 xor r9d,esi |
| 162 rol r9d,12 |
| 163 add edi,r12d |
| 164 xor r10d,edi |
| 165 rol r10d,12 |
| 166 add eax,r9d |
| 167 xor r15d,eax |
| 168 rol r15d,8 |
| 169 add ebx,r10d |
| 170 xor r12d,ebx |
| 171 rol r12d,8 |
| 172 add esi,r15d |
| 173 xor r9d,esi |
| 174 rol r9d,7 |
| 175 add edi,r12d |
| 176 xor r10d,edi |
| 177 rol r10d,7 |
| 178 mov DWORD[40+rsp],esi |
| 179 mov DWORD[44+rsp],edi |
| 180 mov esi,DWORD[32+rsp] |
| 181 mov edi,DWORD[36+rsp] |
| 182 add ecx,r11d |
| 183 xor r13d,ecx |
| 184 rol r13d,16 |
| 185 add edx,r8d |
| 186 xor r14d,edx |
| 187 rol r14d,16 |
| 188 add esi,r13d |
| 189 xor r11d,esi |
| 190 rol r11d,12 |
| 191 add edi,r14d |
| 192 xor r8d,edi |
| 193 rol r8d,12 |
| 194 add ecx,r11d |
| 195 xor r13d,ecx |
| 196 rol r13d,8 |
| 197 add edx,r8d |
| 198 xor r14d,edx |
| 199 rol r14d,8 |
| 200 add esi,r13d |
| 201 xor r11d,esi |
| 202 rol r11d,7 |
| 203 add edi,r14d |
| 204 xor r8d,edi |
| 205 rol r8d,7 |
| 206 dec ebp |
| 207 jnz NEAR $L$oop |
| 208 mov DWORD[36+rsp],edi |
| 209 mov DWORD[32+rsp],esi |
| 210 mov rbp,QWORD[64+rsp] |
| 211 movdqa xmm1,xmm2 |
| 212 mov rsi,QWORD[((64+8))+rsp] |
| 213 paddd xmm3,xmm4 |
| 214 mov rdi,QWORD[((64+16))+rsp] |
| 215 |
| 216 add eax,0x61707865 |
| 217 add ebx,0x3320646e |
| 218 add ecx,0x79622d32 |
| 219 add edx,0x6b206574 |
| 220 add r8d,DWORD[16+rsp] |
| 221 add r9d,DWORD[20+rsp] |
| 222 add r10d,DWORD[24+rsp] |
| 223 add r11d,DWORD[28+rsp] |
| 224 add r12d,DWORD[48+rsp] |
| 225 add r13d,DWORD[52+rsp] |
| 226 add r14d,DWORD[56+rsp] |
| 227 add r15d,DWORD[60+rsp] |
| 228 paddd xmm1,XMMWORD[32+rsp] |
| 229 |
| 230 cmp rbp,64 |
| 231 jb NEAR $L$tail |
| 232 |
| 233 xor eax,DWORD[rsi] |
| 234 xor ebx,DWORD[4+rsi] |
| 235 xor ecx,DWORD[8+rsi] |
| 236 xor edx,DWORD[12+rsi] |
| 237 xor r8d,DWORD[16+rsi] |
| 238 xor r9d,DWORD[20+rsi] |
| 239 xor r10d,DWORD[24+rsi] |
| 240 xor r11d,DWORD[28+rsi] |
| 241 movdqu xmm0,XMMWORD[32+rsi] |
| 242 xor r12d,DWORD[48+rsi] |
| 243 xor r13d,DWORD[52+rsi] |
| 244 xor r14d,DWORD[56+rsi] |
| 245 xor r15d,DWORD[60+rsi] |
| 246 lea rsi,[64+rsi] |
| 247 pxor xmm0,xmm1 |
| 248 |
| 249 movdqa XMMWORD[32+rsp],xmm2 |
| 250 movd DWORD[48+rsp],xmm3 |
| 251 |
| 252 mov DWORD[rdi],eax |
| 253 mov DWORD[4+rdi],ebx |
| 254 mov DWORD[8+rdi],ecx |
| 255 mov DWORD[12+rdi],edx |
| 256 mov DWORD[16+rdi],r8d |
| 257 mov DWORD[20+rdi],r9d |
| 258 mov DWORD[24+rdi],r10d |
| 259 mov DWORD[28+rdi],r11d |
| 260 movdqu XMMWORD[32+rdi],xmm0 |
| 261 mov DWORD[48+rdi],r12d |
| 262 mov DWORD[52+rdi],r13d |
| 263 mov DWORD[56+rdi],r14d |
| 264 mov DWORD[60+rdi],r15d |
| 265 lea rdi,[64+rdi] |
| 266 |
| 267 sub rbp,64 |
| 268 jnz NEAR $L$oop_outer |
| 269 |
| 270 jmp NEAR $L$done |
| 271 |
| 272 ALIGN 16 |
| 273 $L$tail: |
| 274 mov DWORD[rsp],eax |
| 275 mov DWORD[4+rsp],ebx |
| 276 xor rbx,rbx |
| 277 mov DWORD[8+rsp],ecx |
| 278 mov DWORD[12+rsp],edx |
| 279 mov DWORD[16+rsp],r8d |
| 280 mov DWORD[20+rsp],r9d |
| 281 mov DWORD[24+rsp],r10d |
| 282 mov DWORD[28+rsp],r11d |
| 283 movdqa XMMWORD[32+rsp],xmm1 |
| 284 mov DWORD[48+rsp],r12d |
| 285 mov DWORD[52+rsp],r13d |
| 286 mov DWORD[56+rsp],r14d |
| 287 mov DWORD[60+rsp],r15d |
| 288 |
| 289 $L$oop_tail: |
| 290 movzx eax,BYTE[rbx*1+rsi] |
| 291 movzx edx,BYTE[rbx*1+rsp] |
| 292 lea rbx,[1+rbx] |
| 293 xor eax,edx |
| 294 mov BYTE[((-1))+rbx*1+rdi],al |
| 295 dec rbp |
| 296 jnz NEAR $L$oop_tail |
| 297 |
| 298 $L$done: |
| 299 add rsp,64+24 |
| 300 pop r15 |
| 301 pop r14 |
| 302 pop r13 |
| 303 pop r12 |
| 304 pop rbp |
| 305 pop rbx |
| 306 $L$no_data: |
| 307 mov rdi,QWORD[8+rsp] ;WIN64 epilogue |
| 308 mov rsi,QWORD[16+rsp] |
| 309 DB 0F3h,0C3h ;repret |
| 310 $L$SEH_end_ChaCha20_ctr32: |
| 311 |
| 312 ALIGN 32 |
| 313 ChaCha20_ssse3: |
| 314 mov QWORD[8+rsp],rdi ;WIN64 prologue |
| 315 mov QWORD[16+rsp],rsi |
| 316 mov rax,rsp |
| 317 $L$SEH_begin_ChaCha20_ssse3: |
| 318 mov rdi,rcx |
| 319 mov rsi,rdx |
| 320 mov rdx,r8 |
| 321 mov rcx,r9 |
| 322 mov r8,QWORD[40+rsp] |
| 323 |
| 324 |
| 325 $L$ChaCha20_ssse3: |
| 326 cmp rdx,128 |
| 327 ja NEAR $L$ChaCha20_4x |
| 328 |
| 329 $L$do_sse3_after_all: |
| 330 push rbx |
| 331 push rbp |
| 332 push r12 |
| 333 push r13 |
| 334 push r14 |
| 335 push r15 |
| 336 |
| 337 sub rsp,64+72 |
| 338 movaps XMMWORD[(64+32)+rsp],xmm6 |
| 339 movaps XMMWORD[(64+48)+rsp],xmm7 |
| 340 movdqa xmm0,XMMWORD[$L$sigma] |
| 341 movdqu xmm1,XMMWORD[rcx] |
| 342 movdqu xmm2,XMMWORD[16+rcx] |
| 343 movdqu xmm3,XMMWORD[r8] |
| 344 movdqa xmm6,XMMWORD[$L$rot16] |
| 345 movdqa xmm7,XMMWORD[$L$rot24] |
| 346 |
| 347 movdqa XMMWORD[rsp],xmm0 |
| 348 movdqa XMMWORD[16+rsp],xmm1 |
| 349 movdqa XMMWORD[32+rsp],xmm2 |
| 350 movdqa XMMWORD[48+rsp],xmm3 |
| 351 mov ebp,10 |
| 352 jmp NEAR $L$oop_ssse3 |
| 353 |
| 354 ALIGN 32 |
| 355 $L$oop_outer_ssse3: |
| 356 movdqa xmm3,XMMWORD[$L$one] |
| 357 movdqa xmm0,XMMWORD[rsp] |
| 358 movdqa xmm1,XMMWORD[16+rsp] |
| 359 movdqa xmm2,XMMWORD[32+rsp] |
| 360 paddd xmm3,XMMWORD[48+rsp] |
| 361 mov ebp,10 |
| 362 movdqa XMMWORD[48+rsp],xmm3 |
| 363 jmp NEAR $L$oop_ssse3 |
| 364 |
| 365 ALIGN 32 |
| 366 $L$oop_ssse3: |
| 367 paddd xmm0,xmm1 |
| 368 pxor xmm3,xmm0 |
| 369 DB 102,15,56,0,222 |
| 370 paddd xmm2,xmm3 |
| 371 pxor xmm1,xmm2 |
| 372 movdqa xmm4,xmm1 |
| 373 psrld xmm1,20 |
| 374 pslld xmm4,12 |
| 375 por xmm1,xmm4 |
| 376 paddd xmm0,xmm1 |
| 377 pxor xmm3,xmm0 |
| 378 DB 102,15,56,0,223 |
| 379 paddd xmm2,xmm3 |
| 380 pxor xmm1,xmm2 |
| 381 movdqa xmm4,xmm1 |
| 382 psrld xmm1,25 |
| 383 pslld xmm4,7 |
| 384 por xmm1,xmm4 |
| 385 pshufd xmm2,xmm2,78 |
| 386 pshufd xmm1,xmm1,57 |
| 387 pshufd xmm3,xmm3,147 |
| 388 nop |
| 389 paddd xmm0,xmm1 |
| 390 pxor xmm3,xmm0 |
| 391 DB 102,15,56,0,222 |
| 392 paddd xmm2,xmm3 |
| 393 pxor xmm1,xmm2 |
| 394 movdqa xmm4,xmm1 |
| 395 psrld xmm1,20 |
| 396 pslld xmm4,12 |
| 397 por xmm1,xmm4 |
| 398 paddd xmm0,xmm1 |
| 399 pxor xmm3,xmm0 |
| 400 DB 102,15,56,0,223 |
| 401 paddd xmm2,xmm3 |
| 402 pxor xmm1,xmm2 |
| 403 movdqa xmm4,xmm1 |
| 404 psrld xmm1,25 |
| 405 pslld xmm4,7 |
| 406 por xmm1,xmm4 |
| 407 pshufd xmm2,xmm2,78 |
| 408 pshufd xmm1,xmm1,147 |
| 409 pshufd xmm3,xmm3,57 |
| 410 dec ebp |
| 411 jnz NEAR $L$oop_ssse3 |
| 412 paddd xmm0,XMMWORD[rsp] |
| 413 paddd xmm1,XMMWORD[16+rsp] |
| 414 paddd xmm2,XMMWORD[32+rsp] |
| 415 paddd xmm3,XMMWORD[48+rsp] |
| 416 |
| 417 cmp rdx,64 |
| 418 jb NEAR $L$tail_ssse3 |
| 419 |
| 420 movdqu xmm4,XMMWORD[rsi] |
| 421 movdqu xmm5,XMMWORD[16+rsi] |
| 422 pxor xmm0,xmm4 |
| 423 movdqu xmm4,XMMWORD[32+rsi] |
| 424 pxor xmm1,xmm5 |
| 425 movdqu xmm5,XMMWORD[48+rsi] |
| 426 lea rsi,[64+rsi] |
| 427 pxor xmm2,xmm4 |
| 428 pxor xmm3,xmm5 |
| 429 |
| 430 movdqu XMMWORD[rdi],xmm0 |
| 431 movdqu XMMWORD[16+rdi],xmm1 |
| 432 movdqu XMMWORD[32+rdi],xmm2 |
| 433 movdqu XMMWORD[48+rdi],xmm3 |
| 434 lea rdi,[64+rdi] |
| 435 |
| 436 sub rdx,64 |
| 437 jnz NEAR $L$oop_outer_ssse3 |
| 438 |
| 439 jmp NEAR $L$done_ssse3 |
| 440 |
| 441 ALIGN 16 |
| 442 $L$tail_ssse3: |
| 443 movdqa XMMWORD[rsp],xmm0 |
| 444 movdqa XMMWORD[16+rsp],xmm1 |
| 445 movdqa XMMWORD[32+rsp],xmm2 |
| 446 movdqa XMMWORD[48+rsp],xmm3 |
| 447 xor rbx,rbx |
| 448 |
| 449 $L$oop_tail_ssse3: |
| 450 movzx eax,BYTE[rbx*1+rsi] |
| 451 movzx ecx,BYTE[rbx*1+rsp] |
| 452 lea rbx,[1+rbx] |
| 453 xor eax,ecx |
| 454 mov BYTE[((-1))+rbx*1+rdi],al |
| 455 dec rdx |
| 456 jnz NEAR $L$oop_tail_ssse3 |
| 457 |
| 458 $L$done_ssse3: |
| 459 movaps xmm6,XMMWORD[((64+32))+rsp] |
| 460 movaps xmm7,XMMWORD[((64+48))+rsp] |
| 461 add rsp,64+72 |
| 462 pop r15 |
| 463 pop r14 |
| 464 pop r13 |
| 465 pop r12 |
| 466 pop rbp |
| 467 pop rbx |
| 468 mov rdi,QWORD[8+rsp] ;WIN64 epilogue |
| 469 mov rsi,QWORD[16+rsp] |
| 470 DB 0F3h,0C3h ;repret |
| 471 $L$SEH_end_ChaCha20_ssse3: |
| 472 |
| 473 ALIGN 32 |
| 474 ChaCha20_4x: |
| 475 mov QWORD[8+rsp],rdi ;WIN64 prologue |
| 476 mov QWORD[16+rsp],rsi |
| 477 mov rax,rsp |
| 478 $L$SEH_begin_ChaCha20_4x: |
| 479 mov rdi,rcx |
| 480 mov rsi,rdx |
| 481 mov rdx,r8 |
| 482 mov rcx,r9 |
| 483 mov r8,QWORD[40+rsp] |
| 484 |
| 485 |
| 486 $L$ChaCha20_4x: |
| 487 mov r11,r10 |
| 488 shr r10,32 |
| 489 test r10,32 |
| 490 jnz NEAR $L$ChaCha20_8x |
| 491 cmp rdx,192 |
| 492 ja NEAR $L$proceed4x |
| 493 |
| 494 and r11,71303168 |
| 495 cmp r11,4194304 |
| 496 je NEAR $L$do_sse3_after_all |
| 497 |
| 498 $L$proceed4x: |
| 499 lea r11,[((-120))+rsp] |
| 500 sub rsp,0x148+160 |
| 501 movaps XMMWORD[(-48)+r11],xmm6 |
| 502 movaps XMMWORD[(-32)+r11],xmm7 |
| 503 movaps XMMWORD[(-16)+r11],xmm8 |
| 504 movaps XMMWORD[r11],xmm9 |
| 505 movaps XMMWORD[16+r11],xmm10 |
| 506 movaps XMMWORD[32+r11],xmm11 |
| 507 movaps XMMWORD[48+r11],xmm12 |
| 508 movaps XMMWORD[64+r11],xmm13 |
| 509 movaps XMMWORD[80+r11],xmm14 |
| 510 movaps XMMWORD[96+r11],xmm15 |
| 511 movdqa xmm11,XMMWORD[$L$sigma] |
| 512 movdqu xmm15,XMMWORD[rcx] |
| 513 movdqu xmm7,XMMWORD[16+rcx] |
| 514 movdqu xmm3,XMMWORD[r8] |
| 515 lea rcx,[256+rsp] |
| 516 lea r10,[$L$rot16] |
| 517 lea r11,[$L$rot24] |
| 518 |
| 519 pshufd xmm8,xmm11,0x00 |
| 520 pshufd xmm9,xmm11,0x55 |
| 521 movdqa XMMWORD[64+rsp],xmm8 |
| 522 pshufd xmm10,xmm11,0xaa |
| 523 movdqa XMMWORD[80+rsp],xmm9 |
| 524 pshufd xmm11,xmm11,0xff |
| 525 movdqa XMMWORD[96+rsp],xmm10 |
| 526 movdqa XMMWORD[112+rsp],xmm11 |
| 527 |
| 528 pshufd xmm12,xmm15,0x00 |
| 529 pshufd xmm13,xmm15,0x55 |
| 530 movdqa XMMWORD[(128-256)+rcx],xmm12 |
| 531 pshufd xmm14,xmm15,0xaa |
| 532 movdqa XMMWORD[(144-256)+rcx],xmm13 |
| 533 pshufd xmm15,xmm15,0xff |
| 534 movdqa XMMWORD[(160-256)+rcx],xmm14 |
| 535 movdqa XMMWORD[(176-256)+rcx],xmm15 |
| 536 |
| 537 pshufd xmm4,xmm7,0x00 |
| 538 pshufd xmm5,xmm7,0x55 |
| 539 movdqa XMMWORD[(192-256)+rcx],xmm4 |
| 540 pshufd xmm6,xmm7,0xaa |
| 541 movdqa XMMWORD[(208-256)+rcx],xmm5 |
| 542 pshufd xmm7,xmm7,0xff |
| 543 movdqa XMMWORD[(224-256)+rcx],xmm6 |
| 544 movdqa XMMWORD[(240-256)+rcx],xmm7 |
| 545 |
| 546 pshufd xmm0,xmm3,0x00 |
| 547 pshufd xmm1,xmm3,0x55 |
| 548 paddd xmm0,XMMWORD[$L$inc] |
| 549 pshufd xmm2,xmm3,0xaa |
| 550 movdqa XMMWORD[(272-256)+rcx],xmm1 |
| 551 pshufd xmm3,xmm3,0xff |
| 552 movdqa XMMWORD[(288-256)+rcx],xmm2 |
| 553 movdqa XMMWORD[(304-256)+rcx],xmm3 |
| 554 |
| 555 jmp NEAR $L$oop_enter4x |
| 556 |
| 557 ALIGN 32 |
| 558 $L$oop_outer4x: |
| 559 movdqa xmm8,XMMWORD[64+rsp] |
| 560 movdqa xmm9,XMMWORD[80+rsp] |
| 561 movdqa xmm10,XMMWORD[96+rsp] |
| 562 movdqa xmm11,XMMWORD[112+rsp] |
| 563 movdqa xmm12,XMMWORD[((128-256))+rcx] |
| 564 movdqa xmm13,XMMWORD[((144-256))+rcx] |
| 565 movdqa xmm14,XMMWORD[((160-256))+rcx] |
| 566 movdqa xmm15,XMMWORD[((176-256))+rcx] |
| 567 movdqa xmm4,XMMWORD[((192-256))+rcx] |
| 568 movdqa xmm5,XMMWORD[((208-256))+rcx] |
| 569 movdqa xmm6,XMMWORD[((224-256))+rcx] |
| 570 movdqa xmm7,XMMWORD[((240-256))+rcx] |
| 571 movdqa xmm0,XMMWORD[((256-256))+rcx] |
| 572 movdqa xmm1,XMMWORD[((272-256))+rcx] |
| 573 movdqa xmm2,XMMWORD[((288-256))+rcx] |
| 574 movdqa xmm3,XMMWORD[((304-256))+rcx] |
| 575 paddd xmm0,XMMWORD[$L$four] |
| 576 |
| 577 $L$oop_enter4x: |
| 578 movdqa XMMWORD[32+rsp],xmm6 |
| 579 movdqa XMMWORD[48+rsp],xmm7 |
| 580 movdqa xmm7,XMMWORD[r10] |
| 581 mov eax,10 |
| 582 movdqa XMMWORD[(256-256)+rcx],xmm0 |
| 583 jmp NEAR $L$oop4x |
| 584 |
| 585 ALIGN 32 |
| 586 $L$oop4x: |
| 587 paddd xmm8,xmm12 |
| 588 paddd xmm9,xmm13 |
| 589 pxor xmm0,xmm8 |
| 590 pxor xmm1,xmm9 |
| 591 DB 102,15,56,0,199 |
| 592 DB 102,15,56,0,207 |
| 593 paddd xmm4,xmm0 |
| 594 paddd xmm5,xmm1 |
| 595 pxor xmm12,xmm4 |
| 596 pxor xmm13,xmm5 |
| 597 movdqa xmm6,xmm12 |
| 598 pslld xmm12,12 |
| 599 psrld xmm6,20 |
| 600 movdqa xmm7,xmm13 |
| 601 pslld xmm13,12 |
| 602 por xmm12,xmm6 |
| 603 psrld xmm7,20 |
| 604 movdqa xmm6,XMMWORD[r11] |
| 605 por xmm13,xmm7 |
| 606 paddd xmm8,xmm12 |
| 607 paddd xmm9,xmm13 |
| 608 pxor xmm0,xmm8 |
| 609 pxor xmm1,xmm9 |
| 610 DB 102,15,56,0,198 |
| 611 DB 102,15,56,0,206 |
| 612 paddd xmm4,xmm0 |
| 613 paddd xmm5,xmm1 |
| 614 pxor xmm12,xmm4 |
| 615 pxor xmm13,xmm5 |
| 616 movdqa xmm7,xmm12 |
| 617 pslld xmm12,7 |
| 618 psrld xmm7,25 |
| 619 movdqa xmm6,xmm13 |
| 620 pslld xmm13,7 |
| 621 por xmm12,xmm7 |
| 622 psrld xmm6,25 |
| 623 movdqa xmm7,XMMWORD[r10] |
| 624 por xmm13,xmm6 |
| 625 movdqa XMMWORD[rsp],xmm4 |
| 626 movdqa XMMWORD[16+rsp],xmm5 |
| 627 movdqa xmm4,XMMWORD[32+rsp] |
| 628 movdqa xmm5,XMMWORD[48+rsp] |
| 629 paddd xmm10,xmm14 |
| 630 paddd xmm11,xmm15 |
| 631 pxor xmm2,xmm10 |
| 632 pxor xmm3,xmm11 |
| 633 DB 102,15,56,0,215 |
| 634 DB 102,15,56,0,223 |
| 635 paddd xmm4,xmm2 |
| 636 paddd xmm5,xmm3 |
| 637 pxor xmm14,xmm4 |
| 638 pxor xmm15,xmm5 |
| 639 movdqa xmm6,xmm14 |
| 640 pslld xmm14,12 |
| 641 psrld xmm6,20 |
| 642 movdqa xmm7,xmm15 |
| 643 pslld xmm15,12 |
| 644 por xmm14,xmm6 |
| 645 psrld xmm7,20 |
| 646 movdqa xmm6,XMMWORD[r11] |
| 647 por xmm15,xmm7 |
| 648 paddd xmm10,xmm14 |
| 649 paddd xmm11,xmm15 |
| 650 pxor xmm2,xmm10 |
| 651 pxor xmm3,xmm11 |
| 652 DB 102,15,56,0,214 |
| 653 DB 102,15,56,0,222 |
| 654 paddd xmm4,xmm2 |
| 655 paddd xmm5,xmm3 |
| 656 pxor xmm14,xmm4 |
| 657 pxor xmm15,xmm5 |
| 658 movdqa xmm7,xmm14 |
| 659 pslld xmm14,7 |
| 660 psrld xmm7,25 |
| 661 movdqa xmm6,xmm15 |
| 662 pslld xmm15,7 |
| 663 por xmm14,xmm7 |
| 664 psrld xmm6,25 |
| 665 movdqa xmm7,XMMWORD[r10] |
| 666 por xmm15,xmm6 |
| 667 paddd xmm8,xmm13 |
| 668 paddd xmm9,xmm14 |
| 669 pxor xmm3,xmm8 |
| 670 pxor xmm0,xmm9 |
| 671 DB 102,15,56,0,223 |
| 672 DB 102,15,56,0,199 |
| 673 paddd xmm4,xmm3 |
| 674 paddd xmm5,xmm0 |
| 675 pxor xmm13,xmm4 |
| 676 pxor xmm14,xmm5 |
| 677 movdqa xmm6,xmm13 |
| 678 pslld xmm13,12 |
| 679 psrld xmm6,20 |
| 680 movdqa xmm7,xmm14 |
| 681 pslld xmm14,12 |
| 682 por xmm13,xmm6 |
| 683 psrld xmm7,20 |
| 684 movdqa xmm6,XMMWORD[r11] |
| 685 por xmm14,xmm7 |
| 686 paddd xmm8,xmm13 |
| 687 paddd xmm9,xmm14 |
| 688 pxor xmm3,xmm8 |
| 689 pxor xmm0,xmm9 |
| 690 DB 102,15,56,0,222 |
| 691 DB 102,15,56,0,198 |
| 692 paddd xmm4,xmm3 |
| 693 paddd xmm5,xmm0 |
| 694 pxor xmm13,xmm4 |
| 695 pxor xmm14,xmm5 |
| 696 movdqa xmm7,xmm13 |
| 697 pslld xmm13,7 |
| 698 psrld xmm7,25 |
| 699 movdqa xmm6,xmm14 |
| 700 pslld xmm14,7 |
| 701 por xmm13,xmm7 |
| 702 psrld xmm6,25 |
| 703 movdqa xmm7,XMMWORD[r10] |
| 704 por xmm14,xmm6 |
| 705 movdqa XMMWORD[32+rsp],xmm4 |
| 706 movdqa XMMWORD[48+rsp],xmm5 |
| 707 movdqa xmm4,XMMWORD[rsp] |
| 708 movdqa xmm5,XMMWORD[16+rsp] |
| 709 paddd xmm10,xmm15 |
| 710 paddd xmm11,xmm12 |
| 711 pxor xmm1,xmm10 |
| 712 pxor xmm2,xmm11 |
| 713 DB 102,15,56,0,207 |
| 714 DB 102,15,56,0,215 |
| 715 paddd xmm4,xmm1 |
| 716 paddd xmm5,xmm2 |
| 717 pxor xmm15,xmm4 |
| 718 pxor xmm12,xmm5 |
| 719 movdqa xmm6,xmm15 |
| 720 pslld xmm15,12 |
| 721 psrld xmm6,20 |
| 722 movdqa xmm7,xmm12 |
| 723 pslld xmm12,12 |
| 724 por xmm15,xmm6 |
| 725 psrld xmm7,20 |
| 726 movdqa xmm6,XMMWORD[r11] |
| 727 por xmm12,xmm7 |
| 728 paddd xmm10,xmm15 |
| 729 paddd xmm11,xmm12 |
| 730 pxor xmm1,xmm10 |
| 731 pxor xmm2,xmm11 |
| 732 DB 102,15,56,0,206 |
| 733 DB 102,15,56,0,214 |
| 734 paddd xmm4,xmm1 |
| 735 paddd xmm5,xmm2 |
| 736 pxor xmm15,xmm4 |
| 737 pxor xmm12,xmm5 |
| 738 movdqa xmm7,xmm15 |
| 739 pslld xmm15,7 |
| 740 psrld xmm7,25 |
| 741 movdqa xmm6,xmm12 |
| 742 pslld xmm12,7 |
| 743 por xmm15,xmm7 |
| 744 psrld xmm6,25 |
| 745 movdqa xmm7,XMMWORD[r10] |
| 746 por xmm12,xmm6 |
| 747 dec eax |
| 748 jnz NEAR $L$oop4x |
| 749 |
| 750 paddd xmm8,XMMWORD[64+rsp] |
| 751 paddd xmm9,XMMWORD[80+rsp] |
| 752 paddd xmm10,XMMWORD[96+rsp] |
| 753 paddd xmm11,XMMWORD[112+rsp] |
| 754 |
| 755 movdqa xmm6,xmm8 |
| 756 punpckldq xmm8,xmm9 |
| 757 movdqa xmm7,xmm10 |
| 758 punpckldq xmm10,xmm11 |
| 759 punpckhdq xmm6,xmm9 |
| 760 punpckhdq xmm7,xmm11 |
| 761 movdqa xmm9,xmm8 |
| 762 punpcklqdq xmm8,xmm10 |
| 763 movdqa xmm11,xmm6 |
| 764 punpcklqdq xmm6,xmm7 |
| 765 punpckhqdq xmm9,xmm10 |
| 766 punpckhqdq xmm11,xmm7 |
| 767 paddd xmm12,XMMWORD[((128-256))+rcx] |
| 768 paddd xmm13,XMMWORD[((144-256))+rcx] |
| 769 paddd xmm14,XMMWORD[((160-256))+rcx] |
| 770 paddd xmm15,XMMWORD[((176-256))+rcx] |
| 771 |
| 772 movdqa XMMWORD[rsp],xmm8 |
| 773 movdqa XMMWORD[16+rsp],xmm9 |
| 774 movdqa xmm8,XMMWORD[32+rsp] |
| 775 movdqa xmm9,XMMWORD[48+rsp] |
| 776 |
| 777 movdqa xmm10,xmm12 |
| 778 punpckldq xmm12,xmm13 |
| 779 movdqa xmm7,xmm14 |
| 780 punpckldq xmm14,xmm15 |
| 781 punpckhdq xmm10,xmm13 |
| 782 punpckhdq xmm7,xmm15 |
| 783 movdqa xmm13,xmm12 |
| 784 punpcklqdq xmm12,xmm14 |
| 785 movdqa xmm15,xmm10 |
| 786 punpcklqdq xmm10,xmm7 |
| 787 punpckhqdq xmm13,xmm14 |
| 788 punpckhqdq xmm15,xmm7 |
| 789 paddd xmm4,XMMWORD[((192-256))+rcx] |
| 790 paddd xmm5,XMMWORD[((208-256))+rcx] |
| 791 paddd xmm8,XMMWORD[((224-256))+rcx] |
| 792 paddd xmm9,XMMWORD[((240-256))+rcx] |
| 793 |
| 794 movdqa XMMWORD[32+rsp],xmm6 |
| 795 movdqa XMMWORD[48+rsp],xmm11 |
| 796 |
| 797 movdqa xmm14,xmm4 |
| 798 punpckldq xmm4,xmm5 |
| 799 movdqa xmm7,xmm8 |
| 800 punpckldq xmm8,xmm9 |
| 801 punpckhdq xmm14,xmm5 |
| 802 punpckhdq xmm7,xmm9 |
| 803 movdqa xmm5,xmm4 |
| 804 punpcklqdq xmm4,xmm8 |
| 805 movdqa xmm9,xmm14 |
| 806 punpcklqdq xmm14,xmm7 |
| 807 punpckhqdq xmm5,xmm8 |
| 808 punpckhqdq xmm9,xmm7 |
| 809 paddd xmm0,XMMWORD[((256-256))+rcx] |
| 810 paddd xmm1,XMMWORD[((272-256))+rcx] |
| 811 paddd xmm2,XMMWORD[((288-256))+rcx] |
| 812 paddd xmm3,XMMWORD[((304-256))+rcx] |
| 813 |
| 814 movdqa xmm8,xmm0 |
| 815 punpckldq xmm0,xmm1 |
| 816 movdqa xmm7,xmm2 |
| 817 punpckldq xmm2,xmm3 |
| 818 punpckhdq xmm8,xmm1 |
| 819 punpckhdq xmm7,xmm3 |
| 820 movdqa xmm1,xmm0 |
| 821 punpcklqdq xmm0,xmm2 |
| 822 movdqa xmm3,xmm8 |
| 823 punpcklqdq xmm8,xmm7 |
| 824 punpckhqdq xmm1,xmm2 |
| 825 punpckhqdq xmm3,xmm7 |
| 826 cmp rdx,64*4 |
| 827 jb NEAR $L$tail4x |
| 828 |
| 829 movdqu xmm6,XMMWORD[rsi] |
| 830 movdqu xmm11,XMMWORD[16+rsi] |
| 831 movdqu xmm2,XMMWORD[32+rsi] |
| 832 movdqu xmm7,XMMWORD[48+rsi] |
| 833 pxor xmm6,XMMWORD[rsp] |
| 834 pxor xmm11,xmm12 |
| 835 pxor xmm2,xmm4 |
| 836 pxor xmm7,xmm0 |
| 837 |
| 838 movdqu XMMWORD[rdi],xmm6 |
| 839 movdqu xmm6,XMMWORD[64+rsi] |
| 840 movdqu XMMWORD[16+rdi],xmm11 |
| 841 movdqu xmm11,XMMWORD[80+rsi] |
| 842 movdqu XMMWORD[32+rdi],xmm2 |
| 843 movdqu xmm2,XMMWORD[96+rsi] |
| 844 movdqu XMMWORD[48+rdi],xmm7 |
| 845 movdqu xmm7,XMMWORD[112+rsi] |
| 846 lea rsi,[128+rsi] |
| 847 pxor xmm6,XMMWORD[16+rsp] |
| 848 pxor xmm11,xmm13 |
| 849 pxor xmm2,xmm5 |
| 850 pxor xmm7,xmm1 |
| 851 |
| 852 movdqu XMMWORD[64+rdi],xmm6 |
| 853 movdqu xmm6,XMMWORD[rsi] |
| 854 movdqu XMMWORD[80+rdi],xmm11 |
| 855 movdqu xmm11,XMMWORD[16+rsi] |
| 856 movdqu XMMWORD[96+rdi],xmm2 |
| 857 movdqu xmm2,XMMWORD[32+rsi] |
| 858 movdqu XMMWORD[112+rdi],xmm7 |
| 859 lea rdi,[128+rdi] |
| 860 movdqu xmm7,XMMWORD[48+rsi] |
| 861 pxor xmm6,XMMWORD[32+rsp] |
| 862 pxor xmm11,xmm10 |
| 863 pxor xmm2,xmm14 |
| 864 pxor xmm7,xmm8 |
| 865 |
| 866 movdqu XMMWORD[rdi],xmm6 |
| 867 movdqu xmm6,XMMWORD[64+rsi] |
| 868 movdqu XMMWORD[16+rdi],xmm11 |
| 869 movdqu xmm11,XMMWORD[80+rsi] |
| 870 movdqu XMMWORD[32+rdi],xmm2 |
| 871 movdqu xmm2,XMMWORD[96+rsi] |
| 872 movdqu XMMWORD[48+rdi],xmm7 |
| 873 movdqu xmm7,XMMWORD[112+rsi] |
| 874 lea rsi,[128+rsi] |
| 875 pxor xmm6,XMMWORD[48+rsp] |
| 876 pxor xmm11,xmm15 |
| 877 pxor xmm2,xmm9 |
| 878 pxor xmm7,xmm3 |
| 879 movdqu XMMWORD[64+rdi],xmm6 |
| 880 movdqu XMMWORD[80+rdi],xmm11 |
| 881 movdqu XMMWORD[96+rdi],xmm2 |
| 882 movdqu XMMWORD[112+rdi],xmm7 |
| 883 lea rdi,[128+rdi] |
| 884 |
| 885 sub rdx,64*4 |
| 886 jnz NEAR $L$oop_outer4x |
| 887 |
| 888 jmp NEAR $L$done4x |
| 889 |
| 890 $L$tail4x: |
| 891 cmp rdx,192 |
| 892 jae NEAR $L$192_or_more4x |
| 893 cmp rdx,128 |
| 894 jae NEAR $L$128_or_more4x |
| 895 cmp rdx,64 |
| 896 jae NEAR $L$64_or_more4x |
| 897 |
| 898 |
| 899 xor r10,r10 |
| 900 |
| 901 movdqa XMMWORD[16+rsp],xmm12 |
| 902 movdqa XMMWORD[32+rsp],xmm4 |
| 903 movdqa XMMWORD[48+rsp],xmm0 |
| 904 jmp NEAR $L$oop_tail4x |
| 905 |
| 906 ALIGN 32 |
| 907 $L$64_or_more4x: |
| 908 movdqu xmm6,XMMWORD[rsi] |
| 909 movdqu xmm11,XMMWORD[16+rsi] |
| 910 movdqu xmm2,XMMWORD[32+rsi] |
| 911 movdqu xmm7,XMMWORD[48+rsi] |
| 912 pxor xmm6,XMMWORD[rsp] |
| 913 pxor xmm11,xmm12 |
| 914 pxor xmm2,xmm4 |
| 915 pxor xmm7,xmm0 |
| 916 movdqu XMMWORD[rdi],xmm6 |
| 917 movdqu XMMWORD[16+rdi],xmm11 |
| 918 movdqu XMMWORD[32+rdi],xmm2 |
| 919 movdqu XMMWORD[48+rdi],xmm7 |
| 920 je NEAR $L$done4x |
| 921 |
| 922 movdqa xmm6,XMMWORD[16+rsp] |
| 923 lea rsi,[64+rsi] |
| 924 xor r10,r10 |
| 925 movdqa XMMWORD[rsp],xmm6 |
| 926 movdqa XMMWORD[16+rsp],xmm13 |
| 927 lea rdi,[64+rdi] |
| 928 movdqa XMMWORD[32+rsp],xmm5 |
| 929 sub rdx,64 |
| 930 movdqa XMMWORD[48+rsp],xmm1 |
| 931 jmp NEAR $L$oop_tail4x |
| 932 |
| 933 ALIGN 32 |
| 934 $L$128_or_more4x: |
| 935 movdqu xmm6,XMMWORD[rsi] |
| 936 movdqu xmm11,XMMWORD[16+rsi] |
| 937 movdqu xmm2,XMMWORD[32+rsi] |
| 938 movdqu xmm7,XMMWORD[48+rsi] |
| 939 pxor xmm6,XMMWORD[rsp] |
| 940 pxor xmm11,xmm12 |
| 941 pxor xmm2,xmm4 |
| 942 pxor xmm7,xmm0 |
| 943 |
| 944 movdqu XMMWORD[rdi],xmm6 |
| 945 movdqu xmm6,XMMWORD[64+rsi] |
| 946 movdqu XMMWORD[16+rdi],xmm11 |
| 947 movdqu xmm11,XMMWORD[80+rsi] |
| 948 movdqu XMMWORD[32+rdi],xmm2 |
| 949 movdqu xmm2,XMMWORD[96+rsi] |
| 950 movdqu XMMWORD[48+rdi],xmm7 |
| 951 movdqu xmm7,XMMWORD[112+rsi] |
| 952 pxor xmm6,XMMWORD[16+rsp] |
| 953 pxor xmm11,xmm13 |
| 954 pxor xmm2,xmm5 |
| 955 pxor xmm7,xmm1 |
| 956 movdqu XMMWORD[64+rdi],xmm6 |
| 957 movdqu XMMWORD[80+rdi],xmm11 |
| 958 movdqu XMMWORD[96+rdi],xmm2 |
| 959 movdqu XMMWORD[112+rdi],xmm7 |
| 960 je NEAR $L$done4x |
| 961 |
| 962 movdqa xmm6,XMMWORD[32+rsp] |
| 963 lea rsi,[128+rsi] |
| 964 xor r10,r10 |
| 965 movdqa XMMWORD[rsp],xmm6 |
| 966 movdqa XMMWORD[16+rsp],xmm10 |
| 967 lea rdi,[128+rdi] |
| 968 movdqa XMMWORD[32+rsp],xmm14 |
| 969 sub rdx,128 |
| 970 movdqa XMMWORD[48+rsp],xmm8 |
| 971 jmp NEAR $L$oop_tail4x |
| 972 |
| 973 ALIGN 32 |
| 974 $L$192_or_more4x: |
| 975 movdqu xmm6,XMMWORD[rsi] |
| 976 movdqu xmm11,XMMWORD[16+rsi] |
| 977 movdqu xmm2,XMMWORD[32+rsi] |
| 978 movdqu xmm7,XMMWORD[48+rsi] |
| 979 pxor xmm6,XMMWORD[rsp] |
| 980 pxor xmm11,xmm12 |
| 981 pxor xmm2,xmm4 |
| 982 pxor xmm7,xmm0 |
| 983 |
| 984 movdqu XMMWORD[rdi],xmm6 |
| 985 movdqu xmm6,XMMWORD[64+rsi] |
| 986 movdqu XMMWORD[16+rdi],xmm11 |
| 987 movdqu xmm11,XMMWORD[80+rsi] |
| 988 movdqu XMMWORD[32+rdi],xmm2 |
| 989 movdqu xmm2,XMMWORD[96+rsi] |
| 990 movdqu XMMWORD[48+rdi],xmm7 |
| 991 movdqu xmm7,XMMWORD[112+rsi] |
| 992 lea rsi,[128+rsi] |
| 993 pxor xmm6,XMMWORD[16+rsp] |
| 994 pxor xmm11,xmm13 |
| 995 pxor xmm2,xmm5 |
| 996 pxor xmm7,xmm1 |
| 997 |
| 998 movdqu XMMWORD[64+rdi],xmm6 |
| 999 movdqu xmm6,XMMWORD[rsi] |
| 1000 movdqu XMMWORD[80+rdi],xmm11 |
| 1001 movdqu xmm11,XMMWORD[16+rsi] |
| 1002 movdqu XMMWORD[96+rdi],xmm2 |
| 1003 movdqu xmm2,XMMWORD[32+rsi] |
| 1004 movdqu XMMWORD[112+rdi],xmm7 |
| 1005 lea rdi,[128+rdi] |
| 1006 movdqu xmm7,XMMWORD[48+rsi] |
| 1007 pxor xmm6,XMMWORD[32+rsp] |
| 1008 pxor xmm11,xmm10 |
| 1009 pxor xmm2,xmm14 |
| 1010 pxor xmm7,xmm8 |
| 1011 movdqu XMMWORD[rdi],xmm6 |
| 1012 movdqu XMMWORD[16+rdi],xmm11 |
| 1013 movdqu XMMWORD[32+rdi],xmm2 |
| 1014 movdqu XMMWORD[48+rdi],xmm7 |
| 1015 je NEAR $L$done4x |
| 1016 |
| 1017 movdqa xmm6,XMMWORD[48+rsp] |
| 1018 lea rsi,[64+rsi] |
| 1019 xor r10,r10 |
| 1020 movdqa XMMWORD[rsp],xmm6 |
| 1021 movdqa XMMWORD[16+rsp],xmm15 |
| 1022 lea rdi,[64+rdi] |
| 1023 movdqa XMMWORD[32+rsp],xmm9 |
| 1024 sub rdx,192 |
| 1025 movdqa XMMWORD[48+rsp],xmm3 |
| 1026 |
| 1027 $L$oop_tail4x: |
| 1028 movzx eax,BYTE[r10*1+rsi] |
| 1029 movzx ecx,BYTE[r10*1+rsp] |
| 1030 lea r10,[1+r10] |
| 1031 xor eax,ecx |
| 1032 mov BYTE[((-1))+r10*1+rdi],al |
| 1033 dec rdx |
| 1034 jnz NEAR $L$oop_tail4x |
| 1035 |
| 1036 $L$done4x: |
| 1037 lea r11,[((320+48))+rsp] |
| 1038 movaps xmm6,XMMWORD[((-48))+r11] |
| 1039 movaps xmm7,XMMWORD[((-32))+r11] |
| 1040 movaps xmm8,XMMWORD[((-16))+r11] |
| 1041 movaps xmm9,XMMWORD[r11] |
| 1042 movaps xmm10,XMMWORD[16+r11] |
| 1043 movaps xmm11,XMMWORD[32+r11] |
| 1044 movaps xmm12,XMMWORD[48+r11] |
| 1045 movaps xmm13,XMMWORD[64+r11] |
| 1046 movaps xmm14,XMMWORD[80+r11] |
| 1047 movaps xmm15,XMMWORD[96+r11] |
| 1048 add rsp,0x148+160 |
| 1049 mov rdi,QWORD[8+rsp] ;WIN64 epilogue |
| 1050 mov rsi,QWORD[16+rsp] |
| 1051 DB 0F3h,0C3h ;repret |
| 1052 $L$SEH_end_ChaCha20_4x: |
| 1053 |
| 1054 ALIGN 32 |
| 1055 ChaCha20_8x: |
| 1056 mov QWORD[8+rsp],rdi ;WIN64 prologue |
| 1057 mov QWORD[16+rsp],rsi |
| 1058 mov rax,rsp |
| 1059 $L$SEH_begin_ChaCha20_8x: |
| 1060 mov rdi,rcx |
| 1061 mov rsi,rdx |
| 1062 mov rdx,r8 |
| 1063 mov rcx,r9 |
| 1064 mov r8,QWORD[40+rsp] |
| 1065 |
| 1066 |
| 1067 $L$ChaCha20_8x: |
| 1068 mov r10,rsp |
| 1069 sub rsp,0x280+176 |
| 1070 and rsp,-32 |
| 1071 lea r11,[((656+48))+rsp] |
| 1072 movaps XMMWORD[(-48)+r11],xmm6 |
| 1073 movaps XMMWORD[(-32)+r11],xmm7 |
| 1074 movaps XMMWORD[(-16)+r11],xmm8 |
| 1075 movaps XMMWORD[r11],xmm9 |
| 1076 movaps XMMWORD[16+r11],xmm10 |
| 1077 movaps XMMWORD[32+r11],xmm11 |
| 1078 movaps XMMWORD[48+r11],xmm12 |
| 1079 movaps XMMWORD[64+r11],xmm13 |
| 1080 movaps XMMWORD[80+r11],xmm14 |
| 1081 movaps XMMWORD[96+r11],xmm15 |
| 1082 vzeroupper |
| 1083 mov QWORD[640+rsp],r10 |
| 1084 |
| 1085 |
| 1086 |
| 1087 |
| 1088 |
| 1089 |
| 1090 |
| 1091 |
| 1092 |
| 1093 |
| 1094 vbroadcasti128 ymm11,XMMWORD[$L$sigma] |
| 1095 vbroadcasti128 ymm3,XMMWORD[rcx] |
| 1096 vbroadcasti128 ymm15,XMMWORD[16+rcx] |
| 1097 vbroadcasti128 ymm7,XMMWORD[r8] |
| 1098 lea rcx,[256+rsp] |
| 1099 lea rax,[512+rsp] |
| 1100 lea r10,[$L$rot16] |
| 1101 lea r11,[$L$rot24] |
| 1102 |
| 1103 vpshufd ymm8,ymm11,0x00 |
| 1104 vpshufd ymm9,ymm11,0x55 |
| 1105 vmovdqa YMMWORD[(128-256)+rcx],ymm8 |
| 1106 vpshufd ymm10,ymm11,0xaa |
| 1107 vmovdqa YMMWORD[(160-256)+rcx],ymm9 |
| 1108 vpshufd ymm11,ymm11,0xff |
| 1109 vmovdqa YMMWORD[(192-256)+rcx],ymm10 |
| 1110 vmovdqa YMMWORD[(224-256)+rcx],ymm11 |
| 1111 |
| 1112 vpshufd ymm0,ymm3,0x00 |
| 1113 vpshufd ymm1,ymm3,0x55 |
| 1114 vmovdqa YMMWORD[(256-256)+rcx],ymm0 |
| 1115 vpshufd ymm2,ymm3,0xaa |
| 1116 vmovdqa YMMWORD[(288-256)+rcx],ymm1 |
| 1117 vpshufd ymm3,ymm3,0xff |
| 1118 vmovdqa YMMWORD[(320-256)+rcx],ymm2 |
| 1119 vmovdqa YMMWORD[(352-256)+rcx],ymm3 |
| 1120 |
| 1121 vpshufd ymm12,ymm15,0x00 |
| 1122 vpshufd ymm13,ymm15,0x55 |
| 1123 vmovdqa YMMWORD[(384-512)+rax],ymm12 |
| 1124 vpshufd ymm14,ymm15,0xaa |
| 1125 vmovdqa YMMWORD[(416-512)+rax],ymm13 |
| 1126 vpshufd ymm15,ymm15,0xff |
| 1127 vmovdqa YMMWORD[(448-512)+rax],ymm14 |
| 1128 vmovdqa YMMWORD[(480-512)+rax],ymm15 |
| 1129 |
| 1130 vpshufd ymm4,ymm7,0x00 |
| 1131 vpshufd ymm5,ymm7,0x55 |
| 1132 vpaddd ymm4,ymm4,YMMWORD[$L$incy] |
| 1133 vpshufd ymm6,ymm7,0xaa |
| 1134 vmovdqa YMMWORD[(544-512)+rax],ymm5 |
| 1135 vpshufd ymm7,ymm7,0xff |
| 1136 vmovdqa YMMWORD[(576-512)+rax],ymm6 |
| 1137 vmovdqa YMMWORD[(608-512)+rax],ymm7 |
| 1138 |
| 1139 jmp NEAR $L$oop_enter8x |
| 1140 |
| 1141 ALIGN 32 |
| 1142 $L$oop_outer8x: |
| 1143 vmovdqa ymm8,YMMWORD[((128-256))+rcx] |
| 1144 vmovdqa ymm9,YMMWORD[((160-256))+rcx] |
| 1145 vmovdqa ymm10,YMMWORD[((192-256))+rcx] |
| 1146 vmovdqa ymm11,YMMWORD[((224-256))+rcx] |
| 1147 vmovdqa ymm0,YMMWORD[((256-256))+rcx] |
| 1148 vmovdqa ymm1,YMMWORD[((288-256))+rcx] |
| 1149 vmovdqa ymm2,YMMWORD[((320-256))+rcx] |
| 1150 vmovdqa ymm3,YMMWORD[((352-256))+rcx] |
| 1151 vmovdqa ymm12,YMMWORD[((384-512))+rax] |
| 1152 vmovdqa ymm13,YMMWORD[((416-512))+rax] |
| 1153 vmovdqa ymm14,YMMWORD[((448-512))+rax] |
| 1154 vmovdqa ymm15,YMMWORD[((480-512))+rax] |
| 1155 vmovdqa ymm4,YMMWORD[((512-512))+rax] |
| 1156 vmovdqa ymm5,YMMWORD[((544-512))+rax] |
| 1157 vmovdqa ymm6,YMMWORD[((576-512))+rax] |
| 1158 vmovdqa ymm7,YMMWORD[((608-512))+rax] |
| 1159 vpaddd ymm4,ymm4,YMMWORD[$L$eight] |
| 1160 |
| 1161 $L$oop_enter8x: |
| 1162 vmovdqa YMMWORD[64+rsp],ymm14 |
| 1163 vmovdqa YMMWORD[96+rsp],ymm15 |
| 1164 vbroadcasti128 ymm15,XMMWORD[r10] |
| 1165 vmovdqa YMMWORD[(512-512)+rax],ymm4 |
| 1166 mov eax,10 |
| 1167 jmp NEAR $L$oop8x |
| 1168 |
| 1169 ALIGN 32 |
| 1170 $L$oop8x: |
| 1171 vpaddd ymm8,ymm8,ymm0 |
| 1172 vpxor ymm4,ymm8,ymm4 |
| 1173 vpshufb ymm4,ymm4,ymm15 |
| 1174 vpaddd ymm9,ymm9,ymm1 |
| 1175 vpxor ymm5,ymm9,ymm5 |
| 1176 vpshufb ymm5,ymm5,ymm15 |
| 1177 vpaddd ymm12,ymm12,ymm4 |
| 1178 vpxor ymm0,ymm12,ymm0 |
| 1179 vpslld ymm14,ymm0,12 |
| 1180 vpsrld ymm0,ymm0,20 |
| 1181 vpor ymm0,ymm14,ymm0 |
| 1182 vbroadcasti128 ymm14,XMMWORD[r11] |
| 1183 vpaddd ymm13,ymm13,ymm5 |
| 1184 vpxor ymm1,ymm13,ymm1 |
| 1185 vpslld ymm15,ymm1,12 |
| 1186 vpsrld ymm1,ymm1,20 |
| 1187 vpor ymm1,ymm15,ymm1 |
| 1188 vpaddd ymm8,ymm8,ymm0 |
| 1189 vpxor ymm4,ymm8,ymm4 |
| 1190 vpshufb ymm4,ymm4,ymm14 |
| 1191 vpaddd ymm9,ymm9,ymm1 |
| 1192 vpxor ymm5,ymm9,ymm5 |
| 1193 vpshufb ymm5,ymm5,ymm14 |
| 1194 vpaddd ymm12,ymm12,ymm4 |
| 1195 vpxor ymm0,ymm12,ymm0 |
| 1196 vpslld ymm15,ymm0,7 |
| 1197 vpsrld ymm0,ymm0,25 |
| 1198 vpor ymm0,ymm15,ymm0 |
| 1199 vbroadcasti128 ymm15,XMMWORD[r10] |
| 1200 vpaddd ymm13,ymm13,ymm5 |
| 1201 vpxor ymm1,ymm13,ymm1 |
| 1202 vpslld ymm14,ymm1,7 |
| 1203 vpsrld ymm1,ymm1,25 |
| 1204 vpor ymm1,ymm14,ymm1 |
| 1205 vmovdqa YMMWORD[rsp],ymm12 |
| 1206 vmovdqa YMMWORD[32+rsp],ymm13 |
| 1207 vmovdqa ymm12,YMMWORD[64+rsp] |
| 1208 vmovdqa ymm13,YMMWORD[96+rsp] |
| 1209 vpaddd ymm10,ymm10,ymm2 |
| 1210 vpxor ymm6,ymm10,ymm6 |
| 1211 vpshufb ymm6,ymm6,ymm15 |
| 1212 vpaddd ymm11,ymm11,ymm3 |
| 1213 vpxor ymm7,ymm11,ymm7 |
| 1214 vpshufb ymm7,ymm7,ymm15 |
| 1215 vpaddd ymm12,ymm12,ymm6 |
| 1216 vpxor ymm2,ymm12,ymm2 |
| 1217 vpslld ymm14,ymm2,12 |
| 1218 vpsrld ymm2,ymm2,20 |
| 1219 vpor ymm2,ymm14,ymm2 |
| 1220 vbroadcasti128 ymm14,XMMWORD[r11] |
| 1221 vpaddd ymm13,ymm13,ymm7 |
| 1222 vpxor ymm3,ymm13,ymm3 |
| 1223 vpslld ymm15,ymm3,12 |
| 1224 vpsrld ymm3,ymm3,20 |
| 1225 vpor ymm3,ymm15,ymm3 |
| 1226 vpaddd ymm10,ymm10,ymm2 |
| 1227 vpxor ymm6,ymm10,ymm6 |
| 1228 vpshufb ymm6,ymm6,ymm14 |
| 1229 vpaddd ymm11,ymm11,ymm3 |
| 1230 vpxor ymm7,ymm11,ymm7 |
| 1231 vpshufb ymm7,ymm7,ymm14 |
| 1232 vpaddd ymm12,ymm12,ymm6 |
| 1233 vpxor ymm2,ymm12,ymm2 |
| 1234 vpslld ymm15,ymm2,7 |
| 1235 vpsrld ymm2,ymm2,25 |
| 1236 vpor ymm2,ymm15,ymm2 |
| 1237 vbroadcasti128 ymm15,XMMWORD[r10] |
| 1238 vpaddd ymm13,ymm13,ymm7 |
| 1239 vpxor ymm3,ymm13,ymm3 |
| 1240 vpslld ymm14,ymm3,7 |
| 1241 vpsrld ymm3,ymm3,25 |
| 1242 vpor ymm3,ymm14,ymm3 |
| 1243 vpaddd ymm8,ymm8,ymm1 |
| 1244 vpxor ymm7,ymm8,ymm7 |
| 1245 vpshufb ymm7,ymm7,ymm15 |
| 1246 vpaddd ymm9,ymm9,ymm2 |
| 1247 vpxor ymm4,ymm9,ymm4 |
| 1248 vpshufb ymm4,ymm4,ymm15 |
| 1249 vpaddd ymm12,ymm12,ymm7 |
| 1250 vpxor ymm1,ymm12,ymm1 |
| 1251 vpslld ymm14,ymm1,12 |
| 1252 vpsrld ymm1,ymm1,20 |
| 1253 vpor ymm1,ymm14,ymm1 |
| 1254 vbroadcasti128 ymm14,XMMWORD[r11] |
| 1255 vpaddd ymm13,ymm13,ymm4 |
| 1256 vpxor ymm2,ymm13,ymm2 |
| 1257 vpslld ymm15,ymm2,12 |
| 1258 vpsrld ymm2,ymm2,20 |
| 1259 vpor ymm2,ymm15,ymm2 |
| 1260 vpaddd ymm8,ymm8,ymm1 |
| 1261 vpxor ymm7,ymm8,ymm7 |
| 1262 vpshufb ymm7,ymm7,ymm14 |
| 1263 vpaddd ymm9,ymm9,ymm2 |
| 1264 vpxor ymm4,ymm9,ymm4 |
| 1265 vpshufb ymm4,ymm4,ymm14 |
| 1266 vpaddd ymm12,ymm12,ymm7 |
| 1267 vpxor ymm1,ymm12,ymm1 |
| 1268 vpslld ymm15,ymm1,7 |
| 1269 vpsrld ymm1,ymm1,25 |
| 1270 vpor ymm1,ymm15,ymm1 |
| 1271 vbroadcasti128 ymm15,XMMWORD[r10] |
| 1272 vpaddd ymm13,ymm13,ymm4 |
| 1273 vpxor ymm2,ymm13,ymm2 |
| 1274 vpslld ymm14,ymm2,7 |
| 1275 vpsrld ymm2,ymm2,25 |
| 1276 vpor ymm2,ymm14,ymm2 |
| 1277 vmovdqa YMMWORD[64+rsp],ymm12 |
| 1278 vmovdqa YMMWORD[96+rsp],ymm13 |
| 1279 vmovdqa ymm12,YMMWORD[rsp] |
| 1280 vmovdqa ymm13,YMMWORD[32+rsp] |
| 1281 vpaddd ymm10,ymm10,ymm3 |
| 1282 vpxor ymm5,ymm10,ymm5 |
| 1283 vpshufb ymm5,ymm5,ymm15 |
| 1284 vpaddd ymm11,ymm11,ymm0 |
| 1285 vpxor ymm6,ymm11,ymm6 |
| 1286 vpshufb ymm6,ymm6,ymm15 |
| 1287 vpaddd ymm12,ymm12,ymm5 |
| 1288 vpxor ymm3,ymm12,ymm3 |
| 1289 vpslld ymm14,ymm3,12 |
| 1290 vpsrld ymm3,ymm3,20 |
| 1291 vpor ymm3,ymm14,ymm3 |
| 1292 vbroadcasti128 ymm14,XMMWORD[r11] |
| 1293 vpaddd ymm13,ymm13,ymm6 |
| 1294 vpxor ymm0,ymm13,ymm0 |
| 1295 vpslld ymm15,ymm0,12 |
| 1296 vpsrld ymm0,ymm0,20 |
| 1297 vpor ymm0,ymm15,ymm0 |
| 1298 vpaddd ymm10,ymm10,ymm3 |
| 1299 vpxor ymm5,ymm10,ymm5 |
| 1300 vpshufb ymm5,ymm5,ymm14 |
| 1301 vpaddd ymm11,ymm11,ymm0 |
| 1302 vpxor ymm6,ymm11,ymm6 |
| 1303 vpshufb ymm6,ymm6,ymm14 |
| 1304 vpaddd ymm12,ymm12,ymm5 |
| 1305 vpxor ymm3,ymm12,ymm3 |
| 1306 vpslld ymm15,ymm3,7 |
| 1307 vpsrld ymm3,ymm3,25 |
| 1308 vpor ymm3,ymm15,ymm3 |
| 1309 vbroadcasti128 ymm15,XMMWORD[r10] |
| 1310 vpaddd ymm13,ymm13,ymm6 |
| 1311 vpxor ymm0,ymm13,ymm0 |
| 1312 vpslld ymm14,ymm0,7 |
| 1313 vpsrld ymm0,ymm0,25 |
| 1314 vpor ymm0,ymm14,ymm0 |
| 1315 dec eax |
| 1316 jnz NEAR $L$oop8x |
| 1317 |
| 1318 lea rax,[512+rsp] |
| 1319 vpaddd ymm8,ymm8,YMMWORD[((128-256))+rcx] |
| 1320 vpaddd ymm9,ymm9,YMMWORD[((160-256))+rcx] |
| 1321 vpaddd ymm10,ymm10,YMMWORD[((192-256))+rcx] |
| 1322 vpaddd ymm11,ymm11,YMMWORD[((224-256))+rcx] |
| 1323 |
| 1324 vpunpckldq ymm14,ymm8,ymm9 |
| 1325 vpunpckldq ymm15,ymm10,ymm11 |
| 1326 vpunpckhdq ymm8,ymm8,ymm9 |
| 1327 vpunpckhdq ymm10,ymm10,ymm11 |
| 1328 vpunpcklqdq ymm9,ymm14,ymm15 |
| 1329 vpunpckhqdq ymm14,ymm14,ymm15 |
| 1330 vpunpcklqdq ymm11,ymm8,ymm10 |
| 1331 vpunpckhqdq ymm8,ymm8,ymm10 |
| 1332 vpaddd ymm0,ymm0,YMMWORD[((256-256))+rcx] |
| 1333 vpaddd ymm1,ymm1,YMMWORD[((288-256))+rcx] |
| 1334 vpaddd ymm2,ymm2,YMMWORD[((320-256))+rcx] |
| 1335 vpaddd ymm3,ymm3,YMMWORD[((352-256))+rcx] |
| 1336 |
| 1337 vpunpckldq ymm10,ymm0,ymm1 |
| 1338 vpunpckldq ymm15,ymm2,ymm3 |
| 1339 vpunpckhdq ymm0,ymm0,ymm1 |
| 1340 vpunpckhdq ymm2,ymm2,ymm3 |
| 1341 vpunpcklqdq ymm1,ymm10,ymm15 |
| 1342 vpunpckhqdq ymm10,ymm10,ymm15 |
| 1343 vpunpcklqdq ymm3,ymm0,ymm2 |
| 1344 vpunpckhqdq ymm0,ymm0,ymm2 |
| 1345 vperm2i128 ymm15,ymm9,ymm1,0x20 |
| 1346 vperm2i128 ymm1,ymm9,ymm1,0x31 |
| 1347 vperm2i128 ymm9,ymm14,ymm10,0x20 |
| 1348 vperm2i128 ymm10,ymm14,ymm10,0x31 |
| 1349 vperm2i128 ymm14,ymm11,ymm3,0x20 |
| 1350 vperm2i128 ymm3,ymm11,ymm3,0x31 |
| 1351 vperm2i128 ymm11,ymm8,ymm0,0x20 |
| 1352 vperm2i128 ymm0,ymm8,ymm0,0x31 |
| 1353 vmovdqa YMMWORD[rsp],ymm15 |
| 1354 vmovdqa YMMWORD[32+rsp],ymm9 |
| 1355 vmovdqa ymm15,YMMWORD[64+rsp] |
| 1356 vmovdqa ymm9,YMMWORD[96+rsp] |
| 1357 |
| 1358 vpaddd ymm12,ymm12,YMMWORD[((384-512))+rax] |
| 1359 vpaddd ymm13,ymm13,YMMWORD[((416-512))+rax] |
| 1360 vpaddd ymm15,ymm15,YMMWORD[((448-512))+rax] |
| 1361 vpaddd ymm9,ymm9,YMMWORD[((480-512))+rax] |
| 1362 |
| 1363 vpunpckldq ymm2,ymm12,ymm13 |
| 1364 vpunpckldq ymm8,ymm15,ymm9 |
| 1365 vpunpckhdq ymm12,ymm12,ymm13 |
| 1366 vpunpckhdq ymm15,ymm15,ymm9 |
| 1367 vpunpcklqdq ymm13,ymm2,ymm8 |
| 1368 vpunpckhqdq ymm2,ymm2,ymm8 |
| 1369 vpunpcklqdq ymm9,ymm12,ymm15 |
| 1370 vpunpckhqdq ymm12,ymm12,ymm15 |
| 1371 vpaddd ymm4,ymm4,YMMWORD[((512-512))+rax] |
| 1372 vpaddd ymm5,ymm5,YMMWORD[((544-512))+rax] |
| 1373 vpaddd ymm6,ymm6,YMMWORD[((576-512))+rax] |
| 1374 vpaddd ymm7,ymm7,YMMWORD[((608-512))+rax] |
| 1375 |
| 1376 vpunpckldq ymm15,ymm4,ymm5 |
| 1377 vpunpckldq ymm8,ymm6,ymm7 |
| 1378 vpunpckhdq ymm4,ymm4,ymm5 |
| 1379 vpunpckhdq ymm6,ymm6,ymm7 |
| 1380 vpunpcklqdq ymm5,ymm15,ymm8 |
| 1381 vpunpckhqdq ymm15,ymm15,ymm8 |
| 1382 vpunpcklqdq ymm7,ymm4,ymm6 |
| 1383 vpunpckhqdq ymm4,ymm4,ymm6 |
| 1384 vperm2i128 ymm8,ymm13,ymm5,0x20 |
| 1385 vperm2i128 ymm5,ymm13,ymm5,0x31 |
| 1386 vperm2i128 ymm13,ymm2,ymm15,0x20 |
| 1387 vperm2i128 ymm15,ymm2,ymm15,0x31 |
| 1388 vperm2i128 ymm2,ymm9,ymm7,0x20 |
| 1389 vperm2i128 ymm7,ymm9,ymm7,0x31 |
| 1390 vperm2i128 ymm9,ymm12,ymm4,0x20 |
| 1391 vperm2i128 ymm4,ymm12,ymm4,0x31 |
| 1392 vmovdqa ymm6,YMMWORD[rsp] |
| 1393 vmovdqa ymm12,YMMWORD[32+rsp] |
| 1394 |
| 1395 cmp rdx,64*8 |
| 1396 jb NEAR $L$tail8x |
| 1397 |
| 1398 vpxor ymm6,ymm6,YMMWORD[rsi] |
| 1399 vpxor ymm8,ymm8,YMMWORD[32+rsi] |
| 1400 vpxor ymm1,ymm1,YMMWORD[64+rsi] |
| 1401 vpxor ymm5,ymm5,YMMWORD[96+rsi] |
| 1402 lea rsi,[128+rsi] |
| 1403 vmovdqu YMMWORD[rdi],ymm6 |
| 1404 vmovdqu YMMWORD[32+rdi],ymm8 |
| 1405 vmovdqu YMMWORD[64+rdi],ymm1 |
| 1406 vmovdqu YMMWORD[96+rdi],ymm5 |
| 1407 lea rdi,[128+rdi] |
| 1408 |
| 1409 vpxor ymm12,ymm12,YMMWORD[rsi] |
| 1410 vpxor ymm13,ymm13,YMMWORD[32+rsi] |
| 1411 vpxor ymm10,ymm10,YMMWORD[64+rsi] |
| 1412 vpxor ymm15,ymm15,YMMWORD[96+rsi] |
| 1413 lea rsi,[128+rsi] |
| 1414 vmovdqu YMMWORD[rdi],ymm12 |
| 1415 vmovdqu YMMWORD[32+rdi],ymm13 |
| 1416 vmovdqu YMMWORD[64+rdi],ymm10 |
| 1417 vmovdqu YMMWORD[96+rdi],ymm15 |
| 1418 lea rdi,[128+rdi] |
| 1419 |
| 1420 vpxor ymm14,ymm14,YMMWORD[rsi] |
| 1421 vpxor ymm2,ymm2,YMMWORD[32+rsi] |
| 1422 vpxor ymm3,ymm3,YMMWORD[64+rsi] |
| 1423 vpxor ymm7,ymm7,YMMWORD[96+rsi] |
| 1424 lea rsi,[128+rsi] |
| 1425 vmovdqu YMMWORD[rdi],ymm14 |
| 1426 vmovdqu YMMWORD[32+rdi],ymm2 |
| 1427 vmovdqu YMMWORD[64+rdi],ymm3 |
| 1428 vmovdqu YMMWORD[96+rdi],ymm7 |
| 1429 lea rdi,[128+rdi] |
| 1430 |
| 1431 vpxor ymm11,ymm11,YMMWORD[rsi] |
| 1432 vpxor ymm9,ymm9,YMMWORD[32+rsi] |
| 1433 vpxor ymm0,ymm0,YMMWORD[64+rsi] |
| 1434 vpxor ymm4,ymm4,YMMWORD[96+rsi] |
| 1435 lea rsi,[128+rsi] |
| 1436 vmovdqu YMMWORD[rdi],ymm11 |
| 1437 vmovdqu YMMWORD[32+rdi],ymm9 |
| 1438 vmovdqu YMMWORD[64+rdi],ymm0 |
| 1439 vmovdqu YMMWORD[96+rdi],ymm4 |
| 1440 lea rdi,[128+rdi] |
| 1441 |
| 1442 sub rdx,64*8 |
| 1443 jnz NEAR $L$oop_outer8x |
| 1444 |
| 1445 jmp NEAR $L$done8x |
| 1446 |
| 1447 $L$tail8x: |
| 1448 cmp rdx,448 |
| 1449 jae NEAR $L$448_or_more8x |
| 1450 cmp rdx,384 |
| 1451 jae NEAR $L$384_or_more8x |
| 1452 cmp rdx,320 |
| 1453 jae NEAR $L$320_or_more8x |
| 1454 cmp rdx,256 |
| 1455 jae NEAR $L$256_or_more8x |
| 1456 cmp rdx,192 |
| 1457 jae NEAR $L$192_or_more8x |
| 1458 cmp rdx,128 |
| 1459 jae NEAR $L$128_or_more8x |
| 1460 cmp rdx,64 |
| 1461 jae NEAR $L$64_or_more8x |
| 1462 |
| 1463 xor r10,r10 |
| 1464 vmovdqa YMMWORD[rsp],ymm6 |
| 1465 vmovdqa YMMWORD[32+rsp],ymm8 |
| 1466 jmp NEAR $L$oop_tail8x |
| 1467 |
| 1468 ALIGN 32 |
| 1469 $L$64_or_more8x: |
| 1470 vpxor ymm6,ymm6,YMMWORD[rsi] |
| 1471 vpxor ymm8,ymm8,YMMWORD[32+rsi] |
| 1472 vmovdqu YMMWORD[rdi],ymm6 |
| 1473 vmovdqu YMMWORD[32+rdi],ymm8 |
| 1474 je NEAR $L$done8x |
| 1475 |
| 1476 lea rsi,[64+rsi] |
| 1477 xor r10,r10 |
| 1478 vmovdqa YMMWORD[rsp],ymm1 |
| 1479 lea rdi,[64+rdi] |
| 1480 sub rdx,64 |
| 1481 vmovdqa YMMWORD[32+rsp],ymm5 |
| 1482 jmp NEAR $L$oop_tail8x |
| 1483 |
| 1484 ALIGN 32 |
| 1485 $L$128_or_more8x: |
| 1486 vpxor ymm6,ymm6,YMMWORD[rsi] |
| 1487 vpxor ymm8,ymm8,YMMWORD[32+rsi] |
| 1488 vpxor ymm1,ymm1,YMMWORD[64+rsi] |
| 1489 vpxor ymm5,ymm5,YMMWORD[96+rsi] |
| 1490 vmovdqu YMMWORD[rdi],ymm6 |
| 1491 vmovdqu YMMWORD[32+rdi],ymm8 |
| 1492 vmovdqu YMMWORD[64+rdi],ymm1 |
| 1493 vmovdqu YMMWORD[96+rdi],ymm5 |
| 1494 je NEAR $L$done8x |
| 1495 |
| 1496 lea rsi,[128+rsi] |
| 1497 xor r10,r10 |
| 1498 vmovdqa YMMWORD[rsp],ymm12 |
| 1499 lea rdi,[128+rdi] |
| 1500 sub rdx,128 |
| 1501 vmovdqa YMMWORD[32+rsp],ymm13 |
| 1502 jmp NEAR $L$oop_tail8x |
| 1503 |
| 1504 ALIGN 32 |
| 1505 $L$192_or_more8x: |
| 1506 vpxor ymm6,ymm6,YMMWORD[rsi] |
| 1507 vpxor ymm8,ymm8,YMMWORD[32+rsi] |
| 1508 vpxor ymm1,ymm1,YMMWORD[64+rsi] |
| 1509 vpxor ymm5,ymm5,YMMWORD[96+rsi] |
| 1510 vpxor ymm12,ymm12,YMMWORD[128+rsi] |
| 1511 vpxor ymm13,ymm13,YMMWORD[160+rsi] |
| 1512 vmovdqu YMMWORD[rdi],ymm6 |
| 1513 vmovdqu YMMWORD[32+rdi],ymm8 |
| 1514 vmovdqu YMMWORD[64+rdi],ymm1 |
| 1515 vmovdqu YMMWORD[96+rdi],ymm5 |
| 1516 vmovdqu YMMWORD[128+rdi],ymm12 |
| 1517 vmovdqu YMMWORD[160+rdi],ymm13 |
| 1518 je NEAR $L$done8x |
| 1519 |
| 1520 lea rsi,[192+rsi] |
| 1521 xor r10,r10 |
| 1522 vmovdqa YMMWORD[rsp],ymm10 |
| 1523 lea rdi,[192+rdi] |
| 1524 sub rdx,192 |
| 1525 vmovdqa YMMWORD[32+rsp],ymm15 |
| 1526 jmp NEAR $L$oop_tail8x |
| 1527 |
| 1528 ALIGN 32 |
| 1529 $L$256_or_more8x: |
| 1530 vpxor ymm6,ymm6,YMMWORD[rsi] |
| 1531 vpxor ymm8,ymm8,YMMWORD[32+rsi] |
| 1532 vpxor ymm1,ymm1,YMMWORD[64+rsi] |
| 1533 vpxor ymm5,ymm5,YMMWORD[96+rsi] |
| 1534 vpxor ymm12,ymm12,YMMWORD[128+rsi] |
| 1535 vpxor ymm13,ymm13,YMMWORD[160+rsi] |
| 1536 vpxor ymm10,ymm10,YMMWORD[192+rsi] |
| 1537 vpxor ymm15,ymm15,YMMWORD[224+rsi] |
| 1538 vmovdqu YMMWORD[rdi],ymm6 |
| 1539 vmovdqu YMMWORD[32+rdi],ymm8 |
| 1540 vmovdqu YMMWORD[64+rdi],ymm1 |
| 1541 vmovdqu YMMWORD[96+rdi],ymm5 |
| 1542 vmovdqu YMMWORD[128+rdi],ymm12 |
| 1543 vmovdqu YMMWORD[160+rdi],ymm13 |
| 1544 vmovdqu YMMWORD[192+rdi],ymm10 |
| 1545 vmovdqu YMMWORD[224+rdi],ymm15 |
| 1546 je NEAR $L$done8x |
| 1547 |
| 1548 lea rsi,[256+rsi] |
| 1549 xor r10,r10 |
| 1550 vmovdqa YMMWORD[rsp],ymm14 |
| 1551 lea rdi,[256+rdi] |
| 1552 sub rdx,256 |
| 1553 vmovdqa YMMWORD[32+rsp],ymm2 |
| 1554 jmp NEAR $L$oop_tail8x |
| 1555 |
| 1556 ALIGN 32 |
| 1557 $L$320_or_more8x: |
| 1558 vpxor ymm6,ymm6,YMMWORD[rsi] |
| 1559 vpxor ymm8,ymm8,YMMWORD[32+rsi] |
| 1560 vpxor ymm1,ymm1,YMMWORD[64+rsi] |
| 1561 vpxor ymm5,ymm5,YMMWORD[96+rsi] |
| 1562 vpxor ymm12,ymm12,YMMWORD[128+rsi] |
| 1563 vpxor ymm13,ymm13,YMMWORD[160+rsi] |
| 1564 vpxor ymm10,ymm10,YMMWORD[192+rsi] |
| 1565 vpxor ymm15,ymm15,YMMWORD[224+rsi] |
| 1566 vpxor ymm14,ymm14,YMMWORD[256+rsi] |
| 1567 vpxor ymm2,ymm2,YMMWORD[288+rsi] |
| 1568 vmovdqu YMMWORD[rdi],ymm6 |
| 1569 vmovdqu YMMWORD[32+rdi],ymm8 |
| 1570 vmovdqu YMMWORD[64+rdi],ymm1 |
| 1571 vmovdqu YMMWORD[96+rdi],ymm5 |
| 1572 vmovdqu YMMWORD[128+rdi],ymm12 |
| 1573 vmovdqu YMMWORD[160+rdi],ymm13 |
| 1574 vmovdqu YMMWORD[192+rdi],ymm10 |
| 1575 vmovdqu YMMWORD[224+rdi],ymm15 |
| 1576 vmovdqu YMMWORD[256+rdi],ymm14 |
| 1577 vmovdqu YMMWORD[288+rdi],ymm2 |
| 1578 je NEAR $L$done8x |
| 1579 |
| 1580 lea rsi,[320+rsi] |
| 1581 xor r10,r10 |
| 1582 vmovdqa YMMWORD[rsp],ymm3 |
| 1583 lea rdi,[320+rdi] |
| 1584 sub rdx,320 |
| 1585 vmovdqa YMMWORD[32+rsp],ymm7 |
| 1586 jmp NEAR $L$oop_tail8x |
| 1587 |
| 1588 ALIGN 32 |
| 1589 $L$384_or_more8x: |
| 1590 vpxor ymm6,ymm6,YMMWORD[rsi] |
| 1591 vpxor ymm8,ymm8,YMMWORD[32+rsi] |
| 1592 vpxor ymm1,ymm1,YMMWORD[64+rsi] |
| 1593 vpxor ymm5,ymm5,YMMWORD[96+rsi] |
| 1594 vpxor ymm12,ymm12,YMMWORD[128+rsi] |
| 1595 vpxor ymm13,ymm13,YMMWORD[160+rsi] |
| 1596 vpxor ymm10,ymm10,YMMWORD[192+rsi] |
| 1597 vpxor ymm15,ymm15,YMMWORD[224+rsi] |
| 1598 vpxor ymm14,ymm14,YMMWORD[256+rsi] |
| 1599 vpxor ymm2,ymm2,YMMWORD[288+rsi] |
| 1600 vpxor ymm3,ymm3,YMMWORD[320+rsi] |
| 1601 vpxor ymm7,ymm7,YMMWORD[352+rsi] |
| 1602 vmovdqu YMMWORD[rdi],ymm6 |
| 1603 vmovdqu YMMWORD[32+rdi],ymm8 |
| 1604 vmovdqu YMMWORD[64+rdi],ymm1 |
| 1605 vmovdqu YMMWORD[96+rdi],ymm5 |
| 1606 vmovdqu YMMWORD[128+rdi],ymm12 |
| 1607 vmovdqu YMMWORD[160+rdi],ymm13 |
| 1608 vmovdqu YMMWORD[192+rdi],ymm10 |
| 1609 vmovdqu YMMWORD[224+rdi],ymm15 |
| 1610 vmovdqu YMMWORD[256+rdi],ymm14 |
| 1611 vmovdqu YMMWORD[288+rdi],ymm2 |
| 1612 vmovdqu YMMWORD[320+rdi],ymm3 |
| 1613 vmovdqu YMMWORD[352+rdi],ymm7 |
| 1614 je NEAR $L$done8x |
| 1615 |
| 1616 lea rsi,[384+rsi] |
| 1617 xor r10,r10 |
| 1618 vmovdqa YMMWORD[rsp],ymm11 |
| 1619 lea rdi,[384+rdi] |
| 1620 sub rdx,384 |
| 1621 vmovdqa YMMWORD[32+rsp],ymm9 |
| 1622 jmp NEAR $L$oop_tail8x |
| 1623 |
| 1624 ALIGN 32 |
| 1625 $L$448_or_more8x: |
| 1626 vpxor ymm6,ymm6,YMMWORD[rsi] |
| 1627 vpxor ymm8,ymm8,YMMWORD[32+rsi] |
| 1628 vpxor ymm1,ymm1,YMMWORD[64+rsi] |
| 1629 vpxor ymm5,ymm5,YMMWORD[96+rsi] |
| 1630 vpxor ymm12,ymm12,YMMWORD[128+rsi] |
| 1631 vpxor ymm13,ymm13,YMMWORD[160+rsi] |
| 1632 vpxor ymm10,ymm10,YMMWORD[192+rsi] |
| 1633 vpxor ymm15,ymm15,YMMWORD[224+rsi] |
| 1634 vpxor ymm14,ymm14,YMMWORD[256+rsi] |
| 1635 vpxor ymm2,ymm2,YMMWORD[288+rsi] |
| 1636 vpxor ymm3,ymm3,YMMWORD[320+rsi] |
| 1637 vpxor ymm7,ymm7,YMMWORD[352+rsi] |
| 1638 vpxor ymm11,ymm11,YMMWORD[384+rsi] |
| 1639 vpxor ymm9,ymm9,YMMWORD[416+rsi] |
| 1640 vmovdqu YMMWORD[rdi],ymm6 |
| 1641 vmovdqu YMMWORD[32+rdi],ymm8 |
| 1642 vmovdqu YMMWORD[64+rdi],ymm1 |
| 1643 vmovdqu YMMWORD[96+rdi],ymm5 |
| 1644 vmovdqu YMMWORD[128+rdi],ymm12 |
| 1645 vmovdqu YMMWORD[160+rdi],ymm13 |
| 1646 vmovdqu YMMWORD[192+rdi],ymm10 |
| 1647 vmovdqu YMMWORD[224+rdi],ymm15 |
| 1648 vmovdqu YMMWORD[256+rdi],ymm14 |
| 1649 vmovdqu YMMWORD[288+rdi],ymm2 |
| 1650 vmovdqu YMMWORD[320+rdi],ymm3 |
| 1651 vmovdqu YMMWORD[352+rdi],ymm7 |
| 1652 vmovdqu YMMWORD[384+rdi],ymm11 |
| 1653 vmovdqu YMMWORD[416+rdi],ymm9 |
| 1654 je NEAR $L$done8x |
| 1655 |
| 1656 lea rsi,[448+rsi] |
| 1657 xor r10,r10 |
| 1658 vmovdqa YMMWORD[rsp],ymm0 |
| 1659 lea rdi,[448+rdi] |
| 1660 sub rdx,448 |
| 1661 vmovdqa YMMWORD[32+rsp],ymm4 |
| 1662 |
| 1663 $L$oop_tail8x: |
| 1664 movzx eax,BYTE[r10*1+rsi] |
| 1665 movzx ecx,BYTE[r10*1+rsp] |
| 1666 lea r10,[1+r10] |
| 1667 xor eax,ecx |
| 1668 mov BYTE[((-1))+r10*1+rdi],al |
| 1669 dec rdx |
| 1670 jnz NEAR $L$oop_tail8x |
| 1671 |
| 1672 $L$done8x: |
| 1673 vzeroall |
| 1674 lea r11,[((656+48))+rsp] |
| 1675 movaps xmm6,XMMWORD[((-48))+r11] |
| 1676 movaps xmm7,XMMWORD[((-32))+r11] |
| 1677 movaps xmm8,XMMWORD[((-16))+r11] |
| 1678 movaps xmm9,XMMWORD[r11] |
| 1679 movaps xmm10,XMMWORD[16+r11] |
| 1680 movaps xmm11,XMMWORD[32+r11] |
| 1681 movaps xmm12,XMMWORD[48+r11] |
| 1682 movaps xmm13,XMMWORD[64+r11] |
| 1683 movaps xmm14,XMMWORD[80+r11] |
| 1684 movaps xmm15,XMMWORD[96+r11] |
| 1685 mov rsp,QWORD[640+rsp] |
| 1686 mov rdi,QWORD[8+rsp] ;WIN64 epilogue |
| 1687 mov rsi,QWORD[16+rsp] |
| 1688 DB 0F3h,0C3h ;repret |
| 1689 $L$SEH_end_ChaCha20_8x: |
OLD | NEW |