Index: third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont5.asm |
diff --git a/third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont5.asm b/third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont5.asm |
index 284318aae32ceab661f82a85e7a46d3b563b66bf..cd9a6e5d4eddf0764bf1b6385dcc102bde46662f 100644 |
--- a/third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont5.asm |
+++ b/third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont5.asm |
@@ -31,49 +31,151 @@ ALIGN 16 |
$L$mul_enter: |
mov r9d,r9d |
mov rax,rsp |
- mov r10d,DWORD[56+rsp] |
+ movd xmm5,DWORD[56+rsp] |
+ lea r10,[$L$inc] |
push rbx |
push rbp |
push r12 |
push r13 |
push r14 |
push r15 |
- lea rsp,[((-40))+rsp] |
- movaps XMMWORD[rsp],xmm6 |
- movaps XMMWORD[16+rsp],xmm7 |
+ |
lea r11,[2+r9] |
neg r11 |
- lea rsp,[r11*8+rsp] |
+ lea rsp,[((-264))+r11*8+rsp] |
and rsp,-1024 |
mov QWORD[8+r9*8+rsp],rax |
$L$mul_body: |
- mov r12,rdx |
- mov r11,r10 |
- shr r10,3 |
- and r11,7 |
- not r10 |
- lea rax,[$L$magic_masks] |
- and r10,3 |
- lea r12,[96+r11*8+r12] |
- movq xmm4,QWORD[r10*8+rax] |
- movq xmm5,QWORD[8+r10*8+rax] |
- movq xmm6,QWORD[16+r10*8+rax] |
- movq xmm7,QWORD[24+r10*8+rax] |
- |
- movq xmm0,QWORD[(((-96)))+r12] |
- movq xmm1,QWORD[((-32))+r12] |
- pand xmm0,xmm4 |
- movq xmm2,QWORD[32+r12] |
- pand xmm1,xmm5 |
- movq xmm3,QWORD[96+r12] |
- pand xmm2,xmm6 |
- por xmm0,xmm1 |
- pand xmm3,xmm7 |
+ lea r12,[128+rdx] |
+ movdqa xmm0,XMMWORD[r10] |
+ movdqa xmm1,XMMWORD[16+r10] |
+ lea r10,[((24-112))+r9*8+rsp] |
+ and r10,-16 |
+ |
+ pshufd xmm5,xmm5,0 |
+ movdqa xmm4,xmm1 |
+ movdqa xmm2,xmm1 |
+ paddd xmm1,xmm0 |
+ pcmpeqd xmm0,xmm5 |
+DB 0x67 |
+ movdqa xmm3,xmm4 |
+ paddd xmm2,xmm1 |
+ pcmpeqd xmm1,xmm5 |
+ movdqa XMMWORD[112+r10],xmm0 |
+ movdqa xmm0,xmm4 |
+ |
+ paddd xmm3,xmm2 |
+ pcmpeqd xmm2,xmm5 |
+ movdqa XMMWORD[128+r10],xmm1 |
+ movdqa xmm1,xmm4 |
+ |
+ paddd xmm0,xmm3 |
+ pcmpeqd xmm3,xmm5 |
+ movdqa XMMWORD[144+r10],xmm2 |
+ movdqa xmm2,xmm4 |
+ |
+ paddd xmm1,xmm0 |
+ pcmpeqd xmm0,xmm5 |
+ movdqa XMMWORD[160+r10],xmm3 |
+ movdqa xmm3,xmm4 |
+ paddd xmm2,xmm1 |
+ pcmpeqd xmm1,xmm5 |
+ movdqa XMMWORD[176+r10],xmm0 |
+ movdqa xmm0,xmm4 |
+ |
+ paddd xmm3,xmm2 |
+ pcmpeqd xmm2,xmm5 |
+ movdqa XMMWORD[192+r10],xmm1 |
+ movdqa xmm1,xmm4 |
+ |
+ paddd xmm0,xmm3 |
+ pcmpeqd xmm3,xmm5 |
+ movdqa XMMWORD[208+r10],xmm2 |
+ movdqa xmm2,xmm4 |
+ |
+ paddd xmm1,xmm0 |
+ pcmpeqd xmm0,xmm5 |
+ movdqa XMMWORD[224+r10],xmm3 |
+ movdqa xmm3,xmm4 |
+ paddd xmm2,xmm1 |
+ pcmpeqd xmm1,xmm5 |
+ movdqa XMMWORD[240+r10],xmm0 |
+ movdqa xmm0,xmm4 |
+ |
+ paddd xmm3,xmm2 |
+ pcmpeqd xmm2,xmm5 |
+ movdqa XMMWORD[256+r10],xmm1 |
+ movdqa xmm1,xmm4 |
+ |
+ paddd xmm0,xmm3 |
+ pcmpeqd xmm3,xmm5 |
+ movdqa XMMWORD[272+r10],xmm2 |
+ movdqa xmm2,xmm4 |
+ |
+ paddd xmm1,xmm0 |
+ pcmpeqd xmm0,xmm5 |
+ movdqa XMMWORD[288+r10],xmm3 |
+ movdqa xmm3,xmm4 |
+ paddd xmm2,xmm1 |
+ pcmpeqd xmm1,xmm5 |
+ movdqa XMMWORD[304+r10],xmm0 |
+ |
+ paddd xmm3,xmm2 |
+DB 0x67 |
+ pcmpeqd xmm2,xmm5 |
+ movdqa XMMWORD[320+r10],xmm1 |
+ |
+ pcmpeqd xmm3,xmm5 |
+ movdqa XMMWORD[336+r10],xmm2 |
+ pand xmm0,XMMWORD[64+r12] |
+ |
+ pand xmm1,XMMWORD[80+r12] |
+ pand xmm2,XMMWORD[96+r12] |
+ movdqa XMMWORD[352+r10],xmm3 |
+ pand xmm3,XMMWORD[112+r12] |
+ por xmm0,xmm2 |
+ por xmm1,xmm3 |
+ movdqa xmm4,XMMWORD[((-128))+r12] |
+ movdqa xmm5,XMMWORD[((-112))+r12] |
+ movdqa xmm2,XMMWORD[((-96))+r12] |
+ pand xmm4,XMMWORD[112+r10] |
+ movdqa xmm3,XMMWORD[((-80))+r12] |
+ pand xmm5,XMMWORD[128+r10] |
+ por xmm0,xmm4 |
+ pand xmm2,XMMWORD[144+r10] |
+ por xmm1,xmm5 |
+ pand xmm3,XMMWORD[160+r10] |
por xmm0,xmm2 |
+ por xmm1,xmm3 |
+ movdqa xmm4,XMMWORD[((-64))+r12] |
+ movdqa xmm5,XMMWORD[((-48))+r12] |
+ movdqa xmm2,XMMWORD[((-32))+r12] |
+ pand xmm4,XMMWORD[176+r10] |
+ movdqa xmm3,XMMWORD[((-16))+r12] |
+ pand xmm5,XMMWORD[192+r10] |
+ por xmm0,xmm4 |
+ pand xmm2,XMMWORD[208+r10] |
+ por xmm1,xmm5 |
+ pand xmm3,XMMWORD[224+r10] |
+ por xmm0,xmm2 |
+ por xmm1,xmm3 |
+ movdqa xmm4,XMMWORD[r12] |
+ movdqa xmm5,XMMWORD[16+r12] |
+ movdqa xmm2,XMMWORD[32+r12] |
+ pand xmm4,XMMWORD[240+r10] |
+ movdqa xmm3,XMMWORD[48+r12] |
+ pand xmm5,XMMWORD[256+r10] |
+ por xmm0,xmm4 |
+ pand xmm2,XMMWORD[272+r10] |
+ por xmm1,xmm5 |
+ pand xmm3,XMMWORD[288+r10] |
+ por xmm0,xmm2 |
+ por xmm1,xmm3 |
+ por xmm0,xmm1 |
+ pshufd xmm1,xmm0,0x4e |
+ por xmm0,xmm1 |
lea r12,[256+r12] |
- por xmm0,xmm3 |
- |
DB 102,72,15,126,195 |
mov r8,QWORD[r8] |
@@ -82,29 +184,14 @@ DB 102,72,15,126,195 |
xor r14,r14 |
xor r15,r15 |
- movq xmm0,QWORD[(((-96)))+r12] |
- movq xmm1,QWORD[((-32))+r12] |
- pand xmm0,xmm4 |
- movq xmm2,QWORD[32+r12] |
- pand xmm1,xmm5 |
- |
mov rbp,r8 |
mul rbx |
mov r10,rax |
mov rax,QWORD[rcx] |
- movq xmm3,QWORD[96+r12] |
- pand xmm2,xmm6 |
- por xmm0,xmm1 |
- pand xmm3,xmm7 |
- |
imul rbp,r10 |
mov r11,rdx |
- por xmm0,xmm2 |
- lea r12,[256+r12] |
- por xmm0,xmm3 |
- |
mul rbp |
add r10,rax |
mov rax,QWORD[8+rsi] |
@@ -137,14 +224,12 @@ $L$1st_enter: |
cmp r15,r9 |
jne NEAR $L$1st |
-DB 102,72,15,126,195 |
add r13,rax |
- mov rax,QWORD[rsi] |
adc rdx,0 |
add r13,r11 |
adc rdx,0 |
- mov QWORD[((-16))+r15*8+rsp],r13 |
+ mov QWORD[((-16))+r9*8+rsp],r13 |
mov r13,rdx |
mov r11,r10 |
@@ -158,33 +243,78 @@ DB 102,72,15,126,195 |
jmp NEAR $L$outer |
ALIGN 16 |
$L$outer: |
+ lea rdx,[((24+128))+r9*8+rsp] |
+ and rdx,-16 |
+ pxor xmm4,xmm4 |
+ pxor xmm5,xmm5 |
+ movdqa xmm0,XMMWORD[((-128))+r12] |
+ movdqa xmm1,XMMWORD[((-112))+r12] |
+ movdqa xmm2,XMMWORD[((-96))+r12] |
+ movdqa xmm3,XMMWORD[((-80))+r12] |
+ pand xmm0,XMMWORD[((-128))+rdx] |
+ pand xmm1,XMMWORD[((-112))+rdx] |
+ por xmm4,xmm0 |
+ pand xmm2,XMMWORD[((-96))+rdx] |
+ por xmm5,xmm1 |
+ pand xmm3,XMMWORD[((-80))+rdx] |
+ por xmm4,xmm2 |
+ por xmm5,xmm3 |
+ movdqa xmm0,XMMWORD[((-64))+r12] |
+ movdqa xmm1,XMMWORD[((-48))+r12] |
+ movdqa xmm2,XMMWORD[((-32))+r12] |
+ movdqa xmm3,XMMWORD[((-16))+r12] |
+ pand xmm0,XMMWORD[((-64))+rdx] |
+ pand xmm1,XMMWORD[((-48))+rdx] |
+ por xmm4,xmm0 |
+ pand xmm2,XMMWORD[((-32))+rdx] |
+ por xmm5,xmm1 |
+ pand xmm3,XMMWORD[((-16))+rdx] |
+ por xmm4,xmm2 |
+ por xmm5,xmm3 |
+ movdqa xmm0,XMMWORD[r12] |
+ movdqa xmm1,XMMWORD[16+r12] |
+ movdqa xmm2,XMMWORD[32+r12] |
+ movdqa xmm3,XMMWORD[48+r12] |
+ pand xmm0,XMMWORD[rdx] |
+ pand xmm1,XMMWORD[16+rdx] |
+ por xmm4,xmm0 |
+ pand xmm2,XMMWORD[32+rdx] |
+ por xmm5,xmm1 |
+ pand xmm3,XMMWORD[48+rdx] |
+ por xmm4,xmm2 |
+ por xmm5,xmm3 |
+ movdqa xmm0,XMMWORD[64+r12] |
+ movdqa xmm1,XMMWORD[80+r12] |
+ movdqa xmm2,XMMWORD[96+r12] |
+ movdqa xmm3,XMMWORD[112+r12] |
+ pand xmm0,XMMWORD[64+rdx] |
+ pand xmm1,XMMWORD[80+rdx] |
+ por xmm4,xmm0 |
+ pand xmm2,XMMWORD[96+rdx] |
+ por xmm5,xmm1 |
+ pand xmm3,XMMWORD[112+rdx] |
+ por xmm4,xmm2 |
+ por xmm5,xmm3 |
+ por xmm4,xmm5 |
+ pshufd xmm0,xmm4,0x4e |
+ por xmm0,xmm4 |
+ lea r12,[256+r12] |
+ |
+ mov rax,QWORD[rsi] |
+DB 102,72,15,126,195 |
+ |
xor r15,r15 |
mov rbp,r8 |
mov r10,QWORD[rsp] |
- movq xmm0,QWORD[(((-96)))+r12] |
- movq xmm1,QWORD[((-32))+r12] |
- pand xmm0,xmm4 |
- movq xmm2,QWORD[32+r12] |
- pand xmm1,xmm5 |
- |
mul rbx |
add r10,rax |
mov rax,QWORD[rcx] |
adc rdx,0 |
- movq xmm3,QWORD[96+r12] |
- pand xmm2,xmm6 |
- por xmm0,xmm1 |
- pand xmm3,xmm7 |
- |
imul rbp,r10 |
mov r11,rdx |
- por xmm0,xmm2 |
- lea r12,[256+r12] |
- por xmm0,xmm3 |
- |
mul rbp |
add r10,rax |
mov rax,QWORD[8+rsi] |
@@ -220,15 +350,12 @@ $L$inner_enter: |
cmp r15,r9 |
jne NEAR $L$inner |
-DB 102,72,15,126,195 |
- |
add r13,rax |
- mov rax,QWORD[rsi] |
adc rdx,0 |
add r13,r10 |
- mov r10,QWORD[r15*8+rsp] |
+ mov r10,QWORD[r9*8+rsp] |
adc rdx,0 |
- mov QWORD[((-16))+r15*8+rsp],r13 |
+ mov QWORD[((-16))+r9*8+rsp],r13 |
mov r13,rdx |
xor rdx,rdx |
@@ -274,8 +401,7 @@ $L$copy: |
mov rsi,QWORD[8+r9*8+rsp] |
mov rax,1 |
- movaps xmm6,XMMWORD[((-88))+rsi] |
- movaps xmm7,XMMWORD[((-72))+rsi] |
+ |
mov r15,QWORD[((-48))+rsi] |
mov r14,QWORD[((-40))+rsi] |
mov r13,QWORD[((-32))+rsi] |
@@ -312,13 +438,10 @@ DB 0x67 |
push r13 |
push r14 |
push r15 |
- lea rsp,[((-40))+rsp] |
- movaps XMMWORD[rsp],xmm6 |
- movaps XMMWORD[16+rsp],xmm7 |
+ |
DB 0x67 |
- mov r10d,r9d |
shl r9d,3 |
- shl r10d,3+2 |
+ lea r10,[r9*2+r9] |
neg r9 |
@@ -328,19 +451,21 @@ DB 0x67 |
- lea r11,[((-64))+r9*2+rsp] |
- sub r11,rsi |
+ |
+ |
+ lea r11,[((-320))+r9*2+rsp] |
+ sub r11,rdi |
and r11,4095 |
cmp r10,r11 |
jb NEAR $L$mul4xsp_alt |
sub rsp,r11 |
- lea rsp,[((-64))+r9*2+rsp] |
+ lea rsp,[((-320))+r9*2+rsp] |
jmp NEAR $L$mul4xsp_done |
ALIGN 32 |
$L$mul4xsp_alt: |
- lea r10,[((4096-64))+r9*2] |
- lea rsp,[((-64))+r9*2+rsp] |
+ lea r10,[((4096-320))+r9*2] |
+ lea rsp,[((-320))+r9*2+rsp] |
sub r11,r10 |
mov r10,0 |
cmovc r11,r10 |
@@ -356,8 +481,7 @@ $L$mul4x_body: |
mov rsi,QWORD[40+rsp] |
mov rax,1 |
- movaps xmm6,XMMWORD[((-88))+rsi] |
- movaps xmm7,XMMWORD[((-72))+rsi] |
+ |
mov r15,QWORD[((-48))+rsi] |
mov r14,QWORD[((-40))+rsi] |
mov r13,QWORD[((-32))+rsi] |
@@ -375,47 +499,141 @@ $L$SEH_end_bn_mul4x_mont_gather5: |
ALIGN 32 |
mul4x_internal: |
shl r9,5 |
- mov r10d,DWORD[56+rax] |
- lea r13,[256+r9*1+rdx] |
+ movd xmm5,DWORD[56+rax] |
+ lea rax,[$L$inc] |
+ lea r13,[128+r9*1+rdx] |
shr r9,5 |
- mov r11,r10 |
- shr r10,3 |
- and r11,7 |
- not r10 |
- lea rax,[$L$magic_masks] |
- and r10,3 |
- lea r12,[96+r11*8+rdx] |
- movq xmm4,QWORD[r10*8+rax] |
- movq xmm5,QWORD[8+r10*8+rax] |
- add r11,7 |
- movq xmm6,QWORD[16+r10*8+rax] |
- movq xmm7,QWORD[24+r10*8+rax] |
- and r11,7 |
- |
- movq xmm0,QWORD[(((-96)))+r12] |
- lea r14,[256+r12] |
- movq xmm1,QWORD[((-32))+r12] |
- pand xmm0,xmm4 |
- movq xmm2,QWORD[32+r12] |
- pand xmm1,xmm5 |
- movq xmm3,QWORD[96+r12] |
- pand xmm2,xmm6 |
-DB 0x67 |
- por xmm0,xmm1 |
- movq xmm1,QWORD[((-96))+r14] |
-DB 0x67 |
- pand xmm3,xmm7 |
-DB 0x67 |
- por xmm0,xmm2 |
- movq xmm2,QWORD[((-32))+r14] |
+ movdqa xmm0,XMMWORD[rax] |
+ movdqa xmm1,XMMWORD[16+rax] |
+ lea r10,[((88-112))+r9*1+rsp] |
+ lea r12,[128+rdx] |
+ |
+ pshufd xmm5,xmm5,0 |
+ movdqa xmm4,xmm1 |
+DB 0x67,0x67 |
+ movdqa xmm2,xmm1 |
+ paddd xmm1,xmm0 |
+ pcmpeqd xmm0,xmm5 |
DB 0x67 |
- pand xmm1,xmm4 |
+ movdqa xmm3,xmm4 |
+ paddd xmm2,xmm1 |
+ pcmpeqd xmm1,xmm5 |
+ movdqa XMMWORD[112+r10],xmm0 |
+ movdqa xmm0,xmm4 |
+ |
+ paddd xmm3,xmm2 |
+ pcmpeqd xmm2,xmm5 |
+ movdqa XMMWORD[128+r10],xmm1 |
+ movdqa xmm1,xmm4 |
+ |
+ paddd xmm0,xmm3 |
+ pcmpeqd xmm3,xmm5 |
+ movdqa XMMWORD[144+r10],xmm2 |
+ movdqa xmm2,xmm4 |
+ |
+ paddd xmm1,xmm0 |
+ pcmpeqd xmm0,xmm5 |
+ movdqa XMMWORD[160+r10],xmm3 |
+ movdqa xmm3,xmm4 |
+ paddd xmm2,xmm1 |
+ pcmpeqd xmm1,xmm5 |
+ movdqa XMMWORD[176+r10],xmm0 |
+ movdqa xmm0,xmm4 |
+ |
+ paddd xmm3,xmm2 |
+ pcmpeqd xmm2,xmm5 |
+ movdqa XMMWORD[192+r10],xmm1 |
+ movdqa xmm1,xmm4 |
+ |
+ paddd xmm0,xmm3 |
+ pcmpeqd xmm3,xmm5 |
+ movdqa XMMWORD[208+r10],xmm2 |
+ movdqa xmm2,xmm4 |
+ |
+ paddd xmm1,xmm0 |
+ pcmpeqd xmm0,xmm5 |
+ movdqa XMMWORD[224+r10],xmm3 |
+ movdqa xmm3,xmm4 |
+ paddd xmm2,xmm1 |
+ pcmpeqd xmm1,xmm5 |
+ movdqa XMMWORD[240+r10],xmm0 |
+ movdqa xmm0,xmm4 |
+ |
+ paddd xmm3,xmm2 |
+ pcmpeqd xmm2,xmm5 |
+ movdqa XMMWORD[256+r10],xmm1 |
+ movdqa xmm1,xmm4 |
+ |
+ paddd xmm0,xmm3 |
+ pcmpeqd xmm3,xmm5 |
+ movdqa XMMWORD[272+r10],xmm2 |
+ movdqa xmm2,xmm4 |
+ |
+ paddd xmm1,xmm0 |
+ pcmpeqd xmm0,xmm5 |
+ movdqa XMMWORD[288+r10],xmm3 |
+ movdqa xmm3,xmm4 |
+ paddd xmm2,xmm1 |
+ pcmpeqd xmm1,xmm5 |
+ movdqa XMMWORD[304+r10],xmm0 |
+ |
+ paddd xmm3,xmm2 |
DB 0x67 |
- por xmm0,xmm3 |
- movq xmm3,QWORD[32+r14] |
+ pcmpeqd xmm2,xmm5 |
+ movdqa XMMWORD[320+r10],xmm1 |
+ |
+ pcmpeqd xmm3,xmm5 |
+ movdqa XMMWORD[336+r10],xmm2 |
+ pand xmm0,XMMWORD[64+r12] |
+ pand xmm1,XMMWORD[80+r12] |
+ pand xmm2,XMMWORD[96+r12] |
+ movdqa XMMWORD[352+r10],xmm3 |
+ pand xmm3,XMMWORD[112+r12] |
+ por xmm0,xmm2 |
+ por xmm1,xmm3 |
+ movdqa xmm4,XMMWORD[((-128))+r12] |
+ movdqa xmm5,XMMWORD[((-112))+r12] |
+ movdqa xmm2,XMMWORD[((-96))+r12] |
+ pand xmm4,XMMWORD[112+r10] |
+ movdqa xmm3,XMMWORD[((-80))+r12] |
+ pand xmm5,XMMWORD[128+r10] |
+ por xmm0,xmm4 |
+ pand xmm2,XMMWORD[144+r10] |
+ por xmm1,xmm5 |
+ pand xmm3,XMMWORD[160+r10] |
+ por xmm0,xmm2 |
+ por xmm1,xmm3 |
+ movdqa xmm4,XMMWORD[((-64))+r12] |
+ movdqa xmm5,XMMWORD[((-48))+r12] |
+ movdqa xmm2,XMMWORD[((-32))+r12] |
+ pand xmm4,XMMWORD[176+r10] |
+ movdqa xmm3,XMMWORD[((-16))+r12] |
+ pand xmm5,XMMWORD[192+r10] |
+ por xmm0,xmm4 |
+ pand xmm2,XMMWORD[208+r10] |
+ por xmm1,xmm5 |
+ pand xmm3,XMMWORD[224+r10] |
+ por xmm0,xmm2 |
+ por xmm1,xmm3 |
+ movdqa xmm4,XMMWORD[r12] |
+ movdqa xmm5,XMMWORD[16+r12] |
+ movdqa xmm2,XMMWORD[32+r12] |
+ pand xmm4,XMMWORD[240+r10] |
+ movdqa xmm3,XMMWORD[48+r12] |
+ pand xmm5,XMMWORD[256+r10] |
+ por xmm0,xmm4 |
+ pand xmm2,XMMWORD[272+r10] |
+ por xmm1,xmm5 |
+ pand xmm3,XMMWORD[288+r10] |
+ por xmm0,xmm2 |
+ por xmm1,xmm3 |
+ por xmm0,xmm1 |
+ pshufd xmm1,xmm0,0x4e |
+ por xmm0,xmm1 |
+ lea r12,[256+r12] |
DB 102,72,15,126,195 |
- movq xmm0,QWORD[96+r14] |
+ |
mov QWORD[((16+8))+rsp],r13 |
mov QWORD[((56+8))+rsp],rdi |
@@ -429,26 +647,10 @@ DB 102,72,15,126,195 |
mov r10,rax |
mov rax,QWORD[rcx] |
- pand xmm2,xmm5 |
- pand xmm3,xmm6 |
- por xmm1,xmm2 |
- |
imul rbp,r10 |
- |
- |
- |
- |
- |
- |
- |
- lea r14,[((64+8))+r11*8+rsp] |
+ lea r14,[((64+8))+rsp] |
mov r11,rdx |
- pand xmm0,xmm7 |
- por xmm1,xmm3 |
- lea r12,[512+r12] |
- por xmm0,xmm1 |
- |
mul rbp |
add r10,rax |
mov rax,QWORD[8+r9*1+rsi] |
@@ -457,7 +659,7 @@ DB 102,72,15,126,195 |
mul rbx |
add r11,rax |
- mov rax,QWORD[16+rcx] |
+ mov rax,QWORD[8+rcx] |
adc rdx,0 |
mov r10,rdx |
@@ -467,7 +669,7 @@ DB 102,72,15,126,195 |
adc rdx,0 |
add rdi,r11 |
lea r15,[32+r9] |
- lea rcx,[64+rcx] |
+ lea rcx,[32+rcx] |
adc rdx,0 |
mov QWORD[r14],rdi |
mov r13,rdx |
@@ -477,7 +679,7 @@ ALIGN 32 |
$L$1st4x: |
mul rbx |
add r10,rax |
- mov rax,QWORD[((-32))+rcx] |
+ mov rax,QWORD[((-16))+rcx] |
lea r14,[32+r14] |
adc rdx,0 |
mov r11,rdx |
@@ -493,7 +695,7 @@ $L$1st4x: |
mul rbx |
add r11,rax |
- mov rax,QWORD[((-16))+rcx] |
+ mov rax,QWORD[((-8))+rcx] |
adc rdx,0 |
mov r10,rdx |
@@ -523,7 +725,7 @@ $L$1st4x: |
mul rbx |
add r11,rax |
- mov rax,QWORD[16+rcx] |
+ mov rax,QWORD[8+rcx] |
adc rdx,0 |
mov r10,rdx |
@@ -532,7 +734,7 @@ $L$1st4x: |
mov rax,QWORD[16+r15*1+rsi] |
adc rdx,0 |
add rdi,r11 |
- lea rcx,[64+rcx] |
+ lea rcx,[32+rcx] |
adc rdx,0 |
mov QWORD[r14],rdi |
mov r13,rdx |
@@ -542,7 +744,7 @@ $L$1st4x: |
mul rbx |
add r10,rax |
- mov rax,QWORD[((-32))+rcx] |
+ mov rax,QWORD[((-16))+rcx] |
lea r14,[32+r14] |
adc rdx,0 |
mov r11,rdx |
@@ -558,7 +760,7 @@ $L$1st4x: |
mul rbx |
add r11,rax |
- mov rax,QWORD[((-16))+rcx] |
+ mov rax,QWORD[((-8))+rcx] |
adc rdx,0 |
mov r10,rdx |
@@ -571,8 +773,7 @@ $L$1st4x: |
mov QWORD[((-16))+r14],rdi |
mov r13,rdx |
-DB 102,72,15,126,195 |
- lea rcx,[r9*2+rcx] |
+ lea rcx,[r9*1+rcx] |
xor rdi,rdi |
add r13,r10 |
@@ -583,6 +784,63 @@ DB 102,72,15,126,195 |
ALIGN 32 |
$L$outer4x: |
+ lea rdx,[((16+128))+r14] |
+ pxor xmm4,xmm4 |
+ pxor xmm5,xmm5 |
+ movdqa xmm0,XMMWORD[((-128))+r12] |
+ movdqa xmm1,XMMWORD[((-112))+r12] |
+ movdqa xmm2,XMMWORD[((-96))+r12] |
+ movdqa xmm3,XMMWORD[((-80))+r12] |
+ pand xmm0,XMMWORD[((-128))+rdx] |
+ pand xmm1,XMMWORD[((-112))+rdx] |
+ por xmm4,xmm0 |
+ pand xmm2,XMMWORD[((-96))+rdx] |
+ por xmm5,xmm1 |
+ pand xmm3,XMMWORD[((-80))+rdx] |
+ por xmm4,xmm2 |
+ por xmm5,xmm3 |
+ movdqa xmm0,XMMWORD[((-64))+r12] |
+ movdqa xmm1,XMMWORD[((-48))+r12] |
+ movdqa xmm2,XMMWORD[((-32))+r12] |
+ movdqa xmm3,XMMWORD[((-16))+r12] |
+ pand xmm0,XMMWORD[((-64))+rdx] |
+ pand xmm1,XMMWORD[((-48))+rdx] |
+ por xmm4,xmm0 |
+ pand xmm2,XMMWORD[((-32))+rdx] |
+ por xmm5,xmm1 |
+ pand xmm3,XMMWORD[((-16))+rdx] |
+ por xmm4,xmm2 |
+ por xmm5,xmm3 |
+ movdqa xmm0,XMMWORD[r12] |
+ movdqa xmm1,XMMWORD[16+r12] |
+ movdqa xmm2,XMMWORD[32+r12] |
+ movdqa xmm3,XMMWORD[48+r12] |
+ pand xmm0,XMMWORD[rdx] |
+ pand xmm1,XMMWORD[16+rdx] |
+ por xmm4,xmm0 |
+ pand xmm2,XMMWORD[32+rdx] |
+ por xmm5,xmm1 |
+ pand xmm3,XMMWORD[48+rdx] |
+ por xmm4,xmm2 |
+ por xmm5,xmm3 |
+ movdqa xmm0,XMMWORD[64+r12] |
+ movdqa xmm1,XMMWORD[80+r12] |
+ movdqa xmm2,XMMWORD[96+r12] |
+ movdqa xmm3,XMMWORD[112+r12] |
+ pand xmm0,XMMWORD[64+rdx] |
+ pand xmm1,XMMWORD[80+rdx] |
+ por xmm4,xmm0 |
+ pand xmm2,XMMWORD[96+rdx] |
+ por xmm5,xmm1 |
+ pand xmm3,XMMWORD[112+rdx] |
+ por xmm4,xmm2 |
+ por xmm5,xmm3 |
+ por xmm4,xmm5 |
+ pshufd xmm0,xmm4,0x4e |
+ por xmm0,xmm4 |
+ lea r12,[256+r12] |
+DB 102,72,15,126,195 |
+ |
mov r10,QWORD[r9*1+r14] |
mov rbp,r8 |
mul rbx |
@@ -590,25 +848,11 @@ $L$outer4x: |
mov rax,QWORD[rcx] |
adc rdx,0 |
- movq xmm0,QWORD[(((-96)))+r12] |
- movq xmm1,QWORD[((-32))+r12] |
- pand xmm0,xmm4 |
- movq xmm2,QWORD[32+r12] |
- pand xmm1,xmm5 |
- movq xmm3,QWORD[96+r12] |
- |
imul rbp,r10 |
-DB 0x67 |
mov r11,rdx |
mov QWORD[r14],rdi |
- pand xmm2,xmm6 |
- por xmm0,xmm1 |
- pand xmm3,xmm7 |
- por xmm0,xmm2 |
lea r14,[r9*1+r14] |
- lea r12,[256+r12] |
- por xmm0,xmm3 |
mul rbp |
add r10,rax |
@@ -618,7 +862,7 @@ DB 0x67 |
mul rbx |
add r11,rax |
- mov rax,QWORD[16+rcx] |
+ mov rax,QWORD[8+rcx] |
adc rdx,0 |
add r11,QWORD[8+r14] |
adc rdx,0 |
@@ -630,7 +874,7 @@ DB 0x67 |
adc rdx,0 |
add rdi,r11 |
lea r15,[32+r9] |
- lea rcx,[64+rcx] |
+ lea rcx,[32+rcx] |
adc rdx,0 |
mov r13,rdx |
jmp NEAR $L$inner4x |
@@ -639,7 +883,7 @@ ALIGN 32 |
$L$inner4x: |
mul rbx |
add r10,rax |
- mov rax,QWORD[((-32))+rcx] |
+ mov rax,QWORD[((-16))+rcx] |
adc rdx,0 |
add r10,QWORD[16+r14] |
lea r14,[32+r14] |
@@ -657,7 +901,7 @@ $L$inner4x: |
mul rbx |
add r11,rax |
- mov rax,QWORD[((-16))+rcx] |
+ mov rax,QWORD[((-8))+rcx] |
adc rdx,0 |
add r11,QWORD[((-8))+r14] |
adc rdx,0 |
@@ -691,7 +935,7 @@ $L$inner4x: |
mul rbx |
add r11,rax |
- mov rax,QWORD[16+rcx] |
+ mov rax,QWORD[8+rcx] |
adc rdx,0 |
add r11,QWORD[8+r14] |
adc rdx,0 |
@@ -702,7 +946,7 @@ $L$inner4x: |
mov rax,QWORD[16+r15*1+rsi] |
adc rdx,0 |
add rdi,r11 |
- lea rcx,[64+rcx] |
+ lea rcx,[32+rcx] |
adc rdx,0 |
mov QWORD[((-8))+r14],r13 |
mov r13,rdx |
@@ -712,7 +956,7 @@ $L$inner4x: |
mul rbx |
add r10,rax |
- mov rax,QWORD[((-32))+rcx] |
+ mov rax,QWORD[((-16))+rcx] |
adc rdx,0 |
add r10,QWORD[16+r14] |
lea r14,[32+r14] |
@@ -731,7 +975,7 @@ $L$inner4x: |
mul rbx |
add r11,rax |
mov rax,rbp |
- mov rbp,QWORD[((-16))+rcx] |
+ mov rbp,QWORD[((-8))+rcx] |
adc rdx,0 |
add r11,QWORD[((-8))+r14] |
adc rdx,0 |
@@ -746,9 +990,8 @@ $L$inner4x: |
mov QWORD[((-24))+r14],r13 |
mov r13,rdx |
-DB 102,72,15,126,195 |
mov QWORD[((-16))+r14],rdi |
- lea rcx,[r9*2+rcx] |
+ lea rcx,[r9*1+rcx] |
xor rdi,rdi |
add r13,r10 |
@@ -759,16 +1002,23 @@ DB 102,72,15,126,195 |
cmp r12,QWORD[((16+8))+rsp] |
jb NEAR $L$outer4x |
+ xor rax,rax |
sub rbp,r13 |
adc r15,r15 |
or rdi,r15 |
- xor rdi,1 |
+ sub rax,rdi |
lea rbx,[r9*1+r14] |
- lea rbp,[rdi*8+rcx] |
+ mov r12,QWORD[rcx] |
+ lea rbp,[rcx] |
mov rcx,r9 |
sar rcx,3+2 |
mov rdi,QWORD[((56+8))+rsp] |
- jmp NEAR $L$sqr4x_sub |
+ dec r12 |
+ xor r10,r10 |
+ mov r13,QWORD[8+rbp] |
+ mov r14,QWORD[16+rbp] |
+ mov r15,QWORD[24+rbp] |
+ jmp NEAR $L$sqr4x_sub_entry |
global bn_power5 |
@@ -793,12 +1043,9 @@ $L$SEH_begin_bn_power5: |
push r13 |
push r14 |
push r15 |
- lea rsp,[((-40))+rsp] |
- movaps XMMWORD[rsp],xmm6 |
- movaps XMMWORD[16+rsp],xmm7 |
- mov r10d,r9d |
+ |
shl r9d,3 |
- shl r10d,3+2 |
+ lea r10d,[r9*2+r9] |
neg r9 |
mov r8,QWORD[r8] |
@@ -808,19 +1055,20 @@ $L$SEH_begin_bn_power5: |
- lea r11,[((-64))+r9*2+rsp] |
- sub r11,rsi |
+ |
+ lea r11,[((-320))+r9*2+rsp] |
+ sub r11,rdi |
and r11,4095 |
cmp r10,r11 |
jb NEAR $L$pwr_sp_alt |
sub rsp,r11 |
- lea rsp,[((-64))+r9*2+rsp] |
+ lea rsp,[((-320))+r9*2+rsp] |
jmp NEAR $L$pwr_sp_done |
ALIGN 32 |
$L$pwr_sp_alt: |
- lea r10,[((4096-64))+r9*2] |
- lea rsp,[((-64))+r9*2+rsp] |
+ lea r10,[((4096-320))+r9*2] |
+ lea rsp,[((-320))+r9*2+rsp] |
sub r11,r10 |
mov r10,0 |
cmovc r11,r10 |
@@ -848,10 +1096,15 @@ DB 102,73,15,110,218 |
DB 102,72,15,110,226 |
call __bn_sqr8x_internal |
+ call __bn_post4x_internal |
call __bn_sqr8x_internal |
+ call __bn_post4x_internal |
call __bn_sqr8x_internal |
+ call __bn_post4x_internal |
call __bn_sqr8x_internal |
+ call __bn_post4x_internal |
call __bn_sqr8x_internal |
+ call __bn_post4x_internal |
DB 102,72,15,126,209 |
DB 102,72,15,126,226 |
@@ -1397,9 +1650,9 @@ DB 0x67 |
mov QWORD[((-16))+rdi],rbx |
mov QWORD[((-8))+rdi],r8 |
DB 102,72,15,126,213 |
-sqr8x_reduction: |
+__bn_sqr8x_reduction: |
xor rax,rax |
- lea rcx,[r9*2+rbp] |
+ lea rcx,[rbp*1+r9] |
lea rdx,[((48+8))+r9*2+rsp] |
mov QWORD[((0+8))+rsp],rcx |
lea rdi,[((48+8))+r9*1+rsp] |
@@ -1432,14 +1685,14 @@ DB 0x67 |
ALIGN 32 |
$L$8x_reduce: |
mul rbx |
- mov rax,QWORD[16+rbp] |
+ mov rax,QWORD[8+rbp] |
neg r8 |
mov r8,rdx |
adc r8,0 |
mul rbx |
add r9,rax |
- mov rax,QWORD[32+rbp] |
+ mov rax,QWORD[16+rbp] |
adc rdx,0 |
add r8,r9 |
mov QWORD[((48-8+8))+rcx*8+rsp],rbx |
@@ -1448,7 +1701,7 @@ $L$8x_reduce: |
mul rbx |
add r10,rax |
- mov rax,QWORD[48+rbp] |
+ mov rax,QWORD[24+rbp] |
adc rdx,0 |
add r9,r10 |
mov rsi,QWORD[((32+8))+rsp] |
@@ -1457,7 +1710,7 @@ $L$8x_reduce: |
mul rbx |
add r11,rax |
- mov rax,QWORD[64+rbp] |
+ mov rax,QWORD[32+rbp] |
adc rdx,0 |
imul rsi,r8 |
add r10,r11 |
@@ -1466,7 +1719,7 @@ $L$8x_reduce: |
mul rbx |
add r12,rax |
- mov rax,QWORD[80+rbp] |
+ mov rax,QWORD[40+rbp] |
adc rdx,0 |
add r11,r12 |
mov r12,rdx |
@@ -1474,7 +1727,7 @@ $L$8x_reduce: |
mul rbx |
add r13,rax |
- mov rax,QWORD[96+rbp] |
+ mov rax,QWORD[48+rbp] |
adc rdx,0 |
add r12,r13 |
mov r13,rdx |
@@ -1482,7 +1735,7 @@ $L$8x_reduce: |
mul rbx |
add r14,rax |
- mov rax,QWORD[112+rbp] |
+ mov rax,QWORD[56+rbp] |
adc rdx,0 |
add r13,r14 |
mov r14,rdx |
@@ -1500,7 +1753,7 @@ $L$8x_reduce: |
dec ecx |
jnz NEAR $L$8x_reduce |
- lea rbp,[128+rbp] |
+ lea rbp,[64+rbp] |
xor rax,rax |
mov rdx,QWORD[((8+8))+rsp] |
cmp rbp,QWORD[((0+8))+rsp] |
@@ -1526,14 +1779,14 @@ ALIGN 32 |
$L$8x_tail: |
mul rbx |
add r8,rax |
- mov rax,QWORD[16+rbp] |
+ mov rax,QWORD[8+rbp] |
mov QWORD[rdi],r8 |
mov r8,rdx |
adc r8,0 |
mul rbx |
add r9,rax |
- mov rax,QWORD[32+rbp] |
+ mov rax,QWORD[16+rbp] |
adc rdx,0 |
add r8,r9 |
lea rdi,[8+rdi] |
@@ -1542,7 +1795,7 @@ $L$8x_tail: |
mul rbx |
add r10,rax |
- mov rax,QWORD[48+rbp] |
+ mov rax,QWORD[24+rbp] |
adc rdx,0 |
add r9,r10 |
mov r10,rdx |
@@ -1550,7 +1803,7 @@ $L$8x_tail: |
mul rbx |
add r11,rax |
- mov rax,QWORD[64+rbp] |
+ mov rax,QWORD[32+rbp] |
adc rdx,0 |
add r10,r11 |
mov r11,rdx |
@@ -1558,7 +1811,7 @@ $L$8x_tail: |
mul rbx |
add r12,rax |
- mov rax,QWORD[80+rbp] |
+ mov rax,QWORD[40+rbp] |
adc rdx,0 |
add r11,r12 |
mov r12,rdx |
@@ -1566,7 +1819,7 @@ $L$8x_tail: |
mul rbx |
add r13,rax |
- mov rax,QWORD[96+rbp] |
+ mov rax,QWORD[48+rbp] |
adc rdx,0 |
add r12,r13 |
mov r13,rdx |
@@ -1574,7 +1827,7 @@ $L$8x_tail: |
mul rbx |
add r14,rax |
- mov rax,QWORD[112+rbp] |
+ mov rax,QWORD[56+rbp] |
adc rdx,0 |
add r13,r14 |
mov r14,rdx |
@@ -1592,7 +1845,7 @@ $L$8x_tail: |
dec ecx |
jnz NEAR $L$8x_tail |
- lea rbp,[128+rbp] |
+ lea rbp,[64+rbp] |
mov rdx,QWORD[((8+8))+rsp] |
cmp rbp,QWORD[((0+8))+rsp] |
jae NEAR $L$8x_tail_done |
@@ -1616,6 +1869,15 @@ $L$8x_tail: |
ALIGN 32 |
$L$8x_tail_done: |
add r8,QWORD[rdx] |
+ adc r9,0 |
+ adc r10,0 |
+ adc r11,0 |
+ adc r12,0 |
+ adc r13,0 |
+ adc r14,0 |
+ adc r15,0 |
+ |
+ |
xor rax,rax |
neg rsi |
@@ -1629,7 +1891,7 @@ $L$8x_no_tail: |
adc r14,QWORD[48+rdi] |
adc r15,QWORD[56+rdi] |
adc rax,0 |
- mov rcx,QWORD[((-16))+rbp] |
+ mov rcx,QWORD[((-8))+rbp] |
xor rsi,rsi |
DB 102,72,15,126,213 |
@@ -1647,40 +1909,58 @@ DB 102,73,15,126,217 |
cmp rdi,rdx |
jb NEAR $L$8x_reduction_loop |
+ DB 0F3h,0C3h ;repret |
+ |
- sub rcx,r15 |
+ALIGN 32 |
+__bn_post4x_internal: |
+ mov r12,QWORD[rbp] |
lea rbx,[r9*1+rdi] |
- adc rsi,rsi |
mov rcx,r9 |
- or rax,rsi |
DB 102,72,15,126,207 |
- xor rax,1 |
+ neg rax |
DB 102,72,15,126,206 |
- lea rbp,[rax*8+rbp] |
sar rcx,3+2 |
- jmp NEAR $L$sqr4x_sub |
+ dec r12 |
+ xor r10,r10 |
+ mov r13,QWORD[8+rbp] |
+ mov r14,QWORD[16+rbp] |
+ mov r15,QWORD[24+rbp] |
+ jmp NEAR $L$sqr4x_sub_entry |
-ALIGN 32 |
+ALIGN 16 |
$L$sqr4x_sub: |
-DB 0x66 |
- mov r12,QWORD[rbx] |
- mov r13,QWORD[8+rbx] |
- sbb r12,QWORD[rbp] |
- mov r14,QWORD[16+rbx] |
- sbb r13,QWORD[16+rbp] |
- mov r15,QWORD[24+rbx] |
- lea rbx,[32+rbx] |
- sbb r14,QWORD[32+rbp] |
+ mov r12,QWORD[rbp] |
+ mov r13,QWORD[8+rbp] |
+ mov r14,QWORD[16+rbp] |
+ mov r15,QWORD[24+rbp] |
+$L$sqr4x_sub_entry: |
+ lea rbp,[32+rbp] |
+ not r12 |
+ not r13 |
+ not r14 |
+ not r15 |
+ and r12,rax |
+ and r13,rax |
+ and r14,rax |
+ and r15,rax |
+ |
+ neg r10 |
+ adc r12,QWORD[rbx] |
+ adc r13,QWORD[8+rbx] |
+ adc r14,QWORD[16+rbx] |
+ adc r15,QWORD[24+rbx] |
mov QWORD[rdi],r12 |
- sbb r15,QWORD[48+rbp] |
- lea rbp,[64+rbp] |
+ lea rbx,[32+rbx] |
mov QWORD[8+rdi],r13 |
+ sbb r10,r10 |
mov QWORD[16+rdi],r14 |
mov QWORD[24+rdi],r15 |
lea rdi,[32+rdi] |
inc rcx |
jnz NEAR $L$sqr4x_sub |
+ |
mov r10,r9 |
neg r9 |
DB 0F3h,0C3h ;repret |
@@ -1718,13 +1998,9 @@ DB 0x67 |
push r13 |
push r14 |
push r15 |
- lea rsp,[((-40))+rsp] |
- movaps XMMWORD[rsp],xmm6 |
- movaps XMMWORD[16+rsp],xmm7 |
-DB 0x67 |
- mov r10d,r9d |
+ |
shl r9d,3 |
- shl r10d,3+2 |
+ lea r10,[r9*2+r9] |
neg r9 |
mov r8,QWORD[r8] |
@@ -1734,19 +2010,20 @@ DB 0x67 |
- lea r11,[((-64))+r9*2+rsp] |
- sub r11,rsi |
+ |
+ lea r11,[((-320))+r9*2+rsp] |
+ sub r11,rdi |
and r11,4095 |
cmp r10,r11 |
jb NEAR $L$from_sp_alt |
sub rsp,r11 |
- lea rsp,[((-64))+r9*2+rsp] |
+ lea rsp,[((-320))+r9*2+rsp] |
jmp NEAR $L$from_sp_done |
ALIGN 32 |
$L$from_sp_alt: |
- lea r10,[((4096-64))+r9*2] |
- lea rsp,[((-64))+r9*2+rsp] |
+ lea r10,[((4096-320))+r9*2] |
+ lea rsp,[((-320))+r9*2+rsp] |
sub r11,r10 |
mov r10,0 |
cmovc r11,r10 |
@@ -1797,7 +2074,8 @@ DB 102,72,15,110,209 |
DB 0x67 |
mov rbp,rcx |
DB 102,73,15,110,218 |
- call sqr8x_reduction |
+ call __bn_sqr8x_reduction |
+ call __bn_post4x_internal |
pxor xmm0,xmm0 |
lea rax,[48+rsp] |
@@ -1847,55 +2125,171 @@ $L$scatter_epilogue: |
global bn_gather5 |
-ALIGN 16 |
+ALIGN 32 |
bn_gather5: |
$L$SEH_begin_bn_gather5: |
-DB 0x48,0x83,0xec,0x28 |
-DB 0x0f,0x29,0x34,0x24 |
-DB 0x0f,0x29,0x7c,0x24,0x10 |
- mov r11d,r9d |
- shr r9d,3 |
- and r11,7 |
- not r9d |
- lea rax,[$L$magic_masks] |
- and r9d,3 |
- lea r8,[128+r11*8+r8] |
- movq xmm4,QWORD[r9*8+rax] |
- movq xmm5,QWORD[8+r9*8+rax] |
- movq xmm6,QWORD[16+r9*8+rax] |
- movq xmm7,QWORD[24+r9*8+rax] |
+DB 0x4c,0x8d,0x14,0x24 |
+DB 0x48,0x81,0xec,0x08,0x01,0x00,0x00 |
+ lea rax,[$L$inc] |
+ and rsp,-16 |
+ |
+ movd xmm5,r9d |
+ movdqa xmm0,XMMWORD[rax] |
+ movdqa xmm1,XMMWORD[16+rax] |
+ lea r11,[128+r8] |
+ lea rax,[128+rsp] |
+ |
+ pshufd xmm5,xmm5,0 |
+ movdqa xmm4,xmm1 |
+ movdqa xmm2,xmm1 |
+ paddd xmm1,xmm0 |
+ pcmpeqd xmm0,xmm5 |
+ movdqa xmm3,xmm4 |
+ |
+ paddd xmm2,xmm1 |
+ pcmpeqd xmm1,xmm5 |
+ movdqa XMMWORD[(-128)+rax],xmm0 |
+ movdqa xmm0,xmm4 |
+ |
+ paddd xmm3,xmm2 |
+ pcmpeqd xmm2,xmm5 |
+ movdqa XMMWORD[(-112)+rax],xmm1 |
+ movdqa xmm1,xmm4 |
+ |
+ paddd xmm0,xmm3 |
+ pcmpeqd xmm3,xmm5 |
+ movdqa XMMWORD[(-96)+rax],xmm2 |
+ movdqa xmm2,xmm4 |
+ paddd xmm1,xmm0 |
+ pcmpeqd xmm0,xmm5 |
+ movdqa XMMWORD[(-80)+rax],xmm3 |
+ movdqa xmm3,xmm4 |
+ |
+ paddd xmm2,xmm1 |
+ pcmpeqd xmm1,xmm5 |
+ movdqa XMMWORD[(-64)+rax],xmm0 |
+ movdqa xmm0,xmm4 |
+ |
+ paddd xmm3,xmm2 |
+ pcmpeqd xmm2,xmm5 |
+ movdqa XMMWORD[(-48)+rax],xmm1 |
+ movdqa xmm1,xmm4 |
+ |
+ paddd xmm0,xmm3 |
+ pcmpeqd xmm3,xmm5 |
+ movdqa XMMWORD[(-32)+rax],xmm2 |
+ movdqa xmm2,xmm4 |
+ paddd xmm1,xmm0 |
+ pcmpeqd xmm0,xmm5 |
+ movdqa XMMWORD[(-16)+rax],xmm3 |
+ movdqa xmm3,xmm4 |
+ |
+ paddd xmm2,xmm1 |
+ pcmpeqd xmm1,xmm5 |
+ movdqa XMMWORD[rax],xmm0 |
+ movdqa xmm0,xmm4 |
+ |
+ paddd xmm3,xmm2 |
+ pcmpeqd xmm2,xmm5 |
+ movdqa XMMWORD[16+rax],xmm1 |
+ movdqa xmm1,xmm4 |
+ |
+ paddd xmm0,xmm3 |
+ pcmpeqd xmm3,xmm5 |
+ movdqa XMMWORD[32+rax],xmm2 |
+ movdqa xmm2,xmm4 |
+ paddd xmm1,xmm0 |
+ pcmpeqd xmm0,xmm5 |
+ movdqa XMMWORD[48+rax],xmm3 |
+ movdqa xmm3,xmm4 |
+ |
+ paddd xmm2,xmm1 |
+ pcmpeqd xmm1,xmm5 |
+ movdqa XMMWORD[64+rax],xmm0 |
+ movdqa xmm0,xmm4 |
+ |
+ paddd xmm3,xmm2 |
+ pcmpeqd xmm2,xmm5 |
+ movdqa XMMWORD[80+rax],xmm1 |
+ movdqa xmm1,xmm4 |
+ |
+ paddd xmm0,xmm3 |
+ pcmpeqd xmm3,xmm5 |
+ movdqa XMMWORD[96+rax],xmm2 |
+ movdqa xmm2,xmm4 |
+ movdqa XMMWORD[112+rax],xmm3 |
jmp NEAR $L$gather |
-ALIGN 16 |
-$L$gather: |
- movq xmm0,QWORD[(((-128)))+r8] |
- movq xmm1,QWORD[((-64))+r8] |
- pand xmm0,xmm4 |
- movq xmm2,QWORD[r8] |
- pand xmm1,xmm5 |
- movq xmm3,QWORD[64+r8] |
- pand xmm2,xmm6 |
- por xmm0,xmm1 |
- pand xmm3,xmm7 |
-DB 0x67,0x67 |
- por xmm0,xmm2 |
- lea r8,[256+r8] |
- por xmm0,xmm3 |
+ALIGN 32 |
+$L$gather: |
+ pxor xmm4,xmm4 |
+ pxor xmm5,xmm5 |
+ movdqa xmm0,XMMWORD[((-128))+r11] |
+ movdqa xmm1,XMMWORD[((-112))+r11] |
+ movdqa xmm2,XMMWORD[((-96))+r11] |
+ pand xmm0,XMMWORD[((-128))+rax] |
+ movdqa xmm3,XMMWORD[((-80))+r11] |
+ pand xmm1,XMMWORD[((-112))+rax] |
+ por xmm4,xmm0 |
+ pand xmm2,XMMWORD[((-96))+rax] |
+ por xmm5,xmm1 |
+ pand xmm3,XMMWORD[((-80))+rax] |
+ por xmm4,xmm2 |
+ por xmm5,xmm3 |
+ movdqa xmm0,XMMWORD[((-64))+r11] |
+ movdqa xmm1,XMMWORD[((-48))+r11] |
+ movdqa xmm2,XMMWORD[((-32))+r11] |
+ pand xmm0,XMMWORD[((-64))+rax] |
+ movdqa xmm3,XMMWORD[((-16))+r11] |
+ pand xmm1,XMMWORD[((-48))+rax] |
+ por xmm4,xmm0 |
+ pand xmm2,XMMWORD[((-32))+rax] |
+ por xmm5,xmm1 |
+ pand xmm3,XMMWORD[((-16))+rax] |
+ por xmm4,xmm2 |
+ por xmm5,xmm3 |
+ movdqa xmm0,XMMWORD[r11] |
+ movdqa xmm1,XMMWORD[16+r11] |
+ movdqa xmm2,XMMWORD[32+r11] |
+ pand xmm0,XMMWORD[rax] |
+ movdqa xmm3,XMMWORD[48+r11] |
+ pand xmm1,XMMWORD[16+rax] |
+ por xmm4,xmm0 |
+ pand xmm2,XMMWORD[32+rax] |
+ por xmm5,xmm1 |
+ pand xmm3,XMMWORD[48+rax] |
+ por xmm4,xmm2 |
+ por xmm5,xmm3 |
+ movdqa xmm0,XMMWORD[64+r11] |
+ movdqa xmm1,XMMWORD[80+r11] |
+ movdqa xmm2,XMMWORD[96+r11] |
+ pand xmm0,XMMWORD[64+rax] |
+ movdqa xmm3,XMMWORD[112+r11] |
+ pand xmm1,XMMWORD[80+rax] |
+ por xmm4,xmm0 |
+ pand xmm2,XMMWORD[96+rax] |
+ por xmm5,xmm1 |
+ pand xmm3,XMMWORD[112+rax] |
+ por xmm4,xmm2 |
+ por xmm5,xmm3 |
+ por xmm4,xmm5 |
+ lea r11,[256+r11] |
+ pshufd xmm0,xmm4,0x4e |
+ por xmm0,xmm4 |
movq QWORD[rcx],xmm0 |
lea rcx,[8+rcx] |
sub edx,1 |
jnz NEAR $L$gather |
- movaps xmm6,XMMWORD[rsp] |
- movaps xmm7,XMMWORD[16+rsp] |
- lea rsp,[40+rsp] |
+ |
+ lea rsp,[r10] |
DB 0F3h,0C3h ;repret |
$L$SEH_end_bn_gather5: |
ALIGN 64 |
-$L$magic_masks: |
- DD 0,0,0,0,0,0,-1,-1 |
- DD 0,0,0,0,0,0,0,0 |
+$L$inc: |
+ DD 0,0,1,1 |
+ DD 2,2,2,2 |
DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 |
DB 112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115 |
DB 99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111 |
@@ -1937,19 +2331,16 @@ mul_handler: |
lea r10,[$L$mul_epilogue] |
cmp rbx,r10 |
- jb NEAR $L$body_40 |
+ ja NEAR $L$body_40 |
mov r10,QWORD[192+r8] |
mov rax,QWORD[8+r10*8+rax] |
+ |
jmp NEAR $L$body_proceed |
$L$body_40: |
mov rax,QWORD[40+rax] |
$L$body_proceed: |
- |
- movaps xmm0,XMMWORD[((-88))+rax] |
- movaps xmm1,XMMWORD[((-72))+rax] |
- |
mov rbx,QWORD[((-8))+rax] |
mov rbp,QWORD[((-16))+rax] |
mov r12,QWORD[((-24))+rax] |
@@ -1962,8 +2353,6 @@ $L$body_proceed: |
mov QWORD[224+r8],r13 |
mov QWORD[232+r8],r14 |
mov QWORD[240+r8],r15 |
- movups XMMWORD[512+r8],xmm0 |
- movups XMMWORD[528+r8],xmm1 |
$L$common_seh_tail: |
mov rdi,QWORD[8+rax] |
@@ -2049,8 +2438,7 @@ DB 9,0,0,0 |
DD $L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase |
ALIGN 8 |
$L$SEH_info_bn_gather5: |
-DB 0x01,0x0d,0x05,0x00 |
-DB 0x0d,0x78,0x01,0x00 |
-DB 0x08,0x68,0x00,0x00 |
-DB 0x04,0x42,0x00,0x00 |
+DB 0x01,0x0b,0x03,0x0a |
+DB 0x0b,0x01,0x21,0x00 |
+DB 0x04,0xa3,0x00,0x00 |
ALIGN 8 |