Index: third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont5.S |
diff --git a/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont5.S b/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont5.S |
index 02edc69b369c9d71c88dce6241980b4f14d29e43..554df1ffac1cf0806561810285c16cb5229dd364 100644 |
--- a/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont5.S |
+++ b/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont5.S |
@@ -17,46 +17,151 @@ bn_mul_mont_gather5: |
.Lmul_enter: |
movl %r9d,%r9d |
movq %rsp,%rax |
- movl 8(%rsp),%r10d |
+ movd 8(%rsp),%xmm5 |
+ leaq .Linc(%rip),%r10 |
pushq %rbx |
pushq %rbp |
pushq %r12 |
pushq %r13 |
pushq %r14 |
pushq %r15 |
+ |
leaq 2(%r9),%r11 |
negq %r11 |
- leaq (%rsp,%r11,8),%rsp |
+ leaq -264(%rsp,%r11,8),%rsp |
andq $-1024,%rsp |
movq %rax,8(%rsp,%r9,8) |
.Lmul_body: |
- movq %rdx,%r12 |
- movq %r10,%r11 |
- shrq $3,%r10 |
- andq $7,%r11 |
- notq %r10 |
- leaq .Lmagic_masks(%rip),%rax |
- andq $3,%r10 |
- leaq 96(%r12,%r11,8),%r12 |
- movq 0(%rax,%r10,8),%xmm4 |
- movq 8(%rax,%r10,8),%xmm5 |
- movq 16(%rax,%r10,8),%xmm6 |
- movq 24(%rax,%r10,8),%xmm7 |
- |
- movq -96(%r12),%xmm0 |
- movq -32(%r12),%xmm1 |
- pand %xmm4,%xmm0 |
- movq 32(%r12),%xmm2 |
- pand %xmm5,%xmm1 |
- movq 96(%r12),%xmm3 |
- pand %xmm6,%xmm2 |
- por %xmm1,%xmm0 |
- pand %xmm7,%xmm3 |
+ leaq 128(%rdx),%r12 |
+ movdqa 0(%r10),%xmm0 |
+ movdqa 16(%r10),%xmm1 |
+ leaq 24-112(%rsp,%r9,8),%r10 |
+ andq $-16,%r10 |
+ |
+ pshufd $0,%xmm5,%xmm5 |
+ movdqa %xmm1,%xmm4 |
+ movdqa %xmm1,%xmm2 |
+ paddd %xmm0,%xmm1 |
+ pcmpeqd %xmm5,%xmm0 |
+.byte 0x67 |
+ movdqa %xmm4,%xmm3 |
+ paddd %xmm1,%xmm2 |
+ pcmpeqd %xmm5,%xmm1 |
+ movdqa %xmm0,112(%r10) |
+ movdqa %xmm4,%xmm0 |
+ |
+ paddd %xmm2,%xmm3 |
+ pcmpeqd %xmm5,%xmm2 |
+ movdqa %xmm1,128(%r10) |
+ movdqa %xmm4,%xmm1 |
+ |
+ paddd %xmm3,%xmm0 |
+ pcmpeqd %xmm5,%xmm3 |
+ movdqa %xmm2,144(%r10) |
+ movdqa %xmm4,%xmm2 |
+ |
+ paddd %xmm0,%xmm1 |
+ pcmpeqd %xmm5,%xmm0 |
+ movdqa %xmm3,160(%r10) |
+ movdqa %xmm4,%xmm3 |
+ paddd %xmm1,%xmm2 |
+ pcmpeqd %xmm5,%xmm1 |
+ movdqa %xmm0,176(%r10) |
+ movdqa %xmm4,%xmm0 |
+ |
+ paddd %xmm2,%xmm3 |
+ pcmpeqd %xmm5,%xmm2 |
+ movdqa %xmm1,192(%r10) |
+ movdqa %xmm4,%xmm1 |
+ |
+ paddd %xmm3,%xmm0 |
+ pcmpeqd %xmm5,%xmm3 |
+ movdqa %xmm2,208(%r10) |
+ movdqa %xmm4,%xmm2 |
+ |
+ paddd %xmm0,%xmm1 |
+ pcmpeqd %xmm5,%xmm0 |
+ movdqa %xmm3,224(%r10) |
+ movdqa %xmm4,%xmm3 |
+ paddd %xmm1,%xmm2 |
+ pcmpeqd %xmm5,%xmm1 |
+ movdqa %xmm0,240(%r10) |
+ movdqa %xmm4,%xmm0 |
+ |
+ paddd %xmm2,%xmm3 |
+ pcmpeqd %xmm5,%xmm2 |
+ movdqa %xmm1,256(%r10) |
+ movdqa %xmm4,%xmm1 |
+ |
+ paddd %xmm3,%xmm0 |
+ pcmpeqd %xmm5,%xmm3 |
+ movdqa %xmm2,272(%r10) |
+ movdqa %xmm4,%xmm2 |
+ |
+ paddd %xmm0,%xmm1 |
+ pcmpeqd %xmm5,%xmm0 |
+ movdqa %xmm3,288(%r10) |
+ movdqa %xmm4,%xmm3 |
+ paddd %xmm1,%xmm2 |
+ pcmpeqd %xmm5,%xmm1 |
+ movdqa %xmm0,304(%r10) |
+ |
+ paddd %xmm2,%xmm3 |
+.byte 0x67 |
+ pcmpeqd %xmm5,%xmm2 |
+ movdqa %xmm1,320(%r10) |
+ |
+ pcmpeqd %xmm5,%xmm3 |
+ movdqa %xmm2,336(%r10) |
+ pand 64(%r12),%xmm0 |
+ |
+ pand 80(%r12),%xmm1 |
+ pand 96(%r12),%xmm2 |
+ movdqa %xmm3,352(%r10) |
+ pand 112(%r12),%xmm3 |
+ por %xmm2,%xmm0 |
+ por %xmm3,%xmm1 |
+ movdqa -128(%r12),%xmm4 |
+ movdqa -112(%r12),%xmm5 |
+ movdqa -96(%r12),%xmm2 |
+ pand 112(%r10),%xmm4 |
+ movdqa -80(%r12),%xmm3 |
+ pand 128(%r10),%xmm5 |
+ por %xmm4,%xmm0 |
+ pand 144(%r10),%xmm2 |
+ por %xmm5,%xmm1 |
+ pand 160(%r10),%xmm3 |
por %xmm2,%xmm0 |
+ por %xmm3,%xmm1 |
+ movdqa -64(%r12),%xmm4 |
+ movdqa -48(%r12),%xmm5 |
+ movdqa -32(%r12),%xmm2 |
+ pand 176(%r10),%xmm4 |
+ movdqa -16(%r12),%xmm3 |
+ pand 192(%r10),%xmm5 |
+ por %xmm4,%xmm0 |
+ pand 208(%r10),%xmm2 |
+ por %xmm5,%xmm1 |
+ pand 224(%r10),%xmm3 |
+ por %xmm2,%xmm0 |
+ por %xmm3,%xmm1 |
+ movdqa 0(%r12),%xmm4 |
+ movdqa 16(%r12),%xmm5 |
+ movdqa 32(%r12),%xmm2 |
+ pand 240(%r10),%xmm4 |
+ movdqa 48(%r12),%xmm3 |
+ pand 256(%r10),%xmm5 |
+ por %xmm4,%xmm0 |
+ pand 272(%r10),%xmm2 |
+ por %xmm5,%xmm1 |
+ pand 288(%r10),%xmm3 |
+ por %xmm2,%xmm0 |
+ por %xmm3,%xmm1 |
+ por %xmm1,%xmm0 |
+ pshufd $0x4e,%xmm0,%xmm1 |
+ por %xmm1,%xmm0 |
leaq 256(%r12),%r12 |
- por %xmm3,%xmm0 |
- |
.byte 102,72,15,126,195 |
movq (%r8),%r8 |
@@ -65,29 +170,14 @@ bn_mul_mont_gather5: |
xorq %r14,%r14 |
xorq %r15,%r15 |
- movq -96(%r12),%xmm0 |
- movq -32(%r12),%xmm1 |
- pand %xmm4,%xmm0 |
- movq 32(%r12),%xmm2 |
- pand %xmm5,%xmm1 |
- |
movq %r8,%rbp |
mulq %rbx |
movq %rax,%r10 |
movq (%rcx),%rax |
- movq 96(%r12),%xmm3 |
- pand %xmm6,%xmm2 |
- por %xmm1,%xmm0 |
- pand %xmm7,%xmm3 |
- |
imulq %r10,%rbp |
movq %rdx,%r11 |
- por %xmm2,%xmm0 |
- leaq 256(%r12),%r12 |
- por %xmm3,%xmm0 |
- |
mulq %rbp |
addq %rax,%r10 |
movq 8(%rsi),%rax |
@@ -120,14 +210,12 @@ bn_mul_mont_gather5: |
cmpq %r9,%r15 |
jne .L1st |
-.byte 102,72,15,126,195 |
addq %rax,%r13 |
- movq (%rsi),%rax |
adcq $0,%rdx |
addq %r11,%r13 |
adcq $0,%rdx |
- movq %r13,-16(%rsp,%r15,8) |
+ movq %r13,-16(%rsp,%r9,8) |
movq %rdx,%r13 |
movq %r10,%r11 |
@@ -141,33 +229,78 @@ bn_mul_mont_gather5: |
jmp .Louter |
.align 16 |
.Louter: |
+ leaq 24+128(%rsp,%r9,8),%rdx |
+ andq $-16,%rdx |
+ pxor %xmm4,%xmm4 |
+ pxor %xmm5,%xmm5 |
+ movdqa -128(%r12),%xmm0 |
+ movdqa -112(%r12),%xmm1 |
+ movdqa -96(%r12),%xmm2 |
+ movdqa -80(%r12),%xmm3 |
+ pand -128(%rdx),%xmm0 |
+ pand -112(%rdx),%xmm1 |
+ por %xmm0,%xmm4 |
+ pand -96(%rdx),%xmm2 |
+ por %xmm1,%xmm5 |
+ pand -80(%rdx),%xmm3 |
+ por %xmm2,%xmm4 |
+ por %xmm3,%xmm5 |
+ movdqa -64(%r12),%xmm0 |
+ movdqa -48(%r12),%xmm1 |
+ movdqa -32(%r12),%xmm2 |
+ movdqa -16(%r12),%xmm3 |
+ pand -64(%rdx),%xmm0 |
+ pand -48(%rdx),%xmm1 |
+ por %xmm0,%xmm4 |
+ pand -32(%rdx),%xmm2 |
+ por %xmm1,%xmm5 |
+ pand -16(%rdx),%xmm3 |
+ por %xmm2,%xmm4 |
+ por %xmm3,%xmm5 |
+ movdqa 0(%r12),%xmm0 |
+ movdqa 16(%r12),%xmm1 |
+ movdqa 32(%r12),%xmm2 |
+ movdqa 48(%r12),%xmm3 |
+ pand 0(%rdx),%xmm0 |
+ pand 16(%rdx),%xmm1 |
+ por %xmm0,%xmm4 |
+ pand 32(%rdx),%xmm2 |
+ por %xmm1,%xmm5 |
+ pand 48(%rdx),%xmm3 |
+ por %xmm2,%xmm4 |
+ por %xmm3,%xmm5 |
+ movdqa 64(%r12),%xmm0 |
+ movdqa 80(%r12),%xmm1 |
+ movdqa 96(%r12),%xmm2 |
+ movdqa 112(%r12),%xmm3 |
+ pand 64(%rdx),%xmm0 |
+ pand 80(%rdx),%xmm1 |
+ por %xmm0,%xmm4 |
+ pand 96(%rdx),%xmm2 |
+ por %xmm1,%xmm5 |
+ pand 112(%rdx),%xmm3 |
+ por %xmm2,%xmm4 |
+ por %xmm3,%xmm5 |
+ por %xmm5,%xmm4 |
+ pshufd $0x4e,%xmm4,%xmm0 |
+ por %xmm4,%xmm0 |
+ leaq 256(%r12),%r12 |
+ |
+ movq (%rsi),%rax |
+.byte 102,72,15,126,195 |
+ |
xorq %r15,%r15 |
movq %r8,%rbp |
movq (%rsp),%r10 |
- movq -96(%r12),%xmm0 |
- movq -32(%r12),%xmm1 |
- pand %xmm4,%xmm0 |
- movq 32(%r12),%xmm2 |
- pand %xmm5,%xmm1 |
- |
mulq %rbx |
addq %rax,%r10 |
movq (%rcx),%rax |
adcq $0,%rdx |
- movq 96(%r12),%xmm3 |
- pand %xmm6,%xmm2 |
- por %xmm1,%xmm0 |
- pand %xmm7,%xmm3 |
- |
imulq %r10,%rbp |
movq %rdx,%r11 |
- por %xmm2,%xmm0 |
- leaq 256(%r12),%r12 |
- por %xmm3,%xmm0 |
- |
mulq %rbp |
addq %rax,%r10 |
movq 8(%rsi),%rax |
@@ -203,15 +336,12 @@ bn_mul_mont_gather5: |
cmpq %r9,%r15 |
jne .Linner |
-.byte 102,72,15,126,195 |
- |
addq %rax,%r13 |
- movq (%rsi),%rax |
adcq $0,%rdx |
addq %r10,%r13 |
- movq (%rsp,%r15,8),%r10 |
+ movq (%rsp,%r9,8),%r10 |
adcq $0,%rdx |
- movq %r13,-16(%rsp,%r15,8) |
+ movq %r13,-16(%rsp,%r9,8) |
movq %rdx,%r13 |
xorq %rdx,%rdx |
@@ -257,6 +387,7 @@ bn_mul_mont_gather5: |
movq 8(%rsp,%r9,8),%rsi |
movq $1,%rax |
+ |
movq -48(%rsi),%r15 |
movq -40(%rsi),%r14 |
movq -32(%rsi),%r13 |
@@ -279,10 +410,10 @@ bn_mul4x_mont_gather5: |
pushq %r13 |
pushq %r14 |
pushq %r15 |
+ |
.byte 0x67 |
- movl %r9d,%r10d |
shll $3,%r9d |
- shll $3+2,%r10d |
+ leaq (%r9,%r9,2),%r10 |
negq %r9 |
@@ -292,19 +423,21 @@ bn_mul4x_mont_gather5: |
- leaq -64(%rsp,%r9,2),%r11 |
- subq %rsi,%r11 |
+ |
+ |
+ leaq -320(%rsp,%r9,2),%r11 |
+ subq %rdi,%r11 |
andq $4095,%r11 |
cmpq %r11,%r10 |
jb .Lmul4xsp_alt |
subq %r11,%rsp |
- leaq -64(%rsp,%r9,2),%rsp |
+ leaq -320(%rsp,%r9,2),%rsp |
jmp .Lmul4xsp_done |
.align 32 |
.Lmul4xsp_alt: |
- leaq 4096-64(,%r9,2),%r10 |
- leaq -64(%rsp,%r9,2),%rsp |
+ leaq 4096-320(,%r9,2),%r10 |
+ leaq -320(%rsp,%r9,2),%rsp |
subq %r10,%r11 |
movq $0,%r10 |
cmovcq %r10,%r11 |
@@ -320,6 +453,7 @@ bn_mul4x_mont_gather5: |
movq 40(%rsp),%rsi |
movq $1,%rax |
+ |
movq -48(%rsi),%r15 |
movq -40(%rsi),%r14 |
movq -32(%rsi),%r13 |
@@ -335,47 +469,141 @@ bn_mul4x_mont_gather5: |
.align 32 |
mul4x_internal: |
shlq $5,%r9 |
- movl 8(%rax),%r10d |
- leaq 256(%rdx,%r9,1),%r13 |
+ movd 8(%rax),%xmm5 |
+ leaq .Linc(%rip),%rax |
+ leaq 128(%rdx,%r9,1),%r13 |
shrq $5,%r9 |
- movq %r10,%r11 |
- shrq $3,%r10 |
- andq $7,%r11 |
- notq %r10 |
- leaq .Lmagic_masks(%rip),%rax |
- andq $3,%r10 |
- leaq 96(%rdx,%r11,8),%r12 |
- movq 0(%rax,%r10,8),%xmm4 |
- movq 8(%rax,%r10,8),%xmm5 |
- addq $7,%r11 |
- movq 16(%rax,%r10,8),%xmm6 |
- movq 24(%rax,%r10,8),%xmm7 |
- andq $7,%r11 |
- |
- movq -96(%r12),%xmm0 |
- leaq 256(%r12),%r14 |
- movq -32(%r12),%xmm1 |
- pand %xmm4,%xmm0 |
- movq 32(%r12),%xmm2 |
- pand %xmm5,%xmm1 |
- movq 96(%r12),%xmm3 |
- pand %xmm6,%xmm2 |
-.byte 0x67 |
- por %xmm1,%xmm0 |
- movq -96(%r14),%xmm1 |
-.byte 0x67 |
- pand %xmm7,%xmm3 |
-.byte 0x67 |
- por %xmm2,%xmm0 |
- movq -32(%r14),%xmm2 |
+ movdqa 0(%rax),%xmm0 |
+ movdqa 16(%rax),%xmm1 |
+ leaq 88-112(%rsp,%r9,1),%r10 |
+ leaq 128(%rdx),%r12 |
+ |
+ pshufd $0,%xmm5,%xmm5 |
+ movdqa %xmm1,%xmm4 |
+.byte 0x67,0x67 |
+ movdqa %xmm1,%xmm2 |
+ paddd %xmm0,%xmm1 |
+ pcmpeqd %xmm5,%xmm0 |
.byte 0x67 |
- pand %xmm4,%xmm1 |
+ movdqa %xmm4,%xmm3 |
+ paddd %xmm1,%xmm2 |
+ pcmpeqd %xmm5,%xmm1 |
+ movdqa %xmm0,112(%r10) |
+ movdqa %xmm4,%xmm0 |
+ |
+ paddd %xmm2,%xmm3 |
+ pcmpeqd %xmm5,%xmm2 |
+ movdqa %xmm1,128(%r10) |
+ movdqa %xmm4,%xmm1 |
+ |
+ paddd %xmm3,%xmm0 |
+ pcmpeqd %xmm5,%xmm3 |
+ movdqa %xmm2,144(%r10) |
+ movdqa %xmm4,%xmm2 |
+ |
+ paddd %xmm0,%xmm1 |
+ pcmpeqd %xmm5,%xmm0 |
+ movdqa %xmm3,160(%r10) |
+ movdqa %xmm4,%xmm3 |
+ paddd %xmm1,%xmm2 |
+ pcmpeqd %xmm5,%xmm1 |
+ movdqa %xmm0,176(%r10) |
+ movdqa %xmm4,%xmm0 |
+ |
+ paddd %xmm2,%xmm3 |
+ pcmpeqd %xmm5,%xmm2 |
+ movdqa %xmm1,192(%r10) |
+ movdqa %xmm4,%xmm1 |
+ |
+ paddd %xmm3,%xmm0 |
+ pcmpeqd %xmm5,%xmm3 |
+ movdqa %xmm2,208(%r10) |
+ movdqa %xmm4,%xmm2 |
+ |
+ paddd %xmm0,%xmm1 |
+ pcmpeqd %xmm5,%xmm0 |
+ movdqa %xmm3,224(%r10) |
+ movdqa %xmm4,%xmm3 |
+ paddd %xmm1,%xmm2 |
+ pcmpeqd %xmm5,%xmm1 |
+ movdqa %xmm0,240(%r10) |
+ movdqa %xmm4,%xmm0 |
+ |
+ paddd %xmm2,%xmm3 |
+ pcmpeqd %xmm5,%xmm2 |
+ movdqa %xmm1,256(%r10) |
+ movdqa %xmm4,%xmm1 |
+ |
+ paddd %xmm3,%xmm0 |
+ pcmpeqd %xmm5,%xmm3 |
+ movdqa %xmm2,272(%r10) |
+ movdqa %xmm4,%xmm2 |
+ |
+ paddd %xmm0,%xmm1 |
+ pcmpeqd %xmm5,%xmm0 |
+ movdqa %xmm3,288(%r10) |
+ movdqa %xmm4,%xmm3 |
+ paddd %xmm1,%xmm2 |
+ pcmpeqd %xmm5,%xmm1 |
+ movdqa %xmm0,304(%r10) |
+ |
+ paddd %xmm2,%xmm3 |
.byte 0x67 |
- por %xmm3,%xmm0 |
- movq 32(%r14),%xmm3 |
+ pcmpeqd %xmm5,%xmm2 |
+ movdqa %xmm1,320(%r10) |
+ |
+ pcmpeqd %xmm5,%xmm3 |
+ movdqa %xmm2,336(%r10) |
+ pand 64(%r12),%xmm0 |
+ pand 80(%r12),%xmm1 |
+ pand 96(%r12),%xmm2 |
+ movdqa %xmm3,352(%r10) |
+ pand 112(%r12),%xmm3 |
+ por %xmm2,%xmm0 |
+ por %xmm3,%xmm1 |
+ movdqa -128(%r12),%xmm4 |
+ movdqa -112(%r12),%xmm5 |
+ movdqa -96(%r12),%xmm2 |
+ pand 112(%r10),%xmm4 |
+ movdqa -80(%r12),%xmm3 |
+ pand 128(%r10),%xmm5 |
+ por %xmm4,%xmm0 |
+ pand 144(%r10),%xmm2 |
+ por %xmm5,%xmm1 |
+ pand 160(%r10),%xmm3 |
+ por %xmm2,%xmm0 |
+ por %xmm3,%xmm1 |
+ movdqa -64(%r12),%xmm4 |
+ movdqa -48(%r12),%xmm5 |
+ movdqa -32(%r12),%xmm2 |
+ pand 176(%r10),%xmm4 |
+ movdqa -16(%r12),%xmm3 |
+ pand 192(%r10),%xmm5 |
+ por %xmm4,%xmm0 |
+ pand 208(%r10),%xmm2 |
+ por %xmm5,%xmm1 |
+ pand 224(%r10),%xmm3 |
+ por %xmm2,%xmm0 |
+ por %xmm3,%xmm1 |
+ movdqa 0(%r12),%xmm4 |
+ movdqa 16(%r12),%xmm5 |
+ movdqa 32(%r12),%xmm2 |
+ pand 240(%r10),%xmm4 |
+ movdqa 48(%r12),%xmm3 |
+ pand 256(%r10),%xmm5 |
+ por %xmm4,%xmm0 |
+ pand 272(%r10),%xmm2 |
+ por %xmm5,%xmm1 |
+ pand 288(%r10),%xmm3 |
+ por %xmm2,%xmm0 |
+ por %xmm3,%xmm1 |
+ por %xmm1,%xmm0 |
+ pshufd $0x4e,%xmm0,%xmm1 |
+ por %xmm1,%xmm0 |
+ leaq 256(%r12),%r12 |
.byte 102,72,15,126,195 |
- movq 96(%r14),%xmm0 |
+ |
movq %r13,16+8(%rsp) |
movq %rdi,56+8(%rsp) |
@@ -389,26 +617,10 @@ mul4x_internal: |
movq %rax,%r10 |
movq (%rcx),%rax |
- pand %xmm5,%xmm2 |
- pand %xmm6,%xmm3 |
- por %xmm2,%xmm1 |
- |
imulq %r10,%rbp |
- |
- |
- |
- |
- |
- |
- |
- leaq 64+8(%rsp,%r11,8),%r14 |
+ leaq 64+8(%rsp),%r14 |
movq %rdx,%r11 |
- pand %xmm7,%xmm0 |
- por %xmm3,%xmm1 |
- leaq 512(%r12),%r12 |
- por %xmm1,%xmm0 |
- |
mulq %rbp |
addq %rax,%r10 |
movq 8(%rsi,%r9,1),%rax |
@@ -417,7 +629,7 @@ mul4x_internal: |
mulq %rbx |
addq %rax,%r11 |
- movq 16(%rcx),%rax |
+ movq 8(%rcx),%rax |
adcq $0,%rdx |
movq %rdx,%r10 |
@@ -427,7 +639,7 @@ mul4x_internal: |
adcq $0,%rdx |
addq %r11,%rdi |
leaq 32(%r9),%r15 |
- leaq 64(%rcx),%rcx |
+ leaq 32(%rcx),%rcx |
adcq $0,%rdx |
movq %rdi,(%r14) |
movq %rdx,%r13 |
@@ -437,7 +649,7 @@ mul4x_internal: |
.L1st4x: |
mulq %rbx |
addq %rax,%r10 |
- movq -32(%rcx),%rax |
+ movq -16(%rcx),%rax |
leaq 32(%r14),%r14 |
adcq $0,%rdx |
movq %rdx,%r11 |
@@ -453,7 +665,7 @@ mul4x_internal: |
mulq %rbx |
addq %rax,%r11 |
- movq -16(%rcx),%rax |
+ movq -8(%rcx),%rax |
adcq $0,%rdx |
movq %rdx,%r10 |
@@ -483,7 +695,7 @@ mul4x_internal: |
mulq %rbx |
addq %rax,%r11 |
- movq 16(%rcx),%rax |
+ movq 8(%rcx),%rax |
adcq $0,%rdx |
movq %rdx,%r10 |
@@ -492,7 +704,7 @@ mul4x_internal: |
movq 16(%rsi,%r15,1),%rax |
adcq $0,%rdx |
addq %r11,%rdi |
- leaq 64(%rcx),%rcx |
+ leaq 32(%rcx),%rcx |
adcq $0,%rdx |
movq %rdi,(%r14) |
movq %rdx,%r13 |
@@ -502,7 +714,7 @@ mul4x_internal: |
mulq %rbx |
addq %rax,%r10 |
- movq -32(%rcx),%rax |
+ movq -16(%rcx),%rax |
leaq 32(%r14),%r14 |
adcq $0,%rdx |
movq %rdx,%r11 |
@@ -518,7 +730,7 @@ mul4x_internal: |
mulq %rbx |
addq %rax,%r11 |
- movq -16(%rcx),%rax |
+ movq -8(%rcx),%rax |
adcq $0,%rdx |
movq %rdx,%r10 |
@@ -531,8 +743,7 @@ mul4x_internal: |
movq %rdi,-16(%r14) |
movq %rdx,%r13 |
-.byte 102,72,15,126,195 |
- leaq (%rcx,%r9,2),%rcx |
+ leaq (%rcx,%r9,1),%rcx |
xorq %rdi,%rdi |
addq %r10,%r13 |
@@ -543,6 +754,63 @@ mul4x_internal: |
.align 32 |
.Louter4x: |
+ leaq 16+128(%r14),%rdx |
+ pxor %xmm4,%xmm4 |
+ pxor %xmm5,%xmm5 |
+ movdqa -128(%r12),%xmm0 |
+ movdqa -112(%r12),%xmm1 |
+ movdqa -96(%r12),%xmm2 |
+ movdqa -80(%r12),%xmm3 |
+ pand -128(%rdx),%xmm0 |
+ pand -112(%rdx),%xmm1 |
+ por %xmm0,%xmm4 |
+ pand -96(%rdx),%xmm2 |
+ por %xmm1,%xmm5 |
+ pand -80(%rdx),%xmm3 |
+ por %xmm2,%xmm4 |
+ por %xmm3,%xmm5 |
+ movdqa -64(%r12),%xmm0 |
+ movdqa -48(%r12),%xmm1 |
+ movdqa -32(%r12),%xmm2 |
+ movdqa -16(%r12),%xmm3 |
+ pand -64(%rdx),%xmm0 |
+ pand -48(%rdx),%xmm1 |
+ por %xmm0,%xmm4 |
+ pand -32(%rdx),%xmm2 |
+ por %xmm1,%xmm5 |
+ pand -16(%rdx),%xmm3 |
+ por %xmm2,%xmm4 |
+ por %xmm3,%xmm5 |
+ movdqa 0(%r12),%xmm0 |
+ movdqa 16(%r12),%xmm1 |
+ movdqa 32(%r12),%xmm2 |
+ movdqa 48(%r12),%xmm3 |
+ pand 0(%rdx),%xmm0 |
+ pand 16(%rdx),%xmm1 |
+ por %xmm0,%xmm4 |
+ pand 32(%rdx),%xmm2 |
+ por %xmm1,%xmm5 |
+ pand 48(%rdx),%xmm3 |
+ por %xmm2,%xmm4 |
+ por %xmm3,%xmm5 |
+ movdqa 64(%r12),%xmm0 |
+ movdqa 80(%r12),%xmm1 |
+ movdqa 96(%r12),%xmm2 |
+ movdqa 112(%r12),%xmm3 |
+ pand 64(%rdx),%xmm0 |
+ pand 80(%rdx),%xmm1 |
+ por %xmm0,%xmm4 |
+ pand 96(%rdx),%xmm2 |
+ por %xmm1,%xmm5 |
+ pand 112(%rdx),%xmm3 |
+ por %xmm2,%xmm4 |
+ por %xmm3,%xmm5 |
+ por %xmm5,%xmm4 |
+ pshufd $0x4e,%xmm4,%xmm0 |
+ por %xmm4,%xmm0 |
+ leaq 256(%r12),%r12 |
+.byte 102,72,15,126,195 |
+ |
movq (%r14,%r9,1),%r10 |
movq %r8,%rbp |
mulq %rbx |
@@ -550,25 +818,11 @@ mul4x_internal: |
movq (%rcx),%rax |
adcq $0,%rdx |
- movq -96(%r12),%xmm0 |
- movq -32(%r12),%xmm1 |
- pand %xmm4,%xmm0 |
- movq 32(%r12),%xmm2 |
- pand %xmm5,%xmm1 |
- movq 96(%r12),%xmm3 |
- |
imulq %r10,%rbp |
-.byte 0x67 |
movq %rdx,%r11 |
movq %rdi,(%r14) |
- pand %xmm6,%xmm2 |
- por %xmm1,%xmm0 |
- pand %xmm7,%xmm3 |
- por %xmm2,%xmm0 |
leaq (%r14,%r9,1),%r14 |
- leaq 256(%r12),%r12 |
- por %xmm3,%xmm0 |
mulq %rbp |
addq %rax,%r10 |
@@ -578,7 +832,7 @@ mul4x_internal: |
mulq %rbx |
addq %rax,%r11 |
- movq 16(%rcx),%rax |
+ movq 8(%rcx),%rax |
adcq $0,%rdx |
addq 8(%r14),%r11 |
adcq $0,%rdx |
@@ -590,7 +844,7 @@ mul4x_internal: |
adcq $0,%rdx |
addq %r11,%rdi |
leaq 32(%r9),%r15 |
- leaq 64(%rcx),%rcx |
+ leaq 32(%rcx),%rcx |
adcq $0,%rdx |
movq %rdx,%r13 |
jmp .Linner4x |
@@ -599,7 +853,7 @@ mul4x_internal: |
.Linner4x: |
mulq %rbx |
addq %rax,%r10 |
- movq -32(%rcx),%rax |
+ movq -16(%rcx),%rax |
adcq $0,%rdx |
addq 16(%r14),%r10 |
leaq 32(%r14),%r14 |
@@ -617,7 +871,7 @@ mul4x_internal: |
mulq %rbx |
addq %rax,%r11 |
- movq -16(%rcx),%rax |
+ movq -8(%rcx),%rax |
adcq $0,%rdx |
addq -8(%r14),%r11 |
adcq $0,%rdx |
@@ -651,7 +905,7 @@ mul4x_internal: |
mulq %rbx |
addq %rax,%r11 |
- movq 16(%rcx),%rax |
+ movq 8(%rcx),%rax |
adcq $0,%rdx |
addq 8(%r14),%r11 |
adcq $0,%rdx |
@@ -662,7 +916,7 @@ mul4x_internal: |
movq 16(%rsi,%r15,1),%rax |
adcq $0,%rdx |
addq %r11,%rdi |
- leaq 64(%rcx),%rcx |
+ leaq 32(%rcx),%rcx |
adcq $0,%rdx |
movq %r13,-8(%r14) |
movq %rdx,%r13 |
@@ -672,7 +926,7 @@ mul4x_internal: |
mulq %rbx |
addq %rax,%r10 |
- movq -32(%rcx),%rax |
+ movq -16(%rcx),%rax |
adcq $0,%rdx |
addq 16(%r14),%r10 |
leaq 32(%r14),%r14 |
@@ -691,7 +945,7 @@ mul4x_internal: |
mulq %rbx |
addq %rax,%r11 |
movq %rbp,%rax |
- movq -16(%rcx),%rbp |
+ movq -8(%rcx),%rbp |
adcq $0,%rdx |
addq -8(%r14),%r11 |
adcq $0,%rdx |
@@ -706,9 +960,8 @@ mul4x_internal: |
movq %r13,-24(%r14) |
movq %rdx,%r13 |
-.byte 102,72,15,126,195 |
movq %rdi,-16(%r14) |
- leaq (%rcx,%r9,2),%rcx |
+ leaq (%rcx,%r9,1),%rcx |
xorq %rdi,%rdi |
addq %r10,%r13 |
@@ -719,16 +972,23 @@ mul4x_internal: |
cmpq 16+8(%rsp),%r12 |
jb .Louter4x |
+ xorq %rax,%rax |
subq %r13,%rbp |
adcq %r15,%r15 |
orq %r15,%rdi |
- xorq $1,%rdi |
+ subq %rdi,%rax |
leaq (%r14,%r9,1),%rbx |
- leaq (%rcx,%rdi,8),%rbp |
+ movq (%rcx),%r12 |
+ leaq (%rcx),%rbp |
movq %r9,%rcx |
sarq $3+2,%rcx |
movq 56+8(%rsp),%rdi |
- jmp .Lsqr4x_sub |
+ decq %r12 |
+ xorq %r10,%r10 |
+ movq 8(%rbp),%r13 |
+ movq 16(%rbp),%r14 |
+ movq 24(%rbp),%r15 |
+ jmp .Lsqr4x_sub_entry |
.size mul4x_internal,.-mul4x_internal |
.globl bn_power5 |
.hidden bn_power5 |
@@ -742,9 +1002,9 @@ bn_power5: |
pushq %r13 |
pushq %r14 |
pushq %r15 |
- movl %r9d,%r10d |
+ |
shll $3,%r9d |
- shll $3+2,%r10d |
+ leal (%r9,%r9,2),%r10d |
negq %r9 |
movq (%r8),%r8 |
@@ -754,19 +1014,20 @@ bn_power5: |
- leaq -64(%rsp,%r9,2),%r11 |
- subq %rsi,%r11 |
+ |
+ leaq -320(%rsp,%r9,2),%r11 |
+ subq %rdi,%r11 |
andq $4095,%r11 |
cmpq %r11,%r10 |
jb .Lpwr_sp_alt |
subq %r11,%rsp |
- leaq -64(%rsp,%r9,2),%rsp |
+ leaq -320(%rsp,%r9,2),%rsp |
jmp .Lpwr_sp_done |
.align 32 |
.Lpwr_sp_alt: |
- leaq 4096-64(,%r9,2),%r10 |
- leaq -64(%rsp,%r9,2),%rsp |
+ leaq 4096-320(,%r9,2),%r10 |
+ leaq -320(%rsp,%r9,2),%rsp |
subq %r10,%r11 |
movq $0,%r10 |
cmovcq %r10,%r11 |
@@ -794,10 +1055,15 @@ bn_power5: |
.byte 102,72,15,110,226 |
call __bn_sqr8x_internal |
+ call __bn_post4x_internal |
call __bn_sqr8x_internal |
+ call __bn_post4x_internal |
call __bn_sqr8x_internal |
+ call __bn_post4x_internal |
call __bn_sqr8x_internal |
+ call __bn_post4x_internal |
call __bn_sqr8x_internal |
+ call __bn_post4x_internal |
.byte 102,72,15,126,209 |
.byte 102,72,15,126,226 |
@@ -1342,9 +1608,9 @@ __bn_sqr8x_internal: |
movq %rbx,-16(%rdi) |
movq %r8,-8(%rdi) |
.byte 102,72,15,126,213 |
-sqr8x_reduction: |
+__bn_sqr8x_reduction: |
xorq %rax,%rax |
- leaq (%rbp,%r9,2),%rcx |
+ leaq (%r9,%rbp,1),%rcx |
leaq 48+8(%rsp,%r9,2),%rdx |
movq %rcx,0+8(%rsp) |
leaq 48+8(%rsp,%r9,1),%rdi |
@@ -1377,14 +1643,14 @@ sqr8x_reduction: |
.align 32 |
.L8x_reduce: |
mulq %rbx |
- movq 16(%rbp),%rax |
+ movq 8(%rbp),%rax |
negq %r8 |
movq %rdx,%r8 |
adcq $0,%r8 |
mulq %rbx |
addq %rax,%r9 |
- movq 32(%rbp),%rax |
+ movq 16(%rbp),%rax |
adcq $0,%rdx |
addq %r9,%r8 |
movq %rbx,48-8+8(%rsp,%rcx,8) |
@@ -1393,7 +1659,7 @@ sqr8x_reduction: |
mulq %rbx |
addq %rax,%r10 |
- movq 48(%rbp),%rax |
+ movq 24(%rbp),%rax |
adcq $0,%rdx |
addq %r10,%r9 |
movq 32+8(%rsp),%rsi |
@@ -1402,7 +1668,7 @@ sqr8x_reduction: |
mulq %rbx |
addq %rax,%r11 |
- movq 64(%rbp),%rax |
+ movq 32(%rbp),%rax |
adcq $0,%rdx |
imulq %r8,%rsi |
addq %r11,%r10 |
@@ -1411,7 +1677,7 @@ sqr8x_reduction: |
mulq %rbx |
addq %rax,%r12 |
- movq 80(%rbp),%rax |
+ movq 40(%rbp),%rax |
adcq $0,%rdx |
addq %r12,%r11 |
movq %rdx,%r12 |
@@ -1419,7 +1685,7 @@ sqr8x_reduction: |
mulq %rbx |
addq %rax,%r13 |
- movq 96(%rbp),%rax |
+ movq 48(%rbp),%rax |
adcq $0,%rdx |
addq %r13,%r12 |
movq %rdx,%r13 |
@@ -1427,7 +1693,7 @@ sqr8x_reduction: |
mulq %rbx |
addq %rax,%r14 |
- movq 112(%rbp),%rax |
+ movq 56(%rbp),%rax |
adcq $0,%rdx |
addq %r14,%r13 |
movq %rdx,%r14 |
@@ -1445,7 +1711,7 @@ sqr8x_reduction: |
decl %ecx |
jnz .L8x_reduce |
- leaq 128(%rbp),%rbp |
+ leaq 64(%rbp),%rbp |
xorq %rax,%rax |
movq 8+8(%rsp),%rdx |
cmpq 0+8(%rsp),%rbp |
@@ -1471,14 +1737,14 @@ sqr8x_reduction: |
.L8x_tail: |
mulq %rbx |
addq %rax,%r8 |
- movq 16(%rbp),%rax |
+ movq 8(%rbp),%rax |
movq %r8,(%rdi) |
movq %rdx,%r8 |
adcq $0,%r8 |
mulq %rbx |
addq %rax,%r9 |
- movq 32(%rbp),%rax |
+ movq 16(%rbp),%rax |
adcq $0,%rdx |
addq %r9,%r8 |
leaq 8(%rdi),%rdi |
@@ -1487,7 +1753,7 @@ sqr8x_reduction: |
mulq %rbx |
addq %rax,%r10 |
- movq 48(%rbp),%rax |
+ movq 24(%rbp),%rax |
adcq $0,%rdx |
addq %r10,%r9 |
movq %rdx,%r10 |
@@ -1495,7 +1761,7 @@ sqr8x_reduction: |
mulq %rbx |
addq %rax,%r11 |
- movq 64(%rbp),%rax |
+ movq 32(%rbp),%rax |
adcq $0,%rdx |
addq %r11,%r10 |
movq %rdx,%r11 |
@@ -1503,7 +1769,7 @@ sqr8x_reduction: |
mulq %rbx |
addq %rax,%r12 |
- movq 80(%rbp),%rax |
+ movq 40(%rbp),%rax |
adcq $0,%rdx |
addq %r12,%r11 |
movq %rdx,%r12 |
@@ -1511,7 +1777,7 @@ sqr8x_reduction: |
mulq %rbx |
addq %rax,%r13 |
- movq 96(%rbp),%rax |
+ movq 48(%rbp),%rax |
adcq $0,%rdx |
addq %r13,%r12 |
movq %rdx,%r13 |
@@ -1519,7 +1785,7 @@ sqr8x_reduction: |
mulq %rbx |
addq %rax,%r14 |
- movq 112(%rbp),%rax |
+ movq 56(%rbp),%rax |
adcq $0,%rdx |
addq %r14,%r13 |
movq %rdx,%r14 |
@@ -1537,7 +1803,7 @@ sqr8x_reduction: |
decl %ecx |
jnz .L8x_tail |
- leaq 128(%rbp),%rbp |
+ leaq 64(%rbp),%rbp |
movq 8+8(%rsp),%rdx |
cmpq 0+8(%rsp),%rbp |
jae .L8x_tail_done |
@@ -1561,6 +1827,15 @@ sqr8x_reduction: |
.align 32 |
.L8x_tail_done: |
addq (%rdx),%r8 |
+ adcq $0,%r9 |
+ adcq $0,%r10 |
+ adcq $0,%r11 |
+ adcq $0,%r12 |
+ adcq $0,%r13 |
+ adcq $0,%r14 |
+ adcq $0,%r15 |
+ |
+ |
xorq %rax,%rax |
negq %rsi |
@@ -1574,7 +1849,7 @@ sqr8x_reduction: |
adcq 48(%rdi),%r14 |
adcq 56(%rdi),%r15 |
adcq $0,%rax |
- movq -16(%rbp),%rcx |
+ movq -8(%rbp),%rcx |
xorq %rsi,%rsi |
.byte 102,72,15,126,213 |
@@ -1592,44 +1867,62 @@ sqr8x_reduction: |
cmpq %rdx,%rdi |
jb .L8x_reduction_loop |
- |
- subq %r15,%rcx |
+ .byte 0xf3,0xc3 |
+.size bn_sqr8x_internal,.-bn_sqr8x_internal |
+.type __bn_post4x_internal,@function |
+.align 32 |
+__bn_post4x_internal: |
+ movq 0(%rbp),%r12 |
leaq (%rdi,%r9,1),%rbx |
- adcq %rsi,%rsi |
movq %r9,%rcx |
- orq %rsi,%rax |
.byte 102,72,15,126,207 |
- xorq $1,%rax |
+ negq %rax |
.byte 102,72,15,126,206 |
- leaq (%rbp,%rax,8),%rbp |
sarq $3+2,%rcx |
- jmp .Lsqr4x_sub |
+ decq %r12 |
+ xorq %r10,%r10 |
+ movq 8(%rbp),%r13 |
+ movq 16(%rbp),%r14 |
+ movq 24(%rbp),%r15 |
+ jmp .Lsqr4x_sub_entry |
-.align 32 |
+.align 16 |
.Lsqr4x_sub: |
-.byte 0x66 |
- movq 0(%rbx),%r12 |
- movq 8(%rbx),%r13 |
- sbbq 0(%rbp),%r12 |
- movq 16(%rbx),%r14 |
- sbbq 16(%rbp),%r13 |
- movq 24(%rbx),%r15 |
- leaq 32(%rbx),%rbx |
- sbbq 32(%rbp),%r14 |
+ movq 0(%rbp),%r12 |
+ movq 8(%rbp),%r13 |
+ movq 16(%rbp),%r14 |
+ movq 24(%rbp),%r15 |
+.Lsqr4x_sub_entry: |
+ leaq 32(%rbp),%rbp |
+ notq %r12 |
+ notq %r13 |
+ notq %r14 |
+ notq %r15 |
+ andq %rax,%r12 |
+ andq %rax,%r13 |
+ andq %rax,%r14 |
+ andq %rax,%r15 |
+ |
+ negq %r10 |
+ adcq 0(%rbx),%r12 |
+ adcq 8(%rbx),%r13 |
+ adcq 16(%rbx),%r14 |
+ adcq 24(%rbx),%r15 |
movq %r12,0(%rdi) |
- sbbq 48(%rbp),%r15 |
- leaq 64(%rbp),%rbp |
+ leaq 32(%rbx),%rbx |
movq %r13,8(%rdi) |
+ sbbq %r10,%r10 |
movq %r14,16(%rdi) |
movq %r15,24(%rdi) |
leaq 32(%rdi),%rdi |
incq %rcx |
jnz .Lsqr4x_sub |
+ |
movq %r9,%r10 |
negq %r9 |
.byte 0xf3,0xc3 |
-.size bn_sqr8x_internal,.-bn_sqr8x_internal |
+.size __bn_post4x_internal,.-__bn_post4x_internal |
.globl bn_from_montgomery |
.hidden bn_from_montgomery |
.type bn_from_montgomery,@function |
@@ -1652,10 +1945,9 @@ bn_from_mont8x: |
pushq %r13 |
pushq %r14 |
pushq %r15 |
-.byte 0x67 |
- movl %r9d,%r10d |
+ |
shll $3,%r9d |
- shll $3+2,%r10d |
+ leaq (%r9,%r9,2),%r10 |
negq %r9 |
movq (%r8),%r8 |
@@ -1665,19 +1957,20 @@ bn_from_mont8x: |
- leaq -64(%rsp,%r9,2),%r11 |
- subq %rsi,%r11 |
+ |
+ leaq -320(%rsp,%r9,2),%r11 |
+ subq %rdi,%r11 |
andq $4095,%r11 |
cmpq %r11,%r10 |
jb .Lfrom_sp_alt |
subq %r11,%rsp |
- leaq -64(%rsp,%r9,2),%rsp |
+ leaq -320(%rsp,%r9,2),%rsp |
jmp .Lfrom_sp_done |
.align 32 |
.Lfrom_sp_alt: |
- leaq 4096-64(,%r9,2),%r10 |
- leaq -64(%rsp,%r9,2),%rsp |
+ leaq 4096-320(,%r9,2),%r10 |
+ leaq -320(%rsp,%r9,2),%rsp |
subq %r10,%r11 |
movq $0,%r10 |
cmovcq %r10,%r11 |
@@ -1728,7 +2021,8 @@ bn_from_mont8x: |
.byte 0x67 |
movq %rcx,%rbp |
.byte 102,73,15,110,218 |
- call sqr8x_reduction |
+ call __bn_sqr8x_reduction |
+ call __bn_post4x_internal |
pxor %xmm0,%xmm0 |
leaq 48(%rsp),%rax |
@@ -1778,46 +2072,170 @@ bn_scatter5: |
.globl bn_gather5 |
.hidden bn_gather5 |
.type bn_gather5,@function |
-.align 16 |
+.align 32 |
bn_gather5: |
- movl %ecx,%r11d |
- shrl $3,%ecx |
- andq $7,%r11 |
- notl %ecx |
- leaq .Lmagic_masks(%rip),%rax |
- andl $3,%ecx |
- leaq 128(%rdx,%r11,8),%rdx |
- movq 0(%rax,%rcx,8),%xmm4 |
- movq 8(%rax,%rcx,8),%xmm5 |
- movq 16(%rax,%rcx,8),%xmm6 |
- movq 24(%rax,%rcx,8),%xmm7 |
+.LSEH_begin_bn_gather5: |
+ |
+.byte 0x4c,0x8d,0x14,0x24 |
+.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 |
+ leaq .Linc(%rip),%rax |
+ andq $-16,%rsp |
+ |
+ movd %ecx,%xmm5 |
+ movdqa 0(%rax),%xmm0 |
+ movdqa 16(%rax),%xmm1 |
+ leaq 128(%rdx),%r11 |
+ leaq 128(%rsp),%rax |
+ |
+ pshufd $0,%xmm5,%xmm5 |
+ movdqa %xmm1,%xmm4 |
+ movdqa %xmm1,%xmm2 |
+ paddd %xmm0,%xmm1 |
+ pcmpeqd %xmm5,%xmm0 |
+ movdqa %xmm4,%xmm3 |
+ |
+ paddd %xmm1,%xmm2 |
+ pcmpeqd %xmm5,%xmm1 |
+ movdqa %xmm0,-128(%rax) |
+ movdqa %xmm4,%xmm0 |
+ |
+ paddd %xmm2,%xmm3 |
+ pcmpeqd %xmm5,%xmm2 |
+ movdqa %xmm1,-112(%rax) |
+ movdqa %xmm4,%xmm1 |
+ |
+ paddd %xmm3,%xmm0 |
+ pcmpeqd %xmm5,%xmm3 |
+ movdqa %xmm2,-96(%rax) |
+ movdqa %xmm4,%xmm2 |
+ paddd %xmm0,%xmm1 |
+ pcmpeqd %xmm5,%xmm0 |
+ movdqa %xmm3,-80(%rax) |
+ movdqa %xmm4,%xmm3 |
+ |
+ paddd %xmm1,%xmm2 |
+ pcmpeqd %xmm5,%xmm1 |
+ movdqa %xmm0,-64(%rax) |
+ movdqa %xmm4,%xmm0 |
+ |
+ paddd %xmm2,%xmm3 |
+ pcmpeqd %xmm5,%xmm2 |
+ movdqa %xmm1,-48(%rax) |
+ movdqa %xmm4,%xmm1 |
+ |
+ paddd %xmm3,%xmm0 |
+ pcmpeqd %xmm5,%xmm3 |
+ movdqa %xmm2,-32(%rax) |
+ movdqa %xmm4,%xmm2 |
+ paddd %xmm0,%xmm1 |
+ pcmpeqd %xmm5,%xmm0 |
+ movdqa %xmm3,-16(%rax) |
+ movdqa %xmm4,%xmm3 |
+ |
+ paddd %xmm1,%xmm2 |
+ pcmpeqd %xmm5,%xmm1 |
+ movdqa %xmm0,0(%rax) |
+ movdqa %xmm4,%xmm0 |
+ |
+ paddd %xmm2,%xmm3 |
+ pcmpeqd %xmm5,%xmm2 |
+ movdqa %xmm1,16(%rax) |
+ movdqa %xmm4,%xmm1 |
+ |
+ paddd %xmm3,%xmm0 |
+ pcmpeqd %xmm5,%xmm3 |
+ movdqa %xmm2,32(%rax) |
+ movdqa %xmm4,%xmm2 |
+ paddd %xmm0,%xmm1 |
+ pcmpeqd %xmm5,%xmm0 |
+ movdqa %xmm3,48(%rax) |
+ movdqa %xmm4,%xmm3 |
+ |
+ paddd %xmm1,%xmm2 |
+ pcmpeqd %xmm5,%xmm1 |
+ movdqa %xmm0,64(%rax) |
+ movdqa %xmm4,%xmm0 |
+ |
+ paddd %xmm2,%xmm3 |
+ pcmpeqd %xmm5,%xmm2 |
+ movdqa %xmm1,80(%rax) |
+ movdqa %xmm4,%xmm1 |
+ |
+ paddd %xmm3,%xmm0 |
+ pcmpeqd %xmm5,%xmm3 |
+ movdqa %xmm2,96(%rax) |
+ movdqa %xmm4,%xmm2 |
+ movdqa %xmm3,112(%rax) |
jmp .Lgather |
-.align 16 |
-.Lgather: |
- movq -128(%rdx),%xmm0 |
- movq -64(%rdx),%xmm1 |
- pand %xmm4,%xmm0 |
- movq 0(%rdx),%xmm2 |
- pand %xmm5,%xmm1 |
- movq 64(%rdx),%xmm3 |
- pand %xmm6,%xmm2 |
- por %xmm1,%xmm0 |
- pand %xmm7,%xmm3 |
-.byte 0x67,0x67 |
- por %xmm2,%xmm0 |
- leaq 256(%rdx),%rdx |
- por %xmm3,%xmm0 |
+.align 32 |
+.Lgather: |
+ pxor %xmm4,%xmm4 |
+ pxor %xmm5,%xmm5 |
+ movdqa -128(%r11),%xmm0 |
+ movdqa -112(%r11),%xmm1 |
+ movdqa -96(%r11),%xmm2 |
+ pand -128(%rax),%xmm0 |
+ movdqa -80(%r11),%xmm3 |
+ pand -112(%rax),%xmm1 |
+ por %xmm0,%xmm4 |
+ pand -96(%rax),%xmm2 |
+ por %xmm1,%xmm5 |
+ pand -80(%rax),%xmm3 |
+ por %xmm2,%xmm4 |
+ por %xmm3,%xmm5 |
+ movdqa -64(%r11),%xmm0 |
+ movdqa -48(%r11),%xmm1 |
+ movdqa -32(%r11),%xmm2 |
+ pand -64(%rax),%xmm0 |
+ movdqa -16(%r11),%xmm3 |
+ pand -48(%rax),%xmm1 |
+ por %xmm0,%xmm4 |
+ pand -32(%rax),%xmm2 |
+ por %xmm1,%xmm5 |
+ pand -16(%rax),%xmm3 |
+ por %xmm2,%xmm4 |
+ por %xmm3,%xmm5 |
+ movdqa 0(%r11),%xmm0 |
+ movdqa 16(%r11),%xmm1 |
+ movdqa 32(%r11),%xmm2 |
+ pand 0(%rax),%xmm0 |
+ movdqa 48(%r11),%xmm3 |
+ pand 16(%rax),%xmm1 |
+ por %xmm0,%xmm4 |
+ pand 32(%rax),%xmm2 |
+ por %xmm1,%xmm5 |
+ pand 48(%rax),%xmm3 |
+ por %xmm2,%xmm4 |
+ por %xmm3,%xmm5 |
+ movdqa 64(%r11),%xmm0 |
+ movdqa 80(%r11),%xmm1 |
+ movdqa 96(%r11),%xmm2 |
+ pand 64(%rax),%xmm0 |
+ movdqa 112(%r11),%xmm3 |
+ pand 80(%rax),%xmm1 |
+ por %xmm0,%xmm4 |
+ pand 96(%rax),%xmm2 |
+ por %xmm1,%xmm5 |
+ pand 112(%rax),%xmm3 |
+ por %xmm2,%xmm4 |
+ por %xmm3,%xmm5 |
+ por %xmm5,%xmm4 |
+ leaq 256(%r11),%r11 |
+ pshufd $0x4e,%xmm4,%xmm0 |
+ por %xmm4,%xmm0 |
movq %xmm0,(%rdi) |
leaq 8(%rdi),%rdi |
subl $1,%esi |
jnz .Lgather |
+ |
+ leaq (%r10),%rsp |
.byte 0xf3,0xc3 |
.LSEH_end_bn_gather5: |
.size bn_gather5,.-bn_gather5 |
.align 64 |
-.Lmagic_masks: |
-.long 0,0, 0,0, 0,0, -1,-1 |
-.long 0,0, 0,0, 0,0, 0,0 |
+.Linc: |
+.long 0,0, 1,1 |
+.long 2,2, 2,2 |
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 |
#endif |