| Index: third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont.S
|
| diff --git a/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont.S b/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont.S
|
| index 4d401c6743f84448e533db45a600b0fdc5e52a21..83926ad789b0a25102fb778b1842f32270aa47f3 100644
|
| --- a/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont.S
|
| +++ b/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont.S
|
| @@ -636,20 +636,20 @@ bn_sqr8x_mont:
|
|
|
|
|
|
|
| - leaq -64(%rsp,%r9,4),%r11
|
| + leaq -64(%rsp,%r9,2),%r11
|
| movq (%r8),%r8
|
| subq %rsi,%r11
|
| andq $4095,%r11
|
| cmpq %r11,%r10
|
| jb .Lsqr8x_sp_alt
|
| subq %r11,%rsp
|
| - leaq -64(%rsp,%r9,4),%rsp
|
| + leaq -64(%rsp,%r9,2),%rsp
|
| jmp .Lsqr8x_sp_done
|
|
|
| .align 32
|
| .Lsqr8x_sp_alt:
|
| - leaq 4096-64(,%r9,4),%r10
|
| - leaq -64(%rsp,%r9,4),%rsp
|
| + leaq 4096-64(,%r9,2),%r10
|
| + leaq -64(%rsp,%r9,2),%rsp
|
| subq %r10,%r11
|
| movq $0,%r10
|
| cmovcq %r10,%r11
|
| @@ -659,58 +659,80 @@ bn_sqr8x_mont:
|
| movq %r9,%r10
|
| negq %r9
|
|
|
| - leaq 64(%rsp,%r9,2),%r11
|
| movq %r8,32(%rsp)
|
| movq %rax,40(%rsp)
|
| .Lsqr8x_body:
|
|
|
| - movq %r9,%rbp
|
| -.byte 102,73,15,110,211
|
| - shrq $3+2,%rbp
|
| - movl OPENSSL_ia32cap_P+8(%rip),%eax
|
| - jmp .Lsqr8x_copy_n
|
| -
|
| -.align 32
|
| -.Lsqr8x_copy_n:
|
| - movq 0(%rcx),%xmm0
|
| - movq 8(%rcx),%xmm1
|
| - movq 16(%rcx),%xmm3
|
| - movq 24(%rcx),%xmm4
|
| - leaq 32(%rcx),%rcx
|
| - movdqa %xmm0,0(%r11)
|
| - movdqa %xmm1,16(%r11)
|
| - movdqa %xmm3,32(%r11)
|
| - movdqa %xmm4,48(%r11)
|
| - leaq 64(%r11),%r11
|
| - decq %rbp
|
| - jnz .Lsqr8x_copy_n
|
| -
|
| +.byte 102,72,15,110,209
|
| pxor %xmm0,%xmm0
|
| .byte 102,72,15,110,207
|
| .byte 102,73,15,110,218
|
| call bn_sqr8x_internal
|
|
|
| +
|
| +
|
| +
|
| + leaq (%rdi,%r9,1),%rbx
|
| + movq %r9,%rcx
|
| + movq %r9,%rdx
|
| +.byte 102,72,15,126,207
|
| + sarq $3+2,%rcx
|
| + jmp .Lsqr8x_sub
|
| +
|
| +.align 32
|
| +.Lsqr8x_sub:
|
| + movq 0(%rbx),%r12
|
| + movq 8(%rbx),%r13
|
| + movq 16(%rbx),%r14
|
| + movq 24(%rbx),%r15
|
| + leaq 32(%rbx),%rbx
|
| + sbbq 0(%rbp),%r12
|
| + sbbq 8(%rbp),%r13
|
| + sbbq 16(%rbp),%r14
|
| + sbbq 24(%rbp),%r15
|
| + leaq 32(%rbp),%rbp
|
| + movq %r12,0(%rdi)
|
| + movq %r13,8(%rdi)
|
| + movq %r14,16(%rdi)
|
| + movq %r15,24(%rdi)
|
| + leaq 32(%rdi),%rdi
|
| + incq %rcx
|
| + jnz .Lsqr8x_sub
|
| +
|
| + sbbq $0,%rax
|
| + leaq (%rbx,%r9,1),%rbx
|
| + leaq (%rdi,%r9,1),%rdi
|
| +
|
| +.byte 102,72,15,110,200
|
| pxor %xmm0,%xmm0
|
| - leaq 48(%rsp),%rax
|
| - leaq 64(%rsp,%r9,2),%rdx
|
| - shrq $3+2,%r9
|
| + pshufd $0,%xmm1,%xmm1
|
| movq 40(%rsp),%rsi
|
| - jmp .Lsqr8x_zero
|
| + jmp .Lsqr8x_cond_copy
|
|
|
| .align 32
|
| -.Lsqr8x_zero:
|
| - movdqa %xmm0,0(%rax)
|
| - movdqa %xmm0,16(%rax)
|
| - movdqa %xmm0,32(%rax)
|
| - movdqa %xmm0,48(%rax)
|
| - leaq 64(%rax),%rax
|
| - movdqa %xmm0,0(%rdx)
|
| - movdqa %xmm0,16(%rdx)
|
| - movdqa %xmm0,32(%rdx)
|
| - movdqa %xmm0,48(%rdx)
|
| - leaq 64(%rdx),%rdx
|
| - decq %r9
|
| - jnz .Lsqr8x_zero
|
| +.Lsqr8x_cond_copy:
|
| + movdqa 0(%rbx),%xmm2
|
| + movdqa 16(%rbx),%xmm3
|
| + leaq 32(%rbx),%rbx
|
| + movdqu 0(%rdi),%xmm4
|
| + movdqu 16(%rdi),%xmm5
|
| + leaq 32(%rdi),%rdi
|
| + movdqa %xmm0,-32(%rbx)
|
| + movdqa %xmm0,-16(%rbx)
|
| + movdqa %xmm0,-32(%rbx,%rdx,1)
|
| + movdqa %xmm0,-16(%rbx,%rdx,1)
|
| + pcmpeqd %xmm1,%xmm0
|
| + pand %xmm1,%xmm2
|
| + pand %xmm1,%xmm3
|
| + pand %xmm0,%xmm4
|
| + pand %xmm0,%xmm5
|
| + pxor %xmm0,%xmm0
|
| + por %xmm2,%xmm4
|
| + por %xmm3,%xmm5
|
| + movdqu %xmm4,-32(%rdi)
|
| + movdqu %xmm5,-16(%rdi)
|
| + addq $32,%r9
|
| + jnz .Lsqr8x_cond_copy
|
|
|
| movq $1,%rax
|
| movq -48(%rsi),%r15
|
|
|