Index: third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont.S |
diff --git a/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont.S b/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont.S |
index 4d401c6743f84448e533db45a600b0fdc5e52a21..83926ad789b0a25102fb778b1842f32270aa47f3 100644 |
--- a/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont.S |
+++ b/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont.S |
@@ -636,20 +636,20 @@ bn_sqr8x_mont: |
- leaq -64(%rsp,%r9,4),%r11 |
+ leaq -64(%rsp,%r9,2),%r11 |
movq (%r8),%r8 |
subq %rsi,%r11 |
andq $4095,%r11 |
cmpq %r11,%r10 |
jb .Lsqr8x_sp_alt |
subq %r11,%rsp |
- leaq -64(%rsp,%r9,4),%rsp |
+ leaq -64(%rsp,%r9,2),%rsp |
jmp .Lsqr8x_sp_done |
.align 32 |
.Lsqr8x_sp_alt: |
- leaq 4096-64(,%r9,4),%r10 |
- leaq -64(%rsp,%r9,4),%rsp |
+ leaq 4096-64(,%r9,2),%r10 |
+ leaq -64(%rsp,%r9,2),%rsp |
subq %r10,%r11 |
movq $0,%r10 |
cmovcq %r10,%r11 |
@@ -659,58 +659,80 @@ bn_sqr8x_mont: |
movq %r9,%r10 |
negq %r9 |
- leaq 64(%rsp,%r9,2),%r11 |
movq %r8,32(%rsp) |
movq %rax,40(%rsp) |
.Lsqr8x_body: |
- movq %r9,%rbp |
-.byte 102,73,15,110,211 |
- shrq $3+2,%rbp |
- movl OPENSSL_ia32cap_P+8(%rip),%eax |
- jmp .Lsqr8x_copy_n |
- |
-.align 32 |
-.Lsqr8x_copy_n: |
- movq 0(%rcx),%xmm0 |
- movq 8(%rcx),%xmm1 |
- movq 16(%rcx),%xmm3 |
- movq 24(%rcx),%xmm4 |
- leaq 32(%rcx),%rcx |
- movdqa %xmm0,0(%r11) |
- movdqa %xmm1,16(%r11) |
- movdqa %xmm3,32(%r11) |
- movdqa %xmm4,48(%r11) |
- leaq 64(%r11),%r11 |
- decq %rbp |
- jnz .Lsqr8x_copy_n |
- |
+.byte 102,72,15,110,209 |
pxor %xmm0,%xmm0 |
.byte 102,72,15,110,207 |
.byte 102,73,15,110,218 |
call bn_sqr8x_internal |
+ |
+ |
+ |
+ leaq (%rdi,%r9,1),%rbx |
+ movq %r9,%rcx |
+ movq %r9,%rdx |
+.byte 102,72,15,126,207 |
+ sarq $3+2,%rcx |
+ jmp .Lsqr8x_sub |
+ |
+.align 32 |
+.Lsqr8x_sub: |
+ movq 0(%rbx),%r12 |
+ movq 8(%rbx),%r13 |
+ movq 16(%rbx),%r14 |
+ movq 24(%rbx),%r15 |
+ leaq 32(%rbx),%rbx |
+ sbbq 0(%rbp),%r12 |
+ sbbq 8(%rbp),%r13 |
+ sbbq 16(%rbp),%r14 |
+ sbbq 24(%rbp),%r15 |
+ leaq 32(%rbp),%rbp |
+ movq %r12,0(%rdi) |
+ movq %r13,8(%rdi) |
+ movq %r14,16(%rdi) |
+ movq %r15,24(%rdi) |
+ leaq 32(%rdi),%rdi |
+ incq %rcx |
+ jnz .Lsqr8x_sub |
+ |
+ sbbq $0,%rax |
+ leaq (%rbx,%r9,1),%rbx |
+ leaq (%rdi,%r9,1),%rdi |
+ |
+.byte 102,72,15,110,200 |
pxor %xmm0,%xmm0 |
- leaq 48(%rsp),%rax |
- leaq 64(%rsp,%r9,2),%rdx |
- shrq $3+2,%r9 |
+ pshufd $0,%xmm1,%xmm1 |
movq 40(%rsp),%rsi |
- jmp .Lsqr8x_zero |
+ jmp .Lsqr8x_cond_copy |
.align 32 |
-.Lsqr8x_zero: |
- movdqa %xmm0,0(%rax) |
- movdqa %xmm0,16(%rax) |
- movdqa %xmm0,32(%rax) |
- movdqa %xmm0,48(%rax) |
- leaq 64(%rax),%rax |
- movdqa %xmm0,0(%rdx) |
- movdqa %xmm0,16(%rdx) |
- movdqa %xmm0,32(%rdx) |
- movdqa %xmm0,48(%rdx) |
- leaq 64(%rdx),%rdx |
- decq %r9 |
- jnz .Lsqr8x_zero |
+.Lsqr8x_cond_copy: |
+ movdqa 0(%rbx),%xmm2 |
+ movdqa 16(%rbx),%xmm3 |
+ leaq 32(%rbx),%rbx |
+ movdqu 0(%rdi),%xmm4 |
+ movdqu 16(%rdi),%xmm5 |
+ leaq 32(%rdi),%rdi |
+ movdqa %xmm0,-32(%rbx) |
+ movdqa %xmm0,-16(%rbx) |
+ movdqa %xmm0,-32(%rbx,%rdx,1) |
+ movdqa %xmm0,-16(%rbx,%rdx,1) |
+ pcmpeqd %xmm1,%xmm0 |
+ pand %xmm1,%xmm2 |
+ pand %xmm1,%xmm3 |
+ pand %xmm0,%xmm4 |
+ pand %xmm0,%xmm5 |
+ pxor %xmm0,%xmm0 |
+ por %xmm2,%xmm4 |
+ por %xmm3,%xmm5 |
+ movdqu %xmm4,-32(%rdi) |
+ movdqu %xmm5,-16(%rdi) |
+ addq $32,%r9 |
+ jnz .Lsqr8x_cond_copy |
movq $1,%rax |
movq -48(%rsi),%r15 |