Index: third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont.S |
diff --git a/third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont.S b/third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont.S |
index 6b9bc05bf4e2f96c68a3e97315d4d6596428b4f9..51e5d19931f0545ef6d41e790a0403ba68058a20 100644 |
--- a/third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont.S |
+++ b/third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont.S |
@@ -634,20 +634,20 @@ L$sqr8x_enter: |
- leaq -64(%rsp,%r9,4),%r11 |
+ leaq -64(%rsp,%r9,2),%r11 |
movq (%r8),%r8 |
subq %rsi,%r11 |
andq $4095,%r11 |
cmpq %r11,%r10 |
jb L$sqr8x_sp_alt |
subq %r11,%rsp |
- leaq -64(%rsp,%r9,4),%rsp |
+ leaq -64(%rsp,%r9,2),%rsp |
jmp L$sqr8x_sp_done |
.p2align 5 |
L$sqr8x_sp_alt: |
- leaq 4096-64(,%r9,4),%r10 |
- leaq -64(%rsp,%r9,4),%rsp |
+ leaq 4096-64(,%r9,2),%r10 |
+ leaq -64(%rsp,%r9,2),%rsp |
subq %r10,%r11 |
movq $0,%r10 |
cmovcq %r10,%r11 |
@@ -657,58 +657,80 @@ L$sqr8x_sp_done: |
movq %r9,%r10 |
negq %r9 |
- leaq 64(%rsp,%r9,2),%r11 |
movq %r8,32(%rsp) |
movq %rax,40(%rsp) |
L$sqr8x_body: |
- movq %r9,%rbp |
-.byte 102,73,15,110,211 |
- shrq $3+2,%rbp |
- movl _OPENSSL_ia32cap_P+8(%rip),%eax |
- jmp L$sqr8x_copy_n |
- |
-.p2align 5 |
-L$sqr8x_copy_n: |
- movq 0(%rcx),%xmm0 |
- movq 8(%rcx),%xmm1 |
- movq 16(%rcx),%xmm3 |
- movq 24(%rcx),%xmm4 |
- leaq 32(%rcx),%rcx |
- movdqa %xmm0,0(%r11) |
- movdqa %xmm1,16(%r11) |
- movdqa %xmm3,32(%r11) |
- movdqa %xmm4,48(%r11) |
- leaq 64(%r11),%r11 |
- decq %rbp |
- jnz L$sqr8x_copy_n |
- |
+.byte 102,72,15,110,209 |
pxor %xmm0,%xmm0 |
.byte 102,72,15,110,207 |
.byte 102,73,15,110,218 |
call _bn_sqr8x_internal |
+ |
+ |
+ |
+ leaq (%rdi,%r9,1),%rbx |
+ movq %r9,%rcx |
+ movq %r9,%rdx |
+.byte 102,72,15,126,207 |
+ sarq $3+2,%rcx |
+ jmp L$sqr8x_sub |
+ |
+.p2align 5 |
+L$sqr8x_sub: |
+ movq 0(%rbx),%r12 |
+ movq 8(%rbx),%r13 |
+ movq 16(%rbx),%r14 |
+ movq 24(%rbx),%r15 |
+ leaq 32(%rbx),%rbx |
+ sbbq 0(%rbp),%r12 |
+ sbbq 8(%rbp),%r13 |
+ sbbq 16(%rbp),%r14 |
+ sbbq 24(%rbp),%r15 |
+ leaq 32(%rbp),%rbp |
+ movq %r12,0(%rdi) |
+ movq %r13,8(%rdi) |
+ movq %r14,16(%rdi) |
+ movq %r15,24(%rdi) |
+ leaq 32(%rdi),%rdi |
+ incq %rcx |
+ jnz L$sqr8x_sub |
+ |
+ sbbq $0,%rax |
+ leaq (%rbx,%r9,1),%rbx |
+ leaq (%rdi,%r9,1),%rdi |
+ |
+.byte 102,72,15,110,200 |
pxor %xmm0,%xmm0 |
- leaq 48(%rsp),%rax |
- leaq 64(%rsp,%r9,2),%rdx |
- shrq $3+2,%r9 |
+ pshufd $0,%xmm1,%xmm1 |
movq 40(%rsp),%rsi |
- jmp L$sqr8x_zero |
+ jmp L$sqr8x_cond_copy |
.p2align 5 |
-L$sqr8x_zero: |
- movdqa %xmm0,0(%rax) |
- movdqa %xmm0,16(%rax) |
- movdqa %xmm0,32(%rax) |
- movdqa %xmm0,48(%rax) |
- leaq 64(%rax),%rax |
- movdqa %xmm0,0(%rdx) |
- movdqa %xmm0,16(%rdx) |
- movdqa %xmm0,32(%rdx) |
- movdqa %xmm0,48(%rdx) |
- leaq 64(%rdx),%rdx |
- decq %r9 |
- jnz L$sqr8x_zero |
+L$sqr8x_cond_copy: |
+ movdqa 0(%rbx),%xmm2 |
+ movdqa 16(%rbx),%xmm3 |
+ leaq 32(%rbx),%rbx |
+ movdqu 0(%rdi),%xmm4 |
+ movdqu 16(%rdi),%xmm5 |
+ leaq 32(%rdi),%rdi |
+ movdqa %xmm0,-32(%rbx) |
+ movdqa %xmm0,-16(%rbx) |
+ movdqa %xmm0,-32(%rbx,%rdx,1) |
+ movdqa %xmm0,-16(%rbx,%rdx,1) |
+ pcmpeqd %xmm1,%xmm0 |
+ pand %xmm1,%xmm2 |
+ pand %xmm1,%xmm3 |
+ pand %xmm0,%xmm4 |
+ pand %xmm0,%xmm5 |
+ pxor %xmm0,%xmm0 |
+ por %xmm2,%xmm4 |
+ por %xmm3,%xmm5 |
+ movdqu %xmm4,-32(%rdi) |
+ movdqu %xmm5,-16(%rdi) |
+ addq $32,%r9 |
+ jnz L$sqr8x_cond_copy |
movq $1,%rax |
movq -48(%rsi),%r15 |