| Index: third_party/boringssl/linux-x86_64/crypto/bn/rsaz-x86_64.S
|
| diff --git a/third_party/boringssl/linux-x86_64/crypto/bn/rsaz-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/bn/rsaz-x86_64.S
|
| index dd3d3106d164449a9c16aef31a90d2ae23a39e10..21531d1c65099f55e3894c1875e7479a2d6d653c 100644
|
| --- a/third_party/boringssl/linux-x86_64/crypto/bn/rsaz-x86_64.S
|
| +++ b/third_party/boringssl/linux-x86_64/crypto/bn/rsaz-x86_64.S
|
| @@ -466,48 +466,94 @@ rsaz_512_mul_gather4:
|
| pushq %r14
|
| pushq %r15
|
|
|
| - movl %r9d,%r9d
|
| - subq $128+24,%rsp
|
| + subq $152,%rsp
|
| .Lmul_gather4_body:
|
| - movl 64(%rdx,%r9,4),%eax
|
| -.byte 102,72,15,110,199
|
| - movl (%rdx,%r9,4),%ebx
|
| -.byte 102,72,15,110,201
|
| + movd %r9d,%xmm8
|
| + movdqa .Linc+16(%rip),%xmm1
|
| + movdqa .Linc(%rip),%xmm0
|
| +
|
| + pshufd $0,%xmm8,%xmm8
|
| + movdqa %xmm1,%xmm7
|
| + movdqa %xmm1,%xmm2
|
| + paddd %xmm0,%xmm1
|
| + pcmpeqd %xmm8,%xmm0
|
| + movdqa %xmm7,%xmm3
|
| + paddd %xmm1,%xmm2
|
| + pcmpeqd %xmm8,%xmm1
|
| + movdqa %xmm7,%xmm4
|
| + paddd %xmm2,%xmm3
|
| + pcmpeqd %xmm8,%xmm2
|
| + movdqa %xmm7,%xmm5
|
| + paddd %xmm3,%xmm4
|
| + pcmpeqd %xmm8,%xmm3
|
| + movdqa %xmm7,%xmm6
|
| + paddd %xmm4,%xmm5
|
| + pcmpeqd %xmm8,%xmm4
|
| + paddd %xmm5,%xmm6
|
| + pcmpeqd %xmm8,%xmm5
|
| + paddd %xmm6,%xmm7
|
| + pcmpeqd %xmm8,%xmm6
|
| + pcmpeqd %xmm8,%xmm7
|
| +
|
| + movdqa 0(%rdx),%xmm8
|
| + movdqa 16(%rdx),%xmm9
|
| + movdqa 32(%rdx),%xmm10
|
| + movdqa 48(%rdx),%xmm11
|
| + pand %xmm0,%xmm8
|
| + movdqa 64(%rdx),%xmm12
|
| + pand %xmm1,%xmm9
|
| + movdqa 80(%rdx),%xmm13
|
| + pand %xmm2,%xmm10
|
| + movdqa 96(%rdx),%xmm14
|
| + pand %xmm3,%xmm11
|
| + movdqa 112(%rdx),%xmm15
|
| + leaq 128(%rdx),%rbp
|
| + pand %xmm4,%xmm12
|
| + pand %xmm5,%xmm13
|
| + pand %xmm6,%xmm14
|
| + pand %xmm7,%xmm15
|
| + por %xmm10,%xmm8
|
| + por %xmm11,%xmm9
|
| + por %xmm12,%xmm8
|
| + por %xmm13,%xmm9
|
| + por %xmm14,%xmm8
|
| + por %xmm15,%xmm9
|
| +
|
| + por %xmm9,%xmm8
|
| + pshufd $0x4e,%xmm8,%xmm9
|
| + por %xmm9,%xmm8
|
| +.byte 102,76,15,126,195
|
| +
|
| movq %r8,128(%rsp)
|
| + movq %rdi,128+8(%rsp)
|
| + movq %rcx,128+16(%rsp)
|
|
|
| - shlq $32,%rax
|
| - orq %rax,%rbx
|
| movq (%rsi),%rax
|
| movq 8(%rsi),%rcx
|
| - leaq 128(%rdx,%r9,4),%rbp
|
| mulq %rbx
|
| movq %rax,(%rsp)
|
| movq %rcx,%rax
|
| movq %rdx,%r8
|
|
|
| mulq %rbx
|
| - movd (%rbp),%xmm4
|
| addq %rax,%r8
|
| movq 16(%rsi),%rax
|
| movq %rdx,%r9
|
| adcq $0,%r9
|
|
|
| mulq %rbx
|
| - movd 64(%rbp),%xmm5
|
| addq %rax,%r9
|
| movq 24(%rsi),%rax
|
| movq %rdx,%r10
|
| adcq $0,%r10
|
|
|
| mulq %rbx
|
| - pslldq $4,%xmm5
|
| addq %rax,%r10
|
| movq 32(%rsi),%rax
|
| movq %rdx,%r11
|
| adcq $0,%r11
|
|
|
| mulq %rbx
|
| - por %xmm5,%xmm4
|
| addq %rax,%r11
|
| movq 40(%rsi),%rax
|
| movq %rdx,%r12
|
| @@ -520,14 +566,12 @@ rsaz_512_mul_gather4:
|
| adcq $0,%r13
|
|
|
| mulq %rbx
|
| - leaq 128(%rbp),%rbp
|
| addq %rax,%r13
|
| movq 56(%rsi),%rax
|
| movq %rdx,%r14
|
| adcq $0,%r14
|
|
|
| mulq %rbx
|
| -.byte 102,72,15,126,227
|
| addq %rax,%r14
|
| movq (%rsi),%rax
|
| movq %rdx,%r15
|
| @@ -539,6 +583,35 @@ rsaz_512_mul_gather4:
|
|
|
| .align 32
|
| .Loop_mul_gather:
|
| + movdqa 0(%rbp),%xmm8
|
| + movdqa 16(%rbp),%xmm9
|
| + movdqa 32(%rbp),%xmm10
|
| + movdqa 48(%rbp),%xmm11
|
| + pand %xmm0,%xmm8
|
| + movdqa 64(%rbp),%xmm12
|
| + pand %xmm1,%xmm9
|
| + movdqa 80(%rbp),%xmm13
|
| + pand %xmm2,%xmm10
|
| + movdqa 96(%rbp),%xmm14
|
| + pand %xmm3,%xmm11
|
| + movdqa 112(%rbp),%xmm15
|
| + leaq 128(%rbp),%rbp
|
| + pand %xmm4,%xmm12
|
| + pand %xmm5,%xmm13
|
| + pand %xmm6,%xmm14
|
| + pand %xmm7,%xmm15
|
| + por %xmm10,%xmm8
|
| + por %xmm11,%xmm9
|
| + por %xmm12,%xmm8
|
| + por %xmm13,%xmm9
|
| + por %xmm14,%xmm8
|
| + por %xmm15,%xmm9
|
| +
|
| + por %xmm9,%xmm8
|
| + pshufd $0x4e,%xmm8,%xmm9
|
| + por %xmm9,%xmm8
|
| +.byte 102,76,15,126,195
|
| +
|
| mulq %rbx
|
| addq %rax,%r8
|
| movq 8(%rsi),%rax
|
| @@ -547,7 +620,6 @@ rsaz_512_mul_gather4:
|
| adcq $0,%r8
|
|
|
| mulq %rbx
|
| - movd (%rbp),%xmm4
|
| addq %rax,%r9
|
| movq 16(%rsi),%rax
|
| adcq $0,%rdx
|
| @@ -556,7 +628,6 @@ rsaz_512_mul_gather4:
|
| adcq $0,%r9
|
|
|
| mulq %rbx
|
| - movd 64(%rbp),%xmm5
|
| addq %rax,%r10
|
| movq 24(%rsi),%rax
|
| adcq $0,%rdx
|
| @@ -565,7 +636,6 @@ rsaz_512_mul_gather4:
|
| adcq $0,%r10
|
|
|
| mulq %rbx
|
| - pslldq $4,%xmm5
|
| addq %rax,%r11
|
| movq 32(%rsi),%rax
|
| adcq $0,%rdx
|
| @@ -574,7 +644,6 @@ rsaz_512_mul_gather4:
|
| adcq $0,%r11
|
|
|
| mulq %rbx
|
| - por %xmm5,%xmm4
|
| addq %rax,%r12
|
| movq 40(%rsi),%rax
|
| adcq $0,%rdx
|
| @@ -599,7 +668,6 @@ rsaz_512_mul_gather4:
|
| adcq $0,%r14
|
|
|
| mulq %rbx
|
| -.byte 102,72,15,126,227
|
| addq %rax,%r15
|
| movq (%rsi),%rax
|
| adcq $0,%rdx
|
| @@ -607,7 +675,6 @@ rsaz_512_mul_gather4:
|
| movq %rdx,%r15
|
| adcq $0,%r15
|
|
|
| - leaq 128(%rbp),%rbp
|
| leaq 8(%rdi),%rdi
|
|
|
| decl %ecx
|
| @@ -622,8 +689,8 @@ rsaz_512_mul_gather4:
|
| movq %r14,48(%rdi)
|
| movq %r15,56(%rdi)
|
|
|
| -.byte 102,72,15,126,199
|
| -.byte 102,72,15,126,205
|
| + movq 128+8(%rsp),%rdi
|
| + movq 128+16(%rsp),%rbp
|
|
|
| movq (%rsp),%r8
|
| movq 8(%rsp),%r9
|
| @@ -673,7 +740,7 @@ rsaz_512_mul_scatter4:
|
| movl %r9d,%r9d
|
| subq $128+24,%rsp
|
| .Lmul_scatter4_body:
|
| - leaq (%r8,%r9,4),%r8
|
| + leaq (%r8,%r9,8),%r8
|
| .byte 102,72,15,110,199
|
| .byte 102,72,15,110,202
|
| .byte 102,73,15,110,208
|
| @@ -709,30 +776,14 @@ rsaz_512_mul_scatter4:
|
|
|
| call __rsaz_512_subtract
|
|
|
| - movl %r8d,0(%rsi)
|
| - shrq $32,%r8
|
| - movl %r9d,128(%rsi)
|
| - shrq $32,%r9
|
| - movl %r10d,256(%rsi)
|
| - shrq $32,%r10
|
| - movl %r11d,384(%rsi)
|
| - shrq $32,%r11
|
| - movl %r12d,512(%rsi)
|
| - shrq $32,%r12
|
| - movl %r13d,640(%rsi)
|
| - shrq $32,%r13
|
| - movl %r14d,768(%rsi)
|
| - shrq $32,%r14
|
| - movl %r15d,896(%rsi)
|
| - shrq $32,%r15
|
| - movl %r8d,64(%rsi)
|
| - movl %r9d,192(%rsi)
|
| - movl %r10d,320(%rsi)
|
| - movl %r11d,448(%rsi)
|
| - movl %r12d,576(%rsi)
|
| - movl %r13d,704(%rsi)
|
| - movl %r14d,832(%rsi)
|
| - movl %r15d,960(%rsi)
|
| + movq %r8,0(%rsi)
|
| + movq %r9,128(%rsi)
|
| + movq %r10,256(%rsi)
|
| + movq %r11,384(%rsi)
|
| + movq %r12,512(%rsi)
|
| + movq %r13,640(%rsi)
|
| + movq %r14,768(%rsi)
|
| + movq %r15,896(%rsi)
|
|
|
| leaq 128+24+48(%rsp),%rax
|
| movq -48(%rax),%r15
|
| @@ -1087,16 +1138,14 @@ __rsaz_512_mul:
|
| .type rsaz_512_scatter4,@function
|
| .align 16
|
| rsaz_512_scatter4:
|
| - leaq (%rdi,%rdx,4),%rdi
|
| + leaq (%rdi,%rdx,8),%rdi
|
| movl $8,%r9d
|
| jmp .Loop_scatter
|
| .align 16
|
| .Loop_scatter:
|
| movq (%rsi),%rax
|
| leaq 8(%rsi),%rsi
|
| - movl %eax,(%rdi)
|
| - shrq $32,%rax
|
| - movl %eax,64(%rdi)
|
| + movq %rax,(%rdi)
|
| leaq 128(%rdi),%rdi
|
| decl %r9d
|
| jnz .Loop_scatter
|
| @@ -1108,20 +1157,73 @@ rsaz_512_scatter4:
|
| .type rsaz_512_gather4,@function
|
| .align 16
|
| rsaz_512_gather4:
|
| - leaq (%rsi,%rdx,4),%rsi
|
| + movd %edx,%xmm8
|
| + movdqa .Linc+16(%rip),%xmm1
|
| + movdqa .Linc(%rip),%xmm0
|
| +
|
| + pshufd $0,%xmm8,%xmm8
|
| + movdqa %xmm1,%xmm7
|
| + movdqa %xmm1,%xmm2
|
| + paddd %xmm0,%xmm1
|
| + pcmpeqd %xmm8,%xmm0
|
| + movdqa %xmm7,%xmm3
|
| + paddd %xmm1,%xmm2
|
| + pcmpeqd %xmm8,%xmm1
|
| + movdqa %xmm7,%xmm4
|
| + paddd %xmm2,%xmm3
|
| + pcmpeqd %xmm8,%xmm2
|
| + movdqa %xmm7,%xmm5
|
| + paddd %xmm3,%xmm4
|
| + pcmpeqd %xmm8,%xmm3
|
| + movdqa %xmm7,%xmm6
|
| + paddd %xmm4,%xmm5
|
| + pcmpeqd %xmm8,%xmm4
|
| + paddd %xmm5,%xmm6
|
| + pcmpeqd %xmm8,%xmm5
|
| + paddd %xmm6,%xmm7
|
| + pcmpeqd %xmm8,%xmm6
|
| + pcmpeqd %xmm8,%xmm7
|
| movl $8,%r9d
|
| jmp .Loop_gather
|
| .align 16
|
| .Loop_gather:
|
| - movl (%rsi),%eax
|
| - movl 64(%rsi),%r8d
|
| + movdqa 0(%rsi),%xmm8
|
| + movdqa 16(%rsi),%xmm9
|
| + movdqa 32(%rsi),%xmm10
|
| + movdqa 48(%rsi),%xmm11
|
| + pand %xmm0,%xmm8
|
| + movdqa 64(%rsi),%xmm12
|
| + pand %xmm1,%xmm9
|
| + movdqa 80(%rsi),%xmm13
|
| + pand %xmm2,%xmm10
|
| + movdqa 96(%rsi),%xmm14
|
| + pand %xmm3,%xmm11
|
| + movdqa 112(%rsi),%xmm15
|
| leaq 128(%rsi),%rsi
|
| - shlq $32,%r8
|
| - orq %r8,%rax
|
| - movq %rax,(%rdi)
|
| + pand %xmm4,%xmm12
|
| + pand %xmm5,%xmm13
|
| + pand %xmm6,%xmm14
|
| + pand %xmm7,%xmm15
|
| + por %xmm10,%xmm8
|
| + por %xmm11,%xmm9
|
| + por %xmm12,%xmm8
|
| + por %xmm13,%xmm9
|
| + por %xmm14,%xmm8
|
| + por %xmm15,%xmm9
|
| +
|
| + por %xmm9,%xmm8
|
| + pshufd $0x4e,%xmm8,%xmm9
|
| + por %xmm9,%xmm8
|
| + movq %xmm8,(%rdi)
|
| leaq 8(%rdi),%rdi
|
| decl %r9d
|
| jnz .Loop_gather
|
| .byte 0xf3,0xc3
|
| +.LSEH_end_rsaz_512_gather4:
|
| .size rsaz_512_gather4,.-rsaz_512_gather4
|
| +
|
| +.align 64
|
| +.Linc:
|
| +.long 0,0, 1,1
|
| +.long 2,2, 2,2
|
| #endif
|
|
|