Index: third_party/boringssl/linux-x86_64/crypto/bn/rsaz-x86_64.S |
diff --git a/third_party/boringssl/linux-x86_64/crypto/bn/rsaz-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/bn/rsaz-x86_64.S |
index dd3d3106d164449a9c16aef31a90d2ae23a39e10..21531d1c65099f55e3894c1875e7479a2d6d653c 100644 |
--- a/third_party/boringssl/linux-x86_64/crypto/bn/rsaz-x86_64.S |
+++ b/third_party/boringssl/linux-x86_64/crypto/bn/rsaz-x86_64.S |
@@ -466,48 +466,94 @@ rsaz_512_mul_gather4: |
pushq %r14 |
pushq %r15 |
- movl %r9d,%r9d |
- subq $128+24,%rsp |
+ subq $152,%rsp |
.Lmul_gather4_body: |
- movl 64(%rdx,%r9,4),%eax |
-.byte 102,72,15,110,199 |
- movl (%rdx,%r9,4),%ebx |
-.byte 102,72,15,110,201 |
+ movd %r9d,%xmm8 |
+ movdqa .Linc+16(%rip),%xmm1 |
+ movdqa .Linc(%rip),%xmm0 |
+ |
+ pshufd $0,%xmm8,%xmm8 |
+ movdqa %xmm1,%xmm7 |
+ movdqa %xmm1,%xmm2 |
+ paddd %xmm0,%xmm1 |
+ pcmpeqd %xmm8,%xmm0 |
+ movdqa %xmm7,%xmm3 |
+ paddd %xmm1,%xmm2 |
+ pcmpeqd %xmm8,%xmm1 |
+ movdqa %xmm7,%xmm4 |
+ paddd %xmm2,%xmm3 |
+ pcmpeqd %xmm8,%xmm2 |
+ movdqa %xmm7,%xmm5 |
+ paddd %xmm3,%xmm4 |
+ pcmpeqd %xmm8,%xmm3 |
+ movdqa %xmm7,%xmm6 |
+ paddd %xmm4,%xmm5 |
+ pcmpeqd %xmm8,%xmm4 |
+ paddd %xmm5,%xmm6 |
+ pcmpeqd %xmm8,%xmm5 |
+ paddd %xmm6,%xmm7 |
+ pcmpeqd %xmm8,%xmm6 |
+ pcmpeqd %xmm8,%xmm7 |
+ |
+ movdqa 0(%rdx),%xmm8 |
+ movdqa 16(%rdx),%xmm9 |
+ movdqa 32(%rdx),%xmm10 |
+ movdqa 48(%rdx),%xmm11 |
+ pand %xmm0,%xmm8 |
+ movdqa 64(%rdx),%xmm12 |
+ pand %xmm1,%xmm9 |
+ movdqa 80(%rdx),%xmm13 |
+ pand %xmm2,%xmm10 |
+ movdqa 96(%rdx),%xmm14 |
+ pand %xmm3,%xmm11 |
+ movdqa 112(%rdx),%xmm15 |
+ leaq 128(%rdx),%rbp |
+ pand %xmm4,%xmm12 |
+ pand %xmm5,%xmm13 |
+ pand %xmm6,%xmm14 |
+ pand %xmm7,%xmm15 |
+ por %xmm10,%xmm8 |
+ por %xmm11,%xmm9 |
+ por %xmm12,%xmm8 |
+ por %xmm13,%xmm9 |
+ por %xmm14,%xmm8 |
+ por %xmm15,%xmm9 |
+ |
+ por %xmm9,%xmm8 |
+ pshufd $0x4e,%xmm8,%xmm9 |
+ por %xmm9,%xmm8 |
+.byte 102,76,15,126,195 |
+ |
movq %r8,128(%rsp) |
+ movq %rdi,128+8(%rsp) |
+ movq %rcx,128+16(%rsp) |
- shlq $32,%rax |
- orq %rax,%rbx |
movq (%rsi),%rax |
movq 8(%rsi),%rcx |
- leaq 128(%rdx,%r9,4),%rbp |
mulq %rbx |
movq %rax,(%rsp) |
movq %rcx,%rax |
movq %rdx,%r8 |
mulq %rbx |
- movd (%rbp),%xmm4 |
addq %rax,%r8 |
movq 16(%rsi),%rax |
movq %rdx,%r9 |
adcq $0,%r9 |
mulq %rbx |
- movd 64(%rbp),%xmm5 |
addq %rax,%r9 |
movq 24(%rsi),%rax |
movq %rdx,%r10 |
adcq $0,%r10 |
mulq %rbx |
- pslldq $4,%xmm5 |
addq %rax,%r10 |
movq 32(%rsi),%rax |
movq %rdx,%r11 |
adcq $0,%r11 |
mulq %rbx |
- por %xmm5,%xmm4 |
addq %rax,%r11 |
movq 40(%rsi),%rax |
movq %rdx,%r12 |
@@ -520,14 +566,12 @@ rsaz_512_mul_gather4: |
adcq $0,%r13 |
mulq %rbx |
- leaq 128(%rbp),%rbp |
addq %rax,%r13 |
movq 56(%rsi),%rax |
movq %rdx,%r14 |
adcq $0,%r14 |
mulq %rbx |
-.byte 102,72,15,126,227 |
addq %rax,%r14 |
movq (%rsi),%rax |
movq %rdx,%r15 |
@@ -539,6 +583,35 @@ rsaz_512_mul_gather4: |
.align 32 |
.Loop_mul_gather: |
+ movdqa 0(%rbp),%xmm8 |
+ movdqa 16(%rbp),%xmm9 |
+ movdqa 32(%rbp),%xmm10 |
+ movdqa 48(%rbp),%xmm11 |
+ pand %xmm0,%xmm8 |
+ movdqa 64(%rbp),%xmm12 |
+ pand %xmm1,%xmm9 |
+ movdqa 80(%rbp),%xmm13 |
+ pand %xmm2,%xmm10 |
+ movdqa 96(%rbp),%xmm14 |
+ pand %xmm3,%xmm11 |
+ movdqa 112(%rbp),%xmm15 |
+ leaq 128(%rbp),%rbp |
+ pand %xmm4,%xmm12 |
+ pand %xmm5,%xmm13 |
+ pand %xmm6,%xmm14 |
+ pand %xmm7,%xmm15 |
+ por %xmm10,%xmm8 |
+ por %xmm11,%xmm9 |
+ por %xmm12,%xmm8 |
+ por %xmm13,%xmm9 |
+ por %xmm14,%xmm8 |
+ por %xmm15,%xmm9 |
+ |
+ por %xmm9,%xmm8 |
+ pshufd $0x4e,%xmm8,%xmm9 |
+ por %xmm9,%xmm8 |
+.byte 102,76,15,126,195 |
+ |
mulq %rbx |
addq %rax,%r8 |
movq 8(%rsi),%rax |
@@ -547,7 +620,6 @@ rsaz_512_mul_gather4: |
adcq $0,%r8 |
mulq %rbx |
- movd (%rbp),%xmm4 |
addq %rax,%r9 |
movq 16(%rsi),%rax |
adcq $0,%rdx |
@@ -556,7 +628,6 @@ rsaz_512_mul_gather4: |
adcq $0,%r9 |
mulq %rbx |
- movd 64(%rbp),%xmm5 |
addq %rax,%r10 |
movq 24(%rsi),%rax |
adcq $0,%rdx |
@@ -565,7 +636,6 @@ rsaz_512_mul_gather4: |
adcq $0,%r10 |
mulq %rbx |
- pslldq $4,%xmm5 |
addq %rax,%r11 |
movq 32(%rsi),%rax |
adcq $0,%rdx |
@@ -574,7 +644,6 @@ rsaz_512_mul_gather4: |
adcq $0,%r11 |
mulq %rbx |
- por %xmm5,%xmm4 |
addq %rax,%r12 |
movq 40(%rsi),%rax |
adcq $0,%rdx |
@@ -599,7 +668,6 @@ rsaz_512_mul_gather4: |
adcq $0,%r14 |
mulq %rbx |
-.byte 102,72,15,126,227 |
addq %rax,%r15 |
movq (%rsi),%rax |
adcq $0,%rdx |
@@ -607,7 +675,6 @@ rsaz_512_mul_gather4: |
movq %rdx,%r15 |
adcq $0,%r15 |
- leaq 128(%rbp),%rbp |
leaq 8(%rdi),%rdi |
decl %ecx |
@@ -622,8 +689,8 @@ rsaz_512_mul_gather4: |
movq %r14,48(%rdi) |
movq %r15,56(%rdi) |
-.byte 102,72,15,126,199 |
-.byte 102,72,15,126,205 |
+ movq 128+8(%rsp),%rdi |
+ movq 128+16(%rsp),%rbp |
movq (%rsp),%r8 |
movq 8(%rsp),%r9 |
@@ -673,7 +740,7 @@ rsaz_512_mul_scatter4: |
movl %r9d,%r9d |
subq $128+24,%rsp |
.Lmul_scatter4_body: |
- leaq (%r8,%r9,4),%r8 |
+ leaq (%r8,%r9,8),%r8 |
.byte 102,72,15,110,199 |
.byte 102,72,15,110,202 |
.byte 102,73,15,110,208 |
@@ -709,30 +776,14 @@ rsaz_512_mul_scatter4: |
call __rsaz_512_subtract |
- movl %r8d,0(%rsi) |
- shrq $32,%r8 |
- movl %r9d,128(%rsi) |
- shrq $32,%r9 |
- movl %r10d,256(%rsi) |
- shrq $32,%r10 |
- movl %r11d,384(%rsi) |
- shrq $32,%r11 |
- movl %r12d,512(%rsi) |
- shrq $32,%r12 |
- movl %r13d,640(%rsi) |
- shrq $32,%r13 |
- movl %r14d,768(%rsi) |
- shrq $32,%r14 |
- movl %r15d,896(%rsi) |
- shrq $32,%r15 |
- movl %r8d,64(%rsi) |
- movl %r9d,192(%rsi) |
- movl %r10d,320(%rsi) |
- movl %r11d,448(%rsi) |
- movl %r12d,576(%rsi) |
- movl %r13d,704(%rsi) |
- movl %r14d,832(%rsi) |
- movl %r15d,960(%rsi) |
+ movq %r8,0(%rsi) |
+ movq %r9,128(%rsi) |
+ movq %r10,256(%rsi) |
+ movq %r11,384(%rsi) |
+ movq %r12,512(%rsi) |
+ movq %r13,640(%rsi) |
+ movq %r14,768(%rsi) |
+ movq %r15,896(%rsi) |
leaq 128+24+48(%rsp),%rax |
movq -48(%rax),%r15 |
@@ -1087,16 +1138,14 @@ __rsaz_512_mul: |
.type rsaz_512_scatter4,@function |
.align 16 |
rsaz_512_scatter4: |
- leaq (%rdi,%rdx,4),%rdi |
+ leaq (%rdi,%rdx,8),%rdi |
movl $8,%r9d |
jmp .Loop_scatter |
.align 16 |
.Loop_scatter: |
movq (%rsi),%rax |
leaq 8(%rsi),%rsi |
- movl %eax,(%rdi) |
- shrq $32,%rax |
- movl %eax,64(%rdi) |
+ movq %rax,(%rdi) |
leaq 128(%rdi),%rdi |
decl %r9d |
jnz .Loop_scatter |
@@ -1108,20 +1157,73 @@ rsaz_512_scatter4: |
.type rsaz_512_gather4,@function |
.align 16 |
rsaz_512_gather4: |
- leaq (%rsi,%rdx,4),%rsi |
+ movd %edx,%xmm8 |
+ movdqa .Linc+16(%rip),%xmm1 |
+ movdqa .Linc(%rip),%xmm0 |
+ |
+ pshufd $0,%xmm8,%xmm8 |
+ movdqa %xmm1,%xmm7 |
+ movdqa %xmm1,%xmm2 |
+ paddd %xmm0,%xmm1 |
+ pcmpeqd %xmm8,%xmm0 |
+ movdqa %xmm7,%xmm3 |
+ paddd %xmm1,%xmm2 |
+ pcmpeqd %xmm8,%xmm1 |
+ movdqa %xmm7,%xmm4 |
+ paddd %xmm2,%xmm3 |
+ pcmpeqd %xmm8,%xmm2 |
+ movdqa %xmm7,%xmm5 |
+ paddd %xmm3,%xmm4 |
+ pcmpeqd %xmm8,%xmm3 |
+ movdqa %xmm7,%xmm6 |
+ paddd %xmm4,%xmm5 |
+ pcmpeqd %xmm8,%xmm4 |
+ paddd %xmm5,%xmm6 |
+ pcmpeqd %xmm8,%xmm5 |
+ paddd %xmm6,%xmm7 |
+ pcmpeqd %xmm8,%xmm6 |
+ pcmpeqd %xmm8,%xmm7 |
movl $8,%r9d |
jmp .Loop_gather |
.align 16 |
.Loop_gather: |
- movl (%rsi),%eax |
- movl 64(%rsi),%r8d |
+ movdqa 0(%rsi),%xmm8 |
+ movdqa 16(%rsi),%xmm9 |
+ movdqa 32(%rsi),%xmm10 |
+ movdqa 48(%rsi),%xmm11 |
+ pand %xmm0,%xmm8 |
+ movdqa 64(%rsi),%xmm12 |
+ pand %xmm1,%xmm9 |
+ movdqa 80(%rsi),%xmm13 |
+ pand %xmm2,%xmm10 |
+ movdqa 96(%rsi),%xmm14 |
+ pand %xmm3,%xmm11 |
+ movdqa 112(%rsi),%xmm15 |
leaq 128(%rsi),%rsi |
- shlq $32,%r8 |
- orq %r8,%rax |
- movq %rax,(%rdi) |
+ pand %xmm4,%xmm12 |
+ pand %xmm5,%xmm13 |
+ pand %xmm6,%xmm14 |
+ pand %xmm7,%xmm15 |
+ por %xmm10,%xmm8 |
+ por %xmm11,%xmm9 |
+ por %xmm12,%xmm8 |
+ por %xmm13,%xmm9 |
+ por %xmm14,%xmm8 |
+ por %xmm15,%xmm9 |
+ |
+ por %xmm9,%xmm8 |
+ pshufd $0x4e,%xmm8,%xmm9 |
+ por %xmm9,%xmm8 |
+ movq %xmm8,(%rdi) |
leaq 8(%rdi),%rdi |
decl %r9d |
jnz .Loop_gather |
.byte 0xf3,0xc3 |
+.LSEH_end_rsaz_512_gather4: |
.size rsaz_512_gather4,.-rsaz_512_gather4 |
+ |
+.align 64 |
+.Linc: |
+.long 0,0, 1,1 |
+.long 2,2, 2,2 |
#endif |