Index: third_party/boringssl/mac-x86_64/crypto/sha/sha256-x86_64.S |
diff --git a/third_party/boringssl/mac-x86_64/crypto/sha/sha256-x86_64.S b/third_party/boringssl/mac-x86_64/crypto/sha/sha256-x86_64.S |
index da02d4c2dc952a21979b4827ee4ab22447962111..0146ff5cfcbfe89d547b379d520ac368559fe7d6 100644 |
--- a/third_party/boringssl/mac-x86_64/crypto/sha/sha256-x86_64.S |
+++ b/third_party/boringssl/mac-x86_64/crypto/sha/sha256-x86_64.S |
@@ -11,6 +11,11 @@ _sha256_block_data_order: |
movl 0(%r11),%r9d |
movl 4(%r11),%r10d |
movl 8(%r11),%r11d |
+ andl $1073741824,%r9d |
+ andl $268435968,%r10d |
+ orl %r9d,%r10d |
+ cmpl $1342177792,%r10d |
+ je L$avx_shortcut |
testl $512,%r10d |
jnz L$ssse3_shortcut |
pushq %rbx |
@@ -2840,4 +2845,1061 @@ L$ssse3_00_47: |
L$epilogue_ssse3: |
.byte 0xf3,0xc3 |
+ |
+.p2align 6 |
+sha256_block_data_order_avx: |
+L$avx_shortcut: |
+ pushq %rbx |
+ pushq %rbp |
+ pushq %r12 |
+ pushq %r13 |
+ pushq %r14 |
+ pushq %r15 |
+ movq %rsp,%r11 |
+ shlq $4,%rdx |
+ subq $96,%rsp |
+ leaq (%rsi,%rdx,4),%rdx |
+ andq $-64,%rsp |
+ movq %rdi,64+0(%rsp) |
+ movq %rsi,64+8(%rsp) |
+ movq %rdx,64+16(%rsp) |
+ movq %r11,64+24(%rsp) |
+L$prologue_avx: |
+ |
+ vzeroupper |
+ movl 0(%rdi),%eax |
+ movl 4(%rdi),%ebx |
+ movl 8(%rdi),%ecx |
+ movl 12(%rdi),%edx |
+ movl 16(%rdi),%r8d |
+ movl 20(%rdi),%r9d |
+ movl 24(%rdi),%r10d |
+ movl 28(%rdi),%r11d |
+ vmovdqa K256+512+32(%rip),%xmm8 |
+ vmovdqa K256+512+64(%rip),%xmm9 |
+ jmp L$loop_avx |
+.p2align 4 |
+L$loop_avx: |
+ vmovdqa K256+512(%rip),%xmm7 |
+ vmovdqu 0(%rsi),%xmm0 |
+ vmovdqu 16(%rsi),%xmm1 |
+ vmovdqu 32(%rsi),%xmm2 |
+ vmovdqu 48(%rsi),%xmm3 |
+ vpshufb %xmm7,%xmm0,%xmm0 |
+ leaq K256(%rip),%rbp |
+ vpshufb %xmm7,%xmm1,%xmm1 |
+ vpshufb %xmm7,%xmm2,%xmm2 |
+ vpaddd 0(%rbp),%xmm0,%xmm4 |
+ vpshufb %xmm7,%xmm3,%xmm3 |
+ vpaddd 32(%rbp),%xmm1,%xmm5 |
+ vpaddd 64(%rbp),%xmm2,%xmm6 |
+ vpaddd 96(%rbp),%xmm3,%xmm7 |
+ vmovdqa %xmm4,0(%rsp) |
+ movl %eax,%r14d |
+ vmovdqa %xmm5,16(%rsp) |
+ movl %ebx,%edi |
+ vmovdqa %xmm6,32(%rsp) |
+ xorl %ecx,%edi |
+ vmovdqa %xmm7,48(%rsp) |
+ movl %r8d,%r13d |
+ jmp L$avx_00_47 |
+ |
+.p2align 4 |
+L$avx_00_47: |
+ subq $-128,%rbp |
+ vpalignr $4,%xmm0,%xmm1,%xmm4 |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%eax |
+ movl %r9d,%r12d |
+ vpalignr $4,%xmm2,%xmm3,%xmm7 |
+ shrdl $9,%r14d,%r14d |
+ xorl %r8d,%r13d |
+ xorl %r10d,%r12d |
+ vpsrld $7,%xmm4,%xmm6 |
+ shrdl $5,%r13d,%r13d |
+ xorl %eax,%r14d |
+ andl %r8d,%r12d |
+ vpaddd %xmm7,%xmm0,%xmm0 |
+ xorl %r8d,%r13d |
+ addl 0(%rsp),%r11d |
+ movl %eax,%r15d |
+ vpsrld $3,%xmm4,%xmm7 |
+ xorl %r10d,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %ebx,%r15d |
+ vpslld $14,%xmm4,%xmm5 |
+ addl %r12d,%r11d |
+ shrdl $6,%r13d,%r13d |
+ andl %r15d,%edi |
+ vpxor %xmm6,%xmm7,%xmm4 |
+ xorl %eax,%r14d |
+ addl %r13d,%r11d |
+ xorl %ebx,%edi |
+ vpshufd $250,%xmm3,%xmm7 |
+ shrdl $2,%r14d,%r14d |
+ addl %r11d,%edx |
+ addl %edi,%r11d |
+ vpsrld $11,%xmm6,%xmm6 |
+ movl %edx,%r13d |
+ addl %r11d,%r14d |
+ shrdl $14,%r13d,%r13d |
+ vpxor %xmm5,%xmm4,%xmm4 |
+ movl %r14d,%r11d |
+ movl %r8d,%r12d |
+ shrdl $9,%r14d,%r14d |
+ vpslld $11,%xmm5,%xmm5 |
+ xorl %edx,%r13d |
+ xorl %r9d,%r12d |
+ shrdl $5,%r13d,%r13d |
+ vpxor %xmm6,%xmm4,%xmm4 |
+ xorl %r11d,%r14d |
+ andl %edx,%r12d |
+ xorl %edx,%r13d |
+ vpsrld $10,%xmm7,%xmm6 |
+ addl 4(%rsp),%r10d |
+ movl %r11d,%edi |
+ xorl %r9d,%r12d |
+ vpxor %xmm5,%xmm4,%xmm4 |
+ shrdl $11,%r14d,%r14d |
+ xorl %eax,%edi |
+ addl %r12d,%r10d |
+ vpsrlq $17,%xmm7,%xmm7 |
+ shrdl $6,%r13d,%r13d |
+ andl %edi,%r15d |
+ xorl %r11d,%r14d |
+ vpaddd %xmm4,%xmm0,%xmm0 |
+ addl %r13d,%r10d |
+ xorl %eax,%r15d |
+ shrdl $2,%r14d,%r14d |
+ vpxor %xmm7,%xmm6,%xmm6 |
+ addl %r10d,%ecx |
+ addl %r15d,%r10d |
+ movl %ecx,%r13d |
+ vpsrlq $2,%xmm7,%xmm7 |
+ addl %r10d,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%r10d |
+ vpxor %xmm7,%xmm6,%xmm6 |
+ movl %edx,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %ecx,%r13d |
+ vpshufb %xmm8,%xmm6,%xmm6 |
+ xorl %r8d,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %r10d,%r14d |
+ vpaddd %xmm6,%xmm0,%xmm0 |
+ andl %ecx,%r12d |
+ xorl %ecx,%r13d |
+ addl 8(%rsp),%r9d |
+ vpshufd $80,%xmm0,%xmm7 |
+ movl %r10d,%r15d |
+ xorl %r8d,%r12d |
+ shrdl $11,%r14d,%r14d |
+ vpsrld $10,%xmm7,%xmm6 |
+ xorl %r11d,%r15d |
+ addl %r12d,%r9d |
+ shrdl $6,%r13d,%r13d |
+ vpsrlq $17,%xmm7,%xmm7 |
+ andl %r15d,%edi |
+ xorl %r10d,%r14d |
+ addl %r13d,%r9d |
+ vpxor %xmm7,%xmm6,%xmm6 |
+ xorl %r11d,%edi |
+ shrdl $2,%r14d,%r14d |
+ addl %r9d,%ebx |
+ vpsrlq $2,%xmm7,%xmm7 |
+ addl %edi,%r9d |
+ movl %ebx,%r13d |
+ addl %r9d,%r14d |
+ vpxor %xmm7,%xmm6,%xmm6 |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%r9d |
+ movl %ecx,%r12d |
+ vpshufb %xmm9,%xmm6,%xmm6 |
+ shrdl $9,%r14d,%r14d |
+ xorl %ebx,%r13d |
+ xorl %edx,%r12d |
+ vpaddd %xmm6,%xmm0,%xmm0 |
+ shrdl $5,%r13d,%r13d |
+ xorl %r9d,%r14d |
+ andl %ebx,%r12d |
+ vpaddd 0(%rbp),%xmm0,%xmm6 |
+ xorl %ebx,%r13d |
+ addl 12(%rsp),%r8d |
+ movl %r9d,%edi |
+ xorl %edx,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %r10d,%edi |
+ addl %r12d,%r8d |
+ shrdl $6,%r13d,%r13d |
+ andl %edi,%r15d |
+ xorl %r9d,%r14d |
+ addl %r13d,%r8d |
+ xorl %r10d,%r15d |
+ shrdl $2,%r14d,%r14d |
+ addl %r8d,%eax |
+ addl %r15d,%r8d |
+ movl %eax,%r13d |
+ addl %r8d,%r14d |
+ vmovdqa %xmm6,0(%rsp) |
+ vpalignr $4,%xmm1,%xmm2,%xmm4 |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%r8d |
+ movl %ebx,%r12d |
+ vpalignr $4,%xmm3,%xmm0,%xmm7 |
+ shrdl $9,%r14d,%r14d |
+ xorl %eax,%r13d |
+ xorl %ecx,%r12d |
+ vpsrld $7,%xmm4,%xmm6 |
+ shrdl $5,%r13d,%r13d |
+ xorl %r8d,%r14d |
+ andl %eax,%r12d |
+ vpaddd %xmm7,%xmm1,%xmm1 |
+ xorl %eax,%r13d |
+ addl 16(%rsp),%edx |
+ movl %r8d,%r15d |
+ vpsrld $3,%xmm4,%xmm7 |
+ xorl %ecx,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %r9d,%r15d |
+ vpslld $14,%xmm4,%xmm5 |
+ addl %r12d,%edx |
+ shrdl $6,%r13d,%r13d |
+ andl %r15d,%edi |
+ vpxor %xmm6,%xmm7,%xmm4 |
+ xorl %r8d,%r14d |
+ addl %r13d,%edx |
+ xorl %r9d,%edi |
+ vpshufd $250,%xmm0,%xmm7 |
+ shrdl $2,%r14d,%r14d |
+ addl %edx,%r11d |
+ addl %edi,%edx |
+ vpsrld $11,%xmm6,%xmm6 |
+ movl %r11d,%r13d |
+ addl %edx,%r14d |
+ shrdl $14,%r13d,%r13d |
+ vpxor %xmm5,%xmm4,%xmm4 |
+ movl %r14d,%edx |
+ movl %eax,%r12d |
+ shrdl $9,%r14d,%r14d |
+ vpslld $11,%xmm5,%xmm5 |
+ xorl %r11d,%r13d |
+ xorl %ebx,%r12d |
+ shrdl $5,%r13d,%r13d |
+ vpxor %xmm6,%xmm4,%xmm4 |
+ xorl %edx,%r14d |
+ andl %r11d,%r12d |
+ xorl %r11d,%r13d |
+ vpsrld $10,%xmm7,%xmm6 |
+ addl 20(%rsp),%ecx |
+ movl %edx,%edi |
+ xorl %ebx,%r12d |
+ vpxor %xmm5,%xmm4,%xmm4 |
+ shrdl $11,%r14d,%r14d |
+ xorl %r8d,%edi |
+ addl %r12d,%ecx |
+ vpsrlq $17,%xmm7,%xmm7 |
+ shrdl $6,%r13d,%r13d |
+ andl %edi,%r15d |
+ xorl %edx,%r14d |
+ vpaddd %xmm4,%xmm1,%xmm1 |
+ addl %r13d,%ecx |
+ xorl %r8d,%r15d |
+ shrdl $2,%r14d,%r14d |
+ vpxor %xmm7,%xmm6,%xmm6 |
+ addl %ecx,%r10d |
+ addl %r15d,%ecx |
+ movl %r10d,%r13d |
+ vpsrlq $2,%xmm7,%xmm7 |
+ addl %ecx,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%ecx |
+ vpxor %xmm7,%xmm6,%xmm6 |
+ movl %r11d,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %r10d,%r13d |
+ vpshufb %xmm8,%xmm6,%xmm6 |
+ xorl %eax,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %ecx,%r14d |
+ vpaddd %xmm6,%xmm1,%xmm1 |
+ andl %r10d,%r12d |
+ xorl %r10d,%r13d |
+ addl 24(%rsp),%ebx |
+ vpshufd $80,%xmm1,%xmm7 |
+ movl %ecx,%r15d |
+ xorl %eax,%r12d |
+ shrdl $11,%r14d,%r14d |
+ vpsrld $10,%xmm7,%xmm6 |
+ xorl %edx,%r15d |
+ addl %r12d,%ebx |
+ shrdl $6,%r13d,%r13d |
+ vpsrlq $17,%xmm7,%xmm7 |
+ andl %r15d,%edi |
+ xorl %ecx,%r14d |
+ addl %r13d,%ebx |
+ vpxor %xmm7,%xmm6,%xmm6 |
+ xorl %edx,%edi |
+ shrdl $2,%r14d,%r14d |
+ addl %ebx,%r9d |
+ vpsrlq $2,%xmm7,%xmm7 |
+ addl %edi,%ebx |
+ movl %r9d,%r13d |
+ addl %ebx,%r14d |
+ vpxor %xmm7,%xmm6,%xmm6 |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%ebx |
+ movl %r10d,%r12d |
+ vpshufb %xmm9,%xmm6,%xmm6 |
+ shrdl $9,%r14d,%r14d |
+ xorl %r9d,%r13d |
+ xorl %r11d,%r12d |
+ vpaddd %xmm6,%xmm1,%xmm1 |
+ shrdl $5,%r13d,%r13d |
+ xorl %ebx,%r14d |
+ andl %r9d,%r12d |
+ vpaddd 32(%rbp),%xmm1,%xmm6 |
+ xorl %r9d,%r13d |
+ addl 28(%rsp),%eax |
+ movl %ebx,%edi |
+ xorl %r11d,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %ecx,%edi |
+ addl %r12d,%eax |
+ shrdl $6,%r13d,%r13d |
+ andl %edi,%r15d |
+ xorl %ebx,%r14d |
+ addl %r13d,%eax |
+ xorl %ecx,%r15d |
+ shrdl $2,%r14d,%r14d |
+ addl %eax,%r8d |
+ addl %r15d,%eax |
+ movl %r8d,%r13d |
+ addl %eax,%r14d |
+ vmovdqa %xmm6,16(%rsp) |
+ vpalignr $4,%xmm2,%xmm3,%xmm4 |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%eax |
+ movl %r9d,%r12d |
+ vpalignr $4,%xmm0,%xmm1,%xmm7 |
+ shrdl $9,%r14d,%r14d |
+ xorl %r8d,%r13d |
+ xorl %r10d,%r12d |
+ vpsrld $7,%xmm4,%xmm6 |
+ shrdl $5,%r13d,%r13d |
+ xorl %eax,%r14d |
+ andl %r8d,%r12d |
+ vpaddd %xmm7,%xmm2,%xmm2 |
+ xorl %r8d,%r13d |
+ addl 32(%rsp),%r11d |
+ movl %eax,%r15d |
+ vpsrld $3,%xmm4,%xmm7 |
+ xorl %r10d,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %ebx,%r15d |
+ vpslld $14,%xmm4,%xmm5 |
+ addl %r12d,%r11d |
+ shrdl $6,%r13d,%r13d |
+ andl %r15d,%edi |
+ vpxor %xmm6,%xmm7,%xmm4 |
+ xorl %eax,%r14d |
+ addl %r13d,%r11d |
+ xorl %ebx,%edi |
+ vpshufd $250,%xmm1,%xmm7 |
+ shrdl $2,%r14d,%r14d |
+ addl %r11d,%edx |
+ addl %edi,%r11d |
+ vpsrld $11,%xmm6,%xmm6 |
+ movl %edx,%r13d |
+ addl %r11d,%r14d |
+ shrdl $14,%r13d,%r13d |
+ vpxor %xmm5,%xmm4,%xmm4 |
+ movl %r14d,%r11d |
+ movl %r8d,%r12d |
+ shrdl $9,%r14d,%r14d |
+ vpslld $11,%xmm5,%xmm5 |
+ xorl %edx,%r13d |
+ xorl %r9d,%r12d |
+ shrdl $5,%r13d,%r13d |
+ vpxor %xmm6,%xmm4,%xmm4 |
+ xorl %r11d,%r14d |
+ andl %edx,%r12d |
+ xorl %edx,%r13d |
+ vpsrld $10,%xmm7,%xmm6 |
+ addl 36(%rsp),%r10d |
+ movl %r11d,%edi |
+ xorl %r9d,%r12d |
+ vpxor %xmm5,%xmm4,%xmm4 |
+ shrdl $11,%r14d,%r14d |
+ xorl %eax,%edi |
+ addl %r12d,%r10d |
+ vpsrlq $17,%xmm7,%xmm7 |
+ shrdl $6,%r13d,%r13d |
+ andl %edi,%r15d |
+ xorl %r11d,%r14d |
+ vpaddd %xmm4,%xmm2,%xmm2 |
+ addl %r13d,%r10d |
+ xorl %eax,%r15d |
+ shrdl $2,%r14d,%r14d |
+ vpxor %xmm7,%xmm6,%xmm6 |
+ addl %r10d,%ecx |
+ addl %r15d,%r10d |
+ movl %ecx,%r13d |
+ vpsrlq $2,%xmm7,%xmm7 |
+ addl %r10d,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%r10d |
+ vpxor %xmm7,%xmm6,%xmm6 |
+ movl %edx,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %ecx,%r13d |
+ vpshufb %xmm8,%xmm6,%xmm6 |
+ xorl %r8d,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %r10d,%r14d |
+ vpaddd %xmm6,%xmm2,%xmm2 |
+ andl %ecx,%r12d |
+ xorl %ecx,%r13d |
+ addl 40(%rsp),%r9d |
+ vpshufd $80,%xmm2,%xmm7 |
+ movl %r10d,%r15d |
+ xorl %r8d,%r12d |
+ shrdl $11,%r14d,%r14d |
+ vpsrld $10,%xmm7,%xmm6 |
+ xorl %r11d,%r15d |
+ addl %r12d,%r9d |
+ shrdl $6,%r13d,%r13d |
+ vpsrlq $17,%xmm7,%xmm7 |
+ andl %r15d,%edi |
+ xorl %r10d,%r14d |
+ addl %r13d,%r9d |
+ vpxor %xmm7,%xmm6,%xmm6 |
+ xorl %r11d,%edi |
+ shrdl $2,%r14d,%r14d |
+ addl %r9d,%ebx |
+ vpsrlq $2,%xmm7,%xmm7 |
+ addl %edi,%r9d |
+ movl %ebx,%r13d |
+ addl %r9d,%r14d |
+ vpxor %xmm7,%xmm6,%xmm6 |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%r9d |
+ movl %ecx,%r12d |
+ vpshufb %xmm9,%xmm6,%xmm6 |
+ shrdl $9,%r14d,%r14d |
+ xorl %ebx,%r13d |
+ xorl %edx,%r12d |
+ vpaddd %xmm6,%xmm2,%xmm2 |
+ shrdl $5,%r13d,%r13d |
+ xorl %r9d,%r14d |
+ andl %ebx,%r12d |
+ vpaddd 64(%rbp),%xmm2,%xmm6 |
+ xorl %ebx,%r13d |
+ addl 44(%rsp),%r8d |
+ movl %r9d,%edi |
+ xorl %edx,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %r10d,%edi |
+ addl %r12d,%r8d |
+ shrdl $6,%r13d,%r13d |
+ andl %edi,%r15d |
+ xorl %r9d,%r14d |
+ addl %r13d,%r8d |
+ xorl %r10d,%r15d |
+ shrdl $2,%r14d,%r14d |
+ addl %r8d,%eax |
+ addl %r15d,%r8d |
+ movl %eax,%r13d |
+ addl %r8d,%r14d |
+ vmovdqa %xmm6,32(%rsp) |
+ vpalignr $4,%xmm3,%xmm0,%xmm4 |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%r8d |
+ movl %ebx,%r12d |
+ vpalignr $4,%xmm1,%xmm2,%xmm7 |
+ shrdl $9,%r14d,%r14d |
+ xorl %eax,%r13d |
+ xorl %ecx,%r12d |
+ vpsrld $7,%xmm4,%xmm6 |
+ shrdl $5,%r13d,%r13d |
+ xorl %r8d,%r14d |
+ andl %eax,%r12d |
+ vpaddd %xmm7,%xmm3,%xmm3 |
+ xorl %eax,%r13d |
+ addl 48(%rsp),%edx |
+ movl %r8d,%r15d |
+ vpsrld $3,%xmm4,%xmm7 |
+ xorl %ecx,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %r9d,%r15d |
+ vpslld $14,%xmm4,%xmm5 |
+ addl %r12d,%edx |
+ shrdl $6,%r13d,%r13d |
+ andl %r15d,%edi |
+ vpxor %xmm6,%xmm7,%xmm4 |
+ xorl %r8d,%r14d |
+ addl %r13d,%edx |
+ xorl %r9d,%edi |
+ vpshufd $250,%xmm2,%xmm7 |
+ shrdl $2,%r14d,%r14d |
+ addl %edx,%r11d |
+ addl %edi,%edx |
+ vpsrld $11,%xmm6,%xmm6 |
+ movl %r11d,%r13d |
+ addl %edx,%r14d |
+ shrdl $14,%r13d,%r13d |
+ vpxor %xmm5,%xmm4,%xmm4 |
+ movl %r14d,%edx |
+ movl %eax,%r12d |
+ shrdl $9,%r14d,%r14d |
+ vpslld $11,%xmm5,%xmm5 |
+ xorl %r11d,%r13d |
+ xorl %ebx,%r12d |
+ shrdl $5,%r13d,%r13d |
+ vpxor %xmm6,%xmm4,%xmm4 |
+ xorl %edx,%r14d |
+ andl %r11d,%r12d |
+ xorl %r11d,%r13d |
+ vpsrld $10,%xmm7,%xmm6 |
+ addl 52(%rsp),%ecx |
+ movl %edx,%edi |
+ xorl %ebx,%r12d |
+ vpxor %xmm5,%xmm4,%xmm4 |
+ shrdl $11,%r14d,%r14d |
+ xorl %r8d,%edi |
+ addl %r12d,%ecx |
+ vpsrlq $17,%xmm7,%xmm7 |
+ shrdl $6,%r13d,%r13d |
+ andl %edi,%r15d |
+ xorl %edx,%r14d |
+ vpaddd %xmm4,%xmm3,%xmm3 |
+ addl %r13d,%ecx |
+ xorl %r8d,%r15d |
+ shrdl $2,%r14d,%r14d |
+ vpxor %xmm7,%xmm6,%xmm6 |
+ addl %ecx,%r10d |
+ addl %r15d,%ecx |
+ movl %r10d,%r13d |
+ vpsrlq $2,%xmm7,%xmm7 |
+ addl %ecx,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%ecx |
+ vpxor %xmm7,%xmm6,%xmm6 |
+ movl %r11d,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %r10d,%r13d |
+ vpshufb %xmm8,%xmm6,%xmm6 |
+ xorl %eax,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %ecx,%r14d |
+ vpaddd %xmm6,%xmm3,%xmm3 |
+ andl %r10d,%r12d |
+ xorl %r10d,%r13d |
+ addl 56(%rsp),%ebx |
+ vpshufd $80,%xmm3,%xmm7 |
+ movl %ecx,%r15d |
+ xorl %eax,%r12d |
+ shrdl $11,%r14d,%r14d |
+ vpsrld $10,%xmm7,%xmm6 |
+ xorl %edx,%r15d |
+ addl %r12d,%ebx |
+ shrdl $6,%r13d,%r13d |
+ vpsrlq $17,%xmm7,%xmm7 |
+ andl %r15d,%edi |
+ xorl %ecx,%r14d |
+ addl %r13d,%ebx |
+ vpxor %xmm7,%xmm6,%xmm6 |
+ xorl %edx,%edi |
+ shrdl $2,%r14d,%r14d |
+ addl %ebx,%r9d |
+ vpsrlq $2,%xmm7,%xmm7 |
+ addl %edi,%ebx |
+ movl %r9d,%r13d |
+ addl %ebx,%r14d |
+ vpxor %xmm7,%xmm6,%xmm6 |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%ebx |
+ movl %r10d,%r12d |
+ vpshufb %xmm9,%xmm6,%xmm6 |
+ shrdl $9,%r14d,%r14d |
+ xorl %r9d,%r13d |
+ xorl %r11d,%r12d |
+ vpaddd %xmm6,%xmm3,%xmm3 |
+ shrdl $5,%r13d,%r13d |
+ xorl %ebx,%r14d |
+ andl %r9d,%r12d |
+ vpaddd 96(%rbp),%xmm3,%xmm6 |
+ xorl %r9d,%r13d |
+ addl 60(%rsp),%eax |
+ movl %ebx,%edi |
+ xorl %r11d,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %ecx,%edi |
+ addl %r12d,%eax |
+ shrdl $6,%r13d,%r13d |
+ andl %edi,%r15d |
+ xorl %ebx,%r14d |
+ addl %r13d,%eax |
+ xorl %ecx,%r15d |
+ shrdl $2,%r14d,%r14d |
+ addl %eax,%r8d |
+ addl %r15d,%eax |
+ movl %r8d,%r13d |
+ addl %eax,%r14d |
+ vmovdqa %xmm6,48(%rsp) |
+ cmpb $0,131(%rbp) |
+ jne L$avx_00_47 |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%eax |
+ movl %r9d,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %r8d,%r13d |
+ xorl %r10d,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %eax,%r14d |
+ andl %r8d,%r12d |
+ xorl %r8d,%r13d |
+ addl 0(%rsp),%r11d |
+ movl %eax,%r15d |
+ xorl %r10d,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %ebx,%r15d |
+ addl %r12d,%r11d |
+ shrdl $6,%r13d,%r13d |
+ andl %r15d,%edi |
+ xorl %eax,%r14d |
+ addl %r13d,%r11d |
+ xorl %ebx,%edi |
+ shrdl $2,%r14d,%r14d |
+ addl %r11d,%edx |
+ addl %edi,%r11d |
+ movl %edx,%r13d |
+ addl %r11d,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%r11d |
+ movl %r8d,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %edx,%r13d |
+ xorl %r9d,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %r11d,%r14d |
+ andl %edx,%r12d |
+ xorl %edx,%r13d |
+ addl 4(%rsp),%r10d |
+ movl %r11d,%edi |
+ xorl %r9d,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %eax,%edi |
+ addl %r12d,%r10d |
+ shrdl $6,%r13d,%r13d |
+ andl %edi,%r15d |
+ xorl %r11d,%r14d |
+ addl %r13d,%r10d |
+ xorl %eax,%r15d |
+ shrdl $2,%r14d,%r14d |
+ addl %r10d,%ecx |
+ addl %r15d,%r10d |
+ movl %ecx,%r13d |
+ addl %r10d,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%r10d |
+ movl %edx,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %ecx,%r13d |
+ xorl %r8d,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %r10d,%r14d |
+ andl %ecx,%r12d |
+ xorl %ecx,%r13d |
+ addl 8(%rsp),%r9d |
+ movl %r10d,%r15d |
+ xorl %r8d,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %r11d,%r15d |
+ addl %r12d,%r9d |
+ shrdl $6,%r13d,%r13d |
+ andl %r15d,%edi |
+ xorl %r10d,%r14d |
+ addl %r13d,%r9d |
+ xorl %r11d,%edi |
+ shrdl $2,%r14d,%r14d |
+ addl %r9d,%ebx |
+ addl %edi,%r9d |
+ movl %ebx,%r13d |
+ addl %r9d,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%r9d |
+ movl %ecx,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %ebx,%r13d |
+ xorl %edx,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %r9d,%r14d |
+ andl %ebx,%r12d |
+ xorl %ebx,%r13d |
+ addl 12(%rsp),%r8d |
+ movl %r9d,%edi |
+ xorl %edx,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %r10d,%edi |
+ addl %r12d,%r8d |
+ shrdl $6,%r13d,%r13d |
+ andl %edi,%r15d |
+ xorl %r9d,%r14d |
+ addl %r13d,%r8d |
+ xorl %r10d,%r15d |
+ shrdl $2,%r14d,%r14d |
+ addl %r8d,%eax |
+ addl %r15d,%r8d |
+ movl %eax,%r13d |
+ addl %r8d,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%r8d |
+ movl %ebx,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %eax,%r13d |
+ xorl %ecx,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %r8d,%r14d |
+ andl %eax,%r12d |
+ xorl %eax,%r13d |
+ addl 16(%rsp),%edx |
+ movl %r8d,%r15d |
+ xorl %ecx,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %r9d,%r15d |
+ addl %r12d,%edx |
+ shrdl $6,%r13d,%r13d |
+ andl %r15d,%edi |
+ xorl %r8d,%r14d |
+ addl %r13d,%edx |
+ xorl %r9d,%edi |
+ shrdl $2,%r14d,%r14d |
+ addl %edx,%r11d |
+ addl %edi,%edx |
+ movl %r11d,%r13d |
+ addl %edx,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%edx |
+ movl %eax,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %r11d,%r13d |
+ xorl %ebx,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %edx,%r14d |
+ andl %r11d,%r12d |
+ xorl %r11d,%r13d |
+ addl 20(%rsp),%ecx |
+ movl %edx,%edi |
+ xorl %ebx,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %r8d,%edi |
+ addl %r12d,%ecx |
+ shrdl $6,%r13d,%r13d |
+ andl %edi,%r15d |
+ xorl %edx,%r14d |
+ addl %r13d,%ecx |
+ xorl %r8d,%r15d |
+ shrdl $2,%r14d,%r14d |
+ addl %ecx,%r10d |
+ addl %r15d,%ecx |
+ movl %r10d,%r13d |
+ addl %ecx,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%ecx |
+ movl %r11d,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %r10d,%r13d |
+ xorl %eax,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %ecx,%r14d |
+ andl %r10d,%r12d |
+ xorl %r10d,%r13d |
+ addl 24(%rsp),%ebx |
+ movl %ecx,%r15d |
+ xorl %eax,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %edx,%r15d |
+ addl %r12d,%ebx |
+ shrdl $6,%r13d,%r13d |
+ andl %r15d,%edi |
+ xorl %ecx,%r14d |
+ addl %r13d,%ebx |
+ xorl %edx,%edi |
+ shrdl $2,%r14d,%r14d |
+ addl %ebx,%r9d |
+ addl %edi,%ebx |
+ movl %r9d,%r13d |
+ addl %ebx,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%ebx |
+ movl %r10d,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %r9d,%r13d |
+ xorl %r11d,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %ebx,%r14d |
+ andl %r9d,%r12d |
+ xorl %r9d,%r13d |
+ addl 28(%rsp),%eax |
+ movl %ebx,%edi |
+ xorl %r11d,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %ecx,%edi |
+ addl %r12d,%eax |
+ shrdl $6,%r13d,%r13d |
+ andl %edi,%r15d |
+ xorl %ebx,%r14d |
+ addl %r13d,%eax |
+ xorl %ecx,%r15d |
+ shrdl $2,%r14d,%r14d |
+ addl %eax,%r8d |
+ addl %r15d,%eax |
+ movl %r8d,%r13d |
+ addl %eax,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%eax |
+ movl %r9d,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %r8d,%r13d |
+ xorl %r10d,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %eax,%r14d |
+ andl %r8d,%r12d |
+ xorl %r8d,%r13d |
+ addl 32(%rsp),%r11d |
+ movl %eax,%r15d |
+ xorl %r10d,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %ebx,%r15d |
+ addl %r12d,%r11d |
+ shrdl $6,%r13d,%r13d |
+ andl %r15d,%edi |
+ xorl %eax,%r14d |
+ addl %r13d,%r11d |
+ xorl %ebx,%edi |
+ shrdl $2,%r14d,%r14d |
+ addl %r11d,%edx |
+ addl %edi,%r11d |
+ movl %edx,%r13d |
+ addl %r11d,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%r11d |
+ movl %r8d,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %edx,%r13d |
+ xorl %r9d,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %r11d,%r14d |
+ andl %edx,%r12d |
+ xorl %edx,%r13d |
+ addl 36(%rsp),%r10d |
+ movl %r11d,%edi |
+ xorl %r9d,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %eax,%edi |
+ addl %r12d,%r10d |
+ shrdl $6,%r13d,%r13d |
+ andl %edi,%r15d |
+ xorl %r11d,%r14d |
+ addl %r13d,%r10d |
+ xorl %eax,%r15d |
+ shrdl $2,%r14d,%r14d |
+ addl %r10d,%ecx |
+ addl %r15d,%r10d |
+ movl %ecx,%r13d |
+ addl %r10d,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%r10d |
+ movl %edx,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %ecx,%r13d |
+ xorl %r8d,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %r10d,%r14d |
+ andl %ecx,%r12d |
+ xorl %ecx,%r13d |
+ addl 40(%rsp),%r9d |
+ movl %r10d,%r15d |
+ xorl %r8d,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %r11d,%r15d |
+ addl %r12d,%r9d |
+ shrdl $6,%r13d,%r13d |
+ andl %r15d,%edi |
+ xorl %r10d,%r14d |
+ addl %r13d,%r9d |
+ xorl %r11d,%edi |
+ shrdl $2,%r14d,%r14d |
+ addl %r9d,%ebx |
+ addl %edi,%r9d |
+ movl %ebx,%r13d |
+ addl %r9d,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%r9d |
+ movl %ecx,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %ebx,%r13d |
+ xorl %edx,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %r9d,%r14d |
+ andl %ebx,%r12d |
+ xorl %ebx,%r13d |
+ addl 44(%rsp),%r8d |
+ movl %r9d,%edi |
+ xorl %edx,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %r10d,%edi |
+ addl %r12d,%r8d |
+ shrdl $6,%r13d,%r13d |
+ andl %edi,%r15d |
+ xorl %r9d,%r14d |
+ addl %r13d,%r8d |
+ xorl %r10d,%r15d |
+ shrdl $2,%r14d,%r14d |
+ addl %r8d,%eax |
+ addl %r15d,%r8d |
+ movl %eax,%r13d |
+ addl %r8d,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%r8d |
+ movl %ebx,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %eax,%r13d |
+ xorl %ecx,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %r8d,%r14d |
+ andl %eax,%r12d |
+ xorl %eax,%r13d |
+ addl 48(%rsp),%edx |
+ movl %r8d,%r15d |
+ xorl %ecx,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %r9d,%r15d |
+ addl %r12d,%edx |
+ shrdl $6,%r13d,%r13d |
+ andl %r15d,%edi |
+ xorl %r8d,%r14d |
+ addl %r13d,%edx |
+ xorl %r9d,%edi |
+ shrdl $2,%r14d,%r14d |
+ addl %edx,%r11d |
+ addl %edi,%edx |
+ movl %r11d,%r13d |
+ addl %edx,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%edx |
+ movl %eax,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %r11d,%r13d |
+ xorl %ebx,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %edx,%r14d |
+ andl %r11d,%r12d |
+ xorl %r11d,%r13d |
+ addl 52(%rsp),%ecx |
+ movl %edx,%edi |
+ xorl %ebx,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %r8d,%edi |
+ addl %r12d,%ecx |
+ shrdl $6,%r13d,%r13d |
+ andl %edi,%r15d |
+ xorl %edx,%r14d |
+ addl %r13d,%ecx |
+ xorl %r8d,%r15d |
+ shrdl $2,%r14d,%r14d |
+ addl %ecx,%r10d |
+ addl %r15d,%ecx |
+ movl %r10d,%r13d |
+ addl %ecx,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%ecx |
+ movl %r11d,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %r10d,%r13d |
+ xorl %eax,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %ecx,%r14d |
+ andl %r10d,%r12d |
+ xorl %r10d,%r13d |
+ addl 56(%rsp),%ebx |
+ movl %ecx,%r15d |
+ xorl %eax,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %edx,%r15d |
+ addl %r12d,%ebx |
+ shrdl $6,%r13d,%r13d |
+ andl %r15d,%edi |
+ xorl %ecx,%r14d |
+ addl %r13d,%ebx |
+ xorl %edx,%edi |
+ shrdl $2,%r14d,%r14d |
+ addl %ebx,%r9d |
+ addl %edi,%ebx |
+ movl %r9d,%r13d |
+ addl %ebx,%r14d |
+ shrdl $14,%r13d,%r13d |
+ movl %r14d,%ebx |
+ movl %r10d,%r12d |
+ shrdl $9,%r14d,%r14d |
+ xorl %r9d,%r13d |
+ xorl %r11d,%r12d |
+ shrdl $5,%r13d,%r13d |
+ xorl %ebx,%r14d |
+ andl %r9d,%r12d |
+ xorl %r9d,%r13d |
+ addl 60(%rsp),%eax |
+ movl %ebx,%edi |
+ xorl %r11d,%r12d |
+ shrdl $11,%r14d,%r14d |
+ xorl %ecx,%edi |
+ addl %r12d,%eax |
+ shrdl $6,%r13d,%r13d |
+ andl %edi,%r15d |
+ xorl %ebx,%r14d |
+ addl %r13d,%eax |
+ xorl %ecx,%r15d |
+ shrdl $2,%r14d,%r14d |
+ addl %eax,%r8d |
+ addl %r15d,%eax |
+ movl %r8d,%r13d |
+ addl %eax,%r14d |
+ movq 64+0(%rsp),%rdi |
+ movl %r14d,%eax |
+ |
+ addl 0(%rdi),%eax |
+ leaq 64(%rsi),%rsi |
+ addl 4(%rdi),%ebx |
+ addl 8(%rdi),%ecx |
+ addl 12(%rdi),%edx |
+ addl 16(%rdi),%r8d |
+ addl 20(%rdi),%r9d |
+ addl 24(%rdi),%r10d |
+ addl 28(%rdi),%r11d |
+ |
+ cmpq 64+16(%rsp),%rsi |
+ |
+ movl %eax,0(%rdi) |
+ movl %ebx,4(%rdi) |
+ movl %ecx,8(%rdi) |
+ movl %edx,12(%rdi) |
+ movl %r8d,16(%rdi) |
+ movl %r9d,20(%rdi) |
+ movl %r10d,24(%rdi) |
+ movl %r11d,28(%rdi) |
+ jb L$loop_avx |
+ |
+ movq 64+24(%rsp),%rsi |
+ vzeroupper |
+ movq (%rsi),%r15 |
+ movq 8(%rsi),%r14 |
+ movq 16(%rsi),%r13 |
+ movq 24(%rsi),%r12 |
+ movq 32(%rsi),%rbp |
+ movq 40(%rsi),%rbx |
+ leaq 48(%rsi),%rsp |
+L$epilogue_avx: |
+ .byte 0xf3,0xc3 |
+ |
#endif |