Index: third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S |
diff --git a/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S |
new file mode 100644 |
index 0000000000000000000000000000000000000000..daf82d9e9bbdb0225b95a348368570f029d0494b |
--- /dev/null |
+++ b/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S |
@@ -0,0 +1,3066 @@ |
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) |
+.data |
+ |
+.align 16 |
+one: |
+.quad 1,0 |
+two: |
+.quad 2,0 |
+three: |
+.quad 3,0 |
+four: |
+.quad 4,0 |
+five: |
+.quad 5,0 |
+six: |
+.quad 6,0 |
+seven: |
+.quad 7,0 |
+eight: |
+.quad 8,0 |
+ |
+OR_MASK: |
+.long 0x00000000,0x00000000,0x00000000,0x80000000 |
+poly: |
+.quad 0x1, 0xc200000000000000 |
+mask: |
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d |
+con1: |
+.long 1,1,1,1 |
+con2: |
+.long 0x1b,0x1b,0x1b,0x1b |
+con3: |
+.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7 |
+and_mask: |
+.long 0,0xffffffff, 0xffffffff, 0xffffffff |
+.text |
+.type GFMUL,@function |
+.align 16 |
+GFMUL: |
+.cfi_startproc |
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 |
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm5 |
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 |
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpslldq $8,%xmm3,%xmm4 |
+ vpsrldq $8,%xmm3,%xmm3 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpxor %xmm3,%xmm5,%xmm5 |
+ |
+ vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3 |
+ vpshufd $78,%xmm2,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm2 |
+ |
+ vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3 |
+ vpshufd $78,%xmm2,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm2 |
+ |
+ vpxor %xmm5,%xmm2,%xmm0 |
+ .byte 0xf3,0xc3 |
+.cfi_endproc |
+.size GFMUL, .-GFMUL |
+.globl aesgcmsiv_htable_init |
+.hidden aesgcmsiv_htable_init |
+.type aesgcmsiv_htable_init,@function |
+.align 16 |
+aesgcmsiv_htable_init: |
+.cfi_startproc |
+ vmovdqa (%rsi),%xmm0 |
+ vmovdqa %xmm0,%xmm1 |
+ vmovdqa %xmm0,(%rdi) |
+ call GFMUL |
+ vmovdqa %xmm0,16(%rdi) |
+ call GFMUL |
+ vmovdqa %xmm0,32(%rdi) |
+ call GFMUL |
+ vmovdqa %xmm0,48(%rdi) |
+ call GFMUL |
+ vmovdqa %xmm0,64(%rdi) |
+ call GFMUL |
+ vmovdqa %xmm0,80(%rdi) |
+ call GFMUL |
+ vmovdqa %xmm0,96(%rdi) |
+ call GFMUL |
+ vmovdqa %xmm0,112(%rdi) |
+ .byte 0xf3,0xc3 |
+.cfi_endproc |
+.size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init |
+.globl aesgcmsiv_htable6_init |
+.hidden aesgcmsiv_htable6_init |
+.type aesgcmsiv_htable6_init,@function |
+.align 16 |
+aesgcmsiv_htable6_init: |
+.cfi_startproc |
+ vmovdqa (%rsi),%xmm0 |
+ vmovdqa %xmm0,%xmm1 |
+ vmovdqa %xmm0,(%rdi) |
+ call GFMUL |
+ vmovdqa %xmm0,16(%rdi) |
+ call GFMUL |
+ vmovdqa %xmm0,32(%rdi) |
+ call GFMUL |
+ vmovdqa %xmm0,48(%rdi) |
+ call GFMUL |
+ vmovdqa %xmm0,64(%rdi) |
+ call GFMUL |
+ vmovdqa %xmm0,80(%rdi) |
+ .byte 0xf3,0xc3 |
+.cfi_endproc |
+.size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init |
+.globl aesgcmsiv_htable_polyval |
+.hidden aesgcmsiv_htable_polyval |
+.type aesgcmsiv_htable_polyval,@function |
+.align 16 |
+aesgcmsiv_htable_polyval: |
+.cfi_startproc |
+ testq %rdx,%rdx |
+ jnz .Lhtable_polyval_start |
+ .byte 0xf3,0xc3 |
+ |
+.Lhtable_polyval_start: |
+ vzeroall |
+ |
+ |
+ |
+ movq %rdx,%r11 |
+ andq $127,%r11 |
+ |
+ jz .Lhtable_polyval_no_prefix |
+ |
+ vpxor %xmm9,%xmm9,%xmm9 |
+ vmovdqa (%rcx),%xmm1 |
+ subq %r11,%rdx |
+ |
+ subq $16,%r11 |
+ |
+ |
+ vmovdqu (%rsi),%xmm0 |
+ vpxor %xmm1,%xmm0,%xmm0 |
+ |
+ vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm5 |
+ vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm3 |
+ vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm4 |
+ vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm5,%xmm5 |
+ |
+ leaq 16(%rsi),%rsi |
+ testq %r11,%r11 |
+ jnz .Lhtable_polyval_prefix_loop |
+ jmp .Lhtable_polyval_prefix_complete |
+ |
+ |
+.align 64 |
+.Lhtable_polyval_prefix_loop: |
+ subq $16,%r11 |
+ |
+ vmovdqu (%rsi),%xmm0 |
+ |
+ vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm3,%xmm3 |
+ vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm4,%xmm4 |
+ vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm5,%xmm5 |
+ vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm5,%xmm5 |
+ |
+ testq %r11,%r11 |
+ |
+ leaq 16(%rsi),%rsi |
+ |
+ jnz .Lhtable_polyval_prefix_loop |
+ |
+.Lhtable_polyval_prefix_complete: |
+ vpsrldq $8,%xmm5,%xmm6 |
+ vpslldq $8,%xmm5,%xmm5 |
+ |
+ vpxor %xmm6,%xmm4,%xmm9 |
+ vpxor %xmm5,%xmm3,%xmm1 |
+ |
+ jmp .Lhtable_polyval_main_loop |
+ |
+.Lhtable_polyval_no_prefix: |
+ |
+ |
+ |
+ |
+ vpxor %xmm1,%xmm1,%xmm1 |
+ vmovdqa (%rcx),%xmm9 |
+ |
+.align 64 |
+.Lhtable_polyval_main_loop: |
+ subq $0x80,%rdx |
+ jb .Lhtable_polyval_out |
+ |
+ vmovdqu 112(%rsi),%xmm0 |
+ |
+ vpclmulqdq $0x01,(%rdi),%xmm0,%xmm5 |
+ vpclmulqdq $0x00,(%rdi),%xmm0,%xmm3 |
+ vpclmulqdq $0x11,(%rdi),%xmm0,%xmm4 |
+ vpclmulqdq $0x10,(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm5,%xmm5 |
+ |
+ |
+ vmovdqu 96(%rsi),%xmm0 |
+ vpclmulqdq $0x01,16(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm5,%xmm5 |
+ vpclmulqdq $0x00,16(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm3,%xmm3 |
+ vpclmulqdq $0x11,16(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm4,%xmm4 |
+ vpclmulqdq $0x10,16(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm5,%xmm5 |
+ |
+ |
+ |
+ vmovdqu 80(%rsi),%xmm0 |
+ |
+ vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7 |
+ vpalignr $8,%xmm1,%xmm1,%xmm1 |
+ |
+ vpclmulqdq $0x01,32(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm5,%xmm5 |
+ vpclmulqdq $0x00,32(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm3,%xmm3 |
+ vpclmulqdq $0x11,32(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm4,%xmm4 |
+ vpclmulqdq $0x10,32(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm5,%xmm5 |
+ |
+ |
+ vpxor %xmm7,%xmm1,%xmm1 |
+ |
+ vmovdqu 64(%rsi),%xmm0 |
+ |
+ vpclmulqdq $0x01,48(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm5,%xmm5 |
+ vpclmulqdq $0x00,48(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm3,%xmm3 |
+ vpclmulqdq $0x11,48(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm4,%xmm4 |
+ vpclmulqdq $0x10,48(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm5,%xmm5 |
+ |
+ |
+ vmovdqu 48(%rsi),%xmm0 |
+ |
+ vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7 |
+ vpalignr $8,%xmm1,%xmm1,%xmm1 |
+ |
+ vpclmulqdq $0x01,64(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm5,%xmm5 |
+ vpclmulqdq $0x00,64(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm3,%xmm3 |
+ vpclmulqdq $0x11,64(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm4,%xmm4 |
+ vpclmulqdq $0x10,64(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm5,%xmm5 |
+ |
+ |
+ vpxor %xmm7,%xmm1,%xmm1 |
+ |
+ vmovdqu 32(%rsi),%xmm0 |
+ |
+ vpclmulqdq $0x01,80(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm5,%xmm5 |
+ vpclmulqdq $0x00,80(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm3,%xmm3 |
+ vpclmulqdq $0x11,80(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm4,%xmm4 |
+ vpclmulqdq $0x10,80(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm5,%xmm5 |
+ |
+ |
+ vpxor %xmm9,%xmm1,%xmm1 |
+ |
+ vmovdqu 16(%rsi),%xmm0 |
+ |
+ vpclmulqdq $0x01,96(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm5,%xmm5 |
+ vpclmulqdq $0x00,96(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm3,%xmm3 |
+ vpclmulqdq $0x11,96(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm4,%xmm4 |
+ vpclmulqdq $0x10,96(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm5,%xmm5 |
+ |
+ |
+ vmovdqu 0(%rsi),%xmm0 |
+ vpxor %xmm1,%xmm0,%xmm0 |
+ |
+ vpclmulqdq $0x01,112(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm5,%xmm5 |
+ vpclmulqdq $0x00,112(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm3,%xmm3 |
+ vpclmulqdq $0x11,112(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm4,%xmm4 |
+ vpclmulqdq $0x10,112(%rdi),%xmm0,%xmm6 |
+ vpxor %xmm6,%xmm5,%xmm5 |
+ |
+ |
+ vpsrldq $8,%xmm5,%xmm6 |
+ vpslldq $8,%xmm5,%xmm5 |
+ |
+ vpxor %xmm6,%xmm4,%xmm9 |
+ vpxor %xmm5,%xmm3,%xmm1 |
+ |
+ leaq 128(%rsi),%rsi |
+ jmp .Lhtable_polyval_main_loop |
+ |
+ |
+ |
+.Lhtable_polyval_out: |
+ vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6 |
+ vpalignr $8,%xmm1,%xmm1,%xmm1 |
+ vpxor %xmm6,%xmm1,%xmm1 |
+ |
+ vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6 |
+ vpalignr $8,%xmm1,%xmm1,%xmm1 |
+ vpxor %xmm6,%xmm1,%xmm1 |
+ vpxor %xmm9,%xmm1,%xmm1 |
+ |
+ vmovdqu %xmm1,(%rcx) |
+ vzeroupper |
+ .byte 0xf3,0xc3 |
+.cfi_endproc |
+.size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval |
+.globl aesgcmsiv_polyval_horner |
+.hidden aesgcmsiv_polyval_horner |
+.type aesgcmsiv_polyval_horner,@function |
+.align 16 |
+aesgcmsiv_polyval_horner: |
+.cfi_startproc |
+ testq %rcx,%rcx |
+ jnz .Lpolyval_horner_start |
+ .byte 0xf3,0xc3 |
+ |
+.Lpolyval_horner_start: |
+ |
+ |
+ |
+ xorq %r10,%r10 |
+ shlq $4,%rcx |
+ |
+ vmovdqa (%rsi),%xmm1 |
+ vmovdqa (%rdi),%xmm0 |
+ |
+.Lpolyval_horner_loop: |
+ vpxor (%rdx,%r10,1),%xmm0,%xmm0 |
+ call GFMUL |
+ |
+ addq $16,%r10 |
+ cmpq %r10,%rcx |
+ jne .Lpolyval_horner_loop |
+ |
+ |
+ vmovdqa %xmm0,(%rdi) |
+ .byte 0xf3,0xc3 |
+.cfi_endproc |
+.size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner |
+.globl aes128gcmsiv_aes_ks |
+.hidden aes128gcmsiv_aes_ks |
+.type aes128gcmsiv_aes_ks,@function |
+.align 16 |
+aes128gcmsiv_aes_ks: |
+.cfi_startproc |
+ vmovdqa (%rdi),%xmm1 |
+ vmovdqa %xmm1,(%rsi) |
+ |
+ vmovdqa con1(%rip),%xmm0 |
+ vmovdqa mask(%rip),%xmm15 |
+ |
+ movq $8,%rax |
+ |
+.Lks128_loop: |
+ addq $16,%rsi |
+ subq $1,%rax |
+ vpshufb %xmm15,%xmm1,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslld $1,%xmm0,%xmm0 |
+ vpslldq $4,%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpslldq $4,%xmm3,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpslldq $4,%xmm3,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ vmovdqa %xmm1,(%rsi) |
+ jne .Lks128_loop |
+ |
+ vmovdqa con2(%rip),%xmm0 |
+ vpshufb %xmm15,%xmm1,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslld $1,%xmm0,%xmm0 |
+ vpslldq $4,%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpslldq $4,%xmm3,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpslldq $4,%xmm3,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ vmovdqa %xmm1,16(%rsi) |
+ |
+ vpshufb %xmm15,%xmm1,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslldq $4,%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpslldq $4,%xmm3,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpslldq $4,%xmm3,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ vmovdqa %xmm1,32(%rsi) |
+ .byte 0xf3,0xc3 |
+.cfi_endproc |
+.size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks |
+.globl aes256gcmsiv_aes_ks |
+.hidden aes256gcmsiv_aes_ks |
+.type aes256gcmsiv_aes_ks,@function |
+.align 16 |
+aes256gcmsiv_aes_ks: |
+.cfi_startproc |
+ vmovdqa (%rdi),%xmm1 |
+ vmovdqa 16(%rdi),%xmm3 |
+ vmovdqa %xmm1,(%rsi) |
+ vmovdqa %xmm3,16(%rsi) |
+ vmovdqa con1(%rip),%xmm0 |
+ vmovdqa mask(%rip),%xmm15 |
+ vpxor %xmm14,%xmm14,%xmm14 |
+ movq $6,%rax |
+ |
+.Lks256_loop: |
+ addq $32,%rsi |
+ subq $1,%rax |
+ vpshufb %xmm15,%xmm3,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslld $1,%xmm0,%xmm0 |
+ vpsllq $32,%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpshufb con3(%rip),%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ vmovdqa %xmm1,(%rsi) |
+ vpshufd $0xff,%xmm1,%xmm2 |
+ vaesenclast %xmm14,%xmm2,%xmm2 |
+ vpsllq $32,%xmm3,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpshufb con3(%rip),%xmm3,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpxor %xmm2,%xmm3,%xmm3 |
+ vmovdqa %xmm3,16(%rsi) |
+ jne .Lks256_loop |
+ |
+ vpshufb %xmm15,%xmm3,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpsllq $32,%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpshufb con3(%rip),%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ vmovdqa %xmm1,32(%rsi) |
+ .byte 0xf3,0xc3 |
+.cfi_endproc |
+.globl aes128gcmsiv_aes_ks_enc_x1 |
+.hidden aes128gcmsiv_aes_ks_enc_x1 |
+.type aes128gcmsiv_aes_ks_enc_x1,@function |
+.align 16 |
+aes128gcmsiv_aes_ks_enc_x1: |
+.cfi_startproc |
+ vmovdqa (%rcx),%xmm1 |
+ vmovdqa 0(%rdi),%xmm4 |
+ |
+ vmovdqa %xmm1,(%rdx) |
+ vpxor %xmm1,%xmm4,%xmm4 |
+ |
+ vmovdqa con1(%rip),%xmm0 |
+ vmovdqa mask(%rip),%xmm15 |
+ |
+ vpshufb %xmm15,%xmm1,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslld $1,%xmm0,%xmm0 |
+ vpsllq $32,%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpshufb con3(%rip),%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ |
+ vaesenc %xmm1,%xmm4,%xmm4 |
+ vmovdqa %xmm1,16(%rdx) |
+ |
+ vpshufb %xmm15,%xmm1,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslld $1,%xmm0,%xmm0 |
+ vpsllq $32,%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpshufb con3(%rip),%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ |
+ vaesenc %xmm1,%xmm4,%xmm4 |
+ vmovdqa %xmm1,32(%rdx) |
+ |
+ vpshufb %xmm15,%xmm1,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslld $1,%xmm0,%xmm0 |
+ vpsllq $32,%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpshufb con3(%rip),%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ |
+ vaesenc %xmm1,%xmm4,%xmm4 |
+ vmovdqa %xmm1,48(%rdx) |
+ |
+ vpshufb %xmm15,%xmm1,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslld $1,%xmm0,%xmm0 |
+ vpsllq $32,%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpshufb con3(%rip),%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ |
+ vaesenc %xmm1,%xmm4,%xmm4 |
+ vmovdqa %xmm1,64(%rdx) |
+ |
+ vpshufb %xmm15,%xmm1,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslld $1,%xmm0,%xmm0 |
+ vpsllq $32,%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpshufb con3(%rip),%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ |
+ vaesenc %xmm1,%xmm4,%xmm4 |
+ vmovdqa %xmm1,80(%rdx) |
+ |
+ vpshufb %xmm15,%xmm1,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslld $1,%xmm0,%xmm0 |
+ vpsllq $32,%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpshufb con3(%rip),%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ |
+ vaesenc %xmm1,%xmm4,%xmm4 |
+ vmovdqa %xmm1,96(%rdx) |
+ |
+ vpshufb %xmm15,%xmm1,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslld $1,%xmm0,%xmm0 |
+ vpsllq $32,%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpshufb con3(%rip),%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ |
+ vaesenc %xmm1,%xmm4,%xmm4 |
+ vmovdqa %xmm1,112(%rdx) |
+ |
+ vpshufb %xmm15,%xmm1,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslld $1,%xmm0,%xmm0 |
+ vpsllq $32,%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpshufb con3(%rip),%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ |
+ vaesenc %xmm1,%xmm4,%xmm4 |
+ vmovdqa %xmm1,128(%rdx) |
+ |
+ |
+ vmovdqa con2(%rip),%xmm0 |
+ |
+ vpshufb %xmm15,%xmm1,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslld $1,%xmm0,%xmm0 |
+ vpsllq $32,%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpshufb con3(%rip),%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ |
+ vaesenc %xmm1,%xmm4,%xmm4 |
+ vmovdqa %xmm1,144(%rdx) |
+ |
+ vpshufb %xmm15,%xmm1,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpsllq $32,%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpshufb con3(%rip),%xmm1,%xmm3 |
+ vpxor %xmm3,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ |
+ vaesenclast %xmm1,%xmm4,%xmm4 |
+ vmovdqa %xmm1,160(%rdx) |
+ |
+ |
+ vmovdqa %xmm4,0(%rsi) |
+ .byte 0xf3,0xc3 |
+.cfi_endproc |
+.size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1 |
+.globl aes128gcmsiv_kdf |
+.hidden aes128gcmsiv_kdf |
+.type aes128gcmsiv_kdf,@function |
+.align 16 |
+aes128gcmsiv_kdf: |
+.cfi_startproc |
+ |
+ |
+ |
+ |
+ vmovdqa (%rdx),%xmm1 |
+ vmovdqa 0(%rdi),%xmm9 |
+ vmovdqa and_mask(%rip),%xmm12 |
+ vmovdqa one(%rip),%xmm13 |
+ vpshufd $0x90,%xmm9,%xmm9 |
+ vpand %xmm12,%xmm9,%xmm9 |
+ vpaddd %xmm13,%xmm9,%xmm10 |
+ vpaddd %xmm13,%xmm10,%xmm11 |
+ vpaddd %xmm13,%xmm11,%xmm12 |
+ |
+ vpxor %xmm1,%xmm9,%xmm9 |
+ vpxor %xmm1,%xmm10,%xmm10 |
+ vpxor %xmm1,%xmm11,%xmm11 |
+ vpxor %xmm1,%xmm12,%xmm12 |
+ |
+ vmovdqa 16(%rdx),%xmm1 |
+ vaesenc %xmm1,%xmm9,%xmm9 |
+ vaesenc %xmm1,%xmm10,%xmm10 |
+ vaesenc %xmm1,%xmm11,%xmm11 |
+ vaesenc %xmm1,%xmm12,%xmm12 |
+ |
+ vmovdqa 32(%rdx),%xmm2 |
+ vaesenc %xmm2,%xmm9,%xmm9 |
+ vaesenc %xmm2,%xmm10,%xmm10 |
+ vaesenc %xmm2,%xmm11,%xmm11 |
+ vaesenc %xmm2,%xmm12,%xmm12 |
+ |
+ vmovdqa 48(%rdx),%xmm1 |
+ vaesenc %xmm1,%xmm9,%xmm9 |
+ vaesenc %xmm1,%xmm10,%xmm10 |
+ vaesenc %xmm1,%xmm11,%xmm11 |
+ vaesenc %xmm1,%xmm12,%xmm12 |
+ |
+ vmovdqa 64(%rdx),%xmm2 |
+ vaesenc %xmm2,%xmm9,%xmm9 |
+ vaesenc %xmm2,%xmm10,%xmm10 |
+ vaesenc %xmm2,%xmm11,%xmm11 |
+ vaesenc %xmm2,%xmm12,%xmm12 |
+ |
+ vmovdqa 80(%rdx),%xmm1 |
+ vaesenc %xmm1,%xmm9,%xmm9 |
+ vaesenc %xmm1,%xmm10,%xmm10 |
+ vaesenc %xmm1,%xmm11,%xmm11 |
+ vaesenc %xmm1,%xmm12,%xmm12 |
+ |
+ vmovdqa 96(%rdx),%xmm2 |
+ vaesenc %xmm2,%xmm9,%xmm9 |
+ vaesenc %xmm2,%xmm10,%xmm10 |
+ vaesenc %xmm2,%xmm11,%xmm11 |
+ vaesenc %xmm2,%xmm12,%xmm12 |
+ |
+ vmovdqa 112(%rdx),%xmm1 |
+ vaesenc %xmm1,%xmm9,%xmm9 |
+ vaesenc %xmm1,%xmm10,%xmm10 |
+ vaesenc %xmm1,%xmm11,%xmm11 |
+ vaesenc %xmm1,%xmm12,%xmm12 |
+ |
+ vmovdqa 128(%rdx),%xmm2 |
+ vaesenc %xmm2,%xmm9,%xmm9 |
+ vaesenc %xmm2,%xmm10,%xmm10 |
+ vaesenc %xmm2,%xmm11,%xmm11 |
+ vaesenc %xmm2,%xmm12,%xmm12 |
+ |
+ vmovdqa 144(%rdx),%xmm1 |
+ vaesenc %xmm1,%xmm9,%xmm9 |
+ vaesenc %xmm1,%xmm10,%xmm10 |
+ vaesenc %xmm1,%xmm11,%xmm11 |
+ vaesenc %xmm1,%xmm12,%xmm12 |
+ |
+ vmovdqa 160(%rdx),%xmm2 |
+ vaesenclast %xmm2,%xmm9,%xmm9 |
+ vaesenclast %xmm2,%xmm10,%xmm10 |
+ vaesenclast %xmm2,%xmm11,%xmm11 |
+ vaesenclast %xmm2,%xmm12,%xmm12 |
+ |
+ |
+ vmovdqa %xmm9,0(%rsi) |
+ vmovdqa %xmm10,16(%rsi) |
+ vmovdqa %xmm11,32(%rsi) |
+ vmovdqa %xmm12,48(%rsi) |
+ .byte 0xf3,0xc3 |
+.cfi_endproc |
+.size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf |
+.globl aes128gcmsiv_enc_msg_x4 |
+.hidden aes128gcmsiv_enc_msg_x4 |
+.type aes128gcmsiv_enc_msg_x4,@function |
+.align 16 |
+aes128gcmsiv_enc_msg_x4: |
+.cfi_startproc |
+ testq %r8,%r8 |
+ jnz .L128_enc_msg_x4_start |
+ .byte 0xf3,0xc3 |
+ |
+.L128_enc_msg_x4_start: |
+ pushq %r12 |
+.cfi_adjust_cfa_offset 8 |
+.cfi_offset %r12,-16 |
+ pushq %r13 |
+.cfi_adjust_cfa_offset 8 |
+.cfi_offset %r13,-24 |
+ |
+ shrq $4,%r8 |
+ movq %r8,%r10 |
+ shlq $62,%r10 |
+ shrq $62,%r10 |
+ |
+ |
+ vmovdqa (%rdx),%xmm15 |
+ vpor OR_MASK(%rip),%xmm15,%xmm15 |
+ |
+ vmovdqu four(%rip),%xmm4 |
+ vmovdqa %xmm15,%xmm0 |
+ vpaddd one(%rip),%xmm15,%xmm1 |
+ vpaddd two(%rip),%xmm15,%xmm2 |
+ vpaddd three(%rip),%xmm15,%xmm3 |
+ |
+ shrq $2,%r8 |
+ je .L128_enc_msg_x4_check_remainder |
+ |
+ subq $64,%rsi |
+ subq $64,%rdi |
+ |
+.L128_enc_msg_x4_loop1: |
+ addq $64,%rsi |
+ addq $64,%rdi |
+ |
+ vmovdqa %xmm0,%xmm5 |
+ vmovdqa %xmm1,%xmm6 |
+ vmovdqa %xmm2,%xmm7 |
+ vmovdqa %xmm3,%xmm8 |
+ |
+ vpxor (%rcx),%xmm5,%xmm5 |
+ vpxor (%rcx),%xmm6,%xmm6 |
+ vpxor (%rcx),%xmm7,%xmm7 |
+ vpxor (%rcx),%xmm8,%xmm8 |
+ |
+ vmovdqu 16(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vpaddd %xmm4,%xmm0,%xmm0 |
+ vmovdqu 32(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vpaddd %xmm4,%xmm1,%xmm1 |
+ vmovdqu 48(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vpaddd %xmm4,%xmm2,%xmm2 |
+ vmovdqu 64(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vpaddd %xmm4,%xmm3,%xmm3 |
+ |
+ vmovdqu 80(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vmovdqu 96(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vmovdqu 112(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vmovdqu 128(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vmovdqu 144(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vmovdqu 160(%rcx),%xmm12 |
+ vaesenclast %xmm12,%xmm5,%xmm5 |
+ vaesenclast %xmm12,%xmm6,%xmm6 |
+ vaesenclast %xmm12,%xmm7,%xmm7 |
+ vaesenclast %xmm12,%xmm8,%xmm8 |
+ |
+ |
+ |
+ vpxor 0(%rdi),%xmm5,%xmm5 |
+ vpxor 16(%rdi),%xmm6,%xmm6 |
+ vpxor 32(%rdi),%xmm7,%xmm7 |
+ vpxor 48(%rdi),%xmm8,%xmm8 |
+ |
+ subq $1,%r8 |
+ |
+ vmovdqu %xmm5,0(%rsi) |
+ vmovdqu %xmm6,16(%rsi) |
+ vmovdqu %xmm7,32(%rsi) |
+ vmovdqu %xmm8,48(%rsi) |
+ |
+ jne .L128_enc_msg_x4_loop1 |
+ |
+ addq $64,%rsi |
+ addq $64,%rdi |
+ |
+.L128_enc_msg_x4_check_remainder: |
+ cmpq $0,%r10 |
+ je .L128_enc_msg_x4_out |
+ |
+.L128_enc_msg_x4_loop2: |
+ |
+ |
+ vmovdqa %xmm0,%xmm5 |
+ vpaddd one(%rip),%xmm0,%xmm0 |
+ |
+ vpxor (%rcx),%xmm5,%xmm5 |
+ vaesenc 16(%rcx),%xmm5,%xmm5 |
+ vaesenc 32(%rcx),%xmm5,%xmm5 |
+ vaesenc 48(%rcx),%xmm5,%xmm5 |
+ vaesenc 64(%rcx),%xmm5,%xmm5 |
+ vaesenc 80(%rcx),%xmm5,%xmm5 |
+ vaesenc 96(%rcx),%xmm5,%xmm5 |
+ vaesenc 112(%rcx),%xmm5,%xmm5 |
+ vaesenc 128(%rcx),%xmm5,%xmm5 |
+ vaesenc 144(%rcx),%xmm5,%xmm5 |
+ vaesenclast 160(%rcx),%xmm5,%xmm5 |
+ |
+ |
+ vpxor (%rdi),%xmm5,%xmm5 |
+ vmovdqu %xmm5,(%rsi) |
+ |
+ addq $16,%rdi |
+ addq $16,%rsi |
+ |
+ subq $1,%r10 |
+ jne .L128_enc_msg_x4_loop2 |
+ |
+.L128_enc_msg_x4_out: |
+ popq %r13 |
+.cfi_adjust_cfa_offset -8 |
+.cfi_restore %r13 |
+ popq %r12 |
+.cfi_adjust_cfa_offset -8 |
+.cfi_restore %r12 |
+ .byte 0xf3,0xc3 |
+.cfi_endproc |
+.size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4 |
+.globl aes128gcmsiv_enc_msg_x8 |
+.hidden aes128gcmsiv_enc_msg_x8 |
+.type aes128gcmsiv_enc_msg_x8,@function |
+.align 16 |
+aes128gcmsiv_enc_msg_x8: |
+.cfi_startproc |
+ testq %r8,%r8 |
+ jnz .L128_enc_msg_x8_start |
+ .byte 0xf3,0xc3 |
+ |
+.L128_enc_msg_x8_start: |
+ pushq %r12 |
+.cfi_adjust_cfa_offset 8 |
+.cfi_offset %r12,-16 |
+ pushq %r13 |
+.cfi_adjust_cfa_offset 8 |
+.cfi_offset %r13,-24 |
+ pushq %rbp |
+.cfi_adjust_cfa_offset 8 |
+.cfi_offset %rbp,-32 |
+ movq %rsp,%rbp |
+.cfi_def_cfa_register rbp |
+ |
+ |
+ subq $128,%rsp |
+ andq $-64,%rsp |
+ |
+ shrq $4,%r8 |
+ movq %r8,%r10 |
+ shlq $61,%r10 |
+ shrq $61,%r10 |
+ |
+ |
+ vmovdqu (%rdx),%xmm1 |
+ vpor OR_MASK(%rip),%xmm1,%xmm1 |
+ |
+ |
+ vpaddd seven(%rip),%xmm1,%xmm0 |
+ vmovdqu %xmm0,(%rsp) |
+ vpaddd one(%rip),%xmm1,%xmm9 |
+ vpaddd two(%rip),%xmm1,%xmm10 |
+ vpaddd three(%rip),%xmm1,%xmm11 |
+ vpaddd four(%rip),%xmm1,%xmm12 |
+ vpaddd five(%rip),%xmm1,%xmm13 |
+ vpaddd six(%rip),%xmm1,%xmm14 |
+ vmovdqa %xmm1,%xmm0 |
+ |
+ shrq $3,%r8 |
+ je .L128_enc_msg_x8_check_remainder |
+ |
+ subq $128,%rsi |
+ subq $128,%rdi |
+ |
+.L128_enc_msg_x8_loop1: |
+ addq $128,%rsi |
+ addq $128,%rdi |
+ |
+ vmovdqa %xmm0,%xmm1 |
+ vmovdqa %xmm9,%xmm2 |
+ vmovdqa %xmm10,%xmm3 |
+ vmovdqa %xmm11,%xmm4 |
+ vmovdqa %xmm12,%xmm5 |
+ vmovdqa %xmm13,%xmm6 |
+ vmovdqa %xmm14,%xmm7 |
+ |
+ vmovdqu (%rsp),%xmm8 |
+ |
+ vpxor (%rcx),%xmm1,%xmm1 |
+ vpxor (%rcx),%xmm2,%xmm2 |
+ vpxor (%rcx),%xmm3,%xmm3 |
+ vpxor (%rcx),%xmm4,%xmm4 |
+ vpxor (%rcx),%xmm5,%xmm5 |
+ vpxor (%rcx),%xmm6,%xmm6 |
+ vpxor (%rcx),%xmm7,%xmm7 |
+ vpxor (%rcx),%xmm8,%xmm8 |
+ |
+ vmovdqu 16(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vmovdqu (%rsp),%xmm14 |
+ vpaddd eight(%rip),%xmm14,%xmm14 |
+ vmovdqu %xmm14,(%rsp) |
+ vmovdqu 32(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vpsubd one(%rip),%xmm14,%xmm14 |
+ vmovdqu 48(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vpaddd eight(%rip),%xmm0,%xmm0 |
+ vmovdqu 64(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vpaddd eight(%rip),%xmm9,%xmm9 |
+ vmovdqu 80(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vpaddd eight(%rip),%xmm10,%xmm10 |
+ vmovdqu 96(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vpaddd eight(%rip),%xmm11,%xmm11 |
+ vmovdqu 112(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vpaddd eight(%rip),%xmm12,%xmm12 |
+ vmovdqu 128(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vpaddd eight(%rip),%xmm13,%xmm13 |
+ vmovdqu 144(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vmovdqu 160(%rcx),%xmm15 |
+ vaesenclast %xmm15,%xmm1,%xmm1 |
+ vaesenclast %xmm15,%xmm2,%xmm2 |
+ vaesenclast %xmm15,%xmm3,%xmm3 |
+ vaesenclast %xmm15,%xmm4,%xmm4 |
+ vaesenclast %xmm15,%xmm5,%xmm5 |
+ vaesenclast %xmm15,%xmm6,%xmm6 |
+ vaesenclast %xmm15,%xmm7,%xmm7 |
+ vaesenclast %xmm15,%xmm8,%xmm8 |
+ |
+ |
+ |
+ vpxor 0(%rdi),%xmm1,%xmm1 |
+ vpxor 16(%rdi),%xmm2,%xmm2 |
+ vpxor 32(%rdi),%xmm3,%xmm3 |
+ vpxor 48(%rdi),%xmm4,%xmm4 |
+ vpxor 64(%rdi),%xmm5,%xmm5 |
+ vpxor 80(%rdi),%xmm6,%xmm6 |
+ vpxor 96(%rdi),%xmm7,%xmm7 |
+ vpxor 112(%rdi),%xmm8,%xmm8 |
+ |
+ decq %r8 |
+ |
+ vmovdqu %xmm1,0(%rsi) |
+ vmovdqu %xmm2,16(%rsi) |
+ vmovdqu %xmm3,32(%rsi) |
+ vmovdqu %xmm4,48(%rsi) |
+ vmovdqu %xmm5,64(%rsi) |
+ vmovdqu %xmm6,80(%rsi) |
+ vmovdqu %xmm7,96(%rsi) |
+ vmovdqu %xmm8,112(%rsi) |
+ |
+ jne .L128_enc_msg_x8_loop1 |
+ |
+ addq $128,%rsi |
+ addq $128,%rdi |
+ |
+.L128_enc_msg_x8_check_remainder: |
+ cmpq $0,%r10 |
+ je .L128_enc_msg_x8_out |
+ |
+.L128_enc_msg_x8_loop2: |
+ |
+ |
+ vmovdqa %xmm0,%xmm1 |
+ vpaddd one(%rip),%xmm0,%xmm0 |
+ |
+ vpxor (%rcx),%xmm1,%xmm1 |
+ vaesenc 16(%rcx),%xmm1,%xmm1 |
+ vaesenc 32(%rcx),%xmm1,%xmm1 |
+ vaesenc 48(%rcx),%xmm1,%xmm1 |
+ vaesenc 64(%rcx),%xmm1,%xmm1 |
+ vaesenc 80(%rcx),%xmm1,%xmm1 |
+ vaesenc 96(%rcx),%xmm1,%xmm1 |
+ vaesenc 112(%rcx),%xmm1,%xmm1 |
+ vaesenc 128(%rcx),%xmm1,%xmm1 |
+ vaesenc 144(%rcx),%xmm1,%xmm1 |
+ vaesenclast 160(%rcx),%xmm1,%xmm1 |
+ |
+ |
+ vpxor (%rdi),%xmm1,%xmm1 |
+ |
+ vmovdqu %xmm1,(%rsi) |
+ |
+ addq $16,%rdi |
+ addq $16,%rsi |
+ |
+ decq %r10 |
+ jne .L128_enc_msg_x8_loop2 |
+ |
+.L128_enc_msg_x8_out: |
+ movq %rbp,%rsp |
+.cfi_def_cfa_register %rsp |
+ popq %rbp |
+.cfi_adjust_cfa_offset -8 |
+.cfi_restore %rbp |
+ popq %r13 |
+.cfi_adjust_cfa_offset -8 |
+.cfi_restore %r13 |
+ popq %r12 |
+.cfi_adjust_cfa_offset -8 |
+.cfi_restore %r12 |
+ .byte 0xf3,0xc3 |
+.cfi_endproc |
+.size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8 |
+.globl aes128gcmsiv_dec |
+.hidden aes128gcmsiv_dec |
+.type aes128gcmsiv_dec,@function |
+.align 16 |
+aes128gcmsiv_dec: |
+.cfi_startproc |
+ testq $~15,%r9 |
+ jnz .L128_dec_start |
+ .byte 0xf3,0xc3 |
+ |
+.L128_dec_start: |
+ vzeroupper |
+ vmovdqa (%rdx),%xmm0 |
+ movq %rdx,%rax |
+ |
+ leaq 32(%rax),%rax |
+ leaq 32(%rcx),%rcx |
+ |
+ |
+ vmovdqu (%rdi,%r9,1),%xmm15 |
+ vpor OR_MASK(%rip),%xmm15,%xmm15 |
+ andq $~15,%r9 |
+ |
+ |
+ cmpq $96,%r9 |
+ jb .L128_dec_loop2 |
+ |
+ |
+ subq $96,%r9 |
+ vmovdqa %xmm15,%xmm7 |
+ vpaddd one(%rip),%xmm7,%xmm8 |
+ vpaddd two(%rip),%xmm7,%xmm9 |
+ vpaddd one(%rip),%xmm9,%xmm10 |
+ vpaddd two(%rip),%xmm9,%xmm11 |
+ vpaddd one(%rip),%xmm11,%xmm12 |
+ vpaddd two(%rip),%xmm11,%xmm15 |
+ |
+ vpxor (%r8),%xmm7,%xmm7 |
+ vpxor (%r8),%xmm8,%xmm8 |
+ vpxor (%r8),%xmm9,%xmm9 |
+ vpxor (%r8),%xmm10,%xmm10 |
+ vpxor (%r8),%xmm11,%xmm11 |
+ vpxor (%r8),%xmm12,%xmm12 |
+ |
+ vmovdqu 16(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 32(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 48(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 64(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 80(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 96(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 112(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 128(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 144(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 160(%r8),%xmm4 |
+ vaesenclast %xmm4,%xmm7,%xmm7 |
+ vaesenclast %xmm4,%xmm8,%xmm8 |
+ vaesenclast %xmm4,%xmm9,%xmm9 |
+ vaesenclast %xmm4,%xmm10,%xmm10 |
+ vaesenclast %xmm4,%xmm11,%xmm11 |
+ vaesenclast %xmm4,%xmm12,%xmm12 |
+ |
+ |
+ vpxor 0(%rdi),%xmm7,%xmm7 |
+ vpxor 16(%rdi),%xmm8,%xmm8 |
+ vpxor 32(%rdi),%xmm9,%xmm9 |
+ vpxor 48(%rdi),%xmm10,%xmm10 |
+ vpxor 64(%rdi),%xmm11,%xmm11 |
+ vpxor 80(%rdi),%xmm12,%xmm12 |
+ |
+ vmovdqu %xmm7,0(%rsi) |
+ vmovdqu %xmm8,16(%rsi) |
+ vmovdqu %xmm9,32(%rsi) |
+ vmovdqu %xmm10,48(%rsi) |
+ vmovdqu %xmm11,64(%rsi) |
+ vmovdqu %xmm12,80(%rsi) |
+ |
+ addq $96,%rdi |
+ addq $96,%rsi |
+ jmp .L128_dec_loop1 |
+ |
+ |
+.align 64 |
+.L128_dec_loop1: |
+ cmpq $96,%r9 |
+ jb .L128_dec_finish_96 |
+ subq $96,%r9 |
+ |
+ vmovdqa %xmm12,%xmm6 |
+ vmovdqa %xmm11,16-32(%rax) |
+ vmovdqa %xmm10,32-32(%rax) |
+ vmovdqa %xmm9,48-32(%rax) |
+ vmovdqa %xmm8,64-32(%rax) |
+ vmovdqa %xmm7,80-32(%rax) |
+ |
+ vmovdqa %xmm15,%xmm7 |
+ vpaddd one(%rip),%xmm7,%xmm8 |
+ vpaddd two(%rip),%xmm7,%xmm9 |
+ vpaddd one(%rip),%xmm9,%xmm10 |
+ vpaddd two(%rip),%xmm9,%xmm11 |
+ vpaddd one(%rip),%xmm11,%xmm12 |
+ vpaddd two(%rip),%xmm11,%xmm15 |
+ |
+ vmovdqa (%r8),%xmm4 |
+ vpxor %xmm4,%xmm7,%xmm7 |
+ vpxor %xmm4,%xmm8,%xmm8 |
+ vpxor %xmm4,%xmm9,%xmm9 |
+ vpxor %xmm4,%xmm10,%xmm10 |
+ vpxor %xmm4,%xmm11,%xmm11 |
+ vpxor %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 0-32(%rcx),%xmm4 |
+ vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 |
+ vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 |
+ vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1 |
+ vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ vmovdqu 16(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu -16(%rax),%xmm6 |
+ vmovdqu -16(%rcx),%xmm13 |
+ |
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ |
+ vmovdqu 32(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 0(%rax),%xmm6 |
+ vmovdqu 0(%rcx),%xmm13 |
+ |
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ |
+ vmovdqu 48(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 16(%rax),%xmm6 |
+ vmovdqu 16(%rcx),%xmm13 |
+ |
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ |
+ vmovdqu 64(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 32(%rax),%xmm6 |
+ vmovdqu 32(%rcx),%xmm13 |
+ |
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ |
+ vmovdqu 80(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 96(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 112(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ |
+ vmovdqa 80-32(%rax),%xmm6 |
+ vpxor %xmm0,%xmm6,%xmm6 |
+ vmovdqu 80-32(%rcx),%xmm5 |
+ |
+ vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ vmovdqu 128(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ |
+ vpsrldq $8,%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm5 |
+ vpslldq $8,%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm0 |
+ |
+ vmovdqa poly(%rip),%xmm3 |
+ |
+ vmovdqu 144(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 160(%r8),%xmm6 |
+ vpalignr $8,%xmm0,%xmm0,%xmm2 |
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 |
+ vpxor %xmm0,%xmm2,%xmm0 |
+ |
+ vpxor 0(%rdi),%xmm6,%xmm4 |
+ vaesenclast %xmm4,%xmm7,%xmm7 |
+ vpxor 16(%rdi),%xmm6,%xmm4 |
+ vaesenclast %xmm4,%xmm8,%xmm8 |
+ vpxor 32(%rdi),%xmm6,%xmm4 |
+ vaesenclast %xmm4,%xmm9,%xmm9 |
+ vpxor 48(%rdi),%xmm6,%xmm4 |
+ vaesenclast %xmm4,%xmm10,%xmm10 |
+ vpxor 64(%rdi),%xmm6,%xmm4 |
+ vaesenclast %xmm4,%xmm11,%xmm11 |
+ vpxor 80(%rdi),%xmm6,%xmm4 |
+ vaesenclast %xmm4,%xmm12,%xmm12 |
+ |
+ vpalignr $8,%xmm0,%xmm0,%xmm2 |
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 |
+ vpxor %xmm0,%xmm2,%xmm0 |
+ |
+ vmovdqu %xmm7,0(%rsi) |
+ vmovdqu %xmm8,16(%rsi) |
+ vmovdqu %xmm9,32(%rsi) |
+ vmovdqu %xmm10,48(%rsi) |
+ vmovdqu %xmm11,64(%rsi) |
+ vmovdqu %xmm12,80(%rsi) |
+ |
+ vpxor %xmm5,%xmm0,%xmm0 |
+ |
+ leaq 96(%rdi),%rdi |
+ leaq 96(%rsi),%rsi |
+ jmp .L128_dec_loop1 |
+ |
+.L128_dec_finish_96: |
+ vmovdqa %xmm12,%xmm6 |
+ vmovdqa %xmm11,16-32(%rax) |
+ vmovdqa %xmm10,32-32(%rax) |
+ vmovdqa %xmm9,48-32(%rax) |
+ vmovdqa %xmm8,64-32(%rax) |
+ vmovdqa %xmm7,80-32(%rax) |
+ |
+ vmovdqu 0-32(%rcx),%xmm4 |
+ vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1 |
+ vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 |
+ vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 |
+ vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ vmovdqu -16(%rax),%xmm6 |
+ vmovdqu -16(%rcx),%xmm13 |
+ |
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ vmovdqu 0(%rax),%xmm6 |
+ vmovdqu 0(%rcx),%xmm13 |
+ |
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ vmovdqu 16(%rax),%xmm6 |
+ vmovdqu 16(%rcx),%xmm13 |
+ |
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ vmovdqu 32(%rax),%xmm6 |
+ vmovdqu 32(%rcx),%xmm13 |
+ |
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ |
+ vmovdqu 80-32(%rax),%xmm6 |
+ vpxor %xmm0,%xmm6,%xmm6 |
+ vmovdqu 80-32(%rcx),%xmm5 |
+ vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ vpsrldq $8,%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm5 |
+ vpslldq $8,%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm0 |
+ |
+ vmovdqa poly(%rip),%xmm3 |
+ |
+ vpalignr $8,%xmm0,%xmm0,%xmm2 |
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 |
+ vpxor %xmm0,%xmm2,%xmm0 |
+ |
+ vpalignr $8,%xmm0,%xmm0,%xmm2 |
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 |
+ vpxor %xmm0,%xmm2,%xmm0 |
+ |
+ vpxor %xmm5,%xmm0,%xmm0 |
+ |
+.L128_dec_loop2: |
+ |
+ |
+ |
+ cmpq $16,%r9 |
+ jb .L128_dec_out |
+ subq $16,%r9 |
+ |
+ vmovdqa %xmm15,%xmm2 |
+ vpaddd one(%rip),%xmm15,%xmm15 |
+ |
+ vpxor 0(%r8),%xmm2,%xmm2 |
+ vaesenc 16(%r8),%xmm2,%xmm2 |
+ vaesenc 32(%r8),%xmm2,%xmm2 |
+ vaesenc 48(%r8),%xmm2,%xmm2 |
+ vaesenc 64(%r8),%xmm2,%xmm2 |
+ vaesenc 80(%r8),%xmm2,%xmm2 |
+ vaesenc 96(%r8),%xmm2,%xmm2 |
+ vaesenc 112(%r8),%xmm2,%xmm2 |
+ vaesenc 128(%r8),%xmm2,%xmm2 |
+ vaesenc 144(%r8),%xmm2,%xmm2 |
+ vaesenclast 160(%r8),%xmm2,%xmm2 |
+ vpxor (%rdi),%xmm2,%xmm2 |
+ vmovdqu %xmm2,(%rsi) |
+ addq $16,%rdi |
+ addq $16,%rsi |
+ |
+ vpxor %xmm2,%xmm0,%xmm0 |
+ vmovdqa -32(%rcx),%xmm1 |
+ call GFMUL |
+ |
+ jmp .L128_dec_loop2 |
+ |
+.L128_dec_out: |
+ vmovdqu %xmm0,(%rdx) |
+ .byte 0xf3,0xc3 |
+.cfi_endproc |
+.size aes128gcmsiv_dec, .-aes128gcmsiv_dec |
+.globl aes128gcmsiv_ecb_enc_block |
+.hidden aes128gcmsiv_ecb_enc_block |
+.type aes128gcmsiv_ecb_enc_block,@function |
+.align 16 |
+aes128gcmsiv_ecb_enc_block: |
+.cfi_startproc |
+ vmovdqa (%rdi),%xmm1 |
+ |
+ vpxor (%rdx),%xmm1,%xmm1 |
+ vaesenc 16(%rdx),%xmm1,%xmm1 |
+ vaesenc 32(%rdx),%xmm1,%xmm1 |
+ vaesenc 48(%rdx),%xmm1,%xmm1 |
+ vaesenc 64(%rdx),%xmm1,%xmm1 |
+ vaesenc 80(%rdx),%xmm1,%xmm1 |
+ vaesenc 96(%rdx),%xmm1,%xmm1 |
+ vaesenc 112(%rdx),%xmm1,%xmm1 |
+ vaesenc 128(%rdx),%xmm1,%xmm1 |
+ vaesenc 144(%rdx),%xmm1,%xmm1 |
+ vaesenclast 160(%rdx),%xmm1,%xmm1 |
+ |
+ vmovdqa %xmm1,(%rsi) |
+ |
+ .byte 0xf3,0xc3 |
+.cfi_endproc |
+.size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block |
+.globl aes256gcmsiv_aes_ks_enc_x1 |
+.hidden aes256gcmsiv_aes_ks_enc_x1 |
+.type aes256gcmsiv_aes_ks_enc_x1,@function |
+.align 16 |
+aes256gcmsiv_aes_ks_enc_x1: |
+.cfi_startproc |
+ vmovdqa con1(%rip),%xmm0 |
+ vmovdqa mask(%rip),%xmm15 |
+ vmovdqa (%rdi),%xmm8 |
+ vmovdqa (%rcx),%xmm1 |
+ vmovdqa 16(%rcx),%xmm3 |
+ vpxor %xmm1,%xmm8,%xmm8 |
+ vaesenc %xmm3,%xmm8,%xmm8 |
+ vmovdqu %xmm1,(%rdx) |
+ vmovdqu %xmm3,16(%rdx) |
+ vpxor %xmm14,%xmm14,%xmm14 |
+ |
+ vpshufb %xmm15,%xmm3,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslld $1,%xmm0,%xmm0 |
+ vpslldq $4,%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ vaesenc %xmm1,%xmm8,%xmm8 |
+ vmovdqu %xmm1,32(%rdx) |
+ |
+ vpshufd $0xff,%xmm1,%xmm2 |
+ vaesenclast %xmm14,%xmm2,%xmm2 |
+ vpslldq $4,%xmm3,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpxor %xmm2,%xmm3,%xmm3 |
+ vaesenc %xmm3,%xmm8,%xmm8 |
+ vmovdqu %xmm3,48(%rdx) |
+ |
+ vpshufb %xmm15,%xmm3,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslld $1,%xmm0,%xmm0 |
+ vpslldq $4,%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ vaesenc %xmm1,%xmm8,%xmm8 |
+ vmovdqu %xmm1,64(%rdx) |
+ |
+ vpshufd $0xff,%xmm1,%xmm2 |
+ vaesenclast %xmm14,%xmm2,%xmm2 |
+ vpslldq $4,%xmm3,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpxor %xmm2,%xmm3,%xmm3 |
+ vaesenc %xmm3,%xmm8,%xmm8 |
+ vmovdqu %xmm3,80(%rdx) |
+ |
+ vpshufb %xmm15,%xmm3,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslld $1,%xmm0,%xmm0 |
+ vpslldq $4,%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ vaesenc %xmm1,%xmm8,%xmm8 |
+ vmovdqu %xmm1,96(%rdx) |
+ |
+ vpshufd $0xff,%xmm1,%xmm2 |
+ vaesenclast %xmm14,%xmm2,%xmm2 |
+ vpslldq $4,%xmm3,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpxor %xmm2,%xmm3,%xmm3 |
+ vaesenc %xmm3,%xmm8,%xmm8 |
+ vmovdqu %xmm3,112(%rdx) |
+ |
+ vpshufb %xmm15,%xmm3,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslld $1,%xmm0,%xmm0 |
+ vpslldq $4,%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ vaesenc %xmm1,%xmm8,%xmm8 |
+ vmovdqu %xmm1,128(%rdx) |
+ |
+ vpshufd $0xff,%xmm1,%xmm2 |
+ vaesenclast %xmm14,%xmm2,%xmm2 |
+ vpslldq $4,%xmm3,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpxor %xmm2,%xmm3,%xmm3 |
+ vaesenc %xmm3,%xmm8,%xmm8 |
+ vmovdqu %xmm3,144(%rdx) |
+ |
+ vpshufb %xmm15,%xmm3,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslld $1,%xmm0,%xmm0 |
+ vpslldq $4,%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ vaesenc %xmm1,%xmm8,%xmm8 |
+ vmovdqu %xmm1,160(%rdx) |
+ |
+ vpshufd $0xff,%xmm1,%xmm2 |
+ vaesenclast %xmm14,%xmm2,%xmm2 |
+ vpslldq $4,%xmm3,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpxor %xmm2,%xmm3,%xmm3 |
+ vaesenc %xmm3,%xmm8,%xmm8 |
+ vmovdqu %xmm3,176(%rdx) |
+ |
+ vpshufb %xmm15,%xmm3,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslld $1,%xmm0,%xmm0 |
+ vpslldq $4,%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ vaesenc %xmm1,%xmm8,%xmm8 |
+ vmovdqu %xmm1,192(%rdx) |
+ |
+ vpshufd $0xff,%xmm1,%xmm2 |
+ vaesenclast %xmm14,%xmm2,%xmm2 |
+ vpslldq $4,%xmm3,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpxor %xmm2,%xmm3,%xmm3 |
+ vaesenc %xmm3,%xmm8,%xmm8 |
+ vmovdqu %xmm3,208(%rdx) |
+ |
+ vpshufb %xmm15,%xmm3,%xmm2 |
+ vaesenclast %xmm0,%xmm2,%xmm2 |
+ vpslldq $4,%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpslldq $4,%xmm4,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpxor %xmm2,%xmm1,%xmm1 |
+ vaesenclast %xmm1,%xmm8,%xmm8 |
+ vmovdqu %xmm1,224(%rdx) |
+ |
+ vmovdqa %xmm8,(%rsi) |
+ .byte 0xf3,0xc3 |
+.cfi_endproc |
+.size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1 |
+.globl aes256gcmsiv_ecb_enc_block |
+.hidden aes256gcmsiv_ecb_enc_block |
+.type aes256gcmsiv_ecb_enc_block,@function |
+.align 16 |
+aes256gcmsiv_ecb_enc_block: |
+.cfi_startproc |
+ vmovdqa (%rdi),%xmm1 |
+ vpxor (%rdx),%xmm1,%xmm1 |
+ vaesenc 16(%rdx),%xmm1,%xmm1 |
+ vaesenc 32(%rdx),%xmm1,%xmm1 |
+ vaesenc 48(%rdx),%xmm1,%xmm1 |
+ vaesenc 64(%rdx),%xmm1,%xmm1 |
+ vaesenc 80(%rdx),%xmm1,%xmm1 |
+ vaesenc 96(%rdx),%xmm1,%xmm1 |
+ vaesenc 112(%rdx),%xmm1,%xmm1 |
+ vaesenc 128(%rdx),%xmm1,%xmm1 |
+ vaesenc 144(%rdx),%xmm1,%xmm1 |
+ vaesenc 160(%rdx),%xmm1,%xmm1 |
+ vaesenc 176(%rdx),%xmm1,%xmm1 |
+ vaesenc 192(%rdx),%xmm1,%xmm1 |
+ vaesenc 208(%rdx),%xmm1,%xmm1 |
+ vaesenclast 224(%rdx),%xmm1,%xmm1 |
+ vmovdqa %xmm1,(%rsi) |
+ .byte 0xf3,0xc3 |
+.cfi_endproc |
+.size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block |
+.globl aes256gcmsiv_enc_msg_x4 |
+.hidden aes256gcmsiv_enc_msg_x4 |
+.type aes256gcmsiv_enc_msg_x4,@function |
+.align 16 |
+aes256gcmsiv_enc_msg_x4: |
+.cfi_startproc |
+ testq %r8,%r8 |
+ jnz .L256_enc_msg_x4_start |
+ .byte 0xf3,0xc3 |
+ |
+.L256_enc_msg_x4_start: |
+ movq %r8,%r10 |
+ shrq $4,%r8 |
+ shlq $60,%r10 |
+ jz .L256_enc_msg_x4_start2 |
+ addq $1,%r8 |
+ |
+.L256_enc_msg_x4_start2: |
+ movq %r8,%r10 |
+ shlq $62,%r10 |
+ shrq $62,%r10 |
+ |
+ |
+ vmovdqa (%rdx),%xmm15 |
+ vpor OR_MASK(%rip),%xmm15,%xmm15 |
+ |
+ vmovdqa four(%rip),%xmm4 |
+ vmovdqa %xmm15,%xmm0 |
+ vpaddd one(%rip),%xmm15,%xmm1 |
+ vpaddd two(%rip),%xmm15,%xmm2 |
+ vpaddd three(%rip),%xmm15,%xmm3 |
+ |
+ shrq $2,%r8 |
+ je .L256_enc_msg_x4_check_remainder |
+ |
+ subq $64,%rsi |
+ subq $64,%rdi |
+ |
+.L256_enc_msg_x4_loop1: |
+ addq $64,%rsi |
+ addq $64,%rdi |
+ |
+ vmovdqa %xmm0,%xmm5 |
+ vmovdqa %xmm1,%xmm6 |
+ vmovdqa %xmm2,%xmm7 |
+ vmovdqa %xmm3,%xmm8 |
+ |
+ vpxor (%rcx),%xmm5,%xmm5 |
+ vpxor (%rcx),%xmm6,%xmm6 |
+ vpxor (%rcx),%xmm7,%xmm7 |
+ vpxor (%rcx),%xmm8,%xmm8 |
+ |
+ vmovdqu 16(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vpaddd %xmm4,%xmm0,%xmm0 |
+ vmovdqu 32(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vpaddd %xmm4,%xmm1,%xmm1 |
+ vmovdqu 48(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vpaddd %xmm4,%xmm2,%xmm2 |
+ vmovdqu 64(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vpaddd %xmm4,%xmm3,%xmm3 |
+ |
+ vmovdqu 80(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vmovdqu 96(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vmovdqu 112(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vmovdqu 128(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vmovdqu 144(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vmovdqu 160(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vmovdqu 176(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vmovdqu 192(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vmovdqu 208(%rcx),%xmm12 |
+ vaesenc %xmm12,%xmm5,%xmm5 |
+ vaesenc %xmm12,%xmm6,%xmm6 |
+ vaesenc %xmm12,%xmm7,%xmm7 |
+ vaesenc %xmm12,%xmm8,%xmm8 |
+ |
+ vmovdqu 224(%rcx),%xmm12 |
+ vaesenclast %xmm12,%xmm5,%xmm5 |
+ vaesenclast %xmm12,%xmm6,%xmm6 |
+ vaesenclast %xmm12,%xmm7,%xmm7 |
+ vaesenclast %xmm12,%xmm8,%xmm8 |
+ |
+ |
+ |
+ vpxor 0(%rdi),%xmm5,%xmm5 |
+ vpxor 16(%rdi),%xmm6,%xmm6 |
+ vpxor 32(%rdi),%xmm7,%xmm7 |
+ vpxor 48(%rdi),%xmm8,%xmm8 |
+ |
+ subq $1,%r8 |
+ |
+ vmovdqu %xmm5,0(%rsi) |
+ vmovdqu %xmm6,16(%rsi) |
+ vmovdqu %xmm7,32(%rsi) |
+ vmovdqu %xmm8,48(%rsi) |
+ |
+ jne .L256_enc_msg_x4_loop1 |
+ |
+ addq $64,%rsi |
+ addq $64,%rdi |
+ |
+.L256_enc_msg_x4_check_remainder: |
+ cmpq $0,%r10 |
+ je .L256_enc_msg_x4_out |
+ |
+.L256_enc_msg_x4_loop2: |
+ |
+ |
+ |
+ vmovdqa %xmm0,%xmm5 |
+ vpaddd one(%rip),%xmm0,%xmm0 |
+ vpxor (%rcx),%xmm5,%xmm5 |
+ vaesenc 16(%rcx),%xmm5,%xmm5 |
+ vaesenc 32(%rcx),%xmm5,%xmm5 |
+ vaesenc 48(%rcx),%xmm5,%xmm5 |
+ vaesenc 64(%rcx),%xmm5,%xmm5 |
+ vaesenc 80(%rcx),%xmm5,%xmm5 |
+ vaesenc 96(%rcx),%xmm5,%xmm5 |
+ vaesenc 112(%rcx),%xmm5,%xmm5 |
+ vaesenc 128(%rcx),%xmm5,%xmm5 |
+ vaesenc 144(%rcx),%xmm5,%xmm5 |
+ vaesenc 160(%rcx),%xmm5,%xmm5 |
+ vaesenc 176(%rcx),%xmm5,%xmm5 |
+ vaesenc 192(%rcx),%xmm5,%xmm5 |
+ vaesenc 208(%rcx),%xmm5,%xmm5 |
+ vaesenclast 224(%rcx),%xmm5,%xmm5 |
+ |
+ |
+ vpxor (%rdi),%xmm5,%xmm5 |
+ |
+ vmovdqu %xmm5,(%rsi) |
+ |
+ addq $16,%rdi |
+ addq $16,%rsi |
+ |
+ subq $1,%r10 |
+ jne .L256_enc_msg_x4_loop2 |
+ |
+.L256_enc_msg_x4_out: |
+ .byte 0xf3,0xc3 |
+.cfi_endproc |
+.size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4 |
+.globl aes256gcmsiv_enc_msg_x8 |
+.hidden aes256gcmsiv_enc_msg_x8 |
+.type aes256gcmsiv_enc_msg_x8,@function |
+.align 16 |
+aes256gcmsiv_enc_msg_x8: |
+.cfi_startproc |
+ testq %r8,%r8 |
+ jnz .L256_enc_msg_x8_start |
+ .byte 0xf3,0xc3 |
+ |
+.L256_enc_msg_x8_start: |
+ |
+ movq %rsp,%r11 |
+ subq $16,%r11 |
+ andq $-64,%r11 |
+ |
+ movq %r8,%r10 |
+ shrq $4,%r8 |
+ shlq $60,%r10 |
+ jz .L256_enc_msg_x8_start2 |
+ addq $1,%r8 |
+ |
+.L256_enc_msg_x8_start2: |
+ movq %r8,%r10 |
+ shlq $61,%r10 |
+ shrq $61,%r10 |
+ |
+ |
+ vmovdqa (%rdx),%xmm1 |
+ vpor OR_MASK(%rip),%xmm1,%xmm1 |
+ |
+ |
+ vpaddd seven(%rip),%xmm1,%xmm0 |
+ vmovdqa %xmm0,(%r11) |
+ vpaddd one(%rip),%xmm1,%xmm9 |
+ vpaddd two(%rip),%xmm1,%xmm10 |
+ vpaddd three(%rip),%xmm1,%xmm11 |
+ vpaddd four(%rip),%xmm1,%xmm12 |
+ vpaddd five(%rip),%xmm1,%xmm13 |
+ vpaddd six(%rip),%xmm1,%xmm14 |
+ vmovdqa %xmm1,%xmm0 |
+ |
+ shrq $3,%r8 |
+ jz .L256_enc_msg_x8_check_remainder |
+ |
+ subq $128,%rsi |
+ subq $128,%rdi |
+ |
+.L256_enc_msg_x8_loop1: |
+ addq $128,%rsi |
+ addq $128,%rdi |
+ |
+ vmovdqa %xmm0,%xmm1 |
+ vmovdqa %xmm9,%xmm2 |
+ vmovdqa %xmm10,%xmm3 |
+ vmovdqa %xmm11,%xmm4 |
+ vmovdqa %xmm12,%xmm5 |
+ vmovdqa %xmm13,%xmm6 |
+ vmovdqa %xmm14,%xmm7 |
+ |
+ vmovdqa (%r11),%xmm8 |
+ |
+ vpxor (%rcx),%xmm1,%xmm1 |
+ vpxor (%rcx),%xmm2,%xmm2 |
+ vpxor (%rcx),%xmm3,%xmm3 |
+ vpxor (%rcx),%xmm4,%xmm4 |
+ vpxor (%rcx),%xmm5,%xmm5 |
+ vpxor (%rcx),%xmm6,%xmm6 |
+ vpxor (%rcx),%xmm7,%xmm7 |
+ vpxor (%rcx),%xmm8,%xmm8 |
+ |
+ vmovdqu 16(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vmovdqa (%r11),%xmm14 |
+ vpaddd eight(%rip),%xmm14,%xmm14 |
+ vmovdqa %xmm14,(%r11) |
+ vmovdqu 32(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vpsubd one(%rip),%xmm14,%xmm14 |
+ vmovdqu 48(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vpaddd eight(%rip),%xmm0,%xmm0 |
+ vmovdqu 64(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vpaddd eight(%rip),%xmm9,%xmm9 |
+ vmovdqu 80(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vpaddd eight(%rip),%xmm10,%xmm10 |
+ vmovdqu 96(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vpaddd eight(%rip),%xmm11,%xmm11 |
+ vmovdqu 112(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vpaddd eight(%rip),%xmm12,%xmm12 |
+ vmovdqu 128(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vpaddd eight(%rip),%xmm13,%xmm13 |
+ vmovdqu 144(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vmovdqu 160(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vmovdqu 176(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vmovdqu 192(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vmovdqu 208(%rcx),%xmm15 |
+ vaesenc %xmm15,%xmm1,%xmm1 |
+ vaesenc %xmm15,%xmm2,%xmm2 |
+ vaesenc %xmm15,%xmm3,%xmm3 |
+ vaesenc %xmm15,%xmm4,%xmm4 |
+ vaesenc %xmm15,%xmm5,%xmm5 |
+ vaesenc %xmm15,%xmm6,%xmm6 |
+ vaesenc %xmm15,%xmm7,%xmm7 |
+ vaesenc %xmm15,%xmm8,%xmm8 |
+ |
+ vmovdqu 224(%rcx),%xmm15 |
+ vaesenclast %xmm15,%xmm1,%xmm1 |
+ vaesenclast %xmm15,%xmm2,%xmm2 |
+ vaesenclast %xmm15,%xmm3,%xmm3 |
+ vaesenclast %xmm15,%xmm4,%xmm4 |
+ vaesenclast %xmm15,%xmm5,%xmm5 |
+ vaesenclast %xmm15,%xmm6,%xmm6 |
+ vaesenclast %xmm15,%xmm7,%xmm7 |
+ vaesenclast %xmm15,%xmm8,%xmm8 |
+ |
+ |
+ |
+ vpxor 0(%rdi),%xmm1,%xmm1 |
+ vpxor 16(%rdi),%xmm2,%xmm2 |
+ vpxor 32(%rdi),%xmm3,%xmm3 |
+ vpxor 48(%rdi),%xmm4,%xmm4 |
+ vpxor 64(%rdi),%xmm5,%xmm5 |
+ vpxor 80(%rdi),%xmm6,%xmm6 |
+ vpxor 96(%rdi),%xmm7,%xmm7 |
+ vpxor 112(%rdi),%xmm8,%xmm8 |
+ |
+ subq $1,%r8 |
+ |
+ vmovdqu %xmm1,0(%rsi) |
+ vmovdqu %xmm2,16(%rsi) |
+ vmovdqu %xmm3,32(%rsi) |
+ vmovdqu %xmm4,48(%rsi) |
+ vmovdqu %xmm5,64(%rsi) |
+ vmovdqu %xmm6,80(%rsi) |
+ vmovdqu %xmm7,96(%rsi) |
+ vmovdqu %xmm8,112(%rsi) |
+ |
+ jne .L256_enc_msg_x8_loop1 |
+ |
+ addq $128,%rsi |
+ addq $128,%rdi |
+ |
+.L256_enc_msg_x8_check_remainder: |
+ cmpq $0,%r10 |
+ je .L256_enc_msg_x8_out |
+ |
+.L256_enc_msg_x8_loop2: |
+ |
+ |
+ vmovdqa %xmm0,%xmm1 |
+ vpaddd one(%rip),%xmm0,%xmm0 |
+ |
+ vpxor (%rcx),%xmm1,%xmm1 |
+ vaesenc 16(%rcx),%xmm1,%xmm1 |
+ vaesenc 32(%rcx),%xmm1,%xmm1 |
+ vaesenc 48(%rcx),%xmm1,%xmm1 |
+ vaesenc 64(%rcx),%xmm1,%xmm1 |
+ vaesenc 80(%rcx),%xmm1,%xmm1 |
+ vaesenc 96(%rcx),%xmm1,%xmm1 |
+ vaesenc 112(%rcx),%xmm1,%xmm1 |
+ vaesenc 128(%rcx),%xmm1,%xmm1 |
+ vaesenc 144(%rcx),%xmm1,%xmm1 |
+ vaesenc 160(%rcx),%xmm1,%xmm1 |
+ vaesenc 176(%rcx),%xmm1,%xmm1 |
+ vaesenc 192(%rcx),%xmm1,%xmm1 |
+ vaesenc 208(%rcx),%xmm1,%xmm1 |
+ vaesenclast 224(%rcx),%xmm1,%xmm1 |
+ |
+ |
+ vpxor (%rdi),%xmm1,%xmm1 |
+ |
+ vmovdqu %xmm1,(%rsi) |
+ |
+ addq $16,%rdi |
+ addq $16,%rsi |
+ subq $1,%r10 |
+ jnz .L256_enc_msg_x8_loop2 |
+ |
+.L256_enc_msg_x8_out: |
+ .byte 0xf3,0xc3 |
+ |
+.cfi_endproc |
+.size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8 |
+.globl aes256gcmsiv_dec |
+.hidden aes256gcmsiv_dec |
+.type aes256gcmsiv_dec,@function |
+.align 16 |
+aes256gcmsiv_dec: |
+.cfi_startproc |
+ testq $~15,%r9 |
+ jnz .L256_dec_start |
+ .byte 0xf3,0xc3 |
+ |
+.L256_dec_start: |
+ vzeroupper |
+ vmovdqa (%rdx),%xmm0 |
+ movq %rdx,%rax |
+ |
+ leaq 32(%rax),%rax |
+ leaq 32(%rcx),%rcx |
+ |
+ |
+ vmovdqu (%rdi,%r9,1),%xmm15 |
+ vpor OR_MASK(%rip),%xmm15,%xmm15 |
+ andq $~15,%r9 |
+ |
+ |
+ cmpq $96,%r9 |
+ jb .L256_dec_loop2 |
+ |
+ |
+ subq $96,%r9 |
+ vmovdqa %xmm15,%xmm7 |
+ vpaddd one(%rip),%xmm7,%xmm8 |
+ vpaddd two(%rip),%xmm7,%xmm9 |
+ vpaddd one(%rip),%xmm9,%xmm10 |
+ vpaddd two(%rip),%xmm9,%xmm11 |
+ vpaddd one(%rip),%xmm11,%xmm12 |
+ vpaddd two(%rip),%xmm11,%xmm15 |
+ |
+ vpxor (%r8),%xmm7,%xmm7 |
+ vpxor (%r8),%xmm8,%xmm8 |
+ vpxor (%r8),%xmm9,%xmm9 |
+ vpxor (%r8),%xmm10,%xmm10 |
+ vpxor (%r8),%xmm11,%xmm11 |
+ vpxor (%r8),%xmm12,%xmm12 |
+ |
+ vmovdqu 16(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 32(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 48(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 64(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 80(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 96(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 112(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 128(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 144(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 160(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 176(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 192(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 208(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 224(%r8),%xmm4 |
+ vaesenclast %xmm4,%xmm7,%xmm7 |
+ vaesenclast %xmm4,%xmm8,%xmm8 |
+ vaesenclast %xmm4,%xmm9,%xmm9 |
+ vaesenclast %xmm4,%xmm10,%xmm10 |
+ vaesenclast %xmm4,%xmm11,%xmm11 |
+ vaesenclast %xmm4,%xmm12,%xmm12 |
+ |
+ |
+ vpxor 0(%rdi),%xmm7,%xmm7 |
+ vpxor 16(%rdi),%xmm8,%xmm8 |
+ vpxor 32(%rdi),%xmm9,%xmm9 |
+ vpxor 48(%rdi),%xmm10,%xmm10 |
+ vpxor 64(%rdi),%xmm11,%xmm11 |
+ vpxor 80(%rdi),%xmm12,%xmm12 |
+ |
+ vmovdqu %xmm7,0(%rsi) |
+ vmovdqu %xmm8,16(%rsi) |
+ vmovdqu %xmm9,32(%rsi) |
+ vmovdqu %xmm10,48(%rsi) |
+ vmovdqu %xmm11,64(%rsi) |
+ vmovdqu %xmm12,80(%rsi) |
+ |
+ addq $96,%rdi |
+ addq $96,%rsi |
+ jmp .L256_dec_loop1 |
+ |
+ |
+.align 64 |
+.L256_dec_loop1: |
+ cmpq $96,%r9 |
+ jb .L256_dec_finish_96 |
+ subq $96,%r9 |
+ |
+ vmovdqa %xmm12,%xmm6 |
+ vmovdqa %xmm11,16-32(%rax) |
+ vmovdqa %xmm10,32-32(%rax) |
+ vmovdqa %xmm9,48-32(%rax) |
+ vmovdqa %xmm8,64-32(%rax) |
+ vmovdqa %xmm7,80-32(%rax) |
+ |
+ vmovdqa %xmm15,%xmm7 |
+ vpaddd one(%rip),%xmm7,%xmm8 |
+ vpaddd two(%rip),%xmm7,%xmm9 |
+ vpaddd one(%rip),%xmm9,%xmm10 |
+ vpaddd two(%rip),%xmm9,%xmm11 |
+ vpaddd one(%rip),%xmm11,%xmm12 |
+ vpaddd two(%rip),%xmm11,%xmm15 |
+ |
+ vmovdqa (%r8),%xmm4 |
+ vpxor %xmm4,%xmm7,%xmm7 |
+ vpxor %xmm4,%xmm8,%xmm8 |
+ vpxor %xmm4,%xmm9,%xmm9 |
+ vpxor %xmm4,%xmm10,%xmm10 |
+ vpxor %xmm4,%xmm11,%xmm11 |
+ vpxor %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 0-32(%rcx),%xmm4 |
+ vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 |
+ vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 |
+ vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1 |
+ vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ vmovdqu 16(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu -16(%rax),%xmm6 |
+ vmovdqu -16(%rcx),%xmm13 |
+ |
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ |
+ vmovdqu 32(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 0(%rax),%xmm6 |
+ vmovdqu 0(%rcx),%xmm13 |
+ |
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ |
+ vmovdqu 48(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 16(%rax),%xmm6 |
+ vmovdqu 16(%rcx),%xmm13 |
+ |
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ |
+ vmovdqu 64(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 32(%rax),%xmm6 |
+ vmovdqu 32(%rcx),%xmm13 |
+ |
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ |
+ vmovdqu 80(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 96(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 112(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ |
+ vmovdqa 80-32(%rax),%xmm6 |
+ vpxor %xmm0,%xmm6,%xmm6 |
+ vmovdqu 80-32(%rcx),%xmm5 |
+ |
+ vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ vmovdqu 128(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ |
+ vpsrldq $8,%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm5 |
+ vpslldq $8,%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm0 |
+ |
+ vmovdqa poly(%rip),%xmm3 |
+ |
+ vmovdqu 144(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 160(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 176(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 192(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 208(%r8),%xmm4 |
+ vaesenc %xmm4,%xmm7,%xmm7 |
+ vaesenc %xmm4,%xmm8,%xmm8 |
+ vaesenc %xmm4,%xmm9,%xmm9 |
+ vaesenc %xmm4,%xmm10,%xmm10 |
+ vaesenc %xmm4,%xmm11,%xmm11 |
+ vaesenc %xmm4,%xmm12,%xmm12 |
+ |
+ vmovdqu 224(%r8),%xmm6 |
+ vpalignr $8,%xmm0,%xmm0,%xmm2 |
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 |
+ vpxor %xmm0,%xmm2,%xmm0 |
+ |
+ vpxor 0(%rdi),%xmm6,%xmm4 |
+ vaesenclast %xmm4,%xmm7,%xmm7 |
+ vpxor 16(%rdi),%xmm6,%xmm4 |
+ vaesenclast %xmm4,%xmm8,%xmm8 |
+ vpxor 32(%rdi),%xmm6,%xmm4 |
+ vaesenclast %xmm4,%xmm9,%xmm9 |
+ vpxor 48(%rdi),%xmm6,%xmm4 |
+ vaesenclast %xmm4,%xmm10,%xmm10 |
+ vpxor 64(%rdi),%xmm6,%xmm4 |
+ vaesenclast %xmm4,%xmm11,%xmm11 |
+ vpxor 80(%rdi),%xmm6,%xmm4 |
+ vaesenclast %xmm4,%xmm12,%xmm12 |
+ |
+ vpalignr $8,%xmm0,%xmm0,%xmm2 |
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 |
+ vpxor %xmm0,%xmm2,%xmm0 |
+ |
+ vmovdqu %xmm7,0(%rsi) |
+ vmovdqu %xmm8,16(%rsi) |
+ vmovdqu %xmm9,32(%rsi) |
+ vmovdqu %xmm10,48(%rsi) |
+ vmovdqu %xmm11,64(%rsi) |
+ vmovdqu %xmm12,80(%rsi) |
+ |
+ vpxor %xmm5,%xmm0,%xmm0 |
+ |
+ leaq 96(%rdi),%rdi |
+ leaq 96(%rsi),%rsi |
+ jmp .L256_dec_loop1 |
+ |
+.L256_dec_finish_96: |
+ vmovdqa %xmm12,%xmm6 |
+ vmovdqa %xmm11,16-32(%rax) |
+ vmovdqa %xmm10,32-32(%rax) |
+ vmovdqa %xmm9,48-32(%rax) |
+ vmovdqa %xmm8,64-32(%rax) |
+ vmovdqa %xmm7,80-32(%rax) |
+ |
+ vmovdqu 0-32(%rcx),%xmm4 |
+ vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1 |
+ vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 |
+ vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 |
+ vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ vmovdqu -16(%rax),%xmm6 |
+ vmovdqu -16(%rcx),%xmm13 |
+ |
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ vmovdqu 0(%rax),%xmm6 |
+ vmovdqu 0(%rcx),%xmm13 |
+ |
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ vmovdqu 16(%rax),%xmm6 |
+ vmovdqu 16(%rcx),%xmm13 |
+ |
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ vmovdqu 32(%rax),%xmm6 |
+ vmovdqu 32(%rcx),%xmm13 |
+ |
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ |
+ vmovdqu 80-32(%rax),%xmm6 |
+ vpxor %xmm0,%xmm6,%xmm6 |
+ vmovdqu 80-32(%rcx),%xmm5 |
+ vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm2 |
+ vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm3 |
+ vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 |
+ vpxor %xmm4,%xmm1,%xmm1 |
+ |
+ vpsrldq $8,%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm2,%xmm5 |
+ vpslldq $8,%xmm1,%xmm4 |
+ vpxor %xmm4,%xmm3,%xmm0 |
+ |
+ vmovdqa poly(%rip),%xmm3 |
+ |
+ vpalignr $8,%xmm0,%xmm0,%xmm2 |
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 |
+ vpxor %xmm0,%xmm2,%xmm0 |
+ |
+ vpalignr $8,%xmm0,%xmm0,%xmm2 |
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 |
+ vpxor %xmm0,%xmm2,%xmm0 |
+ |
+ vpxor %xmm5,%xmm0,%xmm0 |
+ |
+.L256_dec_loop2: |
+ |
+ |
+ |
+ cmpq $16,%r9 |
+ jb .L256_dec_out |
+ subq $16,%r9 |
+ |
+ vmovdqa %xmm15,%xmm2 |
+ vpaddd one(%rip),%xmm15,%xmm15 |
+ |
+ vpxor 0(%r8),%xmm2,%xmm2 |
+ vaesenc 16(%r8),%xmm2,%xmm2 |
+ vaesenc 32(%r8),%xmm2,%xmm2 |
+ vaesenc 48(%r8),%xmm2,%xmm2 |
+ vaesenc 64(%r8),%xmm2,%xmm2 |
+ vaesenc 80(%r8),%xmm2,%xmm2 |
+ vaesenc 96(%r8),%xmm2,%xmm2 |
+ vaesenc 112(%r8),%xmm2,%xmm2 |
+ vaesenc 128(%r8),%xmm2,%xmm2 |
+ vaesenc 144(%r8),%xmm2,%xmm2 |
+ vaesenc 160(%r8),%xmm2,%xmm2 |
+ vaesenc 176(%r8),%xmm2,%xmm2 |
+ vaesenc 192(%r8),%xmm2,%xmm2 |
+ vaesenc 208(%r8),%xmm2,%xmm2 |
+ vaesenclast 224(%r8),%xmm2,%xmm2 |
+ vpxor (%rdi),%xmm2,%xmm2 |
+ vmovdqu %xmm2,(%rsi) |
+ addq $16,%rdi |
+ addq $16,%rsi |
+ |
+ vpxor %xmm2,%xmm0,%xmm0 |
+ vmovdqa -32(%rcx),%xmm1 |
+ call GFMUL |
+ |
+ jmp .L256_dec_loop2 |
+ |
+.L256_dec_out: |
+ vmovdqu %xmm0,(%rdx) |
+ .byte 0xf3,0xc3 |
+.cfi_endproc |
+.size aes256gcmsiv_dec, .-aes256gcmsiv_dec |
+.globl aes256gcmsiv_kdf |
+.hidden aes256gcmsiv_kdf |
+.type aes256gcmsiv_kdf,@function |
+.align 16 |
+aes256gcmsiv_kdf: |
+.cfi_startproc |
+ |
+ |
+ |
+ |
+ vmovdqa (%rdx),%xmm1 |
+ vmovdqa 0(%rdi),%xmm4 |
+ vmovdqa and_mask(%rip),%xmm11 |
+ vmovdqa one(%rip),%xmm8 |
+ vpshufd $0x90,%xmm4,%xmm4 |
+ vpand %xmm11,%xmm4,%xmm4 |
+ vpaddd %xmm8,%xmm4,%xmm6 |
+ vpaddd %xmm8,%xmm6,%xmm7 |
+ vpaddd %xmm8,%xmm7,%xmm11 |
+ vpaddd %xmm8,%xmm11,%xmm12 |
+ vpaddd %xmm8,%xmm12,%xmm13 |
+ |
+ vpxor %xmm1,%xmm4,%xmm4 |
+ vpxor %xmm1,%xmm6,%xmm6 |
+ vpxor %xmm1,%xmm7,%xmm7 |
+ vpxor %xmm1,%xmm11,%xmm11 |
+ vpxor %xmm1,%xmm12,%xmm12 |
+ vpxor %xmm1,%xmm13,%xmm13 |
+ |
+ vmovdqa 16(%rdx),%xmm1 |
+ vaesenc %xmm1,%xmm4,%xmm4 |
+ vaesenc %xmm1,%xmm6,%xmm6 |
+ vaesenc %xmm1,%xmm7,%xmm7 |
+ vaesenc %xmm1,%xmm11,%xmm11 |
+ vaesenc %xmm1,%xmm12,%xmm12 |
+ vaesenc %xmm1,%xmm13,%xmm13 |
+ |
+ vmovdqa 32(%rdx),%xmm2 |
+ vaesenc %xmm2,%xmm4,%xmm4 |
+ vaesenc %xmm2,%xmm6,%xmm6 |
+ vaesenc %xmm2,%xmm7,%xmm7 |
+ vaesenc %xmm2,%xmm11,%xmm11 |
+ vaesenc %xmm2,%xmm12,%xmm12 |
+ vaesenc %xmm2,%xmm13,%xmm13 |
+ |
+ vmovdqa 48(%rdx),%xmm1 |
+ vaesenc %xmm1,%xmm4,%xmm4 |
+ vaesenc %xmm1,%xmm6,%xmm6 |
+ vaesenc %xmm1,%xmm7,%xmm7 |
+ vaesenc %xmm1,%xmm11,%xmm11 |
+ vaesenc %xmm1,%xmm12,%xmm12 |
+ vaesenc %xmm1,%xmm13,%xmm13 |
+ |
+ vmovdqa 64(%rdx),%xmm2 |
+ vaesenc %xmm2,%xmm4,%xmm4 |
+ vaesenc %xmm2,%xmm6,%xmm6 |
+ vaesenc %xmm2,%xmm7,%xmm7 |
+ vaesenc %xmm2,%xmm11,%xmm11 |
+ vaesenc %xmm2,%xmm12,%xmm12 |
+ vaesenc %xmm2,%xmm13,%xmm13 |
+ |
+ vmovdqa 80(%rdx),%xmm1 |
+ vaesenc %xmm1,%xmm4,%xmm4 |
+ vaesenc %xmm1,%xmm6,%xmm6 |
+ vaesenc %xmm1,%xmm7,%xmm7 |
+ vaesenc %xmm1,%xmm11,%xmm11 |
+ vaesenc %xmm1,%xmm12,%xmm12 |
+ vaesenc %xmm1,%xmm13,%xmm13 |
+ |
+ vmovdqa 96(%rdx),%xmm2 |
+ vaesenc %xmm2,%xmm4,%xmm4 |
+ vaesenc %xmm2,%xmm6,%xmm6 |
+ vaesenc %xmm2,%xmm7,%xmm7 |
+ vaesenc %xmm2,%xmm11,%xmm11 |
+ vaesenc %xmm2,%xmm12,%xmm12 |
+ vaesenc %xmm2,%xmm13,%xmm13 |
+ |
+ vmovdqa 112(%rdx),%xmm1 |
+ vaesenc %xmm1,%xmm4,%xmm4 |
+ vaesenc %xmm1,%xmm6,%xmm6 |
+ vaesenc %xmm1,%xmm7,%xmm7 |
+ vaesenc %xmm1,%xmm11,%xmm11 |
+ vaesenc %xmm1,%xmm12,%xmm12 |
+ vaesenc %xmm1,%xmm13,%xmm13 |
+ |
+ vmovdqa 128(%rdx),%xmm2 |
+ vaesenc %xmm2,%xmm4,%xmm4 |
+ vaesenc %xmm2,%xmm6,%xmm6 |
+ vaesenc %xmm2,%xmm7,%xmm7 |
+ vaesenc %xmm2,%xmm11,%xmm11 |
+ vaesenc %xmm2,%xmm12,%xmm12 |
+ vaesenc %xmm2,%xmm13,%xmm13 |
+ |
+ vmovdqa 144(%rdx),%xmm1 |
+ vaesenc %xmm1,%xmm4,%xmm4 |
+ vaesenc %xmm1,%xmm6,%xmm6 |
+ vaesenc %xmm1,%xmm7,%xmm7 |
+ vaesenc %xmm1,%xmm11,%xmm11 |
+ vaesenc %xmm1,%xmm12,%xmm12 |
+ vaesenc %xmm1,%xmm13,%xmm13 |
+ |
+ vmovdqa 160(%rdx),%xmm2 |
+ vaesenc %xmm2,%xmm4,%xmm4 |
+ vaesenc %xmm2,%xmm6,%xmm6 |
+ vaesenc %xmm2,%xmm7,%xmm7 |
+ vaesenc %xmm2,%xmm11,%xmm11 |
+ vaesenc %xmm2,%xmm12,%xmm12 |
+ vaesenc %xmm2,%xmm13,%xmm13 |
+ |
+ vmovdqa 176(%rdx),%xmm1 |
+ vaesenc %xmm1,%xmm4,%xmm4 |
+ vaesenc %xmm1,%xmm6,%xmm6 |
+ vaesenc %xmm1,%xmm7,%xmm7 |
+ vaesenc %xmm1,%xmm11,%xmm11 |
+ vaesenc %xmm1,%xmm12,%xmm12 |
+ vaesenc %xmm1,%xmm13,%xmm13 |
+ |
+ vmovdqa 192(%rdx),%xmm2 |
+ vaesenc %xmm2,%xmm4,%xmm4 |
+ vaesenc %xmm2,%xmm6,%xmm6 |
+ vaesenc %xmm2,%xmm7,%xmm7 |
+ vaesenc %xmm2,%xmm11,%xmm11 |
+ vaesenc %xmm2,%xmm12,%xmm12 |
+ vaesenc %xmm2,%xmm13,%xmm13 |
+ |
+ vmovdqa 208(%rdx),%xmm1 |
+ vaesenc %xmm1,%xmm4,%xmm4 |
+ vaesenc %xmm1,%xmm6,%xmm6 |
+ vaesenc %xmm1,%xmm7,%xmm7 |
+ vaesenc %xmm1,%xmm11,%xmm11 |
+ vaesenc %xmm1,%xmm12,%xmm12 |
+ vaesenc %xmm1,%xmm13,%xmm13 |
+ |
+ vmovdqa 224(%rdx),%xmm2 |
+ vaesenclast %xmm2,%xmm4,%xmm4 |
+ vaesenclast %xmm2,%xmm6,%xmm6 |
+ vaesenclast %xmm2,%xmm7,%xmm7 |
+ vaesenclast %xmm2,%xmm11,%xmm11 |
+ vaesenclast %xmm2,%xmm12,%xmm12 |
+ vaesenclast %xmm2,%xmm13,%xmm13 |
+ |
+ |
+ vmovdqa %xmm4,0(%rsi) |
+ vmovdqa %xmm6,16(%rsi) |
+ vmovdqa %xmm7,32(%rsi) |
+ vmovdqa %xmm11,48(%rsi) |
+ vmovdqa %xmm12,64(%rsi) |
+ vmovdqa %xmm13,80(%rsi) |
+ .byte 0xf3,0xc3 |
+.cfi_endproc |
+.size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf |
+#endif |