| Index: third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S
|
| diff --git a/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..daf82d9e9bbdb0225b95a348368570f029d0494b
|
| --- /dev/null
|
| +++ b/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S
|
| @@ -0,0 +1,3066 @@
|
| +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
|
| +.data
|
| +
|
| +.align 16
|
| +one:
|
| +.quad 1,0
|
| +two:
|
| +.quad 2,0
|
| +three:
|
| +.quad 3,0
|
| +four:
|
| +.quad 4,0
|
| +five:
|
| +.quad 5,0
|
| +six:
|
| +.quad 6,0
|
| +seven:
|
| +.quad 7,0
|
| +eight:
|
| +.quad 8,0
|
| +
|
| +OR_MASK:
|
| +.long 0x00000000,0x00000000,0x00000000,0x80000000
|
| +poly:
|
| +.quad 0x1, 0xc200000000000000
|
| +mask:
|
| +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
|
| +con1:
|
| +.long 1,1,1,1
|
| +con2:
|
| +.long 0x1b,0x1b,0x1b,0x1b
|
| +con3:
|
| +.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
|
| +and_mask:
|
| +.long 0,0xffffffff, 0xffffffff, 0xffffffff
|
| +.text
|
| +.type GFMUL,@function
|
| +.align 16
|
| +GFMUL:
|
| +.cfi_startproc
|
| + vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
|
| + vpclmulqdq $0x11,%xmm1,%xmm0,%xmm5
|
| + vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
|
| + vpclmulqdq $0x01,%xmm1,%xmm0,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpslldq $8,%xmm3,%xmm4
|
| + vpsrldq $8,%xmm3,%xmm3
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpxor %xmm3,%xmm5,%xmm5
|
| +
|
| + vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3
|
| + vpshufd $78,%xmm2,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm2
|
| +
|
| + vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3
|
| + vpshufd $78,%xmm2,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm2
|
| +
|
| + vpxor %xmm5,%xmm2,%xmm0
|
| + .byte 0xf3,0xc3
|
| +.cfi_endproc
|
| +.size GFMUL, .-GFMUL
|
| +.globl aesgcmsiv_htable_init
|
| +.hidden aesgcmsiv_htable_init
|
| +.type aesgcmsiv_htable_init,@function
|
| +.align 16
|
| +aesgcmsiv_htable_init:
|
| +.cfi_startproc
|
| + vmovdqa (%rsi),%xmm0
|
| + vmovdqa %xmm0,%xmm1
|
| + vmovdqa %xmm0,(%rdi)
|
| + call GFMUL
|
| + vmovdqa %xmm0,16(%rdi)
|
| + call GFMUL
|
| + vmovdqa %xmm0,32(%rdi)
|
| + call GFMUL
|
| + vmovdqa %xmm0,48(%rdi)
|
| + call GFMUL
|
| + vmovdqa %xmm0,64(%rdi)
|
| + call GFMUL
|
| + vmovdqa %xmm0,80(%rdi)
|
| + call GFMUL
|
| + vmovdqa %xmm0,96(%rdi)
|
| + call GFMUL
|
| + vmovdqa %xmm0,112(%rdi)
|
| + .byte 0xf3,0xc3
|
| +.cfi_endproc
|
| +.size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init
|
| +.globl aesgcmsiv_htable6_init
|
| +.hidden aesgcmsiv_htable6_init
|
| +.type aesgcmsiv_htable6_init,@function
|
| +.align 16
|
| +aesgcmsiv_htable6_init:
|
| +.cfi_startproc
|
| + vmovdqa (%rsi),%xmm0
|
| + vmovdqa %xmm0,%xmm1
|
| + vmovdqa %xmm0,(%rdi)
|
| + call GFMUL
|
| + vmovdqa %xmm0,16(%rdi)
|
| + call GFMUL
|
| + vmovdqa %xmm0,32(%rdi)
|
| + call GFMUL
|
| + vmovdqa %xmm0,48(%rdi)
|
| + call GFMUL
|
| + vmovdqa %xmm0,64(%rdi)
|
| + call GFMUL
|
| + vmovdqa %xmm0,80(%rdi)
|
| + .byte 0xf3,0xc3
|
| +.cfi_endproc
|
| +.size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init
|
| +.globl aesgcmsiv_htable_polyval
|
| +.hidden aesgcmsiv_htable_polyval
|
| +.type aesgcmsiv_htable_polyval,@function
|
| +.align 16
|
| +aesgcmsiv_htable_polyval:
|
| +.cfi_startproc
|
| + testq %rdx,%rdx
|
| + jnz .Lhtable_polyval_start
|
| + .byte 0xf3,0xc3
|
| +
|
| +.Lhtable_polyval_start:
|
| + vzeroall
|
| +
|
| +
|
| +
|
| + movq %rdx,%r11
|
| + andq $127,%r11
|
| +
|
| + jz .Lhtable_polyval_no_prefix
|
| +
|
| + vpxor %xmm9,%xmm9,%xmm9
|
| + vmovdqa (%rcx),%xmm1
|
| + subq %r11,%rdx
|
| +
|
| + subq $16,%r11
|
| +
|
| +
|
| + vmovdqu (%rsi),%xmm0
|
| + vpxor %xmm1,%xmm0,%xmm0
|
| +
|
| + vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm5
|
| + vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm3
|
| + vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm4
|
| + vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm5,%xmm5
|
| +
|
| + leaq 16(%rsi),%rsi
|
| + testq %r11,%r11
|
| + jnz .Lhtable_polyval_prefix_loop
|
| + jmp .Lhtable_polyval_prefix_complete
|
| +
|
| +
|
| +.align 64
|
| +.Lhtable_polyval_prefix_loop:
|
| + subq $16,%r11
|
| +
|
| + vmovdqu (%rsi),%xmm0
|
| +
|
| + vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm3,%xmm3
|
| + vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm4,%xmm4
|
| + vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm5,%xmm5
|
| + vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm5,%xmm5
|
| +
|
| + testq %r11,%r11
|
| +
|
| + leaq 16(%rsi),%rsi
|
| +
|
| + jnz .Lhtable_polyval_prefix_loop
|
| +
|
| +.Lhtable_polyval_prefix_complete:
|
| + vpsrldq $8,%xmm5,%xmm6
|
| + vpslldq $8,%xmm5,%xmm5
|
| +
|
| + vpxor %xmm6,%xmm4,%xmm9
|
| + vpxor %xmm5,%xmm3,%xmm1
|
| +
|
| + jmp .Lhtable_polyval_main_loop
|
| +
|
| +.Lhtable_polyval_no_prefix:
|
| +
|
| +
|
| +
|
| +
|
| + vpxor %xmm1,%xmm1,%xmm1
|
| + vmovdqa (%rcx),%xmm9
|
| +
|
| +.align 64
|
| +.Lhtable_polyval_main_loop:
|
| + subq $0x80,%rdx
|
| + jb .Lhtable_polyval_out
|
| +
|
| + vmovdqu 112(%rsi),%xmm0
|
| +
|
| + vpclmulqdq $0x01,(%rdi),%xmm0,%xmm5
|
| + vpclmulqdq $0x00,(%rdi),%xmm0,%xmm3
|
| + vpclmulqdq $0x11,(%rdi),%xmm0,%xmm4
|
| + vpclmulqdq $0x10,(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm5,%xmm5
|
| +
|
| +
|
| + vmovdqu 96(%rsi),%xmm0
|
| + vpclmulqdq $0x01,16(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm5,%xmm5
|
| + vpclmulqdq $0x00,16(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm3,%xmm3
|
| + vpclmulqdq $0x11,16(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm4,%xmm4
|
| + vpclmulqdq $0x10,16(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm5,%xmm5
|
| +
|
| +
|
| +
|
| + vmovdqu 80(%rsi),%xmm0
|
| +
|
| + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7
|
| + vpalignr $8,%xmm1,%xmm1,%xmm1
|
| +
|
| + vpclmulqdq $0x01,32(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm5,%xmm5
|
| + vpclmulqdq $0x00,32(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm3,%xmm3
|
| + vpclmulqdq $0x11,32(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm4,%xmm4
|
| + vpclmulqdq $0x10,32(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm5,%xmm5
|
| +
|
| +
|
| + vpxor %xmm7,%xmm1,%xmm1
|
| +
|
| + vmovdqu 64(%rsi),%xmm0
|
| +
|
| + vpclmulqdq $0x01,48(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm5,%xmm5
|
| + vpclmulqdq $0x00,48(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm3,%xmm3
|
| + vpclmulqdq $0x11,48(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm4,%xmm4
|
| + vpclmulqdq $0x10,48(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm5,%xmm5
|
| +
|
| +
|
| + vmovdqu 48(%rsi),%xmm0
|
| +
|
| + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7
|
| + vpalignr $8,%xmm1,%xmm1,%xmm1
|
| +
|
| + vpclmulqdq $0x01,64(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm5,%xmm5
|
| + vpclmulqdq $0x00,64(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm3,%xmm3
|
| + vpclmulqdq $0x11,64(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm4,%xmm4
|
| + vpclmulqdq $0x10,64(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm5,%xmm5
|
| +
|
| +
|
| + vpxor %xmm7,%xmm1,%xmm1
|
| +
|
| + vmovdqu 32(%rsi),%xmm0
|
| +
|
| + vpclmulqdq $0x01,80(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm5,%xmm5
|
| + vpclmulqdq $0x00,80(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm3,%xmm3
|
| + vpclmulqdq $0x11,80(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm4,%xmm4
|
| + vpclmulqdq $0x10,80(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm5,%xmm5
|
| +
|
| +
|
| + vpxor %xmm9,%xmm1,%xmm1
|
| +
|
| + vmovdqu 16(%rsi),%xmm0
|
| +
|
| + vpclmulqdq $0x01,96(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm5,%xmm5
|
| + vpclmulqdq $0x00,96(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm3,%xmm3
|
| + vpclmulqdq $0x11,96(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm4,%xmm4
|
| + vpclmulqdq $0x10,96(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm5,%xmm5
|
| +
|
| +
|
| + vmovdqu 0(%rsi),%xmm0
|
| + vpxor %xmm1,%xmm0,%xmm0
|
| +
|
| + vpclmulqdq $0x01,112(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm5,%xmm5
|
| + vpclmulqdq $0x00,112(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm3,%xmm3
|
| + vpclmulqdq $0x11,112(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm4,%xmm4
|
| + vpclmulqdq $0x10,112(%rdi),%xmm0,%xmm6
|
| + vpxor %xmm6,%xmm5,%xmm5
|
| +
|
| +
|
| + vpsrldq $8,%xmm5,%xmm6
|
| + vpslldq $8,%xmm5,%xmm5
|
| +
|
| + vpxor %xmm6,%xmm4,%xmm9
|
| + vpxor %xmm5,%xmm3,%xmm1
|
| +
|
| + leaq 128(%rsi),%rsi
|
| + jmp .Lhtable_polyval_main_loop
|
| +
|
| +
|
| +
|
| +.Lhtable_polyval_out:
|
| + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6
|
| + vpalignr $8,%xmm1,%xmm1,%xmm1
|
| + vpxor %xmm6,%xmm1,%xmm1
|
| +
|
| + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6
|
| + vpalignr $8,%xmm1,%xmm1,%xmm1
|
| + vpxor %xmm6,%xmm1,%xmm1
|
| + vpxor %xmm9,%xmm1,%xmm1
|
| +
|
| + vmovdqu %xmm1,(%rcx)
|
| + vzeroupper
|
| + .byte 0xf3,0xc3
|
| +.cfi_endproc
|
| +.size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval
|
| +.globl aesgcmsiv_polyval_horner
|
| +.hidden aesgcmsiv_polyval_horner
|
| +.type aesgcmsiv_polyval_horner,@function
|
| +.align 16
|
| +aesgcmsiv_polyval_horner:
|
| +.cfi_startproc
|
| + testq %rcx,%rcx
|
| + jnz .Lpolyval_horner_start
|
| + .byte 0xf3,0xc3
|
| +
|
| +.Lpolyval_horner_start:
|
| +
|
| +
|
| +
|
| + xorq %r10,%r10
|
| + shlq $4,%rcx
|
| +
|
| + vmovdqa (%rsi),%xmm1
|
| + vmovdqa (%rdi),%xmm0
|
| +
|
| +.Lpolyval_horner_loop:
|
| + vpxor (%rdx,%r10,1),%xmm0,%xmm0
|
| + call GFMUL
|
| +
|
| + addq $16,%r10
|
| + cmpq %r10,%rcx
|
| + jne .Lpolyval_horner_loop
|
| +
|
| +
|
| + vmovdqa %xmm0,(%rdi)
|
| + .byte 0xf3,0xc3
|
| +.cfi_endproc
|
| +.size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner
|
| +.globl aes128gcmsiv_aes_ks
|
| +.hidden aes128gcmsiv_aes_ks
|
| +.type aes128gcmsiv_aes_ks,@function
|
| +.align 16
|
| +aes128gcmsiv_aes_ks:
|
| +.cfi_startproc
|
| + vmovdqa (%rdi),%xmm1
|
| + vmovdqa %xmm1,(%rsi)
|
| +
|
| + vmovdqa con1(%rip),%xmm0
|
| + vmovdqa mask(%rip),%xmm15
|
| +
|
| + movq $8,%rax
|
| +
|
| +.Lks128_loop:
|
| + addq $16,%rsi
|
| + subq $1,%rax
|
| + vpshufb %xmm15,%xmm1,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslld $1,%xmm0,%xmm0
|
| + vpslldq $4,%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpslldq $4,%xmm3,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpslldq $4,%xmm3,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| + vmovdqa %xmm1,(%rsi)
|
| + jne .Lks128_loop
|
| +
|
| + vmovdqa con2(%rip),%xmm0
|
| + vpshufb %xmm15,%xmm1,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslld $1,%xmm0,%xmm0
|
| + vpslldq $4,%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpslldq $4,%xmm3,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpslldq $4,%xmm3,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| + vmovdqa %xmm1,16(%rsi)
|
| +
|
| + vpshufb %xmm15,%xmm1,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslldq $4,%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpslldq $4,%xmm3,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpslldq $4,%xmm3,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| + vmovdqa %xmm1,32(%rsi)
|
| + .byte 0xf3,0xc3
|
| +.cfi_endproc
|
| +.size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks
|
| +.globl aes256gcmsiv_aes_ks
|
| +.hidden aes256gcmsiv_aes_ks
|
| +.type aes256gcmsiv_aes_ks,@function
|
| +.align 16
|
| +aes256gcmsiv_aes_ks:
|
| +.cfi_startproc
|
| + vmovdqa (%rdi),%xmm1
|
| + vmovdqa 16(%rdi),%xmm3
|
| + vmovdqa %xmm1,(%rsi)
|
| + vmovdqa %xmm3,16(%rsi)
|
| + vmovdqa con1(%rip),%xmm0
|
| + vmovdqa mask(%rip),%xmm15
|
| + vpxor %xmm14,%xmm14,%xmm14
|
| + movq $6,%rax
|
| +
|
| +.Lks256_loop:
|
| + addq $32,%rsi
|
| + subq $1,%rax
|
| + vpshufb %xmm15,%xmm3,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslld $1,%xmm0,%xmm0
|
| + vpsllq $32,%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpshufb con3(%rip),%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| + vmovdqa %xmm1,(%rsi)
|
| + vpshufd $0xff,%xmm1,%xmm2
|
| + vaesenclast %xmm14,%xmm2,%xmm2
|
| + vpsllq $32,%xmm3,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpshufb con3(%rip),%xmm3,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpxor %xmm2,%xmm3,%xmm3
|
| + vmovdqa %xmm3,16(%rsi)
|
| + jne .Lks256_loop
|
| +
|
| + vpshufb %xmm15,%xmm3,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpsllq $32,%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpshufb con3(%rip),%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| + vmovdqa %xmm1,32(%rsi)
|
| + .byte 0xf3,0xc3
|
| +.cfi_endproc
|
| +.globl aes128gcmsiv_aes_ks_enc_x1
|
| +.hidden aes128gcmsiv_aes_ks_enc_x1
|
| +.type aes128gcmsiv_aes_ks_enc_x1,@function
|
| +.align 16
|
| +aes128gcmsiv_aes_ks_enc_x1:
|
| +.cfi_startproc
|
| + vmovdqa (%rcx),%xmm1
|
| + vmovdqa 0(%rdi),%xmm4
|
| +
|
| + vmovdqa %xmm1,(%rdx)
|
| + vpxor %xmm1,%xmm4,%xmm4
|
| +
|
| + vmovdqa con1(%rip),%xmm0
|
| + vmovdqa mask(%rip),%xmm15
|
| +
|
| + vpshufb %xmm15,%xmm1,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslld $1,%xmm0,%xmm0
|
| + vpsllq $32,%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpshufb con3(%rip),%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| +
|
| + vaesenc %xmm1,%xmm4,%xmm4
|
| + vmovdqa %xmm1,16(%rdx)
|
| +
|
| + vpshufb %xmm15,%xmm1,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslld $1,%xmm0,%xmm0
|
| + vpsllq $32,%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpshufb con3(%rip),%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| +
|
| + vaesenc %xmm1,%xmm4,%xmm4
|
| + vmovdqa %xmm1,32(%rdx)
|
| +
|
| + vpshufb %xmm15,%xmm1,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslld $1,%xmm0,%xmm0
|
| + vpsllq $32,%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpshufb con3(%rip),%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| +
|
| + vaesenc %xmm1,%xmm4,%xmm4
|
| + vmovdqa %xmm1,48(%rdx)
|
| +
|
| + vpshufb %xmm15,%xmm1,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslld $1,%xmm0,%xmm0
|
| + vpsllq $32,%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpshufb con3(%rip),%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| +
|
| + vaesenc %xmm1,%xmm4,%xmm4
|
| + vmovdqa %xmm1,64(%rdx)
|
| +
|
| + vpshufb %xmm15,%xmm1,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslld $1,%xmm0,%xmm0
|
| + vpsllq $32,%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpshufb con3(%rip),%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| +
|
| + vaesenc %xmm1,%xmm4,%xmm4
|
| + vmovdqa %xmm1,80(%rdx)
|
| +
|
| + vpshufb %xmm15,%xmm1,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslld $1,%xmm0,%xmm0
|
| + vpsllq $32,%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpshufb con3(%rip),%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| +
|
| + vaesenc %xmm1,%xmm4,%xmm4
|
| + vmovdqa %xmm1,96(%rdx)
|
| +
|
| + vpshufb %xmm15,%xmm1,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslld $1,%xmm0,%xmm0
|
| + vpsllq $32,%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpshufb con3(%rip),%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| +
|
| + vaesenc %xmm1,%xmm4,%xmm4
|
| + vmovdqa %xmm1,112(%rdx)
|
| +
|
| + vpshufb %xmm15,%xmm1,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslld $1,%xmm0,%xmm0
|
| + vpsllq $32,%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpshufb con3(%rip),%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| +
|
| + vaesenc %xmm1,%xmm4,%xmm4
|
| + vmovdqa %xmm1,128(%rdx)
|
| +
|
| +
|
| + vmovdqa con2(%rip),%xmm0
|
| +
|
| + vpshufb %xmm15,%xmm1,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslld $1,%xmm0,%xmm0
|
| + vpsllq $32,%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpshufb con3(%rip),%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| +
|
| + vaesenc %xmm1,%xmm4,%xmm4
|
| + vmovdqa %xmm1,144(%rdx)
|
| +
|
| + vpshufb %xmm15,%xmm1,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpsllq $32,%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpshufb con3(%rip),%xmm1,%xmm3
|
| + vpxor %xmm3,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| +
|
| + vaesenclast %xmm1,%xmm4,%xmm4
|
| + vmovdqa %xmm1,160(%rdx)
|
| +
|
| +
|
| + vmovdqa %xmm4,0(%rsi)
|
| + .byte 0xf3,0xc3
|
| +.cfi_endproc
|
| +.size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1
|
| +.globl aes128gcmsiv_kdf
|
| +.hidden aes128gcmsiv_kdf
|
| +.type aes128gcmsiv_kdf,@function
|
| +.align 16
|
| +aes128gcmsiv_kdf:
|
| +.cfi_startproc
|
| +
|
| +
|
| +
|
| +
|
| + vmovdqa (%rdx),%xmm1
|
| + vmovdqa 0(%rdi),%xmm9
|
| + vmovdqa and_mask(%rip),%xmm12
|
| + vmovdqa one(%rip),%xmm13
|
| + vpshufd $0x90,%xmm9,%xmm9
|
| + vpand %xmm12,%xmm9,%xmm9
|
| + vpaddd %xmm13,%xmm9,%xmm10
|
| + vpaddd %xmm13,%xmm10,%xmm11
|
| + vpaddd %xmm13,%xmm11,%xmm12
|
| +
|
| + vpxor %xmm1,%xmm9,%xmm9
|
| + vpxor %xmm1,%xmm10,%xmm10
|
| + vpxor %xmm1,%xmm11,%xmm11
|
| + vpxor %xmm1,%xmm12,%xmm12
|
| +
|
| + vmovdqa 16(%rdx),%xmm1
|
| + vaesenc %xmm1,%xmm9,%xmm9
|
| + vaesenc %xmm1,%xmm10,%xmm10
|
| + vaesenc %xmm1,%xmm11,%xmm11
|
| + vaesenc %xmm1,%xmm12,%xmm12
|
| +
|
| + vmovdqa 32(%rdx),%xmm2
|
| + vaesenc %xmm2,%xmm9,%xmm9
|
| + vaesenc %xmm2,%xmm10,%xmm10
|
| + vaesenc %xmm2,%xmm11,%xmm11
|
| + vaesenc %xmm2,%xmm12,%xmm12
|
| +
|
| + vmovdqa 48(%rdx),%xmm1
|
| + vaesenc %xmm1,%xmm9,%xmm9
|
| + vaesenc %xmm1,%xmm10,%xmm10
|
| + vaesenc %xmm1,%xmm11,%xmm11
|
| + vaesenc %xmm1,%xmm12,%xmm12
|
| +
|
| + vmovdqa 64(%rdx),%xmm2
|
| + vaesenc %xmm2,%xmm9,%xmm9
|
| + vaesenc %xmm2,%xmm10,%xmm10
|
| + vaesenc %xmm2,%xmm11,%xmm11
|
| + vaesenc %xmm2,%xmm12,%xmm12
|
| +
|
| + vmovdqa 80(%rdx),%xmm1
|
| + vaesenc %xmm1,%xmm9,%xmm9
|
| + vaesenc %xmm1,%xmm10,%xmm10
|
| + vaesenc %xmm1,%xmm11,%xmm11
|
| + vaesenc %xmm1,%xmm12,%xmm12
|
| +
|
| + vmovdqa 96(%rdx),%xmm2
|
| + vaesenc %xmm2,%xmm9,%xmm9
|
| + vaesenc %xmm2,%xmm10,%xmm10
|
| + vaesenc %xmm2,%xmm11,%xmm11
|
| + vaesenc %xmm2,%xmm12,%xmm12
|
| +
|
| + vmovdqa 112(%rdx),%xmm1
|
| + vaesenc %xmm1,%xmm9,%xmm9
|
| + vaesenc %xmm1,%xmm10,%xmm10
|
| + vaesenc %xmm1,%xmm11,%xmm11
|
| + vaesenc %xmm1,%xmm12,%xmm12
|
| +
|
| + vmovdqa 128(%rdx),%xmm2
|
| + vaesenc %xmm2,%xmm9,%xmm9
|
| + vaesenc %xmm2,%xmm10,%xmm10
|
| + vaesenc %xmm2,%xmm11,%xmm11
|
| + vaesenc %xmm2,%xmm12,%xmm12
|
| +
|
| + vmovdqa 144(%rdx),%xmm1
|
| + vaesenc %xmm1,%xmm9,%xmm9
|
| + vaesenc %xmm1,%xmm10,%xmm10
|
| + vaesenc %xmm1,%xmm11,%xmm11
|
| + vaesenc %xmm1,%xmm12,%xmm12
|
| +
|
| + vmovdqa 160(%rdx),%xmm2
|
| + vaesenclast %xmm2,%xmm9,%xmm9
|
| + vaesenclast %xmm2,%xmm10,%xmm10
|
| + vaesenclast %xmm2,%xmm11,%xmm11
|
| + vaesenclast %xmm2,%xmm12,%xmm12
|
| +
|
| +
|
| + vmovdqa %xmm9,0(%rsi)
|
| + vmovdqa %xmm10,16(%rsi)
|
| + vmovdqa %xmm11,32(%rsi)
|
| + vmovdqa %xmm12,48(%rsi)
|
| + .byte 0xf3,0xc3
|
| +.cfi_endproc
|
| +.size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf
|
| +.globl aes128gcmsiv_enc_msg_x4
|
| +.hidden aes128gcmsiv_enc_msg_x4
|
| +.type aes128gcmsiv_enc_msg_x4,@function
|
| +.align 16
|
| +aes128gcmsiv_enc_msg_x4:
|
| +.cfi_startproc
|
| + testq %r8,%r8
|
| + jnz .L128_enc_msg_x4_start
|
| + .byte 0xf3,0xc3
|
| +
|
| +.L128_enc_msg_x4_start:
|
| + pushq %r12
|
| +.cfi_adjust_cfa_offset 8
|
| +.cfi_offset %r12,-16
|
| + pushq %r13
|
| +.cfi_adjust_cfa_offset 8
|
| +.cfi_offset %r13,-24
|
| +
|
| + shrq $4,%r8
|
| + movq %r8,%r10
|
| + shlq $62,%r10
|
| + shrq $62,%r10
|
| +
|
| +
|
| + vmovdqa (%rdx),%xmm15
|
| + vpor OR_MASK(%rip),%xmm15,%xmm15
|
| +
|
| + vmovdqu four(%rip),%xmm4
|
| + vmovdqa %xmm15,%xmm0
|
| + vpaddd one(%rip),%xmm15,%xmm1
|
| + vpaddd two(%rip),%xmm15,%xmm2
|
| + vpaddd three(%rip),%xmm15,%xmm3
|
| +
|
| + shrq $2,%r8
|
| + je .L128_enc_msg_x4_check_remainder
|
| +
|
| + subq $64,%rsi
|
| + subq $64,%rdi
|
| +
|
| +.L128_enc_msg_x4_loop1:
|
| + addq $64,%rsi
|
| + addq $64,%rdi
|
| +
|
| + vmovdqa %xmm0,%xmm5
|
| + vmovdqa %xmm1,%xmm6
|
| + vmovdqa %xmm2,%xmm7
|
| + vmovdqa %xmm3,%xmm8
|
| +
|
| + vpxor (%rcx),%xmm5,%xmm5
|
| + vpxor (%rcx),%xmm6,%xmm6
|
| + vpxor (%rcx),%xmm7,%xmm7
|
| + vpxor (%rcx),%xmm8,%xmm8
|
| +
|
| + vmovdqu 16(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vpaddd %xmm4,%xmm0,%xmm0
|
| + vmovdqu 32(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vpaddd %xmm4,%xmm1,%xmm1
|
| + vmovdqu 48(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vpaddd %xmm4,%xmm2,%xmm2
|
| + vmovdqu 64(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vpaddd %xmm4,%xmm3,%xmm3
|
| +
|
| + vmovdqu 80(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vmovdqu 96(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vmovdqu 112(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vmovdqu 128(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vmovdqu 144(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vmovdqu 160(%rcx),%xmm12
|
| + vaesenclast %xmm12,%xmm5,%xmm5
|
| + vaesenclast %xmm12,%xmm6,%xmm6
|
| + vaesenclast %xmm12,%xmm7,%xmm7
|
| + vaesenclast %xmm12,%xmm8,%xmm8
|
| +
|
| +
|
| +
|
| + vpxor 0(%rdi),%xmm5,%xmm5
|
| + vpxor 16(%rdi),%xmm6,%xmm6
|
| + vpxor 32(%rdi),%xmm7,%xmm7
|
| + vpxor 48(%rdi),%xmm8,%xmm8
|
| +
|
| + subq $1,%r8
|
| +
|
| + vmovdqu %xmm5,0(%rsi)
|
| + vmovdqu %xmm6,16(%rsi)
|
| + vmovdqu %xmm7,32(%rsi)
|
| + vmovdqu %xmm8,48(%rsi)
|
| +
|
| + jne .L128_enc_msg_x4_loop1
|
| +
|
| + addq $64,%rsi
|
| + addq $64,%rdi
|
| +
|
| +.L128_enc_msg_x4_check_remainder:
|
| + cmpq $0,%r10
|
| + je .L128_enc_msg_x4_out
|
| +
|
| +.L128_enc_msg_x4_loop2:
|
| +
|
| +
|
| + vmovdqa %xmm0,%xmm5
|
| + vpaddd one(%rip),%xmm0,%xmm0
|
| +
|
| + vpxor (%rcx),%xmm5,%xmm5
|
| + vaesenc 16(%rcx),%xmm5,%xmm5
|
| + vaesenc 32(%rcx),%xmm5,%xmm5
|
| + vaesenc 48(%rcx),%xmm5,%xmm5
|
| + vaesenc 64(%rcx),%xmm5,%xmm5
|
| + vaesenc 80(%rcx),%xmm5,%xmm5
|
| + vaesenc 96(%rcx),%xmm5,%xmm5
|
| + vaesenc 112(%rcx),%xmm5,%xmm5
|
| + vaesenc 128(%rcx),%xmm5,%xmm5
|
| + vaesenc 144(%rcx),%xmm5,%xmm5
|
| + vaesenclast 160(%rcx),%xmm5,%xmm5
|
| +
|
| +
|
| + vpxor (%rdi),%xmm5,%xmm5
|
| + vmovdqu %xmm5,(%rsi)
|
| +
|
| + addq $16,%rdi
|
| + addq $16,%rsi
|
| +
|
| + subq $1,%r10
|
| + jne .L128_enc_msg_x4_loop2
|
| +
|
| +.L128_enc_msg_x4_out:
|
| + popq %r13
|
| +.cfi_adjust_cfa_offset -8
|
| +.cfi_restore %r13
|
| + popq %r12
|
| +.cfi_adjust_cfa_offset -8
|
| +.cfi_restore %r12
|
| + .byte 0xf3,0xc3
|
| +.cfi_endproc
|
| +.size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4
|
| +.globl aes128gcmsiv_enc_msg_x8
|
| +.hidden aes128gcmsiv_enc_msg_x8
|
| +.type aes128gcmsiv_enc_msg_x8,@function
|
| +.align 16
|
| +aes128gcmsiv_enc_msg_x8:
|
| +.cfi_startproc
|
| + testq %r8,%r8
|
| + jnz .L128_enc_msg_x8_start
|
| + .byte 0xf3,0xc3
|
| +
|
| +.L128_enc_msg_x8_start:
|
| + pushq %r12
|
| +.cfi_adjust_cfa_offset 8
|
| +.cfi_offset %r12,-16
|
| + pushq %r13
|
| +.cfi_adjust_cfa_offset 8
|
| +.cfi_offset %r13,-24
|
| + pushq %rbp
|
| +.cfi_adjust_cfa_offset 8
|
| +.cfi_offset %rbp,-32
|
| + movq %rsp,%rbp
|
| +.cfi_def_cfa_register rbp
|
| +
|
| +
|
| + subq $128,%rsp
|
| + andq $-64,%rsp
|
| +
|
| + shrq $4,%r8
|
| + movq %r8,%r10
|
| + shlq $61,%r10
|
| + shrq $61,%r10
|
| +
|
| +
|
| + vmovdqu (%rdx),%xmm1
|
| + vpor OR_MASK(%rip),%xmm1,%xmm1
|
| +
|
| +
|
| + vpaddd seven(%rip),%xmm1,%xmm0
|
| + vmovdqu %xmm0,(%rsp)
|
| + vpaddd one(%rip),%xmm1,%xmm9
|
| + vpaddd two(%rip),%xmm1,%xmm10
|
| + vpaddd three(%rip),%xmm1,%xmm11
|
| + vpaddd four(%rip),%xmm1,%xmm12
|
| + vpaddd five(%rip),%xmm1,%xmm13
|
| + vpaddd six(%rip),%xmm1,%xmm14
|
| + vmovdqa %xmm1,%xmm0
|
| +
|
| + shrq $3,%r8
|
| + je .L128_enc_msg_x8_check_remainder
|
| +
|
| + subq $128,%rsi
|
| + subq $128,%rdi
|
| +
|
| +.L128_enc_msg_x8_loop1:
|
| + addq $128,%rsi
|
| + addq $128,%rdi
|
| +
|
| + vmovdqa %xmm0,%xmm1
|
| + vmovdqa %xmm9,%xmm2
|
| + vmovdqa %xmm10,%xmm3
|
| + vmovdqa %xmm11,%xmm4
|
| + vmovdqa %xmm12,%xmm5
|
| + vmovdqa %xmm13,%xmm6
|
| + vmovdqa %xmm14,%xmm7
|
| +
|
| + vmovdqu (%rsp),%xmm8
|
| +
|
| + vpxor (%rcx),%xmm1,%xmm1
|
| + vpxor (%rcx),%xmm2,%xmm2
|
| + vpxor (%rcx),%xmm3,%xmm3
|
| + vpxor (%rcx),%xmm4,%xmm4
|
| + vpxor (%rcx),%xmm5,%xmm5
|
| + vpxor (%rcx),%xmm6,%xmm6
|
| + vpxor (%rcx),%xmm7,%xmm7
|
| + vpxor (%rcx),%xmm8,%xmm8
|
| +
|
| + vmovdqu 16(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vmovdqu (%rsp),%xmm14
|
| + vpaddd eight(%rip),%xmm14,%xmm14
|
| + vmovdqu %xmm14,(%rsp)
|
| + vmovdqu 32(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vpsubd one(%rip),%xmm14,%xmm14
|
| + vmovdqu 48(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vpaddd eight(%rip),%xmm0,%xmm0
|
| + vmovdqu 64(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vpaddd eight(%rip),%xmm9,%xmm9
|
| + vmovdqu 80(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vpaddd eight(%rip),%xmm10,%xmm10
|
| + vmovdqu 96(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vpaddd eight(%rip),%xmm11,%xmm11
|
| + vmovdqu 112(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vpaddd eight(%rip),%xmm12,%xmm12
|
| + vmovdqu 128(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vpaddd eight(%rip),%xmm13,%xmm13
|
| + vmovdqu 144(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vmovdqu 160(%rcx),%xmm15
|
| + vaesenclast %xmm15,%xmm1,%xmm1
|
| + vaesenclast %xmm15,%xmm2,%xmm2
|
| + vaesenclast %xmm15,%xmm3,%xmm3
|
| + vaesenclast %xmm15,%xmm4,%xmm4
|
| + vaesenclast %xmm15,%xmm5,%xmm5
|
| + vaesenclast %xmm15,%xmm6,%xmm6
|
| + vaesenclast %xmm15,%xmm7,%xmm7
|
| + vaesenclast %xmm15,%xmm8,%xmm8
|
| +
|
| +
|
| +
|
| + vpxor 0(%rdi),%xmm1,%xmm1
|
| + vpxor 16(%rdi),%xmm2,%xmm2
|
| + vpxor 32(%rdi),%xmm3,%xmm3
|
| + vpxor 48(%rdi),%xmm4,%xmm4
|
| + vpxor 64(%rdi),%xmm5,%xmm5
|
| + vpxor 80(%rdi),%xmm6,%xmm6
|
| + vpxor 96(%rdi),%xmm7,%xmm7
|
| + vpxor 112(%rdi),%xmm8,%xmm8
|
| +
|
| + decq %r8
|
| +
|
| + vmovdqu %xmm1,0(%rsi)
|
| + vmovdqu %xmm2,16(%rsi)
|
| + vmovdqu %xmm3,32(%rsi)
|
| + vmovdqu %xmm4,48(%rsi)
|
| + vmovdqu %xmm5,64(%rsi)
|
| + vmovdqu %xmm6,80(%rsi)
|
| + vmovdqu %xmm7,96(%rsi)
|
| + vmovdqu %xmm8,112(%rsi)
|
| +
|
| + jne .L128_enc_msg_x8_loop1
|
| +
|
| + addq $128,%rsi
|
| + addq $128,%rdi
|
| +
|
| +.L128_enc_msg_x8_check_remainder:
|
| + cmpq $0,%r10
|
| + je .L128_enc_msg_x8_out
|
| +
|
| +.L128_enc_msg_x8_loop2:
|
| +
|
| +
|
| + vmovdqa %xmm0,%xmm1
|
| + vpaddd one(%rip),%xmm0,%xmm0
|
| +
|
| + vpxor (%rcx),%xmm1,%xmm1
|
| + vaesenc 16(%rcx),%xmm1,%xmm1
|
| + vaesenc 32(%rcx),%xmm1,%xmm1
|
| + vaesenc 48(%rcx),%xmm1,%xmm1
|
| + vaesenc 64(%rcx),%xmm1,%xmm1
|
| + vaesenc 80(%rcx),%xmm1,%xmm1
|
| + vaesenc 96(%rcx),%xmm1,%xmm1
|
| + vaesenc 112(%rcx),%xmm1,%xmm1
|
| + vaesenc 128(%rcx),%xmm1,%xmm1
|
| + vaesenc 144(%rcx),%xmm1,%xmm1
|
| + vaesenclast 160(%rcx),%xmm1,%xmm1
|
| +
|
| +
|
| + vpxor (%rdi),%xmm1,%xmm1
|
| +
|
| + vmovdqu %xmm1,(%rsi)
|
| +
|
| + addq $16,%rdi
|
| + addq $16,%rsi
|
| +
|
| + decq %r10
|
| + jne .L128_enc_msg_x8_loop2
|
| +
|
| +.L128_enc_msg_x8_out:
|
| + movq %rbp,%rsp
|
| +.cfi_def_cfa_register %rsp
|
| + popq %rbp
|
| +.cfi_adjust_cfa_offset -8
|
| +.cfi_restore %rbp
|
| + popq %r13
|
| +.cfi_adjust_cfa_offset -8
|
| +.cfi_restore %r13
|
| + popq %r12
|
| +.cfi_adjust_cfa_offset -8
|
| +.cfi_restore %r12
|
| + .byte 0xf3,0xc3
|
| +.cfi_endproc
|
| +.size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8
|
| +.globl aes128gcmsiv_dec
|
| +.hidden aes128gcmsiv_dec
|
| +.type aes128gcmsiv_dec,@function
|
| +.align 16
|
| +aes128gcmsiv_dec:
|
| +.cfi_startproc
|
| + testq $~15,%r9
|
| + jnz .L128_dec_start
|
| + .byte 0xf3,0xc3
|
| +
|
| +.L128_dec_start:
|
| + vzeroupper
|
| + vmovdqa (%rdx),%xmm0
|
| + movq %rdx,%rax
|
| +
|
| + leaq 32(%rax),%rax
|
| + leaq 32(%rcx),%rcx
|
| +
|
| +
|
| + vmovdqu (%rdi,%r9,1),%xmm15
|
| + vpor OR_MASK(%rip),%xmm15,%xmm15
|
| + andq $~15,%r9
|
| +
|
| +
|
| + cmpq $96,%r9
|
| + jb .L128_dec_loop2
|
| +
|
| +
|
| + subq $96,%r9
|
| + vmovdqa %xmm15,%xmm7
|
| + vpaddd one(%rip),%xmm7,%xmm8
|
| + vpaddd two(%rip),%xmm7,%xmm9
|
| + vpaddd one(%rip),%xmm9,%xmm10
|
| + vpaddd two(%rip),%xmm9,%xmm11
|
| + vpaddd one(%rip),%xmm11,%xmm12
|
| + vpaddd two(%rip),%xmm11,%xmm15
|
| +
|
| + vpxor (%r8),%xmm7,%xmm7
|
| + vpxor (%r8),%xmm8,%xmm8
|
| + vpxor (%r8),%xmm9,%xmm9
|
| + vpxor (%r8),%xmm10,%xmm10
|
| + vpxor (%r8),%xmm11,%xmm11
|
| + vpxor (%r8),%xmm12,%xmm12
|
| +
|
| + vmovdqu 16(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 32(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 48(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 64(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 80(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 96(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 112(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 128(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 144(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 160(%r8),%xmm4
|
| + vaesenclast %xmm4,%xmm7,%xmm7
|
| + vaesenclast %xmm4,%xmm8,%xmm8
|
| + vaesenclast %xmm4,%xmm9,%xmm9
|
| + vaesenclast %xmm4,%xmm10,%xmm10
|
| + vaesenclast %xmm4,%xmm11,%xmm11
|
| + vaesenclast %xmm4,%xmm12,%xmm12
|
| +
|
| +
|
| + vpxor 0(%rdi),%xmm7,%xmm7
|
| + vpxor 16(%rdi),%xmm8,%xmm8
|
| + vpxor 32(%rdi),%xmm9,%xmm9
|
| + vpxor 48(%rdi),%xmm10,%xmm10
|
| + vpxor 64(%rdi),%xmm11,%xmm11
|
| + vpxor 80(%rdi),%xmm12,%xmm12
|
| +
|
| + vmovdqu %xmm7,0(%rsi)
|
| + vmovdqu %xmm8,16(%rsi)
|
| + vmovdqu %xmm9,32(%rsi)
|
| + vmovdqu %xmm10,48(%rsi)
|
| + vmovdqu %xmm11,64(%rsi)
|
| + vmovdqu %xmm12,80(%rsi)
|
| +
|
| + addq $96,%rdi
|
| + addq $96,%rsi
|
| + jmp .L128_dec_loop1
|
| +
|
| +
|
| +.align 64
|
| +.L128_dec_loop1:
|
| + cmpq $96,%r9
|
| + jb .L128_dec_finish_96
|
| + subq $96,%r9
|
| +
|
| + vmovdqa %xmm12,%xmm6
|
| + vmovdqa %xmm11,16-32(%rax)
|
| + vmovdqa %xmm10,32-32(%rax)
|
| + vmovdqa %xmm9,48-32(%rax)
|
| + vmovdqa %xmm8,64-32(%rax)
|
| + vmovdqa %xmm7,80-32(%rax)
|
| +
|
| + vmovdqa %xmm15,%xmm7
|
| + vpaddd one(%rip),%xmm7,%xmm8
|
| + vpaddd two(%rip),%xmm7,%xmm9
|
| + vpaddd one(%rip),%xmm9,%xmm10
|
| + vpaddd two(%rip),%xmm9,%xmm11
|
| + vpaddd one(%rip),%xmm11,%xmm12
|
| + vpaddd two(%rip),%xmm11,%xmm15
|
| +
|
| + vmovdqa (%r8),%xmm4
|
| + vpxor %xmm4,%xmm7,%xmm7
|
| + vpxor %xmm4,%xmm8,%xmm8
|
| + vpxor %xmm4,%xmm9,%xmm9
|
| + vpxor %xmm4,%xmm10,%xmm10
|
| + vpxor %xmm4,%xmm11,%xmm11
|
| + vpxor %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 0-32(%rcx),%xmm4
|
| + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
|
| + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
|
| + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1
|
| + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| + vmovdqu 16(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu -16(%rax),%xmm6
|
| + vmovdqu -16(%rcx),%xmm13
|
| +
|
| + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| +
|
| + vmovdqu 32(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 0(%rax),%xmm6
|
| + vmovdqu 0(%rcx),%xmm13
|
| +
|
| + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| +
|
| + vmovdqu 48(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 16(%rax),%xmm6
|
| + vmovdqu 16(%rcx),%xmm13
|
| +
|
| + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| +
|
| + vmovdqu 64(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 32(%rax),%xmm6
|
| + vmovdqu 32(%rcx),%xmm13
|
| +
|
| + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| +
|
| + vmovdqu 80(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 96(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 112(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| +
|
| + vmovdqa 80-32(%rax),%xmm6
|
| + vpxor %xmm0,%xmm6,%xmm6
|
| + vmovdqu 80-32(%rcx),%xmm5
|
| +
|
| + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| + vmovdqu 128(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| +
|
| + vpsrldq $8,%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm5
|
| + vpslldq $8,%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm0
|
| +
|
| + vmovdqa poly(%rip),%xmm3
|
| +
|
| + vmovdqu 144(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 160(%r8),%xmm6
|
| + vpalignr $8,%xmm0,%xmm0,%xmm2
|
| + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
|
| + vpxor %xmm0,%xmm2,%xmm0
|
| +
|
| + vpxor 0(%rdi),%xmm6,%xmm4
|
| + vaesenclast %xmm4,%xmm7,%xmm7
|
| + vpxor 16(%rdi),%xmm6,%xmm4
|
| + vaesenclast %xmm4,%xmm8,%xmm8
|
| + vpxor 32(%rdi),%xmm6,%xmm4
|
| + vaesenclast %xmm4,%xmm9,%xmm9
|
| + vpxor 48(%rdi),%xmm6,%xmm4
|
| + vaesenclast %xmm4,%xmm10,%xmm10
|
| + vpxor 64(%rdi),%xmm6,%xmm4
|
| + vaesenclast %xmm4,%xmm11,%xmm11
|
| + vpxor 80(%rdi),%xmm6,%xmm4
|
| + vaesenclast %xmm4,%xmm12,%xmm12
|
| +
|
| + vpalignr $8,%xmm0,%xmm0,%xmm2
|
| + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
|
| + vpxor %xmm0,%xmm2,%xmm0
|
| +
|
| + vmovdqu %xmm7,0(%rsi)
|
| + vmovdqu %xmm8,16(%rsi)
|
| + vmovdqu %xmm9,32(%rsi)
|
| + vmovdqu %xmm10,48(%rsi)
|
| + vmovdqu %xmm11,64(%rsi)
|
| + vmovdqu %xmm12,80(%rsi)
|
| +
|
| + vpxor %xmm5,%xmm0,%xmm0
|
| +
|
| + leaq 96(%rdi),%rdi
|
| + leaq 96(%rsi),%rsi
|
| + jmp .L128_dec_loop1
|
| +
|
| +.L128_dec_finish_96:
|
| + vmovdqa %xmm12,%xmm6
|
| + vmovdqa %xmm11,16-32(%rax)
|
| + vmovdqa %xmm10,32-32(%rax)
|
| + vmovdqa %xmm9,48-32(%rax)
|
| + vmovdqa %xmm8,64-32(%rax)
|
| + vmovdqa %xmm7,80-32(%rax)
|
| +
|
| + vmovdqu 0-32(%rcx),%xmm4
|
| + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1
|
| + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
|
| + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
|
| + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| + vmovdqu -16(%rax),%xmm6
|
| + vmovdqu -16(%rcx),%xmm13
|
| +
|
| + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| + vmovdqu 0(%rax),%xmm6
|
| + vmovdqu 0(%rcx),%xmm13
|
| +
|
| + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| + vmovdqu 16(%rax),%xmm6
|
| + vmovdqu 16(%rcx),%xmm13
|
| +
|
| + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| + vmovdqu 32(%rax),%xmm6
|
| + vmovdqu 32(%rcx),%xmm13
|
| +
|
| + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| +
|
| + vmovdqu 80-32(%rax),%xmm6
|
| + vpxor %xmm0,%xmm6,%xmm6
|
| + vmovdqu 80-32(%rcx),%xmm5
|
| + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| + vpsrldq $8,%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm5
|
| + vpslldq $8,%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm0
|
| +
|
| + vmovdqa poly(%rip),%xmm3
|
| +
|
| + vpalignr $8,%xmm0,%xmm0,%xmm2
|
| + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
|
| + vpxor %xmm0,%xmm2,%xmm0
|
| +
|
| + vpalignr $8,%xmm0,%xmm0,%xmm2
|
| + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
|
| + vpxor %xmm0,%xmm2,%xmm0
|
| +
|
| + vpxor %xmm5,%xmm0,%xmm0
|
| +
|
| +.L128_dec_loop2:
|
| +
|
| +
|
| +
|
| + cmpq $16,%r9
|
| + jb .L128_dec_out
|
| + subq $16,%r9
|
| +
|
| + vmovdqa %xmm15,%xmm2
|
| + vpaddd one(%rip),%xmm15,%xmm15
|
| +
|
| + vpxor 0(%r8),%xmm2,%xmm2
|
| + vaesenc 16(%r8),%xmm2,%xmm2
|
| + vaesenc 32(%r8),%xmm2,%xmm2
|
| + vaesenc 48(%r8),%xmm2,%xmm2
|
| + vaesenc 64(%r8),%xmm2,%xmm2
|
| + vaesenc 80(%r8),%xmm2,%xmm2
|
| + vaesenc 96(%r8),%xmm2,%xmm2
|
| + vaesenc 112(%r8),%xmm2,%xmm2
|
| + vaesenc 128(%r8),%xmm2,%xmm2
|
| + vaesenc 144(%r8),%xmm2,%xmm2
|
| + vaesenclast 160(%r8),%xmm2,%xmm2
|
| + vpxor (%rdi),%xmm2,%xmm2
|
| + vmovdqu %xmm2,(%rsi)
|
| + addq $16,%rdi
|
| + addq $16,%rsi
|
| +
|
| + vpxor %xmm2,%xmm0,%xmm0
|
| + vmovdqa -32(%rcx),%xmm1
|
| + call GFMUL
|
| +
|
| + jmp .L128_dec_loop2
|
| +
|
| +.L128_dec_out:
|
| + vmovdqu %xmm0,(%rdx)
|
| + .byte 0xf3,0xc3
|
| +.cfi_endproc
|
| +.size aes128gcmsiv_dec, .-aes128gcmsiv_dec
|
| +.globl aes128gcmsiv_ecb_enc_block
|
| +.hidden aes128gcmsiv_ecb_enc_block
|
| +.type aes128gcmsiv_ecb_enc_block,@function
|
| +.align 16
|
| +aes128gcmsiv_ecb_enc_block:
|
| +.cfi_startproc
|
| + vmovdqa (%rdi),%xmm1
|
| +
|
| + vpxor (%rdx),%xmm1,%xmm1
|
| + vaesenc 16(%rdx),%xmm1,%xmm1
|
| + vaesenc 32(%rdx),%xmm1,%xmm1
|
| + vaesenc 48(%rdx),%xmm1,%xmm1
|
| + vaesenc 64(%rdx),%xmm1,%xmm1
|
| + vaesenc 80(%rdx),%xmm1,%xmm1
|
| + vaesenc 96(%rdx),%xmm1,%xmm1
|
| + vaesenc 112(%rdx),%xmm1,%xmm1
|
| + vaesenc 128(%rdx),%xmm1,%xmm1
|
| + vaesenc 144(%rdx),%xmm1,%xmm1
|
| + vaesenclast 160(%rdx),%xmm1,%xmm1
|
| +
|
| + vmovdqa %xmm1,(%rsi)
|
| +
|
| + .byte 0xf3,0xc3
|
| +.cfi_endproc
|
| +.size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block
|
| +.globl aes256gcmsiv_aes_ks_enc_x1
|
| +.hidden aes256gcmsiv_aes_ks_enc_x1
|
| +.type aes256gcmsiv_aes_ks_enc_x1,@function
|
| +.align 16
|
| +aes256gcmsiv_aes_ks_enc_x1:
|
| +.cfi_startproc
|
| + vmovdqa con1(%rip),%xmm0
|
| + vmovdqa mask(%rip),%xmm15
|
| + vmovdqa (%rdi),%xmm8
|
| + vmovdqa (%rcx),%xmm1
|
| + vmovdqa 16(%rcx),%xmm3
|
| + vpxor %xmm1,%xmm8,%xmm8
|
| + vaesenc %xmm3,%xmm8,%xmm8
|
| + vmovdqu %xmm1,(%rdx)
|
| + vmovdqu %xmm3,16(%rdx)
|
| + vpxor %xmm14,%xmm14,%xmm14
|
| +
|
| + vpshufb %xmm15,%xmm3,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslld $1,%xmm0,%xmm0
|
| + vpslldq $4,%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| + vaesenc %xmm1,%xmm8,%xmm8
|
| + vmovdqu %xmm1,32(%rdx)
|
| +
|
| + vpshufd $0xff,%xmm1,%xmm2
|
| + vaesenclast %xmm14,%xmm2,%xmm2
|
| + vpslldq $4,%xmm3,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpxor %xmm2,%xmm3,%xmm3
|
| + vaesenc %xmm3,%xmm8,%xmm8
|
| + vmovdqu %xmm3,48(%rdx)
|
| +
|
| + vpshufb %xmm15,%xmm3,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslld $1,%xmm0,%xmm0
|
| + vpslldq $4,%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| + vaesenc %xmm1,%xmm8,%xmm8
|
| + vmovdqu %xmm1,64(%rdx)
|
| +
|
| + vpshufd $0xff,%xmm1,%xmm2
|
| + vaesenclast %xmm14,%xmm2,%xmm2
|
| + vpslldq $4,%xmm3,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpxor %xmm2,%xmm3,%xmm3
|
| + vaesenc %xmm3,%xmm8,%xmm8
|
| + vmovdqu %xmm3,80(%rdx)
|
| +
|
| + vpshufb %xmm15,%xmm3,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslld $1,%xmm0,%xmm0
|
| + vpslldq $4,%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| + vaesenc %xmm1,%xmm8,%xmm8
|
| + vmovdqu %xmm1,96(%rdx)
|
| +
|
| + vpshufd $0xff,%xmm1,%xmm2
|
| + vaesenclast %xmm14,%xmm2,%xmm2
|
| + vpslldq $4,%xmm3,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpxor %xmm2,%xmm3,%xmm3
|
| + vaesenc %xmm3,%xmm8,%xmm8
|
| + vmovdqu %xmm3,112(%rdx)
|
| +
|
| + vpshufb %xmm15,%xmm3,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslld $1,%xmm0,%xmm0
|
| + vpslldq $4,%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| + vaesenc %xmm1,%xmm8,%xmm8
|
| + vmovdqu %xmm1,128(%rdx)
|
| +
|
| + vpshufd $0xff,%xmm1,%xmm2
|
| + vaesenclast %xmm14,%xmm2,%xmm2
|
| + vpslldq $4,%xmm3,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpxor %xmm2,%xmm3,%xmm3
|
| + vaesenc %xmm3,%xmm8,%xmm8
|
| + vmovdqu %xmm3,144(%rdx)
|
| +
|
| + vpshufb %xmm15,%xmm3,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslld $1,%xmm0,%xmm0
|
| + vpslldq $4,%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| + vaesenc %xmm1,%xmm8,%xmm8
|
| + vmovdqu %xmm1,160(%rdx)
|
| +
|
| + vpshufd $0xff,%xmm1,%xmm2
|
| + vaesenclast %xmm14,%xmm2,%xmm2
|
| + vpslldq $4,%xmm3,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpxor %xmm2,%xmm3,%xmm3
|
| + vaesenc %xmm3,%xmm8,%xmm8
|
| + vmovdqu %xmm3,176(%rdx)
|
| +
|
| + vpshufb %xmm15,%xmm3,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslld $1,%xmm0,%xmm0
|
| + vpslldq $4,%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| + vaesenc %xmm1,%xmm8,%xmm8
|
| + vmovdqu %xmm1,192(%rdx)
|
| +
|
| + vpshufd $0xff,%xmm1,%xmm2
|
| + vaesenclast %xmm14,%xmm2,%xmm2
|
| + vpslldq $4,%xmm3,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpxor %xmm2,%xmm3,%xmm3
|
| + vaesenc %xmm3,%xmm8,%xmm8
|
| + vmovdqu %xmm3,208(%rdx)
|
| +
|
| + vpshufb %xmm15,%xmm3,%xmm2
|
| + vaesenclast %xmm0,%xmm2,%xmm2
|
| + vpslldq $4,%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpslldq $4,%xmm4,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpxor %xmm2,%xmm1,%xmm1
|
| + vaesenclast %xmm1,%xmm8,%xmm8
|
| + vmovdqu %xmm1,224(%rdx)
|
| +
|
| + vmovdqa %xmm8,(%rsi)
|
| + .byte 0xf3,0xc3
|
| +.cfi_endproc
|
| +.size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1
|
| +.globl aes256gcmsiv_ecb_enc_block
|
| +.hidden aes256gcmsiv_ecb_enc_block
|
| +.type aes256gcmsiv_ecb_enc_block,@function
|
| +.align 16
|
| +aes256gcmsiv_ecb_enc_block:
|
| +.cfi_startproc
|
| + vmovdqa (%rdi),%xmm1
|
| + vpxor (%rdx),%xmm1,%xmm1
|
| + vaesenc 16(%rdx),%xmm1,%xmm1
|
| + vaesenc 32(%rdx),%xmm1,%xmm1
|
| + vaesenc 48(%rdx),%xmm1,%xmm1
|
| + vaesenc 64(%rdx),%xmm1,%xmm1
|
| + vaesenc 80(%rdx),%xmm1,%xmm1
|
| + vaesenc 96(%rdx),%xmm1,%xmm1
|
| + vaesenc 112(%rdx),%xmm1,%xmm1
|
| + vaesenc 128(%rdx),%xmm1,%xmm1
|
| + vaesenc 144(%rdx),%xmm1,%xmm1
|
| + vaesenc 160(%rdx),%xmm1,%xmm1
|
| + vaesenc 176(%rdx),%xmm1,%xmm1
|
| + vaesenc 192(%rdx),%xmm1,%xmm1
|
| + vaesenc 208(%rdx),%xmm1,%xmm1
|
| + vaesenclast 224(%rdx),%xmm1,%xmm1
|
| + vmovdqa %xmm1,(%rsi)
|
| + .byte 0xf3,0xc3
|
| +.cfi_endproc
|
| +.size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block
|
| +.globl aes256gcmsiv_enc_msg_x4
|
| +.hidden aes256gcmsiv_enc_msg_x4
|
| +.type aes256gcmsiv_enc_msg_x4,@function
|
| +.align 16
|
| +aes256gcmsiv_enc_msg_x4:
|
| +.cfi_startproc
|
| + testq %r8,%r8
|
| + jnz .L256_enc_msg_x4_start
|
| + .byte 0xf3,0xc3
|
| +
|
| +.L256_enc_msg_x4_start:
|
| + movq %r8,%r10
|
| + shrq $4,%r8
|
| + shlq $60,%r10
|
| + jz .L256_enc_msg_x4_start2
|
| + addq $1,%r8
|
| +
|
| +.L256_enc_msg_x4_start2:
|
| + movq %r8,%r10
|
| + shlq $62,%r10
|
| + shrq $62,%r10
|
| +
|
| +
|
| + vmovdqa (%rdx),%xmm15
|
| + vpor OR_MASK(%rip),%xmm15,%xmm15
|
| +
|
| + vmovdqa four(%rip),%xmm4
|
| + vmovdqa %xmm15,%xmm0
|
| + vpaddd one(%rip),%xmm15,%xmm1
|
| + vpaddd two(%rip),%xmm15,%xmm2
|
| + vpaddd three(%rip),%xmm15,%xmm3
|
| +
|
| + shrq $2,%r8
|
| + je .L256_enc_msg_x4_check_remainder
|
| +
|
| + subq $64,%rsi
|
| + subq $64,%rdi
|
| +
|
| +.L256_enc_msg_x4_loop1:
|
| + addq $64,%rsi
|
| + addq $64,%rdi
|
| +
|
| + vmovdqa %xmm0,%xmm5
|
| + vmovdqa %xmm1,%xmm6
|
| + vmovdqa %xmm2,%xmm7
|
| + vmovdqa %xmm3,%xmm8
|
| +
|
| + vpxor (%rcx),%xmm5,%xmm5
|
| + vpxor (%rcx),%xmm6,%xmm6
|
| + vpxor (%rcx),%xmm7,%xmm7
|
| + vpxor (%rcx),%xmm8,%xmm8
|
| +
|
| + vmovdqu 16(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vpaddd %xmm4,%xmm0,%xmm0
|
| + vmovdqu 32(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vpaddd %xmm4,%xmm1,%xmm1
|
| + vmovdqu 48(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vpaddd %xmm4,%xmm2,%xmm2
|
| + vmovdqu 64(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vpaddd %xmm4,%xmm3,%xmm3
|
| +
|
| + vmovdqu 80(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vmovdqu 96(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vmovdqu 112(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vmovdqu 128(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vmovdqu 144(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vmovdqu 160(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vmovdqu 176(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vmovdqu 192(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vmovdqu 208(%rcx),%xmm12
|
| + vaesenc %xmm12,%xmm5,%xmm5
|
| + vaesenc %xmm12,%xmm6,%xmm6
|
| + vaesenc %xmm12,%xmm7,%xmm7
|
| + vaesenc %xmm12,%xmm8,%xmm8
|
| +
|
| + vmovdqu 224(%rcx),%xmm12
|
| + vaesenclast %xmm12,%xmm5,%xmm5
|
| + vaesenclast %xmm12,%xmm6,%xmm6
|
| + vaesenclast %xmm12,%xmm7,%xmm7
|
| + vaesenclast %xmm12,%xmm8,%xmm8
|
| +
|
| +
|
| +
|
| + vpxor 0(%rdi),%xmm5,%xmm5
|
| + vpxor 16(%rdi),%xmm6,%xmm6
|
| + vpxor 32(%rdi),%xmm7,%xmm7
|
| + vpxor 48(%rdi),%xmm8,%xmm8
|
| +
|
| + subq $1,%r8
|
| +
|
| + vmovdqu %xmm5,0(%rsi)
|
| + vmovdqu %xmm6,16(%rsi)
|
| + vmovdqu %xmm7,32(%rsi)
|
| + vmovdqu %xmm8,48(%rsi)
|
| +
|
| + jne .L256_enc_msg_x4_loop1
|
| +
|
| + addq $64,%rsi
|
| + addq $64,%rdi
|
| +
|
| +.L256_enc_msg_x4_check_remainder:
|
| + cmpq $0,%r10
|
| + je .L256_enc_msg_x4_out
|
| +
|
| +.L256_enc_msg_x4_loop2:
|
| +
|
| +
|
| +
|
| + vmovdqa %xmm0,%xmm5
|
| + vpaddd one(%rip),%xmm0,%xmm0
|
| + vpxor (%rcx),%xmm5,%xmm5
|
| + vaesenc 16(%rcx),%xmm5,%xmm5
|
| + vaesenc 32(%rcx),%xmm5,%xmm5
|
| + vaesenc 48(%rcx),%xmm5,%xmm5
|
| + vaesenc 64(%rcx),%xmm5,%xmm5
|
| + vaesenc 80(%rcx),%xmm5,%xmm5
|
| + vaesenc 96(%rcx),%xmm5,%xmm5
|
| + vaesenc 112(%rcx),%xmm5,%xmm5
|
| + vaesenc 128(%rcx),%xmm5,%xmm5
|
| + vaesenc 144(%rcx),%xmm5,%xmm5
|
| + vaesenc 160(%rcx),%xmm5,%xmm5
|
| + vaesenc 176(%rcx),%xmm5,%xmm5
|
| + vaesenc 192(%rcx),%xmm5,%xmm5
|
| + vaesenc 208(%rcx),%xmm5,%xmm5
|
| + vaesenclast 224(%rcx),%xmm5,%xmm5
|
| +
|
| +
|
| + vpxor (%rdi),%xmm5,%xmm5
|
| +
|
| + vmovdqu %xmm5,(%rsi)
|
| +
|
| + addq $16,%rdi
|
| + addq $16,%rsi
|
| +
|
| + subq $1,%r10
|
| + jne .L256_enc_msg_x4_loop2
|
| +
|
| +.L256_enc_msg_x4_out:
|
| + .byte 0xf3,0xc3
|
| +.cfi_endproc
|
| +.size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4
|
| +.globl aes256gcmsiv_enc_msg_x8
|
| +.hidden aes256gcmsiv_enc_msg_x8
|
| +.type aes256gcmsiv_enc_msg_x8,@function
|
| +.align 16
|
| +aes256gcmsiv_enc_msg_x8:
|
| +.cfi_startproc
|
| + testq %r8,%r8
|
| + jnz .L256_enc_msg_x8_start
|
| + .byte 0xf3,0xc3
|
| +
|
| +.L256_enc_msg_x8_start:
|
| +
|
| + movq %rsp,%r11
|
| + subq $16,%r11
|
| + andq $-64,%r11
|
| +
|
| + movq %r8,%r10
|
| + shrq $4,%r8
|
| + shlq $60,%r10
|
| + jz .L256_enc_msg_x8_start2
|
| + addq $1,%r8
|
| +
|
| +.L256_enc_msg_x8_start2:
|
| + movq %r8,%r10
|
| + shlq $61,%r10
|
| + shrq $61,%r10
|
| +
|
| +
|
| + vmovdqa (%rdx),%xmm1
|
| + vpor OR_MASK(%rip),%xmm1,%xmm1
|
| +
|
| +
|
| + vpaddd seven(%rip),%xmm1,%xmm0
|
| + vmovdqa %xmm0,(%r11)
|
| + vpaddd one(%rip),%xmm1,%xmm9
|
| + vpaddd two(%rip),%xmm1,%xmm10
|
| + vpaddd three(%rip),%xmm1,%xmm11
|
| + vpaddd four(%rip),%xmm1,%xmm12
|
| + vpaddd five(%rip),%xmm1,%xmm13
|
| + vpaddd six(%rip),%xmm1,%xmm14
|
| + vmovdqa %xmm1,%xmm0
|
| +
|
| + shrq $3,%r8
|
| + jz .L256_enc_msg_x8_check_remainder
|
| +
|
| + subq $128,%rsi
|
| + subq $128,%rdi
|
| +
|
| +.L256_enc_msg_x8_loop1:
|
| + addq $128,%rsi
|
| + addq $128,%rdi
|
| +
|
| + vmovdqa %xmm0,%xmm1
|
| + vmovdqa %xmm9,%xmm2
|
| + vmovdqa %xmm10,%xmm3
|
| + vmovdqa %xmm11,%xmm4
|
| + vmovdqa %xmm12,%xmm5
|
| + vmovdqa %xmm13,%xmm6
|
| + vmovdqa %xmm14,%xmm7
|
| +
|
| + vmovdqa (%r11),%xmm8
|
| +
|
| + vpxor (%rcx),%xmm1,%xmm1
|
| + vpxor (%rcx),%xmm2,%xmm2
|
| + vpxor (%rcx),%xmm3,%xmm3
|
| + vpxor (%rcx),%xmm4,%xmm4
|
| + vpxor (%rcx),%xmm5,%xmm5
|
| + vpxor (%rcx),%xmm6,%xmm6
|
| + vpxor (%rcx),%xmm7,%xmm7
|
| + vpxor (%rcx),%xmm8,%xmm8
|
| +
|
| + vmovdqu 16(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vmovdqa (%r11),%xmm14
|
| + vpaddd eight(%rip),%xmm14,%xmm14
|
| + vmovdqa %xmm14,(%r11)
|
| + vmovdqu 32(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vpsubd one(%rip),%xmm14,%xmm14
|
| + vmovdqu 48(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vpaddd eight(%rip),%xmm0,%xmm0
|
| + vmovdqu 64(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vpaddd eight(%rip),%xmm9,%xmm9
|
| + vmovdqu 80(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vpaddd eight(%rip),%xmm10,%xmm10
|
| + vmovdqu 96(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vpaddd eight(%rip),%xmm11,%xmm11
|
| + vmovdqu 112(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vpaddd eight(%rip),%xmm12,%xmm12
|
| + vmovdqu 128(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vpaddd eight(%rip),%xmm13,%xmm13
|
| + vmovdqu 144(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vmovdqu 160(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vmovdqu 176(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vmovdqu 192(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vmovdqu 208(%rcx),%xmm15
|
| + vaesenc %xmm15,%xmm1,%xmm1
|
| + vaesenc %xmm15,%xmm2,%xmm2
|
| + vaesenc %xmm15,%xmm3,%xmm3
|
| + vaesenc %xmm15,%xmm4,%xmm4
|
| + vaesenc %xmm15,%xmm5,%xmm5
|
| + vaesenc %xmm15,%xmm6,%xmm6
|
| + vaesenc %xmm15,%xmm7,%xmm7
|
| + vaesenc %xmm15,%xmm8,%xmm8
|
| +
|
| + vmovdqu 224(%rcx),%xmm15
|
| + vaesenclast %xmm15,%xmm1,%xmm1
|
| + vaesenclast %xmm15,%xmm2,%xmm2
|
| + vaesenclast %xmm15,%xmm3,%xmm3
|
| + vaesenclast %xmm15,%xmm4,%xmm4
|
| + vaesenclast %xmm15,%xmm5,%xmm5
|
| + vaesenclast %xmm15,%xmm6,%xmm6
|
| + vaesenclast %xmm15,%xmm7,%xmm7
|
| + vaesenclast %xmm15,%xmm8,%xmm8
|
| +
|
| +
|
| +
|
| + vpxor 0(%rdi),%xmm1,%xmm1
|
| + vpxor 16(%rdi),%xmm2,%xmm2
|
| + vpxor 32(%rdi),%xmm3,%xmm3
|
| + vpxor 48(%rdi),%xmm4,%xmm4
|
| + vpxor 64(%rdi),%xmm5,%xmm5
|
| + vpxor 80(%rdi),%xmm6,%xmm6
|
| + vpxor 96(%rdi),%xmm7,%xmm7
|
| + vpxor 112(%rdi),%xmm8,%xmm8
|
| +
|
| + subq $1,%r8
|
| +
|
| + vmovdqu %xmm1,0(%rsi)
|
| + vmovdqu %xmm2,16(%rsi)
|
| + vmovdqu %xmm3,32(%rsi)
|
| + vmovdqu %xmm4,48(%rsi)
|
| + vmovdqu %xmm5,64(%rsi)
|
| + vmovdqu %xmm6,80(%rsi)
|
| + vmovdqu %xmm7,96(%rsi)
|
| + vmovdqu %xmm8,112(%rsi)
|
| +
|
| + jne .L256_enc_msg_x8_loop1
|
| +
|
| + addq $128,%rsi
|
| + addq $128,%rdi
|
| +
|
| +.L256_enc_msg_x8_check_remainder:
|
| + cmpq $0,%r10
|
| + je .L256_enc_msg_x8_out
|
| +
|
| +.L256_enc_msg_x8_loop2:
|
| +
|
| +
|
| + vmovdqa %xmm0,%xmm1
|
| + vpaddd one(%rip),%xmm0,%xmm0
|
| +
|
| + vpxor (%rcx),%xmm1,%xmm1
|
| + vaesenc 16(%rcx),%xmm1,%xmm1
|
| + vaesenc 32(%rcx),%xmm1,%xmm1
|
| + vaesenc 48(%rcx),%xmm1,%xmm1
|
| + vaesenc 64(%rcx),%xmm1,%xmm1
|
| + vaesenc 80(%rcx),%xmm1,%xmm1
|
| + vaesenc 96(%rcx),%xmm1,%xmm1
|
| + vaesenc 112(%rcx),%xmm1,%xmm1
|
| + vaesenc 128(%rcx),%xmm1,%xmm1
|
| + vaesenc 144(%rcx),%xmm1,%xmm1
|
| + vaesenc 160(%rcx),%xmm1,%xmm1
|
| + vaesenc 176(%rcx),%xmm1,%xmm1
|
| + vaesenc 192(%rcx),%xmm1,%xmm1
|
| + vaesenc 208(%rcx),%xmm1,%xmm1
|
| + vaesenclast 224(%rcx),%xmm1,%xmm1
|
| +
|
| +
|
| + vpxor (%rdi),%xmm1,%xmm1
|
| +
|
| + vmovdqu %xmm1,(%rsi)
|
| +
|
| + addq $16,%rdi
|
| + addq $16,%rsi
|
| + subq $1,%r10
|
| + jnz .L256_enc_msg_x8_loop2
|
| +
|
| +.L256_enc_msg_x8_out:
|
| + .byte 0xf3,0xc3
|
| +
|
| +.cfi_endproc
|
| +.size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8
|
| +.globl aes256gcmsiv_dec
|
| +.hidden aes256gcmsiv_dec
|
| +.type aes256gcmsiv_dec,@function
|
| +.align 16
|
| +aes256gcmsiv_dec:
|
| +.cfi_startproc
|
| + testq $~15,%r9
|
| + jnz .L256_dec_start
|
| + .byte 0xf3,0xc3
|
| +
|
| +.L256_dec_start:
|
| + vzeroupper
|
| + vmovdqa (%rdx),%xmm0
|
| + movq %rdx,%rax
|
| +
|
| + leaq 32(%rax),%rax
|
| + leaq 32(%rcx),%rcx
|
| +
|
| +
|
| + vmovdqu (%rdi,%r9,1),%xmm15
|
| + vpor OR_MASK(%rip),%xmm15,%xmm15
|
| + andq $~15,%r9
|
| +
|
| +
|
| + cmpq $96,%r9
|
| + jb .L256_dec_loop2
|
| +
|
| +
|
| + subq $96,%r9
|
| + vmovdqa %xmm15,%xmm7
|
| + vpaddd one(%rip),%xmm7,%xmm8
|
| + vpaddd two(%rip),%xmm7,%xmm9
|
| + vpaddd one(%rip),%xmm9,%xmm10
|
| + vpaddd two(%rip),%xmm9,%xmm11
|
| + vpaddd one(%rip),%xmm11,%xmm12
|
| + vpaddd two(%rip),%xmm11,%xmm15
|
| +
|
| + vpxor (%r8),%xmm7,%xmm7
|
| + vpxor (%r8),%xmm8,%xmm8
|
| + vpxor (%r8),%xmm9,%xmm9
|
| + vpxor (%r8),%xmm10,%xmm10
|
| + vpxor (%r8),%xmm11,%xmm11
|
| + vpxor (%r8),%xmm12,%xmm12
|
| +
|
| + vmovdqu 16(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 32(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 48(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 64(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 80(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 96(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 112(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 128(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 144(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 160(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 176(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 192(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 208(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 224(%r8),%xmm4
|
| + vaesenclast %xmm4,%xmm7,%xmm7
|
| + vaesenclast %xmm4,%xmm8,%xmm8
|
| + vaesenclast %xmm4,%xmm9,%xmm9
|
| + vaesenclast %xmm4,%xmm10,%xmm10
|
| + vaesenclast %xmm4,%xmm11,%xmm11
|
| + vaesenclast %xmm4,%xmm12,%xmm12
|
| +
|
| +
|
| + vpxor 0(%rdi),%xmm7,%xmm7
|
| + vpxor 16(%rdi),%xmm8,%xmm8
|
| + vpxor 32(%rdi),%xmm9,%xmm9
|
| + vpxor 48(%rdi),%xmm10,%xmm10
|
| + vpxor 64(%rdi),%xmm11,%xmm11
|
| + vpxor 80(%rdi),%xmm12,%xmm12
|
| +
|
| + vmovdqu %xmm7,0(%rsi)
|
| + vmovdqu %xmm8,16(%rsi)
|
| + vmovdqu %xmm9,32(%rsi)
|
| + vmovdqu %xmm10,48(%rsi)
|
| + vmovdqu %xmm11,64(%rsi)
|
| + vmovdqu %xmm12,80(%rsi)
|
| +
|
| + addq $96,%rdi
|
| + addq $96,%rsi
|
| + jmp .L256_dec_loop1
|
| +
|
| +
|
| +.align 64
|
| +.L256_dec_loop1:
|
| + cmpq $96,%r9
|
| + jb .L256_dec_finish_96
|
| + subq $96,%r9
|
| +
|
| + vmovdqa %xmm12,%xmm6
|
| + vmovdqa %xmm11,16-32(%rax)
|
| + vmovdqa %xmm10,32-32(%rax)
|
| + vmovdqa %xmm9,48-32(%rax)
|
| + vmovdqa %xmm8,64-32(%rax)
|
| + vmovdqa %xmm7,80-32(%rax)
|
| +
|
| + vmovdqa %xmm15,%xmm7
|
| + vpaddd one(%rip),%xmm7,%xmm8
|
| + vpaddd two(%rip),%xmm7,%xmm9
|
| + vpaddd one(%rip),%xmm9,%xmm10
|
| + vpaddd two(%rip),%xmm9,%xmm11
|
| + vpaddd one(%rip),%xmm11,%xmm12
|
| + vpaddd two(%rip),%xmm11,%xmm15
|
| +
|
| + vmovdqa (%r8),%xmm4
|
| + vpxor %xmm4,%xmm7,%xmm7
|
| + vpxor %xmm4,%xmm8,%xmm8
|
| + vpxor %xmm4,%xmm9,%xmm9
|
| + vpxor %xmm4,%xmm10,%xmm10
|
| + vpxor %xmm4,%xmm11,%xmm11
|
| + vpxor %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 0-32(%rcx),%xmm4
|
| + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
|
| + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
|
| + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1
|
| + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| + vmovdqu 16(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu -16(%rax),%xmm6
|
| + vmovdqu -16(%rcx),%xmm13
|
| +
|
| + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| +
|
| + vmovdqu 32(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 0(%rax),%xmm6
|
| + vmovdqu 0(%rcx),%xmm13
|
| +
|
| + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| +
|
| + vmovdqu 48(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 16(%rax),%xmm6
|
| + vmovdqu 16(%rcx),%xmm13
|
| +
|
| + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| +
|
| + vmovdqu 64(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 32(%rax),%xmm6
|
| + vmovdqu 32(%rcx),%xmm13
|
| +
|
| + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| +
|
| + vmovdqu 80(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 96(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 112(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| +
|
| + vmovdqa 80-32(%rax),%xmm6
|
| + vpxor %xmm0,%xmm6,%xmm6
|
| + vmovdqu 80-32(%rcx),%xmm5
|
| +
|
| + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| + vmovdqu 128(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| +
|
| + vpsrldq $8,%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm5
|
| + vpslldq $8,%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm0
|
| +
|
| + vmovdqa poly(%rip),%xmm3
|
| +
|
| + vmovdqu 144(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 160(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 176(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 192(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 208(%r8),%xmm4
|
| + vaesenc %xmm4,%xmm7,%xmm7
|
| + vaesenc %xmm4,%xmm8,%xmm8
|
| + vaesenc %xmm4,%xmm9,%xmm9
|
| + vaesenc %xmm4,%xmm10,%xmm10
|
| + vaesenc %xmm4,%xmm11,%xmm11
|
| + vaesenc %xmm4,%xmm12,%xmm12
|
| +
|
| + vmovdqu 224(%r8),%xmm6
|
| + vpalignr $8,%xmm0,%xmm0,%xmm2
|
| + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
|
| + vpxor %xmm0,%xmm2,%xmm0
|
| +
|
| + vpxor 0(%rdi),%xmm6,%xmm4
|
| + vaesenclast %xmm4,%xmm7,%xmm7
|
| + vpxor 16(%rdi),%xmm6,%xmm4
|
| + vaesenclast %xmm4,%xmm8,%xmm8
|
| + vpxor 32(%rdi),%xmm6,%xmm4
|
| + vaesenclast %xmm4,%xmm9,%xmm9
|
| + vpxor 48(%rdi),%xmm6,%xmm4
|
| + vaesenclast %xmm4,%xmm10,%xmm10
|
| + vpxor 64(%rdi),%xmm6,%xmm4
|
| + vaesenclast %xmm4,%xmm11,%xmm11
|
| + vpxor 80(%rdi),%xmm6,%xmm4
|
| + vaesenclast %xmm4,%xmm12,%xmm12
|
| +
|
| + vpalignr $8,%xmm0,%xmm0,%xmm2
|
| + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
|
| + vpxor %xmm0,%xmm2,%xmm0
|
| +
|
| + vmovdqu %xmm7,0(%rsi)
|
| + vmovdqu %xmm8,16(%rsi)
|
| + vmovdqu %xmm9,32(%rsi)
|
| + vmovdqu %xmm10,48(%rsi)
|
| + vmovdqu %xmm11,64(%rsi)
|
| + vmovdqu %xmm12,80(%rsi)
|
| +
|
| + vpxor %xmm5,%xmm0,%xmm0
|
| +
|
| + leaq 96(%rdi),%rdi
|
| + leaq 96(%rsi),%rsi
|
| + jmp .L256_dec_loop1
|
| +
|
| +.L256_dec_finish_96:
|
| + vmovdqa %xmm12,%xmm6
|
| + vmovdqa %xmm11,16-32(%rax)
|
| + vmovdqa %xmm10,32-32(%rax)
|
| + vmovdqa %xmm9,48-32(%rax)
|
| + vmovdqa %xmm8,64-32(%rax)
|
| + vmovdqa %xmm7,80-32(%rax)
|
| +
|
| + vmovdqu 0-32(%rcx),%xmm4
|
| + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1
|
| + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
|
| + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
|
| + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| + vmovdqu -16(%rax),%xmm6
|
| + vmovdqu -16(%rcx),%xmm13
|
| +
|
| + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| + vmovdqu 0(%rax),%xmm6
|
| + vmovdqu 0(%rcx),%xmm13
|
| +
|
| + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| + vmovdqu 16(%rax),%xmm6
|
| + vmovdqu 16(%rcx),%xmm13
|
| +
|
| + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| + vmovdqu 32(%rax),%xmm6
|
| + vmovdqu 32(%rcx),%xmm13
|
| +
|
| + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| +
|
| + vmovdqu 80-32(%rax),%xmm6
|
| + vpxor %xmm0,%xmm6,%xmm6
|
| + vmovdqu 80-32(%rcx),%xmm5
|
| + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm2
|
| + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm3
|
| + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
|
| + vpxor %xmm4,%xmm1,%xmm1
|
| +
|
| + vpsrldq $8,%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm2,%xmm5
|
| + vpslldq $8,%xmm1,%xmm4
|
| + vpxor %xmm4,%xmm3,%xmm0
|
| +
|
| + vmovdqa poly(%rip),%xmm3
|
| +
|
| + vpalignr $8,%xmm0,%xmm0,%xmm2
|
| + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
|
| + vpxor %xmm0,%xmm2,%xmm0
|
| +
|
| + vpalignr $8,%xmm0,%xmm0,%xmm2
|
| + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
|
| + vpxor %xmm0,%xmm2,%xmm0
|
| +
|
| + vpxor %xmm5,%xmm0,%xmm0
|
| +
|
| +.L256_dec_loop2:
|
| +
|
| +
|
| +
|
| + cmpq $16,%r9
|
| + jb .L256_dec_out
|
| + subq $16,%r9
|
| +
|
| + vmovdqa %xmm15,%xmm2
|
| + vpaddd one(%rip),%xmm15,%xmm15
|
| +
|
| + vpxor 0(%r8),%xmm2,%xmm2
|
| + vaesenc 16(%r8),%xmm2,%xmm2
|
| + vaesenc 32(%r8),%xmm2,%xmm2
|
| + vaesenc 48(%r8),%xmm2,%xmm2
|
| + vaesenc 64(%r8),%xmm2,%xmm2
|
| + vaesenc 80(%r8),%xmm2,%xmm2
|
| + vaesenc 96(%r8),%xmm2,%xmm2
|
| + vaesenc 112(%r8),%xmm2,%xmm2
|
| + vaesenc 128(%r8),%xmm2,%xmm2
|
| + vaesenc 144(%r8),%xmm2,%xmm2
|
| + vaesenc 160(%r8),%xmm2,%xmm2
|
| + vaesenc 176(%r8),%xmm2,%xmm2
|
| + vaesenc 192(%r8),%xmm2,%xmm2
|
| + vaesenc 208(%r8),%xmm2,%xmm2
|
| + vaesenclast 224(%r8),%xmm2,%xmm2
|
| + vpxor (%rdi),%xmm2,%xmm2
|
| + vmovdqu %xmm2,(%rsi)
|
| + addq $16,%rdi
|
| + addq $16,%rsi
|
| +
|
| + vpxor %xmm2,%xmm0,%xmm0
|
| + vmovdqa -32(%rcx),%xmm1
|
| + call GFMUL
|
| +
|
| + jmp .L256_dec_loop2
|
| +
|
| +.L256_dec_out:
|
| + vmovdqu %xmm0,(%rdx)
|
| + .byte 0xf3,0xc3
|
| +.cfi_endproc
|
| +.size aes256gcmsiv_dec, .-aes256gcmsiv_dec
|
| +.globl aes256gcmsiv_kdf
|
| +.hidden aes256gcmsiv_kdf
|
| +.type aes256gcmsiv_kdf,@function
|
| +.align 16
|
| +aes256gcmsiv_kdf:
|
| +.cfi_startproc
|
| +
|
| +
|
| +
|
| +
|
| + vmovdqa (%rdx),%xmm1
|
| + vmovdqa 0(%rdi),%xmm4
|
| + vmovdqa and_mask(%rip),%xmm11
|
| + vmovdqa one(%rip),%xmm8
|
| + vpshufd $0x90,%xmm4,%xmm4
|
| + vpand %xmm11,%xmm4,%xmm4
|
| + vpaddd %xmm8,%xmm4,%xmm6
|
| + vpaddd %xmm8,%xmm6,%xmm7
|
| + vpaddd %xmm8,%xmm7,%xmm11
|
| + vpaddd %xmm8,%xmm11,%xmm12
|
| + vpaddd %xmm8,%xmm12,%xmm13
|
| +
|
| + vpxor %xmm1,%xmm4,%xmm4
|
| + vpxor %xmm1,%xmm6,%xmm6
|
| + vpxor %xmm1,%xmm7,%xmm7
|
| + vpxor %xmm1,%xmm11,%xmm11
|
| + vpxor %xmm1,%xmm12,%xmm12
|
| + vpxor %xmm1,%xmm13,%xmm13
|
| +
|
| + vmovdqa 16(%rdx),%xmm1
|
| + vaesenc %xmm1,%xmm4,%xmm4
|
| + vaesenc %xmm1,%xmm6,%xmm6
|
| + vaesenc %xmm1,%xmm7,%xmm7
|
| + vaesenc %xmm1,%xmm11,%xmm11
|
| + vaesenc %xmm1,%xmm12,%xmm12
|
| + vaesenc %xmm1,%xmm13,%xmm13
|
| +
|
| + vmovdqa 32(%rdx),%xmm2
|
| + vaesenc %xmm2,%xmm4,%xmm4
|
| + vaesenc %xmm2,%xmm6,%xmm6
|
| + vaesenc %xmm2,%xmm7,%xmm7
|
| + vaesenc %xmm2,%xmm11,%xmm11
|
| + vaesenc %xmm2,%xmm12,%xmm12
|
| + vaesenc %xmm2,%xmm13,%xmm13
|
| +
|
| + vmovdqa 48(%rdx),%xmm1
|
| + vaesenc %xmm1,%xmm4,%xmm4
|
| + vaesenc %xmm1,%xmm6,%xmm6
|
| + vaesenc %xmm1,%xmm7,%xmm7
|
| + vaesenc %xmm1,%xmm11,%xmm11
|
| + vaesenc %xmm1,%xmm12,%xmm12
|
| + vaesenc %xmm1,%xmm13,%xmm13
|
| +
|
| + vmovdqa 64(%rdx),%xmm2
|
| + vaesenc %xmm2,%xmm4,%xmm4
|
| + vaesenc %xmm2,%xmm6,%xmm6
|
| + vaesenc %xmm2,%xmm7,%xmm7
|
| + vaesenc %xmm2,%xmm11,%xmm11
|
| + vaesenc %xmm2,%xmm12,%xmm12
|
| + vaesenc %xmm2,%xmm13,%xmm13
|
| +
|
| + vmovdqa 80(%rdx),%xmm1
|
| + vaesenc %xmm1,%xmm4,%xmm4
|
| + vaesenc %xmm1,%xmm6,%xmm6
|
| + vaesenc %xmm1,%xmm7,%xmm7
|
| + vaesenc %xmm1,%xmm11,%xmm11
|
| + vaesenc %xmm1,%xmm12,%xmm12
|
| + vaesenc %xmm1,%xmm13,%xmm13
|
| +
|
| + vmovdqa 96(%rdx),%xmm2
|
| + vaesenc %xmm2,%xmm4,%xmm4
|
| + vaesenc %xmm2,%xmm6,%xmm6
|
| + vaesenc %xmm2,%xmm7,%xmm7
|
| + vaesenc %xmm2,%xmm11,%xmm11
|
| + vaesenc %xmm2,%xmm12,%xmm12
|
| + vaesenc %xmm2,%xmm13,%xmm13
|
| +
|
| + vmovdqa 112(%rdx),%xmm1
|
| + vaesenc %xmm1,%xmm4,%xmm4
|
| + vaesenc %xmm1,%xmm6,%xmm6
|
| + vaesenc %xmm1,%xmm7,%xmm7
|
| + vaesenc %xmm1,%xmm11,%xmm11
|
| + vaesenc %xmm1,%xmm12,%xmm12
|
| + vaesenc %xmm1,%xmm13,%xmm13
|
| +
|
| + vmovdqa 128(%rdx),%xmm2
|
| + vaesenc %xmm2,%xmm4,%xmm4
|
| + vaesenc %xmm2,%xmm6,%xmm6
|
| + vaesenc %xmm2,%xmm7,%xmm7
|
| + vaesenc %xmm2,%xmm11,%xmm11
|
| + vaesenc %xmm2,%xmm12,%xmm12
|
| + vaesenc %xmm2,%xmm13,%xmm13
|
| +
|
| + vmovdqa 144(%rdx),%xmm1
|
| + vaesenc %xmm1,%xmm4,%xmm4
|
| + vaesenc %xmm1,%xmm6,%xmm6
|
| + vaesenc %xmm1,%xmm7,%xmm7
|
| + vaesenc %xmm1,%xmm11,%xmm11
|
| + vaesenc %xmm1,%xmm12,%xmm12
|
| + vaesenc %xmm1,%xmm13,%xmm13
|
| +
|
| + vmovdqa 160(%rdx),%xmm2
|
| + vaesenc %xmm2,%xmm4,%xmm4
|
| + vaesenc %xmm2,%xmm6,%xmm6
|
| + vaesenc %xmm2,%xmm7,%xmm7
|
| + vaesenc %xmm2,%xmm11,%xmm11
|
| + vaesenc %xmm2,%xmm12,%xmm12
|
| + vaesenc %xmm2,%xmm13,%xmm13
|
| +
|
| + vmovdqa 176(%rdx),%xmm1
|
| + vaesenc %xmm1,%xmm4,%xmm4
|
| + vaesenc %xmm1,%xmm6,%xmm6
|
| + vaesenc %xmm1,%xmm7,%xmm7
|
| + vaesenc %xmm1,%xmm11,%xmm11
|
| + vaesenc %xmm1,%xmm12,%xmm12
|
| + vaesenc %xmm1,%xmm13,%xmm13
|
| +
|
| + vmovdqa 192(%rdx),%xmm2
|
| + vaesenc %xmm2,%xmm4,%xmm4
|
| + vaesenc %xmm2,%xmm6,%xmm6
|
| + vaesenc %xmm2,%xmm7,%xmm7
|
| + vaesenc %xmm2,%xmm11,%xmm11
|
| + vaesenc %xmm2,%xmm12,%xmm12
|
| + vaesenc %xmm2,%xmm13,%xmm13
|
| +
|
| + vmovdqa 208(%rdx),%xmm1
|
| + vaesenc %xmm1,%xmm4,%xmm4
|
| + vaesenc %xmm1,%xmm6,%xmm6
|
| + vaesenc %xmm1,%xmm7,%xmm7
|
| + vaesenc %xmm1,%xmm11,%xmm11
|
| + vaesenc %xmm1,%xmm12,%xmm12
|
| + vaesenc %xmm1,%xmm13,%xmm13
|
| +
|
| + vmovdqa 224(%rdx),%xmm2
|
| + vaesenclast %xmm2,%xmm4,%xmm4
|
| + vaesenclast %xmm2,%xmm6,%xmm6
|
| + vaesenclast %xmm2,%xmm7,%xmm7
|
| + vaesenclast %xmm2,%xmm11,%xmm11
|
| + vaesenclast %xmm2,%xmm12,%xmm12
|
| + vaesenclast %xmm2,%xmm13,%xmm13
|
| +
|
| +
|
| + vmovdqa %xmm4,0(%rsi)
|
| + vmovdqa %xmm6,16(%rsi)
|
| + vmovdqa %xmm7,32(%rsi)
|
| + vmovdqa %xmm11,48(%rsi)
|
| + vmovdqa %xmm12,64(%rsi)
|
| + vmovdqa %xmm13,80(%rsi)
|
| + .byte 0xf3,0xc3
|
| +.cfi_endproc
|
| +.size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf
|
| +#endif
|
|
|