Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(274)

Unified Diff: third_party/boringssl/win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm

Issue 2869243005: Roll src/third_party/boringssl/src ddfcc6a60..1e5cb820d (Closed)
Patch Set: Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/boringssl/win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm
diff --git a/third_party/boringssl/win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm b/third_party/boringssl/win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm
new file mode 100644
index 0000000000000000000000000000000000000000..1c7360d085ba57a2d98d49efeafb1b4d03a2cc00
--- /dev/null
+++ b/third_party/boringssl/win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm
@@ -0,0 +1,3270 @@
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+section .data data align=8
+
+
+ALIGN 16
+one:
+ DQ 1,0
+two:
+ DQ 2,0
+three:
+ DQ 3,0
+four:
+ DQ 4,0
+five:
+ DQ 5,0
+six:
+ DQ 6,0
+seven:
+ DQ 7,0
+eight:
+ DQ 8,0
+
+OR_MASK:
+ DD 0x00000000,0x00000000,0x00000000,0x80000000
+poly:
+ DQ 0x1,0xc200000000000000
+mask:
+ DD 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+con1:
+ DD 1,1,1,1
+con2:
+ DD 0x1b,0x1b,0x1b,0x1b
+con3:
+DB -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
+and_mask:
+ DD 0,0xffffffff,0xffffffff,0xffffffff
+section .text code align=64
+
+
+ALIGN 16
+GFMUL:
+
+ vpclmulqdq xmm2,xmm0,xmm1,0x00
+ vpclmulqdq xmm5,xmm0,xmm1,0x11
+ vpclmulqdq xmm3,xmm0,xmm1,0x10
+ vpclmulqdq xmm4,xmm0,xmm1,0x01
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm3,8
+ vpsrldq xmm3,xmm3,8
+ vpxor xmm2,xmm2,xmm4
+ vpxor xmm5,xmm5,xmm3
+
+ vpclmulqdq xmm3,xmm2,XMMWORD[poly],0x10
+ vpshufd xmm4,xmm2,78
+ vpxor xmm2,xmm3,xmm4
+
+ vpclmulqdq xmm3,xmm2,XMMWORD[poly],0x10
+ vpshufd xmm4,xmm2,78
+ vpxor xmm2,xmm3,xmm4
+
+ vpxor xmm0,xmm2,xmm5
+ DB 0F3h,0C3h ;repret
+
+
+global aesgcmsiv_htable_init
+
+ALIGN 16
+aesgcmsiv_htable_init:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aesgcmsiv_htable_init:
+ mov rdi,rcx
+ mov rsi,rdx
+
+
+
+ vmovdqa xmm0,XMMWORD[rsi]
+ vmovdqa xmm1,xmm0
+ vmovdqa XMMWORD[rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[16+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[32+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[48+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[64+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[80+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[96+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[112+rdi],xmm0
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_aesgcmsiv_htable_init:
+global aesgcmsiv_htable6_init
+
+ALIGN 16
+aesgcmsiv_htable6_init:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aesgcmsiv_htable6_init:
+ mov rdi,rcx
+ mov rsi,rdx
+
+
+
+ vmovdqa xmm0,XMMWORD[rsi]
+ vmovdqa xmm1,xmm0
+ vmovdqa XMMWORD[rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[16+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[32+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[48+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[64+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[80+rdi],xmm0
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_aesgcmsiv_htable6_init:
+global aesgcmsiv_htable_polyval
+
+ALIGN 16
+aesgcmsiv_htable_polyval:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aesgcmsiv_htable_polyval:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+
+
+
+ test rdx,rdx
+ jnz NEAR $L$htable_polyval_start
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$htable_polyval_start:
+ vzeroall
+
+
+
+ mov r11,rdx
+ and r11,127
+
+ jz NEAR $L$htable_polyval_no_prefix
+
+ vpxor xmm9,xmm9,xmm9
+ vmovdqa xmm1,XMMWORD[rcx]
+ sub rdx,r11
+
+ sub r11,16
+
+
+ vmovdqu xmm0,XMMWORD[rsi]
+ vpxor xmm0,xmm0,xmm1
+
+ vpclmulqdq xmm5,xmm0,XMMWORD[r11*1+rdi],0x01
+ vpclmulqdq xmm3,xmm0,XMMWORD[r11*1+rdi],0x00
+ vpclmulqdq xmm4,xmm0,XMMWORD[r11*1+rdi],0x11
+ vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+ lea rsi,[16+rsi]
+ test r11,r11
+ jnz NEAR $L$htable_polyval_prefix_loop
+ jmp NEAR $L$htable_polyval_prefix_complete
+
+
+ALIGN 64
+$L$htable_polyval_prefix_loop:
+ sub r11,16
+
+ vmovdqu xmm0,XMMWORD[rsi]
+
+ vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x00
+ vpxor xmm3,xmm3,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x11
+ vpxor xmm4,xmm4,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x01
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+ test r11,r11
+
+ lea rsi,[16+rsi]
+
+ jnz NEAR $L$htable_polyval_prefix_loop
+
+$L$htable_polyval_prefix_complete:
+ vpsrldq xmm6,xmm5,8
+ vpslldq xmm5,xmm5,8
+
+ vpxor xmm9,xmm4,xmm6
+ vpxor xmm1,xmm3,xmm5
+
+ jmp NEAR $L$htable_polyval_main_loop
+
+$L$htable_polyval_no_prefix:
+
+
+
+
+ vpxor xmm1,xmm1,xmm1
+ vmovdqa xmm9,XMMWORD[rcx]
+
+ALIGN 64
+$L$htable_polyval_main_loop:
+ sub rdx,0x80
+ jb NEAR $L$htable_polyval_out
+
+ vmovdqu xmm0,XMMWORD[112+rsi]
+
+ vpclmulqdq xmm5,xmm0,XMMWORD[rdi],0x01
+ vpclmulqdq xmm3,xmm0,XMMWORD[rdi],0x00
+ vpclmulqdq xmm4,xmm0,XMMWORD[rdi],0x11
+ vpclmulqdq xmm6,xmm0,XMMWORD[rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+
+ vmovdqu xmm0,XMMWORD[96+rsi]
+ vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x01
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x00
+ vpxor xmm3,xmm3,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x11
+ vpxor xmm4,xmm4,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+
+
+ vmovdqu xmm0,XMMWORD[80+rsi]
+
+ vpclmulqdq xmm7,xmm1,XMMWORD[poly],0x10
+ vpalignr xmm1,xmm1,xmm1,8
+
+ vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x01
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x00
+ vpxor xmm3,xmm3,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x11
+ vpxor xmm4,xmm4,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+
+ vpxor xmm1,xmm1,xmm7
+
+ vmovdqu xmm0,XMMWORD[64+rsi]
+
+ vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x01
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x00
+ vpxor xmm3,xmm3,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x11
+ vpxor xmm4,xmm4,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+
+ vmovdqu xmm0,XMMWORD[48+rsi]
+
+ vpclmulqdq xmm7,xmm1,XMMWORD[poly],0x10
+ vpalignr xmm1,xmm1,xmm1,8
+
+ vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x01
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x00
+ vpxor xmm3,xmm3,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x11
+ vpxor xmm4,xmm4,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+
+ vpxor xmm1,xmm1,xmm7
+
+ vmovdqu xmm0,XMMWORD[32+rsi]
+
+ vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x01
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x00
+ vpxor xmm3,xmm3,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x11
+ vpxor xmm4,xmm4,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+
+ vpxor xmm1,xmm1,xmm9
+
+ vmovdqu xmm0,XMMWORD[16+rsi]
+
+ vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x01
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x00
+ vpxor xmm3,xmm3,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x11
+ vpxor xmm4,xmm4,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+
+ vmovdqu xmm0,XMMWORD[rsi]
+ vpxor xmm0,xmm0,xmm1
+
+ vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x01
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x00
+ vpxor xmm3,xmm3,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x11
+ vpxor xmm4,xmm4,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+
+ vpsrldq xmm6,xmm5,8
+ vpslldq xmm5,xmm5,8
+
+ vpxor xmm9,xmm4,xmm6
+ vpxor xmm1,xmm3,xmm5
+
+ lea rsi,[128+rsi]
+ jmp NEAR $L$htable_polyval_main_loop
+
+
+
+$L$htable_polyval_out:
+ vpclmulqdq xmm6,xmm1,XMMWORD[poly],0x10
+ vpalignr xmm1,xmm1,xmm1,8
+ vpxor xmm1,xmm1,xmm6
+
+ vpclmulqdq xmm6,xmm1,XMMWORD[poly],0x10
+ vpalignr xmm1,xmm1,xmm1,8
+ vpxor xmm1,xmm1,xmm6
+ vpxor xmm1,xmm1,xmm9
+
+ vmovdqu XMMWORD[rcx],xmm1
+ vzeroupper
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_aesgcmsiv_htable_polyval:
+global aesgcmsiv_polyval_horner
+
+ALIGN 16
+aesgcmsiv_polyval_horner:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aesgcmsiv_polyval_horner:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+
+
+
+ test rcx,rcx
+ jnz NEAR $L$polyval_horner_start
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$polyval_horner_start:
+
+
+
+ xor r10,r10
+ shl rcx,4
+
+ vmovdqa xmm1,XMMWORD[rsi]
+ vmovdqa xmm0,XMMWORD[rdi]
+
+$L$polyval_horner_loop:
+ vpxor xmm0,xmm0,XMMWORD[r10*1+rdx]
+ call GFMUL
+
+ add r10,16
+ cmp rcx,r10
+ jne NEAR $L$polyval_horner_loop
+
+
+ vmovdqa XMMWORD[rdi],xmm0
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_aesgcmsiv_polyval_horner:
+global aes128gcmsiv_aes_ks
+
+ALIGN 16
+aes128gcmsiv_aes_ks:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes128gcmsiv_aes_ks:
+ mov rdi,rcx
+ mov rsi,rdx
+
+
+
+ vmovdqa xmm1,XMMWORD[rdi]
+ vmovdqa XMMWORD[rsi],xmm1
+
+ vmovdqa xmm0,XMMWORD[con1]
+ vmovdqa xmm15,XMMWORD[mask]
+
+ mov rax,8
+
+$L$ks128_loop:
+ add rsi,16
+ sub rax,1
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpslldq xmm3,xmm1,4
+ vpxor xmm1,xmm1,xmm3
+ vpslldq xmm3,xmm3,4
+ vpxor xmm1,xmm1,xmm3
+ vpslldq xmm3,xmm3,4
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+ vmovdqa XMMWORD[rsi],xmm1
+ jne NEAR $L$ks128_loop
+
+ vmovdqa xmm0,XMMWORD[con2]
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpslldq xmm3,xmm1,4
+ vpxor xmm1,xmm1,xmm3
+ vpslldq xmm3,xmm3,4
+ vpxor xmm1,xmm1,xmm3
+ vpslldq xmm3,xmm3,4
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+ vmovdqa XMMWORD[16+rsi],xmm1
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslldq xmm3,xmm1,4
+ vpxor xmm1,xmm1,xmm3
+ vpslldq xmm3,xmm3,4
+ vpxor xmm1,xmm1,xmm3
+ vpslldq xmm3,xmm3,4
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+ vmovdqa XMMWORD[32+rsi],xmm1
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_aes128gcmsiv_aes_ks:
+global aes256gcmsiv_aes_ks
+
+ALIGN 16
+aes256gcmsiv_aes_ks:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes256gcmsiv_aes_ks:
+ mov rdi,rcx
+ mov rsi,rdx
+
+
+
+ vmovdqa xmm1,XMMWORD[rdi]
+ vmovdqa xmm3,XMMWORD[16+rdi]
+ vmovdqa XMMWORD[rsi],xmm1
+ vmovdqa XMMWORD[16+rsi],xmm3
+ vmovdqa xmm0,XMMWORD[con1]
+ vmovdqa xmm15,XMMWORD[mask]
+ vpxor xmm14,xmm14,xmm14
+ mov rax,6
+
+$L$ks256_loop:
+ add rsi,32
+ sub rax,1
+ vpshufb xmm2,xmm3,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm4,xmm1,32
+ vpxor xmm1,xmm1,xmm4
+ vpshufb xmm4,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm4
+ vpxor xmm1,xmm1,xmm2
+ vmovdqa XMMWORD[rsi],xmm1
+ vpshufd xmm2,xmm1,0xff
+ vaesenclast xmm2,xmm2,xmm14
+ vpsllq xmm4,xmm3,32
+ vpxor xmm3,xmm3,xmm4
+ vpshufb xmm4,xmm3,XMMWORD[con3]
+ vpxor xmm3,xmm3,xmm4
+ vpxor xmm3,xmm3,xmm2
+ vmovdqa XMMWORD[16+rsi],xmm3
+ jne NEAR $L$ks256_loop
+
+ vpshufb xmm2,xmm3,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpsllq xmm4,xmm1,32
+ vpxor xmm1,xmm1,xmm4
+ vpshufb xmm4,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm4
+ vpxor xmm1,xmm1,xmm2
+ vmovdqa XMMWORD[32+rsi],xmm1
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+global aes128gcmsiv_aes_ks_enc_x1
+
+ALIGN 16
+aes128gcmsiv_aes_ks_enc_x1:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes128gcmsiv_aes_ks_enc_x1:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+
+
+
+ vmovdqa xmm1,XMMWORD[rcx]
+ vmovdqa xmm4,XMMWORD[rdi]
+
+ vmovdqa XMMWORD[rdx],xmm1
+ vpxor xmm4,xmm4,xmm1
+
+ vmovdqa xmm0,XMMWORD[con1]
+ vmovdqa xmm15,XMMWORD[mask]
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenc xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[16+rdx],xmm1
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenc xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[32+rdx],xmm1
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenc xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[48+rdx],xmm1
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenc xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[64+rdx],xmm1
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenc xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[80+rdx],xmm1
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenc xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[96+rdx],xmm1
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenc xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[112+rdx],xmm1
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenc xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[128+rdx],xmm1
+
+
+ vmovdqa xmm0,XMMWORD[con2]
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenc xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[144+rdx],xmm1
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenclast xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[160+rdx],xmm1
+
+
+ vmovdqa XMMWORD[rsi],xmm4
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_aes128gcmsiv_aes_ks_enc_x1:
+global aes128gcmsiv_kdf
+
+ALIGN 16
+aes128gcmsiv_kdf:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes128gcmsiv_kdf:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+
+
+
+
+ vmovdqa xmm1,XMMWORD[rdx]
+ vmovdqa xmm9,XMMWORD[rdi]
+ vmovdqa xmm12,XMMWORD[and_mask]
+ vmovdqa xmm13,XMMWORD[one]
+ vpshufd xmm9,xmm9,0x90
+ vpand xmm9,xmm9,xmm12
+ vpaddd xmm10,xmm9,xmm13
+ vpaddd xmm11,xmm10,xmm13
+ vpaddd xmm12,xmm11,xmm13
+
+ vpxor xmm9,xmm9,xmm1
+ vpxor xmm10,xmm10,xmm1
+ vpxor xmm11,xmm11,xmm1
+ vpxor xmm12,xmm12,xmm1
+
+ vmovdqa xmm1,XMMWORD[16+rdx]
+ vaesenc xmm9,xmm9,xmm1
+ vaesenc xmm10,xmm10,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+
+ vmovdqa xmm2,XMMWORD[32+rdx]
+ vaesenc xmm9,xmm9,xmm2
+ vaesenc xmm10,xmm10,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+
+ vmovdqa xmm1,XMMWORD[48+rdx]
+ vaesenc xmm9,xmm9,xmm1
+ vaesenc xmm10,xmm10,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+
+ vmovdqa xmm2,XMMWORD[64+rdx]
+ vaesenc xmm9,xmm9,xmm2
+ vaesenc xmm10,xmm10,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+
+ vmovdqa xmm1,XMMWORD[80+rdx]
+ vaesenc xmm9,xmm9,xmm1
+ vaesenc xmm10,xmm10,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+
+ vmovdqa xmm2,XMMWORD[96+rdx]
+ vaesenc xmm9,xmm9,xmm2
+ vaesenc xmm10,xmm10,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+
+ vmovdqa xmm1,XMMWORD[112+rdx]
+ vaesenc xmm9,xmm9,xmm1
+ vaesenc xmm10,xmm10,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+
+ vmovdqa xmm2,XMMWORD[128+rdx]
+ vaesenc xmm9,xmm9,xmm2
+ vaesenc xmm10,xmm10,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+
+ vmovdqa xmm1,XMMWORD[144+rdx]
+ vaesenc xmm9,xmm9,xmm1
+ vaesenc xmm10,xmm10,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+
+ vmovdqa xmm2,XMMWORD[160+rdx]
+ vaesenclast xmm9,xmm9,xmm2
+ vaesenclast xmm10,xmm10,xmm2
+ vaesenclast xmm11,xmm11,xmm2
+ vaesenclast xmm12,xmm12,xmm2
+
+
+ vmovdqa XMMWORD[rsi],xmm9
+ vmovdqa XMMWORD[16+rsi],xmm10
+ vmovdqa XMMWORD[32+rsi],xmm11
+ vmovdqa XMMWORD[48+rsi],xmm12
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_aes128gcmsiv_kdf:
+global aes128gcmsiv_enc_msg_x4
+
+ALIGN 16
+aes128gcmsiv_enc_msg_x4:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes128gcmsiv_enc_msg_x4:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+
+
+
+ test r8,r8
+ jnz NEAR $L$128_enc_msg_x4_start
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$128_enc_msg_x4_start:
+ push r12
+
+ push r13
+
+
+ shr r8,4
+ mov r10,r8
+ shl r10,62
+ shr r10,62
+
+
+ vmovdqa xmm15,XMMWORD[rdx]
+ vpor xmm15,xmm15,XMMWORD[OR_MASK]
+
+ vmovdqu xmm4,XMMWORD[four]
+ vmovdqa xmm0,xmm15
+ vpaddd xmm1,xmm15,XMMWORD[one]
+ vpaddd xmm2,xmm15,XMMWORD[two]
+ vpaddd xmm3,xmm15,XMMWORD[three]
+
+ shr r8,2
+ je NEAR $L$128_enc_msg_x4_check_remainder
+
+ sub rsi,64
+ sub rdi,64
+
+$L$128_enc_msg_x4_loop1:
+ add rsi,64
+ add rdi,64
+
+ vmovdqa xmm5,xmm0
+ vmovdqa xmm6,xmm1
+ vmovdqa xmm7,xmm2
+ vmovdqa xmm8,xmm3
+
+ vpxor xmm5,xmm5,XMMWORD[rcx]
+ vpxor xmm6,xmm6,XMMWORD[rcx]
+ vpxor xmm7,xmm7,XMMWORD[rcx]
+ vpxor xmm8,xmm8,XMMWORD[rcx]
+
+ vmovdqu xmm12,XMMWORD[16+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vpaddd xmm0,xmm0,xmm4
+ vmovdqu xmm12,XMMWORD[32+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vpaddd xmm1,xmm1,xmm4
+ vmovdqu xmm12,XMMWORD[48+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vpaddd xmm2,xmm2,xmm4
+ vmovdqu xmm12,XMMWORD[64+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vpaddd xmm3,xmm3,xmm4
+
+ vmovdqu xmm12,XMMWORD[80+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[96+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[112+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[128+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[144+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[160+rcx]
+ vaesenclast xmm5,xmm5,xmm12
+ vaesenclast xmm6,xmm6,xmm12
+ vaesenclast xmm7,xmm7,xmm12
+ vaesenclast xmm8,xmm8,xmm12
+
+
+
+ vpxor xmm5,xmm5,XMMWORD[rdi]
+ vpxor xmm6,xmm6,XMMWORD[16+rdi]
+ vpxor xmm7,xmm7,XMMWORD[32+rdi]
+ vpxor xmm8,xmm8,XMMWORD[48+rdi]
+
+ sub r8,1
+
+ vmovdqu XMMWORD[rsi],xmm5
+ vmovdqu XMMWORD[16+rsi],xmm6
+ vmovdqu XMMWORD[32+rsi],xmm7
+ vmovdqu XMMWORD[48+rsi],xmm8
+
+ jne NEAR $L$128_enc_msg_x4_loop1
+
+ add rsi,64
+ add rdi,64
+
+$L$128_enc_msg_x4_check_remainder:
+ cmp r10,0
+ je NEAR $L$128_enc_msg_x4_out
+
+$L$128_enc_msg_x4_loop2:
+
+
+ vmovdqa xmm5,xmm0
+ vpaddd xmm0,xmm0,XMMWORD[one]
+
+ vpxor xmm5,xmm5,XMMWORD[rcx]
+ vaesenc xmm5,xmm5,XMMWORD[16+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[32+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[48+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[64+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[80+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[96+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[112+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[128+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[144+rcx]
+ vaesenclast xmm5,xmm5,XMMWORD[160+rcx]
+
+
+ vpxor xmm5,xmm5,XMMWORD[rdi]
+ vmovdqu XMMWORD[rsi],xmm5
+
+ add rdi,16
+ add rsi,16
+
+ sub r10,1
+ jne NEAR $L$128_enc_msg_x4_loop2
+
+$L$128_enc_msg_x4_out:
+ pop r13
+
+ pop r12
+
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_aes128gcmsiv_enc_msg_x4:
+global aes128gcmsiv_enc_msg_x8
+
+ALIGN 16
+aes128gcmsiv_enc_msg_x8:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes128gcmsiv_enc_msg_x8:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+
+
+
+ test r8,r8
+ jnz NEAR $L$128_enc_msg_x8_start
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$128_enc_msg_x8_start:
+ push r12
+
+ push r13
+
+ push rbp
+
+ mov rbp,rsp
+
+
+
+ sub rsp,128
+ and rsp,-64
+
+ shr r8,4
+ mov r10,r8
+ shl r10,61
+ shr r10,61
+
+
+ vmovdqu xmm1,XMMWORD[rdx]
+ vpor xmm1,xmm1,XMMWORD[OR_MASK]
+
+
+ vpaddd xmm0,xmm1,XMMWORD[seven]
+ vmovdqu XMMWORD[rsp],xmm0
+ vpaddd xmm9,xmm1,XMMWORD[one]
+ vpaddd xmm10,xmm1,XMMWORD[two]
+ vpaddd xmm11,xmm1,XMMWORD[three]
+ vpaddd xmm12,xmm1,XMMWORD[four]
+ vpaddd xmm13,xmm1,XMMWORD[five]
+ vpaddd xmm14,xmm1,XMMWORD[six]
+ vmovdqa xmm0,xmm1
+
+ shr r8,3
+ je NEAR $L$128_enc_msg_x8_check_remainder
+
+ sub rsi,128
+ sub rdi,128
+
+$L$128_enc_msg_x8_loop1:
+ add rsi,128
+ add rdi,128
+
+ vmovdqa xmm1,xmm0
+ vmovdqa xmm2,xmm9
+ vmovdqa xmm3,xmm10
+ vmovdqa xmm4,xmm11
+ vmovdqa xmm5,xmm12
+ vmovdqa xmm6,xmm13
+ vmovdqa xmm7,xmm14
+
+ vmovdqu xmm8,XMMWORD[rsp]
+
+ vpxor xmm1,xmm1,XMMWORD[rcx]
+ vpxor xmm2,xmm2,XMMWORD[rcx]
+ vpxor xmm3,xmm3,XMMWORD[rcx]
+ vpxor xmm4,xmm4,XMMWORD[rcx]
+ vpxor xmm5,xmm5,XMMWORD[rcx]
+ vpxor xmm6,xmm6,XMMWORD[rcx]
+ vpxor xmm7,xmm7,XMMWORD[rcx]
+ vpxor xmm8,xmm8,XMMWORD[rcx]
+
+ vmovdqu xmm15,XMMWORD[16+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vmovdqu xmm14,XMMWORD[rsp]
+ vpaddd xmm14,xmm14,XMMWORD[eight]
+ vmovdqu XMMWORD[rsp],xmm14
+ vmovdqu xmm15,XMMWORD[32+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpsubd xmm14,xmm14,XMMWORD[one]
+ vmovdqu xmm15,XMMWORD[48+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm0,xmm0,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[64+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm9,xmm9,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[80+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm10,xmm10,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[96+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm11,xmm11,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[112+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm12,xmm12,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[128+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm13,xmm13,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[144+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vmovdqu xmm15,XMMWORD[160+rcx]
+ vaesenclast xmm1,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm15
+ vaesenclast xmm3,xmm3,xmm15
+ vaesenclast xmm4,xmm4,xmm15
+ vaesenclast xmm5,xmm5,xmm15
+ vaesenclast xmm6,xmm6,xmm15
+ vaesenclast xmm7,xmm7,xmm15
+ vaesenclast xmm8,xmm8,xmm15
+
+
+
+ vpxor xmm1,xmm1,XMMWORD[rdi]
+ vpxor xmm2,xmm2,XMMWORD[16+rdi]
+ vpxor xmm3,xmm3,XMMWORD[32+rdi]
+ vpxor xmm4,xmm4,XMMWORD[48+rdi]
+ vpxor xmm5,xmm5,XMMWORD[64+rdi]
+ vpxor xmm6,xmm6,XMMWORD[80+rdi]
+ vpxor xmm7,xmm7,XMMWORD[96+rdi]
+ vpxor xmm8,xmm8,XMMWORD[112+rdi]
+
+ dec r8
+
+ vmovdqu XMMWORD[rsi],xmm1
+ vmovdqu XMMWORD[16+rsi],xmm2
+ vmovdqu XMMWORD[32+rsi],xmm3
+ vmovdqu XMMWORD[48+rsi],xmm4
+ vmovdqu XMMWORD[64+rsi],xmm5
+ vmovdqu XMMWORD[80+rsi],xmm6
+ vmovdqu XMMWORD[96+rsi],xmm7
+ vmovdqu XMMWORD[112+rsi],xmm8
+
+ jne NEAR $L$128_enc_msg_x8_loop1
+
+ add rsi,128
+ add rdi,128
+
+$L$128_enc_msg_x8_check_remainder:
+ cmp r10,0
+ je NEAR $L$128_enc_msg_x8_out
+
+$L$128_enc_msg_x8_loop2:
+
+
+ vmovdqa xmm1,xmm0
+ vpaddd xmm0,xmm0,XMMWORD[one]
+
+ vpxor xmm1,xmm1,XMMWORD[rcx]
+ vaesenc xmm1,xmm1,XMMWORD[16+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[32+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[48+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[64+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[80+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[96+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[112+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[128+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[144+rcx]
+ vaesenclast xmm1,xmm1,XMMWORD[160+rcx]
+
+
+ vpxor xmm1,xmm1,XMMWORD[rdi]
+
+ vmovdqu XMMWORD[rsi],xmm1
+
+ add rdi,16
+ add rsi,16
+
+ dec r10
+ jne NEAR $L$128_enc_msg_x8_loop2
+
+$L$128_enc_msg_x8_out:
+ mov rsp,rbp
+
+ pop rbp
+
+ pop r13
+
+ pop r12
+
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_aes128gcmsiv_enc_msg_x8:
+global aes128gcmsiv_dec
+
+ALIGN 16
+aes128gcmsiv_dec:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes128gcmsiv_dec:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+
+ test r9,~15
+ jnz NEAR $L$128_dec_start
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$128_dec_start:
+ vzeroupper
+ vmovdqa xmm0,XMMWORD[rdx]
+ mov rax,rdx
+
+ lea rax,[32+rax]
+ lea rcx,[32+rcx]
+
+
+ vmovdqu xmm15,XMMWORD[r9*1+rdi]
+ vpor xmm15,xmm15,XMMWORD[OR_MASK]
+ and r9,~15
+
+
+ cmp r9,96
+ jb NEAR $L$128_dec_loop2
+
+
+ sub r9,96
+ vmovdqa xmm7,xmm15
+ vpaddd xmm8,xmm7,XMMWORD[one]
+ vpaddd xmm9,xmm7,XMMWORD[two]
+ vpaddd xmm10,xmm9,XMMWORD[one]
+ vpaddd xmm11,xmm9,XMMWORD[two]
+ vpaddd xmm12,xmm11,XMMWORD[one]
+ vpaddd xmm15,xmm11,XMMWORD[two]
+
+ vpxor xmm7,xmm7,XMMWORD[r8]
+ vpxor xmm8,xmm8,XMMWORD[r8]
+ vpxor xmm9,xmm9,XMMWORD[r8]
+ vpxor xmm10,xmm10,XMMWORD[r8]
+ vpxor xmm11,xmm11,XMMWORD[r8]
+ vpxor xmm12,xmm12,XMMWORD[r8]
+
+ vmovdqu xmm4,XMMWORD[16+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[32+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[48+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[64+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[80+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[96+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[112+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[128+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[144+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[160+r8]
+ vaesenclast xmm7,xmm7,xmm4
+ vaesenclast xmm8,xmm8,xmm4
+ vaesenclast xmm9,xmm9,xmm4
+ vaesenclast xmm10,xmm10,xmm4
+ vaesenclast xmm11,xmm11,xmm4
+ vaesenclast xmm12,xmm12,xmm4
+
+
+ vpxor xmm7,xmm7,XMMWORD[rdi]
+ vpxor xmm8,xmm8,XMMWORD[16+rdi]
+ vpxor xmm9,xmm9,XMMWORD[32+rdi]
+ vpxor xmm10,xmm10,XMMWORD[48+rdi]
+ vpxor xmm11,xmm11,XMMWORD[64+rdi]
+ vpxor xmm12,xmm12,XMMWORD[80+rdi]
+
+ vmovdqu XMMWORD[rsi],xmm7
+ vmovdqu XMMWORD[16+rsi],xmm8
+ vmovdqu XMMWORD[32+rsi],xmm9
+ vmovdqu XMMWORD[48+rsi],xmm10
+ vmovdqu XMMWORD[64+rsi],xmm11
+ vmovdqu XMMWORD[80+rsi],xmm12
+
+ add rdi,96
+ add rsi,96
+ jmp NEAR $L$128_dec_loop1
+
+
+ALIGN 64
+$L$128_dec_loop1:
+ cmp r9,96
+ jb NEAR $L$128_dec_finish_96
+ sub r9,96
+
+ vmovdqa xmm6,xmm12
+ vmovdqa XMMWORD[(16-32)+rax],xmm11
+ vmovdqa XMMWORD[(32-32)+rax],xmm10
+ vmovdqa XMMWORD[(48-32)+rax],xmm9
+ vmovdqa XMMWORD[(64-32)+rax],xmm8
+ vmovdqa XMMWORD[(80-32)+rax],xmm7
+
+ vmovdqa xmm7,xmm15
+ vpaddd xmm8,xmm7,XMMWORD[one]
+ vpaddd xmm9,xmm7,XMMWORD[two]
+ vpaddd xmm10,xmm9,XMMWORD[one]
+ vpaddd xmm11,xmm9,XMMWORD[two]
+ vpaddd xmm12,xmm11,XMMWORD[one]
+ vpaddd xmm15,xmm11,XMMWORD[two]
+
+ vmovdqa xmm4,XMMWORD[r8]
+ vpxor xmm7,xmm7,xmm4
+ vpxor xmm8,xmm8,xmm4
+ vpxor xmm9,xmm9,xmm4
+ vpxor xmm10,xmm10,xmm4
+ vpxor xmm11,xmm11,xmm4
+ vpxor xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[((0-32))+rcx]
+ vpclmulqdq xmm2,xmm6,xmm4,0x11
+ vpclmulqdq xmm3,xmm6,xmm4,0x00
+ vpclmulqdq xmm1,xmm6,xmm4,0x01
+ vpclmulqdq xmm4,xmm6,xmm4,0x10
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm4,XMMWORD[16+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[((-16))+rax]
+ vmovdqu xmm13,XMMWORD[((-16))+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm4,XMMWORD[32+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[rax]
+ vmovdqu xmm13,XMMWORD[rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm4,XMMWORD[48+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[16+rax]
+ vmovdqu xmm13,XMMWORD[16+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm4,XMMWORD[64+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[32+rax]
+ vmovdqu xmm13,XMMWORD[32+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm4,XMMWORD[80+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[96+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[112+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+
+ vmovdqa xmm6,XMMWORD[((80-32))+rax]
+ vpxor xmm6,xmm6,xmm0
+ vmovdqu xmm5,XMMWORD[((80-32))+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm5,0x01
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x10
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm4,XMMWORD[128+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+
+ vpsrldq xmm4,xmm1,8
+ vpxor xmm5,xmm2,xmm4
+ vpslldq xmm4,xmm1,8
+ vpxor xmm0,xmm3,xmm4
+
+ vmovdqa xmm3,XMMWORD[poly]
+
+ vmovdqu xmm4,XMMWORD[144+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[160+r8]
+ vpalignr xmm2,xmm0,xmm0,8
+ vpclmulqdq xmm0,xmm0,xmm3,0x10
+ vpxor xmm0,xmm2,xmm0
+
+ vpxor xmm4,xmm6,XMMWORD[rdi]
+ vaesenclast xmm7,xmm7,xmm4
+ vpxor xmm4,xmm6,XMMWORD[16+rdi]
+ vaesenclast xmm8,xmm8,xmm4
+ vpxor xmm4,xmm6,XMMWORD[32+rdi]
+ vaesenclast xmm9,xmm9,xmm4
+ vpxor xmm4,xmm6,XMMWORD[48+rdi]
+ vaesenclast xmm10,xmm10,xmm4
+ vpxor xmm4,xmm6,XMMWORD[64+rdi]
+ vaesenclast xmm11,xmm11,xmm4
+ vpxor xmm4,xmm6,XMMWORD[80+rdi]
+ vaesenclast xmm12,xmm12,xmm4
+
+ vpalignr xmm2,xmm0,xmm0,8
+ vpclmulqdq xmm0,xmm0,xmm3,0x10
+ vpxor xmm0,xmm2,xmm0
+
+ vmovdqu XMMWORD[rsi],xmm7
+ vmovdqu XMMWORD[16+rsi],xmm8
+ vmovdqu XMMWORD[32+rsi],xmm9
+ vmovdqu XMMWORD[48+rsi],xmm10
+ vmovdqu XMMWORD[64+rsi],xmm11
+ vmovdqu XMMWORD[80+rsi],xmm12
+
+ vpxor xmm0,xmm0,xmm5
+
+ lea rdi,[96+rdi]
+ lea rsi,[96+rsi]
+ jmp NEAR $L$128_dec_loop1
+
+$L$128_dec_finish_96:
+ vmovdqa xmm6,xmm12
+ vmovdqa XMMWORD[(16-32)+rax],xmm11
+ vmovdqa XMMWORD[(32-32)+rax],xmm10
+ vmovdqa XMMWORD[(48-32)+rax],xmm9
+ vmovdqa XMMWORD[(64-32)+rax],xmm8
+ vmovdqa XMMWORD[(80-32)+rax],xmm7
+
+ vmovdqu xmm4,XMMWORD[((0-32))+rcx]
+ vpclmulqdq xmm1,xmm6,xmm4,0x10
+ vpclmulqdq xmm2,xmm6,xmm4,0x11
+ vpclmulqdq xmm3,xmm6,xmm4,0x00
+ vpclmulqdq xmm4,xmm6,xmm4,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm6,XMMWORD[((-16))+rax]
+ vmovdqu xmm13,XMMWORD[((-16))+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm6,XMMWORD[rax]
+ vmovdqu xmm13,XMMWORD[rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm6,XMMWORD[16+rax]
+ vmovdqu xmm13,XMMWORD[16+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm6,XMMWORD[32+rax]
+ vmovdqu xmm13,XMMWORD[32+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm6,XMMWORD[((80-32))+rax]
+ vpxor xmm6,xmm6,xmm0
+ vmovdqu xmm5,XMMWORD[((80-32))+rcx]
+ vpclmulqdq xmm4,xmm6,xmm5,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vpsrldq xmm4,xmm1,8
+ vpxor xmm5,xmm2,xmm4
+ vpslldq xmm4,xmm1,8
+ vpxor xmm0,xmm3,xmm4
+
+ vmovdqa xmm3,XMMWORD[poly]
+
+ vpalignr xmm2,xmm0,xmm0,8
+ vpclmulqdq xmm0,xmm0,xmm3,0x10
+ vpxor xmm0,xmm2,xmm0
+
+ vpalignr xmm2,xmm0,xmm0,8
+ vpclmulqdq xmm0,xmm0,xmm3,0x10
+ vpxor xmm0,xmm2,xmm0
+
+ vpxor xmm0,xmm0,xmm5
+
+$L$128_dec_loop2:
+
+
+
+ cmp r9,16
+ jb NEAR $L$128_dec_out
+ sub r9,16
+
+ vmovdqa xmm2,xmm15
+ vpaddd xmm15,xmm15,XMMWORD[one]
+
+ vpxor xmm2,xmm2,XMMWORD[r8]
+ vaesenc xmm2,xmm2,XMMWORD[16+r8]
+ vaesenc xmm2,xmm2,XMMWORD[32+r8]
+ vaesenc xmm2,xmm2,XMMWORD[48+r8]
+ vaesenc xmm2,xmm2,XMMWORD[64+r8]
+ vaesenc xmm2,xmm2,XMMWORD[80+r8]
+ vaesenc xmm2,xmm2,XMMWORD[96+r8]
+ vaesenc xmm2,xmm2,XMMWORD[112+r8]
+ vaesenc xmm2,xmm2,XMMWORD[128+r8]
+ vaesenc xmm2,xmm2,XMMWORD[144+r8]
+ vaesenclast xmm2,xmm2,XMMWORD[160+r8]
+ vpxor xmm2,xmm2,XMMWORD[rdi]
+ vmovdqu XMMWORD[rsi],xmm2
+ add rdi,16
+ add rsi,16
+
+ vpxor xmm0,xmm0,xmm2
+ vmovdqa xmm1,XMMWORD[((-32))+rcx]
+ call GFMUL
+
+ jmp NEAR $L$128_dec_loop2
+
+$L$128_dec_out:
+ vmovdqu XMMWORD[rdx],xmm0
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_aes128gcmsiv_dec:
+global aes128gcmsiv_ecb_enc_block
+
+ALIGN 16
+aes128gcmsiv_ecb_enc_block:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes128gcmsiv_ecb_enc_block:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+ vmovdqa xmm1,XMMWORD[rdi]
+
+ vpxor xmm1,xmm1,XMMWORD[rdx]
+ vaesenc xmm1,xmm1,XMMWORD[16+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[32+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[48+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[64+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[80+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[96+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[112+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[128+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[144+rdx]
+ vaesenclast xmm1,xmm1,XMMWORD[160+rdx]
+
+ vmovdqa XMMWORD[rsi],xmm1
+
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_aes128gcmsiv_ecb_enc_block:
+global aes256gcmsiv_aes_ks_enc_x1
+
+ALIGN 16
+aes256gcmsiv_aes_ks_enc_x1:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes256gcmsiv_aes_ks_enc_x1:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+
+
+
+ vmovdqa xmm0,XMMWORD[con1]
+ vmovdqa xmm15,XMMWORD[mask]
+ vmovdqa xmm8,XMMWORD[rdi]
+ vmovdqa xmm1,XMMWORD[rcx]
+ vmovdqa xmm3,XMMWORD[16+rcx]
+ vpxor xmm8,xmm8,xmm1
+ vaesenc xmm8,xmm8,xmm3
+ vmovdqu XMMWORD[rdx],xmm1
+ vmovdqu XMMWORD[16+rdx],xmm3
+ vpxor xmm14,xmm14,xmm14
+
+ vpshufb xmm2,xmm3,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpslldq xmm4,xmm1,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpxor xmm1,xmm1,xmm2
+ vaesenc xmm8,xmm8,xmm1
+ vmovdqu XMMWORD[32+rdx],xmm1
+
+ vpshufd xmm2,xmm1,0xff
+ vaesenclast xmm2,xmm2,xmm14
+ vpslldq xmm4,xmm3,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpxor xmm3,xmm3,xmm2
+ vaesenc xmm8,xmm8,xmm3
+ vmovdqu XMMWORD[48+rdx],xmm3
+
+ vpshufb xmm2,xmm3,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpslldq xmm4,xmm1,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpxor xmm1,xmm1,xmm2
+ vaesenc xmm8,xmm8,xmm1
+ vmovdqu XMMWORD[64+rdx],xmm1
+
+ vpshufd xmm2,xmm1,0xff
+ vaesenclast xmm2,xmm2,xmm14
+ vpslldq xmm4,xmm3,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpxor xmm3,xmm3,xmm2
+ vaesenc xmm8,xmm8,xmm3
+ vmovdqu XMMWORD[80+rdx],xmm3
+
+ vpshufb xmm2,xmm3,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpslldq xmm4,xmm1,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpxor xmm1,xmm1,xmm2
+ vaesenc xmm8,xmm8,xmm1
+ vmovdqu XMMWORD[96+rdx],xmm1
+
+ vpshufd xmm2,xmm1,0xff
+ vaesenclast xmm2,xmm2,xmm14
+ vpslldq xmm4,xmm3,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpxor xmm3,xmm3,xmm2
+ vaesenc xmm8,xmm8,xmm3
+ vmovdqu XMMWORD[112+rdx],xmm3
+
+ vpshufb xmm2,xmm3,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpslldq xmm4,xmm1,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpxor xmm1,xmm1,xmm2
+ vaesenc xmm8,xmm8,xmm1
+ vmovdqu XMMWORD[128+rdx],xmm1
+
+ vpshufd xmm2,xmm1,0xff
+ vaesenclast xmm2,xmm2,xmm14
+ vpslldq xmm4,xmm3,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpxor xmm3,xmm3,xmm2
+ vaesenc xmm8,xmm8,xmm3
+ vmovdqu XMMWORD[144+rdx],xmm3
+
+ vpshufb xmm2,xmm3,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpslldq xmm4,xmm1,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpxor xmm1,xmm1,xmm2
+ vaesenc xmm8,xmm8,xmm1
+ vmovdqu XMMWORD[160+rdx],xmm1
+
+ vpshufd xmm2,xmm1,0xff
+ vaesenclast xmm2,xmm2,xmm14
+ vpslldq xmm4,xmm3,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpxor xmm3,xmm3,xmm2
+ vaesenc xmm8,xmm8,xmm3
+ vmovdqu XMMWORD[176+rdx],xmm3
+
+ vpshufb xmm2,xmm3,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpslldq xmm4,xmm1,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpxor xmm1,xmm1,xmm2
+ vaesenc xmm8,xmm8,xmm1
+ vmovdqu XMMWORD[192+rdx],xmm1
+
+ vpshufd xmm2,xmm1,0xff
+ vaesenclast xmm2,xmm2,xmm14
+ vpslldq xmm4,xmm3,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpxor xmm3,xmm3,xmm2
+ vaesenc xmm8,xmm8,xmm3
+ vmovdqu XMMWORD[208+rdx],xmm3
+
+ vpshufb xmm2,xmm3,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslldq xmm4,xmm1,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpxor xmm1,xmm1,xmm2
+ vaesenclast xmm8,xmm8,xmm1
+ vmovdqu XMMWORD[224+rdx],xmm1
+
+ vmovdqa XMMWORD[rsi],xmm8
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_aes256gcmsiv_aes_ks_enc_x1:
+global aes256gcmsiv_ecb_enc_block
+
+ALIGN 16
+aes256gcmsiv_ecb_enc_block:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes256gcmsiv_ecb_enc_block:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+ vmovdqa xmm1,XMMWORD[rdi]
+ vpxor xmm1,xmm1,XMMWORD[rdx]
+ vaesenc xmm1,xmm1,XMMWORD[16+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[32+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[48+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[64+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[80+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[96+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[112+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[128+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[144+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[160+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[176+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[192+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[208+rdx]
+ vaesenclast xmm1,xmm1,XMMWORD[224+rdx]
+ vmovdqa XMMWORD[rsi],xmm1
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_aes256gcmsiv_ecb_enc_block:
+global aes256gcmsiv_enc_msg_x4
+
+ALIGN 16
+aes256gcmsiv_enc_msg_x4:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes256gcmsiv_enc_msg_x4:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+
+
+
+ test r8,r8
+ jnz NEAR $L$256_enc_msg_x4_start
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$256_enc_msg_x4_start:
+ mov r10,r8
+ shr r8,4
+ shl r10,60
+ jz NEAR $L$256_enc_msg_x4_start2
+ add r8,1
+
+$L$256_enc_msg_x4_start2:
+ mov r10,r8
+ shl r10,62
+ shr r10,62
+
+
+ vmovdqa xmm15,XMMWORD[rdx]
+ vpor xmm15,xmm15,XMMWORD[OR_MASK]
+
+ vmovdqa xmm4,XMMWORD[four]
+ vmovdqa xmm0,xmm15
+ vpaddd xmm1,xmm15,XMMWORD[one]
+ vpaddd xmm2,xmm15,XMMWORD[two]
+ vpaddd xmm3,xmm15,XMMWORD[three]
+
+ shr r8,2
+ je NEAR $L$256_enc_msg_x4_check_remainder
+
+ sub rsi,64
+ sub rdi,64
+
+$L$256_enc_msg_x4_loop1:
+ add rsi,64
+ add rdi,64
+
+ vmovdqa xmm5,xmm0
+ vmovdqa xmm6,xmm1
+ vmovdqa xmm7,xmm2
+ vmovdqa xmm8,xmm3
+
+ vpxor xmm5,xmm5,XMMWORD[rcx]
+ vpxor xmm6,xmm6,XMMWORD[rcx]
+ vpxor xmm7,xmm7,XMMWORD[rcx]
+ vpxor xmm8,xmm8,XMMWORD[rcx]
+
+ vmovdqu xmm12,XMMWORD[16+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vpaddd xmm0,xmm0,xmm4
+ vmovdqu xmm12,XMMWORD[32+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vpaddd xmm1,xmm1,xmm4
+ vmovdqu xmm12,XMMWORD[48+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vpaddd xmm2,xmm2,xmm4
+ vmovdqu xmm12,XMMWORD[64+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vpaddd xmm3,xmm3,xmm4
+
+ vmovdqu xmm12,XMMWORD[80+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[96+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[112+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[128+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[144+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[160+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[176+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[192+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[208+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[224+rcx]
+ vaesenclast xmm5,xmm5,xmm12
+ vaesenclast xmm6,xmm6,xmm12
+ vaesenclast xmm7,xmm7,xmm12
+ vaesenclast xmm8,xmm8,xmm12
+
+
+
+ vpxor xmm5,xmm5,XMMWORD[rdi]
+ vpxor xmm6,xmm6,XMMWORD[16+rdi]
+ vpxor xmm7,xmm7,XMMWORD[32+rdi]
+ vpxor xmm8,xmm8,XMMWORD[48+rdi]
+
+ sub r8,1
+
+ vmovdqu XMMWORD[rsi],xmm5
+ vmovdqu XMMWORD[16+rsi],xmm6
+ vmovdqu XMMWORD[32+rsi],xmm7
+ vmovdqu XMMWORD[48+rsi],xmm8
+
+ jne NEAR $L$256_enc_msg_x4_loop1
+
+ add rsi,64
+ add rdi,64
+
+$L$256_enc_msg_x4_check_remainder:
+ cmp r10,0
+ je NEAR $L$256_enc_msg_x4_out
+
+$L$256_enc_msg_x4_loop2:
+
+
+
+ vmovdqa xmm5,xmm0
+ vpaddd xmm0,xmm0,XMMWORD[one]
+ vpxor xmm5,xmm5,XMMWORD[rcx]
+ vaesenc xmm5,xmm5,XMMWORD[16+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[32+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[48+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[64+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[80+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[96+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[112+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[128+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[144+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[160+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[176+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[192+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[208+rcx]
+ vaesenclast xmm5,xmm5,XMMWORD[224+rcx]
+
+
+ vpxor xmm5,xmm5,XMMWORD[rdi]
+
+ vmovdqu XMMWORD[rsi],xmm5
+
+ add rdi,16
+ add rsi,16
+
+ sub r10,1
+ jne NEAR $L$256_enc_msg_x4_loop2
+
+$L$256_enc_msg_x4_out:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_aes256gcmsiv_enc_msg_x4:
+global aes256gcmsiv_enc_msg_x8
+
+ALIGN 16
+aes256gcmsiv_enc_msg_x8:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes256gcmsiv_enc_msg_x8:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+
+
+
+ test r8,r8
+ jnz NEAR $L$256_enc_msg_x8_start
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$256_enc_msg_x8_start:
+
+ mov r11,rsp
+ sub r11,16
+ and r11,-64
+
+ mov r10,r8
+ shr r8,4
+ shl r10,60
+ jz NEAR $L$256_enc_msg_x8_start2
+ add r8,1
+
+$L$256_enc_msg_x8_start2:
+ mov r10,r8
+ shl r10,61
+ shr r10,61
+
+
+ vmovdqa xmm1,XMMWORD[rdx]
+ vpor xmm1,xmm1,XMMWORD[OR_MASK]
+
+
+ vpaddd xmm0,xmm1,XMMWORD[seven]
+ vmovdqa XMMWORD[r11],xmm0
+ vpaddd xmm9,xmm1,XMMWORD[one]
+ vpaddd xmm10,xmm1,XMMWORD[two]
+ vpaddd xmm11,xmm1,XMMWORD[three]
+ vpaddd xmm12,xmm1,XMMWORD[four]
+ vpaddd xmm13,xmm1,XMMWORD[five]
+ vpaddd xmm14,xmm1,XMMWORD[six]
+ vmovdqa xmm0,xmm1
+
+ shr r8,3
+ jz NEAR $L$256_enc_msg_x8_check_remainder
+
+ sub rsi,128
+ sub rdi,128
+
+$L$256_enc_msg_x8_loop1:
+ add rsi,128
+ add rdi,128
+
+ vmovdqa xmm1,xmm0
+ vmovdqa xmm2,xmm9
+ vmovdqa xmm3,xmm10
+ vmovdqa xmm4,xmm11
+ vmovdqa xmm5,xmm12
+ vmovdqa xmm6,xmm13
+ vmovdqa xmm7,xmm14
+
+ vmovdqa xmm8,XMMWORD[r11]
+
+ vpxor xmm1,xmm1,XMMWORD[rcx]
+ vpxor xmm2,xmm2,XMMWORD[rcx]
+ vpxor xmm3,xmm3,XMMWORD[rcx]
+ vpxor xmm4,xmm4,XMMWORD[rcx]
+ vpxor xmm5,xmm5,XMMWORD[rcx]
+ vpxor xmm6,xmm6,XMMWORD[rcx]
+ vpxor xmm7,xmm7,XMMWORD[rcx]
+ vpxor xmm8,xmm8,XMMWORD[rcx]
+
+ vmovdqu xmm15,XMMWORD[16+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vmovdqa xmm14,XMMWORD[r11]
+ vpaddd xmm14,xmm14,XMMWORD[eight]
+ vmovdqa XMMWORD[r11],xmm14
+ vmovdqu xmm15,XMMWORD[32+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpsubd xmm14,xmm14,XMMWORD[one]
+ vmovdqu xmm15,XMMWORD[48+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm0,xmm0,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[64+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm9,xmm9,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[80+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm10,xmm10,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[96+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm11,xmm11,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[112+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm12,xmm12,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[128+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm13,xmm13,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[144+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vmovdqu xmm15,XMMWORD[160+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vmovdqu xmm15,XMMWORD[176+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vmovdqu xmm15,XMMWORD[192+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vmovdqu xmm15,XMMWORD[208+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vmovdqu xmm15,XMMWORD[224+rcx]
+ vaesenclast xmm1,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm15
+ vaesenclast xmm3,xmm3,xmm15
+ vaesenclast xmm4,xmm4,xmm15
+ vaesenclast xmm5,xmm5,xmm15
+ vaesenclast xmm6,xmm6,xmm15
+ vaesenclast xmm7,xmm7,xmm15
+ vaesenclast xmm8,xmm8,xmm15
+
+
+
+ vpxor xmm1,xmm1,XMMWORD[rdi]
+ vpxor xmm2,xmm2,XMMWORD[16+rdi]
+ vpxor xmm3,xmm3,XMMWORD[32+rdi]
+ vpxor xmm4,xmm4,XMMWORD[48+rdi]
+ vpxor xmm5,xmm5,XMMWORD[64+rdi]
+ vpxor xmm6,xmm6,XMMWORD[80+rdi]
+ vpxor xmm7,xmm7,XMMWORD[96+rdi]
+ vpxor xmm8,xmm8,XMMWORD[112+rdi]
+
+ sub r8,1
+
+ vmovdqu XMMWORD[rsi],xmm1
+ vmovdqu XMMWORD[16+rsi],xmm2
+ vmovdqu XMMWORD[32+rsi],xmm3
+ vmovdqu XMMWORD[48+rsi],xmm4
+ vmovdqu XMMWORD[64+rsi],xmm5
+ vmovdqu XMMWORD[80+rsi],xmm6
+ vmovdqu XMMWORD[96+rsi],xmm7
+ vmovdqu XMMWORD[112+rsi],xmm8
+
+ jne NEAR $L$256_enc_msg_x8_loop1
+
+ add rsi,128
+ add rdi,128
+
+$L$256_enc_msg_x8_check_remainder:
+ cmp r10,0
+ je NEAR $L$256_enc_msg_x8_out
+
+$L$256_enc_msg_x8_loop2:
+
+
+ vmovdqa xmm1,xmm0
+ vpaddd xmm0,xmm0,XMMWORD[one]
+
+ vpxor xmm1,xmm1,XMMWORD[rcx]
+ vaesenc xmm1,xmm1,XMMWORD[16+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[32+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[48+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[64+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[80+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[96+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[112+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[128+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[144+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[160+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[176+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[192+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[208+rcx]
+ vaesenclast xmm1,xmm1,XMMWORD[224+rcx]
+
+
+ vpxor xmm1,xmm1,XMMWORD[rdi]
+
+ vmovdqu XMMWORD[rsi],xmm1
+
+ add rdi,16
+ add rsi,16
+ sub r10,1
+ jnz NEAR $L$256_enc_msg_x8_loop2
+
+$L$256_enc_msg_x8_out:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+
+$L$SEH_end_aes256gcmsiv_enc_msg_x8:
+global aes256gcmsiv_dec
+
+ALIGN 16
+aes256gcmsiv_dec:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes256gcmsiv_dec:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+
+ test r9,~15
+ jnz NEAR $L$256_dec_start
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$256_dec_start:
+ vzeroupper
+ vmovdqa xmm0,XMMWORD[rdx]
+ mov rax,rdx
+
+ lea rax,[32+rax]
+ lea rcx,[32+rcx]
+
+
+ vmovdqu xmm15,XMMWORD[r9*1+rdi]
+ vpor xmm15,xmm15,XMMWORD[OR_MASK]
+ and r9,~15
+
+
+ cmp r9,96
+ jb NEAR $L$256_dec_loop2
+
+
+ sub r9,96
+ vmovdqa xmm7,xmm15
+ vpaddd xmm8,xmm7,XMMWORD[one]
+ vpaddd xmm9,xmm7,XMMWORD[two]
+ vpaddd xmm10,xmm9,XMMWORD[one]
+ vpaddd xmm11,xmm9,XMMWORD[two]
+ vpaddd xmm12,xmm11,XMMWORD[one]
+ vpaddd xmm15,xmm11,XMMWORD[two]
+
+ vpxor xmm7,xmm7,XMMWORD[r8]
+ vpxor xmm8,xmm8,XMMWORD[r8]
+ vpxor xmm9,xmm9,XMMWORD[r8]
+ vpxor xmm10,xmm10,XMMWORD[r8]
+ vpxor xmm11,xmm11,XMMWORD[r8]
+ vpxor xmm12,xmm12,XMMWORD[r8]
+
+ vmovdqu xmm4,XMMWORD[16+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[32+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[48+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[64+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[80+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[96+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[112+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[128+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[144+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[160+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[176+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[192+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[208+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[224+r8]
+ vaesenclast xmm7,xmm7,xmm4
+ vaesenclast xmm8,xmm8,xmm4
+ vaesenclast xmm9,xmm9,xmm4
+ vaesenclast xmm10,xmm10,xmm4
+ vaesenclast xmm11,xmm11,xmm4
+ vaesenclast xmm12,xmm12,xmm4
+
+
+ vpxor xmm7,xmm7,XMMWORD[rdi]
+ vpxor xmm8,xmm8,XMMWORD[16+rdi]
+ vpxor xmm9,xmm9,XMMWORD[32+rdi]
+ vpxor xmm10,xmm10,XMMWORD[48+rdi]
+ vpxor xmm11,xmm11,XMMWORD[64+rdi]
+ vpxor xmm12,xmm12,XMMWORD[80+rdi]
+
+ vmovdqu XMMWORD[rsi],xmm7
+ vmovdqu XMMWORD[16+rsi],xmm8
+ vmovdqu XMMWORD[32+rsi],xmm9
+ vmovdqu XMMWORD[48+rsi],xmm10
+ vmovdqu XMMWORD[64+rsi],xmm11
+ vmovdqu XMMWORD[80+rsi],xmm12
+
+ add rdi,96
+ add rsi,96
+ jmp NEAR $L$256_dec_loop1
+
+
+ALIGN 64
+$L$256_dec_loop1:
+ cmp r9,96
+ jb NEAR $L$256_dec_finish_96
+ sub r9,96
+
+ vmovdqa xmm6,xmm12
+ vmovdqa XMMWORD[(16-32)+rax],xmm11
+ vmovdqa XMMWORD[(32-32)+rax],xmm10
+ vmovdqa XMMWORD[(48-32)+rax],xmm9
+ vmovdqa XMMWORD[(64-32)+rax],xmm8
+ vmovdqa XMMWORD[(80-32)+rax],xmm7
+
+ vmovdqa xmm7,xmm15
+ vpaddd xmm8,xmm7,XMMWORD[one]
+ vpaddd xmm9,xmm7,XMMWORD[two]
+ vpaddd xmm10,xmm9,XMMWORD[one]
+ vpaddd xmm11,xmm9,XMMWORD[two]
+ vpaddd xmm12,xmm11,XMMWORD[one]
+ vpaddd xmm15,xmm11,XMMWORD[two]
+
+ vmovdqa xmm4,XMMWORD[r8]
+ vpxor xmm7,xmm7,xmm4
+ vpxor xmm8,xmm8,xmm4
+ vpxor xmm9,xmm9,xmm4
+ vpxor xmm10,xmm10,xmm4
+ vpxor xmm11,xmm11,xmm4
+ vpxor xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[((0-32))+rcx]
+ vpclmulqdq xmm2,xmm6,xmm4,0x11
+ vpclmulqdq xmm3,xmm6,xmm4,0x00
+ vpclmulqdq xmm1,xmm6,xmm4,0x01
+ vpclmulqdq xmm4,xmm6,xmm4,0x10
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm4,XMMWORD[16+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[((-16))+rax]
+ vmovdqu xmm13,XMMWORD[((-16))+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm4,XMMWORD[32+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[rax]
+ vmovdqu xmm13,XMMWORD[rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm4,XMMWORD[48+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[16+rax]
+ vmovdqu xmm13,XMMWORD[16+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm4,XMMWORD[64+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[32+rax]
+ vmovdqu xmm13,XMMWORD[32+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm4,XMMWORD[80+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[96+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[112+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+
+ vmovdqa xmm6,XMMWORD[((80-32))+rax]
+ vpxor xmm6,xmm6,xmm0
+ vmovdqu xmm5,XMMWORD[((80-32))+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm5,0x01
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x10
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm4,XMMWORD[128+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+
+ vpsrldq xmm4,xmm1,8
+ vpxor xmm5,xmm2,xmm4
+ vpslldq xmm4,xmm1,8
+ vpxor xmm0,xmm3,xmm4
+
+ vmovdqa xmm3,XMMWORD[poly]
+
+ vmovdqu xmm4,XMMWORD[144+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[160+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[176+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[192+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[208+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[224+r8]
+ vpalignr xmm2,xmm0,xmm0,8
+ vpclmulqdq xmm0,xmm0,xmm3,0x10
+ vpxor xmm0,xmm2,xmm0
+
+ vpxor xmm4,xmm6,XMMWORD[rdi]
+ vaesenclast xmm7,xmm7,xmm4
+ vpxor xmm4,xmm6,XMMWORD[16+rdi]
+ vaesenclast xmm8,xmm8,xmm4
+ vpxor xmm4,xmm6,XMMWORD[32+rdi]
+ vaesenclast xmm9,xmm9,xmm4
+ vpxor xmm4,xmm6,XMMWORD[48+rdi]
+ vaesenclast xmm10,xmm10,xmm4
+ vpxor xmm4,xmm6,XMMWORD[64+rdi]
+ vaesenclast xmm11,xmm11,xmm4
+ vpxor xmm4,xmm6,XMMWORD[80+rdi]
+ vaesenclast xmm12,xmm12,xmm4
+
+ vpalignr xmm2,xmm0,xmm0,8
+ vpclmulqdq xmm0,xmm0,xmm3,0x10
+ vpxor xmm0,xmm2,xmm0
+
+ vmovdqu XMMWORD[rsi],xmm7
+ vmovdqu XMMWORD[16+rsi],xmm8
+ vmovdqu XMMWORD[32+rsi],xmm9
+ vmovdqu XMMWORD[48+rsi],xmm10
+ vmovdqu XMMWORD[64+rsi],xmm11
+ vmovdqu XMMWORD[80+rsi],xmm12
+
+ vpxor xmm0,xmm0,xmm5
+
+ lea rdi,[96+rdi]
+ lea rsi,[96+rsi]
+ jmp NEAR $L$256_dec_loop1
+
+$L$256_dec_finish_96:
+ vmovdqa xmm6,xmm12
+ vmovdqa XMMWORD[(16-32)+rax],xmm11
+ vmovdqa XMMWORD[(32-32)+rax],xmm10
+ vmovdqa XMMWORD[(48-32)+rax],xmm9
+ vmovdqa XMMWORD[(64-32)+rax],xmm8
+ vmovdqa XMMWORD[(80-32)+rax],xmm7
+
+ vmovdqu xmm4,XMMWORD[((0-32))+rcx]
+ vpclmulqdq xmm1,xmm6,xmm4,0x10
+ vpclmulqdq xmm2,xmm6,xmm4,0x11
+ vpclmulqdq xmm3,xmm6,xmm4,0x00
+ vpclmulqdq xmm4,xmm6,xmm4,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm6,XMMWORD[((-16))+rax]
+ vmovdqu xmm13,XMMWORD[((-16))+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm6,XMMWORD[rax]
+ vmovdqu xmm13,XMMWORD[rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm6,XMMWORD[16+rax]
+ vmovdqu xmm13,XMMWORD[16+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm6,XMMWORD[32+rax]
+ vmovdqu xmm13,XMMWORD[32+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm6,XMMWORD[((80-32))+rax]
+ vpxor xmm6,xmm6,xmm0
+ vmovdqu xmm5,XMMWORD[((80-32))+rcx]
+ vpclmulqdq xmm4,xmm6,xmm5,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vpsrldq xmm4,xmm1,8
+ vpxor xmm5,xmm2,xmm4
+ vpslldq xmm4,xmm1,8
+ vpxor xmm0,xmm3,xmm4
+
+ vmovdqa xmm3,XMMWORD[poly]
+
+ vpalignr xmm2,xmm0,xmm0,8
+ vpclmulqdq xmm0,xmm0,xmm3,0x10
+ vpxor xmm0,xmm2,xmm0
+
+ vpalignr xmm2,xmm0,xmm0,8
+ vpclmulqdq xmm0,xmm0,xmm3,0x10
+ vpxor xmm0,xmm2,xmm0
+
+ vpxor xmm0,xmm0,xmm5
+
+$L$256_dec_loop2:
+
+
+
+ cmp r9,16
+ jb NEAR $L$256_dec_out
+ sub r9,16
+
+ vmovdqa xmm2,xmm15
+ vpaddd xmm15,xmm15,XMMWORD[one]
+
+ vpxor xmm2,xmm2,XMMWORD[r8]
+ vaesenc xmm2,xmm2,XMMWORD[16+r8]
+ vaesenc xmm2,xmm2,XMMWORD[32+r8]
+ vaesenc xmm2,xmm2,XMMWORD[48+r8]
+ vaesenc xmm2,xmm2,XMMWORD[64+r8]
+ vaesenc xmm2,xmm2,XMMWORD[80+r8]
+ vaesenc xmm2,xmm2,XMMWORD[96+r8]
+ vaesenc xmm2,xmm2,XMMWORD[112+r8]
+ vaesenc xmm2,xmm2,XMMWORD[128+r8]
+ vaesenc xmm2,xmm2,XMMWORD[144+r8]
+ vaesenc xmm2,xmm2,XMMWORD[160+r8]
+ vaesenc xmm2,xmm2,XMMWORD[176+r8]
+ vaesenc xmm2,xmm2,XMMWORD[192+r8]
+ vaesenc xmm2,xmm2,XMMWORD[208+r8]
+ vaesenclast xmm2,xmm2,XMMWORD[224+r8]
+ vpxor xmm2,xmm2,XMMWORD[rdi]
+ vmovdqu XMMWORD[rsi],xmm2
+ add rdi,16
+ add rsi,16
+
+ vpxor xmm0,xmm0,xmm2
+ vmovdqa xmm1,XMMWORD[((-32))+rcx]
+ call GFMUL
+
+ jmp NEAR $L$256_dec_loop2
+
+$L$256_dec_out:
+ vmovdqu XMMWORD[rdx],xmm0
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_aes256gcmsiv_dec:
+global aes256gcmsiv_kdf
+
+ALIGN 16
+aes256gcmsiv_kdf:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes256gcmsiv_kdf:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+
+
+
+
+ vmovdqa xmm1,XMMWORD[rdx]
+ vmovdqa xmm4,XMMWORD[rdi]
+ vmovdqa xmm11,XMMWORD[and_mask]
+ vmovdqa xmm8,XMMWORD[one]
+ vpshufd xmm4,xmm4,0x90
+ vpand xmm4,xmm4,xmm11
+ vpaddd xmm6,xmm4,xmm8
+ vpaddd xmm7,xmm6,xmm8
+ vpaddd xmm11,xmm7,xmm8
+ vpaddd xmm12,xmm11,xmm8
+ vpaddd xmm13,xmm12,xmm8
+
+ vpxor xmm4,xmm4,xmm1
+ vpxor xmm6,xmm6,xmm1
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm11,xmm11,xmm1
+ vpxor xmm12,xmm12,xmm1
+ vpxor xmm13,xmm13,xmm1
+
+ vmovdqa xmm1,XMMWORD[16+rdx]
+ vaesenc xmm4,xmm4,xmm1
+ vaesenc xmm6,xmm6,xmm1
+ vaesenc xmm7,xmm7,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+ vaesenc xmm13,xmm13,xmm1
+
+ vmovdqa xmm2,XMMWORD[32+rdx]
+ vaesenc xmm4,xmm4,xmm2
+ vaesenc xmm6,xmm6,xmm2
+ vaesenc xmm7,xmm7,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+ vaesenc xmm13,xmm13,xmm2
+
+ vmovdqa xmm1,XMMWORD[48+rdx]
+ vaesenc xmm4,xmm4,xmm1
+ vaesenc xmm6,xmm6,xmm1
+ vaesenc xmm7,xmm7,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+ vaesenc xmm13,xmm13,xmm1
+
+ vmovdqa xmm2,XMMWORD[64+rdx]
+ vaesenc xmm4,xmm4,xmm2
+ vaesenc xmm6,xmm6,xmm2
+ vaesenc xmm7,xmm7,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+ vaesenc xmm13,xmm13,xmm2
+
+ vmovdqa xmm1,XMMWORD[80+rdx]
+ vaesenc xmm4,xmm4,xmm1
+ vaesenc xmm6,xmm6,xmm1
+ vaesenc xmm7,xmm7,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+ vaesenc xmm13,xmm13,xmm1
+
+ vmovdqa xmm2,XMMWORD[96+rdx]
+ vaesenc xmm4,xmm4,xmm2
+ vaesenc xmm6,xmm6,xmm2
+ vaesenc xmm7,xmm7,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+ vaesenc xmm13,xmm13,xmm2
+
+ vmovdqa xmm1,XMMWORD[112+rdx]
+ vaesenc xmm4,xmm4,xmm1
+ vaesenc xmm6,xmm6,xmm1
+ vaesenc xmm7,xmm7,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+ vaesenc xmm13,xmm13,xmm1
+
+ vmovdqa xmm2,XMMWORD[128+rdx]
+ vaesenc xmm4,xmm4,xmm2
+ vaesenc xmm6,xmm6,xmm2
+ vaesenc xmm7,xmm7,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+ vaesenc xmm13,xmm13,xmm2
+
+ vmovdqa xmm1,XMMWORD[144+rdx]
+ vaesenc xmm4,xmm4,xmm1
+ vaesenc xmm6,xmm6,xmm1
+ vaesenc xmm7,xmm7,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+ vaesenc xmm13,xmm13,xmm1
+
+ vmovdqa xmm2,XMMWORD[160+rdx]
+ vaesenc xmm4,xmm4,xmm2
+ vaesenc xmm6,xmm6,xmm2
+ vaesenc xmm7,xmm7,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+ vaesenc xmm13,xmm13,xmm2
+
+ vmovdqa xmm1,XMMWORD[176+rdx]
+ vaesenc xmm4,xmm4,xmm1
+ vaesenc xmm6,xmm6,xmm1
+ vaesenc xmm7,xmm7,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+ vaesenc xmm13,xmm13,xmm1
+
+ vmovdqa xmm2,XMMWORD[192+rdx]
+ vaesenc xmm4,xmm4,xmm2
+ vaesenc xmm6,xmm6,xmm2
+ vaesenc xmm7,xmm7,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+ vaesenc xmm13,xmm13,xmm2
+
+ vmovdqa xmm1,XMMWORD[208+rdx]
+ vaesenc xmm4,xmm4,xmm1
+ vaesenc xmm6,xmm6,xmm1
+ vaesenc xmm7,xmm7,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+ vaesenc xmm13,xmm13,xmm1
+
+ vmovdqa xmm2,XMMWORD[224+rdx]
+ vaesenclast xmm4,xmm4,xmm2
+ vaesenclast xmm6,xmm6,xmm2
+ vaesenclast xmm7,xmm7,xmm2
+ vaesenclast xmm11,xmm11,xmm2
+ vaesenclast xmm12,xmm12,xmm2
+ vaesenclast xmm13,xmm13,xmm2
+
+
+ vmovdqa XMMWORD[rsi],xmm4
+ vmovdqa XMMWORD[16+rsi],xmm6
+ vmovdqa XMMWORD[32+rsi],xmm7
+ vmovdqa XMMWORD[48+rsi],xmm11
+ vmovdqa XMMWORD[64+rsi],xmm12
+ vmovdqa XMMWORD[80+rsi],xmm13
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_aes256gcmsiv_kdf:

Powered by Google App Engine
This is Rietveld 408576698