| Index: nss/lib/freebl/intel-gcm-x64-masm.asm
 | 
| diff --git a/nss/lib/freebl/intel-gcm-x64-masm.asm b/nss/lib/freebl/intel-gcm-x64-masm.asm
 | 
| new file mode 100644
 | 
| index 0000000000000000000000000000000000000000..408879d38278661a2ca3555cbfa91e046562f12f
 | 
| --- /dev/null
 | 
| +++ b/nss/lib/freebl/intel-gcm-x64-masm.asm
 | 
| @@ -0,0 +1,1301 @@
 | 
| +; LICENSE:
 | 
| +; This submission to NSS is to be made available under the terms of the
 | 
| +; Mozilla Public License, v. 2.0. You can obtain one at http:
 | 
| +; //mozilla.org/MPL/2.0/.
 | 
| +;###############################################################################
 | 
| +; Copyright(c) 2014, Intel Corp.
 | 
| +; Developers and authors:
 | 
| +; Shay Gueron and Vlad Krasnov
 | 
| +; Intel Corporation, Israel Development Centre, Haifa, Israel
 | 
| +; Please send feedback directly to crypto.feedback.alias@intel.com
 | 
| +
 | 
| +
 | 
| +.DATA
 | 
| +ALIGN 16
 | 
| +Lone            dq 1,0
 | 
| +Ltwo            dq 2,0
 | 
| +Lbswap_mask     db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 | 
| +Lshuff_mask     dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
 | 
| +Lpoly           dq 01h, 0c200000000000000h
 | 
| +
 | 
| +.CODE
 | 
| +
 | 
| +
 | 
| +GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4
 | 
| +    vpclmulqdq  TMP1, SRC2, SRC1, 0h
 | 
| +    vpclmulqdq  TMP4, SRC2, SRC1, 011h
 | 
| +
 | 
| +    vpshufd     TMP2, SRC2, 78
 | 
| +    vpshufd     TMP3, SRC1, 78
 | 
| +    vpxor       TMP2, TMP2, SRC2
 | 
| +    vpxor       TMP3, TMP3, SRC1
 | 
| +
 | 
| +    vpclmulqdq  TMP2, TMP2, TMP3, 0h
 | 
| +    vpxor       TMP2, TMP2, TMP1
 | 
| +    vpxor       TMP2, TMP2, TMP4
 | 
| +
 | 
| +    vpslldq     TMP3, TMP2, 8
 | 
| +    vpsrldq     TMP2, TMP2, 8
 | 
| +
 | 
| +    vpxor       TMP1, TMP1, TMP3
 | 
| +    vpxor       TMP4, TMP4, TMP2
 | 
| +
 | 
| +    vpclmulqdq  TMP2, TMP1, [Lpoly], 010h
 | 
| +    vpshufd     TMP3, TMP1, 78
 | 
| +    vpxor       TMP1, TMP2, TMP3
 | 
| +
 | 
| +    vpclmulqdq  TMP2, TMP1, [Lpoly], 010h
 | 
| +    vpshufd     TMP3, TMP1, 78
 | 
| +    vpxor       TMP1, TMP2, TMP3
 | 
| +
 | 
| +    vpxor       DST, TMP1, TMP4
 | 
| +
 | 
| +    ENDM
 | 
| +
 | 
| +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 | 
| +;
 | 
| +; Generates the final GCM tag
 | 
| +; void intel_aes_gcmTAG(unsigned char Htbl[16*16],
 | 
| +;                       unsigned char *Tp,
 | 
| +;                       unsigned int Mlen,
 | 
| +;                       unsigned int Alen,
 | 
| +;                       unsigned char *X0,
 | 
| +;                       unsigned char *TAG);
 | 
| +;
 | 
| +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 | 
| +
 | 
| +ALIGN 16
 | 
| +intel_aes_gcmTAG PROC
 | 
| +
 | 
| +Htbl    textequ <rcx>
 | 
| +Tp      textequ <rdx>
 | 
| +Mlen    textequ <r8>
 | 
| +Alen    textequ <r9>
 | 
| +X0      textequ <r10>
 | 
| +TAG     textequ <r11>
 | 
| +
 | 
| +T       textequ <xmm0>
 | 
| +TMP0    textequ <xmm1>
 | 
| +
 | 
| +    mov     X0, [rsp + 1*8 + 4*8]
 | 
| +    mov     TAG, [rsp + 1*8 + 5*8]
 | 
| +
 | 
| +    vzeroupper
 | 
| +    vmovdqu T, XMMWORD PTR[Tp]
 | 
| +    vpxor   TMP0, TMP0, TMP0
 | 
| +
 | 
| +    shl     Mlen, 3
 | 
| +    shl     Alen, 3
 | 
| +
 | 
| +    ;vpinsrq    TMP0, TMP0, Mlen, 0
 | 
| +    ;vpinsrq    TMP0, TMP0, Alen, 1
 | 
| +    ; workaround the ml64.exe vpinsrq issue
 | 
| +    vpinsrd TMP0, TMP0, r8d, 0
 | 
| +    vpinsrd TMP0, TMP0, r9d, 2
 | 
| +    shr Mlen, 32
 | 
| +    shr Alen, 32
 | 
| +    vpinsrd TMP0, TMP0, r8d, 1
 | 
| +    vpinsrd TMP0, TMP0, r9d, 3
 | 
| +
 | 
| +    vpxor   T, T, TMP0
 | 
| +    vmovdqu TMP0, XMMWORD PTR[Htbl]
 | 
| +    GFMUL   T, T, TMP0, xmm2, xmm3, xmm4, xmm5
 | 
| +
 | 
| +    vpshufb T, T, [Lbswap_mask]
 | 
| +    vpxor   T, T, [X0]
 | 
| +    vmovdqu XMMWORD PTR[TAG], T
 | 
| +    vzeroupper
 | 
| +
 | 
| +    ret
 | 
| +
 | 
| +intel_aes_gcmTAG ENDP
 | 
| +
 | 
| +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 | 
| +;
 | 
| +; Generates the H table
 | 
| +; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR);
 | 
| +;
 | 
| +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 | 
| +
 | 
| +ALIGN 16
 | 
| +intel_aes_gcmINIT PROC
 | 
| +
 | 
| +Htbl    textequ <rcx>
 | 
| +KS      textequ <rdx>
 | 
| +NR      textequ <r8d>
 | 
| +
 | 
| +T       textequ <xmm0>
 | 
| +TMP0    textequ <xmm1>
 | 
| +
 | 
| +    vzeroupper
 | 
| +    ; AES-ENC(0)
 | 
| +    vmovdqu T, XMMWORD PTR[KS]
 | 
| +    lea KS, [16 + KS]
 | 
| +    dec NR
 | 
| +Lenc_loop:
 | 
| +        vaesenc T, T, [KS]
 | 
| +        lea KS, [16 + KS]
 | 
| +        dec NR
 | 
| +        jnz Lenc_loop
 | 
| +
 | 
| +    vaesenclast T, T, [KS]
 | 
| +    vpshufb T, T, [Lbswap_mask]
 | 
| +
 | 
| +    ;Calculate H` = GFMUL(H, 2)
 | 
| +    vpsrad  xmm3, T, 31
 | 
| +    vpshufd xmm3, xmm3, 0ffh
 | 
| +    vpand   xmm5, xmm3, [Lpoly]
 | 
| +    vpsrld  xmm3, T, 31
 | 
| +    vpslld  xmm4, T, 1
 | 
| +    vpslldq xmm3, xmm3, 4
 | 
| +    vpxor   T, xmm4, xmm3
 | 
| +    vpxor   T, T, xmm5
 | 
| +
 | 
| +    vmovdqu TMP0, T
 | 
| +    vmovdqu XMMWORD PTR[Htbl + 0*16], T
 | 
| +
 | 
| +    vpshufd xmm2, T, 78
 | 
| +    vpxor   xmm2, xmm2, T
 | 
| +    vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2
 | 
| +
 | 
| +    i = 1
 | 
| +    WHILE i LT 8
 | 
| +        GFMUL   T, T, TMP0, xmm2, xmm3, xmm4, xmm5
 | 
| +        vmovdqu XMMWORD PTR[Htbl + i*16], T
 | 
| +        vpshufd xmm2, T, 78
 | 
| +        vpxor   xmm2, xmm2, T
 | 
| +        vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2
 | 
| +        i = i+1
 | 
| +        ENDM
 | 
| +    vzeroupper
 | 
| +    ret
 | 
| +intel_aes_gcmINIT ENDP
 | 
| +
 | 
| +
 | 
| +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 | 
| +;
 | 
| +; Authenticate only
 | 
| +; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp);
 | 
| +;
 | 
| +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 | 
| +
 | 
| +ALIGN 16
 | 
| +intel_aes_gcmAAD PROC
 | 
| +
 | 
| +Htbl    textequ <rcx>
 | 
| +inp     textequ <rdx>
 | 
| +len     textequ <r8>
 | 
| +Tp      textequ <r9>
 | 
| +hlp0    textequ <r10>
 | 
| +
 | 
| +DATA    textequ <xmm0>
 | 
| +T       textequ <xmm1>
 | 
| +TMP0    textequ <xmm2>
 | 
| +TMP1    textequ <xmm3>
 | 
| +TMP2    textequ <xmm4>
 | 
| +TMP3    textequ <xmm5>
 | 
| +TMP4    textequ <xmm6>
 | 
| +Xhi     textequ <xmm7>
 | 
| +
 | 
| +KARATSUBA_AAD MACRO i
 | 
| +    vpclmulqdq  TMP3, DATA, [Htbl + i*16], 0h
 | 
| +    vpxor       TMP0, TMP0, TMP3
 | 
| +    vpclmulqdq  TMP3, DATA, [Htbl + i*16], 011h
 | 
| +    vpxor       TMP1, TMP1, TMP3
 | 
| +    vpshufd     TMP3, DATA, 78
 | 
| +    vpxor       TMP3, TMP3, DATA
 | 
| +    vpclmulqdq  TMP3, TMP3, [Htbl + 8*16 + i*16], 0h
 | 
| +    vpxor       TMP2, TMP2, TMP3
 | 
| +ENDM
 | 
| +
 | 
| +    test  len, len
 | 
| +    jnz   LbeginAAD
 | 
| +    ret
 | 
| +
 | 
| +LbeginAAD:
 | 
| +    vzeroupper
 | 
| +
 | 
| +    sub rsp, 2*16
 | 
| +    vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
 | 
| +    vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
 | 
| +
 | 
| +    vpxor   Xhi, Xhi, Xhi
 | 
| +
 | 
| +    vmovdqu T, XMMWORD PTR[Tp]
 | 
| +    ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
 | 
| +    mov hlp0, len
 | 
| +    and hlp0, 128-1
 | 
| +    jz  Lmod_loop
 | 
| +
 | 
| +    and len, -128
 | 
| +    sub hlp0, 16
 | 
| +
 | 
| +    ; Prefix block
 | 
| +    vmovdqu DATA, XMMWORD PTR[inp]
 | 
| +    vpshufb DATA, DATA, [Lbswap_mask]
 | 
| +    vpxor   DATA, DATA, T
 | 
| +
 | 
| +    vpclmulqdq  TMP0, DATA, [Htbl + hlp0], 0h
 | 
| +    vpclmulqdq  TMP1, DATA, [Htbl + hlp0], 011h
 | 
| +    vpshufd     TMP3, DATA, 78
 | 
| +    vpxor       TMP3, TMP3, DATA
 | 
| +    vpclmulqdq  TMP2, TMP3, [Htbl + 8*16 + hlp0], 0h
 | 
| +
 | 
| +    lea     inp, [inp+16]
 | 
| +    test    hlp0, hlp0
 | 
| +    jnz     Lpre_loop
 | 
| +    jmp     Lred1
 | 
| +
 | 
| +    ;hash remaining prefix bocks (up to 7 total prefix blocks)
 | 
| +Lpre_loop:
 | 
| +
 | 
| +        sub hlp0, 16
 | 
| +
 | 
| +        vmovdqu DATA, XMMWORD PTR[inp]
 | 
| +        vpshufb DATA, DATA, [Lbswap_mask]
 | 
| +
 | 
| +        vpclmulqdq  TMP3, DATA, [Htbl + hlp0], 0h
 | 
| +        vpxor       TMP0, TMP0, TMP3
 | 
| +        vpclmulqdq  TMP3, DATA, [Htbl + hlp0], 011h
 | 
| +        vpxor       TMP1, TMP1, TMP3
 | 
| +        vpshufd     TMP3, DATA, 78
 | 
| +        vpxor       TMP3, TMP3, DATA
 | 
| +        vpclmulqdq  TMP3, TMP3, [Htbl + 8*16 + hlp0], 0h
 | 
| +        vpxor       TMP2, TMP2, TMP3
 | 
| +
 | 
| +        test    hlp0, hlp0
 | 
| +        lea     inp, [inp+16]
 | 
| +        jnz     Lpre_loop
 | 
| +
 | 
| +Lred1:
 | 
| +
 | 
| +    vpxor       TMP2, TMP2, TMP0
 | 
| +    vpxor       TMP2, TMP2, TMP1
 | 
| +    vpsrldq     TMP3, TMP2, 8
 | 
| +    vpslldq     TMP2, TMP2, 8
 | 
| +
 | 
| +    vpxor       Xhi, TMP1, TMP3
 | 
| +    vpxor       T, TMP0, TMP2
 | 
| +
 | 
| +
 | 
| +Lmod_loop:
 | 
| +
 | 
| +        sub len, 16*8
 | 
| +        jb  Ldone
 | 
| +        ; Block #0
 | 
| +        vmovdqu DATA, XMMWORD PTR[inp + 16*7]
 | 
| +        vpshufb DATA, DATA, [Lbswap_mask]
 | 
| +
 | 
| +        vpclmulqdq  TMP0, DATA, [Htbl + 0*16], 0h
 | 
| +        vpclmulqdq  TMP1, DATA, [Htbl + 0*16], 011h
 | 
| +        vpshufd     TMP3, DATA, 78
 | 
| +        vpxor       TMP3, TMP3, DATA
 | 
| +        vpclmulqdq  TMP2, TMP3, [Htbl + 8*16 + 0*16], 0h
 | 
| +
 | 
| +        ; Block #1
 | 
| +        vmovdqu DATA, XMMWORD PTR[inp + 16*6]
 | 
| +        vpshufb DATA, DATA, [Lbswap_mask]
 | 
| +        KARATSUBA_AAD 1
 | 
| +
 | 
| +        ; Block #2
 | 
| +        vmovdqu DATA, XMMWORD PTR[inp + 16*5]
 | 
| +        vpshufb DATA, DATA, [Lbswap_mask]
 | 
| +
 | 
| +        vpclmulqdq  TMP4, T, [Lpoly], 010h         ;reduction stage 1a
 | 
| +        vpalignr    T, T, T, 8
 | 
| +
 | 
| +        KARATSUBA_AAD 2
 | 
| +
 | 
| +        vpxor       T, T, TMP4                          ;reduction stage 1b
 | 
| +
 | 
| +        ; Block #3
 | 
| +        vmovdqu DATA, XMMWORD PTR[inp + 16*4]
 | 
| +        vpshufb DATA, DATA, [Lbswap_mask]
 | 
| +        KARATSUBA_AAD 3
 | 
| +        ; Block #4
 | 
| +        vmovdqu DATA, XMMWORD PTR[inp + 16*3]
 | 
| +        vpshufb DATA, DATA, [Lbswap_mask]
 | 
| +
 | 
| +        vpclmulqdq  TMP4, T, [Lpoly], 010h        ;reduction stage 2a
 | 
| +        vpalignr    T, T, T, 8
 | 
| +
 | 
| +        KARATSUBA_AAD 4
 | 
| +
 | 
| +        vpxor       T, T, TMP4                          ;reduction stage 2b
 | 
| +        ; Block #5
 | 
| +        vmovdqu DATA, XMMWORD PTR[inp + 16*2]
 | 
| +        vpshufb DATA, DATA, [Lbswap_mask]
 | 
| +        KARATSUBA_AAD 5
 | 
| +
 | 
| +        vpxor   T, T, Xhi                               ;reduction finalize
 | 
| +        ; Block #6
 | 
| +        vmovdqu DATA, XMMWORD PTR[inp + 16*1]
 | 
| +        vpshufb DATA, DATA, [Lbswap_mask]
 | 
| +        KARATSUBA_AAD 6
 | 
| +        ; Block #7
 | 
| +        vmovdqu DATA, XMMWORD PTR[inp + 16*0]
 | 
| +        vpshufb DATA, DATA, [Lbswap_mask]
 | 
| +        vpxor   DATA, DATA, T
 | 
| +        KARATSUBA_AAD 7
 | 
| +        ; Aggregated 8 blocks, now karatsuba fixup
 | 
| +        vpxor   TMP2, TMP2, TMP0
 | 
| +        vpxor   TMP2, TMP2, TMP1
 | 
| +        vpsrldq TMP3, TMP2, 8
 | 
| +        vpslldq TMP2, TMP2, 8
 | 
| +
 | 
| +        vpxor   Xhi, TMP1, TMP3
 | 
| +        vpxor   T, TMP0, TMP2
 | 
| +
 | 
| +        lea inp, [inp + 16*8]
 | 
| +        jmp Lmod_loop
 | 
| +
 | 
| +Ldone:
 | 
| +    vpclmulqdq  TMP4, T, [Lpoly], 010h
 | 
| +    vpalignr    T, T, T, 8
 | 
| +    vpxor       T, T, TMP4
 | 
| +
 | 
| +    vpclmulqdq  TMP4, T, [Lpoly], 010h
 | 
| +    vpalignr    T, T, T, 8
 | 
| +    vpxor       T, T, TMP4
 | 
| +
 | 
| +    vpxor       T, T, Xhi
 | 
| +    vmovdqu     XMMWORD PTR[Tp], T
 | 
| +    vzeroupper
 | 
| +
 | 
| +    vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
 | 
| +    vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
 | 
| +    add rsp, 16*2
 | 
| +
 | 
| +    ret
 | 
| +
 | 
| +intel_aes_gcmAAD ENDP
 | 
| +
 | 
| +
 | 
| +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 | 
| +;
 | 
| +; Encrypt and Authenticate
 | 
| +; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len);
 | 
| +;
 | 
| +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 | 
| +
 | 
| +ALIGN 16
 | 
| +intel_aes_gcmENC PROC
 | 
| +
 | 
| +PT      textequ <rcx>
 | 
| +CT      textequ <rdx>
 | 
| +Htbl    textequ <r8>
 | 
| +Gctx    textequ <r8>
 | 
| +len     textequ <r9>
 | 
| +KS      textequ <r10>
 | 
| +NR      textequ <eax>
 | 
| +
 | 
| +aluCTR  textequ <r11d>
 | 
| +aluKSl  textequ <r12d>
 | 
| +aluTMP  textequ <r13d>
 | 
| +
 | 
| +T       textequ <xmm0>
 | 
| +TMP0    textequ <xmm1>
 | 
| +TMP1    textequ <xmm2>
 | 
| +TMP2    textequ <xmm3>
 | 
| +TMP3    textequ <xmm4>
 | 
| +TMP4    textequ <xmm5>
 | 
| +TMP5    textequ <xmm6>
 | 
| +CTR0    textequ <xmm7>
 | 
| +CTR1    textequ <xmm8>
 | 
| +CTR2    textequ <xmm9>
 | 
| +CTR3    textequ <xmm10>
 | 
| +CTR4    textequ <xmm11>
 | 
| +CTR5    textequ <xmm12>
 | 
| +CTR6    textequ <xmm13>
 | 
| +CTR7    textequ <xmm14>
 | 
| +BSWAPMASK   textequ <xmm15>
 | 
| +
 | 
| +ROUND MACRO i
 | 
| +    vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
 | 
| +    vaesenc CTR0, CTR0, TMP3
 | 
| +    vaesenc CTR1, CTR1, TMP3
 | 
| +    vaesenc CTR2, CTR2, TMP3
 | 
| +    vaesenc CTR3, CTR3, TMP3
 | 
| +    vaesenc CTR4, CTR4, TMP3
 | 
| +    vaesenc CTR5, CTR5, TMP3
 | 
| +    vaesenc CTR6, CTR6, TMP3
 | 
| +    vaesenc CTR7, CTR7, TMP3
 | 
| +ENDM
 | 
| +ROUNDMUL MACRO i
 | 
| +    vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
 | 
| +
 | 
| +    vaesenc CTR0, CTR0, TMP3
 | 
| +    vaesenc CTR1, CTR1, TMP3
 | 
| +    vaesenc CTR2, CTR2, TMP3
 | 
| +    vaesenc CTR3, CTR3, TMP3
 | 
| +
 | 
| +    vpshufd TMP4, TMP5, 78
 | 
| +    vpxor   TMP4, TMP4, TMP5
 | 
| +
 | 
| +    vaesenc CTR4, CTR4, TMP3
 | 
| +    vaesenc CTR5, CTR5, TMP3
 | 
| +    vaesenc CTR6, CTR6, TMP3
 | 
| +    vaesenc CTR7, CTR7, TMP3
 | 
| +
 | 
| +    vpclmulqdq  TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
 | 
| +    vpxor       TMP0, TMP0, TMP3
 | 
| +    vmovdqu     TMP4, XMMWORD PTR[i*16 + Htbl]
 | 
| +    vpclmulqdq  TMP3, TMP5, TMP4, 011h
 | 
| +    vpxor       TMP1, TMP1, TMP3
 | 
| +    vpclmulqdq  TMP3, TMP5, TMP4, 000h
 | 
| +    vpxor       TMP2, TMP2, TMP3
 | 
| +ENDM
 | 
| +KARATSUBA MACRO i
 | 
| +    vpshufd TMP4, TMP5, 78
 | 
| +    vpxor   TMP4, TMP4, TMP5
 | 
| +    vpclmulqdq  TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
 | 
| +    vpxor       TMP0, TMP0, TMP3
 | 
| +    vmovdqu     TMP4, XMMWORD PTR[i*16 + Htbl]
 | 
| +    vpclmulqdq  TMP3, TMP5, TMP4, 011h
 | 
| +    vpxor       TMP1, TMP1, TMP3
 | 
| +    vpclmulqdq  TMP3, TMP5, TMP4, 000h
 | 
| +    vpxor       TMP2, TMP2, TMP3
 | 
| +ENDM
 | 
| +NEXTCTR MACRO i
 | 
| +    add aluCTR, 1
 | 
| +    mov aluTMP, aluCTR
 | 
| +    xor aluTMP, aluKSl
 | 
| +    bswap   aluTMP
 | 
| +    mov [3*4 + 8*16 + i*16 + rsp], aluTMP
 | 
| +ENDM
 | 
| +
 | 
| +
 | 
| +    test  len, len
 | 
| +    jnz   LbeginENC
 | 
| +    ret
 | 
| +
 | 
| +LbeginENC:
 | 
| +
 | 
| +    vzeroupper
 | 
| +    push    r11
 | 
| +    push    r12
 | 
| +    push    r13
 | 
| +    push    rbp
 | 
| +    sub rsp, 10*16
 | 
| +    vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
 | 
| +    vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
 | 
| +    vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
 | 
| +    vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
 | 
| +    vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
 | 
| +    vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
 | 
| +    vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
 | 
| +    vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
 | 
| +    vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
 | 
| +    vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
 | 
| +
 | 
| +    mov rbp, rsp
 | 
| +    sub rsp, 16*16
 | 
| +    and rsp, -16
 | 
| +
 | 
| +    vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
 | 
| +    vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
 | 
| +    vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
 | 
| +    mov     KS, [16*16 + 3*16 + Gctx]
 | 
| +    mov     NR, [4 + KS]
 | 
| +    lea     KS, [48 + KS]
 | 
| +
 | 
| +    vpshufb CTR0, CTR0, BSWAPMASK
 | 
| +
 | 
| +    mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
 | 
| +    mov aluKSl, [3*4 + KS]
 | 
| +    bswap   aluCTR
 | 
| +    bswap   aluKSl
 | 
| +
 | 
| +    vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
 | 
| +    vpxor   TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
 | 
| +    vmovdqu XMMWORD PTR[8*16 + 0*16 + rsp], TMP0
 | 
| +
 | 
| +    cmp len, 128
 | 
| +    jb  LEncDataSingles
 | 
| +; Prepare the "top" counters
 | 
| +    vmovdqu XMMWORD PTR[8*16 + 1*16 + rsp], TMP0
 | 
| +    vmovdqu XMMWORD PTR[8*16 + 2*16 + rsp], TMP0
 | 
| +    vmovdqu XMMWORD PTR[8*16 + 3*16 + rsp], TMP0
 | 
| +    vmovdqu XMMWORD PTR[8*16 + 4*16 + rsp], TMP0
 | 
| +    vmovdqu XMMWORD PTR[8*16 + 5*16 + rsp], TMP0
 | 
| +    vmovdqu XMMWORD PTR[8*16 + 6*16 + rsp], TMP0
 | 
| +    vmovdqu XMMWORD PTR[8*16 + 7*16 + rsp], TMP0
 | 
| +
 | 
| +; Encrypt the initial 8 blocks
 | 
| +    sub len, 128
 | 
| +    vpaddd  CTR1, CTR0, XMMWORD PTR[Lone]
 | 
| +    vpaddd  CTR2, CTR0, XMMWORD PTR[Ltwo]
 | 
| +    vpaddd  CTR3, CTR2, XMMWORD PTR[Lone]
 | 
| +    vpaddd  CTR4, CTR2, XMMWORD PTR[Ltwo]
 | 
| +    vpaddd  CTR5, CTR4, XMMWORD PTR[Lone]
 | 
| +    vpaddd  CTR6, CTR4, XMMWORD PTR[Ltwo]
 | 
| +    vpaddd  CTR7, CTR6, XMMWORD PTR[Lone]
 | 
| +
 | 
| +    vpshufb CTR0, CTR0, BSWAPMASK
 | 
| +    vpshufb CTR1, CTR1, BSWAPMASK
 | 
| +    vpshufb CTR2, CTR2, BSWAPMASK
 | 
| +    vpshufb CTR3, CTR3, BSWAPMASK
 | 
| +    vpshufb CTR4, CTR4, BSWAPMASK
 | 
| +    vpshufb CTR5, CTR5, BSWAPMASK
 | 
| +    vpshufb CTR6, CTR6, BSWAPMASK
 | 
| +    vpshufb CTR7, CTR7, BSWAPMASK
 | 
| +
 | 
| +    vmovdqu TMP3, XMMWORD PTR[0*16 + KS]
 | 
| +    vpxor   CTR0, CTR0, TMP3
 | 
| +    vpxor   CTR1, CTR1, TMP3
 | 
| +    vpxor   CTR2, CTR2, TMP3
 | 
| +    vpxor   CTR3, CTR3, TMP3
 | 
| +    vpxor   CTR4, CTR4, TMP3
 | 
| +    vpxor   CTR5, CTR5, TMP3
 | 
| +    vpxor   CTR6, CTR6, TMP3
 | 
| +    vpxor   CTR7, CTR7, TMP3
 | 
| +
 | 
| +    ROUND   1
 | 
| +
 | 
| +    add aluCTR, 8
 | 
| +    mov aluTMP, aluCTR
 | 
| +    xor aluTMP, aluKSl
 | 
| +    bswap   aluTMP
 | 
| +    mov [8*16 + 0*16 + 3*4 + rsp], aluTMP
 | 
| +
 | 
| +    ROUND   2
 | 
| +    NEXTCTR 1
 | 
| +    ROUND   3
 | 
| +    NEXTCTR 2
 | 
| +    ROUND   4
 | 
| +    NEXTCTR 3
 | 
| +    ROUND   5
 | 
| +    NEXTCTR 4
 | 
| +    ROUND   6
 | 
| +    NEXTCTR 5
 | 
| +    ROUND   7
 | 
| +    NEXTCTR 6
 | 
| +    ROUND   8
 | 
| +    NEXTCTR 7
 | 
| +    ROUND   9
 | 
| +    vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
 | 
| +    cmp     NR, 10
 | 
| +    je      @f
 | 
| +
 | 
| +    ROUND   10
 | 
| +    ROUND   11
 | 
| +    vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
 | 
| +    cmp     NR, 12
 | 
| +    je      @f
 | 
| +
 | 
| +    ROUND   12
 | 
| +    ROUND   13
 | 
| +    vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
 | 
| +@@:
 | 
| +    vpxor   TMP3, TMP5, XMMWORD PTR[0*16 + PT]
 | 
| +    vaesenclast CTR0, CTR0, TMP3
 | 
| +    vpxor   TMP3, TMP5, XMMWORD PTR[1*16 + PT]
 | 
| +    vaesenclast CTR1, CTR1, TMP3
 | 
| +    vpxor   TMP3, TMP5, XMMWORD PTR[2*16 + PT]
 | 
| +    vaesenclast CTR2, CTR2, TMP3
 | 
| +    vpxor   TMP3, TMP5, XMMWORD PTR[3*16 + PT]
 | 
| +    vaesenclast CTR3, CTR3, TMP3
 | 
| +    vpxor   TMP3, TMP5, XMMWORD PTR[4*16 + PT]
 | 
| +    vaesenclast CTR4, CTR4, TMP3
 | 
| +    vpxor   TMP3, TMP5, XMMWORD PTR[5*16 + PT]
 | 
| +    vaesenclast CTR5, CTR5, TMP3
 | 
| +    vpxor   TMP3, TMP5, XMMWORD PTR[6*16 + PT]
 | 
| +    vaesenclast CTR6, CTR6, TMP3
 | 
| +    vpxor   TMP3, TMP5, XMMWORD PTR[7*16 + PT]
 | 
| +    vaesenclast CTR7, CTR7, TMP3
 | 
| +
 | 
| +    vmovdqu XMMWORD PTR[0*16 + CT], CTR0
 | 
| +    vpshufb CTR0, CTR0, BSWAPMASK
 | 
| +    vmovdqu XMMWORD PTR[1*16 + CT], CTR1
 | 
| +    vpshufb CTR1, CTR1, BSWAPMASK
 | 
| +    vmovdqu XMMWORD PTR[2*16 + CT], CTR2
 | 
| +    vpshufb CTR2, CTR2, BSWAPMASK
 | 
| +    vmovdqu XMMWORD PTR[3*16 + CT], CTR3
 | 
| +    vpshufb CTR3, CTR3, BSWAPMASK
 | 
| +    vmovdqu XMMWORD PTR[4*16 + CT], CTR4
 | 
| +    vpshufb CTR4, CTR4, BSWAPMASK
 | 
| +    vmovdqu XMMWORD PTR[5*16 + CT], CTR5
 | 
| +    vpshufb CTR5, CTR5, BSWAPMASK
 | 
| +    vmovdqu XMMWORD PTR[6*16 + CT], CTR6
 | 
| +    vpshufb CTR6, CTR6, BSWAPMASK
 | 
| +    vmovdqu XMMWORD PTR[7*16 + CT], CTR7
 | 
| +    vpshufb TMP5, CTR7, BSWAPMASK
 | 
| +
 | 
| +    vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
 | 
| +    vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
 | 
| +    vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
 | 
| +    vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
 | 
| +    vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
 | 
| +    vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
 | 
| +    vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
 | 
| +
 | 
| +    lea CT, [8*16 + CT]
 | 
| +    lea PT, [8*16 + PT]
 | 
| +    jmp LEncDataOctets
 | 
| +
 | 
| +LEncDataOctets:
 | 
| +        cmp len, 128
 | 
| +        jb  LEndEncOctets
 | 
| +        sub len, 128
 | 
| +
 | 
| +        vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + rsp]
 | 
| +        vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + rsp]
 | 
| +        vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + rsp]
 | 
| +        vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + rsp]
 | 
| +        vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + rsp]
 | 
| +        vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + rsp]
 | 
| +        vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + rsp]
 | 
| +        vmovdqa CTR7, XMMWORD PTR[8*16 + 7*16 + rsp]
 | 
| +
 | 
| +        vpshufd TMP4, TMP5, 78
 | 
| +        vpxor   TMP4, TMP4, TMP5
 | 
| +        vpclmulqdq  TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
 | 
| +        vmovdqu     TMP4, XMMWORD PTR[0*16 + Htbl]
 | 
| +        vpclmulqdq  TMP1, TMP5, TMP4, 011h
 | 
| +        vpclmulqdq  TMP2, TMP5, TMP4, 000h
 | 
| +
 | 
| +        vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
 | 
| +        ROUNDMUL 1
 | 
| +        NEXTCTR 0
 | 
| +        vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
 | 
| +        ROUNDMUL 2
 | 
| +        NEXTCTR 1
 | 
| +        vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
 | 
| +        ROUNDMUL 3
 | 
| +        NEXTCTR 2
 | 
| +        vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
 | 
| +        ROUNDMUL 4
 | 
| +        NEXTCTR 3
 | 
| +        vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
 | 
| +        ROUNDMUL 5
 | 
| +        NEXTCTR 4
 | 
| +        vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
 | 
| +        ROUNDMUL 6
 | 
| +        NEXTCTR 5
 | 
| +        vpxor   TMP5, T, XMMWORD PTR[7*16 + rsp]
 | 
| +        ROUNDMUL 7
 | 
| +        NEXTCTR 6
 | 
| +
 | 
| +        ROUND 8
 | 
| +        NEXTCTR 7
 | 
| +
 | 
| +        vpxor   TMP0, TMP0, TMP1
 | 
| +        vpxor   TMP0, TMP0, TMP2
 | 
| +        vpsrldq TMP3, TMP0, 8
 | 
| +        vpxor   TMP4, TMP1, TMP3
 | 
| +        vpslldq TMP3, TMP0, 8
 | 
| +        vpxor   T, TMP2, TMP3
 | 
| +
 | 
| +        vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
 | 
| +        vpalignr    T,T,T,8
 | 
| +        vpxor       T, T, TMP1
 | 
| +
 | 
| +        ROUND 9
 | 
| +
 | 
| +        vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
 | 
| +        vpalignr    T,T,T,8
 | 
| +        vpxor       T, T, TMP1
 | 
| +
 | 
| +        vmovdqu     TMP5, XMMWORD PTR[10*16 + KS]
 | 
| +        cmp         NR, 10
 | 
| +        je          @f
 | 
| +
 | 
| +        ROUND 10
 | 
| +        ROUND 11
 | 
| +        vmovdqu     TMP5, XMMWORD PTR[12*16 + KS]
 | 
| +        cmp         NR, 12
 | 
| +        je          @f
 | 
| +
 | 
| +        ROUND 12
 | 
| +        ROUND 13
 | 
| +        vmovdqu     TMP5, XMMWORD PTR[14*16 + KS]
 | 
| +@@:
 | 
| +        vpxor   TMP3, TMP5, XMMWORD PTR[0*16 + PT]
 | 
| +        vaesenclast CTR0, CTR0, TMP3
 | 
| +        vpxor   TMP3, TMP5, XMMWORD PTR[1*16 + PT]
 | 
| +        vaesenclast CTR1, CTR1, TMP3
 | 
| +        vpxor   TMP3, TMP5, XMMWORD PTR[2*16 + PT]
 | 
| +        vaesenclast CTR2, CTR2, TMP3
 | 
| +        vpxor   TMP3, TMP5, XMMWORD PTR[3*16 + PT]
 | 
| +        vaesenclast CTR3, CTR3, TMP3
 | 
| +        vpxor   TMP3, TMP5, XMMWORD PTR[4*16 + PT]
 | 
| +        vaesenclast CTR4, CTR4, TMP3
 | 
| +        vpxor   TMP3, TMP5, XMMWORD PTR[5*16 + PT]
 | 
| +        vaesenclast CTR5, CTR5, TMP3
 | 
| +        vpxor   TMP3, TMP5, XMMWORD PTR[6*16 + PT]
 | 
| +        vaesenclast CTR6, CTR6, TMP3
 | 
| +        vpxor   TMP3, TMP5, XMMWORD PTR[7*16 + PT]
 | 
| +        vaesenclast CTR7, CTR7, TMP3
 | 
| +
 | 
| +        vmovdqu XMMWORD PTR[0*16 + CT], CTR0
 | 
| +        vpshufb CTR0, CTR0, BSWAPMASK
 | 
| +        vmovdqu XMMWORD PTR[1*16 + CT], CTR1
 | 
| +        vpshufb CTR1, CTR1, BSWAPMASK
 | 
| +        vmovdqu XMMWORD PTR[2*16 + CT], CTR2
 | 
| +        vpshufb CTR2, CTR2, BSWAPMASK
 | 
| +        vmovdqu XMMWORD PTR[3*16 + CT], CTR3
 | 
| +        vpshufb CTR3, CTR3, BSWAPMASK
 | 
| +        vmovdqu XMMWORD PTR[4*16 + CT], CTR4
 | 
| +        vpshufb CTR4, CTR4, BSWAPMASK
 | 
| +        vmovdqu XMMWORD PTR[5*16 + CT], CTR5
 | 
| +        vpshufb CTR5, CTR5, BSWAPMASK
 | 
| +        vmovdqu XMMWORD PTR[6*16 + CT], CTR6
 | 
| +        vpshufb CTR6, CTR6, BSWAPMASK
 | 
| +        vmovdqu XMMWORD PTR[7*16 + CT], CTR7
 | 
| +        vpshufb TMP5, CTR7, BSWAPMASK
 | 
| +
 | 
| +        vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
 | 
| +        vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
 | 
| +        vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
 | 
| +        vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
 | 
| +        vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
 | 
| +        vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
 | 
| +        vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
 | 
| +
 | 
| +        vpxor   T, T, TMP4
 | 
| +
 | 
| +        lea CT, [8*16 + CT]
 | 
| +        lea PT, [8*16 + PT]
 | 
| +        jmp LEncDataOctets
 | 
| +
 | 
| +LEndEncOctets:
 | 
| +
 | 
| +    vpshufd TMP4, TMP5, 78
 | 
| +    vpxor   TMP4, TMP4, TMP5
 | 
| +    vpclmulqdq  TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
 | 
| +    vmovdqu     TMP4, XMMWORD PTR[0*16 + Htbl]
 | 
| +    vpclmulqdq  TMP1, TMP5, TMP4, 011h
 | 
| +    vpclmulqdq  TMP2, TMP5, TMP4, 000h
 | 
| +
 | 
| +    vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
 | 
| +    KARATSUBA 1
 | 
| +    vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
 | 
| +    KARATSUBA 2
 | 
| +    vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
 | 
| +    KARATSUBA 3
 | 
| +    vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
 | 
| +    KARATSUBA 4
 | 
| +    vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
 | 
| +    KARATSUBA 5
 | 
| +    vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
 | 
| +    KARATSUBA 6
 | 
| +    vpxor   TMP5, T, XMMWORD PTR[7*16 + rsp]
 | 
| +    KARATSUBA 7
 | 
| +
 | 
| +    vpxor   TMP0, TMP0, TMP1
 | 
| +    vpxor   TMP0, TMP0, TMP2
 | 
| +    vpsrldq TMP3, TMP0, 8
 | 
| +    vpxor   TMP4, TMP1, TMP3
 | 
| +    vpslldq TMP3, TMP0, 8
 | 
| +    vpxor   T, TMP2, TMP3
 | 
| +
 | 
| +    vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
 | 
| +    vpalignr    T,T,T,8
 | 
| +    vpxor       T, T, TMP1
 | 
| +
 | 
| +    vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
 | 
| +    vpalignr    T,T,T,8
 | 
| +    vpxor       T, T, TMP1
 | 
| +
 | 
| +    vpxor       T, T, TMP4
 | 
| +
 | 
| +    sub aluCTR, 7
 | 
| +
 | 
| +LEncDataSingles:
 | 
| +
 | 
| +        cmp len, 16
 | 
| +        jb  LEncDataTail
 | 
| +        sub len, 16
 | 
| +
 | 
| +        vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
 | 
| +        NEXTCTR 0
 | 
| +
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
 | 
| +        vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
 | 
| +        cmp NR, 10
 | 
| +        je  @f
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
 | 
| +        vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
 | 
| +        cmp NR, 12
 | 
| +        je  @f
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
 | 
| +        vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
 | 
| +@@:
 | 
| +        vaesenclast TMP1, TMP1, TMP2
 | 
| +        vpxor   TMP1, TMP1, XMMWORD PTR[PT]
 | 
| +        vmovdqu XMMWORD PTR[CT], TMP1
 | 
| +
 | 
| +        lea PT, [16+PT]
 | 
| +        lea CT, [16+CT]
 | 
| +
 | 
| +        vpshufb TMP1, TMP1, BSWAPMASK
 | 
| +        vpxor   T, T, TMP1
 | 
| +        vmovdqu TMP0, XMMWORD PTR[Htbl]
 | 
| +        GFMUL   T, T, TMP0, TMP1, TMP2, TMP3, TMP4
 | 
| +
 | 
| +        jmp LEncDataSingles
 | 
| +
 | 
| +LEncDataTail:
 | 
| +
 | 
| +    test    len, len
 | 
| +    jz  LEncDataEnd
 | 
| +
 | 
| +    vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
 | 
| +
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
 | 
| +    vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
 | 
| +    cmp NR, 10
 | 
| +    je  @f
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
 | 
| +    vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
 | 
| +    cmp NR, 12
 | 
| +    je  @f
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
 | 
| +    vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
 | 
| +@@:
 | 
| +    vaesenclast TMP1, TMP1, TMP2
 | 
| +; zero a temp location
 | 
| +    vpxor   TMP2, TMP2, TMP2
 | 
| +    vmovdqa XMMWORD PTR[rsp], TMP2
 | 
| +; copy as many bytes as needed
 | 
| +    xor KS, KS
 | 
| +
 | 
| +@@:
 | 
| +        cmp len, KS
 | 
| +        je  @f
 | 
| +        mov al, [PT + KS]
 | 
| +        mov [rsp + KS], al
 | 
| +        inc KS
 | 
| +        jmp @b
 | 
| +@@:
 | 
| +    vpxor   TMP1, TMP1, XMMWORD PTR[rsp]
 | 
| +    vmovdqa XMMWORD PTR[rsp], TMP1
 | 
| +    xor KS, KS
 | 
| +@@:
 | 
| +        cmp len, KS
 | 
| +        je  @f
 | 
| +        mov al, [rsp + KS]
 | 
| +        mov [CT + KS], al
 | 
| +        inc KS
 | 
| +        jmp @b
 | 
| +@@:
 | 
| +        cmp KS, 16
 | 
| +        je  @f
 | 
| +        mov BYTE PTR[rsp + KS], 0
 | 
| +        inc KS
 | 
| +        jmp @b
 | 
| +@@:
 | 
| +BAIL:
 | 
| +    vmovdqa TMP1, XMMWORD PTR[rsp]
 | 
| +    vpshufb TMP1, TMP1, BSWAPMASK
 | 
| +    vpxor   T, T, TMP1
 | 
| +    vmovdqu TMP0, XMMWORD PTR[Htbl]
 | 
| +    GFMUL   T, T, TMP0, TMP1, TMP2, TMP3, TMP4
 | 
| +
 | 
| +LEncDataEnd:
 | 
| +
 | 
| +    vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
 | 
| +    bswap   aluCTR
 | 
| +    mov     [16*16 + 2*16 + 3*4 + Gctx], aluCTR
 | 
| +
 | 
| +    mov rsp, rbp
 | 
| +
 | 
| +    vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
 | 
| +    vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
 | 
| +    vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
 | 
| +    vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
 | 
| +    vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
 | 
| +    vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
 | 
| +    vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
 | 
| +    vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
 | 
| +    vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
 | 
| +    vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
 | 
| +
 | 
| +    add rsp, 10*16
 | 
| +    pop rbp
 | 
| +    pop r13
 | 
| +    pop r12
 | 
| +    pop r11
 | 
| +
 | 
| +    vzeroupper
 | 
| +
 | 
| +    ret
 | 
| +intel_aes_gcmENC ENDP
 | 
| +
 | 
| +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 | 
| +;
 | 
| +; Decrypt and Authenticate
 | 
| +; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len);
 | 
| +;
 | 
| +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 | 
| +
 | 
| +ALIGN 16
 | 
| +intel_aes_gcmDEC PROC
 | 
| +
 | 
| +NEXTCTR MACRO i
 | 
| +    add aluCTR, 1
 | 
| +    mov aluTMP, aluCTR
 | 
| +    xor aluTMP, aluKSl
 | 
| +    bswap   aluTMP
 | 
| +    mov [3*4 + i*16 + rsp], aluTMP
 | 
| +ENDM
 | 
| +
 | 
| +PT      textequ <rdx>
 | 
| +CT      textequ <rcx>
 | 
| +
 | 
| +    test  len, len
 | 
| +    jnz   LbeginDEC
 | 
| +    ret
 | 
| +
 | 
| +LbeginDEC:
 | 
| +
 | 
| +    vzeroupper
 | 
| +    push    r11
 | 
| +    push    r12
 | 
| +    push    r13
 | 
| +    push    rbp
 | 
| +    sub rsp, 10*16
 | 
| +    vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
 | 
| +    vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
 | 
| +    vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
 | 
| +    vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
 | 
| +    vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
 | 
| +    vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
 | 
| +    vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
 | 
| +    vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
 | 
| +    vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
 | 
| +    vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
 | 
| +
 | 
| +    mov rbp, rsp
 | 
| +    sub rsp, 8*16
 | 
| +    and rsp, -16
 | 
| +
 | 
| +    vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
 | 
| +    vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
 | 
| +    vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
 | 
| +    mov     KS, [16*16 + 3*16 + Gctx]
 | 
| +    mov     NR, [4 + KS]
 | 
| +    lea     KS, [48 + KS]
 | 
| +
 | 
| +    vpshufb CTR0, CTR0, BSWAPMASK
 | 
| +
 | 
| +    mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
 | 
| +    mov aluKSl, [3*4 + KS]
 | 
| +    bswap   aluCTR
 | 
| +    bswap   aluKSl
 | 
| +
 | 
| +    vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
 | 
| +    vpxor   TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
 | 
| +    vmovdqu XMMWORD PTR[0*16 + rsp], TMP0
 | 
| +
 | 
| +    cmp len, 128
 | 
| +    jb  LDecDataSingles
 | 
| +; Prepare the "top" counters
 | 
| +    vmovdqu XMMWORD PTR[1*16 + rsp], TMP0
 | 
| +    vmovdqu XMMWORD PTR[2*16 + rsp], TMP0
 | 
| +    vmovdqu XMMWORD PTR[3*16 + rsp], TMP0
 | 
| +    vmovdqu XMMWORD PTR[4*16 + rsp], TMP0
 | 
| +    vmovdqu XMMWORD PTR[5*16 + rsp], TMP0
 | 
| +    vmovdqu XMMWORD PTR[6*16 + rsp], TMP0
 | 
| +    vmovdqu XMMWORD PTR[7*16 + rsp], TMP0
 | 
| +
 | 
| +    NEXTCTR 1
 | 
| +    NEXTCTR 2
 | 
| +    NEXTCTR 3
 | 
| +    NEXTCTR 4
 | 
| +    NEXTCTR 5
 | 
| +    NEXTCTR 6
 | 
| +    NEXTCTR 7
 | 
| +
 | 
| +LDecDataOctets:
 | 
| +        cmp len, 128
 | 
| +        jb  LEndDecOctets
 | 
| +        sub len, 128
 | 
| +
 | 
| +        vmovdqa CTR0, XMMWORD PTR[0*16 + rsp]
 | 
| +        vmovdqa CTR1, XMMWORD PTR[1*16 + rsp]
 | 
| +        vmovdqa CTR2, XMMWORD PTR[2*16 + rsp]
 | 
| +        vmovdqa CTR3, XMMWORD PTR[3*16 + rsp]
 | 
| +        vmovdqa CTR4, XMMWORD PTR[4*16 + rsp]
 | 
| +        vmovdqa CTR5, XMMWORD PTR[5*16 + rsp]
 | 
| +        vmovdqa CTR6, XMMWORD PTR[6*16 + rsp]
 | 
| +        vmovdqa CTR7, XMMWORD PTR[7*16 + rsp]
 | 
| +
 | 
| +        vmovdqu TMP5, XMMWORD PTR[7*16 + CT]
 | 
| +        vpshufb TMP5, TMP5, BSWAPMASK
 | 
| +        vpshufd TMP4, TMP5, 78
 | 
| +        vpxor   TMP4, TMP4, TMP5
 | 
| +        vpclmulqdq  TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
 | 
| +        vmovdqu     TMP4, XMMWORD PTR[0*16 + Htbl]
 | 
| +        vpclmulqdq  TMP1, TMP5, TMP4, 011h
 | 
| +        vpclmulqdq  TMP2, TMP5, TMP4, 000h
 | 
| +
 | 
| +        vmovdqu TMP5, XMMWORD PTR[6*16 + CT]
 | 
| +        vpshufb TMP5, TMP5, BSWAPMASK
 | 
| +        ROUNDMUL 1
 | 
| +        NEXTCTR 0
 | 
| +        vmovdqu TMP5, XMMWORD PTR[5*16 + CT]
 | 
| +        vpshufb TMP5, TMP5, BSWAPMASK
 | 
| +        ROUNDMUL 2
 | 
| +        NEXTCTR 1
 | 
| +        vmovdqu TMP5, XMMWORD PTR[4*16 + CT]
 | 
| +        vpshufb TMP5, TMP5, BSWAPMASK
 | 
| +        ROUNDMUL 3
 | 
| +        NEXTCTR 2
 | 
| +        vmovdqu TMP5, XMMWORD PTR[3*16 + CT]
 | 
| +        vpshufb TMP5, TMP5, BSWAPMASK
 | 
| +        ROUNDMUL 4
 | 
| +        NEXTCTR 3
 | 
| +        vmovdqu TMP5, XMMWORD PTR[2*16 + CT]
 | 
| +        vpshufb TMP5, TMP5, BSWAPMASK
 | 
| +        ROUNDMUL 5
 | 
| +        NEXTCTR 4
 | 
| +        vmovdqu TMP5, XMMWORD PTR[1*16 + CT]
 | 
| +        vpshufb TMP5, TMP5, BSWAPMASK
 | 
| +        ROUNDMUL 6
 | 
| +        NEXTCTR 5
 | 
| +        vmovdqu TMP5, XMMWORD PTR[0*16 + CT]
 | 
| +        vpshufb TMP5, TMP5, BSWAPMASK
 | 
| +        vpxor   TMP5, TMP5, T
 | 
| +        ROUNDMUL 7
 | 
| +        NEXTCTR 6
 | 
| +
 | 
| +        ROUND 8
 | 
| +        NEXTCTR 7
 | 
| +
 | 
| +        vpxor   TMP0, TMP0, TMP1
 | 
| +        vpxor   TMP0, TMP0, TMP2
 | 
| +        vpsrldq TMP3, TMP0, 8
 | 
| +        vpxor   TMP4, TMP1, TMP3
 | 
| +        vpslldq TMP3, TMP0, 8
 | 
| +        vpxor   T, TMP2, TMP3
 | 
| +
 | 
| +        vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
 | 
| +        vpalignr    T,T,T,8
 | 
| +        vpxor       T, T, TMP1
 | 
| +
 | 
| +        ROUND 9
 | 
| +
 | 
| +        vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
 | 
| +        vpalignr    T,T,T,8
 | 
| +        vpxor       T, T, TMP1
 | 
| +
 | 
| +        vmovdqu     TMP5, XMMWORD PTR[10*16 + KS]
 | 
| +        cmp         NR, 10
 | 
| +        je          @f
 | 
| +
 | 
| +        ROUND 10
 | 
| +        ROUND 11
 | 
| +        vmovdqu     TMP5, XMMWORD PTR[12*16 + KS]
 | 
| +        cmp         NR, 12
 | 
| +        je          @f
 | 
| +
 | 
| +        ROUND 12
 | 
| +        ROUND 13
 | 
| +        vmovdqu     TMP5, XMMWORD PTR[14*16 + KS]
 | 
| +@@:
 | 
| +        vpxor   TMP3, TMP5, XMMWORD PTR[0*16 + CT]
 | 
| +        vaesenclast CTR0, CTR0, TMP3
 | 
| +        vpxor   TMP3, TMP5, XMMWORD PTR[1*16 + CT]
 | 
| +        vaesenclast CTR1, CTR1, TMP3
 | 
| +        vpxor   TMP3, TMP5, XMMWORD PTR[2*16 + CT]
 | 
| +        vaesenclast CTR2, CTR2, TMP3
 | 
| +        vpxor   TMP3, TMP5, XMMWORD PTR[3*16 + CT]
 | 
| +        vaesenclast CTR3, CTR3, TMP3
 | 
| +        vpxor   TMP3, TMP5, XMMWORD PTR[4*16 + CT]
 | 
| +        vaesenclast CTR4, CTR4, TMP3
 | 
| +        vpxor   TMP3, TMP5, XMMWORD PTR[5*16 + CT]
 | 
| +        vaesenclast CTR5, CTR5, TMP3
 | 
| +        vpxor   TMP3, TMP5, XMMWORD PTR[6*16 + CT]
 | 
| +        vaesenclast CTR6, CTR6, TMP3
 | 
| +        vpxor   TMP3, TMP5, XMMWORD PTR[7*16 + CT]
 | 
| +        vaesenclast CTR7, CTR7, TMP3
 | 
| +
 | 
| +        vmovdqu XMMWORD PTR[0*16 + PT], CTR0
 | 
| +        vmovdqu XMMWORD PTR[1*16 + PT], CTR1
 | 
| +        vmovdqu XMMWORD PTR[2*16 + PT], CTR2
 | 
| +        vmovdqu XMMWORD PTR[3*16 + PT], CTR3
 | 
| +        vmovdqu XMMWORD PTR[4*16 + PT], CTR4
 | 
| +        vmovdqu XMMWORD PTR[5*16 + PT], CTR5
 | 
| +        vmovdqu XMMWORD PTR[6*16 + PT], CTR6
 | 
| +        vmovdqu XMMWORD PTR[7*16 + PT], CTR7
 | 
| +
 | 
| +        vpxor   T, T, TMP4
 | 
| +
 | 
| +        lea CT, [8*16 + CT]
 | 
| +        lea PT, [8*16 + PT]
 | 
| +        jmp LDecDataOctets
 | 
| +
 | 
| +LEndDecOctets:
 | 
| +
 | 
| +    sub aluCTR, 7
 | 
| +
 | 
| +LDecDataSingles:
 | 
| +
 | 
| +        cmp len, 16
 | 
| +        jb  LDecDataTail
 | 
| +        sub len, 16
 | 
| +
 | 
| +        vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
 | 
| +        NEXTCTR 0
 | 
| +
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
 | 
| +        vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
 | 
| +        cmp NR, 10
 | 
| +        je  @f
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
 | 
| +        vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
 | 
| +        cmp NR, 12
 | 
| +        je  @f
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
 | 
| +        vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
 | 
| +        vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
 | 
| +@@:
 | 
| +        vaesenclast TMP1, TMP1, TMP2
 | 
| +
 | 
| +        vmovdqu TMP2, XMMWORD PTR[CT]
 | 
| +        vpxor   TMP1, TMP1, TMP2
 | 
| +        vmovdqu XMMWORD PTR[PT], TMP1
 | 
| +
 | 
| +        lea PT, [16+PT]
 | 
| +        lea CT, [16+CT]
 | 
| +
 | 
| +        vpshufb TMP2, TMP2, BSWAPMASK
 | 
| +        vpxor   T, T, TMP2
 | 
| +        vmovdqu TMP0, XMMWORD PTR[Htbl]
 | 
| +        GFMUL   T, T, TMP0, TMP1, TMP2, TMP3, TMP4
 | 
| +
 | 
| +        jmp LDecDataSingles
 | 
| +
 | 
| +LDecDataTail:
 | 
| +
 | 
| +    test    len, len
 | 
| +    jz      LDecDataEnd
 | 
| +
 | 
| +    vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
 | 
| +    inc aluCTR
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
 | 
| +    vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
 | 
| +    cmp NR, 10
 | 
| +    je  @f
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
 | 
| +    vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
 | 
| +    cmp NR, 12
 | 
| +    je  @f
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
 | 
| +    vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
 | 
| +    vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
 | 
| +@@:
 | 
| +    vaesenclast TMP1, TMP1, TMP2
 | 
| +; copy as many bytes as needed
 | 
| +    xor KS, KS
 | 
| +@@:
 | 
| +        cmp len, KS
 | 
| +        je  @f
 | 
| +        mov al, [CT + KS]
 | 
| +        mov [rsp + KS], al
 | 
| +        inc KS
 | 
| +        jmp @b
 | 
| +@@:
 | 
| +        cmp KS, 16
 | 
| +        je  @f
 | 
| +        mov BYTE PTR[rsp + KS], 0
 | 
| +        inc KS
 | 
| +        jmp @b
 | 
| +@@:
 | 
| +    vmovdqa TMP2, XMMWORD PTR[rsp]
 | 
| +    vpshufb TMP2, TMP2, BSWAPMASK
 | 
| +    vpxor   T, T, TMP2
 | 
| +    vmovdqu TMP0, XMMWORD PTR[Htbl]
 | 
| +    GFMUL   T, T, TMP0, TMP5, TMP2, TMP3, TMP4
 | 
| +
 | 
| +
 | 
| +    vpxor   TMP1, TMP1, XMMWORD PTR[rsp]
 | 
| +    vmovdqa XMMWORD PTR[rsp], TMP1
 | 
| +    xor KS, KS
 | 
| +@@:
 | 
| +        cmp len, KS
 | 
| +        je  @f
 | 
| +        mov al, [rsp + KS]
 | 
| +        mov [PT + KS], al
 | 
| +        inc KS
 | 
| +        jmp @b
 | 
| +@@:
 | 
| +        cmp KS, 16
 | 
| +        je  @f
 | 
| +        mov BYTE PTR[rsp + KS], 0
 | 
| +        inc KS
 | 
| +        jmp @b
 | 
| +@@:
 | 
| +
 | 
| +LDecDataEnd:
 | 
| +
 | 
| +    vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
 | 
| +    bswap   aluCTR
 | 
| +    mov     [16*16 + 2*16 + 3*4 + Gctx], aluCTR
 | 
| +
 | 
| +    mov rsp, rbp
 | 
| +
 | 
| +    vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
 | 
| +    vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
 | 
| +    vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
 | 
| +    vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
 | 
| +    vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
 | 
| +    vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
 | 
| +    vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
 | 
| +    vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
 | 
| +    vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
 | 
| +    vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
 | 
| +
 | 
| +    add rsp, 10*16
 | 
| +    pop rbp
 | 
| +    pop r13
 | 
| +    pop r12
 | 
| +    pop r11
 | 
| +
 | 
| +    vzeroupper
 | 
| +
 | 
| +    ret
 | 
| +ret
 | 
| +intel_aes_gcmDEC ENDP
 | 
| +
 | 
| +
 | 
| +END
 | 
| 
 |