Index: nss/lib/freebl/intel-gcm-x86-masm.asm |
diff --git a/nss/lib/freebl/intel-gcm-x86-masm.asm b/nss/lib/freebl/intel-gcm-x86-masm.asm |
new file mode 100644 |
index 0000000000000000000000000000000000000000..d8ba9b4961cf26a073905d760111755efe6f7058 |
--- /dev/null |
+++ b/nss/lib/freebl/intel-gcm-x86-masm.asm |
@@ -0,0 +1,1212 @@ |
+; LICENSE: |
+; This submission to NSS is to be made available under the terms of the |
+; Mozilla Public License, v. 2.0. You can obtain one at http: |
+; //mozilla.org/MPL/2.0/. |
+;############################################################################### |
+; Copyright(c) 2014, Intel Corp. |
+; Developers and authors: |
+; Shay Gueron and Vlad Krasnov |
+; Intel Corporation, Israel Development Centre, Haifa, Israel |
+; Please send feedback directly to crypto.feedback.alias@intel.com |
+ |
+ |
+.MODEL FLAT, C |
+.XMM |
+ |
+.DATA |
+ALIGN 16 |
+Lone dq 1,0 |
+Ltwo dq 2,0 |
+Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 |
+Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh |
+Lpoly dq 01h, 0c200000000000000h |
+ |
+.CODE |
+ |
+ |
+GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4 |
+ vpclmulqdq TMP1, SRC2, SRC1, 0h |
+ vpclmulqdq TMP4, SRC2, SRC1, 011h |
+ |
+ vpshufd TMP2, SRC2, 78 |
+ vpshufd TMP3, SRC1, 78 |
+ vpxor TMP2, TMP2, SRC2 |
+ vpxor TMP3, TMP3, SRC1 |
+ |
+ vpclmulqdq TMP2, TMP2, TMP3, 0h |
+ vpxor TMP2, TMP2, TMP1 |
+ vpxor TMP2, TMP2, TMP4 |
+ |
+ vpslldq TMP3, TMP2, 8 |
+ vpsrldq TMP2, TMP2, 8 |
+ |
+ vpxor TMP1, TMP1, TMP3 |
+ vpxor TMP4, TMP4, TMP2 |
+ |
+ vpclmulqdq TMP2, TMP1, [Lpoly], 010h |
+ vpshufd TMP3, TMP1, 78 |
+ vpxor TMP1, TMP2, TMP3 |
+ |
+ vpclmulqdq TMP2, TMP1, [Lpoly], 010h |
+ vpshufd TMP3, TMP1, 78 |
+ vpxor TMP1, TMP2, TMP3 |
+ |
+ vpxor DST, TMP1, TMP4 |
+ |
+ ENDM |
+ |
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
+; |
+; Generates the final GCM tag |
+; void intel_aes_gcmTAG(unsigned char Htbl[16*16], |
+; unsigned char *Tp, |
+; unsigned int Mlen, |
+; unsigned int Alen, |
+; unsigned char* X0, |
+; unsigned char* TAG); |
+; |
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
+ |
+ALIGN 16 |
+intel_aes_gcmTAG PROC |
+ |
+Htbl textequ <eax> |
+Tp textequ <ecx> |
+X0 textequ <edx> |
+TAG textequ <ebx> |
+ |
+T textequ <xmm0> |
+TMP0 textequ <xmm1> |
+ |
+ push ebx |
+ |
+ mov Htbl, [esp + 2*4 + 0*4] |
+ mov Tp, [esp + 2*4 + 1*4] |
+ mov X0, [esp + 2*4 + 4*4] |
+ mov TAG, [esp + 2*4 + 5*4] |
+ |
+ vzeroupper |
+ vmovdqu T, XMMWORD PTR[Tp] |
+ |
+ vpxor TMP0, TMP0, TMP0 |
+ vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 2*4], 0 |
+ vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 3*4], 2 |
+ vpsllq TMP0, TMP0, 3 |
+ |
+ vpxor T, T, TMP0 |
+ vmovdqu TMP0, XMMWORD PTR[Htbl] |
+ GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 |
+ |
+ vpshufb T, T, [Lbswap_mask] |
+ vpxor T, T, [X0] |
+ vmovdqu XMMWORD PTR[TAG], T |
+ vzeroupper |
+ |
+ pop ebx |
+ |
+ ret |
+ |
+intel_aes_gcmTAG ENDP |
+ |
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
+; |
+; Generates the H table |
+; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR); |
+; |
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
+ |
+ALIGN 16 |
+intel_aes_gcmINIT PROC |
+ |
+Htbl textequ <eax> |
+KS textequ <ecx> |
+NR textequ <edx> |
+ |
+T textequ <xmm0> |
+TMP0 textequ <xmm1> |
+ |
+ mov Htbl, [esp + 4*1 + 0*4] |
+ mov KS, [esp + 4*1 + 1*4] |
+ mov NR, [esp + 4*1 + 2*4] |
+ |
+ vzeroupper |
+ ; AES-ENC(0) |
+ vmovdqu T, XMMWORD PTR[KS] |
+ lea KS, [16 + KS] |
+ dec NR |
+Lenc_loop: |
+ vaesenc T, T, [KS] |
+ lea KS, [16 + KS] |
+ dec NR |
+ jnz Lenc_loop |
+ |
+ vaesenclast T, T, [KS] |
+ vpshufb T, T, [Lbswap_mask] |
+ |
+ ;Calculate H` = GFMUL(H, 2) |
+ vpsrad xmm3, T, 31 |
+ vpshufd xmm3, xmm3, 0ffh |
+ vpand xmm5, xmm3, [Lpoly] |
+ vpsrld xmm3, T, 31 |
+ vpslld xmm4, T, 1 |
+ vpslldq xmm3, xmm3, 4 |
+ vpxor T, xmm4, xmm3 |
+ vpxor T, T, xmm5 |
+ |
+ vmovdqu TMP0, T |
+ vmovdqu XMMWORD PTR[Htbl + 0*16], T |
+ |
+ vpshufd xmm2, T, 78 |
+ vpxor xmm2, xmm2, T |
+ vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2 |
+ |
+ i = 1 |
+ WHILE i LT 8 |
+ GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 |
+ vmovdqu XMMWORD PTR[Htbl + i*16], T |
+ vpshufd xmm2, T, 78 |
+ vpxor xmm2, xmm2, T |
+ vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2 |
+ i = i+1 |
+ ENDM |
+ vzeroupper |
+ ret |
+intel_aes_gcmINIT ENDP |
+ |
+ |
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
+; |
+; Authenticate only |
+; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp); |
+; |
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
+ |
+ALIGN 16 |
+intel_aes_gcmAAD PROC |
+ |
+Htbl textequ <eax> |
+inp textequ <ecx> |
+len textequ <edx> |
+Tp textequ <ebx> |
+hlp0 textequ <esi> |
+ |
+DATA textequ <xmm0> |
+T textequ <xmm1> |
+TMP0 textequ <xmm2> |
+TMP1 textequ <xmm3> |
+TMP2 textequ <xmm4> |
+TMP3 textequ <xmm5> |
+TMP4 textequ <xmm6> |
+Xhi textequ <xmm7> |
+ |
+KARATSUBA_AAD MACRO i |
+ vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h |
+ vpxor TMP0, TMP0, TMP3 |
+ vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h |
+ vpxor TMP1, TMP1, TMP3 |
+ vpshufd TMP3, DATA, 78 |
+ vpxor TMP3, TMP3, DATA |
+ vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h |
+ vpxor TMP2, TMP2, TMP3 |
+ENDM |
+ |
+ cmp DWORD PTR[esp + 1*3 + 2*4], 0 |
+ jnz LbeginAAD |
+ ret |
+ |
+LbeginAAD: |
+ push ebx |
+ push esi |
+ |
+ mov Htbl, [esp + 4*3 + 0*4] |
+ mov inp, [esp + 4*3 + 1*4] |
+ mov len, [esp + 4*3 + 2*4] |
+ mov Tp, [esp + 4*3 + 3*4] |
+ |
+ vzeroupper |
+ |
+ vpxor Xhi, Xhi, Xhi |
+ |
+ vmovdqu T, XMMWORD PTR[Tp] |
+ ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first |
+ mov hlp0, len |
+ and hlp0, 128-1 |
+ jz Lmod_loop |
+ |
+ and len, -128 |
+ sub hlp0, 16 |
+ |
+ ; Prefix block |
+ vmovdqu DATA, XMMWORD PTR[inp] |
+ vpshufb DATA, DATA, [Lbswap_mask] |
+ vpxor DATA, DATA, T |
+ |
+ vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + hlp0], 0h |
+ vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + hlp0], 011h |
+ vpshufd TMP3, DATA, 78 |
+ vpxor TMP3, TMP3, DATA |
+ vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h |
+ |
+ lea inp, [inp+16] |
+ test hlp0, hlp0 |
+ jnz Lpre_loop |
+ jmp Lred1 |
+ |
+ ;hash remaining prefix bocks (up to 7 total prefix blocks) |
+Lpre_loop: |
+ |
+ sub hlp0, 16 |
+ |
+ vmovdqu DATA, XMMWORD PTR[inp] |
+ vpshufb DATA, DATA, [Lbswap_mask] |
+ |
+ vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 0h |
+ vpxor TMP0, TMP0, TMP3 |
+ vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 011h |
+ vpxor TMP1, TMP1, TMP3 |
+ vpshufd TMP3, DATA, 78 |
+ vpxor TMP3, TMP3, DATA |
+ vpclmulqdq TMP3, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h |
+ vpxor TMP2, TMP2, TMP3 |
+ |
+ test hlp0, hlp0 |
+ lea inp, [inp+16] |
+ jnz Lpre_loop |
+ |
+Lred1: |
+ |
+ vpxor TMP2, TMP2, TMP0 |
+ vpxor TMP2, TMP2, TMP1 |
+ vpsrldq TMP3, TMP2, 8 |
+ vpslldq TMP2, TMP2, 8 |
+ |
+ vpxor Xhi, TMP1, TMP3 |
+ vpxor T, TMP0, TMP2 |
+ |
+Lmod_loop: |
+ |
+ sub len, 16*8 |
+ jb Ldone |
+ ; Block #0 |
+ vmovdqu DATA, XMMWORD PTR[inp + 16*7] |
+ vpshufb DATA, DATA, XMMWORD PTR[Lbswap_mask] |
+ |
+ vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + 0*16], 0h |
+ vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + 0*16], 011h |
+ vpshufd TMP3, DATA, 78 |
+ vpxor TMP3, TMP3, DATA |
+ vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + 0*16], 0h |
+ |
+ ; Block #1 |
+ vmovdqu DATA, XMMWORD PTR[inp + 16*6] |
+ vpshufb DATA, DATA, [Lbswap_mask] |
+ KARATSUBA_AAD 1 |
+ |
+ ; Block #2 |
+ vmovdqu DATA, XMMWORD PTR[inp + 16*5] |
+ vpshufb DATA, DATA, [Lbswap_mask] |
+ |
+ vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a |
+ vpalignr T, T, T, 8 |
+ |
+ KARATSUBA_AAD 2 |
+ |
+ vpxor T, T, TMP4 ;reduction stage 1b |
+ |
+ ; Block #3 |
+ vmovdqu DATA, XMMWORD PTR[inp + 16*4] |
+ vpshufb DATA, DATA, [Lbswap_mask] |
+ KARATSUBA_AAD 3 |
+ ; Block #4 |
+ vmovdqu DATA, XMMWORD PTR[inp + 16*3] |
+ vpshufb DATA, DATA, [Lbswap_mask] |
+ |
+ vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a |
+ vpalignr T, T, T, 8 |
+ |
+ KARATSUBA_AAD 4 |
+ |
+ vpxor T, T, TMP4 ;reduction stage 2b |
+ ; Block #5 |
+ vmovdqu DATA, XMMWORD PTR[inp + 16*2] |
+ vpshufb DATA, DATA, [Lbswap_mask] |
+ KARATSUBA_AAD 5 |
+ |
+ vpxor T, T, Xhi ;reduction finalize |
+ ; Block #6 |
+ vmovdqu DATA, XMMWORD PTR[inp + 16*1] |
+ vpshufb DATA, DATA, [Lbswap_mask] |
+ KARATSUBA_AAD 6 |
+ ; Block #7 |
+ vmovdqu DATA, XMMWORD PTR[inp + 16*0] |
+ vpshufb DATA, DATA, [Lbswap_mask] |
+ vpxor DATA, DATA, T |
+ KARATSUBA_AAD 7 |
+ ; Aggregated 8 blocks, now karatsuba fixup |
+ vpxor TMP2, TMP2, TMP0 |
+ vpxor TMP2, TMP2, TMP1 |
+ vpsrldq TMP3, TMP2, 8 |
+ vpslldq TMP2, TMP2, 8 |
+ |
+ vpxor Xhi, TMP1, TMP3 |
+ vpxor T, TMP0, TMP2 |
+ |
+ lea inp, [inp + 16*8] |
+ jmp Lmod_loop |
+ |
+Ldone: |
+ vpclmulqdq TMP4, T, [Lpoly], 010h |
+ vpalignr T, T, T, 8 |
+ vpxor T, T, TMP4 |
+ |
+ vpclmulqdq TMP4, T, [Lpoly], 010h |
+ vpalignr T, T, T, 8 |
+ vpxor T, T, TMP4 |
+ |
+ vpxor T, T, Xhi |
+ vmovdqu XMMWORD PTR[Tp], T |
+ vzeroupper |
+ |
+ pop esi |
+ pop ebx |
+ ret |
+ |
+intel_aes_gcmAAD ENDP |
+ |
+ |
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
+; |
+; Encrypt and Authenticate |
+; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len); |
+; |
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
+ |
+ALIGN 16 |
+intel_aes_gcmENC PROC |
+ |
+PT textequ <eax> |
+CT textequ <ecx> |
+Htbl textequ <edx> |
+Gctx textequ <edx> |
+len textequ <DWORD PTR[ebp + 5*4 + 3*4]> |
+KS textequ <esi> |
+NR textequ <DWORD PTR[-40 + KS]> |
+ |
+aluCTR textequ <ebx> |
+aluTMP textequ <edi> |
+ |
+T textequ <XMMWORD PTR[16*16 + 1*16 + Gctx]> |
+TMP0 textequ <xmm1> |
+TMP1 textequ <xmm2> |
+TMP2 textequ <xmm3> |
+TMP3 textequ <xmm4> |
+TMP4 textequ <xmm5> |
+TMP5 textequ <xmm6> |
+ |
+CTR0 textequ <xmm0> |
+CTR1 textequ <xmm1> |
+CTR2 textequ <xmm2> |
+CTR3 textequ <xmm3> |
+CTR4 textequ <xmm4> |
+CTR5 textequ <xmm5> |
+CTR6 textequ <xmm6> |
+ |
+ROUND MACRO i |
+ vmovdqu xmm7, XMMWORD PTR[i*16 + KS] |
+ vaesenc CTR0, CTR0, xmm7 |
+ vaesenc CTR1, CTR1, xmm7 |
+ vaesenc CTR2, CTR2, xmm7 |
+ vaesenc CTR3, CTR3, xmm7 |
+ vaesenc CTR4, CTR4, xmm7 |
+ vaesenc CTR5, CTR5, xmm7 |
+ vaesenc CTR6, CTR6, xmm7 |
+ENDM |
+ |
+KARATSUBA MACRO i |
+ vpshufd TMP4, TMP5, 78 |
+ vpxor TMP4, TMP4, TMP5 |
+ vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h |
+ vpxor TMP0, TMP0, TMP3 |
+ vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] |
+ vpclmulqdq TMP3, TMP5, TMP4, 011h |
+ vpxor TMP1, TMP1, TMP3 |
+ vpclmulqdq TMP3, TMP5, TMP4, 000h |
+ vpxor TMP2, TMP2, TMP3 |
+ENDM |
+ |
+NEXTCTR MACRO i |
+ add aluCTR, 1 |
+ mov aluTMP, aluCTR |
+ bswap aluTMP |
+ xor aluTMP, [3*4 + KS] |
+ mov [3*4 + 8*16 + i*16 + esp], aluTMP |
+ENDM |
+ |
+ cmp DWORD PTR[1*4 + 3*4 + esp], 0 |
+ jne LbeginENC |
+ ret |
+ |
+LbeginENC: |
+ |
+ vzeroupper |
+ push ebp |
+ push ebx |
+ push esi |
+ push edi |
+ |
+ mov ebp, esp |
+ sub esp, 16*16 |
+ and esp, -16 |
+ |
+ mov PT, [ebp + 5*4 + 0*4] |
+ mov CT, [ebp + 5*4 + 1*4] |
+ mov Gctx, [ebp + 5*4 + 2*4] |
+ |
+ mov KS, [16*16 + 3*16 + Gctx] |
+ lea KS, [44 + KS] |
+ |
+ mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] |
+ bswap aluCTR |
+ |
+ |
+ vmovdqu TMP0, XMMWORD PTR[0*16 + KS] |
+ vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] |
+ vmovdqu XMMWORD PTR[8*16 + 0*16 + esp], TMP0 |
+ |
+ cmp len, 16*7 |
+ jb LEncDataSingles |
+; Prepare the "top" counters |
+ vmovdqu XMMWORD PTR[8*16 + 1*16 + esp], TMP0 |
+ vmovdqu XMMWORD PTR[8*16 + 2*16 + esp], TMP0 |
+ vmovdqu XMMWORD PTR[8*16 + 3*16 + esp], TMP0 |
+ vmovdqu XMMWORD PTR[8*16 + 4*16 + esp], TMP0 |
+ vmovdqu XMMWORD PTR[8*16 + 5*16 + esp], TMP0 |
+ vmovdqu XMMWORD PTR[8*16 + 6*16 + esp], TMP0 |
+ |
+ vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] |
+ vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] |
+; Encrypt the initial 7 blocks |
+ sub len, 16*7 |
+ vpaddd CTR1, CTR0, XMMWORD PTR[Lone] |
+ vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo] |
+ vpaddd CTR3, CTR2, XMMWORD PTR[Lone] |
+ vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo] |
+ vpaddd CTR5, CTR4, XMMWORD PTR[Lone] |
+ vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo] |
+ |
+ vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] |
+ vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] |
+ vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] |
+ vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] |
+ vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] |
+ vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] |
+ vpshufb CTR6, CTR6, XMMWORD PTR[Lbswap_mask] |
+ |
+ vmovdqu xmm7, XMMWORD PTR[0*16 + KS] |
+ vpxor CTR0, CTR0, xmm7 |
+ vpxor CTR1, CTR1, xmm7 |
+ vpxor CTR2, CTR2, xmm7 |
+ vpxor CTR3, CTR3, xmm7 |
+ vpxor CTR4, CTR4, xmm7 |
+ vpxor CTR5, CTR5, xmm7 |
+ vpxor CTR6, CTR6, xmm7 |
+ |
+ ROUND 1 |
+ |
+ add aluCTR, 7 |
+ mov aluTMP, aluCTR |
+ bswap aluTMP |
+ xor aluTMP, [KS + 3*4] |
+ mov [8*16 + 0*16 + 3*4 + esp], aluTMP |
+ |
+ ROUND 2 |
+ NEXTCTR 1 |
+ ROUND 3 |
+ NEXTCTR 2 |
+ ROUND 4 |
+ NEXTCTR 3 |
+ ROUND 5 |
+ NEXTCTR 4 |
+ ROUND 6 |
+ NEXTCTR 5 |
+ ROUND 7 |
+ NEXTCTR 6 |
+ ROUND 8 |
+ ROUND 9 |
+ vmovdqu xmm7, XMMWORD PTR[10*16 + KS] |
+ cmp NR, 10 |
+ je @f |
+ |
+ ROUND 10 |
+ ROUND 11 |
+ vmovdqu xmm7, XMMWORD PTR[12*16 + KS] |
+ cmp NR, 12 |
+ je @f |
+ |
+ ROUND 12 |
+ ROUND 13 |
+ vmovdqu xmm7, XMMWORD PTR[14*16 + KS] |
+@@: |
+ vaesenclast CTR0, CTR0, xmm7 |
+ vaesenclast CTR1, CTR1, xmm7 |
+ vaesenclast CTR2, CTR2, xmm7 |
+ vaesenclast CTR3, CTR3, xmm7 |
+ vaesenclast CTR4, CTR4, xmm7 |
+ vaesenclast CTR5, CTR5, xmm7 |
+ vaesenclast CTR6, CTR6, xmm7 |
+ |
+ vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT] |
+ vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT] |
+ vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT] |
+ vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT] |
+ vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT] |
+ vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT] |
+ vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT] |
+ |
+ vmovdqu XMMWORD PTR[0*16 + CT], CTR0 |
+ vmovdqu XMMWORD PTR[1*16 + CT], CTR1 |
+ vmovdqu XMMWORD PTR[2*16 + CT], CTR2 |
+ vmovdqu XMMWORD PTR[3*16 + CT], CTR3 |
+ vmovdqu XMMWORD PTR[4*16 + CT], CTR4 |
+ vmovdqu XMMWORD PTR[5*16 + CT], CTR5 |
+ vmovdqu XMMWORD PTR[6*16 + CT], CTR6 |
+ |
+ vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] |
+ vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] |
+ vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] |
+ vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] |
+ vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] |
+ vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] |
+ vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask] |
+ |
+ vmovdqa XMMWORD PTR[1*16 + esp], CTR5 |
+ vmovdqa XMMWORD PTR[2*16 + esp], CTR4 |
+ vmovdqa XMMWORD PTR[3*16 + esp], CTR3 |
+ vmovdqa XMMWORD PTR[4*16 + esp], CTR2 |
+ vmovdqa XMMWORD PTR[5*16 + esp], CTR1 |
+ vmovdqa XMMWORD PTR[6*16 + esp], CTR0 |
+ |
+ lea CT, [7*16 + CT] |
+ lea PT, [7*16 + PT] |
+ jmp LEncData7 |
+ |
+LEncData7: |
+ cmp len, 16*7 |
+ jb LEndEnc7 |
+ sub len, 16*7 |
+ |
+ vpshufd TMP4, TMP5, 78 |
+ vpxor TMP4, TMP4, TMP5 |
+ vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h |
+ vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] |
+ vpclmulqdq TMP1, TMP5, TMP4, 011h |
+ vpclmulqdq TMP2, TMP5, TMP4, 000h |
+ |
+ vmovdqu TMP5, XMMWORD PTR[1*16 + esp] |
+ KARATSUBA 1 |
+ vmovdqu TMP5, XMMWORD PTR[2*16 + esp] |
+ KARATSUBA 2 |
+ vmovdqu TMP5, XMMWORD PTR[3*16 + esp] |
+ KARATSUBA 3 |
+ vmovdqu TMP5, XMMWORD PTR[4*16 + esp] |
+ KARATSUBA 4 |
+ vmovdqu TMP5, XMMWORD PTR[5*16 + esp] |
+ KARATSUBA 5 |
+ vmovdqu TMP5, XMMWORD PTR[6*16 + esp] |
+ vpxor TMP5, TMP5, T |
+ KARATSUBA 6 |
+ |
+ vpxor TMP0, TMP0, TMP1 |
+ vpxor TMP0, TMP0, TMP2 |
+ vpsrldq TMP3, TMP0, 8 |
+ vpxor TMP4, TMP1, TMP3 |
+ vpslldq TMP3, TMP0, 8 |
+ vpxor TMP5, TMP2, TMP3 |
+ |
+ vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h |
+ vpalignr TMP5,TMP5,TMP5,8 |
+ vpxor TMP5, TMP5, TMP1 |
+ |
+ vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h |
+ vpalignr TMP5,TMP5,TMP5,8 |
+ vpxor TMP5, TMP5, TMP1 |
+ |
+ vpxor TMP5, TMP5, TMP4 |
+ vmovdqu T, TMP5 |
+ |
+ vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + esp] |
+ vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + esp] |
+ vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + esp] |
+ vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + esp] |
+ vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + esp] |
+ vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + esp] |
+ vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + esp] |
+ |
+ ROUND 1 |
+ NEXTCTR 0 |
+ ROUND 2 |
+ NEXTCTR 1 |
+ ROUND 3 |
+ NEXTCTR 2 |
+ ROUND 4 |
+ NEXTCTR 3 |
+ ROUND 5 |
+ NEXTCTR 4 |
+ ROUND 6 |
+ NEXTCTR 5 |
+ ROUND 7 |
+ NEXTCTR 6 |
+ |
+ ROUND 8 |
+ ROUND 9 |
+ |
+ vmovdqu xmm7, XMMWORD PTR[10*16 + KS] |
+ cmp NR, 10 |
+ je @f |
+ |
+ ROUND 10 |
+ ROUND 11 |
+ vmovdqu xmm7, XMMWORD PTR[12*16 + KS] |
+ cmp NR, 12 |
+ je @f |
+ |
+ ROUND 12 |
+ ROUND 13 |
+ vmovdqu xmm7, XMMWORD PTR[14*16 + KS] |
+@@: |
+ vaesenclast CTR0, CTR0, xmm7 |
+ vaesenclast CTR1, CTR1, xmm7 |
+ vaesenclast CTR2, CTR2, xmm7 |
+ vaesenclast CTR3, CTR3, xmm7 |
+ vaesenclast CTR4, CTR4, xmm7 |
+ vaesenclast CTR5, CTR5, xmm7 |
+ vaesenclast CTR6, CTR6, xmm7 |
+ |
+ vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT] |
+ vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT] |
+ vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT] |
+ vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT] |
+ vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT] |
+ vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT] |
+ vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT] |
+ |
+ vmovdqu XMMWORD PTR[0*16 + CT], CTR0 |
+ vmovdqu XMMWORD PTR[1*16 + CT], CTR1 |
+ vmovdqu XMMWORD PTR[2*16 + CT], CTR2 |
+ vmovdqu XMMWORD PTR[3*16 + CT], CTR3 |
+ vmovdqu XMMWORD PTR[4*16 + CT], CTR4 |
+ vmovdqu XMMWORD PTR[5*16 + CT], CTR5 |
+ vmovdqu XMMWORD PTR[6*16 + CT], CTR6 |
+ |
+ vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] |
+ vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] |
+ vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] |
+ vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] |
+ vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] |
+ vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] |
+ vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask] |
+ |
+ vmovdqa XMMWORD PTR[1*16 + esp], CTR5 |
+ vmovdqa XMMWORD PTR[2*16 + esp], CTR4 |
+ vmovdqa XMMWORD PTR[3*16 + esp], CTR3 |
+ vmovdqa XMMWORD PTR[4*16 + esp], CTR2 |
+ vmovdqa XMMWORD PTR[5*16 + esp], CTR1 |
+ vmovdqa XMMWORD PTR[6*16 + esp], CTR0 |
+ |
+ lea CT, [7*16 + CT] |
+ lea PT, [7*16 + PT] |
+ jmp LEncData7 |
+ |
+LEndEnc7: |
+ |
+ vpshufd TMP4, TMP5, 78 |
+ vpxor TMP4, TMP4, TMP5 |
+ vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h |
+ vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] |
+ vpclmulqdq TMP1, TMP5, TMP4, 011h |
+ vpclmulqdq TMP2, TMP5, TMP4, 000h |
+ |
+ vmovdqu TMP5, XMMWORD PTR[1*16 + esp] |
+ KARATSUBA 1 |
+ vmovdqu TMP5, XMMWORD PTR[2*16 + esp] |
+ KARATSUBA 2 |
+ vmovdqu TMP5, XMMWORD PTR[3*16 + esp] |
+ KARATSUBA 3 |
+ vmovdqu TMP5, XMMWORD PTR[4*16 + esp] |
+ KARATSUBA 4 |
+ vmovdqu TMP5, XMMWORD PTR[5*16 + esp] |
+ KARATSUBA 5 |
+ vmovdqu TMP5, XMMWORD PTR[6*16 + esp] |
+ vpxor TMP5, TMP5, T |
+ KARATSUBA 6 |
+ |
+ vpxor TMP0, TMP0, TMP1 |
+ vpxor TMP0, TMP0, TMP2 |
+ vpsrldq TMP3, TMP0, 8 |
+ vpxor TMP4, TMP1, TMP3 |
+ vpslldq TMP3, TMP0, 8 |
+ vpxor TMP5, TMP2, TMP3 |
+ |
+ vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h |
+ vpalignr TMP5,TMP5,TMP5,8 |
+ vpxor TMP5, TMP5, TMP1 |
+ |
+ vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h |
+ vpalignr TMP5,TMP5,TMP5,8 |
+ vpxor TMP5, TMP5, TMP1 |
+ |
+ vpxor TMP5, TMP5, TMP4 |
+ vmovdqu T, TMP5 |
+ |
+ sub aluCTR, 6 |
+ |
+LEncDataSingles: |
+ |
+ cmp len, 16 |
+ jb LEncDataTail |
+ sub len, 16 |
+ |
+ vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp] |
+ NEXTCTR 0 |
+ |
+ vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] |
+ vmovdqu TMP2, XMMWORD PTR[10*16 + KS] |
+ cmp NR, 10 |
+ je @f |
+ vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] |
+ vmovdqu TMP2, XMMWORD PTR[12*16 + KS] |
+ cmp NR, 12 |
+ je @f |
+ vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] |
+ vmovdqu TMP2, XMMWORD PTR[14*16 + KS] |
+@@: |
+ vaesenclast TMP1, TMP1, TMP2 |
+ vpxor TMP1, TMP1, XMMWORD PTR[PT] |
+ vmovdqu XMMWORD PTR[CT], TMP1 |
+ |
+ lea PT, [16+PT] |
+ lea CT, [16+CT] |
+ |
+ vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] |
+ vpxor TMP1, TMP1, T |
+ |
+ vmovdqu TMP0, XMMWORD PTR[Htbl] |
+ GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 |
+ vmovdqu T, TMP1 |
+ |
+ jmp LEncDataSingles |
+ |
+LEncDataTail: |
+ |
+ cmp len, 0 |
+ je LEncDataEnd |
+ |
+ vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp] |
+ |
+ vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] |
+ vmovdqu TMP2, XMMWORD PTR[10*16 + KS] |
+ cmp NR, 10 |
+ je @f |
+ vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] |
+ vmovdqu TMP2, XMMWORD PTR[12*16 + KS] |
+ cmp NR, 12 |
+ je @f |
+ vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] |
+ vmovdqu TMP2, XMMWORD PTR[14*16 + KS] |
+@@: |
+ vaesenclast TMP1, TMP1, TMP2 |
+; zero a temp location |
+ vpxor TMP2, TMP2, TMP2 |
+ vmovdqa XMMWORD PTR[esp], TMP2 |
+; copy as many bytes as needed |
+ xor KS, KS |
+@@: |
+ cmp len, KS |
+ je @f |
+ mov di, [PT + KS] |
+ mov [esp + KS], di |
+ inc KS |
+ jmp @b |
+@@: |
+ vpxor TMP1, TMP1, XMMWORD PTR[esp] |
+ vmovdqa XMMWORD PTR[esp], TMP1 |
+ xor KS, KS |
+@@: |
+ cmp len, KS |
+ je @f |
+ mov di, [esp + KS] |
+ mov [CT + KS], di |
+ inc KS |
+ jmp @b |
+@@: |
+ cmp KS, 16 |
+ je @f |
+ mov BYTE PTR[esp + KS], 0 |
+ inc KS |
+ jmp @b |
+@@: |
+ vmovdqa TMP1, XMMWORD PTR[esp] |
+ |
+ vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] |
+ vpxor TMP1, TMP1, T |
+ |
+ vmovdqu TMP0, XMMWORD PTR[Htbl] |
+ GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 |
+ vmovdqu T, TMP1 |
+ |
+LEncDataEnd: |
+ inc aluCTR |
+ bswap aluCTR |
+ mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR |
+ |
+ mov esp, ebp |
+ pop edi |
+ pop esi |
+ pop ebx |
+ pop ebp |
+ |
+ |
+ vzeroupper |
+ |
+ ret |
+intel_aes_gcmENC ENDP |
+ |
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
+; |
+; Decrypt and Authenticate |
+; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len); |
+; |
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
+ |
+ |
+NEXTCTR MACRO i |
+ add aluCTR, 1 |
+ mov aluTMP, aluCTR |
+ bswap aluTMP |
+ xor aluTMP, [3*4 + KS] |
+ mov [3*4 + i*16 + esp], aluTMP |
+ENDM |
+ |
+intel_aes_gcmDEC PROC |
+ |
+ cmp DWORD PTR[1*4 + 3*4 + esp], 0 |
+ jne LbeginDEC |
+ ret |
+ |
+LbeginDEC: |
+ |
+ vzeroupper |
+ push ebp |
+ push ebx |
+ push esi |
+ push edi |
+ |
+ mov ebp, esp |
+ sub esp, 8*16 |
+ and esp, -16 |
+ |
+ mov CT, [ebp + 5*4 + 0*4] |
+ mov PT, [ebp + 5*4 + 1*4] |
+ mov Gctx, [ebp + 5*4 + 2*4] |
+ |
+ mov KS, [16*16 + 3*16 + Gctx] |
+ lea KS, [44 + KS] |
+ |
+ mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] |
+ bswap aluCTR |
+ |
+ |
+ vmovdqu TMP0, XMMWORD PTR[0*16 + KS] |
+ vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] |
+ vmovdqu XMMWORD PTR[0*16 + esp], TMP0 |
+ |
+ cmp len, 16*7 |
+ jb LDecDataSingles |
+ vmovdqu XMMWORD PTR[1*16 + esp], TMP0 |
+ vmovdqu XMMWORD PTR[2*16 + esp], TMP0 |
+ vmovdqu XMMWORD PTR[3*16 + esp], TMP0 |
+ vmovdqu XMMWORD PTR[4*16 + esp], TMP0 |
+ vmovdqu XMMWORD PTR[5*16 + esp], TMP0 |
+ vmovdqu XMMWORD PTR[6*16 + esp], TMP0 |
+ dec aluCTR |
+ |
+LDecData7: |
+ cmp len, 16*7 |
+ jb LDecData7End |
+ sub len, 16*7 |
+ |
+ vmovdqu TMP5, XMMWORD PTR[0*16 + CT] |
+ vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
+ vpxor TMP5, TMP5, T |
+ vpshufd TMP4, TMP5, 78 |
+ vpxor TMP4, TMP4, TMP5 |
+ vpclmulqdq TMP0, TMP4, XMMWORD PTR[6*16 + 8*16 + Htbl], 000h |
+ vmovdqu TMP4, XMMWORD PTR[6*16 + Htbl] |
+ vpclmulqdq TMP1, TMP5, TMP4, 011h |
+ vpclmulqdq TMP2, TMP5, TMP4, 000h |
+ |
+ NEXTCTR 0 |
+ vmovdqu TMP5, XMMWORD PTR[1*16 + CT] |
+ vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
+ KARATSUBA 5 |
+ NEXTCTR 1 |
+ vmovdqu TMP5, XMMWORD PTR[2*16 + CT] |
+ vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
+ KARATSUBA 4 |
+ NEXTCTR 2 |
+ vmovdqu TMP5, XMMWORD PTR[3*16 + CT] |
+ vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
+ KARATSUBA 3 |
+ NEXTCTR 3 |
+ vmovdqu TMP5, XMMWORD PTR[4*16 + CT] |
+ vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
+ KARATSUBA 2 |
+ NEXTCTR 4 |
+ vmovdqu TMP5, XMMWORD PTR[5*16 + CT] |
+ vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
+ KARATSUBA 1 |
+ NEXTCTR 5 |
+ vmovdqu TMP5, XMMWORD PTR[6*16 + CT] |
+ vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
+ KARATSUBA 0 |
+ NEXTCTR 6 |
+ |
+ vpxor TMP0, TMP0, TMP1 |
+ vpxor TMP0, TMP0, TMP2 |
+ vpsrldq TMP3, TMP0, 8 |
+ vpxor TMP4, TMP1, TMP3 |
+ vpslldq TMP3, TMP0, 8 |
+ vpxor TMP5, TMP2, TMP3 |
+ |
+ vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h |
+ vpalignr TMP5,TMP5,TMP5,8 |
+ vpxor TMP5, TMP5, TMP1 |
+ |
+ vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h |
+ vpalignr TMP5,TMP5,TMP5,8 |
+ vpxor TMP5, TMP5, TMP1 |
+ |
+ vpxor TMP5, TMP5, TMP4 |
+ vmovdqu T, TMP5 |
+ |
+ vmovdqa CTR0, XMMWORD PTR[0*16 + esp] |
+ vmovdqa CTR1, XMMWORD PTR[1*16 + esp] |
+ vmovdqa CTR2, XMMWORD PTR[2*16 + esp] |
+ vmovdqa CTR3, XMMWORD PTR[3*16 + esp] |
+ vmovdqa CTR4, XMMWORD PTR[4*16 + esp] |
+ vmovdqa CTR5, XMMWORD PTR[5*16 + esp] |
+ vmovdqa CTR6, XMMWORD PTR[6*16 + esp] |
+ |
+ ROUND 1 |
+ ROUND 2 |
+ ROUND 3 |
+ ROUND 4 |
+ ROUND 5 |
+ ROUND 6 |
+ ROUND 7 |
+ ROUND 8 |
+ ROUND 9 |
+ vmovdqu xmm7, XMMWORD PTR[10*16 + KS] |
+ cmp NR, 10 |
+ je @f |
+ |
+ ROUND 10 |
+ ROUND 11 |
+ vmovdqu xmm7, XMMWORD PTR[12*16 + KS] |
+ cmp NR, 12 |
+ je @f |
+ |
+ ROUND 12 |
+ ROUND 13 |
+ vmovdqu xmm7, XMMWORD PTR[14*16 + KS] |
+@@: |
+ vaesenclast CTR0, CTR0, xmm7 |
+ vaesenclast CTR1, CTR1, xmm7 |
+ vaesenclast CTR2, CTR2, xmm7 |
+ vaesenclast CTR3, CTR3, xmm7 |
+ vaesenclast CTR4, CTR4, xmm7 |
+ vaesenclast CTR5, CTR5, xmm7 |
+ vaesenclast CTR6, CTR6, xmm7 |
+ |
+ vpxor CTR0, CTR0, XMMWORD PTR[0*16 + CT] |
+ vpxor CTR1, CTR1, XMMWORD PTR[1*16 + CT] |
+ vpxor CTR2, CTR2, XMMWORD PTR[2*16 + CT] |
+ vpxor CTR3, CTR3, XMMWORD PTR[3*16 + CT] |
+ vpxor CTR4, CTR4, XMMWORD PTR[4*16 + CT] |
+ vpxor CTR5, CTR5, XMMWORD PTR[5*16 + CT] |
+ vpxor CTR6, CTR6, XMMWORD PTR[6*16 + CT] |
+ |
+ vmovdqu XMMWORD PTR[0*16 + PT], CTR0 |
+ vmovdqu XMMWORD PTR[1*16 + PT], CTR1 |
+ vmovdqu XMMWORD PTR[2*16 + PT], CTR2 |
+ vmovdqu XMMWORD PTR[3*16 + PT], CTR3 |
+ vmovdqu XMMWORD PTR[4*16 + PT], CTR4 |
+ vmovdqu XMMWORD PTR[5*16 + PT], CTR5 |
+ vmovdqu XMMWORD PTR[6*16 + PT], CTR6 |
+ |
+ lea CT, [7*16 + CT] |
+ lea PT, [7*16 + PT] |
+ jmp LDecData7 |
+ |
+LDecData7End: |
+ |
+ NEXTCTR 0 |
+ |
+LDecDataSingles: |
+ |
+ cmp len, 16 |
+ jb LDecDataTail |
+ sub len, 16 |
+ |
+ vmovdqu TMP1, XMMWORD PTR[CT] |
+ vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] |
+ vpxor TMP1, TMP1, T |
+ |
+ vmovdqu TMP0, XMMWORD PTR[Htbl] |
+ GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 |
+ vmovdqu T, TMP1 |
+ |
+ vmovdqa TMP1, XMMWORD PTR[0*16 + esp] |
+ NEXTCTR 0 |
+ |
+ vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] |
+ vmovdqu TMP2, XMMWORD PTR[10*16 + KS] |
+ cmp NR, 10 |
+ je @f |
+ vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] |
+ vmovdqu TMP2, XMMWORD PTR[12*16 + KS] |
+ cmp NR, 12 |
+ je @f |
+ vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] |
+ vmovdqu TMP2, XMMWORD PTR[14*16 + KS] |
+@@: |
+ vaesenclast TMP1, TMP1, TMP2 |
+ vpxor TMP1, TMP1, XMMWORD PTR[CT] |
+ vmovdqu XMMWORD PTR[PT], TMP1 |
+ |
+ lea PT, [16+PT] |
+ lea CT, [16+CT] |
+ jmp LDecDataSingles |
+ |
+LDecDataTail: |
+ |
+ cmp len, 0 |
+ je LDecDataEnd |
+ |
+ vmovdqa TMP1, XMMWORD PTR[0*16 + esp] |
+ inc aluCTR |
+ vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] |
+ vmovdqu TMP2, XMMWORD PTR[10*16 + KS] |
+ cmp NR, 10 |
+ je @f |
+ vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] |
+ vmovdqu TMP2, XMMWORD PTR[12*16 + KS] |
+ cmp NR, 12 |
+ je @f |
+ vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] |
+ vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] |
+ vmovdqu TMP2, XMMWORD PTR[14*16 + KS] |
+@@: |
+ vaesenclast xmm7, TMP1, TMP2 |
+ |
+; copy as many bytes as needed |
+ xor KS, KS |
+@@: |
+ cmp len, KS |
+ je @f |
+ mov di, [CT + KS] |
+ mov [esp + KS], di |
+ inc KS |
+ jmp @b |
+@@: |
+ cmp KS, 16 |
+ je @f |
+ mov BYTE PTR[esp + KS], 0 |
+ inc KS |
+ jmp @b |
+@@: |
+ |
+ vmovdqa TMP1, XMMWORD PTR[esp] |
+ vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] |
+ vpxor TMP1, TMP1, T |
+ |
+ vmovdqu TMP0, XMMWORD PTR[Htbl] |
+ GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 |
+ vmovdqu T, TMP1 |
+ |
+ |
+ vpxor xmm7, xmm7, XMMWORD PTR[esp] |
+ vmovdqa XMMWORD PTR[esp], xmm7 |
+ xor KS, KS |
+@@: |
+ cmp len, KS |
+ je @f |
+ mov di, [esp + KS] |
+ mov [PT + KS], di |
+ inc KS |
+ jmp @b |
+@@: |
+ cmp KS, 16 |
+ je @f |
+ mov BYTE PTR[PT + KS], 0 |
+ inc KS |
+ jmp @b |
+@@: |
+ |
+LDecDataEnd: |
+ |
+ bswap aluCTR |
+ mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR |
+ |
+ mov esp, ebp |
+ pop edi |
+ pop esi |
+ pop ebx |
+ pop ebp |
+ |
+ vzeroupper |
+ |
+ ret |
+intel_aes_gcmDEC ENDP |
+ |
+ |
+END |