| Index: nss/lib/freebl/intel-gcm-x64-masm.asm
|
| diff --git a/nss/lib/freebl/intel-gcm-x64-masm.asm b/nss/lib/freebl/intel-gcm-x64-masm.asm
|
| deleted file mode 100644
|
| index 8b68b76e58083f1c4dee50217e6d030a6b090cd3..0000000000000000000000000000000000000000
|
| --- a/nss/lib/freebl/intel-gcm-x64-masm.asm
|
| +++ /dev/null
|
| @@ -1,1295 +0,0 @@
|
| -; LICENSE:
|
| -; This submission to NSS is to be made available under the terms of the
|
| -; Mozilla Public License, v. 2.0. You can obtain one at http:
|
| -; //mozilla.org/MPL/2.0/.
|
| -;###############################################################################
|
| -; Copyright(c) 2014, Intel Corp.
|
| -; Developers and authors:
|
| -; Shay Gueron and Vlad Krasnov
|
| -; Intel Corporation, Israel Development Centre, Haifa, Israel
|
| -; Please send feedback directly to crypto.feedback.alias@intel.com
|
| -
|
| -
|
| -.DATA
|
| -ALIGN 16
|
| -Lone dq 1,0
|
| -Ltwo dq 2,0
|
| -Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
| -Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
|
| -Lpoly dq 01h, 0c200000000000000h
|
| -
|
| -.CODE
|
| -
|
| -
|
| -GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4
|
| - vpclmulqdq TMP1, SRC2, SRC1, 0h
|
| - vpclmulqdq TMP4, SRC2, SRC1, 011h
|
| -
|
| - vpshufd TMP2, SRC2, 78
|
| - vpshufd TMP3, SRC1, 78
|
| - vpxor TMP2, TMP2, SRC2
|
| - vpxor TMP3, TMP3, SRC1
|
| -
|
| - vpclmulqdq TMP2, TMP2, TMP3, 0h
|
| - vpxor TMP2, TMP2, TMP1
|
| - vpxor TMP2, TMP2, TMP4
|
| -
|
| - vpslldq TMP3, TMP2, 8
|
| - vpsrldq TMP2, TMP2, 8
|
| -
|
| - vpxor TMP1, TMP1, TMP3
|
| - vpxor TMP4, TMP4, TMP2
|
| -
|
| - vpclmulqdq TMP2, TMP1, [Lpoly], 010h
|
| - vpshufd TMP3, TMP1, 78
|
| - vpxor TMP1, TMP2, TMP3
|
| -
|
| - vpclmulqdq TMP2, TMP1, [Lpoly], 010h
|
| - vpshufd TMP3, TMP1, 78
|
| - vpxor TMP1, TMP2, TMP3
|
| -
|
| - vpxor DST, TMP1, TMP4
|
| -
|
| - ENDM
|
| -
|
| -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
| -;
|
| -; Generates the final GCM tag
|
| -; void intel_aes_gcmTAG(unsigned char Htbl[16*16],
|
| -; unsigned char *Tp,
|
| -; unsigned int Mlen,
|
| -; unsigned int Alen,
|
| -; unsigned char *X0,
|
| -; unsigned char *TAG);
|
| -;
|
| -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
| -
|
| -ALIGN 16
|
| -intel_aes_gcmTAG PROC
|
| -
|
| -Htbl textequ <rcx>
|
| -Tp textequ <rdx>
|
| -Mlen textequ <r8>
|
| -Alen textequ <r9>
|
| -X0 textequ <r10>
|
| -TAG textequ <r11>
|
| -
|
| -T textequ <xmm0>
|
| -TMP0 textequ <xmm1>
|
| -
|
| - mov X0, [rsp + 1*8 + 4*8]
|
| - mov TAG, [rsp + 1*8 + 5*8]
|
| -
|
| - vzeroupper
|
| - vmovdqu T, XMMWORD PTR[Tp]
|
| - vpxor TMP0, TMP0, TMP0
|
| -
|
| - shl Mlen, 3
|
| - shl Alen, 3
|
| -
|
| - ;vpinsrq TMP0, TMP0, Mlen, 0
|
| - ;vpinsrq TMP0, TMP0, Alen, 1
|
| - ; workaround the ml64.exe vpinsrq issue
|
| - vpinsrd TMP0, TMP0, r8d, 0
|
| - vpinsrd TMP0, TMP0, r9d, 2
|
| - shr Mlen, 32
|
| - shr Alen, 32
|
| - vpinsrd TMP0, TMP0, r8d, 1
|
| - vpinsrd TMP0, TMP0, r9d, 3
|
| -
|
| - vpxor T, T, TMP0
|
| - vmovdqu TMP0, XMMWORD PTR[Htbl]
|
| - GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
|
| -
|
| - vpshufb T, T, [Lbswap_mask]
|
| - vpxor T, T, [X0]
|
| - vmovdqu XMMWORD PTR[TAG], T
|
| - vzeroupper
|
| -
|
| - ret
|
| -
|
| -intel_aes_gcmTAG ENDP
|
| -
|
| -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
| -;
|
| -; Generates the H table
|
| -; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR);
|
| -;
|
| -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
| -
|
| -ALIGN 16
|
| -intel_aes_gcmINIT PROC
|
| -
|
| -Htbl textequ <rcx>
|
| -KS textequ <rdx>
|
| -NR textequ <r8d>
|
| -
|
| -T textequ <xmm0>
|
| -TMP0 textequ <xmm1>
|
| -
|
| - vzeroupper
|
| - ; AES-ENC(0)
|
| - vmovdqu T, XMMWORD PTR[KS]
|
| - lea KS, [16 + KS]
|
| - dec NR
|
| -Lenc_loop:
|
| - vaesenc T, T, [KS]
|
| - lea KS, [16 + KS]
|
| - dec NR
|
| - jnz Lenc_loop
|
| -
|
| - vaesenclast T, T, [KS]
|
| - vpshufb T, T, [Lbswap_mask]
|
| -
|
| - ;Calculate H` = GFMUL(H, 2)
|
| - vpsrad xmm3, T, 31
|
| - vpshufd xmm3, xmm3, 0ffh
|
| - vpand xmm5, xmm3, [Lpoly]
|
| - vpsrld xmm3, T, 31
|
| - vpslld xmm4, T, 1
|
| - vpslldq xmm3, xmm3, 4
|
| - vpxor T, xmm4, xmm3
|
| - vpxor T, T, xmm5
|
| -
|
| - vmovdqu TMP0, T
|
| - vmovdqu XMMWORD PTR[Htbl + 0*16], T
|
| -
|
| - vpshufd xmm2, T, 78
|
| - vpxor xmm2, xmm2, T
|
| - vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2
|
| -
|
| - i = 1
|
| - WHILE i LT 8
|
| - GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
|
| - vmovdqu XMMWORD PTR[Htbl + i*16], T
|
| - vpshufd xmm2, T, 78
|
| - vpxor xmm2, xmm2, T
|
| - vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2
|
| - i = i+1
|
| - ENDM
|
| - vzeroupper
|
| - ret
|
| -intel_aes_gcmINIT ENDP
|
| -
|
| -
|
| -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
| -;
|
| -; Authenticate only
|
| -; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp);
|
| -;
|
| -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
| -
|
| -ALIGN 16
|
| -intel_aes_gcmAAD PROC
|
| -
|
| -Htbl textequ <rcx>
|
| -inp textequ <rdx>
|
| -len textequ <r8>
|
| -Tp textequ <r9>
|
| -hlp0 textequ <r10>
|
| -
|
| -DATA textequ <xmm0>
|
| -T textequ <xmm1>
|
| -TMP0 textequ <xmm2>
|
| -TMP1 textequ <xmm3>
|
| -TMP2 textequ <xmm4>
|
| -TMP3 textequ <xmm5>
|
| -TMP4 textequ <xmm6>
|
| -Xhi textequ <xmm7>
|
| -
|
| -KARATSUBA_AAD MACRO i
|
| - vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h
|
| - vpxor TMP0, TMP0, TMP3
|
| - vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h
|
| - vpxor TMP1, TMP1, TMP3
|
| - vpshufd TMP3, DATA, 78
|
| - vpxor TMP3, TMP3, DATA
|
| - vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h
|
| - vpxor TMP2, TMP2, TMP3
|
| -ENDM
|
| -
|
| - test len, len
|
| - jnz LbeginAAD
|
| - ret
|
| -
|
| -LbeginAAD:
|
| - vzeroupper
|
| -
|
| - sub rsp, 2*16
|
| - vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
|
| - vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
|
| -
|
| - vpxor Xhi, Xhi, Xhi
|
| -
|
| - vmovdqu T, XMMWORD PTR[Tp]
|
| - ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
|
| - mov hlp0, len
|
| - and hlp0, 128-1
|
| - jz Lmod_loop
|
| -
|
| - and len, -128
|
| - sub hlp0, 16
|
| -
|
| - ; Prefix block
|
| - vmovdqu DATA, XMMWORD PTR[inp]
|
| - vpshufb DATA, DATA, [Lbswap_mask]
|
| - vpxor DATA, DATA, T
|
| -
|
| - vpclmulqdq TMP0, DATA, [Htbl + hlp0], 0h
|
| - vpclmulqdq TMP1, DATA, [Htbl + hlp0], 011h
|
| - vpshufd TMP3, DATA, 78
|
| - vpxor TMP3, TMP3, DATA
|
| - vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + hlp0], 0h
|
| -
|
| - lea inp, [inp+16]
|
| - test hlp0, hlp0
|
| - jnz Lpre_loop
|
| - jmp Lred1
|
| -
|
| - ;hash remaining prefix bocks (up to 7 total prefix blocks)
|
| -Lpre_loop:
|
| -
|
| - sub hlp0, 16
|
| -
|
| - vmovdqu DATA, XMMWORD PTR[inp]
|
| - vpshufb DATA, DATA, [Lbswap_mask]
|
| -
|
| - vpclmulqdq TMP3, DATA, [Htbl + hlp0], 0h
|
| - vpxor TMP0, TMP0, TMP3
|
| - vpclmulqdq TMP3, DATA, [Htbl + hlp0], 011h
|
| - vpxor TMP1, TMP1, TMP3
|
| - vpshufd TMP3, DATA, 78
|
| - vpxor TMP3, TMP3, DATA
|
| - vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + hlp0], 0h
|
| - vpxor TMP2, TMP2, TMP3
|
| -
|
| - test hlp0, hlp0
|
| - lea inp, [inp+16]
|
| - jnz Lpre_loop
|
| -
|
| -Lred1:
|
| -
|
| - vpxor TMP2, TMP2, TMP0
|
| - vpxor TMP2, TMP2, TMP1
|
| - vpsrldq TMP3, TMP2, 8
|
| - vpslldq TMP2, TMP2, 8
|
| -
|
| - vpxor Xhi, TMP1, TMP3
|
| - vpxor T, TMP0, TMP2
|
| -
|
| -
|
| -Lmod_loop:
|
| -
|
| - sub len, 16*8
|
| - jb Ldone
|
| - ; Block #0
|
| - vmovdqu DATA, XMMWORD PTR[inp + 16*7]
|
| - vpshufb DATA, DATA, [Lbswap_mask]
|
| -
|
| - vpclmulqdq TMP0, DATA, [Htbl + 0*16], 0h
|
| - vpclmulqdq TMP1, DATA, [Htbl + 0*16], 011h
|
| - vpshufd TMP3, DATA, 78
|
| - vpxor TMP3, TMP3, DATA
|
| - vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + 0*16], 0h
|
| -
|
| - ; Block #1
|
| - vmovdqu DATA, XMMWORD PTR[inp + 16*6]
|
| - vpshufb DATA, DATA, [Lbswap_mask]
|
| - KARATSUBA_AAD 1
|
| -
|
| - ; Block #2
|
| - vmovdqu DATA, XMMWORD PTR[inp + 16*5]
|
| - vpshufb DATA, DATA, [Lbswap_mask]
|
| -
|
| - vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a
|
| - vpalignr T, T, T, 8
|
| -
|
| - KARATSUBA_AAD 2
|
| -
|
| - vpxor T, T, TMP4 ;reduction stage 1b
|
| -
|
| - ; Block #3
|
| - vmovdqu DATA, XMMWORD PTR[inp + 16*4]
|
| - vpshufb DATA, DATA, [Lbswap_mask]
|
| - KARATSUBA_AAD 3
|
| - ; Block #4
|
| - vmovdqu DATA, XMMWORD PTR[inp + 16*3]
|
| - vpshufb DATA, DATA, [Lbswap_mask]
|
| -
|
| - vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a
|
| - vpalignr T, T, T, 8
|
| -
|
| - KARATSUBA_AAD 4
|
| -
|
| - vpxor T, T, TMP4 ;reduction stage 2b
|
| - ; Block #5
|
| - vmovdqu DATA, XMMWORD PTR[inp + 16*2]
|
| - vpshufb DATA, DATA, [Lbswap_mask]
|
| - KARATSUBA_AAD 5
|
| -
|
| - vpxor T, T, Xhi ;reduction finalize
|
| - ; Block #6
|
| - vmovdqu DATA, XMMWORD PTR[inp + 16*1]
|
| - vpshufb DATA, DATA, [Lbswap_mask]
|
| - KARATSUBA_AAD 6
|
| - ; Block #7
|
| - vmovdqu DATA, XMMWORD PTR[inp + 16*0]
|
| - vpshufb DATA, DATA, [Lbswap_mask]
|
| - vpxor DATA, DATA, T
|
| - KARATSUBA_AAD 7
|
| - ; Aggregated 8 blocks, now karatsuba fixup
|
| - vpxor TMP2, TMP2, TMP0
|
| - vpxor TMP2, TMP2, TMP1
|
| - vpsrldq TMP3, TMP2, 8
|
| - vpslldq TMP2, TMP2, 8
|
| -
|
| - vpxor Xhi, TMP1, TMP3
|
| - vpxor T, TMP0, TMP2
|
| -
|
| - lea inp, [inp + 16*8]
|
| - jmp Lmod_loop
|
| -
|
| -Ldone:
|
| - vpclmulqdq TMP4, T, [Lpoly], 010h
|
| - vpalignr T, T, T, 8
|
| - vpxor T, T, TMP4
|
| -
|
| - vpclmulqdq TMP4, T, [Lpoly], 010h
|
| - vpalignr T, T, T, 8
|
| - vpxor T, T, TMP4
|
| -
|
| - vpxor T, T, Xhi
|
| - vmovdqu XMMWORD PTR[Tp], T
|
| - vzeroupper
|
| -
|
| - vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
|
| - vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
|
| - add rsp, 16*2
|
| -
|
| - ret
|
| -
|
| -intel_aes_gcmAAD ENDP
|
| -
|
| -
|
| -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
| -;
|
| -; Encrypt and Authenticate
|
| -; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len);
|
| -;
|
| -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
| -
|
| -ALIGN 16
|
| -intel_aes_gcmENC PROC
|
| -
|
| -PT textequ <rcx>
|
| -CT textequ <rdx>
|
| -Htbl textequ <r8>
|
| -Gctx textequ <r8>
|
| -len textequ <r9>
|
| -KS textequ <r10>
|
| -NR textequ <eax>
|
| -
|
| -aluCTR textequ <r11d>
|
| -aluKSl textequ <r12d>
|
| -aluTMP textequ <r13d>
|
| -
|
| -T textequ <xmm0>
|
| -TMP0 textequ <xmm1>
|
| -TMP1 textequ <xmm2>
|
| -TMP2 textequ <xmm3>
|
| -TMP3 textequ <xmm4>
|
| -TMP4 textequ <xmm5>
|
| -TMP5 textequ <xmm6>
|
| -CTR0 textequ <xmm7>
|
| -CTR1 textequ <xmm8>
|
| -CTR2 textequ <xmm9>
|
| -CTR3 textequ <xmm10>
|
| -CTR4 textequ <xmm11>
|
| -CTR5 textequ <xmm12>
|
| -CTR6 textequ <xmm13>
|
| -CTR7 textequ <xmm14>
|
| -BSWAPMASK textequ <xmm15>
|
| -
|
| -ROUND MACRO i
|
| - vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
|
| - vaesenc CTR0, CTR0, TMP3
|
| - vaesenc CTR1, CTR1, TMP3
|
| - vaesenc CTR2, CTR2, TMP3
|
| - vaesenc CTR3, CTR3, TMP3
|
| - vaesenc CTR4, CTR4, TMP3
|
| - vaesenc CTR5, CTR5, TMP3
|
| - vaesenc CTR6, CTR6, TMP3
|
| - vaesenc CTR7, CTR7, TMP3
|
| -ENDM
|
| -ROUNDMUL MACRO i
|
| - vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
|
| -
|
| - vaesenc CTR0, CTR0, TMP3
|
| - vaesenc CTR1, CTR1, TMP3
|
| - vaesenc CTR2, CTR2, TMP3
|
| - vaesenc CTR3, CTR3, TMP3
|
| -
|
| - vpshufd TMP4, TMP5, 78
|
| - vpxor TMP4, TMP4, TMP5
|
| -
|
| - vaesenc CTR4, CTR4, TMP3
|
| - vaesenc CTR5, CTR5, TMP3
|
| - vaesenc CTR6, CTR6, TMP3
|
| - vaesenc CTR7, CTR7, TMP3
|
| -
|
| - vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
|
| - vpxor TMP0, TMP0, TMP3
|
| - vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl]
|
| - vpclmulqdq TMP3, TMP5, TMP4, 011h
|
| - vpxor TMP1, TMP1, TMP3
|
| - vpclmulqdq TMP3, TMP5, TMP4, 000h
|
| - vpxor TMP2, TMP2, TMP3
|
| -ENDM
|
| -KARATSUBA MACRO i
|
| - vpshufd TMP4, TMP5, 78
|
| - vpxor TMP4, TMP4, TMP5
|
| - vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
|
| - vpxor TMP0, TMP0, TMP3
|
| - vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl]
|
| - vpclmulqdq TMP3, TMP5, TMP4, 011h
|
| - vpxor TMP1, TMP1, TMP3
|
| - vpclmulqdq TMP3, TMP5, TMP4, 000h
|
| - vpxor TMP2, TMP2, TMP3
|
| -ENDM
|
| -NEXTCTR MACRO i
|
| - add aluCTR, 1
|
| - mov aluTMP, aluCTR
|
| - xor aluTMP, aluKSl
|
| - bswap aluTMP
|
| - mov [3*4 + 8*16 + i*16 + rsp], aluTMP
|
| -ENDM
|
| -
|
| -
|
| - test len, len
|
| - jnz LbeginENC
|
| - ret
|
| -
|
| -LbeginENC:
|
| -
|
| - vzeroupper
|
| - push r11
|
| - push r12
|
| - push r13
|
| - push rbp
|
| - sub rsp, 10*16
|
| - vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
|
| - vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
|
| - vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
|
| - vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
|
| - vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
|
| - vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
|
| - vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
|
| - vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
|
| - vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
|
| - vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
|
| -
|
| - mov rbp, rsp
|
| - sub rsp, 16*16
|
| - and rsp, -16
|
| -
|
| - vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
|
| - vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
|
| - vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
|
| - mov KS, [16*16 + 3*16 + Gctx]
|
| - mov NR, [4 + KS]
|
| - lea KS, [48 + KS]
|
| -
|
| - vpshufb CTR0, CTR0, BSWAPMASK
|
| -
|
| - mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
|
| - mov aluKSl, [3*4 + KS]
|
| - bswap aluCTR
|
| - bswap aluKSl
|
| -
|
| - vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
|
| - vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
|
| - vmovdqu XMMWORD PTR[8*16 + 0*16 + rsp], TMP0
|
| -
|
| - cmp len, 128
|
| - jb LEncDataSingles
|
| -; Prepare the "top" counters
|
| - vmovdqu XMMWORD PTR[8*16 + 1*16 + rsp], TMP0
|
| - vmovdqu XMMWORD PTR[8*16 + 2*16 + rsp], TMP0
|
| - vmovdqu XMMWORD PTR[8*16 + 3*16 + rsp], TMP0
|
| - vmovdqu XMMWORD PTR[8*16 + 4*16 + rsp], TMP0
|
| - vmovdqu XMMWORD PTR[8*16 + 5*16 + rsp], TMP0
|
| - vmovdqu XMMWORD PTR[8*16 + 6*16 + rsp], TMP0
|
| - vmovdqu XMMWORD PTR[8*16 + 7*16 + rsp], TMP0
|
| -
|
| -; Encrypt the initial 8 blocks
|
| - sub len, 128
|
| - vpaddd CTR1, CTR0, XMMWORD PTR[Lone]
|
| - vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo]
|
| - vpaddd CTR3, CTR2, XMMWORD PTR[Lone]
|
| - vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo]
|
| - vpaddd CTR5, CTR4, XMMWORD PTR[Lone]
|
| - vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo]
|
| - vpaddd CTR7, CTR6, XMMWORD PTR[Lone]
|
| -
|
| - vpshufb CTR0, CTR0, BSWAPMASK
|
| - vpshufb CTR1, CTR1, BSWAPMASK
|
| - vpshufb CTR2, CTR2, BSWAPMASK
|
| - vpshufb CTR3, CTR3, BSWAPMASK
|
| - vpshufb CTR4, CTR4, BSWAPMASK
|
| - vpshufb CTR5, CTR5, BSWAPMASK
|
| - vpshufb CTR6, CTR6, BSWAPMASK
|
| - vpshufb CTR7, CTR7, BSWAPMASK
|
| -
|
| - vmovdqu TMP3, XMMWORD PTR[0*16 + KS]
|
| - vpxor CTR0, CTR0, TMP3
|
| - vpxor CTR1, CTR1, TMP3
|
| - vpxor CTR2, CTR2, TMP3
|
| - vpxor CTR3, CTR3, TMP3
|
| - vpxor CTR4, CTR4, TMP3
|
| - vpxor CTR5, CTR5, TMP3
|
| - vpxor CTR6, CTR6, TMP3
|
| - vpxor CTR7, CTR7, TMP3
|
| -
|
| - ROUND 1
|
| -
|
| - add aluCTR, 8
|
| - mov aluTMP, aluCTR
|
| - xor aluTMP, aluKSl
|
| - bswap aluTMP
|
| - mov [8*16 + 0*16 + 3*4 + rsp], aluTMP
|
| -
|
| - ROUND 2
|
| - NEXTCTR 1
|
| - ROUND 3
|
| - NEXTCTR 2
|
| - ROUND 4
|
| - NEXTCTR 3
|
| - ROUND 5
|
| - NEXTCTR 4
|
| - ROUND 6
|
| - NEXTCTR 5
|
| - ROUND 7
|
| - NEXTCTR 6
|
| - ROUND 8
|
| - NEXTCTR 7
|
| - ROUND 9
|
| - vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
|
| - cmp NR, 10
|
| - je @f
|
| -
|
| - ROUND 10
|
| - ROUND 11
|
| - vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
|
| - cmp NR, 12
|
| - je @f
|
| -
|
| - ROUND 12
|
| - ROUND 13
|
| - vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
|
| -@@:
|
| - vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT]
|
| - vaesenclast CTR0, CTR0, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT]
|
| - vaesenclast CTR1, CTR1, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT]
|
| - vaesenclast CTR2, CTR2, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT]
|
| - vaesenclast CTR3, CTR3, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT]
|
| - vaesenclast CTR4, CTR4, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT]
|
| - vaesenclast CTR5, CTR5, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT]
|
| - vaesenclast CTR6, CTR6, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT]
|
| - vaesenclast CTR7, CTR7, TMP3
|
| -
|
| - vmovdqu XMMWORD PTR[0*16 + CT], CTR0
|
| - vpshufb CTR0, CTR0, BSWAPMASK
|
| - vmovdqu XMMWORD PTR[1*16 + CT], CTR1
|
| - vpshufb CTR1, CTR1, BSWAPMASK
|
| - vmovdqu XMMWORD PTR[2*16 + CT], CTR2
|
| - vpshufb CTR2, CTR2, BSWAPMASK
|
| - vmovdqu XMMWORD PTR[3*16 + CT], CTR3
|
| - vpshufb CTR3, CTR3, BSWAPMASK
|
| - vmovdqu XMMWORD PTR[4*16 + CT], CTR4
|
| - vpshufb CTR4, CTR4, BSWAPMASK
|
| - vmovdqu XMMWORD PTR[5*16 + CT], CTR5
|
| - vpshufb CTR5, CTR5, BSWAPMASK
|
| - vmovdqu XMMWORD PTR[6*16 + CT], CTR6
|
| - vpshufb CTR6, CTR6, BSWAPMASK
|
| - vmovdqu XMMWORD PTR[7*16 + CT], CTR7
|
| - vpshufb TMP5, CTR7, BSWAPMASK
|
| -
|
| - vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
|
| - vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
|
| - vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
|
| - vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
|
| - vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
|
| - vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
|
| - vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
|
| -
|
| - lea CT, [8*16 + CT]
|
| - lea PT, [8*16 + PT]
|
| - jmp LEncDataOctets
|
| -
|
| -LEncDataOctets:
|
| - cmp len, 128
|
| - jb LEndEncOctets
|
| - sub len, 128
|
| -
|
| - vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + rsp]
|
| - vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + rsp]
|
| - vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + rsp]
|
| - vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + rsp]
|
| - vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + rsp]
|
| - vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + rsp]
|
| - vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + rsp]
|
| - vmovdqa CTR7, XMMWORD PTR[8*16 + 7*16 + rsp]
|
| -
|
| - vpshufd TMP4, TMP5, 78
|
| - vpxor TMP4, TMP4, TMP5
|
| - vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
|
| - vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
|
| - vpclmulqdq TMP1, TMP5, TMP4, 011h
|
| - vpclmulqdq TMP2, TMP5, TMP4, 000h
|
| -
|
| - vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
|
| - ROUNDMUL 1
|
| - NEXTCTR 0
|
| - vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
|
| - ROUNDMUL 2
|
| - NEXTCTR 1
|
| - vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
|
| - ROUNDMUL 3
|
| - NEXTCTR 2
|
| - vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
|
| - ROUNDMUL 4
|
| - NEXTCTR 3
|
| - vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
|
| - ROUNDMUL 5
|
| - NEXTCTR 4
|
| - vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
|
| - ROUNDMUL 6
|
| - NEXTCTR 5
|
| - vpxor TMP5, T, XMMWORD PTR[7*16 + rsp]
|
| - ROUNDMUL 7
|
| - NEXTCTR 6
|
| -
|
| - ROUND 8
|
| - NEXTCTR 7
|
| -
|
| - vpxor TMP0, TMP0, TMP1
|
| - vpxor TMP0, TMP0, TMP2
|
| - vpsrldq TMP3, TMP0, 8
|
| - vpxor TMP4, TMP1, TMP3
|
| - vpslldq TMP3, TMP0, 8
|
| - vpxor T, TMP2, TMP3
|
| -
|
| - vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
|
| - vpalignr T,T,T,8
|
| - vpxor T, T, TMP1
|
| -
|
| - ROUND 9
|
| -
|
| - vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
|
| - vpalignr T,T,T,8
|
| - vpxor T, T, TMP1
|
| -
|
| - vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
|
| - cmp NR, 10
|
| - je @f
|
| -
|
| - ROUND 10
|
| - ROUND 11
|
| - vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
|
| - cmp NR, 12
|
| - je @f
|
| -
|
| - ROUND 12
|
| - ROUND 13
|
| - vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
|
| -@@:
|
| - vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT]
|
| - vaesenclast CTR0, CTR0, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT]
|
| - vaesenclast CTR1, CTR1, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT]
|
| - vaesenclast CTR2, CTR2, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT]
|
| - vaesenclast CTR3, CTR3, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT]
|
| - vaesenclast CTR4, CTR4, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT]
|
| - vaesenclast CTR5, CTR5, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT]
|
| - vaesenclast CTR6, CTR6, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT]
|
| - vaesenclast CTR7, CTR7, TMP3
|
| -
|
| - vmovdqu XMMWORD PTR[0*16 + CT], CTR0
|
| - vpshufb CTR0, CTR0, BSWAPMASK
|
| - vmovdqu XMMWORD PTR[1*16 + CT], CTR1
|
| - vpshufb CTR1, CTR1, BSWAPMASK
|
| - vmovdqu XMMWORD PTR[2*16 + CT], CTR2
|
| - vpshufb CTR2, CTR2, BSWAPMASK
|
| - vmovdqu XMMWORD PTR[3*16 + CT], CTR3
|
| - vpshufb CTR3, CTR3, BSWAPMASK
|
| - vmovdqu XMMWORD PTR[4*16 + CT], CTR4
|
| - vpshufb CTR4, CTR4, BSWAPMASK
|
| - vmovdqu XMMWORD PTR[5*16 + CT], CTR5
|
| - vpshufb CTR5, CTR5, BSWAPMASK
|
| - vmovdqu XMMWORD PTR[6*16 + CT], CTR6
|
| - vpshufb CTR6, CTR6, BSWAPMASK
|
| - vmovdqu XMMWORD PTR[7*16 + CT], CTR7
|
| - vpshufb TMP5, CTR7, BSWAPMASK
|
| -
|
| - vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
|
| - vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
|
| - vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
|
| - vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
|
| - vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
|
| - vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
|
| - vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
|
| -
|
| - vpxor T, T, TMP4
|
| -
|
| - lea CT, [8*16 + CT]
|
| - lea PT, [8*16 + PT]
|
| - jmp LEncDataOctets
|
| -
|
| -LEndEncOctets:
|
| -
|
| - vpshufd TMP4, TMP5, 78
|
| - vpxor TMP4, TMP4, TMP5
|
| - vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
|
| - vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
|
| - vpclmulqdq TMP1, TMP5, TMP4, 011h
|
| - vpclmulqdq TMP2, TMP5, TMP4, 000h
|
| -
|
| - vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
|
| - KARATSUBA 1
|
| - vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
|
| - KARATSUBA 2
|
| - vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
|
| - KARATSUBA 3
|
| - vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
|
| - KARATSUBA 4
|
| - vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
|
| - KARATSUBA 5
|
| - vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
|
| - KARATSUBA 6
|
| - vpxor TMP5, T, XMMWORD PTR[7*16 + rsp]
|
| - KARATSUBA 7
|
| -
|
| - vpxor TMP0, TMP0, TMP1
|
| - vpxor TMP0, TMP0, TMP2
|
| - vpsrldq TMP3, TMP0, 8
|
| - vpxor TMP4, TMP1, TMP3
|
| - vpslldq TMP3, TMP0, 8
|
| - vpxor T, TMP2, TMP3
|
| -
|
| - vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
|
| - vpalignr T,T,T,8
|
| - vpxor T, T, TMP1
|
| -
|
| - vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
|
| - vpalignr T,T,T,8
|
| - vpxor T, T, TMP1
|
| -
|
| - vpxor T, T, TMP4
|
| -
|
| - sub aluCTR, 7
|
| -
|
| -LEncDataSingles:
|
| -
|
| - cmp len, 16
|
| - jb LEncDataTail
|
| - sub len, 16
|
| -
|
| - vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
|
| - NEXTCTR 0
|
| -
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
|
| - vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
|
| - cmp NR, 10
|
| - je @f
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
|
| - vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
|
| - cmp NR, 12
|
| - je @f
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
|
| - vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
|
| -@@:
|
| - vaesenclast TMP1, TMP1, TMP2
|
| - vpxor TMP1, TMP1, XMMWORD PTR[PT]
|
| - vmovdqu XMMWORD PTR[CT], TMP1
|
| -
|
| - lea PT, [16+PT]
|
| - lea CT, [16+CT]
|
| -
|
| - vpshufb TMP1, TMP1, BSWAPMASK
|
| - vpxor T, T, TMP1
|
| - vmovdqu TMP0, XMMWORD PTR[Htbl]
|
| - GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4
|
| -
|
| - jmp LEncDataSingles
|
| -
|
| -LEncDataTail:
|
| -
|
| - test len, len
|
| - jz LEncDataEnd
|
| -
|
| - vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
|
| -
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
|
| - vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
|
| - cmp NR, 10
|
| - je @f
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
|
| - vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
|
| - cmp NR, 12
|
| - je @f
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
|
| - vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
|
| -@@:
|
| - vaesenclast TMP1, TMP1, TMP2
|
| -; zero a temp location
|
| - vpxor TMP2, TMP2, TMP2
|
| - vmovdqa XMMWORD PTR[rsp], TMP2
|
| -; copy as many bytes as needed
|
| - xor KS, KS
|
| -
|
| -@@:
|
| - cmp len, KS
|
| - je @f
|
| - mov al, [PT + KS]
|
| - mov [rsp + KS], al
|
| - inc KS
|
| - jmp @b
|
| -@@:
|
| - vpxor TMP1, TMP1, XMMWORD PTR[rsp]
|
| - vmovdqa XMMWORD PTR[rsp], TMP1
|
| - xor KS, KS
|
| -@@:
|
| - cmp len, KS
|
| - je @f
|
| - mov al, [rsp + KS]
|
| - mov [CT + KS], al
|
| - inc KS
|
| - jmp @b
|
| -@@:
|
| - cmp KS, 16
|
| - je @f
|
| - mov BYTE PTR[rsp + KS], 0
|
| - inc KS
|
| - jmp @b
|
| -@@:
|
| -BAIL:
|
| - vmovdqa TMP1, XMMWORD PTR[rsp]
|
| - vpshufb TMP1, TMP1, BSWAPMASK
|
| - vpxor T, T, TMP1
|
| - vmovdqu TMP0, XMMWORD PTR[Htbl]
|
| - GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4
|
| -
|
| -LEncDataEnd:
|
| -
|
| - vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
|
| - bswap aluCTR
|
| - mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
|
| -
|
| - mov rsp, rbp
|
| -
|
| - vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
|
| - vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
|
| - vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
|
| - vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
|
| - vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
|
| - vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
|
| - vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
|
| - vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
|
| - vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
|
| - vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
|
| -
|
| - add rsp, 10*16
|
| - pop rbp
|
| - pop r13
|
| - pop r12
|
| - pop r11
|
| -
|
| - vzeroupper
|
| -
|
| - ret
|
| -intel_aes_gcmENC ENDP
|
| -
|
| -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
| -;
|
| -; Decrypt and Authenticate
|
| -; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len);
|
| -;
|
| -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
| -
|
| -ALIGN 16
|
| -intel_aes_gcmDEC PROC
|
| -
|
| -NEXTCTR MACRO i
|
| - add aluCTR, 1
|
| - mov aluTMP, aluCTR
|
| - xor aluTMP, aluKSl
|
| - bswap aluTMP
|
| - mov [3*4 + i*16 + rsp], aluTMP
|
| -ENDM
|
| -
|
| -PT textequ <rdx>
|
| -CT textequ <rcx>
|
| -
|
| - test len, len
|
| - jnz LbeginDEC
|
| - ret
|
| -
|
| -LbeginDEC:
|
| -
|
| - vzeroupper
|
| - push r11
|
| - push r12
|
| - push r13
|
| - push rbp
|
| - sub rsp, 10*16
|
| - vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
|
| - vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
|
| - vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
|
| - vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
|
| - vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
|
| - vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
|
| - vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
|
| - vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
|
| - vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
|
| - vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
|
| -
|
| - mov rbp, rsp
|
| - sub rsp, 8*16
|
| - and rsp, -16
|
| -
|
| - vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
|
| - vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
|
| - vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
|
| - mov KS, [16*16 + 3*16 + Gctx]
|
| - mov NR, [4 + KS]
|
| - lea KS, [48 + KS]
|
| -
|
| - vpshufb CTR0, CTR0, BSWAPMASK
|
| -
|
| - mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
|
| - mov aluKSl, [3*4 + KS]
|
| - bswap aluCTR
|
| - bswap aluKSl
|
| -
|
| - vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
|
| - vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
|
| - vmovdqu XMMWORD PTR[0*16 + rsp], TMP0
|
| -
|
| - cmp len, 128
|
| - jb LDecDataSingles
|
| -; Prepare the "top" counters
|
| - vmovdqu XMMWORD PTR[1*16 + rsp], TMP0
|
| - vmovdqu XMMWORD PTR[2*16 + rsp], TMP0
|
| - vmovdqu XMMWORD PTR[3*16 + rsp], TMP0
|
| - vmovdqu XMMWORD PTR[4*16 + rsp], TMP0
|
| - vmovdqu XMMWORD PTR[5*16 + rsp], TMP0
|
| - vmovdqu XMMWORD PTR[6*16 + rsp], TMP0
|
| - vmovdqu XMMWORD PTR[7*16 + rsp], TMP0
|
| -
|
| - NEXTCTR 1
|
| - NEXTCTR 2
|
| - NEXTCTR 3
|
| - NEXTCTR 4
|
| - NEXTCTR 5
|
| - NEXTCTR 6
|
| - NEXTCTR 7
|
| -
|
| -LDecDataOctets:
|
| - cmp len, 128
|
| - jb LEndDecOctets
|
| - sub len, 128
|
| -
|
| - vmovdqa CTR0, XMMWORD PTR[0*16 + rsp]
|
| - vmovdqa CTR1, XMMWORD PTR[1*16 + rsp]
|
| - vmovdqa CTR2, XMMWORD PTR[2*16 + rsp]
|
| - vmovdqa CTR3, XMMWORD PTR[3*16 + rsp]
|
| - vmovdqa CTR4, XMMWORD PTR[4*16 + rsp]
|
| - vmovdqa CTR5, XMMWORD PTR[5*16 + rsp]
|
| - vmovdqa CTR6, XMMWORD PTR[6*16 + rsp]
|
| - vmovdqa CTR7, XMMWORD PTR[7*16 + rsp]
|
| -
|
| - vmovdqu TMP5, XMMWORD PTR[7*16 + CT]
|
| - vpshufb TMP5, TMP5, BSWAPMASK
|
| - vpshufd TMP4, TMP5, 78
|
| - vpxor TMP4, TMP4, TMP5
|
| - vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
|
| - vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
|
| - vpclmulqdq TMP1, TMP5, TMP4, 011h
|
| - vpclmulqdq TMP2, TMP5, TMP4, 000h
|
| -
|
| - vmovdqu TMP5, XMMWORD PTR[6*16 + CT]
|
| - vpshufb TMP5, TMP5, BSWAPMASK
|
| - ROUNDMUL 1
|
| - NEXTCTR 0
|
| - vmovdqu TMP5, XMMWORD PTR[5*16 + CT]
|
| - vpshufb TMP5, TMP5, BSWAPMASK
|
| - ROUNDMUL 2
|
| - NEXTCTR 1
|
| - vmovdqu TMP5, XMMWORD PTR[4*16 + CT]
|
| - vpshufb TMP5, TMP5, BSWAPMASK
|
| - ROUNDMUL 3
|
| - NEXTCTR 2
|
| - vmovdqu TMP5, XMMWORD PTR[3*16 + CT]
|
| - vpshufb TMP5, TMP5, BSWAPMASK
|
| - ROUNDMUL 4
|
| - NEXTCTR 3
|
| - vmovdqu TMP5, XMMWORD PTR[2*16 + CT]
|
| - vpshufb TMP5, TMP5, BSWAPMASK
|
| - ROUNDMUL 5
|
| - NEXTCTR 4
|
| - vmovdqu TMP5, XMMWORD PTR[1*16 + CT]
|
| - vpshufb TMP5, TMP5, BSWAPMASK
|
| - ROUNDMUL 6
|
| - NEXTCTR 5
|
| - vmovdqu TMP5, XMMWORD PTR[0*16 + CT]
|
| - vpshufb TMP5, TMP5, BSWAPMASK
|
| - vpxor TMP5, TMP5, T
|
| - ROUNDMUL 7
|
| - NEXTCTR 6
|
| -
|
| - ROUND 8
|
| - NEXTCTR 7
|
| -
|
| - vpxor TMP0, TMP0, TMP1
|
| - vpxor TMP0, TMP0, TMP2
|
| - vpsrldq TMP3, TMP0, 8
|
| - vpxor TMP4, TMP1, TMP3
|
| - vpslldq TMP3, TMP0, 8
|
| - vpxor T, TMP2, TMP3
|
| -
|
| - vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
|
| - vpalignr T,T,T,8
|
| - vpxor T, T, TMP1
|
| -
|
| - ROUND 9
|
| -
|
| - vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
|
| - vpalignr T,T,T,8
|
| - vpxor T, T, TMP1
|
| -
|
| - vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
|
| - cmp NR, 10
|
| - je @f
|
| -
|
| - ROUND 10
|
| - ROUND 11
|
| - vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
|
| - cmp NR, 12
|
| - je @f
|
| -
|
| - ROUND 12
|
| - ROUND 13
|
| - vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
|
| -@@:
|
| - vpxor TMP3, TMP5, XMMWORD PTR[0*16 + CT]
|
| - vaesenclast CTR0, CTR0, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[1*16 + CT]
|
| - vaesenclast CTR1, CTR1, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[2*16 + CT]
|
| - vaesenclast CTR2, CTR2, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[3*16 + CT]
|
| - vaesenclast CTR3, CTR3, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[4*16 + CT]
|
| - vaesenclast CTR4, CTR4, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[5*16 + CT]
|
| - vaesenclast CTR5, CTR5, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[6*16 + CT]
|
| - vaesenclast CTR6, CTR6, TMP3
|
| - vpxor TMP3, TMP5, XMMWORD PTR[7*16 + CT]
|
| - vaesenclast CTR7, CTR7, TMP3
|
| -
|
| - vmovdqu XMMWORD PTR[0*16 + PT], CTR0
|
| - vmovdqu XMMWORD PTR[1*16 + PT], CTR1
|
| - vmovdqu XMMWORD PTR[2*16 + PT], CTR2
|
| - vmovdqu XMMWORD PTR[3*16 + PT], CTR3
|
| - vmovdqu XMMWORD PTR[4*16 + PT], CTR4
|
| - vmovdqu XMMWORD PTR[5*16 + PT], CTR5
|
| - vmovdqu XMMWORD PTR[6*16 + PT], CTR6
|
| - vmovdqu XMMWORD PTR[7*16 + PT], CTR7
|
| -
|
| - vpxor T, T, TMP4
|
| -
|
| - lea CT, [8*16 + CT]
|
| - lea PT, [8*16 + PT]
|
| - jmp LDecDataOctets
|
| -
|
| -LEndDecOctets:
|
| -
|
| - sub aluCTR, 7
|
| -
|
| -LDecDataSingles:
|
| -
|
| - cmp len, 16
|
| - jb LDecDataTail
|
| - sub len, 16
|
| -
|
| - vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
|
| - NEXTCTR 0
|
| -
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
|
| - vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
|
| - cmp NR, 10
|
| - je @f
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
|
| - vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
|
| - cmp NR, 12
|
| - je @f
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
|
| - vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
|
| -@@:
|
| - vaesenclast TMP1, TMP1, TMP2
|
| -
|
| - vmovdqu TMP2, XMMWORD PTR[CT]
|
| - vpxor TMP1, TMP1, TMP2
|
| - vmovdqu XMMWORD PTR[PT], TMP1
|
| -
|
| - lea PT, [16+PT]
|
| - lea CT, [16+CT]
|
| -
|
| - vpshufb TMP2, TMP2, BSWAPMASK
|
| - vpxor T, T, TMP2
|
| - vmovdqu TMP0, XMMWORD PTR[Htbl]
|
| - GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4
|
| -
|
| - jmp LDecDataSingles
|
| -
|
| -LDecDataTail:
|
| -
|
| - test len, len
|
| - jz LDecDataEnd
|
| -
|
| - vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
|
| - inc aluCTR
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
|
| - vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
|
| - cmp NR, 10
|
| - je @f
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
|
| - vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
|
| - cmp NR, 12
|
| - je @f
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
|
| - vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
|
| - vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
|
| -@@:
|
| - vaesenclast TMP1, TMP1, TMP2
|
| -; copy as many bytes as needed
|
| - xor KS, KS
|
| -@@:
|
| - cmp len, KS
|
| - je @f
|
| - mov al, [CT + KS]
|
| - mov [rsp + KS], al
|
| - inc KS
|
| - jmp @b
|
| -@@:
|
| - cmp KS, 16
|
| - je @f
|
| - mov BYTE PTR[rsp + KS], 0
|
| - inc KS
|
| - jmp @b
|
| -@@:
|
| - vmovdqa TMP2, XMMWORD PTR[rsp]
|
| - vpshufb TMP2, TMP2, BSWAPMASK
|
| - vpxor T, T, TMP2
|
| - vmovdqu TMP0, XMMWORD PTR[Htbl]
|
| - GFMUL T, T, TMP0, TMP5, TMP2, TMP3, TMP4
|
| -
|
| -
|
| - vpxor TMP1, TMP1, XMMWORD PTR[rsp]
|
| - vmovdqa XMMWORD PTR[rsp], TMP1
|
| - xor KS, KS
|
| -@@:
|
| - cmp len, KS
|
| - je @f
|
| - mov al, [rsp + KS]
|
| - mov [PT + KS], al
|
| - inc KS
|
| - jmp @b
|
| -@@:
|
| -
|
| -LDecDataEnd:
|
| -
|
| - vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
|
| - bswap aluCTR
|
| - mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
|
| -
|
| - mov rsp, rbp
|
| -
|
| - vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
|
| - vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
|
| - vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
|
| - vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
|
| - vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
|
| - vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
|
| - vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
|
| - vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
|
| - vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
|
| - vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
|
| -
|
| - add rsp, 10*16
|
| - pop rbp
|
| - pop r13
|
| - pop r12
|
| - pop r11
|
| -
|
| - vzeroupper
|
| -
|
| - ret
|
| -ret
|
| -intel_aes_gcmDEC ENDP
|
| -
|
| -
|
| -END
|
|
|