Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(378)

Unified Diff: nss/lib/freebl/intel-gcm-x64-masm.asm

Issue 2078763002: Delete bundled copy of NSS and replace with README. (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/nss@master
Patch Set: Delete bundled copy of NSS and replace with README. Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « nss/lib/freebl/intel-gcm-wrap.c ('k') | nss/lib/freebl/intel-gcm-x86-masm.asm » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: nss/lib/freebl/intel-gcm-x64-masm.asm
diff --git a/nss/lib/freebl/intel-gcm-x64-masm.asm b/nss/lib/freebl/intel-gcm-x64-masm.asm
deleted file mode 100644
index 8b68b76e58083f1c4dee50217e6d030a6b090cd3..0000000000000000000000000000000000000000
--- a/nss/lib/freebl/intel-gcm-x64-masm.asm
+++ /dev/null
@@ -1,1295 +0,0 @@
-; LICENSE:
-; This submission to NSS is to be made available under the terms of the
-; Mozilla Public License, v. 2.0. You can obtain one at http:
-; //mozilla.org/MPL/2.0/.
-;###############################################################################
-; Copyright(c) 2014, Intel Corp.
-; Developers and authors:
-; Shay Gueron and Vlad Krasnov
-; Intel Corporation, Israel Development Centre, Haifa, Israel
-; Please send feedback directly to crypto.feedback.alias@intel.com
-
-
-.DATA
-ALIGN 16
-Lone dq 1,0
-Ltwo dq 2,0
-Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
-Lpoly dq 01h, 0c200000000000000h
-
-.CODE
-
-
-GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4
- vpclmulqdq TMP1, SRC2, SRC1, 0h
- vpclmulqdq TMP4, SRC2, SRC1, 011h
-
- vpshufd TMP2, SRC2, 78
- vpshufd TMP3, SRC1, 78
- vpxor TMP2, TMP2, SRC2
- vpxor TMP3, TMP3, SRC1
-
- vpclmulqdq TMP2, TMP2, TMP3, 0h
- vpxor TMP2, TMP2, TMP1
- vpxor TMP2, TMP2, TMP4
-
- vpslldq TMP3, TMP2, 8
- vpsrldq TMP2, TMP2, 8
-
- vpxor TMP1, TMP1, TMP3
- vpxor TMP4, TMP4, TMP2
-
- vpclmulqdq TMP2, TMP1, [Lpoly], 010h
- vpshufd TMP3, TMP1, 78
- vpxor TMP1, TMP2, TMP3
-
- vpclmulqdq TMP2, TMP1, [Lpoly], 010h
- vpshufd TMP3, TMP1, 78
- vpxor TMP1, TMP2, TMP3
-
- vpxor DST, TMP1, TMP4
-
- ENDM
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Generates the final GCM tag
-; void intel_aes_gcmTAG(unsigned char Htbl[16*16],
-; unsigned char *Tp,
-; unsigned int Mlen,
-; unsigned int Alen,
-; unsigned char *X0,
-; unsigned char *TAG);
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-ALIGN 16
-intel_aes_gcmTAG PROC
-
-Htbl textequ <rcx>
-Tp textequ <rdx>
-Mlen textequ <r8>
-Alen textequ <r9>
-X0 textequ <r10>
-TAG textequ <r11>
-
-T textequ <xmm0>
-TMP0 textequ <xmm1>
-
- mov X0, [rsp + 1*8 + 4*8]
- mov TAG, [rsp + 1*8 + 5*8]
-
- vzeroupper
- vmovdqu T, XMMWORD PTR[Tp]
- vpxor TMP0, TMP0, TMP0
-
- shl Mlen, 3
- shl Alen, 3
-
- ;vpinsrq TMP0, TMP0, Mlen, 0
- ;vpinsrq TMP0, TMP0, Alen, 1
- ; workaround the ml64.exe vpinsrq issue
- vpinsrd TMP0, TMP0, r8d, 0
- vpinsrd TMP0, TMP0, r9d, 2
- shr Mlen, 32
- shr Alen, 32
- vpinsrd TMP0, TMP0, r8d, 1
- vpinsrd TMP0, TMP0, r9d, 3
-
- vpxor T, T, TMP0
- vmovdqu TMP0, XMMWORD PTR[Htbl]
- GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
-
- vpshufb T, T, [Lbswap_mask]
- vpxor T, T, [X0]
- vmovdqu XMMWORD PTR[TAG], T
- vzeroupper
-
- ret
-
-intel_aes_gcmTAG ENDP
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Generates the H table
-; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR);
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-ALIGN 16
-intel_aes_gcmINIT PROC
-
-Htbl textequ <rcx>
-KS textequ <rdx>
-NR textequ <r8d>
-
-T textequ <xmm0>
-TMP0 textequ <xmm1>
-
- vzeroupper
- ; AES-ENC(0)
- vmovdqu T, XMMWORD PTR[KS]
- lea KS, [16 + KS]
- dec NR
-Lenc_loop:
- vaesenc T, T, [KS]
- lea KS, [16 + KS]
- dec NR
- jnz Lenc_loop
-
- vaesenclast T, T, [KS]
- vpshufb T, T, [Lbswap_mask]
-
- ;Calculate H` = GFMUL(H, 2)
- vpsrad xmm3, T, 31
- vpshufd xmm3, xmm3, 0ffh
- vpand xmm5, xmm3, [Lpoly]
- vpsrld xmm3, T, 31
- vpslld xmm4, T, 1
- vpslldq xmm3, xmm3, 4
- vpxor T, xmm4, xmm3
- vpxor T, T, xmm5
-
- vmovdqu TMP0, T
- vmovdqu XMMWORD PTR[Htbl + 0*16], T
-
- vpshufd xmm2, T, 78
- vpxor xmm2, xmm2, T
- vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2
-
- i = 1
- WHILE i LT 8
- GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
- vmovdqu XMMWORD PTR[Htbl + i*16], T
- vpshufd xmm2, T, 78
- vpxor xmm2, xmm2, T
- vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2
- i = i+1
- ENDM
- vzeroupper
- ret
-intel_aes_gcmINIT ENDP
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Authenticate only
-; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp);
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-ALIGN 16
-intel_aes_gcmAAD PROC
-
-Htbl textequ <rcx>
-inp textequ <rdx>
-len textequ <r8>
-Tp textequ <r9>
-hlp0 textequ <r10>
-
-DATA textequ <xmm0>
-T textequ <xmm1>
-TMP0 textequ <xmm2>
-TMP1 textequ <xmm3>
-TMP2 textequ <xmm4>
-TMP3 textequ <xmm5>
-TMP4 textequ <xmm6>
-Xhi textequ <xmm7>
-
-KARATSUBA_AAD MACRO i
- vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h
- vpxor TMP0, TMP0, TMP3
- vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h
- vpxor TMP1, TMP1, TMP3
- vpshufd TMP3, DATA, 78
- vpxor TMP3, TMP3, DATA
- vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h
- vpxor TMP2, TMP2, TMP3
-ENDM
-
- test len, len
- jnz LbeginAAD
- ret
-
-LbeginAAD:
- vzeroupper
-
- sub rsp, 2*16
- vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
- vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
-
- vpxor Xhi, Xhi, Xhi
-
- vmovdqu T, XMMWORD PTR[Tp]
- ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
- mov hlp0, len
- and hlp0, 128-1
- jz Lmod_loop
-
- and len, -128
- sub hlp0, 16
-
- ; Prefix block
- vmovdqu DATA, XMMWORD PTR[inp]
- vpshufb DATA, DATA, [Lbswap_mask]
- vpxor DATA, DATA, T
-
- vpclmulqdq TMP0, DATA, [Htbl + hlp0], 0h
- vpclmulqdq TMP1, DATA, [Htbl + hlp0], 011h
- vpshufd TMP3, DATA, 78
- vpxor TMP3, TMP3, DATA
- vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + hlp0], 0h
-
- lea inp, [inp+16]
- test hlp0, hlp0
- jnz Lpre_loop
- jmp Lred1
-
- ;hash remaining prefix bocks (up to 7 total prefix blocks)
-Lpre_loop:
-
- sub hlp0, 16
-
- vmovdqu DATA, XMMWORD PTR[inp]
- vpshufb DATA, DATA, [Lbswap_mask]
-
- vpclmulqdq TMP3, DATA, [Htbl + hlp0], 0h
- vpxor TMP0, TMP0, TMP3
- vpclmulqdq TMP3, DATA, [Htbl + hlp0], 011h
- vpxor TMP1, TMP1, TMP3
- vpshufd TMP3, DATA, 78
- vpxor TMP3, TMP3, DATA
- vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + hlp0], 0h
- vpxor TMP2, TMP2, TMP3
-
- test hlp0, hlp0
- lea inp, [inp+16]
- jnz Lpre_loop
-
-Lred1:
-
- vpxor TMP2, TMP2, TMP0
- vpxor TMP2, TMP2, TMP1
- vpsrldq TMP3, TMP2, 8
- vpslldq TMP2, TMP2, 8
-
- vpxor Xhi, TMP1, TMP3
- vpxor T, TMP0, TMP2
-
-
-Lmod_loop:
-
- sub len, 16*8
- jb Ldone
- ; Block #0
- vmovdqu DATA, XMMWORD PTR[inp + 16*7]
- vpshufb DATA, DATA, [Lbswap_mask]
-
- vpclmulqdq TMP0, DATA, [Htbl + 0*16], 0h
- vpclmulqdq TMP1, DATA, [Htbl + 0*16], 011h
- vpshufd TMP3, DATA, 78
- vpxor TMP3, TMP3, DATA
- vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + 0*16], 0h
-
- ; Block #1
- vmovdqu DATA, XMMWORD PTR[inp + 16*6]
- vpshufb DATA, DATA, [Lbswap_mask]
- KARATSUBA_AAD 1
-
- ; Block #2
- vmovdqu DATA, XMMWORD PTR[inp + 16*5]
- vpshufb DATA, DATA, [Lbswap_mask]
-
- vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a
- vpalignr T, T, T, 8
-
- KARATSUBA_AAD 2
-
- vpxor T, T, TMP4 ;reduction stage 1b
-
- ; Block #3
- vmovdqu DATA, XMMWORD PTR[inp + 16*4]
- vpshufb DATA, DATA, [Lbswap_mask]
- KARATSUBA_AAD 3
- ; Block #4
- vmovdqu DATA, XMMWORD PTR[inp + 16*3]
- vpshufb DATA, DATA, [Lbswap_mask]
-
- vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a
- vpalignr T, T, T, 8
-
- KARATSUBA_AAD 4
-
- vpxor T, T, TMP4 ;reduction stage 2b
- ; Block #5
- vmovdqu DATA, XMMWORD PTR[inp + 16*2]
- vpshufb DATA, DATA, [Lbswap_mask]
- KARATSUBA_AAD 5
-
- vpxor T, T, Xhi ;reduction finalize
- ; Block #6
- vmovdqu DATA, XMMWORD PTR[inp + 16*1]
- vpshufb DATA, DATA, [Lbswap_mask]
- KARATSUBA_AAD 6
- ; Block #7
- vmovdqu DATA, XMMWORD PTR[inp + 16*0]
- vpshufb DATA, DATA, [Lbswap_mask]
- vpxor DATA, DATA, T
- KARATSUBA_AAD 7
- ; Aggregated 8 blocks, now karatsuba fixup
- vpxor TMP2, TMP2, TMP0
- vpxor TMP2, TMP2, TMP1
- vpsrldq TMP3, TMP2, 8
- vpslldq TMP2, TMP2, 8
-
- vpxor Xhi, TMP1, TMP3
- vpxor T, TMP0, TMP2
-
- lea inp, [inp + 16*8]
- jmp Lmod_loop
-
-Ldone:
- vpclmulqdq TMP4, T, [Lpoly], 010h
- vpalignr T, T, T, 8
- vpxor T, T, TMP4
-
- vpclmulqdq TMP4, T, [Lpoly], 010h
- vpalignr T, T, T, 8
- vpxor T, T, TMP4
-
- vpxor T, T, Xhi
- vmovdqu XMMWORD PTR[Tp], T
- vzeroupper
-
- vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
- vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
- add rsp, 16*2
-
- ret
-
-intel_aes_gcmAAD ENDP
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Encrypt and Authenticate
-; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len);
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-ALIGN 16
-intel_aes_gcmENC PROC
-
-PT textequ <rcx>
-CT textequ <rdx>
-Htbl textequ <r8>
-Gctx textequ <r8>
-len textequ <r9>
-KS textequ <r10>
-NR textequ <eax>
-
-aluCTR textequ <r11d>
-aluKSl textequ <r12d>
-aluTMP textequ <r13d>
-
-T textequ <xmm0>
-TMP0 textequ <xmm1>
-TMP1 textequ <xmm2>
-TMP2 textequ <xmm3>
-TMP3 textequ <xmm4>
-TMP4 textequ <xmm5>
-TMP5 textequ <xmm6>
-CTR0 textequ <xmm7>
-CTR1 textequ <xmm8>
-CTR2 textequ <xmm9>
-CTR3 textequ <xmm10>
-CTR4 textequ <xmm11>
-CTR5 textequ <xmm12>
-CTR6 textequ <xmm13>
-CTR7 textequ <xmm14>
-BSWAPMASK textequ <xmm15>
-
-ROUND MACRO i
- vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
- vaesenc CTR0, CTR0, TMP3
- vaesenc CTR1, CTR1, TMP3
- vaesenc CTR2, CTR2, TMP3
- vaesenc CTR3, CTR3, TMP3
- vaesenc CTR4, CTR4, TMP3
- vaesenc CTR5, CTR5, TMP3
- vaesenc CTR6, CTR6, TMP3
- vaesenc CTR7, CTR7, TMP3
-ENDM
-ROUNDMUL MACRO i
- vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
-
- vaesenc CTR0, CTR0, TMP3
- vaesenc CTR1, CTR1, TMP3
- vaesenc CTR2, CTR2, TMP3
- vaesenc CTR3, CTR3, TMP3
-
- vpshufd TMP4, TMP5, 78
- vpxor TMP4, TMP4, TMP5
-
- vaesenc CTR4, CTR4, TMP3
- vaesenc CTR5, CTR5, TMP3
- vaesenc CTR6, CTR6, TMP3
- vaesenc CTR7, CTR7, TMP3
-
- vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
- vpxor TMP0, TMP0, TMP3
- vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl]
- vpclmulqdq TMP3, TMP5, TMP4, 011h
- vpxor TMP1, TMP1, TMP3
- vpclmulqdq TMP3, TMP5, TMP4, 000h
- vpxor TMP2, TMP2, TMP3
-ENDM
-KARATSUBA MACRO i
- vpshufd TMP4, TMP5, 78
- vpxor TMP4, TMP4, TMP5
- vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
- vpxor TMP0, TMP0, TMP3
- vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl]
- vpclmulqdq TMP3, TMP5, TMP4, 011h
- vpxor TMP1, TMP1, TMP3
- vpclmulqdq TMP3, TMP5, TMP4, 000h
- vpxor TMP2, TMP2, TMP3
-ENDM
-NEXTCTR MACRO i
- add aluCTR, 1
- mov aluTMP, aluCTR
- xor aluTMP, aluKSl
- bswap aluTMP
- mov [3*4 + 8*16 + i*16 + rsp], aluTMP
-ENDM
-
-
- test len, len
- jnz LbeginENC
- ret
-
-LbeginENC:
-
- vzeroupper
- push r11
- push r12
- push r13
- push rbp
- sub rsp, 10*16
- vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
- vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
- vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
- vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
- vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
- vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
- vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
- vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
- vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
- vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
-
- mov rbp, rsp
- sub rsp, 16*16
- and rsp, -16
-
- vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
- vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
- vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
- mov KS, [16*16 + 3*16 + Gctx]
- mov NR, [4 + KS]
- lea KS, [48 + KS]
-
- vpshufb CTR0, CTR0, BSWAPMASK
-
- mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
- mov aluKSl, [3*4 + KS]
- bswap aluCTR
- bswap aluKSl
-
- vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
- vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
- vmovdqu XMMWORD PTR[8*16 + 0*16 + rsp], TMP0
-
- cmp len, 128
- jb LEncDataSingles
-; Prepare the "top" counters
- vmovdqu XMMWORD PTR[8*16 + 1*16 + rsp], TMP0
- vmovdqu XMMWORD PTR[8*16 + 2*16 + rsp], TMP0
- vmovdqu XMMWORD PTR[8*16 + 3*16 + rsp], TMP0
- vmovdqu XMMWORD PTR[8*16 + 4*16 + rsp], TMP0
- vmovdqu XMMWORD PTR[8*16 + 5*16 + rsp], TMP0
- vmovdqu XMMWORD PTR[8*16 + 6*16 + rsp], TMP0
- vmovdqu XMMWORD PTR[8*16 + 7*16 + rsp], TMP0
-
-; Encrypt the initial 8 blocks
- sub len, 128
- vpaddd CTR1, CTR0, XMMWORD PTR[Lone]
- vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo]
- vpaddd CTR3, CTR2, XMMWORD PTR[Lone]
- vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo]
- vpaddd CTR5, CTR4, XMMWORD PTR[Lone]
- vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo]
- vpaddd CTR7, CTR6, XMMWORD PTR[Lone]
-
- vpshufb CTR0, CTR0, BSWAPMASK
- vpshufb CTR1, CTR1, BSWAPMASK
- vpshufb CTR2, CTR2, BSWAPMASK
- vpshufb CTR3, CTR3, BSWAPMASK
- vpshufb CTR4, CTR4, BSWAPMASK
- vpshufb CTR5, CTR5, BSWAPMASK
- vpshufb CTR6, CTR6, BSWAPMASK
- vpshufb CTR7, CTR7, BSWAPMASK
-
- vmovdqu TMP3, XMMWORD PTR[0*16 + KS]
- vpxor CTR0, CTR0, TMP3
- vpxor CTR1, CTR1, TMP3
- vpxor CTR2, CTR2, TMP3
- vpxor CTR3, CTR3, TMP3
- vpxor CTR4, CTR4, TMP3
- vpxor CTR5, CTR5, TMP3
- vpxor CTR6, CTR6, TMP3
- vpxor CTR7, CTR7, TMP3
-
- ROUND 1
-
- add aluCTR, 8
- mov aluTMP, aluCTR
- xor aluTMP, aluKSl
- bswap aluTMP
- mov [8*16 + 0*16 + 3*4 + rsp], aluTMP
-
- ROUND 2
- NEXTCTR 1
- ROUND 3
- NEXTCTR 2
- ROUND 4
- NEXTCTR 3
- ROUND 5
- NEXTCTR 4
- ROUND 6
- NEXTCTR 5
- ROUND 7
- NEXTCTR 6
- ROUND 8
- NEXTCTR 7
- ROUND 9
- vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
- cmp NR, 10
- je @f
-
- ROUND 10
- ROUND 11
- vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
- cmp NR, 12
- je @f
-
- ROUND 12
- ROUND 13
- vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
-@@:
- vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT]
- vaesenclast CTR0, CTR0, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT]
- vaesenclast CTR1, CTR1, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT]
- vaesenclast CTR2, CTR2, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT]
- vaesenclast CTR3, CTR3, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT]
- vaesenclast CTR4, CTR4, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT]
- vaesenclast CTR5, CTR5, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT]
- vaesenclast CTR6, CTR6, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT]
- vaesenclast CTR7, CTR7, TMP3
-
- vmovdqu XMMWORD PTR[0*16 + CT], CTR0
- vpshufb CTR0, CTR0, BSWAPMASK
- vmovdqu XMMWORD PTR[1*16 + CT], CTR1
- vpshufb CTR1, CTR1, BSWAPMASK
- vmovdqu XMMWORD PTR[2*16 + CT], CTR2
- vpshufb CTR2, CTR2, BSWAPMASK
- vmovdqu XMMWORD PTR[3*16 + CT], CTR3
- vpshufb CTR3, CTR3, BSWAPMASK
- vmovdqu XMMWORD PTR[4*16 + CT], CTR4
- vpshufb CTR4, CTR4, BSWAPMASK
- vmovdqu XMMWORD PTR[5*16 + CT], CTR5
- vpshufb CTR5, CTR5, BSWAPMASK
- vmovdqu XMMWORD PTR[6*16 + CT], CTR6
- vpshufb CTR6, CTR6, BSWAPMASK
- vmovdqu XMMWORD PTR[7*16 + CT], CTR7
- vpshufb TMP5, CTR7, BSWAPMASK
-
- vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
- vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
- vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
- vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
- vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
- vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
- vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
-
- lea CT, [8*16 + CT]
- lea PT, [8*16 + PT]
- jmp LEncDataOctets
-
-LEncDataOctets:
- cmp len, 128
- jb LEndEncOctets
- sub len, 128
-
- vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + rsp]
- vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + rsp]
- vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + rsp]
- vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + rsp]
- vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + rsp]
- vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + rsp]
- vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + rsp]
- vmovdqa CTR7, XMMWORD PTR[8*16 + 7*16 + rsp]
-
- vpshufd TMP4, TMP5, 78
- vpxor TMP4, TMP4, TMP5
- vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
- vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
- vpclmulqdq TMP1, TMP5, TMP4, 011h
- vpclmulqdq TMP2, TMP5, TMP4, 000h
-
- vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
- ROUNDMUL 1
- NEXTCTR 0
- vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
- ROUNDMUL 2
- NEXTCTR 1
- vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
- ROUNDMUL 3
- NEXTCTR 2
- vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
- ROUNDMUL 4
- NEXTCTR 3
- vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
- ROUNDMUL 5
- NEXTCTR 4
- vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
- ROUNDMUL 6
- NEXTCTR 5
- vpxor TMP5, T, XMMWORD PTR[7*16 + rsp]
- ROUNDMUL 7
- NEXTCTR 6
-
- ROUND 8
- NEXTCTR 7
-
- vpxor TMP0, TMP0, TMP1
- vpxor TMP0, TMP0, TMP2
- vpsrldq TMP3, TMP0, 8
- vpxor TMP4, TMP1, TMP3
- vpslldq TMP3, TMP0, 8
- vpxor T, TMP2, TMP3
-
- vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
- vpalignr T,T,T,8
- vpxor T, T, TMP1
-
- ROUND 9
-
- vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
- vpalignr T,T,T,8
- vpxor T, T, TMP1
-
- vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
- cmp NR, 10
- je @f
-
- ROUND 10
- ROUND 11
- vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
- cmp NR, 12
- je @f
-
- ROUND 12
- ROUND 13
- vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
-@@:
- vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT]
- vaesenclast CTR0, CTR0, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT]
- vaesenclast CTR1, CTR1, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT]
- vaesenclast CTR2, CTR2, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT]
- vaesenclast CTR3, CTR3, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT]
- vaesenclast CTR4, CTR4, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT]
- vaesenclast CTR5, CTR5, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT]
- vaesenclast CTR6, CTR6, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT]
- vaesenclast CTR7, CTR7, TMP3
-
- vmovdqu XMMWORD PTR[0*16 + CT], CTR0
- vpshufb CTR0, CTR0, BSWAPMASK
- vmovdqu XMMWORD PTR[1*16 + CT], CTR1
- vpshufb CTR1, CTR1, BSWAPMASK
- vmovdqu XMMWORD PTR[2*16 + CT], CTR2
- vpshufb CTR2, CTR2, BSWAPMASK
- vmovdqu XMMWORD PTR[3*16 + CT], CTR3
- vpshufb CTR3, CTR3, BSWAPMASK
- vmovdqu XMMWORD PTR[4*16 + CT], CTR4
- vpshufb CTR4, CTR4, BSWAPMASK
- vmovdqu XMMWORD PTR[5*16 + CT], CTR5
- vpshufb CTR5, CTR5, BSWAPMASK
- vmovdqu XMMWORD PTR[6*16 + CT], CTR6
- vpshufb CTR6, CTR6, BSWAPMASK
- vmovdqu XMMWORD PTR[7*16 + CT], CTR7
- vpshufb TMP5, CTR7, BSWAPMASK
-
- vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
- vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
- vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
- vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
- vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
- vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
- vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
-
- vpxor T, T, TMP4
-
- lea CT, [8*16 + CT]
- lea PT, [8*16 + PT]
- jmp LEncDataOctets
-
-LEndEncOctets:
-
- vpshufd TMP4, TMP5, 78
- vpxor TMP4, TMP4, TMP5
- vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
- vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
- vpclmulqdq TMP1, TMP5, TMP4, 011h
- vpclmulqdq TMP2, TMP5, TMP4, 000h
-
- vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
- KARATSUBA 1
- vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
- KARATSUBA 2
- vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
- KARATSUBA 3
- vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
- KARATSUBA 4
- vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
- KARATSUBA 5
- vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
- KARATSUBA 6
- vpxor TMP5, T, XMMWORD PTR[7*16 + rsp]
- KARATSUBA 7
-
- vpxor TMP0, TMP0, TMP1
- vpxor TMP0, TMP0, TMP2
- vpsrldq TMP3, TMP0, 8
- vpxor TMP4, TMP1, TMP3
- vpslldq TMP3, TMP0, 8
- vpxor T, TMP2, TMP3
-
- vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
- vpalignr T,T,T,8
- vpxor T, T, TMP1
-
- vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
- vpalignr T,T,T,8
- vpxor T, T, TMP1
-
- vpxor T, T, TMP4
-
- sub aluCTR, 7
-
-LEncDataSingles:
-
- cmp len, 16
- jb LEncDataTail
- sub len, 16
-
- vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
- NEXTCTR 0
-
- vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
- vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
- cmp NR, 10
- je @f
- vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
- vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
- cmp NR, 12
- je @f
- vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
- vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
-@@:
- vaesenclast TMP1, TMP1, TMP2
- vpxor TMP1, TMP1, XMMWORD PTR[PT]
- vmovdqu XMMWORD PTR[CT], TMP1
-
- lea PT, [16+PT]
- lea CT, [16+CT]
-
- vpshufb TMP1, TMP1, BSWAPMASK
- vpxor T, T, TMP1
- vmovdqu TMP0, XMMWORD PTR[Htbl]
- GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4
-
- jmp LEncDataSingles
-
-LEncDataTail:
-
- test len, len
- jz LEncDataEnd
-
- vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
-
- vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
- vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
- cmp NR, 10
- je @f
- vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
- vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
- cmp NR, 12
- je @f
- vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
- vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
-@@:
- vaesenclast TMP1, TMP1, TMP2
-; zero a temp location
- vpxor TMP2, TMP2, TMP2
- vmovdqa XMMWORD PTR[rsp], TMP2
-; copy as many bytes as needed
- xor KS, KS
-
-@@:
- cmp len, KS
- je @f
- mov al, [PT + KS]
- mov [rsp + KS], al
- inc KS
- jmp @b
-@@:
- vpxor TMP1, TMP1, XMMWORD PTR[rsp]
- vmovdqa XMMWORD PTR[rsp], TMP1
- xor KS, KS
-@@:
- cmp len, KS
- je @f
- mov al, [rsp + KS]
- mov [CT + KS], al
- inc KS
- jmp @b
-@@:
- cmp KS, 16
- je @f
- mov BYTE PTR[rsp + KS], 0
- inc KS
- jmp @b
-@@:
-BAIL:
- vmovdqa TMP1, XMMWORD PTR[rsp]
- vpshufb TMP1, TMP1, BSWAPMASK
- vpxor T, T, TMP1
- vmovdqu TMP0, XMMWORD PTR[Htbl]
- GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4
-
-LEncDataEnd:
-
- vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
- bswap aluCTR
- mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
-
- mov rsp, rbp
-
- vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
- vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
- vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
- vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
- vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
- vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
- vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
- vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
- vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
- vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
-
- add rsp, 10*16
- pop rbp
- pop r13
- pop r12
- pop r11
-
- vzeroupper
-
- ret
-intel_aes_gcmENC ENDP
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;
-; Decrypt and Authenticate
-; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len);
-;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-ALIGN 16
-intel_aes_gcmDEC PROC
-
-NEXTCTR MACRO i
- add aluCTR, 1
- mov aluTMP, aluCTR
- xor aluTMP, aluKSl
- bswap aluTMP
- mov [3*4 + i*16 + rsp], aluTMP
-ENDM
-
-PT textequ <rdx>
-CT textequ <rcx>
-
- test len, len
- jnz LbeginDEC
- ret
-
-LbeginDEC:
-
- vzeroupper
- push r11
- push r12
- push r13
- push rbp
- sub rsp, 10*16
- vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
- vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
- vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
- vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
- vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
- vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
- vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
- vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
- vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
- vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
-
- mov rbp, rsp
- sub rsp, 8*16
- and rsp, -16
-
- vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
- vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
- vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
- mov KS, [16*16 + 3*16 + Gctx]
- mov NR, [4 + KS]
- lea KS, [48 + KS]
-
- vpshufb CTR0, CTR0, BSWAPMASK
-
- mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
- mov aluKSl, [3*4 + KS]
- bswap aluCTR
- bswap aluKSl
-
- vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
- vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
- vmovdqu XMMWORD PTR[0*16 + rsp], TMP0
-
- cmp len, 128
- jb LDecDataSingles
-; Prepare the "top" counters
- vmovdqu XMMWORD PTR[1*16 + rsp], TMP0
- vmovdqu XMMWORD PTR[2*16 + rsp], TMP0
- vmovdqu XMMWORD PTR[3*16 + rsp], TMP0
- vmovdqu XMMWORD PTR[4*16 + rsp], TMP0
- vmovdqu XMMWORD PTR[5*16 + rsp], TMP0
- vmovdqu XMMWORD PTR[6*16 + rsp], TMP0
- vmovdqu XMMWORD PTR[7*16 + rsp], TMP0
-
- NEXTCTR 1
- NEXTCTR 2
- NEXTCTR 3
- NEXTCTR 4
- NEXTCTR 5
- NEXTCTR 6
- NEXTCTR 7
-
-LDecDataOctets:
- cmp len, 128
- jb LEndDecOctets
- sub len, 128
-
- vmovdqa CTR0, XMMWORD PTR[0*16 + rsp]
- vmovdqa CTR1, XMMWORD PTR[1*16 + rsp]
- vmovdqa CTR2, XMMWORD PTR[2*16 + rsp]
- vmovdqa CTR3, XMMWORD PTR[3*16 + rsp]
- vmovdqa CTR4, XMMWORD PTR[4*16 + rsp]
- vmovdqa CTR5, XMMWORD PTR[5*16 + rsp]
- vmovdqa CTR6, XMMWORD PTR[6*16 + rsp]
- vmovdqa CTR7, XMMWORD PTR[7*16 + rsp]
-
- vmovdqu TMP5, XMMWORD PTR[7*16 + CT]
- vpshufb TMP5, TMP5, BSWAPMASK
- vpshufd TMP4, TMP5, 78
- vpxor TMP4, TMP4, TMP5
- vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
- vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
- vpclmulqdq TMP1, TMP5, TMP4, 011h
- vpclmulqdq TMP2, TMP5, TMP4, 000h
-
- vmovdqu TMP5, XMMWORD PTR[6*16 + CT]
- vpshufb TMP5, TMP5, BSWAPMASK
- ROUNDMUL 1
- NEXTCTR 0
- vmovdqu TMP5, XMMWORD PTR[5*16 + CT]
- vpshufb TMP5, TMP5, BSWAPMASK
- ROUNDMUL 2
- NEXTCTR 1
- vmovdqu TMP5, XMMWORD PTR[4*16 + CT]
- vpshufb TMP5, TMP5, BSWAPMASK
- ROUNDMUL 3
- NEXTCTR 2
- vmovdqu TMP5, XMMWORD PTR[3*16 + CT]
- vpshufb TMP5, TMP5, BSWAPMASK
- ROUNDMUL 4
- NEXTCTR 3
- vmovdqu TMP5, XMMWORD PTR[2*16 + CT]
- vpshufb TMP5, TMP5, BSWAPMASK
- ROUNDMUL 5
- NEXTCTR 4
- vmovdqu TMP5, XMMWORD PTR[1*16 + CT]
- vpshufb TMP5, TMP5, BSWAPMASK
- ROUNDMUL 6
- NEXTCTR 5
- vmovdqu TMP5, XMMWORD PTR[0*16 + CT]
- vpshufb TMP5, TMP5, BSWAPMASK
- vpxor TMP5, TMP5, T
- ROUNDMUL 7
- NEXTCTR 6
-
- ROUND 8
- NEXTCTR 7
-
- vpxor TMP0, TMP0, TMP1
- vpxor TMP0, TMP0, TMP2
- vpsrldq TMP3, TMP0, 8
- vpxor TMP4, TMP1, TMP3
- vpslldq TMP3, TMP0, 8
- vpxor T, TMP2, TMP3
-
- vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
- vpalignr T,T,T,8
- vpxor T, T, TMP1
-
- ROUND 9
-
- vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
- vpalignr T,T,T,8
- vpxor T, T, TMP1
-
- vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
- cmp NR, 10
- je @f
-
- ROUND 10
- ROUND 11
- vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
- cmp NR, 12
- je @f
-
- ROUND 12
- ROUND 13
- vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
-@@:
- vpxor TMP3, TMP5, XMMWORD PTR[0*16 + CT]
- vaesenclast CTR0, CTR0, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[1*16 + CT]
- vaesenclast CTR1, CTR1, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[2*16 + CT]
- vaesenclast CTR2, CTR2, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[3*16 + CT]
- vaesenclast CTR3, CTR3, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[4*16 + CT]
- vaesenclast CTR4, CTR4, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[5*16 + CT]
- vaesenclast CTR5, CTR5, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[6*16 + CT]
- vaesenclast CTR6, CTR6, TMP3
- vpxor TMP3, TMP5, XMMWORD PTR[7*16 + CT]
- vaesenclast CTR7, CTR7, TMP3
-
- vmovdqu XMMWORD PTR[0*16 + PT], CTR0
- vmovdqu XMMWORD PTR[1*16 + PT], CTR1
- vmovdqu XMMWORD PTR[2*16 + PT], CTR2
- vmovdqu XMMWORD PTR[3*16 + PT], CTR3
- vmovdqu XMMWORD PTR[4*16 + PT], CTR4
- vmovdqu XMMWORD PTR[5*16 + PT], CTR5
- vmovdqu XMMWORD PTR[6*16 + PT], CTR6
- vmovdqu XMMWORD PTR[7*16 + PT], CTR7
-
- vpxor T, T, TMP4
-
- lea CT, [8*16 + CT]
- lea PT, [8*16 + PT]
- jmp LDecDataOctets
-
-LEndDecOctets:
-
- sub aluCTR, 7
-
-LDecDataSingles:
-
- cmp len, 16
- jb LDecDataTail
- sub len, 16
-
- vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
- NEXTCTR 0
-
- vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
- vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
- cmp NR, 10
- je @f
- vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
- vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
- cmp NR, 12
- je @f
- vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
- vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
-@@:
- vaesenclast TMP1, TMP1, TMP2
-
- vmovdqu TMP2, XMMWORD PTR[CT]
- vpxor TMP1, TMP1, TMP2
- vmovdqu XMMWORD PTR[PT], TMP1
-
- lea PT, [16+PT]
- lea CT, [16+CT]
-
- vpshufb TMP2, TMP2, BSWAPMASK
- vpxor T, T, TMP2
- vmovdqu TMP0, XMMWORD PTR[Htbl]
- GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4
-
- jmp LDecDataSingles
-
-LDecDataTail:
-
- test len, len
- jz LDecDataEnd
-
- vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
- inc aluCTR
- vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
- vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
- cmp NR, 10
- je @f
- vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
- vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
- cmp NR, 12
- je @f
- vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
- vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
- vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
-@@:
- vaesenclast TMP1, TMP1, TMP2
-; copy as many bytes as needed
- xor KS, KS
-@@:
- cmp len, KS
- je @f
- mov al, [CT + KS]
- mov [rsp + KS], al
- inc KS
- jmp @b
-@@:
- cmp KS, 16
- je @f
- mov BYTE PTR[rsp + KS], 0
- inc KS
- jmp @b
-@@:
- vmovdqa TMP2, XMMWORD PTR[rsp]
- vpshufb TMP2, TMP2, BSWAPMASK
- vpxor T, T, TMP2
- vmovdqu TMP0, XMMWORD PTR[Htbl]
- GFMUL T, T, TMP0, TMP5, TMP2, TMP3, TMP4
-
-
- vpxor TMP1, TMP1, XMMWORD PTR[rsp]
- vmovdqa XMMWORD PTR[rsp], TMP1
- xor KS, KS
-@@:
- cmp len, KS
- je @f
- mov al, [rsp + KS]
- mov [PT + KS], al
- inc KS
- jmp @b
-@@:
-
-LDecDataEnd:
-
- vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
- bswap aluCTR
- mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
-
- mov rsp, rbp
-
- vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
- vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
- vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
- vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
- vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
- vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
- vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
- vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
- vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
- vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
-
- add rsp, 10*16
- pop rbp
- pop r13
- pop r12
- pop r11
-
- vzeroupper
-
- ret
-ret
-intel_aes_gcmDEC ENDP
-
-
-END
« no previous file with comments | « nss/lib/freebl/intel-gcm-wrap.c ('k') | nss/lib/freebl/intel-gcm-x86-masm.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698