| Index: nss/lib/freebl/intel-aes-x64-masm.asm
|
| diff --git a/nss/lib/freebl/intel-aes-x86-masm.asm b/nss/lib/freebl/intel-aes-x64-masm.asm
|
| similarity index 62%
|
| copy from nss/lib/freebl/intel-aes-x86-masm.asm
|
| copy to nss/lib/freebl/intel-aes-x64-masm.asm
|
| index 7d805e7660f15d20f89911424dc83dbb7d906dca..ef5c76ba28370882583003116b9aeeb3505e256d 100644
|
| --- a/nss/lib/freebl/intel-aes-x86-masm.asm
|
| +++ b/nss/lib/freebl/intel-aes-x64-masm.asm
|
| @@ -10,9 +10,6 @@
|
| ; Please send feedback directly to crypto.feedback.alias@intel.com
|
|
|
|
|
| -.MODEL FLAT, C
|
| -.XMM
|
| -
|
| .DATA
|
| ALIGN 16
|
| Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh
|
| @@ -23,74 +20,81 @@ Lcon2 dd 1bh,1bh,1bh,1bh
|
|
|
| .CODE
|
|
|
| -ctx textequ <ecx>
|
| -output textequ <edx>
|
| -input textequ <eax>
|
| -inputLen textequ <edi>
|
| +ctx textequ <rcx>
|
| +output textequ <rdx>
|
| +input textequ <r8>
|
| +inputLen textequ <r9d>
|
|
|
|
|
| aes_rnd MACRO i
|
| - movdqu xmm7, [i*16 + ctx]
|
| - aesenc xmm0, xmm7
|
| - aesenc xmm1, xmm7
|
| - aesenc xmm2, xmm7
|
| - aesenc xmm3, xmm7
|
| - aesenc xmm4, xmm7
|
| - aesenc xmm5, xmm7
|
| - aesenc xmm6, xmm7
|
| + movdqu xmm8, [i*16 + ctx]
|
| + aesenc xmm0, xmm8
|
| + aesenc xmm1, xmm8
|
| + aesenc xmm2, xmm8
|
| + aesenc xmm3, xmm8
|
| + aesenc xmm4, xmm8
|
| + aesenc xmm5, xmm8
|
| + aesenc xmm6, xmm8
|
| + aesenc xmm7, xmm8
|
| ENDM
|
|
|
| aes_last_rnd MACRO i
|
| - movdqu xmm7, [i*16 + ctx]
|
| - aesenclast xmm0, xmm7
|
| - aesenclast xmm1, xmm7
|
| - aesenclast xmm2, xmm7
|
| - aesenclast xmm3, xmm7
|
| - aesenclast xmm4, xmm7
|
| - aesenclast xmm5, xmm7
|
| - aesenclast xmm6, xmm7
|
| + movdqu xmm8, [i*16 + ctx]
|
| + aesenclast xmm0, xmm8
|
| + aesenclast xmm1, xmm8
|
| + aesenclast xmm2, xmm8
|
| + aesenclast xmm3, xmm8
|
| + aesenclast xmm4, xmm8
|
| + aesenclast xmm5, xmm8
|
| + aesenclast xmm6, xmm8
|
| + aesenclast xmm7, xmm8
|
| ENDM
|
|
|
| aes_dec_rnd MACRO i
|
| - movdqu xmm7, [i*16 + ctx]
|
| - aesdec xmm0, xmm7
|
| - aesdec xmm1, xmm7
|
| - aesdec xmm2, xmm7
|
| - aesdec xmm3, xmm7
|
| - aesdec xmm4, xmm7
|
| - aesdec xmm5, xmm7
|
| - aesdec xmm6, xmm7
|
| + movdqu xmm8, [i*16 + ctx]
|
| + aesdec xmm0, xmm8
|
| + aesdec xmm1, xmm8
|
| + aesdec xmm2, xmm8
|
| + aesdec xmm3, xmm8
|
| + aesdec xmm4, xmm8
|
| + aesdec xmm5, xmm8
|
| + aesdec xmm6, xmm8
|
| + aesdec xmm7, xmm8
|
| ENDM
|
|
|
| aes_dec_last_rnd MACRO i
|
| - movdqu xmm7, [i*16 + ctx]
|
| - aesdeclast xmm0, xmm7
|
| - aesdeclast xmm1, xmm7
|
| - aesdeclast xmm2, xmm7
|
| - aesdeclast xmm3, xmm7
|
| - aesdeclast xmm4, xmm7
|
| - aesdeclast xmm5, xmm7
|
| - aesdeclast xmm6, xmm7
|
| + movdqu xmm8, [i*16 + ctx]
|
| + aesdeclast xmm0, xmm8
|
| + aesdeclast xmm1, xmm8
|
| + aesdeclast xmm2, xmm8
|
| + aesdeclast xmm3, xmm8
|
| + aesdeclast xmm4, xmm8
|
| + aesdeclast xmm5, xmm8
|
| + aesdeclast xmm6, xmm8
|
| + aesdeclast xmm7, xmm8
|
| ENDM
|
|
|
|
|
| gen_aes_ecb_func MACRO enc, rnds
|
|
|
| -LOCAL loop7
|
| +LOCAL loop8
|
| LOCAL loop1
|
| LOCAL bail
|
|
|
| - push inputLen
|
| + xor inputLen, inputLen
|
| + mov input, [rsp + 1*8 + 8*4]
|
| + mov inputLen, [rsp + 1*8 + 8*5]
|
| +
|
| + sub rsp, 3*16
|
|
|
| - mov ctx, [esp + 2*4 + 0*4]
|
| - mov output, [esp + 2*4 + 1*4]
|
| - mov input, [esp + 2*4 + 4*4]
|
| - mov inputLen, [esp + 2*4 + 5*4]
|
| + movdqu [rsp + 0*16], xmm6
|
| + movdqu [rsp + 1*16], xmm7
|
| + movdqu [rsp + 2*16], xmm8
|
|
|
| - lea ctx, [44+ctx]
|
| + lea ctx, [48+ctx]
|
|
|
| -loop7:
|
| - cmp inputLen, 7*16
|
| +loop8:
|
| + cmp inputLen, 8*16
|
| jb loop1
|
|
|
| movdqu xmm0, [0*16 + input]
|
| @@ -100,15 +104,17 @@ loop7:
|
| movdqu xmm4, [4*16 + input]
|
| movdqu xmm5, [5*16 + input]
|
| movdqu xmm6, [6*16 + input]
|
| -
|
| - movdqu xmm7, [0*16 + ctx]
|
| - pxor xmm0, xmm7
|
| - pxor xmm1, xmm7
|
| - pxor xmm2, xmm7
|
| - pxor xmm3, xmm7
|
| - pxor xmm4, xmm7
|
| - pxor xmm5, xmm7
|
| - pxor xmm6, xmm7
|
| + movdqu xmm7, [7*16 + input]
|
| +
|
| + movdqu xmm8, [0*16 + ctx]
|
| + pxor xmm0, xmm8
|
| + pxor xmm1, xmm8
|
| + pxor xmm2, xmm8
|
| + pxor xmm3, xmm8
|
| + pxor xmm4, xmm8
|
| + pxor xmm5, xmm8
|
| + pxor xmm6, xmm8
|
| + pxor xmm7, xmm8
|
|
|
| IF enc eq 1
|
| rnd textequ <aes_rnd>
|
| @@ -136,11 +142,12 @@ ENDIF
|
| movdqu [4*16 + output], xmm4
|
| movdqu [5*16 + output], xmm5
|
| movdqu [6*16 + output], xmm6
|
| + movdqu [7*16 + output], xmm7
|
|
|
| - lea input, [7*16 + input]
|
| - lea output, [7*16 + output]
|
| - sub inputLen, 7*16
|
| - jmp loop7
|
| + lea input, [8*16 + input]
|
| + lea output, [8*16 + output]
|
| + sub inputLen, 8*16
|
| + jmp loop8
|
|
|
| loop1:
|
| cmp inputLen, 1*16
|
| @@ -167,54 +174,46 @@ loop1:
|
| jmp loop1
|
|
|
| bail:
|
| - xor eax, eax
|
| - pop inputLen
|
| - ret
|
| + xor rax, rax
|
|
|
| + movdqu xmm6, [rsp + 0*16]
|
| + movdqu xmm7, [rsp + 1*16]
|
| + movdqu xmm8, [rsp + 2*16]
|
| + add rsp, 3*16
|
| + ret
|
| ENDM
|
|
|
| -ALIGN 16
|
| intel_aes_encrypt_ecb_128 PROC
|
| gen_aes_ecb_func 1, 10
|
| intel_aes_encrypt_ecb_128 ENDP
|
|
|
| -ALIGN 16
|
| intel_aes_encrypt_ecb_192 PROC
|
| gen_aes_ecb_func 1, 12
|
| intel_aes_encrypt_ecb_192 ENDP
|
|
|
| -ALIGN 16
|
| intel_aes_encrypt_ecb_256 PROC
|
| gen_aes_ecb_func 1, 14
|
| intel_aes_encrypt_ecb_256 ENDP
|
|
|
| -ALIGN 16
|
| intel_aes_decrypt_ecb_128 PROC
|
| gen_aes_ecb_func 0, 10
|
| intel_aes_decrypt_ecb_128 ENDP
|
|
|
| -ALIGN 16
|
| intel_aes_decrypt_ecb_192 PROC
|
| gen_aes_ecb_func 0, 12
|
| intel_aes_decrypt_ecb_192 ENDP
|
|
|
| -ALIGN 16
|
| intel_aes_decrypt_ecb_256 PROC
|
| gen_aes_ecb_func 0, 14
|
| intel_aes_decrypt_ecb_256 ENDP
|
|
|
|
|
| -KEY textequ <ecx>
|
| -KS textequ <edx>
|
| -ITR textequ <eax>
|
| +KEY textequ <rcx>
|
| +KS textequ <rdx>
|
| +ITR textequ <r8>
|
|
|
| -ALIGN 16
|
| intel_aes_encrypt_init_128 PROC
|
|
|
| - mov KEY, [esp + 1*4 + 0*4]
|
| - mov KS, [esp + 1*4 + 1*4]
|
| -
|
| -
|
| movdqu xmm1, [KEY]
|
| movdqu [KS], xmm1
|
| movdqa xmm2, xmm1
|
| @@ -280,12 +279,8 @@ Lenc_128_ks_loop:
|
| intel_aes_encrypt_init_128 ENDP
|
|
|
|
|
| -ALIGN 16
|
| intel_aes_decrypt_init_128 PROC
|
|
|
| - mov KEY, [esp + 1*4 + 0*4]
|
| - mov KS, [esp + 1*4 + 1*4]
|
| -
|
| push KS
|
| push KEY
|
|
|
| @@ -320,16 +315,15 @@ intel_aes_decrypt_init_128 PROC
|
| intel_aes_decrypt_init_128 ENDP
|
|
|
|
|
| -ALIGN 16
|
| intel_aes_encrypt_init_192 PROC
|
|
|
| - mov KEY, [esp + 1*4 + 0*4]
|
| - mov KS, [esp + 1*4 + 1*4]
|
| + sub rsp, 16*2
|
| + movdqu [16*0 + rsp], xmm6
|
| + movdqu [16*1 + rsp], xmm7
|
|
|
| - pxor xmm3, xmm3
|
| movdqu xmm1, [KEY]
|
| - pinsrd xmm3, DWORD PTR [16 + KEY], 0
|
| - pinsrd xmm3, DWORD PTR [20 + KEY], 1
|
| + mov ITR, [16 + KEY]
|
| + movd xmm3, ITR
|
|
|
| movdqu [KS], xmm1
|
| movdqa xmm5, xmm3
|
| @@ -396,14 +390,14 @@ Lenc_192_ks_loop:
|
| jnz Lenc_192_ks_loop
|
|
|
| movdqu [16 + KS], xmm5
|
| -ret
|
| +
|
| + movdqu xmm7, [16*1 + rsp]
|
| + movdqu xmm6, [16*0 + rsp]
|
| + add rsp, 16*2
|
| + ret
|
| intel_aes_encrypt_init_192 ENDP
|
|
|
| -ALIGN 16
|
| intel_aes_decrypt_init_192 PROC
|
| - mov KEY, [esp + 1*4 + 0*4]
|
| - mov KS, [esp + 1*4 + 1*4]
|
| -
|
| push KS
|
| push KEY
|
|
|
| @@ -437,11 +431,12 @@ intel_aes_decrypt_init_192 PROC
|
| ret
|
| intel_aes_decrypt_init_192 ENDP
|
|
|
| -ALIGN 16
|
| +
|
| intel_aes_encrypt_init_256 PROC
|
| + sub rsp, 16*2
|
| + movdqu [16*0 + rsp], xmm6
|
| + movdqu [16*1 + rsp], xmm7
|
|
|
| - mov KEY, [esp + 1*4 + 0*4]
|
| - mov KS, [esp + 1*4 + 1*4]
|
| movdqu xmm1, [16*0 + KEY]
|
| movdqu xmm3, [16*1 + KEY]
|
|
|
| @@ -502,14 +497,15 @@ Lenc_256_ks_loop:
|
| pxor xmm1, xmm2
|
| movdqu [16*2 + KS], xmm1
|
|
|
| + movdqu xmm7, [16*1 + rsp]
|
| + movdqu xmm6, [16*0 + rsp]
|
| + add rsp, 16*2
|
| ret
|
| +
|
| intel_aes_encrypt_init_256 ENDP
|
|
|
| -ALIGN 16
|
| -intel_aes_decrypt_init_256 PROC
|
| - mov KEY, [esp + 1*4 + 0*4]
|
| - mov KS, [esp + 1*4 + 1*4]
|
|
|
| +intel_aes_decrypt_init_256 PROC
|
| push KS
|
| push KEY
|
|
|
| @@ -550,14 +546,16 @@ gen_aes_cbc_enc_func MACRO rnds
|
| LOCAL loop1
|
| LOCAL bail
|
|
|
| - push inputLen
|
| + mov input, [rsp + 1*8 + 8*4]
|
| + mov inputLen, [rsp + 1*8 + 8*5]
|
|
|
| - mov ctx, [esp + 2*4 + 0*4]
|
| - mov output, [esp + 2*4 + 1*4]
|
| - mov input, [esp + 2*4 + 4*4]
|
| - mov inputLen, [esp + 2*4 + 5*4]
|
| + sub rsp, 3*16
|
|
|
| - lea ctx, [44+ctx]
|
| + movdqu [rsp + 0*16], xmm6
|
| + movdqu [rsp + 1*16], xmm7
|
| + movdqu [rsp + 2*16], xmm8
|
| +
|
| + lea ctx, [48+ctx]
|
|
|
| movdqu xmm0, [-32+ctx]
|
|
|
| @@ -566,6 +564,7 @@ LOCAL bail
|
| movdqu xmm4, [2*16 + ctx]
|
| movdqu xmm5, [3*16 + ctx]
|
| movdqu xmm6, [4*16 + ctx]
|
| + movdqu xmm7, [5*16 + ctx]
|
|
|
| loop1:
|
| cmp inputLen, 1*16
|
| @@ -579,15 +578,16 @@ loop1:
|
| aesenc xmm0, xmm4
|
| aesenc xmm0, xmm5
|
| aesenc xmm0, xmm6
|
| + aesenc xmm0, xmm7
|
|
|
| - i = 5
|
| + i = 6
|
| WHILE i LT rnds
|
| - movdqu xmm7, [i*16 + ctx]
|
| - aesenc xmm0, xmm7
|
| + movdqu xmm8, [i*16 + ctx]
|
| + aesenc xmm0, xmm8
|
| i = i+1
|
| ENDM
|
| - movdqu xmm7, [rnds*16 + ctx]
|
| - aesenclast xmm0, xmm7
|
| + movdqu xmm8, [rnds*16 + ctx]
|
| + aesenclast xmm0, xmm8
|
|
|
| movdqu [output], xmm0
|
|
|
| @@ -599,30 +599,36 @@ loop1:
|
| bail:
|
| movdqu [-32+ctx], xmm0
|
|
|
| - xor eax, eax
|
| - pop inputLen
|
| + xor rax, rax
|
| +
|
| + movdqu xmm6, [rsp + 0*16]
|
| + movdqu xmm7, [rsp + 1*16]
|
| + movdqu xmm8, [rsp + 2*16]
|
| + add rsp, 3*16
|
| ret
|
|
|
| ENDM
|
|
|
| gen_aes_cbc_dec_func MACRO rnds
|
|
|
| -LOCAL loop7
|
| +LOCAL loop8
|
| LOCAL loop1
|
| LOCAL dec1
|
| LOCAL bail
|
|
|
| - push inputLen
|
| + mov input, [rsp + 1*8 + 8*4]
|
| + mov inputLen, [rsp + 1*8 + 8*5]
|
|
|
| - mov ctx, [esp + 2*4 + 0*4]
|
| - mov output, [esp + 2*4 + 1*4]
|
| - mov input, [esp + 2*4 + 4*4]
|
| - mov inputLen, [esp + 2*4 + 5*4]
|
| + sub rsp, 3*16
|
|
|
| - lea ctx, [44+ctx]
|
| + movdqu [rsp + 0*16], xmm6
|
| + movdqu [rsp + 1*16], xmm7
|
| + movdqu [rsp + 2*16], xmm8
|
|
|
| -loop7:
|
| - cmp inputLen, 7*16
|
| + lea ctx, [48+ctx]
|
| +
|
| +loop8:
|
| + cmp inputLen, 8*16
|
| jb dec1
|
|
|
| movdqu xmm0, [0*16 + input]
|
| @@ -632,15 +638,17 @@ loop7:
|
| movdqu xmm4, [4*16 + input]
|
| movdqu xmm5, [5*16 + input]
|
| movdqu xmm6, [6*16 + input]
|
| -
|
| - movdqu xmm7, [0*16 + ctx]
|
| - pxor xmm0, xmm7
|
| - pxor xmm1, xmm7
|
| - pxor xmm2, xmm7
|
| - pxor xmm3, xmm7
|
| - pxor xmm4, xmm7
|
| - pxor xmm5, xmm7
|
| - pxor xmm6, xmm7
|
| + movdqu xmm7, [7*16 + input]
|
| +
|
| + movdqu xmm8, [0*16 + ctx]
|
| + pxor xmm0, xmm8
|
| + pxor xmm1, xmm8
|
| + pxor xmm2, xmm8
|
| + pxor xmm3, xmm8
|
| + pxor xmm4, xmm8
|
| + pxor xmm5, xmm8
|
| + pxor xmm6, xmm8
|
| + pxor xmm7, xmm8
|
|
|
| i = 1
|
| WHILE i LT rnds
|
| @@ -649,21 +657,23 @@ loop7:
|
| ENDM
|
| aes_dec_last_rnd rnds
|
|
|
| - movdqu xmm7, [-32 + ctx]
|
| - pxor xmm0, xmm7
|
| - movdqu xmm7, [0*16 + input]
|
| - pxor xmm1, xmm7
|
| - movdqu xmm7, [1*16 + input]
|
| - pxor xmm2, xmm7
|
| - movdqu xmm7, [2*16 + input]
|
| - pxor xmm3, xmm7
|
| - movdqu xmm7, [3*16 + input]
|
| - pxor xmm4, xmm7
|
| - movdqu xmm7, [4*16 + input]
|
| - pxor xmm5, xmm7
|
| - movdqu xmm7, [5*16 + input]
|
| - pxor xmm6, xmm7
|
| - movdqu xmm7, [6*16 + input]
|
| + movdqu xmm8, [-32 + ctx]
|
| + pxor xmm0, xmm8
|
| + movdqu xmm8, [0*16 + input]
|
| + pxor xmm1, xmm8
|
| + movdqu xmm8, [1*16 + input]
|
| + pxor xmm2, xmm8
|
| + movdqu xmm8, [2*16 + input]
|
| + pxor xmm3, xmm8
|
| + movdqu xmm8, [3*16 + input]
|
| + pxor xmm4, xmm8
|
| + movdqu xmm8, [4*16 + input]
|
| + pxor xmm5, xmm8
|
| + movdqu xmm8, [5*16 + input]
|
| + pxor xmm6, xmm8
|
| + movdqu xmm8, [6*16 + input]
|
| + pxor xmm7, xmm8
|
| + movdqu xmm8, [7*16 + input]
|
|
|
| movdqu [0*16 + output], xmm0
|
| movdqu [1*16 + output], xmm1
|
| @@ -672,12 +682,13 @@ loop7:
|
| movdqu [4*16 + output], xmm4
|
| movdqu [5*16 + output], xmm5
|
| movdqu [6*16 + output], xmm6
|
| - movdqu [-32 + ctx], xmm7
|
| + movdqu [7*16 + output], xmm7
|
| + movdqu [-32 + ctx], xmm8
|
|
|
| - lea input, [7*16 + input]
|
| - lea output, [7*16 + output]
|
| - sub inputLen, 7*16
|
| - jmp loop7
|
| + lea input, [8*16 + input]
|
| + lea output, [8*16 + output]
|
| + sub inputLen, 8*16
|
| + jmp loop8
|
| dec1:
|
|
|
| movdqu xmm3, [-32 + ctx]
|
| @@ -711,143 +722,152 @@ loop1:
|
|
|
| bail:
|
| movdqu [-32 + ctx], xmm3
|
| - xor eax, eax
|
| - pop inputLen
|
| + xor rax, rax
|
| +
|
| + movdqu xmm6, [rsp + 0*16]
|
| + movdqu xmm7, [rsp + 1*16]
|
| + movdqu xmm8, [rsp + 2*16]
|
| + add rsp, 3*16
|
| ret
|
| ENDM
|
|
|
| -ALIGN 16
|
| intel_aes_encrypt_cbc_128 PROC
|
| gen_aes_cbc_enc_func 10
|
| intel_aes_encrypt_cbc_128 ENDP
|
|
|
| -ALIGN 16
|
| intel_aes_encrypt_cbc_192 PROC
|
| gen_aes_cbc_enc_func 12
|
| intel_aes_encrypt_cbc_192 ENDP
|
|
|
| -ALIGN 16
|
| intel_aes_encrypt_cbc_256 PROC
|
| gen_aes_cbc_enc_func 14
|
| intel_aes_encrypt_cbc_256 ENDP
|
|
|
| -ALIGN 16
|
| intel_aes_decrypt_cbc_128 PROC
|
| gen_aes_cbc_dec_func 10
|
| intel_aes_decrypt_cbc_128 ENDP
|
|
|
| -ALIGN 16
|
| intel_aes_decrypt_cbc_192 PROC
|
| gen_aes_cbc_dec_func 12
|
| intel_aes_decrypt_cbc_192 ENDP
|
|
|
| -ALIGN 16
|
| intel_aes_decrypt_cbc_256 PROC
|
| gen_aes_cbc_dec_func 14
|
| intel_aes_decrypt_cbc_256 ENDP
|
|
|
|
|
|
|
| -ctrCtx textequ <esi>
|
| -CTR textequ <ebx>
|
| +ctrCtx textequ <r10>
|
| +CTR textequ <r11d>
|
| +CTRSave textequ <eax>
|
|
|
| gen_aes_ctr_func MACRO rnds
|
|
|
| -LOCAL loop7
|
| +LOCAL loop8
|
| LOCAL loop1
|
| LOCAL enc1
|
| LOCAL bail
|
|
|
| - push inputLen
|
| - push ctrCtx
|
| - push CTR
|
| - push ebp
|
| + mov input, [rsp + 8*1 + 4*8]
|
| + mov inputLen, [rsp + 8*1 + 5*8]
|
| +
|
| + mov ctrCtx, ctx
|
| + mov ctx, [8+ctrCtx]
|
| + lea ctx, [48+ctx]
|
|
|
| - mov ctrCtx, [esp + 4*5 + 0*4]
|
| - mov output, [esp + 4*5 + 1*4]
|
| - mov input, [esp + 4*5 + 4*4]
|
| - mov inputLen, [esp + 4*5 + 5*4]
|
| + sub rsp, 3*16
|
| + movdqu [rsp + 0*16], xmm6
|
| + movdqu [rsp + 1*16], xmm7
|
| + movdqu [rsp + 2*16], xmm8
|
|
|
| - mov ctx, [4+ctrCtx]
|
| - lea ctx, [44+ctx]
|
|
|
| - mov ebp, esp
|
| - sub esp, 7*16
|
| - and esp, -16
|
| + push rbp
|
| + mov rbp, rsp
|
| + sub rsp, 8*16
|
| + and rsp, -16
|
|
|
| - movdqu xmm0, [8+ctrCtx]
|
| - mov ctrCtx, [ctrCtx + 8 + 3*4]
|
| - bswap ctrCtx
|
| +
|
| + movdqu xmm0, [16+ctrCtx]
|
| + mov CTRSave, DWORD PTR [ctrCtx + 16 + 3*4]
|
| + bswap CTRSave
|
| movdqu xmm1, [ctx + 0*16]
|
|
|
| pxor xmm0, xmm1
|
|
|
| - movdqa [esp + 0*16], xmm0
|
| - movdqa [esp + 1*16], xmm0
|
| - movdqa [esp + 2*16], xmm0
|
| - movdqa [esp + 3*16], xmm0
|
| - movdqa [esp + 4*16], xmm0
|
| - movdqa [esp + 5*16], xmm0
|
| - movdqa [esp + 6*16], xmm0
|
| + movdqa [rsp + 0*16], xmm0
|
| + movdqa [rsp + 1*16], xmm0
|
| + movdqa [rsp + 2*16], xmm0
|
| + movdqa [rsp + 3*16], xmm0
|
| + movdqa [rsp + 4*16], xmm0
|
| + movdqa [rsp + 5*16], xmm0
|
| + movdqa [rsp + 6*16], xmm0
|
| + movdqa [rsp + 7*16], xmm0
|
| +
|
| + inc CTRSave
|
| + mov CTR, CTRSave
|
| + bswap CTR
|
| + xor CTR, DWORD PTR [ctx + 3*4]
|
| + mov DWORD PTR [rsp + 1*16 + 3*4], CTR
|
|
|
| - inc ctrCtx
|
| - mov CTR, ctrCtx
|
| + inc CTRSave
|
| + mov CTR, CTRSave
|
| bswap CTR
|
| - xor CTR, [ctx + 3*4]
|
| - mov [esp + 1*16 + 3*4], CTR
|
| + xor CTR, DWORD PTR [ctx + 3*4]
|
| + mov DWORD PTR [rsp + 2*16 + 3*4], CTR
|
|
|
| - inc ctrCtx
|
| - mov CTR, ctrCtx
|
| + inc CTRSave
|
| + mov CTR, CTRSave
|
| bswap CTR
|
| - xor CTR, [ctx + 3*4]
|
| - mov [esp + 2*16 + 3*4], CTR
|
| + xor CTR, DWORD PTR [ctx + 3*4]
|
| + mov DWORD PTR [rsp + 3*16 + 3*4], CTR
|
|
|
| - inc ctrCtx
|
| - mov CTR, ctrCtx
|
| + inc CTRSave
|
| + mov CTR, CTRSave
|
| bswap CTR
|
| - xor CTR, [ctx + 3*4]
|
| - mov [esp + 3*16 + 3*4], CTR
|
| + xor CTR, DWORD PTR [ctx + 3*4]
|
| + mov DWORD PTR [rsp + 4*16 + 3*4], CTR
|
|
|
| - inc ctrCtx
|
| - mov CTR, ctrCtx
|
| + inc CTRSave
|
| + mov CTR, CTRSave
|
| bswap CTR
|
| - xor CTR, [ctx + 3*4]
|
| - mov [esp + 4*16 + 3*4], CTR
|
| + xor CTR, DWORD PTR [ctx + 3*4]
|
| + mov DWORD PTR [rsp + 5*16 + 3*4], CTR
|
|
|
| - inc ctrCtx
|
| - mov CTR, ctrCtx
|
| + inc CTRSave
|
| + mov CTR, CTRSave
|
| bswap CTR
|
| - xor CTR, [ctx + 3*4]
|
| - mov [esp + 5*16 + 3*4], CTR
|
| + xor CTR, DWORD PTR [ctx + 3*4]
|
| + mov DWORD PTR [rsp + 6*16 + 3*4], CTR
|
|
|
| - inc ctrCtx
|
| - mov CTR, ctrCtx
|
| + inc CTRSave
|
| + mov CTR, CTRSave
|
| bswap CTR
|
| - xor CTR, [ctx + 3*4]
|
| - mov [esp + 6*16 + 3*4], CTR
|
| + xor CTR, DWORD PTR [ctx + 3*4]
|
| + mov DWORD PTR [rsp + 7*16 + 3*4], CTR
|
|
|
|
|
| -loop7:
|
| - cmp inputLen, 7*16
|
| +loop8:
|
| + cmp inputLen, 8*16
|
| jb loop1
|
|
|
| - movdqu xmm0, [0*16 + esp]
|
| - movdqu xmm1, [1*16 + esp]
|
| - movdqu xmm2, [2*16 + esp]
|
| - movdqu xmm3, [3*16 + esp]
|
| - movdqu xmm4, [4*16 + esp]
|
| - movdqu xmm5, [5*16 + esp]
|
| - movdqu xmm6, [6*16 + esp]
|
| + movdqu xmm0, [0*16 + rsp]
|
| + movdqu xmm1, [1*16 + rsp]
|
| + movdqu xmm2, [2*16 + rsp]
|
| + movdqu xmm3, [3*16 + rsp]
|
| + movdqu xmm4, [4*16 + rsp]
|
| + movdqu xmm5, [5*16 + rsp]
|
| + movdqu xmm6, [6*16 + rsp]
|
| + movdqu xmm7, [7*16 + rsp]
|
|
|
| i = 1
|
| - WHILE i LE 7
|
| + WHILE i LE 8
|
| aes_rnd i
|
|
|
| - inc ctrCtx
|
| - mov CTR, ctrCtx
|
| + inc CTRSave
|
| + mov CTR, CTRSave
|
| bswap CTR
|
| - xor CTR, [ctx + 3*4]
|
| - mov [esp + (i-1)*16 + 3*4], CTR
|
| + xor CTR, DWORD PTR [ctx + 3*4]
|
| + mov DWORD PTR [rsp + (i-1)*16 + 3*4], CTR
|
|
|
| i = i+1
|
| ENDM
|
| @@ -857,20 +877,22 @@ loop7:
|
| ENDM
|
| aes_last_rnd rnds
|
|
|
| - movdqu xmm7, [0*16 + input]
|
| - pxor xmm0, xmm7
|
| - movdqu xmm7, [1*16 + input]
|
| - pxor xmm1, xmm7
|
| - movdqu xmm7, [2*16 + input]
|
| - pxor xmm2, xmm7
|
| - movdqu xmm7, [3*16 + input]
|
| - pxor xmm3, xmm7
|
| - movdqu xmm7, [4*16 + input]
|
| - pxor xmm4, xmm7
|
| - movdqu xmm7, [5*16 + input]
|
| - pxor xmm5, xmm7
|
| - movdqu xmm7, [6*16 + input]
|
| - pxor xmm6, xmm7
|
| + movdqu xmm8, [0*16 + input]
|
| + pxor xmm0, xmm8
|
| + movdqu xmm8, [1*16 + input]
|
| + pxor xmm1, xmm8
|
| + movdqu xmm8, [2*16 + input]
|
| + pxor xmm2, xmm8
|
| + movdqu xmm8, [3*16 + input]
|
| + pxor xmm3, xmm8
|
| + movdqu xmm8, [4*16 + input]
|
| + pxor xmm4, xmm8
|
| + movdqu xmm8, [5*16 + input]
|
| + pxor xmm5, xmm8
|
| + movdqu xmm8, [6*16 + input]
|
| + pxor xmm6, xmm8
|
| + movdqu xmm8, [7*16 + input]
|
| + pxor xmm7, xmm8
|
|
|
| movdqu [0*16 + output], xmm0
|
| movdqu [1*16 + output], xmm1
|
| @@ -879,19 +901,20 @@ loop7:
|
| movdqu [4*16 + output], xmm4
|
| movdqu [5*16 + output], xmm5
|
| movdqu [6*16 + output], xmm6
|
| + movdqu [7*16 + output], xmm7
|
|
|
| - lea input, [7*16 + input]
|
| - lea output, [7*16 + output]
|
| - sub inputLen, 7*16
|
| - jmp loop7
|
| + lea input, [8*16 + input]
|
| + lea output, [8*16 + output]
|
| + sub inputLen, 8*16
|
| + jmp loop8
|
|
|
|
|
| loop1:
|
| cmp inputLen, 1*16
|
| jb bail
|
|
|
| - movdqu xmm0, [esp]
|
| - add esp, 16
|
| + movdqu xmm0, [rsp]
|
| + add rsp, 16
|
|
|
| i = 1
|
| WHILE i LT rnds
|
| @@ -913,34 +936,33 @@ loop1:
|
|
|
| bail:
|
|
|
| - mov ctrCtx, [ebp + 4*5 + 0*4]
|
| - movdqu xmm0, [esp]
|
| + movdqu xmm0, [rsp]
|
| movdqu xmm1, [ctx + 0*16]
|
| pxor xmm0, xmm1
|
| - movdqu [8+ctrCtx], xmm0
|
| + movdqu [16+ctrCtx], xmm0
|
| +
|
| +
|
| + xor rax, rax
|
| + mov rsp, rbp
|
| + pop rbp
|
|
|
| + movdqu xmm6, [rsp + 0*16]
|
| + movdqu xmm7, [rsp + 1*16]
|
| + movdqu xmm8, [rsp + 2*16]
|
| + add rsp, 3*16
|
|
|
| - xor eax, eax
|
| - mov esp, ebp
|
| - pop ebp
|
| - pop CTR
|
| - pop ctrCtx
|
| - pop inputLen
|
| ret
|
| ENDM
|
|
|
|
|
| -ALIGN 16
|
| intel_aes_encrypt_ctr_128 PROC
|
| gen_aes_ctr_func 10
|
| intel_aes_encrypt_ctr_128 ENDP
|
|
|
| -ALIGN 16
|
| intel_aes_encrypt_ctr_192 PROC
|
| gen_aes_ctr_func 12
|
| intel_aes_encrypt_ctr_192 ENDP
|
|
|
| -ALIGN 16
|
| intel_aes_encrypt_ctr_256 PROC
|
| gen_aes_ctr_func 14
|
| intel_aes_encrypt_ctr_256 ENDP
|
|
|