Chromium Code Reviews| Index: nss/lib/freebl/intel-aes-x64-masm.asm |
| diff --git a/nss/lib/freebl/intel-aes-x86-masm.asm b/nss/lib/freebl/intel-aes-x64-masm.asm |
| similarity index 62% |
| copy from nss/lib/freebl/intel-aes-x86-masm.asm |
| copy to nss/lib/freebl/intel-aes-x64-masm.asm |
| index 7d805e7660f15d20f89911424dc83dbb7d906dca..ef5c76ba28370882583003116b9aeeb3505e256d 100644 |
| --- a/nss/lib/freebl/intel-aes-x86-masm.asm |
| +++ b/nss/lib/freebl/intel-aes-x64-masm.asm |
| @@ -10,9 +10,6 @@ |
| ; Please send feedback directly to crypto.feedback.alias@intel.com |
|
davidben
2015/05/27 19:40:40
(rubber stamp)
|
| -.MODEL FLAT, C |
| -.XMM |
| - |
| .DATA |
| ALIGN 16 |
| Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh |
| @@ -23,74 +20,81 @@ Lcon2 dd 1bh,1bh,1bh,1bh |
| .CODE |
| -ctx textequ <ecx> |
| -output textequ <edx> |
| -input textequ <eax> |
| -inputLen textequ <edi> |
| +ctx textequ <rcx> |
| +output textequ <rdx> |
| +input textequ <r8> |
| +inputLen textequ <r9d> |
| aes_rnd MACRO i |
| - movdqu xmm7, [i*16 + ctx] |
| - aesenc xmm0, xmm7 |
| - aesenc xmm1, xmm7 |
| - aesenc xmm2, xmm7 |
| - aesenc xmm3, xmm7 |
| - aesenc xmm4, xmm7 |
| - aesenc xmm5, xmm7 |
| - aesenc xmm6, xmm7 |
| + movdqu xmm8, [i*16 + ctx] |
| + aesenc xmm0, xmm8 |
| + aesenc xmm1, xmm8 |
| + aesenc xmm2, xmm8 |
| + aesenc xmm3, xmm8 |
| + aesenc xmm4, xmm8 |
| + aesenc xmm5, xmm8 |
| + aesenc xmm6, xmm8 |
| + aesenc xmm7, xmm8 |
| ENDM |
| aes_last_rnd MACRO i |
| - movdqu xmm7, [i*16 + ctx] |
| - aesenclast xmm0, xmm7 |
| - aesenclast xmm1, xmm7 |
| - aesenclast xmm2, xmm7 |
| - aesenclast xmm3, xmm7 |
| - aesenclast xmm4, xmm7 |
| - aesenclast xmm5, xmm7 |
| - aesenclast xmm6, xmm7 |
| + movdqu xmm8, [i*16 + ctx] |
| + aesenclast xmm0, xmm8 |
| + aesenclast xmm1, xmm8 |
| + aesenclast xmm2, xmm8 |
| + aesenclast xmm3, xmm8 |
| + aesenclast xmm4, xmm8 |
| + aesenclast xmm5, xmm8 |
| + aesenclast xmm6, xmm8 |
| + aesenclast xmm7, xmm8 |
| ENDM |
| aes_dec_rnd MACRO i |
| - movdqu xmm7, [i*16 + ctx] |
| - aesdec xmm0, xmm7 |
| - aesdec xmm1, xmm7 |
| - aesdec xmm2, xmm7 |
| - aesdec xmm3, xmm7 |
| - aesdec xmm4, xmm7 |
| - aesdec xmm5, xmm7 |
| - aesdec xmm6, xmm7 |
| + movdqu xmm8, [i*16 + ctx] |
| + aesdec xmm0, xmm8 |
| + aesdec xmm1, xmm8 |
| + aesdec xmm2, xmm8 |
| + aesdec xmm3, xmm8 |
| + aesdec xmm4, xmm8 |
| + aesdec xmm5, xmm8 |
| + aesdec xmm6, xmm8 |
| + aesdec xmm7, xmm8 |
| ENDM |
| aes_dec_last_rnd MACRO i |
| - movdqu xmm7, [i*16 + ctx] |
| - aesdeclast xmm0, xmm7 |
| - aesdeclast xmm1, xmm7 |
| - aesdeclast xmm2, xmm7 |
| - aesdeclast xmm3, xmm7 |
| - aesdeclast xmm4, xmm7 |
| - aesdeclast xmm5, xmm7 |
| - aesdeclast xmm6, xmm7 |
| + movdqu xmm8, [i*16 + ctx] |
| + aesdeclast xmm0, xmm8 |
| + aesdeclast xmm1, xmm8 |
| + aesdeclast xmm2, xmm8 |
| + aesdeclast xmm3, xmm8 |
| + aesdeclast xmm4, xmm8 |
| + aesdeclast xmm5, xmm8 |
| + aesdeclast xmm6, xmm8 |
| + aesdeclast xmm7, xmm8 |
| ENDM |
| gen_aes_ecb_func MACRO enc, rnds |
| -LOCAL loop7 |
| +LOCAL loop8 |
| LOCAL loop1 |
| LOCAL bail |
| - push inputLen |
| + xor inputLen, inputLen |
| + mov input, [rsp + 1*8 + 8*4] |
| + mov inputLen, [rsp + 1*8 + 8*5] |
| + |
| + sub rsp, 3*16 |
| - mov ctx, [esp + 2*4 + 0*4] |
| - mov output, [esp + 2*4 + 1*4] |
| - mov input, [esp + 2*4 + 4*4] |
| - mov inputLen, [esp + 2*4 + 5*4] |
| + movdqu [rsp + 0*16], xmm6 |
| + movdqu [rsp + 1*16], xmm7 |
| + movdqu [rsp + 2*16], xmm8 |
| - lea ctx, [44+ctx] |
| + lea ctx, [48+ctx] |
| -loop7: |
| - cmp inputLen, 7*16 |
| +loop8: |
| + cmp inputLen, 8*16 |
| jb loop1 |
| movdqu xmm0, [0*16 + input] |
| @@ -100,15 +104,17 @@ loop7: |
| movdqu xmm4, [4*16 + input] |
| movdqu xmm5, [5*16 + input] |
| movdqu xmm6, [6*16 + input] |
| - |
| - movdqu xmm7, [0*16 + ctx] |
| - pxor xmm0, xmm7 |
| - pxor xmm1, xmm7 |
| - pxor xmm2, xmm7 |
| - pxor xmm3, xmm7 |
| - pxor xmm4, xmm7 |
| - pxor xmm5, xmm7 |
| - pxor xmm6, xmm7 |
| + movdqu xmm7, [7*16 + input] |
| + |
| + movdqu xmm8, [0*16 + ctx] |
| + pxor xmm0, xmm8 |
| + pxor xmm1, xmm8 |
| + pxor xmm2, xmm8 |
| + pxor xmm3, xmm8 |
| + pxor xmm4, xmm8 |
| + pxor xmm5, xmm8 |
| + pxor xmm6, xmm8 |
| + pxor xmm7, xmm8 |
| IF enc eq 1 |
| rnd textequ <aes_rnd> |
| @@ -136,11 +142,12 @@ ENDIF |
| movdqu [4*16 + output], xmm4 |
| movdqu [5*16 + output], xmm5 |
| movdqu [6*16 + output], xmm6 |
| + movdqu [7*16 + output], xmm7 |
| - lea input, [7*16 + input] |
| - lea output, [7*16 + output] |
| - sub inputLen, 7*16 |
| - jmp loop7 |
| + lea input, [8*16 + input] |
| + lea output, [8*16 + output] |
| + sub inputLen, 8*16 |
| + jmp loop8 |
| loop1: |
| cmp inputLen, 1*16 |
| @@ -167,54 +174,46 @@ loop1: |
| jmp loop1 |
| bail: |
| - xor eax, eax |
| - pop inputLen |
| - ret |
| + xor rax, rax |
| + movdqu xmm6, [rsp + 0*16] |
| + movdqu xmm7, [rsp + 1*16] |
| + movdqu xmm8, [rsp + 2*16] |
| + add rsp, 3*16 |
| + ret |
| ENDM |
| -ALIGN 16 |
| intel_aes_encrypt_ecb_128 PROC |
| gen_aes_ecb_func 1, 10 |
| intel_aes_encrypt_ecb_128 ENDP |
| -ALIGN 16 |
| intel_aes_encrypt_ecb_192 PROC |
| gen_aes_ecb_func 1, 12 |
| intel_aes_encrypt_ecb_192 ENDP |
| -ALIGN 16 |
| intel_aes_encrypt_ecb_256 PROC |
| gen_aes_ecb_func 1, 14 |
| intel_aes_encrypt_ecb_256 ENDP |
| -ALIGN 16 |
| intel_aes_decrypt_ecb_128 PROC |
| gen_aes_ecb_func 0, 10 |
| intel_aes_decrypt_ecb_128 ENDP |
| -ALIGN 16 |
| intel_aes_decrypt_ecb_192 PROC |
| gen_aes_ecb_func 0, 12 |
| intel_aes_decrypt_ecb_192 ENDP |
| -ALIGN 16 |
| intel_aes_decrypt_ecb_256 PROC |
| gen_aes_ecb_func 0, 14 |
| intel_aes_decrypt_ecb_256 ENDP |
| -KEY textequ <ecx> |
| -KS textequ <edx> |
| -ITR textequ <eax> |
| +KEY textequ <rcx> |
| +KS textequ <rdx> |
| +ITR textequ <r8> |
| -ALIGN 16 |
| intel_aes_encrypt_init_128 PROC |
| - mov KEY, [esp + 1*4 + 0*4] |
| - mov KS, [esp + 1*4 + 1*4] |
| - |
| - |
| movdqu xmm1, [KEY] |
| movdqu [KS], xmm1 |
| movdqa xmm2, xmm1 |
| @@ -280,12 +279,8 @@ Lenc_128_ks_loop: |
| intel_aes_encrypt_init_128 ENDP |
| -ALIGN 16 |
| intel_aes_decrypt_init_128 PROC |
| - mov KEY, [esp + 1*4 + 0*4] |
| - mov KS, [esp + 1*4 + 1*4] |
| - |
| push KS |
| push KEY |
| @@ -320,16 +315,15 @@ intel_aes_decrypt_init_128 PROC |
| intel_aes_decrypt_init_128 ENDP |
| -ALIGN 16 |
| intel_aes_encrypt_init_192 PROC |
| - mov KEY, [esp + 1*4 + 0*4] |
| - mov KS, [esp + 1*4 + 1*4] |
| + sub rsp, 16*2 |
| + movdqu [16*0 + rsp], xmm6 |
| + movdqu [16*1 + rsp], xmm7 |
| - pxor xmm3, xmm3 |
| movdqu xmm1, [KEY] |
| - pinsrd xmm3, DWORD PTR [16 + KEY], 0 |
| - pinsrd xmm3, DWORD PTR [20 + KEY], 1 |
| + mov ITR, [16 + KEY] |
| + movd xmm3, ITR |
| movdqu [KS], xmm1 |
| movdqa xmm5, xmm3 |
| @@ -396,14 +390,14 @@ Lenc_192_ks_loop: |
| jnz Lenc_192_ks_loop |
| movdqu [16 + KS], xmm5 |
| -ret |
| + |
| + movdqu xmm7, [16*1 + rsp] |
| + movdqu xmm6, [16*0 + rsp] |
| + add rsp, 16*2 |
| + ret |
| intel_aes_encrypt_init_192 ENDP |
| -ALIGN 16 |
| intel_aes_decrypt_init_192 PROC |
| - mov KEY, [esp + 1*4 + 0*4] |
| - mov KS, [esp + 1*4 + 1*4] |
| - |
| push KS |
| push KEY |
| @@ -437,11 +431,12 @@ intel_aes_decrypt_init_192 PROC |
| ret |
| intel_aes_decrypt_init_192 ENDP |
| -ALIGN 16 |
| + |
| intel_aes_encrypt_init_256 PROC |
| + sub rsp, 16*2 |
| + movdqu [16*0 + rsp], xmm6 |
| + movdqu [16*1 + rsp], xmm7 |
| - mov KEY, [esp + 1*4 + 0*4] |
| - mov KS, [esp + 1*4 + 1*4] |
| movdqu xmm1, [16*0 + KEY] |
| movdqu xmm3, [16*1 + KEY] |
| @@ -502,14 +497,15 @@ Lenc_256_ks_loop: |
| pxor xmm1, xmm2 |
| movdqu [16*2 + KS], xmm1 |
| + movdqu xmm7, [16*1 + rsp] |
| + movdqu xmm6, [16*0 + rsp] |
| + add rsp, 16*2 |
| ret |
| + |
| intel_aes_encrypt_init_256 ENDP |
| -ALIGN 16 |
| -intel_aes_decrypt_init_256 PROC |
| - mov KEY, [esp + 1*4 + 0*4] |
| - mov KS, [esp + 1*4 + 1*4] |
| +intel_aes_decrypt_init_256 PROC |
| push KS |
| push KEY |
| @@ -550,14 +546,16 @@ gen_aes_cbc_enc_func MACRO rnds |
| LOCAL loop1 |
| LOCAL bail |
| - push inputLen |
| + mov input, [rsp + 1*8 + 8*4] |
| + mov inputLen, [rsp + 1*8 + 8*5] |
| - mov ctx, [esp + 2*4 + 0*4] |
| - mov output, [esp + 2*4 + 1*4] |
| - mov input, [esp + 2*4 + 4*4] |
| - mov inputLen, [esp + 2*4 + 5*4] |
| + sub rsp, 3*16 |
| - lea ctx, [44+ctx] |
| + movdqu [rsp + 0*16], xmm6 |
| + movdqu [rsp + 1*16], xmm7 |
| + movdqu [rsp + 2*16], xmm8 |
| + |
| + lea ctx, [48+ctx] |
| movdqu xmm0, [-32+ctx] |
| @@ -566,6 +564,7 @@ LOCAL bail |
| movdqu xmm4, [2*16 + ctx] |
| movdqu xmm5, [3*16 + ctx] |
| movdqu xmm6, [4*16 + ctx] |
| + movdqu xmm7, [5*16 + ctx] |
| loop1: |
| cmp inputLen, 1*16 |
| @@ -579,15 +578,16 @@ loop1: |
| aesenc xmm0, xmm4 |
| aesenc xmm0, xmm5 |
| aesenc xmm0, xmm6 |
| + aesenc xmm0, xmm7 |
| - i = 5 |
| + i = 6 |
| WHILE i LT rnds |
| - movdqu xmm7, [i*16 + ctx] |
| - aesenc xmm0, xmm7 |
| + movdqu xmm8, [i*16 + ctx] |
| + aesenc xmm0, xmm8 |
| i = i+1 |
| ENDM |
| - movdqu xmm7, [rnds*16 + ctx] |
| - aesenclast xmm0, xmm7 |
| + movdqu xmm8, [rnds*16 + ctx] |
| + aesenclast xmm0, xmm8 |
| movdqu [output], xmm0 |
| @@ -599,30 +599,36 @@ loop1: |
| bail: |
| movdqu [-32+ctx], xmm0 |
| - xor eax, eax |
| - pop inputLen |
| + xor rax, rax |
| + |
| + movdqu xmm6, [rsp + 0*16] |
| + movdqu xmm7, [rsp + 1*16] |
| + movdqu xmm8, [rsp + 2*16] |
| + add rsp, 3*16 |
| ret |
| ENDM |
| gen_aes_cbc_dec_func MACRO rnds |
| -LOCAL loop7 |
| +LOCAL loop8 |
| LOCAL loop1 |
| LOCAL dec1 |
| LOCAL bail |
| - push inputLen |
| + mov input, [rsp + 1*8 + 8*4] |
| + mov inputLen, [rsp + 1*8 + 8*5] |
| - mov ctx, [esp + 2*4 + 0*4] |
| - mov output, [esp + 2*4 + 1*4] |
| - mov input, [esp + 2*4 + 4*4] |
| - mov inputLen, [esp + 2*4 + 5*4] |
| + sub rsp, 3*16 |
| - lea ctx, [44+ctx] |
| + movdqu [rsp + 0*16], xmm6 |
| + movdqu [rsp + 1*16], xmm7 |
| + movdqu [rsp + 2*16], xmm8 |
| -loop7: |
| - cmp inputLen, 7*16 |
| + lea ctx, [48+ctx] |
| + |
| +loop8: |
| + cmp inputLen, 8*16 |
| jb dec1 |
| movdqu xmm0, [0*16 + input] |
| @@ -632,15 +638,17 @@ loop7: |
| movdqu xmm4, [4*16 + input] |
| movdqu xmm5, [5*16 + input] |
| movdqu xmm6, [6*16 + input] |
| - |
| - movdqu xmm7, [0*16 + ctx] |
| - pxor xmm0, xmm7 |
| - pxor xmm1, xmm7 |
| - pxor xmm2, xmm7 |
| - pxor xmm3, xmm7 |
| - pxor xmm4, xmm7 |
| - pxor xmm5, xmm7 |
| - pxor xmm6, xmm7 |
| + movdqu xmm7, [7*16 + input] |
| + |
| + movdqu xmm8, [0*16 + ctx] |
| + pxor xmm0, xmm8 |
| + pxor xmm1, xmm8 |
| + pxor xmm2, xmm8 |
| + pxor xmm3, xmm8 |
| + pxor xmm4, xmm8 |
| + pxor xmm5, xmm8 |
| + pxor xmm6, xmm8 |
| + pxor xmm7, xmm8 |
| i = 1 |
| WHILE i LT rnds |
| @@ -649,21 +657,23 @@ loop7: |
| ENDM |
| aes_dec_last_rnd rnds |
| - movdqu xmm7, [-32 + ctx] |
| - pxor xmm0, xmm7 |
| - movdqu xmm7, [0*16 + input] |
| - pxor xmm1, xmm7 |
| - movdqu xmm7, [1*16 + input] |
| - pxor xmm2, xmm7 |
| - movdqu xmm7, [2*16 + input] |
| - pxor xmm3, xmm7 |
| - movdqu xmm7, [3*16 + input] |
| - pxor xmm4, xmm7 |
| - movdqu xmm7, [4*16 + input] |
| - pxor xmm5, xmm7 |
| - movdqu xmm7, [5*16 + input] |
| - pxor xmm6, xmm7 |
| - movdqu xmm7, [6*16 + input] |
| + movdqu xmm8, [-32 + ctx] |
| + pxor xmm0, xmm8 |
| + movdqu xmm8, [0*16 + input] |
| + pxor xmm1, xmm8 |
| + movdqu xmm8, [1*16 + input] |
| + pxor xmm2, xmm8 |
| + movdqu xmm8, [2*16 + input] |
| + pxor xmm3, xmm8 |
| + movdqu xmm8, [3*16 + input] |
| + pxor xmm4, xmm8 |
| + movdqu xmm8, [4*16 + input] |
| + pxor xmm5, xmm8 |
| + movdqu xmm8, [5*16 + input] |
| + pxor xmm6, xmm8 |
| + movdqu xmm8, [6*16 + input] |
| + pxor xmm7, xmm8 |
| + movdqu xmm8, [7*16 + input] |
| movdqu [0*16 + output], xmm0 |
| movdqu [1*16 + output], xmm1 |
| @@ -672,12 +682,13 @@ loop7: |
| movdqu [4*16 + output], xmm4 |
| movdqu [5*16 + output], xmm5 |
| movdqu [6*16 + output], xmm6 |
| - movdqu [-32 + ctx], xmm7 |
| + movdqu [7*16 + output], xmm7 |
| + movdqu [-32 + ctx], xmm8 |
| - lea input, [7*16 + input] |
| - lea output, [7*16 + output] |
| - sub inputLen, 7*16 |
| - jmp loop7 |
| + lea input, [8*16 + input] |
| + lea output, [8*16 + output] |
| + sub inputLen, 8*16 |
| + jmp loop8 |
| dec1: |
| movdqu xmm3, [-32 + ctx] |
| @@ -711,143 +722,152 @@ loop1: |
| bail: |
| movdqu [-32 + ctx], xmm3 |
| - xor eax, eax |
| - pop inputLen |
| + xor rax, rax |
| + |
| + movdqu xmm6, [rsp + 0*16] |
| + movdqu xmm7, [rsp + 1*16] |
| + movdqu xmm8, [rsp + 2*16] |
| + add rsp, 3*16 |
| ret |
| ENDM |
| -ALIGN 16 |
| intel_aes_encrypt_cbc_128 PROC |
| gen_aes_cbc_enc_func 10 |
| intel_aes_encrypt_cbc_128 ENDP |
| -ALIGN 16 |
| intel_aes_encrypt_cbc_192 PROC |
| gen_aes_cbc_enc_func 12 |
| intel_aes_encrypt_cbc_192 ENDP |
| -ALIGN 16 |
| intel_aes_encrypt_cbc_256 PROC |
| gen_aes_cbc_enc_func 14 |
| intel_aes_encrypt_cbc_256 ENDP |
| -ALIGN 16 |
| intel_aes_decrypt_cbc_128 PROC |
| gen_aes_cbc_dec_func 10 |
| intel_aes_decrypt_cbc_128 ENDP |
| -ALIGN 16 |
| intel_aes_decrypt_cbc_192 PROC |
| gen_aes_cbc_dec_func 12 |
| intel_aes_decrypt_cbc_192 ENDP |
| -ALIGN 16 |
| intel_aes_decrypt_cbc_256 PROC |
| gen_aes_cbc_dec_func 14 |
| intel_aes_decrypt_cbc_256 ENDP |
| -ctrCtx textequ <esi> |
| -CTR textequ <ebx> |
| +ctrCtx textequ <r10> |
| +CTR textequ <r11d> |
| +CTRSave textequ <eax> |
| gen_aes_ctr_func MACRO rnds |
| -LOCAL loop7 |
| +LOCAL loop8 |
| LOCAL loop1 |
| LOCAL enc1 |
| LOCAL bail |
| - push inputLen |
| - push ctrCtx |
| - push CTR |
| - push ebp |
| + mov input, [rsp + 8*1 + 4*8] |
| + mov inputLen, [rsp + 8*1 + 5*8] |
| + |
| + mov ctrCtx, ctx |
| + mov ctx, [8+ctrCtx] |
| + lea ctx, [48+ctx] |
| - mov ctrCtx, [esp + 4*5 + 0*4] |
| - mov output, [esp + 4*5 + 1*4] |
| - mov input, [esp + 4*5 + 4*4] |
| - mov inputLen, [esp + 4*5 + 5*4] |
| + sub rsp, 3*16 |
| + movdqu [rsp + 0*16], xmm6 |
| + movdqu [rsp + 1*16], xmm7 |
| + movdqu [rsp + 2*16], xmm8 |
| - mov ctx, [4+ctrCtx] |
| - lea ctx, [44+ctx] |
| - mov ebp, esp |
| - sub esp, 7*16 |
| - and esp, -16 |
| + push rbp |
| + mov rbp, rsp |
| + sub rsp, 8*16 |
| + and rsp, -16 |
| - movdqu xmm0, [8+ctrCtx] |
| - mov ctrCtx, [ctrCtx + 8 + 3*4] |
| - bswap ctrCtx |
| + |
| + movdqu xmm0, [16+ctrCtx] |
| + mov CTRSave, DWORD PTR [ctrCtx + 16 + 3*4] |
| + bswap CTRSave |
| movdqu xmm1, [ctx + 0*16] |
| pxor xmm0, xmm1 |
| - movdqa [esp + 0*16], xmm0 |
| - movdqa [esp + 1*16], xmm0 |
| - movdqa [esp + 2*16], xmm0 |
| - movdqa [esp + 3*16], xmm0 |
| - movdqa [esp + 4*16], xmm0 |
| - movdqa [esp + 5*16], xmm0 |
| - movdqa [esp + 6*16], xmm0 |
| + movdqa [rsp + 0*16], xmm0 |
| + movdqa [rsp + 1*16], xmm0 |
| + movdqa [rsp + 2*16], xmm0 |
| + movdqa [rsp + 3*16], xmm0 |
| + movdqa [rsp + 4*16], xmm0 |
| + movdqa [rsp + 5*16], xmm0 |
| + movdqa [rsp + 6*16], xmm0 |
| + movdqa [rsp + 7*16], xmm0 |
| + |
| + inc CTRSave |
| + mov CTR, CTRSave |
| + bswap CTR |
| + xor CTR, DWORD PTR [ctx + 3*4] |
| + mov DWORD PTR [rsp + 1*16 + 3*4], CTR |
| - inc ctrCtx |
| - mov CTR, ctrCtx |
| + inc CTRSave |
| + mov CTR, CTRSave |
| bswap CTR |
| - xor CTR, [ctx + 3*4] |
| - mov [esp + 1*16 + 3*4], CTR |
| + xor CTR, DWORD PTR [ctx + 3*4] |
| + mov DWORD PTR [rsp + 2*16 + 3*4], CTR |
| - inc ctrCtx |
| - mov CTR, ctrCtx |
| + inc CTRSave |
| + mov CTR, CTRSave |
| bswap CTR |
| - xor CTR, [ctx + 3*4] |
| - mov [esp + 2*16 + 3*4], CTR |
| + xor CTR, DWORD PTR [ctx + 3*4] |
| + mov DWORD PTR [rsp + 3*16 + 3*4], CTR |
| - inc ctrCtx |
| - mov CTR, ctrCtx |
| + inc CTRSave |
| + mov CTR, CTRSave |
| bswap CTR |
| - xor CTR, [ctx + 3*4] |
| - mov [esp + 3*16 + 3*4], CTR |
| + xor CTR, DWORD PTR [ctx + 3*4] |
| + mov DWORD PTR [rsp + 4*16 + 3*4], CTR |
| - inc ctrCtx |
| - mov CTR, ctrCtx |
| + inc CTRSave |
| + mov CTR, CTRSave |
| bswap CTR |
| - xor CTR, [ctx + 3*4] |
| - mov [esp + 4*16 + 3*4], CTR |
| + xor CTR, DWORD PTR [ctx + 3*4] |
| + mov DWORD PTR [rsp + 5*16 + 3*4], CTR |
| - inc ctrCtx |
| - mov CTR, ctrCtx |
| + inc CTRSave |
| + mov CTR, CTRSave |
| bswap CTR |
| - xor CTR, [ctx + 3*4] |
| - mov [esp + 5*16 + 3*4], CTR |
| + xor CTR, DWORD PTR [ctx + 3*4] |
| + mov DWORD PTR [rsp + 6*16 + 3*4], CTR |
| - inc ctrCtx |
| - mov CTR, ctrCtx |
| + inc CTRSave |
| + mov CTR, CTRSave |
| bswap CTR |
| - xor CTR, [ctx + 3*4] |
| - mov [esp + 6*16 + 3*4], CTR |
| + xor CTR, DWORD PTR [ctx + 3*4] |
| + mov DWORD PTR [rsp + 7*16 + 3*4], CTR |
| -loop7: |
| - cmp inputLen, 7*16 |
| +loop8: |
| + cmp inputLen, 8*16 |
| jb loop1 |
| - movdqu xmm0, [0*16 + esp] |
| - movdqu xmm1, [1*16 + esp] |
| - movdqu xmm2, [2*16 + esp] |
| - movdqu xmm3, [3*16 + esp] |
| - movdqu xmm4, [4*16 + esp] |
| - movdqu xmm5, [5*16 + esp] |
| - movdqu xmm6, [6*16 + esp] |
| + movdqu xmm0, [0*16 + rsp] |
| + movdqu xmm1, [1*16 + rsp] |
| + movdqu xmm2, [2*16 + rsp] |
| + movdqu xmm3, [3*16 + rsp] |
| + movdqu xmm4, [4*16 + rsp] |
| + movdqu xmm5, [5*16 + rsp] |
| + movdqu xmm6, [6*16 + rsp] |
| + movdqu xmm7, [7*16 + rsp] |
| i = 1 |
| - WHILE i LE 7 |
| + WHILE i LE 8 |
| aes_rnd i |
| - inc ctrCtx |
| - mov CTR, ctrCtx |
| + inc CTRSave |
| + mov CTR, CTRSave |
| bswap CTR |
| - xor CTR, [ctx + 3*4] |
| - mov [esp + (i-1)*16 + 3*4], CTR |
| + xor CTR, DWORD PTR [ctx + 3*4] |
| + mov DWORD PTR [rsp + (i-1)*16 + 3*4], CTR |
| i = i+1 |
| ENDM |
| @@ -857,20 +877,22 @@ loop7: |
| ENDM |
| aes_last_rnd rnds |
| - movdqu xmm7, [0*16 + input] |
| - pxor xmm0, xmm7 |
| - movdqu xmm7, [1*16 + input] |
| - pxor xmm1, xmm7 |
| - movdqu xmm7, [2*16 + input] |
| - pxor xmm2, xmm7 |
| - movdqu xmm7, [3*16 + input] |
| - pxor xmm3, xmm7 |
| - movdqu xmm7, [4*16 + input] |
| - pxor xmm4, xmm7 |
| - movdqu xmm7, [5*16 + input] |
| - pxor xmm5, xmm7 |
| - movdqu xmm7, [6*16 + input] |
| - pxor xmm6, xmm7 |
| + movdqu xmm8, [0*16 + input] |
| + pxor xmm0, xmm8 |
| + movdqu xmm8, [1*16 + input] |
| + pxor xmm1, xmm8 |
| + movdqu xmm8, [2*16 + input] |
| + pxor xmm2, xmm8 |
| + movdqu xmm8, [3*16 + input] |
| + pxor xmm3, xmm8 |
| + movdqu xmm8, [4*16 + input] |
| + pxor xmm4, xmm8 |
| + movdqu xmm8, [5*16 + input] |
| + pxor xmm5, xmm8 |
| + movdqu xmm8, [6*16 + input] |
| + pxor xmm6, xmm8 |
| + movdqu xmm8, [7*16 + input] |
| + pxor xmm7, xmm8 |
| movdqu [0*16 + output], xmm0 |
| movdqu [1*16 + output], xmm1 |
| @@ -879,19 +901,20 @@ loop7: |
| movdqu [4*16 + output], xmm4 |
| movdqu [5*16 + output], xmm5 |
| movdqu [6*16 + output], xmm6 |
| + movdqu [7*16 + output], xmm7 |
| - lea input, [7*16 + input] |
| - lea output, [7*16 + output] |
| - sub inputLen, 7*16 |
| - jmp loop7 |
| + lea input, [8*16 + input] |
| + lea output, [8*16 + output] |
| + sub inputLen, 8*16 |
| + jmp loop8 |
| loop1: |
| cmp inputLen, 1*16 |
| jb bail |
| - movdqu xmm0, [esp] |
| - add esp, 16 |
| + movdqu xmm0, [rsp] |
| + add rsp, 16 |
| i = 1 |
| WHILE i LT rnds |
| @@ -913,34 +936,33 @@ loop1: |
| bail: |
| - mov ctrCtx, [ebp + 4*5 + 0*4] |
| - movdqu xmm0, [esp] |
| + movdqu xmm0, [rsp] |
| movdqu xmm1, [ctx + 0*16] |
| pxor xmm0, xmm1 |
| - movdqu [8+ctrCtx], xmm0 |
| + movdqu [16+ctrCtx], xmm0 |
| + |
| + |
| + xor rax, rax |
| + mov rsp, rbp |
| + pop rbp |
| + movdqu xmm6, [rsp + 0*16] |
| + movdqu xmm7, [rsp + 1*16] |
| + movdqu xmm8, [rsp + 2*16] |
| + add rsp, 3*16 |
| - xor eax, eax |
| - mov esp, ebp |
| - pop ebp |
| - pop CTR |
| - pop ctrCtx |
| - pop inputLen |
| ret |
| ENDM |
| -ALIGN 16 |
| intel_aes_encrypt_ctr_128 PROC |
| gen_aes_ctr_func 10 |
| intel_aes_encrypt_ctr_128 ENDP |
| -ALIGN 16 |
| intel_aes_encrypt_ctr_192 PROC |
| gen_aes_ctr_func 12 |
| intel_aes_encrypt_ctr_192 ENDP |
| -ALIGN 16 |
| intel_aes_encrypt_ctr_256 PROC |
| gen_aes_ctr_func 14 |
| intel_aes_encrypt_ctr_256 ENDP |