OLD | NEW |
1 ; LICENSE: | 1 ; LICENSE: |
2 ; This submission to NSS is to be made available under the terms of the | 2 ; This submission to NSS is to be made available under the terms of the |
3 ; Mozilla Public License, v. 2.0. You can obtain one at http: | 3 ; Mozilla Public License, v. 2.0. You can obtain one at http: |
4 ; //mozilla.org/MPL/2.0/. | 4 ; //mozilla.org/MPL/2.0/. |
5 ;############################################################################### | 5 ;############################################################################### |
6 ; Copyright(c) 2014, Intel Corp. | 6 ; Copyright(c) 2014, Intel Corp. |
7 ; Developers and authors: | 7 ; Developers and authors: |
8 ; Shay Gueron and Vlad Krasnov | 8 ; Shay Gueron and Vlad Krasnov |
9 ; Intel Corporation, Israel Development Centre, Haifa, Israel | 9 ; Intel Corporation, Israel Development Centre, Haifa, Israel |
10 ; Please send feedback directly to crypto.feedback.alias@intel.com | 10 ; Please send feedback directly to crypto.feedback.alias@intel.com |
11 | 11 |
12 | 12 |
13 .MODEL FLAT, C | |
14 .XMM | |
15 | |
16 .DATA | 13 .DATA |
17 ALIGN 16 | 14 ALIGN 16 |
18 Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh | 15 Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh |
19 Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h | 16 Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h |
20 Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh | 17 Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh |
21 Lcon1 dd 1,1,1,1 | 18 Lcon1 dd 1,1,1,1 |
22 Lcon2 dd 1bh,1bh,1bh,1bh | 19 Lcon2 dd 1bh,1bh,1bh,1bh |
23 | 20 |
24 .CODE | 21 .CODE |
25 | 22 |
26 ctx textequ <ecx> | 23 ctx textequ <rcx> |
27 output textequ <edx> | 24 output textequ <rdx> |
28 input textequ <eax> | 25 input textequ <r8> |
29 inputLen textequ <edi> | 26 inputLen textequ <r9d> |
30 | 27 |
31 | 28 |
32 aes_rnd MACRO i | 29 aes_rnd MACRO i |
33 movdqu xmm7, [i*16 + ctx] | 30 movdqu xmm8, [i*16 + ctx] |
34 aesenc xmm0, xmm7 | 31 aesenc xmm0, xmm8 |
35 aesenc xmm1, xmm7 | 32 aesenc xmm1, xmm8 |
36 aesenc xmm2, xmm7 | 33 aesenc xmm2, xmm8 |
37 aesenc xmm3, xmm7 | 34 aesenc xmm3, xmm8 |
38 aesenc xmm4, xmm7 | 35 aesenc xmm4, xmm8 |
39 aesenc xmm5, xmm7 | 36 aesenc xmm5, xmm8 |
40 aesenc xmm6, xmm7 | 37 aesenc xmm6, xmm8 |
| 38 aesenc xmm7, xmm8 |
41 ENDM | 39 ENDM |
42 | 40 |
43 aes_last_rnd MACRO i | 41 aes_last_rnd MACRO i |
44 movdqu xmm7, [i*16 + ctx] | 42 movdqu xmm8, [i*16 + ctx] |
45 aesenclast xmm0, xmm7 | 43 aesenclast xmm0, xmm8 |
46 aesenclast xmm1, xmm7 | 44 aesenclast xmm1, xmm8 |
47 aesenclast xmm2, xmm7 | 45 aesenclast xmm2, xmm8 |
48 aesenclast xmm3, xmm7 | 46 aesenclast xmm3, xmm8 |
49 aesenclast xmm4, xmm7 | 47 aesenclast xmm4, xmm8 |
50 aesenclast xmm5, xmm7 | 48 aesenclast xmm5, xmm8 |
51 aesenclast xmm6, xmm7 | 49 aesenclast xmm6, xmm8 |
| 50 aesenclast xmm7, xmm8 |
52 ENDM | 51 ENDM |
53 | 52 |
54 aes_dec_rnd MACRO i | 53 aes_dec_rnd MACRO i |
55 movdqu xmm7, [i*16 + ctx] | 54 movdqu xmm8, [i*16 + ctx] |
56 aesdec xmm0, xmm7 | 55 aesdec xmm0, xmm8 |
57 aesdec xmm1, xmm7 | 56 aesdec xmm1, xmm8 |
58 aesdec xmm2, xmm7 | 57 aesdec xmm2, xmm8 |
59 aesdec xmm3, xmm7 | 58 aesdec xmm3, xmm8 |
60 aesdec xmm4, xmm7 | 59 aesdec xmm4, xmm8 |
61 aesdec xmm5, xmm7 | 60 aesdec xmm5, xmm8 |
62 aesdec xmm6, xmm7 | 61 aesdec xmm6, xmm8 |
| 62 aesdec xmm7, xmm8 |
63 ENDM | 63 ENDM |
64 | 64 |
65 aes_dec_last_rnd MACRO i | 65 aes_dec_last_rnd MACRO i |
66 movdqu xmm7, [i*16 + ctx] | 66 movdqu xmm8, [i*16 + ctx] |
67 aesdeclast xmm0, xmm7 | 67 aesdeclast xmm0, xmm8 |
68 aesdeclast xmm1, xmm7 | 68 aesdeclast xmm1, xmm8 |
69 aesdeclast xmm2, xmm7 | 69 aesdeclast xmm2, xmm8 |
70 aesdeclast xmm3, xmm7 | 70 aesdeclast xmm3, xmm8 |
71 aesdeclast xmm4, xmm7 | 71 aesdeclast xmm4, xmm8 |
72 aesdeclast xmm5, xmm7 | 72 aesdeclast xmm5, xmm8 |
73 aesdeclast xmm6, xmm7 | 73 aesdeclast xmm6, xmm8 |
| 74 aesdeclast xmm7, xmm8 |
74 ENDM | 75 ENDM |
75 | 76 |
76 | 77 |
77 gen_aes_ecb_func MACRO enc, rnds | 78 gen_aes_ecb_func MACRO enc, rnds |
78 | 79 |
79 LOCAL loop7 | 80 LOCAL loop8 |
80 LOCAL loop1 | 81 LOCAL loop1 |
81 LOCAL bail | 82 LOCAL bail |
82 | 83 |
83 push inputLen | 84 xor inputLen, inputLen |
| 85 mov input, [rsp + 1*8 + 8*4] |
| 86 mov inputLen, [rsp + 1*8 + 8*5] |
84 | 87 |
85 mov ctx, [esp + 2*4 + 0*4] | 88 sub rsp, 3*16 |
86 mov output, [esp + 2*4 + 1*4] | |
87 mov input, [esp + 2*4 + 4*4] | |
88 mov inputLen, [esp + 2*4 + 5*4] | |
89 | 89 |
90 lea ctx, [44+ctx] | 90 movdqu [rsp + 0*16], xmm6 |
| 91 movdqu [rsp + 1*16], xmm7 |
| 92 movdqu [rsp + 2*16], xmm8 |
91 | 93 |
92 loop7: | 94 lea ctx, [48+ctx] |
93 cmp inputLen, 7*16 | 95 |
| 96 loop8: |
| 97 cmp inputLen, 8*16 |
94 jb loop1 | 98 jb loop1 |
95 | 99 |
96 movdqu xmm0, [0*16 + input] | 100 movdqu xmm0, [0*16 + input] |
97 movdqu xmm1, [1*16 + input] | 101 movdqu xmm1, [1*16 + input] |
98 movdqu xmm2, [2*16 + input] | 102 movdqu xmm2, [2*16 + input] |
99 movdqu xmm3, [3*16 + input] | 103 movdqu xmm3, [3*16 + input] |
100 movdqu xmm4, [4*16 + input] | 104 movdqu xmm4, [4*16 + input] |
101 movdqu xmm5, [5*16 + input] | 105 movdqu xmm5, [5*16 + input] |
102 movdqu xmm6, [6*16 + input] | 106 movdqu xmm6, [6*16 + input] |
| 107 movdqu xmm7, [7*16 + input] |
103 | 108 |
104 movdqu xmm7, [0*16 + ctx] | 109 movdqu xmm8, [0*16 + ctx] |
105 pxor xmm0, xmm7 | 110 pxor xmm0, xmm8 |
106 pxor xmm1, xmm7 | 111 pxor xmm1, xmm8 |
107 pxor xmm2, xmm7 | 112 pxor xmm2, xmm8 |
108 pxor xmm3, xmm7 | 113 pxor xmm3, xmm8 |
109 pxor xmm4, xmm7 | 114 pxor xmm4, xmm8 |
110 pxor xmm5, xmm7 | 115 pxor xmm5, xmm8 |
111 pxor xmm6, xmm7 | 116 pxor xmm6, xmm8 |
| 117 pxor xmm7, xmm8 |
112 | 118 |
113 IF enc eq 1 | 119 IF enc eq 1 |
114 rnd textequ <aes_rnd> | 120 rnd textequ <aes_rnd> |
115 lastrnd textequ <aes_last_rnd> | 121 lastrnd textequ <aes_last_rnd> |
116 aesinst textequ <aesenc> | 122 aesinst textequ <aesenc> |
117 aeslastinst textequ <aesenclast> | 123 aeslastinst textequ <aesenclast> |
118 ELSE | 124 ELSE |
119 rnd textequ <aes_dec_rnd> | 125 rnd textequ <aes_dec_rnd> |
120 lastrnd textequ <aes_dec_last_rnd> | 126 lastrnd textequ <aes_dec_last_rnd> |
121 aesinst textequ <aesdec> | 127 aesinst textequ <aesdec> |
122 aeslastinst textequ <aesdeclast> | 128 aeslastinst textequ <aesdeclast> |
123 ENDIF | 129 ENDIF |
124 | 130 |
125 i = 1 | 131 i = 1 |
126 WHILE i LT rnds | 132 WHILE i LT rnds |
127 rnd i | 133 rnd i |
128 i = i+1 | 134 i = i+1 |
129 ENDM | 135 ENDM |
130 lastrnd rnds | 136 lastrnd rnds |
131 | 137 |
132 movdqu [0*16 + output], xmm0 | 138 movdqu [0*16 + output], xmm0 |
133 movdqu [1*16 + output], xmm1 | 139 movdqu [1*16 + output], xmm1 |
134 movdqu [2*16 + output], xmm2 | 140 movdqu [2*16 + output], xmm2 |
135 movdqu [3*16 + output], xmm3 | 141 movdqu [3*16 + output], xmm3 |
136 movdqu [4*16 + output], xmm4 | 142 movdqu [4*16 + output], xmm4 |
137 movdqu [5*16 + output], xmm5 | 143 movdqu [5*16 + output], xmm5 |
138 movdqu [6*16 + output], xmm6 | 144 movdqu [6*16 + output], xmm6 |
| 145 movdqu [7*16 + output], xmm7 |
139 | 146 |
140 lea input, [7*16 + input] | 147 lea input, [8*16 + input] |
141 lea output, [7*16 + output] | 148 lea output, [8*16 + output] |
142 sub inputLen, 7*16 | 149 sub inputLen, 8*16 |
143 jmp loop7 | 150 jmp loop8 |
144 | 151 |
145 loop1: | 152 loop1: |
146 cmp inputLen, 1*16 | 153 cmp inputLen, 1*16 |
147 jb bail | 154 jb bail |
148 | 155 |
149 movdqu xmm0, [input] | 156 movdqu xmm0, [input] |
150 movdqu xmm7, [0*16 + ctx] | 157 movdqu xmm7, [0*16 + ctx] |
151 pxor xmm0, xmm7 | 158 pxor xmm0, xmm7 |
152 | 159 |
153 i = 1 | 160 i = 1 |
154 WHILE i LT rnds | 161 WHILE i LT rnds |
155 movdqu xmm7, [i*16 + ctx] | 162 movdqu xmm7, [i*16 + ctx] |
156 aesinst xmm0, xmm7 | 163 aesinst xmm0, xmm7 |
157 i = i+1 | 164 i = i+1 |
158 ENDM | 165 ENDM |
159 movdqu xmm7, [rnds*16 + ctx] | 166 movdqu xmm7, [rnds*16 + ctx] |
160 aeslastinst xmm0, xmm7 | 167 aeslastinst xmm0, xmm7 |
161 | 168 |
162 movdqu [output], xmm0 | 169 movdqu [output], xmm0 |
163 | 170 |
164 lea input, [1*16 + input] | 171 lea input, [1*16 + input] |
165 lea output, [1*16 + output] | 172 lea output, [1*16 + output] |
166 sub inputLen, 1*16 | 173 sub inputLen, 1*16 |
167 jmp loop1 | 174 jmp loop1 |
168 | 175 |
169 bail: | 176 bail: |
170 xor eax, eax | 177 xor rax, rax |
171 pop inputLen | 178 |
| 179 movdqu xmm6, [rsp + 0*16] |
| 180 movdqu xmm7, [rsp + 1*16] |
| 181 movdqu xmm8, [rsp + 2*16] |
| 182 add rsp, 3*16 |
172 ret | 183 ret |
173 | |
174 ENDM | 184 ENDM |
175 | 185 |
176 ALIGN 16 | |
177 intel_aes_encrypt_ecb_128 PROC | 186 intel_aes_encrypt_ecb_128 PROC |
178 gen_aes_ecb_func 1, 10 | 187 gen_aes_ecb_func 1, 10 |
179 intel_aes_encrypt_ecb_128 ENDP | 188 intel_aes_encrypt_ecb_128 ENDP |
180 | 189 |
181 ALIGN 16 | |
182 intel_aes_encrypt_ecb_192 PROC | 190 intel_aes_encrypt_ecb_192 PROC |
183 gen_aes_ecb_func 1, 12 | 191 gen_aes_ecb_func 1, 12 |
184 intel_aes_encrypt_ecb_192 ENDP | 192 intel_aes_encrypt_ecb_192 ENDP |
185 | 193 |
186 ALIGN 16 | |
187 intel_aes_encrypt_ecb_256 PROC | 194 intel_aes_encrypt_ecb_256 PROC |
188 gen_aes_ecb_func 1, 14 | 195 gen_aes_ecb_func 1, 14 |
189 intel_aes_encrypt_ecb_256 ENDP | 196 intel_aes_encrypt_ecb_256 ENDP |
190 | 197 |
191 ALIGN 16 | |
192 intel_aes_decrypt_ecb_128 PROC | 198 intel_aes_decrypt_ecb_128 PROC |
193 gen_aes_ecb_func 0, 10 | 199 gen_aes_ecb_func 0, 10 |
194 intel_aes_decrypt_ecb_128 ENDP | 200 intel_aes_decrypt_ecb_128 ENDP |
195 | 201 |
196 ALIGN 16 | |
197 intel_aes_decrypt_ecb_192 PROC | 202 intel_aes_decrypt_ecb_192 PROC |
198 gen_aes_ecb_func 0, 12 | 203 gen_aes_ecb_func 0, 12 |
199 intel_aes_decrypt_ecb_192 ENDP | 204 intel_aes_decrypt_ecb_192 ENDP |
200 | 205 |
201 ALIGN 16 | |
202 intel_aes_decrypt_ecb_256 PROC | 206 intel_aes_decrypt_ecb_256 PROC |
203 gen_aes_ecb_func 0, 14 | 207 gen_aes_ecb_func 0, 14 |
204 intel_aes_decrypt_ecb_256 ENDP | 208 intel_aes_decrypt_ecb_256 ENDP |
205 | 209 |
206 | 210 |
207 KEY textequ <ecx> | 211 KEY textequ <rcx> |
208 KS textequ <edx> | 212 KS textequ <rdx> |
209 ITR textequ <eax> | 213 ITR textequ <r8> |
210 | 214 |
211 ALIGN 16 | |
212 intel_aes_encrypt_init_128 PROC | 215 intel_aes_encrypt_init_128 PROC |
213 | 216 |
214 mov KEY, [esp + 1*4 + 0*4] | |
215 mov KS, [esp + 1*4 + 1*4] | |
216 | |
217 | |
218 movdqu xmm1, [KEY] | 217 movdqu xmm1, [KEY] |
219 movdqu [KS], xmm1 | 218 movdqu [KS], xmm1 |
220 movdqa xmm2, xmm1 | 219 movdqa xmm2, xmm1 |
221 | 220 |
222 lea ITR, Lcon1 | 221 lea ITR, Lcon1 |
223 movdqa xmm0, [ITR] | 222 movdqa xmm0, [ITR] |
224 lea ITR, Lmask | 223 lea ITR, Lmask |
225 movdqa xmm4, [ITR] | 224 movdqa xmm4, [ITR] |
226 | 225 |
227 mov ITR, 8 | 226 mov ITR, 8 |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
273 pslldq xmm3, 4 | 272 pslldq xmm3, 4 |
274 pxor xmm1, xmm3 | 273 pxor xmm1, xmm3 |
275 pxor xmm1, xmm2 | 274 pxor xmm1, xmm2 |
276 movdqu [32 + KS], xmm1 | 275 movdqu [32 + KS], xmm1 |
277 movdqa xmm2, xmm1 | 276 movdqa xmm2, xmm1 |
278 | 277 |
279 ret | 278 ret |
280 intel_aes_encrypt_init_128 ENDP | 279 intel_aes_encrypt_init_128 ENDP |
281 | 280 |
282 | 281 |
283 ALIGN 16 | |
284 intel_aes_decrypt_init_128 PROC | 282 intel_aes_decrypt_init_128 PROC |
285 | 283 |
286 mov KEY, [esp + 1*4 + 0*4] | |
287 mov KS, [esp + 1*4 + 1*4] | |
288 | |
289 push KS | 284 push KS |
290 push KEY | 285 push KEY |
291 | 286 |
292 call intel_aes_encrypt_init_128 | 287 call intel_aes_encrypt_init_128 |
293 | 288 |
294 pop KEY | 289 pop KEY |
295 pop KS | 290 pop KS |
296 | 291 |
297 movdqu xmm0, [0*16 + KS] | 292 movdqu xmm0, [0*16 + KS] |
298 movdqu xmm1, [10*16 + KS] | 293 movdqu xmm1, [10*16 + KS] |
(...skipping 14 matching lines...) Expand all Loading... |
313 i = i+1 | 308 i = i+1 |
314 ENDM | 309 ENDM |
315 | 310 |
316 movdqu xmm0, [5*16 + KS] | 311 movdqu xmm0, [5*16 + KS] |
317 aesimc xmm0, xmm0 | 312 aesimc xmm0, xmm0 |
318 movdqu [5*16 + KS], xmm0 | 313 movdqu [5*16 + KS], xmm0 |
319 ret | 314 ret |
320 intel_aes_decrypt_init_128 ENDP | 315 intel_aes_decrypt_init_128 ENDP |
321 | 316 |
322 | 317 |
323 ALIGN 16 | |
324 intel_aes_encrypt_init_192 PROC | 318 intel_aes_encrypt_init_192 PROC |
325 | 319 |
326 mov KEY, [esp + 1*4 + 0*4] | 320 sub rsp, 16*2 |
327 mov KS, [esp + 1*4 + 1*4] | 321 movdqu [16*0 + rsp], xmm6 |
| 322 movdqu [16*1 + rsp], xmm7 |
328 | 323 |
329 pxor xmm3, xmm3 | |
330 movdqu xmm1, [KEY] | 324 movdqu xmm1, [KEY] |
331 pinsrd xmm3, DWORD PTR [16 + KEY], 0 | 325 mov ITR, [16 + KEY] |
332 pinsrd xmm3, DWORD PTR [20 + KEY], 1 | 326 movd xmm3, ITR |
333 | 327 |
334 movdqu [KS], xmm1 | 328 movdqu [KS], xmm1 |
335 movdqa xmm5, xmm3 | 329 movdqa xmm5, xmm3 |
336 | 330 |
337 lea ITR, Lcon1 | 331 lea ITR, Lcon1 |
338 movdqu xmm0, [ITR] | 332 movdqu xmm0, [ITR] |
339 lea ITR, Lmask192 | 333 lea ITR, Lmask192 |
340 movdqu xmm4, [ITR] | 334 movdqu xmm4, [ITR] |
341 | 335 |
342 mov ITR, 4 | 336 mov ITR, 4 |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
389 | 383 |
390 movdqu [48 + KS], xmm1 | 384 movdqu [48 + KS], xmm1 |
391 movdqa xmm5, xmm3 | 385 movdqa xmm5, xmm3 |
392 | 386 |
393 lea KS, [48 + KS] | 387 lea KS, [48 + KS] |
394 | 388 |
395 dec ITR | 389 dec ITR |
396 jnz Lenc_192_ks_loop | 390 jnz Lenc_192_ks_loop |
397 | 391 |
398 movdqu [16 + KS], xmm5 | 392 movdqu [16 + KS], xmm5 |
399 ret | 393 |
| 394 movdqu xmm7, [16*1 + rsp] |
| 395 movdqu xmm6, [16*0 + rsp] |
| 396 add rsp, 16*2 |
| 397 ret |
400 intel_aes_encrypt_init_192 ENDP | 398 intel_aes_encrypt_init_192 ENDP |
401 | 399 |
402 ALIGN 16 | |
403 intel_aes_decrypt_init_192 PROC | 400 intel_aes_decrypt_init_192 PROC |
404 mov KEY, [esp + 1*4 + 0*4] | |
405 mov KS, [esp + 1*4 + 1*4] | |
406 | |
407 push KS | 401 push KS |
408 push KEY | 402 push KEY |
409 | 403 |
410 call intel_aes_encrypt_init_192 | 404 call intel_aes_encrypt_init_192 |
411 | 405 |
412 pop KEY | 406 pop KEY |
413 pop KS | 407 pop KS |
414 | 408 |
415 movdqu xmm0, [0*16 + KS] | 409 movdqu xmm0, [0*16 + KS] |
416 movdqu xmm1, [12*16 + KS] | 410 movdqu xmm1, [12*16 + KS] |
(...skipping 13 matching lines...) Expand all Loading... |
430 | 424 |
431 i = i+1 | 425 i = i+1 |
432 ENDM | 426 ENDM |
433 | 427 |
434 movdqu xmm0, [6*16 + KS] | 428 movdqu xmm0, [6*16 + KS] |
435 aesimc xmm0, xmm0 | 429 aesimc xmm0, xmm0 |
436 movdqu [6*16 + KS], xmm0 | 430 movdqu [6*16 + KS], xmm0 |
437 ret | 431 ret |
438 intel_aes_decrypt_init_192 ENDP | 432 intel_aes_decrypt_init_192 ENDP |
439 | 433 |
440 ALIGN 16 | 434 |
441 intel_aes_encrypt_init_256 PROC | 435 intel_aes_encrypt_init_256 PROC |
| 436 sub rsp, 16*2 |
| 437 movdqu [16*0 + rsp], xmm6 |
| 438 movdqu [16*1 + rsp], xmm7 |
442 | 439 |
443 mov KEY, [esp + 1*4 + 0*4] | |
444 mov KS, [esp + 1*4 + 1*4] | |
445 movdqu xmm1, [16*0 + KEY] | 440 movdqu xmm1, [16*0 + KEY] |
446 movdqu xmm3, [16*1 + KEY] | 441 movdqu xmm3, [16*1 + KEY] |
447 | 442 |
448 movdqu [16*0 + KS], xmm1 | 443 movdqu [16*0 + KS], xmm1 |
449 movdqu [16*1 + KS], xmm3 | 444 movdqu [16*1 + KS], xmm3 |
450 | 445 |
451 lea ITR, Lcon1 | 446 lea ITR, Lcon1 |
452 movdqu xmm0, [ITR] | 447 movdqu xmm0, [ITR] |
453 lea ITR, Lmask256 | 448 lea ITR, Lmask256 |
454 movdqu xmm5, [ITR] | 449 movdqu xmm5, [ITR] |
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
495 movdqa xmm4, xmm1 | 490 movdqa xmm4, xmm1 |
496 pslldq xmm4, 4 | 491 pslldq xmm4, 4 |
497 pxor xmm1, xmm4 | 492 pxor xmm1, xmm4 |
498 pslldq xmm4, 4 | 493 pslldq xmm4, 4 |
499 pxor xmm1, xmm4 | 494 pxor xmm1, xmm4 |
500 pslldq xmm4, 4 | 495 pslldq xmm4, 4 |
501 pxor xmm1, xmm4 | 496 pxor xmm1, xmm4 |
502 pxor xmm1, xmm2 | 497 pxor xmm1, xmm2 |
503 movdqu [16*2 + KS], xmm1 | 498 movdqu [16*2 + KS], xmm1 |
504 | 499 |
| 500 movdqu xmm7, [16*1 + rsp] |
| 501 movdqu xmm6, [16*0 + rsp] |
| 502 add rsp, 16*2 |
505 ret | 503 ret |
| 504 |
506 intel_aes_encrypt_init_256 ENDP | 505 intel_aes_encrypt_init_256 ENDP |
507 | 506 |
508 ALIGN 16 | 507 |
509 intel_aes_decrypt_init_256 PROC | 508 intel_aes_decrypt_init_256 PROC |
510 mov KEY, [esp + 1*4 + 0*4] | |
511 mov KS, [esp + 1*4 + 1*4] | |
512 | |
513 push KS | 509 push KS |
514 push KEY | 510 push KEY |
515 | 511 |
516 call intel_aes_encrypt_init_256 | 512 call intel_aes_encrypt_init_256 |
517 | 513 |
518 pop KEY | 514 pop KEY |
519 pop KS | 515 pop KS |
520 | 516 |
521 movdqu xmm0, [0*16 + KS] | 517 movdqu xmm0, [0*16 + KS] |
522 movdqu xmm1, [14*16 + KS] | 518 movdqu xmm1, [14*16 + KS] |
(...skipping 20 matching lines...) Expand all Loading... |
543 ret | 539 ret |
544 intel_aes_decrypt_init_256 ENDP | 540 intel_aes_decrypt_init_256 ENDP |
545 | 541 |
546 | 542 |
547 | 543 |
548 gen_aes_cbc_enc_func MACRO rnds | 544 gen_aes_cbc_enc_func MACRO rnds |
549 | 545 |
550 LOCAL loop1 | 546 LOCAL loop1 |
551 LOCAL bail | 547 LOCAL bail |
552 | 548 |
553 push inputLen | 549 mov input, [rsp + 1*8 + 8*4] |
| 550 mov inputLen, [rsp + 1*8 + 8*5] |
554 | 551 |
555 mov ctx, [esp + 2*4 + 0*4] | 552 sub rsp, 3*16 |
556 mov output, [esp + 2*4 + 1*4] | |
557 mov input, [esp + 2*4 + 4*4] | |
558 mov inputLen, [esp + 2*4 + 5*4] | |
559 | 553 |
560 lea ctx, [44+ctx] | 554 movdqu [rsp + 0*16], xmm6 |
| 555 movdqu [rsp + 1*16], xmm7 |
| 556 movdqu [rsp + 2*16], xmm8 |
| 557 |
| 558 lea ctx, [48+ctx] |
561 | 559 |
562 movdqu xmm0, [-32+ctx] | 560 movdqu xmm0, [-32+ctx] |
563 | 561 |
564 movdqu xmm2, [0*16 + ctx] | 562 movdqu xmm2, [0*16 + ctx] |
565 movdqu xmm3, [1*16 + ctx] | 563 movdqu xmm3, [1*16 + ctx] |
566 movdqu xmm4, [2*16 + ctx] | 564 movdqu xmm4, [2*16 + ctx] |
567 movdqu xmm5, [3*16 + ctx] | 565 movdqu xmm5, [3*16 + ctx] |
568 movdqu xmm6, [4*16 + ctx] | 566 movdqu xmm6, [4*16 + ctx] |
| 567 movdqu xmm7, [5*16 + ctx] |
569 | 568 |
570 loop1: | 569 loop1: |
571 cmp inputLen, 1*16 | 570 cmp inputLen, 1*16 |
572 jb bail | 571 jb bail |
573 | 572 |
574 movdqu xmm1, [input] | 573 movdqu xmm1, [input] |
575 pxor xmm1, xmm2 | 574 pxor xmm1, xmm2 |
576 pxor xmm0, xmm1 | 575 pxor xmm0, xmm1 |
577 | 576 |
578 aesenc xmm0, xmm3 | 577 aesenc xmm0, xmm3 |
579 aesenc xmm0, xmm4 | 578 aesenc xmm0, xmm4 |
580 aesenc xmm0, xmm5 | 579 aesenc xmm0, xmm5 |
581 aesenc xmm0, xmm6 | 580 aesenc xmm0, xmm6 |
| 581 aesenc xmm0, xmm7 |
582 | 582 |
583 i = 5 | 583 i = 6 |
584 WHILE i LT rnds | 584 WHILE i LT rnds |
585 movdqu xmm7, [i*16 + ctx] | 585 movdqu xmm8, [i*16 + ctx] |
586 aesenc xmm0, xmm7 | 586 aesenc xmm0, xmm8 |
587 i = i+1 | 587 i = i+1 |
588 ENDM | 588 ENDM |
589 movdqu xmm7, [rnds*16 + ctx] | 589 movdqu xmm8, [rnds*16 + ctx] |
590 aesenclast xmm0, xmm7 | 590 aesenclast xmm0, xmm8 |
591 | 591 |
592 movdqu [output], xmm0 | 592 movdqu [output], xmm0 |
593 | 593 |
594 lea input, [1*16 + input] | 594 lea input, [1*16 + input] |
595 lea output, [1*16 + output] | 595 lea output, [1*16 + output] |
596 sub inputLen, 1*16 | 596 sub inputLen, 1*16 |
597 jmp loop1 | 597 jmp loop1 |
598 | 598 |
599 bail: | 599 bail: |
600 movdqu [-32+ctx], xmm0 | 600 movdqu [-32+ctx], xmm0 |
601 | 601 |
602 xor eax, eax | 602 xor rax, rax |
603 pop inputLen | 603 |
| 604 movdqu xmm6, [rsp + 0*16] |
| 605 movdqu xmm7, [rsp + 1*16] |
| 606 movdqu xmm8, [rsp + 2*16] |
| 607 add rsp, 3*16 |
604 ret | 608 ret |
605 | 609 |
606 ENDM | 610 ENDM |
607 | 611 |
608 gen_aes_cbc_dec_func MACRO rnds | 612 gen_aes_cbc_dec_func MACRO rnds |
609 | 613 |
610 LOCAL loop7 | 614 LOCAL loop8 |
611 LOCAL loop1 | 615 LOCAL loop1 |
612 LOCAL dec1 | 616 LOCAL dec1 |
613 LOCAL bail | 617 LOCAL bail |
614 | 618 |
615 push inputLen | 619 mov input, [rsp + 1*8 + 8*4] |
| 620 mov inputLen, [rsp + 1*8 + 8*5] |
616 | 621 |
617 mov ctx, [esp + 2*4 + 0*4] | 622 sub rsp, 3*16 |
618 mov output, [esp + 2*4 + 1*4] | |
619 mov input, [esp + 2*4 + 4*4] | |
620 mov inputLen, [esp + 2*4 + 5*4] | |
621 | 623 |
622 lea ctx, [44+ctx] | 624 movdqu [rsp + 0*16], xmm6 |
| 625 movdqu [rsp + 1*16], xmm7 |
| 626 movdqu [rsp + 2*16], xmm8 |
623 | 627 |
624 loop7: | 628 lea ctx, [48+ctx] |
625 cmp inputLen, 7*16 | 629 |
| 630 loop8: |
| 631 cmp inputLen, 8*16 |
626 jb dec1 | 632 jb dec1 |
627 | 633 |
628 movdqu xmm0, [0*16 + input] | 634 movdqu xmm0, [0*16 + input] |
629 movdqu xmm1, [1*16 + input] | 635 movdqu xmm1, [1*16 + input] |
630 movdqu xmm2, [2*16 + input] | 636 movdqu xmm2, [2*16 + input] |
631 movdqu xmm3, [3*16 + input] | 637 movdqu xmm3, [3*16 + input] |
632 movdqu xmm4, [4*16 + input] | 638 movdqu xmm4, [4*16 + input] |
633 movdqu xmm5, [5*16 + input] | 639 movdqu xmm5, [5*16 + input] |
634 movdqu xmm6, [6*16 + input] | 640 movdqu xmm6, [6*16 + input] |
| 641 movdqu xmm7, [7*16 + input] |
635 | 642 |
636 movdqu xmm7, [0*16 + ctx] | 643 movdqu xmm8, [0*16 + ctx] |
637 pxor xmm0, xmm7 | 644 pxor xmm0, xmm8 |
638 pxor xmm1, xmm7 | 645 pxor xmm1, xmm8 |
639 pxor xmm2, xmm7 | 646 pxor xmm2, xmm8 |
640 pxor xmm3, xmm7 | 647 pxor xmm3, xmm8 |
641 pxor xmm4, xmm7 | 648 pxor xmm4, xmm8 |
642 pxor xmm5, xmm7 | 649 pxor xmm5, xmm8 |
643 pxor xmm6, xmm7 | 650 pxor xmm6, xmm8 |
| 651 pxor xmm7, xmm8 |
644 | 652 |
645 i = 1 | 653 i = 1 |
646 WHILE i LT rnds | 654 WHILE i LT rnds |
647 aes_dec_rnd i | 655 aes_dec_rnd i |
648 i = i+1 | 656 i = i+1 |
649 ENDM | 657 ENDM |
650 aes_dec_last_rnd rnds | 658 aes_dec_last_rnd rnds |
651 | 659 |
652 movdqu xmm7, [-32 + ctx] | 660 movdqu xmm8, [-32 + ctx] |
653 pxor xmm0, xmm7 | 661 pxor xmm0, xmm8 |
654 movdqu xmm7, [0*16 + input] | 662 movdqu xmm8, [0*16 + input] |
655 pxor xmm1, xmm7 | 663 pxor xmm1, xmm8 |
656 movdqu xmm7, [1*16 + input] | 664 movdqu xmm8, [1*16 + input] |
657 pxor xmm2, xmm7 | 665 pxor xmm2, xmm8 |
658 movdqu xmm7, [2*16 + input] | 666 movdqu xmm8, [2*16 + input] |
659 pxor xmm3, xmm7 | 667 pxor xmm3, xmm8 |
660 movdqu xmm7, [3*16 + input] | 668 movdqu xmm8, [3*16 + input] |
661 pxor xmm4, xmm7 | 669 pxor xmm4, xmm8 |
662 movdqu xmm7, [4*16 + input] | 670 movdqu xmm8, [4*16 + input] |
663 pxor xmm5, xmm7 | 671 pxor xmm5, xmm8 |
664 movdqu xmm7, [5*16 + input] | 672 movdqu xmm8, [5*16 + input] |
665 pxor xmm6, xmm7 | 673 pxor xmm6, xmm8 |
666 movdqu xmm7, [6*16 + input] | 674 movdqu xmm8, [6*16 + input] |
| 675 pxor xmm7, xmm8 |
| 676 movdqu xmm8, [7*16 + input] |
667 | 677 |
668 movdqu [0*16 + output], xmm0 | 678 movdqu [0*16 + output], xmm0 |
669 movdqu [1*16 + output], xmm1 | 679 movdqu [1*16 + output], xmm1 |
670 movdqu [2*16 + output], xmm2 | 680 movdqu [2*16 + output], xmm2 |
671 movdqu [3*16 + output], xmm3 | 681 movdqu [3*16 + output], xmm3 |
672 movdqu [4*16 + output], xmm4 | 682 movdqu [4*16 + output], xmm4 |
673 movdqu [5*16 + output], xmm5 | 683 movdqu [5*16 + output], xmm5 |
674 movdqu [6*16 + output], xmm6 | 684 movdqu [6*16 + output], xmm6 |
675 movdqu [-32 + ctx], xmm7 | 685 movdqu [7*16 + output], xmm7 |
| 686 movdqu [-32 + ctx], xmm8 |
676 | 687 |
677 lea input, [7*16 + input] | 688 lea input, [8*16 + input] |
678 lea output, [7*16 + output] | 689 lea output, [8*16 + output] |
679 sub inputLen, 7*16 | 690 sub inputLen, 8*16 |
680 jmp loop7 | 691 jmp loop8 |
681 dec1: | 692 dec1: |
682 | 693 |
683 movdqu xmm3, [-32 + ctx] | 694 movdqu xmm3, [-32 + ctx] |
684 | 695 |
685 loop1: | 696 loop1: |
686 cmp inputLen, 1*16 | 697 cmp inputLen, 1*16 |
687 jb bail | 698 jb bail |
688 | 699 |
689 movdqu xmm0, [input] | 700 movdqu xmm0, [input] |
690 movdqa xmm4, xmm0 | 701 movdqa xmm4, xmm0 |
(...skipping 13 matching lines...) Expand all Loading... |
704 movdqu [output], xmm3 | 715 movdqu [output], xmm3 |
705 movdqa xmm3, xmm4 | 716 movdqa xmm3, xmm4 |
706 | 717 |
707 lea input, [1*16 + input] | 718 lea input, [1*16 + input] |
708 lea output, [1*16 + output] | 719 lea output, [1*16 + output] |
709 sub inputLen, 1*16 | 720 sub inputLen, 1*16 |
710 jmp loop1 | 721 jmp loop1 |
711 | 722 |
712 bail: | 723 bail: |
713 movdqu [-32 + ctx], xmm3 | 724 movdqu [-32 + ctx], xmm3 |
714 xor eax, eax | 725 xor rax, rax |
715 pop inputLen | 726 |
| 727 movdqu xmm6, [rsp + 0*16] |
| 728 movdqu xmm7, [rsp + 1*16] |
| 729 movdqu xmm8, [rsp + 2*16] |
| 730 add rsp, 3*16 |
716 ret | 731 ret |
717 ENDM | 732 ENDM |
718 | 733 |
719 ALIGN 16 | |
720 intel_aes_encrypt_cbc_128 PROC | 734 intel_aes_encrypt_cbc_128 PROC |
721 gen_aes_cbc_enc_func 10 | 735 gen_aes_cbc_enc_func 10 |
722 intel_aes_encrypt_cbc_128 ENDP | 736 intel_aes_encrypt_cbc_128 ENDP |
723 | 737 |
724 ALIGN 16 | |
725 intel_aes_encrypt_cbc_192 PROC | 738 intel_aes_encrypt_cbc_192 PROC |
726 gen_aes_cbc_enc_func 12 | 739 gen_aes_cbc_enc_func 12 |
727 intel_aes_encrypt_cbc_192 ENDP | 740 intel_aes_encrypt_cbc_192 ENDP |
728 | 741 |
729 ALIGN 16 | |
730 intel_aes_encrypt_cbc_256 PROC | 742 intel_aes_encrypt_cbc_256 PROC |
731 gen_aes_cbc_enc_func 14 | 743 gen_aes_cbc_enc_func 14 |
732 intel_aes_encrypt_cbc_256 ENDP | 744 intel_aes_encrypt_cbc_256 ENDP |
733 | 745 |
734 ALIGN 16 | |
735 intel_aes_decrypt_cbc_128 PROC | 746 intel_aes_decrypt_cbc_128 PROC |
736 gen_aes_cbc_dec_func 10 | 747 gen_aes_cbc_dec_func 10 |
737 intel_aes_decrypt_cbc_128 ENDP | 748 intel_aes_decrypt_cbc_128 ENDP |
738 | 749 |
739 ALIGN 16 | |
740 intel_aes_decrypt_cbc_192 PROC | 750 intel_aes_decrypt_cbc_192 PROC |
741 gen_aes_cbc_dec_func 12 | 751 gen_aes_cbc_dec_func 12 |
742 intel_aes_decrypt_cbc_192 ENDP | 752 intel_aes_decrypt_cbc_192 ENDP |
743 | 753 |
744 ALIGN 16 | |
745 intel_aes_decrypt_cbc_256 PROC | 754 intel_aes_decrypt_cbc_256 PROC |
746 gen_aes_cbc_dec_func 14 | 755 gen_aes_cbc_dec_func 14 |
747 intel_aes_decrypt_cbc_256 ENDP | 756 intel_aes_decrypt_cbc_256 ENDP |
748 | 757 |
749 | 758 |
750 | 759 |
751 ctrCtx textequ <esi> | 760 ctrCtx textequ <r10> |
752 CTR textequ <ebx> | 761 CTR textequ <r11d> |
| 762 CTRSave textequ <eax> |
753 | 763 |
754 gen_aes_ctr_func MACRO rnds | 764 gen_aes_ctr_func MACRO rnds |
755 | 765 |
756 LOCAL loop7 | 766 LOCAL loop8 |
757 LOCAL loop1 | 767 LOCAL loop1 |
758 LOCAL enc1 | 768 LOCAL enc1 |
759 LOCAL bail | 769 LOCAL bail |
760 | 770 |
761 push inputLen | 771 mov input, [rsp + 8*1 + 4*8] |
762 push ctrCtx | 772 mov inputLen, [rsp + 8*1 + 5*8] |
763 push CTR | |
764 push ebp | |
765 | 773 |
766 mov ctrCtx, [esp + 4*5 + 0*4] | 774 mov ctrCtx, ctx |
767 mov output, [esp + 4*5 + 1*4] | 775 mov ctx, [8+ctrCtx] |
768 mov input, [esp + 4*5 + 4*4] | 776 lea ctx, [48+ctx] |
769 mov inputLen, [esp + 4*5 + 5*4] | |
770 | 777 |
771 mov ctx, [4+ctrCtx] | 778 sub rsp, 3*16 |
772 lea ctx, [44+ctx] | 779 movdqu [rsp + 0*16], xmm6 |
| 780 movdqu [rsp + 1*16], xmm7 |
| 781 movdqu [rsp + 2*16], xmm8 |
773 | 782 |
774 mov ebp, esp | |
775 sub esp, 7*16 | |
776 and esp, -16 | |
777 | 783 |
778 movdqu xmm0, [8+ctrCtx] | 784 push rbp |
779 mov ctrCtx, [ctrCtx + 8 + 3*4] | 785 mov rbp, rsp |
780 bswap ctrCtx | 786 sub rsp, 8*16 |
| 787 and rsp, -16 |
| 788 |
| 789 |
| 790 movdqu xmm0, [16+ctrCtx] |
| 791 mov CTRSave, DWORD PTR [ctrCtx + 16 + 3*4] |
| 792 bswap CTRSave |
781 movdqu xmm1, [ctx + 0*16] | 793 movdqu xmm1, [ctx + 0*16] |
782 | 794 |
783 pxor xmm0, xmm1 | 795 pxor xmm0, xmm1 |
784 | 796 |
785 movdqa [esp + 0*16], xmm0 | 797 movdqa [rsp + 0*16], xmm0 |
786 movdqa [esp + 1*16], xmm0 | 798 movdqa [rsp + 1*16], xmm0 |
787 movdqa [esp + 2*16], xmm0 | 799 movdqa [rsp + 2*16], xmm0 |
788 movdqa [esp + 3*16], xmm0 | 800 movdqa [rsp + 3*16], xmm0 |
789 movdqa [esp + 4*16], xmm0 | 801 movdqa [rsp + 4*16], xmm0 |
790 movdqa [esp + 5*16], xmm0 | 802 movdqa [rsp + 5*16], xmm0 |
791 movdqa [esp + 6*16], xmm0 | 803 movdqa [rsp + 6*16], xmm0 |
| 804 movdqa [rsp + 7*16], xmm0 |
792 | 805 |
793 inc ctrCtx | 806 inc CTRSave |
794 mov CTR, ctrCtx | 807 mov CTR, CTRSave |
795 bswap CTR | 808 bswap CTR |
796 xor CTR, [ctx + 3*4] | 809 xor CTR, DWORD PTR [ctx + 3*4] |
797 mov [esp + 1*16 + 3*4], CTR | 810 mov DWORD PTR [rsp + 1*16 + 3*4], CTR |
798 | 811 |
799 inc ctrCtx | 812 inc CTRSave |
800 mov CTR, ctrCtx | 813 mov CTR, CTRSave |
801 bswap CTR | 814 bswap CTR |
802 xor CTR, [ctx + 3*4] | 815 xor CTR, DWORD PTR [ctx + 3*4] |
803 mov [esp + 2*16 + 3*4], CTR | 816 mov DWORD PTR [rsp + 2*16 + 3*4], CTR |
804 | 817 |
805 inc ctrCtx | 818 inc CTRSave |
806 mov CTR, ctrCtx | 819 mov CTR, CTRSave |
807 bswap CTR | 820 bswap CTR |
808 xor CTR, [ctx + 3*4] | 821 xor CTR, DWORD PTR [ctx + 3*4] |
809 mov [esp + 3*16 + 3*4], CTR | 822 mov DWORD PTR [rsp + 3*16 + 3*4], CTR |
810 | 823 |
811 inc ctrCtx | 824 inc CTRSave |
812 mov CTR, ctrCtx | 825 mov CTR, CTRSave |
813 bswap CTR | 826 bswap CTR |
814 xor CTR, [ctx + 3*4] | 827 xor CTR, DWORD PTR [ctx + 3*4] |
815 mov [esp + 4*16 + 3*4], CTR | 828 mov DWORD PTR [rsp + 4*16 + 3*4], CTR |
816 | 829 |
817 inc ctrCtx | 830 inc CTRSave |
818 mov CTR, ctrCtx | 831 mov CTR, CTRSave |
819 bswap CTR | 832 bswap CTR |
820 xor CTR, [ctx + 3*4] | 833 xor CTR, DWORD PTR [ctx + 3*4] |
821 mov [esp + 5*16 + 3*4], CTR | 834 mov DWORD PTR [rsp + 5*16 + 3*4], CTR |
822 | 835 |
823 inc ctrCtx | 836 inc CTRSave |
824 mov CTR, ctrCtx | 837 mov CTR, CTRSave |
825 bswap CTR | 838 bswap CTR |
826 xor CTR, [ctx + 3*4] | 839 xor CTR, DWORD PTR [ctx + 3*4] |
827 mov [esp + 6*16 + 3*4], CTR | 840 mov DWORD PTR [rsp + 6*16 + 3*4], CTR |
| 841 |
| 842 inc CTRSave |
| 843 mov CTR, CTRSave |
| 844 bswap CTR |
| 845 xor CTR, DWORD PTR [ctx + 3*4] |
| 846 mov DWORD PTR [rsp + 7*16 + 3*4], CTR |
828 | 847 |
829 | 848 |
830 loop7: | 849 loop8: |
831 cmp inputLen, 7*16 | 850 cmp inputLen, 8*16 |
832 jb loop1 | 851 jb loop1 |
833 | 852 |
834 movdqu xmm0, [0*16 + esp] | 853 movdqu xmm0, [0*16 + rsp] |
835 movdqu xmm1, [1*16 + esp] | 854 movdqu xmm1, [1*16 + rsp] |
836 movdqu xmm2, [2*16 + esp] | 855 movdqu xmm2, [2*16 + rsp] |
837 movdqu xmm3, [3*16 + esp] | 856 movdqu xmm3, [3*16 + rsp] |
838 movdqu xmm4, [4*16 + esp] | 857 movdqu xmm4, [4*16 + rsp] |
839 movdqu xmm5, [5*16 + esp] | 858 movdqu xmm5, [5*16 + rsp] |
840 movdqu xmm6, [6*16 + esp] | 859 movdqu xmm6, [6*16 + rsp] |
| 860 movdqu xmm7, [7*16 + rsp] |
841 | 861 |
842 i = 1 | 862 i = 1 |
843 WHILE i LE 7 | 863 WHILE i LE 8 |
844 aes_rnd i | 864 aes_rnd i |
845 | 865 |
846 inc ctrCtx | 866 inc CTRSave |
847 mov CTR, ctrCtx | 867 mov CTR, CTRSave |
848 bswap CTR | 868 bswap CTR |
849 xor CTR, [ctx + 3*4] | 869 xor CTR, DWORD PTR [ctx + 3*4] |
850 mov [esp + (i-1)*16 + 3*4], CTR | 870 mov DWORD PTR [rsp + (i-1)*16 + 3*4], CTR |
851 | 871 |
852 i = i+1 | 872 i = i+1 |
853 ENDM | 873 ENDM |
854 WHILE i LT rnds | 874 WHILE i LT rnds |
855 aes_rnd i | 875 aes_rnd i |
856 i = i+1 | 876 i = i+1 |
857 ENDM | 877 ENDM |
858 aes_last_rnd rnds | 878 aes_last_rnd rnds |
859 | 879 |
860 movdqu xmm7, [0*16 + input] | 880 movdqu xmm8, [0*16 + input] |
861 pxor xmm0, xmm7 | 881 pxor xmm0, xmm8 |
862 movdqu xmm7, [1*16 + input] | 882 movdqu xmm8, [1*16 + input] |
863 pxor xmm1, xmm7 | 883 pxor xmm1, xmm8 |
864 movdqu xmm7, [2*16 + input] | 884 movdqu xmm8, [2*16 + input] |
865 pxor xmm2, xmm7 | 885 pxor xmm2, xmm8 |
866 movdqu xmm7, [3*16 + input] | 886 movdqu xmm8, [3*16 + input] |
867 pxor xmm3, xmm7 | 887 pxor xmm3, xmm8 |
868 movdqu xmm7, [4*16 + input] | 888 movdqu xmm8, [4*16 + input] |
869 pxor xmm4, xmm7 | 889 pxor xmm4, xmm8 |
870 movdqu xmm7, [5*16 + input] | 890 movdqu xmm8, [5*16 + input] |
871 pxor xmm5, xmm7 | 891 pxor xmm5, xmm8 |
872 movdqu xmm7, [6*16 + input] | 892 movdqu xmm8, [6*16 + input] |
873 pxor xmm6, xmm7 | 893 pxor xmm6, xmm8 |
| 894 movdqu xmm8, [7*16 + input] |
| 895 pxor xmm7, xmm8 |
874 | 896 |
875 movdqu [0*16 + output], xmm0 | 897 movdqu [0*16 + output], xmm0 |
876 movdqu [1*16 + output], xmm1 | 898 movdqu [1*16 + output], xmm1 |
877 movdqu [2*16 + output], xmm2 | 899 movdqu [2*16 + output], xmm2 |
878 movdqu [3*16 + output], xmm3 | 900 movdqu [3*16 + output], xmm3 |
879 movdqu [4*16 + output], xmm4 | 901 movdqu [4*16 + output], xmm4 |
880 movdqu [5*16 + output], xmm5 | 902 movdqu [5*16 + output], xmm5 |
881 movdqu [6*16 + output], xmm6 | 903 movdqu [6*16 + output], xmm6 |
| 904 movdqu [7*16 + output], xmm7 |
882 | 905 |
883 lea input, [7*16 + input] | 906 lea input, [8*16 + input] |
884 lea output, [7*16 + output] | 907 lea output, [8*16 + output] |
885 sub inputLen, 7*16 | 908 sub inputLen, 8*16 |
886 jmp loop7 | 909 jmp loop8 |
887 | 910 |
888 | 911 |
889 loop1: | 912 loop1: |
890 cmp inputLen, 1*16 | 913 cmp inputLen, 1*16 |
891 jb bail | 914 jb bail |
892 | 915 |
893 movdqu xmm0, [esp] | 916 movdqu xmm0, [rsp] |
894 add esp, 16 | 917 add rsp, 16 |
895 | 918 |
896 i = 1 | 919 i = 1 |
897 WHILE i LT rnds | 920 WHILE i LT rnds |
898 movdqu xmm7, [i*16 + ctx] | 921 movdqu xmm7, [i*16 + ctx] |
899 aesenc xmm0, xmm7 | 922 aesenc xmm0, xmm7 |
900 i = i+1 | 923 i = i+1 |
901 ENDM | 924 ENDM |
902 movdqu xmm7, [rnds*16 + ctx] | 925 movdqu xmm7, [rnds*16 + ctx] |
903 aesenclast xmm0, xmm7 | 926 aesenclast xmm0, xmm7 |
904 | 927 |
905 movdqu xmm7, [input] | 928 movdqu xmm7, [input] |
906 pxor xmm0, xmm7 | 929 pxor xmm0, xmm7 |
907 movdqu [output], xmm0 | 930 movdqu [output], xmm0 |
908 | 931 |
909 lea input, [1*16 + input] | 932 lea input, [1*16 + input] |
910 lea output, [1*16 + output] | 933 lea output, [1*16 + output] |
911 sub inputLen, 1*16 | 934 sub inputLen, 1*16 |
912 jmp loop1 | 935 jmp loop1 |
913 | 936 |
914 bail: | 937 bail: |
915 | 938 |
916 mov ctrCtx, [ebp + 4*5 + 0*4] | 939 movdqu xmm0, [rsp] |
917 movdqu xmm0, [esp] | |
918 movdqu xmm1, [ctx + 0*16] | 940 movdqu xmm1, [ctx + 0*16] |
919 pxor xmm0, xmm1 | 941 pxor xmm0, xmm1 |
920 movdqu [8+ctrCtx], xmm0 | 942 movdqu [16+ctrCtx], xmm0 |
921 | 943 |
922 | 944 |
923 xor eax, eax | 945 xor rax, rax |
924 mov esp, ebp | 946 mov rsp, rbp |
925 pop ebp | 947 pop rbp |
926 pop CTR | 948 |
927 pop ctrCtx | 949 movdqu xmm6, [rsp + 0*16] |
928 pop inputLen | 950 movdqu xmm7, [rsp + 1*16] |
| 951 movdqu xmm8, [rsp + 2*16] |
| 952 add rsp, 3*16 |
| 953 |
929 ret | 954 ret |
930 ENDM | 955 ENDM |
931 | 956 |
932 | 957 |
933 ALIGN 16 | |
934 intel_aes_encrypt_ctr_128 PROC | 958 intel_aes_encrypt_ctr_128 PROC |
935 gen_aes_ctr_func 10 | 959 gen_aes_ctr_func 10 |
936 intel_aes_encrypt_ctr_128 ENDP | 960 intel_aes_encrypt_ctr_128 ENDP |
937 | 961 |
938 ALIGN 16 | |
939 intel_aes_encrypt_ctr_192 PROC | 962 intel_aes_encrypt_ctr_192 PROC |
940 gen_aes_ctr_func 12 | 963 gen_aes_ctr_func 12 |
941 intel_aes_encrypt_ctr_192 ENDP | 964 intel_aes_encrypt_ctr_192 ENDP |
942 | 965 |
943 ALIGN 16 | |
944 intel_aes_encrypt_ctr_256 PROC | 966 intel_aes_encrypt_ctr_256 PROC |
945 gen_aes_ctr_func 14 | 967 gen_aes_ctr_func 14 |
946 intel_aes_encrypt_ctr_256 ENDP | 968 intel_aes_encrypt_ctr_256 ENDP |
947 | 969 |
948 | 970 |
949 END | 971 END |
OLD | NEW |