Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(152)

Side by Side Diff: nss/lib/freebl/intel-gcm-x86-masm.asm

Issue 214183004: Implement AES in different modes of operation, using AES-NI and (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/nss.git@master
Patch Set: Remove an assertion. ctr->cipher doesn't set *outlen. Created 6 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « nss/lib/freebl/intel-gcm-x64-masm.asm ('k') | nss/lib/freebl/rijndael.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 ; LICENSE:
2 ; This submission to NSS is to be made available under the terms of the
3 ; Mozilla Public License, v. 2.0. You can obtain one at http:
4 ; //mozilla.org/MPL/2.0/.
5 ;###############################################################################
6 ; Copyright(c) 2014, Intel Corp.
7 ; Developers and authors:
8 ; Shay Gueron and Vlad Krasnov
9 ; Intel Corporation, Israel Development Centre, Haifa, Israel
10 ; Please send feedback directly to crypto.feedback.alias@intel.com
11
12
13 .MODEL FLAT, C
14 .XMM
15
16 .DATA
17 ALIGN 16
18 Lone dq 1,0
19 Ltwo dq 2,0
20 Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
21 Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
22 Lpoly dq 01h, 0c200000000000000h
23
24 .CODE
25
26
27 GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4
28 vpclmulqdq TMP1, SRC2, SRC1, 0h
29 vpclmulqdq TMP4, SRC2, SRC1, 011h
30
31 vpshufd TMP2, SRC2, 78
32 vpshufd TMP3, SRC1, 78
33 vpxor TMP2, TMP2, SRC2
34 vpxor TMP3, TMP3, SRC1
35
36 vpclmulqdq TMP2, TMP2, TMP3, 0h
37 vpxor TMP2, TMP2, TMP1
38 vpxor TMP2, TMP2, TMP4
39
40 vpslldq TMP3, TMP2, 8
41 vpsrldq TMP2, TMP2, 8
42
43 vpxor TMP1, TMP1, TMP3
44 vpxor TMP4, TMP4, TMP2
45
46 vpclmulqdq TMP2, TMP1, [Lpoly], 010h
47 vpshufd TMP3, TMP1, 78
48 vpxor TMP1, TMP2, TMP3
49
50 vpclmulqdq TMP2, TMP1, [Lpoly], 010h
51 vpshufd TMP3, TMP1, 78
52 vpxor TMP1, TMP2, TMP3
53
54 vpxor DST, TMP1, TMP4
55
56 ENDM
57
58 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
59 ;
60 ; Generates the final GCM tag
61 ; void intel_aes_gcmTAG(unsigned char Htbl[16*16],
62 ; unsigned char *Tp,
63 ; unsigned int Mlen,
64 ; unsigned int Alen,
65 ; unsigned char* X0,
66 ; unsigned char* TAG);
67 ;
68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
69
70 ALIGN 16
71 intel_aes_gcmTAG PROC
72
73 Htbl textequ <eax>
74 Tp textequ <ecx>
75 X0 textequ <edx>
76 TAG textequ <ebx>
77
78 T textequ <xmm0>
79 TMP0 textequ <xmm1>
80
81 push ebx
82
83 mov Htbl, [esp + 2*4 + 0*4]
84 mov Tp, [esp + 2*4 + 1*4]
85 mov X0, [esp + 2*4 + 4*4]
86 mov TAG, [esp + 2*4 + 5*4]
87
88 vzeroupper
89 vmovdqu T, XMMWORD PTR[Tp]
90
91 vpxor TMP0, TMP0, TMP0
92 vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 2*4], 0
93 vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 3*4], 2
94 vpsllq TMP0, TMP0, 3
95
96 vpxor T, T, TMP0
97 vmovdqu TMP0, XMMWORD PTR[Htbl]
98 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
99
100 vpshufb T, T, [Lbswap_mask]
101 vpxor T, T, [X0]
102 vmovdqu XMMWORD PTR[TAG], T
103 vzeroupper
104
105 pop ebx
106
107 ret
108
109 intel_aes_gcmTAG ENDP
110
111 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
112 ;
113 ; Generates the H table
114 ; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR);
115 ;
116 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
117
118 ALIGN 16
119 intel_aes_gcmINIT PROC
120
121 Htbl textequ <eax>
122 KS textequ <ecx>
123 NR textequ <edx>
124
125 T textequ <xmm0>
126 TMP0 textequ <xmm1>
127
128 mov Htbl, [esp + 4*1 + 0*4]
129 mov KS, [esp + 4*1 + 1*4]
130 mov NR, [esp + 4*1 + 2*4]
131
132 vzeroupper
133 ; AES-ENC(0)
134 vmovdqu T, XMMWORD PTR[KS]
135 lea KS, [16 + KS]
136 dec NR
137 Lenc_loop:
138 vaesenc T, T, [KS]
139 lea KS, [16 + KS]
140 dec NR
141 jnz Lenc_loop
142
143 vaesenclast T, T, [KS]
144 vpshufb T, T, [Lbswap_mask]
145
146 ;Calculate H` = GFMUL(H, 2)
147 vpsrad xmm3, T, 31
148 vpshufd xmm3, xmm3, 0ffh
149 vpand xmm5, xmm3, [Lpoly]
150 vpsrld xmm3, T, 31
151 vpslld xmm4, T, 1
152 vpslldq xmm3, xmm3, 4
153 vpxor T, xmm4, xmm3
154 vpxor T, T, xmm5
155
156 vmovdqu TMP0, T
157 vmovdqu XMMWORD PTR[Htbl + 0*16], T
158
159 vpshufd xmm2, T, 78
160 vpxor xmm2, xmm2, T
161 vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2
162
163 i = 1
164 WHILE i LT 8
165 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
166 vmovdqu XMMWORD PTR[Htbl + i*16], T
167 vpshufd xmm2, T, 78
168 vpxor xmm2, xmm2, T
169 vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2
170 i = i+1
171 ENDM
172 vzeroupper
173 ret
174 intel_aes_gcmINIT ENDP
175
176
177 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
178 ;
179 ; Authenticate only
180 ; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp);
181 ;
182 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
183
184 ALIGN 16
185 intel_aes_gcmAAD PROC
186
187 Htbl textequ <eax>
188 inp textequ <ecx>
189 len textequ <edx>
190 Tp textequ <ebx>
191 hlp0 textequ <esi>
192
193 DATA textequ <xmm0>
194 T textequ <xmm1>
195 TMP0 textequ <xmm2>
196 TMP1 textequ <xmm3>
197 TMP2 textequ <xmm4>
198 TMP3 textequ <xmm5>
199 TMP4 textequ <xmm6>
200 Xhi textequ <xmm7>
201
202 KARATSUBA_AAD MACRO i
203 vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h
204 vpxor TMP0, TMP0, TMP3
205 vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h
206 vpxor TMP1, TMP1, TMP3
207 vpshufd TMP3, DATA, 78
208 vpxor TMP3, TMP3, DATA
209 vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h
210 vpxor TMP2, TMP2, TMP3
211 ENDM
212
213 cmp DWORD PTR[esp + 1*3 + 2*4], 0
214 jnz LbeginAAD
215 ret
216
217 LbeginAAD:
218 push ebx
219 push esi
220
221 mov Htbl, [esp + 4*3 + 0*4]
222 mov inp, [esp + 4*3 + 1*4]
223 mov len, [esp + 4*3 + 2*4]
224 mov Tp, [esp + 4*3 + 3*4]
225
226 vzeroupper
227
228 vpxor Xhi, Xhi, Xhi
229
230 vmovdqu T, XMMWORD PTR[Tp]
231 ;we hash 8 block each iteration, if the total amount of blocks is not a mult iple of 8, we hash the first n%8 blocks first
232 mov hlp0, len
233 and hlp0, 128-1
234 jz Lmod_loop
235
236 and len, -128
237 sub hlp0, 16
238
239 ; Prefix block
240 vmovdqu DATA, XMMWORD PTR[inp]
241 vpshufb DATA, DATA, [Lbswap_mask]
242 vpxor DATA, DATA, T
243
244 vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + hlp0], 0h
245 vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + hlp0], 011h
246 vpshufd TMP3, DATA, 78
247 vpxor TMP3, TMP3, DATA
248 vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h
249
250 lea inp, [inp+16]
251 test hlp0, hlp0
252 jnz Lpre_loop
253 jmp Lred1
254
255 ;hash remaining prefix bocks (up to 7 total prefix blocks)
256 Lpre_loop:
257
258 sub hlp0, 16
259
260 vmovdqu DATA, XMMWORD PTR[inp]
261 vpshufb DATA, DATA, [Lbswap_mask]
262
263 vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 0h
264 vpxor TMP0, TMP0, TMP3
265 vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 011h
266 vpxor TMP1, TMP1, TMP3
267 vpshufd TMP3, DATA, 78
268 vpxor TMP3, TMP3, DATA
269 vpclmulqdq TMP3, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h
270 vpxor TMP2, TMP2, TMP3
271
272 test hlp0, hlp0
273 lea inp, [inp+16]
274 jnz Lpre_loop
275
276 Lred1:
277
278 vpxor TMP2, TMP2, TMP0
279 vpxor TMP2, TMP2, TMP1
280 vpsrldq TMP3, TMP2, 8
281 vpslldq TMP2, TMP2, 8
282
283 vpxor Xhi, TMP1, TMP3
284 vpxor T, TMP0, TMP2
285
286 Lmod_loop:
287
288 sub len, 16*8
289 jb Ldone
290 ; Block #0
291 vmovdqu DATA, XMMWORD PTR[inp + 16*7]
292 vpshufb DATA, DATA, XMMWORD PTR[Lbswap_mask]
293
294 vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + 0*16], 0h
295 vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + 0*16], 011h
296 vpshufd TMP3, DATA, 78
297 vpxor TMP3, TMP3, DATA
298 vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + 0*16], 0h
299
300 ; Block #1
301 vmovdqu DATA, XMMWORD PTR[inp + 16*6]
302 vpshufb DATA, DATA, [Lbswap_mask]
303 KARATSUBA_AAD 1
304
305 ; Block #2
306 vmovdqu DATA, XMMWORD PTR[inp + 16*5]
307 vpshufb DATA, DATA, [Lbswap_mask]
308
309 vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a
310 vpalignr T, T, T, 8
311
312 KARATSUBA_AAD 2
313
314 vpxor T, T, TMP4 ;reduction stage 1b
315
316 ; Block #3
317 vmovdqu DATA, XMMWORD PTR[inp + 16*4]
318 vpshufb DATA, DATA, [Lbswap_mask]
319 KARATSUBA_AAD 3
320 ; Block #4
321 vmovdqu DATA, XMMWORD PTR[inp + 16*3]
322 vpshufb DATA, DATA, [Lbswap_mask]
323
324 vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a
325 vpalignr T, T, T, 8
326
327 KARATSUBA_AAD 4
328
329 vpxor T, T, TMP4 ;reduction stage 2b
330 ; Block #5
331 vmovdqu DATA, XMMWORD PTR[inp + 16*2]
332 vpshufb DATA, DATA, [Lbswap_mask]
333 KARATSUBA_AAD 5
334
335 vpxor T, T, Xhi ;reduction finalize
336 ; Block #6
337 vmovdqu DATA, XMMWORD PTR[inp + 16*1]
338 vpshufb DATA, DATA, [Lbswap_mask]
339 KARATSUBA_AAD 6
340 ; Block #7
341 vmovdqu DATA, XMMWORD PTR[inp + 16*0]
342 vpshufb DATA, DATA, [Lbswap_mask]
343 vpxor DATA, DATA, T
344 KARATSUBA_AAD 7
345 ; Aggregated 8 blocks, now karatsuba fixup
346 vpxor TMP2, TMP2, TMP0
347 vpxor TMP2, TMP2, TMP1
348 vpsrldq TMP3, TMP2, 8
349 vpslldq TMP2, TMP2, 8
350
351 vpxor Xhi, TMP1, TMP3
352 vpxor T, TMP0, TMP2
353
354 lea inp, [inp + 16*8]
355 jmp Lmod_loop
356
357 Ldone:
358 vpclmulqdq TMP4, T, [Lpoly], 010h
359 vpalignr T, T, T, 8
360 vpxor T, T, TMP4
361
362 vpclmulqdq TMP4, T, [Lpoly], 010h
363 vpalignr T, T, T, 8
364 vpxor T, T, TMP4
365
366 vpxor T, T, Xhi
367 vmovdqu XMMWORD PTR[Tp], T
368 vzeroupper
369
370 pop esi
371 pop ebx
372 ret
373
374 intel_aes_gcmAAD ENDP
375
376
377 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
378 ;
379 ; Encrypt and Authenticate
380 ; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsign ed int len);
381 ;
382 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
383
384 ALIGN 16
385 intel_aes_gcmENC PROC
386
387 PT textequ <eax>
388 CT textequ <ecx>
389 Htbl textequ <edx>
390 Gctx textequ <edx>
391 len textequ <DWORD PTR[ebp + 5*4 + 3*4]>
392 KS textequ <esi>
393 NR textequ <DWORD PTR[-40 + KS]>
394
395 aluCTR textequ <ebx>
396 aluTMP textequ <edi>
397
398 T textequ <XMMWORD PTR[16*16 + 1*16 + Gctx]>
399 TMP0 textequ <xmm1>
400 TMP1 textequ <xmm2>
401 TMP2 textequ <xmm3>
402 TMP3 textequ <xmm4>
403 TMP4 textequ <xmm5>
404 TMP5 textequ <xmm6>
405
406 CTR0 textequ <xmm0>
407 CTR1 textequ <xmm1>
408 CTR2 textequ <xmm2>
409 CTR3 textequ <xmm3>
410 CTR4 textequ <xmm4>
411 CTR5 textequ <xmm5>
412 CTR6 textequ <xmm6>
413
414 ROUND MACRO i
415 vmovdqu xmm7, XMMWORD PTR[i*16 + KS]
416 vaesenc CTR0, CTR0, xmm7
417 vaesenc CTR1, CTR1, xmm7
418 vaesenc CTR2, CTR2, xmm7
419 vaesenc CTR3, CTR3, xmm7
420 vaesenc CTR4, CTR4, xmm7
421 vaesenc CTR5, CTR5, xmm7
422 vaesenc CTR6, CTR6, xmm7
423 ENDM
424
425 KARATSUBA MACRO i
426 vpshufd TMP4, TMP5, 78
427 vpxor TMP4, TMP4, TMP5
428 vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
429 vpxor TMP0, TMP0, TMP3
430 vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl]
431 vpclmulqdq TMP3, TMP5, TMP4, 011h
432 vpxor TMP1, TMP1, TMP3
433 vpclmulqdq TMP3, TMP5, TMP4, 000h
434 vpxor TMP2, TMP2, TMP3
435 ENDM
436
437 NEXTCTR MACRO i
438 add aluCTR, 1
439 mov aluTMP, aluCTR
440 bswap aluTMP
441 xor aluTMP, [3*4 + KS]
442 mov [3*4 + 8*16 + i*16 + esp], aluTMP
443 ENDM
444
445 cmp DWORD PTR[1*4 + 3*4 + esp], 0
446 jne LbeginENC
447 ret
448
449 LbeginENC:
450
451 vzeroupper
452 push ebp
453 push ebx
454 push esi
455 push edi
456
457 mov ebp, esp
458 sub esp, 16*16
459 and esp, -16
460
461 mov PT, [ebp + 5*4 + 0*4]
462 mov CT, [ebp + 5*4 + 1*4]
463 mov Gctx, [ebp + 5*4 + 2*4]
464
465 mov KS, [16*16 + 3*16 + Gctx]
466 lea KS, [44 + KS]
467
468 mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
469 bswap aluCTR
470
471
472 vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
473 vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
474 vmovdqu XMMWORD PTR[8*16 + 0*16 + esp], TMP0
475
476 cmp len, 16*7
477 jb LEncDataSingles
478 ; Prepare the "top" counters
479 vmovdqu XMMWORD PTR[8*16 + 1*16 + esp], TMP0
480 vmovdqu XMMWORD PTR[8*16 + 2*16 + esp], TMP0
481 vmovdqu XMMWORD PTR[8*16 + 3*16 + esp], TMP0
482 vmovdqu XMMWORD PTR[8*16 + 4*16 + esp], TMP0
483 vmovdqu XMMWORD PTR[8*16 + 5*16 + esp], TMP0
484 vmovdqu XMMWORD PTR[8*16 + 6*16 + esp], TMP0
485
486 vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
487 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
488 ; Encrypt the initial 7 blocks
489 sub len, 16*7
490 vpaddd CTR1, CTR0, XMMWORD PTR[Lone]
491 vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo]
492 vpaddd CTR3, CTR2, XMMWORD PTR[Lone]
493 vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo]
494 vpaddd CTR5, CTR4, XMMWORD PTR[Lone]
495 vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo]
496
497 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
498 vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
499 vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
500 vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
501 vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
502 vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
503 vpshufb CTR6, CTR6, XMMWORD PTR[Lbswap_mask]
504
505 vmovdqu xmm7, XMMWORD PTR[0*16 + KS]
506 vpxor CTR0, CTR0, xmm7
507 vpxor CTR1, CTR1, xmm7
508 vpxor CTR2, CTR2, xmm7
509 vpxor CTR3, CTR3, xmm7
510 vpxor CTR4, CTR4, xmm7
511 vpxor CTR5, CTR5, xmm7
512 vpxor CTR6, CTR6, xmm7
513
514 ROUND 1
515
516 add aluCTR, 7
517 mov aluTMP, aluCTR
518 bswap aluTMP
519 xor aluTMP, [KS + 3*4]
520 mov [8*16 + 0*16 + 3*4 + esp], aluTMP
521
522 ROUND 2
523 NEXTCTR 1
524 ROUND 3
525 NEXTCTR 2
526 ROUND 4
527 NEXTCTR 3
528 ROUND 5
529 NEXTCTR 4
530 ROUND 6
531 NEXTCTR 5
532 ROUND 7
533 NEXTCTR 6
534 ROUND 8
535 ROUND 9
536 vmovdqu xmm7, XMMWORD PTR[10*16 + KS]
537 cmp NR, 10
538 je @f
539
540 ROUND 10
541 ROUND 11
542 vmovdqu xmm7, XMMWORD PTR[12*16 + KS]
543 cmp NR, 12
544 je @f
545
546 ROUND 12
547 ROUND 13
548 vmovdqu xmm7, XMMWORD PTR[14*16 + KS]
549 @@:
550 vaesenclast CTR0, CTR0, xmm7
551 vaesenclast CTR1, CTR1, xmm7
552 vaesenclast CTR2, CTR2, xmm7
553 vaesenclast CTR3, CTR3, xmm7
554 vaesenclast CTR4, CTR4, xmm7
555 vaesenclast CTR5, CTR5, xmm7
556 vaesenclast CTR6, CTR6, xmm7
557
558 vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT]
559 vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT]
560 vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT]
561 vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT]
562 vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT]
563 vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT]
564 vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT]
565
566 vmovdqu XMMWORD PTR[0*16 + CT], CTR0
567 vmovdqu XMMWORD PTR[1*16 + CT], CTR1
568 vmovdqu XMMWORD PTR[2*16 + CT], CTR2
569 vmovdqu XMMWORD PTR[3*16 + CT], CTR3
570 vmovdqu XMMWORD PTR[4*16 + CT], CTR4
571 vmovdqu XMMWORD PTR[5*16 + CT], CTR5
572 vmovdqu XMMWORD PTR[6*16 + CT], CTR6
573
574 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
575 vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
576 vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
577 vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
578 vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
579 vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
580 vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask]
581
582 vmovdqa XMMWORD PTR[1*16 + esp], CTR5
583 vmovdqa XMMWORD PTR[2*16 + esp], CTR4
584 vmovdqa XMMWORD PTR[3*16 + esp], CTR3
585 vmovdqa XMMWORD PTR[4*16 + esp], CTR2
586 vmovdqa XMMWORD PTR[5*16 + esp], CTR1
587 vmovdqa XMMWORD PTR[6*16 + esp], CTR0
588
589 lea CT, [7*16 + CT]
590 lea PT, [7*16 + PT]
591 jmp LEncData7
592
593 LEncData7:
594 cmp len, 16*7
595 jb LEndEnc7
596 sub len, 16*7
597
598 vpshufd TMP4, TMP5, 78
599 vpxor TMP4, TMP4, TMP5
600 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
601 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
602 vpclmulqdq TMP1, TMP5, TMP4, 011h
603 vpclmulqdq TMP2, TMP5, TMP4, 000h
604
605 vmovdqu TMP5, XMMWORD PTR[1*16 + esp]
606 KARATSUBA 1
607 vmovdqu TMP5, XMMWORD PTR[2*16 + esp]
608 KARATSUBA 2
609 vmovdqu TMP5, XMMWORD PTR[3*16 + esp]
610 KARATSUBA 3
611 vmovdqu TMP5, XMMWORD PTR[4*16 + esp]
612 KARATSUBA 4
613 vmovdqu TMP5, XMMWORD PTR[5*16 + esp]
614 KARATSUBA 5
615 vmovdqu TMP5, XMMWORD PTR[6*16 + esp]
616 vpxor TMP5, TMP5, T
617 KARATSUBA 6
618
619 vpxor TMP0, TMP0, TMP1
620 vpxor TMP0, TMP0, TMP2
621 vpsrldq TMP3, TMP0, 8
622 vpxor TMP4, TMP1, TMP3
623 vpslldq TMP3, TMP0, 8
624 vpxor TMP5, TMP2, TMP3
625
626 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
627 vpalignr TMP5,TMP5,TMP5,8
628 vpxor TMP5, TMP5, TMP1
629
630 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
631 vpalignr TMP5,TMP5,TMP5,8
632 vpxor TMP5, TMP5, TMP1
633
634 vpxor TMP5, TMP5, TMP4
635 vmovdqu T, TMP5
636
637 vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + esp]
638 vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + esp]
639 vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + esp]
640 vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + esp]
641 vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + esp]
642 vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + esp]
643 vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + esp]
644
645 ROUND 1
646 NEXTCTR 0
647 ROUND 2
648 NEXTCTR 1
649 ROUND 3
650 NEXTCTR 2
651 ROUND 4
652 NEXTCTR 3
653 ROUND 5
654 NEXTCTR 4
655 ROUND 6
656 NEXTCTR 5
657 ROUND 7
658 NEXTCTR 6
659
660 ROUND 8
661 ROUND 9
662
663 vmovdqu xmm7, XMMWORD PTR[10*16 + KS]
664 cmp NR, 10
665 je @f
666
667 ROUND 10
668 ROUND 11
669 vmovdqu xmm7, XMMWORD PTR[12*16 + KS]
670 cmp NR, 12
671 je @f
672
673 ROUND 12
674 ROUND 13
675 vmovdqu xmm7, XMMWORD PTR[14*16 + KS]
676 @@:
677 vaesenclast CTR0, CTR0, xmm7
678 vaesenclast CTR1, CTR1, xmm7
679 vaesenclast CTR2, CTR2, xmm7
680 vaesenclast CTR3, CTR3, xmm7
681 vaesenclast CTR4, CTR4, xmm7
682 vaesenclast CTR5, CTR5, xmm7
683 vaesenclast CTR6, CTR6, xmm7
684
685 vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT]
686 vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT]
687 vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT]
688 vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT]
689 vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT]
690 vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT]
691 vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT]
692
693 vmovdqu XMMWORD PTR[0*16 + CT], CTR0
694 vmovdqu XMMWORD PTR[1*16 + CT], CTR1
695 vmovdqu XMMWORD PTR[2*16 + CT], CTR2
696 vmovdqu XMMWORD PTR[3*16 + CT], CTR3
697 vmovdqu XMMWORD PTR[4*16 + CT], CTR4
698 vmovdqu XMMWORD PTR[5*16 + CT], CTR5
699 vmovdqu XMMWORD PTR[6*16 + CT], CTR6
700
701 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
702 vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
703 vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
704 vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
705 vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
706 vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
707 vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask]
708
709 vmovdqa XMMWORD PTR[1*16 + esp], CTR5
710 vmovdqa XMMWORD PTR[2*16 + esp], CTR4
711 vmovdqa XMMWORD PTR[3*16 + esp], CTR3
712 vmovdqa XMMWORD PTR[4*16 + esp], CTR2
713 vmovdqa XMMWORD PTR[5*16 + esp], CTR1
714 vmovdqa XMMWORD PTR[6*16 + esp], CTR0
715
716 lea CT, [7*16 + CT]
717 lea PT, [7*16 + PT]
718 jmp LEncData7
719
720 LEndEnc7:
721
722 vpshufd TMP4, TMP5, 78
723 vpxor TMP4, TMP4, TMP5
724 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
725 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
726 vpclmulqdq TMP1, TMP5, TMP4, 011h
727 vpclmulqdq TMP2, TMP5, TMP4, 000h
728
729 vmovdqu TMP5, XMMWORD PTR[1*16 + esp]
730 KARATSUBA 1
731 vmovdqu TMP5, XMMWORD PTR[2*16 + esp]
732 KARATSUBA 2
733 vmovdqu TMP5, XMMWORD PTR[3*16 + esp]
734 KARATSUBA 3
735 vmovdqu TMP5, XMMWORD PTR[4*16 + esp]
736 KARATSUBA 4
737 vmovdqu TMP5, XMMWORD PTR[5*16 + esp]
738 KARATSUBA 5
739 vmovdqu TMP5, XMMWORD PTR[6*16 + esp]
740 vpxor TMP5, TMP5, T
741 KARATSUBA 6
742
743 vpxor TMP0, TMP0, TMP1
744 vpxor TMP0, TMP0, TMP2
745 vpsrldq TMP3, TMP0, 8
746 vpxor TMP4, TMP1, TMP3
747 vpslldq TMP3, TMP0, 8
748 vpxor TMP5, TMP2, TMP3
749
750 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
751 vpalignr TMP5,TMP5,TMP5,8
752 vpxor TMP5, TMP5, TMP1
753
754 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
755 vpalignr TMP5,TMP5,TMP5,8
756 vpxor TMP5, TMP5, TMP1
757
758 vpxor TMP5, TMP5, TMP4
759 vmovdqu T, TMP5
760
761 sub aluCTR, 6
762
763 LEncDataSingles:
764
765 cmp len, 16
766 jb LEncDataTail
767 sub len, 16
768
769 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp]
770 NEXTCTR 0
771
772 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
773 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
774 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
775 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
776 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
777 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
778 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
779 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
780 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
781 vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
782 cmp NR, 10
783 je @f
784 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
785 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
786 vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
787 cmp NR, 12
788 je @f
789 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
790 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
791 vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
792 @@:
793 vaesenclast TMP1, TMP1, TMP2
794 vpxor TMP1, TMP1, XMMWORD PTR[PT]
795 vmovdqu XMMWORD PTR[CT], TMP1
796
797 lea PT, [16+PT]
798 lea CT, [16+CT]
799
800 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
801 vpxor TMP1, TMP1, T
802
803 vmovdqu TMP0, XMMWORD PTR[Htbl]
804 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
805 vmovdqu T, TMP1
806
807 jmp LEncDataSingles
808
809 LEncDataTail:
810
811 cmp len, 0
812 je LEncDataEnd
813
814 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp]
815
816 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
817 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
818 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
819 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
820 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
821 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
822 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
823 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
824 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
825 vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
826 cmp NR, 10
827 je @f
828 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
829 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
830 vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
831 cmp NR, 12
832 je @f
833 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
834 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
835 vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
836 @@:
837 vaesenclast TMP1, TMP1, TMP2
838 ; zero a temp location
839 vpxor TMP2, TMP2, TMP2
840 vmovdqa XMMWORD PTR[esp], TMP2
841 ; copy as many bytes as needed
842 xor KS, KS
843 @@:
844 cmp len, KS
845 je @f
846 mov di, [PT + KS]
847 mov [esp + KS], di
848 inc KS
849 jmp @b
850 @@:
851 vpxor TMP1, TMP1, XMMWORD PTR[esp]
852 vmovdqa XMMWORD PTR[esp], TMP1
853 xor KS, KS
854 @@:
855 cmp len, KS
856 je @f
857 mov di, [esp + KS]
858 mov [CT + KS], di
859 inc KS
860 jmp @b
861 @@:
862 cmp KS, 16
863 je @f
864 mov BYTE PTR[esp + KS], 0
865 inc KS
866 jmp @b
867 @@:
868 vmovdqa TMP1, XMMWORD PTR[esp]
869
870 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
871 vpxor TMP1, TMP1, T
872
873 vmovdqu TMP0, XMMWORD PTR[Htbl]
874 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
875 vmovdqu T, TMP1
876
877 LEncDataEnd:
878 inc aluCTR
879 bswap aluCTR
880 mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
881
882 mov esp, ebp
883 pop edi
884 pop esi
885 pop ebx
886 pop ebp
887
888
889 vzeroupper
890
891 ret
892 intel_aes_gcmENC ENDP
893
894 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
895 ;
896 ; Decrypt and Authenticate
897 ; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len);
898 ;
899 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
900
901
902 NEXTCTR MACRO i
903 add aluCTR, 1
904 mov aluTMP, aluCTR
905 bswap aluTMP
906 xor aluTMP, [3*4 + KS]
907 mov [3*4 + i*16 + esp], aluTMP
908 ENDM
909
910 intel_aes_gcmDEC PROC
911
912 cmp DWORD PTR[1*4 + 3*4 + esp], 0
913 jne LbeginDEC
914 ret
915
916 LbeginDEC:
917
918 vzeroupper
919 push ebp
920 push ebx
921 push esi
922 push edi
923
924 mov ebp, esp
925 sub esp, 8*16
926 and esp, -16
927
928 mov CT, [ebp + 5*4 + 0*4]
929 mov PT, [ebp + 5*4 + 1*4]
930 mov Gctx, [ebp + 5*4 + 2*4]
931
932 mov KS, [16*16 + 3*16 + Gctx]
933 lea KS, [44 + KS]
934
935 mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
936 bswap aluCTR
937
938
939 vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
940 vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
941 vmovdqu XMMWORD PTR[0*16 + esp], TMP0
942
943 cmp len, 16*7
944 jb LDecDataSingles
945 vmovdqu XMMWORD PTR[1*16 + esp], TMP0
946 vmovdqu XMMWORD PTR[2*16 + esp], TMP0
947 vmovdqu XMMWORD PTR[3*16 + esp], TMP0
948 vmovdqu XMMWORD PTR[4*16 + esp], TMP0
949 vmovdqu XMMWORD PTR[5*16 + esp], TMP0
950 vmovdqu XMMWORD PTR[6*16 + esp], TMP0
951 dec aluCTR
952
953 LDecData7:
954 cmp len, 16*7
955 jb LDecData7End
956 sub len, 16*7
957
958 vmovdqu TMP5, XMMWORD PTR[0*16 + CT]
959 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
960 vpxor TMP5, TMP5, T
961 vpshufd TMP4, TMP5, 78
962 vpxor TMP4, TMP4, TMP5
963 vpclmulqdq TMP0, TMP4, XMMWORD PTR[6*16 + 8*16 + Htbl], 000h
964 vmovdqu TMP4, XMMWORD PTR[6*16 + Htbl]
965 vpclmulqdq TMP1, TMP5, TMP4, 011h
966 vpclmulqdq TMP2, TMP5, TMP4, 000h
967
968 NEXTCTR 0
969 vmovdqu TMP5, XMMWORD PTR[1*16 + CT]
970 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
971 KARATSUBA 5
972 NEXTCTR 1
973 vmovdqu TMP5, XMMWORD PTR[2*16 + CT]
974 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
975 KARATSUBA 4
976 NEXTCTR 2
977 vmovdqu TMP5, XMMWORD PTR[3*16 + CT]
978 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
979 KARATSUBA 3
980 NEXTCTR 3
981 vmovdqu TMP5, XMMWORD PTR[4*16 + CT]
982 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
983 KARATSUBA 2
984 NEXTCTR 4
985 vmovdqu TMP5, XMMWORD PTR[5*16 + CT]
986 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
987 KARATSUBA 1
988 NEXTCTR 5
989 vmovdqu TMP5, XMMWORD PTR[6*16 + CT]
990 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
991 KARATSUBA 0
992 NEXTCTR 6
993
994 vpxor TMP0, TMP0, TMP1
995 vpxor TMP0, TMP0, TMP2
996 vpsrldq TMP3, TMP0, 8
997 vpxor TMP4, TMP1, TMP3
998 vpslldq TMP3, TMP0, 8
999 vpxor TMP5, TMP2, TMP3
1000
1001 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
1002 vpalignr TMP5,TMP5,TMP5,8
1003 vpxor TMP5, TMP5, TMP1
1004
1005 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
1006 vpalignr TMP5,TMP5,TMP5,8
1007 vpxor TMP5, TMP5, TMP1
1008
1009 vpxor TMP5, TMP5, TMP4
1010 vmovdqu T, TMP5
1011
1012 vmovdqa CTR0, XMMWORD PTR[0*16 + esp]
1013 vmovdqa CTR1, XMMWORD PTR[1*16 + esp]
1014 vmovdqa CTR2, XMMWORD PTR[2*16 + esp]
1015 vmovdqa CTR3, XMMWORD PTR[3*16 + esp]
1016 vmovdqa CTR4, XMMWORD PTR[4*16 + esp]
1017 vmovdqa CTR5, XMMWORD PTR[5*16 + esp]
1018 vmovdqa CTR6, XMMWORD PTR[6*16 + esp]
1019
1020 ROUND 1
1021 ROUND 2
1022 ROUND 3
1023 ROUND 4
1024 ROUND 5
1025 ROUND 6
1026 ROUND 7
1027 ROUND 8
1028 ROUND 9
1029 vmovdqu xmm7, XMMWORD PTR[10*16 + KS]
1030 cmp NR, 10
1031 je @f
1032
1033 ROUND 10
1034 ROUND 11
1035 vmovdqu xmm7, XMMWORD PTR[12*16 + KS]
1036 cmp NR, 12
1037 je @f
1038
1039 ROUND 12
1040 ROUND 13
1041 vmovdqu xmm7, XMMWORD PTR[14*16 + KS]
1042 @@:
1043 vaesenclast CTR0, CTR0, xmm7
1044 vaesenclast CTR1, CTR1, xmm7
1045 vaesenclast CTR2, CTR2, xmm7
1046 vaesenclast CTR3, CTR3, xmm7
1047 vaesenclast CTR4, CTR4, xmm7
1048 vaesenclast CTR5, CTR5, xmm7
1049 vaesenclast CTR6, CTR6, xmm7
1050
1051 vpxor CTR0, CTR0, XMMWORD PTR[0*16 + CT]
1052 vpxor CTR1, CTR1, XMMWORD PTR[1*16 + CT]
1053 vpxor CTR2, CTR2, XMMWORD PTR[2*16 + CT]
1054 vpxor CTR3, CTR3, XMMWORD PTR[3*16 + CT]
1055 vpxor CTR4, CTR4, XMMWORD PTR[4*16 + CT]
1056 vpxor CTR5, CTR5, XMMWORD PTR[5*16 + CT]
1057 vpxor CTR6, CTR6, XMMWORD PTR[6*16 + CT]
1058
1059 vmovdqu XMMWORD PTR[0*16 + PT], CTR0
1060 vmovdqu XMMWORD PTR[1*16 + PT], CTR1
1061 vmovdqu XMMWORD PTR[2*16 + PT], CTR2
1062 vmovdqu XMMWORD PTR[3*16 + PT], CTR3
1063 vmovdqu XMMWORD PTR[4*16 + PT], CTR4
1064 vmovdqu XMMWORD PTR[5*16 + PT], CTR5
1065 vmovdqu XMMWORD PTR[6*16 + PT], CTR6
1066
1067 lea CT, [7*16 + CT]
1068 lea PT, [7*16 + PT]
1069 jmp LDecData7
1070
1071 LDecData7End:
1072
1073 NEXTCTR 0
1074
1075 LDecDataSingles:
1076
1077 cmp len, 16
1078 jb LDecDataTail
1079 sub len, 16
1080
1081 vmovdqu TMP1, XMMWORD PTR[CT]
1082 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
1083 vpxor TMP1, TMP1, T
1084
1085 vmovdqu TMP0, XMMWORD PTR[Htbl]
1086 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
1087 vmovdqu T, TMP1
1088
1089 vmovdqa TMP1, XMMWORD PTR[0*16 + esp]
1090 NEXTCTR 0
1091
1092 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
1093 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
1094 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
1095 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
1096 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
1097 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
1098 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
1099 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
1100 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
1101 vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
1102 cmp NR, 10
1103 je @f
1104 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
1105 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
1106 vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
1107 cmp NR, 12
1108 je @f
1109 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
1110 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
1111 vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
1112 @@:
1113 vaesenclast TMP1, TMP1, TMP2
1114 vpxor TMP1, TMP1, XMMWORD PTR[CT]
1115 vmovdqu XMMWORD PTR[PT], TMP1
1116
1117 lea PT, [16+PT]
1118 lea CT, [16+CT]
1119 jmp LDecDataSingles
1120
1121 LDecDataTail:
1122
1123 cmp len, 0
1124 je LDecDataEnd
1125
1126 vmovdqa TMP1, XMMWORD PTR[0*16 + esp]
1127 inc aluCTR
1128 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
1129 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
1130 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
1131 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
1132 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
1133 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
1134 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
1135 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
1136 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
1137 vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
1138 cmp NR, 10
1139 je @f
1140 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
1141 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
1142 vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
1143 cmp NR, 12
1144 je @f
1145 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
1146 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
1147 vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
1148 @@:
1149 vaesenclast xmm7, TMP1, TMP2
1150
1151 ; copy as many bytes as needed
1152 xor KS, KS
1153 @@:
1154 cmp len, KS
1155 je @f
1156 mov di, [CT + KS]
1157 mov [esp + KS], di
1158 inc KS
1159 jmp @b
1160 @@:
1161 cmp KS, 16
1162 je @f
1163 mov BYTE PTR[esp + KS], 0
1164 inc KS
1165 jmp @b
1166 @@:
1167
1168 vmovdqa TMP1, XMMWORD PTR[esp]
1169 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
1170 vpxor TMP1, TMP1, T
1171
1172 vmovdqu TMP0, XMMWORD PTR[Htbl]
1173 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
1174 vmovdqu T, TMP1
1175
1176
1177 vpxor xmm7, xmm7, XMMWORD PTR[esp]
1178 vmovdqa XMMWORD PTR[esp], xmm7
1179 xor KS, KS
1180 @@:
1181 cmp len, KS
1182 je @f
1183 mov di, [esp + KS]
1184 mov [PT + KS], di
1185 inc KS
1186 jmp @b
1187 @@:
1188 cmp KS, 16
1189 je @f
1190 mov BYTE PTR[PT + KS], 0
1191 inc KS
1192 jmp @b
1193 @@:
1194
1195 LDecDataEnd:
1196
1197 bswap aluCTR
1198 mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
1199
1200 mov esp, ebp
1201 pop edi
1202 pop esi
1203 pop ebx
1204 pop ebp
1205
1206 vzeroupper
1207
1208 ret
1209 intel_aes_gcmDEC ENDP
1210
1211
1212 END
OLDNEW
« no previous file with comments | « nss/lib/freebl/intel-gcm-x64-masm.asm ('k') | nss/lib/freebl/rijndael.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698