Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(16)

Side by Side Diff: nss/lib/freebl/intel-gcm-x64-masm.asm

Issue 2078763002: Delete bundled copy of NSS and replace with README. (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/nss@master
Patch Set: Delete bundled copy of NSS and replace with README. Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « nss/lib/freebl/intel-gcm-wrap.c ('k') | nss/lib/freebl/intel-gcm-x86-masm.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 ; LICENSE:
2 ; This submission to NSS is to be made available under the terms of the
3 ; Mozilla Public License, v. 2.0. You can obtain one at http:
4 ; //mozilla.org/MPL/2.0/.
5 ;###############################################################################
6 ; Copyright(c) 2014, Intel Corp.
7 ; Developers and authors:
8 ; Shay Gueron and Vlad Krasnov
9 ; Intel Corporation, Israel Development Centre, Haifa, Israel
10 ; Please send feedback directly to crypto.feedback.alias@intel.com
11
12
13 .DATA
14 ALIGN 16
15 Lone dq 1,0
16 Ltwo dq 2,0
17 Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
18 Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
19 Lpoly dq 01h, 0c200000000000000h
20
21 .CODE
22
23
24 GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4
25 vpclmulqdq TMP1, SRC2, SRC1, 0h
26 vpclmulqdq TMP4, SRC2, SRC1, 011h
27
28 vpshufd TMP2, SRC2, 78
29 vpshufd TMP3, SRC1, 78
30 vpxor TMP2, TMP2, SRC2
31 vpxor TMP3, TMP3, SRC1
32
33 vpclmulqdq TMP2, TMP2, TMP3, 0h
34 vpxor TMP2, TMP2, TMP1
35 vpxor TMP2, TMP2, TMP4
36
37 vpslldq TMP3, TMP2, 8
38 vpsrldq TMP2, TMP2, 8
39
40 vpxor TMP1, TMP1, TMP3
41 vpxor TMP4, TMP4, TMP2
42
43 vpclmulqdq TMP2, TMP1, [Lpoly], 010h
44 vpshufd TMP3, TMP1, 78
45 vpxor TMP1, TMP2, TMP3
46
47 vpclmulqdq TMP2, TMP1, [Lpoly], 010h
48 vpshufd TMP3, TMP1, 78
49 vpxor TMP1, TMP2, TMP3
50
51 vpxor DST, TMP1, TMP4
52
53 ENDM
54
55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
56 ;
57 ; Generates the final GCM tag
58 ; void intel_aes_gcmTAG(unsigned char Htbl[16*16],
59 ; unsigned char *Tp,
60 ; unsigned int Mlen,
61 ; unsigned int Alen,
62 ; unsigned char *X0,
63 ; unsigned char *TAG);
64 ;
65 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
66
67 ALIGN 16
68 intel_aes_gcmTAG PROC
69
70 Htbl textequ <rcx>
71 Tp textequ <rdx>
72 Mlen textequ <r8>
73 Alen textequ <r9>
74 X0 textequ <r10>
75 TAG textequ <r11>
76
77 T textequ <xmm0>
78 TMP0 textequ <xmm1>
79
80 mov X0, [rsp + 1*8 + 4*8]
81 mov TAG, [rsp + 1*8 + 5*8]
82
83 vzeroupper
84 vmovdqu T, XMMWORD PTR[Tp]
85 vpxor TMP0, TMP0, TMP0
86
87 shl Mlen, 3
88 shl Alen, 3
89
90 ;vpinsrq TMP0, TMP0, Mlen, 0
91 ;vpinsrq TMP0, TMP0, Alen, 1
92 ; workaround the ml64.exe vpinsrq issue
93 vpinsrd TMP0, TMP0, r8d, 0
94 vpinsrd TMP0, TMP0, r9d, 2
95 shr Mlen, 32
96 shr Alen, 32
97 vpinsrd TMP0, TMP0, r8d, 1
98 vpinsrd TMP0, TMP0, r9d, 3
99
100 vpxor T, T, TMP0
101 vmovdqu TMP0, XMMWORD PTR[Htbl]
102 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
103
104 vpshufb T, T, [Lbswap_mask]
105 vpxor T, T, [X0]
106 vmovdqu XMMWORD PTR[TAG], T
107 vzeroupper
108
109 ret
110
111 intel_aes_gcmTAG ENDP
112
113 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
114 ;
115 ; Generates the H table
116 ; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR);
117 ;
118 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
119
120 ALIGN 16
121 intel_aes_gcmINIT PROC
122
123 Htbl textequ <rcx>
124 KS textequ <rdx>
125 NR textequ <r8d>
126
127 T textequ <xmm0>
128 TMP0 textequ <xmm1>
129
130 vzeroupper
131 ; AES-ENC(0)
132 vmovdqu T, XMMWORD PTR[KS]
133 lea KS, [16 + KS]
134 dec NR
135 Lenc_loop:
136 vaesenc T, T, [KS]
137 lea KS, [16 + KS]
138 dec NR
139 jnz Lenc_loop
140
141 vaesenclast T, T, [KS]
142 vpshufb T, T, [Lbswap_mask]
143
144 ;Calculate H` = GFMUL(H, 2)
145 vpsrad xmm3, T, 31
146 vpshufd xmm3, xmm3, 0ffh
147 vpand xmm5, xmm3, [Lpoly]
148 vpsrld xmm3, T, 31
149 vpslld xmm4, T, 1
150 vpslldq xmm3, xmm3, 4
151 vpxor T, xmm4, xmm3
152 vpxor T, T, xmm5
153
154 vmovdqu TMP0, T
155 vmovdqu XMMWORD PTR[Htbl + 0*16], T
156
157 vpshufd xmm2, T, 78
158 vpxor xmm2, xmm2, T
159 vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2
160
161 i = 1
162 WHILE i LT 8
163 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5
164 vmovdqu XMMWORD PTR[Htbl + i*16], T
165 vpshufd xmm2, T, 78
166 vpxor xmm2, xmm2, T
167 vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2
168 i = i+1
169 ENDM
170 vzeroupper
171 ret
172 intel_aes_gcmINIT ENDP
173
174
175 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
176 ;
177 ; Authenticate only
178 ; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp);
179 ;
180 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
181
182 ALIGN 16
183 intel_aes_gcmAAD PROC
184
185 Htbl textequ <rcx>
186 inp textequ <rdx>
187 len textequ <r8>
188 Tp textequ <r9>
189 hlp0 textequ <r10>
190
191 DATA textequ <xmm0>
192 T textequ <xmm1>
193 TMP0 textequ <xmm2>
194 TMP1 textequ <xmm3>
195 TMP2 textequ <xmm4>
196 TMP3 textequ <xmm5>
197 TMP4 textequ <xmm6>
198 Xhi textequ <xmm7>
199
200 KARATSUBA_AAD MACRO i
201 vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h
202 vpxor TMP0, TMP0, TMP3
203 vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h
204 vpxor TMP1, TMP1, TMP3
205 vpshufd TMP3, DATA, 78
206 vpxor TMP3, TMP3, DATA
207 vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h
208 vpxor TMP2, TMP2, TMP3
209 ENDM
210
211 test len, len
212 jnz LbeginAAD
213 ret
214
215 LbeginAAD:
216 vzeroupper
217
218 sub rsp, 2*16
219 vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
220 vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
221
222 vpxor Xhi, Xhi, Xhi
223
224 vmovdqu T, XMMWORD PTR[Tp]
225 ;we hash 8 block each iteration, if the total amount of blocks is not a mult iple of 8, we hash the first n%8 blocks first
226 mov hlp0, len
227 and hlp0, 128-1
228 jz Lmod_loop
229
230 and len, -128
231 sub hlp0, 16
232
233 ; Prefix block
234 vmovdqu DATA, XMMWORD PTR[inp]
235 vpshufb DATA, DATA, [Lbswap_mask]
236 vpxor DATA, DATA, T
237
238 vpclmulqdq TMP0, DATA, [Htbl + hlp0], 0h
239 vpclmulqdq TMP1, DATA, [Htbl + hlp0], 011h
240 vpshufd TMP3, DATA, 78
241 vpxor TMP3, TMP3, DATA
242 vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + hlp0], 0h
243
244 lea inp, [inp+16]
245 test hlp0, hlp0
246 jnz Lpre_loop
247 jmp Lred1
248
249 ;hash remaining prefix bocks (up to 7 total prefix blocks)
250 Lpre_loop:
251
252 sub hlp0, 16
253
254 vmovdqu DATA, XMMWORD PTR[inp]
255 vpshufb DATA, DATA, [Lbswap_mask]
256
257 vpclmulqdq TMP3, DATA, [Htbl + hlp0], 0h
258 vpxor TMP0, TMP0, TMP3
259 vpclmulqdq TMP3, DATA, [Htbl + hlp0], 011h
260 vpxor TMP1, TMP1, TMP3
261 vpshufd TMP3, DATA, 78
262 vpxor TMP3, TMP3, DATA
263 vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + hlp0], 0h
264 vpxor TMP2, TMP2, TMP3
265
266 test hlp0, hlp0
267 lea inp, [inp+16]
268 jnz Lpre_loop
269
270 Lred1:
271
272 vpxor TMP2, TMP2, TMP0
273 vpxor TMP2, TMP2, TMP1
274 vpsrldq TMP3, TMP2, 8
275 vpslldq TMP2, TMP2, 8
276
277 vpxor Xhi, TMP1, TMP3
278 vpxor T, TMP0, TMP2
279
280
281 Lmod_loop:
282
283 sub len, 16*8
284 jb Ldone
285 ; Block #0
286 vmovdqu DATA, XMMWORD PTR[inp + 16*7]
287 vpshufb DATA, DATA, [Lbswap_mask]
288
289 vpclmulqdq TMP0, DATA, [Htbl + 0*16], 0h
290 vpclmulqdq TMP1, DATA, [Htbl + 0*16], 011h
291 vpshufd TMP3, DATA, 78
292 vpxor TMP3, TMP3, DATA
293 vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + 0*16], 0h
294
295 ; Block #1
296 vmovdqu DATA, XMMWORD PTR[inp + 16*6]
297 vpshufb DATA, DATA, [Lbswap_mask]
298 KARATSUBA_AAD 1
299
300 ; Block #2
301 vmovdqu DATA, XMMWORD PTR[inp + 16*5]
302 vpshufb DATA, DATA, [Lbswap_mask]
303
304 vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a
305 vpalignr T, T, T, 8
306
307 KARATSUBA_AAD 2
308
309 vpxor T, T, TMP4 ;reduction stage 1b
310
311 ; Block #3
312 vmovdqu DATA, XMMWORD PTR[inp + 16*4]
313 vpshufb DATA, DATA, [Lbswap_mask]
314 KARATSUBA_AAD 3
315 ; Block #4
316 vmovdqu DATA, XMMWORD PTR[inp + 16*3]
317 vpshufb DATA, DATA, [Lbswap_mask]
318
319 vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a
320 vpalignr T, T, T, 8
321
322 KARATSUBA_AAD 4
323
324 vpxor T, T, TMP4 ;reduction stage 2b
325 ; Block #5
326 vmovdqu DATA, XMMWORD PTR[inp + 16*2]
327 vpshufb DATA, DATA, [Lbswap_mask]
328 KARATSUBA_AAD 5
329
330 vpxor T, T, Xhi ;reduction finalize
331 ; Block #6
332 vmovdqu DATA, XMMWORD PTR[inp + 16*1]
333 vpshufb DATA, DATA, [Lbswap_mask]
334 KARATSUBA_AAD 6
335 ; Block #7
336 vmovdqu DATA, XMMWORD PTR[inp + 16*0]
337 vpshufb DATA, DATA, [Lbswap_mask]
338 vpxor DATA, DATA, T
339 KARATSUBA_AAD 7
340 ; Aggregated 8 blocks, now karatsuba fixup
341 vpxor TMP2, TMP2, TMP0
342 vpxor TMP2, TMP2, TMP1
343 vpsrldq TMP3, TMP2, 8
344 vpslldq TMP2, TMP2, 8
345
346 vpxor Xhi, TMP1, TMP3
347 vpxor T, TMP0, TMP2
348
349 lea inp, [inp + 16*8]
350 jmp Lmod_loop
351
352 Ldone:
353 vpclmulqdq TMP4, T, [Lpoly], 010h
354 vpalignr T, T, T, 8
355 vpxor T, T, TMP4
356
357 vpclmulqdq TMP4, T, [Lpoly], 010h
358 vpalignr T, T, T, 8
359 vpxor T, T, TMP4
360
361 vpxor T, T, Xhi
362 vmovdqu XMMWORD PTR[Tp], T
363 vzeroupper
364
365 vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
366 vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
367 add rsp, 16*2
368
369 ret
370
371 intel_aes_gcmAAD ENDP
372
373
374 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
375 ;
376 ; Encrypt and Authenticate
377 ; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsign ed int len);
378 ;
379 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
380
381 ALIGN 16
382 intel_aes_gcmENC PROC
383
384 PT textequ <rcx>
385 CT textequ <rdx>
386 Htbl textequ <r8>
387 Gctx textequ <r8>
388 len textequ <r9>
389 KS textequ <r10>
390 NR textequ <eax>
391
392 aluCTR textequ <r11d>
393 aluKSl textequ <r12d>
394 aluTMP textequ <r13d>
395
396 T textequ <xmm0>
397 TMP0 textequ <xmm1>
398 TMP1 textequ <xmm2>
399 TMP2 textequ <xmm3>
400 TMP3 textequ <xmm4>
401 TMP4 textequ <xmm5>
402 TMP5 textequ <xmm6>
403 CTR0 textequ <xmm7>
404 CTR1 textequ <xmm8>
405 CTR2 textequ <xmm9>
406 CTR3 textequ <xmm10>
407 CTR4 textequ <xmm11>
408 CTR5 textequ <xmm12>
409 CTR6 textequ <xmm13>
410 CTR7 textequ <xmm14>
411 BSWAPMASK textequ <xmm15>
412
413 ROUND MACRO i
414 vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
415 vaesenc CTR0, CTR0, TMP3
416 vaesenc CTR1, CTR1, TMP3
417 vaesenc CTR2, CTR2, TMP3
418 vaesenc CTR3, CTR3, TMP3
419 vaesenc CTR4, CTR4, TMP3
420 vaesenc CTR5, CTR5, TMP3
421 vaesenc CTR6, CTR6, TMP3
422 vaesenc CTR7, CTR7, TMP3
423 ENDM
424 ROUNDMUL MACRO i
425 vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
426
427 vaesenc CTR0, CTR0, TMP3
428 vaesenc CTR1, CTR1, TMP3
429 vaesenc CTR2, CTR2, TMP3
430 vaesenc CTR3, CTR3, TMP3
431
432 vpshufd TMP4, TMP5, 78
433 vpxor TMP4, TMP4, TMP5
434
435 vaesenc CTR4, CTR4, TMP3
436 vaesenc CTR5, CTR5, TMP3
437 vaesenc CTR6, CTR6, TMP3
438 vaesenc CTR7, CTR7, TMP3
439
440 vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
441 vpxor TMP0, TMP0, TMP3
442 vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl]
443 vpclmulqdq TMP3, TMP5, TMP4, 011h
444 vpxor TMP1, TMP1, TMP3
445 vpclmulqdq TMP3, TMP5, TMP4, 000h
446 vpxor TMP2, TMP2, TMP3
447 ENDM
448 KARATSUBA MACRO i
449 vpshufd TMP4, TMP5, 78
450 vpxor TMP4, TMP4, TMP5
451 vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
452 vpxor TMP0, TMP0, TMP3
453 vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl]
454 vpclmulqdq TMP3, TMP5, TMP4, 011h
455 vpxor TMP1, TMP1, TMP3
456 vpclmulqdq TMP3, TMP5, TMP4, 000h
457 vpxor TMP2, TMP2, TMP3
458 ENDM
459 NEXTCTR MACRO i
460 add aluCTR, 1
461 mov aluTMP, aluCTR
462 xor aluTMP, aluKSl
463 bswap aluTMP
464 mov [3*4 + 8*16 + i*16 + rsp], aluTMP
465 ENDM
466
467
468 test len, len
469 jnz LbeginENC
470 ret
471
472 LbeginENC:
473
474 vzeroupper
475 push r11
476 push r12
477 push r13
478 push rbp
479 sub rsp, 10*16
480 vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
481 vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
482 vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
483 vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
484 vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
485 vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
486 vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
487 vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
488 vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
489 vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
490
491 mov rbp, rsp
492 sub rsp, 16*16
493 and rsp, -16
494
495 vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
496 vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
497 vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
498 mov KS, [16*16 + 3*16 + Gctx]
499 mov NR, [4 + KS]
500 lea KS, [48 + KS]
501
502 vpshufb CTR0, CTR0, BSWAPMASK
503
504 mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
505 mov aluKSl, [3*4 + KS]
506 bswap aluCTR
507 bswap aluKSl
508
509 vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
510 vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
511 vmovdqu XMMWORD PTR[8*16 + 0*16 + rsp], TMP0
512
513 cmp len, 128
514 jb LEncDataSingles
515 ; Prepare the "top" counters
516 vmovdqu XMMWORD PTR[8*16 + 1*16 + rsp], TMP0
517 vmovdqu XMMWORD PTR[8*16 + 2*16 + rsp], TMP0
518 vmovdqu XMMWORD PTR[8*16 + 3*16 + rsp], TMP0
519 vmovdqu XMMWORD PTR[8*16 + 4*16 + rsp], TMP0
520 vmovdqu XMMWORD PTR[8*16 + 5*16 + rsp], TMP0
521 vmovdqu XMMWORD PTR[8*16 + 6*16 + rsp], TMP0
522 vmovdqu XMMWORD PTR[8*16 + 7*16 + rsp], TMP0
523
524 ; Encrypt the initial 8 blocks
525 sub len, 128
526 vpaddd CTR1, CTR0, XMMWORD PTR[Lone]
527 vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo]
528 vpaddd CTR3, CTR2, XMMWORD PTR[Lone]
529 vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo]
530 vpaddd CTR5, CTR4, XMMWORD PTR[Lone]
531 vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo]
532 vpaddd CTR7, CTR6, XMMWORD PTR[Lone]
533
534 vpshufb CTR0, CTR0, BSWAPMASK
535 vpshufb CTR1, CTR1, BSWAPMASK
536 vpshufb CTR2, CTR2, BSWAPMASK
537 vpshufb CTR3, CTR3, BSWAPMASK
538 vpshufb CTR4, CTR4, BSWAPMASK
539 vpshufb CTR5, CTR5, BSWAPMASK
540 vpshufb CTR6, CTR6, BSWAPMASK
541 vpshufb CTR7, CTR7, BSWAPMASK
542
543 vmovdqu TMP3, XMMWORD PTR[0*16 + KS]
544 vpxor CTR0, CTR0, TMP3
545 vpxor CTR1, CTR1, TMP3
546 vpxor CTR2, CTR2, TMP3
547 vpxor CTR3, CTR3, TMP3
548 vpxor CTR4, CTR4, TMP3
549 vpxor CTR5, CTR5, TMP3
550 vpxor CTR6, CTR6, TMP3
551 vpxor CTR7, CTR7, TMP3
552
553 ROUND 1
554
555 add aluCTR, 8
556 mov aluTMP, aluCTR
557 xor aluTMP, aluKSl
558 bswap aluTMP
559 mov [8*16 + 0*16 + 3*4 + rsp], aluTMP
560
561 ROUND 2
562 NEXTCTR 1
563 ROUND 3
564 NEXTCTR 2
565 ROUND 4
566 NEXTCTR 3
567 ROUND 5
568 NEXTCTR 4
569 ROUND 6
570 NEXTCTR 5
571 ROUND 7
572 NEXTCTR 6
573 ROUND 8
574 NEXTCTR 7
575 ROUND 9
576 vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
577 cmp NR, 10
578 je @f
579
580 ROUND 10
581 ROUND 11
582 vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
583 cmp NR, 12
584 je @f
585
586 ROUND 12
587 ROUND 13
588 vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
589 @@:
590 vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT]
591 vaesenclast CTR0, CTR0, TMP3
592 vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT]
593 vaesenclast CTR1, CTR1, TMP3
594 vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT]
595 vaesenclast CTR2, CTR2, TMP3
596 vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT]
597 vaesenclast CTR3, CTR3, TMP3
598 vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT]
599 vaesenclast CTR4, CTR4, TMP3
600 vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT]
601 vaesenclast CTR5, CTR5, TMP3
602 vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT]
603 vaesenclast CTR6, CTR6, TMP3
604 vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT]
605 vaesenclast CTR7, CTR7, TMP3
606
607 vmovdqu XMMWORD PTR[0*16 + CT], CTR0
608 vpshufb CTR0, CTR0, BSWAPMASK
609 vmovdqu XMMWORD PTR[1*16 + CT], CTR1
610 vpshufb CTR1, CTR1, BSWAPMASK
611 vmovdqu XMMWORD PTR[2*16 + CT], CTR2
612 vpshufb CTR2, CTR2, BSWAPMASK
613 vmovdqu XMMWORD PTR[3*16 + CT], CTR3
614 vpshufb CTR3, CTR3, BSWAPMASK
615 vmovdqu XMMWORD PTR[4*16 + CT], CTR4
616 vpshufb CTR4, CTR4, BSWAPMASK
617 vmovdqu XMMWORD PTR[5*16 + CT], CTR5
618 vpshufb CTR5, CTR5, BSWAPMASK
619 vmovdqu XMMWORD PTR[6*16 + CT], CTR6
620 vpshufb CTR6, CTR6, BSWAPMASK
621 vmovdqu XMMWORD PTR[7*16 + CT], CTR7
622 vpshufb TMP5, CTR7, BSWAPMASK
623
624 vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
625 vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
626 vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
627 vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
628 vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
629 vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
630 vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
631
632 lea CT, [8*16 + CT]
633 lea PT, [8*16 + PT]
634 jmp LEncDataOctets
635
636 LEncDataOctets:
637 cmp len, 128
638 jb LEndEncOctets
639 sub len, 128
640
641 vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + rsp]
642 vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + rsp]
643 vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + rsp]
644 vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + rsp]
645 vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + rsp]
646 vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + rsp]
647 vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + rsp]
648 vmovdqa CTR7, XMMWORD PTR[8*16 + 7*16 + rsp]
649
650 vpshufd TMP4, TMP5, 78
651 vpxor TMP4, TMP4, TMP5
652 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
653 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
654 vpclmulqdq TMP1, TMP5, TMP4, 011h
655 vpclmulqdq TMP2, TMP5, TMP4, 000h
656
657 vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
658 ROUNDMUL 1
659 NEXTCTR 0
660 vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
661 ROUNDMUL 2
662 NEXTCTR 1
663 vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
664 ROUNDMUL 3
665 NEXTCTR 2
666 vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
667 ROUNDMUL 4
668 NEXTCTR 3
669 vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
670 ROUNDMUL 5
671 NEXTCTR 4
672 vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
673 ROUNDMUL 6
674 NEXTCTR 5
675 vpxor TMP5, T, XMMWORD PTR[7*16 + rsp]
676 ROUNDMUL 7
677 NEXTCTR 6
678
679 ROUND 8
680 NEXTCTR 7
681
682 vpxor TMP0, TMP0, TMP1
683 vpxor TMP0, TMP0, TMP2
684 vpsrldq TMP3, TMP0, 8
685 vpxor TMP4, TMP1, TMP3
686 vpslldq TMP3, TMP0, 8
687 vpxor T, TMP2, TMP3
688
689 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
690 vpalignr T,T,T,8
691 vpxor T, T, TMP1
692
693 ROUND 9
694
695 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
696 vpalignr T,T,T,8
697 vpxor T, T, TMP1
698
699 vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
700 cmp NR, 10
701 je @f
702
703 ROUND 10
704 ROUND 11
705 vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
706 cmp NR, 12
707 je @f
708
709 ROUND 12
710 ROUND 13
711 vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
712 @@:
713 vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT]
714 vaesenclast CTR0, CTR0, TMP3
715 vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT]
716 vaesenclast CTR1, CTR1, TMP3
717 vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT]
718 vaesenclast CTR2, CTR2, TMP3
719 vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT]
720 vaesenclast CTR3, CTR3, TMP3
721 vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT]
722 vaesenclast CTR4, CTR4, TMP3
723 vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT]
724 vaesenclast CTR5, CTR5, TMP3
725 vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT]
726 vaesenclast CTR6, CTR6, TMP3
727 vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT]
728 vaesenclast CTR7, CTR7, TMP3
729
730 vmovdqu XMMWORD PTR[0*16 + CT], CTR0
731 vpshufb CTR0, CTR0, BSWAPMASK
732 vmovdqu XMMWORD PTR[1*16 + CT], CTR1
733 vpshufb CTR1, CTR1, BSWAPMASK
734 vmovdqu XMMWORD PTR[2*16 + CT], CTR2
735 vpshufb CTR2, CTR2, BSWAPMASK
736 vmovdqu XMMWORD PTR[3*16 + CT], CTR3
737 vpshufb CTR3, CTR3, BSWAPMASK
738 vmovdqu XMMWORD PTR[4*16 + CT], CTR4
739 vpshufb CTR4, CTR4, BSWAPMASK
740 vmovdqu XMMWORD PTR[5*16 + CT], CTR5
741 vpshufb CTR5, CTR5, BSWAPMASK
742 vmovdqu XMMWORD PTR[6*16 + CT], CTR6
743 vpshufb CTR6, CTR6, BSWAPMASK
744 vmovdqu XMMWORD PTR[7*16 + CT], CTR7
745 vpshufb TMP5, CTR7, BSWAPMASK
746
747 vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
748 vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
749 vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
750 vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
751 vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
752 vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
753 vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
754
755 vpxor T, T, TMP4
756
757 lea CT, [8*16 + CT]
758 lea PT, [8*16 + PT]
759 jmp LEncDataOctets
760
761 LEndEncOctets:
762
763 vpshufd TMP4, TMP5, 78
764 vpxor TMP4, TMP4, TMP5
765 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
766 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
767 vpclmulqdq TMP1, TMP5, TMP4, 011h
768 vpclmulqdq TMP2, TMP5, TMP4, 000h
769
770 vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
771 KARATSUBA 1
772 vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
773 KARATSUBA 2
774 vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
775 KARATSUBA 3
776 vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
777 KARATSUBA 4
778 vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
779 KARATSUBA 5
780 vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
781 KARATSUBA 6
782 vpxor TMP5, T, XMMWORD PTR[7*16 + rsp]
783 KARATSUBA 7
784
785 vpxor TMP0, TMP0, TMP1
786 vpxor TMP0, TMP0, TMP2
787 vpsrldq TMP3, TMP0, 8
788 vpxor TMP4, TMP1, TMP3
789 vpslldq TMP3, TMP0, 8
790 vpxor T, TMP2, TMP3
791
792 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
793 vpalignr T,T,T,8
794 vpxor T, T, TMP1
795
796 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
797 vpalignr T,T,T,8
798 vpxor T, T, TMP1
799
800 vpxor T, T, TMP4
801
802 sub aluCTR, 7
803
804 LEncDataSingles:
805
806 cmp len, 16
807 jb LEncDataTail
808 sub len, 16
809
810 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
811 NEXTCTR 0
812
813 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
814 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
815 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
816 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
817 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
818 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
819 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
820 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
821 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
822 vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
823 cmp NR, 10
824 je @f
825 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
826 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
827 vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
828 cmp NR, 12
829 je @f
830 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
831 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
832 vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
833 @@:
834 vaesenclast TMP1, TMP1, TMP2
835 vpxor TMP1, TMP1, XMMWORD PTR[PT]
836 vmovdqu XMMWORD PTR[CT], TMP1
837
838 lea PT, [16+PT]
839 lea CT, [16+CT]
840
841 vpshufb TMP1, TMP1, BSWAPMASK
842 vpxor T, T, TMP1
843 vmovdqu TMP0, XMMWORD PTR[Htbl]
844 GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4
845
846 jmp LEncDataSingles
847
848 LEncDataTail:
849
850 test len, len
851 jz LEncDataEnd
852
853 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
854
855 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
856 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
857 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
858 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
859 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
860 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
861 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
862 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
863 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
864 vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
865 cmp NR, 10
866 je @f
867 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
868 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
869 vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
870 cmp NR, 12
871 je @f
872 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
873 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
874 vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
875 @@:
876 vaesenclast TMP1, TMP1, TMP2
877 ; zero a temp location
878 vpxor TMP2, TMP2, TMP2
879 vmovdqa XMMWORD PTR[rsp], TMP2
880 ; copy as many bytes as needed
881 xor KS, KS
882
883 @@:
884 cmp len, KS
885 je @f
886 mov al, [PT + KS]
887 mov [rsp + KS], al
888 inc KS
889 jmp @b
890 @@:
891 vpxor TMP1, TMP1, XMMWORD PTR[rsp]
892 vmovdqa XMMWORD PTR[rsp], TMP1
893 xor KS, KS
894 @@:
895 cmp len, KS
896 je @f
897 mov al, [rsp + KS]
898 mov [CT + KS], al
899 inc KS
900 jmp @b
901 @@:
902 cmp KS, 16
903 je @f
904 mov BYTE PTR[rsp + KS], 0
905 inc KS
906 jmp @b
907 @@:
908 BAIL:
909 vmovdqa TMP1, XMMWORD PTR[rsp]
910 vpshufb TMP1, TMP1, BSWAPMASK
911 vpxor T, T, TMP1
912 vmovdqu TMP0, XMMWORD PTR[Htbl]
913 GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4
914
915 LEncDataEnd:
916
917 vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
918 bswap aluCTR
919 mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
920
921 mov rsp, rbp
922
923 vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
924 vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
925 vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
926 vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
927 vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
928 vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
929 vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
930 vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
931 vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
932 vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
933
934 add rsp, 10*16
935 pop rbp
936 pop r13
937 pop r12
938 pop r11
939
940 vzeroupper
941
942 ret
943 intel_aes_gcmENC ENDP
944
945 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
946 ;
947 ; Decrypt and Authenticate
948 ; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len);
949 ;
950 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
951
952 ALIGN 16
953 intel_aes_gcmDEC PROC
954
955 NEXTCTR MACRO i
956 add aluCTR, 1
957 mov aluTMP, aluCTR
958 xor aluTMP, aluKSl
959 bswap aluTMP
960 mov [3*4 + i*16 + rsp], aluTMP
961 ENDM
962
963 PT textequ <rdx>
964 CT textequ <rcx>
965
966 test len, len
967 jnz LbeginDEC
968 ret
969
970 LbeginDEC:
971
972 vzeroupper
973 push r11
974 push r12
975 push r13
976 push rbp
977 sub rsp, 10*16
978 vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
979 vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
980 vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
981 vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
982 vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
983 vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
984 vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
985 vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
986 vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
987 vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
988
989 mov rbp, rsp
990 sub rsp, 8*16
991 and rsp, -16
992
993 vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
994 vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
995 vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
996 mov KS, [16*16 + 3*16 + Gctx]
997 mov NR, [4 + KS]
998 lea KS, [48 + KS]
999
1000 vpshufb CTR0, CTR0, BSWAPMASK
1001
1002 mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
1003 mov aluKSl, [3*4 + KS]
1004 bswap aluCTR
1005 bswap aluKSl
1006
1007 vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
1008 vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
1009 vmovdqu XMMWORD PTR[0*16 + rsp], TMP0
1010
1011 cmp len, 128
1012 jb LDecDataSingles
1013 ; Prepare the "top" counters
1014 vmovdqu XMMWORD PTR[1*16 + rsp], TMP0
1015 vmovdqu XMMWORD PTR[2*16 + rsp], TMP0
1016 vmovdqu XMMWORD PTR[3*16 + rsp], TMP0
1017 vmovdqu XMMWORD PTR[4*16 + rsp], TMP0
1018 vmovdqu XMMWORD PTR[5*16 + rsp], TMP0
1019 vmovdqu XMMWORD PTR[6*16 + rsp], TMP0
1020 vmovdqu XMMWORD PTR[7*16 + rsp], TMP0
1021
1022 NEXTCTR 1
1023 NEXTCTR 2
1024 NEXTCTR 3
1025 NEXTCTR 4
1026 NEXTCTR 5
1027 NEXTCTR 6
1028 NEXTCTR 7
1029
1030 LDecDataOctets:
1031 cmp len, 128
1032 jb LEndDecOctets
1033 sub len, 128
1034
1035 vmovdqa CTR0, XMMWORD PTR[0*16 + rsp]
1036 vmovdqa CTR1, XMMWORD PTR[1*16 + rsp]
1037 vmovdqa CTR2, XMMWORD PTR[2*16 + rsp]
1038 vmovdqa CTR3, XMMWORD PTR[3*16 + rsp]
1039 vmovdqa CTR4, XMMWORD PTR[4*16 + rsp]
1040 vmovdqa CTR5, XMMWORD PTR[5*16 + rsp]
1041 vmovdqa CTR6, XMMWORD PTR[6*16 + rsp]
1042 vmovdqa CTR7, XMMWORD PTR[7*16 + rsp]
1043
1044 vmovdqu TMP5, XMMWORD PTR[7*16 + CT]
1045 vpshufb TMP5, TMP5, BSWAPMASK
1046 vpshufd TMP4, TMP5, 78
1047 vpxor TMP4, TMP4, TMP5
1048 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
1049 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl]
1050 vpclmulqdq TMP1, TMP5, TMP4, 011h
1051 vpclmulqdq TMP2, TMP5, TMP4, 000h
1052
1053 vmovdqu TMP5, XMMWORD PTR[6*16 + CT]
1054 vpshufb TMP5, TMP5, BSWAPMASK
1055 ROUNDMUL 1
1056 NEXTCTR 0
1057 vmovdqu TMP5, XMMWORD PTR[5*16 + CT]
1058 vpshufb TMP5, TMP5, BSWAPMASK
1059 ROUNDMUL 2
1060 NEXTCTR 1
1061 vmovdqu TMP5, XMMWORD PTR[4*16 + CT]
1062 vpshufb TMP5, TMP5, BSWAPMASK
1063 ROUNDMUL 3
1064 NEXTCTR 2
1065 vmovdqu TMP5, XMMWORD PTR[3*16 + CT]
1066 vpshufb TMP5, TMP5, BSWAPMASK
1067 ROUNDMUL 4
1068 NEXTCTR 3
1069 vmovdqu TMP5, XMMWORD PTR[2*16 + CT]
1070 vpshufb TMP5, TMP5, BSWAPMASK
1071 ROUNDMUL 5
1072 NEXTCTR 4
1073 vmovdqu TMP5, XMMWORD PTR[1*16 + CT]
1074 vpshufb TMP5, TMP5, BSWAPMASK
1075 ROUNDMUL 6
1076 NEXTCTR 5
1077 vmovdqu TMP5, XMMWORD PTR[0*16 + CT]
1078 vpshufb TMP5, TMP5, BSWAPMASK
1079 vpxor TMP5, TMP5, T
1080 ROUNDMUL 7
1081 NEXTCTR 6
1082
1083 ROUND 8
1084 NEXTCTR 7
1085
1086 vpxor TMP0, TMP0, TMP1
1087 vpxor TMP0, TMP0, TMP2
1088 vpsrldq TMP3, TMP0, 8
1089 vpxor TMP4, TMP1, TMP3
1090 vpslldq TMP3, TMP0, 8
1091 vpxor T, TMP2, TMP3
1092
1093 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
1094 vpalignr T,T,T,8
1095 vpxor T, T, TMP1
1096
1097 ROUND 9
1098
1099 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h
1100 vpalignr T,T,T,8
1101 vpxor T, T, TMP1
1102
1103 vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
1104 cmp NR, 10
1105 je @f
1106
1107 ROUND 10
1108 ROUND 11
1109 vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
1110 cmp NR, 12
1111 je @f
1112
1113 ROUND 12
1114 ROUND 13
1115 vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
1116 @@:
1117 vpxor TMP3, TMP5, XMMWORD PTR[0*16 + CT]
1118 vaesenclast CTR0, CTR0, TMP3
1119 vpxor TMP3, TMP5, XMMWORD PTR[1*16 + CT]
1120 vaesenclast CTR1, CTR1, TMP3
1121 vpxor TMP3, TMP5, XMMWORD PTR[2*16 + CT]
1122 vaesenclast CTR2, CTR2, TMP3
1123 vpxor TMP3, TMP5, XMMWORD PTR[3*16 + CT]
1124 vaesenclast CTR3, CTR3, TMP3
1125 vpxor TMP3, TMP5, XMMWORD PTR[4*16 + CT]
1126 vaesenclast CTR4, CTR4, TMP3
1127 vpxor TMP3, TMP5, XMMWORD PTR[5*16 + CT]
1128 vaesenclast CTR5, CTR5, TMP3
1129 vpxor TMP3, TMP5, XMMWORD PTR[6*16 + CT]
1130 vaesenclast CTR6, CTR6, TMP3
1131 vpxor TMP3, TMP5, XMMWORD PTR[7*16 + CT]
1132 vaesenclast CTR7, CTR7, TMP3
1133
1134 vmovdqu XMMWORD PTR[0*16 + PT], CTR0
1135 vmovdqu XMMWORD PTR[1*16 + PT], CTR1
1136 vmovdqu XMMWORD PTR[2*16 + PT], CTR2
1137 vmovdqu XMMWORD PTR[3*16 + PT], CTR3
1138 vmovdqu XMMWORD PTR[4*16 + PT], CTR4
1139 vmovdqu XMMWORD PTR[5*16 + PT], CTR5
1140 vmovdqu XMMWORD PTR[6*16 + PT], CTR6
1141 vmovdqu XMMWORD PTR[7*16 + PT], CTR7
1142
1143 vpxor T, T, TMP4
1144
1145 lea CT, [8*16 + CT]
1146 lea PT, [8*16 + PT]
1147 jmp LDecDataOctets
1148
1149 LEndDecOctets:
1150
1151 sub aluCTR, 7
1152
1153 LDecDataSingles:
1154
1155 cmp len, 16
1156 jb LDecDataTail
1157 sub len, 16
1158
1159 vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
1160 NEXTCTR 0
1161
1162 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
1163 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
1164 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
1165 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
1166 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
1167 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
1168 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
1169 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
1170 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
1171 vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
1172 cmp NR, 10
1173 je @f
1174 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
1175 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
1176 vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
1177 cmp NR, 12
1178 je @f
1179 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
1180 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
1181 vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
1182 @@:
1183 vaesenclast TMP1, TMP1, TMP2
1184
1185 vmovdqu TMP2, XMMWORD PTR[CT]
1186 vpxor TMP1, TMP1, TMP2
1187 vmovdqu XMMWORD PTR[PT], TMP1
1188
1189 lea PT, [16+PT]
1190 lea CT, [16+CT]
1191
1192 vpshufb TMP2, TMP2, BSWAPMASK
1193 vpxor T, T, TMP2
1194 vmovdqu TMP0, XMMWORD PTR[Htbl]
1195 GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4
1196
1197 jmp LDecDataSingles
1198
1199 LDecDataTail:
1200
1201 test len, len
1202 jz LDecDataEnd
1203
1204 vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
1205 inc aluCTR
1206 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
1207 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
1208 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
1209 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
1210 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
1211 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
1212 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
1213 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
1214 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
1215 vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
1216 cmp NR, 10
1217 je @f
1218 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
1219 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
1220 vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
1221 cmp NR, 12
1222 je @f
1223 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
1224 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
1225 vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
1226 @@:
1227 vaesenclast TMP1, TMP1, TMP2
1228 ; copy as many bytes as needed
1229 xor KS, KS
1230 @@:
1231 cmp len, KS
1232 je @f
1233 mov al, [CT + KS]
1234 mov [rsp + KS], al
1235 inc KS
1236 jmp @b
1237 @@:
1238 cmp KS, 16
1239 je @f
1240 mov BYTE PTR[rsp + KS], 0
1241 inc KS
1242 jmp @b
1243 @@:
1244 vmovdqa TMP2, XMMWORD PTR[rsp]
1245 vpshufb TMP2, TMP2, BSWAPMASK
1246 vpxor T, T, TMP2
1247 vmovdqu TMP0, XMMWORD PTR[Htbl]
1248 GFMUL T, T, TMP0, TMP5, TMP2, TMP3, TMP4
1249
1250
1251 vpxor TMP1, TMP1, XMMWORD PTR[rsp]
1252 vmovdqa XMMWORD PTR[rsp], TMP1
1253 xor KS, KS
1254 @@:
1255 cmp len, KS
1256 je @f
1257 mov al, [rsp + KS]
1258 mov [PT + KS], al
1259 inc KS
1260 jmp @b
1261 @@:
1262
1263 LDecDataEnd:
1264
1265 vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
1266 bswap aluCTR
1267 mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR
1268
1269 mov rsp, rbp
1270
1271 vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
1272 vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
1273 vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
1274 vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
1275 vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
1276 vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
1277 vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
1278 vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
1279 vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
1280 vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
1281
1282 add rsp, 10*16
1283 pop rbp
1284 pop r13
1285 pop r12
1286 pop r11
1287
1288 vzeroupper
1289
1290 ret
1291 ret
1292 intel_aes_gcmDEC ENDP
1293
1294
1295 END
OLDNEW
« no previous file with comments | « nss/lib/freebl/intel-gcm-wrap.c ('k') | nss/lib/freebl/intel-gcm-x86-masm.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698