Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(156)

Side by Side Diff: openssl/crypto/aes/asm/bsaes-x86_64.pl

Issue 2072073002: Delete bundled copy of OpenSSL and replace with README. (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/openssl@master
Patch Set: Delete bundled copy of OpenSSL and replace with README. Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « openssl/crypto/aes/asm/bsaes-x86_64.S ('k') | openssl/crypto/aes/asm/vpaes-x86.S » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env perl
2
3 ###################################################################
4 ### AES-128 [originally in CTR mode] ###
5 ### bitsliced implementation for Intel Core 2 processors ###
6 ### requires support of SSE extensions up to SSSE3 ###
7 ### Author: Emilia Käsper and Peter Schwabe ###
8 ### Date: 2009-03-19 ###
9 ### Public domain ###
10 ### ###
11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12 ### further information. ###
13 ###################################################################
14 #
15 # September 2011.
16 #
17 # Started as transliteration to "perlasm" the original code has
18 # undergone following changes:
19 #
20 # - code was made position-independent;
21 # - rounds were folded into a loop resulting in >5x size reduction
22 # from 12.5KB to 2.2KB;
23 # - above was possibile thanks to mixcolumns() modification that
24 # allowed to feed its output back to aesenc[last], this was
25 # achieved at cost of two additional inter-registers moves;
26 # - some instruction reordering and interleaving;
27 # - this module doesn't implement key setup subroutine, instead it
28 # relies on conversion of "conventional" key schedule as returned
29 # by AES_set_encrypt_key (see discussion below);
30 # - first and last round keys are treated differently, which allowed
31 # to skip one shiftrows(), reduce bit-sliced key schedule and
32 # speed-up conversion by 22%;
33 # - support for 192- and 256-bit keys was added;
34 #
35 # Resulting performance in CPU cycles spent to encrypt one byte out
36 # of 4096-byte buffer with 128-bit key is:
37 #
38 # Emilia's this(*) difference
39 #
40 # Core 2 9.30 8.69 +7%
41 # Nehalem(**) 7.63 6.98 +9%
42 # Atom 17.1 17.4 -2%(***)
43 #
44 # (*) Comparison is not completely fair, because "this" is ECB,
45 # i.e. no extra processing such as counter values calculation
46 # and xor-ing input as in Emilia's CTR implementation is
47 # performed. However, the CTR calculations stand for not more
48 # than 1% of total time, so comparison is *rather* fair.
49 #
50 # (**) Results were collected on Westmere, which is considered to
51 # be equivalent to Nehalem for this code.
52 #
53 # (***) Slowdown on Atom is rather strange per se, because original
54 # implementation has a number of 9+-bytes instructions, which
55 # are bad for Atom front-end, and which I eliminated completely.
56 # In attempt to address deterioration sbox() was tested in FP
57 # SIMD "domain" (movaps instead of movdqa, xorps instead of
58 # pxor, etc.). While it resulted in nominal 4% improvement on
59 # Atom, it hurted Westmere by more than 2x factor.
60 #
61 # As for key schedule conversion subroutine. Interface to OpenSSL
62 # relies on per-invocation on-the-fly conversion. This naturally
63 # has impact on performance, especially for short inputs. Conversion
64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
65 # function is:
66 #
67 # conversion conversion/8x block
68 # Core 2 240 0.22
69 # Nehalem 180 0.20
70 # Atom 430 0.19
71 #
72 # The ratio values mean that 128-byte blocks will be processed
73 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74 # etc. Then keep in mind that input sizes not divisible by 128 are
75 # *effectively* slower, especially shortest ones, e.g. consecutive
76 # 144-byte blocks are processed 44% slower than one would expect,
77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78 # it's still faster than ["hyper-threading-safe" code path in]
79 # aes-x86_64.pl on all lengths above 64 bytes...
80 #
81 # October 2011.
82 #
83 # Add decryption procedure. Performance in CPU cycles spent to decrypt
84 # one byte out of 4096-byte buffer with 128-bit key is:
85 #
86 # Core 2 11.0
87 # Nehalem 9.16
88 # Atom 20.9
89 #
90 # November 2011.
91 #
92 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93 # suboptimal, but XTS is meant to be used with larger blocks...
94 #
95 # <appro@openssl.org>
96
97 $flavour = shift;
98 $output = shift;
99 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100
101 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102
103 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106 die "can't locate x86_64-xlate.pl";
107
108 open OUT,"| \"$^X\" $xlate $flavour $output";
109 *STDOUT=*OUT;
110
111 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
112 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
113 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
114
115 {
116 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
117
118 sub Sbox {
119 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
120 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
121 my @b=@_[0..7];
122 my @t=@_[8..11];
123 my @s=@_[12..15];
124 &InBasisChange (@b);
125 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
126 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
127 }
128
129 sub InBasisChange {
130 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
131 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
132 my @b=@_[0..7];
133 $code.=<<___;
134 pxor @b[6], @b[5]
135 pxor @b[1], @b[2]
136 pxor @b[0], @b[3]
137 pxor @b[2], @b[6]
138 pxor @b[0], @b[5]
139
140 pxor @b[3], @b[6]
141 pxor @b[7], @b[3]
142 pxor @b[5], @b[7]
143 pxor @b[4], @b[3]
144 pxor @b[5], @b[4]
145 pxor @b[1], @b[3]
146
147 pxor @b[7], @b[2]
148 pxor @b[5], @b[1]
149 ___
150 }
151
152 sub OutBasisChange {
153 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
154 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
155 my @b=@_[0..7];
156 $code.=<<___;
157 pxor @b[6], @b[0]
158 pxor @b[4], @b[1]
159 pxor @b[0], @b[2]
160 pxor @b[6], @b[4]
161 pxor @b[1], @b[6]
162
163 pxor @b[5], @b[1]
164 pxor @b[3], @b[5]
165 pxor @b[7], @b[3]
166 pxor @b[5], @b[7]
167 pxor @b[5], @b[2]
168
169 pxor @b[7], @b[4]
170 ___
171 }
172
173 sub InvSbox {
174 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
175 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
176 my @b=@_[0..7];
177 my @t=@_[8..11];
178 my @s=@_[12..15];
179 &InvInBasisChange (@b);
180 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
181 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
182 }
183
184 sub InvInBasisChange { # OutBasisChange in reverse
185 my @b=@_[5,1,2,6,3,7,0,4];
186 $code.=<<___
187 pxor @b[7], @b[4]
188
189 pxor @b[5], @b[7]
190 pxor @b[5], @b[2]
191 pxor @b[7], @b[3]
192 pxor @b[3], @b[5]
193 pxor @b[5], @b[1]
194
195 pxor @b[1], @b[6]
196 pxor @b[0], @b[2]
197 pxor @b[6], @b[4]
198 pxor @b[6], @b[0]
199 pxor @b[4], @b[1]
200 ___
201 }
202
203 sub InvOutBasisChange { # InBasisChange in reverse
204 my @b=@_[2,5,7,3,6,1,0,4];
205 $code.=<<___;
206 pxor @b[5], @b[1]
207 pxor @b[7], @b[2]
208
209 pxor @b[1], @b[3]
210 pxor @b[5], @b[4]
211 pxor @b[5], @b[7]
212 pxor @b[4], @b[3]
213 pxor @b[0], @b[5]
214 pxor @b[7], @b[3]
215 pxor @b[2], @b[6]
216 pxor @b[1], @b[2]
217 pxor @b[3], @b[6]
218
219 pxor @b[0], @b[3]
220 pxor @b[6], @b[5]
221 ___
222 }
223
224 sub Mul_GF4 {
225 #;*************************************************************
226 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
227 #;*************************************************************
228 my ($x0,$x1,$y0,$y1,$t0)=@_;
229 $code.=<<___;
230 movdqa $y0, $t0
231 pxor $y1, $t0
232 pand $x0, $t0
233 pxor $x1, $x0
234 pand $y0, $x1
235 pand $y1, $x0
236 pxor $x1, $x0
237 pxor $t0, $x1
238 ___
239 }
240
241 sub Mul_GF4_N { # not used, see next subroutine
242 # multiply and scale by N
243 my ($x0,$x1,$y0,$y1,$t0)=@_;
244 $code.=<<___;
245 movdqa $y0, $t0
246 pxor $y1, $t0
247 pand $x0, $t0
248 pxor $x1, $x0
249 pand $y0, $x1
250 pand $y1, $x0
251 pxor $x0, $x1
252 pxor $t0, $x0
253 ___
254 }
255
256 sub Mul_GF4_N_GF4 {
257 # interleaved Mul_GF4_N and Mul_GF4
258 my ($x0,$x1,$y0,$y1,$t0,
259 $x2,$x3,$y2,$y3,$t1)=@_;
260 $code.=<<___;
261 movdqa $y0, $t0
262 movdqa $y2, $t1
263 pxor $y1, $t0
264 pxor $y3, $t1
265 pand $x0, $t0
266 pand $x2, $t1
267 pxor $x1, $x0
268 pxor $x3, $x2
269 pand $y0, $x1
270 pand $y2, $x3
271 pand $y1, $x0
272 pand $y3, $x2
273 pxor $x0, $x1
274 pxor $x3, $x2
275 pxor $t0, $x0
276 pxor $t1, $x3
277 ___
278 }
279 sub Mul_GF16_2 {
280 my @x=@_[0..7];
281 my @y=@_[8..11];
282 my @t=@_[12..15];
283 $code.=<<___;
284 movdqa @x[0], @t[0]
285 movdqa @x[1], @t[1]
286 ___
287 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
288 $code.=<<___;
289 pxor @x[2], @t[0]
290 pxor @x[3], @t[1]
291 pxor @y[2], @y[0]
292 pxor @y[3], @y[1]
293 ___
294 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
295 @x[2], @x[3], @y[2], @y[3], @t[2]);
296 $code.=<<___;
297 pxor @t[0], @x[0]
298 pxor @t[0], @x[2]
299 pxor @t[1], @x[1]
300 pxor @t[1], @x[3]
301
302 movdqa @x[4], @t[0]
303 movdqa @x[5], @t[1]
304 pxor @x[6], @t[0]
305 pxor @x[7], @t[1]
306 ___
307 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
308 @x[6], @x[7], @y[2], @y[3], @t[2]);
309 $code.=<<___;
310 pxor @y[2], @y[0]
311 pxor @y[3], @y[1]
312 ___
313 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
314 $code.=<<___;
315 pxor @t[0], @x[4]
316 pxor @t[0], @x[6]
317 pxor @t[1], @x[5]
318 pxor @t[1], @x[7]
319 ___
320 }
321 sub Inv_GF256 {
322 #;********************************************************************
323 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
324 #;********************************************************************
325 my @x=@_[0..7];
326 my @t=@_[8..11];
327 my @s=@_[12..15];
328 # direct optimizations from hardware
329 $code.=<<___;
330 movdqa @x[4], @t[3]
331 movdqa @x[5], @t[2]
332 movdqa @x[1], @t[1]
333 movdqa @x[7], @s[1]
334 movdqa @x[0], @s[0]
335
336 pxor @x[6], @t[3]
337 pxor @x[7], @t[2]
338 pxor @x[3], @t[1]
339 movdqa @t[3], @s[2]
340 pxor @x[6], @s[1]
341 movdqa @t[2], @t[0]
342 pxor @x[2], @s[0]
343 movdqa @t[3], @s[3]
344
345 por @t[1], @t[2]
346 por @s[0], @t[3]
347 pxor @t[0], @s[3]
348 pand @s[0], @s[2]
349 pxor @t[1], @s[0]
350 pand @t[1], @t[0]
351 pand @s[0], @s[3]
352 movdqa @x[3], @s[0]
353 pxor @x[2], @s[0]
354 pand @s[0], @s[1]
355 pxor @s[1], @t[3]
356 pxor @s[1], @t[2]
357 movdqa @x[4], @s[1]
358 movdqa @x[1], @s[0]
359 pxor @x[5], @s[1]
360 pxor @x[0], @s[0]
361 movdqa @s[1], @t[1]
362 pand @s[0], @s[1]
363 por @s[0], @t[1]
364 pxor @s[1], @t[0]
365 pxor @s[3], @t[3]
366 pxor @s[2], @t[2]
367 pxor @s[3], @t[1]
368 movdqa @x[7], @s[0]
369 pxor @s[2], @t[0]
370 movdqa @x[6], @s[1]
371 pxor @s[2], @t[1]
372 movdqa @x[5], @s[2]
373 pand @x[3], @s[0]
374 movdqa @x[4], @s[3]
375 pand @x[2], @s[1]
376 pand @x[1], @s[2]
377 por @x[0], @s[3]
378 pxor @s[0], @t[3]
379 pxor @s[1], @t[2]
380 pxor @s[2], @t[1]
381 pxor @s[3], @t[0]
382
383 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
384
385 # new smaller inversion
386
387 movdqa @t[3], @s[0]
388 pand @t[1], @t[3]
389 pxor @t[2], @s[0]
390
391 movdqa @t[0], @s[2]
392 movdqa @s[0], @s[3]
393 pxor @t[3], @s[2]
394 pand @s[2], @s[3]
395
396 movdqa @t[1], @s[1]
397 pxor @t[2], @s[3]
398 pxor @t[0], @s[1]
399
400 pxor @t[2], @t[3]
401
402 pand @t[3], @s[1]
403
404 movdqa @s[2], @t[2]
405 pxor @t[0], @s[1]
406
407 pxor @s[1], @t[2]
408 pxor @s[1], @t[1]
409
410 pand @t[0], @t[2]
411
412 pxor @t[2], @s[2]
413 pxor @t[2], @t[1]
414
415 pand @s[3], @s[2]
416
417 pxor @s[0], @s[2]
418 ___
419 # output in s3, s2, s1, t1
420
421 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s 1, \s2, \s3
422
423 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t 0, \t2, \t3
424 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
425
426 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
427 }
428
429 # AES linear components
430
431 sub ShiftRows {
432 my @x=@_[0..7];
433 my $mask=pop;
434 $code.=<<___;
435 pxor 0x00($key),@x[0]
436 pxor 0x10($key),@x[1]
437 pshufb $mask,@x[0]
438 pxor 0x20($key),@x[2]
439 pshufb $mask,@x[1]
440 pxor 0x30($key),@x[3]
441 pshufb $mask,@x[2]
442 pxor 0x40($key),@x[4]
443 pshufb $mask,@x[3]
444 pxor 0x50($key),@x[5]
445 pshufb $mask,@x[4]
446 pxor 0x60($key),@x[6]
447 pshufb $mask,@x[5]
448 pxor 0x70($key),@x[7]
449 pshufb $mask,@x[6]
450 lea 0x80($key),$key
451 pshufb $mask,@x[7]
452 ___
453 }
454
455 sub MixColumns {
456 # modified to emit output in order suitable for feeding back to aesenc[last]
457 my @x=@_[0..7];
458 my @t=@_[8..15];
459 $code.=<<___;
460 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
461 pshufd \$0x93, @x[1], @t[1]
462 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
463 pshufd \$0x93, @x[2], @t[2]
464 pxor @t[1], @x[1]
465 pshufd \$0x93, @x[3], @t[3]
466 pxor @t[2], @x[2]
467 pshufd \$0x93, @x[4], @t[4]
468 pxor @t[3], @x[3]
469 pshufd \$0x93, @x[5], @t[5]
470 pxor @t[4], @x[4]
471 pshufd \$0x93, @x[6], @t[6]
472 pxor @t[5], @x[5]
473 pshufd \$0x93, @x[7], @t[7]
474 pxor @t[6], @x[6]
475 pxor @t[7], @x[7]
476
477 pxor @x[0], @t[1]
478 pxor @x[7], @t[0]
479 pxor @x[7], @t[1]
480 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
481 pxor @x[1], @t[2]
482 pshufd \$0x4E, @x[1], @x[1]
483 pxor @x[4], @t[5]
484 pxor @t[0], @x[0]
485 pxor @x[5], @t[6]
486 pxor @t[1], @x[1]
487 pxor @x[3], @t[4]
488 pshufd \$0x4E, @x[4], @t[0]
489 pxor @x[6], @t[7]
490 pshufd \$0x4E, @x[5], @t[1]
491 pxor @x[2], @t[3]
492 pshufd \$0x4E, @x[3], @x[4]
493 pxor @x[7], @t[3]
494 pshufd \$0x4E, @x[7], @x[5]
495 pxor @x[7], @t[4]
496 pshufd \$0x4E, @x[6], @x[3]
497 pxor @t[4], @t[0]
498 pshufd \$0x4E, @x[2], @x[6]
499 pxor @t[5], @t[1]
500
501 pxor @t[3], @x[4]
502 pxor @t[7], @x[5]
503 pxor @t[6], @x[3]
504 movdqa @t[0], @x[2]
505 pxor @t[2], @x[6]
506 movdqa @t[1], @x[7]
507 ___
508 }
509
510 sub InvMixColumns {
511 my @x=@_[0..7];
512 my @t=@_[8..15];
513
514 $code.=<<___;
515 # multiplication by 0x0e
516 pshufd \$0x93, @x[7], @t[7]
517 movdqa @x[2], @t[2]
518 pxor @x[5], @x[7] # 7 5
519 pxor @x[5], @x[2] # 2 5
520 pshufd \$0x93, @x[0], @t[0]
521 movdqa @x[5], @t[5]
522 pxor @x[0], @x[5] # 5 0 [1]
523 pxor @x[1], @x[0] # 0 1
524 pshufd \$0x93, @x[1], @t[1]
525 pxor @x[2], @x[1] # 1 25
526 pxor @x[6], @x[0] # 01 6 [2]
527 pxor @x[3], @x[1] # 125 3 [4]
528 pshufd \$0x93, @x[3], @t[3]
529 pxor @x[0], @x[2] # 25 016 [3]
530 pxor @x[7], @x[3] # 3 75
531 pxor @x[6], @x[7] # 75 6 [0]
532 pshufd \$0x93, @x[6], @t[6]
533 movdqa @x[4], @t[4]
534 pxor @x[4], @x[6] # 6 4
535 pxor @x[3], @x[4] # 4 375 [6]
536 pxor @x[7], @x[3] # 375 756=36
537 pxor @t[5], @x[6] # 64 5 [7]
538 pxor @t[2], @x[3] # 36 2
539 pxor @t[4], @x[3] # 362 4 [5]
540 pshufd \$0x93, @t[5], @t[5]
541 ___
542 my @y = @x[7,5,0,2,1,3,4,6];
543 $code.=<<___;
544 # multiplication by 0x0b
545 pxor @y[0], @y[1]
546 pxor @t[0], @y[0]
547 pxor @t[1], @y[1]
548 pshufd \$0x93, @t[2], @t[2]
549 pxor @t[5], @y[0]
550 pxor @t[6], @y[1]
551 pxor @t[7], @y[0]
552 pshufd \$0x93, @t[4], @t[4]
553 pxor @t[6], @t[7] # clobber t[7]
554 pxor @y[0], @y[1]
555
556 pxor @t[0], @y[3]
557 pshufd \$0x93, @t[0], @t[0]
558 pxor @t[1], @y[2]
559 pxor @t[1], @y[4]
560 pxor @t[2], @y[2]
561 pshufd \$0x93, @t[1], @t[1]
562 pxor @t[2], @y[3]
563 pxor @t[2], @y[5]
564 pxor @t[7], @y[2]
565 pshufd \$0x93, @t[2], @t[2]
566 pxor @t[3], @y[3]
567 pxor @t[3], @y[6]
568 pxor @t[3], @y[4]
569 pshufd \$0x93, @t[3], @t[3]
570 pxor @t[4], @y[7]
571 pxor @t[4], @y[5]
572 pxor @t[7], @y[7]
573 pxor @t[5], @y[3]
574 pxor @t[4], @y[4]
575 pxor @t[5], @t[7] # clobber t[7] even more
576
577 pxor @t[7], @y[5]
578 pshufd \$0x93, @t[4], @t[4]
579 pxor @t[7], @y[6]
580 pxor @t[7], @y[4]
581
582 pxor @t[5], @t[7]
583 pshufd \$0x93, @t[5], @t[5]
584 pxor @t[6], @t[7] # restore t[7]
585
586 # multiplication by 0x0d
587 pxor @y[7], @y[4]
588 pxor @t[4], @y[7]
589 pshufd \$0x93, @t[6], @t[6]
590 pxor @t[0], @y[2]
591 pxor @t[5], @y[7]
592 pxor @t[2], @y[2]
593 pshufd \$0x93, @t[7], @t[7]
594
595 pxor @y[1], @y[3]
596 pxor @t[1], @y[1]
597 pxor @t[0], @y[0]
598 pxor @t[0], @y[3]
599 pxor @t[5], @y[1]
600 pxor @t[5], @y[0]
601 pxor @t[7], @y[1]
602 pshufd \$0x93, @t[0], @t[0]
603 pxor @t[6], @y[0]
604 pxor @y[1], @y[3]
605 pxor @t[1], @y[4]
606 pshufd \$0x93, @t[1], @t[1]
607
608 pxor @t[7], @y[7]
609 pxor @t[2], @y[4]
610 pxor @t[2], @y[5]
611 pshufd \$0x93, @t[2], @t[2]
612 pxor @t[6], @y[2]
613 pxor @t[3], @t[6] # clobber t[6]
614 pxor @y[7], @y[4]
615 pxor @t[6], @y[3]
616
617 pxor @t[6], @y[6]
618 pxor @t[5], @y[5]
619 pxor @t[4], @y[6]
620 pshufd \$0x93, @t[4], @t[4]
621 pxor @t[6], @y[5]
622 pxor @t[7], @y[6]
623 pxor @t[3], @t[6] # restore t[6]
624
625 pshufd \$0x93, @t[5], @t[5]
626 pshufd \$0x93, @t[6], @t[6]
627 pshufd \$0x93, @t[7], @t[7]
628 pshufd \$0x93, @t[3], @t[3]
629
630 # multiplication by 0x09
631 pxor @y[1], @y[4]
632 pxor @y[1], @t[1] # t[1]=y[1]
633 pxor @t[5], @t[0] # clobber t[0]
634 pxor @t[5], @t[1]
635 pxor @t[0], @y[3]
636 pxor @y[0], @t[0] # t[0]=y[0]
637 pxor @t[6], @t[1]
638 pxor @t[7], @t[6] # clobber t[6]
639 pxor @t[1], @y[4]
640 pxor @t[4], @y[7]
641 pxor @y[4], @t[4] # t[4]=y[4]
642 pxor @t[3], @y[6]
643 pxor @y[3], @t[3] # t[3]=y[3]
644 pxor @t[2], @y[5]
645 pxor @y[2], @t[2] # t[2]=y[2]
646 pxor @t[7], @t[3]
647 pxor @y[5], @t[5] # t[5]=y[5]
648 pxor @t[6], @t[2]
649 pxor @t[6], @t[5]
650 pxor @y[6], @t[6] # t[6]=y[6]
651 pxor @y[7], @t[7] # t[7]=y[7]
652
653 movdqa @t[0],@XMM[0]
654 movdqa @t[1],@XMM[1]
655 movdqa @t[2],@XMM[2]
656 movdqa @t[3],@XMM[3]
657 movdqa @t[4],@XMM[4]
658 movdqa @t[5],@XMM[5]
659 movdqa @t[6],@XMM[6]
660 movdqa @t[7],@XMM[7]
661 ___
662 }
663
664 sub aesenc { # not used
665 my @b=@_[0..7];
666 my @t=@_[8..15];
667 $code.=<<___;
668 movdqa 0x30($const),@t[0] # .LSR
669 ___
670 &ShiftRows (@b,@t[0]);
671 &Sbox (@b,@t);
672 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
673 }
674
675 sub aesenclast { # not used
676 my @b=@_[0..7];
677 my @t=@_[8..15];
678 $code.=<<___;
679 movdqa 0x40($const),@t[0] # .LSRM0
680 ___
681 &ShiftRows (@b,@t[0]);
682 &Sbox (@b,@t);
683 $code.=<<___
684 pxor 0x00($key),@b[0]
685 pxor 0x10($key),@b[1]
686 pxor 0x20($key),@b[4]
687 pxor 0x30($key),@b[6]
688 pxor 0x40($key),@b[3]
689 pxor 0x50($key),@b[7]
690 pxor 0x60($key),@b[2]
691 pxor 0x70($key),@b[5]
692 ___
693 }
694
695 sub swapmove {
696 my ($a,$b,$n,$mask,$t)=@_;
697 $code.=<<___;
698 movdqa $b,$t
699 psrlq \$$n,$b
700 pxor $a,$b
701 pand $mask,$b
702 pxor $b,$a
703 psllq \$$n,$b
704 pxor $t,$b
705 ___
706 }
707 sub swapmove2x {
708 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
709 $code.=<<___;
710 movdqa $b0,$t0
711 psrlq \$$n,$b0
712 movdqa $b1,$t1
713 psrlq \$$n,$b1
714 pxor $a0,$b0
715 pxor $a1,$b1
716 pand $mask,$b0
717 pand $mask,$b1
718 pxor $b0,$a0
719 psllq \$$n,$b0
720 pxor $b1,$a1
721 psllq \$$n,$b1
722 pxor $t0,$b0
723 pxor $t1,$b1
724 ___
725 }
726
727 sub bitslice {
728 my @x=reverse(@_[0..7]);
729 my ($t0,$t1,$t2,$t3)=@_[8..11];
730 $code.=<<___;
731 movdqa 0x00($const),$t0 # .LBS0
732 movdqa 0x10($const),$t1 # .LBS1
733 ___
734 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
735 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
736 $code.=<<___;
737 movdqa 0x20($const),$t0 # .LBS2
738 ___
739 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
740 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
741
742 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
743 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
744 }
745
746 $code.=<<___;
747 .text
748
749 .extern asm_AES_encrypt
750 .extern asm_AES_decrypt
751
752 .type _bsaes_encrypt8,\@abi-omnipotent
753 .align 64
754 _bsaes_encrypt8:
755 lea .LBS0(%rip), $const # constants table
756
757 movdqa ($key), @XMM[9] # round 0 key
758 lea 0x10($key), $key
759 movdqa 0x50($const), @XMM[8] # .LM0SR
760 pxor @XMM[9], @XMM[0] # xor with round0 key
761 pxor @XMM[9], @XMM[1]
762 pshufb @XMM[8], @XMM[0]
763 pxor @XMM[9], @XMM[2]
764 pshufb @XMM[8], @XMM[1]
765 pxor @XMM[9], @XMM[3]
766 pshufb @XMM[8], @XMM[2]
767 pxor @XMM[9], @XMM[4]
768 pshufb @XMM[8], @XMM[3]
769 pxor @XMM[9], @XMM[5]
770 pshufb @XMM[8], @XMM[4]
771 pxor @XMM[9], @XMM[6]
772 pshufb @XMM[8], @XMM[5]
773 pxor @XMM[9], @XMM[7]
774 pshufb @XMM[8], @XMM[6]
775 pshufb @XMM[8], @XMM[7]
776 _bsaes_encrypt8_bitslice:
777 ___
778 &bitslice (@XMM[0..7, 8..11]);
779 $code.=<<___;
780 dec $rounds
781 jmp .Lenc_sbox
782 .align 16
783 .Lenc_loop:
784 ___
785 &ShiftRows (@XMM[0..7, 8]);
786 $code.=".Lenc_sbox:\n";
787 &Sbox (@XMM[0..7, 8..15]);
788 $code.=<<___;
789 dec $rounds
790 jl .Lenc_done
791 ___
792 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
793 $code.=<<___;
794 movdqa 0x30($const), @XMM[8] # .LSR
795 jnz .Lenc_loop
796 movdqa 0x40($const), @XMM[8] # .LSRM0
797 jmp .Lenc_loop
798 .align 16
799 .Lenc_done:
800 ___
801 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
802 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
803 $code.=<<___;
804 movdqa ($key), @XMM[8] # last round key
805 pxor @XMM[8], @XMM[4]
806 pxor @XMM[8], @XMM[6]
807 pxor @XMM[8], @XMM[3]
808 pxor @XMM[8], @XMM[7]
809 pxor @XMM[8], @XMM[2]
810 pxor @XMM[8], @XMM[5]
811 pxor @XMM[8], @XMM[0]
812 pxor @XMM[8], @XMM[1]
813 ret
814 .size _bsaes_encrypt8,.-_bsaes_encrypt8
815
816 .type _bsaes_decrypt8,\@abi-omnipotent
817 .align 64
818 _bsaes_decrypt8:
819 lea .LBS0(%rip), $const # constants table
820
821 movdqa ($key), @XMM[9] # round 0 key
822 lea 0x10($key), $key
823 movdqa -0x30($const), @XMM[8] # .LM0ISR
824 pxor @XMM[9], @XMM[0] # xor with round0 key
825 pxor @XMM[9], @XMM[1]
826 pshufb @XMM[8], @XMM[0]
827 pxor @XMM[9], @XMM[2]
828 pshufb @XMM[8], @XMM[1]
829 pxor @XMM[9], @XMM[3]
830 pshufb @XMM[8], @XMM[2]
831 pxor @XMM[9], @XMM[4]
832 pshufb @XMM[8], @XMM[3]
833 pxor @XMM[9], @XMM[5]
834 pshufb @XMM[8], @XMM[4]
835 pxor @XMM[9], @XMM[6]
836 pshufb @XMM[8], @XMM[5]
837 pxor @XMM[9], @XMM[7]
838 pshufb @XMM[8], @XMM[6]
839 pshufb @XMM[8], @XMM[7]
840 ___
841 &bitslice (@XMM[0..7, 8..11]);
842 $code.=<<___;
843 dec $rounds
844 jmp .Ldec_sbox
845 .align 16
846 .Ldec_loop:
847 ___
848 &ShiftRows (@XMM[0..7, 8]);
849 $code.=".Ldec_sbox:\n";
850 &InvSbox (@XMM[0..7, 8..15]);
851 $code.=<<___;
852 dec $rounds
853 jl .Ldec_done
854 ___
855 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
856 $code.=<<___;
857 movdqa -0x10($const), @XMM[8] # .LISR
858 jnz .Ldec_loop
859 movdqa -0x20($const), @XMM[8] # .LISRM0
860 jmp .Ldec_loop
861 .align 16
862 .Ldec_done:
863 ___
864 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
865 $code.=<<___;
866 movdqa ($key), @XMM[8] # last round key
867 pxor @XMM[8], @XMM[6]
868 pxor @XMM[8], @XMM[4]
869 pxor @XMM[8], @XMM[2]
870 pxor @XMM[8], @XMM[7]
871 pxor @XMM[8], @XMM[3]
872 pxor @XMM[8], @XMM[5]
873 pxor @XMM[8], @XMM[0]
874 pxor @XMM[8], @XMM[1]
875 ret
876 .size _bsaes_decrypt8,.-_bsaes_decrypt8
877 ___
878 }
879 {
880 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
881
882 sub bitslice_key {
883 my @x=reverse(@_[0..7]);
884 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
885
886 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
887 $code.=<<___;
888 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
889 movdqa @x[0], @x[2]
890 movdqa @x[1], @x[3]
891 ___
892 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
893
894 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
895 $code.=<<___;
896 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
897 movdqa @x[0], @x[4]
898 movdqa @x[2], @x[6]
899 movdqa @x[1], @x[5]
900 movdqa @x[3], @x[7]
901 ___
902 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
903 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
904 }
905
906 $code.=<<___;
907 .type _bsaes_key_convert,\@abi-omnipotent
908 .align 16
909 _bsaes_key_convert:
910 lea .Lmasks(%rip), $const
911 movdqu ($inp), %xmm7 # load round 0 key
912 lea 0x10($inp), $inp
913 movdqa 0x00($const), %xmm0 # 0x01...
914 movdqa 0x10($const), %xmm1 # 0x02...
915 movdqa 0x20($const), %xmm2 # 0x04...
916 movdqa 0x30($const), %xmm3 # 0x08...
917 movdqa 0x40($const), %xmm4 # .LM0
918 pcmpeqd %xmm5, %xmm5 # .LNOT
919
920 movdqu ($inp), %xmm6 # load round 1 key
921 movdqa %xmm7, ($out) # save round 0 key
922 lea 0x10($out), $out
923 dec $rounds
924 jmp .Lkey_loop
925 .align 16
926 .Lkey_loop:
927 pshufb %xmm4, %xmm6 # .LM0
928
929 movdqa %xmm0, %xmm8
930 movdqa %xmm1, %xmm9
931
932 pand %xmm6, %xmm8
933 pand %xmm6, %xmm9
934 movdqa %xmm2, %xmm10
935 pcmpeqb %xmm0, %xmm8
936 psllq \$4, %xmm0 # 0x10...
937 movdqa %xmm3, %xmm11
938 pcmpeqb %xmm1, %xmm9
939 psllq \$4, %xmm1 # 0x20...
940
941 pand %xmm6, %xmm10
942 pand %xmm6, %xmm11
943 movdqa %xmm0, %xmm12
944 pcmpeqb %xmm2, %xmm10
945 psllq \$4, %xmm2 # 0x40...
946 movdqa %xmm1, %xmm13
947 pcmpeqb %xmm3, %xmm11
948 psllq \$4, %xmm3 # 0x80...
949
950 movdqa %xmm2, %xmm14
951 movdqa %xmm3, %xmm15
952 pxor %xmm5, %xmm8 # "pnot"
953 pxor %xmm5, %xmm9
954
955 pand %xmm6, %xmm12
956 pand %xmm6, %xmm13
957 movdqa %xmm8, 0x00($out) # write bit-sliced round key
958 pcmpeqb %xmm0, %xmm12
959 psrlq \$4, %xmm0 # 0x01...
960 movdqa %xmm9, 0x10($out)
961 pcmpeqb %xmm1, %xmm13
962 psrlq \$4, %xmm1 # 0x02...
963 lea 0x10($inp), $inp
964
965 pand %xmm6, %xmm14
966 pand %xmm6, %xmm15
967 movdqa %xmm10, 0x20($out)
968 pcmpeqb %xmm2, %xmm14
969 psrlq \$4, %xmm2 # 0x04...
970 movdqa %xmm11, 0x30($out)
971 pcmpeqb %xmm3, %xmm15
972 psrlq \$4, %xmm3 # 0x08...
973 movdqu ($inp), %xmm6 # load next round key
974
975 pxor %xmm5, %xmm13 # "pnot"
976 pxor %xmm5, %xmm14
977 movdqa %xmm12, 0x40($out)
978 movdqa %xmm13, 0x50($out)
979 movdqa %xmm14, 0x60($out)
980 movdqa %xmm15, 0x70($out)
981 lea 0x80($out),$out
982 dec $rounds
983 jnz .Lkey_loop
984
985 movdqa 0x50($const), %xmm7 # .L63
986 #movdqa %xmm6, ($out) # don't save last round key
987 ret
988 .size _bsaes_key_convert,.-_bsaes_key_convert
989 ___
990 }
991
992 if (0 && !$win64) { # following four functions are unsupported interface
993 # used for benchmarking...
994 $code.=<<___;
995 .globl bsaes_enc_key_convert
996 .type bsaes_enc_key_convert,\@function,2
997 .align 16
998 bsaes_enc_key_convert:
999 mov 240($inp),%r10d # pass rounds
1000 mov $inp,%rcx # pass key
1001 mov $out,%rax # pass key schedule
1002 call _bsaes_key_convert
1003 pxor %xmm6,%xmm7 # fix up last round key
1004 movdqa %xmm7,(%rax) # save last round key
1005 ret
1006 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1007
1008 .globl bsaes_encrypt_128
1009 .type bsaes_encrypt_128,\@function,4
1010 .align 16
1011 bsaes_encrypt_128:
1012 .Lenc128_loop:
1013 movdqu 0x00($inp), @XMM[0] # load input
1014 movdqu 0x10($inp), @XMM[1]
1015 movdqu 0x20($inp), @XMM[2]
1016 movdqu 0x30($inp), @XMM[3]
1017 movdqu 0x40($inp), @XMM[4]
1018 movdqu 0x50($inp), @XMM[5]
1019 movdqu 0x60($inp), @XMM[6]
1020 movdqu 0x70($inp), @XMM[7]
1021 mov $key, %rax # pass the $key
1022 lea 0x80($inp), $inp
1023 mov \$10,%r10d
1024
1025 call _bsaes_encrypt8
1026
1027 movdqu @XMM[0], 0x00($out) # write output
1028 movdqu @XMM[1], 0x10($out)
1029 movdqu @XMM[4], 0x20($out)
1030 movdqu @XMM[6], 0x30($out)
1031 movdqu @XMM[3], 0x40($out)
1032 movdqu @XMM[7], 0x50($out)
1033 movdqu @XMM[2], 0x60($out)
1034 movdqu @XMM[5], 0x70($out)
1035 lea 0x80($out), $out
1036 sub \$0x80,$len
1037 ja .Lenc128_loop
1038 ret
1039 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1040
1041 .globl bsaes_dec_key_convert
1042 .type bsaes_dec_key_convert,\@function,2
1043 .align 16
1044 bsaes_dec_key_convert:
1045 mov 240($inp),%r10d # pass rounds
1046 mov $inp,%rcx # pass key
1047 mov $out,%rax # pass key schedule
1048 call _bsaes_key_convert
1049 pxor ($out),%xmm7 # fix up round 0 key
1050 movdqa %xmm6,(%rax) # save last round key
1051 movdqa %xmm7,($out)
1052 ret
1053 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1054
1055 .globl bsaes_decrypt_128
1056 .type bsaes_decrypt_128,\@function,4
1057 .align 16
1058 bsaes_decrypt_128:
1059 .Ldec128_loop:
1060 movdqu 0x00($inp), @XMM[0] # load input
1061 movdqu 0x10($inp), @XMM[1]
1062 movdqu 0x20($inp), @XMM[2]
1063 movdqu 0x30($inp), @XMM[3]
1064 movdqu 0x40($inp), @XMM[4]
1065 movdqu 0x50($inp), @XMM[5]
1066 movdqu 0x60($inp), @XMM[6]
1067 movdqu 0x70($inp), @XMM[7]
1068 mov $key, %rax # pass the $key
1069 lea 0x80($inp), $inp
1070 mov \$10,%r10d
1071
1072 call _bsaes_decrypt8
1073
1074 movdqu @XMM[0], 0x00($out) # write output
1075 movdqu @XMM[1], 0x10($out)
1076 movdqu @XMM[6], 0x20($out)
1077 movdqu @XMM[4], 0x30($out)
1078 movdqu @XMM[2], 0x40($out)
1079 movdqu @XMM[7], 0x50($out)
1080 movdqu @XMM[3], 0x60($out)
1081 movdqu @XMM[5], 0x70($out)
1082 lea 0x80($out), $out
1083 sub \$0x80,$len
1084 ja .Ldec128_loop
1085 ret
1086 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1087 ___
1088 }
1089 {
1090 ######################################################################
1091 #
1092 # OpenSSL interface
1093 #
1094 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r 10","%r11d")
1095 : ("%rdi","%rsi","%rdx","%rcx"," %r8","%r9d");
1096 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1097
1098 if ($ecb) {
1099 $code.=<<___;
1100 .globl bsaes_ecb_encrypt_blocks
1101 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1102 .align 16
1103 bsaes_ecb_encrypt_blocks:
1104 mov %rsp, %rax
1105 .Lecb_enc_prologue:
1106 push %rbp
1107 push %rbx
1108 push %r12
1109 push %r13
1110 push %r14
1111 push %r15
1112 lea -0x48(%rsp),%rsp
1113 ___
1114 $code.=<<___ if ($win64);
1115 lea -0xa0(%rsp), %rsp
1116 movaps %xmm6, 0x40(%rsp)
1117 movaps %xmm7, 0x50(%rsp)
1118 movaps %xmm8, 0x60(%rsp)
1119 movaps %xmm9, 0x70(%rsp)
1120 movaps %xmm10, 0x80(%rsp)
1121 movaps %xmm11, 0x90(%rsp)
1122 movaps %xmm12, 0xa0(%rsp)
1123 movaps %xmm13, 0xb0(%rsp)
1124 movaps %xmm14, 0xc0(%rsp)
1125 movaps %xmm15, 0xd0(%rsp)
1126 .Lecb_enc_body:
1127 ___
1128 $code.=<<___;
1129 mov %rsp,%rbp # backup %rsp
1130 mov 240($arg4),%eax # rounds
1131 mov $arg1,$inp # backup arguments
1132 mov $arg2,$out
1133 mov $arg3,$len
1134 mov $arg4,$key
1135 cmp \$8,$arg3
1136 jb .Lecb_enc_short
1137
1138 mov %eax,%ebx # backup rounds
1139 shl \$7,%rax # 128 bytes per inner round key
1140 sub \$`128-32`,%rax # size of bit-sliced key schedule
1141 sub %rax,%rsp
1142 mov %rsp,%rax # pass key schedule
1143 mov $key,%rcx # pass key
1144 mov %ebx,%r10d # pass rounds
1145 call _bsaes_key_convert
1146 pxor %xmm6,%xmm7 # fix up last round key
1147 movdqa %xmm7,(%rax) # save last round key
1148
1149 sub \$8,$len
1150 .Lecb_enc_loop:
1151 movdqu 0x00($inp), @XMM[0] # load input
1152 movdqu 0x10($inp), @XMM[1]
1153 movdqu 0x20($inp), @XMM[2]
1154 movdqu 0x30($inp), @XMM[3]
1155 movdqu 0x40($inp), @XMM[4]
1156 movdqu 0x50($inp), @XMM[5]
1157 mov %rsp, %rax # pass key schedule
1158 movdqu 0x60($inp), @XMM[6]
1159 mov %ebx,%r10d # pass rounds
1160 movdqu 0x70($inp), @XMM[7]
1161 lea 0x80($inp), $inp
1162
1163 call _bsaes_encrypt8
1164
1165 movdqu @XMM[0], 0x00($out) # write output
1166 movdqu @XMM[1], 0x10($out)
1167 movdqu @XMM[4], 0x20($out)
1168 movdqu @XMM[6], 0x30($out)
1169 movdqu @XMM[3], 0x40($out)
1170 movdqu @XMM[7], 0x50($out)
1171 movdqu @XMM[2], 0x60($out)
1172 movdqu @XMM[5], 0x70($out)
1173 lea 0x80($out), $out
1174 sub \$8,$len
1175 jnc .Lecb_enc_loop
1176
1177 add \$8,$len
1178 jz .Lecb_enc_done
1179
1180 movdqu 0x00($inp), @XMM[0] # load input
1181 mov %rsp, %rax # pass key schedule
1182 mov %ebx,%r10d # pass rounds
1183 cmp \$2,$len
1184 jb .Lecb_enc_one
1185 movdqu 0x10($inp), @XMM[1]
1186 je .Lecb_enc_two
1187 movdqu 0x20($inp), @XMM[2]
1188 cmp \$4,$len
1189 jb .Lecb_enc_three
1190 movdqu 0x30($inp), @XMM[3]
1191 je .Lecb_enc_four
1192 movdqu 0x40($inp), @XMM[4]
1193 cmp \$6,$len
1194 jb .Lecb_enc_five
1195 movdqu 0x50($inp), @XMM[5]
1196 je .Lecb_enc_six
1197 movdqu 0x60($inp), @XMM[6]
1198 call _bsaes_encrypt8
1199 movdqu @XMM[0], 0x00($out) # write output
1200 movdqu @XMM[1], 0x10($out)
1201 movdqu @XMM[4], 0x20($out)
1202 movdqu @XMM[6], 0x30($out)
1203 movdqu @XMM[3], 0x40($out)
1204 movdqu @XMM[7], 0x50($out)
1205 movdqu @XMM[2], 0x60($out)
1206 jmp .Lecb_enc_done
1207 .align 16
1208 .Lecb_enc_six:
1209 call _bsaes_encrypt8
1210 movdqu @XMM[0], 0x00($out) # write output
1211 movdqu @XMM[1], 0x10($out)
1212 movdqu @XMM[4], 0x20($out)
1213 movdqu @XMM[6], 0x30($out)
1214 movdqu @XMM[3], 0x40($out)
1215 movdqu @XMM[7], 0x50($out)
1216 jmp .Lecb_enc_done
1217 .align 16
1218 .Lecb_enc_five:
1219 call _bsaes_encrypt8
1220 movdqu @XMM[0], 0x00($out) # write output
1221 movdqu @XMM[1], 0x10($out)
1222 movdqu @XMM[4], 0x20($out)
1223 movdqu @XMM[6], 0x30($out)
1224 movdqu @XMM[3], 0x40($out)
1225 jmp .Lecb_enc_done
1226 .align 16
1227 .Lecb_enc_four:
1228 call _bsaes_encrypt8
1229 movdqu @XMM[0], 0x00($out) # write output
1230 movdqu @XMM[1], 0x10($out)
1231 movdqu @XMM[4], 0x20($out)
1232 movdqu @XMM[6], 0x30($out)
1233 jmp .Lecb_enc_done
1234 .align 16
1235 .Lecb_enc_three:
1236 call _bsaes_encrypt8
1237 movdqu @XMM[0], 0x00($out) # write output
1238 movdqu @XMM[1], 0x10($out)
1239 movdqu @XMM[4], 0x20($out)
1240 jmp .Lecb_enc_done
1241 .align 16
1242 .Lecb_enc_two:
1243 call _bsaes_encrypt8
1244 movdqu @XMM[0], 0x00($out) # write output
1245 movdqu @XMM[1], 0x10($out)
1246 jmp .Lecb_enc_done
1247 .align 16
1248 .Lecb_enc_one:
1249 call _bsaes_encrypt8
1250 movdqu @XMM[0], 0x00($out) # write output
1251 jmp .Lecb_enc_done
1252 .align 16
1253 .Lecb_enc_short:
1254 lea ($inp), $arg1
1255 lea ($out), $arg2
1256 lea ($key), $arg3
1257 call asm_AES_encrypt
1258 lea 16($inp), $inp
1259 lea 16($out), $out
1260 dec $len
1261 jnz .Lecb_enc_short
1262
1263 .Lecb_enc_done:
1264 lea (%rsp),%rax
1265 pxor %xmm0, %xmm0
1266 .Lecb_enc_bzero: # wipe key schedule [if any]
1267 movdqa %xmm0, 0x00(%rax)
1268 movdqa %xmm0, 0x10(%rax)
1269 lea 0x20(%rax), %rax
1270 cmp %rax, %rbp
1271 jb .Lecb_enc_bzero
1272
1273 lea (%rbp),%rsp # restore %rsp
1274 ___
1275 $code.=<<___ if ($win64);
1276 movaps 0x40(%rbp), %xmm6
1277 movaps 0x50(%rbp), %xmm7
1278 movaps 0x60(%rbp), %xmm8
1279 movaps 0x70(%rbp), %xmm9
1280 movaps 0x80(%rbp), %xmm10
1281 movaps 0x90(%rbp), %xmm11
1282 movaps 0xa0(%rbp), %xmm12
1283 movaps 0xb0(%rbp), %xmm13
1284 movaps 0xc0(%rbp), %xmm14
1285 movaps 0xd0(%rbp), %xmm15
1286 lea 0xa0(%rbp), %rsp
1287 ___
1288 $code.=<<___;
1289 mov 0x48(%rsp), %r15
1290 mov 0x50(%rsp), %r14
1291 mov 0x58(%rsp), %r13
1292 mov 0x60(%rsp), %r12
1293 mov 0x68(%rsp), %rbx
1294 mov 0x70(%rsp), %rax
1295 lea 0x78(%rsp), %rsp
1296 mov %rax, %rbp
1297 .Lecb_enc_epilogue:
1298 ret
1299 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1300
1301 .globl bsaes_ecb_decrypt_blocks
1302 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1303 .align 16
1304 bsaes_ecb_decrypt_blocks:
1305 mov %rsp, %rax
1306 .Lecb_dec_prologue:
1307 push %rbp
1308 push %rbx
1309 push %r12
1310 push %r13
1311 push %r14
1312 push %r15
1313 lea -0x48(%rsp),%rsp
1314 ___
1315 $code.=<<___ if ($win64);
1316 lea -0xa0(%rsp), %rsp
1317 movaps %xmm6, 0x40(%rsp)
1318 movaps %xmm7, 0x50(%rsp)
1319 movaps %xmm8, 0x60(%rsp)
1320 movaps %xmm9, 0x70(%rsp)
1321 movaps %xmm10, 0x80(%rsp)
1322 movaps %xmm11, 0x90(%rsp)
1323 movaps %xmm12, 0xa0(%rsp)
1324 movaps %xmm13, 0xb0(%rsp)
1325 movaps %xmm14, 0xc0(%rsp)
1326 movaps %xmm15, 0xd0(%rsp)
1327 .Lecb_dec_body:
1328 ___
1329 $code.=<<___;
1330 mov %rsp,%rbp # backup %rsp
1331 mov 240($arg4),%eax # rounds
1332 mov $arg1,$inp # backup arguments
1333 mov $arg2,$out
1334 mov $arg3,$len
1335 mov $arg4,$key
1336 cmp \$8,$arg3
1337 jb .Lecb_dec_short
1338
1339 mov %eax,%ebx # backup rounds
1340 shl \$7,%rax # 128 bytes per inner round key
1341 sub \$`128-32`,%rax # size of bit-sliced key schedule
1342 sub %rax,%rsp
1343 mov %rsp,%rax # pass key schedule
1344 mov $key,%rcx # pass key
1345 mov %ebx,%r10d # pass rounds
1346 call _bsaes_key_convert
1347 pxor (%rsp),%xmm7 # fix up 0 round key
1348 movdqa %xmm6,(%rax) # save last round key
1349 movdqa %xmm7,(%rsp)
1350
1351 sub \$8,$len
1352 .Lecb_dec_loop:
1353 movdqu 0x00($inp), @XMM[0] # load input
1354 movdqu 0x10($inp), @XMM[1]
1355 movdqu 0x20($inp), @XMM[2]
1356 movdqu 0x30($inp), @XMM[3]
1357 movdqu 0x40($inp), @XMM[4]
1358 movdqu 0x50($inp), @XMM[5]
1359 mov %rsp, %rax # pass key schedule
1360 movdqu 0x60($inp), @XMM[6]
1361 mov %ebx,%r10d # pass rounds
1362 movdqu 0x70($inp), @XMM[7]
1363 lea 0x80($inp), $inp
1364
1365 call _bsaes_decrypt8
1366
1367 movdqu @XMM[0], 0x00($out) # write output
1368 movdqu @XMM[1], 0x10($out)
1369 movdqu @XMM[6], 0x20($out)
1370 movdqu @XMM[4], 0x30($out)
1371 movdqu @XMM[2], 0x40($out)
1372 movdqu @XMM[7], 0x50($out)
1373 movdqu @XMM[3], 0x60($out)
1374 movdqu @XMM[5], 0x70($out)
1375 lea 0x80($out), $out
1376 sub \$8,$len
1377 jnc .Lecb_dec_loop
1378
1379 add \$8,$len
1380 jz .Lecb_dec_done
1381
1382 movdqu 0x00($inp), @XMM[0] # load input
1383 mov %rsp, %rax # pass key schedule
1384 mov %ebx,%r10d # pass rounds
1385 cmp \$2,$len
1386 jb .Lecb_dec_one
1387 movdqu 0x10($inp), @XMM[1]
1388 je .Lecb_dec_two
1389 movdqu 0x20($inp), @XMM[2]
1390 cmp \$4,$len
1391 jb .Lecb_dec_three
1392 movdqu 0x30($inp), @XMM[3]
1393 je .Lecb_dec_four
1394 movdqu 0x40($inp), @XMM[4]
1395 cmp \$6,$len
1396 jb .Lecb_dec_five
1397 movdqu 0x50($inp), @XMM[5]
1398 je .Lecb_dec_six
1399 movdqu 0x60($inp), @XMM[6]
1400 call _bsaes_decrypt8
1401 movdqu @XMM[0], 0x00($out) # write output
1402 movdqu @XMM[1], 0x10($out)
1403 movdqu @XMM[6], 0x20($out)
1404 movdqu @XMM[4], 0x30($out)
1405 movdqu @XMM[2], 0x40($out)
1406 movdqu @XMM[7], 0x50($out)
1407 movdqu @XMM[3], 0x60($out)
1408 jmp .Lecb_dec_done
1409 .align 16
1410 .Lecb_dec_six:
1411 call _bsaes_decrypt8
1412 movdqu @XMM[0], 0x00($out) # write output
1413 movdqu @XMM[1], 0x10($out)
1414 movdqu @XMM[6], 0x20($out)
1415 movdqu @XMM[4], 0x30($out)
1416 movdqu @XMM[2], 0x40($out)
1417 movdqu @XMM[7], 0x50($out)
1418 jmp .Lecb_dec_done
1419 .align 16
1420 .Lecb_dec_five:
1421 call _bsaes_decrypt8
1422 movdqu @XMM[0], 0x00($out) # write output
1423 movdqu @XMM[1], 0x10($out)
1424 movdqu @XMM[6], 0x20($out)
1425 movdqu @XMM[4], 0x30($out)
1426 movdqu @XMM[2], 0x40($out)
1427 jmp .Lecb_dec_done
1428 .align 16
1429 .Lecb_dec_four:
1430 call _bsaes_decrypt8
1431 movdqu @XMM[0], 0x00($out) # write output
1432 movdqu @XMM[1], 0x10($out)
1433 movdqu @XMM[6], 0x20($out)
1434 movdqu @XMM[4], 0x30($out)
1435 jmp .Lecb_dec_done
1436 .align 16
1437 .Lecb_dec_three:
1438 call _bsaes_decrypt8
1439 movdqu @XMM[0], 0x00($out) # write output
1440 movdqu @XMM[1], 0x10($out)
1441 movdqu @XMM[6], 0x20($out)
1442 jmp .Lecb_dec_done
1443 .align 16
1444 .Lecb_dec_two:
1445 call _bsaes_decrypt8
1446 movdqu @XMM[0], 0x00($out) # write output
1447 movdqu @XMM[1], 0x10($out)
1448 jmp .Lecb_dec_done
1449 .align 16
1450 .Lecb_dec_one:
1451 call _bsaes_decrypt8
1452 movdqu @XMM[0], 0x00($out) # write output
1453 jmp .Lecb_dec_done
1454 .align 16
1455 .Lecb_dec_short:
1456 lea ($inp), $arg1
1457 lea ($out), $arg2
1458 lea ($key), $arg3
1459 call asm_AES_decrypt
1460 lea 16($inp), $inp
1461 lea 16($out), $out
1462 dec $len
1463 jnz .Lecb_dec_short
1464
1465 .Lecb_dec_done:
1466 lea (%rsp),%rax
1467 pxor %xmm0, %xmm0
1468 .Lecb_dec_bzero: # wipe key schedule [if any]
1469 movdqa %xmm0, 0x00(%rax)
1470 movdqa %xmm0, 0x10(%rax)
1471 lea 0x20(%rax), %rax
1472 cmp %rax, %rbp
1473 jb .Lecb_dec_bzero
1474
1475 lea (%rbp),%rsp # restore %rsp
1476 ___
1477 $code.=<<___ if ($win64);
1478 movaps 0x40(%rbp), %xmm6
1479 movaps 0x50(%rbp), %xmm7
1480 movaps 0x60(%rbp), %xmm8
1481 movaps 0x70(%rbp), %xmm9
1482 movaps 0x80(%rbp), %xmm10
1483 movaps 0x90(%rbp), %xmm11
1484 movaps 0xa0(%rbp), %xmm12
1485 movaps 0xb0(%rbp), %xmm13
1486 movaps 0xc0(%rbp), %xmm14
1487 movaps 0xd0(%rbp), %xmm15
1488 lea 0xa0(%rbp), %rsp
1489 ___
1490 $code.=<<___;
1491 mov 0x48(%rsp), %r15
1492 mov 0x50(%rsp), %r14
1493 mov 0x58(%rsp), %r13
1494 mov 0x60(%rsp), %r12
1495 mov 0x68(%rsp), %rbx
1496 mov 0x70(%rsp), %rax
1497 lea 0x78(%rsp), %rsp
1498 mov %rax, %rbp
1499 .Lecb_dec_epilogue:
1500 ret
1501 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1502 ___
1503 }
1504 $code.=<<___;
1505 .extern asm_AES_cbc_encrypt
1506 .globl bsaes_cbc_encrypt
1507 .type bsaes_cbc_encrypt,\@abi-omnipotent
1508 .align 16
1509 bsaes_cbc_encrypt:
1510 ___
1511 $code.=<<___ if ($win64);
1512 mov 48(%rsp),$arg6 # pull direction flag
1513 ___
1514 $code.=<<___;
1515 cmp \$0,$arg6
1516 jne asm_AES_cbc_encrypt
1517 cmp \$128,$arg3
1518 jb asm_AES_cbc_encrypt
1519
1520 mov %rsp, %rax
1521 .Lcbc_dec_prologue:
1522 push %rbp
1523 push %rbx
1524 push %r12
1525 push %r13
1526 push %r14
1527 push %r15
1528 lea -0x48(%rsp), %rsp
1529 ___
1530 $code.=<<___ if ($win64);
1531 mov 0xa0(%rsp),$arg5 # pull ivp
1532 lea -0xa0(%rsp), %rsp
1533 movaps %xmm6, 0x40(%rsp)
1534 movaps %xmm7, 0x50(%rsp)
1535 movaps %xmm8, 0x60(%rsp)
1536 movaps %xmm9, 0x70(%rsp)
1537 movaps %xmm10, 0x80(%rsp)
1538 movaps %xmm11, 0x90(%rsp)
1539 movaps %xmm12, 0xa0(%rsp)
1540 movaps %xmm13, 0xb0(%rsp)
1541 movaps %xmm14, 0xc0(%rsp)
1542 movaps %xmm15, 0xd0(%rsp)
1543 .Lcbc_dec_body:
1544 ___
1545 $code.=<<___;
1546 mov %rsp, %rbp # backup %rsp
1547 mov 240($arg4), %eax # rounds
1548 mov $arg1, $inp # backup arguments
1549 mov $arg2, $out
1550 mov $arg3, $len
1551 mov $arg4, $key
1552 mov $arg5, %rbx
1553 shr \$4, $len # bytes to blocks
1554
1555 mov %eax, %edx # rounds
1556 shl \$7, %rax # 128 bytes per inner round key
1557 sub \$`128-32`, %rax # size of bit-sliced key schedule
1558 sub %rax, %rsp
1559
1560 mov %rsp, %rax # pass key schedule
1561 mov $key, %rcx # pass key
1562 mov %edx, %r10d # pass rounds
1563 call _bsaes_key_convert
1564 pxor (%rsp),%xmm7 # fix up 0 round key
1565 movdqa %xmm6,(%rax) # save last round key
1566 movdqa %xmm7,(%rsp)
1567
1568 movdqu (%rbx), @XMM[15] # load IV
1569 sub \$8,$len
1570 .Lcbc_dec_loop:
1571 movdqu 0x00($inp), @XMM[0] # load input
1572 movdqu 0x10($inp), @XMM[1]
1573 movdqu 0x20($inp), @XMM[2]
1574 movdqu 0x30($inp), @XMM[3]
1575 movdqu 0x40($inp), @XMM[4]
1576 movdqu 0x50($inp), @XMM[5]
1577 mov %rsp, %rax # pass key schedule
1578 movdqu 0x60($inp), @XMM[6]
1579 mov %edx,%r10d # pass rounds
1580 movdqu 0x70($inp), @XMM[7]
1581 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1582
1583 call _bsaes_decrypt8
1584
1585 pxor 0x20(%rbp), @XMM[0] # ^= IV
1586 movdqu 0x00($inp), @XMM[8] # re-load input
1587 movdqu 0x10($inp), @XMM[9]
1588 pxor @XMM[8], @XMM[1]
1589 movdqu 0x20($inp), @XMM[10]
1590 pxor @XMM[9], @XMM[6]
1591 movdqu 0x30($inp), @XMM[11]
1592 pxor @XMM[10], @XMM[4]
1593 movdqu 0x40($inp), @XMM[12]
1594 pxor @XMM[11], @XMM[2]
1595 movdqu 0x50($inp), @XMM[13]
1596 pxor @XMM[12], @XMM[7]
1597 movdqu 0x60($inp), @XMM[14]
1598 pxor @XMM[13], @XMM[3]
1599 movdqu 0x70($inp), @XMM[15] # IV
1600 pxor @XMM[14], @XMM[5]
1601 movdqu @XMM[0], 0x00($out) # write output
1602 lea 0x80($inp), $inp
1603 movdqu @XMM[1], 0x10($out)
1604 movdqu @XMM[6], 0x20($out)
1605 movdqu @XMM[4], 0x30($out)
1606 movdqu @XMM[2], 0x40($out)
1607 movdqu @XMM[7], 0x50($out)
1608 movdqu @XMM[3], 0x60($out)
1609 movdqu @XMM[5], 0x70($out)
1610 lea 0x80($out), $out
1611 sub \$8,$len
1612 jnc .Lcbc_dec_loop
1613
1614 add \$8,$len
1615 jz .Lcbc_dec_done
1616
1617 movdqu 0x00($inp), @XMM[0] # load input
1618 mov %rsp, %rax # pass key schedule
1619 mov %edx, %r10d # pass rounds
1620 cmp \$2,$len
1621 jb .Lcbc_dec_one
1622 movdqu 0x10($inp), @XMM[1]
1623 je .Lcbc_dec_two
1624 movdqu 0x20($inp), @XMM[2]
1625 cmp \$4,$len
1626 jb .Lcbc_dec_three
1627 movdqu 0x30($inp), @XMM[3]
1628 je .Lcbc_dec_four
1629 movdqu 0x40($inp), @XMM[4]
1630 cmp \$6,$len
1631 jb .Lcbc_dec_five
1632 movdqu 0x50($inp), @XMM[5]
1633 je .Lcbc_dec_six
1634 movdqu 0x60($inp), @XMM[6]
1635 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1636 call _bsaes_decrypt8
1637 pxor 0x20(%rbp), @XMM[0] # ^= IV
1638 movdqu 0x00($inp), @XMM[8] # re-load input
1639 movdqu 0x10($inp), @XMM[9]
1640 pxor @XMM[8], @XMM[1]
1641 movdqu 0x20($inp), @XMM[10]
1642 pxor @XMM[9], @XMM[6]
1643 movdqu 0x30($inp), @XMM[11]
1644 pxor @XMM[10], @XMM[4]
1645 movdqu 0x40($inp), @XMM[12]
1646 pxor @XMM[11], @XMM[2]
1647 movdqu 0x50($inp), @XMM[13]
1648 pxor @XMM[12], @XMM[7]
1649 movdqu 0x60($inp), @XMM[15] # IV
1650 pxor @XMM[13], @XMM[3]
1651 movdqu @XMM[0], 0x00($out) # write output
1652 movdqu @XMM[1], 0x10($out)
1653 movdqu @XMM[6], 0x20($out)
1654 movdqu @XMM[4], 0x30($out)
1655 movdqu @XMM[2], 0x40($out)
1656 movdqu @XMM[7], 0x50($out)
1657 movdqu @XMM[3], 0x60($out)
1658 jmp .Lcbc_dec_done
1659 .align 16
1660 .Lcbc_dec_six:
1661 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1662 call _bsaes_decrypt8
1663 pxor 0x20(%rbp), @XMM[0] # ^= IV
1664 movdqu 0x00($inp), @XMM[8] # re-load input
1665 movdqu 0x10($inp), @XMM[9]
1666 pxor @XMM[8], @XMM[1]
1667 movdqu 0x20($inp), @XMM[10]
1668 pxor @XMM[9], @XMM[6]
1669 movdqu 0x30($inp), @XMM[11]
1670 pxor @XMM[10], @XMM[4]
1671 movdqu 0x40($inp), @XMM[12]
1672 pxor @XMM[11], @XMM[2]
1673 movdqu 0x50($inp), @XMM[15] # IV
1674 pxor @XMM[12], @XMM[7]
1675 movdqu @XMM[0], 0x00($out) # write output
1676 movdqu @XMM[1], 0x10($out)
1677 movdqu @XMM[6], 0x20($out)
1678 movdqu @XMM[4], 0x30($out)
1679 movdqu @XMM[2], 0x40($out)
1680 movdqu @XMM[7], 0x50($out)
1681 jmp .Lcbc_dec_done
1682 .align 16
1683 .Lcbc_dec_five:
1684 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1685 call _bsaes_decrypt8
1686 pxor 0x20(%rbp), @XMM[0] # ^= IV
1687 movdqu 0x00($inp), @XMM[8] # re-load input
1688 movdqu 0x10($inp), @XMM[9]
1689 pxor @XMM[8], @XMM[1]
1690 movdqu 0x20($inp), @XMM[10]
1691 pxor @XMM[9], @XMM[6]
1692 movdqu 0x30($inp), @XMM[11]
1693 pxor @XMM[10], @XMM[4]
1694 movdqu 0x40($inp), @XMM[15] # IV
1695 pxor @XMM[11], @XMM[2]
1696 movdqu @XMM[0], 0x00($out) # write output
1697 movdqu @XMM[1], 0x10($out)
1698 movdqu @XMM[6], 0x20($out)
1699 movdqu @XMM[4], 0x30($out)
1700 movdqu @XMM[2], 0x40($out)
1701 jmp .Lcbc_dec_done
1702 .align 16
1703 .Lcbc_dec_four:
1704 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1705 call _bsaes_decrypt8
1706 pxor 0x20(%rbp), @XMM[0] # ^= IV
1707 movdqu 0x00($inp), @XMM[8] # re-load input
1708 movdqu 0x10($inp), @XMM[9]
1709 pxor @XMM[8], @XMM[1]
1710 movdqu 0x20($inp), @XMM[10]
1711 pxor @XMM[9], @XMM[6]
1712 movdqu 0x30($inp), @XMM[15] # IV
1713 pxor @XMM[10], @XMM[4]
1714 movdqu @XMM[0], 0x00($out) # write output
1715 movdqu @XMM[1], 0x10($out)
1716 movdqu @XMM[6], 0x20($out)
1717 movdqu @XMM[4], 0x30($out)
1718 jmp .Lcbc_dec_done
1719 .align 16
1720 .Lcbc_dec_three:
1721 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1722 call _bsaes_decrypt8
1723 pxor 0x20(%rbp), @XMM[0] # ^= IV
1724 movdqu 0x00($inp), @XMM[8] # re-load input
1725 movdqu 0x10($inp), @XMM[9]
1726 pxor @XMM[8], @XMM[1]
1727 movdqu 0x20($inp), @XMM[15] # IV
1728 pxor @XMM[9], @XMM[6]
1729 movdqu @XMM[0], 0x00($out) # write output
1730 movdqu @XMM[1], 0x10($out)
1731 movdqu @XMM[6], 0x20($out)
1732 jmp .Lcbc_dec_done
1733 .align 16
1734 .Lcbc_dec_two:
1735 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1736 call _bsaes_decrypt8
1737 pxor 0x20(%rbp), @XMM[0] # ^= IV
1738 movdqu 0x00($inp), @XMM[8] # re-load input
1739 movdqu 0x10($inp), @XMM[15] # IV
1740 pxor @XMM[8], @XMM[1]
1741 movdqu @XMM[0], 0x00($out) # write output
1742 movdqu @XMM[1], 0x10($out)
1743 jmp .Lcbc_dec_done
1744 .align 16
1745 .Lcbc_dec_one:
1746 lea ($inp), $arg1
1747 lea 0x20(%rbp), $arg2 # buffer output
1748 lea ($key), $arg3
1749 call asm_AES_decrypt # doesn't touch %xmm
1750 pxor 0x20(%rbp), @XMM[15] # ^= IV
1751 movdqu @XMM[15], ($out) # write output
1752 movdqa @XMM[0], @XMM[15] # IV
1753
1754 .Lcbc_dec_done:
1755 movdqu @XMM[15], (%rbx) # return IV
1756 lea (%rsp), %rax
1757 pxor %xmm0, %xmm0
1758 .Lcbc_dec_bzero: # wipe key schedule [if any]
1759 movdqa %xmm0, 0x00(%rax)
1760 movdqa %xmm0, 0x10(%rax)
1761 lea 0x20(%rax), %rax
1762 cmp %rax, %rbp
1763 ja .Lcbc_dec_bzero
1764
1765 lea (%rbp),%rsp # restore %rsp
1766 ___
1767 $code.=<<___ if ($win64);
1768 movaps 0x40(%rbp), %xmm6
1769 movaps 0x50(%rbp), %xmm7
1770 movaps 0x60(%rbp), %xmm8
1771 movaps 0x70(%rbp), %xmm9
1772 movaps 0x80(%rbp), %xmm10
1773 movaps 0x90(%rbp), %xmm11
1774 movaps 0xa0(%rbp), %xmm12
1775 movaps 0xb0(%rbp), %xmm13
1776 movaps 0xc0(%rbp), %xmm14
1777 movaps 0xd0(%rbp), %xmm15
1778 lea 0xa0(%rbp), %rsp
1779 ___
1780 $code.=<<___;
1781 mov 0x48(%rsp), %r15
1782 mov 0x50(%rsp), %r14
1783 mov 0x58(%rsp), %r13
1784 mov 0x60(%rsp), %r12
1785 mov 0x68(%rsp), %rbx
1786 mov 0x70(%rsp), %rax
1787 lea 0x78(%rsp), %rsp
1788 mov %rax, %rbp
1789 .Lcbc_dec_epilogue:
1790 ret
1791 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1792
1793 .globl bsaes_ctr32_encrypt_blocks
1794 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1795 .align 16
1796 bsaes_ctr32_encrypt_blocks:
1797 mov %rsp, %rax
1798 .Lctr_enc_prologue:
1799 push %rbp
1800 push %rbx
1801 push %r12
1802 push %r13
1803 push %r14
1804 push %r15
1805 lea -0x48(%rsp), %rsp
1806 ___
1807 $code.=<<___ if ($win64);
1808 mov 0xa0(%rsp),$arg5 # pull ivp
1809 lea -0xa0(%rsp), %rsp
1810 movaps %xmm6, 0x40(%rsp)
1811 movaps %xmm7, 0x50(%rsp)
1812 movaps %xmm8, 0x60(%rsp)
1813 movaps %xmm9, 0x70(%rsp)
1814 movaps %xmm10, 0x80(%rsp)
1815 movaps %xmm11, 0x90(%rsp)
1816 movaps %xmm12, 0xa0(%rsp)
1817 movaps %xmm13, 0xb0(%rsp)
1818 movaps %xmm14, 0xc0(%rsp)
1819 movaps %xmm15, 0xd0(%rsp)
1820 .Lctr_enc_body:
1821 ___
1822 $code.=<<___;
1823 mov %rsp, %rbp # backup %rsp
1824 movdqu ($arg5), %xmm0 # load counter
1825 mov 240($arg4), %eax # rounds
1826 mov $arg1, $inp # backup arguments
1827 mov $arg2, $out
1828 mov $arg3, $len
1829 mov $arg4, $key
1830 movdqa %xmm0, 0x20(%rbp) # copy counter
1831 cmp \$8, $arg3
1832 jb .Lctr_enc_short
1833
1834 mov %eax, %ebx # rounds
1835 shl \$7, %rax # 128 bytes per inner round key
1836 sub \$`128-32`, %rax # size of bit-sliced key schedule
1837 sub %rax, %rsp
1838
1839 mov %rsp, %rax # pass key schedule
1840 mov $key, %rcx # pass key
1841 mov %ebx, %r10d # pass rounds
1842 call _bsaes_key_convert
1843 pxor %xmm6,%xmm7 # fix up last round key
1844 movdqa %xmm7,(%rax) # save last round key
1845
1846 movdqa (%rsp), @XMM[9] # load round0 key
1847 lea .LADD1(%rip), %r11
1848 movdqa 0x20(%rbp), @XMM[0] # counter copy
1849 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1850 pshufb @XMM[8], @XMM[9] # byte swap upper part
1851 pshufb @XMM[8], @XMM[0]
1852 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1853 jmp .Lctr_enc_loop
1854 .align 16
1855 .Lctr_enc_loop:
1856 movdqa @XMM[0], 0x20(%rbp) # save counter
1857 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1858 movdqa @XMM[0], @XMM[2]
1859 paddd 0x00(%r11), @XMM[1] # .LADD1
1860 movdqa @XMM[0], @XMM[3]
1861 paddd 0x10(%r11), @XMM[2] # .LADD2
1862 movdqa @XMM[0], @XMM[4]
1863 paddd 0x20(%r11), @XMM[3] # .LADD3
1864 movdqa @XMM[0], @XMM[5]
1865 paddd 0x30(%r11), @XMM[4] # .LADD4
1866 movdqa @XMM[0], @XMM[6]
1867 paddd 0x40(%r11), @XMM[5] # .LADD5
1868 movdqa @XMM[0], @XMM[7]
1869 paddd 0x50(%r11), @XMM[6] # .LADD6
1870 paddd 0x60(%r11), @XMM[7] # .LADD7
1871
1872 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1873 # to flip byte order in 32-bit counter
1874 movdqa (%rsp), @XMM[9] # round 0 key
1875 lea 0x10(%rsp), %rax # pass key schedule
1876 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1877 pxor @XMM[9], @XMM[0] # xor with round0 key
1878 pxor @XMM[9], @XMM[1]
1879 pshufb @XMM[8], @XMM[0]
1880 pxor @XMM[9], @XMM[2]
1881 pshufb @XMM[8], @XMM[1]
1882 pxor @XMM[9], @XMM[3]
1883 pshufb @XMM[8], @XMM[2]
1884 pxor @XMM[9], @XMM[4]
1885 pshufb @XMM[8], @XMM[3]
1886 pxor @XMM[9], @XMM[5]
1887 pshufb @XMM[8], @XMM[4]
1888 pxor @XMM[9], @XMM[6]
1889 pshufb @XMM[8], @XMM[5]
1890 pxor @XMM[9], @XMM[7]
1891 pshufb @XMM[8], @XMM[6]
1892 lea .LBS0(%rip), %r11 # constants table
1893 pshufb @XMM[8], @XMM[7]
1894 mov %ebx,%r10d # pass rounds
1895
1896 call _bsaes_encrypt8_bitslice
1897
1898 sub \$8,$len
1899 jc .Lctr_enc_loop_done
1900
1901 movdqu 0x00($inp), @XMM[8] # load input
1902 movdqu 0x10($inp), @XMM[9]
1903 movdqu 0x20($inp), @XMM[10]
1904 movdqu 0x30($inp), @XMM[11]
1905 movdqu 0x40($inp), @XMM[12]
1906 movdqu 0x50($inp), @XMM[13]
1907 movdqu 0x60($inp), @XMM[14]
1908 movdqu 0x70($inp), @XMM[15]
1909 lea 0x80($inp),$inp
1910 pxor @XMM[0], @XMM[8]
1911 movdqa 0x20(%rbp), @XMM[0] # load counter
1912 pxor @XMM[9], @XMM[1]
1913 movdqu @XMM[8], 0x00($out) # write output
1914 pxor @XMM[10], @XMM[4]
1915 movdqu @XMM[1], 0x10($out)
1916 pxor @XMM[11], @XMM[6]
1917 movdqu @XMM[4], 0x20($out)
1918 pxor @XMM[12], @XMM[3]
1919 movdqu @XMM[6], 0x30($out)
1920 pxor @XMM[13], @XMM[7]
1921 movdqu @XMM[3], 0x40($out)
1922 pxor @XMM[14], @XMM[2]
1923 movdqu @XMM[7], 0x50($out)
1924 pxor @XMM[15], @XMM[5]
1925 movdqu @XMM[2], 0x60($out)
1926 lea .LADD1(%rip), %r11
1927 movdqu @XMM[5], 0x70($out)
1928 lea 0x80($out), $out
1929 paddd 0x70(%r11), @XMM[0] # .LADD8
1930 jnz .Lctr_enc_loop
1931
1932 jmp .Lctr_enc_done
1933 .align 16
1934 .Lctr_enc_loop_done:
1935 add \$8, $len
1936 movdqu 0x00($inp), @XMM[8] # load input
1937 pxor @XMM[8], @XMM[0]
1938 movdqu @XMM[0], 0x00($out) # write output
1939 cmp \$2,$len
1940 jb .Lctr_enc_done
1941 movdqu 0x10($inp), @XMM[9]
1942 pxor @XMM[9], @XMM[1]
1943 movdqu @XMM[1], 0x10($out)
1944 je .Lctr_enc_done
1945 movdqu 0x20($inp), @XMM[10]
1946 pxor @XMM[10], @XMM[4]
1947 movdqu @XMM[4], 0x20($out)
1948 cmp \$4,$len
1949 jb .Lctr_enc_done
1950 movdqu 0x30($inp), @XMM[11]
1951 pxor @XMM[11], @XMM[6]
1952 movdqu @XMM[6], 0x30($out)
1953 je .Lctr_enc_done
1954 movdqu 0x40($inp), @XMM[12]
1955 pxor @XMM[12], @XMM[3]
1956 movdqu @XMM[3], 0x40($out)
1957 cmp \$6,$len
1958 jb .Lctr_enc_done
1959 movdqu 0x50($inp), @XMM[13]
1960 pxor @XMM[13], @XMM[7]
1961 movdqu @XMM[7], 0x50($out)
1962 je .Lctr_enc_done
1963 movdqu 0x60($inp), @XMM[14]
1964 pxor @XMM[14], @XMM[2]
1965 movdqu @XMM[2], 0x60($out)
1966 jmp .Lctr_enc_done
1967
1968 .align 16
1969 .Lctr_enc_short:
1970 lea 0x20(%rbp), $arg1
1971 lea 0x30(%rbp), $arg2
1972 lea ($key), $arg3
1973 call asm_AES_encrypt
1974 movdqu ($inp), @XMM[1]
1975 lea 16($inp), $inp
1976 mov 0x2c(%rbp), %eax # load 32-bit counter
1977 bswap %eax
1978 pxor 0x30(%rbp), @XMM[1]
1979 inc %eax # increment
1980 movdqu @XMM[1], ($out)
1981 bswap %eax
1982 lea 16($out), $out
1983 mov %eax, 0x2c(%rsp) # save 32-bit counter
1984 dec $len
1985 jnz .Lctr_enc_short
1986
1987 .Lctr_enc_done:
1988 lea (%rsp), %rax
1989 pxor %xmm0, %xmm0
1990 .Lctr_enc_bzero: # wipe key schedule [if any]
1991 movdqa %xmm0, 0x00(%rax)
1992 movdqa %xmm0, 0x10(%rax)
1993 lea 0x20(%rax), %rax
1994 cmp %rax, %rbp
1995 ja .Lctr_enc_bzero
1996
1997 lea (%rbp),%rsp # restore %rsp
1998 ___
1999 $code.=<<___ if ($win64);
2000 movaps 0x40(%rbp), %xmm6
2001 movaps 0x50(%rbp), %xmm7
2002 movaps 0x60(%rbp), %xmm8
2003 movaps 0x70(%rbp), %xmm9
2004 movaps 0x80(%rbp), %xmm10
2005 movaps 0x90(%rbp), %xmm11
2006 movaps 0xa0(%rbp), %xmm12
2007 movaps 0xb0(%rbp), %xmm13
2008 movaps 0xc0(%rbp), %xmm14
2009 movaps 0xd0(%rbp), %xmm15
2010 lea 0xa0(%rbp), %rsp
2011 ___
2012 $code.=<<___;
2013 mov 0x48(%rsp), %r15
2014 mov 0x50(%rsp), %r14
2015 mov 0x58(%rsp), %r13
2016 mov 0x60(%rsp), %r12
2017 mov 0x68(%rsp), %rbx
2018 mov 0x70(%rsp), %rax
2019 lea 0x78(%rsp), %rsp
2020 mov %rax, %rbp
2021 .Lctr_enc_epilogue:
2022 ret
2023 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2024 ___
2025 ######################################################################
2026 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2027 # const AES_KEY *key1, const AES_KEY *key2,
2028 # const unsigned char iv[16]);
2029 #
2030 my ($twmask,$twres,$twtmp)=@XMM[13..15];
2031 $code.=<<___;
2032 .globl bsaes_xts_encrypt
2033 .type bsaes_xts_encrypt,\@abi-omnipotent
2034 .align 16
2035 bsaes_xts_encrypt:
2036 mov %rsp, %rax
2037 .Lxts_enc_prologue:
2038 push %rbp
2039 push %rbx
2040 push %r12
2041 push %r13
2042 push %r14
2043 push %r15
2044 lea -0x48(%rsp), %rsp
2045 ___
2046 $code.=<<___ if ($win64);
2047 mov 0xa0(%rsp),$arg5 # pull key2
2048 mov 0xa8(%rsp),$arg6 # pull ivp
2049 lea -0xa0(%rsp), %rsp
2050 movaps %xmm6, 0x40(%rsp)
2051 movaps %xmm7, 0x50(%rsp)
2052 movaps %xmm8, 0x60(%rsp)
2053 movaps %xmm9, 0x70(%rsp)
2054 movaps %xmm10, 0x80(%rsp)
2055 movaps %xmm11, 0x90(%rsp)
2056 movaps %xmm12, 0xa0(%rsp)
2057 movaps %xmm13, 0xb0(%rsp)
2058 movaps %xmm14, 0xc0(%rsp)
2059 movaps %xmm15, 0xd0(%rsp)
2060 .Lxts_enc_body:
2061 ___
2062 $code.=<<___;
2063 mov %rsp, %rbp # backup %rsp
2064 mov $arg1, $inp # backup arguments
2065 mov $arg2, $out
2066 mov $arg3, $len
2067 mov $arg4, $key
2068
2069 lea ($arg6), $arg1
2070 lea 0x20(%rbp), $arg2
2071 lea ($arg5), $arg3
2072 call asm_AES_encrypt # generate initial tweak
2073
2074 mov 240($key), %eax # rounds
2075 mov $len, %rbx # backup $len
2076
2077 mov %eax, %edx # rounds
2078 shl \$7, %rax # 128 bytes per inner round key
2079 sub \$`128-32`, %rax # size of bit-sliced key schedule
2080 sub %rax, %rsp
2081
2082 mov %rsp, %rax # pass key schedule
2083 mov $key, %rcx # pass key
2084 mov %edx, %r10d # pass rounds
2085 call _bsaes_key_convert
2086 pxor %xmm6, %xmm7 # fix up last round key
2087 movdqa %xmm7, (%rax) # save last round key
2088
2089 and \$-16, $len
2090 sub \$0x80, %rsp # place for tweak[8]
2091 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2092
2093 pxor $twtmp, $twtmp
2094 movdqa .Lxts_magic(%rip), $twmask
2095 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2096
2097 sub \$0x80, $len
2098 jc .Lxts_enc_short
2099 jmp .Lxts_enc_loop
2100
2101 .align 16
2102 .Lxts_enc_loop:
2103 ___
2104 for ($i=0;$i<7;$i++) {
2105 $code.=<<___;
2106 pshufd \$0x13, $twtmp, $twres
2107 pxor $twtmp, $twtmp
2108 movdqa @XMM[7], @XMM[$i]
2109 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2110 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2111 pand $twmask, $twres # isolate carry and residue
2112 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2113 pxor $twres, @XMM[7]
2114 ___
2115 $code.=<<___ if ($i>=1);
2116 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2117 ___
2118 $code.=<<___ if ($i>=2);
2119 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2120 ___
2121 }
2122 $code.=<<___;
2123 movdqu 0x60($inp), @XMM[8+6]
2124 pxor @XMM[8+5], @XMM[5]
2125 movdqu 0x70($inp), @XMM[8+7]
2126 lea 0x80($inp), $inp
2127 movdqa @XMM[7], 0x70(%rsp)
2128 pxor @XMM[8+6], @XMM[6]
2129 lea 0x80(%rsp), %rax # pass key schedule
2130 pxor @XMM[8+7], @XMM[7]
2131 mov %edx, %r10d # pass rounds
2132
2133 call _bsaes_encrypt8
2134
2135 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2136 pxor 0x10(%rsp), @XMM[1]
2137 movdqu @XMM[0], 0x00($out) # write output
2138 pxor 0x20(%rsp), @XMM[4]
2139 movdqu @XMM[1], 0x10($out)
2140 pxor 0x30(%rsp), @XMM[6]
2141 movdqu @XMM[4], 0x20($out)
2142 pxor 0x40(%rsp), @XMM[3]
2143 movdqu @XMM[6], 0x30($out)
2144 pxor 0x50(%rsp), @XMM[7]
2145 movdqu @XMM[3], 0x40($out)
2146 pxor 0x60(%rsp), @XMM[2]
2147 movdqu @XMM[7], 0x50($out)
2148 pxor 0x70(%rsp), @XMM[5]
2149 movdqu @XMM[2], 0x60($out)
2150 movdqu @XMM[5], 0x70($out)
2151 lea 0x80($out), $out
2152
2153 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2154 pxor $twtmp, $twtmp
2155 movdqa .Lxts_magic(%rip), $twmask
2156 pcmpgtd @XMM[7], $twtmp
2157 pshufd \$0x13, $twtmp, $twres
2158 pxor $twtmp, $twtmp
2159 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2160 pand $twmask, $twres # isolate carry and residue
2161 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2162 pxor $twres, @XMM[7]
2163
2164 sub \$0x80,$len
2165 jnc .Lxts_enc_loop
2166
2167 .Lxts_enc_short:
2168 add \$0x80, $len
2169 jz .Lxts_enc_done
2170 ___
2171 for ($i=0;$i<7;$i++) {
2172 $code.=<<___;
2173 pshufd \$0x13, $twtmp, $twres
2174 pxor $twtmp, $twtmp
2175 movdqa @XMM[7], @XMM[$i]
2176 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2177 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2178 pand $twmask, $twres # isolate carry and residue
2179 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2180 pxor $twres, @XMM[7]
2181 ___
2182 $code.=<<___ if ($i>=1);
2183 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2184 cmp \$`0x10*$i`,$len
2185 je .Lxts_enc_$i
2186 ___
2187 $code.=<<___ if ($i>=2);
2188 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2189 ___
2190 }
2191 $code.=<<___;
2192 movdqu 0x60($inp), @XMM[8+6]
2193 pxor @XMM[8+5], @XMM[5]
2194 movdqa @XMM[7], 0x70(%rsp)
2195 lea 0x70($inp), $inp
2196 pxor @XMM[8+6], @XMM[6]
2197 lea 0x80(%rsp), %rax # pass key schedule
2198 mov %edx, %r10d # pass rounds
2199
2200 call _bsaes_encrypt8
2201
2202 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2203 pxor 0x10(%rsp), @XMM[1]
2204 movdqu @XMM[0], 0x00($out) # write output
2205 pxor 0x20(%rsp), @XMM[4]
2206 movdqu @XMM[1], 0x10($out)
2207 pxor 0x30(%rsp), @XMM[6]
2208 movdqu @XMM[4], 0x20($out)
2209 pxor 0x40(%rsp), @XMM[3]
2210 movdqu @XMM[6], 0x30($out)
2211 pxor 0x50(%rsp), @XMM[7]
2212 movdqu @XMM[3], 0x40($out)
2213 pxor 0x60(%rsp), @XMM[2]
2214 movdqu @XMM[7], 0x50($out)
2215 movdqu @XMM[2], 0x60($out)
2216 lea 0x70($out), $out
2217
2218 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2219 jmp .Lxts_enc_done
2220 .align 16
2221 .Lxts_enc_6:
2222 pxor @XMM[8+4], @XMM[4]
2223 lea 0x60($inp), $inp
2224 pxor @XMM[8+5], @XMM[5]
2225 lea 0x80(%rsp), %rax # pass key schedule
2226 mov %edx, %r10d # pass rounds
2227
2228 call _bsaes_encrypt8
2229
2230 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2231 pxor 0x10(%rsp), @XMM[1]
2232 movdqu @XMM[0], 0x00($out) # write output
2233 pxor 0x20(%rsp), @XMM[4]
2234 movdqu @XMM[1], 0x10($out)
2235 pxor 0x30(%rsp), @XMM[6]
2236 movdqu @XMM[4], 0x20($out)
2237 pxor 0x40(%rsp), @XMM[3]
2238 movdqu @XMM[6], 0x30($out)
2239 pxor 0x50(%rsp), @XMM[7]
2240 movdqu @XMM[3], 0x40($out)
2241 movdqu @XMM[7], 0x50($out)
2242 lea 0x60($out), $out
2243
2244 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2245 jmp .Lxts_enc_done
2246 .align 16
2247 .Lxts_enc_5:
2248 pxor @XMM[8+3], @XMM[3]
2249 lea 0x50($inp), $inp
2250 pxor @XMM[8+4], @XMM[4]
2251 lea 0x80(%rsp), %rax # pass key schedule
2252 mov %edx, %r10d # pass rounds
2253
2254 call _bsaes_encrypt8
2255
2256 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2257 pxor 0x10(%rsp), @XMM[1]
2258 movdqu @XMM[0], 0x00($out) # write output
2259 pxor 0x20(%rsp), @XMM[4]
2260 movdqu @XMM[1], 0x10($out)
2261 pxor 0x30(%rsp), @XMM[6]
2262 movdqu @XMM[4], 0x20($out)
2263 pxor 0x40(%rsp), @XMM[3]
2264 movdqu @XMM[6], 0x30($out)
2265 movdqu @XMM[3], 0x40($out)
2266 lea 0x50($out), $out
2267
2268 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2269 jmp .Lxts_enc_done
2270 .align 16
2271 .Lxts_enc_4:
2272 pxor @XMM[8+2], @XMM[2]
2273 lea 0x40($inp), $inp
2274 pxor @XMM[8+3], @XMM[3]
2275 lea 0x80(%rsp), %rax # pass key schedule
2276 mov %edx, %r10d # pass rounds
2277
2278 call _bsaes_encrypt8
2279
2280 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2281 pxor 0x10(%rsp), @XMM[1]
2282 movdqu @XMM[0], 0x00($out) # write output
2283 pxor 0x20(%rsp), @XMM[4]
2284 movdqu @XMM[1], 0x10($out)
2285 pxor 0x30(%rsp), @XMM[6]
2286 movdqu @XMM[4], 0x20($out)
2287 movdqu @XMM[6], 0x30($out)
2288 lea 0x40($out), $out
2289
2290 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2291 jmp .Lxts_enc_done
2292 .align 16
2293 .Lxts_enc_3:
2294 pxor @XMM[8+1], @XMM[1]
2295 lea 0x30($inp), $inp
2296 pxor @XMM[8+2], @XMM[2]
2297 lea 0x80(%rsp), %rax # pass key schedule
2298 mov %edx, %r10d # pass rounds
2299
2300 call _bsaes_encrypt8
2301
2302 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2303 pxor 0x10(%rsp), @XMM[1]
2304 movdqu @XMM[0], 0x00($out) # write output
2305 pxor 0x20(%rsp), @XMM[4]
2306 movdqu @XMM[1], 0x10($out)
2307 movdqu @XMM[4], 0x20($out)
2308 lea 0x30($out), $out
2309
2310 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2311 jmp .Lxts_enc_done
2312 .align 16
2313 .Lxts_enc_2:
2314 pxor @XMM[8+0], @XMM[0]
2315 lea 0x20($inp), $inp
2316 pxor @XMM[8+1], @XMM[1]
2317 lea 0x80(%rsp), %rax # pass key schedule
2318 mov %edx, %r10d # pass rounds
2319
2320 call _bsaes_encrypt8
2321
2322 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2323 pxor 0x10(%rsp), @XMM[1]
2324 movdqu @XMM[0], 0x00($out) # write output
2325 movdqu @XMM[1], 0x10($out)
2326 lea 0x20($out), $out
2327
2328 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2329 jmp .Lxts_enc_done
2330 .align 16
2331 .Lxts_enc_1:
2332 pxor @XMM[0], @XMM[8]
2333 lea 0x10($inp), $inp
2334 movdqa @XMM[8], 0x20(%rbp)
2335 lea 0x20(%rbp), $arg1
2336 lea 0x20(%rbp), $arg2
2337 lea ($key), $arg3
2338 call asm_AES_encrypt # doesn't touch %xmm
2339 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2340 #pxor @XMM[8], @XMM[0]
2341 #lea 0x80(%rsp), %rax # pass key schedule
2342 #mov %edx, %r10d # pass rounds
2343 #call _bsaes_encrypt8
2344 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2345 movdqu @XMM[0], 0x00($out) # write output
2346 lea 0x10($out), $out
2347
2348 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2349
2350 .Lxts_enc_done:
2351 and \$15, %ebx
2352 jz .Lxts_enc_ret
2353 mov $out, %rdx
2354
2355 .Lxts_enc_steal:
2356 movzb ($inp), %eax
2357 movzb -16(%rdx), %ecx
2358 lea 1($inp), $inp
2359 mov %al, -16(%rdx)
2360 mov %cl, 0(%rdx)
2361 lea 1(%rdx), %rdx
2362 sub \$1,%ebx
2363 jnz .Lxts_enc_steal
2364
2365 movdqu -16($out), @XMM[0]
2366 lea 0x20(%rbp), $arg1
2367 pxor @XMM[7], @XMM[0]
2368 lea 0x20(%rbp), $arg2
2369 movdqa @XMM[0], 0x20(%rbp)
2370 lea ($key), $arg3
2371 call asm_AES_encrypt # doesn't touch %xmm
2372 pxor 0x20(%rbp), @XMM[7]
2373 movdqu @XMM[7], -16($out)
2374
2375 .Lxts_enc_ret:
2376 lea (%rsp), %rax
2377 pxor %xmm0, %xmm0
2378 .Lxts_enc_bzero: # wipe key schedule [if any]
2379 movdqa %xmm0, 0x00(%rax)
2380 movdqa %xmm0, 0x10(%rax)
2381 lea 0x20(%rax), %rax
2382 cmp %rax, %rbp
2383 ja .Lxts_enc_bzero
2384
2385 lea (%rbp),%rsp # restore %rsp
2386 ___
2387 $code.=<<___ if ($win64);
2388 movaps 0x40(%rbp), %xmm6
2389 movaps 0x50(%rbp), %xmm7
2390 movaps 0x60(%rbp), %xmm8
2391 movaps 0x70(%rbp), %xmm9
2392 movaps 0x80(%rbp), %xmm10
2393 movaps 0x90(%rbp), %xmm11
2394 movaps 0xa0(%rbp), %xmm12
2395 movaps 0xb0(%rbp), %xmm13
2396 movaps 0xc0(%rbp), %xmm14
2397 movaps 0xd0(%rbp), %xmm15
2398 lea 0xa0(%rbp), %rsp
2399 ___
2400 $code.=<<___;
2401 mov 0x48(%rsp), %r15
2402 mov 0x50(%rsp), %r14
2403 mov 0x58(%rsp), %r13
2404 mov 0x60(%rsp), %r12
2405 mov 0x68(%rsp), %rbx
2406 mov 0x70(%rsp), %rax
2407 lea 0x78(%rsp), %rsp
2408 mov %rax, %rbp
2409 .Lxts_enc_epilogue:
2410 ret
2411 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2412
2413 .globl bsaes_xts_decrypt
2414 .type bsaes_xts_decrypt,\@abi-omnipotent
2415 .align 16
2416 bsaes_xts_decrypt:
2417 mov %rsp, %rax
2418 .Lxts_dec_prologue:
2419 push %rbp
2420 push %rbx
2421 push %r12
2422 push %r13
2423 push %r14
2424 push %r15
2425 lea -0x48(%rsp), %rsp
2426 ___
2427 $code.=<<___ if ($win64);
2428 mov 0xa0(%rsp),$arg5 # pull key2
2429 mov 0xa8(%rsp),$arg6 # pull ivp
2430 lea -0xa0(%rsp), %rsp
2431 movaps %xmm6, 0x40(%rsp)
2432 movaps %xmm7, 0x50(%rsp)
2433 movaps %xmm8, 0x60(%rsp)
2434 movaps %xmm9, 0x70(%rsp)
2435 movaps %xmm10, 0x80(%rsp)
2436 movaps %xmm11, 0x90(%rsp)
2437 movaps %xmm12, 0xa0(%rsp)
2438 movaps %xmm13, 0xb0(%rsp)
2439 movaps %xmm14, 0xc0(%rsp)
2440 movaps %xmm15, 0xd0(%rsp)
2441 .Lxts_dec_body:
2442 ___
2443 $code.=<<___;
2444 mov %rsp, %rbp # backup %rsp
2445 mov $arg1, $inp # backup arguments
2446 mov $arg2, $out
2447 mov $arg3, $len
2448 mov $arg4, $key
2449
2450 lea ($arg6), $arg1
2451 lea 0x20(%rbp), $arg2
2452 lea ($arg5), $arg3
2453 call asm_AES_encrypt # generate initial tweak
2454
2455 mov 240($key), %eax # rounds
2456 mov $len, %rbx # backup $len
2457
2458 mov %eax, %edx # rounds
2459 shl \$7, %rax # 128 bytes per inner round key
2460 sub \$`128-32`, %rax # size of bit-sliced key schedule
2461 sub %rax, %rsp
2462
2463 mov %rsp, %rax # pass key schedule
2464 mov $key, %rcx # pass key
2465 mov %edx, %r10d # pass rounds
2466 call _bsaes_key_convert
2467 pxor (%rsp), %xmm7 # fix up round 0 key
2468 movdqa %xmm6, (%rax) # save last round key
2469 movdqa %xmm7, (%rsp)
2470
2471 xor %eax, %eax # if ($len%16) len-=16;
2472 and \$-16, $len
2473 test \$15, %ebx
2474 setnz %al
2475 shl \$4, %rax
2476 sub %rax, $len
2477
2478 sub \$0x80, %rsp # place for tweak[8]
2479 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2480
2481 pxor $twtmp, $twtmp
2482 movdqa .Lxts_magic(%rip), $twmask
2483 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2484
2485 sub \$0x80, $len
2486 jc .Lxts_dec_short
2487 jmp .Lxts_dec_loop
2488
2489 .align 16
2490 .Lxts_dec_loop:
2491 ___
2492 for ($i=0;$i<7;$i++) {
2493 $code.=<<___;
2494 pshufd \$0x13, $twtmp, $twres
2495 pxor $twtmp, $twtmp
2496 movdqa @XMM[7], @XMM[$i]
2497 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2498 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2499 pand $twmask, $twres # isolate carry and residue
2500 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2501 pxor $twres, @XMM[7]
2502 ___
2503 $code.=<<___ if ($i>=1);
2504 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2505 ___
2506 $code.=<<___ if ($i>=2);
2507 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2508 ___
2509 }
2510 $code.=<<___;
2511 movdqu 0x60($inp), @XMM[8+6]
2512 pxor @XMM[8+5], @XMM[5]
2513 movdqu 0x70($inp), @XMM[8+7]
2514 lea 0x80($inp), $inp
2515 movdqa @XMM[7], 0x70(%rsp)
2516 pxor @XMM[8+6], @XMM[6]
2517 lea 0x80(%rsp), %rax # pass key schedule
2518 pxor @XMM[8+7], @XMM[7]
2519 mov %edx, %r10d # pass rounds
2520
2521 call _bsaes_decrypt8
2522
2523 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2524 pxor 0x10(%rsp), @XMM[1]
2525 movdqu @XMM[0], 0x00($out) # write output
2526 pxor 0x20(%rsp), @XMM[6]
2527 movdqu @XMM[1], 0x10($out)
2528 pxor 0x30(%rsp), @XMM[4]
2529 movdqu @XMM[6], 0x20($out)
2530 pxor 0x40(%rsp), @XMM[2]
2531 movdqu @XMM[4], 0x30($out)
2532 pxor 0x50(%rsp), @XMM[7]
2533 movdqu @XMM[2], 0x40($out)
2534 pxor 0x60(%rsp), @XMM[3]
2535 movdqu @XMM[7], 0x50($out)
2536 pxor 0x70(%rsp), @XMM[5]
2537 movdqu @XMM[3], 0x60($out)
2538 movdqu @XMM[5], 0x70($out)
2539 lea 0x80($out), $out
2540
2541 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2542 pxor $twtmp, $twtmp
2543 movdqa .Lxts_magic(%rip), $twmask
2544 pcmpgtd @XMM[7], $twtmp
2545 pshufd \$0x13, $twtmp, $twres
2546 pxor $twtmp, $twtmp
2547 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2548 pand $twmask, $twres # isolate carry and residue
2549 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2550 pxor $twres, @XMM[7]
2551
2552 sub \$0x80,$len
2553 jnc .Lxts_dec_loop
2554
2555 .Lxts_dec_short:
2556 add \$0x80, $len
2557 jz .Lxts_dec_done
2558 ___
2559 for ($i=0;$i<7;$i++) {
2560 $code.=<<___;
2561 pshufd \$0x13, $twtmp, $twres
2562 pxor $twtmp, $twtmp
2563 movdqa @XMM[7], @XMM[$i]
2564 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2565 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2566 pand $twmask, $twres # isolate carry and residue
2567 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2568 pxor $twres, @XMM[7]
2569 ___
2570 $code.=<<___ if ($i>=1);
2571 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2572 cmp \$`0x10*$i`,$len
2573 je .Lxts_dec_$i
2574 ___
2575 $code.=<<___ if ($i>=2);
2576 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2577 ___
2578 }
2579 $code.=<<___;
2580 movdqu 0x60($inp), @XMM[8+6]
2581 pxor @XMM[8+5], @XMM[5]
2582 movdqa @XMM[7], 0x70(%rsp)
2583 lea 0x70($inp), $inp
2584 pxor @XMM[8+6], @XMM[6]
2585 lea 0x80(%rsp), %rax # pass key schedule
2586 mov %edx, %r10d # pass rounds
2587
2588 call _bsaes_decrypt8
2589
2590 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2591 pxor 0x10(%rsp), @XMM[1]
2592 movdqu @XMM[0], 0x00($out) # write output
2593 pxor 0x20(%rsp), @XMM[6]
2594 movdqu @XMM[1], 0x10($out)
2595 pxor 0x30(%rsp), @XMM[4]
2596 movdqu @XMM[6], 0x20($out)
2597 pxor 0x40(%rsp), @XMM[2]
2598 movdqu @XMM[4], 0x30($out)
2599 pxor 0x50(%rsp), @XMM[7]
2600 movdqu @XMM[2], 0x40($out)
2601 pxor 0x60(%rsp), @XMM[3]
2602 movdqu @XMM[7], 0x50($out)
2603 movdqu @XMM[3], 0x60($out)
2604 lea 0x70($out), $out
2605
2606 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2607 jmp .Lxts_dec_done
2608 .align 16
2609 .Lxts_dec_6:
2610 pxor @XMM[8+4], @XMM[4]
2611 lea 0x60($inp), $inp
2612 pxor @XMM[8+5], @XMM[5]
2613 lea 0x80(%rsp), %rax # pass key schedule
2614 mov %edx, %r10d # pass rounds
2615
2616 call _bsaes_decrypt8
2617
2618 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2619 pxor 0x10(%rsp), @XMM[1]
2620 movdqu @XMM[0], 0x00($out) # write output
2621 pxor 0x20(%rsp), @XMM[6]
2622 movdqu @XMM[1], 0x10($out)
2623 pxor 0x30(%rsp), @XMM[4]
2624 movdqu @XMM[6], 0x20($out)
2625 pxor 0x40(%rsp), @XMM[2]
2626 movdqu @XMM[4], 0x30($out)
2627 pxor 0x50(%rsp), @XMM[7]
2628 movdqu @XMM[2], 0x40($out)
2629 movdqu @XMM[7], 0x50($out)
2630 lea 0x60($out), $out
2631
2632 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2633 jmp .Lxts_dec_done
2634 .align 16
2635 .Lxts_dec_5:
2636 pxor @XMM[8+3], @XMM[3]
2637 lea 0x50($inp), $inp
2638 pxor @XMM[8+4], @XMM[4]
2639 lea 0x80(%rsp), %rax # pass key schedule
2640 mov %edx, %r10d # pass rounds
2641
2642 call _bsaes_decrypt8
2643
2644 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2645 pxor 0x10(%rsp), @XMM[1]
2646 movdqu @XMM[0], 0x00($out) # write output
2647 pxor 0x20(%rsp), @XMM[6]
2648 movdqu @XMM[1], 0x10($out)
2649 pxor 0x30(%rsp), @XMM[4]
2650 movdqu @XMM[6], 0x20($out)
2651 pxor 0x40(%rsp), @XMM[2]
2652 movdqu @XMM[4], 0x30($out)
2653 movdqu @XMM[2], 0x40($out)
2654 lea 0x50($out), $out
2655
2656 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2657 jmp .Lxts_dec_done
2658 .align 16
2659 .Lxts_dec_4:
2660 pxor @XMM[8+2], @XMM[2]
2661 lea 0x40($inp), $inp
2662 pxor @XMM[8+3], @XMM[3]
2663 lea 0x80(%rsp), %rax # pass key schedule
2664 mov %edx, %r10d # pass rounds
2665
2666 call _bsaes_decrypt8
2667
2668 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2669 pxor 0x10(%rsp), @XMM[1]
2670 movdqu @XMM[0], 0x00($out) # write output
2671 pxor 0x20(%rsp), @XMM[6]
2672 movdqu @XMM[1], 0x10($out)
2673 pxor 0x30(%rsp), @XMM[4]
2674 movdqu @XMM[6], 0x20($out)
2675 movdqu @XMM[4], 0x30($out)
2676 lea 0x40($out), $out
2677
2678 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2679 jmp .Lxts_dec_done
2680 .align 16
2681 .Lxts_dec_3:
2682 pxor @XMM[8+1], @XMM[1]
2683 lea 0x30($inp), $inp
2684 pxor @XMM[8+2], @XMM[2]
2685 lea 0x80(%rsp), %rax # pass key schedule
2686 mov %edx, %r10d # pass rounds
2687
2688 call _bsaes_decrypt8
2689
2690 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2691 pxor 0x10(%rsp), @XMM[1]
2692 movdqu @XMM[0], 0x00($out) # write output
2693 pxor 0x20(%rsp), @XMM[6]
2694 movdqu @XMM[1], 0x10($out)
2695 movdqu @XMM[6], 0x20($out)
2696 lea 0x30($out), $out
2697
2698 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2699 jmp .Lxts_dec_done
2700 .align 16
2701 .Lxts_dec_2:
2702 pxor @XMM[8+0], @XMM[0]
2703 lea 0x20($inp), $inp
2704 pxor @XMM[8+1], @XMM[1]
2705 lea 0x80(%rsp), %rax # pass key schedule
2706 mov %edx, %r10d # pass rounds
2707
2708 call _bsaes_decrypt8
2709
2710 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2711 pxor 0x10(%rsp), @XMM[1]
2712 movdqu @XMM[0], 0x00($out) # write output
2713 movdqu @XMM[1], 0x10($out)
2714 lea 0x20($out), $out
2715
2716 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2717 jmp .Lxts_dec_done
2718 .align 16
2719 .Lxts_dec_1:
2720 pxor @XMM[0], @XMM[8]
2721 lea 0x10($inp), $inp
2722 movdqa @XMM[8], 0x20(%rbp)
2723 lea 0x20(%rbp), $arg1
2724 lea 0x20(%rbp), $arg2
2725 lea ($key), $arg3
2726 call asm_AES_decrypt # doesn't touch %xmm
2727 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2728 #pxor @XMM[8], @XMM[0]
2729 #lea 0x80(%rsp), %rax # pass key schedule
2730 #mov %edx, %r10d # pass rounds
2731 #call _bsaes_decrypt8
2732 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2733 movdqu @XMM[0], 0x00($out) # write output
2734 lea 0x10($out), $out
2735
2736 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2737
2738 .Lxts_dec_done:
2739 and \$15, %ebx
2740 jz .Lxts_dec_ret
2741
2742 pxor $twtmp, $twtmp
2743 movdqa .Lxts_magic(%rip), $twmask
2744 pcmpgtd @XMM[7], $twtmp
2745 pshufd \$0x13, $twtmp, $twres
2746 movdqa @XMM[7], @XMM[6]
2747 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2748 pand $twmask, $twres # isolate carry and residue
2749 movdqu ($inp), @XMM[0]
2750 pxor $twres, @XMM[7]
2751
2752 lea 0x20(%rbp), $arg1
2753 pxor @XMM[7], @XMM[0]
2754 lea 0x20(%rbp), $arg2
2755 movdqa @XMM[0], 0x20(%rbp)
2756 lea ($key), $arg3
2757 call asm_AES_decrypt # doesn't touch %xmm
2758 pxor 0x20(%rbp), @XMM[7]
2759 mov $out, %rdx
2760 movdqu @XMM[7], ($out)
2761
2762 .Lxts_dec_steal:
2763 movzb 16($inp), %eax
2764 movzb (%rdx), %ecx
2765 lea 1($inp), $inp
2766 mov %al, (%rdx)
2767 mov %cl, 16(%rdx)
2768 lea 1(%rdx), %rdx
2769 sub \$1,%ebx
2770 jnz .Lxts_dec_steal
2771
2772 movdqu ($out), @XMM[0]
2773 lea 0x20(%rbp), $arg1
2774 pxor @XMM[6], @XMM[0]
2775 lea 0x20(%rbp), $arg2
2776 movdqa @XMM[0], 0x20(%rbp)
2777 lea ($key), $arg3
2778 call asm_AES_decrypt # doesn't touch %xmm
2779 pxor 0x20(%rbp), @XMM[6]
2780 movdqu @XMM[6], ($out)
2781
2782 .Lxts_dec_ret:
2783 lea (%rsp), %rax
2784 pxor %xmm0, %xmm0
2785 .Lxts_dec_bzero: # wipe key schedule [if any]
2786 movdqa %xmm0, 0x00(%rax)
2787 movdqa %xmm0, 0x10(%rax)
2788 lea 0x20(%rax), %rax
2789 cmp %rax, %rbp
2790 ja .Lxts_dec_bzero
2791
2792 lea (%rbp),%rsp # restore %rsp
2793 ___
2794 $code.=<<___ if ($win64);
2795 movaps 0x40(%rbp), %xmm6
2796 movaps 0x50(%rbp), %xmm7
2797 movaps 0x60(%rbp), %xmm8
2798 movaps 0x70(%rbp), %xmm9
2799 movaps 0x80(%rbp), %xmm10
2800 movaps 0x90(%rbp), %xmm11
2801 movaps 0xa0(%rbp), %xmm12
2802 movaps 0xb0(%rbp), %xmm13
2803 movaps 0xc0(%rbp), %xmm14
2804 movaps 0xd0(%rbp), %xmm15
2805 lea 0xa0(%rbp), %rsp
2806 ___
2807 $code.=<<___;
2808 mov 0x48(%rsp), %r15
2809 mov 0x50(%rsp), %r14
2810 mov 0x58(%rsp), %r13
2811 mov 0x60(%rsp), %r12
2812 mov 0x68(%rsp), %rbx
2813 mov 0x70(%rsp), %rax
2814 lea 0x78(%rsp), %rsp
2815 mov %rax, %rbp
2816 .Lxts_dec_epilogue:
2817 ret
2818 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2819 ___
2820 }
2821 $code.=<<___;
2822 .type _bsaes_const,\@object
2823 .align 64
2824 _bsaes_const:
2825 .LM0ISR: # InvShiftRows constants
2826 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2827 .LISRM0:
2828 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2829 .LISR:
2830 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2831 .LBS0: # bit-slice constants
2832 .quad 0x5555555555555555, 0x5555555555555555
2833 .LBS1:
2834 .quad 0x3333333333333333, 0x3333333333333333
2835 .LBS2:
2836 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2837 .LSR: # shiftrows constants
2838 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2839 .LSRM0:
2840 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2841 .LM0SR:
2842 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2843 .LSWPUP: # byte-swap upper dword
2844 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2845 .LSWPUPM0SR:
2846 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2847 .LADD1: # counter increment constants
2848 .quad 0x0000000000000000, 0x0000000100000000
2849 .LADD2:
2850 .quad 0x0000000000000000, 0x0000000200000000
2851 .LADD3:
2852 .quad 0x0000000000000000, 0x0000000300000000
2853 .LADD4:
2854 .quad 0x0000000000000000, 0x0000000400000000
2855 .LADD5:
2856 .quad 0x0000000000000000, 0x0000000500000000
2857 .LADD6:
2858 .quad 0x0000000000000000, 0x0000000600000000
2859 .LADD7:
2860 .quad 0x0000000000000000, 0x0000000700000000
2861 .LADD8:
2862 .quad 0x0000000000000000, 0x0000000800000000
2863 .Lxts_magic:
2864 .long 0x87,0,1,0
2865 .Lmasks:
2866 .quad 0x0101010101010101, 0x0101010101010101
2867 .quad 0x0202020202020202, 0x0202020202020202
2868 .quad 0x0404040404040404, 0x0404040404040404
2869 .quad 0x0808080808080808, 0x0808080808080808
2870 .LM0:
2871 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2872 .L63:
2873 .quad 0x6363636363636363, 0x6363636363636363
2874 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Pol yakov"
2875 .align 64
2876 .size _bsaes_const,.-_bsaes_const
2877 ___
2878
2879 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2880 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2881 if ($win64) {
2882 $rec="%rcx";
2883 $frame="%rdx";
2884 $context="%r8";
2885 $disp="%r9";
2886
2887 $code.=<<___;
2888 .extern __imp_RtlVirtualUnwind
2889 .type se_handler,\@abi-omnipotent
2890 .align 16
2891 se_handler:
2892 push %rsi
2893 push %rdi
2894 push %rbx
2895 push %rbp
2896 push %r12
2897 push %r13
2898 push %r14
2899 push %r15
2900 pushfq
2901 sub \$64,%rsp
2902
2903 mov 120($context),%rax # pull context->Rax
2904 mov 248($context),%rbx # pull context->Rip
2905
2906 mov 8($disp),%rsi # disp->ImageBase
2907 mov 56($disp),%r11 # disp->HandlerData
2908
2909 mov 0(%r11),%r10d # HandlerData[0]
2910 lea (%rsi,%r10),%r10 # prologue label
2911 cmp %r10,%rbx # context->Rip<prologue label
2912 jb .Lin_prologue
2913
2914 mov 152($context),%rax # pull context->Rsp
2915
2916 mov 4(%r11),%r10d # HandlerData[1]
2917 lea (%rsi,%r10),%r10 # epilogue label
2918 cmp %r10,%rbx # context->Rip>=epilogue label
2919 jae .Lin_prologue
2920
2921 mov 160($context),%rax # pull context->Rbp
2922
2923 lea 0x40(%rax),%rsi # %xmm save area
2924 lea 512($context),%rdi # &context.Xmm6
2925 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2926 .long 0xa548f3fc # cld; rep movsq
2927 lea 0xa0(%rax),%rax # adjust stack pointer
2928
2929 mov 0x70(%rax),%rbp
2930 mov 0x68(%rax),%rbx
2931 mov 0x60(%rax),%r12
2932 mov 0x58(%rax),%r13
2933 mov 0x50(%rax),%r14
2934 mov 0x48(%rax),%r15
2935 lea 0x78(%rax),%rax # adjust stack pointer
2936 mov %rbx,144($context) # restore context->Rbx
2937 mov %rbp,160($context) # restore context->Rbp
2938 mov %r12,216($context) # restore context->R12
2939 mov %r13,224($context) # restore context->R13
2940 mov %r14,232($context) # restore context->R14
2941 mov %r15,240($context) # restore context->R15
2942
2943 .Lin_prologue:
2944 mov %rax,152($context) # restore context->Rsp
2945
2946 mov 40($disp),%rdi # disp->ContextRecord
2947 mov $context,%rsi # context
2948 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
2949 .long 0xa548f3fc # cld; rep movsq
2950
2951 mov $disp,%rsi
2952 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2953 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2954 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2955 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2956 mov 40(%rsi),%r10 # disp->ContextRecord
2957 lea 56(%rsi),%r11 # &disp->HandlerData
2958 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2959 mov %r10,32(%rsp) # arg5
2960 mov %r11,40(%rsp) # arg6
2961 mov %r12,48(%rsp) # arg7
2962 mov %rcx,56(%rsp) # arg8, (NULL)
2963 call *__imp_RtlVirtualUnwind(%rip)
2964
2965 mov \$1,%eax # ExceptionContinueSearch
2966 add \$64,%rsp
2967 popfq
2968 pop %r15
2969 pop %r14
2970 pop %r13
2971 pop %r12
2972 pop %rbp
2973 pop %rbx
2974 pop %rdi
2975 pop %rsi
2976 ret
2977 .size se_handler,.-se_handler
2978
2979 .section .pdata
2980 .align 4
2981 ___
2982 $code.=<<___ if ($ecb);
2983 .rva .Lecb_enc_prologue
2984 .rva .Lecb_enc_epilogue
2985 .rva .Lecb_enc_info
2986
2987 .rva .Lecb_dec_prologue
2988 .rva .Lecb_dec_epilogue
2989 .rva .Lecb_dec_info
2990 ___
2991 $code.=<<___;
2992 .rva .Lcbc_dec_prologue
2993 .rva .Lcbc_dec_epilogue
2994 .rva .Lcbc_dec_info
2995
2996 .rva .Lctr_enc_prologue
2997 .rva .Lctr_enc_epilogue
2998 .rva .Lctr_enc_info
2999
3000 .rva .Lxts_enc_prologue
3001 .rva .Lxts_enc_epilogue
3002 .rva .Lxts_enc_info
3003
3004 .rva .Lxts_dec_prologue
3005 .rva .Lxts_dec_epilogue
3006 .rva .Lxts_dec_info
3007
3008 .section .xdata
3009 .align 8
3010 ___
3011 $code.=<<___ if ($ecb);
3012 .Lecb_enc_info:
3013 .byte 9,0,0,0
3014 .rva se_handler
3015 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3016 .Lecb_dec_info:
3017 .byte 9,0,0,0
3018 .rva se_handler
3019 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3020 ___
3021 $code.=<<___;
3022 .Lcbc_dec_info:
3023 .byte 9,0,0,0
3024 .rva se_handler
3025 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3026 .Lctr_enc_info:
3027 .byte 9,0,0,0
3028 .rva se_handler
3029 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3030 .Lxts_enc_info:
3031 .byte 9,0,0,0
3032 .rva se_handler
3033 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3034 .Lxts_dec_info:
3035 .byte 9,0,0,0
3036 .rva se_handler
3037 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3038 ___
3039 }
3040
3041 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
3042
3043 print $code;
3044
3045 close STDOUT;
OLDNEW
« no previous file with comments | « openssl/crypto/aes/asm/bsaes-x86_64.S ('k') | openssl/crypto/aes/asm/vpaes-x86.S » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698