OLD | NEW |
| (Empty) |
1 #!/usr/bin/env perl | |
2 | |
3 ################################################################### | |
4 ### AES-128 [originally in CTR mode] ### | |
5 ### bitsliced implementation for Intel Core 2 processors ### | |
6 ### requires support of SSE extensions up to SSSE3 ### | |
7 ### Author: Emilia Käsper and Peter Schwabe ### | |
8 ### Date: 2009-03-19 ### | |
9 ### Public domain ### | |
10 ### ### | |
11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ### | |
12 ### further information. ### | |
13 ################################################################### | |
14 # | |
15 # September 2011. | |
16 # | |
17 # Started as transliteration to "perlasm" the original code has | |
18 # undergone following changes: | |
19 # | |
20 # - code was made position-independent; | |
21 # - rounds were folded into a loop resulting in >5x size reduction | |
22 # from 12.5KB to 2.2KB; | |
23 # - above was possibile thanks to mixcolumns() modification that | |
24 # allowed to feed its output back to aesenc[last], this was | |
25 # achieved at cost of two additional inter-registers moves; | |
26 # - some instruction reordering and interleaving; | |
27 # - this module doesn't implement key setup subroutine, instead it | |
28 # relies on conversion of "conventional" key schedule as returned | |
29 # by AES_set_encrypt_key (see discussion below); | |
30 # - first and last round keys are treated differently, which allowed | |
31 # to skip one shiftrows(), reduce bit-sliced key schedule and | |
32 # speed-up conversion by 22%; | |
33 # - support for 192- and 256-bit keys was added; | |
34 # | |
35 # Resulting performance in CPU cycles spent to encrypt one byte out | |
36 # of 4096-byte buffer with 128-bit key is: | |
37 # | |
38 # Emilia's this(*) difference | |
39 # | |
40 # Core 2 9.30 8.69 +7% | |
41 # Nehalem(**) 7.63 6.98 +9% | |
42 # Atom 17.1 17.4 -2%(***) | |
43 # | |
44 # (*) Comparison is not completely fair, because "this" is ECB, | |
45 # i.e. no extra processing such as counter values calculation | |
46 # and xor-ing input as in Emilia's CTR implementation is | |
47 # performed. However, the CTR calculations stand for not more | |
48 # than 1% of total time, so comparison is *rather* fair. | |
49 # | |
50 # (**) Results were collected on Westmere, which is considered to | |
51 # be equivalent to Nehalem for this code. | |
52 # | |
53 # (***) Slowdown on Atom is rather strange per se, because original | |
54 # implementation has a number of 9+-bytes instructions, which | |
55 # are bad for Atom front-end, and which I eliminated completely. | |
56 # In attempt to address deterioration sbox() was tested in FP | |
57 # SIMD "domain" (movaps instead of movdqa, xorps instead of | |
58 # pxor, etc.). While it resulted in nominal 4% improvement on | |
59 # Atom, it hurted Westmere by more than 2x factor. | |
60 # | |
61 # As for key schedule conversion subroutine. Interface to OpenSSL | |
62 # relies on per-invocation on-the-fly conversion. This naturally | |
63 # has impact on performance, especially for short inputs. Conversion | |
64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block | |
65 # function is: | |
66 # | |
67 # conversion conversion/8x block | |
68 # Core 2 240 0.22 | |
69 # Nehalem 180 0.20 | |
70 # Atom 430 0.19 | |
71 # | |
72 # The ratio values mean that 128-byte blocks will be processed | |
73 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, | |
74 # etc. Then keep in mind that input sizes not divisible by 128 are | |
75 # *effectively* slower, especially shortest ones, e.g. consecutive | |
76 # 144-byte blocks are processed 44% slower than one would expect, | |
77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" | |
78 # it's still faster than ["hyper-threading-safe" code path in] | |
79 # aes-x86_64.pl on all lengths above 64 bytes... | |
80 # | |
81 # October 2011. | |
82 # | |
83 # Add decryption procedure. Performance in CPU cycles spent to decrypt | |
84 # one byte out of 4096-byte buffer with 128-bit key is: | |
85 # | |
86 # Core 2 11.0 | |
87 # Nehalem 9.16 | |
88 # Atom 20.9 | |
89 # | |
90 # November 2011. | |
91 # | |
92 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is | |
93 # suboptimal, but XTS is meant to be used with larger blocks... | |
94 # | |
95 # <appro@openssl.org> | |
96 | |
97 $flavour = shift; | |
98 $output = shift; | |
99 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | |
100 | |
101 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
102 | |
103 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
104 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
105 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
106 die "can't locate x86_64-xlate.pl"; | |
107 | |
108 open OUT,"| \"$^X\" $xlate $flavour $output"; | |
109 *STDOUT=*OUT; | |
110 | |
111 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); | |
112 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) | |
113 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... | |
114 | |
115 { | |
116 my ($key,$rounds,$const)=("%rax","%r10d","%r11"); | |
117 | |
118 sub Sbox { | |
119 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | |
120 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb | |
121 my @b=@_[0..7]; | |
122 my @t=@_[8..11]; | |
123 my @s=@_[12..15]; | |
124 &InBasisChange (@b); | |
125 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); | |
126 &OutBasisChange (@b[7,1,4,2,6,5,0,3]); | |
127 } | |
128 | |
129 sub InBasisChange { | |
130 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | |
131 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb | |
132 my @b=@_[0..7]; | |
133 $code.=<<___; | |
134 pxor @b[6], @b[5] | |
135 pxor @b[1], @b[2] | |
136 pxor @b[0], @b[3] | |
137 pxor @b[2], @b[6] | |
138 pxor @b[0], @b[5] | |
139 | |
140 pxor @b[3], @b[6] | |
141 pxor @b[7], @b[3] | |
142 pxor @b[5], @b[7] | |
143 pxor @b[4], @b[3] | |
144 pxor @b[5], @b[4] | |
145 pxor @b[1], @b[3] | |
146 | |
147 pxor @b[7], @b[2] | |
148 pxor @b[5], @b[1] | |
149 ___ | |
150 } | |
151 | |
152 sub OutBasisChange { | |
153 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | |
154 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb | |
155 my @b=@_[0..7]; | |
156 $code.=<<___; | |
157 pxor @b[6], @b[0] | |
158 pxor @b[4], @b[1] | |
159 pxor @b[0], @b[2] | |
160 pxor @b[6], @b[4] | |
161 pxor @b[1], @b[6] | |
162 | |
163 pxor @b[5], @b[1] | |
164 pxor @b[3], @b[5] | |
165 pxor @b[7], @b[3] | |
166 pxor @b[5], @b[7] | |
167 pxor @b[5], @b[2] | |
168 | |
169 pxor @b[7], @b[4] | |
170 ___ | |
171 } | |
172 | |
173 sub InvSbox { | |
174 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb | |
175 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb | |
176 my @b=@_[0..7]; | |
177 my @t=@_[8..11]; | |
178 my @s=@_[12..15]; | |
179 &InvInBasisChange (@b); | |
180 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); | |
181 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); | |
182 } | |
183 | |
184 sub InvInBasisChange { # OutBasisChange in reverse | |
185 my @b=@_[5,1,2,6,3,7,0,4]; | |
186 $code.=<<___ | |
187 pxor @b[7], @b[4] | |
188 | |
189 pxor @b[5], @b[7] | |
190 pxor @b[5], @b[2] | |
191 pxor @b[7], @b[3] | |
192 pxor @b[3], @b[5] | |
193 pxor @b[5], @b[1] | |
194 | |
195 pxor @b[1], @b[6] | |
196 pxor @b[0], @b[2] | |
197 pxor @b[6], @b[4] | |
198 pxor @b[6], @b[0] | |
199 pxor @b[4], @b[1] | |
200 ___ | |
201 } | |
202 | |
203 sub InvOutBasisChange { # InBasisChange in reverse | |
204 my @b=@_[2,5,7,3,6,1,0,4]; | |
205 $code.=<<___; | |
206 pxor @b[5], @b[1] | |
207 pxor @b[7], @b[2] | |
208 | |
209 pxor @b[1], @b[3] | |
210 pxor @b[5], @b[4] | |
211 pxor @b[5], @b[7] | |
212 pxor @b[4], @b[3] | |
213 pxor @b[0], @b[5] | |
214 pxor @b[7], @b[3] | |
215 pxor @b[2], @b[6] | |
216 pxor @b[1], @b[2] | |
217 pxor @b[3], @b[6] | |
218 | |
219 pxor @b[0], @b[3] | |
220 pxor @b[6], @b[5] | |
221 ___ | |
222 } | |
223 | |
224 sub Mul_GF4 { | |
225 #;************************************************************* | |
226 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * | |
227 #;************************************************************* | |
228 my ($x0,$x1,$y0,$y1,$t0)=@_; | |
229 $code.=<<___; | |
230 movdqa $y0, $t0 | |
231 pxor $y1, $t0 | |
232 pand $x0, $t0 | |
233 pxor $x1, $x0 | |
234 pand $y0, $x1 | |
235 pand $y1, $x0 | |
236 pxor $x1, $x0 | |
237 pxor $t0, $x1 | |
238 ___ | |
239 } | |
240 | |
241 sub Mul_GF4_N { # not used, see next subroutine | |
242 # multiply and scale by N | |
243 my ($x0,$x1,$y0,$y1,$t0)=@_; | |
244 $code.=<<___; | |
245 movdqa $y0, $t0 | |
246 pxor $y1, $t0 | |
247 pand $x0, $t0 | |
248 pxor $x1, $x0 | |
249 pand $y0, $x1 | |
250 pand $y1, $x0 | |
251 pxor $x0, $x1 | |
252 pxor $t0, $x0 | |
253 ___ | |
254 } | |
255 | |
256 sub Mul_GF4_N_GF4 { | |
257 # interleaved Mul_GF4_N and Mul_GF4 | |
258 my ($x0,$x1,$y0,$y1,$t0, | |
259 $x2,$x3,$y2,$y3,$t1)=@_; | |
260 $code.=<<___; | |
261 movdqa $y0, $t0 | |
262 movdqa $y2, $t1 | |
263 pxor $y1, $t0 | |
264 pxor $y3, $t1 | |
265 pand $x0, $t0 | |
266 pand $x2, $t1 | |
267 pxor $x1, $x0 | |
268 pxor $x3, $x2 | |
269 pand $y0, $x1 | |
270 pand $y2, $x3 | |
271 pand $y1, $x0 | |
272 pand $y3, $x2 | |
273 pxor $x0, $x1 | |
274 pxor $x3, $x2 | |
275 pxor $t0, $x0 | |
276 pxor $t1, $x3 | |
277 ___ | |
278 } | |
279 sub Mul_GF16_2 { | |
280 my @x=@_[0..7]; | |
281 my @y=@_[8..11]; | |
282 my @t=@_[12..15]; | |
283 $code.=<<___; | |
284 movdqa @x[0], @t[0] | |
285 movdqa @x[1], @t[1] | |
286 ___ | |
287 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); | |
288 $code.=<<___; | |
289 pxor @x[2], @t[0] | |
290 pxor @x[3], @t[1] | |
291 pxor @y[2], @y[0] | |
292 pxor @y[3], @y[1] | |
293 ___ | |
294 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], | |
295 @x[2], @x[3], @y[2], @y[3], @t[2]); | |
296 $code.=<<___; | |
297 pxor @t[0], @x[0] | |
298 pxor @t[0], @x[2] | |
299 pxor @t[1], @x[1] | |
300 pxor @t[1], @x[3] | |
301 | |
302 movdqa @x[4], @t[0] | |
303 movdqa @x[5], @t[1] | |
304 pxor @x[6], @t[0] | |
305 pxor @x[7], @t[1] | |
306 ___ | |
307 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], | |
308 @x[6], @x[7], @y[2], @y[3], @t[2]); | |
309 $code.=<<___; | |
310 pxor @y[2], @y[0] | |
311 pxor @y[3], @y[1] | |
312 ___ | |
313 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); | |
314 $code.=<<___; | |
315 pxor @t[0], @x[4] | |
316 pxor @t[0], @x[6] | |
317 pxor @t[1], @x[5] | |
318 pxor @t[1], @x[7] | |
319 ___ | |
320 } | |
321 sub Inv_GF256 { | |
322 #;******************************************************************** | |
323 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * | |
324 #;******************************************************************** | |
325 my @x=@_[0..7]; | |
326 my @t=@_[8..11]; | |
327 my @s=@_[12..15]; | |
328 # direct optimizations from hardware | |
329 $code.=<<___; | |
330 movdqa @x[4], @t[3] | |
331 movdqa @x[5], @t[2] | |
332 movdqa @x[1], @t[1] | |
333 movdqa @x[7], @s[1] | |
334 movdqa @x[0], @s[0] | |
335 | |
336 pxor @x[6], @t[3] | |
337 pxor @x[7], @t[2] | |
338 pxor @x[3], @t[1] | |
339 movdqa @t[3], @s[2] | |
340 pxor @x[6], @s[1] | |
341 movdqa @t[2], @t[0] | |
342 pxor @x[2], @s[0] | |
343 movdqa @t[3], @s[3] | |
344 | |
345 por @t[1], @t[2] | |
346 por @s[0], @t[3] | |
347 pxor @t[0], @s[3] | |
348 pand @s[0], @s[2] | |
349 pxor @t[1], @s[0] | |
350 pand @t[1], @t[0] | |
351 pand @s[0], @s[3] | |
352 movdqa @x[3], @s[0] | |
353 pxor @x[2], @s[0] | |
354 pand @s[0], @s[1] | |
355 pxor @s[1], @t[3] | |
356 pxor @s[1], @t[2] | |
357 movdqa @x[4], @s[1] | |
358 movdqa @x[1], @s[0] | |
359 pxor @x[5], @s[1] | |
360 pxor @x[0], @s[0] | |
361 movdqa @s[1], @t[1] | |
362 pand @s[0], @s[1] | |
363 por @s[0], @t[1] | |
364 pxor @s[1], @t[0] | |
365 pxor @s[3], @t[3] | |
366 pxor @s[2], @t[2] | |
367 pxor @s[3], @t[1] | |
368 movdqa @x[7], @s[0] | |
369 pxor @s[2], @t[0] | |
370 movdqa @x[6], @s[1] | |
371 pxor @s[2], @t[1] | |
372 movdqa @x[5], @s[2] | |
373 pand @x[3], @s[0] | |
374 movdqa @x[4], @s[3] | |
375 pand @x[2], @s[1] | |
376 pand @x[1], @s[2] | |
377 por @x[0], @s[3] | |
378 pxor @s[0], @t[3] | |
379 pxor @s[1], @t[2] | |
380 pxor @s[2], @t[1] | |
381 pxor @s[3], @t[0] | |
382 | |
383 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 | |
384 | |
385 # new smaller inversion | |
386 | |
387 movdqa @t[3], @s[0] | |
388 pand @t[1], @t[3] | |
389 pxor @t[2], @s[0] | |
390 | |
391 movdqa @t[0], @s[2] | |
392 movdqa @s[0], @s[3] | |
393 pxor @t[3], @s[2] | |
394 pand @s[2], @s[3] | |
395 | |
396 movdqa @t[1], @s[1] | |
397 pxor @t[2], @s[3] | |
398 pxor @t[0], @s[1] | |
399 | |
400 pxor @t[2], @t[3] | |
401 | |
402 pand @t[3], @s[1] | |
403 | |
404 movdqa @s[2], @t[2] | |
405 pxor @t[0], @s[1] | |
406 | |
407 pxor @s[1], @t[2] | |
408 pxor @s[1], @t[1] | |
409 | |
410 pand @t[0], @t[2] | |
411 | |
412 pxor @t[2], @s[2] | |
413 pxor @t[2], @t[1] | |
414 | |
415 pand @s[3], @s[2] | |
416 | |
417 pxor @s[0], @s[2] | |
418 ___ | |
419 # output in s3, s2, s1, t1 | |
420 | |
421 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s
1, \s2, \s3 | |
422 | |
423 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t
0, \t2, \t3 | |
424 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); | |
425 | |
426 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb | |
427 } | |
428 | |
429 # AES linear components | |
430 | |
431 sub ShiftRows { | |
432 my @x=@_[0..7]; | |
433 my $mask=pop; | |
434 $code.=<<___; | |
435 pxor 0x00($key),@x[0] | |
436 pxor 0x10($key),@x[1] | |
437 pshufb $mask,@x[0] | |
438 pxor 0x20($key),@x[2] | |
439 pshufb $mask,@x[1] | |
440 pxor 0x30($key),@x[3] | |
441 pshufb $mask,@x[2] | |
442 pxor 0x40($key),@x[4] | |
443 pshufb $mask,@x[3] | |
444 pxor 0x50($key),@x[5] | |
445 pshufb $mask,@x[4] | |
446 pxor 0x60($key),@x[6] | |
447 pshufb $mask,@x[5] | |
448 pxor 0x70($key),@x[7] | |
449 pshufb $mask,@x[6] | |
450 lea 0x80($key),$key | |
451 pshufb $mask,@x[7] | |
452 ___ | |
453 } | |
454 | |
455 sub MixColumns { | |
456 # modified to emit output in order suitable for feeding back to aesenc[last] | |
457 my @x=@_[0..7]; | |
458 my @t=@_[8..15]; | |
459 $code.=<<___; | |
460 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 | |
461 pshufd \$0x93, @x[1], @t[1] | |
462 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) | |
463 pshufd \$0x93, @x[2], @t[2] | |
464 pxor @t[1], @x[1] | |
465 pshufd \$0x93, @x[3], @t[3] | |
466 pxor @t[2], @x[2] | |
467 pshufd \$0x93, @x[4], @t[4] | |
468 pxor @t[3], @x[3] | |
469 pshufd \$0x93, @x[5], @t[5] | |
470 pxor @t[4], @x[4] | |
471 pshufd \$0x93, @x[6], @t[6] | |
472 pxor @t[5], @x[5] | |
473 pshufd \$0x93, @x[7], @t[7] | |
474 pxor @t[6], @x[6] | |
475 pxor @t[7], @x[7] | |
476 | |
477 pxor @x[0], @t[1] | |
478 pxor @x[7], @t[0] | |
479 pxor @x[7], @t[1] | |
480 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) | |
481 pxor @x[1], @t[2] | |
482 pshufd \$0x4E, @x[1], @x[1] | |
483 pxor @x[4], @t[5] | |
484 pxor @t[0], @x[0] | |
485 pxor @x[5], @t[6] | |
486 pxor @t[1], @x[1] | |
487 pxor @x[3], @t[4] | |
488 pshufd \$0x4E, @x[4], @t[0] | |
489 pxor @x[6], @t[7] | |
490 pshufd \$0x4E, @x[5], @t[1] | |
491 pxor @x[2], @t[3] | |
492 pshufd \$0x4E, @x[3], @x[4] | |
493 pxor @x[7], @t[3] | |
494 pshufd \$0x4E, @x[7], @x[5] | |
495 pxor @x[7], @t[4] | |
496 pshufd \$0x4E, @x[6], @x[3] | |
497 pxor @t[4], @t[0] | |
498 pshufd \$0x4E, @x[2], @x[6] | |
499 pxor @t[5], @t[1] | |
500 | |
501 pxor @t[3], @x[4] | |
502 pxor @t[7], @x[5] | |
503 pxor @t[6], @x[3] | |
504 movdqa @t[0], @x[2] | |
505 pxor @t[2], @x[6] | |
506 movdqa @t[1], @x[7] | |
507 ___ | |
508 } | |
509 | |
510 sub InvMixColumns { | |
511 my @x=@_[0..7]; | |
512 my @t=@_[8..15]; | |
513 | |
514 $code.=<<___; | |
515 # multiplication by 0x0e | |
516 pshufd \$0x93, @x[7], @t[7] | |
517 movdqa @x[2], @t[2] | |
518 pxor @x[5], @x[7] # 7 5 | |
519 pxor @x[5], @x[2] # 2 5 | |
520 pshufd \$0x93, @x[0], @t[0] | |
521 movdqa @x[5], @t[5] | |
522 pxor @x[0], @x[5] # 5 0 [1] | |
523 pxor @x[1], @x[0] # 0 1 | |
524 pshufd \$0x93, @x[1], @t[1] | |
525 pxor @x[2], @x[1] # 1 25 | |
526 pxor @x[6], @x[0] # 01 6 [2] | |
527 pxor @x[3], @x[1] # 125 3 [4] | |
528 pshufd \$0x93, @x[3], @t[3] | |
529 pxor @x[0], @x[2] # 25 016 [3] | |
530 pxor @x[7], @x[3] # 3 75 | |
531 pxor @x[6], @x[7] # 75 6 [0] | |
532 pshufd \$0x93, @x[6], @t[6] | |
533 movdqa @x[4], @t[4] | |
534 pxor @x[4], @x[6] # 6 4 | |
535 pxor @x[3], @x[4] # 4 375 [6] | |
536 pxor @x[7], @x[3] # 375 756=36 | |
537 pxor @t[5], @x[6] # 64 5 [7] | |
538 pxor @t[2], @x[3] # 36 2 | |
539 pxor @t[4], @x[3] # 362 4 [5] | |
540 pshufd \$0x93, @t[5], @t[5] | |
541 ___ | |
542 my @y = @x[7,5,0,2,1,3,4,6]; | |
543 $code.=<<___; | |
544 # multiplication by 0x0b | |
545 pxor @y[0], @y[1] | |
546 pxor @t[0], @y[0] | |
547 pxor @t[1], @y[1] | |
548 pshufd \$0x93, @t[2], @t[2] | |
549 pxor @t[5], @y[0] | |
550 pxor @t[6], @y[1] | |
551 pxor @t[7], @y[0] | |
552 pshufd \$0x93, @t[4], @t[4] | |
553 pxor @t[6], @t[7] # clobber t[7] | |
554 pxor @y[0], @y[1] | |
555 | |
556 pxor @t[0], @y[3] | |
557 pshufd \$0x93, @t[0], @t[0] | |
558 pxor @t[1], @y[2] | |
559 pxor @t[1], @y[4] | |
560 pxor @t[2], @y[2] | |
561 pshufd \$0x93, @t[1], @t[1] | |
562 pxor @t[2], @y[3] | |
563 pxor @t[2], @y[5] | |
564 pxor @t[7], @y[2] | |
565 pshufd \$0x93, @t[2], @t[2] | |
566 pxor @t[3], @y[3] | |
567 pxor @t[3], @y[6] | |
568 pxor @t[3], @y[4] | |
569 pshufd \$0x93, @t[3], @t[3] | |
570 pxor @t[4], @y[7] | |
571 pxor @t[4], @y[5] | |
572 pxor @t[7], @y[7] | |
573 pxor @t[5], @y[3] | |
574 pxor @t[4], @y[4] | |
575 pxor @t[5], @t[7] # clobber t[7] even more | |
576 | |
577 pxor @t[7], @y[5] | |
578 pshufd \$0x93, @t[4], @t[4] | |
579 pxor @t[7], @y[6] | |
580 pxor @t[7], @y[4] | |
581 | |
582 pxor @t[5], @t[7] | |
583 pshufd \$0x93, @t[5], @t[5] | |
584 pxor @t[6], @t[7] # restore t[7] | |
585 | |
586 # multiplication by 0x0d | |
587 pxor @y[7], @y[4] | |
588 pxor @t[4], @y[7] | |
589 pshufd \$0x93, @t[6], @t[6] | |
590 pxor @t[0], @y[2] | |
591 pxor @t[5], @y[7] | |
592 pxor @t[2], @y[2] | |
593 pshufd \$0x93, @t[7], @t[7] | |
594 | |
595 pxor @y[1], @y[3] | |
596 pxor @t[1], @y[1] | |
597 pxor @t[0], @y[0] | |
598 pxor @t[0], @y[3] | |
599 pxor @t[5], @y[1] | |
600 pxor @t[5], @y[0] | |
601 pxor @t[7], @y[1] | |
602 pshufd \$0x93, @t[0], @t[0] | |
603 pxor @t[6], @y[0] | |
604 pxor @y[1], @y[3] | |
605 pxor @t[1], @y[4] | |
606 pshufd \$0x93, @t[1], @t[1] | |
607 | |
608 pxor @t[7], @y[7] | |
609 pxor @t[2], @y[4] | |
610 pxor @t[2], @y[5] | |
611 pshufd \$0x93, @t[2], @t[2] | |
612 pxor @t[6], @y[2] | |
613 pxor @t[3], @t[6] # clobber t[6] | |
614 pxor @y[7], @y[4] | |
615 pxor @t[6], @y[3] | |
616 | |
617 pxor @t[6], @y[6] | |
618 pxor @t[5], @y[5] | |
619 pxor @t[4], @y[6] | |
620 pshufd \$0x93, @t[4], @t[4] | |
621 pxor @t[6], @y[5] | |
622 pxor @t[7], @y[6] | |
623 pxor @t[3], @t[6] # restore t[6] | |
624 | |
625 pshufd \$0x93, @t[5], @t[5] | |
626 pshufd \$0x93, @t[6], @t[6] | |
627 pshufd \$0x93, @t[7], @t[7] | |
628 pshufd \$0x93, @t[3], @t[3] | |
629 | |
630 # multiplication by 0x09 | |
631 pxor @y[1], @y[4] | |
632 pxor @y[1], @t[1] # t[1]=y[1] | |
633 pxor @t[5], @t[0] # clobber t[0] | |
634 pxor @t[5], @t[1] | |
635 pxor @t[0], @y[3] | |
636 pxor @y[0], @t[0] # t[0]=y[0] | |
637 pxor @t[6], @t[1] | |
638 pxor @t[7], @t[6] # clobber t[6] | |
639 pxor @t[1], @y[4] | |
640 pxor @t[4], @y[7] | |
641 pxor @y[4], @t[4] # t[4]=y[4] | |
642 pxor @t[3], @y[6] | |
643 pxor @y[3], @t[3] # t[3]=y[3] | |
644 pxor @t[2], @y[5] | |
645 pxor @y[2], @t[2] # t[2]=y[2] | |
646 pxor @t[7], @t[3] | |
647 pxor @y[5], @t[5] # t[5]=y[5] | |
648 pxor @t[6], @t[2] | |
649 pxor @t[6], @t[5] | |
650 pxor @y[6], @t[6] # t[6]=y[6] | |
651 pxor @y[7], @t[7] # t[7]=y[7] | |
652 | |
653 movdqa @t[0],@XMM[0] | |
654 movdqa @t[1],@XMM[1] | |
655 movdqa @t[2],@XMM[2] | |
656 movdqa @t[3],@XMM[3] | |
657 movdqa @t[4],@XMM[4] | |
658 movdqa @t[5],@XMM[5] | |
659 movdqa @t[6],@XMM[6] | |
660 movdqa @t[7],@XMM[7] | |
661 ___ | |
662 } | |
663 | |
664 sub aesenc { # not used | |
665 my @b=@_[0..7]; | |
666 my @t=@_[8..15]; | |
667 $code.=<<___; | |
668 movdqa 0x30($const),@t[0] # .LSR | |
669 ___ | |
670 &ShiftRows (@b,@t[0]); | |
671 &Sbox (@b,@t); | |
672 &MixColumns (@b[0,1,4,6,3,7,2,5],@t); | |
673 } | |
674 | |
675 sub aesenclast { # not used | |
676 my @b=@_[0..7]; | |
677 my @t=@_[8..15]; | |
678 $code.=<<___; | |
679 movdqa 0x40($const),@t[0] # .LSRM0 | |
680 ___ | |
681 &ShiftRows (@b,@t[0]); | |
682 &Sbox (@b,@t); | |
683 $code.=<<___ | |
684 pxor 0x00($key),@b[0] | |
685 pxor 0x10($key),@b[1] | |
686 pxor 0x20($key),@b[4] | |
687 pxor 0x30($key),@b[6] | |
688 pxor 0x40($key),@b[3] | |
689 pxor 0x50($key),@b[7] | |
690 pxor 0x60($key),@b[2] | |
691 pxor 0x70($key),@b[5] | |
692 ___ | |
693 } | |
694 | |
695 sub swapmove { | |
696 my ($a,$b,$n,$mask,$t)=@_; | |
697 $code.=<<___; | |
698 movdqa $b,$t | |
699 psrlq \$$n,$b | |
700 pxor $a,$b | |
701 pand $mask,$b | |
702 pxor $b,$a | |
703 psllq \$$n,$b | |
704 pxor $t,$b | |
705 ___ | |
706 } | |
707 sub swapmove2x { | |
708 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; | |
709 $code.=<<___; | |
710 movdqa $b0,$t0 | |
711 psrlq \$$n,$b0 | |
712 movdqa $b1,$t1 | |
713 psrlq \$$n,$b1 | |
714 pxor $a0,$b0 | |
715 pxor $a1,$b1 | |
716 pand $mask,$b0 | |
717 pand $mask,$b1 | |
718 pxor $b0,$a0 | |
719 psllq \$$n,$b0 | |
720 pxor $b1,$a1 | |
721 psllq \$$n,$b1 | |
722 pxor $t0,$b0 | |
723 pxor $t1,$b1 | |
724 ___ | |
725 } | |
726 | |
727 sub bitslice { | |
728 my @x=reverse(@_[0..7]); | |
729 my ($t0,$t1,$t2,$t3)=@_[8..11]; | |
730 $code.=<<___; | |
731 movdqa 0x00($const),$t0 # .LBS0 | |
732 movdqa 0x10($const),$t1 # .LBS1 | |
733 ___ | |
734 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); | |
735 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); | |
736 $code.=<<___; | |
737 movdqa 0x20($const),$t0 # .LBS2 | |
738 ___ | |
739 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); | |
740 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); | |
741 | |
742 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); | |
743 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); | |
744 } | |
745 | |
746 $code.=<<___; | |
747 .text | |
748 | |
749 .extern asm_AES_encrypt | |
750 .extern asm_AES_decrypt | |
751 | |
752 .type _bsaes_encrypt8,\@abi-omnipotent | |
753 .align 64 | |
754 _bsaes_encrypt8: | |
755 lea .LBS0(%rip), $const # constants table | |
756 | |
757 movdqa ($key), @XMM[9] # round 0 key | |
758 lea 0x10($key), $key | |
759 movdqa 0x50($const), @XMM[8] # .LM0SR | |
760 pxor @XMM[9], @XMM[0] # xor with round0 key | |
761 pxor @XMM[9], @XMM[1] | |
762 pshufb @XMM[8], @XMM[0] | |
763 pxor @XMM[9], @XMM[2] | |
764 pshufb @XMM[8], @XMM[1] | |
765 pxor @XMM[9], @XMM[3] | |
766 pshufb @XMM[8], @XMM[2] | |
767 pxor @XMM[9], @XMM[4] | |
768 pshufb @XMM[8], @XMM[3] | |
769 pxor @XMM[9], @XMM[5] | |
770 pshufb @XMM[8], @XMM[4] | |
771 pxor @XMM[9], @XMM[6] | |
772 pshufb @XMM[8], @XMM[5] | |
773 pxor @XMM[9], @XMM[7] | |
774 pshufb @XMM[8], @XMM[6] | |
775 pshufb @XMM[8], @XMM[7] | |
776 _bsaes_encrypt8_bitslice: | |
777 ___ | |
778 &bitslice (@XMM[0..7, 8..11]); | |
779 $code.=<<___; | |
780 dec $rounds | |
781 jmp .Lenc_sbox | |
782 .align 16 | |
783 .Lenc_loop: | |
784 ___ | |
785 &ShiftRows (@XMM[0..7, 8]); | |
786 $code.=".Lenc_sbox:\n"; | |
787 &Sbox (@XMM[0..7, 8..15]); | |
788 $code.=<<___; | |
789 dec $rounds | |
790 jl .Lenc_done | |
791 ___ | |
792 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); | |
793 $code.=<<___; | |
794 movdqa 0x30($const), @XMM[8] # .LSR | |
795 jnz .Lenc_loop | |
796 movdqa 0x40($const), @XMM[8] # .LSRM0 | |
797 jmp .Lenc_loop | |
798 .align 16 | |
799 .Lenc_done: | |
800 ___ | |
801 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb | |
802 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); | |
803 $code.=<<___; | |
804 movdqa ($key), @XMM[8] # last round key | |
805 pxor @XMM[8], @XMM[4] | |
806 pxor @XMM[8], @XMM[6] | |
807 pxor @XMM[8], @XMM[3] | |
808 pxor @XMM[8], @XMM[7] | |
809 pxor @XMM[8], @XMM[2] | |
810 pxor @XMM[8], @XMM[5] | |
811 pxor @XMM[8], @XMM[0] | |
812 pxor @XMM[8], @XMM[1] | |
813 ret | |
814 .size _bsaes_encrypt8,.-_bsaes_encrypt8 | |
815 | |
816 .type _bsaes_decrypt8,\@abi-omnipotent | |
817 .align 64 | |
818 _bsaes_decrypt8: | |
819 lea .LBS0(%rip), $const # constants table | |
820 | |
821 movdqa ($key), @XMM[9] # round 0 key | |
822 lea 0x10($key), $key | |
823 movdqa -0x30($const), @XMM[8] # .LM0ISR | |
824 pxor @XMM[9], @XMM[0] # xor with round0 key | |
825 pxor @XMM[9], @XMM[1] | |
826 pshufb @XMM[8], @XMM[0] | |
827 pxor @XMM[9], @XMM[2] | |
828 pshufb @XMM[8], @XMM[1] | |
829 pxor @XMM[9], @XMM[3] | |
830 pshufb @XMM[8], @XMM[2] | |
831 pxor @XMM[9], @XMM[4] | |
832 pshufb @XMM[8], @XMM[3] | |
833 pxor @XMM[9], @XMM[5] | |
834 pshufb @XMM[8], @XMM[4] | |
835 pxor @XMM[9], @XMM[6] | |
836 pshufb @XMM[8], @XMM[5] | |
837 pxor @XMM[9], @XMM[7] | |
838 pshufb @XMM[8], @XMM[6] | |
839 pshufb @XMM[8], @XMM[7] | |
840 ___ | |
841 &bitslice (@XMM[0..7, 8..11]); | |
842 $code.=<<___; | |
843 dec $rounds | |
844 jmp .Ldec_sbox | |
845 .align 16 | |
846 .Ldec_loop: | |
847 ___ | |
848 &ShiftRows (@XMM[0..7, 8]); | |
849 $code.=".Ldec_sbox:\n"; | |
850 &InvSbox (@XMM[0..7, 8..15]); | |
851 $code.=<<___; | |
852 dec $rounds | |
853 jl .Ldec_done | |
854 ___ | |
855 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); | |
856 $code.=<<___; | |
857 movdqa -0x10($const), @XMM[8] # .LISR | |
858 jnz .Ldec_loop | |
859 movdqa -0x20($const), @XMM[8] # .LISRM0 | |
860 jmp .Ldec_loop | |
861 .align 16 | |
862 .Ldec_done: | |
863 ___ | |
864 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); | |
865 $code.=<<___; | |
866 movdqa ($key), @XMM[8] # last round key | |
867 pxor @XMM[8], @XMM[6] | |
868 pxor @XMM[8], @XMM[4] | |
869 pxor @XMM[8], @XMM[2] | |
870 pxor @XMM[8], @XMM[7] | |
871 pxor @XMM[8], @XMM[3] | |
872 pxor @XMM[8], @XMM[5] | |
873 pxor @XMM[8], @XMM[0] | |
874 pxor @XMM[8], @XMM[1] | |
875 ret | |
876 .size _bsaes_decrypt8,.-_bsaes_decrypt8 | |
877 ___ | |
878 } | |
879 { | |
880 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); | |
881 | |
882 sub bitslice_key { | |
883 my @x=reverse(@_[0..7]); | |
884 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; | |
885 | |
886 &swapmove (@x[0,1],1,$bs0,$t2,$t3); | |
887 $code.=<<___; | |
888 #&swapmove(@x[2,3],1,$t0,$t2,$t3); | |
889 movdqa @x[0], @x[2] | |
890 movdqa @x[1], @x[3] | |
891 ___ | |
892 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); | |
893 | |
894 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); | |
895 $code.=<<___; | |
896 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); | |
897 movdqa @x[0], @x[4] | |
898 movdqa @x[2], @x[6] | |
899 movdqa @x[1], @x[5] | |
900 movdqa @x[3], @x[7] | |
901 ___ | |
902 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); | |
903 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); | |
904 } | |
905 | |
906 $code.=<<___; | |
907 .type _bsaes_key_convert,\@abi-omnipotent | |
908 .align 16 | |
909 _bsaes_key_convert: | |
910 lea .Lmasks(%rip), $const | |
911 movdqu ($inp), %xmm7 # load round 0 key | |
912 lea 0x10($inp), $inp | |
913 movdqa 0x00($const), %xmm0 # 0x01... | |
914 movdqa 0x10($const), %xmm1 # 0x02... | |
915 movdqa 0x20($const), %xmm2 # 0x04... | |
916 movdqa 0x30($const), %xmm3 # 0x08... | |
917 movdqa 0x40($const), %xmm4 # .LM0 | |
918 pcmpeqd %xmm5, %xmm5 # .LNOT | |
919 | |
920 movdqu ($inp), %xmm6 # load round 1 key | |
921 movdqa %xmm7, ($out) # save round 0 key | |
922 lea 0x10($out), $out | |
923 dec $rounds | |
924 jmp .Lkey_loop | |
925 .align 16 | |
926 .Lkey_loop: | |
927 pshufb %xmm4, %xmm6 # .LM0 | |
928 | |
929 movdqa %xmm0, %xmm8 | |
930 movdqa %xmm1, %xmm9 | |
931 | |
932 pand %xmm6, %xmm8 | |
933 pand %xmm6, %xmm9 | |
934 movdqa %xmm2, %xmm10 | |
935 pcmpeqb %xmm0, %xmm8 | |
936 psllq \$4, %xmm0 # 0x10... | |
937 movdqa %xmm3, %xmm11 | |
938 pcmpeqb %xmm1, %xmm9 | |
939 psllq \$4, %xmm1 # 0x20... | |
940 | |
941 pand %xmm6, %xmm10 | |
942 pand %xmm6, %xmm11 | |
943 movdqa %xmm0, %xmm12 | |
944 pcmpeqb %xmm2, %xmm10 | |
945 psllq \$4, %xmm2 # 0x40... | |
946 movdqa %xmm1, %xmm13 | |
947 pcmpeqb %xmm3, %xmm11 | |
948 psllq \$4, %xmm3 # 0x80... | |
949 | |
950 movdqa %xmm2, %xmm14 | |
951 movdqa %xmm3, %xmm15 | |
952 pxor %xmm5, %xmm8 # "pnot" | |
953 pxor %xmm5, %xmm9 | |
954 | |
955 pand %xmm6, %xmm12 | |
956 pand %xmm6, %xmm13 | |
957 movdqa %xmm8, 0x00($out) # write bit-sliced round key | |
958 pcmpeqb %xmm0, %xmm12 | |
959 psrlq \$4, %xmm0 # 0x01... | |
960 movdqa %xmm9, 0x10($out) | |
961 pcmpeqb %xmm1, %xmm13 | |
962 psrlq \$4, %xmm1 # 0x02... | |
963 lea 0x10($inp), $inp | |
964 | |
965 pand %xmm6, %xmm14 | |
966 pand %xmm6, %xmm15 | |
967 movdqa %xmm10, 0x20($out) | |
968 pcmpeqb %xmm2, %xmm14 | |
969 psrlq \$4, %xmm2 # 0x04... | |
970 movdqa %xmm11, 0x30($out) | |
971 pcmpeqb %xmm3, %xmm15 | |
972 psrlq \$4, %xmm3 # 0x08... | |
973 movdqu ($inp), %xmm6 # load next round key | |
974 | |
975 pxor %xmm5, %xmm13 # "pnot" | |
976 pxor %xmm5, %xmm14 | |
977 movdqa %xmm12, 0x40($out) | |
978 movdqa %xmm13, 0x50($out) | |
979 movdqa %xmm14, 0x60($out) | |
980 movdqa %xmm15, 0x70($out) | |
981 lea 0x80($out),$out | |
982 dec $rounds | |
983 jnz .Lkey_loop | |
984 | |
985 movdqa 0x50($const), %xmm7 # .L63 | |
986 #movdqa %xmm6, ($out) # don't save last round key | |
987 ret | |
988 .size _bsaes_key_convert,.-_bsaes_key_convert | |
989 ___ | |
990 } | |
991 | |
992 if (0 && !$win64) { # following four functions are unsupported interface | |
993 # used for benchmarking... | |
994 $code.=<<___; | |
995 .globl bsaes_enc_key_convert | |
996 .type bsaes_enc_key_convert,\@function,2 | |
997 .align 16 | |
998 bsaes_enc_key_convert: | |
999 mov 240($inp),%r10d # pass rounds | |
1000 mov $inp,%rcx # pass key | |
1001 mov $out,%rax # pass key schedule | |
1002 call _bsaes_key_convert | |
1003 pxor %xmm6,%xmm7 # fix up last round key | |
1004 movdqa %xmm7,(%rax) # save last round key | |
1005 ret | |
1006 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert | |
1007 | |
1008 .globl bsaes_encrypt_128 | |
1009 .type bsaes_encrypt_128,\@function,4 | |
1010 .align 16 | |
1011 bsaes_encrypt_128: | |
1012 .Lenc128_loop: | |
1013 movdqu 0x00($inp), @XMM[0] # load input | |
1014 movdqu 0x10($inp), @XMM[1] | |
1015 movdqu 0x20($inp), @XMM[2] | |
1016 movdqu 0x30($inp), @XMM[3] | |
1017 movdqu 0x40($inp), @XMM[4] | |
1018 movdqu 0x50($inp), @XMM[5] | |
1019 movdqu 0x60($inp), @XMM[6] | |
1020 movdqu 0x70($inp), @XMM[7] | |
1021 mov $key, %rax # pass the $key | |
1022 lea 0x80($inp), $inp | |
1023 mov \$10,%r10d | |
1024 | |
1025 call _bsaes_encrypt8 | |
1026 | |
1027 movdqu @XMM[0], 0x00($out) # write output | |
1028 movdqu @XMM[1], 0x10($out) | |
1029 movdqu @XMM[4], 0x20($out) | |
1030 movdqu @XMM[6], 0x30($out) | |
1031 movdqu @XMM[3], 0x40($out) | |
1032 movdqu @XMM[7], 0x50($out) | |
1033 movdqu @XMM[2], 0x60($out) | |
1034 movdqu @XMM[5], 0x70($out) | |
1035 lea 0x80($out), $out | |
1036 sub \$0x80,$len | |
1037 ja .Lenc128_loop | |
1038 ret | |
1039 .size bsaes_encrypt_128,.-bsaes_encrypt_128 | |
1040 | |
1041 .globl bsaes_dec_key_convert | |
1042 .type bsaes_dec_key_convert,\@function,2 | |
1043 .align 16 | |
1044 bsaes_dec_key_convert: | |
1045 mov 240($inp),%r10d # pass rounds | |
1046 mov $inp,%rcx # pass key | |
1047 mov $out,%rax # pass key schedule | |
1048 call _bsaes_key_convert | |
1049 pxor ($out),%xmm7 # fix up round 0 key | |
1050 movdqa %xmm6,(%rax) # save last round key | |
1051 movdqa %xmm7,($out) | |
1052 ret | |
1053 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert | |
1054 | |
1055 .globl bsaes_decrypt_128 | |
1056 .type bsaes_decrypt_128,\@function,4 | |
1057 .align 16 | |
1058 bsaes_decrypt_128: | |
1059 .Ldec128_loop: | |
1060 movdqu 0x00($inp), @XMM[0] # load input | |
1061 movdqu 0x10($inp), @XMM[1] | |
1062 movdqu 0x20($inp), @XMM[2] | |
1063 movdqu 0x30($inp), @XMM[3] | |
1064 movdqu 0x40($inp), @XMM[4] | |
1065 movdqu 0x50($inp), @XMM[5] | |
1066 movdqu 0x60($inp), @XMM[6] | |
1067 movdqu 0x70($inp), @XMM[7] | |
1068 mov $key, %rax # pass the $key | |
1069 lea 0x80($inp), $inp | |
1070 mov \$10,%r10d | |
1071 | |
1072 call _bsaes_decrypt8 | |
1073 | |
1074 movdqu @XMM[0], 0x00($out) # write output | |
1075 movdqu @XMM[1], 0x10($out) | |
1076 movdqu @XMM[6], 0x20($out) | |
1077 movdqu @XMM[4], 0x30($out) | |
1078 movdqu @XMM[2], 0x40($out) | |
1079 movdqu @XMM[7], 0x50($out) | |
1080 movdqu @XMM[3], 0x60($out) | |
1081 movdqu @XMM[5], 0x70($out) | |
1082 lea 0x80($out), $out | |
1083 sub \$0x80,$len | |
1084 ja .Ldec128_loop | |
1085 ret | |
1086 .size bsaes_decrypt_128,.-bsaes_decrypt_128 | |
1087 ___ | |
1088 } | |
1089 { | |
1090 ###################################################################### | |
1091 # | |
1092 # OpenSSL interface | |
1093 # | |
1094 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r
10","%r11d") | |
1095 : ("%rdi","%rsi","%rdx","%rcx","
%r8","%r9d"); | |
1096 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); | |
1097 | |
1098 if ($ecb) { | |
1099 $code.=<<___; | |
1100 .globl bsaes_ecb_encrypt_blocks | |
1101 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent | |
1102 .align 16 | |
1103 bsaes_ecb_encrypt_blocks: | |
1104 mov %rsp, %rax | |
1105 .Lecb_enc_prologue: | |
1106 push %rbp | |
1107 push %rbx | |
1108 push %r12 | |
1109 push %r13 | |
1110 push %r14 | |
1111 push %r15 | |
1112 lea -0x48(%rsp),%rsp | |
1113 ___ | |
1114 $code.=<<___ if ($win64); | |
1115 lea -0xa0(%rsp), %rsp | |
1116 movaps %xmm6, 0x40(%rsp) | |
1117 movaps %xmm7, 0x50(%rsp) | |
1118 movaps %xmm8, 0x60(%rsp) | |
1119 movaps %xmm9, 0x70(%rsp) | |
1120 movaps %xmm10, 0x80(%rsp) | |
1121 movaps %xmm11, 0x90(%rsp) | |
1122 movaps %xmm12, 0xa0(%rsp) | |
1123 movaps %xmm13, 0xb0(%rsp) | |
1124 movaps %xmm14, 0xc0(%rsp) | |
1125 movaps %xmm15, 0xd0(%rsp) | |
1126 .Lecb_enc_body: | |
1127 ___ | |
1128 $code.=<<___; | |
1129 mov %rsp,%rbp # backup %rsp | |
1130 mov 240($arg4),%eax # rounds | |
1131 mov $arg1,$inp # backup arguments | |
1132 mov $arg2,$out | |
1133 mov $arg3,$len | |
1134 mov $arg4,$key | |
1135 cmp \$8,$arg3 | |
1136 jb .Lecb_enc_short | |
1137 | |
1138 mov %eax,%ebx # backup rounds | |
1139 shl \$7,%rax # 128 bytes per inner round key | |
1140 sub \$`128-32`,%rax # size of bit-sliced key schedule | |
1141 sub %rax,%rsp | |
1142 mov %rsp,%rax # pass key schedule | |
1143 mov $key,%rcx # pass key | |
1144 mov %ebx,%r10d # pass rounds | |
1145 call _bsaes_key_convert | |
1146 pxor %xmm6,%xmm7 # fix up last round key | |
1147 movdqa %xmm7,(%rax) # save last round key | |
1148 | |
1149 sub \$8,$len | |
1150 .Lecb_enc_loop: | |
1151 movdqu 0x00($inp), @XMM[0] # load input | |
1152 movdqu 0x10($inp), @XMM[1] | |
1153 movdqu 0x20($inp), @XMM[2] | |
1154 movdqu 0x30($inp), @XMM[3] | |
1155 movdqu 0x40($inp), @XMM[4] | |
1156 movdqu 0x50($inp), @XMM[5] | |
1157 mov %rsp, %rax # pass key schedule | |
1158 movdqu 0x60($inp), @XMM[6] | |
1159 mov %ebx,%r10d # pass rounds | |
1160 movdqu 0x70($inp), @XMM[7] | |
1161 lea 0x80($inp), $inp | |
1162 | |
1163 call _bsaes_encrypt8 | |
1164 | |
1165 movdqu @XMM[0], 0x00($out) # write output | |
1166 movdqu @XMM[1], 0x10($out) | |
1167 movdqu @XMM[4], 0x20($out) | |
1168 movdqu @XMM[6], 0x30($out) | |
1169 movdqu @XMM[3], 0x40($out) | |
1170 movdqu @XMM[7], 0x50($out) | |
1171 movdqu @XMM[2], 0x60($out) | |
1172 movdqu @XMM[5], 0x70($out) | |
1173 lea 0x80($out), $out | |
1174 sub \$8,$len | |
1175 jnc .Lecb_enc_loop | |
1176 | |
1177 add \$8,$len | |
1178 jz .Lecb_enc_done | |
1179 | |
1180 movdqu 0x00($inp), @XMM[0] # load input | |
1181 mov %rsp, %rax # pass key schedule | |
1182 mov %ebx,%r10d # pass rounds | |
1183 cmp \$2,$len | |
1184 jb .Lecb_enc_one | |
1185 movdqu 0x10($inp), @XMM[1] | |
1186 je .Lecb_enc_two | |
1187 movdqu 0x20($inp), @XMM[2] | |
1188 cmp \$4,$len | |
1189 jb .Lecb_enc_three | |
1190 movdqu 0x30($inp), @XMM[3] | |
1191 je .Lecb_enc_four | |
1192 movdqu 0x40($inp), @XMM[4] | |
1193 cmp \$6,$len | |
1194 jb .Lecb_enc_five | |
1195 movdqu 0x50($inp), @XMM[5] | |
1196 je .Lecb_enc_six | |
1197 movdqu 0x60($inp), @XMM[6] | |
1198 call _bsaes_encrypt8 | |
1199 movdqu @XMM[0], 0x00($out) # write output | |
1200 movdqu @XMM[1], 0x10($out) | |
1201 movdqu @XMM[4], 0x20($out) | |
1202 movdqu @XMM[6], 0x30($out) | |
1203 movdqu @XMM[3], 0x40($out) | |
1204 movdqu @XMM[7], 0x50($out) | |
1205 movdqu @XMM[2], 0x60($out) | |
1206 jmp .Lecb_enc_done | |
1207 .align 16 | |
1208 .Lecb_enc_six: | |
1209 call _bsaes_encrypt8 | |
1210 movdqu @XMM[0], 0x00($out) # write output | |
1211 movdqu @XMM[1], 0x10($out) | |
1212 movdqu @XMM[4], 0x20($out) | |
1213 movdqu @XMM[6], 0x30($out) | |
1214 movdqu @XMM[3], 0x40($out) | |
1215 movdqu @XMM[7], 0x50($out) | |
1216 jmp .Lecb_enc_done | |
1217 .align 16 | |
1218 .Lecb_enc_five: | |
1219 call _bsaes_encrypt8 | |
1220 movdqu @XMM[0], 0x00($out) # write output | |
1221 movdqu @XMM[1], 0x10($out) | |
1222 movdqu @XMM[4], 0x20($out) | |
1223 movdqu @XMM[6], 0x30($out) | |
1224 movdqu @XMM[3], 0x40($out) | |
1225 jmp .Lecb_enc_done | |
1226 .align 16 | |
1227 .Lecb_enc_four: | |
1228 call _bsaes_encrypt8 | |
1229 movdqu @XMM[0], 0x00($out) # write output | |
1230 movdqu @XMM[1], 0x10($out) | |
1231 movdqu @XMM[4], 0x20($out) | |
1232 movdqu @XMM[6], 0x30($out) | |
1233 jmp .Lecb_enc_done | |
1234 .align 16 | |
1235 .Lecb_enc_three: | |
1236 call _bsaes_encrypt8 | |
1237 movdqu @XMM[0], 0x00($out) # write output | |
1238 movdqu @XMM[1], 0x10($out) | |
1239 movdqu @XMM[4], 0x20($out) | |
1240 jmp .Lecb_enc_done | |
1241 .align 16 | |
1242 .Lecb_enc_two: | |
1243 call _bsaes_encrypt8 | |
1244 movdqu @XMM[0], 0x00($out) # write output | |
1245 movdqu @XMM[1], 0x10($out) | |
1246 jmp .Lecb_enc_done | |
1247 .align 16 | |
1248 .Lecb_enc_one: | |
1249 call _bsaes_encrypt8 | |
1250 movdqu @XMM[0], 0x00($out) # write output | |
1251 jmp .Lecb_enc_done | |
1252 .align 16 | |
1253 .Lecb_enc_short: | |
1254 lea ($inp), $arg1 | |
1255 lea ($out), $arg2 | |
1256 lea ($key), $arg3 | |
1257 call asm_AES_encrypt | |
1258 lea 16($inp), $inp | |
1259 lea 16($out), $out | |
1260 dec $len | |
1261 jnz .Lecb_enc_short | |
1262 | |
1263 .Lecb_enc_done: | |
1264 lea (%rsp),%rax | |
1265 pxor %xmm0, %xmm0 | |
1266 .Lecb_enc_bzero: # wipe key schedule [if any] | |
1267 movdqa %xmm0, 0x00(%rax) | |
1268 movdqa %xmm0, 0x10(%rax) | |
1269 lea 0x20(%rax), %rax | |
1270 cmp %rax, %rbp | |
1271 jb .Lecb_enc_bzero | |
1272 | |
1273 lea (%rbp),%rsp # restore %rsp | |
1274 ___ | |
1275 $code.=<<___ if ($win64); | |
1276 movaps 0x40(%rbp), %xmm6 | |
1277 movaps 0x50(%rbp), %xmm7 | |
1278 movaps 0x60(%rbp), %xmm8 | |
1279 movaps 0x70(%rbp), %xmm9 | |
1280 movaps 0x80(%rbp), %xmm10 | |
1281 movaps 0x90(%rbp), %xmm11 | |
1282 movaps 0xa0(%rbp), %xmm12 | |
1283 movaps 0xb0(%rbp), %xmm13 | |
1284 movaps 0xc0(%rbp), %xmm14 | |
1285 movaps 0xd0(%rbp), %xmm15 | |
1286 lea 0xa0(%rbp), %rsp | |
1287 ___ | |
1288 $code.=<<___; | |
1289 mov 0x48(%rsp), %r15 | |
1290 mov 0x50(%rsp), %r14 | |
1291 mov 0x58(%rsp), %r13 | |
1292 mov 0x60(%rsp), %r12 | |
1293 mov 0x68(%rsp), %rbx | |
1294 mov 0x70(%rsp), %rax | |
1295 lea 0x78(%rsp), %rsp | |
1296 mov %rax, %rbp | |
1297 .Lecb_enc_epilogue: | |
1298 ret | |
1299 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks | |
1300 | |
1301 .globl bsaes_ecb_decrypt_blocks | |
1302 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent | |
1303 .align 16 | |
1304 bsaes_ecb_decrypt_blocks: | |
1305 mov %rsp, %rax | |
1306 .Lecb_dec_prologue: | |
1307 push %rbp | |
1308 push %rbx | |
1309 push %r12 | |
1310 push %r13 | |
1311 push %r14 | |
1312 push %r15 | |
1313 lea -0x48(%rsp),%rsp | |
1314 ___ | |
1315 $code.=<<___ if ($win64); | |
1316 lea -0xa0(%rsp), %rsp | |
1317 movaps %xmm6, 0x40(%rsp) | |
1318 movaps %xmm7, 0x50(%rsp) | |
1319 movaps %xmm8, 0x60(%rsp) | |
1320 movaps %xmm9, 0x70(%rsp) | |
1321 movaps %xmm10, 0x80(%rsp) | |
1322 movaps %xmm11, 0x90(%rsp) | |
1323 movaps %xmm12, 0xa0(%rsp) | |
1324 movaps %xmm13, 0xb0(%rsp) | |
1325 movaps %xmm14, 0xc0(%rsp) | |
1326 movaps %xmm15, 0xd0(%rsp) | |
1327 .Lecb_dec_body: | |
1328 ___ | |
1329 $code.=<<___; | |
1330 mov %rsp,%rbp # backup %rsp | |
1331 mov 240($arg4),%eax # rounds | |
1332 mov $arg1,$inp # backup arguments | |
1333 mov $arg2,$out | |
1334 mov $arg3,$len | |
1335 mov $arg4,$key | |
1336 cmp \$8,$arg3 | |
1337 jb .Lecb_dec_short | |
1338 | |
1339 mov %eax,%ebx # backup rounds | |
1340 shl \$7,%rax # 128 bytes per inner round key | |
1341 sub \$`128-32`,%rax # size of bit-sliced key schedule | |
1342 sub %rax,%rsp | |
1343 mov %rsp,%rax # pass key schedule | |
1344 mov $key,%rcx # pass key | |
1345 mov %ebx,%r10d # pass rounds | |
1346 call _bsaes_key_convert | |
1347 pxor (%rsp),%xmm7 # fix up 0 round key | |
1348 movdqa %xmm6,(%rax) # save last round key | |
1349 movdqa %xmm7,(%rsp) | |
1350 | |
1351 sub \$8,$len | |
1352 .Lecb_dec_loop: | |
1353 movdqu 0x00($inp), @XMM[0] # load input | |
1354 movdqu 0x10($inp), @XMM[1] | |
1355 movdqu 0x20($inp), @XMM[2] | |
1356 movdqu 0x30($inp), @XMM[3] | |
1357 movdqu 0x40($inp), @XMM[4] | |
1358 movdqu 0x50($inp), @XMM[5] | |
1359 mov %rsp, %rax # pass key schedule | |
1360 movdqu 0x60($inp), @XMM[6] | |
1361 mov %ebx,%r10d # pass rounds | |
1362 movdqu 0x70($inp), @XMM[7] | |
1363 lea 0x80($inp), $inp | |
1364 | |
1365 call _bsaes_decrypt8 | |
1366 | |
1367 movdqu @XMM[0], 0x00($out) # write output | |
1368 movdqu @XMM[1], 0x10($out) | |
1369 movdqu @XMM[6], 0x20($out) | |
1370 movdqu @XMM[4], 0x30($out) | |
1371 movdqu @XMM[2], 0x40($out) | |
1372 movdqu @XMM[7], 0x50($out) | |
1373 movdqu @XMM[3], 0x60($out) | |
1374 movdqu @XMM[5], 0x70($out) | |
1375 lea 0x80($out), $out | |
1376 sub \$8,$len | |
1377 jnc .Lecb_dec_loop | |
1378 | |
1379 add \$8,$len | |
1380 jz .Lecb_dec_done | |
1381 | |
1382 movdqu 0x00($inp), @XMM[0] # load input | |
1383 mov %rsp, %rax # pass key schedule | |
1384 mov %ebx,%r10d # pass rounds | |
1385 cmp \$2,$len | |
1386 jb .Lecb_dec_one | |
1387 movdqu 0x10($inp), @XMM[1] | |
1388 je .Lecb_dec_two | |
1389 movdqu 0x20($inp), @XMM[2] | |
1390 cmp \$4,$len | |
1391 jb .Lecb_dec_three | |
1392 movdqu 0x30($inp), @XMM[3] | |
1393 je .Lecb_dec_four | |
1394 movdqu 0x40($inp), @XMM[4] | |
1395 cmp \$6,$len | |
1396 jb .Lecb_dec_five | |
1397 movdqu 0x50($inp), @XMM[5] | |
1398 je .Lecb_dec_six | |
1399 movdqu 0x60($inp), @XMM[6] | |
1400 call _bsaes_decrypt8 | |
1401 movdqu @XMM[0], 0x00($out) # write output | |
1402 movdqu @XMM[1], 0x10($out) | |
1403 movdqu @XMM[6], 0x20($out) | |
1404 movdqu @XMM[4], 0x30($out) | |
1405 movdqu @XMM[2], 0x40($out) | |
1406 movdqu @XMM[7], 0x50($out) | |
1407 movdqu @XMM[3], 0x60($out) | |
1408 jmp .Lecb_dec_done | |
1409 .align 16 | |
1410 .Lecb_dec_six: | |
1411 call _bsaes_decrypt8 | |
1412 movdqu @XMM[0], 0x00($out) # write output | |
1413 movdqu @XMM[1], 0x10($out) | |
1414 movdqu @XMM[6], 0x20($out) | |
1415 movdqu @XMM[4], 0x30($out) | |
1416 movdqu @XMM[2], 0x40($out) | |
1417 movdqu @XMM[7], 0x50($out) | |
1418 jmp .Lecb_dec_done | |
1419 .align 16 | |
1420 .Lecb_dec_five: | |
1421 call _bsaes_decrypt8 | |
1422 movdqu @XMM[0], 0x00($out) # write output | |
1423 movdqu @XMM[1], 0x10($out) | |
1424 movdqu @XMM[6], 0x20($out) | |
1425 movdqu @XMM[4], 0x30($out) | |
1426 movdqu @XMM[2], 0x40($out) | |
1427 jmp .Lecb_dec_done | |
1428 .align 16 | |
1429 .Lecb_dec_four: | |
1430 call _bsaes_decrypt8 | |
1431 movdqu @XMM[0], 0x00($out) # write output | |
1432 movdqu @XMM[1], 0x10($out) | |
1433 movdqu @XMM[6], 0x20($out) | |
1434 movdqu @XMM[4], 0x30($out) | |
1435 jmp .Lecb_dec_done | |
1436 .align 16 | |
1437 .Lecb_dec_three: | |
1438 call _bsaes_decrypt8 | |
1439 movdqu @XMM[0], 0x00($out) # write output | |
1440 movdqu @XMM[1], 0x10($out) | |
1441 movdqu @XMM[6], 0x20($out) | |
1442 jmp .Lecb_dec_done | |
1443 .align 16 | |
1444 .Lecb_dec_two: | |
1445 call _bsaes_decrypt8 | |
1446 movdqu @XMM[0], 0x00($out) # write output | |
1447 movdqu @XMM[1], 0x10($out) | |
1448 jmp .Lecb_dec_done | |
1449 .align 16 | |
1450 .Lecb_dec_one: | |
1451 call _bsaes_decrypt8 | |
1452 movdqu @XMM[0], 0x00($out) # write output | |
1453 jmp .Lecb_dec_done | |
1454 .align 16 | |
1455 .Lecb_dec_short: | |
1456 lea ($inp), $arg1 | |
1457 lea ($out), $arg2 | |
1458 lea ($key), $arg3 | |
1459 call asm_AES_decrypt | |
1460 lea 16($inp), $inp | |
1461 lea 16($out), $out | |
1462 dec $len | |
1463 jnz .Lecb_dec_short | |
1464 | |
1465 .Lecb_dec_done: | |
1466 lea (%rsp),%rax | |
1467 pxor %xmm0, %xmm0 | |
1468 .Lecb_dec_bzero: # wipe key schedule [if any] | |
1469 movdqa %xmm0, 0x00(%rax) | |
1470 movdqa %xmm0, 0x10(%rax) | |
1471 lea 0x20(%rax), %rax | |
1472 cmp %rax, %rbp | |
1473 jb .Lecb_dec_bzero | |
1474 | |
1475 lea (%rbp),%rsp # restore %rsp | |
1476 ___ | |
1477 $code.=<<___ if ($win64); | |
1478 movaps 0x40(%rbp), %xmm6 | |
1479 movaps 0x50(%rbp), %xmm7 | |
1480 movaps 0x60(%rbp), %xmm8 | |
1481 movaps 0x70(%rbp), %xmm9 | |
1482 movaps 0x80(%rbp), %xmm10 | |
1483 movaps 0x90(%rbp), %xmm11 | |
1484 movaps 0xa0(%rbp), %xmm12 | |
1485 movaps 0xb0(%rbp), %xmm13 | |
1486 movaps 0xc0(%rbp), %xmm14 | |
1487 movaps 0xd0(%rbp), %xmm15 | |
1488 lea 0xa0(%rbp), %rsp | |
1489 ___ | |
1490 $code.=<<___; | |
1491 mov 0x48(%rsp), %r15 | |
1492 mov 0x50(%rsp), %r14 | |
1493 mov 0x58(%rsp), %r13 | |
1494 mov 0x60(%rsp), %r12 | |
1495 mov 0x68(%rsp), %rbx | |
1496 mov 0x70(%rsp), %rax | |
1497 lea 0x78(%rsp), %rsp | |
1498 mov %rax, %rbp | |
1499 .Lecb_dec_epilogue: | |
1500 ret | |
1501 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks | |
1502 ___ | |
1503 } | |
1504 $code.=<<___; | |
1505 .extern asm_AES_cbc_encrypt | |
1506 .globl bsaes_cbc_encrypt | |
1507 .type bsaes_cbc_encrypt,\@abi-omnipotent | |
1508 .align 16 | |
1509 bsaes_cbc_encrypt: | |
1510 ___ | |
1511 $code.=<<___ if ($win64); | |
1512 mov 48(%rsp),$arg6 # pull direction flag | |
1513 ___ | |
1514 $code.=<<___; | |
1515 cmp \$0,$arg6 | |
1516 jne asm_AES_cbc_encrypt | |
1517 cmp \$128,$arg3 | |
1518 jb asm_AES_cbc_encrypt | |
1519 | |
1520 mov %rsp, %rax | |
1521 .Lcbc_dec_prologue: | |
1522 push %rbp | |
1523 push %rbx | |
1524 push %r12 | |
1525 push %r13 | |
1526 push %r14 | |
1527 push %r15 | |
1528 lea -0x48(%rsp), %rsp | |
1529 ___ | |
1530 $code.=<<___ if ($win64); | |
1531 mov 0xa0(%rsp),$arg5 # pull ivp | |
1532 lea -0xa0(%rsp), %rsp | |
1533 movaps %xmm6, 0x40(%rsp) | |
1534 movaps %xmm7, 0x50(%rsp) | |
1535 movaps %xmm8, 0x60(%rsp) | |
1536 movaps %xmm9, 0x70(%rsp) | |
1537 movaps %xmm10, 0x80(%rsp) | |
1538 movaps %xmm11, 0x90(%rsp) | |
1539 movaps %xmm12, 0xa0(%rsp) | |
1540 movaps %xmm13, 0xb0(%rsp) | |
1541 movaps %xmm14, 0xc0(%rsp) | |
1542 movaps %xmm15, 0xd0(%rsp) | |
1543 .Lcbc_dec_body: | |
1544 ___ | |
1545 $code.=<<___; | |
1546 mov %rsp, %rbp # backup %rsp | |
1547 mov 240($arg4), %eax # rounds | |
1548 mov $arg1, $inp # backup arguments | |
1549 mov $arg2, $out | |
1550 mov $arg3, $len | |
1551 mov $arg4, $key | |
1552 mov $arg5, %rbx | |
1553 shr \$4, $len # bytes to blocks | |
1554 | |
1555 mov %eax, %edx # rounds | |
1556 shl \$7, %rax # 128 bytes per inner round key | |
1557 sub \$`128-32`, %rax # size of bit-sliced key schedule | |
1558 sub %rax, %rsp | |
1559 | |
1560 mov %rsp, %rax # pass key schedule | |
1561 mov $key, %rcx # pass key | |
1562 mov %edx, %r10d # pass rounds | |
1563 call _bsaes_key_convert | |
1564 pxor (%rsp),%xmm7 # fix up 0 round key | |
1565 movdqa %xmm6,(%rax) # save last round key | |
1566 movdqa %xmm7,(%rsp) | |
1567 | |
1568 movdqu (%rbx), @XMM[15] # load IV | |
1569 sub \$8,$len | |
1570 .Lcbc_dec_loop: | |
1571 movdqu 0x00($inp), @XMM[0] # load input | |
1572 movdqu 0x10($inp), @XMM[1] | |
1573 movdqu 0x20($inp), @XMM[2] | |
1574 movdqu 0x30($inp), @XMM[3] | |
1575 movdqu 0x40($inp), @XMM[4] | |
1576 movdqu 0x50($inp), @XMM[5] | |
1577 mov %rsp, %rax # pass key schedule | |
1578 movdqu 0x60($inp), @XMM[6] | |
1579 mov %edx,%r10d # pass rounds | |
1580 movdqu 0x70($inp), @XMM[7] | |
1581 movdqa @XMM[15], 0x20(%rbp) # put aside IV | |
1582 | |
1583 call _bsaes_decrypt8 | |
1584 | |
1585 pxor 0x20(%rbp), @XMM[0] # ^= IV | |
1586 movdqu 0x00($inp), @XMM[8] # re-load input | |
1587 movdqu 0x10($inp), @XMM[9] | |
1588 pxor @XMM[8], @XMM[1] | |
1589 movdqu 0x20($inp), @XMM[10] | |
1590 pxor @XMM[9], @XMM[6] | |
1591 movdqu 0x30($inp), @XMM[11] | |
1592 pxor @XMM[10], @XMM[4] | |
1593 movdqu 0x40($inp), @XMM[12] | |
1594 pxor @XMM[11], @XMM[2] | |
1595 movdqu 0x50($inp), @XMM[13] | |
1596 pxor @XMM[12], @XMM[7] | |
1597 movdqu 0x60($inp), @XMM[14] | |
1598 pxor @XMM[13], @XMM[3] | |
1599 movdqu 0x70($inp), @XMM[15] # IV | |
1600 pxor @XMM[14], @XMM[5] | |
1601 movdqu @XMM[0], 0x00($out) # write output | |
1602 lea 0x80($inp), $inp | |
1603 movdqu @XMM[1], 0x10($out) | |
1604 movdqu @XMM[6], 0x20($out) | |
1605 movdqu @XMM[4], 0x30($out) | |
1606 movdqu @XMM[2], 0x40($out) | |
1607 movdqu @XMM[7], 0x50($out) | |
1608 movdqu @XMM[3], 0x60($out) | |
1609 movdqu @XMM[5], 0x70($out) | |
1610 lea 0x80($out), $out | |
1611 sub \$8,$len | |
1612 jnc .Lcbc_dec_loop | |
1613 | |
1614 add \$8,$len | |
1615 jz .Lcbc_dec_done | |
1616 | |
1617 movdqu 0x00($inp), @XMM[0] # load input | |
1618 mov %rsp, %rax # pass key schedule | |
1619 mov %edx, %r10d # pass rounds | |
1620 cmp \$2,$len | |
1621 jb .Lcbc_dec_one | |
1622 movdqu 0x10($inp), @XMM[1] | |
1623 je .Lcbc_dec_two | |
1624 movdqu 0x20($inp), @XMM[2] | |
1625 cmp \$4,$len | |
1626 jb .Lcbc_dec_three | |
1627 movdqu 0x30($inp), @XMM[3] | |
1628 je .Lcbc_dec_four | |
1629 movdqu 0x40($inp), @XMM[4] | |
1630 cmp \$6,$len | |
1631 jb .Lcbc_dec_five | |
1632 movdqu 0x50($inp), @XMM[5] | |
1633 je .Lcbc_dec_six | |
1634 movdqu 0x60($inp), @XMM[6] | |
1635 movdqa @XMM[15], 0x20(%rbp) # put aside IV | |
1636 call _bsaes_decrypt8 | |
1637 pxor 0x20(%rbp), @XMM[0] # ^= IV | |
1638 movdqu 0x00($inp), @XMM[8] # re-load input | |
1639 movdqu 0x10($inp), @XMM[9] | |
1640 pxor @XMM[8], @XMM[1] | |
1641 movdqu 0x20($inp), @XMM[10] | |
1642 pxor @XMM[9], @XMM[6] | |
1643 movdqu 0x30($inp), @XMM[11] | |
1644 pxor @XMM[10], @XMM[4] | |
1645 movdqu 0x40($inp), @XMM[12] | |
1646 pxor @XMM[11], @XMM[2] | |
1647 movdqu 0x50($inp), @XMM[13] | |
1648 pxor @XMM[12], @XMM[7] | |
1649 movdqu 0x60($inp), @XMM[15] # IV | |
1650 pxor @XMM[13], @XMM[3] | |
1651 movdqu @XMM[0], 0x00($out) # write output | |
1652 movdqu @XMM[1], 0x10($out) | |
1653 movdqu @XMM[6], 0x20($out) | |
1654 movdqu @XMM[4], 0x30($out) | |
1655 movdqu @XMM[2], 0x40($out) | |
1656 movdqu @XMM[7], 0x50($out) | |
1657 movdqu @XMM[3], 0x60($out) | |
1658 jmp .Lcbc_dec_done | |
1659 .align 16 | |
1660 .Lcbc_dec_six: | |
1661 movdqa @XMM[15], 0x20(%rbp) # put aside IV | |
1662 call _bsaes_decrypt8 | |
1663 pxor 0x20(%rbp), @XMM[0] # ^= IV | |
1664 movdqu 0x00($inp), @XMM[8] # re-load input | |
1665 movdqu 0x10($inp), @XMM[9] | |
1666 pxor @XMM[8], @XMM[1] | |
1667 movdqu 0x20($inp), @XMM[10] | |
1668 pxor @XMM[9], @XMM[6] | |
1669 movdqu 0x30($inp), @XMM[11] | |
1670 pxor @XMM[10], @XMM[4] | |
1671 movdqu 0x40($inp), @XMM[12] | |
1672 pxor @XMM[11], @XMM[2] | |
1673 movdqu 0x50($inp), @XMM[15] # IV | |
1674 pxor @XMM[12], @XMM[7] | |
1675 movdqu @XMM[0], 0x00($out) # write output | |
1676 movdqu @XMM[1], 0x10($out) | |
1677 movdqu @XMM[6], 0x20($out) | |
1678 movdqu @XMM[4], 0x30($out) | |
1679 movdqu @XMM[2], 0x40($out) | |
1680 movdqu @XMM[7], 0x50($out) | |
1681 jmp .Lcbc_dec_done | |
1682 .align 16 | |
1683 .Lcbc_dec_five: | |
1684 movdqa @XMM[15], 0x20(%rbp) # put aside IV | |
1685 call _bsaes_decrypt8 | |
1686 pxor 0x20(%rbp), @XMM[0] # ^= IV | |
1687 movdqu 0x00($inp), @XMM[8] # re-load input | |
1688 movdqu 0x10($inp), @XMM[9] | |
1689 pxor @XMM[8], @XMM[1] | |
1690 movdqu 0x20($inp), @XMM[10] | |
1691 pxor @XMM[9], @XMM[6] | |
1692 movdqu 0x30($inp), @XMM[11] | |
1693 pxor @XMM[10], @XMM[4] | |
1694 movdqu 0x40($inp), @XMM[15] # IV | |
1695 pxor @XMM[11], @XMM[2] | |
1696 movdqu @XMM[0], 0x00($out) # write output | |
1697 movdqu @XMM[1], 0x10($out) | |
1698 movdqu @XMM[6], 0x20($out) | |
1699 movdqu @XMM[4], 0x30($out) | |
1700 movdqu @XMM[2], 0x40($out) | |
1701 jmp .Lcbc_dec_done | |
1702 .align 16 | |
1703 .Lcbc_dec_four: | |
1704 movdqa @XMM[15], 0x20(%rbp) # put aside IV | |
1705 call _bsaes_decrypt8 | |
1706 pxor 0x20(%rbp), @XMM[0] # ^= IV | |
1707 movdqu 0x00($inp), @XMM[8] # re-load input | |
1708 movdqu 0x10($inp), @XMM[9] | |
1709 pxor @XMM[8], @XMM[1] | |
1710 movdqu 0x20($inp), @XMM[10] | |
1711 pxor @XMM[9], @XMM[6] | |
1712 movdqu 0x30($inp), @XMM[15] # IV | |
1713 pxor @XMM[10], @XMM[4] | |
1714 movdqu @XMM[0], 0x00($out) # write output | |
1715 movdqu @XMM[1], 0x10($out) | |
1716 movdqu @XMM[6], 0x20($out) | |
1717 movdqu @XMM[4], 0x30($out) | |
1718 jmp .Lcbc_dec_done | |
1719 .align 16 | |
1720 .Lcbc_dec_three: | |
1721 movdqa @XMM[15], 0x20(%rbp) # put aside IV | |
1722 call _bsaes_decrypt8 | |
1723 pxor 0x20(%rbp), @XMM[0] # ^= IV | |
1724 movdqu 0x00($inp), @XMM[8] # re-load input | |
1725 movdqu 0x10($inp), @XMM[9] | |
1726 pxor @XMM[8], @XMM[1] | |
1727 movdqu 0x20($inp), @XMM[15] # IV | |
1728 pxor @XMM[9], @XMM[6] | |
1729 movdqu @XMM[0], 0x00($out) # write output | |
1730 movdqu @XMM[1], 0x10($out) | |
1731 movdqu @XMM[6], 0x20($out) | |
1732 jmp .Lcbc_dec_done | |
1733 .align 16 | |
1734 .Lcbc_dec_two: | |
1735 movdqa @XMM[15], 0x20(%rbp) # put aside IV | |
1736 call _bsaes_decrypt8 | |
1737 pxor 0x20(%rbp), @XMM[0] # ^= IV | |
1738 movdqu 0x00($inp), @XMM[8] # re-load input | |
1739 movdqu 0x10($inp), @XMM[15] # IV | |
1740 pxor @XMM[8], @XMM[1] | |
1741 movdqu @XMM[0], 0x00($out) # write output | |
1742 movdqu @XMM[1], 0x10($out) | |
1743 jmp .Lcbc_dec_done | |
1744 .align 16 | |
1745 .Lcbc_dec_one: | |
1746 lea ($inp), $arg1 | |
1747 lea 0x20(%rbp), $arg2 # buffer output | |
1748 lea ($key), $arg3 | |
1749 call asm_AES_decrypt # doesn't touch %xmm | |
1750 pxor 0x20(%rbp), @XMM[15] # ^= IV | |
1751 movdqu @XMM[15], ($out) # write output | |
1752 movdqa @XMM[0], @XMM[15] # IV | |
1753 | |
1754 .Lcbc_dec_done: | |
1755 movdqu @XMM[15], (%rbx) # return IV | |
1756 lea (%rsp), %rax | |
1757 pxor %xmm0, %xmm0 | |
1758 .Lcbc_dec_bzero: # wipe key schedule [if any] | |
1759 movdqa %xmm0, 0x00(%rax) | |
1760 movdqa %xmm0, 0x10(%rax) | |
1761 lea 0x20(%rax), %rax | |
1762 cmp %rax, %rbp | |
1763 ja .Lcbc_dec_bzero | |
1764 | |
1765 lea (%rbp),%rsp # restore %rsp | |
1766 ___ | |
1767 $code.=<<___ if ($win64); | |
1768 movaps 0x40(%rbp), %xmm6 | |
1769 movaps 0x50(%rbp), %xmm7 | |
1770 movaps 0x60(%rbp), %xmm8 | |
1771 movaps 0x70(%rbp), %xmm9 | |
1772 movaps 0x80(%rbp), %xmm10 | |
1773 movaps 0x90(%rbp), %xmm11 | |
1774 movaps 0xa0(%rbp), %xmm12 | |
1775 movaps 0xb0(%rbp), %xmm13 | |
1776 movaps 0xc0(%rbp), %xmm14 | |
1777 movaps 0xd0(%rbp), %xmm15 | |
1778 lea 0xa0(%rbp), %rsp | |
1779 ___ | |
1780 $code.=<<___; | |
1781 mov 0x48(%rsp), %r15 | |
1782 mov 0x50(%rsp), %r14 | |
1783 mov 0x58(%rsp), %r13 | |
1784 mov 0x60(%rsp), %r12 | |
1785 mov 0x68(%rsp), %rbx | |
1786 mov 0x70(%rsp), %rax | |
1787 lea 0x78(%rsp), %rsp | |
1788 mov %rax, %rbp | |
1789 .Lcbc_dec_epilogue: | |
1790 ret | |
1791 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt | |
1792 | |
1793 .globl bsaes_ctr32_encrypt_blocks | |
1794 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent | |
1795 .align 16 | |
1796 bsaes_ctr32_encrypt_blocks: | |
1797 mov %rsp, %rax | |
1798 .Lctr_enc_prologue: | |
1799 push %rbp | |
1800 push %rbx | |
1801 push %r12 | |
1802 push %r13 | |
1803 push %r14 | |
1804 push %r15 | |
1805 lea -0x48(%rsp), %rsp | |
1806 ___ | |
1807 $code.=<<___ if ($win64); | |
1808 mov 0xa0(%rsp),$arg5 # pull ivp | |
1809 lea -0xa0(%rsp), %rsp | |
1810 movaps %xmm6, 0x40(%rsp) | |
1811 movaps %xmm7, 0x50(%rsp) | |
1812 movaps %xmm8, 0x60(%rsp) | |
1813 movaps %xmm9, 0x70(%rsp) | |
1814 movaps %xmm10, 0x80(%rsp) | |
1815 movaps %xmm11, 0x90(%rsp) | |
1816 movaps %xmm12, 0xa0(%rsp) | |
1817 movaps %xmm13, 0xb0(%rsp) | |
1818 movaps %xmm14, 0xc0(%rsp) | |
1819 movaps %xmm15, 0xd0(%rsp) | |
1820 .Lctr_enc_body: | |
1821 ___ | |
1822 $code.=<<___; | |
1823 mov %rsp, %rbp # backup %rsp | |
1824 movdqu ($arg5), %xmm0 # load counter | |
1825 mov 240($arg4), %eax # rounds | |
1826 mov $arg1, $inp # backup arguments | |
1827 mov $arg2, $out | |
1828 mov $arg3, $len | |
1829 mov $arg4, $key | |
1830 movdqa %xmm0, 0x20(%rbp) # copy counter | |
1831 cmp \$8, $arg3 | |
1832 jb .Lctr_enc_short | |
1833 | |
1834 mov %eax, %ebx # rounds | |
1835 shl \$7, %rax # 128 bytes per inner round key | |
1836 sub \$`128-32`, %rax # size of bit-sliced key schedule | |
1837 sub %rax, %rsp | |
1838 | |
1839 mov %rsp, %rax # pass key schedule | |
1840 mov $key, %rcx # pass key | |
1841 mov %ebx, %r10d # pass rounds | |
1842 call _bsaes_key_convert | |
1843 pxor %xmm6,%xmm7 # fix up last round key | |
1844 movdqa %xmm7,(%rax) # save last round key | |
1845 | |
1846 movdqa (%rsp), @XMM[9] # load round0 key | |
1847 lea .LADD1(%rip), %r11 | |
1848 movdqa 0x20(%rbp), @XMM[0] # counter copy | |
1849 movdqa -0x20(%r11), @XMM[8] # .LSWPUP | |
1850 pshufb @XMM[8], @XMM[9] # byte swap upper part | |
1851 pshufb @XMM[8], @XMM[0] | |
1852 movdqa @XMM[9], (%rsp) # save adjusted round0 key | |
1853 jmp .Lctr_enc_loop | |
1854 .align 16 | |
1855 .Lctr_enc_loop: | |
1856 movdqa @XMM[0], 0x20(%rbp) # save counter | |
1857 movdqa @XMM[0], @XMM[1] # prepare 8 counter values | |
1858 movdqa @XMM[0], @XMM[2] | |
1859 paddd 0x00(%r11), @XMM[1] # .LADD1 | |
1860 movdqa @XMM[0], @XMM[3] | |
1861 paddd 0x10(%r11), @XMM[2] # .LADD2 | |
1862 movdqa @XMM[0], @XMM[4] | |
1863 paddd 0x20(%r11), @XMM[3] # .LADD3 | |
1864 movdqa @XMM[0], @XMM[5] | |
1865 paddd 0x30(%r11), @XMM[4] # .LADD4 | |
1866 movdqa @XMM[0], @XMM[6] | |
1867 paddd 0x40(%r11), @XMM[5] # .LADD5 | |
1868 movdqa @XMM[0], @XMM[7] | |
1869 paddd 0x50(%r11), @XMM[6] # .LADD6 | |
1870 paddd 0x60(%r11), @XMM[7] # .LADD7 | |
1871 | |
1872 # Borrow prologue from _bsaes_encrypt8 to use the opportunity | |
1873 # to flip byte order in 32-bit counter | |
1874 movdqa (%rsp), @XMM[9] # round 0 key | |
1875 lea 0x10(%rsp), %rax # pass key schedule | |
1876 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR | |
1877 pxor @XMM[9], @XMM[0] # xor with round0 key | |
1878 pxor @XMM[9], @XMM[1] | |
1879 pshufb @XMM[8], @XMM[0] | |
1880 pxor @XMM[9], @XMM[2] | |
1881 pshufb @XMM[8], @XMM[1] | |
1882 pxor @XMM[9], @XMM[3] | |
1883 pshufb @XMM[8], @XMM[2] | |
1884 pxor @XMM[9], @XMM[4] | |
1885 pshufb @XMM[8], @XMM[3] | |
1886 pxor @XMM[9], @XMM[5] | |
1887 pshufb @XMM[8], @XMM[4] | |
1888 pxor @XMM[9], @XMM[6] | |
1889 pshufb @XMM[8], @XMM[5] | |
1890 pxor @XMM[9], @XMM[7] | |
1891 pshufb @XMM[8], @XMM[6] | |
1892 lea .LBS0(%rip), %r11 # constants table | |
1893 pshufb @XMM[8], @XMM[7] | |
1894 mov %ebx,%r10d # pass rounds | |
1895 | |
1896 call _bsaes_encrypt8_bitslice | |
1897 | |
1898 sub \$8,$len | |
1899 jc .Lctr_enc_loop_done | |
1900 | |
1901 movdqu 0x00($inp), @XMM[8] # load input | |
1902 movdqu 0x10($inp), @XMM[9] | |
1903 movdqu 0x20($inp), @XMM[10] | |
1904 movdqu 0x30($inp), @XMM[11] | |
1905 movdqu 0x40($inp), @XMM[12] | |
1906 movdqu 0x50($inp), @XMM[13] | |
1907 movdqu 0x60($inp), @XMM[14] | |
1908 movdqu 0x70($inp), @XMM[15] | |
1909 lea 0x80($inp),$inp | |
1910 pxor @XMM[0], @XMM[8] | |
1911 movdqa 0x20(%rbp), @XMM[0] # load counter | |
1912 pxor @XMM[9], @XMM[1] | |
1913 movdqu @XMM[8], 0x00($out) # write output | |
1914 pxor @XMM[10], @XMM[4] | |
1915 movdqu @XMM[1], 0x10($out) | |
1916 pxor @XMM[11], @XMM[6] | |
1917 movdqu @XMM[4], 0x20($out) | |
1918 pxor @XMM[12], @XMM[3] | |
1919 movdqu @XMM[6], 0x30($out) | |
1920 pxor @XMM[13], @XMM[7] | |
1921 movdqu @XMM[3], 0x40($out) | |
1922 pxor @XMM[14], @XMM[2] | |
1923 movdqu @XMM[7], 0x50($out) | |
1924 pxor @XMM[15], @XMM[5] | |
1925 movdqu @XMM[2], 0x60($out) | |
1926 lea .LADD1(%rip), %r11 | |
1927 movdqu @XMM[5], 0x70($out) | |
1928 lea 0x80($out), $out | |
1929 paddd 0x70(%r11), @XMM[0] # .LADD8 | |
1930 jnz .Lctr_enc_loop | |
1931 | |
1932 jmp .Lctr_enc_done | |
1933 .align 16 | |
1934 .Lctr_enc_loop_done: | |
1935 add \$8, $len | |
1936 movdqu 0x00($inp), @XMM[8] # load input | |
1937 pxor @XMM[8], @XMM[0] | |
1938 movdqu @XMM[0], 0x00($out) # write output | |
1939 cmp \$2,$len | |
1940 jb .Lctr_enc_done | |
1941 movdqu 0x10($inp), @XMM[9] | |
1942 pxor @XMM[9], @XMM[1] | |
1943 movdqu @XMM[1], 0x10($out) | |
1944 je .Lctr_enc_done | |
1945 movdqu 0x20($inp), @XMM[10] | |
1946 pxor @XMM[10], @XMM[4] | |
1947 movdqu @XMM[4], 0x20($out) | |
1948 cmp \$4,$len | |
1949 jb .Lctr_enc_done | |
1950 movdqu 0x30($inp), @XMM[11] | |
1951 pxor @XMM[11], @XMM[6] | |
1952 movdqu @XMM[6], 0x30($out) | |
1953 je .Lctr_enc_done | |
1954 movdqu 0x40($inp), @XMM[12] | |
1955 pxor @XMM[12], @XMM[3] | |
1956 movdqu @XMM[3], 0x40($out) | |
1957 cmp \$6,$len | |
1958 jb .Lctr_enc_done | |
1959 movdqu 0x50($inp), @XMM[13] | |
1960 pxor @XMM[13], @XMM[7] | |
1961 movdqu @XMM[7], 0x50($out) | |
1962 je .Lctr_enc_done | |
1963 movdqu 0x60($inp), @XMM[14] | |
1964 pxor @XMM[14], @XMM[2] | |
1965 movdqu @XMM[2], 0x60($out) | |
1966 jmp .Lctr_enc_done | |
1967 | |
1968 .align 16 | |
1969 .Lctr_enc_short: | |
1970 lea 0x20(%rbp), $arg1 | |
1971 lea 0x30(%rbp), $arg2 | |
1972 lea ($key), $arg3 | |
1973 call asm_AES_encrypt | |
1974 movdqu ($inp), @XMM[1] | |
1975 lea 16($inp), $inp | |
1976 mov 0x2c(%rbp), %eax # load 32-bit counter | |
1977 bswap %eax | |
1978 pxor 0x30(%rbp), @XMM[1] | |
1979 inc %eax # increment | |
1980 movdqu @XMM[1], ($out) | |
1981 bswap %eax | |
1982 lea 16($out), $out | |
1983 mov %eax, 0x2c(%rsp) # save 32-bit counter | |
1984 dec $len | |
1985 jnz .Lctr_enc_short | |
1986 | |
1987 .Lctr_enc_done: | |
1988 lea (%rsp), %rax | |
1989 pxor %xmm0, %xmm0 | |
1990 .Lctr_enc_bzero: # wipe key schedule [if any] | |
1991 movdqa %xmm0, 0x00(%rax) | |
1992 movdqa %xmm0, 0x10(%rax) | |
1993 lea 0x20(%rax), %rax | |
1994 cmp %rax, %rbp | |
1995 ja .Lctr_enc_bzero | |
1996 | |
1997 lea (%rbp),%rsp # restore %rsp | |
1998 ___ | |
1999 $code.=<<___ if ($win64); | |
2000 movaps 0x40(%rbp), %xmm6 | |
2001 movaps 0x50(%rbp), %xmm7 | |
2002 movaps 0x60(%rbp), %xmm8 | |
2003 movaps 0x70(%rbp), %xmm9 | |
2004 movaps 0x80(%rbp), %xmm10 | |
2005 movaps 0x90(%rbp), %xmm11 | |
2006 movaps 0xa0(%rbp), %xmm12 | |
2007 movaps 0xb0(%rbp), %xmm13 | |
2008 movaps 0xc0(%rbp), %xmm14 | |
2009 movaps 0xd0(%rbp), %xmm15 | |
2010 lea 0xa0(%rbp), %rsp | |
2011 ___ | |
2012 $code.=<<___; | |
2013 mov 0x48(%rsp), %r15 | |
2014 mov 0x50(%rsp), %r14 | |
2015 mov 0x58(%rsp), %r13 | |
2016 mov 0x60(%rsp), %r12 | |
2017 mov 0x68(%rsp), %rbx | |
2018 mov 0x70(%rsp), %rax | |
2019 lea 0x78(%rsp), %rsp | |
2020 mov %rax, %rbp | |
2021 .Lctr_enc_epilogue: | |
2022 ret | |
2023 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks | |
2024 ___ | |
2025 ###################################################################### | |
2026 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, | |
2027 # const AES_KEY *key1, const AES_KEY *key2, | |
2028 # const unsigned char iv[16]); | |
2029 # | |
2030 my ($twmask,$twres,$twtmp)=@XMM[13..15]; | |
2031 $code.=<<___; | |
2032 .globl bsaes_xts_encrypt | |
2033 .type bsaes_xts_encrypt,\@abi-omnipotent | |
2034 .align 16 | |
2035 bsaes_xts_encrypt: | |
2036 mov %rsp, %rax | |
2037 .Lxts_enc_prologue: | |
2038 push %rbp | |
2039 push %rbx | |
2040 push %r12 | |
2041 push %r13 | |
2042 push %r14 | |
2043 push %r15 | |
2044 lea -0x48(%rsp), %rsp | |
2045 ___ | |
2046 $code.=<<___ if ($win64); | |
2047 mov 0xa0(%rsp),$arg5 # pull key2 | |
2048 mov 0xa8(%rsp),$arg6 # pull ivp | |
2049 lea -0xa0(%rsp), %rsp | |
2050 movaps %xmm6, 0x40(%rsp) | |
2051 movaps %xmm7, 0x50(%rsp) | |
2052 movaps %xmm8, 0x60(%rsp) | |
2053 movaps %xmm9, 0x70(%rsp) | |
2054 movaps %xmm10, 0x80(%rsp) | |
2055 movaps %xmm11, 0x90(%rsp) | |
2056 movaps %xmm12, 0xa0(%rsp) | |
2057 movaps %xmm13, 0xb0(%rsp) | |
2058 movaps %xmm14, 0xc0(%rsp) | |
2059 movaps %xmm15, 0xd0(%rsp) | |
2060 .Lxts_enc_body: | |
2061 ___ | |
2062 $code.=<<___; | |
2063 mov %rsp, %rbp # backup %rsp | |
2064 mov $arg1, $inp # backup arguments | |
2065 mov $arg2, $out | |
2066 mov $arg3, $len | |
2067 mov $arg4, $key | |
2068 | |
2069 lea ($arg6), $arg1 | |
2070 lea 0x20(%rbp), $arg2 | |
2071 lea ($arg5), $arg3 | |
2072 call asm_AES_encrypt # generate initial tweak | |
2073 | |
2074 mov 240($key), %eax # rounds | |
2075 mov $len, %rbx # backup $len | |
2076 | |
2077 mov %eax, %edx # rounds | |
2078 shl \$7, %rax # 128 bytes per inner round key | |
2079 sub \$`128-32`, %rax # size of bit-sliced key schedule | |
2080 sub %rax, %rsp | |
2081 | |
2082 mov %rsp, %rax # pass key schedule | |
2083 mov $key, %rcx # pass key | |
2084 mov %edx, %r10d # pass rounds | |
2085 call _bsaes_key_convert | |
2086 pxor %xmm6, %xmm7 # fix up last round key | |
2087 movdqa %xmm7, (%rax) # save last round key | |
2088 | |
2089 and \$-16, $len | |
2090 sub \$0x80, %rsp # place for tweak[8] | |
2091 movdqa 0x20(%rbp), @XMM[7] # initial tweak | |
2092 | |
2093 pxor $twtmp, $twtmp | |
2094 movdqa .Lxts_magic(%rip), $twmask | |
2095 pcmpgtd @XMM[7], $twtmp # broadcast upper bits | |
2096 | |
2097 sub \$0x80, $len | |
2098 jc .Lxts_enc_short | |
2099 jmp .Lxts_enc_loop | |
2100 | |
2101 .align 16 | |
2102 .Lxts_enc_loop: | |
2103 ___ | |
2104 for ($i=0;$i<7;$i++) { | |
2105 $code.=<<___; | |
2106 pshufd \$0x13, $twtmp, $twres | |
2107 pxor $twtmp, $twtmp | |
2108 movdqa @XMM[7], @XMM[$i] | |
2109 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | |
2110 paddq @XMM[7], @XMM[7] # psllq 1,$tweak | |
2111 pand $twmask, $twres # isolate carry and residue | |
2112 pcmpgtd @XMM[7], $twtmp # broadcast upper bits | |
2113 pxor $twres, @XMM[7] | |
2114 ___ | |
2115 $code.=<<___ if ($i>=1); | |
2116 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | |
2117 ___ | |
2118 $code.=<<___ if ($i>=2); | |
2119 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | |
2120 ___ | |
2121 } | |
2122 $code.=<<___; | |
2123 movdqu 0x60($inp), @XMM[8+6] | |
2124 pxor @XMM[8+5], @XMM[5] | |
2125 movdqu 0x70($inp), @XMM[8+7] | |
2126 lea 0x80($inp), $inp | |
2127 movdqa @XMM[7], 0x70(%rsp) | |
2128 pxor @XMM[8+6], @XMM[6] | |
2129 lea 0x80(%rsp), %rax # pass key schedule | |
2130 pxor @XMM[8+7], @XMM[7] | |
2131 mov %edx, %r10d # pass rounds | |
2132 | |
2133 call _bsaes_encrypt8 | |
2134 | |
2135 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | |
2136 pxor 0x10(%rsp), @XMM[1] | |
2137 movdqu @XMM[0], 0x00($out) # write output | |
2138 pxor 0x20(%rsp), @XMM[4] | |
2139 movdqu @XMM[1], 0x10($out) | |
2140 pxor 0x30(%rsp), @XMM[6] | |
2141 movdqu @XMM[4], 0x20($out) | |
2142 pxor 0x40(%rsp), @XMM[3] | |
2143 movdqu @XMM[6], 0x30($out) | |
2144 pxor 0x50(%rsp), @XMM[7] | |
2145 movdqu @XMM[3], 0x40($out) | |
2146 pxor 0x60(%rsp), @XMM[2] | |
2147 movdqu @XMM[7], 0x50($out) | |
2148 pxor 0x70(%rsp), @XMM[5] | |
2149 movdqu @XMM[2], 0x60($out) | |
2150 movdqu @XMM[5], 0x70($out) | |
2151 lea 0x80($out), $out | |
2152 | |
2153 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak | |
2154 pxor $twtmp, $twtmp | |
2155 movdqa .Lxts_magic(%rip), $twmask | |
2156 pcmpgtd @XMM[7], $twtmp | |
2157 pshufd \$0x13, $twtmp, $twres | |
2158 pxor $twtmp, $twtmp | |
2159 paddq @XMM[7], @XMM[7] # psllq 1,$tweak | |
2160 pand $twmask, $twres # isolate carry and residue | |
2161 pcmpgtd @XMM[7], $twtmp # broadcast upper bits | |
2162 pxor $twres, @XMM[7] | |
2163 | |
2164 sub \$0x80,$len | |
2165 jnc .Lxts_enc_loop | |
2166 | |
2167 .Lxts_enc_short: | |
2168 add \$0x80, $len | |
2169 jz .Lxts_enc_done | |
2170 ___ | |
2171 for ($i=0;$i<7;$i++) { | |
2172 $code.=<<___; | |
2173 pshufd \$0x13, $twtmp, $twres | |
2174 pxor $twtmp, $twtmp | |
2175 movdqa @XMM[7], @XMM[$i] | |
2176 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | |
2177 paddq @XMM[7], @XMM[7] # psllq 1,$tweak | |
2178 pand $twmask, $twres # isolate carry and residue | |
2179 pcmpgtd @XMM[7], $twtmp # broadcast upper bits | |
2180 pxor $twres, @XMM[7] | |
2181 ___ | |
2182 $code.=<<___ if ($i>=1); | |
2183 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | |
2184 cmp \$`0x10*$i`,$len | |
2185 je .Lxts_enc_$i | |
2186 ___ | |
2187 $code.=<<___ if ($i>=2); | |
2188 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | |
2189 ___ | |
2190 } | |
2191 $code.=<<___; | |
2192 movdqu 0x60($inp), @XMM[8+6] | |
2193 pxor @XMM[8+5], @XMM[5] | |
2194 movdqa @XMM[7], 0x70(%rsp) | |
2195 lea 0x70($inp), $inp | |
2196 pxor @XMM[8+6], @XMM[6] | |
2197 lea 0x80(%rsp), %rax # pass key schedule | |
2198 mov %edx, %r10d # pass rounds | |
2199 | |
2200 call _bsaes_encrypt8 | |
2201 | |
2202 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | |
2203 pxor 0x10(%rsp), @XMM[1] | |
2204 movdqu @XMM[0], 0x00($out) # write output | |
2205 pxor 0x20(%rsp), @XMM[4] | |
2206 movdqu @XMM[1], 0x10($out) | |
2207 pxor 0x30(%rsp), @XMM[6] | |
2208 movdqu @XMM[4], 0x20($out) | |
2209 pxor 0x40(%rsp), @XMM[3] | |
2210 movdqu @XMM[6], 0x30($out) | |
2211 pxor 0x50(%rsp), @XMM[7] | |
2212 movdqu @XMM[3], 0x40($out) | |
2213 pxor 0x60(%rsp), @XMM[2] | |
2214 movdqu @XMM[7], 0x50($out) | |
2215 movdqu @XMM[2], 0x60($out) | |
2216 lea 0x70($out), $out | |
2217 | |
2218 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak | |
2219 jmp .Lxts_enc_done | |
2220 .align 16 | |
2221 .Lxts_enc_6: | |
2222 pxor @XMM[8+4], @XMM[4] | |
2223 lea 0x60($inp), $inp | |
2224 pxor @XMM[8+5], @XMM[5] | |
2225 lea 0x80(%rsp), %rax # pass key schedule | |
2226 mov %edx, %r10d # pass rounds | |
2227 | |
2228 call _bsaes_encrypt8 | |
2229 | |
2230 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | |
2231 pxor 0x10(%rsp), @XMM[1] | |
2232 movdqu @XMM[0], 0x00($out) # write output | |
2233 pxor 0x20(%rsp), @XMM[4] | |
2234 movdqu @XMM[1], 0x10($out) | |
2235 pxor 0x30(%rsp), @XMM[6] | |
2236 movdqu @XMM[4], 0x20($out) | |
2237 pxor 0x40(%rsp), @XMM[3] | |
2238 movdqu @XMM[6], 0x30($out) | |
2239 pxor 0x50(%rsp), @XMM[7] | |
2240 movdqu @XMM[3], 0x40($out) | |
2241 movdqu @XMM[7], 0x50($out) | |
2242 lea 0x60($out), $out | |
2243 | |
2244 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak | |
2245 jmp .Lxts_enc_done | |
2246 .align 16 | |
2247 .Lxts_enc_5: | |
2248 pxor @XMM[8+3], @XMM[3] | |
2249 lea 0x50($inp), $inp | |
2250 pxor @XMM[8+4], @XMM[4] | |
2251 lea 0x80(%rsp), %rax # pass key schedule | |
2252 mov %edx, %r10d # pass rounds | |
2253 | |
2254 call _bsaes_encrypt8 | |
2255 | |
2256 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | |
2257 pxor 0x10(%rsp), @XMM[1] | |
2258 movdqu @XMM[0], 0x00($out) # write output | |
2259 pxor 0x20(%rsp), @XMM[4] | |
2260 movdqu @XMM[1], 0x10($out) | |
2261 pxor 0x30(%rsp), @XMM[6] | |
2262 movdqu @XMM[4], 0x20($out) | |
2263 pxor 0x40(%rsp), @XMM[3] | |
2264 movdqu @XMM[6], 0x30($out) | |
2265 movdqu @XMM[3], 0x40($out) | |
2266 lea 0x50($out), $out | |
2267 | |
2268 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak | |
2269 jmp .Lxts_enc_done | |
2270 .align 16 | |
2271 .Lxts_enc_4: | |
2272 pxor @XMM[8+2], @XMM[2] | |
2273 lea 0x40($inp), $inp | |
2274 pxor @XMM[8+3], @XMM[3] | |
2275 lea 0x80(%rsp), %rax # pass key schedule | |
2276 mov %edx, %r10d # pass rounds | |
2277 | |
2278 call _bsaes_encrypt8 | |
2279 | |
2280 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | |
2281 pxor 0x10(%rsp), @XMM[1] | |
2282 movdqu @XMM[0], 0x00($out) # write output | |
2283 pxor 0x20(%rsp), @XMM[4] | |
2284 movdqu @XMM[1], 0x10($out) | |
2285 pxor 0x30(%rsp), @XMM[6] | |
2286 movdqu @XMM[4], 0x20($out) | |
2287 movdqu @XMM[6], 0x30($out) | |
2288 lea 0x40($out), $out | |
2289 | |
2290 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak | |
2291 jmp .Lxts_enc_done | |
2292 .align 16 | |
2293 .Lxts_enc_3: | |
2294 pxor @XMM[8+1], @XMM[1] | |
2295 lea 0x30($inp), $inp | |
2296 pxor @XMM[8+2], @XMM[2] | |
2297 lea 0x80(%rsp), %rax # pass key schedule | |
2298 mov %edx, %r10d # pass rounds | |
2299 | |
2300 call _bsaes_encrypt8 | |
2301 | |
2302 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | |
2303 pxor 0x10(%rsp), @XMM[1] | |
2304 movdqu @XMM[0], 0x00($out) # write output | |
2305 pxor 0x20(%rsp), @XMM[4] | |
2306 movdqu @XMM[1], 0x10($out) | |
2307 movdqu @XMM[4], 0x20($out) | |
2308 lea 0x30($out), $out | |
2309 | |
2310 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak | |
2311 jmp .Lxts_enc_done | |
2312 .align 16 | |
2313 .Lxts_enc_2: | |
2314 pxor @XMM[8+0], @XMM[0] | |
2315 lea 0x20($inp), $inp | |
2316 pxor @XMM[8+1], @XMM[1] | |
2317 lea 0x80(%rsp), %rax # pass key schedule | |
2318 mov %edx, %r10d # pass rounds | |
2319 | |
2320 call _bsaes_encrypt8 | |
2321 | |
2322 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | |
2323 pxor 0x10(%rsp), @XMM[1] | |
2324 movdqu @XMM[0], 0x00($out) # write output | |
2325 movdqu @XMM[1], 0x10($out) | |
2326 lea 0x20($out), $out | |
2327 | |
2328 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak | |
2329 jmp .Lxts_enc_done | |
2330 .align 16 | |
2331 .Lxts_enc_1: | |
2332 pxor @XMM[0], @XMM[8] | |
2333 lea 0x10($inp), $inp | |
2334 movdqa @XMM[8], 0x20(%rbp) | |
2335 lea 0x20(%rbp), $arg1 | |
2336 lea 0x20(%rbp), $arg2 | |
2337 lea ($key), $arg3 | |
2338 call asm_AES_encrypt # doesn't touch %xmm | |
2339 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] | |
2340 #pxor @XMM[8], @XMM[0] | |
2341 #lea 0x80(%rsp), %rax # pass key schedule | |
2342 #mov %edx, %r10d # pass rounds | |
2343 #call _bsaes_encrypt8 | |
2344 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | |
2345 movdqu @XMM[0], 0x00($out) # write output | |
2346 lea 0x10($out), $out | |
2347 | |
2348 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak | |
2349 | |
2350 .Lxts_enc_done: | |
2351 and \$15, %ebx | |
2352 jz .Lxts_enc_ret | |
2353 mov $out, %rdx | |
2354 | |
2355 .Lxts_enc_steal: | |
2356 movzb ($inp), %eax | |
2357 movzb -16(%rdx), %ecx | |
2358 lea 1($inp), $inp | |
2359 mov %al, -16(%rdx) | |
2360 mov %cl, 0(%rdx) | |
2361 lea 1(%rdx), %rdx | |
2362 sub \$1,%ebx | |
2363 jnz .Lxts_enc_steal | |
2364 | |
2365 movdqu -16($out), @XMM[0] | |
2366 lea 0x20(%rbp), $arg1 | |
2367 pxor @XMM[7], @XMM[0] | |
2368 lea 0x20(%rbp), $arg2 | |
2369 movdqa @XMM[0], 0x20(%rbp) | |
2370 lea ($key), $arg3 | |
2371 call asm_AES_encrypt # doesn't touch %xmm | |
2372 pxor 0x20(%rbp), @XMM[7] | |
2373 movdqu @XMM[7], -16($out) | |
2374 | |
2375 .Lxts_enc_ret: | |
2376 lea (%rsp), %rax | |
2377 pxor %xmm0, %xmm0 | |
2378 .Lxts_enc_bzero: # wipe key schedule [if any] | |
2379 movdqa %xmm0, 0x00(%rax) | |
2380 movdqa %xmm0, 0x10(%rax) | |
2381 lea 0x20(%rax), %rax | |
2382 cmp %rax, %rbp | |
2383 ja .Lxts_enc_bzero | |
2384 | |
2385 lea (%rbp),%rsp # restore %rsp | |
2386 ___ | |
2387 $code.=<<___ if ($win64); | |
2388 movaps 0x40(%rbp), %xmm6 | |
2389 movaps 0x50(%rbp), %xmm7 | |
2390 movaps 0x60(%rbp), %xmm8 | |
2391 movaps 0x70(%rbp), %xmm9 | |
2392 movaps 0x80(%rbp), %xmm10 | |
2393 movaps 0x90(%rbp), %xmm11 | |
2394 movaps 0xa0(%rbp), %xmm12 | |
2395 movaps 0xb0(%rbp), %xmm13 | |
2396 movaps 0xc0(%rbp), %xmm14 | |
2397 movaps 0xd0(%rbp), %xmm15 | |
2398 lea 0xa0(%rbp), %rsp | |
2399 ___ | |
2400 $code.=<<___; | |
2401 mov 0x48(%rsp), %r15 | |
2402 mov 0x50(%rsp), %r14 | |
2403 mov 0x58(%rsp), %r13 | |
2404 mov 0x60(%rsp), %r12 | |
2405 mov 0x68(%rsp), %rbx | |
2406 mov 0x70(%rsp), %rax | |
2407 lea 0x78(%rsp), %rsp | |
2408 mov %rax, %rbp | |
2409 .Lxts_enc_epilogue: | |
2410 ret | |
2411 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt | |
2412 | |
2413 .globl bsaes_xts_decrypt | |
2414 .type bsaes_xts_decrypt,\@abi-omnipotent | |
2415 .align 16 | |
2416 bsaes_xts_decrypt: | |
2417 mov %rsp, %rax | |
2418 .Lxts_dec_prologue: | |
2419 push %rbp | |
2420 push %rbx | |
2421 push %r12 | |
2422 push %r13 | |
2423 push %r14 | |
2424 push %r15 | |
2425 lea -0x48(%rsp), %rsp | |
2426 ___ | |
2427 $code.=<<___ if ($win64); | |
2428 mov 0xa0(%rsp),$arg5 # pull key2 | |
2429 mov 0xa8(%rsp),$arg6 # pull ivp | |
2430 lea -0xa0(%rsp), %rsp | |
2431 movaps %xmm6, 0x40(%rsp) | |
2432 movaps %xmm7, 0x50(%rsp) | |
2433 movaps %xmm8, 0x60(%rsp) | |
2434 movaps %xmm9, 0x70(%rsp) | |
2435 movaps %xmm10, 0x80(%rsp) | |
2436 movaps %xmm11, 0x90(%rsp) | |
2437 movaps %xmm12, 0xa0(%rsp) | |
2438 movaps %xmm13, 0xb0(%rsp) | |
2439 movaps %xmm14, 0xc0(%rsp) | |
2440 movaps %xmm15, 0xd0(%rsp) | |
2441 .Lxts_dec_body: | |
2442 ___ | |
2443 $code.=<<___; | |
2444 mov %rsp, %rbp # backup %rsp | |
2445 mov $arg1, $inp # backup arguments | |
2446 mov $arg2, $out | |
2447 mov $arg3, $len | |
2448 mov $arg4, $key | |
2449 | |
2450 lea ($arg6), $arg1 | |
2451 lea 0x20(%rbp), $arg2 | |
2452 lea ($arg5), $arg3 | |
2453 call asm_AES_encrypt # generate initial tweak | |
2454 | |
2455 mov 240($key), %eax # rounds | |
2456 mov $len, %rbx # backup $len | |
2457 | |
2458 mov %eax, %edx # rounds | |
2459 shl \$7, %rax # 128 bytes per inner round key | |
2460 sub \$`128-32`, %rax # size of bit-sliced key schedule | |
2461 sub %rax, %rsp | |
2462 | |
2463 mov %rsp, %rax # pass key schedule | |
2464 mov $key, %rcx # pass key | |
2465 mov %edx, %r10d # pass rounds | |
2466 call _bsaes_key_convert | |
2467 pxor (%rsp), %xmm7 # fix up round 0 key | |
2468 movdqa %xmm6, (%rax) # save last round key | |
2469 movdqa %xmm7, (%rsp) | |
2470 | |
2471 xor %eax, %eax # if ($len%16) len-=16; | |
2472 and \$-16, $len | |
2473 test \$15, %ebx | |
2474 setnz %al | |
2475 shl \$4, %rax | |
2476 sub %rax, $len | |
2477 | |
2478 sub \$0x80, %rsp # place for tweak[8] | |
2479 movdqa 0x20(%rbp), @XMM[7] # initial tweak | |
2480 | |
2481 pxor $twtmp, $twtmp | |
2482 movdqa .Lxts_magic(%rip), $twmask | |
2483 pcmpgtd @XMM[7], $twtmp # broadcast upper bits | |
2484 | |
2485 sub \$0x80, $len | |
2486 jc .Lxts_dec_short | |
2487 jmp .Lxts_dec_loop | |
2488 | |
2489 .align 16 | |
2490 .Lxts_dec_loop: | |
2491 ___ | |
2492 for ($i=0;$i<7;$i++) { | |
2493 $code.=<<___; | |
2494 pshufd \$0x13, $twtmp, $twres | |
2495 pxor $twtmp, $twtmp | |
2496 movdqa @XMM[7], @XMM[$i] | |
2497 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | |
2498 paddq @XMM[7], @XMM[7] # psllq 1,$tweak | |
2499 pand $twmask, $twres # isolate carry and residue | |
2500 pcmpgtd @XMM[7], $twtmp # broadcast upper bits | |
2501 pxor $twres, @XMM[7] | |
2502 ___ | |
2503 $code.=<<___ if ($i>=1); | |
2504 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | |
2505 ___ | |
2506 $code.=<<___ if ($i>=2); | |
2507 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | |
2508 ___ | |
2509 } | |
2510 $code.=<<___; | |
2511 movdqu 0x60($inp), @XMM[8+6] | |
2512 pxor @XMM[8+5], @XMM[5] | |
2513 movdqu 0x70($inp), @XMM[8+7] | |
2514 lea 0x80($inp), $inp | |
2515 movdqa @XMM[7], 0x70(%rsp) | |
2516 pxor @XMM[8+6], @XMM[6] | |
2517 lea 0x80(%rsp), %rax # pass key schedule | |
2518 pxor @XMM[8+7], @XMM[7] | |
2519 mov %edx, %r10d # pass rounds | |
2520 | |
2521 call _bsaes_decrypt8 | |
2522 | |
2523 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | |
2524 pxor 0x10(%rsp), @XMM[1] | |
2525 movdqu @XMM[0], 0x00($out) # write output | |
2526 pxor 0x20(%rsp), @XMM[6] | |
2527 movdqu @XMM[1], 0x10($out) | |
2528 pxor 0x30(%rsp), @XMM[4] | |
2529 movdqu @XMM[6], 0x20($out) | |
2530 pxor 0x40(%rsp), @XMM[2] | |
2531 movdqu @XMM[4], 0x30($out) | |
2532 pxor 0x50(%rsp), @XMM[7] | |
2533 movdqu @XMM[2], 0x40($out) | |
2534 pxor 0x60(%rsp), @XMM[3] | |
2535 movdqu @XMM[7], 0x50($out) | |
2536 pxor 0x70(%rsp), @XMM[5] | |
2537 movdqu @XMM[3], 0x60($out) | |
2538 movdqu @XMM[5], 0x70($out) | |
2539 lea 0x80($out), $out | |
2540 | |
2541 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak | |
2542 pxor $twtmp, $twtmp | |
2543 movdqa .Lxts_magic(%rip), $twmask | |
2544 pcmpgtd @XMM[7], $twtmp | |
2545 pshufd \$0x13, $twtmp, $twres | |
2546 pxor $twtmp, $twtmp | |
2547 paddq @XMM[7], @XMM[7] # psllq 1,$tweak | |
2548 pand $twmask, $twres # isolate carry and residue | |
2549 pcmpgtd @XMM[7], $twtmp # broadcast upper bits | |
2550 pxor $twres, @XMM[7] | |
2551 | |
2552 sub \$0x80,$len | |
2553 jnc .Lxts_dec_loop | |
2554 | |
2555 .Lxts_dec_short: | |
2556 add \$0x80, $len | |
2557 jz .Lxts_dec_done | |
2558 ___ | |
2559 for ($i=0;$i<7;$i++) { | |
2560 $code.=<<___; | |
2561 pshufd \$0x13, $twtmp, $twres | |
2562 pxor $twtmp, $twtmp | |
2563 movdqa @XMM[7], @XMM[$i] | |
2564 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] | |
2565 paddq @XMM[7], @XMM[7] # psllq 1,$tweak | |
2566 pand $twmask, $twres # isolate carry and residue | |
2567 pcmpgtd @XMM[7], $twtmp # broadcast upper bits | |
2568 pxor $twres, @XMM[7] | |
2569 ___ | |
2570 $code.=<<___ if ($i>=1); | |
2571 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] | |
2572 cmp \$`0x10*$i`,$len | |
2573 je .Lxts_dec_$i | |
2574 ___ | |
2575 $code.=<<___ if ($i>=2); | |
2576 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] | |
2577 ___ | |
2578 } | |
2579 $code.=<<___; | |
2580 movdqu 0x60($inp), @XMM[8+6] | |
2581 pxor @XMM[8+5], @XMM[5] | |
2582 movdqa @XMM[7], 0x70(%rsp) | |
2583 lea 0x70($inp), $inp | |
2584 pxor @XMM[8+6], @XMM[6] | |
2585 lea 0x80(%rsp), %rax # pass key schedule | |
2586 mov %edx, %r10d # pass rounds | |
2587 | |
2588 call _bsaes_decrypt8 | |
2589 | |
2590 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | |
2591 pxor 0x10(%rsp), @XMM[1] | |
2592 movdqu @XMM[0], 0x00($out) # write output | |
2593 pxor 0x20(%rsp), @XMM[6] | |
2594 movdqu @XMM[1], 0x10($out) | |
2595 pxor 0x30(%rsp), @XMM[4] | |
2596 movdqu @XMM[6], 0x20($out) | |
2597 pxor 0x40(%rsp), @XMM[2] | |
2598 movdqu @XMM[4], 0x30($out) | |
2599 pxor 0x50(%rsp), @XMM[7] | |
2600 movdqu @XMM[2], 0x40($out) | |
2601 pxor 0x60(%rsp), @XMM[3] | |
2602 movdqu @XMM[7], 0x50($out) | |
2603 movdqu @XMM[3], 0x60($out) | |
2604 lea 0x70($out), $out | |
2605 | |
2606 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak | |
2607 jmp .Lxts_dec_done | |
2608 .align 16 | |
2609 .Lxts_dec_6: | |
2610 pxor @XMM[8+4], @XMM[4] | |
2611 lea 0x60($inp), $inp | |
2612 pxor @XMM[8+5], @XMM[5] | |
2613 lea 0x80(%rsp), %rax # pass key schedule | |
2614 mov %edx, %r10d # pass rounds | |
2615 | |
2616 call _bsaes_decrypt8 | |
2617 | |
2618 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | |
2619 pxor 0x10(%rsp), @XMM[1] | |
2620 movdqu @XMM[0], 0x00($out) # write output | |
2621 pxor 0x20(%rsp), @XMM[6] | |
2622 movdqu @XMM[1], 0x10($out) | |
2623 pxor 0x30(%rsp), @XMM[4] | |
2624 movdqu @XMM[6], 0x20($out) | |
2625 pxor 0x40(%rsp), @XMM[2] | |
2626 movdqu @XMM[4], 0x30($out) | |
2627 pxor 0x50(%rsp), @XMM[7] | |
2628 movdqu @XMM[2], 0x40($out) | |
2629 movdqu @XMM[7], 0x50($out) | |
2630 lea 0x60($out), $out | |
2631 | |
2632 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak | |
2633 jmp .Lxts_dec_done | |
2634 .align 16 | |
2635 .Lxts_dec_5: | |
2636 pxor @XMM[8+3], @XMM[3] | |
2637 lea 0x50($inp), $inp | |
2638 pxor @XMM[8+4], @XMM[4] | |
2639 lea 0x80(%rsp), %rax # pass key schedule | |
2640 mov %edx, %r10d # pass rounds | |
2641 | |
2642 call _bsaes_decrypt8 | |
2643 | |
2644 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | |
2645 pxor 0x10(%rsp), @XMM[1] | |
2646 movdqu @XMM[0], 0x00($out) # write output | |
2647 pxor 0x20(%rsp), @XMM[6] | |
2648 movdqu @XMM[1], 0x10($out) | |
2649 pxor 0x30(%rsp), @XMM[4] | |
2650 movdqu @XMM[6], 0x20($out) | |
2651 pxor 0x40(%rsp), @XMM[2] | |
2652 movdqu @XMM[4], 0x30($out) | |
2653 movdqu @XMM[2], 0x40($out) | |
2654 lea 0x50($out), $out | |
2655 | |
2656 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak | |
2657 jmp .Lxts_dec_done | |
2658 .align 16 | |
2659 .Lxts_dec_4: | |
2660 pxor @XMM[8+2], @XMM[2] | |
2661 lea 0x40($inp), $inp | |
2662 pxor @XMM[8+3], @XMM[3] | |
2663 lea 0x80(%rsp), %rax # pass key schedule | |
2664 mov %edx, %r10d # pass rounds | |
2665 | |
2666 call _bsaes_decrypt8 | |
2667 | |
2668 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | |
2669 pxor 0x10(%rsp), @XMM[1] | |
2670 movdqu @XMM[0], 0x00($out) # write output | |
2671 pxor 0x20(%rsp), @XMM[6] | |
2672 movdqu @XMM[1], 0x10($out) | |
2673 pxor 0x30(%rsp), @XMM[4] | |
2674 movdqu @XMM[6], 0x20($out) | |
2675 movdqu @XMM[4], 0x30($out) | |
2676 lea 0x40($out), $out | |
2677 | |
2678 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak | |
2679 jmp .Lxts_dec_done | |
2680 .align 16 | |
2681 .Lxts_dec_3: | |
2682 pxor @XMM[8+1], @XMM[1] | |
2683 lea 0x30($inp), $inp | |
2684 pxor @XMM[8+2], @XMM[2] | |
2685 lea 0x80(%rsp), %rax # pass key schedule | |
2686 mov %edx, %r10d # pass rounds | |
2687 | |
2688 call _bsaes_decrypt8 | |
2689 | |
2690 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | |
2691 pxor 0x10(%rsp), @XMM[1] | |
2692 movdqu @XMM[0], 0x00($out) # write output | |
2693 pxor 0x20(%rsp), @XMM[6] | |
2694 movdqu @XMM[1], 0x10($out) | |
2695 movdqu @XMM[6], 0x20($out) | |
2696 lea 0x30($out), $out | |
2697 | |
2698 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak | |
2699 jmp .Lxts_dec_done | |
2700 .align 16 | |
2701 .Lxts_dec_2: | |
2702 pxor @XMM[8+0], @XMM[0] | |
2703 lea 0x20($inp), $inp | |
2704 pxor @XMM[8+1], @XMM[1] | |
2705 lea 0x80(%rsp), %rax # pass key schedule | |
2706 mov %edx, %r10d # pass rounds | |
2707 | |
2708 call _bsaes_decrypt8 | |
2709 | |
2710 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | |
2711 pxor 0x10(%rsp), @XMM[1] | |
2712 movdqu @XMM[0], 0x00($out) # write output | |
2713 movdqu @XMM[1], 0x10($out) | |
2714 lea 0x20($out), $out | |
2715 | |
2716 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak | |
2717 jmp .Lxts_dec_done | |
2718 .align 16 | |
2719 .Lxts_dec_1: | |
2720 pxor @XMM[0], @XMM[8] | |
2721 lea 0x10($inp), $inp | |
2722 movdqa @XMM[8], 0x20(%rbp) | |
2723 lea 0x20(%rbp), $arg1 | |
2724 lea 0x20(%rbp), $arg2 | |
2725 lea ($key), $arg3 | |
2726 call asm_AES_decrypt # doesn't touch %xmm | |
2727 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] | |
2728 #pxor @XMM[8], @XMM[0] | |
2729 #lea 0x80(%rsp), %rax # pass key schedule | |
2730 #mov %edx, %r10d # pass rounds | |
2731 #call _bsaes_decrypt8 | |
2732 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] | |
2733 movdqu @XMM[0], 0x00($out) # write output | |
2734 lea 0x10($out), $out | |
2735 | |
2736 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak | |
2737 | |
2738 .Lxts_dec_done: | |
2739 and \$15, %ebx | |
2740 jz .Lxts_dec_ret | |
2741 | |
2742 pxor $twtmp, $twtmp | |
2743 movdqa .Lxts_magic(%rip), $twmask | |
2744 pcmpgtd @XMM[7], $twtmp | |
2745 pshufd \$0x13, $twtmp, $twres | |
2746 movdqa @XMM[7], @XMM[6] | |
2747 paddq @XMM[7], @XMM[7] # psllq 1,$tweak | |
2748 pand $twmask, $twres # isolate carry and residue | |
2749 movdqu ($inp), @XMM[0] | |
2750 pxor $twres, @XMM[7] | |
2751 | |
2752 lea 0x20(%rbp), $arg1 | |
2753 pxor @XMM[7], @XMM[0] | |
2754 lea 0x20(%rbp), $arg2 | |
2755 movdqa @XMM[0], 0x20(%rbp) | |
2756 lea ($key), $arg3 | |
2757 call asm_AES_decrypt # doesn't touch %xmm | |
2758 pxor 0x20(%rbp), @XMM[7] | |
2759 mov $out, %rdx | |
2760 movdqu @XMM[7], ($out) | |
2761 | |
2762 .Lxts_dec_steal: | |
2763 movzb 16($inp), %eax | |
2764 movzb (%rdx), %ecx | |
2765 lea 1($inp), $inp | |
2766 mov %al, (%rdx) | |
2767 mov %cl, 16(%rdx) | |
2768 lea 1(%rdx), %rdx | |
2769 sub \$1,%ebx | |
2770 jnz .Lxts_dec_steal | |
2771 | |
2772 movdqu ($out), @XMM[0] | |
2773 lea 0x20(%rbp), $arg1 | |
2774 pxor @XMM[6], @XMM[0] | |
2775 lea 0x20(%rbp), $arg2 | |
2776 movdqa @XMM[0], 0x20(%rbp) | |
2777 lea ($key), $arg3 | |
2778 call asm_AES_decrypt # doesn't touch %xmm | |
2779 pxor 0x20(%rbp), @XMM[6] | |
2780 movdqu @XMM[6], ($out) | |
2781 | |
2782 .Lxts_dec_ret: | |
2783 lea (%rsp), %rax | |
2784 pxor %xmm0, %xmm0 | |
2785 .Lxts_dec_bzero: # wipe key schedule [if any] | |
2786 movdqa %xmm0, 0x00(%rax) | |
2787 movdqa %xmm0, 0x10(%rax) | |
2788 lea 0x20(%rax), %rax | |
2789 cmp %rax, %rbp | |
2790 ja .Lxts_dec_bzero | |
2791 | |
2792 lea (%rbp),%rsp # restore %rsp | |
2793 ___ | |
2794 $code.=<<___ if ($win64); | |
2795 movaps 0x40(%rbp), %xmm6 | |
2796 movaps 0x50(%rbp), %xmm7 | |
2797 movaps 0x60(%rbp), %xmm8 | |
2798 movaps 0x70(%rbp), %xmm9 | |
2799 movaps 0x80(%rbp), %xmm10 | |
2800 movaps 0x90(%rbp), %xmm11 | |
2801 movaps 0xa0(%rbp), %xmm12 | |
2802 movaps 0xb0(%rbp), %xmm13 | |
2803 movaps 0xc0(%rbp), %xmm14 | |
2804 movaps 0xd0(%rbp), %xmm15 | |
2805 lea 0xa0(%rbp), %rsp | |
2806 ___ | |
2807 $code.=<<___; | |
2808 mov 0x48(%rsp), %r15 | |
2809 mov 0x50(%rsp), %r14 | |
2810 mov 0x58(%rsp), %r13 | |
2811 mov 0x60(%rsp), %r12 | |
2812 mov 0x68(%rsp), %rbx | |
2813 mov 0x70(%rsp), %rax | |
2814 lea 0x78(%rsp), %rsp | |
2815 mov %rax, %rbp | |
2816 .Lxts_dec_epilogue: | |
2817 ret | |
2818 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt | |
2819 ___ | |
2820 } | |
2821 $code.=<<___; | |
2822 .type _bsaes_const,\@object | |
2823 .align 64 | |
2824 _bsaes_const: | |
2825 .LM0ISR: # InvShiftRows constants | |
2826 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 | |
2827 .LISRM0: | |
2828 .quad 0x01040b0e0205080f, 0x0306090c00070a0d | |
2829 .LISR: | |
2830 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 | |
2831 .LBS0: # bit-slice constants | |
2832 .quad 0x5555555555555555, 0x5555555555555555 | |
2833 .LBS1: | |
2834 .quad 0x3333333333333333, 0x3333333333333333 | |
2835 .LBS2: | |
2836 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f | |
2837 .LSR: # shiftrows constants | |
2838 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b | |
2839 .LSRM0: | |
2840 .quad 0x0304090e00050a0f, 0x01060b0c0207080d | |
2841 .LM0SR: | |
2842 .quad 0x0a0e02060f03070b, 0x0004080c05090d01 | |
2843 .LSWPUP: # byte-swap upper dword | |
2844 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 | |
2845 .LSWPUPM0SR: | |
2846 .quad 0x0a0d02060c03070b, 0x0004080f05090e01 | |
2847 .LADD1: # counter increment constants | |
2848 .quad 0x0000000000000000, 0x0000000100000000 | |
2849 .LADD2: | |
2850 .quad 0x0000000000000000, 0x0000000200000000 | |
2851 .LADD3: | |
2852 .quad 0x0000000000000000, 0x0000000300000000 | |
2853 .LADD4: | |
2854 .quad 0x0000000000000000, 0x0000000400000000 | |
2855 .LADD5: | |
2856 .quad 0x0000000000000000, 0x0000000500000000 | |
2857 .LADD6: | |
2858 .quad 0x0000000000000000, 0x0000000600000000 | |
2859 .LADD7: | |
2860 .quad 0x0000000000000000, 0x0000000700000000 | |
2861 .LADD8: | |
2862 .quad 0x0000000000000000, 0x0000000800000000 | |
2863 .Lxts_magic: | |
2864 .long 0x87,0,1,0 | |
2865 .Lmasks: | |
2866 .quad 0x0101010101010101, 0x0101010101010101 | |
2867 .quad 0x0202020202020202, 0x0202020202020202 | |
2868 .quad 0x0404040404040404, 0x0404040404040404 | |
2869 .quad 0x0808080808080808, 0x0808080808080808 | |
2870 .LM0: | |
2871 .quad 0x02060a0e03070b0f, 0x0004080c0105090d | |
2872 .L63: | |
2873 .quad 0x6363636363636363, 0x6363636363636363 | |
2874 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Pol
yakov" | |
2875 .align 64 | |
2876 .size _bsaes_const,.-_bsaes_const | |
2877 ___ | |
2878 | |
2879 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | |
2880 # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
2881 if ($win64) { | |
2882 $rec="%rcx"; | |
2883 $frame="%rdx"; | |
2884 $context="%r8"; | |
2885 $disp="%r9"; | |
2886 | |
2887 $code.=<<___; | |
2888 .extern __imp_RtlVirtualUnwind | |
2889 .type se_handler,\@abi-omnipotent | |
2890 .align 16 | |
2891 se_handler: | |
2892 push %rsi | |
2893 push %rdi | |
2894 push %rbx | |
2895 push %rbp | |
2896 push %r12 | |
2897 push %r13 | |
2898 push %r14 | |
2899 push %r15 | |
2900 pushfq | |
2901 sub \$64,%rsp | |
2902 | |
2903 mov 120($context),%rax # pull context->Rax | |
2904 mov 248($context),%rbx # pull context->Rip | |
2905 | |
2906 mov 8($disp),%rsi # disp->ImageBase | |
2907 mov 56($disp),%r11 # disp->HandlerData | |
2908 | |
2909 mov 0(%r11),%r10d # HandlerData[0] | |
2910 lea (%rsi,%r10),%r10 # prologue label | |
2911 cmp %r10,%rbx # context->Rip<prologue label | |
2912 jb .Lin_prologue | |
2913 | |
2914 mov 152($context),%rax # pull context->Rsp | |
2915 | |
2916 mov 4(%r11),%r10d # HandlerData[1] | |
2917 lea (%rsi,%r10),%r10 # epilogue label | |
2918 cmp %r10,%rbx # context->Rip>=epilogue label | |
2919 jae .Lin_prologue | |
2920 | |
2921 mov 160($context),%rax # pull context->Rbp | |
2922 | |
2923 lea 0x40(%rax),%rsi # %xmm save area | |
2924 lea 512($context),%rdi # &context.Xmm6 | |
2925 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | |
2926 .long 0xa548f3fc # cld; rep movsq | |
2927 lea 0xa0(%rax),%rax # adjust stack pointer | |
2928 | |
2929 mov 0x70(%rax),%rbp | |
2930 mov 0x68(%rax),%rbx | |
2931 mov 0x60(%rax),%r12 | |
2932 mov 0x58(%rax),%r13 | |
2933 mov 0x50(%rax),%r14 | |
2934 mov 0x48(%rax),%r15 | |
2935 lea 0x78(%rax),%rax # adjust stack pointer | |
2936 mov %rbx,144($context) # restore context->Rbx | |
2937 mov %rbp,160($context) # restore context->Rbp | |
2938 mov %r12,216($context) # restore context->R12 | |
2939 mov %r13,224($context) # restore context->R13 | |
2940 mov %r14,232($context) # restore context->R14 | |
2941 mov %r15,240($context) # restore context->R15 | |
2942 | |
2943 .Lin_prologue: | |
2944 mov %rax,152($context) # restore context->Rsp | |
2945 | |
2946 mov 40($disp),%rdi # disp->ContextRecord | |
2947 mov $context,%rsi # context | |
2948 mov \$`1232/8`,%ecx # sizeof(CONTEXT) | |
2949 .long 0xa548f3fc # cld; rep movsq | |
2950 | |
2951 mov $disp,%rsi | |
2952 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
2953 mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
2954 mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
2955 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
2956 mov 40(%rsi),%r10 # disp->ContextRecord | |
2957 lea 56(%rsi),%r11 # &disp->HandlerData | |
2958 lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
2959 mov %r10,32(%rsp) # arg5 | |
2960 mov %r11,40(%rsp) # arg6 | |
2961 mov %r12,48(%rsp) # arg7 | |
2962 mov %rcx,56(%rsp) # arg8, (NULL) | |
2963 call *__imp_RtlVirtualUnwind(%rip) | |
2964 | |
2965 mov \$1,%eax # ExceptionContinueSearch | |
2966 add \$64,%rsp | |
2967 popfq | |
2968 pop %r15 | |
2969 pop %r14 | |
2970 pop %r13 | |
2971 pop %r12 | |
2972 pop %rbp | |
2973 pop %rbx | |
2974 pop %rdi | |
2975 pop %rsi | |
2976 ret | |
2977 .size se_handler,.-se_handler | |
2978 | |
2979 .section .pdata | |
2980 .align 4 | |
2981 ___ | |
2982 $code.=<<___ if ($ecb); | |
2983 .rva .Lecb_enc_prologue | |
2984 .rva .Lecb_enc_epilogue | |
2985 .rva .Lecb_enc_info | |
2986 | |
2987 .rva .Lecb_dec_prologue | |
2988 .rva .Lecb_dec_epilogue | |
2989 .rva .Lecb_dec_info | |
2990 ___ | |
2991 $code.=<<___; | |
2992 .rva .Lcbc_dec_prologue | |
2993 .rva .Lcbc_dec_epilogue | |
2994 .rva .Lcbc_dec_info | |
2995 | |
2996 .rva .Lctr_enc_prologue | |
2997 .rva .Lctr_enc_epilogue | |
2998 .rva .Lctr_enc_info | |
2999 | |
3000 .rva .Lxts_enc_prologue | |
3001 .rva .Lxts_enc_epilogue | |
3002 .rva .Lxts_enc_info | |
3003 | |
3004 .rva .Lxts_dec_prologue | |
3005 .rva .Lxts_dec_epilogue | |
3006 .rva .Lxts_dec_info | |
3007 | |
3008 .section .xdata | |
3009 .align 8 | |
3010 ___ | |
3011 $code.=<<___ if ($ecb); | |
3012 .Lecb_enc_info: | |
3013 .byte 9,0,0,0 | |
3014 .rva se_handler | |
3015 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] | |
3016 .Lecb_dec_info: | |
3017 .byte 9,0,0,0 | |
3018 .rva se_handler | |
3019 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] | |
3020 ___ | |
3021 $code.=<<___; | |
3022 .Lcbc_dec_info: | |
3023 .byte 9,0,0,0 | |
3024 .rva se_handler | |
3025 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] | |
3026 .Lctr_enc_info: | |
3027 .byte 9,0,0,0 | |
3028 .rva se_handler | |
3029 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] | |
3030 .Lxts_enc_info: | |
3031 .byte 9,0,0,0 | |
3032 .rva se_handler | |
3033 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] | |
3034 .Lxts_dec_info: | |
3035 .byte 9,0,0,0 | |
3036 .rva se_handler | |
3037 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] | |
3038 ___ | |
3039 } | |
3040 | |
3041 $code =~ s/\`([^\`]*)\`/eval($1)/gem; | |
3042 | |
3043 print $code; | |
3044 | |
3045 close STDOUT; | |
OLD | NEW |