OLD | NEW |
| (Empty) |
1 #!/usr/bin/env perl | |
2 # | |
3 # ==================================================================== | |
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 # project. The module is, however, dual licensed under OpenSSL and | |
6 # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 # details see http://www.openssl.org/~appro/cryptogams/. | |
8 # ==================================================================== | |
9 # | |
10 # sha1_block procedure for x86_64. | |
11 # | |
12 # It was brought to my attention that on EM64T compiler-generated code | |
13 # was far behind 32-bit assembler implementation. This is unlike on | |
14 # Opteron where compiler-generated code was only 15% behind 32-bit | |
15 # assembler, which originally made it hard to motivate the effort. | |
16 # There was suggestion to mechanically translate 32-bit code, but I | |
17 # dismissed it, reasoning that x86_64 offers enough register bank | |
18 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh | |
19 # implementation:-) However! While 64-bit code does perform better | |
20 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, | |
21 # x86_64 does offer larger *addressable* bank, but out-of-order core | |
22 # reaches for even more registers through dynamic aliasing, and EM64T | |
23 # core must have managed to run-time optimize even 32-bit code just as | |
24 # good as 64-bit one. Performance improvement is summarized in the | |
25 # following table: | |
26 # | |
27 # gcc 3.4 32-bit asm cycles/byte | |
28 # Opteron +45% +20% 6.8 | |
29 # Xeon P4 +65% +0% 9.9 | |
30 # Core2 +60% +10% 7.0 | |
31 | |
32 # August 2009. | |
33 # | |
34 # The code was revised to minimize code size and to maximize | |
35 # "distance" between instructions producing input to 'lea' | |
36 # instruction and the 'lea' instruction itself, which is essential | |
37 # for Intel Atom core. | |
38 | |
39 # October 2010. | |
40 # | |
41 # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it | |
42 # is to offload message schedule denoted by Wt in NIST specification, | |
43 # or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module | |
44 # for background and implementation details. The only difference from | |
45 # 32-bit code is that 64-bit code doesn't have to spill @X[] elements | |
46 # to free temporary registers. | |
47 | |
48 # April 2011. | |
49 # | |
50 # Add AVX code path. See sha1-586.pl for further information. | |
51 | |
52 ###################################################################### | |
53 # Current performance is summarized in following table. Numbers are | |
54 # CPU clock cycles spent to process single byte (less is better). | |
55 # | |
56 # x86_64 SSSE3 AVX | |
57 # P4 9.8 - | |
58 # Opteron 6.6 - | |
59 # Core2 6.7 6.1/+10% - | |
60 # Atom 11.0 9.7/+13% - | |
61 # Westmere 7.1 5.6/+27% - | |
62 # Sandy Bridge 7.9 6.3/+25% 5.2/+51% | |
63 | |
64 $flavour = shift; | |
65 $output = shift; | |
66 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | |
67 | |
68 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
69 | |
70 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
71 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
72 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
73 die "can't locate x86_64-xlate.pl"; | |
74 | |
75 $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` | |
76 =~ /GNU assembler version ([2-9]\.[0-9]+)/ && | |
77 $1>=2.19); | |
78 $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | |
79 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && | |
80 $1>=2.09); | |
81 $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | |
82 `ml64 2>&1` =~ /Version ([0-9]+)\./ && | |
83 $1>=10); | |
84 | |
85 open OUT,"| \"$^X\" $xlate $flavour $output"; | |
86 *STDOUT=*OUT; | |
87 | |
88 $ctx="%rdi"; # 1st arg | |
89 $inp="%rsi"; # 2nd arg | |
90 $num="%rdx"; # 3rd arg | |
91 | |
92 # reassign arguments in order to produce more compact code | |
93 $ctx="%r8"; | |
94 $inp="%r9"; | |
95 $num="%r10"; | |
96 | |
97 $t0="%eax"; | |
98 $t1="%ebx"; | |
99 $t2="%ecx"; | |
100 @xi=("%edx","%ebp"); | |
101 $A="%esi"; | |
102 $B="%edi"; | |
103 $C="%r11d"; | |
104 $D="%r12d"; | |
105 $E="%r13d"; | |
106 | |
107 @V=($A,$B,$C,$D,$E); | |
108 | |
109 sub BODY_00_19 { | |
110 my ($i,$a,$b,$c,$d,$e)=@_; | |
111 my $j=$i+1; | |
112 $code.=<<___ if ($i==0); | |
113 mov `4*$i`($inp),$xi[0] | |
114 bswap $xi[0] | |
115 mov $xi[0],`4*$i`(%rsp) | |
116 ___ | |
117 $code.=<<___ if ($i<15); | |
118 mov $c,$t0 | |
119 mov `4*$j`($inp),$xi[1] | |
120 mov $a,$t2 | |
121 xor $d,$t0 | |
122 bswap $xi[1] | |
123 rol \$5,$t2 | |
124 lea 0x5a827999($xi[0],$e),$e | |
125 and $b,$t0 | |
126 mov $xi[1],`4*$j`(%rsp) | |
127 add $t2,$e | |
128 xor $d,$t0 | |
129 rol \$30,$b | |
130 add $t0,$e | |
131 ___ | |
132 $code.=<<___ if ($i>=15); | |
133 mov `4*($j%16)`(%rsp),$xi[1] | |
134 mov $c,$t0 | |
135 mov $a,$t2 | |
136 xor `4*(($j+2)%16)`(%rsp),$xi[1] | |
137 xor $d,$t0 | |
138 rol \$5,$t2 | |
139 xor `4*(($j+8)%16)`(%rsp),$xi[1] | |
140 and $b,$t0 | |
141 lea 0x5a827999($xi[0],$e),$e | |
142 xor `4*(($j+13)%16)`(%rsp),$xi[1] | |
143 xor $d,$t0 | |
144 rol \$1,$xi[1] | |
145 add $t2,$e | |
146 rol \$30,$b | |
147 mov $xi[1],`4*($j%16)`(%rsp) | |
148 add $t0,$e | |
149 ___ | |
150 unshift(@xi,pop(@xi)); | |
151 } | |
152 | |
153 sub BODY_20_39 { | |
154 my ($i,$a,$b,$c,$d,$e)=@_; | |
155 my $j=$i+1; | |
156 my $K=($i<40)?0x6ed9eba1:0xca62c1d6; | |
157 $code.=<<___ if ($i<79); | |
158 mov `4*($j%16)`(%rsp),$xi[1] | |
159 mov $c,$t0 | |
160 mov $a,$t2 | |
161 xor `4*(($j+2)%16)`(%rsp),$xi[1] | |
162 xor $b,$t0 | |
163 rol \$5,$t2 | |
164 lea $K($xi[0],$e),$e | |
165 xor `4*(($j+8)%16)`(%rsp),$xi[1] | |
166 xor $d,$t0 | |
167 add $t2,$e | |
168 xor `4*(($j+13)%16)`(%rsp),$xi[1] | |
169 rol \$30,$b | |
170 add $t0,$e | |
171 rol \$1,$xi[1] | |
172 ___ | |
173 $code.=<<___ if ($i<76); | |
174 mov $xi[1],`4*($j%16)`(%rsp) | |
175 ___ | |
176 $code.=<<___ if ($i==79); | |
177 mov $c,$t0 | |
178 mov $a,$t2 | |
179 xor $b,$t0 | |
180 lea $K($xi[0],$e),$e | |
181 rol \$5,$t2 | |
182 xor $d,$t0 | |
183 add $t2,$e | |
184 rol \$30,$b | |
185 add $t0,$e | |
186 ___ | |
187 unshift(@xi,pop(@xi)); | |
188 } | |
189 | |
190 sub BODY_40_59 { | |
191 my ($i,$a,$b,$c,$d,$e)=@_; | |
192 my $j=$i+1; | |
193 $code.=<<___; | |
194 mov `4*($j%16)`(%rsp),$xi[1] | |
195 mov $c,$t0 | |
196 mov $c,$t1 | |
197 xor `4*(($j+2)%16)`(%rsp),$xi[1] | |
198 and $d,$t0 | |
199 mov $a,$t2 | |
200 xor `4*(($j+8)%16)`(%rsp),$xi[1] | |
201 xor $d,$t1 | |
202 lea 0x8f1bbcdc($xi[0],$e),$e | |
203 rol \$5,$t2 | |
204 xor `4*(($j+13)%16)`(%rsp),$xi[1] | |
205 add $t0,$e | |
206 and $b,$t1 | |
207 rol \$1,$xi[1] | |
208 add $t1,$e | |
209 rol \$30,$b | |
210 mov $xi[1],`4*($j%16)`(%rsp) | |
211 add $t2,$e | |
212 ___ | |
213 unshift(@xi,pop(@xi)); | |
214 } | |
215 | |
216 $code.=<<___; | |
217 .text | |
218 .extern OPENSSL_ia32cap_P | |
219 | |
220 .globl sha1_block_data_order | |
221 .type sha1_block_data_order,\@function,3 | |
222 .align 16 | |
223 sha1_block_data_order: | |
224 mov OPENSSL_ia32cap_P+0(%rip),%r9d | |
225 mov OPENSSL_ia32cap_P+4(%rip),%r8d | |
226 test \$`1<<9`,%r8d # check SSSE3 bit | |
227 jz .Lialu | |
228 ___ | |
229 $code.=<<___ if ($avx); | |
230 and \$`1<<28`,%r8d # mask AVX bit | |
231 and \$`1<<30`,%r9d # mask "Intel CPU" bit | |
232 or %r9d,%r8d | |
233 cmp \$`1<<28|1<<30`,%r8d | |
234 je _avx_shortcut | |
235 ___ | |
236 $code.=<<___; | |
237 jmp _ssse3_shortcut | |
238 | |
239 .align 16 | |
240 .Lialu: | |
241 push %rbx | |
242 push %rbp | |
243 push %r12 | |
244 push %r13 | |
245 mov %rsp,%r11 | |
246 mov %rdi,$ctx # reassigned argument | |
247 sub \$`8+16*4`,%rsp | |
248 mov %rsi,$inp # reassigned argument | |
249 and \$-64,%rsp | |
250 mov %rdx,$num # reassigned argument | |
251 mov %r11,`16*4`(%rsp) | |
252 .Lprologue: | |
253 | |
254 mov 0($ctx),$A | |
255 mov 4($ctx),$B | |
256 mov 8($ctx),$C | |
257 mov 12($ctx),$D | |
258 mov 16($ctx),$E | |
259 jmp .Lloop | |
260 | |
261 .align 16 | |
262 .Lloop: | |
263 ___ | |
264 for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } | |
265 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | |
266 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } | |
267 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | |
268 $code.=<<___; | |
269 add 0($ctx),$A | |
270 add 4($ctx),$B | |
271 add 8($ctx),$C | |
272 add 12($ctx),$D | |
273 add 16($ctx),$E | |
274 mov $A,0($ctx) | |
275 mov $B,4($ctx) | |
276 mov $C,8($ctx) | |
277 mov $D,12($ctx) | |
278 mov $E,16($ctx) | |
279 | |
280 sub \$1,$num | |
281 lea `16*4`($inp),$inp | |
282 jnz .Lloop | |
283 | |
284 mov `16*4`(%rsp),%rsi | |
285 mov (%rsi),%r13 | |
286 mov 8(%rsi),%r12 | |
287 mov 16(%rsi),%rbp | |
288 mov 24(%rsi),%rbx | |
289 lea 32(%rsi),%rsp | |
290 .Lepilogue: | |
291 ret | |
292 .size sha1_block_data_order,.-sha1_block_data_order | |
293 ___ | |
294 {{{ | |
295 my $Xi=4; | |
296 my @X=map("%xmm$_",(4..7,0..3)); | |
297 my @Tx=map("%xmm$_",(8..10)); | |
298 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimizat
ion | |
299 my @T=("%esi","%edi"); | |
300 my $j=0; | |
301 my $K_XX_XX="%r11"; | |
302 | |
303 my $_rol=sub { &rol(@_) }; | |
304 my $_ror=sub { &ror(@_) }; | |
305 | |
306 $code.=<<___; | |
307 .type sha1_block_data_order_ssse3,\@function,3 | |
308 .align 16 | |
309 sha1_block_data_order_ssse3: | |
310 _ssse3_shortcut: | |
311 push %rbx | |
312 push %rbp | |
313 push %r12 | |
314 lea `-64-($win64?5*16:0)`(%rsp),%rsp | |
315 ___ | |
316 $code.=<<___ if ($win64); | |
317 movaps %xmm6,64+0(%rsp) | |
318 movaps %xmm7,64+16(%rsp) | |
319 movaps %xmm8,64+32(%rsp) | |
320 movaps %xmm9,64+48(%rsp) | |
321 movaps %xmm10,64+64(%rsp) | |
322 .Lprologue_ssse3: | |
323 ___ | |
324 $code.=<<___; | |
325 mov %rdi,$ctx # reassigned argument | |
326 mov %rsi,$inp # reassigned argument | |
327 mov %rdx,$num # reassigned argument | |
328 | |
329 shl \$6,$num | |
330 add $inp,$num | |
331 lea K_XX_XX(%rip),$K_XX_XX | |
332 | |
333 mov 0($ctx),$A # load context | |
334 mov 4($ctx),$B | |
335 mov 8($ctx),$C | |
336 mov 12($ctx),$D | |
337 mov $B,@T[0] # magic seed | |
338 mov 16($ctx),$E | |
339 | |
340 movdqa 64($K_XX_XX),@X[2] # pbswap mask | |
341 movdqa 0($K_XX_XX),@Tx[1] # K_00_19 | |
342 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] | |
343 movdqu 16($inp),@X[-3&7] | |
344 movdqu 32($inp),@X[-2&7] | |
345 movdqu 48($inp),@X[-1&7] | |
346 pshufb @X[2],@X[-4&7] # byte swap | |
347 add \$64,$inp | |
348 pshufb @X[2],@X[-3&7] | |
349 pshufb @X[2],@X[-2&7] | |
350 pshufb @X[2],@X[-1&7] | |
351 paddd @Tx[1],@X[-4&7] # add K_00_19 | |
352 paddd @Tx[1],@X[-3&7] | |
353 paddd @Tx[1],@X[-2&7] | |
354 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU | |
355 psubd @Tx[1],@X[-4&7] # restore X[] | |
356 movdqa @X[-3&7],16(%rsp) | |
357 psubd @Tx[1],@X[-3&7] | |
358 movdqa @X[-2&7],32(%rsp) | |
359 psubd @Tx[1],@X[-2&7] | |
360 jmp .Loop_ssse3 | |
361 ___ | |
362 | |
363 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm | |
364 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; | |
365 my $arg = pop; | |
366 $arg = "\$$arg" if ($arg*1 eq $arg); | |
367 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; | |
368 } | |
369 | |
370 sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 | |
371 { use integer; | |
372 my $body = shift; | |
373 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions | |
374 my ($a,$b,$c,$d,$e); | |
375 | |
376 &movdqa (@X[0],@X[-3&7]); | |
377 eval(shift(@insns)); | |
378 eval(shift(@insns)); | |
379 &movdqa (@Tx[0],@X[-1&7]); | |
380 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" | |
381 eval(shift(@insns)); | |
382 eval(shift(@insns)); | |
383 | |
384 &paddd (@Tx[1],@X[-1&7]); | |
385 eval(shift(@insns)); | |
386 eval(shift(@insns)); | |
387 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords | |
388 eval(shift(@insns)); | |
389 eval(shift(@insns)); | |
390 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" | |
391 eval(shift(@insns)); | |
392 eval(shift(@insns)); | |
393 | |
394 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" | |
395 eval(shift(@insns)); | |
396 eval(shift(@insns)); | |
397 eval(shift(@insns)); | |
398 eval(shift(@insns)); | |
399 | |
400 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" | |
401 eval(shift(@insns)); | |
402 eval(shift(@insns)); | |
403 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to
IALU | |
404 eval(shift(@insns)); | |
405 eval(shift(@insns)); | |
406 | |
407 &movdqa (@Tx[2],@X[0]); | |
408 &movdqa (@Tx[0],@X[0]); | |
409 eval(shift(@insns)); | |
410 eval(shift(@insns)); | |
411 eval(shift(@insns)); | |
412 eval(shift(@insns)); | |
413 | |
414 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword | |
415 &paddd (@X[0],@X[0]); | |
416 eval(shift(@insns)); | |
417 eval(shift(@insns)); | |
418 eval(shift(@insns)); | |
419 eval(shift(@insns)); | |
420 | |
421 &psrld (@Tx[0],31); | |
422 eval(shift(@insns)); | |
423 eval(shift(@insns)); | |
424 &movdqa (@Tx[1],@Tx[2]); | |
425 eval(shift(@insns)); | |
426 eval(shift(@insns)); | |
427 | |
428 &psrld (@Tx[2],30); | |
429 &por (@X[0],@Tx[0]); # "X[0]"<<<=1 | |
430 eval(shift(@insns)); | |
431 eval(shift(@insns)); | |
432 eval(shift(@insns)); | |
433 eval(shift(@insns)); | |
434 | |
435 &pslld (@Tx[1],2); | |
436 &pxor (@X[0],@Tx[2]); | |
437 eval(shift(@insns)); | |
438 eval(shift(@insns)); | |
439 &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_X
X | |
440 eval(shift(@insns)); | |
441 eval(shift(@insns)); | |
442 | |
443 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 | |
444 | |
445 foreach (@insns) { eval; } # remaining instructions [if any] | |
446 | |
447 $Xi++; push(@X,shift(@X)); # "rotate" X[] | |
448 push(@Tx,shift(@Tx)); | |
449 } | |
450 | |
451 sub Xupdate_ssse3_32_79() | |
452 { use integer; | |
453 my $body = shift; | |
454 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions | |
455 my ($a,$b,$c,$d,$e); | |
456 | |
457 &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); | |
458 eval(shift(@insns)); # body_20_39 | |
459 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" | |
460 &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" | |
461 eval(shift(@insns)); | |
462 eval(shift(@insns)); | |
463 eval(shift(@insns)); # rol | |
464 | |
465 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" | |
466 eval(shift(@insns)); | |
467 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); | |
468 if ($Xi%5) { | |
469 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... | |
470 } else { # ... or load next one | |
471 &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); | |
472 } | |
473 &paddd (@Tx[1],@X[-1&7]); | |
474 eval(shift(@insns)); # ror | |
475 eval(shift(@insns)); | |
476 | |
477 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" | |
478 eval(shift(@insns)); # body_20_39 | |
479 eval(shift(@insns)); | |
480 eval(shift(@insns)); | |
481 eval(shift(@insns)); # rol | |
482 | |
483 &movdqa (@Tx[0],@X[0]); | |
484 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to
IALU | |
485 eval(shift(@insns)); | |
486 eval(shift(@insns)); | |
487 eval(shift(@insns)); # ror | |
488 eval(shift(@insns)); | |
489 | |
490 &pslld (@X[0],2); | |
491 eval(shift(@insns)); # body_20_39 | |
492 eval(shift(@insns)); | |
493 &psrld (@Tx[0],30); | |
494 eval(shift(@insns)); | |
495 eval(shift(@insns)); # rol | |
496 eval(shift(@insns)); | |
497 eval(shift(@insns)); | |
498 eval(shift(@insns)); # ror | |
499 eval(shift(@insns)); | |
500 | |
501 &por (@X[0],@Tx[0]); # "X[0]"<<<=2 | |
502 eval(shift(@insns)); # body_20_39 | |
503 eval(shift(@insns)); | |
504 &movdqa (@Tx[1],@X[0]) if ($Xi<19); | |
505 eval(shift(@insns)); | |
506 eval(shift(@insns)); # rol | |
507 eval(shift(@insns)); | |
508 eval(shift(@insns)); | |
509 eval(shift(@insns)); # rol | |
510 eval(shift(@insns)); | |
511 | |
512 foreach (@insns) { eval; } # remaining instructions | |
513 | |
514 $Xi++; push(@X,shift(@X)); # "rotate" X[] | |
515 push(@Tx,shift(@Tx)); | |
516 } | |
517 | |
518 sub Xuplast_ssse3_80() | |
519 { use integer; | |
520 my $body = shift; | |
521 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
522 my ($a,$b,$c,$d,$e); | |
523 | |
524 eval(shift(@insns)); | |
525 &paddd (@Tx[1],@X[-1&7]); | |
526 eval(shift(@insns)); | |
527 eval(shift(@insns)); | |
528 eval(shift(@insns)); | |
529 eval(shift(@insns)); | |
530 | |
531 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IAL
U | |
532 | |
533 foreach (@insns) { eval; } # remaining instructions | |
534 | |
535 &cmp ($inp,$num); | |
536 &je (".Ldone_ssse3"); | |
537 | |
538 unshift(@Tx,pop(@Tx)); | |
539 | |
540 &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask | |
541 &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 | |
542 &movdqu (@X[-4&7],"0($inp)"); # load input | |
543 &movdqu (@X[-3&7],"16($inp)"); | |
544 &movdqu (@X[-2&7],"32($inp)"); | |
545 &movdqu (@X[-1&7],"48($inp)"); | |
546 &pshufb (@X[-4&7],@X[2]); # byte swap | |
547 &add ($inp,64); | |
548 | |
549 $Xi=0; | |
550 } | |
551 | |
552 sub Xloop_ssse3() | |
553 { use integer; | |
554 my $body = shift; | |
555 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
556 my ($a,$b,$c,$d,$e); | |
557 | |
558 eval(shift(@insns)); | |
559 eval(shift(@insns)); | |
560 &pshufb (@X[($Xi-3)&7],@X[2]); | |
561 eval(shift(@insns)); | |
562 eval(shift(@insns)); | |
563 &paddd (@X[($Xi-4)&7],@Tx[1]); | |
564 eval(shift(@insns)); | |
565 eval(shift(@insns)); | |
566 eval(shift(@insns)); | |
567 eval(shift(@insns)); | |
568 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU | |
569 eval(shift(@insns)); | |
570 eval(shift(@insns)); | |
571 &psubd (@X[($Xi-4)&7],@Tx[1]); | |
572 | |
573 foreach (@insns) { eval; } | |
574 $Xi++; | |
575 } | |
576 | |
577 sub Xtail_ssse3() | |
578 { use integer; | |
579 my $body = shift; | |
580 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
581 my ($a,$b,$c,$d,$e); | |
582 | |
583 foreach (@insns) { eval; } | |
584 } | |
585 | |
586 sub body_00_19 () { | |
587 ( | |
588 '($a,$b,$c,$d,$e)=@V;'. | |
589 '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer | |
590 '&xor ($c,$d);', | |
591 '&mov (@T[1],$a);', # $b in next round | |
592 '&$_rol ($a,5);', | |
593 '&and (@T[0],$c);', # ($b&($c^$d)) | |
594 '&xor ($c,$d);', # restore $c | |
595 '&xor (@T[0],$d);', | |
596 '&add ($e,$a);', | |
597 '&$_ror ($b,$j?7:2);', # $b>>>2 | |
598 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T))
;' | |
599 ); | |
600 } | |
601 | |
602 sub body_20_39 () { | |
603 ( | |
604 '($a,$b,$c,$d,$e)=@V;'. | |
605 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer | |
606 '&xor (@T[0],$d);', # ($b^$d) | |
607 '&mov (@T[1],$a);', # $b in next round | |
608 '&$_rol ($a,5);', | |
609 '&xor (@T[0],$c);', # ($b^$d^$c) | |
610 '&add ($e,$a);', | |
611 '&$_ror ($b,7);', # $b>>>2 | |
612 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' | |
613 ); | |
614 } | |
615 | |
616 sub body_40_59 () { | |
617 ( | |
618 '($a,$b,$c,$d,$e)=@V;'. | |
619 '&mov (@T[1],$c);', | |
620 '&xor ($c,$d);', | |
621 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer | |
622 '&and (@T[1],$d);', | |
623 '&and (@T[0],$c);', # ($b&($c^$d)) | |
624 '&$_ror ($b,7);', # $b>>>2 | |
625 '&add ($e,@T[1]);', | |
626 '&mov (@T[1],$a);', # $b in next round | |
627 '&$_rol ($a,5);', | |
628 '&add ($e,@T[0]);', | |
629 '&xor ($c,$d);', # restore $c | |
630 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' | |
631 ); | |
632 } | |
633 $code.=<<___; | |
634 .align 16 | |
635 .Loop_ssse3: | |
636 ___ | |
637 &Xupdate_ssse3_16_31(\&body_00_19); | |
638 &Xupdate_ssse3_16_31(\&body_00_19); | |
639 &Xupdate_ssse3_16_31(\&body_00_19); | |
640 &Xupdate_ssse3_16_31(\&body_00_19); | |
641 &Xupdate_ssse3_32_79(\&body_00_19); | |
642 &Xupdate_ssse3_32_79(\&body_20_39); | |
643 &Xupdate_ssse3_32_79(\&body_20_39); | |
644 &Xupdate_ssse3_32_79(\&body_20_39); | |
645 &Xupdate_ssse3_32_79(\&body_20_39); | |
646 &Xupdate_ssse3_32_79(\&body_20_39); | |
647 &Xupdate_ssse3_32_79(\&body_40_59); | |
648 &Xupdate_ssse3_32_79(\&body_40_59); | |
649 &Xupdate_ssse3_32_79(\&body_40_59); | |
650 &Xupdate_ssse3_32_79(\&body_40_59); | |
651 &Xupdate_ssse3_32_79(\&body_40_59); | |
652 &Xupdate_ssse3_32_79(\&body_20_39); | |
653 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" | |
654 | |
655 $saved_j=$j; @saved_V=@V; | |
656 | |
657 &Xloop_ssse3(\&body_20_39); | |
658 &Xloop_ssse3(\&body_20_39); | |
659 &Xloop_ssse3(\&body_20_39); | |
660 | |
661 $code.=<<___; | |
662 add 0($ctx),$A # update context | |
663 add 4($ctx),@T[0] | |
664 add 8($ctx),$C | |
665 add 12($ctx),$D | |
666 mov $A,0($ctx) | |
667 add 16($ctx),$E | |
668 mov @T[0],4($ctx) | |
669 mov @T[0],$B # magic seed | |
670 mov $C,8($ctx) | |
671 mov $D,12($ctx) | |
672 mov $E,16($ctx) | |
673 jmp .Loop_ssse3 | |
674 | |
675 .align 16 | |
676 .Ldone_ssse3: | |
677 ___ | |
678 $j=$saved_j; @V=@saved_V; | |
679 | |
680 &Xtail_ssse3(\&body_20_39); | |
681 &Xtail_ssse3(\&body_20_39); | |
682 &Xtail_ssse3(\&body_20_39); | |
683 | |
684 $code.=<<___; | |
685 add 0($ctx),$A # update context | |
686 add 4($ctx),@T[0] | |
687 add 8($ctx),$C | |
688 mov $A,0($ctx) | |
689 add 12($ctx),$D | |
690 mov @T[0],4($ctx) | |
691 add 16($ctx),$E | |
692 mov $C,8($ctx) | |
693 mov $D,12($ctx) | |
694 mov $E,16($ctx) | |
695 ___ | |
696 $code.=<<___ if ($win64); | |
697 movaps 64+0(%rsp),%xmm6 | |
698 movaps 64+16(%rsp),%xmm7 | |
699 movaps 64+32(%rsp),%xmm8 | |
700 movaps 64+48(%rsp),%xmm9 | |
701 movaps 64+64(%rsp),%xmm10 | |
702 ___ | |
703 $code.=<<___; | |
704 lea `64+($win64?5*16:0)`(%rsp),%rsi | |
705 mov 0(%rsi),%r12 | |
706 mov 8(%rsi),%rbp | |
707 mov 16(%rsi),%rbx | |
708 lea 24(%rsi),%rsp | |
709 .Lepilogue_ssse3: | |
710 ret | |
711 .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 | |
712 ___ | |
713 | |
714 if ($avx) { | |
715 my $Xi=4; | |
716 my @X=map("%xmm$_",(4..7,0..3)); | |
717 my @Tx=map("%xmm$_",(8..10)); | |
718 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimizat
ion | |
719 my @T=("%esi","%edi"); | |
720 my $j=0; | |
721 my $K_XX_XX="%r11"; | |
722 | |
723 my $_rol=sub { &shld(@_[0],@_) }; | |
724 my $_ror=sub { &shrd(@_[0],@_) }; | |
725 | |
726 $code.=<<___; | |
727 .type sha1_block_data_order_avx,\@function,3 | |
728 .align 16 | |
729 sha1_block_data_order_avx: | |
730 _avx_shortcut: | |
731 push %rbx | |
732 push %rbp | |
733 push %r12 | |
734 lea `-64-($win64?5*16:0)`(%rsp),%rsp | |
735 ___ | |
736 $code.=<<___ if ($win64); | |
737 movaps %xmm6,64+0(%rsp) | |
738 movaps %xmm7,64+16(%rsp) | |
739 movaps %xmm8,64+32(%rsp) | |
740 movaps %xmm9,64+48(%rsp) | |
741 movaps %xmm10,64+64(%rsp) | |
742 .Lprologue_avx: | |
743 ___ | |
744 $code.=<<___; | |
745 mov %rdi,$ctx # reassigned argument | |
746 mov %rsi,$inp # reassigned argument | |
747 mov %rdx,$num # reassigned argument | |
748 vzeroall | |
749 | |
750 shl \$6,$num | |
751 add $inp,$num | |
752 lea K_XX_XX(%rip),$K_XX_XX | |
753 | |
754 mov 0($ctx),$A # load context | |
755 mov 4($ctx),$B | |
756 mov 8($ctx),$C | |
757 mov 12($ctx),$D | |
758 mov $B,@T[0] # magic seed | |
759 mov 16($ctx),$E | |
760 | |
761 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask | |
762 vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19 | |
763 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] | |
764 vmovdqu 16($inp),@X[-3&7] | |
765 vmovdqu 32($inp),@X[-2&7] | |
766 vmovdqu 48($inp),@X[-1&7] | |
767 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap | |
768 add \$64,$inp | |
769 vpshufb @X[2],@X[-3&7],@X[-3&7] | |
770 vpshufb @X[2],@X[-2&7],@X[-2&7] | |
771 vpshufb @X[2],@X[-1&7],@X[-1&7] | |
772 vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19 | |
773 vpaddd @Tx[1],@X[-3&7],@X[1] | |
774 vpaddd @Tx[1],@X[-2&7],@X[2] | |
775 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU | |
776 vmovdqa @X[1],16(%rsp) | |
777 vmovdqa @X[2],32(%rsp) | |
778 jmp .Loop_avx | |
779 ___ | |
780 | |
781 sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 | |
782 { use integer; | |
783 my $body = shift; | |
784 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions | |
785 my ($a,$b,$c,$d,$e); | |
786 | |
787 eval(shift(@insns)); | |
788 eval(shift(@insns)); | |
789 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" | |
790 eval(shift(@insns)); | |
791 eval(shift(@insns)); | |
792 | |
793 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); | |
794 eval(shift(@insns)); | |
795 eval(shift(@insns)); | |
796 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords | |
797 eval(shift(@insns)); | |
798 eval(shift(@insns)); | |
799 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" | |
800 eval(shift(@insns)); | |
801 eval(shift(@insns)); | |
802 | |
803 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" | |
804 eval(shift(@insns)); | |
805 eval(shift(@insns)); | |
806 eval(shift(@insns)); | |
807 eval(shift(@insns)); | |
808 | |
809 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" | |
810 eval(shift(@insns)); | |
811 eval(shift(@insns)); | |
812 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to
IALU | |
813 eval(shift(@insns)); | |
814 eval(shift(@insns)); | |
815 | |
816 &vpsrld (@Tx[0],@X[0],31); | |
817 eval(shift(@insns)); | |
818 eval(shift(@insns)); | |
819 eval(shift(@insns)); | |
820 eval(shift(@insns)); | |
821 | |
822 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword | |
823 &vpaddd (@X[0],@X[0],@X[0]); | |
824 eval(shift(@insns)); | |
825 eval(shift(@insns)); | |
826 eval(shift(@insns)); | |
827 eval(shift(@insns)); | |
828 | |
829 &vpsrld (@Tx[1],@Tx[2],30); | |
830 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 | |
831 eval(shift(@insns)); | |
832 eval(shift(@insns)); | |
833 eval(shift(@insns)); | |
834 eval(shift(@insns)); | |
835 | |
836 &vpslld (@Tx[2],@Tx[2],2); | |
837 &vpxor (@X[0],@X[0],@Tx[1]); | |
838 eval(shift(@insns)); | |
839 eval(shift(@insns)); | |
840 eval(shift(@insns)); | |
841 eval(shift(@insns)); | |
842 | |
843 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 | |
844 eval(shift(@insns)); | |
845 eval(shift(@insns)); | |
846 &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_X
X | |
847 eval(shift(@insns)); | |
848 eval(shift(@insns)); | |
849 | |
850 | |
851 foreach (@insns) { eval; } # remaining instructions [if any] | |
852 | |
853 $Xi++; push(@X,shift(@X)); # "rotate" X[] | |
854 push(@Tx,shift(@Tx)); | |
855 } | |
856 | |
857 sub Xupdate_avx_32_79() | |
858 { use integer; | |
859 my $body = shift; | |
860 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions | |
861 my ($a,$b,$c,$d,$e); | |
862 | |
863 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" | |
864 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" | |
865 eval(shift(@insns)); # body_20_39 | |
866 eval(shift(@insns)); | |
867 eval(shift(@insns)); | |
868 eval(shift(@insns)); # rol | |
869 | |
870 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" | |
871 eval(shift(@insns)); | |
872 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); | |
873 if ($Xi%5) { | |
874 &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... | |
875 } else { # ... or load next one | |
876 &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); | |
877 } | |
878 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); | |
879 eval(shift(@insns)); # ror | |
880 eval(shift(@insns)); | |
881 | |
882 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" | |
883 eval(shift(@insns)); # body_20_39 | |
884 eval(shift(@insns)); | |
885 eval(shift(@insns)); | |
886 eval(shift(@insns)); # rol | |
887 | |
888 &vpsrld (@Tx[0],@X[0],30); | |
889 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to
IALU | |
890 eval(shift(@insns)); | |
891 eval(shift(@insns)); | |
892 eval(shift(@insns)); # ror | |
893 eval(shift(@insns)); | |
894 | |
895 &vpslld (@X[0],@X[0],2); | |
896 eval(shift(@insns)); # body_20_39 | |
897 eval(shift(@insns)); | |
898 eval(shift(@insns)); | |
899 eval(shift(@insns)); # rol | |
900 eval(shift(@insns)); | |
901 eval(shift(@insns)); | |
902 eval(shift(@insns)); # ror | |
903 eval(shift(@insns)); | |
904 | |
905 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 | |
906 eval(shift(@insns)); # body_20_39 | |
907 eval(shift(@insns)); | |
908 &vmovdqa (@Tx[1],@X[0]) if ($Xi<19); | |
909 eval(shift(@insns)); | |
910 eval(shift(@insns)); # rol | |
911 eval(shift(@insns)); | |
912 eval(shift(@insns)); | |
913 eval(shift(@insns)); # rol | |
914 eval(shift(@insns)); | |
915 | |
916 foreach (@insns) { eval; } # remaining instructions | |
917 | |
918 $Xi++; push(@X,shift(@X)); # "rotate" X[] | |
919 push(@Tx,shift(@Tx)); | |
920 } | |
921 | |
922 sub Xuplast_avx_80() | |
923 { use integer; | |
924 my $body = shift; | |
925 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
926 my ($a,$b,$c,$d,$e); | |
927 | |
928 eval(shift(@insns)); | |
929 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); | |
930 eval(shift(@insns)); | |
931 eval(shift(@insns)); | |
932 eval(shift(@insns)); | |
933 eval(shift(@insns)); | |
934 | |
935 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IAL
U | |
936 | |
937 foreach (@insns) { eval; } # remaining instructions | |
938 | |
939 &cmp ($inp,$num); | |
940 &je (".Ldone_avx"); | |
941 | |
942 unshift(@Tx,pop(@Tx)); | |
943 | |
944 &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask | |
945 &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19 | |
946 &vmovdqu(@X[-4&7],"0($inp)"); # load input | |
947 &vmovdqu(@X[-3&7],"16($inp)"); | |
948 &vmovdqu(@X[-2&7],"32($inp)"); | |
949 &vmovdqu(@X[-1&7],"48($inp)"); | |
950 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap | |
951 &add ($inp,64); | |
952 | |
953 $Xi=0; | |
954 } | |
955 | |
956 sub Xloop_avx() | |
957 { use integer; | |
958 my $body = shift; | |
959 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
960 my ($a,$b,$c,$d,$e); | |
961 | |
962 eval(shift(@insns)); | |
963 eval(shift(@insns)); | |
964 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); | |
965 eval(shift(@insns)); | |
966 eval(shift(@insns)); | |
967 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]); | |
968 eval(shift(@insns)); | |
969 eval(shift(@insns)); | |
970 eval(shift(@insns)); | |
971 eval(shift(@insns)); | |
972 &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU | |
973 eval(shift(@insns)); | |
974 eval(shift(@insns)); | |
975 | |
976 foreach (@insns) { eval; } | |
977 $Xi++; | |
978 } | |
979 | |
980 sub Xtail_avx() | |
981 { use integer; | |
982 my $body = shift; | |
983 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
984 my ($a,$b,$c,$d,$e); | |
985 | |
986 foreach (@insns) { eval; } | |
987 } | |
988 | |
989 $code.=<<___; | |
990 .align 16 | |
991 .Loop_avx: | |
992 ___ | |
993 &Xupdate_avx_16_31(\&body_00_19); | |
994 &Xupdate_avx_16_31(\&body_00_19); | |
995 &Xupdate_avx_16_31(\&body_00_19); | |
996 &Xupdate_avx_16_31(\&body_00_19); | |
997 &Xupdate_avx_32_79(\&body_00_19); | |
998 &Xupdate_avx_32_79(\&body_20_39); | |
999 &Xupdate_avx_32_79(\&body_20_39); | |
1000 &Xupdate_avx_32_79(\&body_20_39); | |
1001 &Xupdate_avx_32_79(\&body_20_39); | |
1002 &Xupdate_avx_32_79(\&body_20_39); | |
1003 &Xupdate_avx_32_79(\&body_40_59); | |
1004 &Xupdate_avx_32_79(\&body_40_59); | |
1005 &Xupdate_avx_32_79(\&body_40_59); | |
1006 &Xupdate_avx_32_79(\&body_40_59); | |
1007 &Xupdate_avx_32_79(\&body_40_59); | |
1008 &Xupdate_avx_32_79(\&body_20_39); | |
1009 &Xuplast_avx_80(\&body_20_39); # can jump to "done" | |
1010 | |
1011 $saved_j=$j; @saved_V=@V; | |
1012 | |
1013 &Xloop_avx(\&body_20_39); | |
1014 &Xloop_avx(\&body_20_39); | |
1015 &Xloop_avx(\&body_20_39); | |
1016 | |
1017 $code.=<<___; | |
1018 add 0($ctx),$A # update context | |
1019 add 4($ctx),@T[0] | |
1020 add 8($ctx),$C | |
1021 add 12($ctx),$D | |
1022 mov $A,0($ctx) | |
1023 add 16($ctx),$E | |
1024 mov @T[0],4($ctx) | |
1025 mov @T[0],$B # magic seed | |
1026 mov $C,8($ctx) | |
1027 mov $D,12($ctx) | |
1028 mov $E,16($ctx) | |
1029 jmp .Loop_avx | |
1030 | |
1031 .align 16 | |
1032 .Ldone_avx: | |
1033 ___ | |
1034 $j=$saved_j; @V=@saved_V; | |
1035 | |
1036 &Xtail_avx(\&body_20_39); | |
1037 &Xtail_avx(\&body_20_39); | |
1038 &Xtail_avx(\&body_20_39); | |
1039 | |
1040 $code.=<<___; | |
1041 vzeroall | |
1042 | |
1043 add 0($ctx),$A # update context | |
1044 add 4($ctx),@T[0] | |
1045 add 8($ctx),$C | |
1046 mov $A,0($ctx) | |
1047 add 12($ctx),$D | |
1048 mov @T[0],4($ctx) | |
1049 add 16($ctx),$E | |
1050 mov $C,8($ctx) | |
1051 mov $D,12($ctx) | |
1052 mov $E,16($ctx) | |
1053 ___ | |
1054 $code.=<<___ if ($win64); | |
1055 movaps 64+0(%rsp),%xmm6 | |
1056 movaps 64+16(%rsp),%xmm7 | |
1057 movaps 64+32(%rsp),%xmm8 | |
1058 movaps 64+48(%rsp),%xmm9 | |
1059 movaps 64+64(%rsp),%xmm10 | |
1060 ___ | |
1061 $code.=<<___; | |
1062 lea `64+($win64?5*16:0)`(%rsp),%rsi | |
1063 mov 0(%rsi),%r12 | |
1064 mov 8(%rsi),%rbp | |
1065 mov 16(%rsi),%rbx | |
1066 lea 24(%rsi),%rsp | |
1067 .Lepilogue_avx: | |
1068 ret | |
1069 .size sha1_block_data_order_avx,.-sha1_block_data_order_avx | |
1070 ___ | |
1071 } | |
1072 $code.=<<___; | |
1073 .align 64 | |
1074 K_XX_XX: | |
1075 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 | |
1076 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 | |
1077 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 | |
1078 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 | |
1079 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask | |
1080 ___ | |
1081 }}} | |
1082 $code.=<<___; | |
1083 .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | |
1084 .align 64 | |
1085 ___ | |
1086 | |
1087 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | |
1088 # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
1089 if ($win64) { | |
1090 $rec="%rcx"; | |
1091 $frame="%rdx"; | |
1092 $context="%r8"; | |
1093 $disp="%r9"; | |
1094 | |
1095 $code.=<<___; | |
1096 .extern __imp_RtlVirtualUnwind | |
1097 .type se_handler,\@abi-omnipotent | |
1098 .align 16 | |
1099 se_handler: | |
1100 push %rsi | |
1101 push %rdi | |
1102 push %rbx | |
1103 push %rbp | |
1104 push %r12 | |
1105 push %r13 | |
1106 push %r14 | |
1107 push %r15 | |
1108 pushfq | |
1109 sub \$64,%rsp | |
1110 | |
1111 mov 120($context),%rax # pull context->Rax | |
1112 mov 248($context),%rbx # pull context->Rip | |
1113 | |
1114 lea .Lprologue(%rip),%r10 | |
1115 cmp %r10,%rbx # context->Rip<.Lprologue | |
1116 jb .Lcommon_seh_tail | |
1117 | |
1118 mov 152($context),%rax # pull context->Rsp | |
1119 | |
1120 lea .Lepilogue(%rip),%r10 | |
1121 cmp %r10,%rbx # context->Rip>=.Lepilogue | |
1122 jae .Lcommon_seh_tail | |
1123 | |
1124 mov `16*4`(%rax),%rax # pull saved stack pointer | |
1125 lea 32(%rax),%rax | |
1126 | |
1127 mov -8(%rax),%rbx | |
1128 mov -16(%rax),%rbp | |
1129 mov -24(%rax),%r12 | |
1130 mov -32(%rax),%r13 | |
1131 mov %rbx,144($context) # restore context->Rbx | |
1132 mov %rbp,160($context) # restore context->Rbp | |
1133 mov %r12,216($context) # restore context->R12 | |
1134 mov %r13,224($context) # restore context->R13 | |
1135 | |
1136 jmp .Lcommon_seh_tail | |
1137 .size se_handler,.-se_handler | |
1138 | |
1139 .type ssse3_handler,\@abi-omnipotent | |
1140 .align 16 | |
1141 ssse3_handler: | |
1142 push %rsi | |
1143 push %rdi | |
1144 push %rbx | |
1145 push %rbp | |
1146 push %r12 | |
1147 push %r13 | |
1148 push %r14 | |
1149 push %r15 | |
1150 pushfq | |
1151 sub \$64,%rsp | |
1152 | |
1153 mov 120($context),%rax # pull context->Rax | |
1154 mov 248($context),%rbx # pull context->Rip | |
1155 | |
1156 mov 8($disp),%rsi # disp->ImageBase | |
1157 mov 56($disp),%r11 # disp->HandlerData | |
1158 | |
1159 mov 0(%r11),%r10d # HandlerData[0] | |
1160 lea (%rsi,%r10),%r10 # prologue label | |
1161 cmp %r10,%rbx # context->Rip<prologue label | |
1162 jb .Lcommon_seh_tail | |
1163 | |
1164 mov 152($context),%rax # pull context->Rsp | |
1165 | |
1166 mov 4(%r11),%r10d # HandlerData[1] | |
1167 lea (%rsi,%r10),%r10 # epilogue label | |
1168 cmp %r10,%rbx # context->Rip>=epilogue label | |
1169 jae .Lcommon_seh_tail | |
1170 | |
1171 lea 64(%rax),%rsi | |
1172 lea 512($context),%rdi # &context.Xmm6 | |
1173 mov \$10,%ecx | |
1174 .long 0xa548f3fc # cld; rep movsq | |
1175 lea `24+64+5*16`(%rax),%rax # adjust stack pointer | |
1176 | |
1177 mov -8(%rax),%rbx | |
1178 mov -16(%rax),%rbp | |
1179 mov -24(%rax),%r12 | |
1180 mov %rbx,144($context) # restore context->Rbx | |
1181 mov %rbp,160($context) # restore context->Rbp | |
1182 mov %r12,216($context) # restore cotnext->R12 | |
1183 | |
1184 .Lcommon_seh_tail: | |
1185 mov 8(%rax),%rdi | |
1186 mov 16(%rax),%rsi | |
1187 mov %rax,152($context) # restore context->Rsp | |
1188 mov %rsi,168($context) # restore context->Rsi | |
1189 mov %rdi,176($context) # restore context->Rdi | |
1190 | |
1191 mov 40($disp),%rdi # disp->ContextRecord | |
1192 mov $context,%rsi # context | |
1193 mov \$154,%ecx # sizeof(CONTEXT) | |
1194 .long 0xa548f3fc # cld; rep movsq | |
1195 | |
1196 mov $disp,%rsi | |
1197 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
1198 mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
1199 mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
1200 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
1201 mov 40(%rsi),%r10 # disp->ContextRecord | |
1202 lea 56(%rsi),%r11 # &disp->HandlerData | |
1203 lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
1204 mov %r10,32(%rsp) # arg5 | |
1205 mov %r11,40(%rsp) # arg6 | |
1206 mov %r12,48(%rsp) # arg7 | |
1207 mov %rcx,56(%rsp) # arg8, (NULL) | |
1208 call *__imp_RtlVirtualUnwind(%rip) | |
1209 | |
1210 mov \$1,%eax # ExceptionContinueSearch | |
1211 add \$64,%rsp | |
1212 popfq | |
1213 pop %r15 | |
1214 pop %r14 | |
1215 pop %r13 | |
1216 pop %r12 | |
1217 pop %rbp | |
1218 pop %rbx | |
1219 pop %rdi | |
1220 pop %rsi | |
1221 ret | |
1222 .size ssse3_handler,.-ssse3_handler | |
1223 | |
1224 .section .pdata | |
1225 .align 4 | |
1226 .rva .LSEH_begin_sha1_block_data_order | |
1227 .rva .LSEH_end_sha1_block_data_order | |
1228 .rva .LSEH_info_sha1_block_data_order | |
1229 .rva .LSEH_begin_sha1_block_data_order_ssse3 | |
1230 .rva .LSEH_end_sha1_block_data_order_ssse3 | |
1231 .rva .LSEH_info_sha1_block_data_order_ssse3 | |
1232 ___ | |
1233 $code.=<<___ if ($avx); | |
1234 .rva .LSEH_begin_sha1_block_data_order_avx | |
1235 .rva .LSEH_end_sha1_block_data_order_avx | |
1236 .rva .LSEH_info_sha1_block_data_order_avx | |
1237 ___ | |
1238 $code.=<<___; | |
1239 .section .xdata | |
1240 .align 8 | |
1241 .LSEH_info_sha1_block_data_order: | |
1242 .byte 9,0,0,0 | |
1243 .rva se_handler | |
1244 .LSEH_info_sha1_block_data_order_ssse3: | |
1245 .byte 9,0,0,0 | |
1246 .rva ssse3_handler | |
1247 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] | |
1248 ___ | |
1249 $code.=<<___ if ($avx); | |
1250 .LSEH_info_sha1_block_data_order_avx: | |
1251 .byte 9,0,0,0 | |
1252 .rva ssse3_handler | |
1253 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] | |
1254 ___ | |
1255 } | |
1256 | |
1257 #################################################################### | |
1258 | |
1259 $code =~ s/\`([^\`]*)\`/eval $1/gem; | |
1260 print $code; | |
1261 close STDOUT; | |
OLD | NEW |