OLD | NEW |
| (Empty) |
1 #!/usr/bin/env perl | |
2 | |
3 # ==================================================================== | |
4 # [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 # project. The module is, however, dual licensed under OpenSSL and | |
6 # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 # details see http://www.openssl.org/~appro/cryptogams/. | |
8 # ==================================================================== | |
9 | |
10 # "[Re]written" was achieved in two major overhauls. In 2004 BODY_* | |
11 # functions were re-implemented to address P4 performance issue [see | |
12 # commentary below], and in 2006 the rest was rewritten in order to | |
13 # gain freedom to liberate licensing terms. | |
14 | |
15 # January, September 2004. | |
16 # | |
17 # It was noted that Intel IA-32 C compiler generates code which | |
18 # performs ~30% *faster* on P4 CPU than original *hand-coded* | |
19 # SHA1 assembler implementation. To address this problem (and | |
20 # prove that humans are still better than machines:-), the | |
21 # original code was overhauled, which resulted in following | |
22 # performance changes: | |
23 # | |
24 # compared with original compared with Intel cc | |
25 # assembler impl. generated code | |
26 # Pentium -16% +48% | |
27 # PIII/AMD +8% +16% | |
28 # P4 +85%(!) +45% | |
29 # | |
30 # As you can see Pentium came out as looser:-( Yet I reckoned that | |
31 # improvement on P4 outweights the loss and incorporate this | |
32 # re-tuned code to 0.9.7 and later. | |
33 # ---------------------------------------------------------------- | |
34 # <appro@fy.chalmers.se> | |
35 | |
36 # August 2009. | |
37 # | |
38 # George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as | |
39 # '(c&d) + (b&(c^d))', which allows to accumulate partial results | |
40 # and lighten "pressure" on scratch registers. This resulted in | |
41 # >12% performance improvement on contemporary AMD cores (with no | |
42 # degradation on other CPUs:-). Also, the code was revised to maximize | |
43 # "distance" between instructions producing input to 'lea' instruction | |
44 # and the 'lea' instruction itself, which is essential for Intel Atom | |
45 # core and resulted in ~15% improvement. | |
46 | |
47 # October 2010. | |
48 # | |
49 # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it | |
50 # is to offload message schedule denoted by Wt in NIST specification, | |
51 # or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel, | |
52 # and in SSE2 context was first explored by Dean Gaudet in 2004, see | |
53 # http://arctic.org/~dean/crypto/sha1.html. Since then several things | |
54 # have changed that made it interesting again: | |
55 # | |
56 # a) XMM units became faster and wider; | |
57 # b) instruction set became more versatile; | |
58 # c) an important observation was made by Max Locktykhin, which made | |
59 # it possible to reduce amount of instructions required to perform | |
60 # the operation in question, for further details see | |
61 # http://software.intel.com/en-us/articles/improving-the-performance-of-the-s
ecure-hash-algorithm-1/. | |
62 | |
63 # April 2011. | |
64 # | |
65 # Add AVX code path, probably most controversial... The thing is that | |
66 # switch to AVX alone improves performance by as little as 4% in | |
67 # comparison to SSSE3 code path. But below result doesn't look like | |
68 # 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as | |
69 # pair of µ-ops, and it's the additional µ-ops, two per round, that | |
70 # make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded | |
71 # as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with | |
72 # equivalent 'sh[rl]d' that is responsible for the impressive 5.1 | |
73 # cycles per processed byte. But 'sh[rl]d' is not something that used | |
74 # to be fast, nor does it appear to be fast in upcoming Bulldozer | |
75 # [according to its optimization manual]. Which is why AVX code path | |
76 # is guarded by *both* AVX and synthetic bit denoting Intel CPUs. | |
77 # One can argue that it's unfair to AMD, but without 'sh[rl]d' it | |
78 # makes no sense to keep the AVX code path. If somebody feels that | |
79 # strongly, it's probably more appropriate to discuss possibility of | |
80 # using vector rotate XOP on AMD... | |
81 | |
82 ###################################################################### | |
83 # Current performance is summarized in following table. Numbers are | |
84 # CPU clock cycles spent to process single byte (less is better). | |
85 # | |
86 # x86 SSSE3 AVX | |
87 # Pentium 15.7 - | |
88 # PIII 11.5 - | |
89 # P4 10.6 - | |
90 # AMD K8 7.1 - | |
91 # Core2 7.3 6.1/+20% - | |
92 # Atom 12.5 9.5(*)/+32% - | |
93 # Westmere 7.3 5.6/+30% - | |
94 # Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70% | |
95 # | |
96 # (*) Loop is 1056 instructions long and expected result is ~8.25. | |
97 # It remains mystery [to me] why ILP is limited to 1.7. | |
98 # | |
99 # (**) As per above comment, the result is for AVX *plus* sh[rl]d. | |
100 | |
101 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
102 push(@INC,"${dir}","${dir}../../perlasm"); | |
103 require "x86asm.pl"; | |
104 | |
105 &asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386"); | |
106 | |
107 $xmm=$ymm=0; | |
108 for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } | |
109 | |
110 $ymm=1 if ($xmm && | |
111 `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` | |
112 =~ /GNU assembler version ([2-9]\.[0-9]+)/ && | |
113 $1>=2.19); # first version supporting AVX | |
114 | |
115 $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && | |
116 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && | |
117 $1>=2.03); # first version supporting AVX | |
118 | |
119 &external_label("OPENSSL_ia32cap_P") if ($xmm); | |
120 | |
121 | |
122 $A="eax"; | |
123 $B="ebx"; | |
124 $C="ecx"; | |
125 $D="edx"; | |
126 $E="edi"; | |
127 $T="esi"; | |
128 $tmp1="ebp"; | |
129 | |
130 @V=($A,$B,$C,$D,$E,$T); | |
131 | |
132 $alt=0; # 1 denotes alternative IALU implementation, which performs | |
133 # 8% *worse* on P4, same on Westmere and Atom, 2% better on | |
134 # Sandy Bridge... | |
135 | |
136 sub BODY_00_15 | |
137 { | |
138 local($n,$a,$b,$c,$d,$e,$f)=@_; | |
139 | |
140 &comment("00_15 $n"); | |
141 | |
142 &mov($f,$c); # f to hold F_00_19(b,c,d) | |
143 if ($n==0) { &mov($tmp1,$a); } | |
144 else { &mov($a,$tmp1); } | |
145 &rotl($tmp1,5); # tmp1=ROTATE(a,5) | |
146 &xor($f,$d); | |
147 &add($tmp1,$e); # tmp1+=e; | |
148 &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded | |
149 # with xi, also note that e becomes | |
150 # f in next round... | |
151 &and($f,$b); | |
152 &rotr($b,2); # b=ROTATE(b,30) | |
153 &xor($f,$d); # f holds F_00_19(b,c,d) | |
154 &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi | |
155 | |
156 if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round | |
157 &add($f,$tmp1); } # f+=tmp1 | |
158 else { &add($tmp1,$f); } # f becomes a in next round | |
159 &mov($tmp1,$a) if ($alt && $n==15); | |
160 } | |
161 | |
162 sub BODY_16_19 | |
163 { | |
164 local($n,$a,$b,$c,$d,$e,$f)=@_; | |
165 | |
166 &comment("16_19 $n"); | |
167 | |
168 if ($alt) { | |
169 &xor($c,$d); | |
170 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) | |
171 &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d | |
172 &xor($f,&swtmp(($n+8)%16)); | |
173 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) | |
174 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd | |
175 &rotl($f,1); # f=ROTATE(f,1) | |
176 &add($e,$tmp1); # e+=F_00_19(b,c,d) | |
177 &xor($c,$d); # restore $c | |
178 &mov($tmp1,$a); # b in next round | |
179 &rotr($b,$n==16?2:7); # b=ROTATE(b,30) | |
180 &mov(&swtmp($n%16),$f); # xi=f | |
181 &rotl($a,5); # ROTATE(a,5) | |
182 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e | |
183 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round | |
184 &add($f,$a); # f+=ROTATE(a,5) | |
185 } else { | |
186 &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d) | |
187 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) | |
188 &xor($tmp1,$d); | |
189 &xor($f,&swtmp(($n+8)%16)); | |
190 &and($tmp1,$b); | |
191 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd | |
192 &rotl($f,1); # f=ROTATE(f,1) | |
193 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) | |
194 &add($e,$tmp1); # e+=F_00_19(b,c,d) | |
195 &mov($tmp1,$a); | |
196 &rotr($b,2); # b=ROTATE(b,30) | |
197 &mov(&swtmp($n%16),$f); # xi=f | |
198 &rotl($tmp1,5); # ROTATE(a,5) | |
199 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e | |
200 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round | |
201 &add($f,$tmp1); # f+=ROTATE(a,5) | |
202 } | |
203 } | |
204 | |
205 sub BODY_20_39 | |
206 { | |
207 local($n,$a,$b,$c,$d,$e,$f)=@_; | |
208 local $K=($n<40)?0x6ed9eba1:0xca62c1d6; | |
209 | |
210 &comment("20_39 $n"); | |
211 | |
212 if ($alt) { | |
213 &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c | |
214 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) | |
215 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) | |
216 &xor($f,&swtmp(($n+8)%16)); | |
217 &add($e,$tmp1); # e+=F_20_39(b,c,d) | |
218 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd | |
219 &rotl($f,1); # f=ROTATE(f,1) | |
220 &mov($tmp1,$a); # b in next round | |
221 &rotr($b,7); # b=ROTATE(b,30) | |
222 &mov(&swtmp($n%16),$f) if($n<77);# xi=f | |
223 &rotl($a,5); # ROTATE(a,5) | |
224 &xor($b,$c) if($n==39);# warm up for BODY_40_59 | |
225 &and($tmp1,$b) if($n==39); | |
226 &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY | |
227 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round | |
228 &add($f,$a); # f+=ROTATE(a,5) | |
229 &rotr($a,5) if ($n==79); | |
230 } else { | |
231 &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d) | |
232 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) | |
233 &xor($tmp1,$c); | |
234 &xor($f,&swtmp(($n+8)%16)); | |
235 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) | |
236 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd | |
237 &rotl($f,1); # f=ROTATE(f,1) | |
238 &add($e,$tmp1); # e+=F_20_39(b,c,d) | |
239 &rotr($b,2); # b=ROTATE(b,30) | |
240 &mov($tmp1,$a); | |
241 &rotl($tmp1,5); # ROTATE(a,5) | |
242 &mov(&swtmp($n%16),$f) if($n<77);# xi=f | |
243 &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY | |
244 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round | |
245 &add($f,$tmp1); # f+=ROTATE(a,5) | |
246 } | |
247 } | |
248 | |
249 sub BODY_40_59 | |
250 { | |
251 local($n,$a,$b,$c,$d,$e,$f)=@_; | |
252 | |
253 &comment("40_59 $n"); | |
254 | |
255 if ($alt) { | |
256 &add($e,$tmp1); # e+=b&(c^d) | |
257 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) | |
258 &mov($tmp1,$d); | |
259 &xor($f,&swtmp(($n+8)%16)); | |
260 &xor($c,$d); # restore $c | |
261 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd | |
262 &rotl($f,1); # f=ROTATE(f,1) | |
263 &and($tmp1,$c); | |
264 &rotr($b,7); # b=ROTATE(b,30) | |
265 &add($e,$tmp1); # e+=c&d | |
266 &mov($tmp1,$a); # b in next round | |
267 &mov(&swtmp($n%16),$f); # xi=f | |
268 &rotl($a,5); # ROTATE(a,5) | |
269 &xor($b,$c) if ($n<59); | |
270 &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d) | |
271 &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d)) | |
272 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round | |
273 &add($f,$a); # f+=ROTATE(a,5) | |
274 } else { | |
275 &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d) | |
276 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) | |
277 &xor($tmp1,$d); | |
278 &xor($f,&swtmp(($n+8)%16)); | |
279 &and($tmp1,$b); | |
280 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd | |
281 &rotl($f,1); # f=ROTATE(f,1) | |
282 &add($tmp1,$e); # b&(c^d)+=e | |
283 &rotr($b,2); # b=ROTATE(b,30) | |
284 &mov($e,$a); # e becomes volatile | |
285 &rotl($e,5); # ROTATE(a,5) | |
286 &mov(&swtmp($n%16),$f); # xi=f | |
287 &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d)) | |
288 &mov($tmp1,$c); | |
289 &add($f,$e); # f+=ROTATE(a,5) | |
290 &and($tmp1,$d); | |
291 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round | |
292 &add($f,$tmp1); # f+=c&d | |
293 } | |
294 } | |
295 | |
296 &function_begin("sha1_block_data_order"); | |
297 if ($xmm) { | |
298 &static_label("ssse3_shortcut"); | |
299 &static_label("avx_shortcut") if ($ymm); | |
300 &static_label("K_XX_XX"); | |
301 | |
302 &call (&label("pic_point")); # make it PIC! | |
303 &set_label("pic_point"); | |
304 &blindpop($tmp1); | |
305 &picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point")); | |
306 &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); | |
307 | |
308 &mov ($A,&DWP(0,$T)); | |
309 &mov ($D,&DWP(4,$T)); | |
310 &test ($D,1<<9); # check SSSE3 bit | |
311 &jz (&label("x86")); | |
312 &test ($A,1<<24); # check FXSR bit | |
313 &jz (&label("x86")); | |
314 if ($ymm) { | |
315 &and ($D,1<<28); # mask AVX bit | |
316 &and ($A,1<<30); # mask "Intel CPU" bit | |
317 &or ($A,$D); | |
318 &cmp ($A,1<<28|1<<30); | |
319 &je (&label("avx_shortcut")); | |
320 } | |
321 &jmp (&label("ssse3_shortcut")); | |
322 &set_label("x86",16); | |
323 } | |
324 &mov($tmp1,&wparam(0)); # SHA_CTX *c | |
325 &mov($T,&wparam(1)); # const void *input | |
326 &mov($A,&wparam(2)); # size_t num | |
327 &stack_push(16+3); # allocate X[16] | |
328 &shl($A,6); | |
329 &add($A,$T); | |
330 &mov(&wparam(2),$A); # pointer beyond the end of input | |
331 &mov($E,&DWP(16,$tmp1));# pre-load E | |
332 &jmp(&label("loop")); | |
333 | |
334 &set_label("loop",16); | |
335 | |
336 # copy input chunk to X, but reversing byte order! | |
337 for ($i=0; $i<16; $i+=4) | |
338 { | |
339 &mov($A,&DWP(4*($i+0),$T)); | |
340 &mov($B,&DWP(4*($i+1),$T)); | |
341 &mov($C,&DWP(4*($i+2),$T)); | |
342 &mov($D,&DWP(4*($i+3),$T)); | |
343 &bswap($A); | |
344 &bswap($B); | |
345 &bswap($C); | |
346 &bswap($D); | |
347 &mov(&swtmp($i+0),$A); | |
348 &mov(&swtmp($i+1),$B); | |
349 &mov(&swtmp($i+2),$C); | |
350 &mov(&swtmp($i+3),$D); | |
351 } | |
352 &mov(&wparam(1),$T); # redundant in 1st spin | |
353 | |
354 &mov($A,&DWP(0,$tmp1)); # load SHA_CTX | |
355 &mov($B,&DWP(4,$tmp1)); | |
356 &mov($C,&DWP(8,$tmp1)); | |
357 &mov($D,&DWP(12,$tmp1)); | |
358 # E is pre-loaded | |
359 | |
360 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } | |
361 for(;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); } | |
362 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | |
363 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } | |
364 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | |
365 | |
366 (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check | |
367 | |
368 &mov($tmp1,&wparam(0)); # re-load SHA_CTX* | |
369 &mov($D,&wparam(1)); # D is last "T" and is discarded | |
370 | |
371 &add($E,&DWP(0,$tmp1)); # E is last "A"... | |
372 &add($T,&DWP(4,$tmp1)); | |
373 &add($A,&DWP(8,$tmp1)); | |
374 &add($B,&DWP(12,$tmp1)); | |
375 &add($C,&DWP(16,$tmp1)); | |
376 | |
377 &mov(&DWP(0,$tmp1),$E); # update SHA_CTX | |
378 &add($D,64); # advance input pointer | |
379 &mov(&DWP(4,$tmp1),$T); | |
380 &cmp($D,&wparam(2)); # have we reached the end yet? | |
381 &mov(&DWP(8,$tmp1),$A); | |
382 &mov($E,$C); # C is last "E" which needs to be "pre-loaded" | |
383 &mov(&DWP(12,$tmp1),$B); | |
384 &mov($T,$D); # input pointer | |
385 &mov(&DWP(16,$tmp1),$C); | |
386 &jb(&label("loop")); | |
387 | |
388 &stack_pop(16+3); | |
389 &function_end("sha1_block_data_order"); | |
390 | |
391 if ($xmm) { | |
392 ###################################################################### | |
393 # The SSSE3 implementation. | |
394 # | |
395 # %xmm[0-7] are used as ring @X[] buffer containing quadruples of last | |
396 # 32 elements of the message schedule or Xupdate outputs. First 4 | |
397 # quadruples are simply byte-swapped input, next 4 are calculated | |
398 # according to method originally suggested by Dean Gaudet (modulo | |
399 # being implemented in SSSE3). Once 8 quadruples or 32 elements are | |
400 # collected, it switches to routine proposed by Max Locktyukhin. | |
401 # | |
402 # Calculations inevitably require temporary reqisters, and there are | |
403 # no %xmm registers left to spare. For this reason part of the ring | |
404 # buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring | |
405 # buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] - | |
406 # X[-5], and X[4] - X[-4]... | |
407 # | |
408 # Another notable optimization is aggressive stack frame compression | |
409 # aiming to minimize amount of 9-byte instructions... | |
410 # | |
411 # Yet another notable optimization is "jumping" $B variable. It means | |
412 # that there is no register permanently allocated for $B value. This | |
413 # allowed to eliminate one instruction from body_20_39... | |
414 # | |
415 my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded | |
416 my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 | |
417 my @V=($A,$B,$C,$D,$E); | |
418 my $j=0; # hash round | |
419 my @T=($T,$tmp1); | |
420 my $inp; | |
421 | |
422 my $_rol=sub { &rol(@_) }; | |
423 my $_ror=sub { &ror(@_) }; | |
424 | |
425 &function_begin("_sha1_block_data_order_ssse3"); | |
426 &call (&label("pic_point")); # make it PIC! | |
427 &set_label("pic_point"); | |
428 &blindpop($tmp1); | |
429 &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); | |
430 &set_label("ssse3_shortcut"); | |
431 | |
432 &movdqa (@X[3],&QWP(0,$tmp1)); # K_00_19 | |
433 &movdqa (@X[4],&QWP(16,$tmp1)); # K_20_39 | |
434 &movdqa (@X[5],&QWP(32,$tmp1)); # K_40_59 | |
435 &movdqa (@X[6],&QWP(48,$tmp1)); # K_60_79 | |
436 &movdqa (@X[2],&QWP(64,$tmp1)); # pbswap mask | |
437 | |
438 &mov ($E,&wparam(0)); # load argument block | |
439 &mov ($inp=@T[1],&wparam(1)); | |
440 &mov ($D,&wparam(2)); | |
441 &mov (@T[0],"esp"); | |
442 | |
443 # stack frame layout | |
444 # | |
445 # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area | |
446 # X[4]+K X[5]+K X[6]+K X[7]+K | |
447 # X[8]+K X[9]+K X[10]+K X[11]+K | |
448 # X[12]+K X[13]+K X[14]+K X[15]+K | |
449 # | |
450 # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area | |
451 # X[4] X[5] X[6] X[7] | |
452 # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 | |
453 # | |
454 # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants | |
455 # K_40_59 K_40_59 K_40_59 K_40_59 | |
456 # K_60_79 K_60_79 K_60_79 K_60_79 | |
457 # K_00_19 K_00_19 K_00_19 K_00_19 | |
458 # pbswap mask | |
459 # | |
460 # +192 ctx # argument block | |
461 # +196 inp | |
462 # +200 end | |
463 # +204 esp | |
464 &sub ("esp",208); | |
465 &and ("esp",-64); | |
466 | |
467 &movdqa (&QWP(112+0,"esp"),@X[4]); # copy constants | |
468 &movdqa (&QWP(112+16,"esp"),@X[5]); | |
469 &movdqa (&QWP(112+32,"esp"),@X[6]); | |
470 &shl ($D,6); # len*64 | |
471 &movdqa (&QWP(112+48,"esp"),@X[3]); | |
472 &add ($D,$inp); # end of input | |
473 &movdqa (&QWP(112+64,"esp"),@X[2]); | |
474 &add ($inp,64); | |
475 &mov (&DWP(192+0,"esp"),$E); # save argument block | |
476 &mov (&DWP(192+4,"esp"),$inp); | |
477 &mov (&DWP(192+8,"esp"),$D); | |
478 &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp | |
479 | |
480 &mov ($A,&DWP(0,$E)); # load context | |
481 &mov ($B,&DWP(4,$E)); | |
482 &mov ($C,&DWP(8,$E)); | |
483 &mov ($D,&DWP(12,$E)); | |
484 &mov ($E,&DWP(16,$E)); | |
485 &mov (@T[0],$B); # magic seed | |
486 | |
487 &movdqu (@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] | |
488 &movdqu (@X[-3&7],&QWP(-48,$inp)); | |
489 &movdqu (@X[-2&7],&QWP(-32,$inp)); | |
490 &movdqu (@X[-1&7],&QWP(-16,$inp)); | |
491 &pshufb (@X[-4&7],@X[2]); # byte swap | |
492 &pshufb (@X[-3&7],@X[2]); | |
493 &pshufb (@X[-2&7],@X[2]); | |
494 &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot | |
495 &pshufb (@X[-1&7],@X[2]); | |
496 &paddd (@X[-4&7],@X[3]); # add K_00_19 | |
497 &paddd (@X[-3&7],@X[3]); | |
498 &paddd (@X[-2&7],@X[3]); | |
499 &movdqa (&QWP(0,"esp"),@X[-4&7]); # X[]+K xfer to IALU | |
500 &psubd (@X[-4&7],@X[3]); # restore X[] | |
501 &movdqa (&QWP(0+16,"esp"),@X[-3&7]); | |
502 &psubd (@X[-3&7],@X[3]); | |
503 &movdqa (&QWP(0+32,"esp"),@X[-2&7]); | |
504 &psubd (@X[-2&7],@X[3]); | |
505 &movdqa (@X[0],@X[-3&7]); | |
506 &jmp (&label("loop")); | |
507 | |
508 ###################################################################### | |
509 # SSE instruction sequence is first broken to groups of indepentent | |
510 # instructions, independent in respect to their inputs and shifter | |
511 # (not all architectures have more than one). Then IALU instructions | |
512 # are "knitted in" between the SSE groups. Distance is maintained for | |
513 # SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer | |
514 # [which allegedly also implements SSSE3]... | |
515 # | |
516 # Temporary registers usage. X[2] is volatile at the entry and at the | |
517 # end is restored from backtrace ring buffer. X[3] is expected to | |
518 # contain current K_XX_XX constant and is used to caclulate X[-1]+K | |
519 # from previous round, it becomes volatile the moment the value is | |
520 # saved to stack for transfer to IALU. X[4] becomes volatile whenever | |
521 # X[-4] is accumulated and offloaded to backtrace ring buffer, at the | |
522 # end it is loaded with next K_XX_XX [which becomes X[3] in next | |
523 # round]... | |
524 # | |
525 sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 | |
526 { use integer; | |
527 my $body = shift; | |
528 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions | |
529 my ($a,$b,$c,$d,$e); | |
530 | |
531 eval(shift(@insns)); | |
532 eval(shift(@insns)); | |
533 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" | |
534 &movdqa (@X[2],@X[-1&7]); | |
535 eval(shift(@insns)); | |
536 eval(shift(@insns)); | |
537 | |
538 &paddd (@X[3],@X[-1&7]); | |
539 &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to b
acktrace buffer | |
540 eval(shift(@insns)); | |
541 eval(shift(@insns)); | |
542 &psrldq (@X[2],4); # "X[-3]", 3 dwords | |
543 eval(shift(@insns)); | |
544 eval(shift(@insns)); | |
545 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" | |
546 eval(shift(@insns)); | |
547 eval(shift(@insns)); | |
548 | |
549 &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" | |
550 eval(shift(@insns)); | |
551 eval(shift(@insns)); | |
552 eval(shift(@insns)); | |
553 eval(shift(@insns)); | |
554 | |
555 &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" | |
556 eval(shift(@insns)); | |
557 eval(shift(@insns)); | |
558 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to
IALU | |
559 eval(shift(@insns)); | |
560 eval(shift(@insns)); | |
561 | |
562 &movdqa (@X[4],@X[0]); | |
563 &movdqa (@X[2],@X[0]); | |
564 eval(shift(@insns)); | |
565 eval(shift(@insns)); | |
566 eval(shift(@insns)); | |
567 eval(shift(@insns)); | |
568 | |
569 &pslldq (@X[4],12); # "X[0]"<<96, extract one dword | |
570 &paddd (@X[0],@X[0]); | |
571 eval(shift(@insns)); | |
572 eval(shift(@insns)); | |
573 eval(shift(@insns)); | |
574 eval(shift(@insns)); | |
575 | |
576 &psrld (@X[2],31); | |
577 eval(shift(@insns)); | |
578 eval(shift(@insns)); | |
579 &movdqa (@X[3],@X[4]); | |
580 eval(shift(@insns)); | |
581 eval(shift(@insns)); | |
582 | |
583 &psrld (@X[4],30); | |
584 &por (@X[0],@X[2]); # "X[0]"<<<=1 | |
585 eval(shift(@insns)); | |
586 eval(shift(@insns)); | |
587 &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);
# restore X[] from backtrace buffer | |
588 eval(shift(@insns)); | |
589 eval(shift(@insns)); | |
590 | |
591 &pslld (@X[3],2); | |
592 &pxor (@X[0],@X[4]); | |
593 eval(shift(@insns)); | |
594 eval(shift(@insns)); | |
595 &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_X
X | |
596 eval(shift(@insns)); | |
597 eval(shift(@insns)); | |
598 | |
599 &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2 | |
600 &movdqa (@X[1],@X[-2&7]) if ($Xi<7); | |
601 eval(shift(@insns)); | |
602 eval(shift(@insns)); | |
603 | |
604 foreach (@insns) { eval; } # remaining instructions [if any] | |
605 | |
606 $Xi++; push(@X,shift(@X)); # "rotate" X[] | |
607 } | |
608 | |
609 sub Xupdate_ssse3_32_79() | |
610 { use integer; | |
611 my $body = shift; | |
612 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions | |
613 my ($a,$b,$c,$d,$e); | |
614 | |
615 &movdqa (@X[2],@X[-1&7]) if ($Xi==8); | |
616 eval(shift(@insns)); # body_20_39 | |
617 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" | |
618 &palignr(@X[2],@X[-2&7],8); # compose "X[-6]" | |
619 eval(shift(@insns)); | |
620 eval(shift(@insns)); | |
621 eval(shift(@insns)); # rol | |
622 | |
623 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" | |
624 &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X
[] to backtrace buffer | |
625 eval(shift(@insns)); | |
626 eval(shift(@insns)); | |
627 if ($Xi%5) { | |
628 &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... | |
629 } else { # ... or load next one | |
630 &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); | |
631 } | |
632 &paddd (@X[3],@X[-1&7]); | |
633 eval(shift(@insns)); # ror | |
634 eval(shift(@insns)); | |
635 | |
636 &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]" | |
637 eval(shift(@insns)); # body_20_39 | |
638 eval(shift(@insns)); | |
639 eval(shift(@insns)); | |
640 eval(shift(@insns)); # rol | |
641 | |
642 &movdqa (@X[2],@X[0]); | |
643 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to
IALU | |
644 eval(shift(@insns)); | |
645 eval(shift(@insns)); | |
646 eval(shift(@insns)); # ror | |
647 eval(shift(@insns)); | |
648 | |
649 &pslld (@X[0],2); | |
650 eval(shift(@insns)); # body_20_39 | |
651 eval(shift(@insns)); | |
652 &psrld (@X[2],30); | |
653 eval(shift(@insns)); | |
654 eval(shift(@insns)); # rol | |
655 eval(shift(@insns)); | |
656 eval(shift(@insns)); | |
657 eval(shift(@insns)); # ror | |
658 eval(shift(@insns)); | |
659 | |
660 &por (@X[0],@X[2]); # "X[0]"<<<=2 | |
661 eval(shift(@insns)); # body_20_39 | |
662 eval(shift(@insns)); | |
663 &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);
# restore X[] from backtrace buffer | |
664 eval(shift(@insns)); | |
665 eval(shift(@insns)); # rol | |
666 eval(shift(@insns)); | |
667 eval(shift(@insns)); | |
668 eval(shift(@insns)); # ror | |
669 &movdqa (@X[3],@X[0]) if ($Xi<19); | |
670 eval(shift(@insns)); | |
671 | |
672 foreach (@insns) { eval; } # remaining instructions | |
673 | |
674 $Xi++; push(@X,shift(@X)); # "rotate" X[] | |
675 } | |
676 | |
677 sub Xuplast_ssse3_80() | |
678 { use integer; | |
679 my $body = shift; | |
680 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
681 my ($a,$b,$c,$d,$e); | |
682 | |
683 eval(shift(@insns)); | |
684 &paddd (@X[3],@X[-1&7]); | |
685 eval(shift(@insns)); | |
686 eval(shift(@insns)); | |
687 eval(shift(@insns)); | |
688 eval(shift(@insns)); | |
689 | |
690 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IAL
U | |
691 | |
692 foreach (@insns) { eval; } # remaining instructions | |
693 | |
694 &mov ($inp=@T[1],&DWP(192+4,"esp")); | |
695 &cmp ($inp,&DWP(192+8,"esp")); | |
696 &je (&label("done")); | |
697 | |
698 &movdqa (@X[3],&QWP(112+48,"esp")); # K_00_19 | |
699 &movdqa (@X[2],&QWP(112+64,"esp")); # pbswap mask | |
700 &movdqu (@X[-4&7],&QWP(0,$inp)); # load input | |
701 &movdqu (@X[-3&7],&QWP(16,$inp)); | |
702 &movdqu (@X[-2&7],&QWP(32,$inp)); | |
703 &movdqu (@X[-1&7],&QWP(48,$inp)); | |
704 &add ($inp,64); | |
705 &pshufb (@X[-4&7],@X[2]); # byte swap | |
706 &mov (&DWP(192+4,"esp"),$inp); | |
707 &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot | |
708 | |
709 $Xi=0; | |
710 } | |
711 | |
712 sub Xloop_ssse3() | |
713 { use integer; | |
714 my $body = shift; | |
715 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
716 my ($a,$b,$c,$d,$e); | |
717 | |
718 eval(shift(@insns)); | |
719 eval(shift(@insns)); | |
720 &pshufb (@X[($Xi-3)&7],@X[2]); | |
721 eval(shift(@insns)); | |
722 eval(shift(@insns)); | |
723 &paddd (@X[($Xi-4)&7],@X[3]); | |
724 eval(shift(@insns)); | |
725 eval(shift(@insns)); | |
726 eval(shift(@insns)); | |
727 eval(shift(@insns)); | |
728 &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU | |
729 eval(shift(@insns)); | |
730 eval(shift(@insns)); | |
731 &psubd (@X[($Xi-4)&7],@X[3]); | |
732 | |
733 foreach (@insns) { eval; } | |
734 $Xi++; | |
735 } | |
736 | |
737 sub Xtail_ssse3() | |
738 { use integer; | |
739 my $body = shift; | |
740 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
741 my ($a,$b,$c,$d,$e); | |
742 | |
743 foreach (@insns) { eval; } | |
744 } | |
745 | |
746 sub body_00_19 () { | |
747 ( | |
748 '($a,$b,$c,$d,$e)=@V;'. | |
749 '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer | |
750 '&xor ($c,$d);', | |
751 '&mov (@T[1],$a);', # $b in next round | |
752 '&$_rol ($a,5);', | |
753 '&and (@T[0],$c);', # ($b&($c^$d)) | |
754 '&xor ($c,$d);', # restore $c | |
755 '&xor (@T[0],$d);', | |
756 '&add ($e,$a);', | |
757 '&$_ror ($b,$j?7:2);', # $b>>>2 | |
758 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T))
;' | |
759 ); | |
760 } | |
761 | |
762 sub body_20_39 () { | |
763 ( | |
764 '($a,$b,$c,$d,$e)=@V;'. | |
765 '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer | |
766 '&xor (@T[0],$d);', # ($b^$d) | |
767 '&mov (@T[1],$a);', # $b in next round | |
768 '&$_rol ($a,5);', | |
769 '&xor (@T[0],$c);', # ($b^$d^$c) | |
770 '&add ($e,$a);', | |
771 '&$_ror ($b,7);', # $b>>>2 | |
772 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' | |
773 ); | |
774 } | |
775 | |
776 sub body_40_59 () { | |
777 ( | |
778 '($a,$b,$c,$d,$e)=@V;'. | |
779 '&mov (@T[1],$c);', | |
780 '&xor ($c,$d);', | |
781 '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer | |
782 '&and (@T[1],$d);', | |
783 '&and (@T[0],$c);', # ($b&($c^$d)) | |
784 '&$_ror ($b,7);', # $b>>>2 | |
785 '&add ($e,@T[1]);', | |
786 '&mov (@T[1],$a);', # $b in next round | |
787 '&$_rol ($a,5);', | |
788 '&add ($e,@T[0]);', | |
789 '&xor ($c,$d);', # restore $c | |
790 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' | |
791 ); | |
792 } | |
793 | |
794 &set_label("loop",16); | |
795 &Xupdate_ssse3_16_31(\&body_00_19); | |
796 &Xupdate_ssse3_16_31(\&body_00_19); | |
797 &Xupdate_ssse3_16_31(\&body_00_19); | |
798 &Xupdate_ssse3_16_31(\&body_00_19); | |
799 &Xupdate_ssse3_32_79(\&body_00_19); | |
800 &Xupdate_ssse3_32_79(\&body_20_39); | |
801 &Xupdate_ssse3_32_79(\&body_20_39); | |
802 &Xupdate_ssse3_32_79(\&body_20_39); | |
803 &Xupdate_ssse3_32_79(\&body_20_39); | |
804 &Xupdate_ssse3_32_79(\&body_20_39); | |
805 &Xupdate_ssse3_32_79(\&body_40_59); | |
806 &Xupdate_ssse3_32_79(\&body_40_59); | |
807 &Xupdate_ssse3_32_79(\&body_40_59); | |
808 &Xupdate_ssse3_32_79(\&body_40_59); | |
809 &Xupdate_ssse3_32_79(\&body_40_59); | |
810 &Xupdate_ssse3_32_79(\&body_20_39); | |
811 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" | |
812 | |
813 $saved_j=$j; @saved_V=@V; | |
814 | |
815 &Xloop_ssse3(\&body_20_39); | |
816 &Xloop_ssse3(\&body_20_39); | |
817 &Xloop_ssse3(\&body_20_39); | |
818 | |
819 &mov (@T[1],&DWP(192,"esp")); # update context | |
820 &add ($A,&DWP(0,@T[1])); | |
821 &add (@T[0],&DWP(4,@T[1])); # $b | |
822 &add ($C,&DWP(8,@T[1])); | |
823 &mov (&DWP(0,@T[1]),$A); | |
824 &add ($D,&DWP(12,@T[1])); | |
825 &mov (&DWP(4,@T[1]),@T[0]); | |
826 &add ($E,&DWP(16,@T[1])); | |
827 &mov (&DWP(8,@T[1]),$C); | |
828 &mov ($B,@T[0]); | |
829 &mov (&DWP(12,@T[1]),$D); | |
830 &mov (&DWP(16,@T[1]),$E); | |
831 &movdqa (@X[0],@X[-3&7]); | |
832 | |
833 &jmp (&label("loop")); | |
834 | |
835 &set_label("done",16); $j=$saved_j; @V=@saved_V; | |
836 | |
837 &Xtail_ssse3(\&body_20_39); | |
838 &Xtail_ssse3(\&body_20_39); | |
839 &Xtail_ssse3(\&body_20_39); | |
840 | |
841 &mov (@T[1],&DWP(192,"esp")); # update context | |
842 &add ($A,&DWP(0,@T[1])); | |
843 &mov ("esp",&DWP(192+12,"esp")); # restore %esp | |
844 &add (@T[0],&DWP(4,@T[1])); # $b | |
845 &add ($C,&DWP(8,@T[1])); | |
846 &mov (&DWP(0,@T[1]),$A); | |
847 &add ($D,&DWP(12,@T[1])); | |
848 &mov (&DWP(4,@T[1]),@T[0]); | |
849 &add ($E,&DWP(16,@T[1])); | |
850 &mov (&DWP(8,@T[1]),$C); | |
851 &mov (&DWP(12,@T[1]),$D); | |
852 &mov (&DWP(16,@T[1]),$E); | |
853 | |
854 &function_end("_sha1_block_data_order_ssse3"); | |
855 | |
856 if ($ymm) { | |
857 my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded | |
858 my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 | |
859 my @V=($A,$B,$C,$D,$E); | |
860 my $j=0; # hash round | |
861 my @T=($T,$tmp1); | |
862 my $inp; | |
863 | |
864 my $_rol=sub { &shld(@_[0],@_) }; | |
865 my $_ror=sub { &shrd(@_[0],@_) }; | |
866 | |
867 &function_begin("_sha1_block_data_order_avx"); | |
868 &call (&label("pic_point")); # make it PIC! | |
869 &set_label("pic_point"); | |
870 &blindpop($tmp1); | |
871 &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); | |
872 &set_label("avx_shortcut"); | |
873 &vzeroall(); | |
874 | |
875 &vmovdqa(@X[3],&QWP(0,$tmp1)); # K_00_19 | |
876 &vmovdqa(@X[4],&QWP(16,$tmp1)); # K_20_39 | |
877 &vmovdqa(@X[5],&QWP(32,$tmp1)); # K_40_59 | |
878 &vmovdqa(@X[6],&QWP(48,$tmp1)); # K_60_79 | |
879 &vmovdqa(@X[2],&QWP(64,$tmp1)); # pbswap mask | |
880 | |
881 &mov ($E,&wparam(0)); # load argument block | |
882 &mov ($inp=@T[1],&wparam(1)); | |
883 &mov ($D,&wparam(2)); | |
884 &mov (@T[0],"esp"); | |
885 | |
886 # stack frame layout | |
887 # | |
888 # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area | |
889 # X[4]+K X[5]+K X[6]+K X[7]+K | |
890 # X[8]+K X[9]+K X[10]+K X[11]+K | |
891 # X[12]+K X[13]+K X[14]+K X[15]+K | |
892 # | |
893 # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area | |
894 # X[4] X[5] X[6] X[7] | |
895 # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 | |
896 # | |
897 # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants | |
898 # K_40_59 K_40_59 K_40_59 K_40_59 | |
899 # K_60_79 K_60_79 K_60_79 K_60_79 | |
900 # K_00_19 K_00_19 K_00_19 K_00_19 | |
901 # pbswap mask | |
902 # | |
903 # +192 ctx # argument block | |
904 # +196 inp | |
905 # +200 end | |
906 # +204 esp | |
907 &sub ("esp",208); | |
908 &and ("esp",-64); | |
909 | |
910 &vmovdqa(&QWP(112+0,"esp"),@X[4]); # copy constants | |
911 &vmovdqa(&QWP(112+16,"esp"),@X[5]); | |
912 &vmovdqa(&QWP(112+32,"esp"),@X[6]); | |
913 &shl ($D,6); # len*64 | |
914 &vmovdqa(&QWP(112+48,"esp"),@X[3]); | |
915 &add ($D,$inp); # end of input | |
916 &vmovdqa(&QWP(112+64,"esp"),@X[2]); | |
917 &add ($inp,64); | |
918 &mov (&DWP(192+0,"esp"),$E); # save argument block | |
919 &mov (&DWP(192+4,"esp"),$inp); | |
920 &mov (&DWP(192+8,"esp"),$D); | |
921 &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp | |
922 | |
923 &mov ($A,&DWP(0,$E)); # load context | |
924 &mov ($B,&DWP(4,$E)); | |
925 &mov ($C,&DWP(8,$E)); | |
926 &mov ($D,&DWP(12,$E)); | |
927 &mov ($E,&DWP(16,$E)); | |
928 &mov (@T[0],$B); # magic seed | |
929 | |
930 &vmovdqu(@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] | |
931 &vmovdqu(@X[-3&7],&QWP(-48,$inp)); | |
932 &vmovdqu(@X[-2&7],&QWP(-32,$inp)); | |
933 &vmovdqu(@X[-1&7],&QWP(-16,$inp)); | |
934 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap | |
935 &vpshufb(@X[-3&7],@X[-3&7],@X[2]); | |
936 &vpshufb(@X[-2&7],@X[-2&7],@X[2]); | |
937 &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot | |
938 &vpshufb(@X[-1&7],@X[-1&7],@X[2]); | |
939 &vpaddd (@X[0],@X[-4&7],@X[3]); # add K_00_19 | |
940 &vpaddd (@X[1],@X[-3&7],@X[3]); | |
941 &vpaddd (@X[2],@X[-2&7],@X[3]); | |
942 &vmovdqa(&QWP(0,"esp"),@X[0]); # X[]+K xfer to IALU | |
943 &vmovdqa(&QWP(0+16,"esp"),@X[1]); | |
944 &vmovdqa(&QWP(0+32,"esp"),@X[2]); | |
945 &jmp (&label("loop")); | |
946 | |
947 sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 | |
948 { use integer; | |
949 my $body = shift; | |
950 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions | |
951 my ($a,$b,$c,$d,$e); | |
952 | |
953 eval(shift(@insns)); | |
954 eval(shift(@insns)); | |
955 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" | |
956 eval(shift(@insns)); | |
957 eval(shift(@insns)); | |
958 | |
959 &vpaddd (@X[3],@X[3],@X[-1&7]); | |
960 &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to b
acktrace buffer | |
961 eval(shift(@insns)); | |
962 eval(shift(@insns)); | |
963 &vpsrldq(@X[2],@X[-1&7],4); # "X[-3]", 3 dwords | |
964 eval(shift(@insns)); | |
965 eval(shift(@insns)); | |
966 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" | |
967 eval(shift(@insns)); | |
968 eval(shift(@insns)); | |
969 | |
970 &vpxor (@X[2],@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" | |
971 eval(shift(@insns)); | |
972 eval(shift(@insns)); | |
973 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to
IALU | |
974 eval(shift(@insns)); | |
975 eval(shift(@insns)); | |
976 | |
977 &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" | |
978 eval(shift(@insns)); | |
979 eval(shift(@insns)); | |
980 eval(shift(@insns)); | |
981 eval(shift(@insns)); | |
982 | |
983 &vpsrld (@X[2],@X[0],31); | |
984 eval(shift(@insns)); | |
985 eval(shift(@insns)); | |
986 eval(shift(@insns)); | |
987 eval(shift(@insns)); | |
988 | |
989 &vpslldq(@X[4],@X[0],12); # "X[0]"<<96, extract one dword | |
990 &vpaddd (@X[0],@X[0],@X[0]); | |
991 eval(shift(@insns)); | |
992 eval(shift(@insns)); | |
993 eval(shift(@insns)); | |
994 eval(shift(@insns)); | |
995 | |
996 &vpsrld (@X[3],@X[4],30); | |
997 &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=1 | |
998 eval(shift(@insns)); | |
999 eval(shift(@insns)); | |
1000 eval(shift(@insns)); | |
1001 eval(shift(@insns)); | |
1002 | |
1003 &vpslld (@X[4],@X[4],2); | |
1004 &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);
# restore X[] from backtrace buffer | |
1005 eval(shift(@insns)); | |
1006 eval(shift(@insns)); | |
1007 &vpxor (@X[0],@X[0],@X[3]); | |
1008 eval(shift(@insns)); | |
1009 eval(shift(@insns)); | |
1010 eval(shift(@insns)); | |
1011 eval(shift(@insns)); | |
1012 | |
1013 &vpxor (@X[0],@X[0],@X[4]); # "X[0]"^=("X[0]"<<96)<<<2 | |
1014 eval(shift(@insns)); | |
1015 eval(shift(@insns)); | |
1016 &vmovdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_X
X | |
1017 eval(shift(@insns)); | |
1018 eval(shift(@insns)); | |
1019 | |
1020 foreach (@insns) { eval; } # remaining instructions [if any] | |
1021 | |
1022 $Xi++; push(@X,shift(@X)); # "rotate" X[] | |
1023 } | |
1024 | |
1025 sub Xupdate_avx_32_79() | |
1026 { use integer; | |
1027 my $body = shift; | |
1028 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions | |
1029 my ($a,$b,$c,$d,$e); | |
1030 | |
1031 &vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]" | |
1032 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" | |
1033 eval(shift(@insns)); # body_20_39 | |
1034 eval(shift(@insns)); | |
1035 eval(shift(@insns)); | |
1036 eval(shift(@insns)); # rol | |
1037 | |
1038 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" | |
1039 &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X
[] to backtrace buffer | |
1040 eval(shift(@insns)); | |
1041 eval(shift(@insns)); | |
1042 if ($Xi%5) { | |
1043 &vmovdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... | |
1044 } else { # ... or load next one | |
1045 &vmovdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); | |
1046 } | |
1047 &vpaddd (@X[3],@X[3],@X[-1&7]); | |
1048 eval(shift(@insns)); # ror | |
1049 eval(shift(@insns)); | |
1050 | |
1051 &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-6]" | |
1052 eval(shift(@insns)); # body_20_39 | |
1053 eval(shift(@insns)); | |
1054 eval(shift(@insns)); | |
1055 eval(shift(@insns)); # rol | |
1056 | |
1057 &vpsrld (@X[2],@X[0],30); | |
1058 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to
IALU | |
1059 eval(shift(@insns)); | |
1060 eval(shift(@insns)); | |
1061 eval(shift(@insns)); # ror | |
1062 eval(shift(@insns)); | |
1063 | |
1064 &vpslld (@X[0],@X[0],2); | |
1065 eval(shift(@insns)); # body_20_39 | |
1066 eval(shift(@insns)); | |
1067 eval(shift(@insns)); | |
1068 eval(shift(@insns)); # rol | |
1069 eval(shift(@insns)); | |
1070 eval(shift(@insns)); | |
1071 eval(shift(@insns)); # ror | |
1072 eval(shift(@insns)); | |
1073 | |
1074 &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=2 | |
1075 eval(shift(@insns)); # body_20_39 | |
1076 eval(shift(@insns)); | |
1077 &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);
# restore X[] from backtrace buffer | |
1078 eval(shift(@insns)); | |
1079 eval(shift(@insns)); # rol | |
1080 eval(shift(@insns)); | |
1081 eval(shift(@insns)); | |
1082 eval(shift(@insns)); # ror | |
1083 eval(shift(@insns)); | |
1084 | |
1085 foreach (@insns) { eval; } # remaining instructions | |
1086 | |
1087 $Xi++; push(@X,shift(@X)); # "rotate" X[] | |
1088 } | |
1089 | |
1090 sub Xuplast_avx_80() | |
1091 { use integer; | |
1092 my $body = shift; | |
1093 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
1094 my ($a,$b,$c,$d,$e); | |
1095 | |
1096 eval(shift(@insns)); | |
1097 &vpaddd (@X[3],@X[3],@X[-1&7]); | |
1098 eval(shift(@insns)); | |
1099 eval(shift(@insns)); | |
1100 eval(shift(@insns)); | |
1101 eval(shift(@insns)); | |
1102 | |
1103 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IAL
U | |
1104 | |
1105 foreach (@insns) { eval; } # remaining instructions | |
1106 | |
1107 &mov ($inp=@T[1],&DWP(192+4,"esp")); | |
1108 &cmp ($inp,&DWP(192+8,"esp")); | |
1109 &je (&label("done")); | |
1110 | |
1111 &vmovdqa(@X[3],&QWP(112+48,"esp")); # K_00_19 | |
1112 &vmovdqa(@X[2],&QWP(112+64,"esp")); # pbswap mask | |
1113 &vmovdqu(@X[-4&7],&QWP(0,$inp)); # load input | |
1114 &vmovdqu(@X[-3&7],&QWP(16,$inp)); | |
1115 &vmovdqu(@X[-2&7],&QWP(32,$inp)); | |
1116 &vmovdqu(@X[-1&7],&QWP(48,$inp)); | |
1117 &add ($inp,64); | |
1118 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap | |
1119 &mov (&DWP(192+4,"esp"),$inp); | |
1120 &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot | |
1121 | |
1122 $Xi=0; | |
1123 } | |
1124 | |
1125 sub Xloop_avx() | |
1126 { use integer; | |
1127 my $body = shift; | |
1128 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
1129 my ($a,$b,$c,$d,$e); | |
1130 | |
1131 eval(shift(@insns)); | |
1132 eval(shift(@insns)); | |
1133 &vpshufb (@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); | |
1134 eval(shift(@insns)); | |
1135 eval(shift(@insns)); | |
1136 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@X[3]); | |
1137 eval(shift(@insns)); | |
1138 eval(shift(@insns)); | |
1139 eval(shift(@insns)); | |
1140 eval(shift(@insns)); | |
1141 &vmovdqa (&QWP(0+16*$Xi,"esp"),@X[$Xi&7]); # X[]+K xfer to
IALU | |
1142 eval(shift(@insns)); | |
1143 eval(shift(@insns)); | |
1144 | |
1145 foreach (@insns) { eval; } | |
1146 $Xi++; | |
1147 } | |
1148 | |
1149 sub Xtail_avx() | |
1150 { use integer; | |
1151 my $body = shift; | |
1152 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
1153 my ($a,$b,$c,$d,$e); | |
1154 | |
1155 foreach (@insns) { eval; } | |
1156 } | |
1157 | |
1158 &set_label("loop",16); | |
1159 &Xupdate_avx_16_31(\&body_00_19); | |
1160 &Xupdate_avx_16_31(\&body_00_19); | |
1161 &Xupdate_avx_16_31(\&body_00_19); | |
1162 &Xupdate_avx_16_31(\&body_00_19); | |
1163 &Xupdate_avx_32_79(\&body_00_19); | |
1164 &Xupdate_avx_32_79(\&body_20_39); | |
1165 &Xupdate_avx_32_79(\&body_20_39); | |
1166 &Xupdate_avx_32_79(\&body_20_39); | |
1167 &Xupdate_avx_32_79(\&body_20_39); | |
1168 &Xupdate_avx_32_79(\&body_20_39); | |
1169 &Xupdate_avx_32_79(\&body_40_59); | |
1170 &Xupdate_avx_32_79(\&body_40_59); | |
1171 &Xupdate_avx_32_79(\&body_40_59); | |
1172 &Xupdate_avx_32_79(\&body_40_59); | |
1173 &Xupdate_avx_32_79(\&body_40_59); | |
1174 &Xupdate_avx_32_79(\&body_20_39); | |
1175 &Xuplast_avx_80(\&body_20_39); # can jump to "done" | |
1176 | |
1177 $saved_j=$j; @saved_V=@V; | |
1178 | |
1179 &Xloop_avx(\&body_20_39); | |
1180 &Xloop_avx(\&body_20_39); | |
1181 &Xloop_avx(\&body_20_39); | |
1182 | |
1183 &mov (@T[1],&DWP(192,"esp")); # update context | |
1184 &add ($A,&DWP(0,@T[1])); | |
1185 &add (@T[0],&DWP(4,@T[1])); # $b | |
1186 &add ($C,&DWP(8,@T[1])); | |
1187 &mov (&DWP(0,@T[1]),$A); | |
1188 &add ($D,&DWP(12,@T[1])); | |
1189 &mov (&DWP(4,@T[1]),@T[0]); | |
1190 &add ($E,&DWP(16,@T[1])); | |
1191 &mov (&DWP(8,@T[1]),$C); | |
1192 &mov ($B,@T[0]); | |
1193 &mov (&DWP(12,@T[1]),$D); | |
1194 &mov (&DWP(16,@T[1]),$E); | |
1195 | |
1196 &jmp (&label("loop")); | |
1197 | |
1198 &set_label("done",16); $j=$saved_j; @V=@saved_V; | |
1199 | |
1200 &Xtail_avx(\&body_20_39); | |
1201 &Xtail_avx(\&body_20_39); | |
1202 &Xtail_avx(\&body_20_39); | |
1203 | |
1204 &vzeroall(); | |
1205 | |
1206 &mov (@T[1],&DWP(192,"esp")); # update context | |
1207 &add ($A,&DWP(0,@T[1])); | |
1208 &mov ("esp",&DWP(192+12,"esp")); # restore %esp | |
1209 &add (@T[0],&DWP(4,@T[1])); # $b | |
1210 &add ($C,&DWP(8,@T[1])); | |
1211 &mov (&DWP(0,@T[1]),$A); | |
1212 &add ($D,&DWP(12,@T[1])); | |
1213 &mov (&DWP(4,@T[1]),@T[0]); | |
1214 &add ($E,&DWP(16,@T[1])); | |
1215 &mov (&DWP(8,@T[1]),$C); | |
1216 &mov (&DWP(12,@T[1]),$D); | |
1217 &mov (&DWP(16,@T[1]),$E); | |
1218 &function_end("_sha1_block_data_order_avx"); | |
1219 } | |
1220 &set_label("K_XX_XX",64); | |
1221 &data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999); # K_00_19 | |
1222 &data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1); # K_20_39 | |
1223 &data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59 | |
1224 &data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79 | |
1225 &data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask | |
1226 } | |
1227 &asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); | |
1228 | |
1229 &asm_finish(); | |
OLD | NEW |