| OLD | NEW |
| (Empty) |
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 # ==================================================================== | |
| 4 # [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
| 5 # project. The module is, however, dual licensed under OpenSSL and | |
| 6 # CRYPTOGAMS licenses depending on where you obtain it. For further | |
| 7 # details see http://www.openssl.org/~appro/cryptogams/. | |
| 8 # ==================================================================== | |
| 9 | |
| 10 # "[Re]written" was achieved in two major overhauls. In 2004 BODY_* | |
| 11 # functions were re-implemented to address P4 performance issue [see | |
| 12 # commentary below], and in 2006 the rest was rewritten in order to | |
| 13 # gain freedom to liberate licensing terms. | |
| 14 | |
| 15 # January, September 2004. | |
| 16 # | |
| 17 # It was noted that Intel IA-32 C compiler generates code which | |
| 18 # performs ~30% *faster* on P4 CPU than original *hand-coded* | |
| 19 # SHA1 assembler implementation. To address this problem (and | |
| 20 # prove that humans are still better than machines:-), the | |
| 21 # original code was overhauled, which resulted in following | |
| 22 # performance changes: | |
| 23 # | |
| 24 # compared with original compared with Intel cc | |
| 25 # assembler impl. generated code | |
| 26 # Pentium -16% +48% | |
| 27 # PIII/AMD +8% +16% | |
| 28 # P4 +85%(!) +45% | |
| 29 # | |
| 30 # As you can see Pentium came out as looser:-( Yet I reckoned that | |
| 31 # improvement on P4 outweights the loss and incorporate this | |
| 32 # re-tuned code to 0.9.7 and later. | |
| 33 # ---------------------------------------------------------------- | |
| 34 # <appro@fy.chalmers.se> | |
| 35 | |
| 36 # August 2009. | |
| 37 # | |
| 38 # George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as | |
| 39 # '(c&d) + (b&(c^d))', which allows to accumulate partial results | |
| 40 # and lighten "pressure" on scratch registers. This resulted in | |
| 41 # >12% performance improvement on contemporary AMD cores (with no | |
| 42 # degradation on other CPUs:-). Also, the code was revised to maximize | |
| 43 # "distance" between instructions producing input to 'lea' instruction | |
| 44 # and the 'lea' instruction itself, which is essential for Intel Atom | |
| 45 # core and resulted in ~15% improvement. | |
| 46 | |
| 47 # October 2010. | |
| 48 # | |
| 49 # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it | |
| 50 # is to offload message schedule denoted by Wt in NIST specification, | |
| 51 # or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel, | |
| 52 # and in SSE2 context was first explored by Dean Gaudet in 2004, see | |
| 53 # http://arctic.org/~dean/crypto/sha1.html. Since then several things | |
| 54 # have changed that made it interesting again: | |
| 55 # | |
| 56 # a) XMM units became faster and wider; | |
| 57 # b) instruction set became more versatile; | |
| 58 # c) an important observation was made by Max Locktykhin, which made | |
| 59 # it possible to reduce amount of instructions required to perform | |
| 60 # the operation in question, for further details see | |
| 61 # http://software.intel.com/en-us/articles/improving-the-performance-of-the-s
ecure-hash-algorithm-1/. | |
| 62 | |
| 63 # April 2011. | |
| 64 # | |
| 65 # Add AVX code path, probably most controversial... The thing is that | |
| 66 # switch to AVX alone improves performance by as little as 4% in | |
| 67 # comparison to SSSE3 code path. But below result doesn't look like | |
| 68 # 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as | |
| 69 # pair of µ-ops, and it's the additional µ-ops, two per round, that | |
| 70 # make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded | |
| 71 # as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with | |
| 72 # equivalent 'sh[rl]d' that is responsible for the impressive 5.1 | |
| 73 # cycles per processed byte. But 'sh[rl]d' is not something that used | |
| 74 # to be fast, nor does it appear to be fast in upcoming Bulldozer | |
| 75 # [according to its optimization manual]. Which is why AVX code path | |
| 76 # is guarded by *both* AVX and synthetic bit denoting Intel CPUs. | |
| 77 # One can argue that it's unfair to AMD, but without 'sh[rl]d' it | |
| 78 # makes no sense to keep the AVX code path. If somebody feels that | |
| 79 # strongly, it's probably more appropriate to discuss possibility of | |
| 80 # using vector rotate XOP on AMD... | |
| 81 | |
| 82 ###################################################################### | |
| 83 # Current performance is summarized in following table. Numbers are | |
| 84 # CPU clock cycles spent to process single byte (less is better). | |
| 85 # | |
| 86 # x86 SSSE3 AVX | |
| 87 # Pentium 15.7 - | |
| 88 # PIII 11.5 - | |
| 89 # P4 10.6 - | |
| 90 # AMD K8 7.1 - | |
| 91 # Core2 7.3 6.1/+20% - | |
| 92 # Atom 12.5 9.5(*)/+32% - | |
| 93 # Westmere 7.3 5.6/+30% - | |
| 94 # Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70% | |
| 95 # | |
| 96 # (*) Loop is 1056 instructions long and expected result is ~8.25. | |
| 97 # It remains mystery [to me] why ILP is limited to 1.7. | |
| 98 # | |
| 99 # (**) As per above comment, the result is for AVX *plus* sh[rl]d. | |
| 100 | |
| 101 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
| 102 push(@INC,"${dir}","${dir}../../perlasm"); | |
| 103 require "x86asm.pl"; | |
| 104 | |
| 105 &asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386"); | |
| 106 | |
| 107 $xmm=$ymm=0; | |
| 108 for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } | |
| 109 | |
| 110 $ymm=1 if ($xmm && | |
| 111 `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` | |
| 112 =~ /GNU assembler version ([2-9]\.[0-9]+)/ && | |
| 113 $1>=2.19); # first version supporting AVX | |
| 114 | |
| 115 $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && | |
| 116 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && | |
| 117 $1>=2.03); # first version supporting AVX | |
| 118 | |
| 119 &external_label("OPENSSL_ia32cap_P") if ($xmm); | |
| 120 | |
| 121 | |
| 122 $A="eax"; | |
| 123 $B="ebx"; | |
| 124 $C="ecx"; | |
| 125 $D="edx"; | |
| 126 $E="edi"; | |
| 127 $T="esi"; | |
| 128 $tmp1="ebp"; | |
| 129 | |
| 130 @V=($A,$B,$C,$D,$E,$T); | |
| 131 | |
| 132 $alt=0; # 1 denotes alternative IALU implementation, which performs | |
| 133 # 8% *worse* on P4, same on Westmere and Atom, 2% better on | |
| 134 # Sandy Bridge... | |
| 135 | |
| 136 sub BODY_00_15 | |
| 137 { | |
| 138 local($n,$a,$b,$c,$d,$e,$f)=@_; | |
| 139 | |
| 140 &comment("00_15 $n"); | |
| 141 | |
| 142 &mov($f,$c); # f to hold F_00_19(b,c,d) | |
| 143 if ($n==0) { &mov($tmp1,$a); } | |
| 144 else { &mov($a,$tmp1); } | |
| 145 &rotl($tmp1,5); # tmp1=ROTATE(a,5) | |
| 146 &xor($f,$d); | |
| 147 &add($tmp1,$e); # tmp1+=e; | |
| 148 &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded | |
| 149 # with xi, also note that e becomes | |
| 150 # f in next round... | |
| 151 &and($f,$b); | |
| 152 &rotr($b,2); # b=ROTATE(b,30) | |
| 153 &xor($f,$d); # f holds F_00_19(b,c,d) | |
| 154 &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi | |
| 155 | |
| 156 if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round | |
| 157 &add($f,$tmp1); } # f+=tmp1 | |
| 158 else { &add($tmp1,$f); } # f becomes a in next round | |
| 159 &mov($tmp1,$a) if ($alt && $n==15); | |
| 160 } | |
| 161 | |
| 162 sub BODY_16_19 | |
| 163 { | |
| 164 local($n,$a,$b,$c,$d,$e,$f)=@_; | |
| 165 | |
| 166 &comment("16_19 $n"); | |
| 167 | |
| 168 if ($alt) { | |
| 169 &xor($c,$d); | |
| 170 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) | |
| 171 &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d | |
| 172 &xor($f,&swtmp(($n+8)%16)); | |
| 173 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) | |
| 174 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd | |
| 175 &rotl($f,1); # f=ROTATE(f,1) | |
| 176 &add($e,$tmp1); # e+=F_00_19(b,c,d) | |
| 177 &xor($c,$d); # restore $c | |
| 178 &mov($tmp1,$a); # b in next round | |
| 179 &rotr($b,$n==16?2:7); # b=ROTATE(b,30) | |
| 180 &mov(&swtmp($n%16),$f); # xi=f | |
| 181 &rotl($a,5); # ROTATE(a,5) | |
| 182 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e | |
| 183 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round | |
| 184 &add($f,$a); # f+=ROTATE(a,5) | |
| 185 } else { | |
| 186 &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d) | |
| 187 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) | |
| 188 &xor($tmp1,$d); | |
| 189 &xor($f,&swtmp(($n+8)%16)); | |
| 190 &and($tmp1,$b); | |
| 191 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd | |
| 192 &rotl($f,1); # f=ROTATE(f,1) | |
| 193 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) | |
| 194 &add($e,$tmp1); # e+=F_00_19(b,c,d) | |
| 195 &mov($tmp1,$a); | |
| 196 &rotr($b,2); # b=ROTATE(b,30) | |
| 197 &mov(&swtmp($n%16),$f); # xi=f | |
| 198 &rotl($tmp1,5); # ROTATE(a,5) | |
| 199 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e | |
| 200 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round | |
| 201 &add($f,$tmp1); # f+=ROTATE(a,5) | |
| 202 } | |
| 203 } | |
| 204 | |
| 205 sub BODY_20_39 | |
| 206 { | |
| 207 local($n,$a,$b,$c,$d,$e,$f)=@_; | |
| 208 local $K=($n<40)?0x6ed9eba1:0xca62c1d6; | |
| 209 | |
| 210 &comment("20_39 $n"); | |
| 211 | |
| 212 if ($alt) { | |
| 213 &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c | |
| 214 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) | |
| 215 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) | |
| 216 &xor($f,&swtmp(($n+8)%16)); | |
| 217 &add($e,$tmp1); # e+=F_20_39(b,c,d) | |
| 218 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd | |
| 219 &rotl($f,1); # f=ROTATE(f,1) | |
| 220 &mov($tmp1,$a); # b in next round | |
| 221 &rotr($b,7); # b=ROTATE(b,30) | |
| 222 &mov(&swtmp($n%16),$f) if($n<77);# xi=f | |
| 223 &rotl($a,5); # ROTATE(a,5) | |
| 224 &xor($b,$c) if($n==39);# warm up for BODY_40_59 | |
| 225 &and($tmp1,$b) if($n==39); | |
| 226 &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY | |
| 227 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round | |
| 228 &add($f,$a); # f+=ROTATE(a,5) | |
| 229 &rotr($a,5) if ($n==79); | |
| 230 } else { | |
| 231 &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d) | |
| 232 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) | |
| 233 &xor($tmp1,$c); | |
| 234 &xor($f,&swtmp(($n+8)%16)); | |
| 235 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) | |
| 236 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd | |
| 237 &rotl($f,1); # f=ROTATE(f,1) | |
| 238 &add($e,$tmp1); # e+=F_20_39(b,c,d) | |
| 239 &rotr($b,2); # b=ROTATE(b,30) | |
| 240 &mov($tmp1,$a); | |
| 241 &rotl($tmp1,5); # ROTATE(a,5) | |
| 242 &mov(&swtmp($n%16),$f) if($n<77);# xi=f | |
| 243 &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY | |
| 244 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round | |
| 245 &add($f,$tmp1); # f+=ROTATE(a,5) | |
| 246 } | |
| 247 } | |
| 248 | |
| 249 sub BODY_40_59 | |
| 250 { | |
| 251 local($n,$a,$b,$c,$d,$e,$f)=@_; | |
| 252 | |
| 253 &comment("40_59 $n"); | |
| 254 | |
| 255 if ($alt) { | |
| 256 &add($e,$tmp1); # e+=b&(c^d) | |
| 257 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) | |
| 258 &mov($tmp1,$d); | |
| 259 &xor($f,&swtmp(($n+8)%16)); | |
| 260 &xor($c,$d); # restore $c | |
| 261 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd | |
| 262 &rotl($f,1); # f=ROTATE(f,1) | |
| 263 &and($tmp1,$c); | |
| 264 &rotr($b,7); # b=ROTATE(b,30) | |
| 265 &add($e,$tmp1); # e+=c&d | |
| 266 &mov($tmp1,$a); # b in next round | |
| 267 &mov(&swtmp($n%16),$f); # xi=f | |
| 268 &rotl($a,5); # ROTATE(a,5) | |
| 269 &xor($b,$c) if ($n<59); | |
| 270 &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d) | |
| 271 &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d)) | |
| 272 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round | |
| 273 &add($f,$a); # f+=ROTATE(a,5) | |
| 274 } else { | |
| 275 &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d) | |
| 276 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) | |
| 277 &xor($tmp1,$d); | |
| 278 &xor($f,&swtmp(($n+8)%16)); | |
| 279 &and($tmp1,$b); | |
| 280 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd | |
| 281 &rotl($f,1); # f=ROTATE(f,1) | |
| 282 &add($tmp1,$e); # b&(c^d)+=e | |
| 283 &rotr($b,2); # b=ROTATE(b,30) | |
| 284 &mov($e,$a); # e becomes volatile | |
| 285 &rotl($e,5); # ROTATE(a,5) | |
| 286 &mov(&swtmp($n%16),$f); # xi=f | |
| 287 &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d)) | |
| 288 &mov($tmp1,$c); | |
| 289 &add($f,$e); # f+=ROTATE(a,5) | |
| 290 &and($tmp1,$d); | |
| 291 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round | |
| 292 &add($f,$tmp1); # f+=c&d | |
| 293 } | |
| 294 } | |
| 295 | |
| 296 &function_begin("sha1_block_data_order"); | |
| 297 if ($xmm) { | |
| 298 &static_label("ssse3_shortcut"); | |
| 299 &static_label("avx_shortcut") if ($ymm); | |
| 300 &static_label("K_XX_XX"); | |
| 301 | |
| 302 &call (&label("pic_point")); # make it PIC! | |
| 303 &set_label("pic_point"); | |
| 304 &blindpop($tmp1); | |
| 305 &picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point")); | |
| 306 &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); | |
| 307 | |
| 308 &mov ($A,&DWP(0,$T)); | |
| 309 &mov ($D,&DWP(4,$T)); | |
| 310 &test ($D,1<<9); # check SSSE3 bit | |
| 311 &jz (&label("x86")); | |
| 312 &test ($A,1<<24); # check FXSR bit | |
| 313 &jz (&label("x86")); | |
| 314 if ($ymm) { | |
| 315 &and ($D,1<<28); # mask AVX bit | |
| 316 &and ($A,1<<30); # mask "Intel CPU" bit | |
| 317 &or ($A,$D); | |
| 318 &cmp ($A,1<<28|1<<30); | |
| 319 &je (&label("avx_shortcut")); | |
| 320 } | |
| 321 &jmp (&label("ssse3_shortcut")); | |
| 322 &set_label("x86",16); | |
| 323 } | |
| 324 &mov($tmp1,&wparam(0)); # SHA_CTX *c | |
| 325 &mov($T,&wparam(1)); # const void *input | |
| 326 &mov($A,&wparam(2)); # size_t num | |
| 327 &stack_push(16+3); # allocate X[16] | |
| 328 &shl($A,6); | |
| 329 &add($A,$T); | |
| 330 &mov(&wparam(2),$A); # pointer beyond the end of input | |
| 331 &mov($E,&DWP(16,$tmp1));# pre-load E | |
| 332 &jmp(&label("loop")); | |
| 333 | |
| 334 &set_label("loop",16); | |
| 335 | |
| 336 # copy input chunk to X, but reversing byte order! | |
| 337 for ($i=0; $i<16; $i+=4) | |
| 338 { | |
| 339 &mov($A,&DWP(4*($i+0),$T)); | |
| 340 &mov($B,&DWP(4*($i+1),$T)); | |
| 341 &mov($C,&DWP(4*($i+2),$T)); | |
| 342 &mov($D,&DWP(4*($i+3),$T)); | |
| 343 &bswap($A); | |
| 344 &bswap($B); | |
| 345 &bswap($C); | |
| 346 &bswap($D); | |
| 347 &mov(&swtmp($i+0),$A); | |
| 348 &mov(&swtmp($i+1),$B); | |
| 349 &mov(&swtmp($i+2),$C); | |
| 350 &mov(&swtmp($i+3),$D); | |
| 351 } | |
| 352 &mov(&wparam(1),$T); # redundant in 1st spin | |
| 353 | |
| 354 &mov($A,&DWP(0,$tmp1)); # load SHA_CTX | |
| 355 &mov($B,&DWP(4,$tmp1)); | |
| 356 &mov($C,&DWP(8,$tmp1)); | |
| 357 &mov($D,&DWP(12,$tmp1)); | |
| 358 # E is pre-loaded | |
| 359 | |
| 360 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } | |
| 361 for(;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); } | |
| 362 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | |
| 363 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } | |
| 364 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | |
| 365 | |
| 366 (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check | |
| 367 | |
| 368 &mov($tmp1,&wparam(0)); # re-load SHA_CTX* | |
| 369 &mov($D,&wparam(1)); # D is last "T" and is discarded | |
| 370 | |
| 371 &add($E,&DWP(0,$tmp1)); # E is last "A"... | |
| 372 &add($T,&DWP(4,$tmp1)); | |
| 373 &add($A,&DWP(8,$tmp1)); | |
| 374 &add($B,&DWP(12,$tmp1)); | |
| 375 &add($C,&DWP(16,$tmp1)); | |
| 376 | |
| 377 &mov(&DWP(0,$tmp1),$E); # update SHA_CTX | |
| 378 &add($D,64); # advance input pointer | |
| 379 &mov(&DWP(4,$tmp1),$T); | |
| 380 &cmp($D,&wparam(2)); # have we reached the end yet? | |
| 381 &mov(&DWP(8,$tmp1),$A); | |
| 382 &mov($E,$C); # C is last "E" which needs to be "pre-loaded" | |
| 383 &mov(&DWP(12,$tmp1),$B); | |
| 384 &mov($T,$D); # input pointer | |
| 385 &mov(&DWP(16,$tmp1),$C); | |
| 386 &jb(&label("loop")); | |
| 387 | |
| 388 &stack_pop(16+3); | |
| 389 &function_end("sha1_block_data_order"); | |
| 390 | |
| 391 if ($xmm) { | |
| 392 ###################################################################### | |
| 393 # The SSSE3 implementation. | |
| 394 # | |
| 395 # %xmm[0-7] are used as ring @X[] buffer containing quadruples of last | |
| 396 # 32 elements of the message schedule or Xupdate outputs. First 4 | |
| 397 # quadruples are simply byte-swapped input, next 4 are calculated | |
| 398 # according to method originally suggested by Dean Gaudet (modulo | |
| 399 # being implemented in SSSE3). Once 8 quadruples or 32 elements are | |
| 400 # collected, it switches to routine proposed by Max Locktyukhin. | |
| 401 # | |
| 402 # Calculations inevitably require temporary reqisters, and there are | |
| 403 # no %xmm registers left to spare. For this reason part of the ring | |
| 404 # buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring | |
| 405 # buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] - | |
| 406 # X[-5], and X[4] - X[-4]... | |
| 407 # | |
| 408 # Another notable optimization is aggressive stack frame compression | |
| 409 # aiming to minimize amount of 9-byte instructions... | |
| 410 # | |
| 411 # Yet another notable optimization is "jumping" $B variable. It means | |
| 412 # that there is no register permanently allocated for $B value. This | |
| 413 # allowed to eliminate one instruction from body_20_39... | |
| 414 # | |
| 415 my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded | |
| 416 my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 | |
| 417 my @V=($A,$B,$C,$D,$E); | |
| 418 my $j=0; # hash round | |
| 419 my @T=($T,$tmp1); | |
| 420 my $inp; | |
| 421 | |
| 422 my $_rol=sub { &rol(@_) }; | |
| 423 my $_ror=sub { &ror(@_) }; | |
| 424 | |
| 425 &function_begin("_sha1_block_data_order_ssse3"); | |
| 426 &call (&label("pic_point")); # make it PIC! | |
| 427 &set_label("pic_point"); | |
| 428 &blindpop($tmp1); | |
| 429 &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); | |
| 430 &set_label("ssse3_shortcut"); | |
| 431 | |
| 432 &movdqa (@X[3],&QWP(0,$tmp1)); # K_00_19 | |
| 433 &movdqa (@X[4],&QWP(16,$tmp1)); # K_20_39 | |
| 434 &movdqa (@X[5],&QWP(32,$tmp1)); # K_40_59 | |
| 435 &movdqa (@X[6],&QWP(48,$tmp1)); # K_60_79 | |
| 436 &movdqa (@X[2],&QWP(64,$tmp1)); # pbswap mask | |
| 437 | |
| 438 &mov ($E,&wparam(0)); # load argument block | |
| 439 &mov ($inp=@T[1],&wparam(1)); | |
| 440 &mov ($D,&wparam(2)); | |
| 441 &mov (@T[0],"esp"); | |
| 442 | |
| 443 # stack frame layout | |
| 444 # | |
| 445 # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area | |
| 446 # X[4]+K X[5]+K X[6]+K X[7]+K | |
| 447 # X[8]+K X[9]+K X[10]+K X[11]+K | |
| 448 # X[12]+K X[13]+K X[14]+K X[15]+K | |
| 449 # | |
| 450 # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area | |
| 451 # X[4] X[5] X[6] X[7] | |
| 452 # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 | |
| 453 # | |
| 454 # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants | |
| 455 # K_40_59 K_40_59 K_40_59 K_40_59 | |
| 456 # K_60_79 K_60_79 K_60_79 K_60_79 | |
| 457 # K_00_19 K_00_19 K_00_19 K_00_19 | |
| 458 # pbswap mask | |
| 459 # | |
| 460 # +192 ctx # argument block | |
| 461 # +196 inp | |
| 462 # +200 end | |
| 463 # +204 esp | |
| 464 &sub ("esp",208); | |
| 465 &and ("esp",-64); | |
| 466 | |
| 467 &movdqa (&QWP(112+0,"esp"),@X[4]); # copy constants | |
| 468 &movdqa (&QWP(112+16,"esp"),@X[5]); | |
| 469 &movdqa (&QWP(112+32,"esp"),@X[6]); | |
| 470 &shl ($D,6); # len*64 | |
| 471 &movdqa (&QWP(112+48,"esp"),@X[3]); | |
| 472 &add ($D,$inp); # end of input | |
| 473 &movdqa (&QWP(112+64,"esp"),@X[2]); | |
| 474 &add ($inp,64); | |
| 475 &mov (&DWP(192+0,"esp"),$E); # save argument block | |
| 476 &mov (&DWP(192+4,"esp"),$inp); | |
| 477 &mov (&DWP(192+8,"esp"),$D); | |
| 478 &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp | |
| 479 | |
| 480 &mov ($A,&DWP(0,$E)); # load context | |
| 481 &mov ($B,&DWP(4,$E)); | |
| 482 &mov ($C,&DWP(8,$E)); | |
| 483 &mov ($D,&DWP(12,$E)); | |
| 484 &mov ($E,&DWP(16,$E)); | |
| 485 &mov (@T[0],$B); # magic seed | |
| 486 | |
| 487 &movdqu (@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] | |
| 488 &movdqu (@X[-3&7],&QWP(-48,$inp)); | |
| 489 &movdqu (@X[-2&7],&QWP(-32,$inp)); | |
| 490 &movdqu (@X[-1&7],&QWP(-16,$inp)); | |
| 491 &pshufb (@X[-4&7],@X[2]); # byte swap | |
| 492 &pshufb (@X[-3&7],@X[2]); | |
| 493 &pshufb (@X[-2&7],@X[2]); | |
| 494 &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot | |
| 495 &pshufb (@X[-1&7],@X[2]); | |
| 496 &paddd (@X[-4&7],@X[3]); # add K_00_19 | |
| 497 &paddd (@X[-3&7],@X[3]); | |
| 498 &paddd (@X[-2&7],@X[3]); | |
| 499 &movdqa (&QWP(0,"esp"),@X[-4&7]); # X[]+K xfer to IALU | |
| 500 &psubd (@X[-4&7],@X[3]); # restore X[] | |
| 501 &movdqa (&QWP(0+16,"esp"),@X[-3&7]); | |
| 502 &psubd (@X[-3&7],@X[3]); | |
| 503 &movdqa (&QWP(0+32,"esp"),@X[-2&7]); | |
| 504 &psubd (@X[-2&7],@X[3]); | |
| 505 &movdqa (@X[0],@X[-3&7]); | |
| 506 &jmp (&label("loop")); | |
| 507 | |
| 508 ###################################################################### | |
| 509 # SSE instruction sequence is first broken to groups of indepentent | |
| 510 # instructions, independent in respect to their inputs and shifter | |
| 511 # (not all architectures have more than one). Then IALU instructions | |
| 512 # are "knitted in" between the SSE groups. Distance is maintained for | |
| 513 # SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer | |
| 514 # [which allegedly also implements SSSE3]... | |
| 515 # | |
| 516 # Temporary registers usage. X[2] is volatile at the entry and at the | |
| 517 # end is restored from backtrace ring buffer. X[3] is expected to | |
| 518 # contain current K_XX_XX constant and is used to caclulate X[-1]+K | |
| 519 # from previous round, it becomes volatile the moment the value is | |
| 520 # saved to stack for transfer to IALU. X[4] becomes volatile whenever | |
| 521 # X[-4] is accumulated and offloaded to backtrace ring buffer, at the | |
| 522 # end it is loaded with next K_XX_XX [which becomes X[3] in next | |
| 523 # round]... | |
| 524 # | |
| 525 sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 | |
| 526 { use integer; | |
| 527 my $body = shift; | |
| 528 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions | |
| 529 my ($a,$b,$c,$d,$e); | |
| 530 | |
| 531 eval(shift(@insns)); | |
| 532 eval(shift(@insns)); | |
| 533 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" | |
| 534 &movdqa (@X[2],@X[-1&7]); | |
| 535 eval(shift(@insns)); | |
| 536 eval(shift(@insns)); | |
| 537 | |
| 538 &paddd (@X[3],@X[-1&7]); | |
| 539 &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to b
acktrace buffer | |
| 540 eval(shift(@insns)); | |
| 541 eval(shift(@insns)); | |
| 542 &psrldq (@X[2],4); # "X[-3]", 3 dwords | |
| 543 eval(shift(@insns)); | |
| 544 eval(shift(@insns)); | |
| 545 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" | |
| 546 eval(shift(@insns)); | |
| 547 eval(shift(@insns)); | |
| 548 | |
| 549 &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" | |
| 550 eval(shift(@insns)); | |
| 551 eval(shift(@insns)); | |
| 552 eval(shift(@insns)); | |
| 553 eval(shift(@insns)); | |
| 554 | |
| 555 &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" | |
| 556 eval(shift(@insns)); | |
| 557 eval(shift(@insns)); | |
| 558 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to
IALU | |
| 559 eval(shift(@insns)); | |
| 560 eval(shift(@insns)); | |
| 561 | |
| 562 &movdqa (@X[4],@X[0]); | |
| 563 &movdqa (@X[2],@X[0]); | |
| 564 eval(shift(@insns)); | |
| 565 eval(shift(@insns)); | |
| 566 eval(shift(@insns)); | |
| 567 eval(shift(@insns)); | |
| 568 | |
| 569 &pslldq (@X[4],12); # "X[0]"<<96, extract one dword | |
| 570 &paddd (@X[0],@X[0]); | |
| 571 eval(shift(@insns)); | |
| 572 eval(shift(@insns)); | |
| 573 eval(shift(@insns)); | |
| 574 eval(shift(@insns)); | |
| 575 | |
| 576 &psrld (@X[2],31); | |
| 577 eval(shift(@insns)); | |
| 578 eval(shift(@insns)); | |
| 579 &movdqa (@X[3],@X[4]); | |
| 580 eval(shift(@insns)); | |
| 581 eval(shift(@insns)); | |
| 582 | |
| 583 &psrld (@X[4],30); | |
| 584 &por (@X[0],@X[2]); # "X[0]"<<<=1 | |
| 585 eval(shift(@insns)); | |
| 586 eval(shift(@insns)); | |
| 587 &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);
# restore X[] from backtrace buffer | |
| 588 eval(shift(@insns)); | |
| 589 eval(shift(@insns)); | |
| 590 | |
| 591 &pslld (@X[3],2); | |
| 592 &pxor (@X[0],@X[4]); | |
| 593 eval(shift(@insns)); | |
| 594 eval(shift(@insns)); | |
| 595 &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_X
X | |
| 596 eval(shift(@insns)); | |
| 597 eval(shift(@insns)); | |
| 598 | |
| 599 &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2 | |
| 600 &movdqa (@X[1],@X[-2&7]) if ($Xi<7); | |
| 601 eval(shift(@insns)); | |
| 602 eval(shift(@insns)); | |
| 603 | |
| 604 foreach (@insns) { eval; } # remaining instructions [if any] | |
| 605 | |
| 606 $Xi++; push(@X,shift(@X)); # "rotate" X[] | |
| 607 } | |
| 608 | |
| 609 sub Xupdate_ssse3_32_79() | |
| 610 { use integer; | |
| 611 my $body = shift; | |
| 612 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions | |
| 613 my ($a,$b,$c,$d,$e); | |
| 614 | |
| 615 &movdqa (@X[2],@X[-1&7]) if ($Xi==8); | |
| 616 eval(shift(@insns)); # body_20_39 | |
| 617 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" | |
| 618 &palignr(@X[2],@X[-2&7],8); # compose "X[-6]" | |
| 619 eval(shift(@insns)); | |
| 620 eval(shift(@insns)); | |
| 621 eval(shift(@insns)); # rol | |
| 622 | |
| 623 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" | |
| 624 &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X
[] to backtrace buffer | |
| 625 eval(shift(@insns)); | |
| 626 eval(shift(@insns)); | |
| 627 if ($Xi%5) { | |
| 628 &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... | |
| 629 } else { # ... or load next one | |
| 630 &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); | |
| 631 } | |
| 632 &paddd (@X[3],@X[-1&7]); | |
| 633 eval(shift(@insns)); # ror | |
| 634 eval(shift(@insns)); | |
| 635 | |
| 636 &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]" | |
| 637 eval(shift(@insns)); # body_20_39 | |
| 638 eval(shift(@insns)); | |
| 639 eval(shift(@insns)); | |
| 640 eval(shift(@insns)); # rol | |
| 641 | |
| 642 &movdqa (@X[2],@X[0]); | |
| 643 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to
IALU | |
| 644 eval(shift(@insns)); | |
| 645 eval(shift(@insns)); | |
| 646 eval(shift(@insns)); # ror | |
| 647 eval(shift(@insns)); | |
| 648 | |
| 649 &pslld (@X[0],2); | |
| 650 eval(shift(@insns)); # body_20_39 | |
| 651 eval(shift(@insns)); | |
| 652 &psrld (@X[2],30); | |
| 653 eval(shift(@insns)); | |
| 654 eval(shift(@insns)); # rol | |
| 655 eval(shift(@insns)); | |
| 656 eval(shift(@insns)); | |
| 657 eval(shift(@insns)); # ror | |
| 658 eval(shift(@insns)); | |
| 659 | |
| 660 &por (@X[0],@X[2]); # "X[0]"<<<=2 | |
| 661 eval(shift(@insns)); # body_20_39 | |
| 662 eval(shift(@insns)); | |
| 663 &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);
# restore X[] from backtrace buffer | |
| 664 eval(shift(@insns)); | |
| 665 eval(shift(@insns)); # rol | |
| 666 eval(shift(@insns)); | |
| 667 eval(shift(@insns)); | |
| 668 eval(shift(@insns)); # ror | |
| 669 &movdqa (@X[3],@X[0]) if ($Xi<19); | |
| 670 eval(shift(@insns)); | |
| 671 | |
| 672 foreach (@insns) { eval; } # remaining instructions | |
| 673 | |
| 674 $Xi++; push(@X,shift(@X)); # "rotate" X[] | |
| 675 } | |
| 676 | |
| 677 sub Xuplast_ssse3_80() | |
| 678 { use integer; | |
| 679 my $body = shift; | |
| 680 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
| 681 my ($a,$b,$c,$d,$e); | |
| 682 | |
| 683 eval(shift(@insns)); | |
| 684 &paddd (@X[3],@X[-1&7]); | |
| 685 eval(shift(@insns)); | |
| 686 eval(shift(@insns)); | |
| 687 eval(shift(@insns)); | |
| 688 eval(shift(@insns)); | |
| 689 | |
| 690 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IAL
U | |
| 691 | |
| 692 foreach (@insns) { eval; } # remaining instructions | |
| 693 | |
| 694 &mov ($inp=@T[1],&DWP(192+4,"esp")); | |
| 695 &cmp ($inp,&DWP(192+8,"esp")); | |
| 696 &je (&label("done")); | |
| 697 | |
| 698 &movdqa (@X[3],&QWP(112+48,"esp")); # K_00_19 | |
| 699 &movdqa (@X[2],&QWP(112+64,"esp")); # pbswap mask | |
| 700 &movdqu (@X[-4&7],&QWP(0,$inp)); # load input | |
| 701 &movdqu (@X[-3&7],&QWP(16,$inp)); | |
| 702 &movdqu (@X[-2&7],&QWP(32,$inp)); | |
| 703 &movdqu (@X[-1&7],&QWP(48,$inp)); | |
| 704 &add ($inp,64); | |
| 705 &pshufb (@X[-4&7],@X[2]); # byte swap | |
| 706 &mov (&DWP(192+4,"esp"),$inp); | |
| 707 &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot | |
| 708 | |
| 709 $Xi=0; | |
| 710 } | |
| 711 | |
| 712 sub Xloop_ssse3() | |
| 713 { use integer; | |
| 714 my $body = shift; | |
| 715 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
| 716 my ($a,$b,$c,$d,$e); | |
| 717 | |
| 718 eval(shift(@insns)); | |
| 719 eval(shift(@insns)); | |
| 720 &pshufb (@X[($Xi-3)&7],@X[2]); | |
| 721 eval(shift(@insns)); | |
| 722 eval(shift(@insns)); | |
| 723 &paddd (@X[($Xi-4)&7],@X[3]); | |
| 724 eval(shift(@insns)); | |
| 725 eval(shift(@insns)); | |
| 726 eval(shift(@insns)); | |
| 727 eval(shift(@insns)); | |
| 728 &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU | |
| 729 eval(shift(@insns)); | |
| 730 eval(shift(@insns)); | |
| 731 &psubd (@X[($Xi-4)&7],@X[3]); | |
| 732 | |
| 733 foreach (@insns) { eval; } | |
| 734 $Xi++; | |
| 735 } | |
| 736 | |
| 737 sub Xtail_ssse3() | |
| 738 { use integer; | |
| 739 my $body = shift; | |
| 740 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
| 741 my ($a,$b,$c,$d,$e); | |
| 742 | |
| 743 foreach (@insns) { eval; } | |
| 744 } | |
| 745 | |
| 746 sub body_00_19 () { | |
| 747 ( | |
| 748 '($a,$b,$c,$d,$e)=@V;'. | |
| 749 '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer | |
| 750 '&xor ($c,$d);', | |
| 751 '&mov (@T[1],$a);', # $b in next round | |
| 752 '&$_rol ($a,5);', | |
| 753 '&and (@T[0],$c);', # ($b&($c^$d)) | |
| 754 '&xor ($c,$d);', # restore $c | |
| 755 '&xor (@T[0],$d);', | |
| 756 '&add ($e,$a);', | |
| 757 '&$_ror ($b,$j?7:2);', # $b>>>2 | |
| 758 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T))
;' | |
| 759 ); | |
| 760 } | |
| 761 | |
| 762 sub body_20_39 () { | |
| 763 ( | |
| 764 '($a,$b,$c,$d,$e)=@V;'. | |
| 765 '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer | |
| 766 '&xor (@T[0],$d);', # ($b^$d) | |
| 767 '&mov (@T[1],$a);', # $b in next round | |
| 768 '&$_rol ($a,5);', | |
| 769 '&xor (@T[0],$c);', # ($b^$d^$c) | |
| 770 '&add ($e,$a);', | |
| 771 '&$_ror ($b,7);', # $b>>>2 | |
| 772 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' | |
| 773 ); | |
| 774 } | |
| 775 | |
| 776 sub body_40_59 () { | |
| 777 ( | |
| 778 '($a,$b,$c,$d,$e)=@V;'. | |
| 779 '&mov (@T[1],$c);', | |
| 780 '&xor ($c,$d);', | |
| 781 '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer | |
| 782 '&and (@T[1],$d);', | |
| 783 '&and (@T[0],$c);', # ($b&($c^$d)) | |
| 784 '&$_ror ($b,7);', # $b>>>2 | |
| 785 '&add ($e,@T[1]);', | |
| 786 '&mov (@T[1],$a);', # $b in next round | |
| 787 '&$_rol ($a,5);', | |
| 788 '&add ($e,@T[0]);', | |
| 789 '&xor ($c,$d);', # restore $c | |
| 790 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' | |
| 791 ); | |
| 792 } | |
| 793 | |
| 794 &set_label("loop",16); | |
| 795 &Xupdate_ssse3_16_31(\&body_00_19); | |
| 796 &Xupdate_ssse3_16_31(\&body_00_19); | |
| 797 &Xupdate_ssse3_16_31(\&body_00_19); | |
| 798 &Xupdate_ssse3_16_31(\&body_00_19); | |
| 799 &Xupdate_ssse3_32_79(\&body_00_19); | |
| 800 &Xupdate_ssse3_32_79(\&body_20_39); | |
| 801 &Xupdate_ssse3_32_79(\&body_20_39); | |
| 802 &Xupdate_ssse3_32_79(\&body_20_39); | |
| 803 &Xupdate_ssse3_32_79(\&body_20_39); | |
| 804 &Xupdate_ssse3_32_79(\&body_20_39); | |
| 805 &Xupdate_ssse3_32_79(\&body_40_59); | |
| 806 &Xupdate_ssse3_32_79(\&body_40_59); | |
| 807 &Xupdate_ssse3_32_79(\&body_40_59); | |
| 808 &Xupdate_ssse3_32_79(\&body_40_59); | |
| 809 &Xupdate_ssse3_32_79(\&body_40_59); | |
| 810 &Xupdate_ssse3_32_79(\&body_20_39); | |
| 811 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" | |
| 812 | |
| 813 $saved_j=$j; @saved_V=@V; | |
| 814 | |
| 815 &Xloop_ssse3(\&body_20_39); | |
| 816 &Xloop_ssse3(\&body_20_39); | |
| 817 &Xloop_ssse3(\&body_20_39); | |
| 818 | |
| 819 &mov (@T[1],&DWP(192,"esp")); # update context | |
| 820 &add ($A,&DWP(0,@T[1])); | |
| 821 &add (@T[0],&DWP(4,@T[1])); # $b | |
| 822 &add ($C,&DWP(8,@T[1])); | |
| 823 &mov (&DWP(0,@T[1]),$A); | |
| 824 &add ($D,&DWP(12,@T[1])); | |
| 825 &mov (&DWP(4,@T[1]),@T[0]); | |
| 826 &add ($E,&DWP(16,@T[1])); | |
| 827 &mov (&DWP(8,@T[1]),$C); | |
| 828 &mov ($B,@T[0]); | |
| 829 &mov (&DWP(12,@T[1]),$D); | |
| 830 &mov (&DWP(16,@T[1]),$E); | |
| 831 &movdqa (@X[0],@X[-3&7]); | |
| 832 | |
| 833 &jmp (&label("loop")); | |
| 834 | |
| 835 &set_label("done",16); $j=$saved_j; @V=@saved_V; | |
| 836 | |
| 837 &Xtail_ssse3(\&body_20_39); | |
| 838 &Xtail_ssse3(\&body_20_39); | |
| 839 &Xtail_ssse3(\&body_20_39); | |
| 840 | |
| 841 &mov (@T[1],&DWP(192,"esp")); # update context | |
| 842 &add ($A,&DWP(0,@T[1])); | |
| 843 &mov ("esp",&DWP(192+12,"esp")); # restore %esp | |
| 844 &add (@T[0],&DWP(4,@T[1])); # $b | |
| 845 &add ($C,&DWP(8,@T[1])); | |
| 846 &mov (&DWP(0,@T[1]),$A); | |
| 847 &add ($D,&DWP(12,@T[1])); | |
| 848 &mov (&DWP(4,@T[1]),@T[0]); | |
| 849 &add ($E,&DWP(16,@T[1])); | |
| 850 &mov (&DWP(8,@T[1]),$C); | |
| 851 &mov (&DWP(12,@T[1]),$D); | |
| 852 &mov (&DWP(16,@T[1]),$E); | |
| 853 | |
| 854 &function_end("_sha1_block_data_order_ssse3"); | |
| 855 | |
| 856 if ($ymm) { | |
| 857 my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded | |
| 858 my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 | |
| 859 my @V=($A,$B,$C,$D,$E); | |
| 860 my $j=0; # hash round | |
| 861 my @T=($T,$tmp1); | |
| 862 my $inp; | |
| 863 | |
| 864 my $_rol=sub { &shld(@_[0],@_) }; | |
| 865 my $_ror=sub { &shrd(@_[0],@_) }; | |
| 866 | |
| 867 &function_begin("_sha1_block_data_order_avx"); | |
| 868 &call (&label("pic_point")); # make it PIC! | |
| 869 &set_label("pic_point"); | |
| 870 &blindpop($tmp1); | |
| 871 &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); | |
| 872 &set_label("avx_shortcut"); | |
| 873 &vzeroall(); | |
| 874 | |
| 875 &vmovdqa(@X[3],&QWP(0,$tmp1)); # K_00_19 | |
| 876 &vmovdqa(@X[4],&QWP(16,$tmp1)); # K_20_39 | |
| 877 &vmovdqa(@X[5],&QWP(32,$tmp1)); # K_40_59 | |
| 878 &vmovdqa(@X[6],&QWP(48,$tmp1)); # K_60_79 | |
| 879 &vmovdqa(@X[2],&QWP(64,$tmp1)); # pbswap mask | |
| 880 | |
| 881 &mov ($E,&wparam(0)); # load argument block | |
| 882 &mov ($inp=@T[1],&wparam(1)); | |
| 883 &mov ($D,&wparam(2)); | |
| 884 &mov (@T[0],"esp"); | |
| 885 | |
| 886 # stack frame layout | |
| 887 # | |
| 888 # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area | |
| 889 # X[4]+K X[5]+K X[6]+K X[7]+K | |
| 890 # X[8]+K X[9]+K X[10]+K X[11]+K | |
| 891 # X[12]+K X[13]+K X[14]+K X[15]+K | |
| 892 # | |
| 893 # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area | |
| 894 # X[4] X[5] X[6] X[7] | |
| 895 # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 | |
| 896 # | |
| 897 # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants | |
| 898 # K_40_59 K_40_59 K_40_59 K_40_59 | |
| 899 # K_60_79 K_60_79 K_60_79 K_60_79 | |
| 900 # K_00_19 K_00_19 K_00_19 K_00_19 | |
| 901 # pbswap mask | |
| 902 # | |
| 903 # +192 ctx # argument block | |
| 904 # +196 inp | |
| 905 # +200 end | |
| 906 # +204 esp | |
| 907 &sub ("esp",208); | |
| 908 &and ("esp",-64); | |
| 909 | |
| 910 &vmovdqa(&QWP(112+0,"esp"),@X[4]); # copy constants | |
| 911 &vmovdqa(&QWP(112+16,"esp"),@X[5]); | |
| 912 &vmovdqa(&QWP(112+32,"esp"),@X[6]); | |
| 913 &shl ($D,6); # len*64 | |
| 914 &vmovdqa(&QWP(112+48,"esp"),@X[3]); | |
| 915 &add ($D,$inp); # end of input | |
| 916 &vmovdqa(&QWP(112+64,"esp"),@X[2]); | |
| 917 &add ($inp,64); | |
| 918 &mov (&DWP(192+0,"esp"),$E); # save argument block | |
| 919 &mov (&DWP(192+4,"esp"),$inp); | |
| 920 &mov (&DWP(192+8,"esp"),$D); | |
| 921 &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp | |
| 922 | |
| 923 &mov ($A,&DWP(0,$E)); # load context | |
| 924 &mov ($B,&DWP(4,$E)); | |
| 925 &mov ($C,&DWP(8,$E)); | |
| 926 &mov ($D,&DWP(12,$E)); | |
| 927 &mov ($E,&DWP(16,$E)); | |
| 928 &mov (@T[0],$B); # magic seed | |
| 929 | |
| 930 &vmovdqu(@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] | |
| 931 &vmovdqu(@X[-3&7],&QWP(-48,$inp)); | |
| 932 &vmovdqu(@X[-2&7],&QWP(-32,$inp)); | |
| 933 &vmovdqu(@X[-1&7],&QWP(-16,$inp)); | |
| 934 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap | |
| 935 &vpshufb(@X[-3&7],@X[-3&7],@X[2]); | |
| 936 &vpshufb(@X[-2&7],@X[-2&7],@X[2]); | |
| 937 &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot | |
| 938 &vpshufb(@X[-1&7],@X[-1&7],@X[2]); | |
| 939 &vpaddd (@X[0],@X[-4&7],@X[3]); # add K_00_19 | |
| 940 &vpaddd (@X[1],@X[-3&7],@X[3]); | |
| 941 &vpaddd (@X[2],@X[-2&7],@X[3]); | |
| 942 &vmovdqa(&QWP(0,"esp"),@X[0]); # X[]+K xfer to IALU | |
| 943 &vmovdqa(&QWP(0+16,"esp"),@X[1]); | |
| 944 &vmovdqa(&QWP(0+32,"esp"),@X[2]); | |
| 945 &jmp (&label("loop")); | |
| 946 | |
| 947 sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 | |
| 948 { use integer; | |
| 949 my $body = shift; | |
| 950 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions | |
| 951 my ($a,$b,$c,$d,$e); | |
| 952 | |
| 953 eval(shift(@insns)); | |
| 954 eval(shift(@insns)); | |
| 955 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" | |
| 956 eval(shift(@insns)); | |
| 957 eval(shift(@insns)); | |
| 958 | |
| 959 &vpaddd (@X[3],@X[3],@X[-1&7]); | |
| 960 &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to b
acktrace buffer | |
| 961 eval(shift(@insns)); | |
| 962 eval(shift(@insns)); | |
| 963 &vpsrldq(@X[2],@X[-1&7],4); # "X[-3]", 3 dwords | |
| 964 eval(shift(@insns)); | |
| 965 eval(shift(@insns)); | |
| 966 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" | |
| 967 eval(shift(@insns)); | |
| 968 eval(shift(@insns)); | |
| 969 | |
| 970 &vpxor (@X[2],@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" | |
| 971 eval(shift(@insns)); | |
| 972 eval(shift(@insns)); | |
| 973 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to
IALU | |
| 974 eval(shift(@insns)); | |
| 975 eval(shift(@insns)); | |
| 976 | |
| 977 &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" | |
| 978 eval(shift(@insns)); | |
| 979 eval(shift(@insns)); | |
| 980 eval(shift(@insns)); | |
| 981 eval(shift(@insns)); | |
| 982 | |
| 983 &vpsrld (@X[2],@X[0],31); | |
| 984 eval(shift(@insns)); | |
| 985 eval(shift(@insns)); | |
| 986 eval(shift(@insns)); | |
| 987 eval(shift(@insns)); | |
| 988 | |
| 989 &vpslldq(@X[4],@X[0],12); # "X[0]"<<96, extract one dword | |
| 990 &vpaddd (@X[0],@X[0],@X[0]); | |
| 991 eval(shift(@insns)); | |
| 992 eval(shift(@insns)); | |
| 993 eval(shift(@insns)); | |
| 994 eval(shift(@insns)); | |
| 995 | |
| 996 &vpsrld (@X[3],@X[4],30); | |
| 997 &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=1 | |
| 998 eval(shift(@insns)); | |
| 999 eval(shift(@insns)); | |
| 1000 eval(shift(@insns)); | |
| 1001 eval(shift(@insns)); | |
| 1002 | |
| 1003 &vpslld (@X[4],@X[4],2); | |
| 1004 &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);
# restore X[] from backtrace buffer | |
| 1005 eval(shift(@insns)); | |
| 1006 eval(shift(@insns)); | |
| 1007 &vpxor (@X[0],@X[0],@X[3]); | |
| 1008 eval(shift(@insns)); | |
| 1009 eval(shift(@insns)); | |
| 1010 eval(shift(@insns)); | |
| 1011 eval(shift(@insns)); | |
| 1012 | |
| 1013 &vpxor (@X[0],@X[0],@X[4]); # "X[0]"^=("X[0]"<<96)<<<2 | |
| 1014 eval(shift(@insns)); | |
| 1015 eval(shift(@insns)); | |
| 1016 &vmovdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_X
X | |
| 1017 eval(shift(@insns)); | |
| 1018 eval(shift(@insns)); | |
| 1019 | |
| 1020 foreach (@insns) { eval; } # remaining instructions [if any] | |
| 1021 | |
| 1022 $Xi++; push(@X,shift(@X)); # "rotate" X[] | |
| 1023 } | |
| 1024 | |
| 1025 sub Xupdate_avx_32_79() | |
| 1026 { use integer; | |
| 1027 my $body = shift; | |
| 1028 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions | |
| 1029 my ($a,$b,$c,$d,$e); | |
| 1030 | |
| 1031 &vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]" | |
| 1032 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" | |
| 1033 eval(shift(@insns)); # body_20_39 | |
| 1034 eval(shift(@insns)); | |
| 1035 eval(shift(@insns)); | |
| 1036 eval(shift(@insns)); # rol | |
| 1037 | |
| 1038 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" | |
| 1039 &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X
[] to backtrace buffer | |
| 1040 eval(shift(@insns)); | |
| 1041 eval(shift(@insns)); | |
| 1042 if ($Xi%5) { | |
| 1043 &vmovdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... | |
| 1044 } else { # ... or load next one | |
| 1045 &vmovdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); | |
| 1046 } | |
| 1047 &vpaddd (@X[3],@X[3],@X[-1&7]); | |
| 1048 eval(shift(@insns)); # ror | |
| 1049 eval(shift(@insns)); | |
| 1050 | |
| 1051 &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-6]" | |
| 1052 eval(shift(@insns)); # body_20_39 | |
| 1053 eval(shift(@insns)); | |
| 1054 eval(shift(@insns)); | |
| 1055 eval(shift(@insns)); # rol | |
| 1056 | |
| 1057 &vpsrld (@X[2],@X[0],30); | |
| 1058 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to
IALU | |
| 1059 eval(shift(@insns)); | |
| 1060 eval(shift(@insns)); | |
| 1061 eval(shift(@insns)); # ror | |
| 1062 eval(shift(@insns)); | |
| 1063 | |
| 1064 &vpslld (@X[0],@X[0],2); | |
| 1065 eval(shift(@insns)); # body_20_39 | |
| 1066 eval(shift(@insns)); | |
| 1067 eval(shift(@insns)); | |
| 1068 eval(shift(@insns)); # rol | |
| 1069 eval(shift(@insns)); | |
| 1070 eval(shift(@insns)); | |
| 1071 eval(shift(@insns)); # ror | |
| 1072 eval(shift(@insns)); | |
| 1073 | |
| 1074 &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=2 | |
| 1075 eval(shift(@insns)); # body_20_39 | |
| 1076 eval(shift(@insns)); | |
| 1077 &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);
# restore X[] from backtrace buffer | |
| 1078 eval(shift(@insns)); | |
| 1079 eval(shift(@insns)); # rol | |
| 1080 eval(shift(@insns)); | |
| 1081 eval(shift(@insns)); | |
| 1082 eval(shift(@insns)); # ror | |
| 1083 eval(shift(@insns)); | |
| 1084 | |
| 1085 foreach (@insns) { eval; } # remaining instructions | |
| 1086 | |
| 1087 $Xi++; push(@X,shift(@X)); # "rotate" X[] | |
| 1088 } | |
| 1089 | |
| 1090 sub Xuplast_avx_80() | |
| 1091 { use integer; | |
| 1092 my $body = shift; | |
| 1093 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
| 1094 my ($a,$b,$c,$d,$e); | |
| 1095 | |
| 1096 eval(shift(@insns)); | |
| 1097 &vpaddd (@X[3],@X[3],@X[-1&7]); | |
| 1098 eval(shift(@insns)); | |
| 1099 eval(shift(@insns)); | |
| 1100 eval(shift(@insns)); | |
| 1101 eval(shift(@insns)); | |
| 1102 | |
| 1103 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IAL
U | |
| 1104 | |
| 1105 foreach (@insns) { eval; } # remaining instructions | |
| 1106 | |
| 1107 &mov ($inp=@T[1],&DWP(192+4,"esp")); | |
| 1108 &cmp ($inp,&DWP(192+8,"esp")); | |
| 1109 &je (&label("done")); | |
| 1110 | |
| 1111 &vmovdqa(@X[3],&QWP(112+48,"esp")); # K_00_19 | |
| 1112 &vmovdqa(@X[2],&QWP(112+64,"esp")); # pbswap mask | |
| 1113 &vmovdqu(@X[-4&7],&QWP(0,$inp)); # load input | |
| 1114 &vmovdqu(@X[-3&7],&QWP(16,$inp)); | |
| 1115 &vmovdqu(@X[-2&7],&QWP(32,$inp)); | |
| 1116 &vmovdqu(@X[-1&7],&QWP(48,$inp)); | |
| 1117 &add ($inp,64); | |
| 1118 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap | |
| 1119 &mov (&DWP(192+4,"esp"),$inp); | |
| 1120 &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot | |
| 1121 | |
| 1122 $Xi=0; | |
| 1123 } | |
| 1124 | |
| 1125 sub Xloop_avx() | |
| 1126 { use integer; | |
| 1127 my $body = shift; | |
| 1128 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
| 1129 my ($a,$b,$c,$d,$e); | |
| 1130 | |
| 1131 eval(shift(@insns)); | |
| 1132 eval(shift(@insns)); | |
| 1133 &vpshufb (@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); | |
| 1134 eval(shift(@insns)); | |
| 1135 eval(shift(@insns)); | |
| 1136 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@X[3]); | |
| 1137 eval(shift(@insns)); | |
| 1138 eval(shift(@insns)); | |
| 1139 eval(shift(@insns)); | |
| 1140 eval(shift(@insns)); | |
| 1141 &vmovdqa (&QWP(0+16*$Xi,"esp"),@X[$Xi&7]); # X[]+K xfer to
IALU | |
| 1142 eval(shift(@insns)); | |
| 1143 eval(shift(@insns)); | |
| 1144 | |
| 1145 foreach (@insns) { eval; } | |
| 1146 $Xi++; | |
| 1147 } | |
| 1148 | |
| 1149 sub Xtail_avx() | |
| 1150 { use integer; | |
| 1151 my $body = shift; | |
| 1152 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions | |
| 1153 my ($a,$b,$c,$d,$e); | |
| 1154 | |
| 1155 foreach (@insns) { eval; } | |
| 1156 } | |
| 1157 | |
| 1158 &set_label("loop",16); | |
| 1159 &Xupdate_avx_16_31(\&body_00_19); | |
| 1160 &Xupdate_avx_16_31(\&body_00_19); | |
| 1161 &Xupdate_avx_16_31(\&body_00_19); | |
| 1162 &Xupdate_avx_16_31(\&body_00_19); | |
| 1163 &Xupdate_avx_32_79(\&body_00_19); | |
| 1164 &Xupdate_avx_32_79(\&body_20_39); | |
| 1165 &Xupdate_avx_32_79(\&body_20_39); | |
| 1166 &Xupdate_avx_32_79(\&body_20_39); | |
| 1167 &Xupdate_avx_32_79(\&body_20_39); | |
| 1168 &Xupdate_avx_32_79(\&body_20_39); | |
| 1169 &Xupdate_avx_32_79(\&body_40_59); | |
| 1170 &Xupdate_avx_32_79(\&body_40_59); | |
| 1171 &Xupdate_avx_32_79(\&body_40_59); | |
| 1172 &Xupdate_avx_32_79(\&body_40_59); | |
| 1173 &Xupdate_avx_32_79(\&body_40_59); | |
| 1174 &Xupdate_avx_32_79(\&body_20_39); | |
| 1175 &Xuplast_avx_80(\&body_20_39); # can jump to "done" | |
| 1176 | |
| 1177 $saved_j=$j; @saved_V=@V; | |
| 1178 | |
| 1179 &Xloop_avx(\&body_20_39); | |
| 1180 &Xloop_avx(\&body_20_39); | |
| 1181 &Xloop_avx(\&body_20_39); | |
| 1182 | |
| 1183 &mov (@T[1],&DWP(192,"esp")); # update context | |
| 1184 &add ($A,&DWP(0,@T[1])); | |
| 1185 &add (@T[0],&DWP(4,@T[1])); # $b | |
| 1186 &add ($C,&DWP(8,@T[1])); | |
| 1187 &mov (&DWP(0,@T[1]),$A); | |
| 1188 &add ($D,&DWP(12,@T[1])); | |
| 1189 &mov (&DWP(4,@T[1]),@T[0]); | |
| 1190 &add ($E,&DWP(16,@T[1])); | |
| 1191 &mov (&DWP(8,@T[1]),$C); | |
| 1192 &mov ($B,@T[0]); | |
| 1193 &mov (&DWP(12,@T[1]),$D); | |
| 1194 &mov (&DWP(16,@T[1]),$E); | |
| 1195 | |
| 1196 &jmp (&label("loop")); | |
| 1197 | |
| 1198 &set_label("done",16); $j=$saved_j; @V=@saved_V; | |
| 1199 | |
| 1200 &Xtail_avx(\&body_20_39); | |
| 1201 &Xtail_avx(\&body_20_39); | |
| 1202 &Xtail_avx(\&body_20_39); | |
| 1203 | |
| 1204 &vzeroall(); | |
| 1205 | |
| 1206 &mov (@T[1],&DWP(192,"esp")); # update context | |
| 1207 &add ($A,&DWP(0,@T[1])); | |
| 1208 &mov ("esp",&DWP(192+12,"esp")); # restore %esp | |
| 1209 &add (@T[0],&DWP(4,@T[1])); # $b | |
| 1210 &add ($C,&DWP(8,@T[1])); | |
| 1211 &mov (&DWP(0,@T[1]),$A); | |
| 1212 &add ($D,&DWP(12,@T[1])); | |
| 1213 &mov (&DWP(4,@T[1]),@T[0]); | |
| 1214 &add ($E,&DWP(16,@T[1])); | |
| 1215 &mov (&DWP(8,@T[1]),$C); | |
| 1216 &mov (&DWP(12,@T[1]),$D); | |
| 1217 &mov (&DWP(16,@T[1]),$E); | |
| 1218 &function_end("_sha1_block_data_order_avx"); | |
| 1219 } | |
| 1220 &set_label("K_XX_XX",64); | |
| 1221 &data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999); # K_00_19 | |
| 1222 &data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1); # K_20_39 | |
| 1223 &data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59 | |
| 1224 &data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79 | |
| 1225 &data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask | |
| 1226 } | |
| 1227 &asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); | |
| 1228 | |
| 1229 &asm_finish(); | |
| OLD | NEW |