| OLD | NEW |
| (Empty) |
| 1 #!/usr/bin/env perl | |
| 2 # | |
| 3 # ==================================================================== | |
| 4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
| 5 # project. The module is, however, dual licensed under OpenSSL and | |
| 6 # CRYPTOGAMS licenses depending on where you obtain it. For further | |
| 7 # details see http://www.openssl.org/~appro/cryptogams/. | |
| 8 # ==================================================================== | |
| 9 | |
| 10 # June 2011 | |
| 11 # | |
| 12 # This is RC4+MD5 "stitch" implementation. The idea, as spelled in | |
| 13 # http://download.intel.com/design/intarch/papers/323686.pdf, is that | |
| 14 # since both algorithms exhibit instruction-level parallelism, ILP, | |
| 15 # below theoretical maximum, interleaving them would allow to utilize | |
| 16 # processor resources better and achieve better performance. RC4 | |
| 17 # instruction sequence is virtually identical to rc4-x86_64.pl, which | |
| 18 # is heavily based on submission by Maxim Perminov, Maxim Locktyukhin | |
| 19 # and Jim Guilford of Intel. MD5 is fresh implementation aiming to | |
| 20 # minimize register usage, which was used as "main thread" with RC4 | |
| 21 # weaved into it, one RC4 round per one MD5 round. In addition to the | |
| 22 # stiched subroutine the script can generate standalone replacement | |
| 23 # md5_block_asm_data_order and RC4. Below are performance numbers in | |
| 24 # cycles per processed byte, less is better, for these the standalone | |
| 25 # subroutines, sum of them, and stitched one: | |
| 26 # | |
| 27 # RC4 MD5 RC4+MD5 stitch gain | |
| 28 # Opteron 6.5(*) 5.4 11.9 7.0 +70%(*) | |
| 29 # Core2 6.5 5.8 12.3 7.7 +60% | |
| 30 # Westmere 4.3 5.2 9.5 7.0 +36% | |
| 31 # Sandy Bridge 4.2 5.5 9.7 6.8 +43% | |
| 32 # Atom 9.3 6.5 15.8 11.1 +42% | |
| 33 # | |
| 34 # (*) rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement | |
| 35 # is +53%... | |
| 36 | |
| 37 my ($rc4,$md5)=(1,1); # what to generate? | |
| 38 my $D="#" if (!$md5); # if set to "#", MD5 is stitched into RC4(), | |
| 39 # but its result is discarded. Idea here is | |
| 40 # to be able to use 'openssl speed rc4' for | |
| 41 # benchmarking the stitched subroutine... | |
| 42 | |
| 43 my $flavour = shift; | |
| 44 my $output = shift; | |
| 45 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | |
| 46 | |
| 47 my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
| 48 | |
| 49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate; | |
| 50 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
| 51 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
| 52 die "can't locate x86_64-xlate.pl"; | |
| 53 | |
| 54 open OUT,"| \"$^X\" $xlate $flavour $output"; | |
| 55 *STDOUT=*OUT; | |
| 56 | |
| 57 my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs); | |
| 58 | |
| 59 if ($rc4 && !$md5) { | |
| 60 ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx"); | |
| 61 $func="RC4"; $nargs=4; | |
| 62 } elsif ($md5 && !$rc4) { | |
| 63 ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx"); | |
| 64 $func="md5_block_asm_data_order"; $nargs=3; | |
| 65 } else { | |
| 66 ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); | |
| 67 $func="rc4_md5_enc"; $nargs=6; | |
| 68 # void rc4_md5_enc( | |
| 69 # RC4_KEY *key, # | |
| 70 # const void *in0, # RC4 input | |
| 71 # void *out, # RC4 output | |
| 72 # MD5_CTX *ctx, # | |
| 73 # const void *inp, # MD5 input | |
| 74 # size_t len); # number of 64-byte blocks | |
| 75 } | |
| 76 | |
| 77 my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee, | |
| 78 0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501, | |
| 79 0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be, | |
| 80 0x6b901122,0xfd987193,0xa679438e,0x49b40821, | |
| 81 | |
| 82 0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa, | |
| 83 0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8, | |
| 84 0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed, | |
| 85 0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a, | |
| 86 | |
| 87 0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c, | |
| 88 0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70, | |
| 89 0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05, | |
| 90 0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665, | |
| 91 | |
| 92 0xf4292244,0x432aff97,0xab9423a7,0xfc93a039, | |
| 93 0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1, | |
| 94 0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1, | |
| 95 0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391 ); | |
| 96 | |
| 97 my @V=("%r8d","%r9d","%r10d","%r11d"); # MD5 registers | |
| 98 my $tmp="%r12d"; | |
| 99 | |
| 100 my @XX=("%rbp","%rsi"); # RC4 registers | |
| 101 my @TX=("%rax","%rbx"); | |
| 102 my $YY="%rcx"; | |
| 103 my $TY="%rdx"; | |
| 104 | |
| 105 my $MOD=32; # 16, 32 or 64 | |
| 106 | |
| 107 $code.=<<___; | |
| 108 .text | |
| 109 .align 16 | |
| 110 | |
| 111 .globl $func | |
| 112 .type $func,\@function,$nargs | |
| 113 $func: | |
| 114 cmp \$0,$len | |
| 115 je .Labort | |
| 116 push %rbx | |
| 117 push %rbp | |
| 118 push %r12 | |
| 119 push %r13 | |
| 120 push %r14 | |
| 121 push %r15 | |
| 122 sub \$40,%rsp | |
| 123 .Lbody: | |
| 124 ___ | |
| 125 if ($rc4) { | |
| 126 $code.=<<___; | |
| 127 $D#md5# mov $ctx,%r11 # reassign arguments | |
| 128 mov $len,%r12 | |
| 129 mov $in0,%r13 | |
| 130 mov $out,%r14 | |
| 131 $D#md5# mov $inp,%r15 | |
| 132 ___ | |
| 133 $ctx="%r11" if ($md5); # reassign arguments | |
| 134 $len="%r12"; | |
| 135 $in0="%r13"; | |
| 136 $out="%r14"; | |
| 137 $inp="%r15" if ($md5); | |
| 138 $inp=$in0 if (!$md5); | |
| 139 $code.=<<___; | |
| 140 xor $XX[0],$XX[0] | |
| 141 xor $YY,$YY | |
| 142 | |
| 143 lea 8($dat),$dat | |
| 144 mov -8($dat),$XX[0]#b | |
| 145 mov -4($dat),$YY#b | |
| 146 | |
| 147 inc $XX[0]#b | |
| 148 sub $in0,$out | |
| 149 movl ($dat,$XX[0],4),$TX[0]#d | |
| 150 ___ | |
| 151 $code.=<<___ if (!$md5); | |
| 152 xor $TX[1],$TX[1] | |
| 153 test \$-128,$len | |
| 154 jz .Loop1 | |
| 155 sub $XX[0],$TX[1] | |
| 156 and \$`$MOD-1`,$TX[1] | |
| 157 jz .Loop${MOD}_is_hot | |
| 158 sub $TX[1],$len | |
| 159 .Loop${MOD}_warmup: | |
| 160 add $TX[0]#b,$YY#b | |
| 161 movl ($dat,$YY,4),$TY#d | |
| 162 movl $TX[0]#d,($dat,$YY,4) | |
| 163 movl $TY#d,($dat,$XX[0],4) | |
| 164 add $TY#b,$TX[0]#b | |
| 165 inc $XX[0]#b | |
| 166 movl ($dat,$TX[0],4),$TY#d | |
| 167 movl ($dat,$XX[0],4),$TX[0]#d | |
| 168 xorb ($in0),$TY#b | |
| 169 movb $TY#b,($out,$in0) | |
| 170 lea 1($in0),$in0 | |
| 171 dec $TX[1] | |
| 172 jnz .Loop${MOD}_warmup | |
| 173 | |
| 174 mov $YY,$TX[1] | |
| 175 xor $YY,$YY | |
| 176 mov $TX[1]#b,$YY#b | |
| 177 | |
| 178 .Loop${MOD}_is_hot: | |
| 179 mov $len,32(%rsp) # save original $len | |
| 180 shr \$6,$len # number of 64-byte blocks | |
| 181 ___ | |
| 182 if ($D && !$md5) { # stitch in dummy MD5 | |
| 183 $md5=1; | |
| 184 $ctx="%r11"; | |
| 185 $inp="%r15"; | |
| 186 $code.=<<___; | |
| 187 mov %rsp,$ctx | |
| 188 mov $in0,$inp | |
| 189 ___ | |
| 190 } | |
| 191 } | |
| 192 $code.=<<___; | |
| 193 #rc4# add $TX[0]#b,$YY#b | |
| 194 #rc4# lea ($dat,$XX[0],4),$XX[1] | |
| 195 shl \$6,$len | |
| 196 add $inp,$len # pointer to the end of input | |
| 197 mov $len,16(%rsp) | |
| 198 | |
| 199 #md5# mov $ctx,24(%rsp) # save pointer to MD5_CTX | |
| 200 #md5# mov 0*4($ctx),$V[0] # load current hash value from MD5_CTX | |
| 201 #md5# mov 1*4($ctx),$V[1] | |
| 202 #md5# mov 2*4($ctx),$V[2] | |
| 203 #md5# mov 3*4($ctx),$V[3] | |
| 204 jmp .Loop | |
| 205 | |
| 206 .align 16 | |
| 207 .Loop: | |
| 208 #md5# mov $V[0],0*4(%rsp) # put aside current hash value | |
| 209 #md5# mov $V[1],1*4(%rsp) | |
| 210 #md5# mov $V[2],2*4(%rsp) | |
| 211 #md5# mov $V[3],$tmp # forward reference | |
| 212 #md5# mov $V[3],3*4(%rsp) | |
| 213 ___ | |
| 214 | |
| 215 sub R0 { | |
| 216 my ($i,$a,$b,$c,$d)=@_; | |
| 217 my @rot0=(7,12,17,22); | |
| 218 my $j=$i%16; | |
| 219 my $k=$i%$MOD; | |
| 220 my $xmm="%xmm".($j&1); | |
| 221 $code.=" movdqu ($in0),%xmm2\n" if ($rc4 && $j==15); | |
| 222 $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1
); | |
| 223 $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | |
| 224 $code.=<<___; | |
| 225 #rc4# movl ($dat,$YY,4),$TY#d | |
| 226 #md5# xor $c,$tmp | |
| 227 #rc4# movl $TX[0]#d,($dat,$YY,4) | |
| 228 #md5# and $b,$tmp | |
| 229 #md5# add 4*`$j`($inp),$a | |
| 230 #rc4# add $TY#b,$TX[0]#b | |
| 231 #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#
d | |
| 232 #md5# add \$$K[$i],$a | |
| 233 #md5# xor $d,$tmp | |
| 234 #rc4# movz $TX[0]#b,$TX[0]#d | |
| 235 #rc4# movl $TY#d,4*$k($XX[1]) | |
| 236 #md5# add $tmp,$a | |
| 237 #rc4# add $TX[1]#b,$YY#b | |
| 238 #md5# rol \$$rot0[$j%4],$a | |
| 239 #md5# mov `$j==15?"$b":"$c"`,$tmp # forward reference | |
| 240 #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | |
| 241 #md5# add $b,$a | |
| 242 ___ | |
| 243 $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); | |
| 244 mov $YY,$XX[1] | |
| 245 xor $YY,$YY # keyword to partial register | |
| 246 mov $XX[1]#b,$YY#b | |
| 247 lea ($dat,$XX[0],4),$XX[1] | |
| 248 ___ | |
| 249 $code.=<<___ if ($rc4 && $j==15); | |
| 250 psllq \$8,%xmm1 | |
| 251 pxor %xmm0,%xmm2 | |
| 252 pxor %xmm1,%xmm2 | |
| 253 ___ | |
| 254 } | |
| 255 sub R1 { | |
| 256 my ($i,$a,$b,$c,$d)=@_; | |
| 257 my @rot1=(5,9,14,20); | |
| 258 my $j=$i%16; | |
| 259 my $k=$i%$MOD; | |
| 260 my $xmm="%xmm".($j&1); | |
| 261 $code.=" movdqu 16($in0),%xmm3\n" if ($rc4 && $j==15); | |
| 262 $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1
); | |
| 263 $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | |
| 264 $code.=<<___; | |
| 265 #rc4# movl ($dat,$YY,4),$TY#d | |
| 266 #md5# xor $b,$tmp | |
| 267 #rc4# movl $TX[0]#d,($dat,$YY,4) | |
| 268 #md5# and $d,$tmp | |
| 269 #md5# add 4*`((1+5*$j)%16)`($inp),$a | |
| 270 #rc4# add $TY#b,$TX[0]#b | |
| 271 #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#
d | |
| 272 #md5# add \$$K[$i],$a | |
| 273 #md5# xor $c,$tmp | |
| 274 #rc4# movz $TX[0]#b,$TX[0]#d | |
| 275 #rc4# movl $TY#d,4*$k($XX[1]) | |
| 276 #md5# add $tmp,$a | |
| 277 #rc4# add $TX[1]#b,$YY#b | |
| 278 #md5# rol \$$rot1[$j%4],$a | |
| 279 #md5# mov `$j==15?"$c":"$b"`,$tmp # forward reference | |
| 280 #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | |
| 281 #md5# add $b,$a | |
| 282 ___ | |
| 283 $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); | |
| 284 mov $YY,$XX[1] | |
| 285 xor $YY,$YY # keyword to partial register | |
| 286 mov $XX[1]#b,$YY#b | |
| 287 lea ($dat,$XX[0],4),$XX[1] | |
| 288 ___ | |
| 289 $code.=<<___ if ($rc4 && $j==15); | |
| 290 psllq \$8,%xmm1 | |
| 291 pxor %xmm0,%xmm3 | |
| 292 pxor %xmm1,%xmm3 | |
| 293 ___ | |
| 294 } | |
| 295 sub R2 { | |
| 296 my ($i,$a,$b,$c,$d)=@_; | |
| 297 my @rot2=(4,11,16,23); | |
| 298 my $j=$i%16; | |
| 299 my $k=$i%$MOD; | |
| 300 my $xmm="%xmm".($j&1); | |
| 301 $code.=" movdqu 32($in0),%xmm4\n" if ($rc4 && $j==15); | |
| 302 $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1
); | |
| 303 $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | |
| 304 $code.=<<___; | |
| 305 #rc4# movl ($dat,$YY,4),$TY#d | |
| 306 #md5# xor $c,$tmp | |
| 307 #rc4# movl $TX[0]#d,($dat,$YY,4) | |
| 308 #md5# xor $b,$tmp | |
| 309 #md5# add 4*`((5+3*$j)%16)`($inp),$a | |
| 310 #rc4# add $TY#b,$TX[0]#b | |
| 311 #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#
d | |
| 312 #md5# add \$$K[$i],$a | |
| 313 #rc4# movz $TX[0]#b,$TX[0]#d | |
| 314 #md5# add $tmp,$a | |
| 315 #rc4# movl $TY#d,4*$k($XX[1]) | |
| 316 #rc4# add $TX[1]#b,$YY#b | |
| 317 #md5# rol \$$rot2[$j%4],$a | |
| 318 #md5# mov `$j==15?"\\\$-1":"$c"`,$tmp # forward reference | |
| 319 #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | |
| 320 #md5# add $b,$a | |
| 321 ___ | |
| 322 $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); | |
| 323 mov $YY,$XX[1] | |
| 324 xor $YY,$YY # keyword to partial register | |
| 325 mov $XX[1]#b,$YY#b | |
| 326 lea ($dat,$XX[0],4),$XX[1] | |
| 327 ___ | |
| 328 $code.=<<___ if ($rc4 && $j==15); | |
| 329 psllq \$8,%xmm1 | |
| 330 pxor %xmm0,%xmm4 | |
| 331 pxor %xmm1,%xmm4 | |
| 332 ___ | |
| 333 } | |
| 334 sub R3 { | |
| 335 my ($i,$a,$b,$c,$d)=@_; | |
| 336 my @rot3=(6,10,15,21); | |
| 337 my $j=$i%16; | |
| 338 my $k=$i%$MOD; | |
| 339 my $xmm="%xmm".($j&1); | |
| 340 $code.=" movdqu 48($in0),%xmm5\n" if ($rc4 && $j==15); | |
| 341 $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1
); | |
| 342 $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | |
| 343 $code.=<<___; | |
| 344 #rc4# movl ($dat,$YY,4),$TY#d | |
| 345 #md5# xor $d,$tmp | |
| 346 #rc4# movl $TX[0]#d,($dat,$YY,4) | |
| 347 #md5# or $b,$tmp | |
| 348 #md5# add 4*`((7*$j)%16)`($inp),$a | |
| 349 #rc4# add $TY#b,$TX[0]#b | |
| 350 #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#
d | |
| 351 #md5# add \$$K[$i],$a | |
| 352 #rc4# movz $TX[0]#b,$TX[0]#d | |
| 353 #md5# xor $c,$tmp | |
| 354 #rc4# movl $TY#d,4*$k($XX[1]) | |
| 355 #md5# add $tmp,$a | |
| 356 #rc4# add $TX[1]#b,$YY#b | |
| 357 #md5# rol \$$rot3[$j%4],$a | |
| 358 #md5# mov \$-1,$tmp # forward reference | |
| 359 #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | |
| 360 #md5# add $b,$a | |
| 361 ___ | |
| 362 $code.=<<___ if ($rc4 && $j==15); | |
| 363 mov $XX[0],$XX[1] | |
| 364 xor $XX[0],$XX[0] # keyword to partial register | |
| 365 mov $XX[1]#b,$XX[0]#b | |
| 366 mov $YY,$XX[1] | |
| 367 xor $YY,$YY # keyword to partial register | |
| 368 mov $XX[1]#b,$YY#b | |
| 369 lea ($dat,$XX[0],4),$XX[1] | |
| 370 psllq \$8,%xmm1 | |
| 371 pxor %xmm0,%xmm5 | |
| 372 pxor %xmm1,%xmm5 | |
| 373 ___ | |
| 374 } | |
| 375 | |
| 376 my $i=0; | |
| 377 for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | |
| 378 for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | |
| 379 for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | |
| 380 for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | |
| 381 | |
| 382 $code.=<<___; | |
| 383 #md5# add 0*4(%rsp),$V[0] # accumulate hash value | |
| 384 #md5# add 1*4(%rsp),$V[1] | |
| 385 #md5# add 2*4(%rsp),$V[2] | |
| 386 #md5# add 3*4(%rsp),$V[3] | |
| 387 | |
| 388 #rc4# movdqu %xmm2,($out,$in0) # write RC4 output | |
| 389 #rc4# movdqu %xmm3,16($out,$in0) | |
| 390 #rc4# movdqu %xmm4,32($out,$in0) | |
| 391 #rc4# movdqu %xmm5,48($out,$in0) | |
| 392 #md5# lea 64($inp),$inp | |
| 393 #rc4# lea 64($in0),$in0 | |
| 394 cmp 16(%rsp),$inp # are we done? | |
| 395 jb .Loop | |
| 396 | |
| 397 #md5# mov 24(%rsp),$len # restore pointer to MD5_CTX | |
| 398 #rc4# sub $TX[0]#b,$YY#b # correct $YY | |
| 399 #md5# mov $V[0],0*4($len) # write MD5_CTX | |
| 400 #md5# mov $V[1],1*4($len) | |
| 401 #md5# mov $V[2],2*4($len) | |
| 402 #md5# mov $V[3],3*4($len) | |
| 403 ___ | |
| 404 $code.=<<___ if ($rc4 && (!$md5 || $D)); | |
| 405 mov 32(%rsp),$len # restore original $len | |
| 406 and \$63,$len # remaining bytes | |
| 407 jnz .Loop1 | |
| 408 jmp .Ldone | |
| 409 | |
| 410 .align 16 | |
| 411 .Loop1: | |
| 412 add $TX[0]#b,$YY#b | |
| 413 movl ($dat,$YY,4),$TY#d | |
| 414 movl $TX[0]#d,($dat,$YY,4) | |
| 415 movl $TY#d,($dat,$XX[0],4) | |
| 416 add $TY#b,$TX[0]#b | |
| 417 inc $XX[0]#b | |
| 418 movl ($dat,$TX[0],4),$TY#d | |
| 419 movl ($dat,$XX[0],4),$TX[0]#d | |
| 420 xorb ($in0),$TY#b | |
| 421 movb $TY#b,($out,$in0) | |
| 422 lea 1($in0),$in0 | |
| 423 dec $len | |
| 424 jnz .Loop1 | |
| 425 | |
| 426 .Ldone: | |
| 427 ___ | |
| 428 $code.=<<___; | |
| 429 #rc4# sub \$1,$XX[0]#b | |
| 430 #rc4# movl $XX[0]#d,-8($dat) | |
| 431 #rc4# movl $YY#d,-4($dat) | |
| 432 | |
| 433 mov 40(%rsp),%r15 | |
| 434 mov 48(%rsp),%r14 | |
| 435 mov 56(%rsp),%r13 | |
| 436 mov 64(%rsp),%r12 | |
| 437 mov 72(%rsp),%rbp | |
| 438 mov 80(%rsp),%rbx | |
| 439 lea 88(%rsp),%rsp | |
| 440 .Lepilogue: | |
| 441 .Labort: | |
| 442 ret | |
| 443 .size $func,.-$func | |
| 444 ___ | |
| 445 | |
| 446 if ($rc4 && $D) { # sole purpose of this section is to provide | |
| 447 # option to use the generated module as drop-in | |
| 448 # replacement for rc4-x86_64.pl for debugging | |
| 449 # and testing purposes... | |
| 450 my ($idx,$ido)=("%r8","%r9"); | |
| 451 my ($dat,$len,$inp)=("%rdi","%rsi","%rdx"); | |
| 452 | |
| 453 $code.=<<___; | |
| 454 .globl RC4_set_key | |
| 455 .type RC4_set_key,\@function,3 | |
| 456 .align 16 | |
| 457 RC4_set_key: | |
| 458 lea 8($dat),$dat | |
| 459 lea ($inp,$len),$inp | |
| 460 neg $len | |
| 461 mov $len,%rcx | |
| 462 xor %eax,%eax | |
| 463 xor $ido,$ido | |
| 464 xor %r10,%r10 | |
| 465 xor %r11,%r11 | |
| 466 jmp .Lw1stloop | |
| 467 | |
| 468 .align 16 | |
| 469 .Lw1stloop: | |
| 470 mov %eax,($dat,%rax,4) | |
| 471 add \$1,%al | |
| 472 jnc .Lw1stloop | |
| 473 | |
| 474 xor $ido,$ido | |
| 475 xor $idx,$idx | |
| 476 .align 16 | |
| 477 .Lw2ndloop: | |
| 478 mov ($dat,$ido,4),%r10d | |
| 479 add ($inp,$len,1),$idx#b | |
| 480 add %r10b,$idx#b | |
| 481 add \$1,$len | |
| 482 mov ($dat,$idx,4),%r11d | |
| 483 cmovz %rcx,$len | |
| 484 mov %r10d,($dat,$idx,4) | |
| 485 mov %r11d,($dat,$ido,4) | |
| 486 add \$1,$ido#b | |
| 487 jnc .Lw2ndloop | |
| 488 | |
| 489 xor %eax,%eax | |
| 490 mov %eax,-8($dat) | |
| 491 mov %eax,-4($dat) | |
| 492 ret | |
| 493 .size RC4_set_key,.-RC4_set_key | |
| 494 | |
| 495 .globl RC4_options | |
| 496 .type RC4_options,\@abi-omnipotent | |
| 497 .align 16 | |
| 498 RC4_options: | |
| 499 lea .Lopts(%rip),%rax | |
| 500 ret | |
| 501 .align 64 | |
| 502 .Lopts: | |
| 503 .asciz "rc4(64x,int)" | |
| 504 .align 64 | |
| 505 .size RC4_options,.-RC4_options | |
| 506 ___ | |
| 507 } | |
| 508 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | |
| 509 # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
| 510 if ($win64) { | |
| 511 my $rec="%rcx"; | |
| 512 my $frame="%rdx"; | |
| 513 my $context="%r8"; | |
| 514 my $disp="%r9"; | |
| 515 | |
| 516 $code.=<<___; | |
| 517 .extern __imp_RtlVirtualUnwind | |
| 518 .type se_handler,\@abi-omnipotent | |
| 519 .align 16 | |
| 520 se_handler: | |
| 521 push %rsi | |
| 522 push %rdi | |
| 523 push %rbx | |
| 524 push %rbp | |
| 525 push %r12 | |
| 526 push %r13 | |
| 527 push %r14 | |
| 528 push %r15 | |
| 529 pushfq | |
| 530 sub \$64,%rsp | |
| 531 | |
| 532 mov 120($context),%rax # pull context->Rax | |
| 533 mov 248($context),%rbx # pull context->Rip | |
| 534 | |
| 535 lea .Lbody(%rip),%r10 | |
| 536 cmp %r10,%rbx # context->Rip<.Lbody | |
| 537 jb .Lin_prologue | |
| 538 | |
| 539 mov 152($context),%rax # pull context->Rsp | |
| 540 | |
| 541 lea .Lepilogue(%rip),%r10 | |
| 542 cmp %r10,%rbx # context->Rip>=.Lepilogue | |
| 543 jae .Lin_prologue | |
| 544 | |
| 545 mov 40(%rax),%r15 | |
| 546 mov 48(%rax),%r14 | |
| 547 mov 56(%rax),%r13 | |
| 548 mov 64(%rax),%r12 | |
| 549 mov 72(%rax),%rbp | |
| 550 mov 80(%rax),%rbx | |
| 551 lea 88(%rax),%rax | |
| 552 | |
| 553 mov %rbx,144($context) # restore context->Rbx | |
| 554 mov %rbp,160($context) # restore context->Rbp | |
| 555 mov %r12,216($context) # restore context->R12 | |
| 556 mov %r13,224($context) # restore context->R12 | |
| 557 mov %r14,232($context) # restore context->R14 | |
| 558 mov %r15,240($context) # restore context->R15 | |
| 559 | |
| 560 .Lin_prologue: | |
| 561 mov 8(%rax),%rdi | |
| 562 mov 16(%rax),%rsi | |
| 563 mov %rax,152($context) # restore context->Rsp | |
| 564 mov %rsi,168($context) # restore context->Rsi | |
| 565 mov %rdi,176($context) # restore context->Rdi | |
| 566 | |
| 567 mov 40($disp),%rdi # disp->ContextRecord | |
| 568 mov $context,%rsi # context | |
| 569 mov \$154,%ecx # sizeof(CONTEXT) | |
| 570 .long 0xa548f3fc # cld; rep movsq | |
| 571 | |
| 572 mov $disp,%rsi | |
| 573 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
| 574 mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
| 575 mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
| 576 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
| 577 mov 40(%rsi),%r10 # disp->ContextRecord | |
| 578 lea 56(%rsi),%r11 # &disp->HandlerData | |
| 579 lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
| 580 mov %r10,32(%rsp) # arg5 | |
| 581 mov %r11,40(%rsp) # arg6 | |
| 582 mov %r12,48(%rsp) # arg7 | |
| 583 mov %rcx,56(%rsp) # arg8, (NULL) | |
| 584 call *__imp_RtlVirtualUnwind(%rip) | |
| 585 | |
| 586 mov \$1,%eax # ExceptionContinueSearch | |
| 587 add \$64,%rsp | |
| 588 popfq | |
| 589 pop %r15 | |
| 590 pop %r14 | |
| 591 pop %r13 | |
| 592 pop %r12 | |
| 593 pop %rbp | |
| 594 pop %rbx | |
| 595 pop %rdi | |
| 596 pop %rsi | |
| 597 ret | |
| 598 .size se_handler,.-se_handler | |
| 599 | |
| 600 .section .pdata | |
| 601 .align 4 | |
| 602 .rva .LSEH_begin_$func | |
| 603 .rva .LSEH_end_$func | |
| 604 .rva .LSEH_info_$func | |
| 605 | |
| 606 .section .xdata | |
| 607 .align 8 | |
| 608 .LSEH_info_$func: | |
| 609 .byte 9,0,0,0 | |
| 610 .rva se_handler | |
| 611 ___ | |
| 612 } | |
| 613 | |
| 614 sub reg_part { | |
| 615 my ($reg,$conv)=@_; | |
| 616 if ($reg =~ /%r[0-9]+/) { $reg .= $conv; } | |
| 617 elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; } | |
| 618 elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; } | |
| 619 elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; } | |
| 620 return $reg; | |
| 621 } | |
| 622 | |
| 623 $code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem; | |
| 624 $code =~ s/\`([^\`]*)\`/eval $1/gem; | |
| 625 $code =~ s/pinsrw\s+\$0,/movd /gm; | |
| 626 | |
| 627 $code =~ s/#md5#//gm if ($md5); | |
| 628 $code =~ s/#rc4#//gm if ($rc4); | |
| 629 | |
| 630 print $code; | |
| 631 | |
| 632 close STDOUT; | |
| OLD | NEW |