| OLD | NEW |
| (Empty) |
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 # ==================================================================== | |
| 4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
| 5 # project. The module is, however, dual licensed under OpenSSL and | |
| 6 # CRYPTOGAMS licenses depending on where you obtain it. For further | |
| 7 # details see http://www.openssl.org/~appro/cryptogams/. | |
| 8 # ==================================================================== | |
| 9 | |
| 10 # August 2011. | |
| 11 # | |
| 12 # Companion to x86_64-mont.pl that optimizes cache-timing attack | |
| 13 # countermeasures. The subroutines are produced by replacing bp[i] | |
| 14 # references in their x86_64-mont.pl counterparts with cache-neutral | |
| 15 # references to powers table computed in BN_mod_exp_mont_consttime. | |
| 16 # In addition subroutine that scatters elements of the powers table | |
| 17 # is implemented, so that scatter-/gathering can be tuned without | |
| 18 # bn_exp.c modifications. | |
| 19 | |
| 20 $flavour = shift; | |
| 21 $output = shift; | |
| 22 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | |
| 23 | |
| 24 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
| 25 | |
| 26 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
| 27 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
| 28 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
| 29 die "can't locate x86_64-xlate.pl"; | |
| 30 | |
| 31 open OUT,"| \"$^X\" $xlate $flavour $output"; | |
| 32 *STDOUT=*OUT; | |
| 33 | |
| 34 # int bn_mul_mont_gather5( | |
| 35 $rp="%rdi"; # BN_ULONG *rp, | |
| 36 $ap="%rsi"; # const BN_ULONG *ap, | |
| 37 $bp="%rdx"; # const BN_ULONG *bp, | |
| 38 $np="%rcx"; # const BN_ULONG *np, | |
| 39 $n0="%r8"; # const BN_ULONG *n0, | |
| 40 $num="%r9"; # int num, | |
| 41 # int idx); # 0 to 2^5-1, "index" in $bp holding | |
| 42 # pre-computed powers of a', interlaced | |
| 43 # in such manner that b[0] is $bp[idx], | |
| 44 # b[1] is [2^5+idx], etc. | |
| 45 $lo0="%r10"; | |
| 46 $hi0="%r11"; | |
| 47 $hi1="%r13"; | |
| 48 $i="%r14"; | |
| 49 $j="%r15"; | |
| 50 $m0="%rbx"; | |
| 51 $m1="%rbp"; | |
| 52 | |
| 53 $code=<<___; | |
| 54 .text | |
| 55 | |
| 56 .globl bn_mul_mont_gather5 | |
| 57 .type bn_mul_mont_gather5,\@function,6 | |
| 58 .align 64 | |
| 59 bn_mul_mont_gather5: | |
| 60 test \$3,${num}d | |
| 61 jnz .Lmul_enter | |
| 62 cmp \$8,${num}d | |
| 63 jb .Lmul_enter | |
| 64 jmp .Lmul4x_enter | |
| 65 | |
| 66 .align 16 | |
| 67 .Lmul_enter: | |
| 68 mov ${num}d,${num}d | |
| 69 mov `($win64?56:8)`(%rsp),%r10d # load 7th argument | |
| 70 push %rbx | |
| 71 push %rbp | |
| 72 push %r12 | |
| 73 push %r13 | |
| 74 push %r14 | |
| 75 push %r15 | |
| 76 ___ | |
| 77 $code.=<<___ if ($win64); | |
| 78 lea -0x28(%rsp),%rsp | |
| 79 movaps %xmm6,(%rsp) | |
| 80 movaps %xmm7,0x10(%rsp) | |
| 81 .Lmul_alloca: | |
| 82 ___ | |
| 83 $code.=<<___; | |
| 84 mov %rsp,%rax | |
| 85 lea 2($num),%r11 | |
| 86 neg %r11 | |
| 87 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) | |
| 88 and \$-1024,%rsp # minimize TLB usage | |
| 89 | |
| 90 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp | |
| 91 .Lmul_body: | |
| 92 mov $bp,%r12 # reassign $bp | |
| 93 ___ | |
| 94 $bp="%r12"; | |
| 95 $STRIDE=2**5*8; # 5 is "window size" | |
| 96 $N=$STRIDE/4; # should match cache line size | |
| 97 $code.=<<___; | |
| 98 mov %r10,%r11 | |
| 99 shr \$`log($N/8)/log(2)`,%r10 | |
| 100 and \$`$N/8-1`,%r11 | |
| 101 not %r10 | |
| 102 lea .Lmagic_masks(%rip),%rax | |
| 103 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" | |
| 104 lea 96($bp,%r11,8),$bp # pointer within 1st cache line | |
| 105 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which | |
| 106 movq 8(%rax,%r10,8),%xmm5 # cache line contains element | |
| 107 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument | |
| 108 movq 24(%rax,%r10,8),%xmm7 | |
| 109 | |
| 110 movq `0*$STRIDE/4-96`($bp),%xmm0 | |
| 111 movq `1*$STRIDE/4-96`($bp),%xmm1 | |
| 112 pand %xmm4,%xmm0 | |
| 113 movq `2*$STRIDE/4-96`($bp),%xmm2 | |
| 114 pand %xmm5,%xmm1 | |
| 115 movq `3*$STRIDE/4-96`($bp),%xmm3 | |
| 116 pand %xmm6,%xmm2 | |
| 117 por %xmm1,%xmm0 | |
| 118 pand %xmm7,%xmm3 | |
| 119 por %xmm2,%xmm0 | |
| 120 lea $STRIDE($bp),$bp | |
| 121 por %xmm3,%xmm0 | |
| 122 | |
| 123 movq %xmm0,$m0 # m0=bp[0] | |
| 124 | |
| 125 mov ($n0),$n0 # pull n0[0] value | |
| 126 mov ($ap),%rax | |
| 127 | |
| 128 xor $i,$i # i=0 | |
| 129 xor $j,$j # j=0 | |
| 130 | |
| 131 movq `0*$STRIDE/4-96`($bp),%xmm0 | |
| 132 movq `1*$STRIDE/4-96`($bp),%xmm1 | |
| 133 pand %xmm4,%xmm0 | |
| 134 movq `2*$STRIDE/4-96`($bp),%xmm2 | |
| 135 pand %xmm5,%xmm1 | |
| 136 | |
| 137 mov $n0,$m1 | |
| 138 mulq $m0 # ap[0]*bp[0] | |
| 139 mov %rax,$lo0 | |
| 140 mov ($np),%rax | |
| 141 | |
| 142 movq `3*$STRIDE/4-96`($bp),%xmm3 | |
| 143 pand %xmm6,%xmm2 | |
| 144 por %xmm1,%xmm0 | |
| 145 pand %xmm7,%xmm3 | |
| 146 | |
| 147 imulq $lo0,$m1 # "tp[0]"*n0 | |
| 148 mov %rdx,$hi0 | |
| 149 | |
| 150 por %xmm2,%xmm0 | |
| 151 lea $STRIDE($bp),$bp | |
| 152 por %xmm3,%xmm0 | |
| 153 | |
| 154 mulq $m1 # np[0]*m1 | |
| 155 add %rax,$lo0 # discarded | |
| 156 mov 8($ap),%rax | |
| 157 adc \$0,%rdx | |
| 158 mov %rdx,$hi1 | |
| 159 | |
| 160 lea 1($j),$j # j++ | |
| 161 jmp .L1st_enter | |
| 162 | |
| 163 .align 16 | |
| 164 .L1st: | |
| 165 add %rax,$hi1 | |
| 166 mov ($ap,$j,8),%rax | |
| 167 adc \$0,%rdx | |
| 168 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] | |
| 169 mov $lo0,$hi0 | |
| 170 adc \$0,%rdx | |
| 171 mov $hi1,-16(%rsp,$j,8) # tp[j-1] | |
| 172 mov %rdx,$hi1 | |
| 173 | |
| 174 .L1st_enter: | |
| 175 mulq $m0 # ap[j]*bp[0] | |
| 176 add %rax,$hi0 | |
| 177 mov ($np,$j,8),%rax | |
| 178 adc \$0,%rdx | |
| 179 lea 1($j),$j # j++ | |
| 180 mov %rdx,$lo0 | |
| 181 | |
| 182 mulq $m1 # np[j]*m1 | |
| 183 cmp $num,$j | |
| 184 jne .L1st | |
| 185 | |
| 186 movq %xmm0,$m0 # bp[1] | |
| 187 | |
| 188 add %rax,$hi1 | |
| 189 mov ($ap),%rax # ap[0] | |
| 190 adc \$0,%rdx | |
| 191 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] | |
| 192 adc \$0,%rdx | |
| 193 mov $hi1,-16(%rsp,$j,8) # tp[j-1] | |
| 194 mov %rdx,$hi1 | |
| 195 mov $lo0,$hi0 | |
| 196 | |
| 197 xor %rdx,%rdx | |
| 198 add $hi0,$hi1 | |
| 199 adc \$0,%rdx | |
| 200 mov $hi1,-8(%rsp,$num,8) | |
| 201 mov %rdx,(%rsp,$num,8) # store upmost overflow bit | |
| 202 | |
| 203 lea 1($i),$i # i++ | |
| 204 jmp .Louter | |
| 205 .align 16 | |
| 206 .Louter: | |
| 207 xor $j,$j # j=0 | |
| 208 mov $n0,$m1 | |
| 209 mov (%rsp),$lo0 | |
| 210 | |
| 211 movq `0*$STRIDE/4-96`($bp),%xmm0 | |
| 212 movq `1*$STRIDE/4-96`($bp),%xmm1 | |
| 213 pand %xmm4,%xmm0 | |
| 214 movq `2*$STRIDE/4-96`($bp),%xmm2 | |
| 215 pand %xmm5,%xmm1 | |
| 216 | |
| 217 mulq $m0 # ap[0]*bp[i] | |
| 218 add %rax,$lo0 # ap[0]*bp[i]+tp[0] | |
| 219 mov ($np),%rax | |
| 220 adc \$0,%rdx | |
| 221 | |
| 222 movq `3*$STRIDE/4-96`($bp),%xmm3 | |
| 223 pand %xmm6,%xmm2 | |
| 224 por %xmm1,%xmm0 | |
| 225 pand %xmm7,%xmm3 | |
| 226 | |
| 227 imulq $lo0,$m1 # tp[0]*n0 | |
| 228 mov %rdx,$hi0 | |
| 229 | |
| 230 por %xmm2,%xmm0 | |
| 231 lea $STRIDE($bp),$bp | |
| 232 por %xmm3,%xmm0 | |
| 233 | |
| 234 mulq $m1 # np[0]*m1 | |
| 235 add %rax,$lo0 # discarded | |
| 236 mov 8($ap),%rax | |
| 237 adc \$0,%rdx | |
| 238 mov 8(%rsp),$lo0 # tp[1] | |
| 239 mov %rdx,$hi1 | |
| 240 | |
| 241 lea 1($j),$j # j++ | |
| 242 jmp .Linner_enter | |
| 243 | |
| 244 .align 16 | |
| 245 .Linner: | |
| 246 add %rax,$hi1 | |
| 247 mov ($ap,$j,8),%rax | |
| 248 adc \$0,%rdx | |
| 249 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] | |
| 250 mov (%rsp,$j,8),$lo0 | |
| 251 adc \$0,%rdx | |
| 252 mov $hi1,-16(%rsp,$j,8) # tp[j-1] | |
| 253 mov %rdx,$hi1 | |
| 254 | |
| 255 .Linner_enter: | |
| 256 mulq $m0 # ap[j]*bp[i] | |
| 257 add %rax,$hi0 | |
| 258 mov ($np,$j,8),%rax | |
| 259 adc \$0,%rdx | |
| 260 add $hi0,$lo0 # ap[j]*bp[i]+tp[j] | |
| 261 mov %rdx,$hi0 | |
| 262 adc \$0,$hi0 | |
| 263 lea 1($j),$j # j++ | |
| 264 | |
| 265 mulq $m1 # np[j]*m1 | |
| 266 cmp $num,$j | |
| 267 jne .Linner | |
| 268 | |
| 269 movq %xmm0,$m0 # bp[i+1] | |
| 270 | |
| 271 add %rax,$hi1 | |
| 272 mov ($ap),%rax # ap[0] | |
| 273 adc \$0,%rdx | |
| 274 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] | |
| 275 mov (%rsp,$j,8),$lo0 | |
| 276 adc \$0,%rdx | |
| 277 mov $hi1,-16(%rsp,$j,8) # tp[j-1] | |
| 278 mov %rdx,$hi1 | |
| 279 | |
| 280 xor %rdx,%rdx | |
| 281 add $hi0,$hi1 | |
| 282 adc \$0,%rdx | |
| 283 add $lo0,$hi1 # pull upmost overflow bit | |
| 284 adc \$0,%rdx | |
| 285 mov $hi1,-8(%rsp,$num,8) | |
| 286 mov %rdx,(%rsp,$num,8) # store upmost overflow bit | |
| 287 | |
| 288 lea 1($i),$i # i++ | |
| 289 cmp $num,$i | |
| 290 jl .Louter | |
| 291 | |
| 292 xor $i,$i # i=0 and clear CF! | |
| 293 mov (%rsp),%rax # tp[0] | |
| 294 lea (%rsp),$ap # borrow ap for tp | |
| 295 mov $num,$j # j=num | |
| 296 jmp .Lsub | |
| 297 .align 16 | |
| 298 .Lsub: sbb ($np,$i,8),%rax | |
| 299 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] | |
| 300 mov 8($ap,$i,8),%rax # tp[i+1] | |
| 301 lea 1($i),$i # i++ | |
| 302 dec $j # doesnn't affect CF! | |
| 303 jnz .Lsub | |
| 304 | |
| 305 sbb \$0,%rax # handle upmost overflow bit | |
| 306 xor $i,$i | |
| 307 and %rax,$ap | |
| 308 not %rax | |
| 309 mov $rp,$np | |
| 310 and %rax,$np | |
| 311 mov $num,$j # j=num | |
| 312 or $np,$ap # ap=borrow?tp:rp | |
| 313 .align 16 | |
| 314 .Lcopy: # copy or in-place refresh | |
| 315 mov ($ap,$i,8),%rax | |
| 316 mov $i,(%rsp,$i,8) # zap temporary vector | |
| 317 mov %rax,($rp,$i,8) # rp[i]=tp[i] | |
| 318 lea 1($i),$i | |
| 319 sub \$1,$j | |
| 320 jnz .Lcopy | |
| 321 | |
| 322 mov 8(%rsp,$num,8),%rsi # restore %rsp | |
| 323 mov \$1,%rax | |
| 324 ___ | |
| 325 $code.=<<___ if ($win64); | |
| 326 movaps (%rsi),%xmm6 | |
| 327 movaps 0x10(%rsi),%xmm7 | |
| 328 lea 0x28(%rsi),%rsi | |
| 329 ___ | |
| 330 $code.=<<___; | |
| 331 mov (%rsi),%r15 | |
| 332 mov 8(%rsi),%r14 | |
| 333 mov 16(%rsi),%r13 | |
| 334 mov 24(%rsi),%r12 | |
| 335 mov 32(%rsi),%rbp | |
| 336 mov 40(%rsi),%rbx | |
| 337 lea 48(%rsi),%rsp | |
| 338 .Lmul_epilogue: | |
| 339 ret | |
| 340 .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 | |
| 341 ___ | |
| 342 {{{ | |
| 343 my @A=("%r10","%r11"); | |
| 344 my @N=("%r13","%rdi"); | |
| 345 $code.=<<___; | |
| 346 .type bn_mul4x_mont_gather5,\@function,6 | |
| 347 .align 16 | |
| 348 bn_mul4x_mont_gather5: | |
| 349 .Lmul4x_enter: | |
| 350 mov ${num}d,${num}d | |
| 351 mov `($win64?56:8)`(%rsp),%r10d # load 7th argument | |
| 352 push %rbx | |
| 353 push %rbp | |
| 354 push %r12 | |
| 355 push %r13 | |
| 356 push %r14 | |
| 357 push %r15 | |
| 358 ___ | |
| 359 $code.=<<___ if ($win64); | |
| 360 lea -0x28(%rsp),%rsp | |
| 361 movaps %xmm6,(%rsp) | |
| 362 movaps %xmm7,0x10(%rsp) | |
| 363 .Lmul4x_alloca: | |
| 364 ___ | |
| 365 $code.=<<___; | |
| 366 mov %rsp,%rax | |
| 367 lea 4($num),%r11 | |
| 368 neg %r11 | |
| 369 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)) | |
| 370 and \$-1024,%rsp # minimize TLB usage | |
| 371 | |
| 372 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp | |
| 373 .Lmul4x_body: | |
| 374 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp | |
| 375 mov %rdx,%r12 # reassign $bp | |
| 376 ___ | |
| 377 $bp="%r12"; | |
| 378 $STRIDE=2**5*8; # 5 is "window size" | |
| 379 $N=$STRIDE/4; # should match cache line size | |
| 380 $code.=<<___; | |
| 381 mov %r10,%r11 | |
| 382 shr \$`log($N/8)/log(2)`,%r10 | |
| 383 and \$`$N/8-1`,%r11 | |
| 384 not %r10 | |
| 385 lea .Lmagic_masks(%rip),%rax | |
| 386 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" | |
| 387 lea 96($bp,%r11,8),$bp # pointer within 1st cache line | |
| 388 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which | |
| 389 movq 8(%rax,%r10,8),%xmm5 # cache line contains element | |
| 390 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument | |
| 391 movq 24(%rax,%r10,8),%xmm7 | |
| 392 | |
| 393 movq `0*$STRIDE/4-96`($bp),%xmm0 | |
| 394 movq `1*$STRIDE/4-96`($bp),%xmm1 | |
| 395 pand %xmm4,%xmm0 | |
| 396 movq `2*$STRIDE/4-96`($bp),%xmm2 | |
| 397 pand %xmm5,%xmm1 | |
| 398 movq `3*$STRIDE/4-96`($bp),%xmm3 | |
| 399 pand %xmm6,%xmm2 | |
| 400 por %xmm1,%xmm0 | |
| 401 pand %xmm7,%xmm3 | |
| 402 por %xmm2,%xmm0 | |
| 403 lea $STRIDE($bp),$bp | |
| 404 por %xmm3,%xmm0 | |
| 405 | |
| 406 movq %xmm0,$m0 # m0=bp[0] | |
| 407 mov ($n0),$n0 # pull n0[0] value | |
| 408 mov ($ap),%rax | |
| 409 | |
| 410 xor $i,$i # i=0 | |
| 411 xor $j,$j # j=0 | |
| 412 | |
| 413 movq `0*$STRIDE/4-96`($bp),%xmm0 | |
| 414 movq `1*$STRIDE/4-96`($bp),%xmm1 | |
| 415 pand %xmm4,%xmm0 | |
| 416 movq `2*$STRIDE/4-96`($bp),%xmm2 | |
| 417 pand %xmm5,%xmm1 | |
| 418 | |
| 419 mov $n0,$m1 | |
| 420 mulq $m0 # ap[0]*bp[0] | |
| 421 mov %rax,$A[0] | |
| 422 mov ($np),%rax | |
| 423 | |
| 424 movq `3*$STRIDE/4-96`($bp),%xmm3 | |
| 425 pand %xmm6,%xmm2 | |
| 426 por %xmm1,%xmm0 | |
| 427 pand %xmm7,%xmm3 | |
| 428 | |
| 429 imulq $A[0],$m1 # "tp[0]"*n0 | |
| 430 mov %rdx,$A[1] | |
| 431 | |
| 432 por %xmm2,%xmm0 | |
| 433 lea $STRIDE($bp),$bp | |
| 434 por %xmm3,%xmm0 | |
| 435 | |
| 436 mulq $m1 # np[0]*m1 | |
| 437 add %rax,$A[0] # discarded | |
| 438 mov 8($ap),%rax | |
| 439 adc \$0,%rdx | |
| 440 mov %rdx,$N[1] | |
| 441 | |
| 442 mulq $m0 | |
| 443 add %rax,$A[1] | |
| 444 mov 8($np),%rax | |
| 445 adc \$0,%rdx | |
| 446 mov %rdx,$A[0] | |
| 447 | |
| 448 mulq $m1 | |
| 449 add %rax,$N[1] | |
| 450 mov 16($ap),%rax | |
| 451 adc \$0,%rdx | |
| 452 add $A[1],$N[1] | |
| 453 lea 4($j),$j # j++ | |
| 454 adc \$0,%rdx | |
| 455 mov $N[1],(%rsp) | |
| 456 mov %rdx,$N[0] | |
| 457 jmp .L1st4x | |
| 458 .align 16 | |
| 459 .L1st4x: | |
| 460 mulq $m0 # ap[j]*bp[0] | |
| 461 add %rax,$A[0] | |
| 462 mov -16($np,$j,8),%rax | |
| 463 adc \$0,%rdx | |
| 464 mov %rdx,$A[1] | |
| 465 | |
| 466 mulq $m1 # np[j]*m1 | |
| 467 add %rax,$N[0] | |
| 468 mov -8($ap,$j,8),%rax | |
| 469 adc \$0,%rdx | |
| 470 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] | |
| 471 adc \$0,%rdx | |
| 472 mov $N[0],-24(%rsp,$j,8) # tp[j-1] | |
| 473 mov %rdx,$N[1] | |
| 474 | |
| 475 mulq $m0 # ap[j]*bp[0] | |
| 476 add %rax,$A[1] | |
| 477 mov -8($np,$j,8),%rax | |
| 478 adc \$0,%rdx | |
| 479 mov %rdx,$A[0] | |
| 480 | |
| 481 mulq $m1 # np[j]*m1 | |
| 482 add %rax,$N[1] | |
| 483 mov ($ap,$j,8),%rax | |
| 484 adc \$0,%rdx | |
| 485 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] | |
| 486 adc \$0,%rdx | |
| 487 mov $N[1],-16(%rsp,$j,8) # tp[j-1] | |
| 488 mov %rdx,$N[0] | |
| 489 | |
| 490 mulq $m0 # ap[j]*bp[0] | |
| 491 add %rax,$A[0] | |
| 492 mov ($np,$j,8),%rax | |
| 493 adc \$0,%rdx | |
| 494 mov %rdx,$A[1] | |
| 495 | |
| 496 mulq $m1 # np[j]*m1 | |
| 497 add %rax,$N[0] | |
| 498 mov 8($ap,$j,8),%rax | |
| 499 adc \$0,%rdx | |
| 500 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] | |
| 501 adc \$0,%rdx | |
| 502 mov $N[0],-8(%rsp,$j,8) # tp[j-1] | |
| 503 mov %rdx,$N[1] | |
| 504 | |
| 505 mulq $m0 # ap[j]*bp[0] | |
| 506 add %rax,$A[1] | |
| 507 mov 8($np,$j,8),%rax | |
| 508 adc \$0,%rdx | |
| 509 lea 4($j),$j # j++ | |
| 510 mov %rdx,$A[0] | |
| 511 | |
| 512 mulq $m1 # np[j]*m1 | |
| 513 add %rax,$N[1] | |
| 514 mov -16($ap,$j,8),%rax | |
| 515 adc \$0,%rdx | |
| 516 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] | |
| 517 adc \$0,%rdx | |
| 518 mov $N[1],-32(%rsp,$j,8) # tp[j-1] | |
| 519 mov %rdx,$N[0] | |
| 520 cmp $num,$j | |
| 521 jl .L1st4x | |
| 522 | |
| 523 mulq $m0 # ap[j]*bp[0] | |
| 524 add %rax,$A[0] | |
| 525 mov -16($np,$j,8),%rax | |
| 526 adc \$0,%rdx | |
| 527 mov %rdx,$A[1] | |
| 528 | |
| 529 mulq $m1 # np[j]*m1 | |
| 530 add %rax,$N[0] | |
| 531 mov -8($ap,$j,8),%rax | |
| 532 adc \$0,%rdx | |
| 533 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] | |
| 534 adc \$0,%rdx | |
| 535 mov $N[0],-24(%rsp,$j,8) # tp[j-1] | |
| 536 mov %rdx,$N[1] | |
| 537 | |
| 538 mulq $m0 # ap[j]*bp[0] | |
| 539 add %rax,$A[1] | |
| 540 mov -8($np,$j,8),%rax | |
| 541 adc \$0,%rdx | |
| 542 mov %rdx,$A[0] | |
| 543 | |
| 544 mulq $m1 # np[j]*m1 | |
| 545 add %rax,$N[1] | |
| 546 mov ($ap),%rax # ap[0] | |
| 547 adc \$0,%rdx | |
| 548 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] | |
| 549 adc \$0,%rdx | |
| 550 mov $N[1],-16(%rsp,$j,8) # tp[j-1] | |
| 551 mov %rdx,$N[0] | |
| 552 | |
| 553 movq %xmm0,$m0 # bp[1] | |
| 554 | |
| 555 xor $N[1],$N[1] | |
| 556 add $A[0],$N[0] | |
| 557 adc \$0,$N[1] | |
| 558 mov $N[0],-8(%rsp,$j,8) | |
| 559 mov $N[1],(%rsp,$j,8) # store upmost overflow bit | |
| 560 | |
| 561 lea 1($i),$i # i++ | |
| 562 .align 4 | |
| 563 .Louter4x: | |
| 564 xor $j,$j # j=0 | |
| 565 movq `0*$STRIDE/4-96`($bp),%xmm0 | |
| 566 movq `1*$STRIDE/4-96`($bp),%xmm1 | |
| 567 pand %xmm4,%xmm0 | |
| 568 movq `2*$STRIDE/4-96`($bp),%xmm2 | |
| 569 pand %xmm5,%xmm1 | |
| 570 | |
| 571 mov (%rsp),$A[0] | |
| 572 mov $n0,$m1 | |
| 573 mulq $m0 # ap[0]*bp[i] | |
| 574 add %rax,$A[0] # ap[0]*bp[i]+tp[0] | |
| 575 mov ($np),%rax | |
| 576 adc \$0,%rdx | |
| 577 | |
| 578 movq `3*$STRIDE/4-96`($bp),%xmm3 | |
| 579 pand %xmm6,%xmm2 | |
| 580 por %xmm1,%xmm0 | |
| 581 pand %xmm7,%xmm3 | |
| 582 | |
| 583 imulq $A[0],$m1 # tp[0]*n0 | |
| 584 mov %rdx,$A[1] | |
| 585 | |
| 586 por %xmm2,%xmm0 | |
| 587 lea $STRIDE($bp),$bp | |
| 588 por %xmm3,%xmm0 | |
| 589 | |
| 590 mulq $m1 # np[0]*m1 | |
| 591 add %rax,$A[0] # "$N[0]", discarded | |
| 592 mov 8($ap),%rax | |
| 593 adc \$0,%rdx | |
| 594 mov %rdx,$N[1] | |
| 595 | |
| 596 mulq $m0 # ap[j]*bp[i] | |
| 597 add %rax,$A[1] | |
| 598 mov 8($np),%rax | |
| 599 adc \$0,%rdx | |
| 600 add 8(%rsp),$A[1] # +tp[1] | |
| 601 adc \$0,%rdx | |
| 602 mov %rdx,$A[0] | |
| 603 | |
| 604 mulq $m1 # np[j]*m1 | |
| 605 add %rax,$N[1] | |
| 606 mov 16($ap),%rax | |
| 607 adc \$0,%rdx | |
| 608 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] | |
| 609 lea 4($j),$j # j+=2 | |
| 610 adc \$0,%rdx | |
| 611 mov %rdx,$N[0] | |
| 612 jmp .Linner4x | |
| 613 .align 16 | |
| 614 .Linner4x: | |
| 615 mulq $m0 # ap[j]*bp[i] | |
| 616 add %rax,$A[0] | |
| 617 mov -16($np,$j,8),%rax | |
| 618 adc \$0,%rdx | |
| 619 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] | |
| 620 adc \$0,%rdx | |
| 621 mov %rdx,$A[1] | |
| 622 | |
| 623 mulq $m1 # np[j]*m1 | |
| 624 add %rax,$N[0] | |
| 625 mov -8($ap,$j,8),%rax | |
| 626 adc \$0,%rdx | |
| 627 add $A[0],$N[0] | |
| 628 adc \$0,%rdx | |
| 629 mov $N[1],-32(%rsp,$j,8) # tp[j-1] | |
| 630 mov %rdx,$N[1] | |
| 631 | |
| 632 mulq $m0 # ap[j]*bp[i] | |
| 633 add %rax,$A[1] | |
| 634 mov -8($np,$j,8),%rax | |
| 635 adc \$0,%rdx | |
| 636 add -8(%rsp,$j,8),$A[1] | |
| 637 adc \$0,%rdx | |
| 638 mov %rdx,$A[0] | |
| 639 | |
| 640 mulq $m1 # np[j]*m1 | |
| 641 add %rax,$N[1] | |
| 642 mov ($ap,$j,8),%rax | |
| 643 adc \$0,%rdx | |
| 644 add $A[1],$N[1] | |
| 645 adc \$0,%rdx | |
| 646 mov $N[0],-24(%rsp,$j,8) # tp[j-1] | |
| 647 mov %rdx,$N[0] | |
| 648 | |
| 649 mulq $m0 # ap[j]*bp[i] | |
| 650 add %rax,$A[0] | |
| 651 mov ($np,$j,8),%rax | |
| 652 adc \$0,%rdx | |
| 653 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] | |
| 654 adc \$0,%rdx | |
| 655 mov %rdx,$A[1] | |
| 656 | |
| 657 mulq $m1 # np[j]*m1 | |
| 658 add %rax,$N[0] | |
| 659 mov 8($ap,$j,8),%rax | |
| 660 adc \$0,%rdx | |
| 661 add $A[0],$N[0] | |
| 662 adc \$0,%rdx | |
| 663 mov $N[1],-16(%rsp,$j,8) # tp[j-1] | |
| 664 mov %rdx,$N[1] | |
| 665 | |
| 666 mulq $m0 # ap[j]*bp[i] | |
| 667 add %rax,$A[1] | |
| 668 mov 8($np,$j,8),%rax | |
| 669 adc \$0,%rdx | |
| 670 add 8(%rsp,$j,8),$A[1] | |
| 671 adc \$0,%rdx | |
| 672 lea 4($j),$j # j++ | |
| 673 mov %rdx,$A[0] | |
| 674 | |
| 675 mulq $m1 # np[j]*m1 | |
| 676 add %rax,$N[1] | |
| 677 mov -16($ap,$j,8),%rax | |
| 678 adc \$0,%rdx | |
| 679 add $A[1],$N[1] | |
| 680 adc \$0,%rdx | |
| 681 mov $N[0],-40(%rsp,$j,8) # tp[j-1] | |
| 682 mov %rdx,$N[0] | |
| 683 cmp $num,$j | |
| 684 jl .Linner4x | |
| 685 | |
| 686 mulq $m0 # ap[j]*bp[i] | |
| 687 add %rax,$A[0] | |
| 688 mov -16($np,$j,8),%rax | |
| 689 adc \$0,%rdx | |
| 690 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] | |
| 691 adc \$0,%rdx | |
| 692 mov %rdx,$A[1] | |
| 693 | |
| 694 mulq $m1 # np[j]*m1 | |
| 695 add %rax,$N[0] | |
| 696 mov -8($ap,$j,8),%rax | |
| 697 adc \$0,%rdx | |
| 698 add $A[0],$N[0] | |
| 699 adc \$0,%rdx | |
| 700 mov $N[1],-32(%rsp,$j,8) # tp[j-1] | |
| 701 mov %rdx,$N[1] | |
| 702 | |
| 703 mulq $m0 # ap[j]*bp[i] | |
| 704 add %rax,$A[1] | |
| 705 mov -8($np,$j,8),%rax | |
| 706 adc \$0,%rdx | |
| 707 add -8(%rsp,$j,8),$A[1] | |
| 708 adc \$0,%rdx | |
| 709 lea 1($i),$i # i++ | |
| 710 mov %rdx,$A[0] | |
| 711 | |
| 712 mulq $m1 # np[j]*m1 | |
| 713 add %rax,$N[1] | |
| 714 mov ($ap),%rax # ap[0] | |
| 715 adc \$0,%rdx | |
| 716 add $A[1],$N[1] | |
| 717 adc \$0,%rdx | |
| 718 mov $N[0],-24(%rsp,$j,8) # tp[j-1] | |
| 719 mov %rdx,$N[0] | |
| 720 | |
| 721 movq %xmm0,$m0 # bp[i+1] | |
| 722 mov $N[1],-16(%rsp,$j,8) # tp[j-1] | |
| 723 | |
| 724 xor $N[1],$N[1] | |
| 725 add $A[0],$N[0] | |
| 726 adc \$0,$N[1] | |
| 727 add (%rsp,$num,8),$N[0] # pull upmost overflow bit | |
| 728 adc \$0,$N[1] | |
| 729 mov $N[0],-8(%rsp,$j,8) | |
| 730 mov $N[1],(%rsp,$j,8) # store upmost overflow bit | |
| 731 | |
| 732 cmp $num,$i | |
| 733 jl .Louter4x | |
| 734 ___ | |
| 735 { | |
| 736 my @ri=("%rax","%rdx",$m0,$m1); | |
| 737 $code.=<<___; | |
| 738 mov 16(%rsp,$num,8),$rp # restore $rp | |
| 739 mov 0(%rsp),@ri[0] # tp[0] | |
| 740 pxor %xmm0,%xmm0 | |
| 741 mov 8(%rsp),@ri[1] # tp[1] | |
| 742 shr \$2,$num # num/=4 | |
| 743 lea (%rsp),$ap # borrow ap for tp | |
| 744 xor $i,$i # i=0 and clear CF! | |
| 745 | |
| 746 sub 0($np),@ri[0] | |
| 747 mov 16($ap),@ri[2] # tp[2] | |
| 748 mov 24($ap),@ri[3] # tp[3] | |
| 749 sbb 8($np),@ri[1] | |
| 750 lea -1($num),$j # j=num/4-1 | |
| 751 jmp .Lsub4x | |
| 752 .align 16 | |
| 753 .Lsub4x: | |
| 754 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] | |
| 755 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] | |
| 756 sbb 16($np,$i,8),@ri[2] | |
| 757 mov 32($ap,$i,8),@ri[0] # tp[i+1] | |
| 758 mov 40($ap,$i,8),@ri[1] | |
| 759 sbb 24($np,$i,8),@ri[3] | |
| 760 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] | |
| 761 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] | |
| 762 sbb 32($np,$i,8),@ri[0] | |
| 763 mov 48($ap,$i,8),@ri[2] | |
| 764 mov 56($ap,$i,8),@ri[3] | |
| 765 sbb 40($np,$i,8),@ri[1] | |
| 766 lea 4($i),$i # i++ | |
| 767 dec $j # doesnn't affect CF! | |
| 768 jnz .Lsub4x | |
| 769 | |
| 770 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] | |
| 771 mov 32($ap,$i,8),@ri[0] # load overflow bit | |
| 772 sbb 16($np,$i,8),@ri[2] | |
| 773 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] | |
| 774 sbb 24($np,$i,8),@ri[3] | |
| 775 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] | |
| 776 | |
| 777 sbb \$0,@ri[0] # handle upmost overflow bit | |
| 778 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] | |
| 779 xor $i,$i # i=0 | |
| 780 and @ri[0],$ap | |
| 781 not @ri[0] | |
| 782 mov $rp,$np | |
| 783 and @ri[0],$np | |
| 784 lea -1($num),$j | |
| 785 or $np,$ap # ap=borrow?tp:rp | |
| 786 | |
| 787 movdqu ($ap),%xmm1 | |
| 788 movdqa %xmm0,(%rsp) | |
| 789 movdqu %xmm1,($rp) | |
| 790 jmp .Lcopy4x | |
| 791 .align 16 | |
| 792 .Lcopy4x: # copy or in-place refresh | |
| 793 movdqu 16($ap,$i),%xmm2 | |
| 794 movdqu 32($ap,$i),%xmm1 | |
| 795 movdqa %xmm0,16(%rsp,$i) | |
| 796 movdqu %xmm2,16($rp,$i) | |
| 797 movdqa %xmm0,32(%rsp,$i) | |
| 798 movdqu %xmm1,32($rp,$i) | |
| 799 lea 32($i),$i | |
| 800 dec $j | |
| 801 jnz .Lcopy4x | |
| 802 | |
| 803 shl \$2,$num | |
| 804 movdqu 16($ap,$i),%xmm2 | |
| 805 movdqa %xmm0,16(%rsp,$i) | |
| 806 movdqu %xmm2,16($rp,$i) | |
| 807 ___ | |
| 808 } | |
| 809 $code.=<<___; | |
| 810 mov 8(%rsp,$num,8),%rsi # restore %rsp | |
| 811 mov \$1,%rax | |
| 812 ___ | |
| 813 $code.=<<___ if ($win64); | |
| 814 movaps (%rsi),%xmm6 | |
| 815 movaps 0x10(%rsi),%xmm7 | |
| 816 lea 0x28(%rsi),%rsi | |
| 817 ___ | |
| 818 $code.=<<___; | |
| 819 mov (%rsi),%r15 | |
| 820 mov 8(%rsi),%r14 | |
| 821 mov 16(%rsi),%r13 | |
| 822 mov 24(%rsi),%r12 | |
| 823 mov 32(%rsi),%rbp | |
| 824 mov 40(%rsi),%rbx | |
| 825 lea 48(%rsi),%rsp | |
| 826 .Lmul4x_epilogue: | |
| 827 ret | |
| 828 .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 | |
| 829 ___ | |
| 830 }}} | |
| 831 | |
| 832 { | |
| 833 my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order | |
| 834 ("%rdi","%rsi","%rdx","%rcx"); # Unix order | |
| 835 my $out=$inp; | |
| 836 my $STRIDE=2**5*8; | |
| 837 my $N=$STRIDE/4; | |
| 838 | |
| 839 $code.=<<___; | |
| 840 .globl bn_scatter5 | |
| 841 .type bn_scatter5,\@abi-omnipotent | |
| 842 .align 16 | |
| 843 bn_scatter5: | |
| 844 cmp \$0, $num | |
| 845 jz .Lscatter_epilogue | |
| 846 lea ($tbl,$idx,8),$tbl | |
| 847 .Lscatter: | |
| 848 mov ($inp),%rax | |
| 849 lea 8($inp),$inp | |
| 850 mov %rax,($tbl) | |
| 851 lea 32*8($tbl),$tbl | |
| 852 sub \$1,$num | |
| 853 jnz .Lscatter | |
| 854 .Lscatter_epilogue: | |
| 855 ret | |
| 856 .size bn_scatter5,.-bn_scatter5 | |
| 857 | |
| 858 .globl bn_gather5 | |
| 859 .type bn_gather5,\@abi-omnipotent | |
| 860 .align 16 | |
| 861 bn_gather5: | |
| 862 ___ | |
| 863 $code.=<<___ if ($win64); | |
| 864 .LSEH_begin_bn_gather5: | |
| 865 # I can't trust assembler to use specific encoding:-( | |
| 866 .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp | |
| 867 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) | |
| 868 .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) | |
| 869 ___ | |
| 870 $code.=<<___; | |
| 871 mov $idx,%r11 | |
| 872 shr \$`log($N/8)/log(2)`,$idx | |
| 873 and \$`$N/8-1`,%r11 | |
| 874 not $idx | |
| 875 lea .Lmagic_masks(%rip),%rax | |
| 876 and \$`2**5/($N/8)-1`,$idx # 5 is "window size" | |
| 877 lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line | |
| 878 movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which | |
| 879 movq 8(%rax,$idx,8),%xmm5 # cache line contains element | |
| 880 movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument | |
| 881 movq 24(%rax,$idx,8),%xmm7 | |
| 882 jmp .Lgather | |
| 883 .align 16 | |
| 884 .Lgather: | |
| 885 movq `0*$STRIDE/4-96`($tbl),%xmm0 | |
| 886 movq `1*$STRIDE/4-96`($tbl),%xmm1 | |
| 887 pand %xmm4,%xmm0 | |
| 888 movq `2*$STRIDE/4-96`($tbl),%xmm2 | |
| 889 pand %xmm5,%xmm1 | |
| 890 movq `3*$STRIDE/4-96`($tbl),%xmm3 | |
| 891 pand %xmm6,%xmm2 | |
| 892 por %xmm1,%xmm0 | |
| 893 pand %xmm7,%xmm3 | |
| 894 por %xmm2,%xmm0 | |
| 895 lea $STRIDE($tbl),$tbl | |
| 896 por %xmm3,%xmm0 | |
| 897 | |
| 898 movq %xmm0,($out) # m0=bp[0] | |
| 899 lea 8($out),$out | |
| 900 sub \$1,$num | |
| 901 jnz .Lgather | |
| 902 ___ | |
| 903 $code.=<<___ if ($win64); | |
| 904 movaps %xmm6,(%rsp) | |
| 905 movaps %xmm7,0x10(%rsp) | |
| 906 lea 0x28(%rsp),%rsp | |
| 907 ___ | |
| 908 $code.=<<___; | |
| 909 ret | |
| 910 .LSEH_end_bn_gather5: | |
| 911 .size bn_gather5,.-bn_gather5 | |
| 912 ___ | |
| 913 } | |
| 914 $code.=<<___; | |
| 915 .align 64 | |
| 916 .Lmagic_masks: | |
| 917 .long 0,0, 0,0, 0,0, -1,-1 | |
| 918 .long 0,0, 0,0, 0,0, 0,0 | |
| 919 .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by
<appro\@openssl.org>" | |
| 920 ___ | |
| 921 | |
| 922 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | |
| 923 # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
| 924 if ($win64) { | |
| 925 $rec="%rcx"; | |
| 926 $frame="%rdx"; | |
| 927 $context="%r8"; | |
| 928 $disp="%r9"; | |
| 929 | |
| 930 $code.=<<___; | |
| 931 .extern __imp_RtlVirtualUnwind | |
| 932 .type mul_handler,\@abi-omnipotent | |
| 933 .align 16 | |
| 934 mul_handler: | |
| 935 push %rsi | |
| 936 push %rdi | |
| 937 push %rbx | |
| 938 push %rbp | |
| 939 push %r12 | |
| 940 push %r13 | |
| 941 push %r14 | |
| 942 push %r15 | |
| 943 pushfq | |
| 944 sub \$64,%rsp | |
| 945 | |
| 946 mov 120($context),%rax # pull context->Rax | |
| 947 mov 248($context),%rbx # pull context->Rip | |
| 948 | |
| 949 mov 8($disp),%rsi # disp->ImageBase | |
| 950 mov 56($disp),%r11 # disp->HandlerData | |
| 951 | |
| 952 mov 0(%r11),%r10d # HandlerData[0] | |
| 953 lea (%rsi,%r10),%r10 # end of prologue label | |
| 954 cmp %r10,%rbx # context->Rip<end of prologue label | |
| 955 jb .Lcommon_seh_tail | |
| 956 | |
| 957 lea `40+48`(%rax),%rax | |
| 958 | |
| 959 mov 4(%r11),%r10d # HandlerData[1] | |
| 960 lea (%rsi,%r10),%r10 # end of alloca label | |
| 961 cmp %r10,%rbx # context->Rip<end of alloca label | |
| 962 jb .Lcommon_seh_tail | |
| 963 | |
| 964 mov 152($context),%rax # pull context->Rsp | |
| 965 | |
| 966 mov 8(%r11),%r10d # HandlerData[2] | |
| 967 lea (%rsi,%r10),%r10 # epilogue label | |
| 968 cmp %r10,%rbx # context->Rip>=epilogue label | |
| 969 jae .Lcommon_seh_tail | |
| 970 | |
| 971 mov 192($context),%r10 # pull $num | |
| 972 mov 8(%rax,%r10,8),%rax # pull saved stack pointer | |
| 973 | |
| 974 movaps (%rax),%xmm0 | |
| 975 movaps 16(%rax),%xmm1 | |
| 976 lea `40+48`(%rax),%rax | |
| 977 | |
| 978 mov -8(%rax),%rbx | |
| 979 mov -16(%rax),%rbp | |
| 980 mov -24(%rax),%r12 | |
| 981 mov -32(%rax),%r13 | |
| 982 mov -40(%rax),%r14 | |
| 983 mov -48(%rax),%r15 | |
| 984 mov %rbx,144($context) # restore context->Rbx | |
| 985 mov %rbp,160($context) # restore context->Rbp | |
| 986 mov %r12,216($context) # restore context->R12 | |
| 987 mov %r13,224($context) # restore context->R13 | |
| 988 mov %r14,232($context) # restore context->R14 | |
| 989 mov %r15,240($context) # restore context->R15 | |
| 990 movups %xmm0,512($context) # restore context->Xmm6 | |
| 991 movups %xmm1,528($context) # restore context->Xmm7 | |
| 992 | |
| 993 .Lcommon_seh_tail: | |
| 994 mov 8(%rax),%rdi | |
| 995 mov 16(%rax),%rsi | |
| 996 mov %rax,152($context) # restore context->Rsp | |
| 997 mov %rsi,168($context) # restore context->Rsi | |
| 998 mov %rdi,176($context) # restore context->Rdi | |
| 999 | |
| 1000 mov 40($disp),%rdi # disp->ContextRecord | |
| 1001 mov $context,%rsi # context | |
| 1002 mov \$154,%ecx # sizeof(CONTEXT) | |
| 1003 .long 0xa548f3fc # cld; rep movsq | |
| 1004 | |
| 1005 mov $disp,%rsi | |
| 1006 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
| 1007 mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
| 1008 mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
| 1009 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
| 1010 mov 40(%rsi),%r10 # disp->ContextRecord | |
| 1011 lea 56(%rsi),%r11 # &disp->HandlerData | |
| 1012 lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
| 1013 mov %r10,32(%rsp) # arg5 | |
| 1014 mov %r11,40(%rsp) # arg6 | |
| 1015 mov %r12,48(%rsp) # arg7 | |
| 1016 mov %rcx,56(%rsp) # arg8, (NULL) | |
| 1017 call *__imp_RtlVirtualUnwind(%rip) | |
| 1018 | |
| 1019 mov \$1,%eax # ExceptionContinueSearch | |
| 1020 add \$64,%rsp | |
| 1021 popfq | |
| 1022 pop %r15 | |
| 1023 pop %r14 | |
| 1024 pop %r13 | |
| 1025 pop %r12 | |
| 1026 pop %rbp | |
| 1027 pop %rbx | |
| 1028 pop %rdi | |
| 1029 pop %rsi | |
| 1030 ret | |
| 1031 .size mul_handler,.-mul_handler | |
| 1032 | |
| 1033 .section .pdata | |
| 1034 .align 4 | |
| 1035 .rva .LSEH_begin_bn_mul_mont_gather5 | |
| 1036 .rva .LSEH_end_bn_mul_mont_gather5 | |
| 1037 .rva .LSEH_info_bn_mul_mont_gather5 | |
| 1038 | |
| 1039 .rva .LSEH_begin_bn_mul4x_mont_gather5 | |
| 1040 .rva .LSEH_end_bn_mul4x_mont_gather5 | |
| 1041 .rva .LSEH_info_bn_mul4x_mont_gather5 | |
| 1042 | |
| 1043 .rva .LSEH_begin_bn_gather5 | |
| 1044 .rva .LSEH_end_bn_gather5 | |
| 1045 .rva .LSEH_info_bn_gather5 | |
| 1046 | |
| 1047 .section .xdata | |
| 1048 .align 8 | |
| 1049 .LSEH_info_bn_mul_mont_gather5: | |
| 1050 .byte 9,0,0,0 | |
| 1051 .rva mul_handler | |
| 1052 .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[] | |
| 1053 .align 8 | |
| 1054 .LSEH_info_bn_mul4x_mont_gather5: | |
| 1055 .byte 9,0,0,0 | |
| 1056 .rva mul_handler | |
| 1057 .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] | |
| 1058 .align 8 | |
| 1059 .LSEH_info_bn_gather5: | |
| 1060 .byte 0x01,0x0d,0x05,0x00 | |
| 1061 .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 | |
| 1062 .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 | |
| 1063 .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 | |
| 1064 .align 8 | |
| 1065 ___ | |
| 1066 } | |
| 1067 | |
| 1068 $code =~ s/\`([^\`]*)\`/eval($1)/gem; | |
| 1069 | |
| 1070 print $code; | |
| 1071 close STDOUT; | |
| OLD | NEW |