OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/env perl |
| 2 |
| 3 # ==================================================================== |
| 4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
| 5 # project. The module is, however, dual licensed under OpenSSL and |
| 6 # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 7 # details see http://www.openssl.org/~appro/cryptogams/. |
| 8 # ==================================================================== |
| 9 |
| 10 # December 2007 |
| 11 |
| 12 # The reason for undertaken effort is basically following. Even though |
| 13 # Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI |
| 14 # performance was observed to be less than impressive, essentially as |
| 15 # fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope. |
| 16 # Well, it's not surprising that IBM had to make some sacrifices to |
| 17 # boost the clock frequency that much, but no overall improvement? |
| 18 # Having observed how much difference did switching to FPU make on |
| 19 # UltraSPARC, playing same stunt on Power 6 appeared appropriate... |
| 20 # Unfortunately the resulting performance improvement is not as |
| 21 # impressive, ~30%, and in absolute terms is still very far from what |
| 22 # one would expect from 4.7GHz CPU. There is a chance that I'm doing |
| 23 # something wrong, but in the lack of assembler level micro-profiling |
| 24 # data or at least decent platform guide I can't tell... Or better |
| 25 # results might be achieved with VMX... Anyway, this module provides |
| 26 # *worse* performance on other PowerPC implementations, ~40-15% slower |
| 27 # on PPC970 depending on key length and ~40% slower on Power 5 for all |
| 28 # key lengths. As it's obviously inappropriate as "best all-round" |
| 29 # alternative, it has to be complemented with run-time CPU family |
| 30 # detection. Oh! It should also be noted that unlike other PowerPC |
| 31 # implementation IALU ppc-mont.pl module performs *suboptimaly* on |
| 32 # >=1024-bit key lengths on Power 6. It should also be noted that |
| 33 # *everything* said so far applies to 64-bit builds! As far as 32-bit |
| 34 # application executed on 64-bit CPU goes, this module is likely to |
| 35 # become preferred choice, because it's easy to adapt it for such |
| 36 # case and *is* faster than 32-bit ppc-mont.pl on *all* processors. |
| 37 |
| 38 # February 2008 |
| 39 |
| 40 # Micro-profiling assisted optimization results in ~15% improvement |
| 41 # over original ppc64-mont.pl version, or overall ~50% improvement |
| 42 # over ppc.pl module on Power 6. If compared to ppc-mont.pl on same |
| 43 # Power 6 CPU, this module is 5-150% faster depending on key length, |
| 44 # [hereafter] more for longer keys. But if compared to ppc-mont.pl |
| 45 # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive |
| 46 # in absolute terms, but it's apparently the way Power 6 is... |
| 47 |
| 48 $flavour = shift; |
| 49 |
| 50 if ($flavour =~ /32/) { |
| 51 $SIZE_T=4; |
| 52 $RZONE= 224; |
| 53 $FRAME= $SIZE_T*12+8*12; |
| 54 $fname= "bn_mul_mont_ppc64"; |
| 55 |
| 56 $STUX= "stwux"; # store indexed and update |
| 57 $PUSH= "stw"; |
| 58 $POP= "lwz"; |
| 59 die "not implemented yet"; |
| 60 } elsif ($flavour =~ /64/) { |
| 61 $SIZE_T=8; |
| 62 $RZONE= 288; |
| 63 $FRAME= $SIZE_T*12+8*12; |
| 64 $fname= "bn_mul_mont"; |
| 65 |
| 66 # same as above, but 64-bit mnemonics... |
| 67 $STUX= "stdux"; # store indexed and update |
| 68 $PUSH= "std"; |
| 69 $POP= "ld"; |
| 70 } else { die "nonsense $flavour"; } |
| 71 |
| 72 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 73 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or |
| 74 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or |
| 75 die "can't locate ppc-xlate.pl"; |
| 76 |
| 77 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; |
| 78 |
| 79 $FRAME=($FRAME+63)&~63; |
| 80 $TRANSFER=16*8; |
| 81 |
| 82 $carry="r0"; |
| 83 $sp="r1"; |
| 84 $toc="r2"; |
| 85 $rp="r3"; $ovf="r3"; |
| 86 $ap="r4"; |
| 87 $bp="r5"; |
| 88 $np="r6"; |
| 89 $n0="r7"; |
| 90 $num="r8"; |
| 91 $rp="r9"; # $rp is reassigned |
| 92 $tp="r10"; |
| 93 $j="r11"; |
| 94 $i="r12"; |
| 95 # non-volatile registers |
| 96 $nap_d="r14"; # interleaved ap and np in double format |
| 97 $a0="r15"; # ap[0] |
| 98 $t0="r16"; # temporary registers |
| 99 $t1="r17"; |
| 100 $t2="r18"; |
| 101 $t3="r19"; |
| 102 $t4="r20"; |
| 103 $t5="r21"; |
| 104 $t6="r22"; |
| 105 $t7="r23"; |
| 106 |
| 107 # PPC offers enough register bank capacity to unroll inner loops twice |
| 108 # |
| 109 # ..A3A2A1A0 |
| 110 # dcba |
| 111 # ----------- |
| 112 # A0a |
| 113 # A0b |
| 114 # A0c |
| 115 # A0d |
| 116 # A1a |
| 117 # A1b |
| 118 # A1c |
| 119 # A1d |
| 120 # A2a |
| 121 # A2b |
| 122 # A2c |
| 123 # A2d |
| 124 # A3a |
| 125 # A3b |
| 126 # A3c |
| 127 # A3d |
| 128 # ..a |
| 129 # ..b |
| 130 # |
| 131 $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; |
| 132 $na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; |
| 133 $dota="f8"; $dotb="f9"; |
| 134 $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; |
| 135 $N0="f14"; $N1="f15"; $N2="f16"; $N3="f17"; |
| 136 $T0a="f18"; $T0b="f19"; |
| 137 $T1a="f20"; $T1b="f21"; |
| 138 $T2a="f22"; $T2b="f23"; |
| 139 $T3a="f24"; $T3b="f25"; |
| 140 |
| 141 # sp----------->+-------------------------------+ |
| 142 # | saved sp | |
| 143 # +-------------------------------+ |
| 144 # | | |
| 145 # +-------------------------------+ |
| 146 # | 10 saved gpr, r14-r23 | |
| 147 # . . |
| 148 # . . |
| 149 # +12*size_t +-------------------------------+ |
| 150 # | 12 saved fpr, f14-f25 | |
| 151 # . . |
| 152 # . . |
| 153 # +12*8 +-------------------------------+ |
| 154 # | padding to 64 byte boundary | |
| 155 # . . |
| 156 # +X +-------------------------------+ |
| 157 # | 16 gpr<->fpr transfer zone | |
| 158 # . . |
| 159 # . . |
| 160 # +16*8 +-------------------------------+ |
| 161 # | __int64 tmp[-1] | |
| 162 # +-------------------------------+ |
| 163 # | __int64 tmp[num] | |
| 164 # . . |
| 165 # . . |
| 166 # . . |
| 167 # +(num+1)*8 +-------------------------------+ |
| 168 # | padding to 64 byte boundary | |
| 169 # . . |
| 170 # +X +-------------------------------+ |
| 171 # | double nap_d[4*num] | |
| 172 # . . |
| 173 # . . |
| 174 # . . |
| 175 # +-------------------------------+ |
| 176 |
| 177 $code=<<___; |
| 178 .machine "any" |
| 179 .text |
| 180 |
| 181 .globl .$fname |
| 182 .align 5 |
| 183 .$fname: |
| 184 cmpwi $num,4 |
| 185 mr $rp,r3 ; $rp is reassigned |
| 186 li r3,0 ; possible "not handled" return code |
| 187 bltlr- |
| 188 andi. r0,$num,1 ; $num has to be even |
| 189 bnelr- |
| 190 |
| 191 slwi $num,$num,3 ; num*=8 |
| 192 li $i,-4096 |
| 193 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num |
| 194 add $tp,$tp,$num ; place for tp[num+1] |
| 195 addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE` |
| 196 subf $tp,$tp,$sp ; $sp-$tp |
| 197 and $tp,$tp,$i ; minimize TLB usage |
| 198 subf $tp,$sp,$tp ; $tp-$sp |
| 199 $STUX $sp,$sp,$tp ; alloca |
| 200 |
| 201 $PUSH r14,`2*$SIZE_T`($sp) |
| 202 $PUSH r15,`3*$SIZE_T`($sp) |
| 203 $PUSH r16,`4*$SIZE_T`($sp) |
| 204 $PUSH r17,`5*$SIZE_T`($sp) |
| 205 $PUSH r18,`6*$SIZE_T`($sp) |
| 206 $PUSH r19,`7*$SIZE_T`($sp) |
| 207 $PUSH r20,`8*$SIZE_T`($sp) |
| 208 $PUSH r21,`9*$SIZE_T`($sp) |
| 209 $PUSH r22,`10*$SIZE_T`($sp) |
| 210 $PUSH r23,`11*$SIZE_T`($sp) |
| 211 stfd f14,`12*$SIZE_T+0`($sp) |
| 212 stfd f15,`12*$SIZE_T+8`($sp) |
| 213 stfd f16,`12*$SIZE_T+16`($sp) |
| 214 stfd f17,`12*$SIZE_T+24`($sp) |
| 215 stfd f18,`12*$SIZE_T+32`($sp) |
| 216 stfd f19,`12*$SIZE_T+40`($sp) |
| 217 stfd f20,`12*$SIZE_T+48`($sp) |
| 218 stfd f21,`12*$SIZE_T+56`($sp) |
| 219 stfd f22,`12*$SIZE_T+64`($sp) |
| 220 stfd f23,`12*$SIZE_T+72`($sp) |
| 221 stfd f24,`12*$SIZE_T+80`($sp) |
| 222 stfd f25,`12*$SIZE_T+88`($sp) |
| 223 |
| 224 ld $a0,0($ap) ; pull ap[0] value |
| 225 ld $n0,0($n0) ; pull n0[0] value |
| 226 ld $t3,0($bp) ; bp[0] |
| 227 |
| 228 addi $tp,$sp,`$FRAME+$TRANSFER+8+64` |
| 229 li $i,-64 |
| 230 add $nap_d,$tp,$num |
| 231 and $nap_d,$nap_d,$i ; align to 64 bytes |
| 232 |
| 233 mulld $t7,$a0,$t3 ; ap[0]*bp[0] |
| 234 ; nap_d is off by 1, because it's used with stfdu/lfdu |
| 235 addi $nap_d,$nap_d,-8 |
| 236 srwi $j,$num,`3+1` ; counter register, num/2 |
| 237 mulld $t7,$t7,$n0 ; tp[0]*n0 |
| 238 addi $j,$j,-1 |
| 239 addi $tp,$sp,`$FRAME+$TRANSFER-8` |
| 240 li $carry,0 |
| 241 mtctr $j |
| 242 |
| 243 ; transfer bp[0] to FPU as 4x16-bit values |
| 244 extrdi $t0,$t3,16,48 |
| 245 extrdi $t1,$t3,16,32 |
| 246 extrdi $t2,$t3,16,16 |
| 247 extrdi $t3,$t3,16,0 |
| 248 std $t0,`$FRAME+0`($sp) |
| 249 std $t1,`$FRAME+8`($sp) |
| 250 std $t2,`$FRAME+16`($sp) |
| 251 std $t3,`$FRAME+24`($sp) |
| 252 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values |
| 253 extrdi $t4,$t7,16,48 |
| 254 extrdi $t5,$t7,16,32 |
| 255 extrdi $t6,$t7,16,16 |
| 256 extrdi $t7,$t7,16,0 |
| 257 std $t4,`$FRAME+32`($sp) |
| 258 std $t5,`$FRAME+40`($sp) |
| 259 std $t6,`$FRAME+48`($sp) |
| 260 std $t7,`$FRAME+56`($sp) |
| 261 lwz $t0,4($ap) ; load a[j] as 32-bit word pair |
| 262 lwz $t1,0($ap) |
| 263 lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair |
| 264 lwz $t3,8($ap) |
| 265 lwz $t4,4($np) ; load n[j] as 32-bit word pair |
| 266 lwz $t5,0($np) |
| 267 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair |
| 268 lwz $t7,8($np) |
| 269 lfd $ba,`$FRAME+0`($sp) |
| 270 lfd $bb,`$FRAME+8`($sp) |
| 271 lfd $bc,`$FRAME+16`($sp) |
| 272 lfd $bd,`$FRAME+24`($sp) |
| 273 lfd $na,`$FRAME+32`($sp) |
| 274 lfd $nb,`$FRAME+40`($sp) |
| 275 lfd $nc,`$FRAME+48`($sp) |
| 276 lfd $nd,`$FRAME+56`($sp) |
| 277 std $t0,`$FRAME+64`($sp) |
| 278 std $t1,`$FRAME+72`($sp) |
| 279 std $t2,`$FRAME+80`($sp) |
| 280 std $t3,`$FRAME+88`($sp) |
| 281 std $t4,`$FRAME+96`($sp) |
| 282 std $t5,`$FRAME+104`($sp) |
| 283 std $t6,`$FRAME+112`($sp) |
| 284 std $t7,`$FRAME+120`($sp) |
| 285 fcfid $ba,$ba |
| 286 fcfid $bb,$bb |
| 287 fcfid $bc,$bc |
| 288 fcfid $bd,$bd |
| 289 fcfid $na,$na |
| 290 fcfid $nb,$nb |
| 291 fcfid $nc,$nc |
| 292 fcfid $nd,$nd |
| 293 |
| 294 lfd $A0,`$FRAME+64`($sp) |
| 295 lfd $A1,`$FRAME+72`($sp) |
| 296 lfd $A2,`$FRAME+80`($sp) |
| 297 lfd $A3,`$FRAME+88`($sp) |
| 298 lfd $N0,`$FRAME+96`($sp) |
| 299 lfd $N1,`$FRAME+104`($sp) |
| 300 lfd $N2,`$FRAME+112`($sp) |
| 301 lfd $N3,`$FRAME+120`($sp) |
| 302 fcfid $A0,$A0 |
| 303 fcfid $A1,$A1 |
| 304 fcfid $A2,$A2 |
| 305 fcfid $A3,$A3 |
| 306 fcfid $N0,$N0 |
| 307 fcfid $N1,$N1 |
| 308 fcfid $N2,$N2 |
| 309 fcfid $N3,$N3 |
| 310 addi $ap,$ap,16 |
| 311 addi $np,$np,16 |
| 312 |
| 313 fmul $T1a,$A1,$ba |
| 314 fmul $T1b,$A1,$bb |
| 315 stfd $A0,8($nap_d) ; save a[j] in double format |
| 316 stfd $A1,16($nap_d) |
| 317 fmul $T2a,$A2,$ba |
| 318 fmul $T2b,$A2,$bb |
| 319 stfd $A2,24($nap_d) ; save a[j+1] in double format |
| 320 stfd $A3,32($nap_d) |
| 321 fmul $T3a,$A3,$ba |
| 322 fmul $T3b,$A3,$bb |
| 323 stfd $N0,40($nap_d) ; save n[j] in double format |
| 324 stfd $N1,48($nap_d) |
| 325 fmul $T0a,$A0,$ba |
| 326 fmul $T0b,$A0,$bb |
| 327 stfd $N2,56($nap_d) ; save n[j+1] in double format |
| 328 stfdu $N3,64($nap_d) |
| 329 |
| 330 fmadd $T1a,$A0,$bc,$T1a |
| 331 fmadd $T1b,$A0,$bd,$T1b |
| 332 fmadd $T2a,$A1,$bc,$T2a |
| 333 fmadd $T2b,$A1,$bd,$T2b |
| 334 fmadd $T3a,$A2,$bc,$T3a |
| 335 fmadd $T3b,$A2,$bd,$T3b |
| 336 fmul $dota,$A3,$bc |
| 337 fmul $dotb,$A3,$bd |
| 338 |
| 339 fmadd $T1a,$N1,$na,$T1a |
| 340 fmadd $T1b,$N1,$nb,$T1b |
| 341 fmadd $T2a,$N2,$na,$T2a |
| 342 fmadd $T2b,$N2,$nb,$T2b |
| 343 fmadd $T3a,$N3,$na,$T3a |
| 344 fmadd $T3b,$N3,$nb,$T3b |
| 345 fmadd $T0a,$N0,$na,$T0a |
| 346 fmadd $T0b,$N0,$nb,$T0b |
| 347 |
| 348 fmadd $T1a,$N0,$nc,$T1a |
| 349 fmadd $T1b,$N0,$nd,$T1b |
| 350 fmadd $T2a,$N1,$nc,$T2a |
| 351 fmadd $T2b,$N1,$nd,$T2b |
| 352 fmadd $T3a,$N2,$nc,$T3a |
| 353 fmadd $T3b,$N2,$nd,$T3b |
| 354 fmadd $dota,$N3,$nc,$dota |
| 355 fmadd $dotb,$N3,$nd,$dotb |
| 356 |
| 357 fctid $T0a,$T0a |
| 358 fctid $T0b,$T0b |
| 359 fctid $T1a,$T1a |
| 360 fctid $T1b,$T1b |
| 361 fctid $T2a,$T2a |
| 362 fctid $T2b,$T2b |
| 363 fctid $T3a,$T3a |
| 364 fctid $T3b,$T3b |
| 365 |
| 366 stfd $T0a,`$FRAME+0`($sp) |
| 367 stfd $T0b,`$FRAME+8`($sp) |
| 368 stfd $T1a,`$FRAME+16`($sp) |
| 369 stfd $T1b,`$FRAME+24`($sp) |
| 370 stfd $T2a,`$FRAME+32`($sp) |
| 371 stfd $T2b,`$FRAME+40`($sp) |
| 372 stfd $T3a,`$FRAME+48`($sp) |
| 373 stfd $T3b,`$FRAME+56`($sp) |
| 374 |
| 375 .align 5 |
| 376 L1st: |
| 377 lwz $t0,4($ap) ; load a[j] as 32-bit word pair |
| 378 lwz $t1,0($ap) |
| 379 lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair |
| 380 lwz $t3,8($ap) |
| 381 lwz $t4,4($np) ; load n[j] as 32-bit word pair |
| 382 lwz $t5,0($np) |
| 383 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair |
| 384 lwz $t7,8($np) |
| 385 std $t0,`$FRAME+64`($sp) |
| 386 std $t1,`$FRAME+72`($sp) |
| 387 std $t2,`$FRAME+80`($sp) |
| 388 std $t3,`$FRAME+88`($sp) |
| 389 std $t4,`$FRAME+96`($sp) |
| 390 std $t5,`$FRAME+104`($sp) |
| 391 std $t6,`$FRAME+112`($sp) |
| 392 std $t7,`$FRAME+120`($sp) |
| 393 ld $t0,`$FRAME+0`($sp) |
| 394 ld $t1,`$FRAME+8`($sp) |
| 395 ld $t2,`$FRAME+16`($sp) |
| 396 ld $t3,`$FRAME+24`($sp) |
| 397 ld $t4,`$FRAME+32`($sp) |
| 398 ld $t5,`$FRAME+40`($sp) |
| 399 ld $t6,`$FRAME+48`($sp) |
| 400 ld $t7,`$FRAME+56`($sp) |
| 401 lfd $A0,`$FRAME+64`($sp) |
| 402 lfd $A1,`$FRAME+72`($sp) |
| 403 lfd $A2,`$FRAME+80`($sp) |
| 404 lfd $A3,`$FRAME+88`($sp) |
| 405 lfd $N0,`$FRAME+96`($sp) |
| 406 lfd $N1,`$FRAME+104`($sp) |
| 407 lfd $N2,`$FRAME+112`($sp) |
| 408 lfd $N3,`$FRAME+120`($sp) |
| 409 fcfid $A0,$A0 |
| 410 fcfid $A1,$A1 |
| 411 fcfid $A2,$A2 |
| 412 fcfid $A3,$A3 |
| 413 fcfid $N0,$N0 |
| 414 fcfid $N1,$N1 |
| 415 fcfid $N2,$N2 |
| 416 fcfid $N3,$N3 |
| 417 addi $ap,$ap,16 |
| 418 addi $np,$np,16 |
| 419 |
| 420 fmul $T1a,$A1,$ba |
| 421 fmul $T1b,$A1,$bb |
| 422 fmul $T2a,$A2,$ba |
| 423 fmul $T2b,$A2,$bb |
| 424 stfd $A0,8($nap_d) ; save a[j] in double format |
| 425 stfd $A1,16($nap_d) |
| 426 fmul $T3a,$A3,$ba |
| 427 fmul $T3b,$A3,$bb |
| 428 fmadd $T0a,$A0,$ba,$dota |
| 429 fmadd $T0b,$A0,$bb,$dotb |
| 430 stfd $A2,24($nap_d) ; save a[j+1] in double format |
| 431 stfd $A3,32($nap_d) |
| 432 |
| 433 fmadd $T1a,$A0,$bc,$T1a |
| 434 fmadd $T1b,$A0,$bd,$T1b |
| 435 fmadd $T2a,$A1,$bc,$T2a |
| 436 fmadd $T2b,$A1,$bd,$T2b |
| 437 stfd $N0,40($nap_d) ; save n[j] in double format |
| 438 stfd $N1,48($nap_d) |
| 439 fmadd $T3a,$A2,$bc,$T3a |
| 440 fmadd $T3b,$A2,$bd,$T3b |
| 441 add $t0,$t0,$carry ; can not overflow |
| 442 fmul $dota,$A3,$bc |
| 443 fmul $dotb,$A3,$bd |
| 444 stfd $N2,56($nap_d) ; save n[j+1] in double format |
| 445 stfdu $N3,64($nap_d) |
| 446 srdi $carry,$t0,16 |
| 447 add $t1,$t1,$carry |
| 448 srdi $carry,$t1,16 |
| 449 |
| 450 fmadd $T1a,$N1,$na,$T1a |
| 451 fmadd $T1b,$N1,$nb,$T1b |
| 452 insrdi $t0,$t1,16,32 |
| 453 fmadd $T2a,$N2,$na,$T2a |
| 454 fmadd $T2b,$N2,$nb,$T2b |
| 455 add $t2,$t2,$carry |
| 456 fmadd $T3a,$N3,$na,$T3a |
| 457 fmadd $T3b,$N3,$nb,$T3b |
| 458 srdi $carry,$t2,16 |
| 459 fmadd $T0a,$N0,$na,$T0a |
| 460 fmadd $T0b,$N0,$nb,$T0b |
| 461 insrdi $t0,$t2,16,16 |
| 462 add $t3,$t3,$carry |
| 463 srdi $carry,$t3,16 |
| 464 |
| 465 fmadd $T1a,$N0,$nc,$T1a |
| 466 fmadd $T1b,$N0,$nd,$T1b |
| 467 insrdi $t0,$t3,16,0 ; 0..63 bits |
| 468 fmadd $T2a,$N1,$nc,$T2a |
| 469 fmadd $T2b,$N1,$nd,$T2b |
| 470 add $t4,$t4,$carry |
| 471 fmadd $T3a,$N2,$nc,$T3a |
| 472 fmadd $T3b,$N2,$nd,$T3b |
| 473 srdi $carry,$t4,16 |
| 474 fmadd $dota,$N3,$nc,$dota |
| 475 fmadd $dotb,$N3,$nd,$dotb |
| 476 add $t5,$t5,$carry |
| 477 srdi $carry,$t5,16 |
| 478 insrdi $t4,$t5,16,32 |
| 479 |
| 480 fctid $T0a,$T0a |
| 481 fctid $T0b,$T0b |
| 482 add $t6,$t6,$carry |
| 483 fctid $T1a,$T1a |
| 484 fctid $T1b,$T1b |
| 485 srdi $carry,$t6,16 |
| 486 fctid $T2a,$T2a |
| 487 fctid $T2b,$T2b |
| 488 insrdi $t4,$t6,16,16 |
| 489 fctid $T3a,$T3a |
| 490 fctid $T3b,$T3b |
| 491 add $t7,$t7,$carry |
| 492 insrdi $t4,$t7,16,0 ; 64..127 bits |
| 493 srdi $carry,$t7,16 ; upper 33 bits |
| 494 |
| 495 stfd $T0a,`$FRAME+0`($sp) |
| 496 stfd $T0b,`$FRAME+8`($sp) |
| 497 stfd $T1a,`$FRAME+16`($sp) |
| 498 stfd $T1b,`$FRAME+24`($sp) |
| 499 stfd $T2a,`$FRAME+32`($sp) |
| 500 stfd $T2b,`$FRAME+40`($sp) |
| 501 stfd $T3a,`$FRAME+48`($sp) |
| 502 stfd $T3b,`$FRAME+56`($sp) |
| 503 std $t0,8($tp) ; tp[j-1] |
| 504 stdu $t4,16($tp) ; tp[j] |
| 505 bdnz- L1st |
| 506 |
| 507 fctid $dota,$dota |
| 508 fctid $dotb,$dotb |
| 509 |
| 510 ld $t0,`$FRAME+0`($sp) |
| 511 ld $t1,`$FRAME+8`($sp) |
| 512 ld $t2,`$FRAME+16`($sp) |
| 513 ld $t3,`$FRAME+24`($sp) |
| 514 ld $t4,`$FRAME+32`($sp) |
| 515 ld $t5,`$FRAME+40`($sp) |
| 516 ld $t6,`$FRAME+48`($sp) |
| 517 ld $t7,`$FRAME+56`($sp) |
| 518 stfd $dota,`$FRAME+64`($sp) |
| 519 stfd $dotb,`$FRAME+72`($sp) |
| 520 |
| 521 add $t0,$t0,$carry ; can not overflow |
| 522 srdi $carry,$t0,16 |
| 523 add $t1,$t1,$carry |
| 524 srdi $carry,$t1,16 |
| 525 insrdi $t0,$t1,16,32 |
| 526 add $t2,$t2,$carry |
| 527 srdi $carry,$t2,16 |
| 528 insrdi $t0,$t2,16,16 |
| 529 add $t3,$t3,$carry |
| 530 srdi $carry,$t3,16 |
| 531 insrdi $t0,$t3,16,0 ; 0..63 bits |
| 532 add $t4,$t4,$carry |
| 533 srdi $carry,$t4,16 |
| 534 add $t5,$t5,$carry |
| 535 srdi $carry,$t5,16 |
| 536 insrdi $t4,$t5,16,32 |
| 537 add $t6,$t6,$carry |
| 538 srdi $carry,$t6,16 |
| 539 insrdi $t4,$t6,16,16 |
| 540 add $t7,$t7,$carry |
| 541 insrdi $t4,$t7,16,0 ; 64..127 bits |
| 542 srdi $carry,$t7,16 ; upper 33 bits |
| 543 ld $t6,`$FRAME+64`($sp) |
| 544 ld $t7,`$FRAME+72`($sp) |
| 545 |
| 546 std $t0,8($tp) ; tp[j-1] |
| 547 stdu $t4,16($tp) ; tp[j] |
| 548 |
| 549 add $t6,$t6,$carry ; can not overflow |
| 550 srdi $carry,$t6,16 |
| 551 add $t7,$t7,$carry |
| 552 insrdi $t6,$t7,48,0 |
| 553 srdi $ovf,$t7,48 |
| 554 std $t6,8($tp) ; tp[num-1] |
| 555 |
| 556 slwi $t7,$num,2 |
| 557 subf $nap_d,$t7,$nap_d ; rewind pointer |
| 558 |
| 559 li $i,8 ; i=1 |
| 560 .align 5 |
| 561 Louter: |
| 562 ldx $t3,$bp,$i ; bp[i] |
| 563 ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] |
| 564 mulld $t7,$a0,$t3 ; ap[0]*bp[i] |
| 565 |
| 566 addi $tp,$sp,`$FRAME+$TRANSFER` |
| 567 add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0] |
| 568 li $carry,0 |
| 569 mulld $t7,$t7,$n0 ; tp[0]*n0 |
| 570 mtctr $j |
| 571 |
| 572 ; transfer bp[i] to FPU as 4x16-bit values |
| 573 extrdi $t0,$t3,16,48 |
| 574 extrdi $t1,$t3,16,32 |
| 575 extrdi $t2,$t3,16,16 |
| 576 extrdi $t3,$t3,16,0 |
| 577 std $t0,`$FRAME+0`($sp) |
| 578 std $t1,`$FRAME+8`($sp) |
| 579 std $t2,`$FRAME+16`($sp) |
| 580 std $t3,`$FRAME+24`($sp) |
| 581 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values |
| 582 extrdi $t4,$t7,16,48 |
| 583 extrdi $t5,$t7,16,32 |
| 584 extrdi $t6,$t7,16,16 |
| 585 extrdi $t7,$t7,16,0 |
| 586 std $t4,`$FRAME+32`($sp) |
| 587 std $t5,`$FRAME+40`($sp) |
| 588 std $t6,`$FRAME+48`($sp) |
| 589 std $t7,`$FRAME+56`($sp) |
| 590 |
| 591 lfd $A0,8($nap_d) ; load a[j] in double format |
| 592 lfd $A1,16($nap_d) |
| 593 lfd $A2,24($nap_d) ; load a[j+1] in double format |
| 594 lfd $A3,32($nap_d) |
| 595 lfd $N0,40($nap_d) ; load n[j] in double format |
| 596 lfd $N1,48($nap_d) |
| 597 lfd $N2,56($nap_d) ; load n[j+1] in double format |
| 598 lfdu $N3,64($nap_d) |
| 599 |
| 600 lfd $ba,`$FRAME+0`($sp) |
| 601 lfd $bb,`$FRAME+8`($sp) |
| 602 lfd $bc,`$FRAME+16`($sp) |
| 603 lfd $bd,`$FRAME+24`($sp) |
| 604 lfd $na,`$FRAME+32`($sp) |
| 605 lfd $nb,`$FRAME+40`($sp) |
| 606 lfd $nc,`$FRAME+48`($sp) |
| 607 lfd $nd,`$FRAME+56`($sp) |
| 608 |
| 609 fcfid $ba,$ba |
| 610 fcfid $bb,$bb |
| 611 fcfid $bc,$bc |
| 612 fcfid $bd,$bd |
| 613 fcfid $na,$na |
| 614 fcfid $nb,$nb |
| 615 fcfid $nc,$nc |
| 616 fcfid $nd,$nd |
| 617 |
| 618 fmul $T1a,$A1,$ba |
| 619 fmul $T1b,$A1,$bb |
| 620 fmul $T2a,$A2,$ba |
| 621 fmul $T2b,$A2,$bb |
| 622 fmul $T3a,$A3,$ba |
| 623 fmul $T3b,$A3,$bb |
| 624 fmul $T0a,$A0,$ba |
| 625 fmul $T0b,$A0,$bb |
| 626 |
| 627 fmadd $T1a,$A0,$bc,$T1a |
| 628 fmadd $T1b,$A0,$bd,$T1b |
| 629 fmadd $T2a,$A1,$bc,$T2a |
| 630 fmadd $T2b,$A1,$bd,$T2b |
| 631 fmadd $T3a,$A2,$bc,$T3a |
| 632 fmadd $T3b,$A2,$bd,$T3b |
| 633 fmul $dota,$A3,$bc |
| 634 fmul $dotb,$A3,$bd |
| 635 |
| 636 fmadd $T1a,$N1,$na,$T1a |
| 637 fmadd $T1b,$N1,$nb,$T1b |
| 638 lfd $A0,8($nap_d) ; load a[j] in double format |
| 639 lfd $A1,16($nap_d) |
| 640 fmadd $T2a,$N2,$na,$T2a |
| 641 fmadd $T2b,$N2,$nb,$T2b |
| 642 lfd $A2,24($nap_d) ; load a[j+1] in double format |
| 643 lfd $A3,32($nap_d) |
| 644 fmadd $T3a,$N3,$na,$T3a |
| 645 fmadd $T3b,$N3,$nb,$T3b |
| 646 fmadd $T0a,$N0,$na,$T0a |
| 647 fmadd $T0b,$N0,$nb,$T0b |
| 648 |
| 649 fmadd $T1a,$N0,$nc,$T1a |
| 650 fmadd $T1b,$N0,$nd,$T1b |
| 651 fmadd $T2a,$N1,$nc,$T2a |
| 652 fmadd $T2b,$N1,$nd,$T2b |
| 653 fmadd $T3a,$N2,$nc,$T3a |
| 654 fmadd $T3b,$N2,$nd,$T3b |
| 655 fmadd $dota,$N3,$nc,$dota |
| 656 fmadd $dotb,$N3,$nd,$dotb |
| 657 |
| 658 fctid $T0a,$T0a |
| 659 fctid $T0b,$T0b |
| 660 fctid $T1a,$T1a |
| 661 fctid $T1b,$T1b |
| 662 fctid $T2a,$T2a |
| 663 fctid $T2b,$T2b |
| 664 fctid $T3a,$T3a |
| 665 fctid $T3b,$T3b |
| 666 |
| 667 stfd $T0a,`$FRAME+0`($sp) |
| 668 stfd $T0b,`$FRAME+8`($sp) |
| 669 stfd $T1a,`$FRAME+16`($sp) |
| 670 stfd $T1b,`$FRAME+24`($sp) |
| 671 stfd $T2a,`$FRAME+32`($sp) |
| 672 stfd $T2b,`$FRAME+40`($sp) |
| 673 stfd $T3a,`$FRAME+48`($sp) |
| 674 stfd $T3b,`$FRAME+56`($sp) |
| 675 |
| 676 .align 5 |
| 677 Linner: |
| 678 fmul $T1a,$A1,$ba |
| 679 fmul $T1b,$A1,$bb |
| 680 fmul $T2a,$A2,$ba |
| 681 fmul $T2b,$A2,$bb |
| 682 lfd $N0,40($nap_d) ; load n[j] in double format |
| 683 lfd $N1,48($nap_d) |
| 684 fmul $T3a,$A3,$ba |
| 685 fmul $T3b,$A3,$bb |
| 686 fmadd $T0a,$A0,$ba,$dota |
| 687 fmadd $T0b,$A0,$bb,$dotb |
| 688 lfd $N2,56($nap_d) ; load n[j+1] in double format |
| 689 lfdu $N3,64($nap_d) |
| 690 |
| 691 fmadd $T1a,$A0,$bc,$T1a |
| 692 fmadd $T1b,$A0,$bd,$T1b |
| 693 fmadd $T2a,$A1,$bc,$T2a |
| 694 fmadd $T2b,$A1,$bd,$T2b |
| 695 lfd $A0,8($nap_d) ; load a[j] in double format |
| 696 lfd $A1,16($nap_d) |
| 697 fmadd $T3a,$A2,$bc,$T3a |
| 698 fmadd $T3b,$A2,$bd,$T3b |
| 699 fmul $dota,$A3,$bc |
| 700 fmul $dotb,$A3,$bd |
| 701 lfd $A2,24($nap_d) ; load a[j+1] in double format |
| 702 lfd $A3,32($nap_d) |
| 703 |
| 704 fmadd $T1a,$N1,$na,$T1a |
| 705 fmadd $T1b,$N1,$nb,$T1b |
| 706 ld $t0,`$FRAME+0`($sp) |
| 707 ld $t1,`$FRAME+8`($sp) |
| 708 fmadd $T2a,$N2,$na,$T2a |
| 709 fmadd $T2b,$N2,$nb,$T2b |
| 710 ld $t2,`$FRAME+16`($sp) |
| 711 ld $t3,`$FRAME+24`($sp) |
| 712 fmadd $T3a,$N3,$na,$T3a |
| 713 fmadd $T3b,$N3,$nb,$T3b |
| 714 add $t0,$t0,$carry ; can not overflow |
| 715 ld $t4,`$FRAME+32`($sp) |
| 716 ld $t5,`$FRAME+40`($sp) |
| 717 fmadd $T0a,$N0,$na,$T0a |
| 718 fmadd $T0b,$N0,$nb,$T0b |
| 719 srdi $carry,$t0,16 |
| 720 add $t1,$t1,$carry |
| 721 srdi $carry,$t1,16 |
| 722 ld $t6,`$FRAME+48`($sp) |
| 723 ld $t7,`$FRAME+56`($sp) |
| 724 |
| 725 fmadd $T1a,$N0,$nc,$T1a |
| 726 fmadd $T1b,$N0,$nd,$T1b |
| 727 insrdi $t0,$t1,16,32 |
| 728 ld $t1,8($tp) ; tp[j] |
| 729 fmadd $T2a,$N1,$nc,$T2a |
| 730 fmadd $T2b,$N1,$nd,$T2b |
| 731 add $t2,$t2,$carry |
| 732 fmadd $T3a,$N2,$nc,$T3a |
| 733 fmadd $T3b,$N2,$nd,$T3b |
| 734 srdi $carry,$t2,16 |
| 735 insrdi $t0,$t2,16,16 |
| 736 fmadd $dota,$N3,$nc,$dota |
| 737 fmadd $dotb,$N3,$nd,$dotb |
| 738 add $t3,$t3,$carry |
| 739 ldu $t2,16($tp) ; tp[j+1] |
| 740 srdi $carry,$t3,16 |
| 741 insrdi $t0,$t3,16,0 ; 0..63 bits |
| 742 add $t4,$t4,$carry |
| 743 |
| 744 fctid $T0a,$T0a |
| 745 fctid $T0b,$T0b |
| 746 srdi $carry,$t4,16 |
| 747 fctid $T1a,$T1a |
| 748 fctid $T1b,$T1b |
| 749 add $t5,$t5,$carry |
| 750 fctid $T2a,$T2a |
| 751 fctid $T2b,$T2b |
| 752 srdi $carry,$t5,16 |
| 753 insrdi $t4,$t5,16,32 |
| 754 fctid $T3a,$T3a |
| 755 fctid $T3b,$T3b |
| 756 add $t6,$t6,$carry |
| 757 srdi $carry,$t6,16 |
| 758 insrdi $t4,$t6,16,16 |
| 759 |
| 760 stfd $T0a,`$FRAME+0`($sp) |
| 761 stfd $T0b,`$FRAME+8`($sp) |
| 762 add $t7,$t7,$carry |
| 763 addc $t3,$t0,$t1 |
| 764 stfd $T1a,`$FRAME+16`($sp) |
| 765 stfd $T1b,`$FRAME+24`($sp) |
| 766 insrdi $t4,$t7,16,0 ; 64..127 bits |
| 767 srdi $carry,$t7,16 ; upper 33 bits |
| 768 stfd $T2a,`$FRAME+32`($sp) |
| 769 stfd $T2b,`$FRAME+40`($sp) |
| 770 adde $t5,$t4,$t2 |
| 771 stfd $T3a,`$FRAME+48`($sp) |
| 772 stfd $T3b,`$FRAME+56`($sp) |
| 773 addze $carry,$carry |
| 774 std $t3,-16($tp) ; tp[j-1] |
| 775 std $t5,-8($tp) ; tp[j] |
| 776 bdnz- Linner |
| 777 |
| 778 fctid $dota,$dota |
| 779 fctid $dotb,$dotb |
| 780 ld $t0,`$FRAME+0`($sp) |
| 781 ld $t1,`$FRAME+8`($sp) |
| 782 ld $t2,`$FRAME+16`($sp) |
| 783 ld $t3,`$FRAME+24`($sp) |
| 784 ld $t4,`$FRAME+32`($sp) |
| 785 ld $t5,`$FRAME+40`($sp) |
| 786 ld $t6,`$FRAME+48`($sp) |
| 787 ld $t7,`$FRAME+56`($sp) |
| 788 stfd $dota,`$FRAME+64`($sp) |
| 789 stfd $dotb,`$FRAME+72`($sp) |
| 790 |
| 791 add $t0,$t0,$carry ; can not overflow |
| 792 srdi $carry,$t0,16 |
| 793 add $t1,$t1,$carry |
| 794 srdi $carry,$t1,16 |
| 795 insrdi $t0,$t1,16,32 |
| 796 add $t2,$t2,$carry |
| 797 ld $t1,8($tp) ; tp[j] |
| 798 srdi $carry,$t2,16 |
| 799 insrdi $t0,$t2,16,16 |
| 800 add $t3,$t3,$carry |
| 801 ldu $t2,16($tp) ; tp[j+1] |
| 802 srdi $carry,$t3,16 |
| 803 insrdi $t0,$t3,16,0 ; 0..63 bits |
| 804 add $t4,$t4,$carry |
| 805 srdi $carry,$t4,16 |
| 806 add $t5,$t5,$carry |
| 807 srdi $carry,$t5,16 |
| 808 insrdi $t4,$t5,16,32 |
| 809 add $t6,$t6,$carry |
| 810 srdi $carry,$t6,16 |
| 811 insrdi $t4,$t6,16,16 |
| 812 add $t7,$t7,$carry |
| 813 insrdi $t4,$t7,16,0 ; 64..127 bits |
| 814 srdi $carry,$t7,16 ; upper 33 bits |
| 815 ld $t6,`$FRAME+64`($sp) |
| 816 ld $t7,`$FRAME+72`($sp) |
| 817 |
| 818 addc $t3,$t0,$t1 |
| 819 adde $t5,$t4,$t2 |
| 820 addze $carry,$carry |
| 821 |
| 822 std $t3,-16($tp) ; tp[j-1] |
| 823 std $t5,-8($tp) ; tp[j] |
| 824 |
| 825 add $carry,$carry,$ovf ; comsume upmost overflow |
| 826 add $t6,$t6,$carry ; can not overflow |
| 827 srdi $carry,$t6,16 |
| 828 add $t7,$t7,$carry |
| 829 insrdi $t6,$t7,48,0 |
| 830 srdi $ovf,$t7,48 |
| 831 std $t6,0($tp) ; tp[num-1] |
| 832 |
| 833 slwi $t7,$num,2 |
| 834 addi $i,$i,8 |
| 835 subf $nap_d,$t7,$nap_d ; rewind pointer |
| 836 cmpw $i,$num |
| 837 blt- Louter |
| 838 |
| 839 subf $np,$num,$np ; rewind np |
| 840 addi $j,$j,1 ; restore counter |
| 841 subfc $i,$i,$i ; j=0 and "clear" XER[CA] |
| 842 addi $tp,$sp,`$FRAME+$TRANSFER+8` |
| 843 addi $t4,$sp,`$FRAME+$TRANSFER+16` |
| 844 addi $t5,$np,8 |
| 845 addi $t6,$rp,8 |
| 846 mtctr $j |
| 847 |
| 848 .align 4 |
| 849 Lsub: ldx $t0,$tp,$i |
| 850 ldx $t1,$np,$i |
| 851 ldx $t2,$t4,$i |
| 852 ldx $t3,$t5,$i |
| 853 subfe $t0,$t1,$t0 ; tp[j]-np[j] |
| 854 subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1] |
| 855 stdx $t0,$rp,$i |
| 856 stdx $t2,$t6,$i |
| 857 addi $i,$i,16 |
| 858 bdnz- Lsub |
| 859 |
| 860 li $i,0 |
| 861 subfe $ovf,$i,$ovf ; handle upmost overflow bit |
| 862 and $ap,$tp,$ovf |
| 863 andc $np,$rp,$ovf |
| 864 or $ap,$ap,$np ; ap=borrow?tp:rp |
| 865 addi $t7,$ap,8 |
| 866 mtctr $j |
| 867 |
| 868 .align 4 |
| 869 Lcopy: ; copy or in-place refresh |
| 870 ldx $t0,$ap,$i |
| 871 ldx $t1,$t7,$i |
| 872 std $i,8($nap_d) ; zap nap_d |
| 873 std $i,16($nap_d) |
| 874 std $i,24($nap_d) |
| 875 std $i,32($nap_d) |
| 876 std $i,40($nap_d) |
| 877 std $i,48($nap_d) |
| 878 std $i,56($nap_d) |
| 879 stdu $i,64($nap_d) |
| 880 stdx $t0,$rp,$i |
| 881 stdx $t1,$t6,$i |
| 882 stdx $i,$tp,$i ; zap tp at once |
| 883 stdx $i,$t4,$i |
| 884 addi $i,$i,16 |
| 885 bdnz- Lcopy |
| 886 |
| 887 $POP r14,`2*$SIZE_T`($sp) |
| 888 $POP r15,`3*$SIZE_T`($sp) |
| 889 $POP r16,`4*$SIZE_T`($sp) |
| 890 $POP r17,`5*$SIZE_T`($sp) |
| 891 $POP r18,`6*$SIZE_T`($sp) |
| 892 $POP r19,`7*$SIZE_T`($sp) |
| 893 $POP r20,`8*$SIZE_T`($sp) |
| 894 $POP r21,`9*$SIZE_T`($sp) |
| 895 $POP r22,`10*$SIZE_T`($sp) |
| 896 $POP r23,`11*$SIZE_T`($sp) |
| 897 lfd f14,`12*$SIZE_T+0`($sp) |
| 898 lfd f15,`12*$SIZE_T+8`($sp) |
| 899 lfd f16,`12*$SIZE_T+16`($sp) |
| 900 lfd f17,`12*$SIZE_T+24`($sp) |
| 901 lfd f18,`12*$SIZE_T+32`($sp) |
| 902 lfd f19,`12*$SIZE_T+40`($sp) |
| 903 lfd f20,`12*$SIZE_T+48`($sp) |
| 904 lfd f21,`12*$SIZE_T+56`($sp) |
| 905 lfd f22,`12*$SIZE_T+64`($sp) |
| 906 lfd f23,`12*$SIZE_T+72`($sp) |
| 907 lfd f24,`12*$SIZE_T+80`($sp) |
| 908 lfd f25,`12*$SIZE_T+88`($sp) |
| 909 $POP $sp,0($sp) |
| 910 li r3,1 ; signal "handled" |
| 911 blr |
| 912 .long 0 |
| 913 .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.
se>" |
| 914 ___ |
| 915 |
| 916 $code =~ s/\`([^\`]*)\`/eval $1/gem; |
| 917 print $code; |
| 918 close STDOUT; |
OLD | NEW |