Index: openssl/crypto/bn/asm/sparcv9-mont.pl |
=================================================================== |
--- openssl/crypto/bn/asm/sparcv9-mont.pl (revision 0) |
+++ openssl/crypto/bn/asm/sparcv9-mont.pl (revision 0) |
@@ -0,0 +1,606 @@ |
+#!/usr/bin/env perl |
+ |
+# ==================================================================== |
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
+# project. The module is, however, dual licensed under OpenSSL and |
+# CRYPTOGAMS licenses depending on where you obtain it. For further |
+# details see http://www.openssl.org/~appro/cryptogams/. |
+# ==================================================================== |
+ |
+# December 2005 |
+# |
+# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons |
+# for undertaken effort are multiple. First of all, UltraSPARC is not |
+# the whole SPARCv9 universe and other VIS-free implementations deserve |
+# optimized code as much. Secondly, newly introduced UltraSPARC T1, |
+# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes, |
+# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with |
+# several integrated RSA/DSA accelerator circuits accessible through |
+# kernel driver [only(*)], but having decent user-land software |
+# implementation is important too. Finally, reasons like desire to |
+# experiment with dedicated squaring procedure. Yes, this module |
+# implements one, because it was easiest to draft it in SPARCv9 |
+# instructions... |
+ |
+# (*) Engine accessing the driver in question is on my TODO list. |
+# For reference, acceleator is estimated to give 6 to 10 times |
+# improvement on single-threaded RSA sign. It should be noted |
+# that 6-10x improvement coefficient does not actually mean |
+# something extraordinary in terms of absolute [single-threaded] |
+# performance, as SPARCv9 instruction set is by all means least |
+# suitable for high performance crypto among other 64 bit |
+# platforms. 6-10x factor simply places T1 in same performance |
+# domain as say AMD64 and IA-64. Improvement of RSA verify don't |
+# appear impressive at all, but it's the sign operation which is |
+# far more critical/interesting. |
+ |
+# You might notice that inner loops are modulo-scheduled:-) This has |
+# essentially negligible impact on UltraSPARC performance, it's |
+# Fujitsu SPARC64 V users who should notice and hopefully appreciate |
+# the advantage... Currently this module surpasses sparcv9a-mont.pl |
+# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a |
+# module still have hidden potential [see TODO list there], which is |
+# estimated to be larger than 20%... |
+ |
+# int bn_mul_mont( |
+$rp="%i0"; # BN_ULONG *rp, |
+$ap="%i1"; # const BN_ULONG *ap, |
+$bp="%i2"; # const BN_ULONG *bp, |
+$np="%i3"; # const BN_ULONG *np, |
+$n0="%i4"; # const BN_ULONG *n0, |
+$num="%i5"; # int num); |
+ |
+$bits=32; |
+for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } |
+if ($bits==64) { $bias=2047; $frame=192; } |
+else { $bias=0; $frame=128; } |
+ |
+$car0="%o0"; |
+$car1="%o1"; |
+$car2="%o2"; # 1 bit |
+$acc0="%o3"; |
+$acc1="%o4"; |
+$mask="%g1"; # 32 bits, what a waste... |
+$tmp0="%g4"; |
+$tmp1="%g5"; |
+ |
+$i="%l0"; |
+$j="%l1"; |
+$mul0="%l2"; |
+$mul1="%l3"; |
+$tp="%l4"; |
+$apj="%l5"; |
+$npj="%l6"; |
+$tpj="%l7"; |
+ |
+$fname="bn_mul_mont_int"; |
+ |
+$code=<<___; |
+.section ".text",#alloc,#execinstr |
+ |
+.global $fname |
+.align 32 |
+$fname: |
+ cmp %o5,4 ! 128 bits minimum |
+ bge,pt %icc,.Lenter |
+ sethi %hi(0xffffffff),$mask |
+ retl |
+ clr %o0 |
+.align 32 |
+.Lenter: |
+ save %sp,-$frame,%sp |
+ sll $num,2,$num ! num*=4 |
+ or $mask,%lo(0xffffffff),$mask |
+ ld [$n0],$n0 |
+ cmp $ap,$bp |
+ and $num,$mask,$num |
+ ld [$bp],$mul0 ! bp[0] |
+ nop |
+ |
+ add %sp,$bias,%o7 ! real top of stack |
+ ld [$ap],$car0 ! ap[0] ! redundant in squaring context |
+ sub %o7,$num,%o7 |
+ ld [$ap+4],$apj ! ap[1] |
+ and %o7,-1024,%o7 |
+ ld [$np],$car1 ! np[0] |
+ sub %o7,$bias,%sp ! alloca |
+ ld [$np+4],$npj ! np[1] |
+ be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont |
+ mov 12,$j |
+ |
+ mulx $car0,$mul0,$car0 ! ap[0]*bp[0] |
+ mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] |
+ and $car0,$mask,$acc0 |
+ add %sp,$bias+$frame,$tp |
+ ld [$ap+8],$apj !prologue! |
+ |
+ mulx $n0,$acc0,$mul1 ! "t[0]"*n0 |
+ and $mul1,$mask,$mul1 |
+ |
+ mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 |
+ mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 |
+ srlx $car0,32,$car0 |
+ add $acc0,$car1,$car1 |
+ ld [$np+8],$npj !prologue! |
+ srlx $car1,32,$car1 |
+ mov $tmp0,$acc0 !prologue! |
+ |
+.L1st: |
+ mulx $apj,$mul0,$tmp0 |
+ mulx $npj,$mul1,$tmp1 |
+ add $acc0,$car0,$car0 |
+ ld [$ap+$j],$apj ! ap[j] |
+ and $car0,$mask,$acc0 |
+ add $acc1,$car1,$car1 |
+ ld [$np+$j],$npj ! np[j] |
+ srlx $car0,32,$car0 |
+ add $acc0,$car1,$car1 |
+ add $j,4,$j ! j++ |
+ mov $tmp0,$acc0 |
+ st $car1,[$tp] |
+ cmp $j,$num |
+ mov $tmp1,$acc1 |
+ srlx $car1,32,$car1 |
+ bl %icc,.L1st |
+ add $tp,4,$tp ! tp++ |
+!.L1st |
+ |
+ mulx $apj,$mul0,$tmp0 !epilogue! |
+ mulx $npj,$mul1,$tmp1 |
+ add $acc0,$car0,$car0 |
+ and $car0,$mask,$acc0 |
+ add $acc1,$car1,$car1 |
+ srlx $car0,32,$car0 |
+ add $acc0,$car1,$car1 |
+ st $car1,[$tp] |
+ srlx $car1,32,$car1 |
+ |
+ add $tmp0,$car0,$car0 |
+ and $car0,$mask,$acc0 |
+ add $tmp1,$car1,$car1 |
+ srlx $car0,32,$car0 |
+ add $acc0,$car1,$car1 |
+ st $car1,[$tp+4] |
+ srlx $car1,32,$car1 |
+ |
+ add $car0,$car1,$car1 |
+ st $car1,[$tp+8] |
+ srlx $car1,32,$car2 |
+ |
+ mov 4,$i ! i++ |
+ ld [$bp+4],$mul0 ! bp[1] |
+.Louter: |
+ add %sp,$bias+$frame,$tp |
+ ld [$ap],$car0 ! ap[0] |
+ ld [$ap+4],$apj ! ap[1] |
+ ld [$np],$car1 ! np[0] |
+ ld [$np+4],$npj ! np[1] |
+ ld [$tp],$tmp1 ! tp[0] |
+ ld [$tp+4],$tpj ! tp[1] |
+ mov 12,$j |
+ |
+ mulx $car0,$mul0,$car0 |
+ mulx $apj,$mul0,$tmp0 !prologue! |
+ add $tmp1,$car0,$car0 |
+ ld [$ap+8],$apj !prologue! |
+ and $car0,$mask,$acc0 |
+ |
+ mulx $n0,$acc0,$mul1 |
+ and $mul1,$mask,$mul1 |
+ |
+ mulx $car1,$mul1,$car1 |
+ mulx $npj,$mul1,$acc1 !prologue! |
+ srlx $car0,32,$car0 |
+ add $acc0,$car1,$car1 |
+ ld [$np+8],$npj !prologue! |
+ srlx $car1,32,$car1 |
+ mov $tmp0,$acc0 !prologue! |
+ |
+.Linner: |
+ mulx $apj,$mul0,$tmp0 |
+ mulx $npj,$mul1,$tmp1 |
+ add $tpj,$car0,$car0 |
+ ld [$ap+$j],$apj ! ap[j] |
+ add $acc0,$car0,$car0 |
+ add $acc1,$car1,$car1 |
+ ld [$np+$j],$npj ! np[j] |
+ and $car0,$mask,$acc0 |
+ ld [$tp+8],$tpj ! tp[j] |
+ srlx $car0,32,$car0 |
+ add $acc0,$car1,$car1 |
+ add $j,4,$j ! j++ |
+ mov $tmp0,$acc0 |
+ st $car1,[$tp] ! tp[j-1] |
+ srlx $car1,32,$car1 |
+ mov $tmp1,$acc1 |
+ cmp $j,$num |
+ bl %icc,.Linner |
+ add $tp,4,$tp ! tp++ |
+!.Linner |
+ |
+ mulx $apj,$mul0,$tmp0 !epilogue! |
+ mulx $npj,$mul1,$tmp1 |
+ add $tpj,$car0,$car0 |
+ add $acc0,$car0,$car0 |
+ ld [$tp+8],$tpj ! tp[j] |
+ and $car0,$mask,$acc0 |
+ add $acc1,$car1,$car1 |
+ srlx $car0,32,$car0 |
+ add $acc0,$car1,$car1 |
+ st $car1,[$tp] ! tp[j-1] |
+ srlx $car1,32,$car1 |
+ |
+ add $tpj,$car0,$car0 |
+ add $tmp0,$car0,$car0 |
+ and $car0,$mask,$acc0 |
+ add $tmp1,$car1,$car1 |
+ add $acc0,$car1,$car1 |
+ st $car1,[$tp+4] ! tp[j-1] |
+ srlx $car0,32,$car0 |
+ add $i,4,$i ! i++ |
+ srlx $car1,32,$car1 |
+ |
+ add $car0,$car1,$car1 |
+ cmp $i,$num |
+ add $car2,$car1,$car1 |
+ st $car1,[$tp+8] |
+ |
+ srlx $car1,32,$car2 |
+ bl,a %icc,.Louter |
+ ld [$bp+$i],$mul0 ! bp[i] |
+!.Louter |
+ |
+ add $tp,12,$tp |
+ |
+.Ltail: |
+ add $np,$num,$np |
+ add $rp,$num,$rp |
+ mov $tp,$ap |
+ sub %g0,$num,%o7 ! k=-num |
+ ba .Lsub |
+ subcc %g0,%g0,%g0 ! clear %icc.c |
+.align 16 |
+.Lsub: |
+ ld [$tp+%o7],%o0 |
+ ld [$np+%o7],%o1 |
+ subccc %o0,%o1,%o1 ! tp[j]-np[j] |
+ add $rp,%o7,$i |
+ add %o7,4,%o7 |
+ brnz %o7,.Lsub |
+ st %o1,[$i] |
+ subc $car2,0,$car2 ! handle upmost overflow bit |
+ and $tp,$car2,$ap |
+ andn $rp,$car2,$np |
+ or $ap,$np,$ap |
+ sub %g0,$num,%o7 |
+ |
+.Lcopy: |
+ ld [$ap+%o7],%o0 ! copy or in-place refresh |
+ st %g0,[$tp+%o7] ! zap tp |
+ st %o0,[$rp+%o7] |
+ add %o7,4,%o7 |
+ brnz %o7,.Lcopy |
+ nop |
+ mov 1,%i0 |
+ ret |
+ restore |
+___ |
+ |
+######## |
+######## .Lbn_sqr_mont gives up to 20% *overall* improvement over |
+######## code without following dedicated squaring procedure. |
+######## |
+$sbit="%i2"; # re-use $bp! |
+ |
+$code.=<<___; |
+.align 32 |
+.Lbn_sqr_mont: |
+ mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] |
+ mulx $apj,$mul0,$tmp0 !prologue! |
+ and $car0,$mask,$acc0 |
+ add %sp,$bias+$frame,$tp |
+ ld [$ap+8],$apj !prologue! |
+ |
+ mulx $n0,$acc0,$mul1 ! "t[0]"*n0 |
+ srlx $car0,32,$car0 |
+ and $mul1,$mask,$mul1 |
+ |
+ mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 |
+ mulx $npj,$mul1,$acc1 !prologue! |
+ and $car0,1,$sbit |
+ ld [$np+8],$npj !prologue! |
+ srlx $car0,1,$car0 |
+ add $acc0,$car1,$car1 |
+ srlx $car1,32,$car1 |
+ mov $tmp0,$acc0 !prologue! |
+ |
+.Lsqr_1st: |
+ mulx $apj,$mul0,$tmp0 |
+ mulx $npj,$mul1,$tmp1 |
+ add $acc0,$car0,$car0 ! ap[j]*a0+c0 |
+ add $acc1,$car1,$car1 |
+ ld [$ap+$j],$apj ! ap[j] |
+ and $car0,$mask,$acc0 |
+ ld [$np+$j],$npj ! np[j] |
+ srlx $car0,32,$car0 |
+ add $acc0,$acc0,$acc0 |
+ or $sbit,$acc0,$acc0 |
+ mov $tmp1,$acc1 |
+ srlx $acc0,32,$sbit |
+ add $j,4,$j ! j++ |
+ and $acc0,$mask,$acc0 |
+ cmp $j,$num |
+ add $acc0,$car1,$car1 |
+ st $car1,[$tp] |
+ mov $tmp0,$acc0 |
+ srlx $car1,32,$car1 |
+ bl %icc,.Lsqr_1st |
+ add $tp,4,$tp ! tp++ |
+!.Lsqr_1st |
+ |
+ mulx $apj,$mul0,$tmp0 ! epilogue |
+ mulx $npj,$mul1,$tmp1 |
+ add $acc0,$car0,$car0 ! ap[j]*a0+c0 |
+ add $acc1,$car1,$car1 |
+ and $car0,$mask,$acc0 |
+ srlx $car0,32,$car0 |
+ add $acc0,$acc0,$acc0 |
+ or $sbit,$acc0,$acc0 |
+ srlx $acc0,32,$sbit |
+ and $acc0,$mask,$acc0 |
+ add $acc0,$car1,$car1 |
+ st $car1,[$tp] |
+ srlx $car1,32,$car1 |
+ |
+ add $tmp0,$car0,$car0 ! ap[j]*a0+c0 |
+ add $tmp1,$car1,$car1 |
+ and $car0,$mask,$acc0 |
+ srlx $car0,32,$car0 |
+ add $acc0,$acc0,$acc0 |
+ or $sbit,$acc0,$acc0 |
+ srlx $acc0,32,$sbit |
+ and $acc0,$mask,$acc0 |
+ add $acc0,$car1,$car1 |
+ st $car1,[$tp+4] |
+ srlx $car1,32,$car1 |
+ |
+ add $car0,$car0,$car0 |
+ or $sbit,$car0,$car0 |
+ add $car0,$car1,$car1 |
+ st $car1,[$tp+8] |
+ srlx $car1,32,$car2 |
+ |
+ ld [%sp+$bias+$frame],$tmp0 ! tp[0] |
+ ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] |
+ ld [%sp+$bias+$frame+8],$tpj ! tp[2] |
+ ld [$ap+4],$mul0 ! ap[1] |
+ ld [$ap+8],$apj ! ap[2] |
+ ld [$np],$car1 ! np[0] |
+ ld [$np+4],$npj ! np[1] |
+ mulx $n0,$tmp0,$mul1 |
+ |
+ mulx $mul0,$mul0,$car0 |
+ and $mul1,$mask,$mul1 |
+ |
+ mulx $car1,$mul1,$car1 |
+ mulx $npj,$mul1,$acc1 |
+ add $tmp0,$car1,$car1 |
+ and $car0,$mask,$acc0 |
+ ld [$np+8],$npj ! np[2] |
+ srlx $car1,32,$car1 |
+ add $tmp1,$car1,$car1 |
+ srlx $car0,32,$car0 |
+ add $acc0,$car1,$car1 |
+ and $car0,1,$sbit |
+ add $acc1,$car1,$car1 |
+ srlx $car0,1,$car0 |
+ mov 12,$j |
+ st $car1,[%sp+$bias+$frame] ! tp[0]= |
+ srlx $car1,32,$car1 |
+ add %sp,$bias+$frame+4,$tp |
+ |
+.Lsqr_2nd: |
+ mulx $apj,$mul0,$acc0 |
+ mulx $npj,$mul1,$acc1 |
+ add $acc0,$car0,$car0 |
+ add $tpj,$car1,$car1 |
+ ld [$ap+$j],$apj ! ap[j] |
+ and $car0,$mask,$acc0 |
+ ld [$np+$j],$npj ! np[j] |
+ srlx $car0,32,$car0 |
+ add $acc1,$car1,$car1 |
+ ld [$tp+8],$tpj ! tp[j] |
+ add $acc0,$acc0,$acc0 |
+ add $j,4,$j ! j++ |
+ or $sbit,$acc0,$acc0 |
+ srlx $acc0,32,$sbit |
+ and $acc0,$mask,$acc0 |
+ cmp $j,$num |
+ add $acc0,$car1,$car1 |
+ st $car1,[$tp] ! tp[j-1] |
+ srlx $car1,32,$car1 |
+ bl %icc,.Lsqr_2nd |
+ add $tp,4,$tp ! tp++ |
+!.Lsqr_2nd |
+ |
+ mulx $apj,$mul0,$acc0 |
+ mulx $npj,$mul1,$acc1 |
+ add $acc0,$car0,$car0 |
+ add $tpj,$car1,$car1 |
+ and $car0,$mask,$acc0 |
+ srlx $car0,32,$car0 |
+ add $acc1,$car1,$car1 |
+ add $acc0,$acc0,$acc0 |
+ or $sbit,$acc0,$acc0 |
+ srlx $acc0,32,$sbit |
+ and $acc0,$mask,$acc0 |
+ add $acc0,$car1,$car1 |
+ st $car1,[$tp] ! tp[j-1] |
+ srlx $car1,32,$car1 |
+ |
+ add $car0,$car0,$car0 |
+ or $sbit,$car0,$car0 |
+ add $car0,$car1,$car1 |
+ add $car2,$car1,$car1 |
+ st $car1,[$tp+4] |
+ srlx $car1,32,$car2 |
+ |
+ ld [%sp+$bias+$frame],$tmp1 ! tp[0] |
+ ld [%sp+$bias+$frame+4],$tpj ! tp[1] |
+ ld [$ap+8],$mul0 ! ap[2] |
+ ld [$np],$car1 ! np[0] |
+ ld [$np+4],$npj ! np[1] |
+ mulx $n0,$tmp1,$mul1 |
+ and $mul1,$mask,$mul1 |
+ mov 8,$i |
+ |
+ mulx $mul0,$mul0,$car0 |
+ mulx $car1,$mul1,$car1 |
+ and $car0,$mask,$acc0 |
+ add $tmp1,$car1,$car1 |
+ srlx $car0,32,$car0 |
+ add %sp,$bias+$frame,$tp |
+ srlx $car1,32,$car1 |
+ and $car0,1,$sbit |
+ srlx $car0,1,$car0 |
+ mov 4,$j |
+ |
+.Lsqr_outer: |
+.Lsqr_inner1: |
+ mulx $npj,$mul1,$acc1 |
+ add $tpj,$car1,$car1 |
+ add $j,4,$j |
+ ld [$tp+8],$tpj |
+ cmp $j,$i |
+ add $acc1,$car1,$car1 |
+ ld [$np+$j],$npj |
+ st $car1,[$tp] |
+ srlx $car1,32,$car1 |
+ bl %icc,.Lsqr_inner1 |
+ add $tp,4,$tp |
+!.Lsqr_inner1 |
+ |
+ add $j,4,$j |
+ ld [$ap+$j],$apj ! ap[j] |
+ mulx $npj,$mul1,$acc1 |
+ add $tpj,$car1,$car1 |
+ ld [$np+$j],$npj ! np[j] |
+ add $acc0,$car1,$car1 |
+ ld [$tp+8],$tpj ! tp[j] |
+ add $acc1,$car1,$car1 |
+ st $car1,[$tp] |
+ srlx $car1,32,$car1 |
+ |
+ add $j,4,$j |
+ cmp $j,$num |
+ be,pn %icc,.Lsqr_no_inner2 |
+ add $tp,4,$tp |
+ |
+.Lsqr_inner2: |
+ mulx $apj,$mul0,$acc0 |
+ mulx $npj,$mul1,$acc1 |
+ add $tpj,$car1,$car1 |
+ add $acc0,$car0,$car0 |
+ ld [$ap+$j],$apj ! ap[j] |
+ and $car0,$mask,$acc0 |
+ ld [$np+$j],$npj ! np[j] |
+ srlx $car0,32,$car0 |
+ add $acc0,$acc0,$acc0 |
+ ld [$tp+8],$tpj ! tp[j] |
+ or $sbit,$acc0,$acc0 |
+ add $j,4,$j ! j++ |
+ srlx $acc0,32,$sbit |
+ and $acc0,$mask,$acc0 |
+ cmp $j,$num |
+ add $acc0,$car1,$car1 |
+ add $acc1,$car1,$car1 |
+ st $car1,[$tp] ! tp[j-1] |
+ srlx $car1,32,$car1 |
+ bl %icc,.Lsqr_inner2 |
+ add $tp,4,$tp ! tp++ |
+ |
+.Lsqr_no_inner2: |
+ mulx $apj,$mul0,$acc0 |
+ mulx $npj,$mul1,$acc1 |
+ add $tpj,$car1,$car1 |
+ add $acc0,$car0,$car0 |
+ and $car0,$mask,$acc0 |
+ srlx $car0,32,$car0 |
+ add $acc0,$acc0,$acc0 |
+ or $sbit,$acc0,$acc0 |
+ srlx $acc0,32,$sbit |
+ and $acc0,$mask,$acc0 |
+ add $acc0,$car1,$car1 |
+ add $acc1,$car1,$car1 |
+ st $car1,[$tp] ! tp[j-1] |
+ srlx $car1,32,$car1 |
+ |
+ add $car0,$car0,$car0 |
+ or $sbit,$car0,$car0 |
+ add $car0,$car1,$car1 |
+ add $car2,$car1,$car1 |
+ st $car1,[$tp+4] |
+ srlx $car1,32,$car2 |
+ |
+ add $i,4,$i ! i++ |
+ ld [%sp+$bias+$frame],$tmp1 ! tp[0] |
+ ld [%sp+$bias+$frame+4],$tpj ! tp[1] |
+ ld [$ap+$i],$mul0 ! ap[j] |
+ ld [$np],$car1 ! np[0] |
+ ld [$np+4],$npj ! np[1] |
+ mulx $n0,$tmp1,$mul1 |
+ and $mul1,$mask,$mul1 |
+ add $i,4,$tmp0 |
+ |
+ mulx $mul0,$mul0,$car0 |
+ mulx $car1,$mul1,$car1 |
+ and $car0,$mask,$acc0 |
+ add $tmp1,$car1,$car1 |
+ srlx $car0,32,$car0 |
+ add %sp,$bias+$frame,$tp |
+ srlx $car1,32,$car1 |
+ and $car0,1,$sbit |
+ srlx $car0,1,$car0 |
+ |
+ cmp $tmp0,$num ! i<num-1 |
+ bl %icc,.Lsqr_outer |
+ mov 4,$j |
+ |
+.Lsqr_last: |
+ mulx $npj,$mul1,$acc1 |
+ add $tpj,$car1,$car1 |
+ add $j,4,$j |
+ ld [$tp+8],$tpj |
+ cmp $j,$i |
+ add $acc1,$car1,$car1 |
+ ld [$np+$j],$npj |
+ st $car1,[$tp] |
+ srlx $car1,32,$car1 |
+ bl %icc,.Lsqr_last |
+ add $tp,4,$tp |
+!.Lsqr_last |
+ |
+ mulx $npj,$mul1,$acc1 |
+ add $tpj,$car1,$car1 |
+ add $acc0,$car1,$car1 |
+ add $acc1,$car1,$car1 |
+ st $car1,[$tp] |
+ srlx $car1,32,$car1 |
+ |
+ add $car0,$car0,$car0 ! recover $car0 |
+ or $sbit,$car0,$car0 |
+ add $car0,$car1,$car1 |
+ add $car2,$car1,$car1 |
+ st $car1,[$tp+4] |
+ srlx $car1,32,$car2 |
+ |
+ ba .Ltail |
+ add $tp,8,$tp |
+.type $fname,#function |
+.size $fname,(.-$fname) |
+.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" |
+.align 32 |
+___ |
+$code =~ s/\`([^\`]*)\`/eval($1)/gem; |
+print $code; |
+close STDOUT; |