Index: openssl/crypto/bn/asm/alpha-mont.pl |
=================================================================== |
--- openssl/crypto/bn/asm/alpha-mont.pl (revision 0) |
+++ openssl/crypto/bn/asm/alpha-mont.pl (revision 0) |
@@ -0,0 +1,321 @@ |
+#!/usr/bin/env perl |
+# |
+# ==================================================================== |
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
+# project. The module is, however, dual licensed under OpenSSL and |
+# CRYPTOGAMS licenses depending on where you obtain it. For further |
+# details see http://www.openssl.org/~appro/cryptogams/. |
+# ==================================================================== |
+# |
+# On 21264 RSA sign performance improves by 70/35/20/15 percent for |
+# 512/1024/2048/4096 bit key lengths. This is against vendor compiler |
+# instructed to '-tune host' code with in-line assembler. Other |
+# benchmarks improve by 15-20%. To anchor it to something else, the |
+# code provides approximately the same performance per GHz as AMD64. |
+# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x |
+# difference. |
+ |
+# int bn_mul_mont( |
+$rp="a0"; # BN_ULONG *rp, |
+$ap="a1"; # const BN_ULONG *ap, |
+$bp="a2"; # const BN_ULONG *bp, |
+$np="a3"; # const BN_ULONG *np, |
+$n0="a4"; # const BN_ULONG *n0, |
+$num="a5"; # int num); |
+ |
+$lo0="t0"; |
+$hi0="t1"; |
+$lo1="t2"; |
+$hi1="t3"; |
+$aj="t4"; |
+$bi="t5"; |
+$nj="t6"; |
+$tp="t7"; |
+$alo="t8"; |
+$ahi="t9"; |
+$nlo="t10"; |
+$nhi="t11"; |
+$tj="t12"; |
+$i="s3"; |
+$j="s4"; |
+$m1="s5"; |
+ |
+$code=<<___; |
+#ifdef __linux__ |
+#include <asm/regdef.h> |
+#else |
+#include <asm.h> |
+#include <regdef.h> |
+#endif |
+ |
+.text |
+ |
+.set noat |
+.set noreorder |
+ |
+.globl bn_mul_mont |
+.align 5 |
+.ent bn_mul_mont |
+bn_mul_mont: |
+ lda sp,-48(sp) |
+ stq ra,0(sp) |
+ stq s3,8(sp) |
+ stq s4,16(sp) |
+ stq s5,24(sp) |
+ stq fp,32(sp) |
+ mov sp,fp |
+ .mask 0x0400f000,-48 |
+ .frame fp,48,ra |
+ .prologue 0 |
+ |
+ .align 4 |
+ .set reorder |
+ sextl $num,$num |
+ mov 0,v0 |
+ cmplt $num,4,AT |
+ bne AT,.Lexit |
+ |
+ ldq $hi0,0($ap) # ap[0] |
+ s8addq $num,16,AT |
+ ldq $aj,8($ap) |
+ subq sp,AT,sp |
+ ldq $bi,0($bp) # bp[0] |
+ lda AT,-4096(zero) # mov -4096,AT |
+ ldq $n0,0($n0) |
+ and sp,AT,sp |
+ |
+ mulq $hi0,$bi,$lo0 |
+ ldq $hi1,0($np) # np[0] |
+ umulh $hi0,$bi,$hi0 |
+ ldq $nj,8($np) |
+ |
+ mulq $lo0,$n0,$m1 |
+ |
+ mulq $hi1,$m1,$lo1 |
+ umulh $hi1,$m1,$hi1 |
+ |
+ addq $lo1,$lo0,$lo1 |
+ cmpult $lo1,$lo0,AT |
+ addq $hi1,AT,$hi1 |
+ |
+ mulq $aj,$bi,$alo |
+ mov 2,$j |
+ umulh $aj,$bi,$ahi |
+ mov sp,$tp |
+ |
+ mulq $nj,$m1,$nlo |
+ s8addq $j,$ap,$aj |
+ umulh $nj,$m1,$nhi |
+ s8addq $j,$np,$nj |
+.align 4 |
+.L1st: |
+ .set noreorder |
+ ldq $aj,0($aj) |
+ addl $j,1,$j |
+ ldq $nj,0($nj) |
+ lda $tp,8($tp) |
+ |
+ addq $alo,$hi0,$lo0 |
+ mulq $aj,$bi,$alo |
+ cmpult $lo0,$hi0,AT |
+ addq $nlo,$hi1,$lo1 |
+ |
+ mulq $nj,$m1,$nlo |
+ addq $ahi,AT,$hi0 |
+ cmpult $lo1,$hi1,v0 |
+ cmplt $j,$num,$tj |
+ |
+ umulh $aj,$bi,$ahi |
+ addq $nhi,v0,$hi1 |
+ addq $lo1,$lo0,$lo1 |
+ s8addq $j,$ap,$aj |
+ |
+ umulh $nj,$m1,$nhi |
+ cmpult $lo1,$lo0,v0 |
+ addq $hi1,v0,$hi1 |
+ s8addq $j,$np,$nj |
+ |
+ stq $lo1,-8($tp) |
+ nop |
+ unop |
+ bne $tj,.L1st |
+ .set reorder |
+ |
+ addq $alo,$hi0,$lo0 |
+ addq $nlo,$hi1,$lo1 |
+ cmpult $lo0,$hi0,AT |
+ cmpult $lo1,$hi1,v0 |
+ addq $ahi,AT,$hi0 |
+ addq $nhi,v0,$hi1 |
+ |
+ addq $lo1,$lo0,$lo1 |
+ cmpult $lo1,$lo0,v0 |
+ addq $hi1,v0,$hi1 |
+ |
+ stq $lo1,0($tp) |
+ |
+ addq $hi1,$hi0,$hi1 |
+ cmpult $hi1,$hi0,AT |
+ stq $hi1,8($tp) |
+ stq AT,16($tp) |
+ |
+ mov 1,$i |
+.align 4 |
+.Louter: |
+ s8addq $i,$bp,$bi |
+ ldq $hi0,0($ap) |
+ ldq $aj,8($ap) |
+ ldq $bi,0($bi) |
+ ldq $hi1,0($np) |
+ ldq $nj,8($np) |
+ ldq $tj,0(sp) |
+ |
+ mulq $hi0,$bi,$lo0 |
+ umulh $hi0,$bi,$hi0 |
+ |
+ addq $lo0,$tj,$lo0 |
+ cmpult $lo0,$tj,AT |
+ addq $hi0,AT,$hi0 |
+ |
+ mulq $lo0,$n0,$m1 |
+ |
+ mulq $hi1,$m1,$lo1 |
+ umulh $hi1,$m1,$hi1 |
+ |
+ addq $lo1,$lo0,$lo1 |
+ cmpult $lo1,$lo0,AT |
+ mov 2,$j |
+ addq $hi1,AT,$hi1 |
+ |
+ mulq $aj,$bi,$alo |
+ mov sp,$tp |
+ umulh $aj,$bi,$ahi |
+ |
+ mulq $nj,$m1,$nlo |
+ s8addq $j,$ap,$aj |
+ umulh $nj,$m1,$nhi |
+.align 4 |
+.Linner: |
+ .set noreorder |
+ ldq $tj,8($tp) #L0 |
+ nop #U1 |
+ ldq $aj,0($aj) #L1 |
+ s8addq $j,$np,$nj #U0 |
+ |
+ ldq $nj,0($nj) #L0 |
+ nop #U1 |
+ addq $alo,$hi0,$lo0 #L1 |
+ lda $tp,8($tp) |
+ |
+ mulq $aj,$bi,$alo #U1 |
+ cmpult $lo0,$hi0,AT #L0 |
+ addq $nlo,$hi1,$lo1 #L1 |
+ addl $j,1,$j |
+ |
+ mulq $nj,$m1,$nlo #U1 |
+ addq $ahi,AT,$hi0 #L0 |
+ addq $lo0,$tj,$lo0 #L1 |
+ cmpult $lo1,$hi1,v0 #U0 |
+ |
+ umulh $aj,$bi,$ahi #U1 |
+ cmpult $lo0,$tj,AT #L0 |
+ addq $lo1,$lo0,$lo1 #L1 |
+ addq $nhi,v0,$hi1 #U0 |
+ |
+ umulh $nj,$m1,$nhi #U1 |
+ s8addq $j,$ap,$aj #L0 |
+ cmpult $lo1,$lo0,v0 #L1 |
+ cmplt $j,$num,$tj #U0 # borrow $tj |
+ |
+ addq $hi0,AT,$hi0 #L0 |
+ addq $hi1,v0,$hi1 #U1 |
+ stq $lo1,-8($tp) #L1 |
+ bne $tj,.Linner #U0 |
+ .set reorder |
+ |
+ ldq $tj,8($tp) |
+ addq $alo,$hi0,$lo0 |
+ addq $nlo,$hi1,$lo1 |
+ cmpult $lo0,$hi0,AT |
+ cmpult $lo1,$hi1,v0 |
+ addq $ahi,AT,$hi0 |
+ addq $nhi,v0,$hi1 |
+ |
+ addq $lo0,$tj,$lo0 |
+ cmpult $lo0,$tj,AT |
+ addq $hi0,AT,$hi0 |
+ |
+ ldq $tj,16($tp) |
+ addq $lo1,$lo0,$j |
+ cmpult $j,$lo0,v0 |
+ addq $hi1,v0,$hi1 |
+ |
+ addq $hi1,$hi0,$lo1 |
+ stq $j,0($tp) |
+ cmpult $lo1,$hi0,$hi1 |
+ addq $lo1,$tj,$lo1 |
+ cmpult $lo1,$tj,AT |
+ addl $i,1,$i |
+ addq $hi1,AT,$hi1 |
+ stq $lo1,8($tp) |
+ cmplt $i,$num,$tj # borrow $tj |
+ stq $hi1,16($tp) |
+ bne $tj,.Louter |
+ |
+ s8addq $num,sp,$tj # &tp[num] |
+ mov $rp,$bp # put rp aside |
+ mov sp,$tp |
+ mov sp,$ap |
+ mov 0,$hi0 # clear borrow bit |
+ |
+.align 4 |
+.Lsub: ldq $lo0,0($tp) |
+ ldq $lo1,0($np) |
+ lda $tp,8($tp) |
+ lda $np,8($np) |
+ subq $lo0,$lo1,$lo1 # tp[i]-np[i] |
+ cmpult $lo0,$lo1,AT |
+ subq $lo1,$hi0,$lo0 |
+ cmpult $lo1,$lo0,$hi0 |
+ or $hi0,AT,$hi0 |
+ stq $lo0,0($rp) |
+ cmpult $tp,$tj,v0 |
+ lda $rp,8($rp) |
+ bne v0,.Lsub |
+ |
+ subq $hi1,$hi0,$hi0 # handle upmost overflow bit |
+ mov sp,$tp |
+ mov $bp,$rp # restore rp |
+ |
+ and sp,$hi0,$ap |
+ bic $bp,$hi0,$bp |
+ bis $bp,$ap,$ap # ap=borrow?tp:rp |
+ |
+.align 4 |
+.Lcopy: ldq $aj,0($ap) # copy or in-place refresh |
+ lda $tp,8($tp) |
+ lda $rp,8($rp) |
+ lda $ap,8($ap) |
+ stq zero,-8($tp) # zap tp |
+ cmpult $tp,$tj,AT |
+ stq $aj,-8($rp) |
+ bne AT,.Lcopy |
+ mov 1,v0 |
+ |
+.Lexit: |
+ .set noreorder |
+ mov fp,sp |
+ /*ldq ra,0(sp)*/ |
+ ldq s3,8(sp) |
+ ldq s4,16(sp) |
+ ldq s5,24(sp) |
+ ldq fp,32(sp) |
+ lda sp,48(sp) |
+ ret (ra) |
+.end bn_mul_mont |
+.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>" |
+.align 2 |
+___ |
+ |
+print $code; |
+close STDOUT; |