Index: openssl/crypto/bn/asm/alpha-mont.pl |
diff --git a/openssl/crypto/bn/asm/alpha-mont.pl b/openssl/crypto/bn/asm/alpha-mont.pl |
deleted file mode 100644 |
index 03596e2014d4035d02440316cc1cf025f726d772..0000000000000000000000000000000000000000 |
--- a/openssl/crypto/bn/asm/alpha-mont.pl |
+++ /dev/null |
@@ -1,321 +0,0 @@ |
-#!/usr/bin/env perl |
-# |
-# ==================================================================== |
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
-# project. The module is, however, dual licensed under OpenSSL and |
-# CRYPTOGAMS licenses depending on where you obtain it. For further |
-# details see http://www.openssl.org/~appro/cryptogams/. |
-# ==================================================================== |
-# |
-# On 21264 RSA sign performance improves by 70/35/20/15 percent for |
-# 512/1024/2048/4096 bit key lengths. This is against vendor compiler |
-# instructed to '-tune host' code with in-line assembler. Other |
-# benchmarks improve by 15-20%. To anchor it to something else, the |
-# code provides approximately the same performance per GHz as AMD64. |
-# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x |
-# difference. |
- |
-# int bn_mul_mont( |
-$rp="a0"; # BN_ULONG *rp, |
-$ap="a1"; # const BN_ULONG *ap, |
-$bp="a2"; # const BN_ULONG *bp, |
-$np="a3"; # const BN_ULONG *np, |
-$n0="a4"; # const BN_ULONG *n0, |
-$num="a5"; # int num); |
- |
-$lo0="t0"; |
-$hi0="t1"; |
-$lo1="t2"; |
-$hi1="t3"; |
-$aj="t4"; |
-$bi="t5"; |
-$nj="t6"; |
-$tp="t7"; |
-$alo="t8"; |
-$ahi="t9"; |
-$nlo="t10"; |
-$nhi="t11"; |
-$tj="t12"; |
-$i="s3"; |
-$j="s4"; |
-$m1="s5"; |
- |
-$code=<<___; |
-#ifdef __linux__ |
-#include <asm/regdef.h> |
-#else |
-#include <asm.h> |
-#include <regdef.h> |
-#endif |
- |
-.text |
- |
-.set noat |
-.set noreorder |
- |
-.globl bn_mul_mont |
-.align 5 |
-.ent bn_mul_mont |
-bn_mul_mont: |
- lda sp,-48(sp) |
- stq ra,0(sp) |
- stq s3,8(sp) |
- stq s4,16(sp) |
- stq s5,24(sp) |
- stq fp,32(sp) |
- mov sp,fp |
- .mask 0x0400f000,-48 |
- .frame fp,48,ra |
- .prologue 0 |
- |
- .align 4 |
- .set reorder |
- sextl $num,$num |
- mov 0,v0 |
- cmplt $num,4,AT |
- bne AT,.Lexit |
- |
- ldq $hi0,0($ap) # ap[0] |
- s8addq $num,16,AT |
- ldq $aj,8($ap) |
- subq sp,AT,sp |
- ldq $bi,0($bp) # bp[0] |
- lda AT,-4096(zero) # mov -4096,AT |
- ldq $n0,0($n0) |
- and sp,AT,sp |
- |
- mulq $hi0,$bi,$lo0 |
- ldq $hi1,0($np) # np[0] |
- umulh $hi0,$bi,$hi0 |
- ldq $nj,8($np) |
- |
- mulq $lo0,$n0,$m1 |
- |
- mulq $hi1,$m1,$lo1 |
- umulh $hi1,$m1,$hi1 |
- |
- addq $lo1,$lo0,$lo1 |
- cmpult $lo1,$lo0,AT |
- addq $hi1,AT,$hi1 |
- |
- mulq $aj,$bi,$alo |
- mov 2,$j |
- umulh $aj,$bi,$ahi |
- mov sp,$tp |
- |
- mulq $nj,$m1,$nlo |
- s8addq $j,$ap,$aj |
- umulh $nj,$m1,$nhi |
- s8addq $j,$np,$nj |
-.align 4 |
-.L1st: |
- .set noreorder |
- ldq $aj,0($aj) |
- addl $j,1,$j |
- ldq $nj,0($nj) |
- lda $tp,8($tp) |
- |
- addq $alo,$hi0,$lo0 |
- mulq $aj,$bi,$alo |
- cmpult $lo0,$hi0,AT |
- addq $nlo,$hi1,$lo1 |
- |
- mulq $nj,$m1,$nlo |
- addq $ahi,AT,$hi0 |
- cmpult $lo1,$hi1,v0 |
- cmplt $j,$num,$tj |
- |
- umulh $aj,$bi,$ahi |
- addq $nhi,v0,$hi1 |
- addq $lo1,$lo0,$lo1 |
- s8addq $j,$ap,$aj |
- |
- umulh $nj,$m1,$nhi |
- cmpult $lo1,$lo0,v0 |
- addq $hi1,v0,$hi1 |
- s8addq $j,$np,$nj |
- |
- stq $lo1,-8($tp) |
- nop |
- unop |
- bne $tj,.L1st |
- .set reorder |
- |
- addq $alo,$hi0,$lo0 |
- addq $nlo,$hi1,$lo1 |
- cmpult $lo0,$hi0,AT |
- cmpult $lo1,$hi1,v0 |
- addq $ahi,AT,$hi0 |
- addq $nhi,v0,$hi1 |
- |
- addq $lo1,$lo0,$lo1 |
- cmpult $lo1,$lo0,v0 |
- addq $hi1,v0,$hi1 |
- |
- stq $lo1,0($tp) |
- |
- addq $hi1,$hi0,$hi1 |
- cmpult $hi1,$hi0,AT |
- stq $hi1,8($tp) |
- stq AT,16($tp) |
- |
- mov 1,$i |
-.align 4 |
-.Louter: |
- s8addq $i,$bp,$bi |
- ldq $hi0,0($ap) |
- ldq $aj,8($ap) |
- ldq $bi,0($bi) |
- ldq $hi1,0($np) |
- ldq $nj,8($np) |
- ldq $tj,0(sp) |
- |
- mulq $hi0,$bi,$lo0 |
- umulh $hi0,$bi,$hi0 |
- |
- addq $lo0,$tj,$lo0 |
- cmpult $lo0,$tj,AT |
- addq $hi0,AT,$hi0 |
- |
- mulq $lo0,$n0,$m1 |
- |
- mulq $hi1,$m1,$lo1 |
- umulh $hi1,$m1,$hi1 |
- |
- addq $lo1,$lo0,$lo1 |
- cmpult $lo1,$lo0,AT |
- mov 2,$j |
- addq $hi1,AT,$hi1 |
- |
- mulq $aj,$bi,$alo |
- mov sp,$tp |
- umulh $aj,$bi,$ahi |
- |
- mulq $nj,$m1,$nlo |
- s8addq $j,$ap,$aj |
- umulh $nj,$m1,$nhi |
-.align 4 |
-.Linner: |
- .set noreorder |
- ldq $tj,8($tp) #L0 |
- nop #U1 |
- ldq $aj,0($aj) #L1 |
- s8addq $j,$np,$nj #U0 |
- |
- ldq $nj,0($nj) #L0 |
- nop #U1 |
- addq $alo,$hi0,$lo0 #L1 |
- lda $tp,8($tp) |
- |
- mulq $aj,$bi,$alo #U1 |
- cmpult $lo0,$hi0,AT #L0 |
- addq $nlo,$hi1,$lo1 #L1 |
- addl $j,1,$j |
- |
- mulq $nj,$m1,$nlo #U1 |
- addq $ahi,AT,$hi0 #L0 |
- addq $lo0,$tj,$lo0 #L1 |
- cmpult $lo1,$hi1,v0 #U0 |
- |
- umulh $aj,$bi,$ahi #U1 |
- cmpult $lo0,$tj,AT #L0 |
- addq $lo1,$lo0,$lo1 #L1 |
- addq $nhi,v0,$hi1 #U0 |
- |
- umulh $nj,$m1,$nhi #U1 |
- s8addq $j,$ap,$aj #L0 |
- cmpult $lo1,$lo0,v0 #L1 |
- cmplt $j,$num,$tj #U0 # borrow $tj |
- |
- addq $hi0,AT,$hi0 #L0 |
- addq $hi1,v0,$hi1 #U1 |
- stq $lo1,-8($tp) #L1 |
- bne $tj,.Linner #U0 |
- .set reorder |
- |
- ldq $tj,8($tp) |
- addq $alo,$hi0,$lo0 |
- addq $nlo,$hi1,$lo1 |
- cmpult $lo0,$hi0,AT |
- cmpult $lo1,$hi1,v0 |
- addq $ahi,AT,$hi0 |
- addq $nhi,v0,$hi1 |
- |
- addq $lo0,$tj,$lo0 |
- cmpult $lo0,$tj,AT |
- addq $hi0,AT,$hi0 |
- |
- ldq $tj,16($tp) |
- addq $lo1,$lo0,$j |
- cmpult $j,$lo0,v0 |
- addq $hi1,v0,$hi1 |
- |
- addq $hi1,$hi0,$lo1 |
- stq $j,0($tp) |
- cmpult $lo1,$hi0,$hi1 |
- addq $lo1,$tj,$lo1 |
- cmpult $lo1,$tj,AT |
- addl $i,1,$i |
- addq $hi1,AT,$hi1 |
- stq $lo1,8($tp) |
- cmplt $i,$num,$tj # borrow $tj |
- stq $hi1,16($tp) |
- bne $tj,.Louter |
- |
- s8addq $num,sp,$tj # &tp[num] |
- mov $rp,$bp # put rp aside |
- mov sp,$tp |
- mov sp,$ap |
- mov 0,$hi0 # clear borrow bit |
- |
-.align 4 |
-.Lsub: ldq $lo0,0($tp) |
- ldq $lo1,0($np) |
- lda $tp,8($tp) |
- lda $np,8($np) |
- subq $lo0,$lo1,$lo1 # tp[i]-np[i] |
- cmpult $lo0,$lo1,AT |
- subq $lo1,$hi0,$lo0 |
- cmpult $lo1,$lo0,$hi0 |
- or $hi0,AT,$hi0 |
- stq $lo0,0($rp) |
- cmpult $tp,$tj,v0 |
- lda $rp,8($rp) |
- bne v0,.Lsub |
- |
- subq $hi1,$hi0,$hi0 # handle upmost overflow bit |
- mov sp,$tp |
- mov $bp,$rp # restore rp |
- |
- and sp,$hi0,$ap |
- bic $bp,$hi0,$bp |
- bis $bp,$ap,$ap # ap=borrow?tp:rp |
- |
-.align 4 |
-.Lcopy: ldq $aj,0($ap) # copy or in-place refresh |
- lda $tp,8($tp) |
- lda $rp,8($rp) |
- lda $ap,8($ap) |
- stq zero,-8($tp) # zap tp |
- cmpult $tp,$tj,AT |
- stq $aj,-8($rp) |
- bne AT,.Lcopy |
- mov 1,v0 |
- |
-.Lexit: |
- .set noreorder |
- mov fp,sp |
- /*ldq ra,0(sp)*/ |
- ldq s3,8(sp) |
- ldq s4,16(sp) |
- ldq s5,24(sp) |
- ldq fp,32(sp) |
- lda sp,48(sp) |
- ret (ra) |
-.end bn_mul_mont |
-.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>" |
-.align 2 |
-___ |
- |
-print $code; |
-close STDOUT; |