| Index: openssl/crypto/bn/asm/alpha-mont.pl
|
| diff --git a/openssl/crypto/bn/asm/alpha-mont.pl b/openssl/crypto/bn/asm/alpha-mont.pl
|
| deleted file mode 100644
|
| index 03596e2014d4035d02440316cc1cf025f726d772..0000000000000000000000000000000000000000
|
| --- a/openssl/crypto/bn/asm/alpha-mont.pl
|
| +++ /dev/null
|
| @@ -1,321 +0,0 @@
|
| -#!/usr/bin/env perl
|
| -#
|
| -# ====================================================================
|
| -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
| -# project. The module is, however, dual licensed under OpenSSL and
|
| -# CRYPTOGAMS licenses depending on where you obtain it. For further
|
| -# details see http://www.openssl.org/~appro/cryptogams/.
|
| -# ====================================================================
|
| -#
|
| -# On 21264 RSA sign performance improves by 70/35/20/15 percent for
|
| -# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
|
| -# instructed to '-tune host' code with in-line assembler. Other
|
| -# benchmarks improve by 15-20%. To anchor it to something else, the
|
| -# code provides approximately the same performance per GHz as AMD64.
|
| -# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
|
| -# difference.
|
| -
|
| -# int bn_mul_mont(
|
| -$rp="a0"; # BN_ULONG *rp,
|
| -$ap="a1"; # const BN_ULONG *ap,
|
| -$bp="a2"; # const BN_ULONG *bp,
|
| -$np="a3"; # const BN_ULONG *np,
|
| -$n0="a4"; # const BN_ULONG *n0,
|
| -$num="a5"; # int num);
|
| -
|
| -$lo0="t0";
|
| -$hi0="t1";
|
| -$lo1="t2";
|
| -$hi1="t3";
|
| -$aj="t4";
|
| -$bi="t5";
|
| -$nj="t6";
|
| -$tp="t7";
|
| -$alo="t8";
|
| -$ahi="t9";
|
| -$nlo="t10";
|
| -$nhi="t11";
|
| -$tj="t12";
|
| -$i="s3";
|
| -$j="s4";
|
| -$m1="s5";
|
| -
|
| -$code=<<___;
|
| -#ifdef __linux__
|
| -#include <asm/regdef.h>
|
| -#else
|
| -#include <asm.h>
|
| -#include <regdef.h>
|
| -#endif
|
| -
|
| -.text
|
| -
|
| -.set noat
|
| -.set noreorder
|
| -
|
| -.globl bn_mul_mont
|
| -.align 5
|
| -.ent bn_mul_mont
|
| -bn_mul_mont:
|
| - lda sp,-48(sp)
|
| - stq ra,0(sp)
|
| - stq s3,8(sp)
|
| - stq s4,16(sp)
|
| - stq s5,24(sp)
|
| - stq fp,32(sp)
|
| - mov sp,fp
|
| - .mask 0x0400f000,-48
|
| - .frame fp,48,ra
|
| - .prologue 0
|
| -
|
| - .align 4
|
| - .set reorder
|
| - sextl $num,$num
|
| - mov 0,v0
|
| - cmplt $num,4,AT
|
| - bne AT,.Lexit
|
| -
|
| - ldq $hi0,0($ap) # ap[0]
|
| - s8addq $num,16,AT
|
| - ldq $aj,8($ap)
|
| - subq sp,AT,sp
|
| - ldq $bi,0($bp) # bp[0]
|
| - lda AT,-4096(zero) # mov -4096,AT
|
| - ldq $n0,0($n0)
|
| - and sp,AT,sp
|
| -
|
| - mulq $hi0,$bi,$lo0
|
| - ldq $hi1,0($np) # np[0]
|
| - umulh $hi0,$bi,$hi0
|
| - ldq $nj,8($np)
|
| -
|
| - mulq $lo0,$n0,$m1
|
| -
|
| - mulq $hi1,$m1,$lo1
|
| - umulh $hi1,$m1,$hi1
|
| -
|
| - addq $lo1,$lo0,$lo1
|
| - cmpult $lo1,$lo0,AT
|
| - addq $hi1,AT,$hi1
|
| -
|
| - mulq $aj,$bi,$alo
|
| - mov 2,$j
|
| - umulh $aj,$bi,$ahi
|
| - mov sp,$tp
|
| -
|
| - mulq $nj,$m1,$nlo
|
| - s8addq $j,$ap,$aj
|
| - umulh $nj,$m1,$nhi
|
| - s8addq $j,$np,$nj
|
| -.align 4
|
| -.L1st:
|
| - .set noreorder
|
| - ldq $aj,0($aj)
|
| - addl $j,1,$j
|
| - ldq $nj,0($nj)
|
| - lda $tp,8($tp)
|
| -
|
| - addq $alo,$hi0,$lo0
|
| - mulq $aj,$bi,$alo
|
| - cmpult $lo0,$hi0,AT
|
| - addq $nlo,$hi1,$lo1
|
| -
|
| - mulq $nj,$m1,$nlo
|
| - addq $ahi,AT,$hi0
|
| - cmpult $lo1,$hi1,v0
|
| - cmplt $j,$num,$tj
|
| -
|
| - umulh $aj,$bi,$ahi
|
| - addq $nhi,v0,$hi1
|
| - addq $lo1,$lo0,$lo1
|
| - s8addq $j,$ap,$aj
|
| -
|
| - umulh $nj,$m1,$nhi
|
| - cmpult $lo1,$lo0,v0
|
| - addq $hi1,v0,$hi1
|
| - s8addq $j,$np,$nj
|
| -
|
| - stq $lo1,-8($tp)
|
| - nop
|
| - unop
|
| - bne $tj,.L1st
|
| - .set reorder
|
| -
|
| - addq $alo,$hi0,$lo0
|
| - addq $nlo,$hi1,$lo1
|
| - cmpult $lo0,$hi0,AT
|
| - cmpult $lo1,$hi1,v0
|
| - addq $ahi,AT,$hi0
|
| - addq $nhi,v0,$hi1
|
| -
|
| - addq $lo1,$lo0,$lo1
|
| - cmpult $lo1,$lo0,v0
|
| - addq $hi1,v0,$hi1
|
| -
|
| - stq $lo1,0($tp)
|
| -
|
| - addq $hi1,$hi0,$hi1
|
| - cmpult $hi1,$hi0,AT
|
| - stq $hi1,8($tp)
|
| - stq AT,16($tp)
|
| -
|
| - mov 1,$i
|
| -.align 4
|
| -.Louter:
|
| - s8addq $i,$bp,$bi
|
| - ldq $hi0,0($ap)
|
| - ldq $aj,8($ap)
|
| - ldq $bi,0($bi)
|
| - ldq $hi1,0($np)
|
| - ldq $nj,8($np)
|
| - ldq $tj,0(sp)
|
| -
|
| - mulq $hi0,$bi,$lo0
|
| - umulh $hi0,$bi,$hi0
|
| -
|
| - addq $lo0,$tj,$lo0
|
| - cmpult $lo0,$tj,AT
|
| - addq $hi0,AT,$hi0
|
| -
|
| - mulq $lo0,$n0,$m1
|
| -
|
| - mulq $hi1,$m1,$lo1
|
| - umulh $hi1,$m1,$hi1
|
| -
|
| - addq $lo1,$lo0,$lo1
|
| - cmpult $lo1,$lo0,AT
|
| - mov 2,$j
|
| - addq $hi1,AT,$hi1
|
| -
|
| - mulq $aj,$bi,$alo
|
| - mov sp,$tp
|
| - umulh $aj,$bi,$ahi
|
| -
|
| - mulq $nj,$m1,$nlo
|
| - s8addq $j,$ap,$aj
|
| - umulh $nj,$m1,$nhi
|
| -.align 4
|
| -.Linner:
|
| - .set noreorder
|
| - ldq $tj,8($tp) #L0
|
| - nop #U1
|
| - ldq $aj,0($aj) #L1
|
| - s8addq $j,$np,$nj #U0
|
| -
|
| - ldq $nj,0($nj) #L0
|
| - nop #U1
|
| - addq $alo,$hi0,$lo0 #L1
|
| - lda $tp,8($tp)
|
| -
|
| - mulq $aj,$bi,$alo #U1
|
| - cmpult $lo0,$hi0,AT #L0
|
| - addq $nlo,$hi1,$lo1 #L1
|
| - addl $j,1,$j
|
| -
|
| - mulq $nj,$m1,$nlo #U1
|
| - addq $ahi,AT,$hi0 #L0
|
| - addq $lo0,$tj,$lo0 #L1
|
| - cmpult $lo1,$hi1,v0 #U0
|
| -
|
| - umulh $aj,$bi,$ahi #U1
|
| - cmpult $lo0,$tj,AT #L0
|
| - addq $lo1,$lo0,$lo1 #L1
|
| - addq $nhi,v0,$hi1 #U0
|
| -
|
| - umulh $nj,$m1,$nhi #U1
|
| - s8addq $j,$ap,$aj #L0
|
| - cmpult $lo1,$lo0,v0 #L1
|
| - cmplt $j,$num,$tj #U0 # borrow $tj
|
| -
|
| - addq $hi0,AT,$hi0 #L0
|
| - addq $hi1,v0,$hi1 #U1
|
| - stq $lo1,-8($tp) #L1
|
| - bne $tj,.Linner #U0
|
| - .set reorder
|
| -
|
| - ldq $tj,8($tp)
|
| - addq $alo,$hi0,$lo0
|
| - addq $nlo,$hi1,$lo1
|
| - cmpult $lo0,$hi0,AT
|
| - cmpult $lo1,$hi1,v0
|
| - addq $ahi,AT,$hi0
|
| - addq $nhi,v0,$hi1
|
| -
|
| - addq $lo0,$tj,$lo0
|
| - cmpult $lo0,$tj,AT
|
| - addq $hi0,AT,$hi0
|
| -
|
| - ldq $tj,16($tp)
|
| - addq $lo1,$lo0,$j
|
| - cmpult $j,$lo0,v0
|
| - addq $hi1,v0,$hi1
|
| -
|
| - addq $hi1,$hi0,$lo1
|
| - stq $j,0($tp)
|
| - cmpult $lo1,$hi0,$hi1
|
| - addq $lo1,$tj,$lo1
|
| - cmpult $lo1,$tj,AT
|
| - addl $i,1,$i
|
| - addq $hi1,AT,$hi1
|
| - stq $lo1,8($tp)
|
| - cmplt $i,$num,$tj # borrow $tj
|
| - stq $hi1,16($tp)
|
| - bne $tj,.Louter
|
| -
|
| - s8addq $num,sp,$tj # &tp[num]
|
| - mov $rp,$bp # put rp aside
|
| - mov sp,$tp
|
| - mov sp,$ap
|
| - mov 0,$hi0 # clear borrow bit
|
| -
|
| -.align 4
|
| -.Lsub: ldq $lo0,0($tp)
|
| - ldq $lo1,0($np)
|
| - lda $tp,8($tp)
|
| - lda $np,8($np)
|
| - subq $lo0,$lo1,$lo1 # tp[i]-np[i]
|
| - cmpult $lo0,$lo1,AT
|
| - subq $lo1,$hi0,$lo0
|
| - cmpult $lo1,$lo0,$hi0
|
| - or $hi0,AT,$hi0
|
| - stq $lo0,0($rp)
|
| - cmpult $tp,$tj,v0
|
| - lda $rp,8($rp)
|
| - bne v0,.Lsub
|
| -
|
| - subq $hi1,$hi0,$hi0 # handle upmost overflow bit
|
| - mov sp,$tp
|
| - mov $bp,$rp # restore rp
|
| -
|
| - and sp,$hi0,$ap
|
| - bic $bp,$hi0,$bp
|
| - bis $bp,$ap,$ap # ap=borrow?tp:rp
|
| -
|
| -.align 4
|
| -.Lcopy: ldq $aj,0($ap) # copy or in-place refresh
|
| - lda $tp,8($tp)
|
| - lda $rp,8($rp)
|
| - lda $ap,8($ap)
|
| - stq zero,-8($tp) # zap tp
|
| - cmpult $tp,$tj,AT
|
| - stq $aj,-8($rp)
|
| - bne AT,.Lcopy
|
| - mov 1,v0
|
| -
|
| -.Lexit:
|
| - .set noreorder
|
| - mov fp,sp
|
| - /*ldq ra,0(sp)*/
|
| - ldq s3,8(sp)
|
| - ldq s4,16(sp)
|
| - ldq s5,24(sp)
|
| - ldq fp,32(sp)
|
| - lda sp,48(sp)
|
| - ret (ra)
|
| -.end bn_mul_mont
|
| -.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
|
| -.align 2
|
| -___
|
| -
|
| -print $code;
|
| -close STDOUT;
|
|
|