OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/env perl |
| 2 |
| 3 # ==================================================================== |
| 4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
| 5 # project. The module is, however, dual licensed under OpenSSL and |
| 6 # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 7 # details see http://www.openssl.org/~appro/cryptogams/. |
| 8 # ==================================================================== |
| 9 |
| 10 # April 2007. |
| 11 # |
| 12 # Performance improvement over vanilla C code varies from 85% to 45% |
| 13 # depending on key length and benchmark. Unfortunately in this context |
| 14 # these are not very impressive results [for code that utilizes "wide" |
| 15 # 64x64=128-bit multiplication, which is not commonly available to C |
| 16 # programmers], at least hand-coded bn_asm.c replacement is known to |
| 17 # provide 30-40% better results for longest keys. Well, on a second |
| 18 # thought it's not very surprising, because z-CPUs are single-issue |
| 19 # and _strictly_ in-order execution, while bn_mul_mont is more or less |
| 20 # dependent on CPU ability to pipe-line instructions and have several |
| 21 # of them "in-flight" at the same time. I mean while other methods, |
| 22 # for example Karatsuba, aim to minimize amount of multiplications at |
| 23 # the cost of other operations increase, bn_mul_mont aim to neatly |
| 24 # "overlap" multiplications and the other operations [and on most |
| 25 # platforms even minimize the amount of the other operations, in |
| 26 # particular references to memory]. But it's possible to improve this |
| 27 # module performance by implementing dedicated squaring code-path and |
| 28 # possibly by unrolling loops... |
| 29 |
| 30 # January 2009. |
| 31 # |
| 32 # Reschedule to minimize/avoid Address Generation Interlock hazard, |
| 33 # make inner loops counter-based. |
| 34 |
| 35 $mn0="%r0"; |
| 36 $num="%r1"; |
| 37 |
| 38 # int bn_mul_mont( |
| 39 $rp="%r2"; # BN_ULONG *rp, |
| 40 $ap="%r3"; # const BN_ULONG *ap, |
| 41 $bp="%r4"; # const BN_ULONG *bp, |
| 42 $np="%r5"; # const BN_ULONG *np, |
| 43 $n0="%r6"; # const BN_ULONG *n0, |
| 44 #$num="160(%r15)" # int num); |
| 45 |
| 46 $bi="%r2"; # zaps rp |
| 47 $j="%r7"; |
| 48 |
| 49 $ahi="%r8"; |
| 50 $alo="%r9"; |
| 51 $nhi="%r10"; |
| 52 $nlo="%r11"; |
| 53 $AHI="%r12"; |
| 54 $NHI="%r13"; |
| 55 $count="%r14"; |
| 56 $sp="%r15"; |
| 57 |
| 58 $code.=<<___; |
| 59 .text |
| 60 .globl bn_mul_mont |
| 61 .type bn_mul_mont,\@function |
| 62 bn_mul_mont: |
| 63 lgf $num,164($sp) # pull $num |
| 64 sla $num,3 # $num to enumerate bytes |
| 65 la $bp,0($num,$bp) |
| 66 |
| 67 stg %r2,16($sp) |
| 68 |
| 69 cghi $num,16 # |
| 70 lghi %r2,0 # |
| 71 blr %r14 # if($num<16) return 0; |
| 72 cghi $num,96 # |
| 73 bhr %r14 # if($num>96) return 0; |
| 74 |
| 75 stmg %r3,%r15,24($sp) |
| 76 |
| 77 lghi $rp,-160-8 # leave room for carry bit |
| 78 lcgr $j,$num # -$num |
| 79 lgr %r0,$sp |
| 80 la $rp,0($rp,$sp) |
| 81 la $sp,0($j,$rp) # alloca |
| 82 stg %r0,0($sp) # back chain |
| 83 |
| 84 sra $num,3 # restore $num |
| 85 la $bp,0($j,$bp) # restore $bp |
| 86 ahi $num,-1 # adjust $num for inner loop |
| 87 lg $n0,0($n0) # pull n0 |
| 88 |
| 89 lg $bi,0($bp) |
| 90 lg $alo,0($ap) |
| 91 mlgr $ahi,$bi # ap[0]*bp[0] |
| 92 lgr $AHI,$ahi |
| 93 |
| 94 lgr $mn0,$alo # "tp[0]"*n0 |
| 95 msgr $mn0,$n0 |
| 96 |
| 97 lg $nlo,0($np) # |
| 98 mlgr $nhi,$mn0 # np[0]*m1 |
| 99 algr $nlo,$alo # +="tp[0]" |
| 100 lghi $NHI,0 |
| 101 alcgr $NHI,$nhi |
| 102 |
| 103 la $j,8(%r0) # j=1 |
| 104 lr $count,$num |
| 105 |
| 106 .align 16 |
| 107 .L1st: |
| 108 lg $alo,0($j,$ap) |
| 109 mlgr $ahi,$bi # ap[j]*bp[0] |
| 110 algr $alo,$AHI |
| 111 lghi $AHI,0 |
| 112 alcgr $AHI,$ahi |
| 113 |
| 114 lg $nlo,0($j,$np) |
| 115 mlgr $nhi,$mn0 # np[j]*m1 |
| 116 algr $nlo,$NHI |
| 117 lghi $NHI,0 |
| 118 alcgr $nhi,$NHI # +="tp[j]" |
| 119 algr $nlo,$alo |
| 120 alcgr $NHI,$nhi |
| 121 |
| 122 stg $nlo,160-8($j,$sp) # tp[j-1]= |
| 123 la $j,8($j) # j++ |
| 124 brct $count,.L1st |
| 125 |
| 126 algr $NHI,$AHI |
| 127 lghi $AHI,0 |
| 128 alcgr $AHI,$AHI # upmost overflow bit |
| 129 stg $NHI,160-8($j,$sp) |
| 130 stg $AHI,160($j,$sp) |
| 131 la $bp,8($bp) # bp++ |
| 132 |
| 133 .Louter: |
| 134 lg $bi,0($bp) # bp[i] |
| 135 lg $alo,0($ap) |
| 136 mlgr $ahi,$bi # ap[0]*bp[i] |
| 137 alg $alo,160($sp) # +=tp[0] |
| 138 lghi $AHI,0 |
| 139 alcgr $AHI,$ahi |
| 140 |
| 141 lgr $mn0,$alo |
| 142 msgr $mn0,$n0 # tp[0]*n0 |
| 143 |
| 144 lg $nlo,0($np) # np[0] |
| 145 mlgr $nhi,$mn0 # np[0]*m1 |
| 146 algr $nlo,$alo # +="tp[0]" |
| 147 lghi $NHI,0 |
| 148 alcgr $NHI,$nhi |
| 149 |
| 150 la $j,8(%r0) # j=1 |
| 151 lr $count,$num |
| 152 |
| 153 .align 16 |
| 154 .Linner: |
| 155 lg $alo,0($j,$ap) |
| 156 mlgr $ahi,$bi # ap[j]*bp[i] |
| 157 algr $alo,$AHI |
| 158 lghi $AHI,0 |
| 159 alcgr $ahi,$AHI |
| 160 alg $alo,160($j,$sp)# +=tp[j] |
| 161 alcgr $AHI,$ahi |
| 162 |
| 163 lg $nlo,0($j,$np) |
| 164 mlgr $nhi,$mn0 # np[j]*m1 |
| 165 algr $nlo,$NHI |
| 166 lghi $NHI,0 |
| 167 alcgr $nhi,$NHI |
| 168 algr $nlo,$alo # +="tp[j]" |
| 169 alcgr $NHI,$nhi |
| 170 |
| 171 stg $nlo,160-8($j,$sp) # tp[j-1]= |
| 172 la $j,8($j) # j++ |
| 173 brct $count,.Linner |
| 174 |
| 175 algr $NHI,$AHI |
| 176 lghi $AHI,0 |
| 177 alcgr $AHI,$AHI |
| 178 alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit |
| 179 lghi $ahi,0 |
| 180 alcgr $AHI,$ahi # new upmost overflow bit |
| 181 stg $NHI,160-8($j,$sp) |
| 182 stg $AHI,160($j,$sp) |
| 183 |
| 184 la $bp,8($bp) # bp++ |
| 185 clg $bp,160+8+32($j,$sp) # compare to &bp[num] |
| 186 jne .Louter |
| 187 |
| 188 lg $rp,160+8+16($j,$sp) # reincarnate rp |
| 189 la $ap,160($sp) |
| 190 ahi $num,1 # restore $num, incidentally clears "borrow" |
| 191 |
| 192 la $j,0(%r0) |
| 193 lr $count,$num |
| 194 .Lsub: lg $alo,0($j,$ap) |
| 195 slbg $alo,0($j,$np) |
| 196 stg $alo,0($j,$rp) |
| 197 la $j,8($j) |
| 198 brct $count,.Lsub |
| 199 lghi $ahi,0 |
| 200 slbgr $AHI,$ahi # handle upmost carry |
| 201 |
| 202 ngr $ap,$AHI |
| 203 lghi $np,-1 |
| 204 xgr $np,$AHI |
| 205 ngr $np,$rp |
| 206 ogr $ap,$np # ap=borrow?tp:rp |
| 207 |
| 208 la $j,0(%r0) |
| 209 lgr $count,$num |
| 210 .Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh |
| 211 stg $j,160($j,$sp) # zap tp |
| 212 stg $alo,0($j,$rp) |
| 213 la $j,8($j) |
| 214 brct $count,.Lcopy |
| 215 |
| 216 la %r1,160+8+48($j,$sp) |
| 217 lmg %r6,%r15,0(%r1) |
| 218 lghi %r2,1 # signal "processed" |
| 219 br %r14 |
| 220 .size bn_mul_mont,.-bn_mul_mont |
| 221 .string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>
" |
| 222 ___ |
| 223 |
| 224 print $code; |
| 225 close STDOUT; |
OLD | NEW |