| OLD | NEW |
| (Empty) |
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 # ==================================================================== | |
| 4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
| 5 # project. The module is, however, dual licensed under OpenSSL and | |
| 6 # CRYPTOGAMS licenses depending on where you obtain it. For further | |
| 7 # details see http://www.openssl.org/~appro/cryptogams/. | |
| 8 # ==================================================================== | |
| 9 | |
| 10 # April 2007. | |
| 11 # | |
| 12 # Performance improvement over vanilla C code varies from 85% to 45% | |
| 13 # depending on key length and benchmark. Unfortunately in this context | |
| 14 # these are not very impressive results [for code that utilizes "wide" | |
| 15 # 64x64=128-bit multiplication, which is not commonly available to C | |
| 16 # programmers], at least hand-coded bn_asm.c replacement is known to | |
| 17 # provide 30-40% better results for longest keys. Well, on a second | |
| 18 # thought it's not very surprising, because z-CPUs are single-issue | |
| 19 # and _strictly_ in-order execution, while bn_mul_mont is more or less | |
| 20 # dependent on CPU ability to pipe-line instructions and have several | |
| 21 # of them "in-flight" at the same time. I mean while other methods, | |
| 22 # for example Karatsuba, aim to minimize amount of multiplications at | |
| 23 # the cost of other operations increase, bn_mul_mont aim to neatly | |
| 24 # "overlap" multiplications and the other operations [and on most | |
| 25 # platforms even minimize the amount of the other operations, in | |
| 26 # particular references to memory]. But it's possible to improve this | |
| 27 # module performance by implementing dedicated squaring code-path and | |
| 28 # possibly by unrolling loops... | |
| 29 | |
| 30 # January 2009. | |
| 31 # | |
| 32 # Reschedule to minimize/avoid Address Generation Interlock hazard, | |
| 33 # make inner loops counter-based. | |
| 34 | |
| 35 # November 2010. | |
| 36 # | |
| 37 # Adapt for -m31 build. If kernel supports what's called "highgprs" | |
| 38 # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | |
| 39 # instructions and achieve "64-bit" performance even in 31-bit legacy | |
| 40 # application context. The feature is not specific to any particular | |
| 41 # processor, as long as it's "z-CPU". Latter implies that the code | |
| 42 # remains z/Architecture specific. Compatibility with 32-bit BN_ULONG | |
| 43 # is achieved by swapping words after 64-bit loads, follow _dswap-s. | |
| 44 # On z990 it was measured to perform 2.6-2.2 times better than | |
| 45 # compiler-generated code, less for longer keys... | |
| 46 | |
| 47 $flavour = shift; | |
| 48 | |
| 49 if ($flavour =~ /3[12]/) { | |
| 50 $SIZE_T=4; | |
| 51 $g=""; | |
| 52 } else { | |
| 53 $SIZE_T=8; | |
| 54 $g="g"; | |
| 55 } | |
| 56 | |
| 57 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | |
| 58 open STDOUT,">$output"; | |
| 59 | |
| 60 $stdframe=16*$SIZE_T+4*8; | |
| 61 | |
| 62 $mn0="%r0"; | |
| 63 $num="%r1"; | |
| 64 | |
| 65 # int bn_mul_mont( | |
| 66 $rp="%r2"; # BN_ULONG *rp, | |
| 67 $ap="%r3"; # const BN_ULONG *ap, | |
| 68 $bp="%r4"; # const BN_ULONG *bp, | |
| 69 $np="%r5"; # const BN_ULONG *np, | |
| 70 $n0="%r6"; # const BN_ULONG *n0, | |
| 71 #$num="160(%r15)" # int num); | |
| 72 | |
| 73 $bi="%r2"; # zaps rp | |
| 74 $j="%r7"; | |
| 75 | |
| 76 $ahi="%r8"; | |
| 77 $alo="%r9"; | |
| 78 $nhi="%r10"; | |
| 79 $nlo="%r11"; | |
| 80 $AHI="%r12"; | |
| 81 $NHI="%r13"; | |
| 82 $count="%r14"; | |
| 83 $sp="%r15"; | |
| 84 | |
| 85 $code.=<<___; | |
| 86 .text | |
| 87 .globl bn_mul_mont | |
| 88 .type bn_mul_mont,\@function | |
| 89 bn_mul_mont: | |
| 90 lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num | |
| 91 sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes | |
| 92 la $bp,0($num,$bp) | |
| 93 | |
| 94 st${g} %r2,2*$SIZE_T($sp) | |
| 95 | |
| 96 cghi $num,16 # | |
| 97 lghi %r2,0 # | |
| 98 blr %r14 # if($num<16) return 0; | |
| 99 ___ | |
| 100 $code.=<<___ if ($flavour =~ /3[12]/); | |
| 101 tmll $num,4 | |
| 102 bnzr %r14 # if ($num&1) return 0; | |
| 103 ___ | |
| 104 $code.=<<___ if ($flavour !~ /3[12]/); | |
| 105 cghi $num,96 # | |
| 106 bhr %r14 # if($num>96) return 0; | |
| 107 ___ | |
| 108 $code.=<<___; | |
| 109 stm${g} %r3,%r15,3*$SIZE_T($sp) | |
| 110 | |
| 111 lghi $rp,-$stdframe-8 # leave room for carry bit | |
| 112 lcgr $j,$num # -$num | |
| 113 lgr %r0,$sp | |
| 114 la $rp,0($rp,$sp) | |
| 115 la $sp,0($j,$rp) # alloca | |
| 116 st${g} %r0,0($sp) # back chain | |
| 117 | |
| 118 sra $num,3 # restore $num | |
| 119 la $bp,0($j,$bp) # restore $bp | |
| 120 ahi $num,-1 # adjust $num for inner loop | |
| 121 lg $n0,0($n0) # pull n0 | |
| 122 _dswap $n0 | |
| 123 | |
| 124 lg $bi,0($bp) | |
| 125 _dswap $bi | |
| 126 lg $alo,0($ap) | |
| 127 _dswap $alo | |
| 128 mlgr $ahi,$bi # ap[0]*bp[0] | |
| 129 lgr $AHI,$ahi | |
| 130 | |
| 131 lgr $mn0,$alo # "tp[0]"*n0 | |
| 132 msgr $mn0,$n0 | |
| 133 | |
| 134 lg $nlo,0($np) # | |
| 135 _dswap $nlo | |
| 136 mlgr $nhi,$mn0 # np[0]*m1 | |
| 137 algr $nlo,$alo # +="tp[0]" | |
| 138 lghi $NHI,0 | |
| 139 alcgr $NHI,$nhi | |
| 140 | |
| 141 la $j,8(%r0) # j=1 | |
| 142 lr $count,$num | |
| 143 | |
| 144 .align 16 | |
| 145 .L1st: | |
| 146 lg $alo,0($j,$ap) | |
| 147 _dswap $alo | |
| 148 mlgr $ahi,$bi # ap[j]*bp[0] | |
| 149 algr $alo,$AHI | |
| 150 lghi $AHI,0 | |
| 151 alcgr $AHI,$ahi | |
| 152 | |
| 153 lg $nlo,0($j,$np) | |
| 154 _dswap $nlo | |
| 155 mlgr $nhi,$mn0 # np[j]*m1 | |
| 156 algr $nlo,$NHI | |
| 157 lghi $NHI,0 | |
| 158 alcgr $nhi,$NHI # +="tp[j]" | |
| 159 algr $nlo,$alo | |
| 160 alcgr $NHI,$nhi | |
| 161 | |
| 162 stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= | |
| 163 la $j,8($j) # j++ | |
| 164 brct $count,.L1st | |
| 165 | |
| 166 algr $NHI,$AHI | |
| 167 lghi $AHI,0 | |
| 168 alcgr $AHI,$AHI # upmost overflow bit | |
| 169 stg $NHI,$stdframe-8($j,$sp) | |
| 170 stg $AHI,$stdframe($j,$sp) | |
| 171 la $bp,8($bp) # bp++ | |
| 172 | |
| 173 .Louter: | |
| 174 lg $bi,0($bp) # bp[i] | |
| 175 _dswap $bi | |
| 176 lg $alo,0($ap) | |
| 177 _dswap $alo | |
| 178 mlgr $ahi,$bi # ap[0]*bp[i] | |
| 179 alg $alo,$stdframe($sp) # +=tp[0] | |
| 180 lghi $AHI,0 | |
| 181 alcgr $AHI,$ahi | |
| 182 | |
| 183 lgr $mn0,$alo | |
| 184 msgr $mn0,$n0 # tp[0]*n0 | |
| 185 | |
| 186 lg $nlo,0($np) # np[0] | |
| 187 _dswap $nlo | |
| 188 mlgr $nhi,$mn0 # np[0]*m1 | |
| 189 algr $nlo,$alo # +="tp[0]" | |
| 190 lghi $NHI,0 | |
| 191 alcgr $NHI,$nhi | |
| 192 | |
| 193 la $j,8(%r0) # j=1 | |
| 194 lr $count,$num | |
| 195 | |
| 196 .align 16 | |
| 197 .Linner: | |
| 198 lg $alo,0($j,$ap) | |
| 199 _dswap $alo | |
| 200 mlgr $ahi,$bi # ap[j]*bp[i] | |
| 201 algr $alo,$AHI | |
| 202 lghi $AHI,0 | |
| 203 alcgr $ahi,$AHI | |
| 204 alg $alo,$stdframe($j,$sp)# +=tp[j] | |
| 205 alcgr $AHI,$ahi | |
| 206 | |
| 207 lg $nlo,0($j,$np) | |
| 208 _dswap $nlo | |
| 209 mlgr $nhi,$mn0 # np[j]*m1 | |
| 210 algr $nlo,$NHI | |
| 211 lghi $NHI,0 | |
| 212 alcgr $nhi,$NHI | |
| 213 algr $nlo,$alo # +="tp[j]" | |
| 214 alcgr $NHI,$nhi | |
| 215 | |
| 216 stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= | |
| 217 la $j,8($j) # j++ | |
| 218 brct $count,.Linner | |
| 219 | |
| 220 algr $NHI,$AHI | |
| 221 lghi $AHI,0 | |
| 222 alcgr $AHI,$AHI | |
| 223 alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit | |
| 224 lghi $ahi,0 | |
| 225 alcgr $AHI,$ahi # new upmost overflow bit | |
| 226 stg $NHI,$stdframe-8($j,$sp) | |
| 227 stg $AHI,$stdframe($j,$sp) | |
| 228 | |
| 229 la $bp,8($bp) # bp++ | |
| 230 cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num] | |
| 231 jne .Louter | |
| 232 | |
| 233 l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp | |
| 234 la $ap,$stdframe($sp) | |
| 235 ahi $num,1 # restore $num, incidentally clears "borrow" | |
| 236 | |
| 237 la $j,0(%r0) | |
| 238 lr $count,$num | |
| 239 .Lsub: lg $alo,0($j,$ap) | |
| 240 lg $nlo,0($j,$np) | |
| 241 _dswap $nlo | |
| 242 slbgr $alo,$nlo | |
| 243 stg $alo,0($j,$rp) | |
| 244 la $j,8($j) | |
| 245 brct $count,.Lsub | |
| 246 lghi $ahi,0 | |
| 247 slbgr $AHI,$ahi # handle upmost carry | |
| 248 | |
| 249 ngr $ap,$AHI | |
| 250 lghi $np,-1 | |
| 251 xgr $np,$AHI | |
| 252 ngr $np,$rp | |
| 253 ogr $ap,$np # ap=borrow?tp:rp | |
| 254 | |
| 255 la $j,0(%r0) | |
| 256 lgr $count,$num | |
| 257 .Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh | |
| 258 _dswap $alo | |
| 259 stg $j,$stdframe($j,$sp) # zap tp | |
| 260 stg $alo,0($j,$rp) | |
| 261 la $j,8($j) | |
| 262 brct $count,.Lcopy | |
| 263 | |
| 264 la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp) | |
| 265 lm${g} %r6,%r15,0(%r1) | |
| 266 lghi %r2,1 # signal "processed" | |
| 267 br %r14 | |
| 268 .size bn_mul_mont,.-bn_mul_mont | |
| 269 .string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>
" | |
| 270 ___ | |
| 271 | |
| 272 foreach (split("\n",$code)) { | |
| 273 s/\`([^\`]*)\`/eval $1/ge; | |
| 274 s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e; | |
| 275 print $_,"\n"; | |
| 276 } | |
| 277 close STDOUT; | |
| OLD | NEW |