OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/env perl |
| 2 |
| 3 # ==================================================================== |
| 4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
| 5 # project. The module is, however, dual licensed under OpenSSL and |
| 6 # CRYPTOGAMS licenses depending on where you obtain it. For further |
| 7 # details see http://www.openssl.org/~appro/cryptogams/. |
| 8 # ==================================================================== |
| 9 |
| 10 # October 2005 |
| 11 # |
| 12 # This is a "teaser" code, as it can be improved in several ways... |
| 13 # First of all non-SSE2 path should be implemented (yes, for now it |
| 14 # performs Montgomery multiplication/convolution only on SSE2-capable |
| 15 # CPUs such as P4, others fall down to original code). Then inner loop |
| 16 # can be unrolled and modulo-scheduled to improve ILP and possibly |
| 17 # moved to 128-bit XMM register bank (though it would require input |
| 18 # rearrangement and/or increase bus bandwidth utilization). Dedicated |
| 19 # squaring procedure should give further performance improvement... |
| 20 # Yet, for being draft, the code improves rsa512 *sign* benchmark by |
| 21 # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) |
| 22 |
| 23 # December 2006 |
| 24 # |
| 25 # Modulo-scheduling SSE2 loops results in further 15-20% improvement. |
| 26 # Integer-only code [being equipped with dedicated squaring procedure] |
| 27 # gives ~40% on rsa512 sign benchmark... |
| 28 |
| 29 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 30 push(@INC,"${dir}","${dir}../../perlasm"); |
| 31 require "x86asm.pl"; |
| 32 |
| 33 &asm_init($ARGV[0],$0); |
| 34 |
| 35 $sse2=0; |
| 36 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } |
| 37 |
| 38 &external_label("OPENSSL_ia32cap_P") if ($sse2); |
| 39 |
| 40 &function_begin("bn_mul_mont"); |
| 41 |
| 42 $i="edx"; |
| 43 $j="ecx"; |
| 44 $ap="esi"; $tp="esi"; # overlapping variables!!! |
| 45 $rp="edi"; $bp="edi"; # overlapping variables!!! |
| 46 $np="ebp"; |
| 47 $num="ebx"; |
| 48 |
| 49 $_num=&DWP(4*0,"esp"); # stack top layout |
| 50 $_rp=&DWP(4*1,"esp"); |
| 51 $_ap=&DWP(4*2,"esp"); |
| 52 $_bp=&DWP(4*3,"esp"); |
| 53 $_np=&DWP(4*4,"esp"); |
| 54 $_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); |
| 55 $_sp=&DWP(4*6,"esp"); |
| 56 $_bpend=&DWP(4*7,"esp"); |
| 57 $frame=32; # size of above frame rounded up to 16n |
| 58 |
| 59 &xor ("eax","eax"); |
| 60 &mov ("edi",&wparam(5)); # int num |
| 61 &cmp ("edi",4); |
| 62 &jl (&label("just_leave")); |
| 63 |
| 64 &lea ("esi",&wparam(0)); # put aside pointer to argument block |
| 65 &lea ("edx",&wparam(1)); # load ap |
| 66 &mov ("ebp","esp"); # saved stack pointer! |
| 67 &add ("edi",2); # extra two words on top of tp |
| 68 &neg ("edi"); |
| 69 &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2
)) |
| 70 &neg ("edi"); |
| 71 |
| 72 # minimize cache contention by arraning 2K window between stack |
| 73 # pointer and ap argument [np is also position sensitive vector, |
| 74 # but it's assumed to be near ap, as it's allocated at ~same |
| 75 # time]. |
| 76 &mov ("eax","esp"); |
| 77 &sub ("eax","edx"); |
| 78 &and ("eax",2047); |
| 79 &sub ("esp","eax"); # this aligns sp and ap modulo 2048 |
| 80 |
| 81 &xor ("edx","esp"); |
| 82 &and ("edx",2048); |
| 83 &xor ("edx",2048); |
| 84 &sub ("esp","edx"); # this splits them apart modulo 4096 |
| 85 |
| 86 &and ("esp",-64); # align to cache line |
| 87 |
| 88 ################################# load argument block... |
| 89 &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp |
| 90 &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap |
| 91 &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp |
| 92 &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np |
| 93 &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 |
| 94 #&mov ("edi",&DWP(5*4,"esi"));# int num |
| 95 |
| 96 &mov ("esi",&DWP(0,"esi")); # pull n0[0] |
| 97 &mov ($_rp,"eax"); # ... save a copy of argument block |
| 98 &mov ($_ap,"ebx"); |
| 99 &mov ($_bp,"ecx"); |
| 100 &mov ($_np,"edx"); |
| 101 &mov ($_n0,"esi"); |
| 102 &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling |
| 103 #&mov ($_num,$num); # redundant as $num is not reused |
| 104 &mov ($_sp,"ebp"); # saved stack pointer! |
| 105 |
| 106 if($sse2) { |
| 107 $acc0="mm0"; # mmx register bank layout |
| 108 $acc1="mm1"; |
| 109 $car0="mm2"; |
| 110 $car1="mm3"; |
| 111 $mul0="mm4"; |
| 112 $mul1="mm5"; |
| 113 $temp="mm6"; |
| 114 $mask="mm7"; |
| 115 |
| 116 &picmeup("eax","OPENSSL_ia32cap_P"); |
| 117 &bt (&DWP(0,"eax"),26); |
| 118 &jnc (&label("non_sse2")); |
| 119 |
| 120 &mov ("eax",-1); |
| 121 &movd ($mask,"eax"); # mask 32 lower bits |
| 122 |
| 123 &mov ($ap,$_ap); # load input pointers |
| 124 &mov ($bp,$_bp); |
| 125 &mov ($np,$_np); |
| 126 |
| 127 &xor ($i,$i); # i=0 |
| 128 &xor ($j,$j); # j=0 |
| 129 |
| 130 &movd ($mul0,&DWP(0,$bp)); # bp[0] |
| 131 &movd ($mul1,&DWP(0,$ap)); # ap[0] |
| 132 &movd ($car1,&DWP(0,$np)); # np[0] |
| 133 |
| 134 &pmuludq($mul1,$mul0); # ap[0]*bp[0] |
| 135 &movq ($car0,$mul1); |
| 136 &movq ($acc0,$mul1); # I wish movd worked for |
| 137 &pand ($acc0,$mask); # inter-register transfers |
| 138 |
| 139 &pmuludq($mul1,$_n0q); # *=n0 |
| 140 |
| 141 &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 |
| 142 &paddq ($car1,$acc0); |
| 143 |
| 144 &movd ($acc1,&DWP(4,$np)); # np[1] |
| 145 &movd ($acc0,&DWP(4,$ap)); # ap[1] |
| 146 |
| 147 &psrlq ($car0,32); |
| 148 &psrlq ($car1,32); |
| 149 |
| 150 &inc ($j); # j++ |
| 151 &set_label("1st",16); |
| 152 &pmuludq($acc0,$mul0); # ap[j]*bp[0] |
| 153 &pmuludq($acc1,$mul1); # np[j]*m1 |
| 154 &paddq ($car0,$acc0); # +=c0 |
| 155 &paddq ($car1,$acc1); # +=c1 |
| 156 |
| 157 &movq ($acc0,$car0); |
| 158 &pand ($acc0,$mask); |
| 159 &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] |
| 160 &paddq ($car1,$acc0); # +=ap[j]*bp[0]; |
| 161 &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] |
| 162 &psrlq ($car0,32); |
| 163 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= |
| 164 &psrlq ($car1,32); |
| 165 |
| 166 &lea ($j,&DWP(1,$j)); |
| 167 &cmp ($j,$num); |
| 168 &jl (&label("1st")); |
| 169 |
| 170 &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] |
| 171 &pmuludq($acc1,$mul1); # np[num-1]*m1 |
| 172 &paddq ($car0,$acc0); # +=c0 |
| 173 &paddq ($car1,$acc1); # +=c1 |
| 174 |
| 175 &movq ($acc0,$car0); |
| 176 &pand ($acc0,$mask); |
| 177 &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; |
| 178 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= |
| 179 |
| 180 &psrlq ($car0,32); |
| 181 &psrlq ($car1,32); |
| 182 |
| 183 &paddq ($car1,$car0); |
| 184 &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] |
| 185 |
| 186 &inc ($i); # i++ |
| 187 &set_label("outer"); |
| 188 &xor ($j,$j); # j=0 |
| 189 |
| 190 &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] |
| 191 &movd ($mul1,&DWP(0,$ap)); # ap[0] |
| 192 &movd ($temp,&DWP($frame,"esp")); # tp[0] |
| 193 &movd ($car1,&DWP(0,$np)); # np[0] |
| 194 &pmuludq($mul1,$mul0); # ap[0]*bp[i] |
| 195 |
| 196 &paddq ($mul1,$temp); # +=tp[0] |
| 197 &movq ($acc0,$mul1); |
| 198 &movq ($car0,$mul1); |
| 199 &pand ($acc0,$mask); |
| 200 |
| 201 &pmuludq($mul1,$_n0q); # *=n0 |
| 202 |
| 203 &pmuludq($car1,$mul1); |
| 204 &paddq ($car1,$acc0); |
| 205 |
| 206 &movd ($temp,&DWP($frame+4,"esp")); # tp[1] |
| 207 &movd ($acc1,&DWP(4,$np)); # np[1] |
| 208 &movd ($acc0,&DWP(4,$ap)); # ap[1] |
| 209 |
| 210 &psrlq ($car0,32); |
| 211 &psrlq ($car1,32); |
| 212 &paddq ($car0,$temp); # +=tp[1] |
| 213 |
| 214 &inc ($j); # j++ |
| 215 &dec ($num); |
| 216 &set_label("inner"); |
| 217 &pmuludq($acc0,$mul0); # ap[j]*bp[i] |
| 218 &pmuludq($acc1,$mul1); # np[j]*m1 |
| 219 &paddq ($car0,$acc0); # +=c0 |
| 220 &paddq ($car1,$acc1); # +=c1 |
| 221 |
| 222 &movq ($acc0,$car0); |
| 223 &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] |
| 224 &pand ($acc0,$mask); |
| 225 &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] |
| 226 &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] |
| 227 &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] |
| 228 &psrlq ($car0,32); |
| 229 &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= |
| 230 &psrlq ($car1,32); |
| 231 &paddq ($car0,$temp); # +=tp[j+1] |
| 232 |
| 233 &dec ($num); |
| 234 &lea ($j,&DWP(1,$j)); # j++ |
| 235 &jnz (&label("inner")); |
| 236 |
| 237 &mov ($num,$j); |
| 238 &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] |
| 239 &pmuludq($acc1,$mul1); # np[num-1]*m1 |
| 240 &paddq ($car0,$acc0); # +=c0 |
| 241 &paddq ($car1,$acc1); # +=c1 |
| 242 |
| 243 &movq ($acc0,$car0); |
| 244 &pand ($acc0,$mask); |
| 245 &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] |
| 246 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= |
| 247 &psrlq ($car0,32); |
| 248 &psrlq ($car1,32); |
| 249 |
| 250 &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] |
| 251 &paddq ($car1,$car0); |
| 252 &paddq ($car1,$temp); |
| 253 &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] |
| 254 |
| 255 &lea ($i,&DWP(1,$i)); # i++ |
| 256 &cmp ($i,$num); |
| 257 &jle (&label("outer")); |
| 258 |
| 259 &emms (); # done with mmx bank |
| 260 &jmp (&label("common_tail")); |
| 261 |
| 262 &set_label("non_sse2",16); |
| 263 } |
| 264 |
| 265 if (0) { |
| 266 &mov ("esp",$_sp); |
| 267 &xor ("eax","eax"); # signal "not fast enough [yet]" |
| 268 &jmp (&label("just_leave")); |
| 269 # While the below code provides competitive performance for |
| 270 # all key lengthes on modern Intel cores, it's still more |
| 271 # than 10% slower for 4096-bit key elsewhere:-( "Competitive" |
| 272 # means compared to the original integer-only assembler. |
| 273 # 512-bit RSA sign is better by ~40%, but that's about all |
| 274 # one can say about all CPUs... |
| 275 } else { |
| 276 $inp="esi"; # integer path uses these registers differently |
| 277 $word="edi"; |
| 278 $carry="ebp"; |
| 279 |
| 280 &mov ($inp,$_ap); |
| 281 &lea ($carry,&DWP(1,$num)); |
| 282 &mov ($word,$_bp); |
| 283 &xor ($j,$j); # j=0 |
| 284 &mov ("edx",$inp); |
| 285 &and ($carry,1); # see if num is even |
| 286 &sub ("edx",$word); # see if ap==bp |
| 287 &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] |
| 288 &or ($carry,"edx"); |
| 289 &mov ($word,&DWP(0,$word)); # bp[0] |
| 290 &jz (&label("bn_sqr_mont")); |
| 291 &mov ($_bpend,"eax"); |
| 292 &mov ("eax",&DWP(0,$inp)); |
| 293 &xor ("edx","edx"); |
| 294 |
| 295 &set_label("mull",16); |
| 296 &mov ($carry,"edx"); |
| 297 &mul ($word); # ap[j]*bp[0] |
| 298 &add ($carry,"eax"); |
| 299 &lea ($j,&DWP(1,$j)); |
| 300 &adc ("edx",0); |
| 301 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] |
| 302 &cmp ($j,$num); |
| 303 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= |
| 304 &jl (&label("mull")); |
| 305 |
| 306 &mov ($carry,"edx"); |
| 307 &mul ($word); # ap[num-1]*bp[0] |
| 308 &mov ($word,$_n0); |
| 309 &add ("eax",$carry); |
| 310 &mov ($inp,$_np); |
| 311 &adc ("edx",0); |
| 312 &imul ($word,&DWP($frame,"esp")); # n0*tp[0] |
| 313 |
| 314 &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= |
| 315 &xor ($j,$j); |
| 316 &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= |
| 317 &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= |
| 318 |
| 319 &mov ("eax",&DWP(0,$inp)); # np[0] |
| 320 &mul ($word); # np[0]*m |
| 321 &add ("eax",&DWP($frame,"esp")); # +=tp[0] |
| 322 &mov ("eax",&DWP(4,$inp)); # np[1] |
| 323 &adc ("edx",0); |
| 324 &inc ($j); |
| 325 |
| 326 &jmp (&label("2ndmadd")); |
| 327 |
| 328 &set_label("1stmadd",16); |
| 329 &mov ($carry,"edx"); |
| 330 &mul ($word); # ap[j]*bp[i] |
| 331 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] |
| 332 &lea ($j,&DWP(1,$j)); |
| 333 &adc ("edx",0); |
| 334 &add ($carry,"eax"); |
| 335 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] |
| 336 &adc ("edx",0); |
| 337 &cmp ($j,$num); |
| 338 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= |
| 339 &jl (&label("1stmadd")); |
| 340 |
| 341 &mov ($carry,"edx"); |
| 342 &mul ($word); # ap[num-1]*bp[i] |
| 343 &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] |
| 344 &mov ($word,$_n0); |
| 345 &adc ("edx",0); |
| 346 &mov ($inp,$_np); |
| 347 &add ($carry,"eax"); |
| 348 &adc ("edx",0); |
| 349 &imul ($word,&DWP($frame,"esp")); # n0*tp[0] |
| 350 |
| 351 &xor ($j,$j); |
| 352 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] |
| 353 &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]= |
| 354 &adc ($j,0); |
| 355 &mov ("eax",&DWP(0,$inp)); # np[0] |
| 356 &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= |
| 357 &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= |
| 358 |
| 359 &mul ($word); # np[0]*m |
| 360 &add ("eax",&DWP($frame,"esp")); # +=tp[0] |
| 361 &mov ("eax",&DWP(4,$inp)); # np[1] |
| 362 &adc ("edx",0); |
| 363 &mov ($j,1); |
| 364 |
| 365 &set_label("2ndmadd",16); |
| 366 &mov ($carry,"edx"); |
| 367 &mul ($word); # np[j]*m |
| 368 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] |
| 369 &lea ($j,&DWP(1,$j)); |
| 370 &adc ("edx",0); |
| 371 &add ($carry,"eax"); |
| 372 &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1] |
| 373 &adc ("edx",0); |
| 374 &cmp ($j,$num); |
| 375 &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]= |
| 376 &jl (&label("2ndmadd")); |
| 377 |
| 378 &mov ($carry,"edx"); |
| 379 &mul ($word); # np[j]*m |
| 380 &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] |
| 381 &adc ("edx",0); |
| 382 &add ($carry,"eax"); |
| 383 &adc ("edx",0); |
| 384 &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= |
| 385 |
| 386 &xor ("eax","eax"); |
| 387 &mov ($j,$_bp); # &bp[i] |
| 388 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] |
| 389 &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] |
| 390 &lea ($j,&DWP(4,$j)); |
| 391 &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= |
| 392 &cmp ($j,$_bpend); |
| 393 &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= |
| 394 &je (&label("common_tail")); |
| 395 |
| 396 &mov ($word,&DWP(0,$j)); # bp[i+1] |
| 397 &mov ($inp,$_ap); |
| 398 &mov ($_bp,$j); # &bp[++i] |
| 399 &xor ($j,$j); |
| 400 &xor ("edx","edx"); |
| 401 &mov ("eax",&DWP(0,$inp)); |
| 402 &jmp (&label("1stmadd")); |
| 403 |
| 404 &set_label("bn_sqr_mont",16); |
| 405 $sbit=$num; |
| 406 &mov ($_num,$num); |
| 407 &mov ($_bp,$j); # i=0 |
| 408 |
| 409 &mov ("eax",$word); # ap[0] |
| 410 &mul ($word); # ap[0]*ap[0] |
| 411 &mov (&DWP($frame,"esp"),"eax"); # tp[0]= |
| 412 &mov ($sbit,"edx"); |
| 413 &shr ("edx",1); |
| 414 &and ($sbit,1); |
| 415 &inc ($j); |
| 416 &set_label("sqr",16); |
| 417 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] |
| 418 &mov ($carry,"edx"); |
| 419 &mul ($word); # ap[j]*ap[0] |
| 420 &add ("eax",$carry); |
| 421 &lea ($j,&DWP(1,$j)); |
| 422 &adc ("edx",0); |
| 423 &lea ($carry,&DWP(0,$sbit,"eax",2)); |
| 424 &shr ("eax",31); |
| 425 &cmp ($j,$_num); |
| 426 &mov ($sbit,"eax"); |
| 427 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= |
| 428 &jl (&label("sqr")); |
| 429 |
| 430 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1] |
| 431 &mov ($carry,"edx"); |
| 432 &mul ($word); # ap[num-1]*ap[0] |
| 433 &add ("eax",$carry); |
| 434 &mov ($word,$_n0); |
| 435 &adc ("edx",0); |
| 436 &mov ($inp,$_np); |
| 437 &lea ($carry,&DWP(0,$sbit,"eax",2)); |
| 438 &imul ($word,&DWP($frame,"esp")); # n0*tp[0] |
| 439 &shr ("eax",31); |
| 440 &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]= |
| 441 |
| 442 &lea ($carry,&DWP(0,"eax","edx",2)); |
| 443 &mov ("eax",&DWP(0,$inp)); # np[0] |
| 444 &shr ("edx",31); |
| 445 &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]= |
| 446 &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]= |
| 447 |
| 448 &mul ($word); # np[0]*m |
| 449 &add ("eax",&DWP($frame,"esp")); # +=tp[0] |
| 450 &mov ($num,$j); |
| 451 &adc ("edx",0); |
| 452 &mov ("eax",&DWP(4,$inp)); # np[1] |
| 453 &mov ($j,1); |
| 454 |
| 455 &set_label("3rdmadd",16); |
| 456 &mov ($carry,"edx"); |
| 457 &mul ($word); # np[j]*m |
| 458 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] |
| 459 &adc ("edx",0); |
| 460 &add ($carry,"eax"); |
| 461 &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1] |
| 462 &adc ("edx",0); |
| 463 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]= |
| 464 |
| 465 &mov ($carry,"edx"); |
| 466 &mul ($word); # np[j+1]*m |
| 467 &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1] |
| 468 &lea ($j,&DWP(2,$j)); |
| 469 &adc ("edx",0); |
| 470 &add ($carry,"eax"); |
| 471 &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2] |
| 472 &adc ("edx",0); |
| 473 &cmp ($j,$num); |
| 474 &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]= |
| 475 &jl (&label("3rdmadd")); |
| 476 |
| 477 &mov ($carry,"edx"); |
| 478 &mul ($word); # np[j]*m |
| 479 &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] |
| 480 &adc ("edx",0); |
| 481 &add ($carry,"eax"); |
| 482 &adc ("edx",0); |
| 483 &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= |
| 484 |
| 485 &mov ($j,$_bp); # i |
| 486 &xor ("eax","eax"); |
| 487 &mov ($inp,$_ap); |
| 488 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] |
| 489 &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] |
| 490 &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= |
| 491 &cmp ($j,$num); |
| 492 &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= |
| 493 &je (&label("common_tail")); |
| 494 |
| 495 &mov ($word,&DWP(4,$inp,$j,4)); # ap[i] |
| 496 &lea ($j,&DWP(1,$j)); |
| 497 &mov ("eax",$word); |
| 498 &mov ($_bp,$j); # ++i |
| 499 &mul ($word); # ap[i]*ap[i] |
| 500 &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i] |
| 501 &adc ("edx",0); |
| 502 &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]= |
| 503 &xor ($carry,$carry); |
| 504 &cmp ($j,$num); |
| 505 &lea ($j,&DWP(1,$j)); |
| 506 &je (&label("sqrlast")); |
| 507 |
| 508 &mov ($sbit,"edx"); # zaps $num |
| 509 &shr ("edx",1); |
| 510 &and ($sbit,1); |
| 511 &set_label("sqradd",16); |
| 512 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] |
| 513 &mov ($carry,"edx"); |
| 514 &mul ($word); # ap[j]*ap[i] |
| 515 &add ("eax",$carry); |
| 516 &lea ($carry,&DWP(0,"eax","eax")); |
| 517 &adc ("edx",0); |
| 518 &shr ("eax",31); |
| 519 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] |
| 520 &lea ($j,&DWP(1,$j)); |
| 521 &adc ("eax",0); |
| 522 &add ($carry,$sbit); |
| 523 &adc ("eax",0); |
| 524 &cmp ($j,$_num); |
| 525 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= |
| 526 &mov ($sbit,"eax"); |
| 527 &jle (&label("sqradd")); |
| 528 |
| 529 &mov ($carry,"edx"); |
| 530 &add ("edx","edx"); |
| 531 &shr ($carry,31); |
| 532 &add ("edx",$sbit); |
| 533 &adc ($carry,0); |
| 534 &set_label("sqrlast"); |
| 535 &mov ($word,$_n0); |
| 536 &mov ($inp,$_np); |
| 537 &imul ($word,&DWP($frame,"esp")); # n0*tp[0] |
| 538 |
| 539 &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num] |
| 540 &mov ("eax",&DWP(0,$inp)); # np[0] |
| 541 &adc ($carry,0); |
| 542 &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]= |
| 543 &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]= |
| 544 |
| 545 &mul ($word); # np[0]*m |
| 546 &add ("eax",&DWP($frame,"esp")); # +=tp[0] |
| 547 &lea ($num,&DWP(-1,$j)); |
| 548 &adc ("edx",0); |
| 549 &mov ($j,1); |
| 550 &mov ("eax",&DWP(4,$inp)); # np[1] |
| 551 |
| 552 &jmp (&label("3rdmadd")); |
| 553 } |
| 554 |
| 555 &set_label("common_tail",16); |
| 556 &mov ($np,$_np); # load modulus pointer |
| 557 &mov ($rp,$_rp); # load result pointer |
| 558 &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] |
| 559 |
| 560 &mov ("eax",&DWP(0,$tp)); # tp[0] |
| 561 &mov ($j,$num); # j=num-1 |
| 562 &xor ($i,$i); # i=0 and clear CF! |
| 563 |
| 564 &set_label("sub",16); |
| 565 &sbb ("eax",&DWP(0,$np,$i,4)); |
| 566 &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] |
| 567 &dec ($j); # doesn't affect CF! |
| 568 &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] |
| 569 &lea ($i,&DWP(1,$i)); # i++ |
| 570 &jge (&label("sub")); |
| 571 |
| 572 &sbb ("eax",0); # handle upmost overflow bit |
| 573 &and ($tp,"eax"); |
| 574 ¬ ("eax"); |
| 575 &mov ($np,$rp); |
| 576 &and ($np,"eax"); |
| 577 &or ($tp,$np); # tp=carry?tp:rp |
| 578 |
| 579 &set_label("copy",16); # copy or in-place refresh |
| 580 &mov ("eax",&DWP(0,$tp,$num,4)); |
| 581 &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i] |
| 582 &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector |
| 583 &dec ($num); |
| 584 &jge (&label("copy")); |
| 585 |
| 586 &mov ("esp",$_sp); # pull saved stack pointer |
| 587 &mov ("eax",1); |
| 588 &set_label("just_leave"); |
| 589 &function_end("bn_mul_mont"); |
| 590 |
| 591 &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); |
| 592 |
| 593 &asm_finish(); |
OLD | NEW |