| OLD | NEW |
| (Empty) |
| 1 .ident "sparcv8plus.s, Version 1.4" | |
| 2 .ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" | |
| 3 | |
| 4 /* | |
| 5 * ==================================================================== | |
| 6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
| 7 * project. | |
| 8 * | |
| 9 * Rights for redistribution and usage in source and binary forms are | |
| 10 * granted according to the OpenSSL license. Warranty of any kind is | |
| 11 * disclaimed. | |
| 12 * ==================================================================== | |
| 13 */ | |
| 14 | |
| 15 /* | |
| 16 * This is my modest contributon to OpenSSL project (see | |
| 17 * http://www.openssl.org/ for more information about it) and is | |
| 18 * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c | |
| 19 * module. For updates see http://fy.chalmers.se/~appro/hpe/. | |
| 20 * | |
| 21 * Questions-n-answers. | |
| 22 * | |
| 23 * Q. How to compile? | |
| 24 * A. With SC4.x/SC5.x: | |
| 25 * | |
| 26 * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o | |
| 27 * | |
| 28 * and with gcc: | |
| 29 * | |
| 30 * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o | |
| 31 * | |
| 32 * or if above fails (it does if you have gas installed): | |
| 33 * | |
| 34 * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o | |
| 35 * | |
| 36 * Quick-n-dirty way to fuse the module into the library. | |
| 37 * Provided that the library is already configured and built | |
| 38 * (in 0.9.2 case with no-asm option): | |
| 39 * | |
| 40 * # cd crypto/bn | |
| 41 * # cp /some/place/bn_asm.sparc.v8plus.S . | |
| 42 * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o | |
| 43 * # make | |
| 44 * # cd ../.. | |
| 45 * # make; make test | |
| 46 * | |
| 47 * Quick-n-dirty way to get rid of it: | |
| 48 * | |
| 49 * # cd crypto/bn | |
| 50 * # touch bn_asm.c | |
| 51 * # make | |
| 52 * # cd ../.. | |
| 53 * # make; make test | |
| 54 * | |
| 55 * Q. V8plus achitecture? What kind of beast is that? | |
| 56 * A. Well, it's rather a programming model than an architecture... | |
| 57 * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under | |
| 58 * special conditions, namely when kernel doesn't preserve upper | |
| 59 * 32 bits of otherwise 64-bit registers during a context switch. | |
| 60 * | |
| 61 * Q. Why just UltraSPARC? What about SuperSPARC? | |
| 62 * A. Original release did target UltraSPARC only. Now SuperSPARC | |
| 63 * version is provided along. Both version share bn_*comba[48] | |
| 64 * implementations (see comment later in code for explanation). | |
| 65 * But what's so special about this UltraSPARC implementation? | |
| 66 * Why didn't I let compiler do the job? Trouble is that most of | |
| 67 * available compilers (well, SC5.0 is the only exception) don't | |
| 68 * attempt to take advantage of UltraSPARC's 64-bitness under | |
| 69 * 32-bit kernels even though it's perfectly possible (see next | |
| 70 * question). | |
| 71 * | |
| 72 * Q. 64-bit registers under 32-bit kernels? Didn't you just say it | |
| 73 * doesn't work? | |
| 74 * A. You can't adress *all* registers as 64-bit wide:-( The catch is | |
| 75 * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully | |
| 76 * preserved if you're in a leaf function, i.e. such never calling | |
| 77 * any other functions. All functions in this module are leaf and | |
| 78 * 10 registers is a handful. And as a matter of fact none-"comba" | |
| 79 * routines don't require even that much and I could even afford to | |
| 80 * not allocate own stack frame for 'em:-) | |
| 81 * | |
| 82 * Q. What about 64-bit kernels? | |
| 83 * A. What about 'em? Just kidding:-) Pure 64-bit version is currently | |
| 84 * under evaluation and development... | |
| 85 * | |
| 86 * Q. What about shared libraries? | |
| 87 * A. What about 'em? Kidding again:-) Code does *not* contain any | |
| 88 * code position dependencies and it's safe to include it into | |
| 89 * shared library as is. | |
| 90 * | |
| 91 * Q. How much faster does it go? | |
| 92 * A. Do you have a good benchmark? In either case below is what I | |
| 93 * experience with crypto/bn/expspeed.c test program: | |
| 94 * | |
| 95 * v8plus module on U10/300MHz against bn_asm.c compiled with: | |
| 96 * | |
| 97 * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12% | |
| 98 * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35% | |
| 99 * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45% | |
| 100 * | |
| 101 * v8 module on SS10/60MHz against bn_asm.c compiled with: | |
| 102 * | |
| 103 * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10% | |
| 104 * cc-4.2 -xarch=v8 -xO5 -xdepend +10% | |
| 105 * egcs-1.1.2 -mv8 -O3 +35-45% | |
| 106 * | |
| 107 * As you can see it's damn hard to beat the new Sun C compiler | |
| 108 * and it's in first place GNU C users who will appreciate this | |
| 109 * assembler implementation:-) | |
| 110 */ | |
| 111 | |
| 112 /* | |
| 113 * Revision history. | |
| 114 * | |
| 115 * 1.0 - initial release; | |
| 116 * 1.1 - new loop unrolling model(*); | |
| 117 * - some more fine tuning; | |
| 118 * 1.2 - made gas friendly; | |
| 119 * - updates to documentation concerning v9; | |
| 120 * - new performance comparison matrix; | |
| 121 * 1.3 - fixed problem with /usr/ccs/lib/cpp; | |
| 122 * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient) | |
| 123 * resulting in slight overall performance kick; | |
| 124 * - some retunes; | |
| 125 * - support for GNU as added; | |
| 126 * | |
| 127 * (*) Originally unrolled loop looked like this: | |
| 128 * for (;;) { | |
| 129 * op(p+0); if (--n==0) break; | |
| 130 * op(p+1); if (--n==0) break; | |
| 131 * op(p+2); if (--n==0) break; | |
| 132 * op(p+3); if (--n==0) break; | |
| 133 * p+=4; | |
| 134 * } | |
| 135 * I unroll according to following: | |
| 136 * while (n&~3) { | |
| 137 * op(p+0); op(p+1); op(p+2); op(p+3); | |
| 138 * p+=4; n=-4; | |
| 139 * } | |
| 140 * if (n) { | |
| 141 * op(p+0); if (--n==0) return; | |
| 142 * op(p+2); if (--n==0) return; | |
| 143 * op(p+3); return; | |
| 144 * } | |
| 145 */ | |
| 146 | |
| 147 #if defined(__SUNPRO_C) && defined(__sparcv9) | |
| 148 /* They've said -xarch=v9 at command line */ | |
| 149 .register %g2,#scratch | |
| 150 .register %g3,#scratch | |
| 151 # define FRAME_SIZE -192 | |
| 152 #elif defined(__GNUC__) && defined(__arch64__) | |
| 153 /* They've said -m64 at command line */ | |
| 154 .register %g2,#scratch | |
| 155 .register %g3,#scratch | |
| 156 # define FRAME_SIZE -192 | |
| 157 #else | |
| 158 # define FRAME_SIZE -96 | |
| 159 #endif | |
| 160 /* | |
| 161 * GNU assembler can't stand stuw:-( | |
| 162 */ | |
| 163 #define stuw st | |
| 164 | |
| 165 .section ".text",#alloc,#execinstr | |
| 166 .file "bn_asm.sparc.v8plus.S" | |
| 167 | |
| 168 .align 32 | |
| 169 | |
| 170 .global bn_mul_add_words | |
| 171 /* | |
| 172 * BN_ULONG bn_mul_add_words(rp,ap,num,w) | |
| 173 * BN_ULONG *rp,*ap; | |
| 174 * int num; | |
| 175 * BN_ULONG w; | |
| 176 */ | |
| 177 bn_mul_add_words: | |
| 178 sra %o2,%g0,%o2 ! signx %o2 | |
| 179 brgz,a %o2,.L_bn_mul_add_words_proceed | |
| 180 lduw [%o1],%g2 | |
| 181 retl | |
| 182 clr %o0 | |
| 183 nop | |
| 184 nop | |
| 185 nop | |
| 186 | |
| 187 .L_bn_mul_add_words_proceed: | |
| 188 srl %o3,%g0,%o3 ! clruw %o3 | |
| 189 andcc %o2,-4,%g0 | |
| 190 bz,pn %icc,.L_bn_mul_add_words_tail | |
| 191 clr %o5 | |
| 192 | |
| 193 .L_bn_mul_add_words_loop: ! wow! 32 aligned! | |
| 194 lduw [%o0],%g1 | |
| 195 lduw [%o1+4],%g3 | |
| 196 mulx %o3,%g2,%g2 | |
| 197 add %g1,%o5,%o4 | |
| 198 nop | |
| 199 add %o4,%g2,%o4 | |
| 200 stuw %o4,[%o0] | |
| 201 srlx %o4,32,%o5 | |
| 202 | |
| 203 lduw [%o0+4],%g1 | |
| 204 lduw [%o1+8],%g2 | |
| 205 mulx %o3,%g3,%g3 | |
| 206 add %g1,%o5,%o4 | |
| 207 dec 4,%o2 | |
| 208 add %o4,%g3,%o4 | |
| 209 stuw %o4,[%o0+4] | |
| 210 srlx %o4,32,%o5 | |
| 211 | |
| 212 lduw [%o0+8],%g1 | |
| 213 lduw [%o1+12],%g3 | |
| 214 mulx %o3,%g2,%g2 | |
| 215 add %g1,%o5,%o4 | |
| 216 inc 16,%o1 | |
| 217 add %o4,%g2,%o4 | |
| 218 stuw %o4,[%o0+8] | |
| 219 srlx %o4,32,%o5 | |
| 220 | |
| 221 lduw [%o0+12],%g1 | |
| 222 mulx %o3,%g3,%g3 | |
| 223 add %g1,%o5,%o4 | |
| 224 inc 16,%o0 | |
| 225 add %o4,%g3,%o4 | |
| 226 andcc %o2,-4,%g0 | |
| 227 stuw %o4,[%o0-4] | |
| 228 srlx %o4,32,%o5 | |
| 229 bnz,a,pt %icc,.L_bn_mul_add_words_loop | |
| 230 lduw [%o1],%g2 | |
| 231 | |
| 232 brnz,a,pn %o2,.L_bn_mul_add_words_tail | |
| 233 lduw [%o1],%g2 | |
| 234 .L_bn_mul_add_words_return: | |
| 235 retl | |
| 236 mov %o5,%o0 | |
| 237 | |
| 238 .L_bn_mul_add_words_tail: | |
| 239 lduw [%o0],%g1 | |
| 240 mulx %o3,%g2,%g2 | |
| 241 add %g1,%o5,%o4 | |
| 242 dec %o2 | |
| 243 add %o4,%g2,%o4 | |
| 244 srlx %o4,32,%o5 | |
| 245 brz,pt %o2,.L_bn_mul_add_words_return | |
| 246 stuw %o4,[%o0] | |
| 247 | |
| 248 lduw [%o1+4],%g2 | |
| 249 lduw [%o0+4],%g1 | |
| 250 mulx %o3,%g2,%g2 | |
| 251 add %g1,%o5,%o4 | |
| 252 dec %o2 | |
| 253 add %o4,%g2,%o4 | |
| 254 srlx %o4,32,%o5 | |
| 255 brz,pt %o2,.L_bn_mul_add_words_return | |
| 256 stuw %o4,[%o0+4] | |
| 257 | |
| 258 lduw [%o1+8],%g2 | |
| 259 lduw [%o0+8],%g1 | |
| 260 mulx %o3,%g2,%g2 | |
| 261 add %g1,%o5,%o4 | |
| 262 add %o4,%g2,%o4 | |
| 263 stuw %o4,[%o0+8] | |
| 264 retl | |
| 265 srlx %o4,32,%o0 | |
| 266 | |
| 267 .type bn_mul_add_words,#function | |
| 268 .size bn_mul_add_words,(.-bn_mul_add_words) | |
| 269 | |
| 270 .align 32 | |
| 271 | |
| 272 .global bn_mul_words | |
| 273 /* | |
| 274 * BN_ULONG bn_mul_words(rp,ap,num,w) | |
| 275 * BN_ULONG *rp,*ap; | |
| 276 * int num; | |
| 277 * BN_ULONG w; | |
| 278 */ | |
| 279 bn_mul_words: | |
| 280 sra %o2,%g0,%o2 ! signx %o2 | |
| 281 brgz,a %o2,.L_bn_mul_words_proceeed | |
| 282 lduw [%o1],%g2 | |
| 283 retl | |
| 284 clr %o0 | |
| 285 nop | |
| 286 nop | |
| 287 nop | |
| 288 | |
| 289 .L_bn_mul_words_proceeed: | |
| 290 srl %o3,%g0,%o3 ! clruw %o3 | |
| 291 andcc %o2,-4,%g0 | |
| 292 bz,pn %icc,.L_bn_mul_words_tail | |
| 293 clr %o5 | |
| 294 | |
| 295 .L_bn_mul_words_loop: ! wow! 32 aligned! | |
| 296 lduw [%o1+4],%g3 | |
| 297 mulx %o3,%g2,%g2 | |
| 298 add %g2,%o5,%o4 | |
| 299 nop | |
| 300 stuw %o4,[%o0] | |
| 301 srlx %o4,32,%o5 | |
| 302 | |
| 303 lduw [%o1+8],%g2 | |
| 304 mulx %o3,%g3,%g3 | |
| 305 add %g3,%o5,%o4 | |
| 306 dec 4,%o2 | |
| 307 stuw %o4,[%o0+4] | |
| 308 srlx %o4,32,%o5 | |
| 309 | |
| 310 lduw [%o1+12],%g3 | |
| 311 mulx %o3,%g2,%g2 | |
| 312 add %g2,%o5,%o4 | |
| 313 inc 16,%o1 | |
| 314 stuw %o4,[%o0+8] | |
| 315 srlx %o4,32,%o5 | |
| 316 | |
| 317 mulx %o3,%g3,%g3 | |
| 318 add %g3,%o5,%o4 | |
| 319 inc 16,%o0 | |
| 320 stuw %o4,[%o0-4] | |
| 321 srlx %o4,32,%o5 | |
| 322 andcc %o2,-4,%g0 | |
| 323 bnz,a,pt %icc,.L_bn_mul_words_loop | |
| 324 lduw [%o1],%g2 | |
| 325 nop | |
| 326 nop | |
| 327 | |
| 328 brnz,a,pn %o2,.L_bn_mul_words_tail | |
| 329 lduw [%o1],%g2 | |
| 330 .L_bn_mul_words_return: | |
| 331 retl | |
| 332 mov %o5,%o0 | |
| 333 | |
| 334 .L_bn_mul_words_tail: | |
| 335 mulx %o3,%g2,%g2 | |
| 336 add %g2,%o5,%o4 | |
| 337 dec %o2 | |
| 338 srlx %o4,32,%o5 | |
| 339 brz,pt %o2,.L_bn_mul_words_return | |
| 340 stuw %o4,[%o0] | |
| 341 | |
| 342 lduw [%o1+4],%g2 | |
| 343 mulx %o3,%g2,%g2 | |
| 344 add %g2,%o5,%o4 | |
| 345 dec %o2 | |
| 346 srlx %o4,32,%o5 | |
| 347 brz,pt %o2,.L_bn_mul_words_return | |
| 348 stuw %o4,[%o0+4] | |
| 349 | |
| 350 lduw [%o1+8],%g2 | |
| 351 mulx %o3,%g2,%g2 | |
| 352 add %g2,%o5,%o4 | |
| 353 stuw %o4,[%o0+8] | |
| 354 retl | |
| 355 srlx %o4,32,%o0 | |
| 356 | |
| 357 .type bn_mul_words,#function | |
| 358 .size bn_mul_words,(.-bn_mul_words) | |
| 359 | |
| 360 .align 32 | |
| 361 .global bn_sqr_words | |
| 362 /* | |
| 363 * void bn_sqr_words(r,a,n) | |
| 364 * BN_ULONG *r,*a; | |
| 365 * int n; | |
| 366 */ | |
| 367 bn_sqr_words: | |
| 368 sra %o2,%g0,%o2 ! signx %o2 | |
| 369 brgz,a %o2,.L_bn_sqr_words_proceeed | |
| 370 lduw [%o1],%g2 | |
| 371 retl | |
| 372 clr %o0 | |
| 373 nop | |
| 374 nop | |
| 375 nop | |
| 376 | |
| 377 .L_bn_sqr_words_proceeed: | |
| 378 andcc %o2,-4,%g0 | |
| 379 nop | |
| 380 bz,pn %icc,.L_bn_sqr_words_tail | |
| 381 nop | |
| 382 | |
| 383 .L_bn_sqr_words_loop: ! wow! 32 aligned! | |
| 384 lduw [%o1+4],%g3 | |
| 385 mulx %g2,%g2,%o4 | |
| 386 stuw %o4,[%o0] | |
| 387 srlx %o4,32,%o5 | |
| 388 stuw %o5,[%o0+4] | |
| 389 nop | |
| 390 | |
| 391 lduw [%o1+8],%g2 | |
| 392 mulx %g3,%g3,%o4 | |
| 393 dec 4,%o2 | |
| 394 stuw %o4,[%o0+8] | |
| 395 srlx %o4,32,%o5 | |
| 396 stuw %o5,[%o0+12] | |
| 397 | |
| 398 lduw [%o1+12],%g3 | |
| 399 mulx %g2,%g2,%o4 | |
| 400 srlx %o4,32,%o5 | |
| 401 stuw %o4,[%o0+16] | |
| 402 inc 16,%o1 | |
| 403 stuw %o5,[%o0+20] | |
| 404 | |
| 405 mulx %g3,%g3,%o4 | |
| 406 inc 32,%o0 | |
| 407 stuw %o4,[%o0-8] | |
| 408 srlx %o4,32,%o5 | |
| 409 andcc %o2,-4,%g2 | |
| 410 stuw %o5,[%o0-4] | |
| 411 bnz,a,pt %icc,.L_bn_sqr_words_loop | |
| 412 lduw [%o1],%g2 | |
| 413 nop | |
| 414 | |
| 415 brnz,a,pn %o2,.L_bn_sqr_words_tail | |
| 416 lduw [%o1],%g2 | |
| 417 .L_bn_sqr_words_return: | |
| 418 retl | |
| 419 clr %o0 | |
| 420 | |
| 421 .L_bn_sqr_words_tail: | |
| 422 mulx %g2,%g2,%o4 | |
| 423 dec %o2 | |
| 424 stuw %o4,[%o0] | |
| 425 srlx %o4,32,%o5 | |
| 426 brz,pt %o2,.L_bn_sqr_words_return | |
| 427 stuw %o5,[%o0+4] | |
| 428 | |
| 429 lduw [%o1+4],%g2 | |
| 430 mulx %g2,%g2,%o4 | |
| 431 dec %o2 | |
| 432 stuw %o4,[%o0+8] | |
| 433 srlx %o4,32,%o5 | |
| 434 brz,pt %o2,.L_bn_sqr_words_return | |
| 435 stuw %o5,[%o0+12] | |
| 436 | |
| 437 lduw [%o1+8],%g2 | |
| 438 mulx %g2,%g2,%o4 | |
| 439 srlx %o4,32,%o5 | |
| 440 stuw %o4,[%o0+16] | |
| 441 stuw %o5,[%o0+20] | |
| 442 retl | |
| 443 clr %o0 | |
| 444 | |
| 445 .type bn_sqr_words,#function | |
| 446 .size bn_sqr_words,(.-bn_sqr_words) | |
| 447 | |
| 448 .align 32 | |
| 449 .global bn_div_words | |
| 450 /* | |
| 451 * BN_ULONG bn_div_words(h,l,d) | |
| 452 * BN_ULONG h,l,d; | |
| 453 */ | |
| 454 bn_div_words: | |
| 455 sllx %o0,32,%o0 | |
| 456 or %o0,%o1,%o0 | |
| 457 udivx %o0,%o2,%o0 | |
| 458 retl | |
| 459 srl %o0,%g0,%o0 ! clruw %o0 | |
| 460 | |
| 461 .type bn_div_words,#function | |
| 462 .size bn_div_words,(.-bn_div_words) | |
| 463 | |
| 464 .align 32 | |
| 465 | |
| 466 .global bn_add_words | |
| 467 /* | |
| 468 * BN_ULONG bn_add_words(rp,ap,bp,n) | |
| 469 * BN_ULONG *rp,*ap,*bp; | |
| 470 * int n; | |
| 471 */ | |
| 472 bn_add_words: | |
| 473 sra %o3,%g0,%o3 ! signx %o3 | |
| 474 brgz,a %o3,.L_bn_add_words_proceed | |
| 475 lduw [%o1],%o4 | |
| 476 retl | |
| 477 clr %o0 | |
| 478 | |
| 479 .L_bn_add_words_proceed: | |
| 480 andcc %o3,-4,%g0 | |
| 481 bz,pn %icc,.L_bn_add_words_tail | |
| 482 addcc %g0,0,%g0 ! clear carry flag | |
| 483 | |
| 484 .L_bn_add_words_loop: ! wow! 32 aligned! | |
| 485 dec 4,%o3 | |
| 486 lduw [%o2],%o5 | |
| 487 lduw [%o1+4],%g1 | |
| 488 lduw [%o2+4],%g2 | |
| 489 lduw [%o1+8],%g3 | |
| 490 lduw [%o2+8],%g4 | |
| 491 addccc %o5,%o4,%o5 | |
| 492 stuw %o5,[%o0] | |
| 493 | |
| 494 lduw [%o1+12],%o4 | |
| 495 lduw [%o2+12],%o5 | |
| 496 inc 16,%o1 | |
| 497 addccc %g1,%g2,%g1 | |
| 498 stuw %g1,[%o0+4] | |
| 499 | |
| 500 inc 16,%o2 | |
| 501 addccc %g3,%g4,%g3 | |
| 502 stuw %g3,[%o0+8] | |
| 503 | |
| 504 inc 16,%o0 | |
| 505 addccc %o5,%o4,%o5 | |
| 506 stuw %o5,[%o0-4] | |
| 507 and %o3,-4,%g1 | |
| 508 brnz,a,pt %g1,.L_bn_add_words_loop | |
| 509 lduw [%o1],%o4 | |
| 510 | |
| 511 brnz,a,pn %o3,.L_bn_add_words_tail | |
| 512 lduw [%o1],%o4 | |
| 513 .L_bn_add_words_return: | |
| 514 clr %o0 | |
| 515 retl | |
| 516 movcs %icc,1,%o0 | |
| 517 nop | |
| 518 | |
| 519 .L_bn_add_words_tail: | |
| 520 lduw [%o2],%o5 | |
| 521 dec %o3 | |
| 522 addccc %o5,%o4,%o5 | |
| 523 brz,pt %o3,.L_bn_add_words_return | |
| 524 stuw %o5,[%o0] | |
| 525 | |
| 526 lduw [%o1+4],%o4 | |
| 527 lduw [%o2+4],%o5 | |
| 528 dec %o3 | |
| 529 addccc %o5,%o4,%o5 | |
| 530 brz,pt %o3,.L_bn_add_words_return | |
| 531 stuw %o5,[%o0+4] | |
| 532 | |
| 533 lduw [%o1+8],%o4 | |
| 534 lduw [%o2+8],%o5 | |
| 535 addccc %o5,%o4,%o5 | |
| 536 stuw %o5,[%o0+8] | |
| 537 clr %o0 | |
| 538 retl | |
| 539 movcs %icc,1,%o0 | |
| 540 | |
| 541 .type bn_add_words,#function | |
| 542 .size bn_add_words,(.-bn_add_words) | |
| 543 | |
| 544 .global bn_sub_words | |
| 545 /* | |
| 546 * BN_ULONG bn_sub_words(rp,ap,bp,n) | |
| 547 * BN_ULONG *rp,*ap,*bp; | |
| 548 * int n; | |
| 549 */ | |
| 550 bn_sub_words: | |
| 551 sra %o3,%g0,%o3 ! signx %o3 | |
| 552 brgz,a %o3,.L_bn_sub_words_proceed | |
| 553 lduw [%o1],%o4 | |
| 554 retl | |
| 555 clr %o0 | |
| 556 | |
| 557 .L_bn_sub_words_proceed: | |
| 558 andcc %o3,-4,%g0 | |
| 559 bz,pn %icc,.L_bn_sub_words_tail | |
| 560 addcc %g0,0,%g0 ! clear carry flag | |
| 561 | |
| 562 .L_bn_sub_words_loop: ! wow! 32 aligned! | |
| 563 dec 4,%o3 | |
| 564 lduw [%o2],%o5 | |
| 565 lduw [%o1+4],%g1 | |
| 566 lduw [%o2+4],%g2 | |
| 567 lduw [%o1+8],%g3 | |
| 568 lduw [%o2+8],%g4 | |
| 569 subccc %o4,%o5,%o5 | |
| 570 stuw %o5,[%o0] | |
| 571 | |
| 572 lduw [%o1+12],%o4 | |
| 573 lduw [%o2+12],%o5 | |
| 574 inc 16,%o1 | |
| 575 subccc %g1,%g2,%g2 | |
| 576 stuw %g2,[%o0+4] | |
| 577 | |
| 578 inc 16,%o2 | |
| 579 subccc %g3,%g4,%g4 | |
| 580 stuw %g4,[%o0+8] | |
| 581 | |
| 582 inc 16,%o0 | |
| 583 subccc %o4,%o5,%o5 | |
| 584 stuw %o5,[%o0-4] | |
| 585 and %o3,-4,%g1 | |
| 586 brnz,a,pt %g1,.L_bn_sub_words_loop | |
| 587 lduw [%o1],%o4 | |
| 588 | |
| 589 brnz,a,pn %o3,.L_bn_sub_words_tail | |
| 590 lduw [%o1],%o4 | |
| 591 .L_bn_sub_words_return: | |
| 592 clr %o0 | |
| 593 retl | |
| 594 movcs %icc,1,%o0 | |
| 595 nop | |
| 596 | |
| 597 .L_bn_sub_words_tail: ! wow! 32 aligned! | |
| 598 lduw [%o2],%o5 | |
| 599 dec %o3 | |
| 600 subccc %o4,%o5,%o5 | |
| 601 brz,pt %o3,.L_bn_sub_words_return | |
| 602 stuw %o5,[%o0] | |
| 603 | |
| 604 lduw [%o1+4],%o4 | |
| 605 lduw [%o2+4],%o5 | |
| 606 dec %o3 | |
| 607 subccc %o4,%o5,%o5 | |
| 608 brz,pt %o3,.L_bn_sub_words_return | |
| 609 stuw %o5,[%o0+4] | |
| 610 | |
| 611 lduw [%o1+8],%o4 | |
| 612 lduw [%o2+8],%o5 | |
| 613 subccc %o4,%o5,%o5 | |
| 614 stuw %o5,[%o0+8] | |
| 615 clr %o0 | |
| 616 retl | |
| 617 movcs %icc,1,%o0 | |
| 618 | |
| 619 .type bn_sub_words,#function | |
| 620 .size bn_sub_words,(.-bn_sub_words) | |
| 621 | |
| 622 /* | |
| 623 * Code below depends on the fact that upper parts of the %l0-%l7 | |
| 624 * and %i0-%i7 are zeroed by kernel after context switch. In | |
| 625 * previous versions this comment stated that "the trouble is that | |
| 626 * it's not feasible to implement the mumbo-jumbo in less V9 | |
| 627 * instructions:-(" which apparently isn't true thanks to | |
| 628 * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement | |
| 629 * results not from the shorter code, but from elimination of | |
| 630 * multicycle none-pairable 'rd %y,%rd' instructions. | |
| 631 * | |
| 632 * Andy. | |
| 633 */ | |
| 634 | |
| 635 /* | |
| 636 * Here is register usage map for *all* routines below. | |
| 637 */ | |
| 638 #define t_1 %o0 | |
| 639 #define t_2 %o1 | |
| 640 #define c_12 %o2 | |
| 641 #define c_3 %o3 | |
| 642 | |
| 643 #define ap(I) [%i1+4*I] | |
| 644 #define bp(I) [%i2+4*I] | |
| 645 #define rp(I) [%i0+4*I] | |
| 646 | |
| 647 #define a_0 %l0 | |
| 648 #define a_1 %l1 | |
| 649 #define a_2 %l2 | |
| 650 #define a_3 %l3 | |
| 651 #define a_4 %l4 | |
| 652 #define a_5 %l5 | |
| 653 #define a_6 %l6 | |
| 654 #define a_7 %l7 | |
| 655 | |
| 656 #define b_0 %i3 | |
| 657 #define b_1 %i4 | |
| 658 #define b_2 %i5 | |
| 659 #define b_3 %o4 | |
| 660 #define b_4 %o5 | |
| 661 #define b_5 %o7 | |
| 662 #define b_6 %g1 | |
| 663 #define b_7 %g4 | |
| 664 | |
| 665 .align 32 | |
| 666 .global bn_mul_comba8 | |
| 667 /* | |
| 668 * void bn_mul_comba8(r,a,b) | |
| 669 * BN_ULONG *r,*a,*b; | |
| 670 */ | |
| 671 bn_mul_comba8: | |
| 672 save %sp,FRAME_SIZE,%sp | |
| 673 mov 1,t_2 | |
| 674 lduw ap(0),a_0 | |
| 675 sllx t_2,32,t_2 | |
| 676 lduw bp(0),b_0 != | |
| 677 lduw bp(1),b_1 | |
| 678 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3); | |
| 679 srlx t_1,32,c_12 | |
| 680 stuw t_1,rp(0) !=!r[0]=c1; | |
| 681 | |
| 682 lduw ap(1),a_1 | |
| 683 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1); | |
| 684 addcc c_12,t_1,c_12 | |
| 685 clr c_3 != | |
| 686 bcs,a %xcc,.+8 | |
| 687 add c_3,t_2,c_3 | |
| 688 lduw ap(2),a_2 | |
| 689 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); | |
| 690 addcc c_12,t_1,t_1 | |
| 691 bcs,a %xcc,.+8 | |
| 692 add c_3,t_2,c_3 | |
| 693 srlx t_1,32,c_12 != | |
| 694 stuw t_1,rp(1) !r[1]=c2; | |
| 695 or c_12,c_3,c_12 | |
| 696 | |
| 697 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); | |
| 698 addcc c_12,t_1,c_12 != | |
| 699 clr c_3 | |
| 700 bcs,a %xcc,.+8 | |
| 701 add c_3,t_2,c_3 | |
| 702 lduw bp(2),b_2 != | |
| 703 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); | |
| 704 addcc c_12,t_1,c_12 | |
| 705 bcs,a %xcc,.+8 | |
| 706 add c_3,t_2,c_3 != | |
| 707 lduw bp(3),b_3 | |
| 708 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); | |
| 709 addcc c_12,t_1,t_1 | |
| 710 bcs,a %xcc,.+8 != | |
| 711 add c_3,t_2,c_3 | |
| 712 srlx t_1,32,c_12 | |
| 713 stuw t_1,rp(2) !r[2]=c3; | |
| 714 or c_12,c_3,c_12 != | |
| 715 | |
| 716 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); | |
| 717 addcc c_12,t_1,c_12 | |
| 718 clr c_3 | |
| 719 bcs,a %xcc,.+8 != | |
| 720 add c_3,t_2,c_3 | |
| 721 mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3); | |
| 722 addcc c_12,t_1,c_12 | |
| 723 bcs,a %xcc,.+8 != | |
| 724 add c_3,t_2,c_3 | |
| 725 lduw ap(3),a_3 | |
| 726 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); | |
| 727 addcc c_12,t_1,c_12 != | |
| 728 bcs,a %xcc,.+8 | |
| 729 add c_3,t_2,c_3 | |
| 730 lduw ap(4),a_4 | |
| 731 mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!= | |
| 732 addcc c_12,t_1,t_1 | |
| 733 bcs,a %xcc,.+8 | |
| 734 add c_3,t_2,c_3 | |
| 735 srlx t_1,32,c_12 != | |
| 736 stuw t_1,rp(3) !r[3]=c1; | |
| 737 or c_12,c_3,c_12 | |
| 738 | |
| 739 mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1); | |
| 740 addcc c_12,t_1,c_12 != | |
| 741 clr c_3 | |
| 742 bcs,a %xcc,.+8 | |
| 743 add c_3,t_2,c_3 | |
| 744 mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1); | |
| 745 addcc c_12,t_1,c_12 | |
| 746 bcs,a %xcc,.+8 | |
| 747 add c_3,t_2,c_3 | |
| 748 mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1); | |
| 749 addcc c_12,t_1,c_12 | |
| 750 bcs,a %xcc,.+8 | |
| 751 add c_3,t_2,c_3 | |
| 752 lduw bp(4),b_4 != | |
| 753 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); | |
| 754 addcc c_12,t_1,c_12 | |
| 755 bcs,a %xcc,.+8 | |
| 756 add c_3,t_2,c_3 != | |
| 757 lduw bp(5),b_5 | |
| 758 mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1); | |
| 759 addcc c_12,t_1,t_1 | |
| 760 bcs,a %xcc,.+8 != | |
| 761 add c_3,t_2,c_3 | |
| 762 srlx t_1,32,c_12 | |
| 763 stuw t_1,rp(4) !r[4]=c2; | |
| 764 or c_12,c_3,c_12 != | |
| 765 | |
| 766 mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2); | |
| 767 addcc c_12,t_1,c_12 | |
| 768 clr c_3 | |
| 769 bcs,a %xcc,.+8 != | |
| 770 add c_3,t_2,c_3 | |
| 771 mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2); | |
| 772 addcc c_12,t_1,c_12 | |
| 773 bcs,a %xcc,.+8 != | |
| 774 add c_3,t_2,c_3 | |
| 775 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); | |
| 776 addcc c_12,t_1,c_12 | |
| 777 bcs,a %xcc,.+8 != | |
| 778 add c_3,t_2,c_3 | |
| 779 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); | |
| 780 addcc c_12,t_1,c_12 | |
| 781 bcs,a %xcc,.+8 != | |
| 782 add c_3,t_2,c_3 | |
| 783 lduw ap(5),a_5 | |
| 784 mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2); | |
| 785 addcc c_12,t_1,c_12 != | |
| 786 bcs,a %xcc,.+8 | |
| 787 add c_3,t_2,c_3 | |
| 788 lduw ap(6),a_6 | |
| 789 mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2); | |
| 790 addcc c_12,t_1,t_1 | |
| 791 bcs,a %xcc,.+8 | |
| 792 add c_3,t_2,c_3 | |
| 793 srlx t_1,32,c_12 != | |
| 794 stuw t_1,rp(5) !r[5]=c3; | |
| 795 or c_12,c_3,c_12 | |
| 796 | |
| 797 mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3); | |
| 798 addcc c_12,t_1,c_12 != | |
| 799 clr c_3 | |
| 800 bcs,a %xcc,.+8 | |
| 801 add c_3,t_2,c_3 | |
| 802 mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3); | |
| 803 addcc c_12,t_1,c_12 | |
| 804 bcs,a %xcc,.+8 | |
| 805 add c_3,t_2,c_3 | |
| 806 mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3); | |
| 807 addcc c_12,t_1,c_12 | |
| 808 bcs,a %xcc,.+8 | |
| 809 add c_3,t_2,c_3 | |
| 810 mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3); | |
| 811 addcc c_12,t_1,c_12 | |
| 812 bcs,a %xcc,.+8 | |
| 813 add c_3,t_2,c_3 | |
| 814 mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3); | |
| 815 addcc c_12,t_1,c_12 | |
| 816 bcs,a %xcc,.+8 | |
| 817 add c_3,t_2,c_3 | |
| 818 lduw bp(6),b_6 != | |
| 819 mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3); | |
| 820 addcc c_12,t_1,c_12 | |
| 821 bcs,a %xcc,.+8 | |
| 822 add c_3,t_2,c_3 != | |
| 823 lduw bp(7),b_7 | |
| 824 mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3); | |
| 825 addcc c_12,t_1,t_1 | |
| 826 bcs,a %xcc,.+8 != | |
| 827 add c_3,t_2,c_3 | |
| 828 srlx t_1,32,c_12 | |
| 829 stuw t_1,rp(6) !r[6]=c1; | |
| 830 or c_12,c_3,c_12 != | |
| 831 | |
| 832 mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1); | |
| 833 addcc c_12,t_1,c_12 | |
| 834 clr c_3 | |
| 835 bcs,a %xcc,.+8 != | |
| 836 add c_3,t_2,c_3 | |
| 837 mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1); | |
| 838 addcc c_12,t_1,c_12 | |
| 839 bcs,a %xcc,.+8 != | |
| 840 add c_3,t_2,c_3 | |
| 841 mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1); | |
| 842 addcc c_12,t_1,c_12 | |
| 843 bcs,a %xcc,.+8 != | |
| 844 add c_3,t_2,c_3 | |
| 845 mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1); | |
| 846 addcc c_12,t_1,c_12 | |
| 847 bcs,a %xcc,.+8 != | |
| 848 add c_3,t_2,c_3 | |
| 849 mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1); | |
| 850 addcc c_12,t_1,c_12 | |
| 851 bcs,a %xcc,.+8 != | |
| 852 add c_3,t_2,c_3 | |
| 853 mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1); | |
| 854 addcc c_12,t_1,c_12 | |
| 855 bcs,a %xcc,.+8 != | |
| 856 add c_3,t_2,c_3 | |
| 857 lduw ap(7),a_7 | |
| 858 mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1); | |
| 859 addcc c_12,t_1,c_12 | |
| 860 bcs,a %xcc,.+8 | |
| 861 add c_3,t_2,c_3 | |
| 862 mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1); | |
| 863 addcc c_12,t_1,t_1 | |
| 864 bcs,a %xcc,.+8 | |
| 865 add c_3,t_2,c_3 | |
| 866 srlx t_1,32,c_12 != | |
| 867 stuw t_1,rp(7) !r[7]=c2; | |
| 868 or c_12,c_3,c_12 | |
| 869 | |
| 870 mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2); | |
| 871 addcc c_12,t_1,c_12 | |
| 872 clr c_3 | |
| 873 bcs,a %xcc,.+8 | |
| 874 add c_3,t_2,c_3 != | |
| 875 mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2); | |
| 876 addcc c_12,t_1,c_12 | |
| 877 bcs,a %xcc,.+8 | |
| 878 add c_3,t_2,c_3 != | |
| 879 mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2); | |
| 880 addcc c_12,t_1,c_12 | |
| 881 bcs,a %xcc,.+8 | |
| 882 add c_3,t_2,c_3 != | |
| 883 mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2); | |
| 884 addcc c_12,t_1,c_12 | |
| 885 bcs,a %xcc,.+8 | |
| 886 add c_3,t_2,c_3 != | |
| 887 mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2); | |
| 888 addcc c_12,t_1,c_12 | |
| 889 bcs,a %xcc,.+8 | |
| 890 add c_3,t_2,c_3 != | |
| 891 mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2); | |
| 892 addcc c_12,t_1,c_12 | |
| 893 bcs,a %xcc,.+8 | |
| 894 add c_3,t_2,c_3 != | |
| 895 mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2); | |
| 896 addcc c_12,t_1,t_1 | |
| 897 bcs,a %xcc,.+8 | |
| 898 add c_3,t_2,c_3 != | |
| 899 srlx t_1,32,c_12 | |
| 900 stuw t_1,rp(8) !r[8]=c3; | |
| 901 or c_12,c_3,c_12 | |
| 902 | |
| 903 mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3); | |
| 904 addcc c_12,t_1,c_12 | |
| 905 clr c_3 | |
| 906 bcs,a %xcc,.+8 | |
| 907 add c_3,t_2,c_3 != | |
| 908 mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3); | |
| 909 addcc c_12,t_1,c_12 | |
| 910 bcs,a %xcc,.+8 != | |
| 911 add c_3,t_2,c_3 | |
| 912 mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3); | |
| 913 addcc c_12,t_1,c_12 | |
| 914 bcs,a %xcc,.+8 != | |
| 915 add c_3,t_2,c_3 | |
| 916 mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3); | |
| 917 addcc c_12,t_1,c_12 | |
| 918 bcs,a %xcc,.+8 != | |
| 919 add c_3,t_2,c_3 | |
| 920 mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3); | |
| 921 addcc c_12,t_1,c_12 | |
| 922 bcs,a %xcc,.+8 != | |
| 923 add c_3,t_2,c_3 | |
| 924 mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3); | |
| 925 addcc c_12,t_1,t_1 | |
| 926 bcs,a %xcc,.+8 != | |
| 927 add c_3,t_2,c_3 | |
| 928 srlx t_1,32,c_12 | |
| 929 stuw t_1,rp(9) !r[9]=c1; | |
| 930 or c_12,c_3,c_12 != | |
| 931 | |
| 932 mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1); | |
| 933 addcc c_12,t_1,c_12 | |
| 934 clr c_3 | |
| 935 bcs,a %xcc,.+8 != | |
| 936 add c_3,t_2,c_3 | |
| 937 mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1); | |
| 938 addcc c_12,t_1,c_12 | |
| 939 bcs,a %xcc,.+8 != | |
| 940 add c_3,t_2,c_3 | |
| 941 mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1); | |
| 942 addcc c_12,t_1,c_12 | |
| 943 bcs,a %xcc,.+8 != | |
| 944 add c_3,t_2,c_3 | |
| 945 mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1); | |
| 946 addcc c_12,t_1,c_12 | |
| 947 bcs,a %xcc,.+8 != | |
| 948 add c_3,t_2,c_3 | |
| 949 mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1); | |
| 950 addcc c_12,t_1,t_1 | |
| 951 bcs,a %xcc,.+8 != | |
| 952 add c_3,t_2,c_3 | |
| 953 srlx t_1,32,c_12 | |
| 954 stuw t_1,rp(10) !r[10]=c2; | |
| 955 or c_12,c_3,c_12 != | |
| 956 | |
| 957 mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2); | |
| 958 addcc c_12,t_1,c_12 | |
| 959 clr c_3 | |
| 960 bcs,a %xcc,.+8 != | |
| 961 add c_3,t_2,c_3 | |
| 962 mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2); | |
| 963 addcc c_12,t_1,c_12 | |
| 964 bcs,a %xcc,.+8 != | |
| 965 add c_3,t_2,c_3 | |
| 966 mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2); | |
| 967 addcc c_12,t_1,c_12 | |
| 968 bcs,a %xcc,.+8 != | |
| 969 add c_3,t_2,c_3 | |
| 970 mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2); | |
| 971 addcc c_12,t_1,t_1 | |
| 972 bcs,a %xcc,.+8 != | |
| 973 add c_3,t_2,c_3 | |
| 974 srlx t_1,32,c_12 | |
| 975 stuw t_1,rp(11) !r[11]=c3; | |
| 976 or c_12,c_3,c_12 != | |
| 977 | |
| 978 mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3); | |
| 979 addcc c_12,t_1,c_12 | |
| 980 clr c_3 | |
| 981 bcs,a %xcc,.+8 != | |
| 982 add c_3,t_2,c_3 | |
| 983 mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3); | |
| 984 addcc c_12,t_1,c_12 | |
| 985 bcs,a %xcc,.+8 != | |
| 986 add c_3,t_2,c_3 | |
| 987 mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3); | |
| 988 addcc c_12,t_1,t_1 | |
| 989 bcs,a %xcc,.+8 != | |
| 990 add c_3,t_2,c_3 | |
| 991 srlx t_1,32,c_12 | |
| 992 stuw t_1,rp(12) !r[12]=c1; | |
| 993 or c_12,c_3,c_12 != | |
| 994 | |
| 995 mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1); | |
| 996 addcc c_12,t_1,c_12 | |
| 997 clr c_3 | |
| 998 bcs,a %xcc,.+8 != | |
| 999 add c_3,t_2,c_3 | |
| 1000 mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1); | |
| 1001 addcc c_12,t_1,t_1 | |
| 1002 bcs,a %xcc,.+8 != | |
| 1003 add c_3,t_2,c_3 | |
| 1004 srlx t_1,32,c_12 | |
| 1005 st t_1,rp(13) !r[13]=c2; | |
| 1006 or c_12,c_3,c_12 != | |
| 1007 | |
| 1008 mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2); | |
| 1009 addcc c_12,t_1,t_1 | |
| 1010 srlx t_1,32,c_12 != | |
| 1011 stuw t_1,rp(14) !r[14]=c3; | |
| 1012 stuw c_12,rp(15) !r[15]=c1; | |
| 1013 | |
| 1014 ret | |
| 1015 restore %g0,%g0,%o0 != | |
| 1016 | |
| 1017 .type bn_mul_comba8,#function | |
| 1018 .size bn_mul_comba8,(.-bn_mul_comba8) | |
| 1019 | |
| 1020 .align 32 | |
| 1021 | |
| 1022 .global bn_mul_comba4 | |
| 1023 /* | |
| 1024 * void bn_mul_comba4(r,a,b) | |
| 1025 * BN_ULONG *r,*a,*b; | |
| 1026 */ | |
| 1027 bn_mul_comba4: | |
| 1028 save %sp,FRAME_SIZE,%sp | |
| 1029 lduw ap(0),a_0 | |
| 1030 mov 1,t_2 | |
| 1031 lduw bp(0),b_0 | |
| 1032 sllx t_2,32,t_2 != | |
| 1033 lduw bp(1),b_1 | |
| 1034 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3); | |
| 1035 srlx t_1,32,c_12 | |
| 1036 stuw t_1,rp(0) !=!r[0]=c1; | |
| 1037 | |
| 1038 lduw ap(1),a_1 | |
| 1039 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1); | |
| 1040 addcc c_12,t_1,c_12 | |
| 1041 clr c_3 != | |
| 1042 bcs,a %xcc,.+8 | |
| 1043 add c_3,t_2,c_3 | |
| 1044 lduw ap(2),a_2 | |
| 1045 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); | |
| 1046 addcc c_12,t_1,t_1 | |
| 1047 bcs,a %xcc,.+8 | |
| 1048 add c_3,t_2,c_3 | |
| 1049 srlx t_1,32,c_12 != | |
| 1050 stuw t_1,rp(1) !r[1]=c2; | |
| 1051 or c_12,c_3,c_12 | |
| 1052 | |
| 1053 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); | |
| 1054 addcc c_12,t_1,c_12 != | |
| 1055 clr c_3 | |
| 1056 bcs,a %xcc,.+8 | |
| 1057 add c_3,t_2,c_3 | |
| 1058 lduw bp(2),b_2 != | |
| 1059 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); | |
| 1060 addcc c_12,t_1,c_12 | |
| 1061 bcs,a %xcc,.+8 | |
| 1062 add c_3,t_2,c_3 != | |
| 1063 lduw bp(3),b_3 | |
| 1064 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); | |
| 1065 addcc c_12,t_1,t_1 | |
| 1066 bcs,a %xcc,.+8 != | |
| 1067 add c_3,t_2,c_3 | |
| 1068 srlx t_1,32,c_12 | |
| 1069 stuw t_1,rp(2) !r[2]=c3; | |
| 1070 or c_12,c_3,c_12 != | |
| 1071 | |
| 1072 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); | |
| 1073 addcc c_12,t_1,c_12 | |
| 1074 clr c_3 | |
| 1075 bcs,a %xcc,.+8 != | |
| 1076 add c_3,t_2,c_3 | |
| 1077 mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3); | |
| 1078 addcc c_12,t_1,c_12 | |
| 1079 bcs,a %xcc,.+8 != | |
| 1080 add c_3,t_2,c_3 | |
| 1081 lduw ap(3),a_3 | |
| 1082 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); | |
| 1083 addcc c_12,t_1,c_12 != | |
| 1084 bcs,a %xcc,.+8 | |
| 1085 add c_3,t_2,c_3 | |
| 1086 mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!= | |
| 1087 addcc c_12,t_1,t_1 != | |
| 1088 bcs,a %xcc,.+8 | |
| 1089 add c_3,t_2,c_3 | |
| 1090 srlx t_1,32,c_12 | |
| 1091 stuw t_1,rp(3) !=!r[3]=c1; | |
| 1092 or c_12,c_3,c_12 | |
| 1093 | |
| 1094 mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1); | |
| 1095 addcc c_12,t_1,c_12 | |
| 1096 clr c_3 != | |
| 1097 bcs,a %xcc,.+8 | |
| 1098 add c_3,t_2,c_3 | |
| 1099 mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1); | |
| 1100 addcc c_12,t_1,c_12 != | |
| 1101 bcs,a %xcc,.+8 | |
| 1102 add c_3,t_2,c_3 | |
| 1103 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); | |
| 1104 addcc c_12,t_1,t_1 != | |
| 1105 bcs,a %xcc,.+8 | |
| 1106 add c_3,t_2,c_3 | |
| 1107 srlx t_1,32,c_12 | |
| 1108 stuw t_1,rp(4) !=!r[4]=c2; | |
| 1109 or c_12,c_3,c_12 | |
| 1110 | |
| 1111 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); | |
| 1112 addcc c_12,t_1,c_12 | |
| 1113 clr c_3 != | |
| 1114 bcs,a %xcc,.+8 | |
| 1115 add c_3,t_2,c_3 | |
| 1116 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); | |
| 1117 addcc c_12,t_1,t_1 != | |
| 1118 bcs,a %xcc,.+8 | |
| 1119 add c_3,t_2,c_3 | |
| 1120 srlx t_1,32,c_12 | |
| 1121 stuw t_1,rp(5) !=!r[5]=c3; | |
| 1122 or c_12,c_3,c_12 | |
| 1123 | |
| 1124 mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3); | |
| 1125 addcc c_12,t_1,t_1 | |
| 1126 srlx t_1,32,c_12 != | |
| 1127 stuw t_1,rp(6) !r[6]=c1; | |
| 1128 stuw c_12,rp(7) !r[7]=c2; | |
| 1129 | |
| 1130 ret | |
| 1131 restore %g0,%g0,%o0 | |
| 1132 | |
| 1133 .type bn_mul_comba4,#function | |
| 1134 .size bn_mul_comba4,(.-bn_mul_comba4) | |
| 1135 | |
| 1136 .align 32 | |
| 1137 | |
| 1138 .global bn_sqr_comba8 | |
| 1139 bn_sqr_comba8: | |
| 1140 save %sp,FRAME_SIZE,%sp | |
| 1141 mov 1,t_2 | |
| 1142 lduw ap(0),a_0 | |
| 1143 sllx t_2,32,t_2 | |
| 1144 lduw ap(1),a_1 | |
| 1145 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3); | |
| 1146 srlx t_1,32,c_12 | |
| 1147 stuw t_1,rp(0) !r[0]=c1; | |
| 1148 | |
| 1149 lduw ap(2),a_2 | |
| 1150 mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1); | |
| 1151 addcc c_12,t_1,c_12 | |
| 1152 clr c_3 | |
| 1153 bcs,a %xcc,.+8 | |
| 1154 add c_3,t_2,c_3 | |
| 1155 addcc c_12,t_1,t_1 | |
| 1156 bcs,a %xcc,.+8 | |
| 1157 add c_3,t_2,c_3 | |
| 1158 srlx t_1,32,c_12 | |
| 1159 stuw t_1,rp(1) !r[1]=c2; | |
| 1160 or c_12,c_3,c_12 | |
| 1161 | |
| 1162 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); | |
| 1163 addcc c_12,t_1,c_12 | |
| 1164 clr c_3 | |
| 1165 bcs,a %xcc,.+8 | |
| 1166 add c_3,t_2,c_3 | |
| 1167 addcc c_12,t_1,c_12 | |
| 1168 bcs,a %xcc,.+8 | |
| 1169 add c_3,t_2,c_3 | |
| 1170 lduw ap(3),a_3 | |
| 1171 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); | |
| 1172 addcc c_12,t_1,t_1 | |
| 1173 bcs,a %xcc,.+8 | |
| 1174 add c_3,t_2,c_3 | |
| 1175 srlx t_1,32,c_12 | |
| 1176 stuw t_1,rp(2) !r[2]=c3; | |
| 1177 or c_12,c_3,c_12 | |
| 1178 | |
| 1179 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); | |
| 1180 addcc c_12,t_1,c_12 | |
| 1181 clr c_3 | |
| 1182 bcs,a %xcc,.+8 | |
| 1183 add c_3,t_2,c_3 | |
| 1184 addcc c_12,t_1,c_12 | |
| 1185 bcs,a %xcc,.+8 | |
| 1186 add c_3,t_2,c_3 | |
| 1187 lduw ap(4),a_4 | |
| 1188 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); | |
| 1189 addcc c_12,t_1,c_12 | |
| 1190 bcs,a %xcc,.+8 | |
| 1191 add c_3,t_2,c_3 | |
| 1192 addcc c_12,t_1,t_1 | |
| 1193 bcs,a %xcc,.+8 | |
| 1194 add c_3,t_2,c_3 | |
| 1195 srlx t_1,32,c_12 | |
| 1196 st t_1,rp(3) !r[3]=c1; | |
| 1197 or c_12,c_3,c_12 | |
| 1198 | |
| 1199 mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1); | |
| 1200 addcc c_12,t_1,c_12 | |
| 1201 clr c_3 | |
| 1202 bcs,a %xcc,.+8 | |
| 1203 add c_3,t_2,c_3 | |
| 1204 addcc c_12,t_1,c_12 | |
| 1205 bcs,a %xcc,.+8 | |
| 1206 add c_3,t_2,c_3 | |
| 1207 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); | |
| 1208 addcc c_12,t_1,c_12 | |
| 1209 bcs,a %xcc,.+8 | |
| 1210 add c_3,t_2,c_3 | |
| 1211 addcc c_12,t_1,c_12 | |
| 1212 bcs,a %xcc,.+8 | |
| 1213 add c_3,t_2,c_3 | |
| 1214 lduw ap(5),a_5 | |
| 1215 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); | |
| 1216 addcc c_12,t_1,t_1 | |
| 1217 bcs,a %xcc,.+8 | |
| 1218 add c_3,t_2,c_3 | |
| 1219 srlx t_1,32,c_12 | |
| 1220 stuw t_1,rp(4) !r[4]=c2; | |
| 1221 or c_12,c_3,c_12 | |
| 1222 | |
| 1223 mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2); | |
| 1224 addcc c_12,t_1,c_12 | |
| 1225 clr c_3 | |
| 1226 bcs,a %xcc,.+8 | |
| 1227 add c_3,t_2,c_3 | |
| 1228 addcc c_12,t_1,c_12 | |
| 1229 bcs,a %xcc,.+8 | |
| 1230 add c_3,t_2,c_3 | |
| 1231 mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2); | |
| 1232 addcc c_12,t_1,c_12 | |
| 1233 bcs,a %xcc,.+8 | |
| 1234 add c_3,t_2,c_3 | |
| 1235 addcc c_12,t_1,c_12 | |
| 1236 bcs,a %xcc,.+8 | |
| 1237 add c_3,t_2,c_3 | |
| 1238 lduw ap(6),a_6 | |
| 1239 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); | |
| 1240 addcc c_12,t_1,c_12 | |
| 1241 bcs,a %xcc,.+8 | |
| 1242 add c_3,t_2,c_3 | |
| 1243 addcc c_12,t_1,t_1 | |
| 1244 bcs,a %xcc,.+8 | |
| 1245 add c_3,t_2,c_3 | |
| 1246 srlx t_1,32,c_12 | |
| 1247 stuw t_1,rp(5) !r[5]=c3; | |
| 1248 or c_12,c_3,c_12 | |
| 1249 | |
| 1250 mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3); | |
| 1251 addcc c_12,t_1,c_12 | |
| 1252 clr c_3 | |
| 1253 bcs,a %xcc,.+8 | |
| 1254 add c_3,t_2,c_3 | |
| 1255 addcc c_12,t_1,c_12 | |
| 1256 bcs,a %xcc,.+8 | |
| 1257 add c_3,t_2,c_3 | |
| 1258 mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3); | |
| 1259 addcc c_12,t_1,c_12 | |
| 1260 bcs,a %xcc,.+8 | |
| 1261 add c_3,t_2,c_3 | |
| 1262 addcc c_12,t_1,c_12 | |
| 1263 bcs,a %xcc,.+8 | |
| 1264 add c_3,t_2,c_3 | |
| 1265 mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3); | |
| 1266 addcc c_12,t_1,c_12 | |
| 1267 bcs,a %xcc,.+8 | |
| 1268 add c_3,t_2,c_3 | |
| 1269 addcc c_12,t_1,c_12 | |
| 1270 bcs,a %xcc,.+8 | |
| 1271 add c_3,t_2,c_3 | |
| 1272 lduw ap(7),a_7 | |
| 1273 mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3); | |
| 1274 addcc c_12,t_1,t_1 | |
| 1275 bcs,a %xcc,.+8 | |
| 1276 add c_3,t_2,c_3 | |
| 1277 srlx t_1,32,c_12 | |
| 1278 stuw t_1,rp(6) !r[6]=c1; | |
| 1279 or c_12,c_3,c_12 | |
| 1280 | |
| 1281 mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1); | |
| 1282 addcc c_12,t_1,c_12 | |
| 1283 clr c_3 | |
| 1284 bcs,a %xcc,.+8 | |
| 1285 add c_3,t_2,c_3 | |
| 1286 addcc c_12,t_1,c_12 | |
| 1287 bcs,a %xcc,.+8 | |
| 1288 add c_3,t_2,c_3 | |
| 1289 mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1); | |
| 1290 addcc c_12,t_1,c_12 | |
| 1291 bcs,a %xcc,.+8 | |
| 1292 add c_3,t_2,c_3 | |
| 1293 addcc c_12,t_1,c_12 | |
| 1294 bcs,a %xcc,.+8 | |
| 1295 add c_3,t_2,c_3 | |
| 1296 mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1); | |
| 1297 addcc c_12,t_1,c_12 | |
| 1298 bcs,a %xcc,.+8 | |
| 1299 add c_3,t_2,c_3 | |
| 1300 addcc c_12,t_1,c_12 | |
| 1301 bcs,a %xcc,.+8 | |
| 1302 add c_3,t_2,c_3 | |
| 1303 mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1); | |
| 1304 addcc c_12,t_1,c_12 | |
| 1305 bcs,a %xcc,.+8 | |
| 1306 add c_3,t_2,c_3 | |
| 1307 addcc c_12,t_1,t_1 | |
| 1308 bcs,a %xcc,.+8 | |
| 1309 add c_3,t_2,c_3 | |
| 1310 srlx t_1,32,c_12 | |
| 1311 stuw t_1,rp(7) !r[7]=c2; | |
| 1312 or c_12,c_3,c_12 | |
| 1313 | |
| 1314 mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2); | |
| 1315 addcc c_12,t_1,c_12 | |
| 1316 clr c_3 | |
| 1317 bcs,a %xcc,.+8 | |
| 1318 add c_3,t_2,c_3 | |
| 1319 addcc c_12,t_1,c_12 | |
| 1320 bcs,a %xcc,.+8 | |
| 1321 add c_3,t_2,c_3 | |
| 1322 mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2); | |
| 1323 addcc c_12,t_1,c_12 | |
| 1324 bcs,a %xcc,.+8 | |
| 1325 add c_3,t_2,c_3 | |
| 1326 addcc c_12,t_1,c_12 | |
| 1327 bcs,a %xcc,.+8 | |
| 1328 add c_3,t_2,c_3 | |
| 1329 mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2); | |
| 1330 addcc c_12,t_1,c_12 | |
| 1331 bcs,a %xcc,.+8 | |
| 1332 add c_3,t_2,c_3 | |
| 1333 addcc c_12,t_1,c_12 | |
| 1334 bcs,a %xcc,.+8 | |
| 1335 add c_3,t_2,c_3 | |
| 1336 mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2); | |
| 1337 addcc c_12,t_1,t_1 | |
| 1338 bcs,a %xcc,.+8 | |
| 1339 add c_3,t_2,c_3 | |
| 1340 srlx t_1,32,c_12 | |
| 1341 stuw t_1,rp(8) !r[8]=c3; | |
| 1342 or c_12,c_3,c_12 | |
| 1343 | |
| 1344 mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3); | |
| 1345 addcc c_12,t_1,c_12 | |
| 1346 clr c_3 | |
| 1347 bcs,a %xcc,.+8 | |
| 1348 add c_3,t_2,c_3 | |
| 1349 addcc c_12,t_1,c_12 | |
| 1350 bcs,a %xcc,.+8 | |
| 1351 add c_3,t_2,c_3 | |
| 1352 mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3); | |
| 1353 addcc c_12,t_1,c_12 | |
| 1354 bcs,a %xcc,.+8 | |
| 1355 add c_3,t_2,c_3 | |
| 1356 addcc c_12,t_1,c_12 | |
| 1357 bcs,a %xcc,.+8 | |
| 1358 add c_3,t_2,c_3 | |
| 1359 mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3); | |
| 1360 addcc c_12,t_1,c_12 | |
| 1361 bcs,a %xcc,.+8 | |
| 1362 add c_3,t_2,c_3 | |
| 1363 addcc c_12,t_1,t_1 | |
| 1364 bcs,a %xcc,.+8 | |
| 1365 add c_3,t_2,c_3 | |
| 1366 srlx t_1,32,c_12 | |
| 1367 stuw t_1,rp(9) !r[9]=c1; | |
| 1368 or c_12,c_3,c_12 | |
| 1369 | |
| 1370 mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1); | |
| 1371 addcc c_12,t_1,c_12 | |
| 1372 clr c_3 | |
| 1373 bcs,a %xcc,.+8 | |
| 1374 add c_3,t_2,c_3 | |
| 1375 addcc c_12,t_1,c_12 | |
| 1376 bcs,a %xcc,.+8 | |
| 1377 add c_3,t_2,c_3 | |
| 1378 mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1); | |
| 1379 addcc c_12,t_1,c_12 | |
| 1380 bcs,a %xcc,.+8 | |
| 1381 add c_3,t_2,c_3 | |
| 1382 addcc c_12,t_1,c_12 | |
| 1383 bcs,a %xcc,.+8 | |
| 1384 add c_3,t_2,c_3 | |
| 1385 mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1); | |
| 1386 addcc c_12,t_1,t_1 | |
| 1387 bcs,a %xcc,.+8 | |
| 1388 add c_3,t_2,c_3 | |
| 1389 srlx t_1,32,c_12 | |
| 1390 stuw t_1,rp(10) !r[10]=c2; | |
| 1391 or c_12,c_3,c_12 | |
| 1392 | |
| 1393 mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2); | |
| 1394 addcc c_12,t_1,c_12 | |
| 1395 clr c_3 | |
| 1396 bcs,a %xcc,.+8 | |
| 1397 add c_3,t_2,c_3 | |
| 1398 addcc c_12,t_1,c_12 | |
| 1399 bcs,a %xcc,.+8 | |
| 1400 add c_3,t_2,c_3 | |
| 1401 mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2); | |
| 1402 addcc c_12,t_1,c_12 | |
| 1403 bcs,a %xcc,.+8 | |
| 1404 add c_3,t_2,c_3 | |
| 1405 addcc c_12,t_1,t_1 | |
| 1406 bcs,a %xcc,.+8 | |
| 1407 add c_3,t_2,c_3 | |
| 1408 srlx t_1,32,c_12 | |
| 1409 stuw t_1,rp(11) !r[11]=c3; | |
| 1410 or c_12,c_3,c_12 | |
| 1411 | |
| 1412 mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3); | |
| 1413 addcc c_12,t_1,c_12 | |
| 1414 clr c_3 | |
| 1415 bcs,a %xcc,.+8 | |
| 1416 add c_3,t_2,c_3 | |
| 1417 addcc c_12,t_1,c_12 | |
| 1418 bcs,a %xcc,.+8 | |
| 1419 add c_3,t_2,c_3 | |
| 1420 mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3); | |
| 1421 addcc c_12,t_1,t_1 | |
| 1422 bcs,a %xcc,.+8 | |
| 1423 add c_3,t_2,c_3 | |
| 1424 srlx t_1,32,c_12 | |
| 1425 stuw t_1,rp(12) !r[12]=c1; | |
| 1426 or c_12,c_3,c_12 | |
| 1427 | |
| 1428 mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1); | |
| 1429 addcc c_12,t_1,c_12 | |
| 1430 clr c_3 | |
| 1431 bcs,a %xcc,.+8 | |
| 1432 add c_3,t_2,c_3 | |
| 1433 addcc c_12,t_1,t_1 | |
| 1434 bcs,a %xcc,.+8 | |
| 1435 add c_3,t_2,c_3 | |
| 1436 srlx t_1,32,c_12 | |
| 1437 stuw t_1,rp(13) !r[13]=c2; | |
| 1438 or c_12,c_3,c_12 | |
| 1439 | |
| 1440 mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2); | |
| 1441 addcc c_12,t_1,t_1 | |
| 1442 srlx t_1,32,c_12 | |
| 1443 stuw t_1,rp(14) !r[14]=c3; | |
| 1444 stuw c_12,rp(15) !r[15]=c1; | |
| 1445 | |
| 1446 ret | |
| 1447 restore %g0,%g0,%o0 | |
| 1448 | |
| 1449 .type bn_sqr_comba8,#function | |
| 1450 .size bn_sqr_comba8,(.-bn_sqr_comba8) | |
| 1451 | |
| 1452 .align 32 | |
| 1453 | |
| 1454 .global bn_sqr_comba4 | |
| 1455 /* | |
| 1456 * void bn_sqr_comba4(r,a) | |
| 1457 * BN_ULONG *r,*a; | |
| 1458 */ | |
| 1459 bn_sqr_comba4: | |
| 1460 save %sp,FRAME_SIZE,%sp | |
| 1461 mov 1,t_2 | |
| 1462 lduw ap(0),a_0 | |
| 1463 sllx t_2,32,t_2 | |
| 1464 lduw ap(1),a_1 | |
| 1465 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3); | |
| 1466 srlx t_1,32,c_12 | |
| 1467 stuw t_1,rp(0) !r[0]=c1; | |
| 1468 | |
| 1469 lduw ap(2),a_2 | |
| 1470 mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1); | |
| 1471 addcc c_12,t_1,c_12 | |
| 1472 clr c_3 | |
| 1473 bcs,a %xcc,.+8 | |
| 1474 add c_3,t_2,c_3 | |
| 1475 addcc c_12,t_1,t_1 | |
| 1476 bcs,a %xcc,.+8 | |
| 1477 add c_3,t_2,c_3 | |
| 1478 srlx t_1,32,c_12 | |
| 1479 stuw t_1,rp(1) !r[1]=c2; | |
| 1480 or c_12,c_3,c_12 | |
| 1481 | |
| 1482 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); | |
| 1483 addcc c_12,t_1,c_12 | |
| 1484 clr c_3 | |
| 1485 bcs,a %xcc,.+8 | |
| 1486 add c_3,t_2,c_3 | |
| 1487 addcc c_12,t_1,c_12 | |
| 1488 bcs,a %xcc,.+8 | |
| 1489 add c_3,t_2,c_3 | |
| 1490 lduw ap(3),a_3 | |
| 1491 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); | |
| 1492 addcc c_12,t_1,t_1 | |
| 1493 bcs,a %xcc,.+8 | |
| 1494 add c_3,t_2,c_3 | |
| 1495 srlx t_1,32,c_12 | |
| 1496 stuw t_1,rp(2) !r[2]=c3; | |
| 1497 or c_12,c_3,c_12 | |
| 1498 | |
| 1499 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); | |
| 1500 addcc c_12,t_1,c_12 | |
| 1501 clr c_3 | |
| 1502 bcs,a %xcc,.+8 | |
| 1503 add c_3,t_2,c_3 | |
| 1504 addcc c_12,t_1,c_12 | |
| 1505 bcs,a %xcc,.+8 | |
| 1506 add c_3,t_2,c_3 | |
| 1507 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); | |
| 1508 addcc c_12,t_1,c_12 | |
| 1509 bcs,a %xcc,.+8 | |
| 1510 add c_3,t_2,c_3 | |
| 1511 addcc c_12,t_1,t_1 | |
| 1512 bcs,a %xcc,.+8 | |
| 1513 add c_3,t_2,c_3 | |
| 1514 srlx t_1,32,c_12 | |
| 1515 stuw t_1,rp(3) !r[3]=c1; | |
| 1516 or c_12,c_3,c_12 | |
| 1517 | |
| 1518 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); | |
| 1519 addcc c_12,t_1,c_12 | |
| 1520 clr c_3 | |
| 1521 bcs,a %xcc,.+8 | |
| 1522 add c_3,t_2,c_3 | |
| 1523 addcc c_12,t_1,c_12 | |
| 1524 bcs,a %xcc,.+8 | |
| 1525 add c_3,t_2,c_3 | |
| 1526 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); | |
| 1527 addcc c_12,t_1,t_1 | |
| 1528 bcs,a %xcc,.+8 | |
| 1529 add c_3,t_2,c_3 | |
| 1530 srlx t_1,32,c_12 | |
| 1531 stuw t_1,rp(4) !r[4]=c2; | |
| 1532 or c_12,c_3,c_12 | |
| 1533 | |
| 1534 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); | |
| 1535 addcc c_12,t_1,c_12 | |
| 1536 clr c_3 | |
| 1537 bcs,a %xcc,.+8 | |
| 1538 add c_3,t_2,c_3 | |
| 1539 addcc c_12,t_1,t_1 | |
| 1540 bcs,a %xcc,.+8 | |
| 1541 add c_3,t_2,c_3 | |
| 1542 srlx t_1,32,c_12 | |
| 1543 stuw t_1,rp(5) !r[5]=c3; | |
| 1544 or c_12,c_3,c_12 | |
| 1545 | |
| 1546 mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3); | |
| 1547 addcc c_12,t_1,t_1 | |
| 1548 srlx t_1,32,c_12 | |
| 1549 stuw t_1,rp(6) !r[6]=c1; | |
| 1550 stuw c_12,rp(7) !r[7]=c2; | |
| 1551 | |
| 1552 ret | |
| 1553 restore %g0,%g0,%o0 | |
| 1554 | |
| 1555 .type bn_sqr_comba4,#function | |
| 1556 .size bn_sqr_comba4,(.-bn_sqr_comba4) | |
| 1557 | |
| 1558 .align 32 | |
| OLD | NEW |