openssl/crypto/bn/asm/sparcv8plus.S - Issue 2072073002: Delete bundled copy of OpenSSL and replace with README.

Side by Side Diff: openssl/crypto/bn/asm/sparcv8plus.S

Issue 2072073002: Delete bundled copy of OpenSSL and replace with README. (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/openssl@master

Patch Set: Delete bundled copy of OpenSSL and replace with README. Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 .ident "sparcv8plus.s, Version 1.4"

2 .ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"

3

4 /*

5 * ====================================================================

6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL

7 * project.

8 *

9 * Rights for redistribution and usage in source and binary forms are

10 * granted according to the OpenSSL license. Warranty of any kind is

11 * disclaimed.

12 * ====================================================================

13 */

14

15 /*

16 * This is my modest contributon to OpenSSL project (see

17 * http://www.openssl.org/ for more information about it) and is

18 * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c

19 * module. For updates see http://fy.chalmers.se/~appro/hpe/.

20 *

21 * Questions-n-answers.

22 *

23 * Q. How to compile?

24 * A. With SC4.x/SC5.x:

25 *

26 * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o

27 *

28 * and with gcc:

29 *

30 * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o

31 *

32 * or if above fails (it does if you have gas installed):

33 *

34 * gcc -E bn_asm.sparc.v8plus.S \| as -xarch=v8plus /dev/fd/0 -o bn_asm.o

35 *

36 * Quick-n-dirty way to fuse the module into the library.

37 * Provided that the library is already configured and built

38 * (in 0.9.2 case with no-asm option):

39 *

40 * # cd crypto/bn

41 * # cp /some/place/bn_asm.sparc.v8plus.S .

42 * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o

43 * # make

44 * # cd ../..

45 * # make; make test

46 *

47 * Quick-n-dirty way to get rid of it:

48 *

49 * # cd crypto/bn

50 * # touch bn_asm.c

51 * # make

52 * # cd ../..

53 * # make; make test

54 *

55 * Q. V8plus achitecture? What kind of beast is that?

56 * A. Well, it's rather a programming model than an architecture...

57 * It's actually v9-compliant, i.e. any UltraSPARC, CPU under

58 * special conditions, namely when kernel doesn't preserve upper

59 * 32 bits of otherwise 64-bit registers during a context switch.

60 *

61 * Q. Why just UltraSPARC? What about SuperSPARC?

62 * A. Original release did target UltraSPARC only. Now SuperSPARC

63 * version is provided along. Both version share bn_*comba[48]

64 * implementations (see comment later in code for explanation).

65 * But what's so special about this UltraSPARC implementation?

66 * Why didn't I let compiler do the job? Trouble is that most of

67 * available compilers (well, SC5.0 is the only exception) don't

68 * attempt to take advantage of UltraSPARC's 64-bitness under

69 * 32-bit kernels even though it's perfectly possible (see next

70 * question).

71 *

72 * Q. 64-bit registers under 32-bit kernels? Didn't you just say it

73 * doesn't work?

74 * A. You can't adress all registers as 64-bit wide:-( The catch is

75 * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully

76 * preserved if you're in a leaf function, i.e. such never calling

77 * any other functions. All functions in this module are leaf and

78 * 10 registers is a handful. And as a matter of fact none-"comba"

79 * routines don't require even that much and I could even afford to

80 * not allocate own stack frame for 'em:-)

81 *

82 * Q. What about 64-bit kernels?

83 * A. What about 'em? Just kidding:-) Pure 64-bit version is currently

84 * under evaluation and development...

85 *

86 * Q. What about shared libraries?

87 * A. What about 'em? Kidding again:-) Code does not contain any

88 * code position dependencies and it's safe to include it into

89 * shared library as is.

90 *

91 * Q. How much faster does it go?

92 * A. Do you have a good benchmark? In either case below is what I

93 * experience with crypto/bn/expspeed.c test program:

94 *

95 * v8plus module on U10/300MHz against bn_asm.c compiled with:

96 *

97 * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12%

98 * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35%

99 * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45%

100 *

101 * v8 module on SS10/60MHz against bn_asm.c compiled with:

102 *

103 * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10%

104 * cc-4.2 -xarch=v8 -xO5 -xdepend +10%

105 * egcs-1.1.2 -mv8 -O3 +35-45%

106 *

107 * As you can see it's damn hard to beat the new Sun C compiler

108 * and it's in first place GNU C users who will appreciate this

109 * assembler implementation:-)

110 */

111

112 /*

113 * Revision history.

114 *

115 * 1.0 - initial release;

116 * 1.1 - new loop unrolling model(*);

117 * - some more fine tuning;

118 * 1.2 - made gas friendly;

119 * - updates to documentation concerning v9;

120 * - new performance comparison matrix;

121 * 1.3 - fixed problem with /usr/ccs/lib/cpp;

122 * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient)

123 * resulting in slight overall performance kick;

124 * - some retunes;

125 * - support for GNU as added;

126 *

127 * (*) Originally unrolled loop looked like this:

128 * for (;;) {

129 * op(p+0); if (--n==0) break;

130 * op(p+1); if (--n==0) break;

131 * op(p+2); if (--n==0) break;

132 * op(p+3); if (--n==0) break;

133 * p+=4;

134 * }

135 * I unroll according to following:

136 * while (n&~3) {

137 * op(p+0); op(p+1); op(p+2); op(p+3);

138 * p+=4; n=-4;

139 * }

140 * if (n) {

141 * op(p+0); if (--n==0) return;

142 * op(p+2); if (--n==0) return;

143 * op(p+3); return;

144 * }

145 */

146

147 #if defined(__SUNPRO_C) && defined(__sparcv9)

148 /* They've said -xarch=v9 at command line */

149 .register %g2,#scratch

150 .register %g3,#scratch

151 # define FRAME_SIZE -192

152 #elif defined(__GNUC__) && defined(__arch64__)

153 /* They've said -m64 at command line */

154 .register %g2,#scratch

155 .register %g3,#scratch

156 # define FRAME_SIZE -192

157 #else

158 # define FRAME_SIZE -96

159 #endif

160 /*

161 * GNU assembler can't stand stuw:-(

162 */

163 #define stuw st

164

165 .section ".text",#alloc,#execinstr

166 .file "bn_asm.sparc.v8plus.S"

167

168 .align 32

169

170 .global bn_mul_add_words

171 /*

172 * BN_ULONG bn_mul_add_words(rp,ap,num,w)

173 * BN_ULONG rp,ap;

174 * int num;

175 * BN_ULONG w;

176 */

177 bn_mul_add_words:

178 sra %o2,%g0,%o2 ! signx %o2

179 brgz,a %o2,.L_bn_mul_add_words_proceed

180 lduw [%o1],%g2

181 retl

182 clr %o0

183 nop

184 nop

185 nop

186

187 .L_bn_mul_add_words_proceed:

188 srl %o3,%g0,%o3 ! clruw %o3

189 andcc %o2,-4,%g0

190 bz,pn %icc,.L_bn_mul_add_words_tail

191 clr %o5

192

193 .L_bn_mul_add_words_loop: ! wow! 32 aligned!

194 lduw [%o0],%g1

195 lduw [%o1+4],%g3

196 mulx %o3,%g2,%g2

197 add %g1,%o5,%o4

198 nop

199 add %o4,%g2,%o4

200 stuw %o4,[%o0]

201 srlx %o4,32,%o5

202

203 lduw [%o0+4],%g1

204 lduw [%o1+8],%g2

205 mulx %o3,%g3,%g3

206 add %g1,%o5,%o4

207 dec 4,%o2

208 add %o4,%g3,%o4

209 stuw %o4,[%o0+4]

210 srlx %o4,32,%o5

211

212 lduw [%o0+8],%g1

213 lduw [%o1+12],%g3

214 mulx %o3,%g2,%g2

215 add %g1,%o5,%o4

216 inc 16,%o1

217 add %o4,%g2,%o4

218 stuw %o4,[%o0+8]

219 srlx %o4,32,%o5

220

221 lduw [%o0+12],%g1

222 mulx %o3,%g3,%g3

223 add %g1,%o5,%o4

224 inc 16,%o0

225 add %o4,%g3,%o4

226 andcc %o2,-4,%g0

227 stuw %o4,[%o0-4]

228 srlx %o4,32,%o5

229 bnz,a,pt %icc,.L_bn_mul_add_words_loop

230 lduw [%o1],%g2

231

232 brnz,a,pn %o2,.L_bn_mul_add_words_tail

233 lduw [%o1],%g2

234 .L_bn_mul_add_words_return:

235 retl

236 mov %o5,%o0

237

238 .L_bn_mul_add_words_tail:

239 lduw [%o0],%g1

240 mulx %o3,%g2,%g2

241 add %g1,%o5,%o4

242 dec %o2

243 add %o4,%g2,%o4

244 srlx %o4,32,%o5

245 brz,pt %o2,.L_bn_mul_add_words_return

246 stuw %o4,[%o0]

247

248 lduw [%o1+4],%g2

249 lduw [%o0+4],%g1

250 mulx %o3,%g2,%g2

251 add %g1,%o5,%o4

252 dec %o2

253 add %o4,%g2,%o4

254 srlx %o4,32,%o5

255 brz,pt %o2,.L_bn_mul_add_words_return

256 stuw %o4,[%o0+4]

257

258 lduw [%o1+8],%g2

259 lduw [%o0+8],%g1

260 mulx %o3,%g2,%g2

261 add %g1,%o5,%o4

262 add %o4,%g2,%o4

263 stuw %o4,[%o0+8]

264 retl

265 srlx %o4,32,%o0

266

267 .type bn_mul_add_words,#function

268 .size bn_mul_add_words,(.-bn_mul_add_words)

269

270 .align 32

271

272 .global bn_mul_words

273 /*

274 * BN_ULONG bn_mul_words(rp,ap,num,w)

275 * BN_ULONG rp,ap;

276 * int num;

277 * BN_ULONG w;

278 */

279 bn_mul_words:

280 sra %o2,%g0,%o2 ! signx %o2

281 brgz,a %o2,.L_bn_mul_words_proceeed

282 lduw [%o1],%g2

283 retl

284 clr %o0

285 nop

286 nop

287 nop

288

289 .L_bn_mul_words_proceeed:

290 srl %o3,%g0,%o3 ! clruw %o3

291 andcc %o2,-4,%g0

292 bz,pn %icc,.L_bn_mul_words_tail

293 clr %o5

294

295 .L_bn_mul_words_loop: ! wow! 32 aligned!

296 lduw [%o1+4],%g3

297 mulx %o3,%g2,%g2

298 add %g2,%o5,%o4

299 nop

300 stuw %o4,[%o0]

301 srlx %o4,32,%o5

302

303 lduw [%o1+8],%g2

304 mulx %o3,%g3,%g3

305 add %g3,%o5,%o4

306 dec 4,%o2

307 stuw %o4,[%o0+4]

308 srlx %o4,32,%o5

309

310 lduw [%o1+12],%g3

311 mulx %o3,%g2,%g2

312 add %g2,%o5,%o4

313 inc 16,%o1

314 stuw %o4,[%o0+8]

315 srlx %o4,32,%o5

316

317 mulx %o3,%g3,%g3

318 add %g3,%o5,%o4

319 inc 16,%o0

320 stuw %o4,[%o0-4]

321 srlx %o4,32,%o5

322 andcc %o2,-4,%g0

323 bnz,a,pt %icc,.L_bn_mul_words_loop

324 lduw [%o1],%g2

325 nop

326 nop

327

328 brnz,a,pn %o2,.L_bn_mul_words_tail

329 lduw [%o1],%g2

330 .L_bn_mul_words_return:

331 retl

332 mov %o5,%o0

333

334 .L_bn_mul_words_tail:

335 mulx %o3,%g2,%g2

336 add %g2,%o5,%o4

337 dec %o2

338 srlx %o4,32,%o5

339 brz,pt %o2,.L_bn_mul_words_return

340 stuw %o4,[%o0]

341

342 lduw [%o1+4],%g2

343 mulx %o3,%g2,%g2

344 add %g2,%o5,%o4

345 dec %o2

346 srlx %o4,32,%o5

347 brz,pt %o2,.L_bn_mul_words_return

348 stuw %o4,[%o0+4]

349

350 lduw [%o1+8],%g2

351 mulx %o3,%g2,%g2

352 add %g2,%o5,%o4

353 stuw %o4,[%o0+8]

354 retl

355 srlx %o4,32,%o0

356

357 .type bn_mul_words,#function

358 .size bn_mul_words,(.-bn_mul_words)

359

360 .align 32

361 .global bn_sqr_words

362 /*

363 * void bn_sqr_words(r,a,n)

364 * BN_ULONG r,a;

365 * int n;

366 */

367 bn_sqr_words:

368 sra %o2,%g0,%o2 ! signx %o2

369 brgz,a %o2,.L_bn_sqr_words_proceeed

370 lduw [%o1],%g2

371 retl

372 clr %o0

373 nop

374 nop

375 nop

376

377 .L_bn_sqr_words_proceeed:

378 andcc %o2,-4,%g0

379 nop

380 bz,pn %icc,.L_bn_sqr_words_tail

381 nop

382

383 .L_bn_sqr_words_loop: ! wow! 32 aligned!

384 lduw [%o1+4],%g3

385 mulx %g2,%g2,%o4

386 stuw %o4,[%o0]

387 srlx %o4,32,%o5

388 stuw %o5,[%o0+4]

389 nop

390

391 lduw [%o1+8],%g2

392 mulx %g3,%g3,%o4

393 dec 4,%o2

394 stuw %o4,[%o0+8]

395 srlx %o4,32,%o5

396 stuw %o5,[%o0+12]

397

398 lduw [%o1+12],%g3

399 mulx %g2,%g2,%o4

400 srlx %o4,32,%o5

401 stuw %o4,[%o0+16]

402 inc 16,%o1

403 stuw %o5,[%o0+20]

404

405 mulx %g3,%g3,%o4

406 inc 32,%o0

407 stuw %o4,[%o0-8]

408 srlx %o4,32,%o5

409 andcc %o2,-4,%g2

410 stuw %o5,[%o0-4]

411 bnz,a,pt %icc,.L_bn_sqr_words_loop

412 lduw [%o1],%g2

413 nop

414

415 brnz,a,pn %o2,.L_bn_sqr_words_tail

416 lduw [%o1],%g2

417 .L_bn_sqr_words_return:

418 retl

419 clr %o0

420

421 .L_bn_sqr_words_tail:

422 mulx %g2,%g2,%o4

423 dec %o2

424 stuw %o4,[%o0]

425 srlx %o4,32,%o5

426 brz,pt %o2,.L_bn_sqr_words_return

427 stuw %o5,[%o0+4]

428

429 lduw [%o1+4],%g2

430 mulx %g2,%g2,%o4

431 dec %o2

432 stuw %o4,[%o0+8]

433 srlx %o4,32,%o5

434 brz,pt %o2,.L_bn_sqr_words_return

435 stuw %o5,[%o0+12]

436

437 lduw [%o1+8],%g2

438 mulx %g2,%g2,%o4

439 srlx %o4,32,%o5

440 stuw %o4,[%o0+16]

441 stuw %o5,[%o0+20]

442 retl

443 clr %o0

444

445 .type bn_sqr_words,#function

446 .size bn_sqr_words,(.-bn_sqr_words)

447

448 .align 32

449 .global bn_div_words

450 /*

451 * BN_ULONG bn_div_words(h,l,d)

452 * BN_ULONG h,l,d;

453 */

454 bn_div_words:

455 sllx %o0,32,%o0

456 or %o0,%o1,%o0

457 udivx %o0,%o2,%o0

458 retl

459 srl %o0,%g0,%o0 ! clruw %o0

460

461 .type bn_div_words,#function

462 .size bn_div_words,(.-bn_div_words)

463

464 .align 32

465

466 .global bn_add_words

467 /*

468 * BN_ULONG bn_add_words(rp,ap,bp,n)

469 * BN_ULONG rp,ap,*bp;

470 * int n;

471 */

472 bn_add_words:

473 sra %o3,%g0,%o3 ! signx %o3

474 brgz,a %o3,.L_bn_add_words_proceed

475 lduw [%o1],%o4

476 retl

477 clr %o0

478

479 .L_bn_add_words_proceed:

480 andcc %o3,-4,%g0

481 bz,pn %icc,.L_bn_add_words_tail

482 addcc %g0,0,%g0 ! clear carry flag

483

484 .L_bn_add_words_loop: ! wow! 32 aligned!

485 dec 4,%o3

486 lduw [%o2],%o5

487 lduw [%o1+4],%g1

488 lduw [%o2+4],%g2

489 lduw [%o1+8],%g3

490 lduw [%o2+8],%g4

491 addccc %o5,%o4,%o5

492 stuw %o5,[%o0]

493

494 lduw [%o1+12],%o4

495 lduw [%o2+12],%o5

496 inc 16,%o1

497 addccc %g1,%g2,%g1

498 stuw %g1,[%o0+4]

499

500 inc 16,%o2

501 addccc %g3,%g4,%g3

502 stuw %g3,[%o0+8]

503

504 inc 16,%o0

505 addccc %o5,%o4,%o5

506 stuw %o5,[%o0-4]

507 and %o3,-4,%g1

508 brnz,a,pt %g1,.L_bn_add_words_loop

509 lduw [%o1],%o4

510

511 brnz,a,pn %o3,.L_bn_add_words_tail

512 lduw [%o1],%o4

513 .L_bn_add_words_return:

514 clr %o0

515 retl

516 movcs %icc,1,%o0

517 nop

518

519 .L_bn_add_words_tail:

520 lduw [%o2],%o5

521 dec %o3

522 addccc %o5,%o4,%o5

523 brz,pt %o3,.L_bn_add_words_return

524 stuw %o5,[%o0]

525

526 lduw [%o1+4],%o4

527 lduw [%o2+4],%o5

528 dec %o3

529 addccc %o5,%o4,%o5

530 brz,pt %o3,.L_bn_add_words_return

531 stuw %o5,[%o0+4]

532

533 lduw [%o1+8],%o4

534 lduw [%o2+8],%o5

535 addccc %o5,%o4,%o5

536 stuw %o5,[%o0+8]

537 clr %o0

538 retl

539 movcs %icc,1,%o0

540

541 .type bn_add_words,#function

542 .size bn_add_words,(.-bn_add_words)

543

544 .global bn_sub_words

545 /*

546 * BN_ULONG bn_sub_words(rp,ap,bp,n)

547 * BN_ULONG rp,ap,*bp;

548 * int n;

549 */

550 bn_sub_words:

551 sra %o3,%g0,%o3 ! signx %o3

552 brgz,a %o3,.L_bn_sub_words_proceed

553 lduw [%o1],%o4

554 retl

555 clr %o0

556

557 .L_bn_sub_words_proceed:

558 andcc %o3,-4,%g0

559 bz,pn %icc,.L_bn_sub_words_tail

560 addcc %g0,0,%g0 ! clear carry flag

561

562 .L_bn_sub_words_loop: ! wow! 32 aligned!

563 dec 4,%o3

564 lduw [%o2],%o5

565 lduw [%o1+4],%g1

566 lduw [%o2+4],%g2

567 lduw [%o1+8],%g3

568 lduw [%o2+8],%g4

569 subccc %o4,%o5,%o5

570 stuw %o5,[%o0]

571

572 lduw [%o1+12],%o4

573 lduw [%o2+12],%o5

574 inc 16,%o1

575 subccc %g1,%g2,%g2

576 stuw %g2,[%o0+4]

577

578 inc 16,%o2

579 subccc %g3,%g4,%g4

580 stuw %g4,[%o0+8]

581

582 inc 16,%o0

583 subccc %o4,%o5,%o5

584 stuw %o5,[%o0-4]

585 and %o3,-4,%g1

586 brnz,a,pt %g1,.L_bn_sub_words_loop

587 lduw [%o1],%o4

588

589 brnz,a,pn %o3,.L_bn_sub_words_tail

590 lduw [%o1],%o4

591 .L_bn_sub_words_return:

592 clr %o0

593 retl

594 movcs %icc,1,%o0

595 nop

596

597 .L_bn_sub_words_tail: ! wow! 32 aligned!

598 lduw [%o2],%o5

599 dec %o3

600 subccc %o4,%o5,%o5

601 brz,pt %o3,.L_bn_sub_words_return

602 stuw %o5,[%o0]

603

604 lduw [%o1+4],%o4

605 lduw [%o2+4],%o5

606 dec %o3

607 subccc %o4,%o5,%o5

608 brz,pt %o3,.L_bn_sub_words_return

609 stuw %o5,[%o0+4]

610

611 lduw [%o1+8],%o4

612 lduw [%o2+8],%o5

613 subccc %o4,%o5,%o5

614 stuw %o5,[%o0+8]

615 clr %o0

616 retl

617 movcs %icc,1,%o0

618

619 .type bn_sub_words,#function

620 .size bn_sub_words,(.-bn_sub_words)

621

622 /*

623 * Code below depends on the fact that upper parts of the %l0-%l7

624 * and %i0-%i7 are zeroed by kernel after context switch. In

625 * previous versions this comment stated that "the trouble is that

626 * it's not feasible to implement the mumbo-jumbo in less V9

627 * instructions:-(" which apparently isn't true thanks to

628 * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement

629 * results not from the shorter code, but from elimination of

630 * multicycle none-pairable 'rd %y,%rd' instructions.

631 *

632 * Andy.

633 */

634

635 /*

636 * Here is register usage map for all routines below.

637 */

638 #define t_1 %o0

639 #define t_2 %o1

640 #define c_12 %o2

641 #define c_3 %o3

642

643 #define ap(I) [%i1+4*I]

644 #define bp(I) [%i2+4*I]

645 #define rp(I) [%i0+4*I]

646

647 #define a_0 %l0

648 #define a_1 %l1

649 #define a_2 %l2

650 #define a_3 %l3

651 #define a_4 %l4

652 #define a_5 %l5

653 #define a_6 %l6

654 #define a_7 %l7

655

656 #define b_0 %i3

657 #define b_1 %i4

658 #define b_2 %i5

659 #define b_3 %o4

660 #define b_4 %o5

661 #define b_5 %o7

662 #define b_6 %g1

663 #define b_7 %g4

664

665 .align 32

666 .global bn_mul_comba8

667 /*

668 * void bn_mul_comba8(r,a,b)

669 * BN_ULONG r,a,*b;

670 */

671 bn_mul_comba8:

672 save %sp,FRAME_SIZE,%sp

673 mov 1,t_2

674 lduw ap(0),a_0

675 sllx t_2,32,t_2

676 lduw bp(0),b_0 !=

677 lduw bp(1),b_1

678 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);

679 srlx t_1,32,c_12

680 stuw t_1,rp(0) !=!r[0]=c1;

681

682 lduw ap(1),a_1

683 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);

684 addcc c_12,t_1,c_12

685 clr c_3 !=

686 bcs,a %xcc,.+8

687 add c_3,t_2,c_3

688 lduw ap(2),a_2

689 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);

690 addcc c_12,t_1,t_1

691 bcs,a %xcc,.+8

692 add c_3,t_2,c_3

693 srlx t_1,32,c_12 !=

694 stuw t_1,rp(1) !r[1]=c2;

695 or c_12,c_3,c_12

696

697 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);

698 addcc c_12,t_1,c_12 !=

699 clr c_3

700 bcs,a %xcc,.+8

701 add c_3,t_2,c_3

702 lduw bp(2),b_2 !=

703 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);

704 addcc c_12,t_1,c_12

705 bcs,a %xcc,.+8

706 add c_3,t_2,c_3 !=

707 lduw bp(3),b_3

708 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);

709 addcc c_12,t_1,t_1

710 bcs,a %xcc,.+8 !=

711 add c_3,t_2,c_3

712 srlx t_1,32,c_12

713 stuw t_1,rp(2) !r[2]=c3;

714 or c_12,c_3,c_12 !=

715

716 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);

717 addcc c_12,t_1,c_12

718 clr c_3

719 bcs,a %xcc,.+8 !=

720 add c_3,t_2,c_3

721 mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3);

722 addcc c_12,t_1,c_12

723 bcs,a %xcc,.+8 !=

724 add c_3,t_2,c_3

725 lduw ap(3),a_3

726 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);

727 addcc c_12,t_1,c_12 !=

728 bcs,a %xcc,.+8

729 add c_3,t_2,c_3

730 lduw ap(4),a_4

731 mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!=

732 addcc c_12,t_1,t_1

733 bcs,a %xcc,.+8

734 add c_3,t_2,c_3

735 srlx t_1,32,c_12 !=

736 stuw t_1,rp(3) !r[3]=c1;

737 or c_12,c_3,c_12

738

739 mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1);

740 addcc c_12,t_1,c_12 !=

741 clr c_3

742 bcs,a %xcc,.+8

743 add c_3,t_2,c_3

744 mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1);

745 addcc c_12,t_1,c_12

746 bcs,a %xcc,.+8

747 add c_3,t_2,c_3

748 mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1);

749 addcc c_12,t_1,c_12

750 bcs,a %xcc,.+8

751 add c_3,t_2,c_3

752 lduw bp(4),b_4 !=

753 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);

754 addcc c_12,t_1,c_12

755 bcs,a %xcc,.+8

756 add c_3,t_2,c_3 !=

757 lduw bp(5),b_5

758 mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1);

759 addcc c_12,t_1,t_1

760 bcs,a %xcc,.+8 !=

761 add c_3,t_2,c_3

762 srlx t_1,32,c_12

763 stuw t_1,rp(4) !r[4]=c2;

764 or c_12,c_3,c_12 !=

765

766 mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2);

767 addcc c_12,t_1,c_12

768 clr c_3

769 bcs,a %xcc,.+8 !=

770 add c_3,t_2,c_3

771 mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2);

772 addcc c_12,t_1,c_12

773 bcs,a %xcc,.+8 !=

774 add c_3,t_2,c_3

775 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);

776 addcc c_12,t_1,c_12

777 bcs,a %xcc,.+8 !=

778 add c_3,t_2,c_3

779 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);

780 addcc c_12,t_1,c_12

781 bcs,a %xcc,.+8 !=

782 add c_3,t_2,c_3

783 lduw ap(5),a_5

784 mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2);

785 addcc c_12,t_1,c_12 !=

786 bcs,a %xcc,.+8

787 add c_3,t_2,c_3

788 lduw ap(6),a_6

789 mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2);

790 addcc c_12,t_1,t_1

791 bcs,a %xcc,.+8

792 add c_3,t_2,c_3

793 srlx t_1,32,c_12 !=

794 stuw t_1,rp(5) !r[5]=c3;

795 or c_12,c_3,c_12

796

797 mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3);

798 addcc c_12,t_1,c_12 !=

799 clr c_3

800 bcs,a %xcc,.+8

801 add c_3,t_2,c_3

802 mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3);

803 addcc c_12,t_1,c_12

804 bcs,a %xcc,.+8

805 add c_3,t_2,c_3

806 mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3);

807 addcc c_12,t_1,c_12

808 bcs,a %xcc,.+8

809 add c_3,t_2,c_3

810 mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3);

811 addcc c_12,t_1,c_12

812 bcs,a %xcc,.+8

813 add c_3,t_2,c_3

814 mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3);

815 addcc c_12,t_1,c_12

816 bcs,a %xcc,.+8

817 add c_3,t_2,c_3

818 lduw bp(6),b_6 !=

819 mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3);

820 addcc c_12,t_1,c_12

821 bcs,a %xcc,.+8

822 add c_3,t_2,c_3 !=

823 lduw bp(7),b_7

824 mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3);

825 addcc c_12,t_1,t_1

826 bcs,a %xcc,.+8 !=

827 add c_3,t_2,c_3

828 srlx t_1,32,c_12

829 stuw t_1,rp(6) !r[6]=c1;

830 or c_12,c_3,c_12 !=

831

832 mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1);

833 addcc c_12,t_1,c_12

834 clr c_3

835 bcs,a %xcc,.+8 !=

836 add c_3,t_2,c_3

837 mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1);

838 addcc c_12,t_1,c_12

839 bcs,a %xcc,.+8 !=

840 add c_3,t_2,c_3

841 mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1);

842 addcc c_12,t_1,c_12

843 bcs,a %xcc,.+8 !=

844 add c_3,t_2,c_3

845 mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1);

846 addcc c_12,t_1,c_12

847 bcs,a %xcc,.+8 !=

848 add c_3,t_2,c_3

849 mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1);

850 addcc c_12,t_1,c_12

851 bcs,a %xcc,.+8 !=

852 add c_3,t_2,c_3

853 mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1);

854 addcc c_12,t_1,c_12

855 bcs,a %xcc,.+8 !=

856 add c_3,t_2,c_3

857 lduw ap(7),a_7

858 mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1);

859 addcc c_12,t_1,c_12

860 bcs,a %xcc,.+8

861 add c_3,t_2,c_3

862 mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1);

863 addcc c_12,t_1,t_1

864 bcs,a %xcc,.+8

865 add c_3,t_2,c_3

866 srlx t_1,32,c_12 !=

867 stuw t_1,rp(7) !r[7]=c2;

868 or c_12,c_3,c_12

869

870 mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2);

871 addcc c_12,t_1,c_12

872 clr c_3

873 bcs,a %xcc,.+8

874 add c_3,t_2,c_3 !=

875 mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2);

876 addcc c_12,t_1,c_12

877 bcs,a %xcc,.+8

878 add c_3,t_2,c_3 !=

879 mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2);

880 addcc c_12,t_1,c_12

881 bcs,a %xcc,.+8

882 add c_3,t_2,c_3 !=

883 mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2);

884 addcc c_12,t_1,c_12

885 bcs,a %xcc,.+8

886 add c_3,t_2,c_3 !=

887 mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2);

888 addcc c_12,t_1,c_12

889 bcs,a %xcc,.+8

890 add c_3,t_2,c_3 !=

891 mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2);

892 addcc c_12,t_1,c_12

893 bcs,a %xcc,.+8

894 add c_3,t_2,c_3 !=

895 mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2);

896 addcc c_12,t_1,t_1

897 bcs,a %xcc,.+8

898 add c_3,t_2,c_3 !=

899 srlx t_1,32,c_12

900 stuw t_1,rp(8) !r[8]=c3;

901 or c_12,c_3,c_12

902

903 mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3);

904 addcc c_12,t_1,c_12

905 clr c_3

906 bcs,a %xcc,.+8

907 add c_3,t_2,c_3 !=

908 mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3);

909 addcc c_12,t_1,c_12

910 bcs,a %xcc,.+8 !=

911 add c_3,t_2,c_3

912 mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3);

913 addcc c_12,t_1,c_12

914 bcs,a %xcc,.+8 !=

915 add c_3,t_2,c_3

916 mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3);

917 addcc c_12,t_1,c_12

918 bcs,a %xcc,.+8 !=

919 add c_3,t_2,c_3

920 mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3);

921 addcc c_12,t_1,c_12

922 bcs,a %xcc,.+8 !=

923 add c_3,t_2,c_3

924 mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3);

925 addcc c_12,t_1,t_1

926 bcs,a %xcc,.+8 !=

927 add c_3,t_2,c_3

928 srlx t_1,32,c_12

929 stuw t_1,rp(9) !r[9]=c1;

930 or c_12,c_3,c_12 !=

931

932 mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1);

933 addcc c_12,t_1,c_12

934 clr c_3

935 bcs,a %xcc,.+8 !=

936 add c_3,t_2,c_3

937 mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1);

938 addcc c_12,t_1,c_12

939 bcs,a %xcc,.+8 !=

940 add c_3,t_2,c_3

941 mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1);

942 addcc c_12,t_1,c_12

943 bcs,a %xcc,.+8 !=

944 add c_3,t_2,c_3

945 mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1);

946 addcc c_12,t_1,c_12

947 bcs,a %xcc,.+8 !=

948 add c_3,t_2,c_3

949 mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1);

950 addcc c_12,t_1,t_1

951 bcs,a %xcc,.+8 !=

952 add c_3,t_2,c_3

953 srlx t_1,32,c_12

954 stuw t_1,rp(10) !r[10]=c2;

955 or c_12,c_3,c_12 !=

956

957 mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2);

958 addcc c_12,t_1,c_12

959 clr c_3

960 bcs,a %xcc,.+8 !=

961 add c_3,t_2,c_3

962 mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2);

963 addcc c_12,t_1,c_12

964 bcs,a %xcc,.+8 !=

965 add c_3,t_2,c_3

966 mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2);

967 addcc c_12,t_1,c_12

968 bcs,a %xcc,.+8 !=

969 add c_3,t_2,c_3

970 mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2);

971 addcc c_12,t_1,t_1

972 bcs,a %xcc,.+8 !=

973 add c_3,t_2,c_3

974 srlx t_1,32,c_12

975 stuw t_1,rp(11) !r[11]=c3;

976 or c_12,c_3,c_12 !=

977

978 mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3);

979 addcc c_12,t_1,c_12

980 clr c_3

981 bcs,a %xcc,.+8 !=

982 add c_3,t_2,c_3

983 mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3);

984 addcc c_12,t_1,c_12

985 bcs,a %xcc,.+8 !=

986 add c_3,t_2,c_3

987 mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3);

988 addcc c_12,t_1,t_1

989 bcs,a %xcc,.+8 !=

990 add c_3,t_2,c_3

991 srlx t_1,32,c_12

992 stuw t_1,rp(12) !r[12]=c1;

993 or c_12,c_3,c_12 !=

994

995 mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1);

996 addcc c_12,t_1,c_12

997 clr c_3

998 bcs,a %xcc,.+8 !=

999 add c_3,t_2,c_3

1000 mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1);

1001 addcc c_12,t_1,t_1

1002 bcs,a %xcc,.+8 !=

1003 add c_3,t_2,c_3

1004 srlx t_1,32,c_12

1005 st t_1,rp(13) !r[13]=c2;

1006 or c_12,c_3,c_12 !=

1007

1008 mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2);

1009 addcc c_12,t_1,t_1

1010 srlx t_1,32,c_12 !=

1011 stuw t_1,rp(14) !r[14]=c3;

1012 stuw c_12,rp(15) !r[15]=c1;

1013

1014 ret

1015 restore %g0,%g0,%o0 !=

1016

1017 .type bn_mul_comba8,#function

1018 .size bn_mul_comba8,(.-bn_mul_comba8)

1019

1020 .align 32

1021

1022 .global bn_mul_comba4

1023 /*

1024 * void bn_mul_comba4(r,a,b)

1025 * BN_ULONG r,a,*b;

1026 */

1027 bn_mul_comba4:

1028 save %sp,FRAME_SIZE,%sp

1029 lduw ap(0),a_0

1030 mov 1,t_2

1031 lduw bp(0),b_0

1032 sllx t_2,32,t_2 !=

1033 lduw bp(1),b_1

1034 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);

1035 srlx t_1,32,c_12

1036 stuw t_1,rp(0) !=!r[0]=c1;

1037

1038 lduw ap(1),a_1

1039 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);

1040 addcc c_12,t_1,c_12

1041 clr c_3 !=

1042 bcs,a %xcc,.+8

1043 add c_3,t_2,c_3

1044 lduw ap(2),a_2

1045 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);

1046 addcc c_12,t_1,t_1

1047 bcs,a %xcc,.+8

1048 add c_3,t_2,c_3

1049 srlx t_1,32,c_12 !=

1050 stuw t_1,rp(1) !r[1]=c2;

1051 or c_12,c_3,c_12

1052

1053 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);

1054 addcc c_12,t_1,c_12 !=

1055 clr c_3

1056 bcs,a %xcc,.+8

1057 add c_3,t_2,c_3

1058 lduw bp(2),b_2 !=

1059 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);

1060 addcc c_12,t_1,c_12

1061 bcs,a %xcc,.+8

1062 add c_3,t_2,c_3 !=

1063 lduw bp(3),b_3

1064 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);

1065 addcc c_12,t_1,t_1

1066 bcs,a %xcc,.+8 !=

1067 add c_3,t_2,c_3

1068 srlx t_1,32,c_12

1069 stuw t_1,rp(2) !r[2]=c3;

1070 or c_12,c_3,c_12 !=

1071

1072 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);

1073 addcc c_12,t_1,c_12

1074 clr c_3

1075 bcs,a %xcc,.+8 !=

1076 add c_3,t_2,c_3

1077 mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3);

1078 addcc c_12,t_1,c_12

1079 bcs,a %xcc,.+8 !=

1080 add c_3,t_2,c_3

1081 lduw ap(3),a_3

1082 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);

1083 addcc c_12,t_1,c_12 !=

1084 bcs,a %xcc,.+8

1085 add c_3,t_2,c_3

1086 mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!=

1087 addcc c_12,t_1,t_1 !=

1088 bcs,a %xcc,.+8

1089 add c_3,t_2,c_3

1090 srlx t_1,32,c_12

1091 stuw t_1,rp(3) !=!r[3]=c1;

1092 or c_12,c_3,c_12

1093

1094 mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);

1095 addcc c_12,t_1,c_12

1096 clr c_3 !=

1097 bcs,a %xcc,.+8

1098 add c_3,t_2,c_3

1099 mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1);

1100 addcc c_12,t_1,c_12 !=

1101 bcs,a %xcc,.+8

1102 add c_3,t_2,c_3

1103 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);

1104 addcc c_12,t_1,t_1 !=

1105 bcs,a %xcc,.+8

1106 add c_3,t_2,c_3

1107 srlx t_1,32,c_12

1108 stuw t_1,rp(4) !=!r[4]=c2;

1109 or c_12,c_3,c_12

1110

1111 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);

1112 addcc c_12,t_1,c_12

1113 clr c_3 !=

1114 bcs,a %xcc,.+8

1115 add c_3,t_2,c_3

1116 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);

1117 addcc c_12,t_1,t_1 !=

1118 bcs,a %xcc,.+8

1119 add c_3,t_2,c_3

1120 srlx t_1,32,c_12

1121 stuw t_1,rp(5) !=!r[5]=c3;

1122 or c_12,c_3,c_12

1123

1124 mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);

1125 addcc c_12,t_1,t_1

1126 srlx t_1,32,c_12 !=

1127 stuw t_1,rp(6) !r[6]=c1;

1128 stuw c_12,rp(7) !r[7]=c2;

1129

1130 ret

1131 restore %g0,%g0,%o0

1132

1133 .type bn_mul_comba4,#function

1134 .size bn_mul_comba4,(.-bn_mul_comba4)

1135

1136 .align 32

1137

1138 .global bn_sqr_comba8

1139 bn_sqr_comba8:

1140 save %sp,FRAME_SIZE,%sp

1141 mov 1,t_2

1142 lduw ap(0),a_0

1143 sllx t_2,32,t_2

1144 lduw ap(1),a_1

1145 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);

1146 srlx t_1,32,c_12

1147 stuw t_1,rp(0) !r[0]=c1;

1148

1149 lduw ap(2),a_2

1150 mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);

1151 addcc c_12,t_1,c_12

1152 clr c_3

1153 bcs,a %xcc,.+8

1154 add c_3,t_2,c_3

1155 addcc c_12,t_1,t_1

1156 bcs,a %xcc,.+8

1157 add c_3,t_2,c_3

1158 srlx t_1,32,c_12

1159 stuw t_1,rp(1) !r[1]=c2;

1160 or c_12,c_3,c_12

1161

1162 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);

1163 addcc c_12,t_1,c_12

1164 clr c_3

1165 bcs,a %xcc,.+8

1166 add c_3,t_2,c_3

1167 addcc c_12,t_1,c_12

1168 bcs,a %xcc,.+8

1169 add c_3,t_2,c_3

1170 lduw ap(3),a_3

1171 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);

1172 addcc c_12,t_1,t_1

1173 bcs,a %xcc,.+8

1174 add c_3,t_2,c_3

1175 srlx t_1,32,c_12

1176 stuw t_1,rp(2) !r[2]=c3;

1177 or c_12,c_3,c_12

1178

1179 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);

1180 addcc c_12,t_1,c_12

1181 clr c_3

1182 bcs,a %xcc,.+8

1183 add c_3,t_2,c_3

1184 addcc c_12,t_1,c_12

1185 bcs,a %xcc,.+8

1186 add c_3,t_2,c_3

1187 lduw ap(4),a_4

1188 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);

1189 addcc c_12,t_1,c_12

1190 bcs,a %xcc,.+8

1191 add c_3,t_2,c_3

1192 addcc c_12,t_1,t_1

1193 bcs,a %xcc,.+8

1194 add c_3,t_2,c_3

1195 srlx t_1,32,c_12

1196 st t_1,rp(3) !r[3]=c1;

1197 or c_12,c_3,c_12

1198

1199 mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1);

1200 addcc c_12,t_1,c_12

1201 clr c_3

1202 bcs,a %xcc,.+8

1203 add c_3,t_2,c_3

1204 addcc c_12,t_1,c_12

1205 bcs,a %xcc,.+8

1206 add c_3,t_2,c_3

1207 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);

1208 addcc c_12,t_1,c_12

1209 bcs,a %xcc,.+8

1210 add c_3,t_2,c_3

1211 addcc c_12,t_1,c_12

1212 bcs,a %xcc,.+8

1213 add c_3,t_2,c_3

1214 lduw ap(5),a_5

1215 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);

1216 addcc c_12,t_1,t_1

1217 bcs,a %xcc,.+8

1218 add c_3,t_2,c_3

1219 srlx t_1,32,c_12

1220 stuw t_1,rp(4) !r[4]=c2;

1221 or c_12,c_3,c_12

1222

1223 mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2);

1224 addcc c_12,t_1,c_12

1225 clr c_3

1226 bcs,a %xcc,.+8

1227 add c_3,t_2,c_3

1228 addcc c_12,t_1,c_12

1229 bcs,a %xcc,.+8

1230 add c_3,t_2,c_3

1231 mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2);

1232 addcc c_12,t_1,c_12

1233 bcs,a %xcc,.+8

1234 add c_3,t_2,c_3

1235 addcc c_12,t_1,c_12

1236 bcs,a %xcc,.+8

1237 add c_3,t_2,c_3

1238 lduw ap(6),a_6

1239 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);

1240 addcc c_12,t_1,c_12

1241 bcs,a %xcc,.+8

1242 add c_3,t_2,c_3

1243 addcc c_12,t_1,t_1

1244 bcs,a %xcc,.+8

1245 add c_3,t_2,c_3

1246 srlx t_1,32,c_12

1247 stuw t_1,rp(5) !r[5]=c3;

1248 or c_12,c_3,c_12

1249

1250 mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3);

1251 addcc c_12,t_1,c_12

1252 clr c_3

1253 bcs,a %xcc,.+8

1254 add c_3,t_2,c_3

1255 addcc c_12,t_1,c_12

1256 bcs,a %xcc,.+8

1257 add c_3,t_2,c_3

1258 mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3);

1259 addcc c_12,t_1,c_12

1260 bcs,a %xcc,.+8

1261 add c_3,t_2,c_3

1262 addcc c_12,t_1,c_12

1263 bcs,a %xcc,.+8

1264 add c_3,t_2,c_3

1265 mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3);

1266 addcc c_12,t_1,c_12

1267 bcs,a %xcc,.+8

1268 add c_3,t_2,c_3

1269 addcc c_12,t_1,c_12

1270 bcs,a %xcc,.+8

1271 add c_3,t_2,c_3

1272 lduw ap(7),a_7

1273 mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3);

1274 addcc c_12,t_1,t_1

1275 bcs,a %xcc,.+8

1276 add c_3,t_2,c_3

1277 srlx t_1,32,c_12

1278 stuw t_1,rp(6) !r[6]=c1;

1279 or c_12,c_3,c_12

1280

1281 mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1);

1282 addcc c_12,t_1,c_12

1283 clr c_3

1284 bcs,a %xcc,.+8

1285 add c_3,t_2,c_3

1286 addcc c_12,t_1,c_12

1287 bcs,a %xcc,.+8

1288 add c_3,t_2,c_3

1289 mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1);

1290 addcc c_12,t_1,c_12

1291 bcs,a %xcc,.+8

1292 add c_3,t_2,c_3

1293 addcc c_12,t_1,c_12

1294 bcs,a %xcc,.+8

1295 add c_3,t_2,c_3

1296 mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1);

1297 addcc c_12,t_1,c_12

1298 bcs,a %xcc,.+8

1299 add c_3,t_2,c_3

1300 addcc c_12,t_1,c_12

1301 bcs,a %xcc,.+8

1302 add c_3,t_2,c_3

1303 mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1);

1304 addcc c_12,t_1,c_12

1305 bcs,a %xcc,.+8

1306 add c_3,t_2,c_3

1307 addcc c_12,t_1,t_1

1308 bcs,a %xcc,.+8

1309 add c_3,t_2,c_3

1310 srlx t_1,32,c_12

1311 stuw t_1,rp(7) !r[7]=c2;

1312 or c_12,c_3,c_12

1313

1314 mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2);

1315 addcc c_12,t_1,c_12

1316 clr c_3

1317 bcs,a %xcc,.+8

1318 add c_3,t_2,c_3

1319 addcc c_12,t_1,c_12

1320 bcs,a %xcc,.+8

1321 add c_3,t_2,c_3

1322 mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2);

1323 addcc c_12,t_1,c_12

1324 bcs,a %xcc,.+8

1325 add c_3,t_2,c_3

1326 addcc c_12,t_1,c_12

1327 bcs,a %xcc,.+8

1328 add c_3,t_2,c_3

1329 mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2);

1330 addcc c_12,t_1,c_12

1331 bcs,a %xcc,.+8

1332 add c_3,t_2,c_3

1333 addcc c_12,t_1,c_12

1334 bcs,a %xcc,.+8

1335 add c_3,t_2,c_3

1336 mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2);

1337 addcc c_12,t_1,t_1

1338 bcs,a %xcc,.+8

1339 add c_3,t_2,c_3

1340 srlx t_1,32,c_12

1341 stuw t_1,rp(8) !r[8]=c3;

1342 or c_12,c_3,c_12

1343

1344 mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3);

1345 addcc c_12,t_1,c_12

1346 clr c_3

1347 bcs,a %xcc,.+8

1348 add c_3,t_2,c_3

1349 addcc c_12,t_1,c_12

1350 bcs,a %xcc,.+8

1351 add c_3,t_2,c_3

1352 mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3);

1353 addcc c_12,t_1,c_12

1354 bcs,a %xcc,.+8

1355 add c_3,t_2,c_3

1356 addcc c_12,t_1,c_12

1357 bcs,a %xcc,.+8

1358 add c_3,t_2,c_3

1359 mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3);

1360 addcc c_12,t_1,c_12

1361 bcs,a %xcc,.+8

1362 add c_3,t_2,c_3

1363 addcc c_12,t_1,t_1

1364 bcs,a %xcc,.+8

1365 add c_3,t_2,c_3

1366 srlx t_1,32,c_12

1367 stuw t_1,rp(9) !r[9]=c1;

1368 or c_12,c_3,c_12

1369

1370 mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1);

1371 addcc c_12,t_1,c_12

1372 clr c_3

1373 bcs,a %xcc,.+8

1374 add c_3,t_2,c_3

1375 addcc c_12,t_1,c_12

1376 bcs,a %xcc,.+8

1377 add c_3,t_2,c_3

1378 mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1);

1379 addcc c_12,t_1,c_12

1380 bcs,a %xcc,.+8

1381 add c_3,t_2,c_3

1382 addcc c_12,t_1,c_12

1383 bcs,a %xcc,.+8

1384 add c_3,t_2,c_3

1385 mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1);

1386 addcc c_12,t_1,t_1

1387 bcs,a %xcc,.+8

1388 add c_3,t_2,c_3

1389 srlx t_1,32,c_12

1390 stuw t_1,rp(10) !r[10]=c2;

1391 or c_12,c_3,c_12

1392

1393 mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2);

1394 addcc c_12,t_1,c_12

1395 clr c_3

1396 bcs,a %xcc,.+8

1397 add c_3,t_2,c_3

1398 addcc c_12,t_1,c_12

1399 bcs,a %xcc,.+8

1400 add c_3,t_2,c_3

1401 mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2);

1402 addcc c_12,t_1,c_12

1403 bcs,a %xcc,.+8

1404 add c_3,t_2,c_3

1405 addcc c_12,t_1,t_1

1406 bcs,a %xcc,.+8

1407 add c_3,t_2,c_3

1408 srlx t_1,32,c_12

1409 stuw t_1,rp(11) !r[11]=c3;

1410 or c_12,c_3,c_12

1411

1412 mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3);

1413 addcc c_12,t_1,c_12

1414 clr c_3

1415 bcs,a %xcc,.+8

1416 add c_3,t_2,c_3

1417 addcc c_12,t_1,c_12

1418 bcs,a %xcc,.+8

1419 add c_3,t_2,c_3

1420 mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3);

1421 addcc c_12,t_1,t_1

1422 bcs,a %xcc,.+8

1423 add c_3,t_2,c_3

1424 srlx t_1,32,c_12

1425 stuw t_1,rp(12) !r[12]=c1;

1426 or c_12,c_3,c_12

1427

1428 mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1);

1429 addcc c_12,t_1,c_12

1430 clr c_3

1431 bcs,a %xcc,.+8

1432 add c_3,t_2,c_3

1433 addcc c_12,t_1,t_1

1434 bcs,a %xcc,.+8

1435 add c_3,t_2,c_3

1436 srlx t_1,32,c_12

1437 stuw t_1,rp(13) !r[13]=c2;

1438 or c_12,c_3,c_12

1439

1440 mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2);

1441 addcc c_12,t_1,t_1

1442 srlx t_1,32,c_12

1443 stuw t_1,rp(14) !r[14]=c3;

1444 stuw c_12,rp(15) !r[15]=c1;

1445

1446 ret

1447 restore %g0,%g0,%o0

1448

1449 .type bn_sqr_comba8,#function

1450 .size bn_sqr_comba8,(.-bn_sqr_comba8)

1451

1452 .align 32

1453

1454 .global bn_sqr_comba4

1455 /*

1456 * void bn_sqr_comba4(r,a)

1457 * BN_ULONG r,a;

1458 */

1459 bn_sqr_comba4:

1460 save %sp,FRAME_SIZE,%sp

1461 mov 1,t_2

1462 lduw ap(0),a_0

1463 sllx t_2,32,t_2

1464 lduw ap(1),a_1

1465 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);

1466 srlx t_1,32,c_12

1467 stuw t_1,rp(0) !r[0]=c1;

1468

1469 lduw ap(2),a_2

1470 mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1);

1471 addcc c_12,t_1,c_12

1472 clr c_3

1473 bcs,a %xcc,.+8

1474 add c_3,t_2,c_3

1475 addcc c_12,t_1,t_1

1476 bcs,a %xcc,.+8

1477 add c_3,t_2,c_3

1478 srlx t_1,32,c_12

1479 stuw t_1,rp(1) !r[1]=c2;

1480 or c_12,c_3,c_12

1481

1482 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);

1483 addcc c_12,t_1,c_12

1484 clr c_3

1485 bcs,a %xcc,.+8

1486 add c_3,t_2,c_3

1487 addcc c_12,t_1,c_12

1488 bcs,a %xcc,.+8

1489 add c_3,t_2,c_3

1490 lduw ap(3),a_3

1491 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);

1492 addcc c_12,t_1,t_1

1493 bcs,a %xcc,.+8

1494 add c_3,t_2,c_3

1495 srlx t_1,32,c_12

1496 stuw t_1,rp(2) !r[2]=c3;

1497 or c_12,c_3,c_12

1498

1499 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);

1500 addcc c_12,t_1,c_12

1501 clr c_3

1502 bcs,a %xcc,.+8

1503 add c_3,t_2,c_3

1504 addcc c_12,t_1,c_12

1505 bcs,a %xcc,.+8

1506 add c_3,t_2,c_3

1507 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);

1508 addcc c_12,t_1,c_12

1509 bcs,a %xcc,.+8

1510 add c_3,t_2,c_3

1511 addcc c_12,t_1,t_1

1512 bcs,a %xcc,.+8

1513 add c_3,t_2,c_3

1514 srlx t_1,32,c_12

1515 stuw t_1,rp(3) !r[3]=c1;

1516 or c_12,c_3,c_12

1517

1518 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);

1519 addcc c_12,t_1,c_12

1520 clr c_3

1521 bcs,a %xcc,.+8

1522 add c_3,t_2,c_3

1523 addcc c_12,t_1,c_12

1524 bcs,a %xcc,.+8

1525 add c_3,t_2,c_3

1526 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);

1527 addcc c_12,t_1,t_1

1528 bcs,a %xcc,.+8

1529 add c_3,t_2,c_3

1530 srlx t_1,32,c_12

1531 stuw t_1,rp(4) !r[4]=c2;

1532 or c_12,c_3,c_12

1533

1534 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);

1535 addcc c_12,t_1,c_12

1536 clr c_3

1537 bcs,a %xcc,.+8

1538 add c_3,t_2,c_3

1539 addcc c_12,t_1,t_1

1540 bcs,a %xcc,.+8

1541 add c_3,t_2,c_3

1542 srlx t_1,32,c_12

1543 stuw t_1,rp(5) !r[5]=c3;

1544 or c_12,c_3,c_12

1545

1546 mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3);

1547 addcc c_12,t_1,t_1

1548 srlx t_1,32,c_12

1549 stuw t_1,rp(6) !r[6]=c1;

1550 stuw c_12,rp(7) !r[7]=c2;

1551

1552 ret

1553 restore %g0,%g0,%o0

1554

1555 .type bn_sqr_comba4,#function

1556 .size bn_sqr_comba4,(.-bn_sqr_comba4)

1557

1558 .align 32

OLD	NEW

« no previous file with comments | « openssl/crypto/bn/asm/sparcv8.S ('k') | openssl/crypto/bn/asm/sparcv9-mont.pl » ('j') | no next file with comments »