OLD | NEW |
| (Empty) |
1 .ident "sparcv8plus.s, Version 1.4" | |
2 .ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" | |
3 | |
4 /* | |
5 * ==================================================================== | |
6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
7 * project. | |
8 * | |
9 * Rights for redistribution and usage in source and binary forms are | |
10 * granted according to the OpenSSL license. Warranty of any kind is | |
11 * disclaimed. | |
12 * ==================================================================== | |
13 */ | |
14 | |
15 /* | |
16 * This is my modest contributon to OpenSSL project (see | |
17 * http://www.openssl.org/ for more information about it) and is | |
18 * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c | |
19 * module. For updates see http://fy.chalmers.se/~appro/hpe/. | |
20 * | |
21 * Questions-n-answers. | |
22 * | |
23 * Q. How to compile? | |
24 * A. With SC4.x/SC5.x: | |
25 * | |
26 * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o | |
27 * | |
28 * and with gcc: | |
29 * | |
30 * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o | |
31 * | |
32 * or if above fails (it does if you have gas installed): | |
33 * | |
34 * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o | |
35 * | |
36 * Quick-n-dirty way to fuse the module into the library. | |
37 * Provided that the library is already configured and built | |
38 * (in 0.9.2 case with no-asm option): | |
39 * | |
40 * # cd crypto/bn | |
41 * # cp /some/place/bn_asm.sparc.v8plus.S . | |
42 * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o | |
43 * # make | |
44 * # cd ../.. | |
45 * # make; make test | |
46 * | |
47 * Quick-n-dirty way to get rid of it: | |
48 * | |
49 * # cd crypto/bn | |
50 * # touch bn_asm.c | |
51 * # make | |
52 * # cd ../.. | |
53 * # make; make test | |
54 * | |
55 * Q. V8plus achitecture? What kind of beast is that? | |
56 * A. Well, it's rather a programming model than an architecture... | |
57 * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under | |
58 * special conditions, namely when kernel doesn't preserve upper | |
59 * 32 bits of otherwise 64-bit registers during a context switch. | |
60 * | |
61 * Q. Why just UltraSPARC? What about SuperSPARC? | |
62 * A. Original release did target UltraSPARC only. Now SuperSPARC | |
63 * version is provided along. Both version share bn_*comba[48] | |
64 * implementations (see comment later in code for explanation). | |
65 * But what's so special about this UltraSPARC implementation? | |
66 * Why didn't I let compiler do the job? Trouble is that most of | |
67 * available compilers (well, SC5.0 is the only exception) don't | |
68 * attempt to take advantage of UltraSPARC's 64-bitness under | |
69 * 32-bit kernels even though it's perfectly possible (see next | |
70 * question). | |
71 * | |
72 * Q. 64-bit registers under 32-bit kernels? Didn't you just say it | |
73 * doesn't work? | |
74 * A. You can't adress *all* registers as 64-bit wide:-( The catch is | |
75 * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully | |
76 * preserved if you're in a leaf function, i.e. such never calling | |
77 * any other functions. All functions in this module are leaf and | |
78 * 10 registers is a handful. And as a matter of fact none-"comba" | |
79 * routines don't require even that much and I could even afford to | |
80 * not allocate own stack frame for 'em:-) | |
81 * | |
82 * Q. What about 64-bit kernels? | |
83 * A. What about 'em? Just kidding:-) Pure 64-bit version is currently | |
84 * under evaluation and development... | |
85 * | |
86 * Q. What about shared libraries? | |
87 * A. What about 'em? Kidding again:-) Code does *not* contain any | |
88 * code position dependencies and it's safe to include it into | |
89 * shared library as is. | |
90 * | |
91 * Q. How much faster does it go? | |
92 * A. Do you have a good benchmark? In either case below is what I | |
93 * experience with crypto/bn/expspeed.c test program: | |
94 * | |
95 * v8plus module on U10/300MHz against bn_asm.c compiled with: | |
96 * | |
97 * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12% | |
98 * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35% | |
99 * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45% | |
100 * | |
101 * v8 module on SS10/60MHz against bn_asm.c compiled with: | |
102 * | |
103 * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10% | |
104 * cc-4.2 -xarch=v8 -xO5 -xdepend +10% | |
105 * egcs-1.1.2 -mv8 -O3 +35-45% | |
106 * | |
107 * As you can see it's damn hard to beat the new Sun C compiler | |
108 * and it's in first place GNU C users who will appreciate this | |
109 * assembler implementation:-) | |
110 */ | |
111 | |
112 /* | |
113 * Revision history. | |
114 * | |
115 * 1.0 - initial release; | |
116 * 1.1 - new loop unrolling model(*); | |
117 * - some more fine tuning; | |
118 * 1.2 - made gas friendly; | |
119 * - updates to documentation concerning v9; | |
120 * - new performance comparison matrix; | |
121 * 1.3 - fixed problem with /usr/ccs/lib/cpp; | |
122 * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient) | |
123 * resulting in slight overall performance kick; | |
124 * - some retunes; | |
125 * - support for GNU as added; | |
126 * | |
127 * (*) Originally unrolled loop looked like this: | |
128 * for (;;) { | |
129 * op(p+0); if (--n==0) break; | |
130 * op(p+1); if (--n==0) break; | |
131 * op(p+2); if (--n==0) break; | |
132 * op(p+3); if (--n==0) break; | |
133 * p+=4; | |
134 * } | |
135 * I unroll according to following: | |
136 * while (n&~3) { | |
137 * op(p+0); op(p+1); op(p+2); op(p+3); | |
138 * p+=4; n=-4; | |
139 * } | |
140 * if (n) { | |
141 * op(p+0); if (--n==0) return; | |
142 * op(p+2); if (--n==0) return; | |
143 * op(p+3); return; | |
144 * } | |
145 */ | |
146 | |
147 #if defined(__SUNPRO_C) && defined(__sparcv9) | |
148 /* They've said -xarch=v9 at command line */ | |
149 .register %g2,#scratch | |
150 .register %g3,#scratch | |
151 # define FRAME_SIZE -192 | |
152 #elif defined(__GNUC__) && defined(__arch64__) | |
153 /* They've said -m64 at command line */ | |
154 .register %g2,#scratch | |
155 .register %g3,#scratch | |
156 # define FRAME_SIZE -192 | |
157 #else | |
158 # define FRAME_SIZE -96 | |
159 #endif | |
160 /* | |
161 * GNU assembler can't stand stuw:-( | |
162 */ | |
163 #define stuw st | |
164 | |
165 .section ".text",#alloc,#execinstr | |
166 .file "bn_asm.sparc.v8plus.S" | |
167 | |
168 .align 32 | |
169 | |
170 .global bn_mul_add_words | |
171 /* | |
172 * BN_ULONG bn_mul_add_words(rp,ap,num,w) | |
173 * BN_ULONG *rp,*ap; | |
174 * int num; | |
175 * BN_ULONG w; | |
176 */ | |
177 bn_mul_add_words: | |
178 sra %o2,%g0,%o2 ! signx %o2 | |
179 brgz,a %o2,.L_bn_mul_add_words_proceed | |
180 lduw [%o1],%g2 | |
181 retl | |
182 clr %o0 | |
183 nop | |
184 nop | |
185 nop | |
186 | |
187 .L_bn_mul_add_words_proceed: | |
188 srl %o3,%g0,%o3 ! clruw %o3 | |
189 andcc %o2,-4,%g0 | |
190 bz,pn %icc,.L_bn_mul_add_words_tail | |
191 clr %o5 | |
192 | |
193 .L_bn_mul_add_words_loop: ! wow! 32 aligned! | |
194 lduw [%o0],%g1 | |
195 lduw [%o1+4],%g3 | |
196 mulx %o3,%g2,%g2 | |
197 add %g1,%o5,%o4 | |
198 nop | |
199 add %o4,%g2,%o4 | |
200 stuw %o4,[%o0] | |
201 srlx %o4,32,%o5 | |
202 | |
203 lduw [%o0+4],%g1 | |
204 lduw [%o1+8],%g2 | |
205 mulx %o3,%g3,%g3 | |
206 add %g1,%o5,%o4 | |
207 dec 4,%o2 | |
208 add %o4,%g3,%o4 | |
209 stuw %o4,[%o0+4] | |
210 srlx %o4,32,%o5 | |
211 | |
212 lduw [%o0+8],%g1 | |
213 lduw [%o1+12],%g3 | |
214 mulx %o3,%g2,%g2 | |
215 add %g1,%o5,%o4 | |
216 inc 16,%o1 | |
217 add %o4,%g2,%o4 | |
218 stuw %o4,[%o0+8] | |
219 srlx %o4,32,%o5 | |
220 | |
221 lduw [%o0+12],%g1 | |
222 mulx %o3,%g3,%g3 | |
223 add %g1,%o5,%o4 | |
224 inc 16,%o0 | |
225 add %o4,%g3,%o4 | |
226 andcc %o2,-4,%g0 | |
227 stuw %o4,[%o0-4] | |
228 srlx %o4,32,%o5 | |
229 bnz,a,pt %icc,.L_bn_mul_add_words_loop | |
230 lduw [%o1],%g2 | |
231 | |
232 brnz,a,pn %o2,.L_bn_mul_add_words_tail | |
233 lduw [%o1],%g2 | |
234 .L_bn_mul_add_words_return: | |
235 retl | |
236 mov %o5,%o0 | |
237 | |
238 .L_bn_mul_add_words_tail: | |
239 lduw [%o0],%g1 | |
240 mulx %o3,%g2,%g2 | |
241 add %g1,%o5,%o4 | |
242 dec %o2 | |
243 add %o4,%g2,%o4 | |
244 srlx %o4,32,%o5 | |
245 brz,pt %o2,.L_bn_mul_add_words_return | |
246 stuw %o4,[%o0] | |
247 | |
248 lduw [%o1+4],%g2 | |
249 lduw [%o0+4],%g1 | |
250 mulx %o3,%g2,%g2 | |
251 add %g1,%o5,%o4 | |
252 dec %o2 | |
253 add %o4,%g2,%o4 | |
254 srlx %o4,32,%o5 | |
255 brz,pt %o2,.L_bn_mul_add_words_return | |
256 stuw %o4,[%o0+4] | |
257 | |
258 lduw [%o1+8],%g2 | |
259 lduw [%o0+8],%g1 | |
260 mulx %o3,%g2,%g2 | |
261 add %g1,%o5,%o4 | |
262 add %o4,%g2,%o4 | |
263 stuw %o4,[%o0+8] | |
264 retl | |
265 srlx %o4,32,%o0 | |
266 | |
267 .type bn_mul_add_words,#function | |
268 .size bn_mul_add_words,(.-bn_mul_add_words) | |
269 | |
270 .align 32 | |
271 | |
272 .global bn_mul_words | |
273 /* | |
274 * BN_ULONG bn_mul_words(rp,ap,num,w) | |
275 * BN_ULONG *rp,*ap; | |
276 * int num; | |
277 * BN_ULONG w; | |
278 */ | |
279 bn_mul_words: | |
280 sra %o2,%g0,%o2 ! signx %o2 | |
281 brgz,a %o2,.L_bn_mul_words_proceeed | |
282 lduw [%o1],%g2 | |
283 retl | |
284 clr %o0 | |
285 nop | |
286 nop | |
287 nop | |
288 | |
289 .L_bn_mul_words_proceeed: | |
290 srl %o3,%g0,%o3 ! clruw %o3 | |
291 andcc %o2,-4,%g0 | |
292 bz,pn %icc,.L_bn_mul_words_tail | |
293 clr %o5 | |
294 | |
295 .L_bn_mul_words_loop: ! wow! 32 aligned! | |
296 lduw [%o1+4],%g3 | |
297 mulx %o3,%g2,%g2 | |
298 add %g2,%o5,%o4 | |
299 nop | |
300 stuw %o4,[%o0] | |
301 srlx %o4,32,%o5 | |
302 | |
303 lduw [%o1+8],%g2 | |
304 mulx %o3,%g3,%g3 | |
305 add %g3,%o5,%o4 | |
306 dec 4,%o2 | |
307 stuw %o4,[%o0+4] | |
308 srlx %o4,32,%o5 | |
309 | |
310 lduw [%o1+12],%g3 | |
311 mulx %o3,%g2,%g2 | |
312 add %g2,%o5,%o4 | |
313 inc 16,%o1 | |
314 stuw %o4,[%o0+8] | |
315 srlx %o4,32,%o5 | |
316 | |
317 mulx %o3,%g3,%g3 | |
318 add %g3,%o5,%o4 | |
319 inc 16,%o0 | |
320 stuw %o4,[%o0-4] | |
321 srlx %o4,32,%o5 | |
322 andcc %o2,-4,%g0 | |
323 bnz,a,pt %icc,.L_bn_mul_words_loop | |
324 lduw [%o1],%g2 | |
325 nop | |
326 nop | |
327 | |
328 brnz,a,pn %o2,.L_bn_mul_words_tail | |
329 lduw [%o1],%g2 | |
330 .L_bn_mul_words_return: | |
331 retl | |
332 mov %o5,%o0 | |
333 | |
334 .L_bn_mul_words_tail: | |
335 mulx %o3,%g2,%g2 | |
336 add %g2,%o5,%o4 | |
337 dec %o2 | |
338 srlx %o4,32,%o5 | |
339 brz,pt %o2,.L_bn_mul_words_return | |
340 stuw %o4,[%o0] | |
341 | |
342 lduw [%o1+4],%g2 | |
343 mulx %o3,%g2,%g2 | |
344 add %g2,%o5,%o4 | |
345 dec %o2 | |
346 srlx %o4,32,%o5 | |
347 brz,pt %o2,.L_bn_mul_words_return | |
348 stuw %o4,[%o0+4] | |
349 | |
350 lduw [%o1+8],%g2 | |
351 mulx %o3,%g2,%g2 | |
352 add %g2,%o5,%o4 | |
353 stuw %o4,[%o0+8] | |
354 retl | |
355 srlx %o4,32,%o0 | |
356 | |
357 .type bn_mul_words,#function | |
358 .size bn_mul_words,(.-bn_mul_words) | |
359 | |
360 .align 32 | |
361 .global bn_sqr_words | |
362 /* | |
363 * void bn_sqr_words(r,a,n) | |
364 * BN_ULONG *r,*a; | |
365 * int n; | |
366 */ | |
367 bn_sqr_words: | |
368 sra %o2,%g0,%o2 ! signx %o2 | |
369 brgz,a %o2,.L_bn_sqr_words_proceeed | |
370 lduw [%o1],%g2 | |
371 retl | |
372 clr %o0 | |
373 nop | |
374 nop | |
375 nop | |
376 | |
377 .L_bn_sqr_words_proceeed: | |
378 andcc %o2,-4,%g0 | |
379 nop | |
380 bz,pn %icc,.L_bn_sqr_words_tail | |
381 nop | |
382 | |
383 .L_bn_sqr_words_loop: ! wow! 32 aligned! | |
384 lduw [%o1+4],%g3 | |
385 mulx %g2,%g2,%o4 | |
386 stuw %o4,[%o0] | |
387 srlx %o4,32,%o5 | |
388 stuw %o5,[%o0+4] | |
389 nop | |
390 | |
391 lduw [%o1+8],%g2 | |
392 mulx %g3,%g3,%o4 | |
393 dec 4,%o2 | |
394 stuw %o4,[%o0+8] | |
395 srlx %o4,32,%o5 | |
396 stuw %o5,[%o0+12] | |
397 | |
398 lduw [%o1+12],%g3 | |
399 mulx %g2,%g2,%o4 | |
400 srlx %o4,32,%o5 | |
401 stuw %o4,[%o0+16] | |
402 inc 16,%o1 | |
403 stuw %o5,[%o0+20] | |
404 | |
405 mulx %g3,%g3,%o4 | |
406 inc 32,%o0 | |
407 stuw %o4,[%o0-8] | |
408 srlx %o4,32,%o5 | |
409 andcc %o2,-4,%g2 | |
410 stuw %o5,[%o0-4] | |
411 bnz,a,pt %icc,.L_bn_sqr_words_loop | |
412 lduw [%o1],%g2 | |
413 nop | |
414 | |
415 brnz,a,pn %o2,.L_bn_sqr_words_tail | |
416 lduw [%o1],%g2 | |
417 .L_bn_sqr_words_return: | |
418 retl | |
419 clr %o0 | |
420 | |
421 .L_bn_sqr_words_tail: | |
422 mulx %g2,%g2,%o4 | |
423 dec %o2 | |
424 stuw %o4,[%o0] | |
425 srlx %o4,32,%o5 | |
426 brz,pt %o2,.L_bn_sqr_words_return | |
427 stuw %o5,[%o0+4] | |
428 | |
429 lduw [%o1+4],%g2 | |
430 mulx %g2,%g2,%o4 | |
431 dec %o2 | |
432 stuw %o4,[%o0+8] | |
433 srlx %o4,32,%o5 | |
434 brz,pt %o2,.L_bn_sqr_words_return | |
435 stuw %o5,[%o0+12] | |
436 | |
437 lduw [%o1+8],%g2 | |
438 mulx %g2,%g2,%o4 | |
439 srlx %o4,32,%o5 | |
440 stuw %o4,[%o0+16] | |
441 stuw %o5,[%o0+20] | |
442 retl | |
443 clr %o0 | |
444 | |
445 .type bn_sqr_words,#function | |
446 .size bn_sqr_words,(.-bn_sqr_words) | |
447 | |
448 .align 32 | |
449 .global bn_div_words | |
450 /* | |
451 * BN_ULONG bn_div_words(h,l,d) | |
452 * BN_ULONG h,l,d; | |
453 */ | |
454 bn_div_words: | |
455 sllx %o0,32,%o0 | |
456 or %o0,%o1,%o0 | |
457 udivx %o0,%o2,%o0 | |
458 retl | |
459 srl %o0,%g0,%o0 ! clruw %o0 | |
460 | |
461 .type bn_div_words,#function | |
462 .size bn_div_words,(.-bn_div_words) | |
463 | |
464 .align 32 | |
465 | |
466 .global bn_add_words | |
467 /* | |
468 * BN_ULONG bn_add_words(rp,ap,bp,n) | |
469 * BN_ULONG *rp,*ap,*bp; | |
470 * int n; | |
471 */ | |
472 bn_add_words: | |
473 sra %o3,%g0,%o3 ! signx %o3 | |
474 brgz,a %o3,.L_bn_add_words_proceed | |
475 lduw [%o1],%o4 | |
476 retl | |
477 clr %o0 | |
478 | |
479 .L_bn_add_words_proceed: | |
480 andcc %o3,-4,%g0 | |
481 bz,pn %icc,.L_bn_add_words_tail | |
482 addcc %g0,0,%g0 ! clear carry flag | |
483 | |
484 .L_bn_add_words_loop: ! wow! 32 aligned! | |
485 dec 4,%o3 | |
486 lduw [%o2],%o5 | |
487 lduw [%o1+4],%g1 | |
488 lduw [%o2+4],%g2 | |
489 lduw [%o1+8],%g3 | |
490 lduw [%o2+8],%g4 | |
491 addccc %o5,%o4,%o5 | |
492 stuw %o5,[%o0] | |
493 | |
494 lduw [%o1+12],%o4 | |
495 lduw [%o2+12],%o5 | |
496 inc 16,%o1 | |
497 addccc %g1,%g2,%g1 | |
498 stuw %g1,[%o0+4] | |
499 | |
500 inc 16,%o2 | |
501 addccc %g3,%g4,%g3 | |
502 stuw %g3,[%o0+8] | |
503 | |
504 inc 16,%o0 | |
505 addccc %o5,%o4,%o5 | |
506 stuw %o5,[%o0-4] | |
507 and %o3,-4,%g1 | |
508 brnz,a,pt %g1,.L_bn_add_words_loop | |
509 lduw [%o1],%o4 | |
510 | |
511 brnz,a,pn %o3,.L_bn_add_words_tail | |
512 lduw [%o1],%o4 | |
513 .L_bn_add_words_return: | |
514 clr %o0 | |
515 retl | |
516 movcs %icc,1,%o0 | |
517 nop | |
518 | |
519 .L_bn_add_words_tail: | |
520 lduw [%o2],%o5 | |
521 dec %o3 | |
522 addccc %o5,%o4,%o5 | |
523 brz,pt %o3,.L_bn_add_words_return | |
524 stuw %o5,[%o0] | |
525 | |
526 lduw [%o1+4],%o4 | |
527 lduw [%o2+4],%o5 | |
528 dec %o3 | |
529 addccc %o5,%o4,%o5 | |
530 brz,pt %o3,.L_bn_add_words_return | |
531 stuw %o5,[%o0+4] | |
532 | |
533 lduw [%o1+8],%o4 | |
534 lduw [%o2+8],%o5 | |
535 addccc %o5,%o4,%o5 | |
536 stuw %o5,[%o0+8] | |
537 clr %o0 | |
538 retl | |
539 movcs %icc,1,%o0 | |
540 | |
541 .type bn_add_words,#function | |
542 .size bn_add_words,(.-bn_add_words) | |
543 | |
544 .global bn_sub_words | |
545 /* | |
546 * BN_ULONG bn_sub_words(rp,ap,bp,n) | |
547 * BN_ULONG *rp,*ap,*bp; | |
548 * int n; | |
549 */ | |
550 bn_sub_words: | |
551 sra %o3,%g0,%o3 ! signx %o3 | |
552 brgz,a %o3,.L_bn_sub_words_proceed | |
553 lduw [%o1],%o4 | |
554 retl | |
555 clr %o0 | |
556 | |
557 .L_bn_sub_words_proceed: | |
558 andcc %o3,-4,%g0 | |
559 bz,pn %icc,.L_bn_sub_words_tail | |
560 addcc %g0,0,%g0 ! clear carry flag | |
561 | |
562 .L_bn_sub_words_loop: ! wow! 32 aligned! | |
563 dec 4,%o3 | |
564 lduw [%o2],%o5 | |
565 lduw [%o1+4],%g1 | |
566 lduw [%o2+4],%g2 | |
567 lduw [%o1+8],%g3 | |
568 lduw [%o2+8],%g4 | |
569 subccc %o4,%o5,%o5 | |
570 stuw %o5,[%o0] | |
571 | |
572 lduw [%o1+12],%o4 | |
573 lduw [%o2+12],%o5 | |
574 inc 16,%o1 | |
575 subccc %g1,%g2,%g2 | |
576 stuw %g2,[%o0+4] | |
577 | |
578 inc 16,%o2 | |
579 subccc %g3,%g4,%g4 | |
580 stuw %g4,[%o0+8] | |
581 | |
582 inc 16,%o0 | |
583 subccc %o4,%o5,%o5 | |
584 stuw %o5,[%o0-4] | |
585 and %o3,-4,%g1 | |
586 brnz,a,pt %g1,.L_bn_sub_words_loop | |
587 lduw [%o1],%o4 | |
588 | |
589 brnz,a,pn %o3,.L_bn_sub_words_tail | |
590 lduw [%o1],%o4 | |
591 .L_bn_sub_words_return: | |
592 clr %o0 | |
593 retl | |
594 movcs %icc,1,%o0 | |
595 nop | |
596 | |
597 .L_bn_sub_words_tail: ! wow! 32 aligned! | |
598 lduw [%o2],%o5 | |
599 dec %o3 | |
600 subccc %o4,%o5,%o5 | |
601 brz,pt %o3,.L_bn_sub_words_return | |
602 stuw %o5,[%o0] | |
603 | |
604 lduw [%o1+4],%o4 | |
605 lduw [%o2+4],%o5 | |
606 dec %o3 | |
607 subccc %o4,%o5,%o5 | |
608 brz,pt %o3,.L_bn_sub_words_return | |
609 stuw %o5,[%o0+4] | |
610 | |
611 lduw [%o1+8],%o4 | |
612 lduw [%o2+8],%o5 | |
613 subccc %o4,%o5,%o5 | |
614 stuw %o5,[%o0+8] | |
615 clr %o0 | |
616 retl | |
617 movcs %icc,1,%o0 | |
618 | |
619 .type bn_sub_words,#function | |
620 .size bn_sub_words,(.-bn_sub_words) | |
621 | |
622 /* | |
623 * Code below depends on the fact that upper parts of the %l0-%l7 | |
624 * and %i0-%i7 are zeroed by kernel after context switch. In | |
625 * previous versions this comment stated that "the trouble is that | |
626 * it's not feasible to implement the mumbo-jumbo in less V9 | |
627 * instructions:-(" which apparently isn't true thanks to | |
628 * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement | |
629 * results not from the shorter code, but from elimination of | |
630 * multicycle none-pairable 'rd %y,%rd' instructions. | |
631 * | |
632 * Andy. | |
633 */ | |
634 | |
635 /* | |
636 * Here is register usage map for *all* routines below. | |
637 */ | |
638 #define t_1 %o0 | |
639 #define t_2 %o1 | |
640 #define c_12 %o2 | |
641 #define c_3 %o3 | |
642 | |
643 #define ap(I) [%i1+4*I] | |
644 #define bp(I) [%i2+4*I] | |
645 #define rp(I) [%i0+4*I] | |
646 | |
647 #define a_0 %l0 | |
648 #define a_1 %l1 | |
649 #define a_2 %l2 | |
650 #define a_3 %l3 | |
651 #define a_4 %l4 | |
652 #define a_5 %l5 | |
653 #define a_6 %l6 | |
654 #define a_7 %l7 | |
655 | |
656 #define b_0 %i3 | |
657 #define b_1 %i4 | |
658 #define b_2 %i5 | |
659 #define b_3 %o4 | |
660 #define b_4 %o5 | |
661 #define b_5 %o7 | |
662 #define b_6 %g1 | |
663 #define b_7 %g4 | |
664 | |
665 .align 32 | |
666 .global bn_mul_comba8 | |
667 /* | |
668 * void bn_mul_comba8(r,a,b) | |
669 * BN_ULONG *r,*a,*b; | |
670 */ | |
671 bn_mul_comba8: | |
672 save %sp,FRAME_SIZE,%sp | |
673 mov 1,t_2 | |
674 lduw ap(0),a_0 | |
675 sllx t_2,32,t_2 | |
676 lduw bp(0),b_0 != | |
677 lduw bp(1),b_1 | |
678 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3); | |
679 srlx t_1,32,c_12 | |
680 stuw t_1,rp(0) !=!r[0]=c1; | |
681 | |
682 lduw ap(1),a_1 | |
683 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1); | |
684 addcc c_12,t_1,c_12 | |
685 clr c_3 != | |
686 bcs,a %xcc,.+8 | |
687 add c_3,t_2,c_3 | |
688 lduw ap(2),a_2 | |
689 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); | |
690 addcc c_12,t_1,t_1 | |
691 bcs,a %xcc,.+8 | |
692 add c_3,t_2,c_3 | |
693 srlx t_1,32,c_12 != | |
694 stuw t_1,rp(1) !r[1]=c2; | |
695 or c_12,c_3,c_12 | |
696 | |
697 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); | |
698 addcc c_12,t_1,c_12 != | |
699 clr c_3 | |
700 bcs,a %xcc,.+8 | |
701 add c_3,t_2,c_3 | |
702 lduw bp(2),b_2 != | |
703 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); | |
704 addcc c_12,t_1,c_12 | |
705 bcs,a %xcc,.+8 | |
706 add c_3,t_2,c_3 != | |
707 lduw bp(3),b_3 | |
708 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); | |
709 addcc c_12,t_1,t_1 | |
710 bcs,a %xcc,.+8 != | |
711 add c_3,t_2,c_3 | |
712 srlx t_1,32,c_12 | |
713 stuw t_1,rp(2) !r[2]=c3; | |
714 or c_12,c_3,c_12 != | |
715 | |
716 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); | |
717 addcc c_12,t_1,c_12 | |
718 clr c_3 | |
719 bcs,a %xcc,.+8 != | |
720 add c_3,t_2,c_3 | |
721 mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3); | |
722 addcc c_12,t_1,c_12 | |
723 bcs,a %xcc,.+8 != | |
724 add c_3,t_2,c_3 | |
725 lduw ap(3),a_3 | |
726 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); | |
727 addcc c_12,t_1,c_12 != | |
728 bcs,a %xcc,.+8 | |
729 add c_3,t_2,c_3 | |
730 lduw ap(4),a_4 | |
731 mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!= | |
732 addcc c_12,t_1,t_1 | |
733 bcs,a %xcc,.+8 | |
734 add c_3,t_2,c_3 | |
735 srlx t_1,32,c_12 != | |
736 stuw t_1,rp(3) !r[3]=c1; | |
737 or c_12,c_3,c_12 | |
738 | |
739 mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1); | |
740 addcc c_12,t_1,c_12 != | |
741 clr c_3 | |
742 bcs,a %xcc,.+8 | |
743 add c_3,t_2,c_3 | |
744 mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1); | |
745 addcc c_12,t_1,c_12 | |
746 bcs,a %xcc,.+8 | |
747 add c_3,t_2,c_3 | |
748 mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1); | |
749 addcc c_12,t_1,c_12 | |
750 bcs,a %xcc,.+8 | |
751 add c_3,t_2,c_3 | |
752 lduw bp(4),b_4 != | |
753 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); | |
754 addcc c_12,t_1,c_12 | |
755 bcs,a %xcc,.+8 | |
756 add c_3,t_2,c_3 != | |
757 lduw bp(5),b_5 | |
758 mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1); | |
759 addcc c_12,t_1,t_1 | |
760 bcs,a %xcc,.+8 != | |
761 add c_3,t_2,c_3 | |
762 srlx t_1,32,c_12 | |
763 stuw t_1,rp(4) !r[4]=c2; | |
764 or c_12,c_3,c_12 != | |
765 | |
766 mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2); | |
767 addcc c_12,t_1,c_12 | |
768 clr c_3 | |
769 bcs,a %xcc,.+8 != | |
770 add c_3,t_2,c_3 | |
771 mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2); | |
772 addcc c_12,t_1,c_12 | |
773 bcs,a %xcc,.+8 != | |
774 add c_3,t_2,c_3 | |
775 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); | |
776 addcc c_12,t_1,c_12 | |
777 bcs,a %xcc,.+8 != | |
778 add c_3,t_2,c_3 | |
779 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); | |
780 addcc c_12,t_1,c_12 | |
781 bcs,a %xcc,.+8 != | |
782 add c_3,t_2,c_3 | |
783 lduw ap(5),a_5 | |
784 mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2); | |
785 addcc c_12,t_1,c_12 != | |
786 bcs,a %xcc,.+8 | |
787 add c_3,t_2,c_3 | |
788 lduw ap(6),a_6 | |
789 mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2); | |
790 addcc c_12,t_1,t_1 | |
791 bcs,a %xcc,.+8 | |
792 add c_3,t_2,c_3 | |
793 srlx t_1,32,c_12 != | |
794 stuw t_1,rp(5) !r[5]=c3; | |
795 or c_12,c_3,c_12 | |
796 | |
797 mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3); | |
798 addcc c_12,t_1,c_12 != | |
799 clr c_3 | |
800 bcs,a %xcc,.+8 | |
801 add c_3,t_2,c_3 | |
802 mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3); | |
803 addcc c_12,t_1,c_12 | |
804 bcs,a %xcc,.+8 | |
805 add c_3,t_2,c_3 | |
806 mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3); | |
807 addcc c_12,t_1,c_12 | |
808 bcs,a %xcc,.+8 | |
809 add c_3,t_2,c_3 | |
810 mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3); | |
811 addcc c_12,t_1,c_12 | |
812 bcs,a %xcc,.+8 | |
813 add c_3,t_2,c_3 | |
814 mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3); | |
815 addcc c_12,t_1,c_12 | |
816 bcs,a %xcc,.+8 | |
817 add c_3,t_2,c_3 | |
818 lduw bp(6),b_6 != | |
819 mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3); | |
820 addcc c_12,t_1,c_12 | |
821 bcs,a %xcc,.+8 | |
822 add c_3,t_2,c_3 != | |
823 lduw bp(7),b_7 | |
824 mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3); | |
825 addcc c_12,t_1,t_1 | |
826 bcs,a %xcc,.+8 != | |
827 add c_3,t_2,c_3 | |
828 srlx t_1,32,c_12 | |
829 stuw t_1,rp(6) !r[6]=c1; | |
830 or c_12,c_3,c_12 != | |
831 | |
832 mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1); | |
833 addcc c_12,t_1,c_12 | |
834 clr c_3 | |
835 bcs,a %xcc,.+8 != | |
836 add c_3,t_2,c_3 | |
837 mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1); | |
838 addcc c_12,t_1,c_12 | |
839 bcs,a %xcc,.+8 != | |
840 add c_3,t_2,c_3 | |
841 mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1); | |
842 addcc c_12,t_1,c_12 | |
843 bcs,a %xcc,.+8 != | |
844 add c_3,t_2,c_3 | |
845 mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1); | |
846 addcc c_12,t_1,c_12 | |
847 bcs,a %xcc,.+8 != | |
848 add c_3,t_2,c_3 | |
849 mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1); | |
850 addcc c_12,t_1,c_12 | |
851 bcs,a %xcc,.+8 != | |
852 add c_3,t_2,c_3 | |
853 mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1); | |
854 addcc c_12,t_1,c_12 | |
855 bcs,a %xcc,.+8 != | |
856 add c_3,t_2,c_3 | |
857 lduw ap(7),a_7 | |
858 mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1); | |
859 addcc c_12,t_1,c_12 | |
860 bcs,a %xcc,.+8 | |
861 add c_3,t_2,c_3 | |
862 mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1); | |
863 addcc c_12,t_1,t_1 | |
864 bcs,a %xcc,.+8 | |
865 add c_3,t_2,c_3 | |
866 srlx t_1,32,c_12 != | |
867 stuw t_1,rp(7) !r[7]=c2; | |
868 or c_12,c_3,c_12 | |
869 | |
870 mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2); | |
871 addcc c_12,t_1,c_12 | |
872 clr c_3 | |
873 bcs,a %xcc,.+8 | |
874 add c_3,t_2,c_3 != | |
875 mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2); | |
876 addcc c_12,t_1,c_12 | |
877 bcs,a %xcc,.+8 | |
878 add c_3,t_2,c_3 != | |
879 mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2); | |
880 addcc c_12,t_1,c_12 | |
881 bcs,a %xcc,.+8 | |
882 add c_3,t_2,c_3 != | |
883 mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2); | |
884 addcc c_12,t_1,c_12 | |
885 bcs,a %xcc,.+8 | |
886 add c_3,t_2,c_3 != | |
887 mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2); | |
888 addcc c_12,t_1,c_12 | |
889 bcs,a %xcc,.+8 | |
890 add c_3,t_2,c_3 != | |
891 mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2); | |
892 addcc c_12,t_1,c_12 | |
893 bcs,a %xcc,.+8 | |
894 add c_3,t_2,c_3 != | |
895 mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2); | |
896 addcc c_12,t_1,t_1 | |
897 bcs,a %xcc,.+8 | |
898 add c_3,t_2,c_3 != | |
899 srlx t_1,32,c_12 | |
900 stuw t_1,rp(8) !r[8]=c3; | |
901 or c_12,c_3,c_12 | |
902 | |
903 mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3); | |
904 addcc c_12,t_1,c_12 | |
905 clr c_3 | |
906 bcs,a %xcc,.+8 | |
907 add c_3,t_2,c_3 != | |
908 mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3); | |
909 addcc c_12,t_1,c_12 | |
910 bcs,a %xcc,.+8 != | |
911 add c_3,t_2,c_3 | |
912 mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3); | |
913 addcc c_12,t_1,c_12 | |
914 bcs,a %xcc,.+8 != | |
915 add c_3,t_2,c_3 | |
916 mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3); | |
917 addcc c_12,t_1,c_12 | |
918 bcs,a %xcc,.+8 != | |
919 add c_3,t_2,c_3 | |
920 mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3); | |
921 addcc c_12,t_1,c_12 | |
922 bcs,a %xcc,.+8 != | |
923 add c_3,t_2,c_3 | |
924 mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3); | |
925 addcc c_12,t_1,t_1 | |
926 bcs,a %xcc,.+8 != | |
927 add c_3,t_2,c_3 | |
928 srlx t_1,32,c_12 | |
929 stuw t_1,rp(9) !r[9]=c1; | |
930 or c_12,c_3,c_12 != | |
931 | |
932 mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1); | |
933 addcc c_12,t_1,c_12 | |
934 clr c_3 | |
935 bcs,a %xcc,.+8 != | |
936 add c_3,t_2,c_3 | |
937 mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1); | |
938 addcc c_12,t_1,c_12 | |
939 bcs,a %xcc,.+8 != | |
940 add c_3,t_2,c_3 | |
941 mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1); | |
942 addcc c_12,t_1,c_12 | |
943 bcs,a %xcc,.+8 != | |
944 add c_3,t_2,c_3 | |
945 mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1); | |
946 addcc c_12,t_1,c_12 | |
947 bcs,a %xcc,.+8 != | |
948 add c_3,t_2,c_3 | |
949 mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1); | |
950 addcc c_12,t_1,t_1 | |
951 bcs,a %xcc,.+8 != | |
952 add c_3,t_2,c_3 | |
953 srlx t_1,32,c_12 | |
954 stuw t_1,rp(10) !r[10]=c2; | |
955 or c_12,c_3,c_12 != | |
956 | |
957 mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2); | |
958 addcc c_12,t_1,c_12 | |
959 clr c_3 | |
960 bcs,a %xcc,.+8 != | |
961 add c_3,t_2,c_3 | |
962 mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2); | |
963 addcc c_12,t_1,c_12 | |
964 bcs,a %xcc,.+8 != | |
965 add c_3,t_2,c_3 | |
966 mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2); | |
967 addcc c_12,t_1,c_12 | |
968 bcs,a %xcc,.+8 != | |
969 add c_3,t_2,c_3 | |
970 mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2); | |
971 addcc c_12,t_1,t_1 | |
972 bcs,a %xcc,.+8 != | |
973 add c_3,t_2,c_3 | |
974 srlx t_1,32,c_12 | |
975 stuw t_1,rp(11) !r[11]=c3; | |
976 or c_12,c_3,c_12 != | |
977 | |
978 mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3); | |
979 addcc c_12,t_1,c_12 | |
980 clr c_3 | |
981 bcs,a %xcc,.+8 != | |
982 add c_3,t_2,c_3 | |
983 mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3); | |
984 addcc c_12,t_1,c_12 | |
985 bcs,a %xcc,.+8 != | |
986 add c_3,t_2,c_3 | |
987 mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3); | |
988 addcc c_12,t_1,t_1 | |
989 bcs,a %xcc,.+8 != | |
990 add c_3,t_2,c_3 | |
991 srlx t_1,32,c_12 | |
992 stuw t_1,rp(12) !r[12]=c1; | |
993 or c_12,c_3,c_12 != | |
994 | |
995 mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1); | |
996 addcc c_12,t_1,c_12 | |
997 clr c_3 | |
998 bcs,a %xcc,.+8 != | |
999 add c_3,t_2,c_3 | |
1000 mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1); | |
1001 addcc c_12,t_1,t_1 | |
1002 bcs,a %xcc,.+8 != | |
1003 add c_3,t_2,c_3 | |
1004 srlx t_1,32,c_12 | |
1005 st t_1,rp(13) !r[13]=c2; | |
1006 or c_12,c_3,c_12 != | |
1007 | |
1008 mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2); | |
1009 addcc c_12,t_1,t_1 | |
1010 srlx t_1,32,c_12 != | |
1011 stuw t_1,rp(14) !r[14]=c3; | |
1012 stuw c_12,rp(15) !r[15]=c1; | |
1013 | |
1014 ret | |
1015 restore %g0,%g0,%o0 != | |
1016 | |
1017 .type bn_mul_comba8,#function | |
1018 .size bn_mul_comba8,(.-bn_mul_comba8) | |
1019 | |
1020 .align 32 | |
1021 | |
1022 .global bn_mul_comba4 | |
1023 /* | |
1024 * void bn_mul_comba4(r,a,b) | |
1025 * BN_ULONG *r,*a,*b; | |
1026 */ | |
1027 bn_mul_comba4: | |
1028 save %sp,FRAME_SIZE,%sp | |
1029 lduw ap(0),a_0 | |
1030 mov 1,t_2 | |
1031 lduw bp(0),b_0 | |
1032 sllx t_2,32,t_2 != | |
1033 lduw bp(1),b_1 | |
1034 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3); | |
1035 srlx t_1,32,c_12 | |
1036 stuw t_1,rp(0) !=!r[0]=c1; | |
1037 | |
1038 lduw ap(1),a_1 | |
1039 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1); | |
1040 addcc c_12,t_1,c_12 | |
1041 clr c_3 != | |
1042 bcs,a %xcc,.+8 | |
1043 add c_3,t_2,c_3 | |
1044 lduw ap(2),a_2 | |
1045 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); | |
1046 addcc c_12,t_1,t_1 | |
1047 bcs,a %xcc,.+8 | |
1048 add c_3,t_2,c_3 | |
1049 srlx t_1,32,c_12 != | |
1050 stuw t_1,rp(1) !r[1]=c2; | |
1051 or c_12,c_3,c_12 | |
1052 | |
1053 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); | |
1054 addcc c_12,t_1,c_12 != | |
1055 clr c_3 | |
1056 bcs,a %xcc,.+8 | |
1057 add c_3,t_2,c_3 | |
1058 lduw bp(2),b_2 != | |
1059 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); | |
1060 addcc c_12,t_1,c_12 | |
1061 bcs,a %xcc,.+8 | |
1062 add c_3,t_2,c_3 != | |
1063 lduw bp(3),b_3 | |
1064 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); | |
1065 addcc c_12,t_1,t_1 | |
1066 bcs,a %xcc,.+8 != | |
1067 add c_3,t_2,c_3 | |
1068 srlx t_1,32,c_12 | |
1069 stuw t_1,rp(2) !r[2]=c3; | |
1070 or c_12,c_3,c_12 != | |
1071 | |
1072 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); | |
1073 addcc c_12,t_1,c_12 | |
1074 clr c_3 | |
1075 bcs,a %xcc,.+8 != | |
1076 add c_3,t_2,c_3 | |
1077 mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3); | |
1078 addcc c_12,t_1,c_12 | |
1079 bcs,a %xcc,.+8 != | |
1080 add c_3,t_2,c_3 | |
1081 lduw ap(3),a_3 | |
1082 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); | |
1083 addcc c_12,t_1,c_12 != | |
1084 bcs,a %xcc,.+8 | |
1085 add c_3,t_2,c_3 | |
1086 mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!= | |
1087 addcc c_12,t_1,t_1 != | |
1088 bcs,a %xcc,.+8 | |
1089 add c_3,t_2,c_3 | |
1090 srlx t_1,32,c_12 | |
1091 stuw t_1,rp(3) !=!r[3]=c1; | |
1092 or c_12,c_3,c_12 | |
1093 | |
1094 mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1); | |
1095 addcc c_12,t_1,c_12 | |
1096 clr c_3 != | |
1097 bcs,a %xcc,.+8 | |
1098 add c_3,t_2,c_3 | |
1099 mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1); | |
1100 addcc c_12,t_1,c_12 != | |
1101 bcs,a %xcc,.+8 | |
1102 add c_3,t_2,c_3 | |
1103 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); | |
1104 addcc c_12,t_1,t_1 != | |
1105 bcs,a %xcc,.+8 | |
1106 add c_3,t_2,c_3 | |
1107 srlx t_1,32,c_12 | |
1108 stuw t_1,rp(4) !=!r[4]=c2; | |
1109 or c_12,c_3,c_12 | |
1110 | |
1111 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); | |
1112 addcc c_12,t_1,c_12 | |
1113 clr c_3 != | |
1114 bcs,a %xcc,.+8 | |
1115 add c_3,t_2,c_3 | |
1116 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); | |
1117 addcc c_12,t_1,t_1 != | |
1118 bcs,a %xcc,.+8 | |
1119 add c_3,t_2,c_3 | |
1120 srlx t_1,32,c_12 | |
1121 stuw t_1,rp(5) !=!r[5]=c3; | |
1122 or c_12,c_3,c_12 | |
1123 | |
1124 mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3); | |
1125 addcc c_12,t_1,t_1 | |
1126 srlx t_1,32,c_12 != | |
1127 stuw t_1,rp(6) !r[6]=c1; | |
1128 stuw c_12,rp(7) !r[7]=c2; | |
1129 | |
1130 ret | |
1131 restore %g0,%g0,%o0 | |
1132 | |
1133 .type bn_mul_comba4,#function | |
1134 .size bn_mul_comba4,(.-bn_mul_comba4) | |
1135 | |
1136 .align 32 | |
1137 | |
1138 .global bn_sqr_comba8 | |
1139 bn_sqr_comba8: | |
1140 save %sp,FRAME_SIZE,%sp | |
1141 mov 1,t_2 | |
1142 lduw ap(0),a_0 | |
1143 sllx t_2,32,t_2 | |
1144 lduw ap(1),a_1 | |
1145 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3); | |
1146 srlx t_1,32,c_12 | |
1147 stuw t_1,rp(0) !r[0]=c1; | |
1148 | |
1149 lduw ap(2),a_2 | |
1150 mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1); | |
1151 addcc c_12,t_1,c_12 | |
1152 clr c_3 | |
1153 bcs,a %xcc,.+8 | |
1154 add c_3,t_2,c_3 | |
1155 addcc c_12,t_1,t_1 | |
1156 bcs,a %xcc,.+8 | |
1157 add c_3,t_2,c_3 | |
1158 srlx t_1,32,c_12 | |
1159 stuw t_1,rp(1) !r[1]=c2; | |
1160 or c_12,c_3,c_12 | |
1161 | |
1162 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); | |
1163 addcc c_12,t_1,c_12 | |
1164 clr c_3 | |
1165 bcs,a %xcc,.+8 | |
1166 add c_3,t_2,c_3 | |
1167 addcc c_12,t_1,c_12 | |
1168 bcs,a %xcc,.+8 | |
1169 add c_3,t_2,c_3 | |
1170 lduw ap(3),a_3 | |
1171 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); | |
1172 addcc c_12,t_1,t_1 | |
1173 bcs,a %xcc,.+8 | |
1174 add c_3,t_2,c_3 | |
1175 srlx t_1,32,c_12 | |
1176 stuw t_1,rp(2) !r[2]=c3; | |
1177 or c_12,c_3,c_12 | |
1178 | |
1179 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); | |
1180 addcc c_12,t_1,c_12 | |
1181 clr c_3 | |
1182 bcs,a %xcc,.+8 | |
1183 add c_3,t_2,c_3 | |
1184 addcc c_12,t_1,c_12 | |
1185 bcs,a %xcc,.+8 | |
1186 add c_3,t_2,c_3 | |
1187 lduw ap(4),a_4 | |
1188 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); | |
1189 addcc c_12,t_1,c_12 | |
1190 bcs,a %xcc,.+8 | |
1191 add c_3,t_2,c_3 | |
1192 addcc c_12,t_1,t_1 | |
1193 bcs,a %xcc,.+8 | |
1194 add c_3,t_2,c_3 | |
1195 srlx t_1,32,c_12 | |
1196 st t_1,rp(3) !r[3]=c1; | |
1197 or c_12,c_3,c_12 | |
1198 | |
1199 mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1); | |
1200 addcc c_12,t_1,c_12 | |
1201 clr c_3 | |
1202 bcs,a %xcc,.+8 | |
1203 add c_3,t_2,c_3 | |
1204 addcc c_12,t_1,c_12 | |
1205 bcs,a %xcc,.+8 | |
1206 add c_3,t_2,c_3 | |
1207 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); | |
1208 addcc c_12,t_1,c_12 | |
1209 bcs,a %xcc,.+8 | |
1210 add c_3,t_2,c_3 | |
1211 addcc c_12,t_1,c_12 | |
1212 bcs,a %xcc,.+8 | |
1213 add c_3,t_2,c_3 | |
1214 lduw ap(5),a_5 | |
1215 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); | |
1216 addcc c_12,t_1,t_1 | |
1217 bcs,a %xcc,.+8 | |
1218 add c_3,t_2,c_3 | |
1219 srlx t_1,32,c_12 | |
1220 stuw t_1,rp(4) !r[4]=c2; | |
1221 or c_12,c_3,c_12 | |
1222 | |
1223 mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2); | |
1224 addcc c_12,t_1,c_12 | |
1225 clr c_3 | |
1226 bcs,a %xcc,.+8 | |
1227 add c_3,t_2,c_3 | |
1228 addcc c_12,t_1,c_12 | |
1229 bcs,a %xcc,.+8 | |
1230 add c_3,t_2,c_3 | |
1231 mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2); | |
1232 addcc c_12,t_1,c_12 | |
1233 bcs,a %xcc,.+8 | |
1234 add c_3,t_2,c_3 | |
1235 addcc c_12,t_1,c_12 | |
1236 bcs,a %xcc,.+8 | |
1237 add c_3,t_2,c_3 | |
1238 lduw ap(6),a_6 | |
1239 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); | |
1240 addcc c_12,t_1,c_12 | |
1241 bcs,a %xcc,.+8 | |
1242 add c_3,t_2,c_3 | |
1243 addcc c_12,t_1,t_1 | |
1244 bcs,a %xcc,.+8 | |
1245 add c_3,t_2,c_3 | |
1246 srlx t_1,32,c_12 | |
1247 stuw t_1,rp(5) !r[5]=c3; | |
1248 or c_12,c_3,c_12 | |
1249 | |
1250 mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3); | |
1251 addcc c_12,t_1,c_12 | |
1252 clr c_3 | |
1253 bcs,a %xcc,.+8 | |
1254 add c_3,t_2,c_3 | |
1255 addcc c_12,t_1,c_12 | |
1256 bcs,a %xcc,.+8 | |
1257 add c_3,t_2,c_3 | |
1258 mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3); | |
1259 addcc c_12,t_1,c_12 | |
1260 bcs,a %xcc,.+8 | |
1261 add c_3,t_2,c_3 | |
1262 addcc c_12,t_1,c_12 | |
1263 bcs,a %xcc,.+8 | |
1264 add c_3,t_2,c_3 | |
1265 mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3); | |
1266 addcc c_12,t_1,c_12 | |
1267 bcs,a %xcc,.+8 | |
1268 add c_3,t_2,c_3 | |
1269 addcc c_12,t_1,c_12 | |
1270 bcs,a %xcc,.+8 | |
1271 add c_3,t_2,c_3 | |
1272 lduw ap(7),a_7 | |
1273 mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3); | |
1274 addcc c_12,t_1,t_1 | |
1275 bcs,a %xcc,.+8 | |
1276 add c_3,t_2,c_3 | |
1277 srlx t_1,32,c_12 | |
1278 stuw t_1,rp(6) !r[6]=c1; | |
1279 or c_12,c_3,c_12 | |
1280 | |
1281 mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1); | |
1282 addcc c_12,t_1,c_12 | |
1283 clr c_3 | |
1284 bcs,a %xcc,.+8 | |
1285 add c_3,t_2,c_3 | |
1286 addcc c_12,t_1,c_12 | |
1287 bcs,a %xcc,.+8 | |
1288 add c_3,t_2,c_3 | |
1289 mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1); | |
1290 addcc c_12,t_1,c_12 | |
1291 bcs,a %xcc,.+8 | |
1292 add c_3,t_2,c_3 | |
1293 addcc c_12,t_1,c_12 | |
1294 bcs,a %xcc,.+8 | |
1295 add c_3,t_2,c_3 | |
1296 mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1); | |
1297 addcc c_12,t_1,c_12 | |
1298 bcs,a %xcc,.+8 | |
1299 add c_3,t_2,c_3 | |
1300 addcc c_12,t_1,c_12 | |
1301 bcs,a %xcc,.+8 | |
1302 add c_3,t_2,c_3 | |
1303 mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1); | |
1304 addcc c_12,t_1,c_12 | |
1305 bcs,a %xcc,.+8 | |
1306 add c_3,t_2,c_3 | |
1307 addcc c_12,t_1,t_1 | |
1308 bcs,a %xcc,.+8 | |
1309 add c_3,t_2,c_3 | |
1310 srlx t_1,32,c_12 | |
1311 stuw t_1,rp(7) !r[7]=c2; | |
1312 or c_12,c_3,c_12 | |
1313 | |
1314 mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2); | |
1315 addcc c_12,t_1,c_12 | |
1316 clr c_3 | |
1317 bcs,a %xcc,.+8 | |
1318 add c_3,t_2,c_3 | |
1319 addcc c_12,t_1,c_12 | |
1320 bcs,a %xcc,.+8 | |
1321 add c_3,t_2,c_3 | |
1322 mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2); | |
1323 addcc c_12,t_1,c_12 | |
1324 bcs,a %xcc,.+8 | |
1325 add c_3,t_2,c_3 | |
1326 addcc c_12,t_1,c_12 | |
1327 bcs,a %xcc,.+8 | |
1328 add c_3,t_2,c_3 | |
1329 mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2); | |
1330 addcc c_12,t_1,c_12 | |
1331 bcs,a %xcc,.+8 | |
1332 add c_3,t_2,c_3 | |
1333 addcc c_12,t_1,c_12 | |
1334 bcs,a %xcc,.+8 | |
1335 add c_3,t_2,c_3 | |
1336 mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2); | |
1337 addcc c_12,t_1,t_1 | |
1338 bcs,a %xcc,.+8 | |
1339 add c_3,t_2,c_3 | |
1340 srlx t_1,32,c_12 | |
1341 stuw t_1,rp(8) !r[8]=c3; | |
1342 or c_12,c_3,c_12 | |
1343 | |
1344 mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3); | |
1345 addcc c_12,t_1,c_12 | |
1346 clr c_3 | |
1347 bcs,a %xcc,.+8 | |
1348 add c_3,t_2,c_3 | |
1349 addcc c_12,t_1,c_12 | |
1350 bcs,a %xcc,.+8 | |
1351 add c_3,t_2,c_3 | |
1352 mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3); | |
1353 addcc c_12,t_1,c_12 | |
1354 bcs,a %xcc,.+8 | |
1355 add c_3,t_2,c_3 | |
1356 addcc c_12,t_1,c_12 | |
1357 bcs,a %xcc,.+8 | |
1358 add c_3,t_2,c_3 | |
1359 mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3); | |
1360 addcc c_12,t_1,c_12 | |
1361 bcs,a %xcc,.+8 | |
1362 add c_3,t_2,c_3 | |
1363 addcc c_12,t_1,t_1 | |
1364 bcs,a %xcc,.+8 | |
1365 add c_3,t_2,c_3 | |
1366 srlx t_1,32,c_12 | |
1367 stuw t_1,rp(9) !r[9]=c1; | |
1368 or c_12,c_3,c_12 | |
1369 | |
1370 mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1); | |
1371 addcc c_12,t_1,c_12 | |
1372 clr c_3 | |
1373 bcs,a %xcc,.+8 | |
1374 add c_3,t_2,c_3 | |
1375 addcc c_12,t_1,c_12 | |
1376 bcs,a %xcc,.+8 | |
1377 add c_3,t_2,c_3 | |
1378 mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1); | |
1379 addcc c_12,t_1,c_12 | |
1380 bcs,a %xcc,.+8 | |
1381 add c_3,t_2,c_3 | |
1382 addcc c_12,t_1,c_12 | |
1383 bcs,a %xcc,.+8 | |
1384 add c_3,t_2,c_3 | |
1385 mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1); | |
1386 addcc c_12,t_1,t_1 | |
1387 bcs,a %xcc,.+8 | |
1388 add c_3,t_2,c_3 | |
1389 srlx t_1,32,c_12 | |
1390 stuw t_1,rp(10) !r[10]=c2; | |
1391 or c_12,c_3,c_12 | |
1392 | |
1393 mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2); | |
1394 addcc c_12,t_1,c_12 | |
1395 clr c_3 | |
1396 bcs,a %xcc,.+8 | |
1397 add c_3,t_2,c_3 | |
1398 addcc c_12,t_1,c_12 | |
1399 bcs,a %xcc,.+8 | |
1400 add c_3,t_2,c_3 | |
1401 mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2); | |
1402 addcc c_12,t_1,c_12 | |
1403 bcs,a %xcc,.+8 | |
1404 add c_3,t_2,c_3 | |
1405 addcc c_12,t_1,t_1 | |
1406 bcs,a %xcc,.+8 | |
1407 add c_3,t_2,c_3 | |
1408 srlx t_1,32,c_12 | |
1409 stuw t_1,rp(11) !r[11]=c3; | |
1410 or c_12,c_3,c_12 | |
1411 | |
1412 mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3); | |
1413 addcc c_12,t_1,c_12 | |
1414 clr c_3 | |
1415 bcs,a %xcc,.+8 | |
1416 add c_3,t_2,c_3 | |
1417 addcc c_12,t_1,c_12 | |
1418 bcs,a %xcc,.+8 | |
1419 add c_3,t_2,c_3 | |
1420 mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3); | |
1421 addcc c_12,t_1,t_1 | |
1422 bcs,a %xcc,.+8 | |
1423 add c_3,t_2,c_3 | |
1424 srlx t_1,32,c_12 | |
1425 stuw t_1,rp(12) !r[12]=c1; | |
1426 or c_12,c_3,c_12 | |
1427 | |
1428 mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1); | |
1429 addcc c_12,t_1,c_12 | |
1430 clr c_3 | |
1431 bcs,a %xcc,.+8 | |
1432 add c_3,t_2,c_3 | |
1433 addcc c_12,t_1,t_1 | |
1434 bcs,a %xcc,.+8 | |
1435 add c_3,t_2,c_3 | |
1436 srlx t_1,32,c_12 | |
1437 stuw t_1,rp(13) !r[13]=c2; | |
1438 or c_12,c_3,c_12 | |
1439 | |
1440 mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2); | |
1441 addcc c_12,t_1,t_1 | |
1442 srlx t_1,32,c_12 | |
1443 stuw t_1,rp(14) !r[14]=c3; | |
1444 stuw c_12,rp(15) !r[15]=c1; | |
1445 | |
1446 ret | |
1447 restore %g0,%g0,%o0 | |
1448 | |
1449 .type bn_sqr_comba8,#function | |
1450 .size bn_sqr_comba8,(.-bn_sqr_comba8) | |
1451 | |
1452 .align 32 | |
1453 | |
1454 .global bn_sqr_comba4 | |
1455 /* | |
1456 * void bn_sqr_comba4(r,a) | |
1457 * BN_ULONG *r,*a; | |
1458 */ | |
1459 bn_sqr_comba4: | |
1460 save %sp,FRAME_SIZE,%sp | |
1461 mov 1,t_2 | |
1462 lduw ap(0),a_0 | |
1463 sllx t_2,32,t_2 | |
1464 lduw ap(1),a_1 | |
1465 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3); | |
1466 srlx t_1,32,c_12 | |
1467 stuw t_1,rp(0) !r[0]=c1; | |
1468 | |
1469 lduw ap(2),a_2 | |
1470 mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1); | |
1471 addcc c_12,t_1,c_12 | |
1472 clr c_3 | |
1473 bcs,a %xcc,.+8 | |
1474 add c_3,t_2,c_3 | |
1475 addcc c_12,t_1,t_1 | |
1476 bcs,a %xcc,.+8 | |
1477 add c_3,t_2,c_3 | |
1478 srlx t_1,32,c_12 | |
1479 stuw t_1,rp(1) !r[1]=c2; | |
1480 or c_12,c_3,c_12 | |
1481 | |
1482 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); | |
1483 addcc c_12,t_1,c_12 | |
1484 clr c_3 | |
1485 bcs,a %xcc,.+8 | |
1486 add c_3,t_2,c_3 | |
1487 addcc c_12,t_1,c_12 | |
1488 bcs,a %xcc,.+8 | |
1489 add c_3,t_2,c_3 | |
1490 lduw ap(3),a_3 | |
1491 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); | |
1492 addcc c_12,t_1,t_1 | |
1493 bcs,a %xcc,.+8 | |
1494 add c_3,t_2,c_3 | |
1495 srlx t_1,32,c_12 | |
1496 stuw t_1,rp(2) !r[2]=c3; | |
1497 or c_12,c_3,c_12 | |
1498 | |
1499 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); | |
1500 addcc c_12,t_1,c_12 | |
1501 clr c_3 | |
1502 bcs,a %xcc,.+8 | |
1503 add c_3,t_2,c_3 | |
1504 addcc c_12,t_1,c_12 | |
1505 bcs,a %xcc,.+8 | |
1506 add c_3,t_2,c_3 | |
1507 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); | |
1508 addcc c_12,t_1,c_12 | |
1509 bcs,a %xcc,.+8 | |
1510 add c_3,t_2,c_3 | |
1511 addcc c_12,t_1,t_1 | |
1512 bcs,a %xcc,.+8 | |
1513 add c_3,t_2,c_3 | |
1514 srlx t_1,32,c_12 | |
1515 stuw t_1,rp(3) !r[3]=c1; | |
1516 or c_12,c_3,c_12 | |
1517 | |
1518 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); | |
1519 addcc c_12,t_1,c_12 | |
1520 clr c_3 | |
1521 bcs,a %xcc,.+8 | |
1522 add c_3,t_2,c_3 | |
1523 addcc c_12,t_1,c_12 | |
1524 bcs,a %xcc,.+8 | |
1525 add c_3,t_2,c_3 | |
1526 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); | |
1527 addcc c_12,t_1,t_1 | |
1528 bcs,a %xcc,.+8 | |
1529 add c_3,t_2,c_3 | |
1530 srlx t_1,32,c_12 | |
1531 stuw t_1,rp(4) !r[4]=c2; | |
1532 or c_12,c_3,c_12 | |
1533 | |
1534 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); | |
1535 addcc c_12,t_1,c_12 | |
1536 clr c_3 | |
1537 bcs,a %xcc,.+8 | |
1538 add c_3,t_2,c_3 | |
1539 addcc c_12,t_1,t_1 | |
1540 bcs,a %xcc,.+8 | |
1541 add c_3,t_2,c_3 | |
1542 srlx t_1,32,c_12 | |
1543 stuw t_1,rp(5) !r[5]=c3; | |
1544 or c_12,c_3,c_12 | |
1545 | |
1546 mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3); | |
1547 addcc c_12,t_1,t_1 | |
1548 srlx t_1,32,c_12 | |
1549 stuw t_1,rp(6) !r[6]=c1; | |
1550 stuw c_12,rp(7) !r[7]=c2; | |
1551 | |
1552 ret | |
1553 restore %g0,%g0,%o0 | |
1554 | |
1555 .type bn_sqr_comba4,#function | |
1556 .size bn_sqr_comba4,(.-bn_sqr_comba4) | |
1557 | |
1558 .align 32 | |
OLD | NEW |