OLD | NEW |
| (Empty) |
1 .ident "sparcv8.s, Version 1.4" | |
2 .ident "SPARC v8 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" | |
3 | |
4 /* | |
5 * ==================================================================== | |
6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
7 * project. | |
8 * | |
9 * Rights for redistribution and usage in source and binary forms are | |
10 * granted according to the OpenSSL license. Warranty of any kind is | |
11 * disclaimed. | |
12 * ==================================================================== | |
13 */ | |
14 | |
15 /* | |
16 * This is my modest contributon to OpenSSL project (see | |
17 * http://www.openssl.org/ for more information about it) and is | |
18 * a drop-in SuperSPARC ISA replacement for crypto/bn/bn_asm.c | |
19 * module. For updates see http://fy.chalmers.se/~appro/hpe/. | |
20 * | |
21 * See bn_asm.sparc.v8plus.S for more details. | |
22 */ | |
23 | |
24 /* | |
25 * Revision history. | |
26 * | |
27 * 1.1 - new loop unrolling model(*); | |
28 * 1.2 - made gas friendly; | |
29 * 1.3 - fixed problem with /usr/ccs/lib/cpp; | |
30 * 1.4 - some retunes; | |
31 * | |
32 * (*) see bn_asm.sparc.v8plus.S for details | |
33 */ | |
34 | |
35 .section ".text",#alloc,#execinstr | |
36 .file "bn_asm.sparc.v8.S" | |
37 | |
38 .align 32 | |
39 | |
40 .global bn_mul_add_words | |
41 /* | |
42 * BN_ULONG bn_mul_add_words(rp,ap,num,w) | |
43 * BN_ULONG *rp,*ap; | |
44 * int num; | |
45 * BN_ULONG w; | |
46 */ | |
47 bn_mul_add_words: | |
48 cmp %o2,0 | |
49 bg,a .L_bn_mul_add_words_proceed | |
50 ld [%o1],%g2 | |
51 retl | |
52 clr %o0 | |
53 | |
54 .L_bn_mul_add_words_proceed: | |
55 andcc %o2,-4,%g0 | |
56 bz .L_bn_mul_add_words_tail | |
57 clr %o5 | |
58 | |
59 .L_bn_mul_add_words_loop: | |
60 ld [%o0],%o4 | |
61 ld [%o1+4],%g3 | |
62 umul %o3,%g2,%g2 | |
63 rd %y,%g1 | |
64 addcc %o4,%o5,%o4 | |
65 addx %g1,0,%g1 | |
66 addcc %o4,%g2,%o4 | |
67 st %o4,[%o0] | |
68 addx %g1,0,%o5 | |
69 | |
70 ld [%o0+4],%o4 | |
71 ld [%o1+8],%g2 | |
72 umul %o3,%g3,%g3 | |
73 dec 4,%o2 | |
74 rd %y,%g1 | |
75 addcc %o4,%o5,%o4 | |
76 addx %g1,0,%g1 | |
77 addcc %o4,%g3,%o4 | |
78 st %o4,[%o0+4] | |
79 addx %g1,0,%o5 | |
80 | |
81 ld [%o0+8],%o4 | |
82 ld [%o1+12],%g3 | |
83 umul %o3,%g2,%g2 | |
84 inc 16,%o1 | |
85 rd %y,%g1 | |
86 addcc %o4,%o5,%o4 | |
87 addx %g1,0,%g1 | |
88 addcc %o4,%g2,%o4 | |
89 st %o4,[%o0+8] | |
90 addx %g1,0,%o5 | |
91 | |
92 ld [%o0+12],%o4 | |
93 umul %o3,%g3,%g3 | |
94 inc 16,%o0 | |
95 rd %y,%g1 | |
96 addcc %o4,%o5,%o4 | |
97 addx %g1,0,%g1 | |
98 addcc %o4,%g3,%o4 | |
99 st %o4,[%o0-4] | |
100 addx %g1,0,%o5 | |
101 andcc %o2,-4,%g0 | |
102 bnz,a .L_bn_mul_add_words_loop | |
103 ld [%o1],%g2 | |
104 | |
105 tst %o2 | |
106 bnz,a .L_bn_mul_add_words_tail | |
107 ld [%o1],%g2 | |
108 .L_bn_mul_add_words_return: | |
109 retl | |
110 mov %o5,%o0 | |
111 nop | |
112 | |
113 .L_bn_mul_add_words_tail: | |
114 ld [%o0],%o4 | |
115 umul %o3,%g2,%g2 | |
116 addcc %o4,%o5,%o4 | |
117 rd %y,%g1 | |
118 addx %g1,0,%g1 | |
119 addcc %o4,%g2,%o4 | |
120 addx %g1,0,%o5 | |
121 deccc %o2 | |
122 bz .L_bn_mul_add_words_return | |
123 st %o4,[%o0] | |
124 | |
125 ld [%o1+4],%g2 | |
126 ld [%o0+4],%o4 | |
127 umul %o3,%g2,%g2 | |
128 rd %y,%g1 | |
129 addcc %o4,%o5,%o4 | |
130 addx %g1,0,%g1 | |
131 addcc %o4,%g2,%o4 | |
132 addx %g1,0,%o5 | |
133 deccc %o2 | |
134 bz .L_bn_mul_add_words_return | |
135 st %o4,[%o0+4] | |
136 | |
137 ld [%o1+8],%g2 | |
138 ld [%o0+8],%o4 | |
139 umul %o3,%g2,%g2 | |
140 rd %y,%g1 | |
141 addcc %o4,%o5,%o4 | |
142 addx %g1,0,%g1 | |
143 addcc %o4,%g2,%o4 | |
144 st %o4,[%o0+8] | |
145 retl | |
146 addx %g1,0,%o0 | |
147 | |
148 .type bn_mul_add_words,#function | |
149 .size bn_mul_add_words,(.-bn_mul_add_words) | |
150 | |
151 .align 32 | |
152 | |
153 .global bn_mul_words | |
154 /* | |
155 * BN_ULONG bn_mul_words(rp,ap,num,w) | |
156 * BN_ULONG *rp,*ap; | |
157 * int num; | |
158 * BN_ULONG w; | |
159 */ | |
160 bn_mul_words: | |
161 cmp %o2,0 | |
162 bg,a .L_bn_mul_words_proceeed | |
163 ld [%o1],%g2 | |
164 retl | |
165 clr %o0 | |
166 | |
167 .L_bn_mul_words_proceeed: | |
168 andcc %o2,-4,%g0 | |
169 bz .L_bn_mul_words_tail | |
170 clr %o5 | |
171 | |
172 .L_bn_mul_words_loop: | |
173 ld [%o1+4],%g3 | |
174 umul %o3,%g2,%g2 | |
175 addcc %g2,%o5,%g2 | |
176 rd %y,%g1 | |
177 addx %g1,0,%o5 | |
178 st %g2,[%o0] | |
179 | |
180 ld [%o1+8],%g2 | |
181 umul %o3,%g3,%g3 | |
182 addcc %g3,%o5,%g3 | |
183 rd %y,%g1 | |
184 dec 4,%o2 | |
185 addx %g1,0,%o5 | |
186 st %g3,[%o0+4] | |
187 | |
188 ld [%o1+12],%g3 | |
189 umul %o3,%g2,%g2 | |
190 addcc %g2,%o5,%g2 | |
191 rd %y,%g1 | |
192 inc 16,%o1 | |
193 st %g2,[%o0+8] | |
194 addx %g1,0,%o5 | |
195 | |
196 umul %o3,%g3,%g3 | |
197 addcc %g3,%o5,%g3 | |
198 rd %y,%g1 | |
199 inc 16,%o0 | |
200 addx %g1,0,%o5 | |
201 st %g3,[%o0-4] | |
202 andcc %o2,-4,%g0 | |
203 nop | |
204 bnz,a .L_bn_mul_words_loop | |
205 ld [%o1],%g2 | |
206 | |
207 tst %o2 | |
208 bnz,a .L_bn_mul_words_tail | |
209 ld [%o1],%g2 | |
210 .L_bn_mul_words_return: | |
211 retl | |
212 mov %o5,%o0 | |
213 nop | |
214 | |
215 .L_bn_mul_words_tail: | |
216 umul %o3,%g2,%g2 | |
217 addcc %g2,%o5,%g2 | |
218 rd %y,%g1 | |
219 addx %g1,0,%o5 | |
220 deccc %o2 | |
221 bz .L_bn_mul_words_return | |
222 st %g2,[%o0] | |
223 nop | |
224 | |
225 ld [%o1+4],%g2 | |
226 umul %o3,%g2,%g2 | |
227 addcc %g2,%o5,%g2 | |
228 rd %y,%g1 | |
229 addx %g1,0,%o5 | |
230 deccc %o2 | |
231 bz .L_bn_mul_words_return | |
232 st %g2,[%o0+4] | |
233 | |
234 ld [%o1+8],%g2 | |
235 umul %o3,%g2,%g2 | |
236 addcc %g2,%o5,%g2 | |
237 rd %y,%g1 | |
238 st %g2,[%o0+8] | |
239 retl | |
240 addx %g1,0,%o0 | |
241 | |
242 .type bn_mul_words,#function | |
243 .size bn_mul_words,(.-bn_mul_words) | |
244 | |
245 .align 32 | |
246 .global bn_sqr_words | |
247 /* | |
248 * void bn_sqr_words(r,a,n) | |
249 * BN_ULONG *r,*a; | |
250 * int n; | |
251 */ | |
252 bn_sqr_words: | |
253 cmp %o2,0 | |
254 bg,a .L_bn_sqr_words_proceeed | |
255 ld [%o1],%g2 | |
256 retl | |
257 clr %o0 | |
258 | |
259 .L_bn_sqr_words_proceeed: | |
260 andcc %o2,-4,%g0 | |
261 bz .L_bn_sqr_words_tail | |
262 clr %o5 | |
263 | |
264 .L_bn_sqr_words_loop: | |
265 ld [%o1+4],%g3 | |
266 umul %g2,%g2,%o4 | |
267 st %o4,[%o0] | |
268 rd %y,%o5 | |
269 st %o5,[%o0+4] | |
270 | |
271 ld [%o1+8],%g2 | |
272 umul %g3,%g3,%o4 | |
273 dec 4,%o2 | |
274 st %o4,[%o0+8] | |
275 rd %y,%o5 | |
276 st %o5,[%o0+12] | |
277 nop | |
278 | |
279 ld [%o1+12],%g3 | |
280 umul %g2,%g2,%o4 | |
281 st %o4,[%o0+16] | |
282 rd %y,%o5 | |
283 inc 16,%o1 | |
284 st %o5,[%o0+20] | |
285 | |
286 umul %g3,%g3,%o4 | |
287 inc 32,%o0 | |
288 st %o4,[%o0-8] | |
289 rd %y,%o5 | |
290 st %o5,[%o0-4] | |
291 andcc %o2,-4,%g2 | |
292 bnz,a .L_bn_sqr_words_loop | |
293 ld [%o1],%g2 | |
294 | |
295 tst %o2 | |
296 nop | |
297 bnz,a .L_bn_sqr_words_tail | |
298 ld [%o1],%g2 | |
299 .L_bn_sqr_words_return: | |
300 retl | |
301 clr %o0 | |
302 | |
303 .L_bn_sqr_words_tail: | |
304 umul %g2,%g2,%o4 | |
305 st %o4,[%o0] | |
306 deccc %o2 | |
307 rd %y,%o5 | |
308 bz .L_bn_sqr_words_return | |
309 st %o5,[%o0+4] | |
310 | |
311 ld [%o1+4],%g2 | |
312 umul %g2,%g2,%o4 | |
313 st %o4,[%o0+8] | |
314 deccc %o2 | |
315 rd %y,%o5 | |
316 nop | |
317 bz .L_bn_sqr_words_return | |
318 st %o5,[%o0+12] | |
319 | |
320 ld [%o1+8],%g2 | |
321 umul %g2,%g2,%o4 | |
322 st %o4,[%o0+16] | |
323 rd %y,%o5 | |
324 st %o5,[%o0+20] | |
325 retl | |
326 clr %o0 | |
327 | |
328 .type bn_sqr_words,#function | |
329 .size bn_sqr_words,(.-bn_sqr_words) | |
330 | |
331 .align 32 | |
332 | |
333 .global bn_div_words | |
334 /* | |
335 * BN_ULONG bn_div_words(h,l,d) | |
336 * BN_ULONG h,l,d; | |
337 */ | |
338 bn_div_words: | |
339 wr %o0,%y | |
340 udiv %o1,%o2,%o0 | |
341 retl | |
342 nop | |
343 | |
344 .type bn_div_words,#function | |
345 .size bn_div_words,(.-bn_div_words) | |
346 | |
347 .align 32 | |
348 | |
349 .global bn_add_words | |
350 /* | |
351 * BN_ULONG bn_add_words(rp,ap,bp,n) | |
352 * BN_ULONG *rp,*ap,*bp; | |
353 * int n; | |
354 */ | |
355 bn_add_words: | |
356 cmp %o3,0 | |
357 bg,a .L_bn_add_words_proceed | |
358 ld [%o1],%o4 | |
359 retl | |
360 clr %o0 | |
361 | |
362 .L_bn_add_words_proceed: | |
363 andcc %o3,-4,%g0 | |
364 bz .L_bn_add_words_tail | |
365 clr %g1 | |
366 ba .L_bn_add_words_warn_loop | |
367 addcc %g0,0,%g0 ! clear carry flag | |
368 | |
369 .L_bn_add_words_loop: | |
370 ld [%o1],%o4 | |
371 .L_bn_add_words_warn_loop: | |
372 ld [%o2],%o5 | |
373 ld [%o1+4],%g3 | |
374 ld [%o2+4],%g4 | |
375 dec 4,%o3 | |
376 addxcc %o5,%o4,%o5 | |
377 st %o5,[%o0] | |
378 | |
379 ld [%o1+8],%o4 | |
380 ld [%o2+8],%o5 | |
381 inc 16,%o1 | |
382 addxcc %g3,%g4,%g3 | |
383 st %g3,[%o0+4] | |
384 | |
385 ld [%o1-4],%g3 | |
386 ld [%o2+12],%g4 | |
387 inc 16,%o2 | |
388 addxcc %o5,%o4,%o5 | |
389 st %o5,[%o0+8] | |
390 | |
391 inc 16,%o0 | |
392 addxcc %g3,%g4,%g3 | |
393 st %g3,[%o0-4] | |
394 addx %g0,0,%g1 | |
395 andcc %o3,-4,%g0 | |
396 bnz,a .L_bn_add_words_loop | |
397 addcc %g1,-1,%g0 | |
398 | |
399 tst %o3 | |
400 bnz,a .L_bn_add_words_tail | |
401 ld [%o1],%o4 | |
402 .L_bn_add_words_return: | |
403 retl | |
404 mov %g1,%o0 | |
405 | |
406 .L_bn_add_words_tail: | |
407 addcc %g1,-1,%g0 | |
408 ld [%o2],%o5 | |
409 addxcc %o5,%o4,%o5 | |
410 addx %g0,0,%g1 | |
411 deccc %o3 | |
412 bz .L_bn_add_words_return | |
413 st %o5,[%o0] | |
414 | |
415 ld [%o1+4],%o4 | |
416 addcc %g1,-1,%g0 | |
417 ld [%o2+4],%o5 | |
418 addxcc %o5,%o4,%o5 | |
419 addx %g0,0,%g1 | |
420 deccc %o3 | |
421 bz .L_bn_add_words_return | |
422 st %o5,[%o0+4] | |
423 | |
424 ld [%o1+8],%o4 | |
425 addcc %g1,-1,%g0 | |
426 ld [%o2+8],%o5 | |
427 addxcc %o5,%o4,%o5 | |
428 st %o5,[%o0+8] | |
429 retl | |
430 addx %g0,0,%o0 | |
431 | |
432 .type bn_add_words,#function | |
433 .size bn_add_words,(.-bn_add_words) | |
434 | |
435 .align 32 | |
436 | |
437 .global bn_sub_words | |
438 /* | |
439 * BN_ULONG bn_sub_words(rp,ap,bp,n) | |
440 * BN_ULONG *rp,*ap,*bp; | |
441 * int n; | |
442 */ | |
443 bn_sub_words: | |
444 cmp %o3,0 | |
445 bg,a .L_bn_sub_words_proceed | |
446 ld [%o1],%o4 | |
447 retl | |
448 clr %o0 | |
449 | |
450 .L_bn_sub_words_proceed: | |
451 andcc %o3,-4,%g0 | |
452 bz .L_bn_sub_words_tail | |
453 clr %g1 | |
454 ba .L_bn_sub_words_warm_loop | |
455 addcc %g0,0,%g0 ! clear carry flag | |
456 | |
457 .L_bn_sub_words_loop: | |
458 ld [%o1],%o4 | |
459 .L_bn_sub_words_warm_loop: | |
460 ld [%o2],%o5 | |
461 ld [%o1+4],%g3 | |
462 ld [%o2+4],%g4 | |
463 dec 4,%o3 | |
464 subxcc %o4,%o5,%o5 | |
465 st %o5,[%o0] | |
466 | |
467 ld [%o1+8],%o4 | |
468 ld [%o2+8],%o5 | |
469 inc 16,%o1 | |
470 subxcc %g3,%g4,%g4 | |
471 st %g4,[%o0+4] | |
472 | |
473 ld [%o1-4],%g3 | |
474 ld [%o2+12],%g4 | |
475 inc 16,%o2 | |
476 subxcc %o4,%o5,%o5 | |
477 st %o5,[%o0+8] | |
478 | |
479 inc 16,%o0 | |
480 subxcc %g3,%g4,%g4 | |
481 st %g4,[%o0-4] | |
482 addx %g0,0,%g1 | |
483 andcc %o3,-4,%g0 | |
484 bnz,a .L_bn_sub_words_loop | |
485 addcc %g1,-1,%g0 | |
486 | |
487 tst %o3 | |
488 nop | |
489 bnz,a .L_bn_sub_words_tail | |
490 ld [%o1],%o4 | |
491 .L_bn_sub_words_return: | |
492 retl | |
493 mov %g1,%o0 | |
494 | |
495 .L_bn_sub_words_tail: | |
496 addcc %g1,-1,%g0 | |
497 ld [%o2],%o5 | |
498 subxcc %o4,%o5,%o5 | |
499 addx %g0,0,%g1 | |
500 deccc %o3 | |
501 bz .L_bn_sub_words_return | |
502 st %o5,[%o0] | |
503 nop | |
504 | |
505 ld [%o1+4],%o4 | |
506 addcc %g1,-1,%g0 | |
507 ld [%o2+4],%o5 | |
508 subxcc %o4,%o5,%o5 | |
509 addx %g0,0,%g1 | |
510 deccc %o3 | |
511 bz .L_bn_sub_words_return | |
512 st %o5,[%o0+4] | |
513 | |
514 ld [%o1+8],%o4 | |
515 addcc %g1,-1,%g0 | |
516 ld [%o2+8],%o5 | |
517 subxcc %o4,%o5,%o5 | |
518 st %o5,[%o0+8] | |
519 retl | |
520 addx %g0,0,%o0 | |
521 | |
522 .type bn_sub_words,#function | |
523 .size bn_sub_words,(.-bn_sub_words) | |
524 | |
525 #define FRAME_SIZE -96 | |
526 | |
527 /* | |
528 * Here is register usage map for *all* routines below. | |
529 */ | |
530 #define t_1 %o0 | |
531 #define t_2 %o1 | |
532 #define c_1 %o2 | |
533 #define c_2 %o3 | |
534 #define c_3 %o4 | |
535 | |
536 #define ap(I) [%i1+4*I] | |
537 #define bp(I) [%i2+4*I] | |
538 #define rp(I) [%i0+4*I] | |
539 | |
540 #define a_0 %l0 | |
541 #define a_1 %l1 | |
542 #define a_2 %l2 | |
543 #define a_3 %l3 | |
544 #define a_4 %l4 | |
545 #define a_5 %l5 | |
546 #define a_6 %l6 | |
547 #define a_7 %l7 | |
548 | |
549 #define b_0 %i3 | |
550 #define b_1 %i4 | |
551 #define b_2 %i5 | |
552 #define b_3 %o5 | |
553 #define b_4 %g1 | |
554 #define b_5 %g2 | |
555 #define b_6 %g3 | |
556 #define b_7 %g4 | |
557 | |
558 .align 32 | |
559 .global bn_mul_comba8 | |
560 /* | |
561 * void bn_mul_comba8(r,a,b) | |
562 * BN_ULONG *r,*a,*b; | |
563 */ | |
564 bn_mul_comba8: | |
565 save %sp,FRAME_SIZE,%sp | |
566 ld ap(0),a_0 | |
567 ld bp(0),b_0 | |
568 umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3); | |
569 ld bp(1),b_1 | |
570 rd %y,c_2 | |
571 st c_1,rp(0) !r[0]=c1; | |
572 | |
573 umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1); | |
574 ld ap(1),a_1 | |
575 addcc c_2,t_1,c_2 | |
576 rd %y,t_2 | |
577 addxcc %g0,t_2,c_3 != | |
578 addx %g0,%g0,c_1 | |
579 ld ap(2),a_2 | |
580 umul a_1,b_0,t_1 !mul_add_c(a[1],b[0],c2,c3,c1); | |
581 addcc c_2,t_1,c_2 != | |
582 rd %y,t_2 | |
583 addxcc c_3,t_2,c_3 | |
584 st c_2,rp(1) !r[1]=c2; | |
585 addx c_1,%g0,c_1 != | |
586 | |
587 umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); | |
588 addcc c_3,t_1,c_3 | |
589 rd %y,t_2 | |
590 addxcc c_1,t_2,c_1 != | |
591 addx %g0,%g0,c_2 | |
592 ld bp(2),b_2 | |
593 umul a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); | |
594 addcc c_3,t_1,c_3 != | |
595 rd %y,t_2 | |
596 addxcc c_1,t_2,c_1 | |
597 ld bp(3),b_3 | |
598 addx c_2,%g0,c_2 != | |
599 umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); | |
600 addcc c_3,t_1,c_3 | |
601 rd %y,t_2 | |
602 addxcc c_1,t_2,c_1 != | |
603 addx c_2,%g0,c_2 | |
604 st c_3,rp(2) !r[2]=c3; | |
605 | |
606 umul a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); | |
607 addcc c_1,t_1,c_1 != | |
608 rd %y,t_2 | |
609 addxcc c_2,t_2,c_2 | |
610 addx %g0,%g0,c_3 | |
611 umul a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3); | |
612 addcc c_1,t_1,c_1 | |
613 rd %y,t_2 | |
614 addxcc c_2,t_2,c_2 | |
615 addx c_3,%g0,c_3 != | |
616 ld ap(3),a_3 | |
617 umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); | |
618 addcc c_1,t_1,c_1 | |
619 rd %y,t_2 != | |
620 addxcc c_2,t_2,c_2 | |
621 addx c_3,%g0,c_3 | |
622 ld ap(4),a_4 | |
623 umul a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!= | |
624 addcc c_1,t_1,c_1 | |
625 rd %y,t_2 | |
626 addxcc c_2,t_2,c_2 | |
627 addx c_3,%g0,c_3 != | |
628 st c_1,rp(3) !r[3]=c1; | |
629 | |
630 umul a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1); | |
631 addcc c_2,t_1,c_2 | |
632 rd %y,t_2 != | |
633 addxcc c_3,t_2,c_3 | |
634 addx %g0,%g0,c_1 | |
635 umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1); | |
636 addcc c_2,t_1,c_2 != | |
637 rd %y,t_2 | |
638 addxcc c_3,t_2,c_3 | |
639 addx c_1,%g0,c_1 | |
640 umul a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1); | |
641 addcc c_2,t_1,c_2 | |
642 rd %y,t_2 | |
643 addxcc c_3,t_2,c_3 | |
644 addx c_1,%g0,c_1 != | |
645 ld bp(4),b_4 | |
646 umul a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); | |
647 addcc c_2,t_1,c_2 | |
648 rd %y,t_2 != | |
649 addxcc c_3,t_2,c_3 | |
650 addx c_1,%g0,c_1 | |
651 ld bp(5),b_5 | |
652 umul a_0,b_4,t_1 !=!mul_add_c(a[0],b[4],c2,c3,c1); | |
653 addcc c_2,t_1,c_2 | |
654 rd %y,t_2 | |
655 addxcc c_3,t_2,c_3 | |
656 addx c_1,%g0,c_1 != | |
657 st c_2,rp(4) !r[4]=c2; | |
658 | |
659 umul a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2); | |
660 addcc c_3,t_1,c_3 | |
661 rd %y,t_2 != | |
662 addxcc c_1,t_2,c_1 | |
663 addx %g0,%g0,c_2 | |
664 umul a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2); | |
665 addcc c_3,t_1,c_3 != | |
666 rd %y,t_2 | |
667 addxcc c_1,t_2,c_1 | |
668 addx c_2,%g0,c_2 | |
669 umul a_2,b_3,t_1 !=!mul_add_c(a[2],b[3],c3,c1,c2); | |
670 addcc c_3,t_1,c_3 | |
671 rd %y,t_2 | |
672 addxcc c_1,t_2,c_1 | |
673 addx c_2,%g0,c_2 != | |
674 umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); | |
675 addcc c_3,t_1,c_3 | |
676 rd %y,t_2 | |
677 addxcc c_1,t_2,c_1 != | |
678 addx c_2,%g0,c_2 | |
679 ld ap(5),a_5 | |
680 umul a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2); | |
681 addcc c_3,t_1,c_3 != | |
682 rd %y,t_2 | |
683 addxcc c_1,t_2,c_1 | |
684 ld ap(6),a_6 | |
685 addx c_2,%g0,c_2 != | |
686 umul a_5,b_0,t_1 !mul_add_c(a[5],b[0],c3,c1,c2); | |
687 addcc c_3,t_1,c_3 | |
688 rd %y,t_2 | |
689 addxcc c_1,t_2,c_1 != | |
690 addx c_2,%g0,c_2 | |
691 st c_3,rp(5) !r[5]=c3; | |
692 | |
693 umul a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3); | |
694 addcc c_1,t_1,c_1 != | |
695 rd %y,t_2 | |
696 addxcc c_2,t_2,c_2 | |
697 addx %g0,%g0,c_3 | |
698 umul a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3); | |
699 addcc c_1,t_1,c_1 | |
700 rd %y,t_2 | |
701 addxcc c_2,t_2,c_2 | |
702 addx c_3,%g0,c_3 != | |
703 umul a_4,b_2,t_1 !mul_add_c(a[4],b[2],c1,c2,c3); | |
704 addcc c_1,t_1,c_1 | |
705 rd %y,t_2 | |
706 addxcc c_2,t_2,c_2 != | |
707 addx c_3,%g0,c_3 | |
708 umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3); | |
709 addcc c_1,t_1,c_1 | |
710 rd %y,t_2 != | |
711 addxcc c_2,t_2,c_2 | |
712 addx c_3,%g0,c_3 | |
713 umul a_2,b_4,t_1 !mul_add_c(a[2],b[4],c1,c2,c3); | |
714 addcc c_1,t_1,c_1 != | |
715 rd %y,t_2 | |
716 addxcc c_2,t_2,c_2 | |
717 ld bp(6),b_6 | |
718 addx c_3,%g0,c_3 != | |
719 umul a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3); | |
720 addcc c_1,t_1,c_1 | |
721 rd %y,t_2 | |
722 addxcc c_2,t_2,c_2 != | |
723 addx c_3,%g0,c_3 | |
724 ld bp(7),b_7 | |
725 umul a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3); | |
726 addcc c_1,t_1,c_1 != | |
727 rd %y,t_2 | |
728 addxcc c_2,t_2,c_2 | |
729 st c_1,rp(6) !r[6]=c1; | |
730 addx c_3,%g0,c_3 != | |
731 | |
732 umul a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1); | |
733 addcc c_2,t_1,c_2 | |
734 rd %y,t_2 | |
735 addxcc c_3,t_2,c_3 != | |
736 addx %g0,%g0,c_1 | |
737 umul a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1); | |
738 addcc c_2,t_1,c_2 | |
739 rd %y,t_2 != | |
740 addxcc c_3,t_2,c_3 | |
741 addx c_1,%g0,c_1 | |
742 umul a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1); | |
743 addcc c_2,t_1,c_2 != | |
744 rd %y,t_2 | |
745 addxcc c_3,t_2,c_3 | |
746 addx c_1,%g0,c_1 | |
747 umul a_3,b_4,t_1 !=!mul_add_c(a[3],b[4],c2,c3,c1); | |
748 addcc c_2,t_1,c_2 | |
749 rd %y,t_2 | |
750 addxcc c_3,t_2,c_3 | |
751 addx c_1,%g0,c_1 != | |
752 umul a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1); | |
753 addcc c_2,t_1,c_2 | |
754 rd %y,t_2 | |
755 addxcc c_3,t_2,c_3 != | |
756 addx c_1,%g0,c_1 | |
757 umul a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1); | |
758 addcc c_2,t_1,c_2 | |
759 rd %y,t_2 != | |
760 addxcc c_3,t_2,c_3 | |
761 addx c_1,%g0,c_1 | |
762 ld ap(7),a_7 | |
763 umul a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1); | |
764 addcc c_2,t_1,c_2 | |
765 rd %y,t_2 | |
766 addxcc c_3,t_2,c_3 | |
767 addx c_1,%g0,c_1 != | |
768 umul a_7,b_0,t_1 !mul_add_c(a[7],b[0],c2,c3,c1); | |
769 addcc c_2,t_1,c_2 | |
770 rd %y,t_2 | |
771 addxcc c_3,t_2,c_3 != | |
772 addx c_1,%g0,c_1 | |
773 st c_2,rp(7) !r[7]=c2; | |
774 | |
775 umul a_7,b_1,t_1 !mul_add_c(a[7],b[1],c3,c1,c2); | |
776 addcc c_3,t_1,c_3 != | |
777 rd %y,t_2 | |
778 addxcc c_1,t_2,c_1 | |
779 addx %g0,%g0,c_2 | |
780 umul a_6,b_2,t_1 !=!mul_add_c(a[6],b[2],c3,c1,c2); | |
781 addcc c_3,t_1,c_3 | |
782 rd %y,t_2 | |
783 addxcc c_1,t_2,c_1 | |
784 addx c_2,%g0,c_2 != | |
785 umul a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2); | |
786 addcc c_3,t_1,c_3 | |
787 rd %y,t_2 | |
788 addxcc c_1,t_2,c_1 != | |
789 addx c_2,%g0,c_2 | |
790 umul a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2); | |
791 addcc c_3,t_1,c_3 | |
792 rd %y,t_2 != | |
793 addxcc c_1,t_2,c_1 | |
794 addx c_2,%g0,c_2 | |
795 umul a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2); | |
796 addcc c_3,t_1,c_3 != | |
797 rd %y,t_2 | |
798 addxcc c_1,t_2,c_1 | |
799 addx c_2,%g0,c_2 | |
800 umul a_2,b_6,t_1 !=!mul_add_c(a[2],b[6],c3,c1,c2); | |
801 addcc c_3,t_1,c_3 | |
802 rd %y,t_2 | |
803 addxcc c_1,t_2,c_1 | |
804 addx c_2,%g0,c_2 != | |
805 umul a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2); | |
806 addcc c_3,t_1,c_3 | |
807 rd %y,t_2 | |
808 addxcc c_1,t_2,c_1 ! | |
809 addx c_2,%g0,c_2 | |
810 st c_3,rp(8) !r[8]=c3; | |
811 | |
812 umul a_2,b_7,t_1 !mul_add_c(a[2],b[7],c1,c2,c3); | |
813 addcc c_1,t_1,c_1 != | |
814 rd %y,t_2 | |
815 addxcc c_2,t_2,c_2 | |
816 addx %g0,%g0,c_3 | |
817 umul a_3,b_6,t_1 !=!mul_add_c(a[3],b[6],c1,c2,c3); | |
818 addcc c_1,t_1,c_1 | |
819 rd %y,t_2 | |
820 addxcc c_2,t_2,c_2 | |
821 addx c_3,%g0,c_3 != | |
822 umul a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3); | |
823 addcc c_1,t_1,c_1 | |
824 rd %y,t_2 | |
825 addxcc c_2,t_2,c_2 != | |
826 addx c_3,%g0,c_3 | |
827 umul a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3); | |
828 addcc c_1,t_1,c_1 | |
829 rd %y,t_2 != | |
830 addxcc c_2,t_2,c_2 | |
831 addx c_3,%g0,c_3 | |
832 umul a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3); | |
833 addcc c_1,t_1,c_1 != | |
834 rd %y,t_2 | |
835 addxcc c_2,t_2,c_2 | |
836 addx c_3,%g0,c_3 | |
837 umul a_7,b_2,t_1 !=!mul_add_c(a[7],b[2],c1,c2,c3); | |
838 addcc c_1,t_1,c_1 | |
839 rd %y,t_2 | |
840 addxcc c_2,t_2,c_2 | |
841 addx c_3,%g0,c_3 != | |
842 st c_1,rp(9) !r[9]=c1; | |
843 | |
844 umul a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1); | |
845 addcc c_2,t_1,c_2 | |
846 rd %y,t_2 != | |
847 addxcc c_3,t_2,c_3 | |
848 addx %g0,%g0,c_1 | |
849 umul a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1); | |
850 addcc c_2,t_1,c_2 != | |
851 rd %y,t_2 | |
852 addxcc c_3,t_2,c_3 | |
853 addx c_1,%g0,c_1 | |
854 umul a_5,b_5,t_1 !=!mul_add_c(a[5],b[5],c2,c3,c1); | |
855 addcc c_2,t_1,c_2 | |
856 rd %y,t_2 | |
857 addxcc c_3,t_2,c_3 | |
858 addx c_1,%g0,c_1 != | |
859 umul a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1); | |
860 addcc c_2,t_1,c_2 | |
861 rd %y,t_2 | |
862 addxcc c_3,t_2,c_3 != | |
863 addx c_1,%g0,c_1 | |
864 umul a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1); | |
865 addcc c_2,t_1,c_2 | |
866 rd %y,t_2 != | |
867 addxcc c_3,t_2,c_3 | |
868 addx c_1,%g0,c_1 | |
869 st c_2,rp(10) !r[10]=c2; | |
870 | |
871 umul a_4,b_7,t_1 !=!mul_add_c(a[4],b[7],c3,c1,c2); | |
872 addcc c_3,t_1,c_3 | |
873 rd %y,t_2 | |
874 addxcc c_1,t_2,c_1 | |
875 addx %g0,%g0,c_2 != | |
876 umul a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2); | |
877 addcc c_3,t_1,c_3 | |
878 rd %y,t_2 | |
879 addxcc c_1,t_2,c_1 != | |
880 addx c_2,%g0,c_2 | |
881 umul a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2); | |
882 addcc c_3,t_1,c_3 | |
883 rd %y,t_2 != | |
884 addxcc c_1,t_2,c_1 | |
885 addx c_2,%g0,c_2 | |
886 umul a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2); | |
887 addcc c_3,t_1,c_3 != | |
888 rd %y,t_2 | |
889 addxcc c_1,t_2,c_1 | |
890 st c_3,rp(11) !r[11]=c3; | |
891 addx c_2,%g0,c_2 != | |
892 | |
893 umul a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3); | |
894 addcc c_1,t_1,c_1 | |
895 rd %y,t_2 | |
896 addxcc c_2,t_2,c_2 != | |
897 addx %g0,%g0,c_3 | |
898 umul a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3); | |
899 addcc c_1,t_1,c_1 | |
900 rd %y,t_2 != | |
901 addxcc c_2,t_2,c_2 | |
902 addx c_3,%g0,c_3 | |
903 umul a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3); | |
904 addcc c_1,t_1,c_1 != | |
905 rd %y,t_2 | |
906 addxcc c_2,t_2,c_2 | |
907 st c_1,rp(12) !r[12]=c1; | |
908 addx c_3,%g0,c_3 != | |
909 | |
910 umul a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1); | |
911 addcc c_2,t_1,c_2 | |
912 rd %y,t_2 | |
913 addxcc c_3,t_2,c_3 != | |
914 addx %g0,%g0,c_1 | |
915 umul a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1); | |
916 addcc c_2,t_1,c_2 | |
917 rd %y,t_2 != | |
918 addxcc c_3,t_2,c_3 | |
919 addx c_1,%g0,c_1 | |
920 st c_2,rp(13) !r[13]=c2; | |
921 | |
922 umul a_7,b_7,t_1 !=!mul_add_c(a[7],b[7],c3,c1,c2); | |
923 addcc c_3,t_1,c_3 | |
924 rd %y,t_2 | |
925 addxcc c_1,t_2,c_1 | |
926 nop != | |
927 st c_3,rp(14) !r[14]=c3; | |
928 st c_1,rp(15) !r[15]=c1; | |
929 | |
930 ret | |
931 restore %g0,%g0,%o0 | |
932 | |
933 .type bn_mul_comba8,#function | |
934 .size bn_mul_comba8,(.-bn_mul_comba8) | |
935 | |
936 .align 32 | |
937 | |
938 .global bn_mul_comba4 | |
939 /* | |
940 * void bn_mul_comba4(r,a,b) | |
941 * BN_ULONG *r,*a,*b; | |
942 */ | |
943 bn_mul_comba4: | |
944 save %sp,FRAME_SIZE,%sp | |
945 ld ap(0),a_0 | |
946 ld bp(0),b_0 | |
947 umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3); | |
948 ld bp(1),b_1 | |
949 rd %y,c_2 | |
950 st c_1,rp(0) !r[0]=c1; | |
951 | |
952 umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1); | |
953 ld ap(1),a_1 | |
954 addcc c_2,t_1,c_2 | |
955 rd %y,t_2 != | |
956 addxcc %g0,t_2,c_3 | |
957 addx %g0,%g0,c_1 | |
958 ld ap(2),a_2 | |
959 umul a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); | |
960 addcc c_2,t_1,c_2 | |
961 rd %y,t_2 | |
962 addxcc c_3,t_2,c_3 | |
963 addx c_1,%g0,c_1 != | |
964 st c_2,rp(1) !r[1]=c2; | |
965 | |
966 umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); | |
967 addcc c_3,t_1,c_3 | |
968 rd %y,t_2 != | |
969 addxcc c_1,t_2,c_1 | |
970 addx %g0,%g0,c_2 | |
971 ld bp(2),b_2 | |
972 umul a_1,b_1,t_1 !=!mul_add_c(a[1],b[1],c3,c1,c2); | |
973 addcc c_3,t_1,c_3 | |
974 rd %y,t_2 | |
975 addxcc c_1,t_2,c_1 | |
976 addx c_2,%g0,c_2 != | |
977 ld bp(3),b_3 | |
978 umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); | |
979 addcc c_3,t_1,c_3 | |
980 rd %y,t_2 != | |
981 addxcc c_1,t_2,c_1 | |
982 addx c_2,%g0,c_2 | |
983 st c_3,rp(2) !r[2]=c3; | |
984 | |
985 umul a_0,b_3,t_1 !=!mul_add_c(a[0],b[3],c1,c2,c3); | |
986 addcc c_1,t_1,c_1 | |
987 rd %y,t_2 | |
988 addxcc c_2,t_2,c_2 | |
989 addx %g0,%g0,c_3 != | |
990 umul a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3); | |
991 addcc c_1,t_1,c_1 | |
992 rd %y,t_2 | |
993 addxcc c_2,t_2,c_2 != | |
994 addx c_3,%g0,c_3 | |
995 ld ap(3),a_3 | |
996 umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); | |
997 addcc c_1,t_1,c_1 != | |
998 rd %y,t_2 | |
999 addxcc c_2,t_2,c_2 | |
1000 addx c_3,%g0,c_3 | |
1001 umul a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3); | |
1002 addcc c_1,t_1,c_1 | |
1003 rd %y,t_2 | |
1004 addxcc c_2,t_2,c_2 | |
1005 addx c_3,%g0,c_3 != | |
1006 st c_1,rp(3) !r[3]=c1; | |
1007 | |
1008 umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1); | |
1009 addcc c_2,t_1,c_2 | |
1010 rd %y,t_2 != | |
1011 addxcc c_3,t_2,c_3 | |
1012 addx %g0,%g0,c_1 | |
1013 umul a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1); | |
1014 addcc c_2,t_1,c_2 != | |
1015 rd %y,t_2 | |
1016 addxcc c_3,t_2,c_3 | |
1017 addx c_1,%g0,c_1 | |
1018 umul a_1,b_3,t_1 !=!mul_add_c(a[1],b[3],c2,c3,c1); | |
1019 addcc c_2,t_1,c_2 | |
1020 rd %y,t_2 | |
1021 addxcc c_3,t_2,c_3 | |
1022 addx c_1,%g0,c_1 != | |
1023 st c_2,rp(4) !r[4]=c2; | |
1024 | |
1025 umul a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); | |
1026 addcc c_3,t_1,c_3 | |
1027 rd %y,t_2 != | |
1028 addxcc c_1,t_2,c_1 | |
1029 addx %g0,%g0,c_2 | |
1030 umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); | |
1031 addcc c_3,t_1,c_3 != | |
1032 rd %y,t_2 | |
1033 addxcc c_1,t_2,c_1 | |
1034 st c_3,rp(5) !r[5]=c3; | |
1035 addx c_2,%g0,c_2 != | |
1036 | |
1037 umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3); | |
1038 addcc c_1,t_1,c_1 | |
1039 rd %y,t_2 | |
1040 addxcc c_2,t_2,c_2 != | |
1041 st c_1,rp(6) !r[6]=c1; | |
1042 st c_2,rp(7) !r[7]=c2; | |
1043 | |
1044 ret | |
1045 restore %g0,%g0,%o0 | |
1046 | |
1047 .type bn_mul_comba4,#function | |
1048 .size bn_mul_comba4,(.-bn_mul_comba4) | |
1049 | |
1050 .align 32 | |
1051 | |
1052 .global bn_sqr_comba8 | |
1053 bn_sqr_comba8: | |
1054 save %sp,FRAME_SIZE,%sp | |
1055 ld ap(0),a_0 | |
1056 ld ap(1),a_1 | |
1057 umul a_0,a_0,c_1 !=!sqr_add_c(a,0,c1,c2,c3); | |
1058 rd %y,c_2 | |
1059 st c_1,rp(0) !r[0]=c1; | |
1060 | |
1061 ld ap(2),a_2 | |
1062 umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1); | |
1063 addcc c_2,t_1,c_2 | |
1064 rd %y,t_2 | |
1065 addxcc %g0,t_2,c_3 | |
1066 addx %g0,%g0,c_1 != | |
1067 addcc c_2,t_1,c_2 | |
1068 addxcc c_3,t_2,c_3 | |
1069 st c_2,rp(1) !r[1]=c2; | |
1070 addx c_1,%g0,c_1 != | |
1071 | |
1072 umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); | |
1073 addcc c_3,t_1,c_3 | |
1074 rd %y,t_2 | |
1075 addxcc c_1,t_2,c_1 != | |
1076 addx %g0,%g0,c_2 | |
1077 addcc c_3,t_1,c_3 | |
1078 addxcc c_1,t_2,c_1 | |
1079 addx c_2,%g0,c_2 != | |
1080 ld ap(3),a_3 | |
1081 umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); | |
1082 addcc c_3,t_1,c_3 | |
1083 rd %y,t_2 != | |
1084 addxcc c_1,t_2,c_1 | |
1085 addx c_2,%g0,c_2 | |
1086 st c_3,rp(2) !r[2]=c3; | |
1087 | |
1088 umul a_0,a_3,t_1 !=!sqr_add_c2(a,3,0,c1,c2,c3); | |
1089 addcc c_1,t_1,c_1 | |
1090 rd %y,t_2 | |
1091 addxcc c_2,t_2,c_2 | |
1092 addx %g0,%g0,c_3 != | |
1093 addcc c_1,t_1,c_1 | |
1094 addxcc c_2,t_2,c_2 | |
1095 ld ap(4),a_4 | |
1096 addx c_3,%g0,c_3 != | |
1097 umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); | |
1098 addcc c_1,t_1,c_1 | |
1099 rd %y,t_2 | |
1100 addxcc c_2,t_2,c_2 != | |
1101 addx c_3,%g0,c_3 | |
1102 addcc c_1,t_1,c_1 | |
1103 addxcc c_2,t_2,c_2 | |
1104 addx c_3,%g0,c_3 != | |
1105 st c_1,rp(3) !r[3]=c1; | |
1106 | |
1107 umul a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1); | |
1108 addcc c_2,t_1,c_2 | |
1109 rd %y,t_2 != | |
1110 addxcc c_3,t_2,c_3 | |
1111 addx %g0,%g0,c_1 | |
1112 addcc c_2,t_1,c_2 | |
1113 addxcc c_3,t_2,c_3 != | |
1114 addx c_1,%g0,c_1 | |
1115 umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); | |
1116 addcc c_2,t_1,c_2 | |
1117 rd %y,t_2 != | |
1118 addxcc c_3,t_2,c_3 | |
1119 addx c_1,%g0,c_1 | |
1120 addcc c_2,t_1,c_2 | |
1121 addxcc c_3,t_2,c_3 != | |
1122 addx c_1,%g0,c_1 | |
1123 ld ap(5),a_5 | |
1124 umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); | |
1125 addcc c_2,t_1,c_2 != | |
1126 rd %y,t_2 | |
1127 addxcc c_3,t_2,c_3 | |
1128 st c_2,rp(4) !r[4]=c2; | |
1129 addx c_1,%g0,c_1 != | |
1130 | |
1131 umul a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2); | |
1132 addcc c_3,t_1,c_3 | |
1133 rd %y,t_2 | |
1134 addxcc c_1,t_2,c_1 != | |
1135 addx %g0,%g0,c_2 | |
1136 addcc c_3,t_1,c_3 | |
1137 addxcc c_1,t_2,c_1 | |
1138 addx c_2,%g0,c_2 != | |
1139 umul a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2); | |
1140 addcc c_3,t_1,c_3 | |
1141 rd %y,t_2 | |
1142 addxcc c_1,t_2,c_1 != | |
1143 addx c_2,%g0,c_2 | |
1144 addcc c_3,t_1,c_3 | |
1145 addxcc c_1,t_2,c_1 | |
1146 addx c_2,%g0,c_2 != | |
1147 ld ap(6),a_6 | |
1148 umul a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); | |
1149 addcc c_3,t_1,c_3 | |
1150 rd %y,t_2 != | |
1151 addxcc c_1,t_2,c_1 | |
1152 addx c_2,%g0,c_2 | |
1153 addcc c_3,t_1,c_3 | |
1154 addxcc c_1,t_2,c_1 != | |
1155 addx c_2,%g0,c_2 | |
1156 st c_3,rp(5) !r[5]=c3; | |
1157 | |
1158 umul a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3); | |
1159 addcc c_1,t_1,c_1 != | |
1160 rd %y,t_2 | |
1161 addxcc c_2,t_2,c_2 | |
1162 addx %g0,%g0,c_3 | |
1163 addcc c_1,t_1,c_1 != | |
1164 addxcc c_2,t_2,c_2 | |
1165 addx c_3,%g0,c_3 | |
1166 umul a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3); | |
1167 addcc c_1,t_1,c_1 != | |
1168 rd %y,t_2 | |
1169 addxcc c_2,t_2,c_2 | |
1170 addx c_3,%g0,c_3 | |
1171 addcc c_1,t_1,c_1 != | |
1172 addxcc c_2,t_2,c_2 | |
1173 addx c_3,%g0,c_3 | |
1174 umul a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3); | |
1175 addcc c_1,t_1,c_1 != | |
1176 rd %y,t_2 | |
1177 addxcc c_2,t_2,c_2 | |
1178 addx c_3,%g0,c_3 | |
1179 addcc c_1,t_1,c_1 != | |
1180 addxcc c_2,t_2,c_2 | |
1181 addx c_3,%g0,c_3 | |
1182 ld ap(7),a_7 | |
1183 umul a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3); | |
1184 addcc c_1,t_1,c_1 | |
1185 rd %y,t_2 | |
1186 addxcc c_2,t_2,c_2 | |
1187 addx c_3,%g0,c_3 != | |
1188 st c_1,rp(6) !r[6]=c1; | |
1189 | |
1190 umul a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1); | |
1191 addcc c_2,t_1,c_2 | |
1192 rd %y,t_2 != | |
1193 addxcc c_3,t_2,c_3 | |
1194 addx %g0,%g0,c_1 | |
1195 addcc c_2,t_1,c_2 | |
1196 addxcc c_3,t_2,c_3 != | |
1197 addx c_1,%g0,c_1 | |
1198 umul a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1); | |
1199 addcc c_2,t_1,c_2 | |
1200 rd %y,t_2 != | |
1201 addxcc c_3,t_2,c_3 | |
1202 addx c_1,%g0,c_1 | |
1203 addcc c_2,t_1,c_2 | |
1204 addxcc c_3,t_2,c_3 != | |
1205 addx c_1,%g0,c_1 | |
1206 umul a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1); | |
1207 addcc c_2,t_1,c_2 | |
1208 rd %y,t_2 != | |
1209 addxcc c_3,t_2,c_3 | |
1210 addx c_1,%g0,c_1 | |
1211 addcc c_2,t_1,c_2 | |
1212 addxcc c_3,t_2,c_3 != | |
1213 addx c_1,%g0,c_1 | |
1214 umul a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1); | |
1215 addcc c_2,t_1,c_2 | |
1216 rd %y,t_2 != | |
1217 addxcc c_3,t_2,c_3 | |
1218 addx c_1,%g0,c_1 | |
1219 addcc c_2,t_1,c_2 | |
1220 addxcc c_3,t_2,c_3 != | |
1221 addx c_1,%g0,c_1 | |
1222 st c_2,rp(7) !r[7]=c2; | |
1223 | |
1224 umul a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2); | |
1225 addcc c_3,t_1,c_3 != | |
1226 rd %y,t_2 | |
1227 addxcc c_1,t_2,c_1 | |
1228 addx %g0,%g0,c_2 | |
1229 addcc c_3,t_1,c_3 != | |
1230 addxcc c_1,t_2,c_1 | |
1231 addx c_2,%g0,c_2 | |
1232 umul a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2); | |
1233 addcc c_3,t_1,c_3 != | |
1234 rd %y,t_2 | |
1235 addxcc c_1,t_2,c_1 | |
1236 addx c_2,%g0,c_2 | |
1237 addcc c_3,t_1,c_3 != | |
1238 addxcc c_1,t_2,c_1 | |
1239 addx c_2,%g0,c_2 | |
1240 umul a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2); | |
1241 addcc c_3,t_1,c_3 != | |
1242 rd %y,t_2 | |
1243 addxcc c_1,t_2,c_1 | |
1244 addx c_2,%g0,c_2 | |
1245 addcc c_3,t_1,c_3 != | |
1246 addxcc c_1,t_2,c_1 | |
1247 addx c_2,%g0,c_2 | |
1248 umul a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2); | |
1249 addcc c_3,t_1,c_3 != | |
1250 rd %y,t_2 | |
1251 addxcc c_1,t_2,c_1 | |
1252 st c_3,rp(8) !r[8]=c3; | |
1253 addx c_2,%g0,c_2 != | |
1254 | |
1255 umul a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3); | |
1256 addcc c_1,t_1,c_1 | |
1257 rd %y,t_2 | |
1258 addxcc c_2,t_2,c_2 != | |
1259 addx %g0,%g0,c_3 | |
1260 addcc c_1,t_1,c_1 | |
1261 addxcc c_2,t_2,c_2 | |
1262 addx c_3,%g0,c_3 != | |
1263 umul a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3); | |
1264 addcc c_1,t_1,c_1 | |
1265 rd %y,t_2 | |
1266 addxcc c_2,t_2,c_2 != | |
1267 addx c_3,%g0,c_3 | |
1268 addcc c_1,t_1,c_1 | |
1269 addxcc c_2,t_2,c_2 | |
1270 addx c_3,%g0,c_3 != | |
1271 umul a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3); | |
1272 addcc c_1,t_1,c_1 | |
1273 rd %y,t_2 | |
1274 addxcc c_2,t_2,c_2 != | |
1275 addx c_3,%g0,c_3 | |
1276 addcc c_1,t_1,c_1 | |
1277 addxcc c_2,t_2,c_2 | |
1278 addx c_3,%g0,c_3 != | |
1279 st c_1,rp(9) !r[9]=c1; | |
1280 | |
1281 umul a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1); | |
1282 addcc c_2,t_1,c_2 | |
1283 rd %y,t_2 != | |
1284 addxcc c_3,t_2,c_3 | |
1285 addx %g0,%g0,c_1 | |
1286 addcc c_2,t_1,c_2 | |
1287 addxcc c_3,t_2,c_3 != | |
1288 addx c_1,%g0,c_1 | |
1289 umul a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1); | |
1290 addcc c_2,t_1,c_2 | |
1291 rd %y,t_2 != | |
1292 addxcc c_3,t_2,c_3 | |
1293 addx c_1,%g0,c_1 | |
1294 addcc c_2,t_1,c_2 | |
1295 addxcc c_3,t_2,c_3 != | |
1296 addx c_1,%g0,c_1 | |
1297 umul a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1); | |
1298 addcc c_2,t_1,c_2 | |
1299 rd %y,t_2 != | |
1300 addxcc c_3,t_2,c_3 | |
1301 addx c_1,%g0,c_1 | |
1302 st c_2,rp(10) !r[10]=c2; | |
1303 | |
1304 umul a_4,a_7,t_1 !=!sqr_add_c2(a,7,4,c3,c1,c2); | |
1305 addcc c_3,t_1,c_3 | |
1306 rd %y,t_2 | |
1307 addxcc c_1,t_2,c_1 | |
1308 addx %g0,%g0,c_2 != | |
1309 addcc c_3,t_1,c_3 | |
1310 addxcc c_1,t_2,c_1 | |
1311 addx c_2,%g0,c_2 | |
1312 umul a_5,a_6,t_1 !=!sqr_add_c2(a,6,5,c3,c1,c2); | |
1313 addcc c_3,t_1,c_3 | |
1314 rd %y,t_2 | |
1315 addxcc c_1,t_2,c_1 | |
1316 addx c_2,%g0,c_2 != | |
1317 addcc c_3,t_1,c_3 | |
1318 addxcc c_1,t_2,c_1 | |
1319 st c_3,rp(11) !r[11]=c3; | |
1320 addx c_2,%g0,c_2 != | |
1321 | |
1322 umul a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3); | |
1323 addcc c_1,t_1,c_1 | |
1324 rd %y,t_2 | |
1325 addxcc c_2,t_2,c_2 != | |
1326 addx %g0,%g0,c_3 | |
1327 addcc c_1,t_1,c_1 | |
1328 addxcc c_2,t_2,c_2 | |
1329 addx c_3,%g0,c_3 != | |
1330 umul a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3); | |
1331 addcc c_1,t_1,c_1 | |
1332 rd %y,t_2 | |
1333 addxcc c_2,t_2,c_2 != | |
1334 addx c_3,%g0,c_3 | |
1335 st c_1,rp(12) !r[12]=c1; | |
1336 | |
1337 umul a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1); | |
1338 addcc c_2,t_1,c_2 != | |
1339 rd %y,t_2 | |
1340 addxcc c_3,t_2,c_3 | |
1341 addx %g0,%g0,c_1 | |
1342 addcc c_2,t_1,c_2 != | |
1343 addxcc c_3,t_2,c_3 | |
1344 st c_2,rp(13) !r[13]=c2; | |
1345 addx c_1,%g0,c_1 != | |
1346 | |
1347 umul a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2); | |
1348 addcc c_3,t_1,c_3 | |
1349 rd %y,t_2 | |
1350 addxcc c_1,t_2,c_1 != | |
1351 st c_3,rp(14) !r[14]=c3; | |
1352 st c_1,rp(15) !r[15]=c1; | |
1353 | |
1354 ret | |
1355 restore %g0,%g0,%o0 | |
1356 | |
1357 .type bn_sqr_comba8,#function | |
1358 .size bn_sqr_comba8,(.-bn_sqr_comba8) | |
1359 | |
1360 .align 32 | |
1361 | |
1362 .global bn_sqr_comba4 | |
1363 /* | |
1364 * void bn_sqr_comba4(r,a) | |
1365 * BN_ULONG *r,*a; | |
1366 */ | |
1367 bn_sqr_comba4: | |
1368 save %sp,FRAME_SIZE,%sp | |
1369 ld ap(0),a_0 | |
1370 umul a_0,a_0,c_1 !sqr_add_c(a,0,c1,c2,c3); | |
1371 ld ap(1),a_1 != | |
1372 rd %y,c_2 | |
1373 st c_1,rp(0) !r[0]=c1; | |
1374 | |
1375 ld ap(2),a_2 | |
1376 umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1); | |
1377 addcc c_2,t_1,c_2 | |
1378 rd %y,t_2 | |
1379 addxcc %g0,t_2,c_3 | |
1380 addx %g0,%g0,c_1 != | |
1381 addcc c_2,t_1,c_2 | |
1382 addxcc c_3,t_2,c_3 | |
1383 addx c_1,%g0,c_1 != | |
1384 st c_2,rp(1) !r[1]=c2; | |
1385 | |
1386 umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); | |
1387 addcc c_3,t_1,c_3 | |
1388 rd %y,t_2 != | |
1389 addxcc c_1,t_2,c_1 | |
1390 addx %g0,%g0,c_2 | |
1391 addcc c_3,t_1,c_3 | |
1392 addxcc c_1,t_2,c_1 != | |
1393 addx c_2,%g0,c_2 | |
1394 ld ap(3),a_3 | |
1395 umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); | |
1396 addcc c_3,t_1,c_3 != | |
1397 rd %y,t_2 | |
1398 addxcc c_1,t_2,c_1 | |
1399 st c_3,rp(2) !r[2]=c3; | |
1400 addx c_2,%g0,c_2 != | |
1401 | |
1402 umul a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); | |
1403 addcc c_1,t_1,c_1 | |
1404 rd %y,t_2 | |
1405 addxcc c_2,t_2,c_2 != | |
1406 addx %g0,%g0,c_3 | |
1407 addcc c_1,t_1,c_1 | |
1408 addxcc c_2,t_2,c_2 | |
1409 addx c_3,%g0,c_3 != | |
1410 umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); | |
1411 addcc c_1,t_1,c_1 | |
1412 rd %y,t_2 | |
1413 addxcc c_2,t_2,c_2 != | |
1414 addx c_3,%g0,c_3 | |
1415 addcc c_1,t_1,c_1 | |
1416 addxcc c_2,t_2,c_2 | |
1417 addx c_3,%g0,c_3 != | |
1418 st c_1,rp(3) !r[3]=c1; | |
1419 | |
1420 umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); | |
1421 addcc c_2,t_1,c_2 | |
1422 rd %y,t_2 != | |
1423 addxcc c_3,t_2,c_3 | |
1424 addx %g0,%g0,c_1 | |
1425 addcc c_2,t_1,c_2 | |
1426 addxcc c_3,t_2,c_3 != | |
1427 addx c_1,%g0,c_1 | |
1428 umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); | |
1429 addcc c_2,t_1,c_2 | |
1430 rd %y,t_2 != | |
1431 addxcc c_3,t_2,c_3 | |
1432 addx c_1,%g0,c_1 | |
1433 st c_2,rp(4) !r[4]=c2; | |
1434 | |
1435 umul a_2,a_3,t_1 !=!sqr_add_c2(a,3,2,c3,c1,c2); | |
1436 addcc c_3,t_1,c_3 | |
1437 rd %y,t_2 | |
1438 addxcc c_1,t_2,c_1 | |
1439 addx %g0,%g0,c_2 != | |
1440 addcc c_3,t_1,c_3 | |
1441 addxcc c_1,t_2,c_1 | |
1442 st c_3,rp(5) !r[5]=c3; | |
1443 addx c_2,%g0,c_2 != | |
1444 | |
1445 umul a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3); | |
1446 addcc c_1,t_1,c_1 | |
1447 rd %y,t_2 | |
1448 addxcc c_2,t_2,c_2 != | |
1449 st c_1,rp(6) !r[6]=c1; | |
1450 st c_2,rp(7) !r[7]=c2; | |
1451 | |
1452 ret | |
1453 restore %g0,%g0,%o0 | |
1454 | |
1455 .type bn_sqr_comba4,#function | |
1456 .size bn_sqr_comba4,(.-bn_sqr_comba4) | |
1457 | |
1458 .align 32 | |
OLD | NEW |