OLD | NEW |
| (Empty) |
1 .rdata | |
2 .asciiz "mips3.s, Version 1.1" | |
3 .asciiz "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" | |
4 | |
5 /* | |
6 * ==================================================================== | |
7 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
8 * project. | |
9 * | |
10 * Rights for redistribution and usage in source and binary forms are | |
11 * granted according to the OpenSSL license. Warranty of any kind is | |
12 * disclaimed. | |
13 * ==================================================================== | |
14 */ | |
15 | |
16 /* | |
17 * This is my modest contributon to the OpenSSL project (see | |
18 * http://www.openssl.org/ for more information about it) and is | |
19 * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c | |
20 * module. For updates see http://fy.chalmers.se/~appro/hpe/. | |
21 * | |
22 * The module is designed to work with either of the "new" MIPS ABI(5), | |
23 * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under | |
24 * IRIX 5.x not only because it doesn't support new ABIs but also | |
25 * because 5.x kernels put R4x00 CPU into 32-bit mode and all those | |
26 * 64-bit instructions (daddu, dmultu, etc.) found below gonna only | |
27 * cause illegal instruction exception:-( | |
28 * | |
29 * In addition the code depends on preprocessor flags set up by MIPSpro | |
30 * compiler driver (either as or cc) and therefore (probably?) can't be | |
31 * compiled by the GNU assembler. GNU C driver manages fine though... | |
32 * I mean as long as -mmips-as is specified or is the default option, | |
33 * because then it simply invokes /usr/bin/as which in turn takes | |
34 * perfect care of the preprocessor definitions. Another neat feature | |
35 * offered by the MIPSpro assembler is an optimization pass. This gave | |
36 * me the opportunity to have the code looking more regular as all those | |
37 * architecture dependent instruction rescheduling details were left to | |
38 * the assembler. Cool, huh? | |
39 * | |
40 * Performance improvement is astonishing! 'apps/openssl speed rsa dsa' | |
41 * goes way over 3 times faster! | |
42 * | |
43 * <appro@fy.chalmers.se> | |
44 */ | |
45 #include <asm.h> | |
46 #include <regdef.h> | |
47 | |
48 #if _MIPS_ISA>=4 | |
49 #define MOVNZ(cond,dst,src) \ | |
50 movn dst,src,cond | |
51 #else | |
52 #define MOVNZ(cond,dst,src) \ | |
53 .set noreorder; \ | |
54 bnezl cond,.+8; \ | |
55 move dst,src; \ | |
56 .set reorder | |
57 #endif | |
58 | |
59 .text | |
60 | |
61 .set noat | |
62 .set reorder | |
63 | |
64 #define MINUS4 v1 | |
65 | |
66 .align 5 | |
67 LEAF(bn_mul_add_words) | |
68 .set noreorder | |
69 bgtzl a2,.L_bn_mul_add_words_proceed | |
70 ld t0,0(a1) | |
71 jr ra | |
72 move v0,zero | |
73 .set reorder | |
74 | |
75 .L_bn_mul_add_words_proceed: | |
76 li MINUS4,-4 | |
77 and ta0,a2,MINUS4 | |
78 move v0,zero | |
79 beqz ta0,.L_bn_mul_add_words_tail | |
80 | |
81 .L_bn_mul_add_words_loop: | |
82 dmultu t0,a3 | |
83 ld t1,0(a0) | |
84 ld t2,8(a1) | |
85 ld t3,8(a0) | |
86 ld ta0,16(a1) | |
87 ld ta1,16(a0) | |
88 daddu t1,v0 | |
89 sltu v0,t1,v0 /* All manuals say it "compares 32-bit | |
90 * values", but it seems to work fine | |
91 * even on 64-bit registers. */ | |
92 mflo AT | |
93 mfhi t0 | |
94 daddu t1,AT | |
95 daddu v0,t0 | |
96 sltu AT,t1,AT | |
97 sd t1,0(a0) | |
98 daddu v0,AT | |
99 | |
100 dmultu t2,a3 | |
101 ld ta2,24(a1) | |
102 ld ta3,24(a0) | |
103 daddu t3,v0 | |
104 sltu v0,t3,v0 | |
105 mflo AT | |
106 mfhi t2 | |
107 daddu t3,AT | |
108 daddu v0,t2 | |
109 sltu AT,t3,AT | |
110 sd t3,8(a0) | |
111 daddu v0,AT | |
112 | |
113 dmultu ta0,a3 | |
114 subu a2,4 | |
115 PTR_ADD a0,32 | |
116 PTR_ADD a1,32 | |
117 daddu ta1,v0 | |
118 sltu v0,ta1,v0 | |
119 mflo AT | |
120 mfhi ta0 | |
121 daddu ta1,AT | |
122 daddu v0,ta0 | |
123 sltu AT,ta1,AT | |
124 sd ta1,-16(a0) | |
125 daddu v0,AT | |
126 | |
127 | |
128 dmultu ta2,a3 | |
129 and ta0,a2,MINUS4 | |
130 daddu ta3,v0 | |
131 sltu v0,ta3,v0 | |
132 mflo AT | |
133 mfhi ta2 | |
134 daddu ta3,AT | |
135 daddu v0,ta2 | |
136 sltu AT,ta3,AT | |
137 sd ta3,-8(a0) | |
138 daddu v0,AT | |
139 .set noreorder | |
140 bgtzl ta0,.L_bn_mul_add_words_loop | |
141 ld t0,0(a1) | |
142 | |
143 bnezl a2,.L_bn_mul_add_words_tail | |
144 ld t0,0(a1) | |
145 .set reorder | |
146 | |
147 .L_bn_mul_add_words_return: | |
148 jr ra | |
149 | |
150 .L_bn_mul_add_words_tail: | |
151 dmultu t0,a3 | |
152 ld t1,0(a0) | |
153 subu a2,1 | |
154 daddu t1,v0 | |
155 sltu v0,t1,v0 | |
156 mflo AT | |
157 mfhi t0 | |
158 daddu t1,AT | |
159 daddu v0,t0 | |
160 sltu AT,t1,AT | |
161 sd t1,0(a0) | |
162 daddu v0,AT | |
163 beqz a2,.L_bn_mul_add_words_return | |
164 | |
165 ld t0,8(a1) | |
166 dmultu t0,a3 | |
167 ld t1,8(a0) | |
168 subu a2,1 | |
169 daddu t1,v0 | |
170 sltu v0,t1,v0 | |
171 mflo AT | |
172 mfhi t0 | |
173 daddu t1,AT | |
174 daddu v0,t0 | |
175 sltu AT,t1,AT | |
176 sd t1,8(a0) | |
177 daddu v0,AT | |
178 beqz a2,.L_bn_mul_add_words_return | |
179 | |
180 ld t0,16(a1) | |
181 dmultu t0,a3 | |
182 ld t1,16(a0) | |
183 daddu t1,v0 | |
184 sltu v0,t1,v0 | |
185 mflo AT | |
186 mfhi t0 | |
187 daddu t1,AT | |
188 daddu v0,t0 | |
189 sltu AT,t1,AT | |
190 sd t1,16(a0) | |
191 daddu v0,AT | |
192 jr ra | |
193 END(bn_mul_add_words) | |
194 | |
195 .align 5 | |
196 LEAF(bn_mul_words) | |
197 .set noreorder | |
198 bgtzl a2,.L_bn_mul_words_proceed | |
199 ld t0,0(a1) | |
200 jr ra | |
201 move v0,zero | |
202 .set reorder | |
203 | |
204 .L_bn_mul_words_proceed: | |
205 li MINUS4,-4 | |
206 and ta0,a2,MINUS4 | |
207 move v0,zero | |
208 beqz ta0,.L_bn_mul_words_tail | |
209 | |
210 .L_bn_mul_words_loop: | |
211 dmultu t0,a3 | |
212 ld t2,8(a1) | |
213 ld ta0,16(a1) | |
214 ld ta2,24(a1) | |
215 mflo AT | |
216 mfhi t0 | |
217 daddu v0,AT | |
218 sltu t1,v0,AT | |
219 sd v0,0(a0) | |
220 daddu v0,t1,t0 | |
221 | |
222 dmultu t2,a3 | |
223 subu a2,4 | |
224 PTR_ADD a0,32 | |
225 PTR_ADD a1,32 | |
226 mflo AT | |
227 mfhi t2 | |
228 daddu v0,AT | |
229 sltu t3,v0,AT | |
230 sd v0,-24(a0) | |
231 daddu v0,t3,t2 | |
232 | |
233 dmultu ta0,a3 | |
234 mflo AT | |
235 mfhi ta0 | |
236 daddu v0,AT | |
237 sltu ta1,v0,AT | |
238 sd v0,-16(a0) | |
239 daddu v0,ta1,ta0 | |
240 | |
241 | |
242 dmultu ta2,a3 | |
243 and ta0,a2,MINUS4 | |
244 mflo AT | |
245 mfhi ta2 | |
246 daddu v0,AT | |
247 sltu ta3,v0,AT | |
248 sd v0,-8(a0) | |
249 daddu v0,ta3,ta2 | |
250 .set noreorder | |
251 bgtzl ta0,.L_bn_mul_words_loop | |
252 ld t0,0(a1) | |
253 | |
254 bnezl a2,.L_bn_mul_words_tail | |
255 ld t0,0(a1) | |
256 .set reorder | |
257 | |
258 .L_bn_mul_words_return: | |
259 jr ra | |
260 | |
261 .L_bn_mul_words_tail: | |
262 dmultu t0,a3 | |
263 subu a2,1 | |
264 mflo AT | |
265 mfhi t0 | |
266 daddu v0,AT | |
267 sltu t1,v0,AT | |
268 sd v0,0(a0) | |
269 daddu v0,t1,t0 | |
270 beqz a2,.L_bn_mul_words_return | |
271 | |
272 ld t0,8(a1) | |
273 dmultu t0,a3 | |
274 subu a2,1 | |
275 mflo AT | |
276 mfhi t0 | |
277 daddu v0,AT | |
278 sltu t1,v0,AT | |
279 sd v0,8(a0) | |
280 daddu v0,t1,t0 | |
281 beqz a2,.L_bn_mul_words_return | |
282 | |
283 ld t0,16(a1) | |
284 dmultu t0,a3 | |
285 mflo AT | |
286 mfhi t0 | |
287 daddu v0,AT | |
288 sltu t1,v0,AT | |
289 sd v0,16(a0) | |
290 daddu v0,t1,t0 | |
291 jr ra | |
292 END(bn_mul_words) | |
293 | |
294 .align 5 | |
295 LEAF(bn_sqr_words) | |
296 .set noreorder | |
297 bgtzl a2,.L_bn_sqr_words_proceed | |
298 ld t0,0(a1) | |
299 jr ra | |
300 move v0,zero | |
301 .set reorder | |
302 | |
303 .L_bn_sqr_words_proceed: | |
304 li MINUS4,-4 | |
305 and ta0,a2,MINUS4 | |
306 move v0,zero | |
307 beqz ta0,.L_bn_sqr_words_tail | |
308 | |
309 .L_bn_sqr_words_loop: | |
310 dmultu t0,t0 | |
311 ld t2,8(a1) | |
312 ld ta0,16(a1) | |
313 ld ta2,24(a1) | |
314 mflo t1 | |
315 mfhi t0 | |
316 sd t1,0(a0) | |
317 sd t0,8(a0) | |
318 | |
319 dmultu t2,t2 | |
320 subu a2,4 | |
321 PTR_ADD a0,64 | |
322 PTR_ADD a1,32 | |
323 mflo t3 | |
324 mfhi t2 | |
325 sd t3,-48(a0) | |
326 sd t2,-40(a0) | |
327 | |
328 dmultu ta0,ta0 | |
329 mflo ta1 | |
330 mfhi ta0 | |
331 sd ta1,-32(a0) | |
332 sd ta0,-24(a0) | |
333 | |
334 | |
335 dmultu ta2,ta2 | |
336 and ta0,a2,MINUS4 | |
337 mflo ta3 | |
338 mfhi ta2 | |
339 sd ta3,-16(a0) | |
340 sd ta2,-8(a0) | |
341 | |
342 .set noreorder | |
343 bgtzl ta0,.L_bn_sqr_words_loop | |
344 ld t0,0(a1) | |
345 | |
346 bnezl a2,.L_bn_sqr_words_tail | |
347 ld t0,0(a1) | |
348 .set reorder | |
349 | |
350 .L_bn_sqr_words_return: | |
351 move v0,zero | |
352 jr ra | |
353 | |
354 .L_bn_sqr_words_tail: | |
355 dmultu t0,t0 | |
356 subu a2,1 | |
357 mflo t1 | |
358 mfhi t0 | |
359 sd t1,0(a0) | |
360 sd t0,8(a0) | |
361 beqz a2,.L_bn_sqr_words_return | |
362 | |
363 ld t0,8(a1) | |
364 dmultu t0,t0 | |
365 subu a2,1 | |
366 mflo t1 | |
367 mfhi t0 | |
368 sd t1,16(a0) | |
369 sd t0,24(a0) | |
370 beqz a2,.L_bn_sqr_words_return | |
371 | |
372 ld t0,16(a1) | |
373 dmultu t0,t0 | |
374 mflo t1 | |
375 mfhi t0 | |
376 sd t1,32(a0) | |
377 sd t0,40(a0) | |
378 jr ra | |
379 END(bn_sqr_words) | |
380 | |
381 .align 5 | |
382 LEAF(bn_add_words) | |
383 .set noreorder | |
384 bgtzl a3,.L_bn_add_words_proceed | |
385 ld t0,0(a1) | |
386 jr ra | |
387 move v0,zero | |
388 .set reorder | |
389 | |
390 .L_bn_add_words_proceed: | |
391 li MINUS4,-4 | |
392 and AT,a3,MINUS4 | |
393 move v0,zero | |
394 beqz AT,.L_bn_add_words_tail | |
395 | |
396 .L_bn_add_words_loop: | |
397 ld ta0,0(a2) | |
398 subu a3,4 | |
399 ld t1,8(a1) | |
400 and AT,a3,MINUS4 | |
401 ld t2,16(a1) | |
402 PTR_ADD a2,32 | |
403 ld t3,24(a1) | |
404 PTR_ADD a0,32 | |
405 ld ta1,-24(a2) | |
406 PTR_ADD a1,32 | |
407 ld ta2,-16(a2) | |
408 ld ta3,-8(a2) | |
409 daddu ta0,t0 | |
410 sltu t8,ta0,t0 | |
411 daddu t0,ta0,v0 | |
412 sltu v0,t0,ta0 | |
413 sd t0,-32(a0) | |
414 daddu v0,t8 | |
415 | |
416 daddu ta1,t1 | |
417 sltu t9,ta1,t1 | |
418 daddu t1,ta1,v0 | |
419 sltu v0,t1,ta1 | |
420 sd t1,-24(a0) | |
421 daddu v0,t9 | |
422 | |
423 daddu ta2,t2 | |
424 sltu t8,ta2,t2 | |
425 daddu t2,ta2,v0 | |
426 sltu v0,t2,ta2 | |
427 sd t2,-16(a0) | |
428 daddu v0,t8 | |
429 | |
430 daddu ta3,t3 | |
431 sltu t9,ta3,t3 | |
432 daddu t3,ta3,v0 | |
433 sltu v0,t3,ta3 | |
434 sd t3,-8(a0) | |
435 daddu v0,t9 | |
436 | |
437 .set noreorder | |
438 bgtzl AT,.L_bn_add_words_loop | |
439 ld t0,0(a1) | |
440 | |
441 bnezl a3,.L_bn_add_words_tail | |
442 ld t0,0(a1) | |
443 .set reorder | |
444 | |
445 .L_bn_add_words_return: | |
446 jr ra | |
447 | |
448 .L_bn_add_words_tail: | |
449 ld ta0,0(a2) | |
450 daddu ta0,t0 | |
451 subu a3,1 | |
452 sltu t8,ta0,t0 | |
453 daddu t0,ta0,v0 | |
454 sltu v0,t0,ta0 | |
455 sd t0,0(a0) | |
456 daddu v0,t8 | |
457 beqz a3,.L_bn_add_words_return | |
458 | |
459 ld t1,8(a1) | |
460 ld ta1,8(a2) | |
461 daddu ta1,t1 | |
462 subu a3,1 | |
463 sltu t9,ta1,t1 | |
464 daddu t1,ta1,v0 | |
465 sltu v0,t1,ta1 | |
466 sd t1,8(a0) | |
467 daddu v0,t9 | |
468 beqz a3,.L_bn_add_words_return | |
469 | |
470 ld t2,16(a1) | |
471 ld ta2,16(a2) | |
472 daddu ta2,t2 | |
473 sltu t8,ta2,t2 | |
474 daddu t2,ta2,v0 | |
475 sltu v0,t2,ta2 | |
476 sd t2,16(a0) | |
477 daddu v0,t8 | |
478 jr ra | |
479 END(bn_add_words) | |
480 | |
481 .align 5 | |
482 LEAF(bn_sub_words) | |
483 .set noreorder | |
484 bgtzl a3,.L_bn_sub_words_proceed | |
485 ld t0,0(a1) | |
486 jr ra | |
487 move v0,zero | |
488 .set reorder | |
489 | |
490 .L_bn_sub_words_proceed: | |
491 li MINUS4,-4 | |
492 and AT,a3,MINUS4 | |
493 move v0,zero | |
494 beqz AT,.L_bn_sub_words_tail | |
495 | |
496 .L_bn_sub_words_loop: | |
497 ld ta0,0(a2) | |
498 subu a3,4 | |
499 ld t1,8(a1) | |
500 and AT,a3,MINUS4 | |
501 ld t2,16(a1) | |
502 PTR_ADD a2,32 | |
503 ld t3,24(a1) | |
504 PTR_ADD a0,32 | |
505 ld ta1,-24(a2) | |
506 PTR_ADD a1,32 | |
507 ld ta2,-16(a2) | |
508 ld ta3,-8(a2) | |
509 sltu t8,t0,ta0 | |
510 dsubu t0,ta0 | |
511 dsubu ta0,t0,v0 | |
512 sd ta0,-32(a0) | |
513 MOVNZ (t0,v0,t8) | |
514 | |
515 sltu t9,t1,ta1 | |
516 dsubu t1,ta1 | |
517 dsubu ta1,t1,v0 | |
518 sd ta1,-24(a0) | |
519 MOVNZ (t1,v0,t9) | |
520 | |
521 | |
522 sltu t8,t2,ta2 | |
523 dsubu t2,ta2 | |
524 dsubu ta2,t2,v0 | |
525 sd ta2,-16(a0) | |
526 MOVNZ (t2,v0,t8) | |
527 | |
528 sltu t9,t3,ta3 | |
529 dsubu t3,ta3 | |
530 dsubu ta3,t3,v0 | |
531 sd ta3,-8(a0) | |
532 MOVNZ (t3,v0,t9) | |
533 | |
534 .set noreorder | |
535 bgtzl AT,.L_bn_sub_words_loop | |
536 ld t0,0(a1) | |
537 | |
538 bnezl a3,.L_bn_sub_words_tail | |
539 ld t0,0(a1) | |
540 .set reorder | |
541 | |
542 .L_bn_sub_words_return: | |
543 jr ra | |
544 | |
545 .L_bn_sub_words_tail: | |
546 ld ta0,0(a2) | |
547 subu a3,1 | |
548 sltu t8,t0,ta0 | |
549 dsubu t0,ta0 | |
550 dsubu ta0,t0,v0 | |
551 MOVNZ (t0,v0,t8) | |
552 sd ta0,0(a0) | |
553 beqz a3,.L_bn_sub_words_return | |
554 | |
555 ld t1,8(a1) | |
556 subu a3,1 | |
557 ld ta1,8(a2) | |
558 sltu t9,t1,ta1 | |
559 dsubu t1,ta1 | |
560 dsubu ta1,t1,v0 | |
561 MOVNZ (t1,v0,t9) | |
562 sd ta1,8(a0) | |
563 beqz a3,.L_bn_sub_words_return | |
564 | |
565 ld t2,16(a1) | |
566 ld ta2,16(a2) | |
567 sltu t8,t2,ta2 | |
568 dsubu t2,ta2 | |
569 dsubu ta2,t2,v0 | |
570 MOVNZ (t2,v0,t8) | |
571 sd ta2,16(a0) | |
572 jr ra | |
573 END(bn_sub_words) | |
574 | |
575 #undef MINUS4 | |
576 | |
577 .align 5 | |
578 LEAF(bn_div_3_words) | |
579 .set reorder | |
580 move a3,a0 /* we know that bn_div_words doesn't | |
581 * touch a3, ta2, ta3 and preserves a2 | |
582 * so that we can save two arguments | |
583 * and return address in registers | |
584 * instead of stack:-) | |
585 */ | |
586 ld a0,(a3) | |
587 move ta2,a1 | |
588 ld a1,-8(a3) | |
589 bne a0,a2,.L_bn_div_3_words_proceed | |
590 li v0,-1 | |
591 jr ra | |
592 .L_bn_div_3_words_proceed: | |
593 move ta3,ra | |
594 bal bn_div_words | |
595 move ra,ta3 | |
596 dmultu ta2,v0 | |
597 ld t2,-16(a3) | |
598 move ta0,zero | |
599 mfhi t1 | |
600 mflo t0 | |
601 sltu t8,t1,v1 | |
602 .L_bn_div_3_words_inner_loop: | |
603 bnez t8,.L_bn_div_3_words_inner_loop_done | |
604 sgeu AT,t2,t0 | |
605 seq t9,t1,v1 | |
606 and AT,t9 | |
607 sltu t3,t0,ta2 | |
608 daddu v1,a2 | |
609 dsubu t1,t3 | |
610 dsubu t0,ta2 | |
611 sltu t8,t1,v1 | |
612 sltu ta0,v1,a2 | |
613 or t8,ta0 | |
614 .set noreorder | |
615 beqzl AT,.L_bn_div_3_words_inner_loop | |
616 dsubu v0,1 | |
617 .set reorder | |
618 .L_bn_div_3_words_inner_loop_done: | |
619 jr ra | |
620 END(bn_div_3_words) | |
621 | |
622 .align 5 | |
623 LEAF(bn_div_words) | |
624 .set noreorder | |
625 bnezl a2,.L_bn_div_words_proceed | |
626 move v1,zero | |
627 jr ra | |
628 li v0,-1 /* I'd rather signal div-by-zero | |
629 * which can be done with 'break 7' */ | |
630 | |
631 .L_bn_div_words_proceed: | |
632 bltz a2,.L_bn_div_words_body | |
633 move t9,v1 | |
634 dsll a2,1 | |
635 bgtz a2,.-4 | |
636 addu t9,1 | |
637 | |
638 .set reorder | |
639 negu t1,t9 | |
640 li t2,-1 | |
641 dsll t2,t1 | |
642 and t2,a0 | |
643 dsrl AT,a1,t1 | |
644 .set noreorder | |
645 bnezl t2,.+8 | |
646 break 6 /* signal overflow */ | |
647 .set reorder | |
648 dsll a0,t9 | |
649 dsll a1,t9 | |
650 or a0,AT | |
651 | |
652 #define QT ta0 | |
653 #define HH ta1 | |
654 #define DH v1 | |
655 .L_bn_div_words_body: | |
656 dsrl DH,a2,32 | |
657 sgeu AT,a0,a2 | |
658 .set noreorder | |
659 bnezl AT,.+8 | |
660 dsubu a0,a2 | |
661 .set reorder | |
662 | |
663 li QT,-1 | |
664 dsrl HH,a0,32 | |
665 dsrl QT,32 /* q=0xffffffff */ | |
666 beq DH,HH,.L_bn_div_words_skip_div1 | |
667 ddivu zero,a0,DH | |
668 mflo QT | |
669 .L_bn_div_words_skip_div1: | |
670 dmultu a2,QT | |
671 dsll t3,a0,32 | |
672 dsrl AT,a1,32 | |
673 or t3,AT | |
674 mflo t0 | |
675 mfhi t1 | |
676 .L_bn_div_words_inner_loop1: | |
677 sltu t2,t3,t0 | |
678 seq t8,HH,t1 | |
679 sltu AT,HH,t1 | |
680 and t2,t8 | |
681 sltu v0,t0,a2 | |
682 or AT,t2 | |
683 .set noreorder | |
684 beqz AT,.L_bn_div_words_inner_loop1_done | |
685 dsubu t1,v0 | |
686 dsubu t0,a2 | |
687 b .L_bn_div_words_inner_loop1 | |
688 dsubu QT,1 | |
689 .set reorder | |
690 .L_bn_div_words_inner_loop1_done: | |
691 | |
692 dsll a1,32 | |
693 dsubu a0,t3,t0 | |
694 dsll v0,QT,32 | |
695 | |
696 li QT,-1 | |
697 dsrl HH,a0,32 | |
698 dsrl QT,32 /* q=0xffffffff */ | |
699 beq DH,HH,.L_bn_div_words_skip_div2 | |
700 ddivu zero,a0,DH | |
701 mflo QT | |
702 .L_bn_div_words_skip_div2: | |
703 #undef DH | |
704 dmultu a2,QT | |
705 dsll t3,a0,32 | |
706 dsrl AT,a1,32 | |
707 or t3,AT | |
708 mflo t0 | |
709 mfhi t1 | |
710 .L_bn_div_words_inner_loop2: | |
711 sltu t2,t3,t0 | |
712 seq t8,HH,t1 | |
713 sltu AT,HH,t1 | |
714 and t2,t8 | |
715 sltu v1,t0,a2 | |
716 or AT,t2 | |
717 .set noreorder | |
718 beqz AT,.L_bn_div_words_inner_loop2_done | |
719 dsubu t1,v1 | |
720 dsubu t0,a2 | |
721 b .L_bn_div_words_inner_loop2 | |
722 dsubu QT,1 | |
723 .set reorder | |
724 .L_bn_div_words_inner_loop2_done: | |
725 #undef HH | |
726 | |
727 dsubu a0,t3,t0 | |
728 or v0,QT | |
729 dsrl v1,a0,t9 /* v1 contains remainder if anybody wants it */ | |
730 dsrl a2,t9 /* restore a2 */ | |
731 jr ra | |
732 #undef QT | |
733 END(bn_div_words) | |
734 | |
735 #define a_0 t0 | |
736 #define a_1 t1 | |
737 #define a_2 t2 | |
738 #define a_3 t3 | |
739 #define b_0 ta0 | |
740 #define b_1 ta1 | |
741 #define b_2 ta2 | |
742 #define b_3 ta3 | |
743 | |
744 #define a_4 s0 | |
745 #define a_5 s2 | |
746 #define a_6 s4 | |
747 #define a_7 a1 /* once we load a[7] we don't need a anymore */ | |
748 #define b_4 s1 | |
749 #define b_5 s3 | |
750 #define b_6 s5 | |
751 #define b_7 a2 /* once we load b[7] we don't need b anymore */ | |
752 | |
753 #define t_1 t8 | |
754 #define t_2 t9 | |
755 | |
756 #define c_1 v0 | |
757 #define c_2 v1 | |
758 #define c_3 a3 | |
759 | |
760 #define FRAME_SIZE 48 | |
761 | |
762 .align 5 | |
763 LEAF(bn_mul_comba8) | |
764 .set noreorder | |
765 PTR_SUB sp,FRAME_SIZE | |
766 .frame sp,64,ra | |
767 .set reorder | |
768 ld a_0,0(a1) /* If compiled with -mips3 option on | |
769 * R5000 box assembler barks on this | |
770 * line with "shouldn't have mult/div | |
771 * as last instruction in bb (R10K | |
772 * bug)" warning. If anybody out there | |
773 * has a clue about how to circumvent | |
774 * this do send me a note. | |
775 * <appro@fy.chalmers.se> | |
776 */ | |
777 ld b_0,0(a2) | |
778 ld a_1,8(a1) | |
779 ld a_2,16(a1) | |
780 ld a_3,24(a1) | |
781 ld b_1,8(a2) | |
782 ld b_2,16(a2) | |
783 ld b_3,24(a2) | |
784 dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ | |
785 sd s0,0(sp) | |
786 sd s1,8(sp) | |
787 sd s2,16(sp) | |
788 sd s3,24(sp) | |
789 sd s4,32(sp) | |
790 sd s5,40(sp) | |
791 mflo c_1 | |
792 mfhi c_2 | |
793 | |
794 dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ | |
795 ld a_4,32(a1) | |
796 ld a_5,40(a1) | |
797 ld a_6,48(a1) | |
798 ld a_7,56(a1) | |
799 ld b_4,32(a2) | |
800 ld b_5,40(a2) | |
801 mflo t_1 | |
802 mfhi t_2 | |
803 daddu c_2,t_1 | |
804 sltu AT,c_2,t_1 | |
805 daddu c_3,t_2,AT | |
806 dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ | |
807 ld b_6,48(a2) | |
808 ld b_7,56(a2) | |
809 sd c_1,0(a0) /* r[0]=c1; */ | |
810 mflo t_1 | |
811 mfhi t_2 | |
812 daddu c_2,t_1 | |
813 sltu AT,c_2,t_1 | |
814 daddu t_2,AT | |
815 daddu c_3,t_2 | |
816 sltu c_1,c_3,t_2 | |
817 sd c_2,8(a0) /* r[1]=c2; */ | |
818 | |
819 dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ | |
820 mflo t_1 | |
821 mfhi t_2 | |
822 daddu c_3,t_1 | |
823 sltu AT,c_3,t_1 | |
824 daddu t_2,AT | |
825 daddu c_1,t_2 | |
826 dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ | |
827 mflo t_1 | |
828 mfhi t_2 | |
829 daddu c_3,t_1 | |
830 sltu AT,c_3,t_1 | |
831 daddu t_2,AT | |
832 daddu c_1,t_2 | |
833 sltu c_2,c_1,t_2 | |
834 dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ | |
835 mflo t_1 | |
836 mfhi t_2 | |
837 daddu c_3,t_1 | |
838 sltu AT,c_3,t_1 | |
839 daddu t_2,AT | |
840 daddu c_1,t_2 | |
841 sltu AT,c_1,t_2 | |
842 daddu c_2,AT | |
843 sd c_3,16(a0) /* r[2]=c3; */ | |
844 | |
845 dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ | |
846 mflo t_1 | |
847 mfhi t_2 | |
848 daddu c_1,t_1 | |
849 sltu AT,c_1,t_1 | |
850 daddu t_2,AT | |
851 daddu c_2,t_2 | |
852 sltu c_3,c_2,t_2 | |
853 dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ | |
854 mflo t_1 | |
855 mfhi t_2 | |
856 daddu c_1,t_1 | |
857 sltu AT,c_1,t_1 | |
858 daddu t_2,AT | |
859 daddu c_2,t_2 | |
860 sltu AT,c_2,t_2 | |
861 daddu c_3,AT | |
862 dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ | |
863 mflo t_1 | |
864 mfhi t_2 | |
865 daddu c_1,t_1 | |
866 sltu AT,c_1,t_1 | |
867 daddu t_2,AT | |
868 daddu c_2,t_2 | |
869 sltu AT,c_2,t_2 | |
870 daddu c_3,AT | |
871 dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ | |
872 mflo t_1 | |
873 mfhi t_2 | |
874 daddu c_1,t_1 | |
875 sltu AT,c_1,t_1 | |
876 daddu t_2,AT | |
877 daddu c_2,t_2 | |
878 sltu AT,c_2,t_2 | |
879 daddu c_3,AT | |
880 sd c_1,24(a0) /* r[3]=c1; */ | |
881 | |
882 dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */ | |
883 mflo t_1 | |
884 mfhi t_2 | |
885 daddu c_2,t_1 | |
886 sltu AT,c_2,t_1 | |
887 daddu t_2,AT | |
888 daddu c_3,t_2 | |
889 sltu c_1,c_3,t_2 | |
890 dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ | |
891 mflo t_1 | |
892 mfhi t_2 | |
893 daddu c_2,t_1 | |
894 sltu AT,c_2,t_1 | |
895 daddu t_2,AT | |
896 daddu c_3,t_2 | |
897 sltu AT,c_3,t_2 | |
898 daddu c_1,AT | |
899 dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ | |
900 mflo t_1 | |
901 mfhi t_2 | |
902 daddu c_2,t_1 | |
903 sltu AT,c_2,t_1 | |
904 daddu t_2,AT | |
905 daddu c_3,t_2 | |
906 sltu AT,c_3,t_2 | |
907 daddu c_1,AT | |
908 dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ | |
909 mflo t_1 | |
910 mfhi t_2 | |
911 daddu c_2,t_1 | |
912 sltu AT,c_2,t_1 | |
913 daddu t_2,AT | |
914 daddu c_3,t_2 | |
915 sltu AT,c_3,t_2 | |
916 daddu c_1,AT | |
917 dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */ | |
918 mflo t_1 | |
919 mfhi t_2 | |
920 daddu c_2,t_1 | |
921 sltu AT,c_2,t_1 | |
922 daddu t_2,AT | |
923 daddu c_3,t_2 | |
924 sltu AT,c_3,t_2 | |
925 daddu c_1,AT | |
926 sd c_2,32(a0) /* r[4]=c2; */ | |
927 | |
928 dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */ | |
929 mflo t_1 | |
930 mfhi t_2 | |
931 daddu c_3,t_1 | |
932 sltu AT,c_3,t_1 | |
933 daddu t_2,AT | |
934 daddu c_1,t_2 | |
935 sltu c_2,c_1,t_2 | |
936 dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */ | |
937 mflo t_1 | |
938 mfhi t_2 | |
939 daddu c_3,t_1 | |
940 sltu AT,c_3,t_1 | |
941 daddu t_2,AT | |
942 daddu c_1,t_2 | |
943 sltu AT,c_1,t_2 | |
944 daddu c_2,AT | |
945 dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ | |
946 mflo t_1 | |
947 mfhi t_2 | |
948 daddu c_3,t_1 | |
949 sltu AT,c_3,t_1 | |
950 daddu t_2,AT | |
951 daddu c_1,t_2 | |
952 sltu AT,c_1,t_2 | |
953 daddu c_2,AT | |
954 dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ | |
955 mflo t_1 | |
956 mfhi t_2 | |
957 daddu c_3,t_1 | |
958 sltu AT,c_3,t_1 | |
959 daddu t_2,AT | |
960 daddu c_1,t_2 | |
961 sltu AT,c_1,t_2 | |
962 daddu c_2,AT | |
963 dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */ | |
964 mflo t_1 | |
965 mfhi t_2 | |
966 daddu c_3,t_1 | |
967 sltu AT,c_3,t_1 | |
968 daddu t_2,AT | |
969 daddu c_1,t_2 | |
970 sltu AT,c_1,t_2 | |
971 daddu c_2,AT | |
972 dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */ | |
973 mflo t_1 | |
974 mfhi t_2 | |
975 daddu c_3,t_1 | |
976 sltu AT,c_3,t_1 | |
977 daddu t_2,AT | |
978 daddu c_1,t_2 | |
979 sltu AT,c_1,t_2 | |
980 daddu c_2,AT | |
981 sd c_3,40(a0) /* r[5]=c3; */ | |
982 | |
983 dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */ | |
984 mflo t_1 | |
985 mfhi t_2 | |
986 daddu c_1,t_1 | |
987 sltu AT,c_1,t_1 | |
988 daddu t_2,AT | |
989 daddu c_2,t_2 | |
990 sltu c_3,c_2,t_2 | |
991 dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */ | |
992 mflo t_1 | |
993 mfhi t_2 | |
994 daddu c_1,t_1 | |
995 sltu AT,c_1,t_1 | |
996 daddu t_2,AT | |
997 daddu c_2,t_2 | |
998 sltu AT,c_2,t_2 | |
999 daddu c_3,AT | |
1000 dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */ | |
1001 mflo t_1 | |
1002 mfhi t_2 | |
1003 daddu c_1,t_1 | |
1004 sltu AT,c_1,t_1 | |
1005 daddu t_2,AT | |
1006 daddu c_2,t_2 | |
1007 sltu AT,c_2,t_2 | |
1008 daddu c_3,AT | |
1009 dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ | |
1010 mflo t_1 | |
1011 mfhi t_2 | |
1012 daddu c_1,t_1 | |
1013 sltu AT,c_1,t_1 | |
1014 daddu t_2,AT | |
1015 daddu c_2,t_2 | |
1016 sltu AT,c_2,t_2 | |
1017 daddu c_3,AT | |
1018 dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */ | |
1019 mflo t_1 | |
1020 mfhi t_2 | |
1021 daddu c_1,t_1 | |
1022 sltu AT,c_1,t_1 | |
1023 daddu t_2,AT | |
1024 daddu c_2,t_2 | |
1025 sltu AT,c_2,t_2 | |
1026 daddu c_3,AT | |
1027 dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */ | |
1028 mflo t_1 | |
1029 mfhi t_2 | |
1030 daddu c_1,t_1 | |
1031 sltu AT,c_1,t_1 | |
1032 daddu t_2,AT | |
1033 daddu c_2,t_2 | |
1034 sltu AT,c_2,t_2 | |
1035 daddu c_3,AT | |
1036 dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */ | |
1037 mflo t_1 | |
1038 mfhi t_2 | |
1039 daddu c_1,t_1 | |
1040 sltu AT,c_1,t_1 | |
1041 daddu t_2,AT | |
1042 daddu c_2,t_2 | |
1043 sltu AT,c_2,t_2 | |
1044 daddu c_3,AT | |
1045 sd c_1,48(a0) /* r[6]=c1; */ | |
1046 | |
1047 dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */ | |
1048 mflo t_1 | |
1049 mfhi t_2 | |
1050 daddu c_2,t_1 | |
1051 sltu AT,c_2,t_1 | |
1052 daddu t_2,AT | |
1053 daddu c_3,t_2 | |
1054 sltu c_1,c_3,t_2 | |
1055 dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */ | |
1056 mflo t_1 | |
1057 mfhi t_2 | |
1058 daddu c_2,t_1 | |
1059 sltu AT,c_2,t_1 | |
1060 daddu t_2,AT | |
1061 daddu c_3,t_2 | |
1062 sltu AT,c_3,t_2 | |
1063 daddu c_1,AT | |
1064 dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */ | |
1065 mflo t_1 | |
1066 mfhi t_2 | |
1067 daddu c_2,t_1 | |
1068 sltu AT,c_2,t_1 | |
1069 daddu t_2,AT | |
1070 daddu c_3,t_2 | |
1071 sltu AT,c_3,t_2 | |
1072 daddu c_1,AT | |
1073 dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */ | |
1074 mflo t_1 | |
1075 mfhi t_2 | |
1076 daddu c_2,t_1 | |
1077 sltu AT,c_2,t_1 | |
1078 daddu t_2,AT | |
1079 daddu c_3,t_2 | |
1080 sltu AT,c_3,t_2 | |
1081 daddu c_1,AT | |
1082 dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */ | |
1083 mflo t_1 | |
1084 mfhi t_2 | |
1085 daddu c_2,t_1 | |
1086 sltu AT,c_2,t_1 | |
1087 daddu t_2,AT | |
1088 daddu c_3,t_2 | |
1089 sltu AT,c_3,t_2 | |
1090 daddu c_1,AT | |
1091 dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */ | |
1092 mflo t_1 | |
1093 mfhi t_2 | |
1094 daddu c_2,t_1 | |
1095 sltu AT,c_2,t_1 | |
1096 daddu t_2,AT | |
1097 daddu c_3,t_2 | |
1098 sltu AT,c_3,t_2 | |
1099 daddu c_1,AT | |
1100 dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */ | |
1101 mflo t_1 | |
1102 mfhi t_2 | |
1103 daddu c_2,t_1 | |
1104 sltu AT,c_2,t_1 | |
1105 daddu t_2,AT | |
1106 daddu c_3,t_2 | |
1107 sltu AT,c_3,t_2 | |
1108 daddu c_1,AT | |
1109 dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */ | |
1110 mflo t_1 | |
1111 mfhi t_2 | |
1112 daddu c_2,t_1 | |
1113 sltu AT,c_2,t_1 | |
1114 daddu t_2,AT | |
1115 daddu c_3,t_2 | |
1116 sltu AT,c_3,t_2 | |
1117 daddu c_1,AT | |
1118 sd c_2,56(a0) /* r[7]=c2; */ | |
1119 | |
1120 dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */ | |
1121 mflo t_1 | |
1122 mfhi t_2 | |
1123 daddu c_3,t_1 | |
1124 sltu AT,c_3,t_1 | |
1125 daddu t_2,AT | |
1126 daddu c_1,t_2 | |
1127 sltu c_2,c_1,t_2 | |
1128 dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */ | |
1129 mflo t_1 | |
1130 mfhi t_2 | |
1131 daddu c_3,t_1 | |
1132 sltu AT,c_3,t_1 | |
1133 daddu t_2,AT | |
1134 daddu c_1,t_2 | |
1135 sltu AT,c_1,t_2 | |
1136 daddu c_2,AT | |
1137 dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */ | |
1138 mflo t_1 | |
1139 mfhi t_2 | |
1140 daddu c_3,t_1 | |
1141 sltu AT,c_3,t_1 | |
1142 daddu t_2,AT | |
1143 daddu c_1,t_2 | |
1144 sltu AT,c_1,t_2 | |
1145 daddu c_2,AT | |
1146 dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ | |
1147 mflo t_1 | |
1148 mfhi t_2 | |
1149 daddu c_3,t_1 | |
1150 sltu AT,c_3,t_1 | |
1151 daddu t_2,AT | |
1152 daddu c_1,t_2 | |
1153 sltu AT,c_1,t_2 | |
1154 daddu c_2,AT | |
1155 dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */ | |
1156 mflo t_1 | |
1157 mfhi t_2 | |
1158 daddu c_3,t_1 | |
1159 sltu AT,c_3,t_1 | |
1160 daddu t_2,AT | |
1161 daddu c_1,t_2 | |
1162 sltu AT,c_1,t_2 | |
1163 daddu c_2,AT | |
1164 dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */ | |
1165 mflo t_1 | |
1166 mfhi t_2 | |
1167 daddu c_3,t_1 | |
1168 sltu AT,c_3,t_1 | |
1169 daddu t_2,AT | |
1170 daddu c_1,t_2 | |
1171 sltu AT,c_1,t_2 | |
1172 daddu c_2,AT | |
1173 dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */ | |
1174 mflo t_1 | |
1175 mfhi t_2 | |
1176 daddu c_3,t_1 | |
1177 sltu AT,c_3,t_1 | |
1178 daddu t_2,AT | |
1179 daddu c_1,t_2 | |
1180 sltu AT,c_1,t_2 | |
1181 daddu c_2,AT | |
1182 sd c_3,64(a0) /* r[8]=c3; */ | |
1183 | |
1184 dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */ | |
1185 mflo t_1 | |
1186 mfhi t_2 | |
1187 daddu c_1,t_1 | |
1188 sltu AT,c_1,t_1 | |
1189 daddu t_2,AT | |
1190 daddu c_2,t_2 | |
1191 sltu c_3,c_2,t_2 | |
1192 dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */ | |
1193 mflo t_1 | |
1194 mfhi t_2 | |
1195 daddu c_1,t_1 | |
1196 sltu AT,c_1,t_1 | |
1197 daddu t_2,AT | |
1198 daddu c_2,t_2 | |
1199 sltu AT,c_2,t_2 | |
1200 daddu c_3,AT | |
1201 dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */ | |
1202 mflo t_1 | |
1203 mfhi t_2 | |
1204 daddu c_1,t_1 | |
1205 sltu AT,c_1,t_1 | |
1206 daddu t_2,AT | |
1207 daddu c_2,t_2 | |
1208 sltu AT,c_2,t_2 | |
1209 daddu c_3,AT | |
1210 dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */ | |
1211 mflo t_1 | |
1212 mfhi t_2 | |
1213 daddu c_1,t_1 | |
1214 sltu AT,c_1,t_1 | |
1215 daddu t_2,AT | |
1216 daddu c_2,t_2 | |
1217 sltu AT,c_2,t_2 | |
1218 daddu c_3,AT | |
1219 dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */ | |
1220 mflo t_1 | |
1221 mfhi t_2 | |
1222 daddu c_1,t_1 | |
1223 sltu AT,c_1,t_1 | |
1224 daddu t_2,AT | |
1225 daddu c_2,t_2 | |
1226 sltu AT,c_2,t_2 | |
1227 daddu c_3,AT | |
1228 dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */ | |
1229 mflo t_1 | |
1230 mfhi t_2 | |
1231 daddu c_1,t_1 | |
1232 sltu AT,c_1,t_1 | |
1233 daddu t_2,AT | |
1234 daddu c_2,t_2 | |
1235 sltu AT,c_2,t_2 | |
1236 daddu c_3,AT | |
1237 sd c_1,72(a0) /* r[9]=c1; */ | |
1238 | |
1239 dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */ | |
1240 mflo t_1 | |
1241 mfhi t_2 | |
1242 daddu c_2,t_1 | |
1243 sltu AT,c_2,t_1 | |
1244 daddu t_2,AT | |
1245 daddu c_3,t_2 | |
1246 sltu c_1,c_3,t_2 | |
1247 dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */ | |
1248 mflo t_1 | |
1249 mfhi t_2 | |
1250 daddu c_2,t_1 | |
1251 sltu AT,c_2,t_1 | |
1252 daddu t_2,AT | |
1253 daddu c_3,t_2 | |
1254 sltu AT,c_3,t_2 | |
1255 daddu c_1,AT | |
1256 dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ | |
1257 mflo t_1 | |
1258 mfhi t_2 | |
1259 daddu c_2,t_1 | |
1260 sltu AT,c_2,t_1 | |
1261 daddu t_2,AT | |
1262 daddu c_3,t_2 | |
1263 sltu AT,c_3,t_2 | |
1264 daddu c_1,AT | |
1265 dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */ | |
1266 mflo t_1 | |
1267 mfhi t_2 | |
1268 daddu c_2,t_1 | |
1269 sltu AT,c_2,t_1 | |
1270 daddu t_2,AT | |
1271 daddu c_3,t_2 | |
1272 sltu AT,c_3,t_2 | |
1273 daddu c_1,AT | |
1274 dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */ | |
1275 mflo t_1 | |
1276 mfhi t_2 | |
1277 daddu c_2,t_1 | |
1278 sltu AT,c_2,t_1 | |
1279 daddu t_2,AT | |
1280 daddu c_3,t_2 | |
1281 sltu AT,c_3,t_2 | |
1282 daddu c_1,AT | |
1283 sd c_2,80(a0) /* r[10]=c2; */ | |
1284 | |
1285 dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */ | |
1286 mflo t_1 | |
1287 mfhi t_2 | |
1288 daddu c_3,t_1 | |
1289 sltu AT,c_3,t_1 | |
1290 daddu t_2,AT | |
1291 daddu c_1,t_2 | |
1292 sltu c_2,c_1,t_2 | |
1293 dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */ | |
1294 mflo t_1 | |
1295 mfhi t_2 | |
1296 daddu c_3,t_1 | |
1297 sltu AT,c_3,t_1 | |
1298 daddu t_2,AT | |
1299 daddu c_1,t_2 | |
1300 sltu AT,c_1,t_2 | |
1301 daddu c_2,AT | |
1302 dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */ | |
1303 mflo t_1 | |
1304 mfhi t_2 | |
1305 daddu c_3,t_1 | |
1306 sltu AT,c_3,t_1 | |
1307 daddu t_2,AT | |
1308 daddu c_1,t_2 | |
1309 sltu AT,c_1,t_2 | |
1310 daddu c_2,AT | |
1311 dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */ | |
1312 mflo t_1 | |
1313 mfhi t_2 | |
1314 daddu c_3,t_1 | |
1315 sltu AT,c_3,t_1 | |
1316 daddu t_2,AT | |
1317 daddu c_1,t_2 | |
1318 sltu AT,c_1,t_2 | |
1319 daddu c_2,AT | |
1320 sd c_3,88(a0) /* r[11]=c3; */ | |
1321 | |
1322 dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */ | |
1323 mflo t_1 | |
1324 mfhi t_2 | |
1325 daddu c_1,t_1 | |
1326 sltu AT,c_1,t_1 | |
1327 daddu t_2,AT | |
1328 daddu c_2,t_2 | |
1329 sltu c_3,c_2,t_2 | |
1330 dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ | |
1331 mflo t_1 | |
1332 mfhi t_2 | |
1333 daddu c_1,t_1 | |
1334 sltu AT,c_1,t_1 | |
1335 daddu t_2,AT | |
1336 daddu c_2,t_2 | |
1337 sltu AT,c_2,t_2 | |
1338 daddu c_3,AT | |
1339 dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */ | |
1340 mflo t_1 | |
1341 mfhi t_2 | |
1342 daddu c_1,t_1 | |
1343 sltu AT,c_1,t_1 | |
1344 daddu t_2,AT | |
1345 daddu c_2,t_2 | |
1346 sltu AT,c_2,t_2 | |
1347 daddu c_3,AT | |
1348 sd c_1,96(a0) /* r[12]=c1; */ | |
1349 | |
1350 dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */ | |
1351 mflo t_1 | |
1352 mfhi t_2 | |
1353 daddu c_2,t_1 | |
1354 sltu AT,c_2,t_1 | |
1355 daddu t_2,AT | |
1356 daddu c_3,t_2 | |
1357 sltu c_1,c_3,t_2 | |
1358 dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */ | |
1359 mflo t_1 | |
1360 mfhi t_2 | |
1361 daddu c_2,t_1 | |
1362 sltu AT,c_2,t_1 | |
1363 daddu t_2,AT | |
1364 daddu c_3,t_2 | |
1365 sltu AT,c_3,t_2 | |
1366 daddu c_1,AT | |
1367 sd c_2,104(a0) /* r[13]=c2; */ | |
1368 | |
1369 dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ | |
1370 ld s0,0(sp) | |
1371 ld s1,8(sp) | |
1372 ld s2,16(sp) | |
1373 ld s3,24(sp) | |
1374 ld s4,32(sp) | |
1375 ld s5,40(sp) | |
1376 mflo t_1 | |
1377 mfhi t_2 | |
1378 daddu c_3,t_1 | |
1379 sltu AT,c_3,t_1 | |
1380 daddu t_2,AT | |
1381 daddu c_1,t_2 | |
1382 sd c_3,112(a0) /* r[14]=c3; */ | |
1383 sd c_1,120(a0) /* r[15]=c1; */ | |
1384 | |
1385 PTR_ADD sp,FRAME_SIZE | |
1386 | |
1387 jr ra | |
1388 END(bn_mul_comba8) | |
1389 | |
1390 .align 5 | |
1391 LEAF(bn_mul_comba4) | |
1392 .set reorder | |
1393 ld a_0,0(a1) | |
1394 ld b_0,0(a2) | |
1395 ld a_1,8(a1) | |
1396 ld a_2,16(a1) | |
1397 dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ | |
1398 ld a_3,24(a1) | |
1399 ld b_1,8(a2) | |
1400 ld b_2,16(a2) | |
1401 ld b_3,24(a2) | |
1402 mflo c_1 | |
1403 mfhi c_2 | |
1404 sd c_1,0(a0) | |
1405 | |
1406 dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ | |
1407 mflo t_1 | |
1408 mfhi t_2 | |
1409 daddu c_2,t_1 | |
1410 sltu AT,c_2,t_1 | |
1411 daddu c_3,t_2,AT | |
1412 dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ | |
1413 mflo t_1 | |
1414 mfhi t_2 | |
1415 daddu c_2,t_1 | |
1416 sltu AT,c_2,t_1 | |
1417 daddu t_2,AT | |
1418 daddu c_3,t_2 | |
1419 sltu c_1,c_3,t_2 | |
1420 sd c_2,8(a0) | |
1421 | |
1422 dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ | |
1423 mflo t_1 | |
1424 mfhi t_2 | |
1425 daddu c_3,t_1 | |
1426 sltu AT,c_3,t_1 | |
1427 daddu t_2,AT | |
1428 daddu c_1,t_2 | |
1429 dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ | |
1430 mflo t_1 | |
1431 mfhi t_2 | |
1432 daddu c_3,t_1 | |
1433 sltu AT,c_3,t_1 | |
1434 daddu t_2,AT | |
1435 daddu c_1,t_2 | |
1436 sltu c_2,c_1,t_2 | |
1437 dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ | |
1438 mflo t_1 | |
1439 mfhi t_2 | |
1440 daddu c_3,t_1 | |
1441 sltu AT,c_3,t_1 | |
1442 daddu t_2,AT | |
1443 daddu c_1,t_2 | |
1444 sltu AT,c_1,t_2 | |
1445 daddu c_2,AT | |
1446 sd c_3,16(a0) | |
1447 | |
1448 dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ | |
1449 mflo t_1 | |
1450 mfhi t_2 | |
1451 daddu c_1,t_1 | |
1452 sltu AT,c_1,t_1 | |
1453 daddu t_2,AT | |
1454 daddu c_2,t_2 | |
1455 sltu c_3,c_2,t_2 | |
1456 dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ | |
1457 mflo t_1 | |
1458 mfhi t_2 | |
1459 daddu c_1,t_1 | |
1460 sltu AT,c_1,t_1 | |
1461 daddu t_2,AT | |
1462 daddu c_2,t_2 | |
1463 sltu AT,c_2,t_2 | |
1464 daddu c_3,AT | |
1465 dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ | |
1466 mflo t_1 | |
1467 mfhi t_2 | |
1468 daddu c_1,t_1 | |
1469 sltu AT,c_1,t_1 | |
1470 daddu t_2,AT | |
1471 daddu c_2,t_2 | |
1472 sltu AT,c_2,t_2 | |
1473 daddu c_3,AT | |
1474 dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ | |
1475 mflo t_1 | |
1476 mfhi t_2 | |
1477 daddu c_1,t_1 | |
1478 sltu AT,c_1,t_1 | |
1479 daddu t_2,AT | |
1480 daddu c_2,t_2 | |
1481 sltu AT,c_2,t_2 | |
1482 daddu c_3,AT | |
1483 sd c_1,24(a0) | |
1484 | |
1485 dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ | |
1486 mflo t_1 | |
1487 mfhi t_2 | |
1488 daddu c_2,t_1 | |
1489 sltu AT,c_2,t_1 | |
1490 daddu t_2,AT | |
1491 daddu c_3,t_2 | |
1492 sltu c_1,c_3,t_2 | |
1493 dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ | |
1494 mflo t_1 | |
1495 mfhi t_2 | |
1496 daddu c_2,t_1 | |
1497 sltu AT,c_2,t_1 | |
1498 daddu t_2,AT | |
1499 daddu c_3,t_2 | |
1500 sltu AT,c_3,t_2 | |
1501 daddu c_1,AT | |
1502 dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ | |
1503 mflo t_1 | |
1504 mfhi t_2 | |
1505 daddu c_2,t_1 | |
1506 sltu AT,c_2,t_1 | |
1507 daddu t_2,AT | |
1508 daddu c_3,t_2 | |
1509 sltu AT,c_3,t_2 | |
1510 daddu c_1,AT | |
1511 sd c_2,32(a0) | |
1512 | |
1513 dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ | |
1514 mflo t_1 | |
1515 mfhi t_2 | |
1516 daddu c_3,t_1 | |
1517 sltu AT,c_3,t_1 | |
1518 daddu t_2,AT | |
1519 daddu c_1,t_2 | |
1520 sltu c_2,c_1,t_2 | |
1521 dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ | |
1522 mflo t_1 | |
1523 mfhi t_2 | |
1524 daddu c_3,t_1 | |
1525 sltu AT,c_3,t_1 | |
1526 daddu t_2,AT | |
1527 daddu c_1,t_2 | |
1528 sltu AT,c_1,t_2 | |
1529 daddu c_2,AT | |
1530 sd c_3,40(a0) | |
1531 | |
1532 dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ | |
1533 mflo t_1 | |
1534 mfhi t_2 | |
1535 daddu c_1,t_1 | |
1536 sltu AT,c_1,t_1 | |
1537 daddu t_2,AT | |
1538 daddu c_2,t_2 | |
1539 sd c_1,48(a0) | |
1540 sd c_2,56(a0) | |
1541 | |
1542 jr ra | |
1543 END(bn_mul_comba4) | |
1544 | |
1545 #undef a_4 | |
1546 #undef a_5 | |
1547 #undef a_6 | |
1548 #undef a_7 | |
1549 #define a_4 b_0 | |
1550 #define a_5 b_1 | |
1551 #define a_6 b_2 | |
1552 #define a_7 b_3 | |
1553 | |
1554 .align 5 | |
1555 LEAF(bn_sqr_comba8) | |
1556 .set reorder | |
1557 ld a_0,0(a1) | |
1558 ld a_1,8(a1) | |
1559 ld a_2,16(a1) | |
1560 ld a_3,24(a1) | |
1561 | |
1562 dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ | |
1563 ld a_4,32(a1) | |
1564 ld a_5,40(a1) | |
1565 ld a_6,48(a1) | |
1566 ld a_7,56(a1) | |
1567 mflo c_1 | |
1568 mfhi c_2 | |
1569 sd c_1,0(a0) | |
1570 | |
1571 dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ | |
1572 mflo t_1 | |
1573 mfhi t_2 | |
1574 slt c_1,t_2,zero | |
1575 dsll t_2,1 | |
1576 slt a2,t_1,zero | |
1577 daddu t_2,a2 | |
1578 dsll t_1,1 | |
1579 daddu c_2,t_1 | |
1580 sltu AT,c_2,t_1 | |
1581 daddu c_3,t_2,AT | |
1582 sd c_2,8(a0) | |
1583 | |
1584 dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ | |
1585 mflo t_1 | |
1586 mfhi t_2 | |
1587 slt c_2,t_2,zero | |
1588 dsll t_2,1 | |
1589 slt a2,t_1,zero | |
1590 daddu t_2,a2 | |
1591 dsll t_1,1 | |
1592 daddu c_3,t_1 | |
1593 sltu AT,c_3,t_1 | |
1594 daddu t_2,AT | |
1595 daddu c_1,t_2 | |
1596 sltu AT,c_1,t_2 | |
1597 daddu c_2,AT | |
1598 dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ | |
1599 mflo t_1 | |
1600 mfhi t_2 | |
1601 daddu c_3,t_1 | |
1602 sltu AT,c_3,t_1 | |
1603 daddu t_2,AT | |
1604 daddu c_1,t_2 | |
1605 sltu AT,c_1,t_2 | |
1606 daddu c_2,AT | |
1607 sd c_3,16(a0) | |
1608 | |
1609 dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ | |
1610 mflo t_1 | |
1611 mfhi t_2 | |
1612 slt c_3,t_2,zero | |
1613 dsll t_2,1 | |
1614 slt a2,t_1,zero | |
1615 daddu t_2,a2 | |
1616 dsll t_1,1 | |
1617 daddu c_1,t_1 | |
1618 sltu AT,c_1,t_1 | |
1619 daddu t_2,AT | |
1620 daddu c_2,t_2 | |
1621 sltu AT,c_2,t_2 | |
1622 daddu c_3,AT | |
1623 dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */ | |
1624 mflo t_1 | |
1625 mfhi t_2 | |
1626 slt AT,t_2,zero | |
1627 daddu c_3,AT | |
1628 dsll t_2,1 | |
1629 slt a2,t_1,zero | |
1630 daddu t_2,a2 | |
1631 dsll t_1,1 | |
1632 daddu c_1,t_1 | |
1633 sltu AT,c_1,t_1 | |
1634 daddu t_2,AT | |
1635 daddu c_2,t_2 | |
1636 sltu AT,c_2,t_2 | |
1637 daddu c_3,AT | |
1638 sd c_1,24(a0) | |
1639 | |
1640 dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */ | |
1641 mflo t_1 | |
1642 mfhi t_2 | |
1643 slt c_1,t_2,zero | |
1644 dsll t_2,1 | |
1645 slt a2,t_1,zero | |
1646 daddu t_2,a2 | |
1647 dsll t_1,1 | |
1648 daddu c_2,t_1 | |
1649 sltu AT,c_2,t_1 | |
1650 daddu t_2,AT | |
1651 daddu c_3,t_2 | |
1652 sltu AT,c_3,t_2 | |
1653 daddu c_1,AT | |
1654 dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ | |
1655 mflo t_1 | |
1656 mfhi t_2 | |
1657 slt AT,t_2,zero | |
1658 daddu c_1,AT | |
1659 dsll t_2,1 | |
1660 slt a2,t_1,zero | |
1661 daddu t_2,a2 | |
1662 dsll t_1,1 | |
1663 daddu c_2,t_1 | |
1664 sltu AT,c_2,t_1 | |
1665 daddu t_2,AT | |
1666 daddu c_3,t_2 | |
1667 sltu AT,c_3,t_2 | |
1668 daddu c_1,AT | |
1669 dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ | |
1670 mflo t_1 | |
1671 mfhi t_2 | |
1672 daddu c_2,t_1 | |
1673 sltu AT,c_2,t_1 | |
1674 daddu t_2,AT | |
1675 daddu c_3,t_2 | |
1676 sltu AT,c_3,t_2 | |
1677 daddu c_1,AT | |
1678 sd c_2,32(a0) | |
1679 | |
1680 dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */ | |
1681 mflo t_1 | |
1682 mfhi t_2 | |
1683 slt c_2,t_2,zero | |
1684 dsll t_2,1 | |
1685 slt a2,t_1,zero | |
1686 daddu t_2,a2 | |
1687 dsll t_1,1 | |
1688 daddu c_3,t_1 | |
1689 sltu AT,c_3,t_1 | |
1690 daddu t_2,AT | |
1691 daddu c_1,t_2 | |
1692 sltu AT,c_1,t_2 | |
1693 daddu c_2,AT | |
1694 dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */ | |
1695 mflo t_1 | |
1696 mfhi t_2 | |
1697 slt AT,t_2,zero | |
1698 daddu c_2,AT | |
1699 dsll t_2,1 | |
1700 slt a2,t_1,zero | |
1701 daddu t_2,a2 | |
1702 dsll t_1,1 | |
1703 daddu c_3,t_1 | |
1704 sltu AT,c_3,t_1 | |
1705 daddu t_2,AT | |
1706 daddu c_1,t_2 | |
1707 sltu AT,c_1,t_2 | |
1708 daddu c_2,AT | |
1709 dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ | |
1710 mflo t_1 | |
1711 mfhi t_2 | |
1712 slt AT,t_2,zero | |
1713 daddu c_2,AT | |
1714 dsll t_2,1 | |
1715 slt a2,t_1,zero | |
1716 daddu t_2,a2 | |
1717 dsll t_1,1 | |
1718 daddu c_3,t_1 | |
1719 sltu AT,c_3,t_1 | |
1720 daddu t_2,AT | |
1721 daddu c_1,t_2 | |
1722 sltu AT,c_1,t_2 | |
1723 daddu c_2,AT | |
1724 sd c_3,40(a0) | |
1725 | |
1726 dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */ | |
1727 mflo t_1 | |
1728 mfhi t_2 | |
1729 slt c_3,t_2,zero | |
1730 dsll t_2,1 | |
1731 slt a2,t_1,zero | |
1732 daddu t_2,a2 | |
1733 dsll t_1,1 | |
1734 daddu c_1,t_1 | |
1735 sltu AT,c_1,t_1 | |
1736 daddu t_2,AT | |
1737 daddu c_2,t_2 | |
1738 sltu AT,c_2,t_2 | |
1739 daddu c_3,AT | |
1740 dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */ | |
1741 mflo t_1 | |
1742 mfhi t_2 | |
1743 slt AT,t_2,zero | |
1744 daddu c_3,AT | |
1745 dsll t_2,1 | |
1746 slt a2,t_1,zero | |
1747 daddu t_2,a2 | |
1748 dsll t_1,1 | |
1749 daddu c_1,t_1 | |
1750 sltu AT,c_1,t_1 | |
1751 daddu t_2,AT | |
1752 daddu c_2,t_2 | |
1753 sltu AT,c_2,t_2 | |
1754 daddu c_3,AT | |
1755 dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */ | |
1756 mflo t_1 | |
1757 mfhi t_2 | |
1758 slt AT,t_2,zero | |
1759 daddu c_3,AT | |
1760 dsll t_2,1 | |
1761 slt a2,t_1,zero | |
1762 daddu t_2,a2 | |
1763 dsll t_1,1 | |
1764 daddu c_1,t_1 | |
1765 sltu AT,c_1,t_1 | |
1766 daddu t_2,AT | |
1767 daddu c_2,t_2 | |
1768 sltu AT,c_2,t_2 | |
1769 daddu c_3,AT | |
1770 dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ | |
1771 mflo t_1 | |
1772 mfhi t_2 | |
1773 daddu c_1,t_1 | |
1774 sltu AT,c_1,t_1 | |
1775 daddu t_2,AT | |
1776 daddu c_2,t_2 | |
1777 sltu AT,c_2,t_2 | |
1778 daddu c_3,AT | |
1779 sd c_1,48(a0) | |
1780 | |
1781 dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */ | |
1782 mflo t_1 | |
1783 mfhi t_2 | |
1784 slt c_1,t_2,zero | |
1785 dsll t_2,1 | |
1786 slt a2,t_1,zero | |
1787 daddu t_2,a2 | |
1788 dsll t_1,1 | |
1789 daddu c_2,t_1 | |
1790 sltu AT,c_2,t_1 | |
1791 daddu t_2,AT | |
1792 daddu c_3,t_2 | |
1793 sltu AT,c_3,t_2 | |
1794 daddu c_1,AT | |
1795 dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */ | |
1796 mflo t_1 | |
1797 mfhi t_2 | |
1798 slt AT,t_2,zero | |
1799 daddu c_1,AT | |
1800 dsll t_2,1 | |
1801 slt a2,t_1,zero | |
1802 daddu t_2,a2 | |
1803 dsll t_1,1 | |
1804 daddu c_2,t_1 | |
1805 sltu AT,c_2,t_1 | |
1806 daddu t_2,AT | |
1807 daddu c_3,t_2 | |
1808 sltu AT,c_3,t_2 | |
1809 daddu c_1,AT | |
1810 dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */ | |
1811 mflo t_1 | |
1812 mfhi t_2 | |
1813 slt AT,t_2,zero | |
1814 daddu c_1,AT | |
1815 dsll t_2,1 | |
1816 slt a2,t_1,zero | |
1817 daddu t_2,a2 | |
1818 dsll t_1,1 | |
1819 daddu c_2,t_1 | |
1820 sltu AT,c_2,t_1 | |
1821 daddu t_2,AT | |
1822 daddu c_3,t_2 | |
1823 sltu AT,c_3,t_2 | |
1824 daddu c_1,AT | |
1825 dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */ | |
1826 mflo t_1 | |
1827 mfhi t_2 | |
1828 slt AT,t_2,zero | |
1829 daddu c_1,AT | |
1830 dsll t_2,1 | |
1831 slt a2,t_1,zero | |
1832 daddu t_2,a2 | |
1833 dsll t_1,1 | |
1834 daddu c_2,t_1 | |
1835 sltu AT,c_2,t_1 | |
1836 daddu t_2,AT | |
1837 daddu c_3,t_2 | |
1838 sltu AT,c_3,t_2 | |
1839 daddu c_1,AT | |
1840 sd c_2,56(a0) | |
1841 | |
1842 dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */ | |
1843 mflo t_1 | |
1844 mfhi t_2 | |
1845 slt c_2,t_2,zero | |
1846 dsll t_2,1 | |
1847 slt a2,t_1,zero | |
1848 daddu t_2,a2 | |
1849 dsll t_1,1 | |
1850 daddu c_3,t_1 | |
1851 sltu AT,c_3,t_1 | |
1852 daddu t_2,AT | |
1853 daddu c_1,t_2 | |
1854 sltu AT,c_1,t_2 | |
1855 daddu c_2,AT | |
1856 dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */ | |
1857 mflo t_1 | |
1858 mfhi t_2 | |
1859 slt AT,t_2,zero | |
1860 daddu c_2,AT | |
1861 dsll t_2,1 | |
1862 slt a2,t_1,zero | |
1863 daddu t_2,a2 | |
1864 dsll t_1,1 | |
1865 daddu c_3,t_1 | |
1866 sltu AT,c_3,t_1 | |
1867 daddu t_2,AT | |
1868 daddu c_1,t_2 | |
1869 sltu AT,c_1,t_2 | |
1870 daddu c_2,AT | |
1871 dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */ | |
1872 mflo t_1 | |
1873 mfhi t_2 | |
1874 slt AT,t_2,zero | |
1875 daddu c_2,AT | |
1876 dsll t_2,1 | |
1877 slt a2,t_1,zero | |
1878 daddu t_2,a2 | |
1879 dsll t_1,1 | |
1880 daddu c_3,t_1 | |
1881 sltu AT,c_3,t_1 | |
1882 daddu t_2,AT | |
1883 daddu c_1,t_2 | |
1884 sltu AT,c_1,t_2 | |
1885 daddu c_2,AT | |
1886 dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ | |
1887 mflo t_1 | |
1888 mfhi t_2 | |
1889 daddu c_3,t_1 | |
1890 sltu AT,c_3,t_1 | |
1891 daddu t_2,AT | |
1892 daddu c_1,t_2 | |
1893 sltu AT,c_1,t_2 | |
1894 daddu c_2,AT | |
1895 sd c_3,64(a0) | |
1896 | |
1897 dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */ | |
1898 mflo t_1 | |
1899 mfhi t_2 | |
1900 slt c_3,t_2,zero | |
1901 dsll t_2,1 | |
1902 slt a2,t_1,zero | |
1903 daddu t_2,a2 | |
1904 dsll t_1,1 | |
1905 daddu c_1,t_1 | |
1906 sltu AT,c_1,t_1 | |
1907 daddu t_2,AT | |
1908 daddu c_2,t_2 | |
1909 sltu AT,c_2,t_2 | |
1910 daddu c_3,AT | |
1911 dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */ | |
1912 mflo t_1 | |
1913 mfhi t_2 | |
1914 slt AT,t_2,zero | |
1915 daddu c_3,AT | |
1916 dsll t_2,1 | |
1917 slt a2,t_1,zero | |
1918 daddu t_2,a2 | |
1919 dsll t_1,1 | |
1920 daddu c_1,t_1 | |
1921 sltu AT,c_1,t_1 | |
1922 daddu t_2,AT | |
1923 daddu c_2,t_2 | |
1924 sltu AT,c_2,t_2 | |
1925 daddu c_3,AT | |
1926 dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */ | |
1927 mflo t_1 | |
1928 mfhi t_2 | |
1929 slt AT,t_2,zero | |
1930 daddu c_3,AT | |
1931 dsll t_2,1 | |
1932 slt a2,t_1,zero | |
1933 daddu t_2,a2 | |
1934 dsll t_1,1 | |
1935 daddu c_1,t_1 | |
1936 sltu AT,c_1,t_1 | |
1937 daddu t_2,AT | |
1938 daddu c_2,t_2 | |
1939 sltu AT,c_2,t_2 | |
1940 daddu c_3,AT | |
1941 sd c_1,72(a0) | |
1942 | |
1943 dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */ | |
1944 mflo t_1 | |
1945 mfhi t_2 | |
1946 slt c_1,t_2,zero | |
1947 dsll t_2,1 | |
1948 slt a2,t_1,zero | |
1949 daddu t_2,a2 | |
1950 dsll t_1,1 | |
1951 daddu c_2,t_1 | |
1952 sltu AT,c_2,t_1 | |
1953 daddu t_2,AT | |
1954 daddu c_3,t_2 | |
1955 sltu AT,c_3,t_2 | |
1956 daddu c_1,AT | |
1957 dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */ | |
1958 mflo t_1 | |
1959 mfhi t_2 | |
1960 slt AT,t_2,zero | |
1961 daddu c_1,AT | |
1962 dsll t_2,1 | |
1963 slt a2,t_1,zero | |
1964 daddu t_2,a2 | |
1965 dsll t_1,1 | |
1966 daddu c_2,t_1 | |
1967 sltu AT,c_2,t_1 | |
1968 daddu t_2,AT | |
1969 daddu c_3,t_2 | |
1970 sltu AT,c_3,t_2 | |
1971 daddu c_1,AT | |
1972 dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ | |
1973 mflo t_1 | |
1974 mfhi t_2 | |
1975 daddu c_2,t_1 | |
1976 sltu AT,c_2,t_1 | |
1977 daddu t_2,AT | |
1978 daddu c_3,t_2 | |
1979 sltu AT,c_3,t_2 | |
1980 daddu c_1,AT | |
1981 sd c_2,80(a0) | |
1982 | |
1983 dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */ | |
1984 mflo t_1 | |
1985 mfhi t_2 | |
1986 slt c_2,t_2,zero | |
1987 dsll t_2,1 | |
1988 slt a2,t_1,zero | |
1989 daddu t_2,a2 | |
1990 dsll t_1,1 | |
1991 daddu c_3,t_1 | |
1992 sltu AT,c_3,t_1 | |
1993 daddu t_2,AT | |
1994 daddu c_1,t_2 | |
1995 sltu AT,c_1,t_2 | |
1996 daddu c_2,AT | |
1997 dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */ | |
1998 mflo t_1 | |
1999 mfhi t_2 | |
2000 slt AT,t_2,zero | |
2001 daddu c_2,AT | |
2002 dsll t_2,1 | |
2003 slt a2,t_1,zero | |
2004 daddu t_2,a2 | |
2005 dsll t_1,1 | |
2006 daddu c_3,t_1 | |
2007 sltu AT,c_3,t_1 | |
2008 daddu t_2,AT | |
2009 daddu c_1,t_2 | |
2010 sltu AT,c_1,t_2 | |
2011 daddu c_2,AT | |
2012 sd c_3,88(a0) | |
2013 | |
2014 dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */ | |
2015 mflo t_1 | |
2016 mfhi t_2 | |
2017 slt c_3,t_2,zero | |
2018 dsll t_2,1 | |
2019 slt a2,t_1,zero | |
2020 daddu t_2,a2 | |
2021 dsll t_1,1 | |
2022 daddu c_1,t_1 | |
2023 sltu AT,c_1,t_1 | |
2024 daddu t_2,AT | |
2025 daddu c_2,t_2 | |
2026 sltu AT,c_2,t_2 | |
2027 daddu c_3,AT | |
2028 dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ | |
2029 mflo t_1 | |
2030 mfhi t_2 | |
2031 daddu c_1,t_1 | |
2032 sltu AT,c_1,t_1 | |
2033 daddu t_2,AT | |
2034 daddu c_2,t_2 | |
2035 sltu AT,c_2,t_2 | |
2036 daddu c_3,AT | |
2037 sd c_1,96(a0) | |
2038 | |
2039 dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */ | |
2040 mflo t_1 | |
2041 mfhi t_2 | |
2042 slt c_1,t_2,zero | |
2043 dsll t_2,1 | |
2044 slt a2,t_1,zero | |
2045 daddu t_2,a2 | |
2046 dsll t_1,1 | |
2047 daddu c_2,t_1 | |
2048 sltu AT,c_2,t_1 | |
2049 daddu t_2,AT | |
2050 daddu c_3,t_2 | |
2051 sltu AT,c_3,t_2 | |
2052 daddu c_1,AT | |
2053 sd c_2,104(a0) | |
2054 | |
2055 dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ | |
2056 mflo t_1 | |
2057 mfhi t_2 | |
2058 daddu c_3,t_1 | |
2059 sltu AT,c_3,t_1 | |
2060 daddu t_2,AT | |
2061 daddu c_1,t_2 | |
2062 sd c_3,112(a0) | |
2063 sd c_1,120(a0) | |
2064 | |
2065 jr ra | |
2066 END(bn_sqr_comba8) | |
2067 | |
2068 .align 5 | |
2069 LEAF(bn_sqr_comba4) | |
2070 .set reorder | |
2071 ld a_0,0(a1) | |
2072 ld a_1,8(a1) | |
2073 ld a_2,16(a1) | |
2074 ld a_3,24(a1) | |
2075 dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ | |
2076 mflo c_1 | |
2077 mfhi c_2 | |
2078 sd c_1,0(a0) | |
2079 | |
2080 dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ | |
2081 mflo t_1 | |
2082 mfhi t_2 | |
2083 slt c_1,t_2,zero | |
2084 dsll t_2,1 | |
2085 slt a2,t_1,zero | |
2086 daddu t_2,a2 | |
2087 dsll t_1,1 | |
2088 daddu c_2,t_1 | |
2089 sltu AT,c_2,t_1 | |
2090 daddu c_3,t_2,AT | |
2091 sd c_2,8(a0) | |
2092 | |
2093 dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ | |
2094 mflo t_1 | |
2095 mfhi t_2 | |
2096 slt c_2,t_2,zero | |
2097 dsll t_2,1 | |
2098 slt a2,t_1,zero | |
2099 daddu t_2,a2 | |
2100 dsll t_1,1 | |
2101 daddu c_3,t_1 | |
2102 sltu AT,c_3,t_1 | |
2103 daddu t_2,AT | |
2104 daddu c_1,t_2 | |
2105 sltu AT,c_1,t_2 | |
2106 daddu c_2,AT | |
2107 dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ | |
2108 mflo t_1 | |
2109 mfhi t_2 | |
2110 daddu c_3,t_1 | |
2111 sltu AT,c_3,t_1 | |
2112 daddu t_2,AT | |
2113 daddu c_1,t_2 | |
2114 sltu AT,c_1,t_2 | |
2115 daddu c_2,AT | |
2116 sd c_3,16(a0) | |
2117 | |
2118 dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ | |
2119 mflo t_1 | |
2120 mfhi t_2 | |
2121 slt c_3,t_2,zero | |
2122 dsll t_2,1 | |
2123 slt a2,t_1,zero | |
2124 daddu t_2,a2 | |
2125 dsll t_1,1 | |
2126 daddu c_1,t_1 | |
2127 sltu AT,c_1,t_1 | |
2128 daddu t_2,AT | |
2129 daddu c_2,t_2 | |
2130 sltu AT,c_2,t_2 | |
2131 daddu c_3,AT | |
2132 dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */ | |
2133 mflo t_1 | |
2134 mfhi t_2 | |
2135 slt AT,t_2,zero | |
2136 daddu c_3,AT | |
2137 dsll t_2,1 | |
2138 slt a2,t_1,zero | |
2139 daddu t_2,a2 | |
2140 dsll t_1,1 | |
2141 daddu c_1,t_1 | |
2142 sltu AT,c_1,t_1 | |
2143 daddu t_2,AT | |
2144 daddu c_2,t_2 | |
2145 sltu AT,c_2,t_2 | |
2146 daddu c_3,AT | |
2147 sd c_1,24(a0) | |
2148 | |
2149 dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ | |
2150 mflo t_1 | |
2151 mfhi t_2 | |
2152 slt c_1,t_2,zero | |
2153 dsll t_2,1 | |
2154 slt a2,t_1,zero | |
2155 daddu t_2,a2 | |
2156 dsll t_1,1 | |
2157 daddu c_2,t_1 | |
2158 sltu AT,c_2,t_1 | |
2159 daddu t_2,AT | |
2160 daddu c_3,t_2 | |
2161 sltu AT,c_3,t_2 | |
2162 daddu c_1,AT | |
2163 dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ | |
2164 mflo t_1 | |
2165 mfhi t_2 | |
2166 daddu c_2,t_1 | |
2167 sltu AT,c_2,t_1 | |
2168 daddu t_2,AT | |
2169 daddu c_3,t_2 | |
2170 sltu AT,c_3,t_2 | |
2171 daddu c_1,AT | |
2172 sd c_2,32(a0) | |
2173 | |
2174 dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ | |
2175 mflo t_1 | |
2176 mfhi t_2 | |
2177 slt c_2,t_2,zero | |
2178 dsll t_2,1 | |
2179 slt a2,t_1,zero | |
2180 daddu t_2,a2 | |
2181 dsll t_1,1 | |
2182 daddu c_3,t_1 | |
2183 sltu AT,c_3,t_1 | |
2184 daddu t_2,AT | |
2185 daddu c_1,t_2 | |
2186 sltu AT,c_1,t_2 | |
2187 daddu c_2,AT | |
2188 sd c_3,40(a0) | |
2189 | |
2190 dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ | |
2191 mflo t_1 | |
2192 mfhi t_2 | |
2193 daddu c_1,t_1 | |
2194 sltu AT,c_1,t_1 | |
2195 daddu t_2,AT | |
2196 daddu c_2,t_2 | |
2197 sd c_1,48(a0) | |
2198 sd c_2,56(a0) | |
2199 | |
2200 jr ra | |
2201 END(bn_sqr_comba4) | |
OLD | NEW |