OLD | NEW |
| (Empty) |
1 ; | |
2 ; PA-RISC 2.0 implementation of bn_asm code, based on the | |
3 ; 64-bit version of the code. This code is effectively the | |
4 ; same as the 64-bit version except the register model is | |
5 ; slightly different given all values must be 32-bit between | |
6 ; function calls. Thus the 64-bit return values are returned | |
7 ; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit | |
8 ; | |
9 ; | |
10 ; This code is approximately 2x faster than the C version | |
11 ; for RSA/DSA. | |
12 ; | |
13 ; See http://devresource.hp.com/ for more details on the PA-RISC | |
14 ; architecture. Also see the book "PA-RISC 2.0 Architecture" | |
15 ; by Gerry Kane for information on the instruction set architecture. | |
16 ; | |
17 ; Code written by Chris Ruemmler (with some help from the HP C | |
18 ; compiler). | |
19 ; | |
20 ; The code compiles with HP's assembler | |
21 ; | |
22 | |
23 .level 2.0N | |
24 .space $TEXT$ | |
25 .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY | |
26 | |
27 ; | |
28 ; Global Register definitions used for the routines. | |
29 ; | |
30 ; Some information about HP's runtime architecture for 32-bits. | |
31 ; | |
32 ; "Caller save" means the calling function must save the register | |
33 ; if it wants the register to be preserved. | |
34 ; "Callee save" means if a function uses the register, it must save | |
35 ; the value before using it. | |
36 ; | |
37 ; For the floating point registers | |
38 ; | |
39 ; "caller save" registers: fr4-fr11, fr22-fr31 | |
40 ; "callee save" registers: fr12-fr21 | |
41 ; "special" registers: fr0-fr3 (status and exception registers) | |
42 ; | |
43 ; For the integer registers | |
44 ; value zero : r0 | |
45 ; "caller save" registers: r1,r19-r26 | |
46 ; "callee save" registers: r3-r18 | |
47 ; return register : r2 (rp) | |
48 ; return values ; r28,r29 (ret0,ret1) | |
49 ; Stack pointer ; r30 (sp) | |
50 ; millicode return ptr ; r31 (also a caller save register) | |
51 | |
52 | |
53 ; | |
54 ; Arguments to the routines | |
55 ; | |
56 r_ptr .reg %r26 | |
57 a_ptr .reg %r25 | |
58 b_ptr .reg %r24 | |
59 num .reg %r24 | |
60 n .reg %r23 | |
61 | |
62 ; | |
63 ; Note that the "w" argument for bn_mul_add_words and bn_mul_words | |
64 ; is passed on the stack at a delta of -56 from the top of stack | |
65 ; as the routine is entered. | |
66 ; | |
67 | |
68 ; | |
69 ; Globals used in some routines | |
70 ; | |
71 | |
72 top_overflow .reg %r23 | |
73 high_mask .reg %r22 ; value 0xffffffff80000000L | |
74 | |
75 | |
76 ;------------------------------------------------------------------------------ | |
77 ; | |
78 ; bn_mul_add_words | |
79 ; | |
80 ;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr, | |
81 ; int num, BN_ULON
G w) | |
82 ; | |
83 ; arg0 = r_ptr | |
84 ; arg1 = a_ptr | |
85 ; arg3 = num | |
86 ; -56(sp) = w | |
87 ; | |
88 ; Local register definitions | |
89 ; | |
90 | |
91 fm1 .reg %fr22 | |
92 fm .reg %fr23 | |
93 ht_temp .reg %fr24 | |
94 ht_temp_1 .reg %fr25 | |
95 lt_temp .reg %fr26 | |
96 lt_temp_1 .reg %fr27 | |
97 fm1_1 .reg %fr28 | |
98 fm_1 .reg %fr29 | |
99 | |
100 fw_h .reg %fr7L | |
101 fw_l .reg %fr7R | |
102 fw .reg %fr7 | |
103 | |
104 fht_0 .reg %fr8L | |
105 flt_0 .reg %fr8R | |
106 t_float_0 .reg %fr8 | |
107 | |
108 fht_1 .reg %fr9L | |
109 flt_1 .reg %fr9R | |
110 t_float_1 .reg %fr9 | |
111 | |
112 tmp_0 .reg %r31 | |
113 tmp_1 .reg %r21 | |
114 m_0 .reg %r20 | |
115 m_1 .reg %r19 | |
116 ht_0 .reg %r1 | |
117 ht_1 .reg %r3 | |
118 lt_0 .reg %r4 | |
119 lt_1 .reg %r5 | |
120 m1_0 .reg %r6 | |
121 m1_1 .reg %r7 | |
122 rp_val .reg %r8 | |
123 rp_val_1 .reg %r9 | |
124 | |
125 bn_mul_add_words | |
126 .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN | |
127 .proc | |
128 .callinfo frame=128 | |
129 .entry | |
130 .align 64 | |
131 | |
132 STD %r3,0(%sp) ; save r3 | |
133 STD %r4,8(%sp) ; save r4 | |
134 NOP ; Needed to make the loop 16-byte aligned | |
135 NOP ; needed to make the loop 16-byte aligned | |
136 | |
137 STD %r5,16(%sp) ; save r5 | |
138 NOP | |
139 STD %r6,24(%sp) ; save r6 | |
140 STD %r7,32(%sp) ; save r7 | |
141 | |
142 STD %r8,40(%sp) ; save r8 | |
143 STD %r9,48(%sp) ; save r9 | |
144 COPY %r0,%ret1 ; return 0 by default | |
145 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 | |
146 | |
147 CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit | |
148 LDO 128(%sp),%sp ; bump stack | |
149 | |
150 ; | |
151 ; The loop is unrolled twice, so if there is only 1 number | |
152 ; then go straight to the cleanup code. | |
153 ; | |
154 CMPIB,= 1,num,bn_mul_add_words_single_top | |
155 FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l) | |
156 | |
157 ; | |
158 ; This loop is unrolled 2 times (64-byte aligned as well) | |
159 ; | |
160 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus | |
161 ; two 32-bit mutiplies can be issued per cycle. | |
162 ; | |
163 bn_mul_add_words_unroll2 | |
164 | |
165 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | |
166 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) | |
167 LDD 0(r_ptr),rp_val ; rp[0] | |
168 LDD 8(r_ptr),rp_val_1 ; rp[1] | |
169 | |
170 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l | |
171 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l | |
172 FSTD fm1,-16(%sp) ; -16(sp) = m1[0] | |
173 FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1] | |
174 | |
175 XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h | |
176 XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h | |
177 FSTD fm,-8(%sp) ; -8(sp) = m[0] | |
178 FSTD fm_1,-40(%sp) ; -40(sp) = m[1] | |
179 | |
180 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h | |
181 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h | |
182 FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp | |
183 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1 | |
184 | |
185 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l | |
186 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l | |
187 FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp | |
188 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1 | |
189 | |
190 LDD -8(%sp),m_0 ; m[0] | |
191 LDD -40(%sp),m_1 ; m[1] | |
192 LDD -16(%sp),m1_0 ; m1[0] | |
193 LDD -48(%sp),m1_1 ; m1[1] | |
194 | |
195 LDD -24(%sp),ht_0 ; ht[0] | |
196 LDD -56(%sp),ht_1 ; ht[1] | |
197 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0]; | |
198 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1]; | |
199 | |
200 LDD -32(%sp),lt_0 | |
201 LDD -64(%sp),lt_1 | |
202 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0]) | |
203 ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32) | |
204 | |
205 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1]) | |
206 ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32) | |
207 EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32 | |
208 DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32 | |
209 | |
210 EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32 | |
211 DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32 | |
212 ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32) | |
213 ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32) | |
214 | |
215 ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0]; | |
216 ADD,DC ht_0,%r0,ht_0 ; ht[0]++ | |
217 ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1]; | |
218 ADD,DC ht_1,%r0,ht_1 ; ht[1]++ | |
219 | |
220 ADD %ret1,lt_0,lt_0 ; lt[0] = lt[0] + c; | |
221 ADD,DC ht_0,%r0,ht_0 ; ht[0]++ | |
222 ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0] | |
223 ADD,DC ht_0,%r0,ht_0 ; ht[0]++ | |
224 | |
225 LDO -2(num),num ; num = num - 2; | |
226 ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c); | |
227 ADD,DC ht_1,%r0,ht_1 ; ht[1]++ | |
228 STD lt_0,0(r_ptr) ; rp[0] = lt[0] | |
229 | |
230 ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1] | |
231 ADD,DC ht_1,%r0,%ret1 ; ht[1]++ | |
232 LDO 16(a_ptr),a_ptr ; a_ptr += 2 | |
233 | |
234 STD lt_1,8(r_ptr) ; rp[1] = lt[1] | |
235 CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do | |
236 LDO 16(r_ptr),r_ptr ; r_ptr += 2 | |
237 | |
238 CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one | |
239 | |
240 ; | |
241 ; Top of loop aligned on 64-byte boundary | |
242 ; | |
243 bn_mul_add_words_single_top | |
244 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | |
245 LDD 0(r_ptr),rp_val ; rp[0] | |
246 LDO 8(a_ptr),a_ptr ; a_ptr++ | |
247 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l | |
248 FSTD fm1,-16(%sp) ; -16(sp) = m1 | |
249 XMPYU flt_0,fw_h,fm ; m = lt*fw_h | |
250 FSTD fm,-8(%sp) ; -8(sp) = m | |
251 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h | |
252 FSTD ht_temp,-24(%sp) ; -24(sp) = ht | |
253 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l | |
254 FSTD lt_temp,-32(%sp) ; -32(sp) = lt | |
255 | |
256 LDD -8(%sp),m_0 | |
257 LDD -16(%sp),m1_0 ; m1 = temp1 | |
258 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; | |
259 LDD -24(%sp),ht_0 | |
260 LDD -32(%sp),lt_0 | |
261 | |
262 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) | |
263 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) | |
264 | |
265 EXTRD,U tmp_0,31,32,m_0 ; m>>32 | |
266 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 | |
267 | |
268 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) | |
269 ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1; | |
270 ADD,DC ht_0,%r0,ht_0 ; ht++ | |
271 ADD %ret1,tmp_0,lt_0 ; lt = lt + c; | |
272 ADD,DC ht_0,%r0,ht_0 ; ht++ | |
273 ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0] | |
274 ADD,DC ht_0,%r0,%ret1 ; ht++ | |
275 STD lt_0,0(r_ptr) ; rp[0] = lt | |
276 | |
277 bn_mul_add_words_exit | |
278 .EXIT | |
279 | |
280 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 | |
281 LDD -80(%sp),%r9 ; restore r9 | |
282 LDD -88(%sp),%r8 ; restore r8 | |
283 LDD -96(%sp),%r7 ; restore r7 | |
284 LDD -104(%sp),%r6 ; restore r6 | |
285 LDD -112(%sp),%r5 ; restore r5 | |
286 LDD -120(%sp),%r4 ; restore r4 | |
287 BVE (%rp) | |
288 LDD,MB -128(%sp),%r3 ; restore r3 | |
289 .PROCEND ;in=23,24,25,26,29;out=28; | |
290 | |
291 ;---------------------------------------------------------------------------- | |
292 ; | |
293 ;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | |
294 ; | |
295 ; arg0 = rp | |
296 ; arg1 = ap | |
297 ; arg3 = num | |
298 ; w on stack at -56(sp) | |
299 | |
300 bn_mul_words | |
301 .proc | |
302 .callinfo frame=128 | |
303 .entry | |
304 .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | |
305 .align 64 | |
306 | |
307 STD %r3,0(%sp) ; save r3 | |
308 STD %r4,8(%sp) ; save r4 | |
309 NOP | |
310 STD %r5,16(%sp) ; save r5 | |
311 | |
312 STD %r6,24(%sp) ; save r6 | |
313 STD %r7,32(%sp) ; save r7 | |
314 COPY %r0,%ret1 ; return 0 by default | |
315 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 | |
316 | |
317 CMPIB,>= 0,num,bn_mul_words_exit | |
318 LDO 128(%sp),%sp ; bump stack | |
319 | |
320 ; | |
321 ; See if only 1 word to do, thus just do cleanup | |
322 ; | |
323 CMPIB,= 1,num,bn_mul_words_single_top | |
324 FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l) | |
325 | |
326 ; | |
327 ; This loop is unrolled 2 times (64-byte aligned as well) | |
328 ; | |
329 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus | |
330 ; two 32-bit mutiplies can be issued per cycle. | |
331 ; | |
332 bn_mul_words_unroll2 | |
333 | |
334 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | |
335 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) | |
336 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l | |
337 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l | |
338 | |
339 FSTD fm1,-16(%sp) ; -16(sp) = m1 | |
340 FSTD fm1_1,-48(%sp) ; -48(sp) = m1 | |
341 XMPYU flt_0,fw_h,fm ; m = lt*fw_h | |
342 XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h | |
343 | |
344 FSTD fm,-8(%sp) ; -8(sp) = m | |
345 FSTD fm_1,-40(%sp) ; -40(sp) = m | |
346 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h | |
347 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h | |
348 | |
349 FSTD ht_temp,-24(%sp) ; -24(sp) = ht | |
350 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht | |
351 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l | |
352 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l | |
353 | |
354 FSTD lt_temp,-32(%sp) ; -32(sp) = lt | |
355 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt | |
356 LDD -8(%sp),m_0 | |
357 LDD -40(%sp),m_1 | |
358 | |
359 LDD -16(%sp),m1_0 | |
360 LDD -48(%sp),m1_1 | |
361 LDD -24(%sp),ht_0 | |
362 LDD -56(%sp),ht_1 | |
363 | |
364 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1; | |
365 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1; | |
366 LDD -32(%sp),lt_0 | |
367 LDD -64(%sp),lt_1 | |
368 | |
369 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1) | |
370 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) | |
371 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1) | |
372 ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32) | |
373 | |
374 EXTRD,U tmp_0,31,32,m_0 ; m>>32 | |
375 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 | |
376 EXTRD,U tmp_1,31,32,m_1 ; m>>32 | |
377 DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32 | |
378 | |
379 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) | |
380 ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32) | |
381 ADD lt_0,m1_0,lt_0 ; lt = lt+m1; | |
382 ADD,DC ht_0,%r0,ht_0 ; ht++ | |
383 | |
384 ADD lt_1,m1_1,lt_1 ; lt = lt+m1; | |
385 ADD,DC ht_1,%r0,ht_1 ; ht++ | |
386 ADD %ret1,lt_0,lt_0 ; lt = lt + c (ret1); | |
387 ADD,DC ht_0,%r0,ht_0 ; ht++ | |
388 | |
389 ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0) | |
390 ADD,DC ht_1,%r0,ht_1 ; ht++ | |
391 STD lt_0,0(r_ptr) ; rp[0] = lt | |
392 STD lt_1,8(r_ptr) ; rp[1] = lt | |
393 | |
394 COPY ht_1,%ret1 ; carry = ht | |
395 LDO -2(num),num ; num = num - 2; | |
396 LDO 16(a_ptr),a_ptr ; ap += 2 | |
397 CMPIB,<= 2,num,bn_mul_words_unroll2 | |
398 LDO 16(r_ptr),r_ptr ; rp++ | |
399 | |
400 CMPIB,=,N 0,num,bn_mul_words_exit ; are we done? | |
401 | |
402 ; | |
403 ; Top of loop aligned on 64-byte boundary | |
404 ; | |
405 bn_mul_words_single_top | |
406 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | |
407 | |
408 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l | |
409 FSTD fm1,-16(%sp) ; -16(sp) = m1 | |
410 XMPYU flt_0,fw_h,fm ; m = lt*fw_h | |
411 FSTD fm,-8(%sp) ; -8(sp) = m | |
412 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h | |
413 FSTD ht_temp,-24(%sp) ; -24(sp) = ht | |
414 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l | |
415 FSTD lt_temp,-32(%sp) ; -32(sp) = lt | |
416 | |
417 LDD -8(%sp),m_0 | |
418 LDD -16(%sp),m1_0 | |
419 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; | |
420 LDD -24(%sp),ht_0 | |
421 LDD -32(%sp),lt_0 | |
422 | |
423 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) | |
424 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) | |
425 | |
426 EXTRD,U tmp_0,31,32,m_0 ; m>>32 | |
427 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 | |
428 | |
429 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) | |
430 ADD lt_0,m1_0,lt_0 ; lt= lt+m1; | |
431 ADD,DC ht_0,%r0,ht_0 ; ht++ | |
432 | |
433 ADD %ret1,lt_0,lt_0 ; lt = lt + c; | |
434 ADD,DC ht_0,%r0,ht_0 ; ht++ | |
435 | |
436 COPY ht_0,%ret1 ; copy carry | |
437 STD lt_0,0(r_ptr) ; rp[0] = lt | |
438 | |
439 bn_mul_words_exit | |
440 .EXIT | |
441 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 | |
442 LDD -96(%sp),%r7 ; restore r7 | |
443 LDD -104(%sp),%r6 ; restore r6 | |
444 LDD -112(%sp),%r5 ; restore r5 | |
445 LDD -120(%sp),%r4 ; restore r4 | |
446 BVE (%rp) | |
447 LDD,MB -128(%sp),%r3 ; restore r3 | |
448 .PROCEND | |
449 | |
450 ;---------------------------------------------------------------------------- | |
451 ; | |
452 ;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num) | |
453 ; | |
454 ; arg0 = rp | |
455 ; arg1 = ap | |
456 ; arg2 = num | |
457 ; | |
458 | |
459 bn_sqr_words | |
460 .proc | |
461 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | |
462 .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | |
463 .entry | |
464 .align 64 | |
465 | |
466 STD %r3,0(%sp) ; save r3 | |
467 STD %r4,8(%sp) ; save r4 | |
468 NOP | |
469 STD %r5,16(%sp) ; save r5 | |
470 | |
471 CMPIB,>= 0,num,bn_sqr_words_exit | |
472 LDO 128(%sp),%sp ; bump stack | |
473 | |
474 ; | |
475 ; If only 1, the goto straight to cleanup | |
476 ; | |
477 CMPIB,= 1,num,bn_sqr_words_single_top | |
478 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L | |
479 | |
480 ; | |
481 ; This loop is unrolled 2 times (64-byte aligned as well) | |
482 ; | |
483 | |
484 bn_sqr_words_unroll2 | |
485 FLDD 0(a_ptr),t_float_0 ; a[0] | |
486 FLDD 8(a_ptr),t_float_1 ; a[1] | |
487 XMPYU fht_0,flt_0,fm ; m[0] | |
488 XMPYU fht_1,flt_1,fm_1 ; m[1] | |
489 | |
490 FSTD fm,-24(%sp) ; store m[0] | |
491 FSTD fm_1,-56(%sp) ; store m[1] | |
492 XMPYU flt_0,flt_0,lt_temp ; lt[0] | |
493 XMPYU flt_1,flt_1,lt_temp_1 ; lt[1] | |
494 | |
495 FSTD lt_temp,-16(%sp) ; store lt[0] | |
496 FSTD lt_temp_1,-48(%sp) ; store lt[1] | |
497 XMPYU fht_0,fht_0,ht_temp ; ht[0] | |
498 XMPYU fht_1,fht_1,ht_temp_1 ; ht[1] | |
499 | |
500 FSTD ht_temp,-8(%sp) ; store ht[0] | |
501 FSTD ht_temp_1,-40(%sp) ; store ht[1] | |
502 LDD -24(%sp),m_0 | |
503 LDD -56(%sp),m_1 | |
504 | |
505 AND m_0,high_mask,tmp_0 ; m[0] & Mask | |
506 AND m_1,high_mask,tmp_1 ; m[1] & Mask | |
507 DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1 | |
508 DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1 | |
509 | |
510 LDD -16(%sp),lt_0 | |
511 LDD -48(%sp),lt_1 | |
512 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1 | |
513 EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1 | |
514 | |
515 LDD -8(%sp),ht_0 | |
516 LDD -40(%sp),ht_1 | |
517 ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0 | |
518 ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1 | |
519 | |
520 ADD lt_0,m_0,lt_0 ; lt = lt+m | |
521 ADD,DC ht_0,%r0,ht_0 ; ht[0]++ | |
522 STD lt_0,0(r_ptr) ; rp[0] = lt[0] | |
523 STD ht_0,8(r_ptr) ; rp[1] = ht[1] | |
524 | |
525 ADD lt_1,m_1,lt_1 ; lt = lt+m | |
526 ADD,DC ht_1,%r0,ht_1 ; ht[1]++ | |
527 STD lt_1,16(r_ptr) ; rp[2] = lt[1] | |
528 STD ht_1,24(r_ptr) ; rp[3] = ht[1] | |
529 | |
530 LDO -2(num),num ; num = num - 2; | |
531 LDO 16(a_ptr),a_ptr ; ap += 2 | |
532 CMPIB,<= 2,num,bn_sqr_words_unroll2 | |
533 LDO 32(r_ptr),r_ptr ; rp += 4 | |
534 | |
535 CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done? | |
536 | |
537 ; | |
538 ; Top of loop aligned on 64-byte boundary | |
539 ; | |
540 bn_sqr_words_single_top | |
541 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | |
542 | |
543 XMPYU fht_0,flt_0,fm ; m | |
544 FSTD fm,-24(%sp) ; store m | |
545 | |
546 XMPYU flt_0,flt_0,lt_temp ; lt | |
547 FSTD lt_temp,-16(%sp) ; store lt | |
548 | |
549 XMPYU fht_0,fht_0,ht_temp ; ht | |
550 FSTD ht_temp,-8(%sp) ; store ht | |
551 | |
552 LDD -24(%sp),m_0 ; load m | |
553 AND m_0,high_mask,tmp_0 ; m & Mask | |
554 DEPD,Z m_0,30,31,m_0 ; m << 32+1 | |
555 LDD -16(%sp),lt_0 ; lt | |
556 | |
557 LDD -8(%sp),ht_0 ; ht | |
558 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1 | |
559 ADD m_0,lt_0,lt_0 ; lt = lt+m | |
560 ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0 | |
561 ADD,DC ht_0,%r0,ht_0 ; ht++ | |
562 | |
563 STD lt_0,0(r_ptr) ; rp[0] = lt | |
564 STD ht_0,8(r_ptr) ; rp[1] = ht | |
565 | |
566 bn_sqr_words_exit | |
567 .EXIT | |
568 LDD -112(%sp),%r5 ; restore r5 | |
569 LDD -120(%sp),%r4 ; restore r4 | |
570 BVE (%rp) | |
571 LDD,MB -128(%sp),%r3 | |
572 .PROCEND ;in=23,24,25,26,29;out=28; | |
573 | |
574 | |
575 ;---------------------------------------------------------------------------- | |
576 ; | |
577 ;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | |
578 ; | |
579 ; arg0 = rp | |
580 ; arg1 = ap | |
581 ; arg2 = bp | |
582 ; arg3 = n | |
583 | |
584 t .reg %r22 | |
585 b .reg %r21 | |
586 l .reg %r20 | |
587 | |
588 bn_add_words | |
589 .proc | |
590 .entry | |
591 .callinfo | |
592 .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | |
593 .align 64 | |
594 | |
595 CMPIB,>= 0,n,bn_add_words_exit | |
596 COPY %r0,%ret1 ; return 0 by default | |
597 | |
598 ; | |
599 ; If 2 or more numbers do the loop | |
600 ; | |
601 CMPIB,= 1,n,bn_add_words_single_top | |
602 NOP | |
603 | |
604 ; | |
605 ; This loop is unrolled 2 times (64-byte aligned as well) | |
606 ; | |
607 bn_add_words_unroll2 | |
608 LDD 0(a_ptr),t | |
609 LDD 0(b_ptr),b | |
610 ADD t,%ret1,t ; t = t+c; | |
611 ADD,DC %r0,%r0,%ret1 ; set c to carry | |
612 ADD t,b,l ; l = t + b[0] | |
613 ADD,DC %ret1,%r0,%ret1 ; c+= carry | |
614 STD l,0(r_ptr) | |
615 | |
616 LDD 8(a_ptr),t | |
617 LDD 8(b_ptr),b | |
618 ADD t,%ret1,t ; t = t+c; | |
619 ADD,DC %r0,%r0,%ret1 ; set c to carry | |
620 ADD t,b,l ; l = t + b[0] | |
621 ADD,DC %ret1,%r0,%ret1 ; c+= carry | |
622 STD l,8(r_ptr) | |
623 | |
624 LDO -2(n),n | |
625 LDO 16(a_ptr),a_ptr | |
626 LDO 16(b_ptr),b_ptr | |
627 | |
628 CMPIB,<= 2,n,bn_add_words_unroll2 | |
629 LDO 16(r_ptr),r_ptr | |
630 | |
631 CMPIB,=,N 0,n,bn_add_words_exit ; are we done? | |
632 | |
633 bn_add_words_single_top | |
634 LDD 0(a_ptr),t | |
635 LDD 0(b_ptr),b | |
636 | |
637 ADD t,%ret1,t ; t = t+c; | |
638 ADD,DC %r0,%r0,%ret1 ; set c to carry (could use CMPCLR??) | |
639 ADD t,b,l ; l = t + b[0] | |
640 ADD,DC %ret1,%r0,%ret1 ; c+= carry | |
641 STD l,0(r_ptr) | |
642 | |
643 bn_add_words_exit | |
644 .EXIT | |
645 BVE (%rp) | |
646 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 | |
647 .PROCEND ;in=23,24,25,26,29;out=28; | |
648 | |
649 ;---------------------------------------------------------------------------- | |
650 ; | |
651 ;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | |
652 ; | |
653 ; arg0 = rp | |
654 ; arg1 = ap | |
655 ; arg2 = bp | |
656 ; arg3 = n | |
657 | |
658 t1 .reg %r22 | |
659 t2 .reg %r21 | |
660 sub_tmp1 .reg %r20 | |
661 sub_tmp2 .reg %r19 | |
662 | |
663 | |
664 bn_sub_words | |
665 .proc | |
666 .callinfo | |
667 .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | |
668 .entry | |
669 .align 64 | |
670 | |
671 CMPIB,>= 0,n,bn_sub_words_exit | |
672 COPY %r0,%ret1 ; return 0 by default | |
673 | |
674 ; | |
675 ; If 2 or more numbers do the loop | |
676 ; | |
677 CMPIB,= 1,n,bn_sub_words_single_top | |
678 NOP | |
679 | |
680 ; | |
681 ; This loop is unrolled 2 times (64-byte aligned as well) | |
682 ; | |
683 bn_sub_words_unroll2 | |
684 LDD 0(a_ptr),t1 | |
685 LDD 0(b_ptr),t2 | |
686 SUB t1,t2,sub_tmp1 ; t3 = t1-t2; | |
687 SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; | |
688 | |
689 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 | |
690 LDO 1(%r0),sub_tmp2 | |
691 | |
692 CMPCLR,*= t1,t2,%r0 | |
693 COPY sub_tmp2,%ret1 | |
694 STD sub_tmp1,0(r_ptr) | |
695 | |
696 LDD 8(a_ptr),t1 | |
697 LDD 8(b_ptr),t2 | |
698 SUB t1,t2,sub_tmp1 ; t3 = t1-t2; | |
699 SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; | |
700 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 | |
701 LDO 1(%r0),sub_tmp2 | |
702 | |
703 CMPCLR,*= t1,t2,%r0 | |
704 COPY sub_tmp2,%ret1 | |
705 STD sub_tmp1,8(r_ptr) | |
706 | |
707 LDO -2(n),n | |
708 LDO 16(a_ptr),a_ptr | |
709 LDO 16(b_ptr),b_ptr | |
710 | |
711 CMPIB,<= 2,n,bn_sub_words_unroll2 | |
712 LDO 16(r_ptr),r_ptr | |
713 | |
714 CMPIB,=,N 0,n,bn_sub_words_exit ; are we done? | |
715 | |
716 bn_sub_words_single_top | |
717 LDD 0(a_ptr),t1 | |
718 LDD 0(b_ptr),t2 | |
719 SUB t1,t2,sub_tmp1 ; t3 = t1-t2; | |
720 SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; | |
721 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 | |
722 LDO 1(%r0),sub_tmp2 | |
723 | |
724 CMPCLR,*= t1,t2,%r0 | |
725 COPY sub_tmp2,%ret1 | |
726 | |
727 STD sub_tmp1,0(r_ptr) | |
728 | |
729 bn_sub_words_exit | |
730 .EXIT | |
731 BVE (%rp) | |
732 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 | |
733 .PROCEND ;in=23,24,25,26,29;out=28; | |
734 | |
735 ;------------------------------------------------------------------------------ | |
736 ; | |
737 ; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d) | |
738 ; | |
739 ; arg0 = h | |
740 ; arg1 = l | |
741 ; arg2 = d | |
742 ; | |
743 ; This is mainly just output from the HP C compiler. | |
744 ; | |
745 ;------------------------------------------------------------------------------ | |
746 bn_div_words | |
747 .PROC | |
748 .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=G
R,RTNVAL=GR,LONG_RETURN | |
749 .IMPORT BN_num_bits_word,CODE | |
750 ;--- not PIC .IMPORT __iob,DATA | |
751 ;--- not PIC .IMPORT fprintf,CODE | |
752 .IMPORT abort,CODE | |
753 .IMPORT $$div2U,MILLICODE | |
754 .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWAR
E | |
755 .ENTRY | |
756 STW %r2,-20(%r30) ;offset 0x8ec | |
757 STW,MA %r3,192(%r30) ;offset 0x8f0 | |
758 STW %r4,-188(%r30) ;offset 0x8f4 | |
759 DEPD %r5,31,32,%r6 ;offset 0x8f8 | |
760 STD %r6,-184(%r30) ;offset 0x8fc | |
761 DEPD %r7,31,32,%r8 ;offset 0x900 | |
762 STD %r8,-176(%r30) ;offset 0x904 | |
763 STW %r9,-168(%r30) ;offset 0x908 | |
764 LDD -248(%r30),%r3 ;offset 0x90c | |
765 COPY %r26,%r4 ;offset 0x910 | |
766 COPY %r24,%r5 ;offset 0x914 | |
767 DEPD %r25,31,32,%r4 ;offset 0x918 | |
768 CMPB,*<> %r3,%r0,$0006000C ;offset 0x91c | |
769 DEPD %r23,31,32,%r5 ;offset 0x920 | |
770 MOVIB,TR -1,%r29,$00060002 ;offset 0x924 | |
771 EXTRD,U %r29,31,32,%r28 ;offset 0x928 | |
772 $0006002A | |
773 LDO -1(%r29),%r29 ;offset 0x92c | |
774 SUB %r23,%r7,%r23 ;offset 0x930 | |
775 $00060024 | |
776 SUB %r4,%r31,%r25 ;offset 0x934 | |
777 AND %r25,%r19,%r26 ;offset 0x938 | |
778 CMPB,*<>,N %r0,%r26,$00060046 ;offset 0x93c | |
779 DEPD,Z %r25,31,32,%r20 ;offset 0x940 | |
780 OR %r20,%r24,%r21 ;offset 0x944 | |
781 CMPB,*<<,N %r21,%r23,$0006002A ;offset 0x948 | |
782 SUB %r31,%r2,%r31 ;offset 0x94c | |
783 $00060046 | |
784 $0006002E | |
785 DEPD,Z %r23,31,32,%r25 ;offset 0x950 | |
786 EXTRD,U %r23,31,32,%r26 ;offset 0x954 | |
787 AND %r25,%r19,%r24 ;offset 0x958 | |
788 ADD,L %r31,%r26,%r31 ;offset 0x95c | |
789 CMPCLR,*>>= %r5,%r24,%r0 ;offset 0x960 | |
790 LDO 1(%r31),%r31 ;offset 0x964 | |
791 $00060032 | |
792 CMPB,*<<=,N %r31,%r4,$00060036 ;offset 0x968 | |
793 LDO -1(%r29),%r29 ;offset 0x96c | |
794 ADD,L %r4,%r3,%r4 ;offset 0x970 | |
795 $00060036 | |
796 ADDIB,=,N -1,%r8,$D0 ;offset 0x974 | |
797 SUB %r5,%r24,%r28 ;offset 0x978 | |
798 $0006003A | |
799 SUB %r4,%r31,%r24 ;offset 0x97c | |
800 SHRPD %r24,%r28,32,%r4 ;offset 0x980 | |
801 DEPD,Z %r29,31,32,%r9 ;offset 0x984 | |
802 DEPD,Z %r28,31,32,%r5 ;offset 0x988 | |
803 $0006001C | |
804 EXTRD,U %r4,31,32,%r31 ;offset 0x98c | |
805 CMPB,*<>,N %r31,%r2,$00060020 ;offset 0x990 | |
806 MOVB,TR %r6,%r29,$D1 ;offset 0x994 | |
807 STD %r29,-152(%r30) ;offset 0x998 | |
808 $0006000C | |
809 EXTRD,U %r3,31,32,%r25 ;offset 0x99c | |
810 COPY %r3,%r26 ;offset 0x9a0 | |
811 EXTRD,U %r3,31,32,%r9 ;offset 0x9a4 | |
812 EXTRD,U %r4,31,32,%r8 ;offset 0x9a8 | |
813 .CALL ARGW0=GR,ARGW1=GR,RTNVAL=GR ;in=25,26;out=28; | |
814 B,L BN_num_bits_word,%r2 ;offset 0x9ac | |
815 EXTRD,U %r5,31,32,%r7 ;offset 0x9b0 | |
816 LDI 64,%r20 ;offset 0x9b4 | |
817 DEPD %r7,31,32,%r5 ;offset 0x9b8 | |
818 DEPD %r8,31,32,%r4 ;offset 0x9bc | |
819 DEPD %r9,31,32,%r3 ;offset 0x9c0 | |
820 CMPB,= %r28,%r20,$00060012 ;offset 0x9c4 | |
821 COPY %r28,%r24 ;offset 0x9c8 | |
822 MTSARCM %r24 ;offset 0x9cc | |
823 DEPDI,Z -1,%sar,1,%r19 ;offset 0x9d0 | |
824 CMPB,*>>,N %r4,%r19,$D2 ;offset 0x9d4 | |
825 $00060012 | |
826 SUBI 64,%r24,%r31 ;offset 0x9d8 | |
827 CMPCLR,*<< %r4,%r3,%r0 ;offset 0x9dc | |
828 SUB %r4,%r3,%r4 ;offset 0x9e0 | |
829 $00060016 | |
830 CMPB,= %r31,%r0,$0006001A ;offset 0x9e4 | |
831 COPY %r0,%r9 ;offset 0x9e8 | |
832 MTSARCM %r31 ;offset 0x9ec | |
833 DEPD,Z %r3,%sar,64,%r3 ;offset 0x9f0 | |
834 SUBI 64,%r31,%r26 ;offset 0x9f4 | |
835 MTSAR %r26 ;offset 0x9f8 | |
836 SHRPD %r4,%r5,%sar,%r4 ;offset 0x9fc | |
837 MTSARCM %r31 ;offset 0xa00 | |
838 DEPD,Z %r5,%sar,64,%r5 ;offset 0xa04 | |
839 $0006001A | |
840 DEPDI,Z -1,31,32,%r19 ;offset 0xa08 | |
841 AND %r3,%r19,%r29 ;offset 0xa0c | |
842 EXTRD,U %r29,31,32,%r2 ;offset 0xa10 | |
843 DEPDI,Z -1,63,32,%r6 ;offset 0xa14 | |
844 MOVIB,TR 2,%r8,$0006001C ;offset 0xa18 | |
845 EXTRD,U %r3,63,32,%r7 ;offset 0xa1c | |
846 $D2 | |
847 ;--- not PIC ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20 | |
848 ;--- not PIC LDIL LR'C$7,%r21 ;offset 0xa24 | |
849 ;--- not PIC LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28 | |
850 ;--- not PIC .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,2
5,26;out=28; | |
851 ;--- not PIC B,L fprintf,%r2 ;offset 0xa2c | |
852 ;--- not PIC LDO RR'C$7(%r21),%r25 ;offset 0xa30 | |
853 .CALL ; | |
854 B,L abort,%r2 ;offset 0xa34 | |
855 NOP ;offset 0xa38 | |
856 B $D3 ;offset 0xa3c | |
857 LDW -212(%r30),%r2 ;offset 0xa40 | |
858 $00060020 | |
859 COPY %r4,%r26 ;offset 0xa44 | |
860 EXTRD,U %r4,31,32,%r25 ;offset 0xa48 | |
861 COPY %r2,%r24 ;offset 0xa4c | |
862 .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL) | |
863 B,L $$div2U,%r31 ;offset 0xa50 | |
864 EXTRD,U %r2,31,32,%r23 ;offset 0xa54 | |
865 DEPD %r28,31,32,%r29 ;offset 0xa58 | |
866 $00060022 | |
867 STD %r29,-152(%r30) ;offset 0xa5c | |
868 $D1 | |
869 AND %r5,%r19,%r24 ;offset 0xa60 | |
870 EXTRD,U %r24,31,32,%r24 ;offset 0xa64 | |
871 STW %r2,-160(%r30) ;offset 0xa68 | |
872 STW %r7,-128(%r30) ;offset 0xa6c | |
873 FLDD -152(%r30),%fr4 ;offset 0xa70 | |
874 FLDD -152(%r30),%fr7 ;offset 0xa74 | |
875 FLDW -160(%r30),%fr8L ;offset 0xa78 | |
876 FLDW -128(%r30),%fr5L ;offset 0xa7c | |
877 XMPYU %fr8L,%fr7L,%fr10 ;offset 0xa80 | |
878 FSTD %fr10,-136(%r30) ;offset 0xa84 | |
879 XMPYU %fr8L,%fr7R,%fr22 ;offset 0xa88 | |
880 FSTD %fr22,-144(%r30) ;offset 0xa8c | |
881 XMPYU %fr5L,%fr4L,%fr11 ;offset 0xa90 | |
882 XMPYU %fr5L,%fr4R,%fr23 ;offset 0xa94 | |
883 FSTD %fr11,-112(%r30) ;offset 0xa98 | |
884 FSTD %fr23,-120(%r30) ;offset 0xa9c | |
885 LDD -136(%r30),%r28 ;offset 0xaa0 | |
886 DEPD,Z %r28,31,32,%r31 ;offset 0xaa4 | |
887 LDD -144(%r30),%r20 ;offset 0xaa8 | |
888 ADD,L %r20,%r31,%r31 ;offset 0xaac | |
889 LDD -112(%r30),%r22 ;offset 0xab0 | |
890 DEPD,Z %r22,31,32,%r22 ;offset 0xab4 | |
891 LDD -120(%r30),%r21 ;offset 0xab8 | |
892 B $00060024 ;offset 0xabc | |
893 ADD,L %r21,%r22,%r23 ;offset 0xac0 | |
894 $D0 | |
895 OR %r9,%r29,%r29 ;offset 0xac4 | |
896 $00060040 | |
897 EXTRD,U %r29,31,32,%r28 ;offset 0xac8 | |
898 $00060002 | |
899 $L2 | |
900 LDW -212(%r30),%r2 ;offset 0xacc | |
901 $D3 | |
902 LDW -168(%r30),%r9 ;offset 0xad0 | |
903 LDD -176(%r30),%r8 ;offset 0xad4 | |
904 EXTRD,U %r8,31,32,%r7 ;offset 0xad8 | |
905 LDD -184(%r30),%r6 ;offset 0xadc | |
906 EXTRD,U %r6,31,32,%r5 ;offset 0xae0 | |
907 LDW -188(%r30),%r4 ;offset 0xae4 | |
908 BVE (%r2) ;offset 0xae8 | |
909 .EXIT | |
910 LDW,MB -192(%r30),%r3 ;offset 0xaec | |
911 .PROCEND ;in=23,25;out=28,29;fpin=105,107; | |
912 | |
913 | |
914 | |
915 | |
916 ;---------------------------------------------------------------------------- | |
917 ; | |
918 ; Registers to hold 64-bit values to manipulate. The "L" part | |
919 ; of the register corresponds to the upper 32-bits, while the "R" | |
920 ; part corresponds to the lower 32-bits | |
921 ; | |
922 ; Note, that when using b6 and b7, the code must save these before | |
923 ; using them because they are callee save registers | |
924 ; | |
925 ; | |
926 ; Floating point registers to use to save values that | |
927 ; are manipulated. These don't collide with ftemp1-6 and | |
928 ; are all caller save registers | |
929 ; | |
930 a0 .reg %fr22 | |
931 a0L .reg %fr22L | |
932 a0R .reg %fr22R | |
933 | |
934 a1 .reg %fr23 | |
935 a1L .reg %fr23L | |
936 a1R .reg %fr23R | |
937 | |
938 a2 .reg %fr24 | |
939 a2L .reg %fr24L | |
940 a2R .reg %fr24R | |
941 | |
942 a3 .reg %fr25 | |
943 a3L .reg %fr25L | |
944 a3R .reg %fr25R | |
945 | |
946 a4 .reg %fr26 | |
947 a4L .reg %fr26L | |
948 a4R .reg %fr26R | |
949 | |
950 a5 .reg %fr27 | |
951 a5L .reg %fr27L | |
952 a5R .reg %fr27R | |
953 | |
954 a6 .reg %fr28 | |
955 a6L .reg %fr28L | |
956 a6R .reg %fr28R | |
957 | |
958 a7 .reg %fr29 | |
959 a7L .reg %fr29L | |
960 a7R .reg %fr29R | |
961 | |
962 b0 .reg %fr30 | |
963 b0L .reg %fr30L | |
964 b0R .reg %fr30R | |
965 | |
966 b1 .reg %fr31 | |
967 b1L .reg %fr31L | |
968 b1R .reg %fr31R | |
969 | |
970 ; | |
971 ; Temporary floating point variables, these are all caller save | |
972 ; registers | |
973 ; | |
974 ftemp1 .reg %fr4 | |
975 ftemp2 .reg %fr5 | |
976 ftemp3 .reg %fr6 | |
977 ftemp4 .reg %fr7 | |
978 | |
979 ; | |
980 ; The B set of registers when used. | |
981 ; | |
982 | |
983 b2 .reg %fr8 | |
984 b2L .reg %fr8L | |
985 b2R .reg %fr8R | |
986 | |
987 b3 .reg %fr9 | |
988 b3L .reg %fr9L | |
989 b3R .reg %fr9R | |
990 | |
991 b4 .reg %fr10 | |
992 b4L .reg %fr10L | |
993 b4R .reg %fr10R | |
994 | |
995 b5 .reg %fr11 | |
996 b5L .reg %fr11L | |
997 b5R .reg %fr11R | |
998 | |
999 b6 .reg %fr12 | |
1000 b6L .reg %fr12L | |
1001 b6R .reg %fr12R | |
1002 | |
1003 b7 .reg %fr13 | |
1004 b7L .reg %fr13L | |
1005 b7R .reg %fr13R | |
1006 | |
1007 c1 .reg %r21 ; only reg | |
1008 temp1 .reg %r20 ; only reg | |
1009 temp2 .reg %r19 ; only reg | |
1010 temp3 .reg %r31 ; only reg | |
1011 | |
1012 m1 .reg %r28 | |
1013 c2 .reg %r23 | |
1014 high_one .reg %r1 | |
1015 ht .reg %r6 | |
1016 lt .reg %r5 | |
1017 m .reg %r4 | |
1018 c3 .reg %r3 | |
1019 | |
1020 SQR_ADD_C .macro A0L,A0R,C1,C2,C3 | |
1021 XMPYU A0L,A0R,ftemp1 ; m | |
1022 FSTD ftemp1,-24(%sp) ; store m | |
1023 | |
1024 XMPYU A0R,A0R,ftemp2 ; lt | |
1025 FSTD ftemp2,-16(%sp) ; store lt | |
1026 | |
1027 XMPYU A0L,A0L,ftemp3 ; ht | |
1028 FSTD ftemp3,-8(%sp) ; store ht | |
1029 | |
1030 LDD -24(%sp),m ; load m | |
1031 AND m,high_mask,temp2 ; m & Mask | |
1032 DEPD,Z m,30,31,temp3 ; m << 32+1 | |
1033 LDD -16(%sp),lt ; lt | |
1034 | |
1035 LDD -8(%sp),ht ; ht | |
1036 EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1 | |
1037 ADD temp3,lt,lt ; lt = lt+m | |
1038 ADD,L ht,temp1,ht ; ht += temp1 | |
1039 ADD,DC ht,%r0,ht ; ht++ | |
1040 | |
1041 ADD C1,lt,C1 ; c1=c1+lt | |
1042 ADD,DC ht,%r0,ht ; ht++ | |
1043 | |
1044 ADD C2,ht,C2 ; c2=c2+ht | |
1045 ADD,DC C3,%r0,C3 ; c3++ | |
1046 .endm | |
1047 | |
1048 SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3 | |
1049 XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht | |
1050 FSTD ftemp1,-16(%sp) ; | |
1051 XMPYU A0R,A1L,ftemp2 ; m = bh*lt | |
1052 FSTD ftemp2,-8(%sp) ; | |
1053 XMPYU A0R,A1R,ftemp3 ; lt = bl*lt | |
1054 FSTD ftemp3,-32(%sp) | |
1055 XMPYU A0L,A1L,ftemp4 ; ht = bh*ht | |
1056 FSTD ftemp4,-24(%sp) ; | |
1057 | |
1058 LDD -8(%sp),m ; r21 = m | |
1059 LDD -16(%sp),m1 ; r19 = m1 | |
1060 ADD,L m,m1,m ; m+m1 | |
1061 | |
1062 DEPD,Z m,31,32,temp3 ; (m+m1<<32) | |
1063 LDD -24(%sp),ht ; r24 = ht | |
1064 | |
1065 CMPCLR,*>>= m,m1,%r0 ; if (m < m1) | |
1066 ADD,L ht,high_one,ht ; ht+=high_one | |
1067 | |
1068 EXTRD,U m,31,32,temp1 ; m >> 32 | |
1069 LDD -32(%sp),lt ; lt | |
1070 ADD,L ht,temp1,ht ; ht+= m>>32 | |
1071 ADD lt,temp3,lt ; lt = lt+m1 | |
1072 ADD,DC ht,%r0,ht ; ht++ | |
1073 | |
1074 ADD ht,ht,ht ; ht=ht+ht; | |
1075 ADD,DC C3,%r0,C3 ; add in carry (c3++) | |
1076 | |
1077 ADD lt,lt,lt ; lt=lt+lt; | |
1078 ADD,DC ht,%r0,ht ; add in carry (ht++) | |
1079 | |
1080 ADD C1,lt,C1 ; c1=c1+lt | |
1081 ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++) | |
1082 LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise | |
1083 | |
1084 ADD C2,ht,C2 ; c2 = c2 + ht | |
1085 ADD,DC C3,%r0,C3 ; add in carry (c3++) | |
1086 .endm | |
1087 | |
1088 ; | |
1089 ;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) | |
1090 ; arg0 = r_ptr | |
1091 ; arg1 = a_ptr | |
1092 ; | |
1093 | |
1094 bn_sqr_comba8 | |
1095 .PROC | |
1096 .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | |
1097 .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | |
1098 .ENTRY | |
1099 .align 64 | |
1100 | |
1101 STD %r3,0(%sp) ; save r3 | |
1102 STD %r4,8(%sp) ; save r4 | |
1103 STD %r5,16(%sp) ; save r5 | |
1104 STD %r6,24(%sp) ; save r6 | |
1105 | |
1106 ; | |
1107 ; Zero out carries | |
1108 ; | |
1109 COPY %r0,c1 | |
1110 COPY %r0,c2 | |
1111 COPY %r0,c3 | |
1112 | |
1113 LDO 128(%sp),%sp ; bump stack | |
1114 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L | |
1115 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | |
1116 | |
1117 ; | |
1118 ; Load up all of the values we are going to use | |
1119 ; | |
1120 FLDD 0(a_ptr),a0 | |
1121 FLDD 8(a_ptr),a1 | |
1122 FLDD 16(a_ptr),a2 | |
1123 FLDD 24(a_ptr),a3 | |
1124 FLDD 32(a_ptr),a4 | |
1125 FLDD 40(a_ptr),a5 | |
1126 FLDD 48(a_ptr),a6 | |
1127 FLDD 56(a_ptr),a7 | |
1128 | |
1129 SQR_ADD_C a0L,a0R,c1,c2,c3 | |
1130 STD c1,0(r_ptr) ; r[0] = c1; | |
1131 COPY %r0,c1 | |
1132 | |
1133 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 | |
1134 STD c2,8(r_ptr) ; r[1] = c2; | |
1135 COPY %r0,c2 | |
1136 | |
1137 SQR_ADD_C a1L,a1R,c3,c1,c2 | |
1138 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 | |
1139 STD c3,16(r_ptr) ; r[2] = c3; | |
1140 COPY %r0,c3 | |
1141 | |
1142 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 | |
1143 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 | |
1144 STD c1,24(r_ptr) ; r[3] = c1; | |
1145 COPY %r0,c1 | |
1146 | |
1147 SQR_ADD_C a2L,a2R,c2,c3,c1 | |
1148 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 | |
1149 SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1 | |
1150 STD c2,32(r_ptr) ; r[4] = c2; | |
1151 COPY %r0,c2 | |
1152 | |
1153 SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2 | |
1154 SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2 | |
1155 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 | |
1156 STD c3,40(r_ptr) ; r[5] = c3; | |
1157 COPY %r0,c3 | |
1158 | |
1159 SQR_ADD_C a3L,a3R,c1,c2,c3 | |
1160 SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3 | |
1161 SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3 | |
1162 SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3 | |
1163 STD c1,48(r_ptr) ; r[6] = c1; | |
1164 COPY %r0,c1 | |
1165 | |
1166 SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1 | |
1167 SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1 | |
1168 SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1 | |
1169 SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1 | |
1170 STD c2,56(r_ptr) ; r[7] = c2; | |
1171 COPY %r0,c2 | |
1172 | |
1173 SQR_ADD_C a4L,a4R,c3,c1,c2 | |
1174 SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2 | |
1175 SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2 | |
1176 SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2 | |
1177 STD c3,64(r_ptr) ; r[8] = c3; | |
1178 COPY %r0,c3 | |
1179 | |
1180 SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3 | |
1181 SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3 | |
1182 SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3 | |
1183 STD c1,72(r_ptr) ; r[9] = c1; | |
1184 COPY %r0,c1 | |
1185 | |
1186 SQR_ADD_C a5L,a5R,c2,c3,c1 | |
1187 SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1 | |
1188 SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1 | |
1189 STD c2,80(r_ptr) ; r[10] = c2; | |
1190 COPY %r0,c2 | |
1191 | |
1192 SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2 | |
1193 SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2 | |
1194 STD c3,88(r_ptr) ; r[11] = c3; | |
1195 COPY %r0,c3 | |
1196 | |
1197 SQR_ADD_C a6L,a6R,c1,c2,c3 | |
1198 SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3 | |
1199 STD c1,96(r_ptr) ; r[12] = c1; | |
1200 COPY %r0,c1 | |
1201 | |
1202 SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1 | |
1203 STD c2,104(r_ptr) ; r[13] = c2; | |
1204 COPY %r0,c2 | |
1205 | |
1206 SQR_ADD_C a7L,a7R,c3,c1,c2 | |
1207 STD c3, 112(r_ptr) ; r[14] = c3 | |
1208 STD c1, 120(r_ptr) ; r[15] = c1 | |
1209 | |
1210 .EXIT | |
1211 LDD -104(%sp),%r6 ; restore r6 | |
1212 LDD -112(%sp),%r5 ; restore r5 | |
1213 LDD -120(%sp),%r4 ; restore r4 | |
1214 BVE (%rp) | |
1215 LDD,MB -128(%sp),%r3 | |
1216 | |
1217 .PROCEND | |
1218 | |
1219 ;----------------------------------------------------------------------------- | |
1220 ; | |
1221 ;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) | |
1222 ; arg0 = r_ptr | |
1223 ; arg1 = a_ptr | |
1224 ; | |
1225 | |
1226 bn_sqr_comba4 | |
1227 .proc | |
1228 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | |
1229 .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | |
1230 .entry | |
1231 .align 64 | |
1232 STD %r3,0(%sp) ; save r3 | |
1233 STD %r4,8(%sp) ; save r4 | |
1234 STD %r5,16(%sp) ; save r5 | |
1235 STD %r6,24(%sp) ; save r6 | |
1236 | |
1237 ; | |
1238 ; Zero out carries | |
1239 ; | |
1240 COPY %r0,c1 | |
1241 COPY %r0,c2 | |
1242 COPY %r0,c3 | |
1243 | |
1244 LDO 128(%sp),%sp ; bump stack | |
1245 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L | |
1246 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | |
1247 | |
1248 ; | |
1249 ; Load up all of the values we are going to use | |
1250 ; | |
1251 FLDD 0(a_ptr),a0 | |
1252 FLDD 8(a_ptr),a1 | |
1253 FLDD 16(a_ptr),a2 | |
1254 FLDD 24(a_ptr),a3 | |
1255 FLDD 32(a_ptr),a4 | |
1256 FLDD 40(a_ptr),a5 | |
1257 FLDD 48(a_ptr),a6 | |
1258 FLDD 56(a_ptr),a7 | |
1259 | |
1260 SQR_ADD_C a0L,a0R,c1,c2,c3 | |
1261 | |
1262 STD c1,0(r_ptr) ; r[0] = c1; | |
1263 COPY %r0,c1 | |
1264 | |
1265 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 | |
1266 | |
1267 STD c2,8(r_ptr) ; r[1] = c2; | |
1268 COPY %r0,c2 | |
1269 | |
1270 SQR_ADD_C a1L,a1R,c3,c1,c2 | |
1271 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 | |
1272 | |
1273 STD c3,16(r_ptr) ; r[2] = c3; | |
1274 COPY %r0,c3 | |
1275 | |
1276 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 | |
1277 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 | |
1278 | |
1279 STD c1,24(r_ptr) ; r[3] = c1; | |
1280 COPY %r0,c1 | |
1281 | |
1282 SQR_ADD_C a2L,a2R,c2,c3,c1 | |
1283 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 | |
1284 | |
1285 STD c2,32(r_ptr) ; r[4] = c2; | |
1286 COPY %r0,c2 | |
1287 | |
1288 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 | |
1289 STD c3,40(r_ptr) ; r[5] = c3; | |
1290 COPY %r0,c3 | |
1291 | |
1292 SQR_ADD_C a3L,a3R,c1,c2,c3 | |
1293 STD c1,48(r_ptr) ; r[6] = c1; | |
1294 STD c2,56(r_ptr) ; r[7] = c2; | |
1295 | |
1296 .EXIT | |
1297 LDD -104(%sp),%r6 ; restore r6 | |
1298 LDD -112(%sp),%r5 ; restore r5 | |
1299 LDD -120(%sp),%r4 ; restore r4 | |
1300 BVE (%rp) | |
1301 LDD,MB -128(%sp),%r3 | |
1302 | |
1303 .PROCEND | |
1304 | |
1305 | |
1306 ;--------------------------------------------------------------------------- | |
1307 | |
1308 MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3 | |
1309 XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht | |
1310 FSTD ftemp1,-16(%sp) ; | |
1311 XMPYU A0R,B0L,ftemp2 ; m = bh*lt | |
1312 FSTD ftemp2,-8(%sp) ; | |
1313 XMPYU A0R,B0R,ftemp3 ; lt = bl*lt | |
1314 FSTD ftemp3,-32(%sp) | |
1315 XMPYU A0L,B0L,ftemp4 ; ht = bh*ht | |
1316 FSTD ftemp4,-24(%sp) ; | |
1317 | |
1318 LDD -8(%sp),m ; r21 = m | |
1319 LDD -16(%sp),m1 ; r19 = m1 | |
1320 ADD,L m,m1,m ; m+m1 | |
1321 | |
1322 DEPD,Z m,31,32,temp3 ; (m+m1<<32) | |
1323 LDD -24(%sp),ht ; r24 = ht | |
1324 | |
1325 CMPCLR,*>>= m,m1,%r0 ; if (m < m1) | |
1326 ADD,L ht,high_one,ht ; ht+=high_one | |
1327 | |
1328 EXTRD,U m,31,32,temp1 ; m >> 32 | |
1329 LDD -32(%sp),lt ; lt | |
1330 ADD,L ht,temp1,ht ; ht+= m>>32 | |
1331 ADD lt,temp3,lt ; lt = lt+m1 | |
1332 ADD,DC ht,%r0,ht ; ht++ | |
1333 | |
1334 ADD C1,lt,C1 ; c1=c1+lt | |
1335 ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise | |
1336 | |
1337 ADD C2,ht,C2 ; c2 = c2 + ht | |
1338 ADD,DC C3,%r0,C3 ; add in carry (c3++) | |
1339 .endm | |
1340 | |
1341 | |
1342 ; | |
1343 ;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | |
1344 ; arg0 = r_ptr | |
1345 ; arg1 = a_ptr | |
1346 ; arg2 = b_ptr | |
1347 ; | |
1348 | |
1349 bn_mul_comba8 | |
1350 .proc | |
1351 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | |
1352 .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | |
1353 .entry | |
1354 .align 64 | |
1355 | |
1356 STD %r3,0(%sp) ; save r3 | |
1357 STD %r4,8(%sp) ; save r4 | |
1358 STD %r5,16(%sp) ; save r5 | |
1359 STD %r6,24(%sp) ; save r6 | |
1360 FSTD %fr12,32(%sp) ; save r6 | |
1361 FSTD %fr13,40(%sp) ; save r7 | |
1362 | |
1363 ; | |
1364 ; Zero out carries | |
1365 ; | |
1366 COPY %r0,c1 | |
1367 COPY %r0,c2 | |
1368 COPY %r0,c3 | |
1369 | |
1370 LDO 128(%sp),%sp ; bump stack | |
1371 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | |
1372 | |
1373 ; | |
1374 ; Load up all of the values we are going to use | |
1375 ; | |
1376 FLDD 0(a_ptr),a0 | |
1377 FLDD 8(a_ptr),a1 | |
1378 FLDD 16(a_ptr),a2 | |
1379 FLDD 24(a_ptr),a3 | |
1380 FLDD 32(a_ptr),a4 | |
1381 FLDD 40(a_ptr),a5 | |
1382 FLDD 48(a_ptr),a6 | |
1383 FLDD 56(a_ptr),a7 | |
1384 | |
1385 FLDD 0(b_ptr),b0 | |
1386 FLDD 8(b_ptr),b1 | |
1387 FLDD 16(b_ptr),b2 | |
1388 FLDD 24(b_ptr),b3 | |
1389 FLDD 32(b_ptr),b4 | |
1390 FLDD 40(b_ptr),b5 | |
1391 FLDD 48(b_ptr),b6 | |
1392 FLDD 56(b_ptr),b7 | |
1393 | |
1394 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 | |
1395 STD c1,0(r_ptr) | |
1396 COPY %r0,c1 | |
1397 | |
1398 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 | |
1399 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 | |
1400 STD c2,8(r_ptr) | |
1401 COPY %r0,c2 | |
1402 | |
1403 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 | |
1404 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 | |
1405 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 | |
1406 STD c3,16(r_ptr) | |
1407 COPY %r0,c3 | |
1408 | |
1409 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 | |
1410 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 | |
1411 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 | |
1412 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 | |
1413 STD c1,24(r_ptr) | |
1414 COPY %r0,c1 | |
1415 | |
1416 MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1 | |
1417 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 | |
1418 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 | |
1419 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 | |
1420 MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1 | |
1421 STD c2,32(r_ptr) | |
1422 COPY %r0,c2 | |
1423 | |
1424 MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2 | |
1425 MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2 | |
1426 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 | |
1427 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 | |
1428 MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2 | |
1429 MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2 | |
1430 STD c3,40(r_ptr) | |
1431 COPY %r0,c3 | |
1432 | |
1433 MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3 | |
1434 MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3 | |
1435 MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3 | |
1436 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 | |
1437 MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3 | |
1438 MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3 | |
1439 MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3 | |
1440 STD c1,48(r_ptr) | |
1441 COPY %r0,c1 | |
1442 | |
1443 MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1 | |
1444 MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1 | |
1445 MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1 | |
1446 MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1 | |
1447 MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1 | |
1448 MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1 | |
1449 MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1 | |
1450 MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1 | |
1451 STD c2,56(r_ptr) | |
1452 COPY %r0,c2 | |
1453 | |
1454 MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2 | |
1455 MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2 | |
1456 MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2 | |
1457 MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2 | |
1458 MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2 | |
1459 MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2 | |
1460 MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2 | |
1461 STD c3,64(r_ptr) | |
1462 COPY %r0,c3 | |
1463 | |
1464 MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3 | |
1465 MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3 | |
1466 MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3 | |
1467 MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3 | |
1468 MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3 | |
1469 MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3 | |
1470 STD c1,72(r_ptr) | |
1471 COPY %r0,c1 | |
1472 | |
1473 MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1 | |
1474 MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1 | |
1475 MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1 | |
1476 MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1 | |
1477 MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1 | |
1478 STD c2,80(r_ptr) | |
1479 COPY %r0,c2 | |
1480 | |
1481 MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2 | |
1482 MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2 | |
1483 MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2 | |
1484 MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2 | |
1485 STD c3,88(r_ptr) | |
1486 COPY %r0,c3 | |
1487 | |
1488 MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3 | |
1489 MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3 | |
1490 MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3 | |
1491 STD c1,96(r_ptr) | |
1492 COPY %r0,c1 | |
1493 | |
1494 MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1 | |
1495 MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1 | |
1496 STD c2,104(r_ptr) | |
1497 COPY %r0,c2 | |
1498 | |
1499 MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2 | |
1500 STD c3,112(r_ptr) | |
1501 STD c1,120(r_ptr) | |
1502 | |
1503 .EXIT | |
1504 FLDD -88(%sp),%fr13 | |
1505 FLDD -96(%sp),%fr12 | |
1506 LDD -104(%sp),%r6 ; restore r6 | |
1507 LDD -112(%sp),%r5 ; restore r5 | |
1508 LDD -120(%sp),%r4 ; restore r4 | |
1509 BVE (%rp) | |
1510 LDD,MB -128(%sp),%r3 | |
1511 | |
1512 .PROCEND | |
1513 | |
1514 ;----------------------------------------------------------------------------- | |
1515 ; | |
1516 ;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | |
1517 ; arg0 = r_ptr | |
1518 ; arg1 = a_ptr | |
1519 ; arg2 = b_ptr | |
1520 ; | |
1521 | |
1522 bn_mul_comba4 | |
1523 .proc | |
1524 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | |
1525 .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | |
1526 .entry | |
1527 .align 64 | |
1528 | |
1529 STD %r3,0(%sp) ; save r3 | |
1530 STD %r4,8(%sp) ; save r4 | |
1531 STD %r5,16(%sp) ; save r5 | |
1532 STD %r6,24(%sp) ; save r6 | |
1533 FSTD %fr12,32(%sp) ; save r6 | |
1534 FSTD %fr13,40(%sp) ; save r7 | |
1535 | |
1536 ; | |
1537 ; Zero out carries | |
1538 ; | |
1539 COPY %r0,c1 | |
1540 COPY %r0,c2 | |
1541 COPY %r0,c3 | |
1542 | |
1543 LDO 128(%sp),%sp ; bump stack | |
1544 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | |
1545 | |
1546 ; | |
1547 ; Load up all of the values we are going to use | |
1548 ; | |
1549 FLDD 0(a_ptr),a0 | |
1550 FLDD 8(a_ptr),a1 | |
1551 FLDD 16(a_ptr),a2 | |
1552 FLDD 24(a_ptr),a3 | |
1553 | |
1554 FLDD 0(b_ptr),b0 | |
1555 FLDD 8(b_ptr),b1 | |
1556 FLDD 16(b_ptr),b2 | |
1557 FLDD 24(b_ptr),b3 | |
1558 | |
1559 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 | |
1560 STD c1,0(r_ptr) | |
1561 COPY %r0,c1 | |
1562 | |
1563 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 | |
1564 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 | |
1565 STD c2,8(r_ptr) | |
1566 COPY %r0,c2 | |
1567 | |
1568 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 | |
1569 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 | |
1570 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 | |
1571 STD c3,16(r_ptr) | |
1572 COPY %r0,c3 | |
1573 | |
1574 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 | |
1575 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 | |
1576 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 | |
1577 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 | |
1578 STD c1,24(r_ptr) | |
1579 COPY %r0,c1 | |
1580 | |
1581 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 | |
1582 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 | |
1583 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 | |
1584 STD c2,32(r_ptr) | |
1585 COPY %r0,c2 | |
1586 | |
1587 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 | |
1588 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 | |
1589 STD c3,40(r_ptr) | |
1590 COPY %r0,c3 | |
1591 | |
1592 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 | |
1593 STD c1,48(r_ptr) | |
1594 STD c2,56(r_ptr) | |
1595 | |
1596 .EXIT | |
1597 FLDD -88(%sp),%fr13 | |
1598 FLDD -96(%sp),%fr12 | |
1599 LDD -104(%sp),%r6 ; restore r6 | |
1600 LDD -112(%sp),%r5 ; restore r5 | |
1601 LDD -120(%sp),%r4 ; restore r4 | |
1602 BVE (%rp) | |
1603 LDD,MB -128(%sp),%r3 | |
1604 | |
1605 .PROCEND | |
1606 | |
1607 | |
1608 ;--- not PIC .SPACE $TEXT$ | |
1609 ;--- not PIC .SUBSPA $CODE$ | |
1610 ;--- not PIC .SPACE $PRIVATE$,SORT=16 | |
1611 ;--- not PIC .IMPORT $global$,DATA | |
1612 ;--- not PIC .SPACE $TEXT$ | |
1613 ;--- not PIC .SUBSPA $CODE$ | |
1614 ;--- not PIC .SUBSPA $LIT$,ACCESS=0x2c | |
1615 ;--- not PIC C$7 | |
1616 ;--- not PIC .ALIGN 8 | |
1617 ;--- not PIC .STRINGZ "Division would overflow (%d)\n" | |
1618 .END | |
OLD | NEW |