OLD | NEW |
| (Empty) |
1 ; | |
2 ; PA-RISC 64-bit implementation of bn_asm code | |
3 ; | |
4 ; This code is approximately 2x faster than the C version | |
5 ; for RSA/DSA. | |
6 ; | |
7 ; See http://devresource.hp.com/ for more details on the PA-RISC | |
8 ; architecture. Also see the book "PA-RISC 2.0 Architecture" | |
9 ; by Gerry Kane for information on the instruction set architecture. | |
10 ; | |
11 ; Code written by Chris Ruemmler (with some help from the HP C | |
12 ; compiler). | |
13 ; | |
14 ; The code compiles with HP's assembler | |
15 ; | |
16 | |
17 .level 2.0W | |
18 .space $TEXT$ | |
19 .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY | |
20 | |
21 ; | |
22 ; Global Register definitions used for the routines. | |
23 ; | |
24 ; Some information about HP's runtime architecture for 64-bits. | |
25 ; | |
26 ; "Caller save" means the calling function must save the register | |
27 ; if it wants the register to be preserved. | |
28 ; "Callee save" means if a function uses the register, it must save | |
29 ; the value before using it. | |
30 ; | |
31 ; For the floating point registers | |
32 ; | |
33 ; "caller save" registers: fr4-fr11, fr22-fr31 | |
34 ; "callee save" registers: fr12-fr21 | |
35 ; "special" registers: fr0-fr3 (status and exception registers) | |
36 ; | |
37 ; For the integer registers | |
38 ; value zero : r0 | |
39 ; "caller save" registers: r1,r19-r26 | |
40 ; "callee save" registers: r3-r18 | |
41 ; return register : r2 (rp) | |
42 ; return values ; r28 (ret0,ret1) | |
43 ; Stack pointer ; r30 (sp) | |
44 ; global data pointer ; r27 (dp) | |
45 ; argument pointer ; r29 (ap) | |
46 ; millicode return ptr ; r31 (also a caller save register) | |
47 | |
48 | |
49 ; | |
50 ; Arguments to the routines | |
51 ; | |
52 r_ptr .reg %r26 | |
53 a_ptr .reg %r25 | |
54 b_ptr .reg %r24 | |
55 num .reg %r24 | |
56 w .reg %r23 | |
57 n .reg %r23 | |
58 | |
59 | |
60 ; | |
61 ; Globals used in some routines | |
62 ; | |
63 | |
64 top_overflow .reg %r29 | |
65 high_mask .reg %r22 ; value 0xffffffff80000000L | |
66 | |
67 | |
68 ;------------------------------------------------------------------------------ | |
69 ; | |
70 ; bn_mul_add_words | |
71 ; | |
72 ;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr, | |
73 ; int num, BN_ULON
G w) | |
74 ; | |
75 ; arg0 = r_ptr | |
76 ; arg1 = a_ptr | |
77 ; arg2 = num | |
78 ; arg3 = w | |
79 ; | |
80 ; Local register definitions | |
81 ; | |
82 | |
83 fm1 .reg %fr22 | |
84 fm .reg %fr23 | |
85 ht_temp .reg %fr24 | |
86 ht_temp_1 .reg %fr25 | |
87 lt_temp .reg %fr26 | |
88 lt_temp_1 .reg %fr27 | |
89 fm1_1 .reg %fr28 | |
90 fm_1 .reg %fr29 | |
91 | |
92 fw_h .reg %fr7L | |
93 fw_l .reg %fr7R | |
94 fw .reg %fr7 | |
95 | |
96 fht_0 .reg %fr8L | |
97 flt_0 .reg %fr8R | |
98 t_float_0 .reg %fr8 | |
99 | |
100 fht_1 .reg %fr9L | |
101 flt_1 .reg %fr9R | |
102 t_float_1 .reg %fr9 | |
103 | |
104 tmp_0 .reg %r31 | |
105 tmp_1 .reg %r21 | |
106 m_0 .reg %r20 | |
107 m_1 .reg %r19 | |
108 ht_0 .reg %r1 | |
109 ht_1 .reg %r3 | |
110 lt_0 .reg %r4 | |
111 lt_1 .reg %r5 | |
112 m1_0 .reg %r6 | |
113 m1_1 .reg %r7 | |
114 rp_val .reg %r8 | |
115 rp_val_1 .reg %r9 | |
116 | |
117 bn_mul_add_words | |
118 .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN | |
119 .proc | |
120 .callinfo frame=128 | |
121 .entry | |
122 .align 64 | |
123 | |
124 STD %r3,0(%sp) ; save r3 | |
125 STD %r4,8(%sp) ; save r4 | |
126 NOP ; Needed to make the loop 16-byte aligned | |
127 NOP ; Needed to make the loop 16-byte aligned | |
128 | |
129 STD %r5,16(%sp) ; save r5 | |
130 STD %r6,24(%sp) ; save r6 | |
131 STD %r7,32(%sp) ; save r7 | |
132 STD %r8,40(%sp) ; save r8 | |
133 | |
134 STD %r9,48(%sp) ; save r9 | |
135 COPY %r0,%ret0 ; return 0 by default | |
136 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 | |
137 STD w,56(%sp) ; store w on stack | |
138 | |
139 CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit | |
140 LDO 128(%sp),%sp ; bump stack | |
141 | |
142 ; | |
143 ; The loop is unrolled twice, so if there is only 1 number | |
144 ; then go straight to the cleanup code. | |
145 ; | |
146 CMPIB,= 1,num,bn_mul_add_words_single_top | |
147 FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l) | |
148 | |
149 ; | |
150 ; This loop is unrolled 2 times (64-byte aligned as well) | |
151 ; | |
152 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus | |
153 ; two 32-bit mutiplies can be issued per cycle. | |
154 ; | |
155 bn_mul_add_words_unroll2 | |
156 | |
157 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | |
158 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) | |
159 LDD 0(r_ptr),rp_val ; rp[0] | |
160 LDD 8(r_ptr),rp_val_1 ; rp[1] | |
161 | |
162 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l | |
163 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l | |
164 FSTD fm1,-16(%sp) ; -16(sp) = m1[0] | |
165 FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1] | |
166 | |
167 XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h | |
168 XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h | |
169 FSTD fm,-8(%sp) ; -8(sp) = m[0] | |
170 FSTD fm_1,-40(%sp) ; -40(sp) = m[1] | |
171 | |
172 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h | |
173 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h | |
174 FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp | |
175 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1 | |
176 | |
177 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l | |
178 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l | |
179 FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp | |
180 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1 | |
181 | |
182 LDD -8(%sp),m_0 ; m[0] | |
183 LDD -40(%sp),m_1 ; m[1] | |
184 LDD -16(%sp),m1_0 ; m1[0] | |
185 LDD -48(%sp),m1_1 ; m1[1] | |
186 | |
187 LDD -24(%sp),ht_0 ; ht[0] | |
188 LDD -56(%sp),ht_1 ; ht[1] | |
189 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0]; | |
190 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1]; | |
191 | |
192 LDD -32(%sp),lt_0 | |
193 LDD -64(%sp),lt_1 | |
194 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0]) | |
195 ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32) | |
196 | |
197 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1]) | |
198 ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32) | |
199 EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32 | |
200 DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32 | |
201 | |
202 EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32 | |
203 DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32 | |
204 ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32) | |
205 ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32) | |
206 | |
207 ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0]; | |
208 ADD,DC ht_0,%r0,ht_0 ; ht[0]++ | |
209 ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1]; | |
210 ADD,DC ht_1,%r0,ht_1 ; ht[1]++ | |
211 | |
212 ADD %ret0,lt_0,lt_0 ; lt[0] = lt[0] + c; | |
213 ADD,DC ht_0,%r0,ht_0 ; ht[0]++ | |
214 ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0] | |
215 ADD,DC ht_0,%r0,ht_0 ; ht[0]++ | |
216 | |
217 LDO -2(num),num ; num = num - 2; | |
218 ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c); | |
219 ADD,DC ht_1,%r0,ht_1 ; ht[1]++ | |
220 STD lt_0,0(r_ptr) ; rp[0] = lt[0] | |
221 | |
222 ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1] | |
223 ADD,DC ht_1,%r0,%ret0 ; ht[1]++ | |
224 LDO 16(a_ptr),a_ptr ; a_ptr += 2 | |
225 | |
226 STD lt_1,8(r_ptr) ; rp[1] = lt[1] | |
227 CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do | |
228 LDO 16(r_ptr),r_ptr ; r_ptr += 2 | |
229 | |
230 CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one | |
231 | |
232 ; | |
233 ; Top of loop aligned on 64-byte boundary | |
234 ; | |
235 bn_mul_add_words_single_top | |
236 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | |
237 LDD 0(r_ptr),rp_val ; rp[0] | |
238 LDO 8(a_ptr),a_ptr ; a_ptr++ | |
239 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l | |
240 FSTD fm1,-16(%sp) ; -16(sp) = m1 | |
241 XMPYU flt_0,fw_h,fm ; m = lt*fw_h | |
242 FSTD fm,-8(%sp) ; -8(sp) = m | |
243 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h | |
244 FSTD ht_temp,-24(%sp) ; -24(sp) = ht | |
245 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l | |
246 FSTD lt_temp,-32(%sp) ; -32(sp) = lt | |
247 | |
248 LDD -8(%sp),m_0 | |
249 LDD -16(%sp),m1_0 ; m1 = temp1 | |
250 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; | |
251 LDD -24(%sp),ht_0 | |
252 LDD -32(%sp),lt_0 | |
253 | |
254 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) | |
255 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) | |
256 | |
257 EXTRD,U tmp_0,31,32,m_0 ; m>>32 | |
258 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 | |
259 | |
260 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) | |
261 ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1; | |
262 ADD,DC ht_0,%r0,ht_0 ; ht++ | |
263 ADD %ret0,tmp_0,lt_0 ; lt = lt + c; | |
264 ADD,DC ht_0,%r0,ht_0 ; ht++ | |
265 ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0] | |
266 ADD,DC ht_0,%r0,%ret0 ; ht++ | |
267 STD lt_0,0(r_ptr) ; rp[0] = lt | |
268 | |
269 bn_mul_add_words_exit | |
270 .EXIT | |
271 LDD -80(%sp),%r9 ; restore r9 | |
272 LDD -88(%sp),%r8 ; restore r8 | |
273 LDD -96(%sp),%r7 ; restore r7 | |
274 LDD -104(%sp),%r6 ; restore r6 | |
275 LDD -112(%sp),%r5 ; restore r5 | |
276 LDD -120(%sp),%r4 ; restore r4 | |
277 BVE (%rp) | |
278 LDD,MB -128(%sp),%r3 ; restore r3 | |
279 .PROCEND ;in=23,24,25,26,29;out=28; | |
280 | |
281 ;---------------------------------------------------------------------------- | |
282 ; | |
283 ;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | |
284 ; | |
285 ; arg0 = rp | |
286 ; arg1 = ap | |
287 ; arg2 = num | |
288 ; arg3 = w | |
289 | |
290 bn_mul_words | |
291 .proc | |
292 .callinfo frame=128 | |
293 .entry | |
294 .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | |
295 .align 64 | |
296 | |
297 STD %r3,0(%sp) ; save r3 | |
298 STD %r4,8(%sp) ; save r4 | |
299 STD %r5,16(%sp) ; save r5 | |
300 STD %r6,24(%sp) ; save r6 | |
301 | |
302 STD %r7,32(%sp) ; save r7 | |
303 COPY %r0,%ret0 ; return 0 by default | |
304 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 | |
305 STD w,56(%sp) ; w on stack | |
306 | |
307 CMPIB,>= 0,num,bn_mul_words_exit | |
308 LDO 128(%sp),%sp ; bump stack | |
309 | |
310 ; | |
311 ; See if only 1 word to do, thus just do cleanup | |
312 ; | |
313 CMPIB,= 1,num,bn_mul_words_single_top | |
314 FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l) | |
315 | |
316 ; | |
317 ; This loop is unrolled 2 times (64-byte aligned as well) | |
318 ; | |
319 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus | |
320 ; two 32-bit mutiplies can be issued per cycle. | |
321 ; | |
322 bn_mul_words_unroll2 | |
323 | |
324 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | |
325 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) | |
326 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l | |
327 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l | |
328 | |
329 FSTD fm1,-16(%sp) ; -16(sp) = m1 | |
330 FSTD fm1_1,-48(%sp) ; -48(sp) = m1 | |
331 XMPYU flt_0,fw_h,fm ; m = lt*fw_h | |
332 XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h | |
333 | |
334 FSTD fm,-8(%sp) ; -8(sp) = m | |
335 FSTD fm_1,-40(%sp) ; -40(sp) = m | |
336 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h | |
337 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h | |
338 | |
339 FSTD ht_temp,-24(%sp) ; -24(sp) = ht | |
340 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht | |
341 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l | |
342 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l | |
343 | |
344 FSTD lt_temp,-32(%sp) ; -32(sp) = lt | |
345 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt | |
346 LDD -8(%sp),m_0 | |
347 LDD -40(%sp),m_1 | |
348 | |
349 LDD -16(%sp),m1_0 | |
350 LDD -48(%sp),m1_1 | |
351 LDD -24(%sp),ht_0 | |
352 LDD -56(%sp),ht_1 | |
353 | |
354 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1; | |
355 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1; | |
356 LDD -32(%sp),lt_0 | |
357 LDD -64(%sp),lt_1 | |
358 | |
359 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1) | |
360 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) | |
361 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1) | |
362 ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32) | |
363 | |
364 EXTRD,U tmp_0,31,32,m_0 ; m>>32 | |
365 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 | |
366 EXTRD,U tmp_1,31,32,m_1 ; m>>32 | |
367 DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32 | |
368 | |
369 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) | |
370 ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32) | |
371 ADD lt_0,m1_0,lt_0 ; lt = lt+m1; | |
372 ADD,DC ht_0,%r0,ht_0 ; ht++ | |
373 | |
374 ADD lt_1,m1_1,lt_1 ; lt = lt+m1; | |
375 ADD,DC ht_1,%r0,ht_1 ; ht++ | |
376 ADD %ret0,lt_0,lt_0 ; lt = lt + c (ret0); | |
377 ADD,DC ht_0,%r0,ht_0 ; ht++ | |
378 | |
379 ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0) | |
380 ADD,DC ht_1,%r0,ht_1 ; ht++ | |
381 STD lt_0,0(r_ptr) ; rp[0] = lt | |
382 STD lt_1,8(r_ptr) ; rp[1] = lt | |
383 | |
384 COPY ht_1,%ret0 ; carry = ht | |
385 LDO -2(num),num ; num = num - 2; | |
386 LDO 16(a_ptr),a_ptr ; ap += 2 | |
387 CMPIB,<= 2,num,bn_mul_words_unroll2 | |
388 LDO 16(r_ptr),r_ptr ; rp++ | |
389 | |
390 CMPIB,=,N 0,num,bn_mul_words_exit ; are we done? | |
391 | |
392 ; | |
393 ; Top of loop aligned on 64-byte boundary | |
394 ; | |
395 bn_mul_words_single_top | |
396 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | |
397 | |
398 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l | |
399 FSTD fm1,-16(%sp) ; -16(sp) = m1 | |
400 XMPYU flt_0,fw_h,fm ; m = lt*fw_h | |
401 FSTD fm,-8(%sp) ; -8(sp) = m | |
402 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h | |
403 FSTD ht_temp,-24(%sp) ; -24(sp) = ht | |
404 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l | |
405 FSTD lt_temp,-32(%sp) ; -32(sp) = lt | |
406 | |
407 LDD -8(%sp),m_0 | |
408 LDD -16(%sp),m1_0 | |
409 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; | |
410 LDD -24(%sp),ht_0 | |
411 LDD -32(%sp),lt_0 | |
412 | |
413 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) | |
414 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) | |
415 | |
416 EXTRD,U tmp_0,31,32,m_0 ; m>>32 | |
417 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 | |
418 | |
419 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) | |
420 ADD lt_0,m1_0,lt_0 ; lt= lt+m1; | |
421 ADD,DC ht_0,%r0,ht_0 ; ht++ | |
422 | |
423 ADD %ret0,lt_0,lt_0 ; lt = lt + c; | |
424 ADD,DC ht_0,%r0,ht_0 ; ht++ | |
425 | |
426 COPY ht_0,%ret0 ; copy carry | |
427 STD lt_0,0(r_ptr) ; rp[0] = lt | |
428 | |
429 bn_mul_words_exit | |
430 .EXIT | |
431 LDD -96(%sp),%r7 ; restore r7 | |
432 LDD -104(%sp),%r6 ; restore r6 | |
433 LDD -112(%sp),%r5 ; restore r5 | |
434 LDD -120(%sp),%r4 ; restore r4 | |
435 BVE (%rp) | |
436 LDD,MB -128(%sp),%r3 ; restore r3 | |
437 .PROCEND ;in=23,24,25,26,29;out=28; | |
438 | |
439 ;---------------------------------------------------------------------------- | |
440 ; | |
441 ;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num) | |
442 ; | |
443 ; arg0 = rp | |
444 ; arg1 = ap | |
445 ; arg2 = num | |
446 ; | |
447 | |
448 bn_sqr_words | |
449 .proc | |
450 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | |
451 .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | |
452 .entry | |
453 .align 64 | |
454 | |
455 STD %r3,0(%sp) ; save r3 | |
456 STD %r4,8(%sp) ; save r4 | |
457 NOP | |
458 STD %r5,16(%sp) ; save r5 | |
459 | |
460 CMPIB,>= 0,num,bn_sqr_words_exit | |
461 LDO 128(%sp),%sp ; bump stack | |
462 | |
463 ; | |
464 ; If only 1, the goto straight to cleanup | |
465 ; | |
466 CMPIB,= 1,num,bn_sqr_words_single_top | |
467 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L | |
468 | |
469 ; | |
470 ; This loop is unrolled 2 times (64-byte aligned as well) | |
471 ; | |
472 | |
473 bn_sqr_words_unroll2 | |
474 FLDD 0(a_ptr),t_float_0 ; a[0] | |
475 FLDD 8(a_ptr),t_float_1 ; a[1] | |
476 XMPYU fht_0,flt_0,fm ; m[0] | |
477 XMPYU fht_1,flt_1,fm_1 ; m[1] | |
478 | |
479 FSTD fm,-24(%sp) ; store m[0] | |
480 FSTD fm_1,-56(%sp) ; store m[1] | |
481 XMPYU flt_0,flt_0,lt_temp ; lt[0] | |
482 XMPYU flt_1,flt_1,lt_temp_1 ; lt[1] | |
483 | |
484 FSTD lt_temp,-16(%sp) ; store lt[0] | |
485 FSTD lt_temp_1,-48(%sp) ; store lt[1] | |
486 XMPYU fht_0,fht_0,ht_temp ; ht[0] | |
487 XMPYU fht_1,fht_1,ht_temp_1 ; ht[1] | |
488 | |
489 FSTD ht_temp,-8(%sp) ; store ht[0] | |
490 FSTD ht_temp_1,-40(%sp) ; store ht[1] | |
491 LDD -24(%sp),m_0 | |
492 LDD -56(%sp),m_1 | |
493 | |
494 AND m_0,high_mask,tmp_0 ; m[0] & Mask | |
495 AND m_1,high_mask,tmp_1 ; m[1] & Mask | |
496 DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1 | |
497 DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1 | |
498 | |
499 LDD -16(%sp),lt_0 | |
500 LDD -48(%sp),lt_1 | |
501 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1 | |
502 EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1 | |
503 | |
504 LDD -8(%sp),ht_0 | |
505 LDD -40(%sp),ht_1 | |
506 ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0 | |
507 ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1 | |
508 | |
509 ADD lt_0,m_0,lt_0 ; lt = lt+m | |
510 ADD,DC ht_0,%r0,ht_0 ; ht[0]++ | |
511 STD lt_0,0(r_ptr) ; rp[0] = lt[0] | |
512 STD ht_0,8(r_ptr) ; rp[1] = ht[1] | |
513 | |
514 ADD lt_1,m_1,lt_1 ; lt = lt+m | |
515 ADD,DC ht_1,%r0,ht_1 ; ht[1]++ | |
516 STD lt_1,16(r_ptr) ; rp[2] = lt[1] | |
517 STD ht_1,24(r_ptr) ; rp[3] = ht[1] | |
518 | |
519 LDO -2(num),num ; num = num - 2; | |
520 LDO 16(a_ptr),a_ptr ; ap += 2 | |
521 CMPIB,<= 2,num,bn_sqr_words_unroll2 | |
522 LDO 32(r_ptr),r_ptr ; rp += 4 | |
523 | |
524 CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done? | |
525 | |
526 ; | |
527 ; Top of loop aligned on 64-byte boundary | |
528 ; | |
529 bn_sqr_words_single_top | |
530 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | |
531 | |
532 XMPYU fht_0,flt_0,fm ; m | |
533 FSTD fm,-24(%sp) ; store m | |
534 | |
535 XMPYU flt_0,flt_0,lt_temp ; lt | |
536 FSTD lt_temp,-16(%sp) ; store lt | |
537 | |
538 XMPYU fht_0,fht_0,ht_temp ; ht | |
539 FSTD ht_temp,-8(%sp) ; store ht | |
540 | |
541 LDD -24(%sp),m_0 ; load m | |
542 AND m_0,high_mask,tmp_0 ; m & Mask | |
543 DEPD,Z m_0,30,31,m_0 ; m << 32+1 | |
544 LDD -16(%sp),lt_0 ; lt | |
545 | |
546 LDD -8(%sp),ht_0 ; ht | |
547 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1 | |
548 ADD m_0,lt_0,lt_0 ; lt = lt+m | |
549 ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0 | |
550 ADD,DC ht_0,%r0,ht_0 ; ht++ | |
551 | |
552 STD lt_0,0(r_ptr) ; rp[0] = lt | |
553 STD ht_0,8(r_ptr) ; rp[1] = ht | |
554 | |
555 bn_sqr_words_exit | |
556 .EXIT | |
557 LDD -112(%sp),%r5 ; restore r5 | |
558 LDD -120(%sp),%r4 ; restore r4 | |
559 BVE (%rp) | |
560 LDD,MB -128(%sp),%r3 | |
561 .PROCEND ;in=23,24,25,26,29;out=28; | |
562 | |
563 | |
564 ;---------------------------------------------------------------------------- | |
565 ; | |
566 ;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | |
567 ; | |
568 ; arg0 = rp | |
569 ; arg1 = ap | |
570 ; arg2 = bp | |
571 ; arg3 = n | |
572 | |
573 t .reg %r22 | |
574 b .reg %r21 | |
575 l .reg %r20 | |
576 | |
577 bn_add_words | |
578 .proc | |
579 .entry | |
580 .callinfo | |
581 .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | |
582 .align 64 | |
583 | |
584 CMPIB,>= 0,n,bn_add_words_exit | |
585 COPY %r0,%ret0 ; return 0 by default | |
586 | |
587 ; | |
588 ; If 2 or more numbers do the loop | |
589 ; | |
590 CMPIB,= 1,n,bn_add_words_single_top | |
591 NOP | |
592 | |
593 ; | |
594 ; This loop is unrolled 2 times (64-byte aligned as well) | |
595 ; | |
596 bn_add_words_unroll2 | |
597 LDD 0(a_ptr),t | |
598 LDD 0(b_ptr),b | |
599 ADD t,%ret0,t ; t = t+c; | |
600 ADD,DC %r0,%r0,%ret0 ; set c to carry | |
601 ADD t,b,l ; l = t + b[0] | |
602 ADD,DC %ret0,%r0,%ret0 ; c+= carry | |
603 STD l,0(r_ptr) | |
604 | |
605 LDD 8(a_ptr),t | |
606 LDD 8(b_ptr),b | |
607 ADD t,%ret0,t ; t = t+c; | |
608 ADD,DC %r0,%r0,%ret0 ; set c to carry | |
609 ADD t,b,l ; l = t + b[0] | |
610 ADD,DC %ret0,%r0,%ret0 ; c+= carry | |
611 STD l,8(r_ptr) | |
612 | |
613 LDO -2(n),n | |
614 LDO 16(a_ptr),a_ptr | |
615 LDO 16(b_ptr),b_ptr | |
616 | |
617 CMPIB,<= 2,n,bn_add_words_unroll2 | |
618 LDO 16(r_ptr),r_ptr | |
619 | |
620 CMPIB,=,N 0,n,bn_add_words_exit ; are we done? | |
621 | |
622 bn_add_words_single_top | |
623 LDD 0(a_ptr),t | |
624 LDD 0(b_ptr),b | |
625 | |
626 ADD t,%ret0,t ; t = t+c; | |
627 ADD,DC %r0,%r0,%ret0 ; set c to carry (could use CMPCLR??) | |
628 ADD t,b,l ; l = t + b[0] | |
629 ADD,DC %ret0,%r0,%ret0 ; c+= carry | |
630 STD l,0(r_ptr) | |
631 | |
632 bn_add_words_exit | |
633 .EXIT | |
634 BVE (%rp) | |
635 NOP | |
636 .PROCEND ;in=23,24,25,26,29;out=28; | |
637 | |
638 ;---------------------------------------------------------------------------- | |
639 ; | |
640 ;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | |
641 ; | |
642 ; arg0 = rp | |
643 ; arg1 = ap | |
644 ; arg2 = bp | |
645 ; arg3 = n | |
646 | |
647 t1 .reg %r22 | |
648 t2 .reg %r21 | |
649 sub_tmp1 .reg %r20 | |
650 sub_tmp2 .reg %r19 | |
651 | |
652 | |
653 bn_sub_words | |
654 .proc | |
655 .callinfo | |
656 .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | |
657 .entry | |
658 .align 64 | |
659 | |
660 CMPIB,>= 0,n,bn_sub_words_exit | |
661 COPY %r0,%ret0 ; return 0 by default | |
662 | |
663 ; | |
664 ; If 2 or more numbers do the loop | |
665 ; | |
666 CMPIB,= 1,n,bn_sub_words_single_top | |
667 NOP | |
668 | |
669 ; | |
670 ; This loop is unrolled 2 times (64-byte aligned as well) | |
671 ; | |
672 bn_sub_words_unroll2 | |
673 LDD 0(a_ptr),t1 | |
674 LDD 0(b_ptr),t2 | |
675 SUB t1,t2,sub_tmp1 ; t3 = t1-t2; | |
676 SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; | |
677 | |
678 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 | |
679 LDO 1(%r0),sub_tmp2 | |
680 | |
681 CMPCLR,*= t1,t2,%r0 | |
682 COPY sub_tmp2,%ret0 | |
683 STD sub_tmp1,0(r_ptr) | |
684 | |
685 LDD 8(a_ptr),t1 | |
686 LDD 8(b_ptr),t2 | |
687 SUB t1,t2,sub_tmp1 ; t3 = t1-t2; | |
688 SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; | |
689 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 | |
690 LDO 1(%r0),sub_tmp2 | |
691 | |
692 CMPCLR,*= t1,t2,%r0 | |
693 COPY sub_tmp2,%ret0 | |
694 STD sub_tmp1,8(r_ptr) | |
695 | |
696 LDO -2(n),n | |
697 LDO 16(a_ptr),a_ptr | |
698 LDO 16(b_ptr),b_ptr | |
699 | |
700 CMPIB,<= 2,n,bn_sub_words_unroll2 | |
701 LDO 16(r_ptr),r_ptr | |
702 | |
703 CMPIB,=,N 0,n,bn_sub_words_exit ; are we done? | |
704 | |
705 bn_sub_words_single_top | |
706 LDD 0(a_ptr),t1 | |
707 LDD 0(b_ptr),t2 | |
708 SUB t1,t2,sub_tmp1 ; t3 = t1-t2; | |
709 SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; | |
710 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 | |
711 LDO 1(%r0),sub_tmp2 | |
712 | |
713 CMPCLR,*= t1,t2,%r0 | |
714 COPY sub_tmp2,%ret0 | |
715 | |
716 STD sub_tmp1,0(r_ptr) | |
717 | |
718 bn_sub_words_exit | |
719 .EXIT | |
720 BVE (%rp) | |
721 NOP | |
722 .PROCEND ;in=23,24,25,26,29;out=28; | |
723 | |
724 ;------------------------------------------------------------------------------ | |
725 ; | |
726 ; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d) | |
727 ; | |
728 ; arg0 = h | |
729 ; arg1 = l | |
730 ; arg2 = d | |
731 ; | |
732 ; This is mainly just modified assembly from the compiler, thus the | |
733 ; lack of variable names. | |
734 ; | |
735 ;------------------------------------------------------------------------------ | |
736 bn_div_words | |
737 .proc | |
738 .callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWA
RE | |
739 .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | |
740 .IMPORT BN_num_bits_word,CODE,NO_RELOCATION | |
741 .IMPORT __iob,DATA | |
742 .IMPORT fprintf,CODE,NO_RELOCATION | |
743 .IMPORT abort,CODE,NO_RELOCATION | |
744 .IMPORT $$div2U,MILLICODE | |
745 .entry | |
746 STD %r2,-16(%r30) | |
747 STD,MA %r3,352(%r30) | |
748 STD %r4,-344(%r30) | |
749 STD %r5,-336(%r30) | |
750 STD %r6,-328(%r30) | |
751 STD %r7,-320(%r30) | |
752 STD %r8,-312(%r30) | |
753 STD %r9,-304(%r30) | |
754 STD %r10,-296(%r30) | |
755 | |
756 STD %r27,-288(%r30) ; save gp | |
757 | |
758 COPY %r24,%r3 ; save d | |
759 COPY %r26,%r4 ; save h (high 64-bits) | |
760 LDO -1(%r0),%ret0 ; return -1 by default | |
761 | |
762 CMPB,*= %r0,%arg2,$D3 ; if (d == 0) | |
763 COPY %r25,%r5 ; save l (low 64-bits) | |
764 | |
765 LDO -48(%r30),%r29 ; create ap | |
766 .CALL ;in=26,29;out=28; | |
767 B,L BN_num_bits_word,%r2 | |
768 COPY %r3,%r26 | |
769 LDD -288(%r30),%r27 ; restore gp | |
770 LDI 64,%r21 | |
771 | |
772 CMPB,= %r21,%ret0,$00000012 ;if (i == 64) (forward) | |
773 COPY %ret0,%r24 ; i | |
774 MTSARCM %r24 | |
775 DEPDI,Z -1,%sar,1,%r29 | |
776 CMPB,*<<,N %r29,%r4,bn_div_err_case ; if (h > 1<<i) (forward) | |
777 | |
778 $00000012 | |
779 SUBI 64,%r24,%r31 ; i = 64 - i; | |
780 CMPCLR,*<< %r4,%r3,%r0 ; if (h >= d) | |
781 SUB %r4,%r3,%r4 ; h -= d | |
782 CMPB,= %r31,%r0,$0000001A ; if (i) | |
783 COPY %r0,%r10 ; ret = 0 | |
784 MTSARCM %r31 ; i to shift | |
785 DEPD,Z %r3,%sar,64,%r3 ; d <<= i; | |
786 SUBI 64,%r31,%r19 ; 64 - i; redundent | |
787 MTSAR %r19 ; (64 -i) to shift | |
788 SHRPD %r4,%r5,%sar,%r4 ; l>> (64-i) | |
789 MTSARCM %r31 ; i to shift | |
790 DEPD,Z %r5,%sar,64,%r5 ; l <<= i; | |
791 | |
792 $0000001A | |
793 DEPDI,Z -1,31,32,%r19 | |
794 EXTRD,U %r3,31,32,%r6 ; dh=(d&0xfff)>>32 | |
795 EXTRD,U %r3,63,32,%r8 ; dl = d&0xffffff | |
796 LDO 2(%r0),%r9 | |
797 STD %r3,-280(%r30) ; "d" to stack | |
798 | |
799 $0000001C | |
800 DEPDI,Z -1,63,32,%r29 ; | |
801 EXTRD,U %r4,31,32,%r31 ; h >> 32 | |
802 CMPB,*=,N %r31,%r6,$D2 ; if ((h>>32) != dh)(forward) div | |
803 COPY %r4,%r26 | |
804 EXTRD,U %r4,31,32,%r25 | |
805 COPY %r6,%r24 | |
806 .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL) | |
807 B,L $$div2U,%r2 | |
808 EXTRD,U %r6,31,32,%r23 | |
809 DEPD %r28,31,32,%r29 | |
810 $D2 | |
811 STD %r29,-272(%r30) ; q | |
812 AND %r5,%r19,%r24 ; t & 0xffffffff00000000; | |
813 EXTRD,U %r24,31,32,%r24 ; ??? | |
814 FLDD -272(%r30),%fr7 ; q | |
815 FLDD -280(%r30),%fr8 ; d | |
816 XMPYU %fr8L,%fr7L,%fr10 | |
817 FSTD %fr10,-256(%r30) | |
818 XMPYU %fr8L,%fr7R,%fr22 | |
819 FSTD %fr22,-264(%r30) | |
820 XMPYU %fr8R,%fr7L,%fr11 | |
821 XMPYU %fr8R,%fr7R,%fr23 | |
822 FSTD %fr11,-232(%r30) | |
823 FSTD %fr23,-240(%r30) | |
824 LDD -256(%r30),%r28 | |
825 DEPD,Z %r28,31,32,%r2 | |
826 LDD -264(%r30),%r20 | |
827 ADD,L %r20,%r2,%r31 | |
828 LDD -232(%r30),%r22 | |
829 DEPD,Z %r22,31,32,%r22 | |
830 LDD -240(%r30),%r21 | |
831 B $00000024 ; enter loop | |
832 ADD,L %r21,%r22,%r23 | |
833 | |
834 $0000002A | |
835 LDO -1(%r29),%r29 | |
836 SUB %r23,%r8,%r23 | |
837 $00000024 | |
838 SUB %r4,%r31,%r25 | |
839 AND %r25,%r19,%r26 | |
840 CMPB,*<>,N %r0,%r26,$00000046 ; (forward) | |
841 DEPD,Z %r25,31,32,%r20 | |
842 OR %r20,%r24,%r21 | |
843 CMPB,*<<,N %r21,%r23,$0000002A ;(backward) | |
844 SUB %r31,%r6,%r31 | |
845 ;-------------Break path--------------------- | |
846 | |
847 $00000046 | |
848 DEPD,Z %r23,31,32,%r25 ;tl | |
849 EXTRD,U %r23,31,32,%r26 ;t | |
850 AND %r25,%r19,%r24 ;tl = (tl<<32)&0xfffffff0000000L | |
851 ADD,L %r31,%r26,%r31 ;th += t; | |
852 CMPCLR,*>>= %r5,%r24,%r0 ;if (l<tl) | |
853 LDO 1(%r31),%r31 ; th++; | |
854 CMPB,*<<=,N %r31,%r4,$00000036 ;if (n < th) (forward) | |
855 LDO -1(%r29),%r29 ;q--; | |
856 ADD,L %r4,%r3,%r4 ;h += d; | |
857 $00000036 | |
858 ADDIB,=,N -1,%r9,$D1 ;if (--count == 0) break (forward) | |
859 SUB %r5,%r24,%r28 ; l -= tl; | |
860 SUB %r4,%r31,%r24 ; h -= th; | |
861 SHRPD %r24,%r28,32,%r4 ; h = ((h<<32)|(l>>32)); | |
862 DEPD,Z %r29,31,32,%r10 ; ret = q<<32 | |
863 b $0000001C | |
864 DEPD,Z %r28,31,32,%r5 ; l = l << 32 | |
865 | |
866 $D1 | |
867 OR %r10,%r29,%r28 ; ret |= q | |
868 $D3 | |
869 LDD -368(%r30),%r2 | |
870 $D0 | |
871 LDD -296(%r30),%r10 | |
872 LDD -304(%r30),%r9 | |
873 LDD -312(%r30),%r8 | |
874 LDD -320(%r30),%r7 | |
875 LDD -328(%r30),%r6 | |
876 LDD -336(%r30),%r5 | |
877 LDD -344(%r30),%r4 | |
878 BVE (%r2) | |
879 .EXIT | |
880 LDD,MB -352(%r30),%r3 | |
881 | |
882 bn_div_err_case | |
883 MFIA %r6 | |
884 ADDIL L'bn_div_words-bn_div_err_case,%r6,%r1 | |
885 LDO R'bn_div_words-bn_div_err_case(%r1),%r6 | |
886 ADDIL LT'__iob,%r27,%r1 | |
887 LDD RT'__iob(%r1),%r26 | |
888 ADDIL L'C$4-bn_div_words,%r6,%r1 | |
889 LDO R'C$4-bn_div_words(%r1),%r25 | |
890 LDO 64(%r26),%r26 | |
891 .CALL ;in=24,25,26,29;out=28; | |
892 B,L fprintf,%r2 | |
893 LDO -48(%r30),%r29 | |
894 LDD -288(%r30),%r27 | |
895 .CALL ;in=29; | |
896 B,L abort,%r2 | |
897 LDO -48(%r30),%r29 | |
898 LDD -288(%r30),%r27 | |
899 B $D0 | |
900 LDD -368(%r30),%r2 | |
901 .PROCEND ;in=24,25,26,29;out=28; | |
902 | |
903 ;---------------------------------------------------------------------------- | |
904 ; | |
905 ; Registers to hold 64-bit values to manipulate. The "L" part | |
906 ; of the register corresponds to the upper 32-bits, while the "R" | |
907 ; part corresponds to the lower 32-bits | |
908 ; | |
909 ; Note, that when using b6 and b7, the code must save these before | |
910 ; using them because they are callee save registers | |
911 ; | |
912 ; | |
913 ; Floating point registers to use to save values that | |
914 ; are manipulated. These don't collide with ftemp1-6 and | |
915 ; are all caller save registers | |
916 ; | |
917 a0 .reg %fr22 | |
918 a0L .reg %fr22L | |
919 a0R .reg %fr22R | |
920 | |
921 a1 .reg %fr23 | |
922 a1L .reg %fr23L | |
923 a1R .reg %fr23R | |
924 | |
925 a2 .reg %fr24 | |
926 a2L .reg %fr24L | |
927 a2R .reg %fr24R | |
928 | |
929 a3 .reg %fr25 | |
930 a3L .reg %fr25L | |
931 a3R .reg %fr25R | |
932 | |
933 a4 .reg %fr26 | |
934 a4L .reg %fr26L | |
935 a4R .reg %fr26R | |
936 | |
937 a5 .reg %fr27 | |
938 a5L .reg %fr27L | |
939 a5R .reg %fr27R | |
940 | |
941 a6 .reg %fr28 | |
942 a6L .reg %fr28L | |
943 a6R .reg %fr28R | |
944 | |
945 a7 .reg %fr29 | |
946 a7L .reg %fr29L | |
947 a7R .reg %fr29R | |
948 | |
949 b0 .reg %fr30 | |
950 b0L .reg %fr30L | |
951 b0R .reg %fr30R | |
952 | |
953 b1 .reg %fr31 | |
954 b1L .reg %fr31L | |
955 b1R .reg %fr31R | |
956 | |
957 ; | |
958 ; Temporary floating point variables, these are all caller save | |
959 ; registers | |
960 ; | |
961 ftemp1 .reg %fr4 | |
962 ftemp2 .reg %fr5 | |
963 ftemp3 .reg %fr6 | |
964 ftemp4 .reg %fr7 | |
965 | |
966 ; | |
967 ; The B set of registers when used. | |
968 ; | |
969 | |
970 b2 .reg %fr8 | |
971 b2L .reg %fr8L | |
972 b2R .reg %fr8R | |
973 | |
974 b3 .reg %fr9 | |
975 b3L .reg %fr9L | |
976 b3R .reg %fr9R | |
977 | |
978 b4 .reg %fr10 | |
979 b4L .reg %fr10L | |
980 b4R .reg %fr10R | |
981 | |
982 b5 .reg %fr11 | |
983 b5L .reg %fr11L | |
984 b5R .reg %fr11R | |
985 | |
986 b6 .reg %fr12 | |
987 b6L .reg %fr12L | |
988 b6R .reg %fr12R | |
989 | |
990 b7 .reg %fr13 | |
991 b7L .reg %fr13L | |
992 b7R .reg %fr13R | |
993 | |
994 c1 .reg %r21 ; only reg | |
995 temp1 .reg %r20 ; only reg | |
996 temp2 .reg %r19 ; only reg | |
997 temp3 .reg %r31 ; only reg | |
998 | |
999 m1 .reg %r28 | |
1000 c2 .reg %r23 | |
1001 high_one .reg %r1 | |
1002 ht .reg %r6 | |
1003 lt .reg %r5 | |
1004 m .reg %r4 | |
1005 c3 .reg %r3 | |
1006 | |
1007 SQR_ADD_C .macro A0L,A0R,C1,C2,C3 | |
1008 XMPYU A0L,A0R,ftemp1 ; m | |
1009 FSTD ftemp1,-24(%sp) ; store m | |
1010 | |
1011 XMPYU A0R,A0R,ftemp2 ; lt | |
1012 FSTD ftemp2,-16(%sp) ; store lt | |
1013 | |
1014 XMPYU A0L,A0L,ftemp3 ; ht | |
1015 FSTD ftemp3,-8(%sp) ; store ht | |
1016 | |
1017 LDD -24(%sp),m ; load m | |
1018 AND m,high_mask,temp2 ; m & Mask | |
1019 DEPD,Z m,30,31,temp3 ; m << 32+1 | |
1020 LDD -16(%sp),lt ; lt | |
1021 | |
1022 LDD -8(%sp),ht ; ht | |
1023 EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1 | |
1024 ADD temp3,lt,lt ; lt = lt+m | |
1025 ADD,L ht,temp1,ht ; ht += temp1 | |
1026 ADD,DC ht,%r0,ht ; ht++ | |
1027 | |
1028 ADD C1,lt,C1 ; c1=c1+lt | |
1029 ADD,DC ht,%r0,ht ; ht++ | |
1030 | |
1031 ADD C2,ht,C2 ; c2=c2+ht | |
1032 ADD,DC C3,%r0,C3 ; c3++ | |
1033 .endm | |
1034 | |
1035 SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3 | |
1036 XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht | |
1037 FSTD ftemp1,-16(%sp) ; | |
1038 XMPYU A0R,A1L,ftemp2 ; m = bh*lt | |
1039 FSTD ftemp2,-8(%sp) ; | |
1040 XMPYU A0R,A1R,ftemp3 ; lt = bl*lt | |
1041 FSTD ftemp3,-32(%sp) | |
1042 XMPYU A0L,A1L,ftemp4 ; ht = bh*ht | |
1043 FSTD ftemp4,-24(%sp) ; | |
1044 | |
1045 LDD -8(%sp),m ; r21 = m | |
1046 LDD -16(%sp),m1 ; r19 = m1 | |
1047 ADD,L m,m1,m ; m+m1 | |
1048 | |
1049 DEPD,Z m,31,32,temp3 ; (m+m1<<32) | |
1050 LDD -24(%sp),ht ; r24 = ht | |
1051 | |
1052 CMPCLR,*>>= m,m1,%r0 ; if (m < m1) | |
1053 ADD,L ht,high_one,ht ; ht+=high_one | |
1054 | |
1055 EXTRD,U m,31,32,temp1 ; m >> 32 | |
1056 LDD -32(%sp),lt ; lt | |
1057 ADD,L ht,temp1,ht ; ht+= m>>32 | |
1058 ADD lt,temp3,lt ; lt = lt+m1 | |
1059 ADD,DC ht,%r0,ht ; ht++ | |
1060 | |
1061 ADD ht,ht,ht ; ht=ht+ht; | |
1062 ADD,DC C3,%r0,C3 ; add in carry (c3++) | |
1063 | |
1064 ADD lt,lt,lt ; lt=lt+lt; | |
1065 ADD,DC ht,%r0,ht ; add in carry (ht++) | |
1066 | |
1067 ADD C1,lt,C1 ; c1=c1+lt | |
1068 ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++) | |
1069 LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise | |
1070 | |
1071 ADD C2,ht,C2 ; c2 = c2 + ht | |
1072 ADD,DC C3,%r0,C3 ; add in carry (c3++) | |
1073 .endm | |
1074 | |
1075 ; | |
1076 ;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) | |
1077 ; arg0 = r_ptr | |
1078 ; arg1 = a_ptr | |
1079 ; | |
1080 | |
1081 bn_sqr_comba8 | |
1082 .PROC | |
1083 .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | |
1084 .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | |
1085 .ENTRY | |
1086 .align 64 | |
1087 | |
1088 STD %r3,0(%sp) ; save r3 | |
1089 STD %r4,8(%sp) ; save r4 | |
1090 STD %r5,16(%sp) ; save r5 | |
1091 STD %r6,24(%sp) ; save r6 | |
1092 | |
1093 ; | |
1094 ; Zero out carries | |
1095 ; | |
1096 COPY %r0,c1 | |
1097 COPY %r0,c2 | |
1098 COPY %r0,c3 | |
1099 | |
1100 LDO 128(%sp),%sp ; bump stack | |
1101 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L | |
1102 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | |
1103 | |
1104 ; | |
1105 ; Load up all of the values we are going to use | |
1106 ; | |
1107 FLDD 0(a_ptr),a0 | |
1108 FLDD 8(a_ptr),a1 | |
1109 FLDD 16(a_ptr),a2 | |
1110 FLDD 24(a_ptr),a3 | |
1111 FLDD 32(a_ptr),a4 | |
1112 FLDD 40(a_ptr),a5 | |
1113 FLDD 48(a_ptr),a6 | |
1114 FLDD 56(a_ptr),a7 | |
1115 | |
1116 SQR_ADD_C a0L,a0R,c1,c2,c3 | |
1117 STD c1,0(r_ptr) ; r[0] = c1; | |
1118 COPY %r0,c1 | |
1119 | |
1120 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 | |
1121 STD c2,8(r_ptr) ; r[1] = c2; | |
1122 COPY %r0,c2 | |
1123 | |
1124 SQR_ADD_C a1L,a1R,c3,c1,c2 | |
1125 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 | |
1126 STD c3,16(r_ptr) ; r[2] = c3; | |
1127 COPY %r0,c3 | |
1128 | |
1129 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 | |
1130 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 | |
1131 STD c1,24(r_ptr) ; r[3] = c1; | |
1132 COPY %r0,c1 | |
1133 | |
1134 SQR_ADD_C a2L,a2R,c2,c3,c1 | |
1135 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 | |
1136 SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1 | |
1137 STD c2,32(r_ptr) ; r[4] = c2; | |
1138 COPY %r0,c2 | |
1139 | |
1140 SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2 | |
1141 SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2 | |
1142 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 | |
1143 STD c3,40(r_ptr) ; r[5] = c3; | |
1144 COPY %r0,c3 | |
1145 | |
1146 SQR_ADD_C a3L,a3R,c1,c2,c3 | |
1147 SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3 | |
1148 SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3 | |
1149 SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3 | |
1150 STD c1,48(r_ptr) ; r[6] = c1; | |
1151 COPY %r0,c1 | |
1152 | |
1153 SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1 | |
1154 SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1 | |
1155 SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1 | |
1156 SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1 | |
1157 STD c2,56(r_ptr) ; r[7] = c2; | |
1158 COPY %r0,c2 | |
1159 | |
1160 SQR_ADD_C a4L,a4R,c3,c1,c2 | |
1161 SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2 | |
1162 SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2 | |
1163 SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2 | |
1164 STD c3,64(r_ptr) ; r[8] = c3; | |
1165 COPY %r0,c3 | |
1166 | |
1167 SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3 | |
1168 SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3 | |
1169 SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3 | |
1170 STD c1,72(r_ptr) ; r[9] = c1; | |
1171 COPY %r0,c1 | |
1172 | |
1173 SQR_ADD_C a5L,a5R,c2,c3,c1 | |
1174 SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1 | |
1175 SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1 | |
1176 STD c2,80(r_ptr) ; r[10] = c2; | |
1177 COPY %r0,c2 | |
1178 | |
1179 SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2 | |
1180 SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2 | |
1181 STD c3,88(r_ptr) ; r[11] = c3; | |
1182 COPY %r0,c3 | |
1183 | |
1184 SQR_ADD_C a6L,a6R,c1,c2,c3 | |
1185 SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3 | |
1186 STD c1,96(r_ptr) ; r[12] = c1; | |
1187 COPY %r0,c1 | |
1188 | |
1189 SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1 | |
1190 STD c2,104(r_ptr) ; r[13] = c2; | |
1191 COPY %r0,c2 | |
1192 | |
1193 SQR_ADD_C a7L,a7R,c3,c1,c2 | |
1194 STD c3, 112(r_ptr) ; r[14] = c3 | |
1195 STD c1, 120(r_ptr) ; r[15] = c1 | |
1196 | |
1197 .EXIT | |
1198 LDD -104(%sp),%r6 ; restore r6 | |
1199 LDD -112(%sp),%r5 ; restore r5 | |
1200 LDD -120(%sp),%r4 ; restore r4 | |
1201 BVE (%rp) | |
1202 LDD,MB -128(%sp),%r3 | |
1203 | |
1204 .PROCEND | |
1205 | |
1206 ;----------------------------------------------------------------------------- | |
1207 ; | |
1208 ;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) | |
1209 ; arg0 = r_ptr | |
1210 ; arg1 = a_ptr | |
1211 ; | |
1212 | |
1213 bn_sqr_comba4 | |
1214 .proc | |
1215 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | |
1216 .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | |
1217 .entry | |
1218 .align 64 | |
1219 STD %r3,0(%sp) ; save r3 | |
1220 STD %r4,8(%sp) ; save r4 | |
1221 STD %r5,16(%sp) ; save r5 | |
1222 STD %r6,24(%sp) ; save r6 | |
1223 | |
1224 ; | |
1225 ; Zero out carries | |
1226 ; | |
1227 COPY %r0,c1 | |
1228 COPY %r0,c2 | |
1229 COPY %r0,c3 | |
1230 | |
1231 LDO 128(%sp),%sp ; bump stack | |
1232 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L | |
1233 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | |
1234 | |
1235 ; | |
1236 ; Load up all of the values we are going to use | |
1237 ; | |
1238 FLDD 0(a_ptr),a0 | |
1239 FLDD 8(a_ptr),a1 | |
1240 FLDD 16(a_ptr),a2 | |
1241 FLDD 24(a_ptr),a3 | |
1242 FLDD 32(a_ptr),a4 | |
1243 FLDD 40(a_ptr),a5 | |
1244 FLDD 48(a_ptr),a6 | |
1245 FLDD 56(a_ptr),a7 | |
1246 | |
1247 SQR_ADD_C a0L,a0R,c1,c2,c3 | |
1248 | |
1249 STD c1,0(r_ptr) ; r[0] = c1; | |
1250 COPY %r0,c1 | |
1251 | |
1252 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 | |
1253 | |
1254 STD c2,8(r_ptr) ; r[1] = c2; | |
1255 COPY %r0,c2 | |
1256 | |
1257 SQR_ADD_C a1L,a1R,c3,c1,c2 | |
1258 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 | |
1259 | |
1260 STD c3,16(r_ptr) ; r[2] = c3; | |
1261 COPY %r0,c3 | |
1262 | |
1263 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 | |
1264 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 | |
1265 | |
1266 STD c1,24(r_ptr) ; r[3] = c1; | |
1267 COPY %r0,c1 | |
1268 | |
1269 SQR_ADD_C a2L,a2R,c2,c3,c1 | |
1270 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 | |
1271 | |
1272 STD c2,32(r_ptr) ; r[4] = c2; | |
1273 COPY %r0,c2 | |
1274 | |
1275 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 | |
1276 STD c3,40(r_ptr) ; r[5] = c3; | |
1277 COPY %r0,c3 | |
1278 | |
1279 SQR_ADD_C a3L,a3R,c1,c2,c3 | |
1280 STD c1,48(r_ptr) ; r[6] = c1; | |
1281 STD c2,56(r_ptr) ; r[7] = c2; | |
1282 | |
1283 .EXIT | |
1284 LDD -104(%sp),%r6 ; restore r6 | |
1285 LDD -112(%sp),%r5 ; restore r5 | |
1286 LDD -120(%sp),%r4 ; restore r4 | |
1287 BVE (%rp) | |
1288 LDD,MB -128(%sp),%r3 | |
1289 | |
1290 .PROCEND | |
1291 | |
1292 | |
1293 ;--------------------------------------------------------------------------- | |
1294 | |
1295 MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3 | |
1296 XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht | |
1297 FSTD ftemp1,-16(%sp) ; | |
1298 XMPYU A0R,B0L,ftemp2 ; m = bh*lt | |
1299 FSTD ftemp2,-8(%sp) ; | |
1300 XMPYU A0R,B0R,ftemp3 ; lt = bl*lt | |
1301 FSTD ftemp3,-32(%sp) | |
1302 XMPYU A0L,B0L,ftemp4 ; ht = bh*ht | |
1303 FSTD ftemp4,-24(%sp) ; | |
1304 | |
1305 LDD -8(%sp),m ; r21 = m | |
1306 LDD -16(%sp),m1 ; r19 = m1 | |
1307 ADD,L m,m1,m ; m+m1 | |
1308 | |
1309 DEPD,Z m,31,32,temp3 ; (m+m1<<32) | |
1310 LDD -24(%sp),ht ; r24 = ht | |
1311 | |
1312 CMPCLR,*>>= m,m1,%r0 ; if (m < m1) | |
1313 ADD,L ht,high_one,ht ; ht+=high_one | |
1314 | |
1315 EXTRD,U m,31,32,temp1 ; m >> 32 | |
1316 LDD -32(%sp),lt ; lt | |
1317 ADD,L ht,temp1,ht ; ht+= m>>32 | |
1318 ADD lt,temp3,lt ; lt = lt+m1 | |
1319 ADD,DC ht,%r0,ht ; ht++ | |
1320 | |
1321 ADD C1,lt,C1 ; c1=c1+lt | |
1322 ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise | |
1323 | |
1324 ADD C2,ht,C2 ; c2 = c2 + ht | |
1325 ADD,DC C3,%r0,C3 ; add in carry (c3++) | |
1326 .endm | |
1327 | |
1328 | |
1329 ; | |
1330 ;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | |
1331 ; arg0 = r_ptr | |
1332 ; arg1 = a_ptr | |
1333 ; arg2 = b_ptr | |
1334 ; | |
1335 | |
1336 bn_mul_comba8 | |
1337 .proc | |
1338 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | |
1339 .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | |
1340 .entry | |
1341 .align 64 | |
1342 | |
1343 STD %r3,0(%sp) ; save r3 | |
1344 STD %r4,8(%sp) ; save r4 | |
1345 STD %r5,16(%sp) ; save r5 | |
1346 STD %r6,24(%sp) ; save r6 | |
1347 FSTD %fr12,32(%sp) ; save r6 | |
1348 FSTD %fr13,40(%sp) ; save r7 | |
1349 | |
1350 ; | |
1351 ; Zero out carries | |
1352 ; | |
1353 COPY %r0,c1 | |
1354 COPY %r0,c2 | |
1355 COPY %r0,c3 | |
1356 | |
1357 LDO 128(%sp),%sp ; bump stack | |
1358 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | |
1359 | |
1360 ; | |
1361 ; Load up all of the values we are going to use | |
1362 ; | |
1363 FLDD 0(a_ptr),a0 | |
1364 FLDD 8(a_ptr),a1 | |
1365 FLDD 16(a_ptr),a2 | |
1366 FLDD 24(a_ptr),a3 | |
1367 FLDD 32(a_ptr),a4 | |
1368 FLDD 40(a_ptr),a5 | |
1369 FLDD 48(a_ptr),a6 | |
1370 FLDD 56(a_ptr),a7 | |
1371 | |
1372 FLDD 0(b_ptr),b0 | |
1373 FLDD 8(b_ptr),b1 | |
1374 FLDD 16(b_ptr),b2 | |
1375 FLDD 24(b_ptr),b3 | |
1376 FLDD 32(b_ptr),b4 | |
1377 FLDD 40(b_ptr),b5 | |
1378 FLDD 48(b_ptr),b6 | |
1379 FLDD 56(b_ptr),b7 | |
1380 | |
1381 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 | |
1382 STD c1,0(r_ptr) | |
1383 COPY %r0,c1 | |
1384 | |
1385 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 | |
1386 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 | |
1387 STD c2,8(r_ptr) | |
1388 COPY %r0,c2 | |
1389 | |
1390 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 | |
1391 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 | |
1392 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 | |
1393 STD c3,16(r_ptr) | |
1394 COPY %r0,c3 | |
1395 | |
1396 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 | |
1397 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 | |
1398 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 | |
1399 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 | |
1400 STD c1,24(r_ptr) | |
1401 COPY %r0,c1 | |
1402 | |
1403 MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1 | |
1404 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 | |
1405 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 | |
1406 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 | |
1407 MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1 | |
1408 STD c2,32(r_ptr) | |
1409 COPY %r0,c2 | |
1410 | |
1411 MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2 | |
1412 MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2 | |
1413 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 | |
1414 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 | |
1415 MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2 | |
1416 MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2 | |
1417 STD c3,40(r_ptr) | |
1418 COPY %r0,c3 | |
1419 | |
1420 MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3 | |
1421 MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3 | |
1422 MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3 | |
1423 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 | |
1424 MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3 | |
1425 MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3 | |
1426 MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3 | |
1427 STD c1,48(r_ptr) | |
1428 COPY %r0,c1 | |
1429 | |
1430 MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1 | |
1431 MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1 | |
1432 MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1 | |
1433 MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1 | |
1434 MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1 | |
1435 MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1 | |
1436 MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1 | |
1437 MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1 | |
1438 STD c2,56(r_ptr) | |
1439 COPY %r0,c2 | |
1440 | |
1441 MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2 | |
1442 MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2 | |
1443 MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2 | |
1444 MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2 | |
1445 MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2 | |
1446 MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2 | |
1447 MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2 | |
1448 STD c3,64(r_ptr) | |
1449 COPY %r0,c3 | |
1450 | |
1451 MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3 | |
1452 MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3 | |
1453 MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3 | |
1454 MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3 | |
1455 MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3 | |
1456 MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3 | |
1457 STD c1,72(r_ptr) | |
1458 COPY %r0,c1 | |
1459 | |
1460 MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1 | |
1461 MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1 | |
1462 MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1 | |
1463 MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1 | |
1464 MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1 | |
1465 STD c2,80(r_ptr) | |
1466 COPY %r0,c2 | |
1467 | |
1468 MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2 | |
1469 MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2 | |
1470 MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2 | |
1471 MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2 | |
1472 STD c3,88(r_ptr) | |
1473 COPY %r0,c3 | |
1474 | |
1475 MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3 | |
1476 MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3 | |
1477 MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3 | |
1478 STD c1,96(r_ptr) | |
1479 COPY %r0,c1 | |
1480 | |
1481 MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1 | |
1482 MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1 | |
1483 STD c2,104(r_ptr) | |
1484 COPY %r0,c2 | |
1485 | |
1486 MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2 | |
1487 STD c3,112(r_ptr) | |
1488 STD c1,120(r_ptr) | |
1489 | |
1490 .EXIT | |
1491 FLDD -88(%sp),%fr13 | |
1492 FLDD -96(%sp),%fr12 | |
1493 LDD -104(%sp),%r6 ; restore r6 | |
1494 LDD -112(%sp),%r5 ; restore r5 | |
1495 LDD -120(%sp),%r4 ; restore r4 | |
1496 BVE (%rp) | |
1497 LDD,MB -128(%sp),%r3 | |
1498 | |
1499 .PROCEND | |
1500 | |
1501 ;----------------------------------------------------------------------------- | |
1502 ; | |
1503 ;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | |
1504 ; arg0 = r_ptr | |
1505 ; arg1 = a_ptr | |
1506 ; arg2 = b_ptr | |
1507 ; | |
1508 | |
1509 bn_mul_comba4 | |
1510 .proc | |
1511 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | |
1512 .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | |
1513 .entry | |
1514 .align 64 | |
1515 | |
1516 STD %r3,0(%sp) ; save r3 | |
1517 STD %r4,8(%sp) ; save r4 | |
1518 STD %r5,16(%sp) ; save r5 | |
1519 STD %r6,24(%sp) ; save r6 | |
1520 FSTD %fr12,32(%sp) ; save r6 | |
1521 FSTD %fr13,40(%sp) ; save r7 | |
1522 | |
1523 ; | |
1524 ; Zero out carries | |
1525 ; | |
1526 COPY %r0,c1 | |
1527 COPY %r0,c2 | |
1528 COPY %r0,c3 | |
1529 | |
1530 LDO 128(%sp),%sp ; bump stack | |
1531 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | |
1532 | |
1533 ; | |
1534 ; Load up all of the values we are going to use | |
1535 ; | |
1536 FLDD 0(a_ptr),a0 | |
1537 FLDD 8(a_ptr),a1 | |
1538 FLDD 16(a_ptr),a2 | |
1539 FLDD 24(a_ptr),a3 | |
1540 | |
1541 FLDD 0(b_ptr),b0 | |
1542 FLDD 8(b_ptr),b1 | |
1543 FLDD 16(b_ptr),b2 | |
1544 FLDD 24(b_ptr),b3 | |
1545 | |
1546 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 | |
1547 STD c1,0(r_ptr) | |
1548 COPY %r0,c1 | |
1549 | |
1550 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 | |
1551 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 | |
1552 STD c2,8(r_ptr) | |
1553 COPY %r0,c2 | |
1554 | |
1555 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 | |
1556 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 | |
1557 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 | |
1558 STD c3,16(r_ptr) | |
1559 COPY %r0,c3 | |
1560 | |
1561 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 | |
1562 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 | |
1563 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 | |
1564 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 | |
1565 STD c1,24(r_ptr) | |
1566 COPY %r0,c1 | |
1567 | |
1568 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 | |
1569 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 | |
1570 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 | |
1571 STD c2,32(r_ptr) | |
1572 COPY %r0,c2 | |
1573 | |
1574 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 | |
1575 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 | |
1576 STD c3,40(r_ptr) | |
1577 COPY %r0,c3 | |
1578 | |
1579 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 | |
1580 STD c1,48(r_ptr) | |
1581 STD c2,56(r_ptr) | |
1582 | |
1583 .EXIT | |
1584 FLDD -88(%sp),%fr13 | |
1585 FLDD -96(%sp),%fr12 | |
1586 LDD -104(%sp),%r6 ; restore r6 | |
1587 LDD -112(%sp),%r5 ; restore r5 | |
1588 LDD -120(%sp),%r4 ; restore r4 | |
1589 BVE (%rp) | |
1590 LDD,MB -128(%sp),%r3 | |
1591 | |
1592 .PROCEND | |
1593 | |
1594 | |
1595 .SPACE $TEXT$ | |
1596 .SUBSPA $CODE$ | |
1597 .SPACE $PRIVATE$,SORT=16 | |
1598 .IMPORT $global$,DATA | |
1599 .SPACE $TEXT$ | |
1600 .SUBSPA $CODE$ | |
1601 .SUBSPA $LIT$,ACCESS=0x2c | |
1602 C$4 | |
1603 .ALIGN 8 | |
1604 .STRINGZ "Division would overflow (%d)\n" | |
1605 .END | |
OLD | NEW |