OLD | NEW |
| (Empty) |
1 dnl IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and | |
2 dnl store the result in a second limb vector. | |
3 | |
4 dnl Copyright 2000, 2001, 2002, 2003, 2004, 2006, 2007 Free Software | |
5 dnl Foundation, Inc. | |
6 | |
7 dnl This file is part of the GNU MP Library. | |
8 | |
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify | |
10 dnl it under the terms of the GNU Lesser General Public License as published | |
11 dnl by the Free Software Foundation; either version 3 of the License, or (at | |
12 dnl your option) any later version. | |
13 | |
14 dnl The GNU MP Library is distributed in the hope that it will be useful, but | |
15 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
16 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public | |
17 dnl License for more details. | |
18 | |
19 dnl You should have received a copy of the GNU Lesser General Public License | |
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. | |
21 | |
22 include(`../config.m4') | |
23 | |
24 C cycles/limb | |
25 C Itanium: 4.0 | |
26 C Itanium 2: 2.0 | |
27 | |
28 C TODO | |
29 C * Further optimize feed-in and wind-down code, both for speed and code size. | |
30 C * Handle low limb input and results specially, using a common stf8 in the | |
31 C epilogue. | |
32 C * Use 1 c/l carry propagation scheme in wind-down code. | |
33 C * Use extra pointer register for `up' to speed up feed-in loads. | |
34 C * Work out final differences with addmul_1.asm. | |
35 | |
36 C INPUT PARAMETERS | |
37 define(`rp', `r32') | |
38 define(`up', `r33') | |
39 define(`n', `r34') | |
40 define(`vl', `r35') | |
41 define(`cy', `r36') C for mpn_mul_1c | |
42 | |
43 ASM_START() | |
44 PROLOGUE(mpn_mul_1) | |
45 .prologue | |
46 .save ar.lc, r2 | |
47 .body | |
48 | |
49 ifdef(`HAVE_ABI_32', | |
50 ` addp4 rp = 0, rp C M I | |
51 addp4 up = 0, up C M I | |
52 zxt4 n = n C I | |
53 ;; | |
54 ') | |
55 {.mfi | |
56 adds r15 = -1, n C M I | |
57 mov f9 = f0 C F | |
58 mov.i r2 = ar.lc C I0 | |
59 } | |
60 {.mmi | |
61 ldf8 f7 = [up], 8 C M | |
62 nop.m 0 C M | |
63 and r14 = 3, n C M I | |
64 ;; | |
65 } | |
66 .Lcommon: | |
67 {.mii | |
68 setf.sig f6 = vl C M2 M3 | |
69 shr.u r31 = r15, 2 C I0 | |
70 cmp.eq p10, p0 = 0, r14 C M I | |
71 } | |
72 {.mii | |
73 cmp.eq p11, p0 = 2, r14 C M I | |
74 cmp.eq p12, p0 = 3, r14 C M I | |
75 nop.i 0 C I | |
76 ;; | |
77 } | |
78 {.mii | |
79 cmp.ne p6, p7 = r0, r0 C M I | |
80 mov.i ar.lc = r31 C I0 | |
81 cmp.ne p8, p9 = r0, r0 C M I | |
82 } | |
83 {.bbb | |
84 (p10) br.dptk .Lb00 C B | |
85 (p11) br.dptk .Lb10 C B | |
86 (p12) br.dptk .Lb11 C B | |
87 ;; | |
88 } | |
89 | |
90 .Lb01: mov r20 = 0 | |
91 br.cloop.dptk .grt1 C B | |
92 | |
93 xma.l f39 = f7, f6, f9 C F | |
94 xma.hu f43 = f7, f6, f9 C F | |
95 ;; | |
96 getf.sig r8 = f43 C M2 | |
97 stf8 [rp] = f39 C M2 M3 | |
98 mov.i ar.lc = r2 C I0 | |
99 br.ret.sptk.many b0 C B | |
100 | |
101 .grt1: | |
102 ldf8 f32 = [up], 8 | |
103 ;; | |
104 ldf8 f33 = [up], 8 | |
105 ;; | |
106 ldf8 f34 = [up], 8 | |
107 xma.l f39 = f7, f6, f9 | |
108 xma.hu f43 = f7, f6, f9 | |
109 ;; | |
110 ldf8 f35 = [up], 8 | |
111 br.cloop.dptk .grt5 | |
112 | |
113 xma.l f36 = f32, f6, f0 | |
114 xma.hu f40 = f32, f6, f0 | |
115 ;; | |
116 stf8 [rp] = f39, 8 | |
117 xma.l f37 = f33, f6, f0 | |
118 xma.hu f41 = f33, f6, f0 | |
119 ;; | |
120 getf.sig r21 = f43 | |
121 getf.sig r18 = f36 | |
122 xma.l f38 = f34, f6, f0 | |
123 xma.hu f42 = f34, f6, f0 | |
124 ;; | |
125 getf.sig r22 = f40 | |
126 getf.sig r19 = f37 | |
127 xma.l f39 = f35, f6, f0 | |
128 xma.hu f43 = f35, f6, f0 | |
129 ;; | |
130 getf.sig r23 = f41 | |
131 getf.sig r16 = f38 | |
132 br .Lcj5 | |
133 | |
134 .grt5: | |
135 xma.l f36 = f32, f6, f0 | |
136 xma.hu f40 = f32, f6, f0 | |
137 ;; | |
138 getf.sig r17 = f39 | |
139 ldf8 f32 = [up], 8 | |
140 xma.l f37 = f33, f6, f0 | |
141 xma.hu f41 = f33, f6, f0 | |
142 ;; | |
143 getf.sig r21 = f43 | |
144 ldf8 f33 = [up], 8 | |
145 xma.l f38 = f34, f6, f0 | |
146 ;; | |
147 getf.sig r18 = f36 | |
148 xma.hu f42 = f34, f6, f0 | |
149 ;; | |
150 getf.sig r22 = f40 | |
151 ldf8 f34 = [up], 8 | |
152 xma.l f39 = f35, f6, f0 | |
153 ;; | |
154 getf.sig r19 = f37 | |
155 xma.hu f43 = f35, f6, f0 | |
156 br .LL01 | |
157 | |
158 | |
159 .Lb10: ldf8 f35 = [up], 8 | |
160 mov r23 = 0 | |
161 br.cloop.dptk .grt2 | |
162 | |
163 xma.l f38 = f7, f6, f9 | |
164 xma.hu f42 = f7, f6, f9 | |
165 ;; | |
166 stf8 [rp] = f38, 8 | |
167 xma.l f39 = f35, f6, f42 | |
168 xma.hu f43 = f35, f6, f42 | |
169 ;; | |
170 getf.sig r8 = f43 | |
171 stf8 [rp] = f39 | |
172 mov.i ar.lc = r2 | |
173 br.ret.sptk.many b0 | |
174 | |
175 | |
176 .grt2: | |
177 ldf8 f32 = [up], 8 | |
178 ;; | |
179 ldf8 f33 = [up], 8 | |
180 xma.l f38 = f7, f6, f9 | |
181 xma.hu f42 = f7, f6, f9 | |
182 ;; | |
183 ldf8 f34 = [up], 8 | |
184 xma.l f39 = f35, f6, f0 | |
185 xma.hu f43 = f35, f6, f0 | |
186 ;; | |
187 ldf8 f35 = [up], 8 | |
188 br.cloop.dptk .grt6 | |
189 | |
190 stf8 [rp] = f38, 8 | |
191 xma.l f36 = f32, f6, f0 | |
192 xma.hu f40 = f32, f6, f0 | |
193 ;; | |
194 getf.sig r20 = f42 | |
195 getf.sig r17 = f39 | |
196 xma.l f37 = f33, f6, f0 | |
197 xma.hu f41 = f33, f6, f0 | |
198 ;; | |
199 getf.sig r21 = f43 | |
200 getf.sig r18 = f36 | |
201 xma.l f38 = f34, f6, f0 | |
202 xma.hu f42 = f34, f6, f0 | |
203 ;; | |
204 getf.sig r22 = f40 | |
205 getf.sig r19 = f37 | |
206 xma.l f39 = f35, f6, f0 | |
207 xma.hu f43 = f35, f6, f0 | |
208 br .Lcj6 | |
209 | |
210 .grt6: | |
211 getf.sig r16 = f38 | |
212 xma.l f36 = f32, f6, f0 | |
213 xma.hu f40 = f32, f6, f0 | |
214 ;; | |
215 getf.sig r20 = f42 | |
216 ldf8 f32 = [up], 8 | |
217 xma.l f37 = f33, f6, f0 | |
218 ;; | |
219 getf.sig r17 = f39 | |
220 xma.hu f41 = f33, f6, f0 | |
221 ;; | |
222 getf.sig r21 = f43 | |
223 ldf8 f33 = [up], 8 | |
224 xma.l f38 = f34, f6, f0 | |
225 ;; | |
226 getf.sig r18 = f36 | |
227 xma.hu f42 = f34, f6, f0 | |
228 br .LL10 | |
229 | |
230 | |
231 .Lb11: ldf8 f34 = [up], 8 | |
232 mov r22 = 0 | |
233 ;; | |
234 ldf8 f35 = [up], 8 | |
235 br.cloop.dptk .grt3 | |
236 ;; | |
237 | |
238 xma.l f37 = f7, f6, f9 | |
239 xma.hu f41 = f7, f6, f9 | |
240 xma.l f38 = f34, f6, f0 | |
241 xma.hu f42 = f34, f6, f0 | |
242 xma.l f39 = f35, f6, f0 | |
243 xma.hu f43 = f35, f6, f0 | |
244 ;; | |
245 getf.sig r23 = f41 | |
246 stf8 [rp] = f37, 8 | |
247 getf.sig r16 = f38 | |
248 getf.sig r20 = f42 | |
249 getf.sig r17 = f39 | |
250 getf.sig r8 = f43 | |
251 br .Lcj3 | |
252 | |
253 .grt3: | |
254 ldf8 f32 = [up], 8 | |
255 xma.l f37 = f7, f6, f9 | |
256 xma.hu f41 = f7, f6, f9 | |
257 ;; | |
258 ldf8 f33 = [up], 8 | |
259 xma.l f38 = f34, f6, f0 | |
260 xma.hu f42 = f34, f6, f0 | |
261 ;; | |
262 getf.sig r19 = f37 | |
263 ldf8 f34 = [up], 8 | |
264 xma.l f39 = f35, f6, f0 | |
265 xma.hu f43 = f35, f6, f0 | |
266 ;; | |
267 getf.sig r23 = f41 | |
268 ldf8 f35 = [up], 8 | |
269 br.cloop.dptk .grt7 | |
270 | |
271 getf.sig r16 = f38 | |
272 xma.l f36 = f32, f6, f0 | |
273 getf.sig r20 = f42 | |
274 xma.hu f40 = f32, f6, f0 | |
275 ;; | |
276 getf.sig r17 = f39 | |
277 xma.l f37 = f33, f6, f0 | |
278 getf.sig r21 = f43 | |
279 xma.hu f41 = f33, f6, f0 | |
280 ;; | |
281 getf.sig r18 = f36 | |
282 st8 [rp] = r19, 8 | |
283 xma.l f38 = f34, f6, f0 | |
284 xma.hu f42 = f34, f6, f0 | |
285 br .Lcj7 | |
286 | |
287 .grt7: | |
288 getf.sig r16 = f38 | |
289 xma.l f36 = f32, f6, f0 | |
290 xma.hu f40 = f32, f6, f0 | |
291 ;; | |
292 getf.sig r20 = f42 | |
293 ldf8 f32 = [up], 8 | |
294 xma.l f37 = f33, f6, f0 | |
295 ;; | |
296 getf.sig r17 = f39 | |
297 xma.hu f41 = f33, f6, f0 | |
298 br .LL11 | |
299 | |
300 | |
301 .Lb00: ldf8 f33 = [up], 8 | |
302 mov r21 = 0 | |
303 ;; | |
304 ldf8 f34 = [up], 8 | |
305 ;; | |
306 ldf8 f35 = [up], 8 | |
307 xma.l f36 = f7, f6, f9 | |
308 xma.hu f40 = f7, f6, f9 | |
309 br.cloop.dptk .grt4 | |
310 | |
311 xma.l f37 = f33, f6, f0 | |
312 xma.hu f41 = f33, f6, f0 | |
313 xma.l f38 = f34, f6, f0 | |
314 xma.hu f42 = f34, f6, f0 | |
315 ;; | |
316 getf.sig r22 = f40 | |
317 stf8 [rp] = f36, 8 | |
318 xma.l f39 = f35, f6, f0 | |
319 getf.sig r19 = f37 | |
320 xma.hu f43 = f35, f6, f0 | |
321 ;; | |
322 getf.sig r23 = f41 | |
323 getf.sig r16 = f38 | |
324 getf.sig r20 = f42 | |
325 getf.sig r17 = f39 | |
326 br .Lcj4 | |
327 | |
328 .grt4: | |
329 ldf8 f32 = [up], 8 | |
330 xma.l f37 = f33, f6, f0 | |
331 xma.hu f41 = f33, f6, f0 | |
332 ;; | |
333 getf.sig r18 = f36 | |
334 ldf8 f33 = [up], 8 | |
335 xma.l f38 = f34, f6, f0 | |
336 xma.hu f42 = f34, f6, f0 | |
337 ;; | |
338 getf.sig r22 = f40 | |
339 ldf8 f34 = [up], 8 | |
340 xma.l f39 = f35, f6, f0 | |
341 ;; | |
342 getf.sig r19 = f37 | |
343 getf.sig r23 = f41 | |
344 xma.hu f43 = f35, f6, f0 | |
345 ldf8 f35 = [up], 8 | |
346 br.cloop.dptk .grt8 | |
347 | |
348 getf.sig r16 = f38 | |
349 xma.l f36 = f32, f6, f0 | |
350 getf.sig r20 = f42 | |
351 xma.hu f40 = f32, f6, f0 | |
352 ;; | |
353 getf.sig r17 = f39 | |
354 st8 [rp] = r18, 8 | |
355 xma.l f37 = f33, f6, f0 | |
356 xma.hu f41 = f33, f6, f0 | |
357 br .Lcj8 | |
358 | |
359 .grt8: | |
360 getf.sig r16 = f38 | |
361 xma.l f36 = f32, f6, f0 | |
362 xma.hu f40 = f32, f6, f0 | |
363 br .LL00 | |
364 | |
365 | |
366 C *** MAIN LOOP START *** | |
367 ALIGN(32) | |
368 .Loop: | |
369 .pred.rel "mutex",p6,p7 | |
370 getf.sig r16 = f38 | |
371 xma.l f36 = f32, f6, f0 | |
372 (p6) cmp.leu p8, p9 = r24, r17 | |
373 st8 [rp] = r24, 8 | |
374 xma.hu f40 = f32, f6, f0 | |
375 (p7) cmp.ltu p8, p9 = r24, r17 | |
376 ;; | |
377 .LL00: | |
378 .pred.rel "mutex",p8,p9 | |
379 getf.sig r20 = f42 | |
380 (p8) add r24 = r18, r21, 1 | |
381 nop.b 0 | |
382 ldf8 f32 = [up], 8 | |
383 (p9) add r24 = r18, r21 | |
384 nop.b 0 | |
385 ;; | |
386 .pred.rel "mutex",p8,p9 | |
387 getf.sig r17 = f39 | |
388 xma.l f37 = f33, f6, f0 | |
389 (p8) cmp.leu p6, p7 = r24, r18 | |
390 st8 [rp] = r24, 8 | |
391 xma.hu f41 = f33, f6, f0 | |
392 (p9) cmp.ltu p6, p7 = r24, r18 | |
393 ;; | |
394 .LL11: | |
395 .pred.rel "mutex",p6,p7 | |
396 getf.sig r21 = f43 | |
397 (p6) add r24 = r19, r22, 1 | |
398 nop.b 0 | |
399 ldf8 f33 = [up], 8 | |
400 (p7) add r24 = r19, r22 | |
401 nop.b 0 | |
402 ;; | |
403 .pred.rel "mutex",p6,p7 | |
404 getf.sig r18 = f36 | |
405 xma.l f38 = f34, f6, f0 | |
406 (p6) cmp.leu p8, p9 = r24, r19 | |
407 st8 [rp] = r24, 8 | |
408 xma.hu f42 = f34, f6, f0 | |
409 (p7) cmp.ltu p8, p9 = r24, r19 | |
410 ;; | |
411 .LL10: | |
412 .pred.rel "mutex",p8,p9 | |
413 getf.sig r22 = f40 | |
414 (p8) add r24 = r16, r23, 1 | |
415 nop.b 0 | |
416 ldf8 f34 = [up], 8 | |
417 (p9) add r24 = r16, r23 | |
418 nop.b 0 | |
419 ;; | |
420 .pred.rel "mutex",p8,p9 | |
421 getf.sig r19 = f37 | |
422 xma.l f39 = f35, f6, f0 | |
423 (p8) cmp.leu p6, p7 = r24, r16 | |
424 st8 [rp] = r24, 8 | |
425 xma.hu f43 = f35, f6, f0 | |
426 (p9) cmp.ltu p6, p7 = r24, r16 | |
427 ;; | |
428 .LL01: | |
429 .pred.rel "mutex",p6,p7 | |
430 getf.sig r23 = f41 | |
431 (p6) add r24 = r17, r20, 1 | |
432 nop.b 0 | |
433 ldf8 f35 = [up], 8 | |
434 (p7) add r24 = r17, r20 | |
435 br.cloop.dptk .Loop | |
436 C *** MAIN LOOP END *** | |
437 ;; | |
438 | |
439 .Lcj9: | |
440 .pred.rel "mutex",p6,p7 | |
441 getf.sig r16 = f38 | |
442 xma.l f36 = f32, f6, f0 | |
443 (p6) cmp.leu p8, p9 = r24, r17 | |
444 st8 [rp] = r24, 8 | |
445 xma.hu f40 = f32, f6, f0 | |
446 (p7) cmp.ltu p8, p9 = r24, r17 | |
447 ;; | |
448 .pred.rel "mutex",p8,p9 | |
449 getf.sig r20 = f42 | |
450 (p8) add r24 = r18, r21, 1 | |
451 (p9) add r24 = r18, r21 | |
452 ;; | |
453 .pred.rel "mutex",p8,p9 | |
454 getf.sig r17 = f39 | |
455 xma.l f37 = f33, f6, f0 | |
456 (p8) cmp.leu p6, p7 = r24, r18 | |
457 st8 [rp] = r24, 8 | |
458 xma.hu f41 = f33, f6, f0 | |
459 (p9) cmp.ltu p6, p7 = r24, r18 | |
460 ;; | |
461 .Lcj8: | |
462 .pred.rel "mutex",p6,p7 | |
463 getf.sig r21 = f43 | |
464 (p6) add r24 = r19, r22, 1 | |
465 (p7) add r24 = r19, r22 | |
466 ;; | |
467 .pred.rel "mutex",p6,p7 | |
468 getf.sig r18 = f36 | |
469 xma.l f38 = f34, f6, f0 | |
470 (p6) cmp.leu p8, p9 = r24, r19 | |
471 st8 [rp] = r24, 8 | |
472 xma.hu f42 = f34, f6, f0 | |
473 (p7) cmp.ltu p8, p9 = r24, r19 | |
474 ;; | |
475 .Lcj7: | |
476 .pred.rel "mutex",p8,p9 | |
477 getf.sig r22 = f40 | |
478 (p8) add r24 = r16, r23, 1 | |
479 (p9) add r24 = r16, r23 | |
480 ;; | |
481 .pred.rel "mutex",p8,p9 | |
482 getf.sig r19 = f37 | |
483 xma.l f39 = f35, f6, f0 | |
484 (p8) cmp.leu p6, p7 = r24, r16 | |
485 st8 [rp] = r24, 8 | |
486 xma.hu f43 = f35, f6, f0 | |
487 (p9) cmp.ltu p6, p7 = r24, r16 | |
488 ;; | |
489 .Lcj6: | |
490 .pred.rel "mutex",p6,p7 | |
491 getf.sig r23 = f41 | |
492 (p6) add r24 = r17, r20, 1 | |
493 (p7) add r24 = r17, r20 | |
494 ;; | |
495 .pred.rel "mutex",p6,p7 | |
496 (p6) cmp.leu p8, p9 = r24, r17 | |
497 (p7) cmp.ltu p8, p9 = r24, r17 | |
498 getf.sig r16 = f38 | |
499 st8 [rp] = r24, 8 | |
500 ;; | |
501 .Lcj5: | |
502 .pred.rel "mutex",p8,p9 | |
503 getf.sig r20 = f42 | |
504 (p8) add r24 = r18, r21, 1 | |
505 (p9) add r24 = r18, r21 | |
506 ;; | |
507 .pred.rel "mutex",p8,p9 | |
508 (p8) cmp.leu p6, p7 = r24, r18 | |
509 (p9) cmp.ltu p6, p7 = r24, r18 | |
510 getf.sig r17 = f39 | |
511 st8 [rp] = r24, 8 | |
512 ;; | |
513 .Lcj4: | |
514 .pred.rel "mutex",p6,p7 | |
515 getf.sig r8 = f43 | |
516 (p6) add r24 = r19, r22, 1 | |
517 (p7) add r24 = r19, r22 | |
518 ;; | |
519 .pred.rel "mutex",p6,p7 | |
520 st8 [rp] = r24, 8 | |
521 (p6) cmp.leu p8, p9 = r24, r19 | |
522 (p7) cmp.ltu p8, p9 = r24, r19 | |
523 ;; | |
524 .Lcj3: | |
525 .pred.rel "mutex",p8,p9 | |
526 (p8) add r24 = r16, r23, 1 | |
527 (p9) add r24 = r16, r23 | |
528 ;; | |
529 .pred.rel "mutex",p8,p9 | |
530 st8 [rp] = r24, 8 | |
531 (p8) cmp.leu p6, p7 = r24, r16 | |
532 (p9) cmp.ltu p6, p7 = r24, r16 | |
533 ;; | |
534 .Lcj2: | |
535 .pred.rel "mutex",p6,p7 | |
536 (p6) add r24 = r17, r20, 1 | |
537 (p7) add r24 = r17, r20 | |
538 ;; | |
539 .pred.rel "mutex",p6,p7 | |
540 st8 [rp] = r24, 8 | |
541 (p6) cmp.leu p8, p9 = r24, r17 | |
542 (p7) cmp.ltu p8, p9 = r24, r17 | |
543 ;; | |
544 .pred.rel "mutex",p8,p9 | |
545 (p8) add r8 = 1, r8 | |
546 mov.i ar.lc = r2 | |
547 br.ret.sptk.many b0 | |
548 EPILOGUE() | |
549 | |
550 PROLOGUE(mpn_mul_1c) | |
551 .prologue | |
552 .save ar.lc, r2 | |
553 .body | |
554 | |
555 ifdef(`HAVE_ABI_32', | |
556 ` addp4 rp = 0, rp C M I | |
557 addp4 up = 0, up C M I | |
558 zxt4 n = n C I | |
559 ;; | |
560 ') | |
561 {.mmi | |
562 adds r15 = -1, n C M I | |
563 setf.sig f9 = cy C M2 M3 | |
564 mov.i r2 = ar.lc C I0 | |
565 } | |
566 {.mmb | |
567 ldf8 f7 = [up], 8 C M | |
568 and r14 = 3, n C M I | |
569 br.sptk .Lcommon | |
570 ;; | |
571 } | |
572 EPILOGUE() | |
573 ASM_END() | |
OLD | NEW |