Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(5)

Side by Side Diff: gcc/gmp/mpn/ia64/mul_1.asm

Issue 3050029: [gcc] GCC 4.5.0=>4.5.1 (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/nacl-toolchain.git
Patch Set: Created 10 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « gcc/gmp/mpn/ia64/mode1o.asm ('k') | gcc/gmp/mpn/ia64/mul_2.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 dnl IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and
2 dnl store the result in a second limb vector.
3
4 dnl Copyright 2000, 2001, 2002, 2003, 2004, 2006, 2007 Free Software
5 dnl Foundation, Inc.
6
7 dnl This file is part of the GNU MP Library.
8
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of the GNU Lesser General Public License as published
11 dnl by the Free Software Foundation; either version 3 of the License, or (at
12 dnl your option) any later version.
13
14 dnl The GNU MP Library is distributed in the hope that it will be useful, but
15 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17 dnl License for more details.
18
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21
22 include(`../config.m4')
23
24 C cycles/limb
25 C Itanium: 4.0
26 C Itanium 2: 2.0
27
28 C TODO
29 C * Further optimize feed-in and wind-down code, both for speed and code size.
30 C * Handle low limb input and results specially, using a common stf8 in the
31 C epilogue.
32 C * Use 1 c/l carry propagation scheme in wind-down code.
33 C * Use extra pointer register for `up' to speed up feed-in loads.
34 C * Work out final differences with addmul_1.asm.
35
36 C INPUT PARAMETERS
37 define(`rp', `r32')
38 define(`up', `r33')
39 define(`n', `r34')
40 define(`vl', `r35')
41 define(`cy', `r36') C for mpn_mul_1c
42
43 ASM_START()
44 PROLOGUE(mpn_mul_1)
45 .prologue
46 .save ar.lc, r2
47 .body
48
49 ifdef(`HAVE_ABI_32',
50 ` addp4 rp = 0, rp C M I
51 addp4 up = 0, up C M I
52 zxt4 n = n C I
53 ;;
54 ')
55 {.mfi
56 adds r15 = -1, n C M I
57 mov f9 = f0 C F
58 mov.i r2 = ar.lc C I0
59 }
60 {.mmi
61 ldf8 f7 = [up], 8 C M
62 nop.m 0 C M
63 and r14 = 3, n C M I
64 ;;
65 }
66 .Lcommon:
67 {.mii
68 setf.sig f6 = vl C M2 M3
69 shr.u r31 = r15, 2 C I0
70 cmp.eq p10, p0 = 0, r14 C M I
71 }
72 {.mii
73 cmp.eq p11, p0 = 2, r14 C M I
74 cmp.eq p12, p0 = 3, r14 C M I
75 nop.i 0 C I
76 ;;
77 }
78 {.mii
79 cmp.ne p6, p7 = r0, r0 C M I
80 mov.i ar.lc = r31 C I0
81 cmp.ne p8, p9 = r0, r0 C M I
82 }
83 {.bbb
84 (p10) br.dptk .Lb00 C B
85 (p11) br.dptk .Lb10 C B
86 (p12) br.dptk .Lb11 C B
87 ;;
88 }
89
90 .Lb01: mov r20 = 0
91 br.cloop.dptk .grt1 C B
92
93 xma.l f39 = f7, f6, f9 C F
94 xma.hu f43 = f7, f6, f9 C F
95 ;;
96 getf.sig r8 = f43 C M2
97 stf8 [rp] = f39 C M2 M3
98 mov.i ar.lc = r2 C I0
99 br.ret.sptk.many b0 C B
100
101 .grt1:
102 ldf8 f32 = [up], 8
103 ;;
104 ldf8 f33 = [up], 8
105 ;;
106 ldf8 f34 = [up], 8
107 xma.l f39 = f7, f6, f9
108 xma.hu f43 = f7, f6, f9
109 ;;
110 ldf8 f35 = [up], 8
111 br.cloop.dptk .grt5
112
113 xma.l f36 = f32, f6, f0
114 xma.hu f40 = f32, f6, f0
115 ;;
116 stf8 [rp] = f39, 8
117 xma.l f37 = f33, f6, f0
118 xma.hu f41 = f33, f6, f0
119 ;;
120 getf.sig r21 = f43
121 getf.sig r18 = f36
122 xma.l f38 = f34, f6, f0
123 xma.hu f42 = f34, f6, f0
124 ;;
125 getf.sig r22 = f40
126 getf.sig r19 = f37
127 xma.l f39 = f35, f6, f0
128 xma.hu f43 = f35, f6, f0
129 ;;
130 getf.sig r23 = f41
131 getf.sig r16 = f38
132 br .Lcj5
133
134 .grt5:
135 xma.l f36 = f32, f6, f0
136 xma.hu f40 = f32, f6, f0
137 ;;
138 getf.sig r17 = f39
139 ldf8 f32 = [up], 8
140 xma.l f37 = f33, f6, f0
141 xma.hu f41 = f33, f6, f0
142 ;;
143 getf.sig r21 = f43
144 ldf8 f33 = [up], 8
145 xma.l f38 = f34, f6, f0
146 ;;
147 getf.sig r18 = f36
148 xma.hu f42 = f34, f6, f0
149 ;;
150 getf.sig r22 = f40
151 ldf8 f34 = [up], 8
152 xma.l f39 = f35, f6, f0
153 ;;
154 getf.sig r19 = f37
155 xma.hu f43 = f35, f6, f0
156 br .LL01
157
158
159 .Lb10: ldf8 f35 = [up], 8
160 mov r23 = 0
161 br.cloop.dptk .grt2
162
163 xma.l f38 = f7, f6, f9
164 xma.hu f42 = f7, f6, f9
165 ;;
166 stf8 [rp] = f38, 8
167 xma.l f39 = f35, f6, f42
168 xma.hu f43 = f35, f6, f42
169 ;;
170 getf.sig r8 = f43
171 stf8 [rp] = f39
172 mov.i ar.lc = r2
173 br.ret.sptk.many b0
174
175
176 .grt2:
177 ldf8 f32 = [up], 8
178 ;;
179 ldf8 f33 = [up], 8
180 xma.l f38 = f7, f6, f9
181 xma.hu f42 = f7, f6, f9
182 ;;
183 ldf8 f34 = [up], 8
184 xma.l f39 = f35, f6, f0
185 xma.hu f43 = f35, f6, f0
186 ;;
187 ldf8 f35 = [up], 8
188 br.cloop.dptk .grt6
189
190 stf8 [rp] = f38, 8
191 xma.l f36 = f32, f6, f0
192 xma.hu f40 = f32, f6, f0
193 ;;
194 getf.sig r20 = f42
195 getf.sig r17 = f39
196 xma.l f37 = f33, f6, f0
197 xma.hu f41 = f33, f6, f0
198 ;;
199 getf.sig r21 = f43
200 getf.sig r18 = f36
201 xma.l f38 = f34, f6, f0
202 xma.hu f42 = f34, f6, f0
203 ;;
204 getf.sig r22 = f40
205 getf.sig r19 = f37
206 xma.l f39 = f35, f6, f0
207 xma.hu f43 = f35, f6, f0
208 br .Lcj6
209
210 .grt6:
211 getf.sig r16 = f38
212 xma.l f36 = f32, f6, f0
213 xma.hu f40 = f32, f6, f0
214 ;;
215 getf.sig r20 = f42
216 ldf8 f32 = [up], 8
217 xma.l f37 = f33, f6, f0
218 ;;
219 getf.sig r17 = f39
220 xma.hu f41 = f33, f6, f0
221 ;;
222 getf.sig r21 = f43
223 ldf8 f33 = [up], 8
224 xma.l f38 = f34, f6, f0
225 ;;
226 getf.sig r18 = f36
227 xma.hu f42 = f34, f6, f0
228 br .LL10
229
230
231 .Lb11: ldf8 f34 = [up], 8
232 mov r22 = 0
233 ;;
234 ldf8 f35 = [up], 8
235 br.cloop.dptk .grt3
236 ;;
237
238 xma.l f37 = f7, f6, f9
239 xma.hu f41 = f7, f6, f9
240 xma.l f38 = f34, f6, f0
241 xma.hu f42 = f34, f6, f0
242 xma.l f39 = f35, f6, f0
243 xma.hu f43 = f35, f6, f0
244 ;;
245 getf.sig r23 = f41
246 stf8 [rp] = f37, 8
247 getf.sig r16 = f38
248 getf.sig r20 = f42
249 getf.sig r17 = f39
250 getf.sig r8 = f43
251 br .Lcj3
252
253 .grt3:
254 ldf8 f32 = [up], 8
255 xma.l f37 = f7, f6, f9
256 xma.hu f41 = f7, f6, f9
257 ;;
258 ldf8 f33 = [up], 8
259 xma.l f38 = f34, f6, f0
260 xma.hu f42 = f34, f6, f0
261 ;;
262 getf.sig r19 = f37
263 ldf8 f34 = [up], 8
264 xma.l f39 = f35, f6, f0
265 xma.hu f43 = f35, f6, f0
266 ;;
267 getf.sig r23 = f41
268 ldf8 f35 = [up], 8
269 br.cloop.dptk .grt7
270
271 getf.sig r16 = f38
272 xma.l f36 = f32, f6, f0
273 getf.sig r20 = f42
274 xma.hu f40 = f32, f6, f0
275 ;;
276 getf.sig r17 = f39
277 xma.l f37 = f33, f6, f0
278 getf.sig r21 = f43
279 xma.hu f41 = f33, f6, f0
280 ;;
281 getf.sig r18 = f36
282 st8 [rp] = r19, 8
283 xma.l f38 = f34, f6, f0
284 xma.hu f42 = f34, f6, f0
285 br .Lcj7
286
287 .grt7:
288 getf.sig r16 = f38
289 xma.l f36 = f32, f6, f0
290 xma.hu f40 = f32, f6, f0
291 ;;
292 getf.sig r20 = f42
293 ldf8 f32 = [up], 8
294 xma.l f37 = f33, f6, f0
295 ;;
296 getf.sig r17 = f39
297 xma.hu f41 = f33, f6, f0
298 br .LL11
299
300
301 .Lb00: ldf8 f33 = [up], 8
302 mov r21 = 0
303 ;;
304 ldf8 f34 = [up], 8
305 ;;
306 ldf8 f35 = [up], 8
307 xma.l f36 = f7, f6, f9
308 xma.hu f40 = f7, f6, f9
309 br.cloop.dptk .grt4
310
311 xma.l f37 = f33, f6, f0
312 xma.hu f41 = f33, f6, f0
313 xma.l f38 = f34, f6, f0
314 xma.hu f42 = f34, f6, f0
315 ;;
316 getf.sig r22 = f40
317 stf8 [rp] = f36, 8
318 xma.l f39 = f35, f6, f0
319 getf.sig r19 = f37
320 xma.hu f43 = f35, f6, f0
321 ;;
322 getf.sig r23 = f41
323 getf.sig r16 = f38
324 getf.sig r20 = f42
325 getf.sig r17 = f39
326 br .Lcj4
327
328 .grt4:
329 ldf8 f32 = [up], 8
330 xma.l f37 = f33, f6, f0
331 xma.hu f41 = f33, f6, f0
332 ;;
333 getf.sig r18 = f36
334 ldf8 f33 = [up], 8
335 xma.l f38 = f34, f6, f0
336 xma.hu f42 = f34, f6, f0
337 ;;
338 getf.sig r22 = f40
339 ldf8 f34 = [up], 8
340 xma.l f39 = f35, f6, f0
341 ;;
342 getf.sig r19 = f37
343 getf.sig r23 = f41
344 xma.hu f43 = f35, f6, f0
345 ldf8 f35 = [up], 8
346 br.cloop.dptk .grt8
347
348 getf.sig r16 = f38
349 xma.l f36 = f32, f6, f0
350 getf.sig r20 = f42
351 xma.hu f40 = f32, f6, f0
352 ;;
353 getf.sig r17 = f39
354 st8 [rp] = r18, 8
355 xma.l f37 = f33, f6, f0
356 xma.hu f41 = f33, f6, f0
357 br .Lcj8
358
359 .grt8:
360 getf.sig r16 = f38
361 xma.l f36 = f32, f6, f0
362 xma.hu f40 = f32, f6, f0
363 br .LL00
364
365
366 C *** MAIN LOOP START ***
367 ALIGN(32)
368 .Loop:
369 .pred.rel "mutex",p6,p7
370 getf.sig r16 = f38
371 xma.l f36 = f32, f6, f0
372 (p6) cmp.leu p8, p9 = r24, r17
373 st8 [rp] = r24, 8
374 xma.hu f40 = f32, f6, f0
375 (p7) cmp.ltu p8, p9 = r24, r17
376 ;;
377 .LL00:
378 .pred.rel "mutex",p8,p9
379 getf.sig r20 = f42
380 (p8) add r24 = r18, r21, 1
381 nop.b 0
382 ldf8 f32 = [up], 8
383 (p9) add r24 = r18, r21
384 nop.b 0
385 ;;
386 .pred.rel "mutex",p8,p9
387 getf.sig r17 = f39
388 xma.l f37 = f33, f6, f0
389 (p8) cmp.leu p6, p7 = r24, r18
390 st8 [rp] = r24, 8
391 xma.hu f41 = f33, f6, f0
392 (p9) cmp.ltu p6, p7 = r24, r18
393 ;;
394 .LL11:
395 .pred.rel "mutex",p6,p7
396 getf.sig r21 = f43
397 (p6) add r24 = r19, r22, 1
398 nop.b 0
399 ldf8 f33 = [up], 8
400 (p7) add r24 = r19, r22
401 nop.b 0
402 ;;
403 .pred.rel "mutex",p6,p7
404 getf.sig r18 = f36
405 xma.l f38 = f34, f6, f0
406 (p6) cmp.leu p8, p9 = r24, r19
407 st8 [rp] = r24, 8
408 xma.hu f42 = f34, f6, f0
409 (p7) cmp.ltu p8, p9 = r24, r19
410 ;;
411 .LL10:
412 .pred.rel "mutex",p8,p9
413 getf.sig r22 = f40
414 (p8) add r24 = r16, r23, 1
415 nop.b 0
416 ldf8 f34 = [up], 8
417 (p9) add r24 = r16, r23
418 nop.b 0
419 ;;
420 .pred.rel "mutex",p8,p9
421 getf.sig r19 = f37
422 xma.l f39 = f35, f6, f0
423 (p8) cmp.leu p6, p7 = r24, r16
424 st8 [rp] = r24, 8
425 xma.hu f43 = f35, f6, f0
426 (p9) cmp.ltu p6, p7 = r24, r16
427 ;;
428 .LL01:
429 .pred.rel "mutex",p6,p7
430 getf.sig r23 = f41
431 (p6) add r24 = r17, r20, 1
432 nop.b 0
433 ldf8 f35 = [up], 8
434 (p7) add r24 = r17, r20
435 br.cloop.dptk .Loop
436 C *** MAIN LOOP END ***
437 ;;
438
439 .Lcj9:
440 .pred.rel "mutex",p6,p7
441 getf.sig r16 = f38
442 xma.l f36 = f32, f6, f0
443 (p6) cmp.leu p8, p9 = r24, r17
444 st8 [rp] = r24, 8
445 xma.hu f40 = f32, f6, f0
446 (p7) cmp.ltu p8, p9 = r24, r17
447 ;;
448 .pred.rel "mutex",p8,p9
449 getf.sig r20 = f42
450 (p8) add r24 = r18, r21, 1
451 (p9) add r24 = r18, r21
452 ;;
453 .pred.rel "mutex",p8,p9
454 getf.sig r17 = f39
455 xma.l f37 = f33, f6, f0
456 (p8) cmp.leu p6, p7 = r24, r18
457 st8 [rp] = r24, 8
458 xma.hu f41 = f33, f6, f0
459 (p9) cmp.ltu p6, p7 = r24, r18
460 ;;
461 .Lcj8:
462 .pred.rel "mutex",p6,p7
463 getf.sig r21 = f43
464 (p6) add r24 = r19, r22, 1
465 (p7) add r24 = r19, r22
466 ;;
467 .pred.rel "mutex",p6,p7
468 getf.sig r18 = f36
469 xma.l f38 = f34, f6, f0
470 (p6) cmp.leu p8, p9 = r24, r19
471 st8 [rp] = r24, 8
472 xma.hu f42 = f34, f6, f0
473 (p7) cmp.ltu p8, p9 = r24, r19
474 ;;
475 .Lcj7:
476 .pred.rel "mutex",p8,p9
477 getf.sig r22 = f40
478 (p8) add r24 = r16, r23, 1
479 (p9) add r24 = r16, r23
480 ;;
481 .pred.rel "mutex",p8,p9
482 getf.sig r19 = f37
483 xma.l f39 = f35, f6, f0
484 (p8) cmp.leu p6, p7 = r24, r16
485 st8 [rp] = r24, 8
486 xma.hu f43 = f35, f6, f0
487 (p9) cmp.ltu p6, p7 = r24, r16
488 ;;
489 .Lcj6:
490 .pred.rel "mutex",p6,p7
491 getf.sig r23 = f41
492 (p6) add r24 = r17, r20, 1
493 (p7) add r24 = r17, r20
494 ;;
495 .pred.rel "mutex",p6,p7
496 (p6) cmp.leu p8, p9 = r24, r17
497 (p7) cmp.ltu p8, p9 = r24, r17
498 getf.sig r16 = f38
499 st8 [rp] = r24, 8
500 ;;
501 .Lcj5:
502 .pred.rel "mutex",p8,p9
503 getf.sig r20 = f42
504 (p8) add r24 = r18, r21, 1
505 (p9) add r24 = r18, r21
506 ;;
507 .pred.rel "mutex",p8,p9
508 (p8) cmp.leu p6, p7 = r24, r18
509 (p9) cmp.ltu p6, p7 = r24, r18
510 getf.sig r17 = f39
511 st8 [rp] = r24, 8
512 ;;
513 .Lcj4:
514 .pred.rel "mutex",p6,p7
515 getf.sig r8 = f43
516 (p6) add r24 = r19, r22, 1
517 (p7) add r24 = r19, r22
518 ;;
519 .pred.rel "mutex",p6,p7
520 st8 [rp] = r24, 8
521 (p6) cmp.leu p8, p9 = r24, r19
522 (p7) cmp.ltu p8, p9 = r24, r19
523 ;;
524 .Lcj3:
525 .pred.rel "mutex",p8,p9
526 (p8) add r24 = r16, r23, 1
527 (p9) add r24 = r16, r23
528 ;;
529 .pred.rel "mutex",p8,p9
530 st8 [rp] = r24, 8
531 (p8) cmp.leu p6, p7 = r24, r16
532 (p9) cmp.ltu p6, p7 = r24, r16
533 ;;
534 .Lcj2:
535 .pred.rel "mutex",p6,p7
536 (p6) add r24 = r17, r20, 1
537 (p7) add r24 = r17, r20
538 ;;
539 .pred.rel "mutex",p6,p7
540 st8 [rp] = r24, 8
541 (p6) cmp.leu p8, p9 = r24, r17
542 (p7) cmp.ltu p8, p9 = r24, r17
543 ;;
544 .pred.rel "mutex",p8,p9
545 (p8) add r8 = 1, r8
546 mov.i ar.lc = r2
547 br.ret.sptk.many b0
548 EPILOGUE()
549
550 PROLOGUE(mpn_mul_1c)
551 .prologue
552 .save ar.lc, r2
553 .body
554
555 ifdef(`HAVE_ABI_32',
556 ` addp4 rp = 0, rp C M I
557 addp4 up = 0, up C M I
558 zxt4 n = n C I
559 ;;
560 ')
561 {.mmi
562 adds r15 = -1, n C M I
563 setf.sig f9 = cy C M2 M3
564 mov.i r2 = ar.lc C I0
565 }
566 {.mmb
567 ldf8 f7 = [up], 8 C M
568 and r14 = 3, n C M I
569 br.sptk .Lcommon
570 ;;
571 }
572 EPILOGUE()
573 ASM_END()
OLDNEW
« no previous file with comments | « gcc/gmp/mpn/ia64/mode1o.asm ('k') | gcc/gmp/mpn/ia64/mul_2.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698