OLD | NEW |
| (Empty) |
1 dnl IA-64 mpn_sqr_diagonal. Helper for sqr_basecase. | |
2 | |
3 dnl Copyright 2001, 2002, 2004 Free Software Foundation, Inc. | |
4 | |
5 dnl This file is part of the GNU MP Library. | |
6 | |
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify | |
8 dnl it under the terms of the GNU Lesser General Public License as published | |
9 dnl by the Free Software Foundation; either version 3 of the License, or (at | |
10 dnl your option) any later version. | |
11 | |
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but | |
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public | |
15 dnl License for more details. | |
16 | |
17 dnl You should have received a copy of the GNU Lesser General Public License | |
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. | |
19 | |
20 include(`../config.m4') | |
21 | |
22 C cycles/limb | |
23 C Itanium: 4 | |
24 C Itanium 2: 2 | |
25 | |
26 C TODO | |
27 C * Perhaps avoid ctop loop. Unfortunately, a cloop loop running at 1 c/l | |
28 C would need prohibitive 8-way unrolling. | |
29 C * Instead of messing too much with this, write a nifty mpn_sqr_basecase. | |
30 | |
31 C INPUT PARAMETERS | |
32 C rp = r32 | |
33 C sp = r33 | |
34 C n = r34 | |
35 | |
36 ASM_START() | |
37 PROLOGUE(mpn_sqr_diagonal) | |
38 .prologue | |
39 .save ar.lc, r2 | |
40 .save pr, r15 | |
41 .body | |
42 ifdef(`HAVE_ABI_32', | |
43 ` addp4 r32 = 0, r32 | |
44 addp4 r33 = 0, r33 | |
45 zxt4 r34 = r34 | |
46 ;; | |
47 ') | |
48 ldf8 f32 = [r33], 8 C M load rp[0] early | |
49 mov r2 = ar.lc C I0 | |
50 mov r14 = ar.ec C I0 | |
51 mov r15 = pr C I0 | |
52 add r19 = -1, r34 C M I decr n | |
53 add r18 = 8, r32 C M I rp for high limb | |
54 ;; | |
55 mov ar.lc = r19 C I0 | |
56 mov ar.ec = 5 C I0 | |
57 mov pr.rot = 1<<16 C I0 | |
58 ;; | |
59 br.cexit.spnt .Ldone C B | |
60 ;; | |
61 ALIGN(32) | |
62 .Loop: | |
63 (p16) ldf8 f32 = [r33], 8 C M | |
64 (p19) xma.l f36 = f35, f35, f0 C F | |
65 (p21) stf8 [r32] = f38, 16 C M2 M3 | |
66 (p19) xma.hu f40 = f35, f35, f0 C F | |
67 (p21) stf8 [r18] = f42, 16 C M2 M3 | |
68 br.ctop.dptk .Loop C B | |
69 ;; | |
70 .Ldone: | |
71 stf8 [r32] = f38 C M2 M3 | |
72 stf8 [r18] = f42 C M2 M3 | |
73 mov ar.ec = r14 C I0 | |
74 ;; | |
75 mov pr = r15, 0x1ffff C I0 | |
76 mov ar.lc = r2 C I0 | |
77 br.ret.sptk.many b0 C B | |
78 EPILOGUE(mpn_sqr_diagonal) | |
79 ASM_END() | |
OLD | NEW |