| OLD | NEW |
| (Empty) |
| 1 dnl IA-64 mpn_sqr_diagonal. Helper for sqr_basecase. | |
| 2 | |
| 3 dnl Copyright 2001, 2002, 2004 Free Software Foundation, Inc. | |
| 4 | |
| 5 dnl This file is part of the GNU MP Library. | |
| 6 | |
| 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify | |
| 8 dnl it under the terms of the GNU Lesser General Public License as published | |
| 9 dnl by the Free Software Foundation; either version 3 of the License, or (at | |
| 10 dnl your option) any later version. | |
| 11 | |
| 12 dnl The GNU MP Library is distributed in the hope that it will be useful, but | |
| 13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
| 14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public | |
| 15 dnl License for more details. | |
| 16 | |
| 17 dnl You should have received a copy of the GNU Lesser General Public License | |
| 18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. | |
| 19 | |
| 20 include(`../config.m4') | |
| 21 | |
| 22 C cycles/limb | |
| 23 C Itanium: 4 | |
| 24 C Itanium 2: 2 | |
| 25 | |
| 26 C TODO | |
| 27 C * Perhaps avoid ctop loop. Unfortunately, a cloop loop running at 1 c/l | |
| 28 C would need prohibitive 8-way unrolling. | |
| 29 C * Instead of messing too much with this, write a nifty mpn_sqr_basecase. | |
| 30 | |
| 31 C INPUT PARAMETERS | |
| 32 C rp = r32 | |
| 33 C sp = r33 | |
| 34 C n = r34 | |
| 35 | |
| 36 ASM_START() | |
| 37 PROLOGUE(mpn_sqr_diagonal) | |
| 38 .prologue | |
| 39 .save ar.lc, r2 | |
| 40 .save pr, r15 | |
| 41 .body | |
| 42 ifdef(`HAVE_ABI_32', | |
| 43 ` addp4 r32 = 0, r32 | |
| 44 addp4 r33 = 0, r33 | |
| 45 zxt4 r34 = r34 | |
| 46 ;; | |
| 47 ') | |
| 48 ldf8 f32 = [r33], 8 C M load rp[0] early | |
| 49 mov r2 = ar.lc C I0 | |
| 50 mov r14 = ar.ec C I0 | |
| 51 mov r15 = pr C I0 | |
| 52 add r19 = -1, r34 C M I decr n | |
| 53 add r18 = 8, r32 C M I rp for high limb | |
| 54 ;; | |
| 55 mov ar.lc = r19 C I0 | |
| 56 mov ar.ec = 5 C I0 | |
| 57 mov pr.rot = 1<<16 C I0 | |
| 58 ;; | |
| 59 br.cexit.spnt .Ldone C B | |
| 60 ;; | |
| 61 ALIGN(32) | |
| 62 .Loop: | |
| 63 (p16) ldf8 f32 = [r33], 8 C M | |
| 64 (p19) xma.l f36 = f35, f35, f0 C F | |
| 65 (p21) stf8 [r32] = f38, 16 C M2 M3 | |
| 66 (p19) xma.hu f40 = f35, f35, f0 C F | |
| 67 (p21) stf8 [r18] = f42, 16 C M2 M3 | |
| 68 br.ctop.dptk .Loop C B | |
| 69 ;; | |
| 70 .Ldone: | |
| 71 stf8 [r32] = f38 C M2 M3 | |
| 72 stf8 [r18] = f42 C M2 M3 | |
| 73 mov ar.ec = r14 C I0 | |
| 74 ;; | |
| 75 mov pr = r15, 0x1ffff C I0 | |
| 76 mov ar.lc = r2 C I0 | |
| 77 br.ret.sptk.many b0 C B | |
| 78 EPILOGUE(mpn_sqr_diagonal) | |
| 79 ASM_END() | |
| OLD | NEW |